VMM: init and cleanup take the proc *
[akaros.git] / kern / arch / x86 / vmm / intel / vmx.c
1 /**
2  *  vmx.c - The Intel VT-x driver for Dune
3  *
4  * This file is derived from Linux KVM VT-x support.
5  * Copyright (C) 2006 Qumranet, Inc.
6  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
7  *
8  * Original Authors:
9  *   Avi Kivity   <avi@qumranet.com>
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *
12  * This modified version is simpler because it avoids the following
13  * features that are not requirements for Dune:
14  *  * Real-mode emulation
15  *  * Nested VT-x support
16  *  * I/O hardware emulation
17  *  * Any of the more esoteric X86 features and registers
18  *  * KVM-specific functionality
19  *
20  * In essence we provide only the minimum functionality needed to run
21  * a process in vmx non-root mode rather than the full hardware emulation
22  * needed to support an entire OS.
23  *
24  * This driver is a research prototype and as such has the following
25  * limitations:
26  *
27  * FIXME: Backward compatability is currently a non-goal, and only recent
28  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
29  * driver.
30  *
31  * FIXME: Eventually we should handle concurrent user's of VT-x more
32  * gracefully instead of requiring exclusive access. This would allow
33  * Dune to interoperate with KVM and other HV solutions.
34  *
35  * FIXME: We need to support hotplugged physical CPUs.
36  *
37  * Authors:
38  *   Adam Belay   <abelay@stanford.edu>
39  */
40
41 /* Basic flow.
42  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
43  * You're left with the feeling that they got part way through and realized they had to have one for
44  *
45  * 1) your CPU is going to be capable of running VMs, and you need state for that.
46  *
47  * 2) you're about to start a guest, and you need state for that.
48  *
49  * So there is get cpu set up to be able to run VMs stuff, and now
50  * let's start a guest stuff.  In Akaros, CPUs will always be set up
51  * to run a VM if that is possible. Processes can flip themselves into
52  * a VM and that will require another VMCS.
53  *
54  * So: at kernel startup time, the SMP boot stuff calls
55  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
56  * in the case of this file is intel_vmm_init. That does some code
57  * that sets up stuff for ALL sockets, based on the capabilities of
58  * the socket it runs on. If any cpu supports vmx, it assumes they all
59  * do. That's a realistic assumption. So the call_function_all is kind
60  * of stupid, really; it could just see what's on the current cpu and
61  * assume it's on all. HOWEVER: there are systems in the wilde that
62  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
63  * might as well allow for the chance that wel'll only all VMMCPs on a
64  * subset (not implemented yet however).  So: probe all CPUs, get a
65  * count of how many support VMX and, for now, assume they all do
66  * anyway.
67  *
68  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
69  * which contains all the naughty bits settings for all the cpus that can run a VM.
70  * Realistically, all VMX-capable cpus in a system will have identical configurations.
71  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
72  *
73  * configure the msr_bitmap. This is the bitmap of MSRs which the
74  * guest can manipulate.  Currently, we only allow GS and FS base.
75  *
76  * Reserve bit 0 in the vpid bitmap as guests can not use that
77  *
78  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
79  * per-guest. Once set up, it is left alone.  The ONLY think we set in
80  * there is the revision area. The VMX is page-sized per cpu and
81  * page-aligned. Note that it can be smaller, but why bother? We know
82  * the max size and alightment, and it's convenient.
83  *
84  * Now that it is set up, enable vmx on all cpus. This involves
85  * testing VMXE in cr4, to see if we've been here before (TODO: delete
86  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
87  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
88  * instruction), and syncing vpid's and ept's.  Now the CPU is ready
89  * to host guests.
90  *
91  * Setting up a guest.
92  * We divide this into two things: vmm_proc_init and vm_run.
93  * Currently, on Intel, vmm_proc_init does nothing.
94  *
95  * vm_run is really complicated. It is called with a coreid, rip, rsp,
96  * cr3, and flags.  On intel, it calls vmx_launch. vmx_launch is set
97  * up for a few test cases. If rip is 1, it sets the guest rip to
98  * a function which will deref 0 and should exit with failure 2. If rip is 0,
99  * it calls an infinite loop in the guest.
100  *
101  * The sequence of operations:
102  * create a vcpu
103  * while (1) {
104  * get a vcpu
105  * disable irqs (required or you can't enter the VM)
106  * vmx_run_vcpu()
107  * enable irqs
108  * manage the vm exit
109  * }
110  *
111  * get a vcpu
112  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
113  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
114  *
115  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
116  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
117  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
118  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
119  *
120  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
121  * of inline assembly with embedded CPP crap. I suspect we'll want to
122  * un-inline it someday, but maybe not.  It's called with a vcpu
123  * struct from which it loads guest state, and to which it stores
124  * non-virtualized host state. It issues a vmlaunch or vmresume
125  * instruction depending, and on return, it evaluates if things the
126  * launch/resume had an error in that operation. Note this is NOT the
127  * same as an error while in the virtual machine; this is an error in
128  * startup due to misconfiguration. Depending on whatis returned it's
129  * either a failed vm startup or an exit for lots of many reasons.
130  *
131  */
132
133 /* basically: only rename those globals that might conflict
134  * with existing names. Leave all else the same.
135  * this code is more modern than the other code, yet still
136  * well encapsulated, it seems.
137  */
138 #include <kmalloc.h>
139 #include <string.h>
140 #include <stdio.h>
141 #include <assert.h>
142 #include <error.h>
143 #include <pmap.h>
144 #include <sys/queue.h>
145 #include <smp.h>
146 #include <kref.h>
147 #include <atomic.h>
148 #include <alarm.h>
149 #include <event.h>
150 #include <umem.h>
151 #include <bitops.h>
152 #include <arch/types.h>
153 #include <syscall.h>
154
155 #include "vmx.h"
156 #include "../vmm.h"
157
158 #include "compat.h"
159 #include "cpufeature.h"
160
161 #define currentcpu (&per_cpu_info[core_id()])
162
163 /* this is always 1, and only ever incremented. If it's more than 1,
164  * then you failed.
165  */
166 static bool has_vmx = FALSE;
167
168 /*
169  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
170  * away by decrementing the array size.
171  */
172 static const uint32_t vmx_msr_index[] = {
173 #ifdef CONFIG_X86_64
174         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
175 #endif
176         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
177 };
178 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
179
180 /* TEMPORARY TEST HACK EPT */
181 void *ept;
182 uint64_t eptp;
183 /* END HACKQUE */
184
185 static DECLARE_BITMAP(vmx_vpid_bitmap, /*VMX_NR_VPIDS*/ 65536);
186 static spinlock_t vmx_vpid_lock;
187
188 static unsigned long *msr_bitmap;
189
190 static struct vmcs_config {
191         int size;
192         int order;
193         uint32_t revision_id;
194         uint32_t pin_based_exec_ctrl;
195         uint32_t cpu_based_exec_ctrl;
196         uint32_t cpu_based_2nd_exec_ctrl;
197         uint32_t vmexit_ctrl;
198         uint32_t vmentry_ctrl;
199 } vmcs_config;
200
201 struct vmx_capability vmx_capability;
202
203 static inline bool cpu_has_secondary_exec_ctrls(void)
204 {
205         return vmcs_config.cpu_based_exec_ctrl &
206                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
207 }
208
209 static inline bool cpu_has_vmx_vpid(void)
210 {
211         return vmcs_config.cpu_based_2nd_exec_ctrl &
212                 SECONDARY_EXEC_ENABLE_VPID;
213 }
214
215 static inline bool cpu_has_vmx_invpcid(void)
216 {
217         return vmcs_config.cpu_based_2nd_exec_ctrl &
218                 SECONDARY_EXEC_ENABLE_INVPCID;
219 }
220
221 static inline bool cpu_has_vmx_invvpid_single(void)
222 {
223         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
224 }
225
226 static inline bool cpu_has_vmx_invvpid_global(void)
227 {
228         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
229 }
230
231 static inline bool cpu_has_vmx_ept(void)
232 {
233         return vmcs_config.cpu_based_2nd_exec_ctrl &
234                 SECONDARY_EXEC_ENABLE_EPT;
235 }
236
237 static inline bool cpu_has_vmx_invept_individual_addr(void)
238 {
239         return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
240 }
241
242 static inline bool cpu_has_vmx_invept_context(void)
243 {
244         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
245 }
246
247 static inline bool cpu_has_vmx_invept_global(void)
248 {
249         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
250 }
251
252 static inline bool cpu_has_vmx_ept_ad_bits(void)
253 {
254         return vmx_capability.ept & VMX_EPT_AD_BIT;
255 }
256
257 static inline void __invept(int ext, uint64_t eptp, gpa_t gpa)
258 {
259         struct {
260                 uint64_t eptp, gpa;
261         } operand = {eptp, gpa};
262
263         asm volatile (ASM_VMX_INVEPT
264                         /* CF==1 or ZF==1 --> rc = -1 */
265                         "; ja 1f ; ud2 ; 1:\n"
266                         : : "a" (&operand), "c" (ext) : "cc", "memory");
267 }
268
269 static inline void ept_sync_global(void)
270 {
271         if (cpu_has_vmx_invept_global())
272                 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
273 }
274
275 static inline void ept_sync_context(uint64_t eptp)
276 {
277         if (cpu_has_vmx_invept_context())
278                 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
279         else
280                 ept_sync_global();
281 }
282
283 static inline void ept_sync_individual_addr(uint64_t eptp, gpa_t gpa)
284 {
285         if (cpu_has_vmx_invept_individual_addr())
286                 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
287                                 eptp, gpa);
288         else
289                 ept_sync_context(eptp);
290 }
291
292 static inline void __vmxon(uint64_t addr)
293 {
294         asm volatile (ASM_VMX_VMXON_RAX
295                         : : "a"(&addr), "m"(addr)
296                         : "memory", "cc");
297 }
298
299 static inline void __vmxoff(void)
300 {
301         asm volatile (ASM_VMX_VMXOFF : : : "cc");
302 }
303
304 static inline void __invvpid(int ext, uint16_t vpid, gva_t gva)
305 {
306     struct {
307         uint64_t vpid : 16;
308         uint64_t rsvd : 48;
309         uint64_t gva;
310     } operand = { vpid, 0, gva };
311
312     asm volatile (ASM_VMX_INVVPID
313                   /* CF==1 or ZF==1 --> rc = -1 */
314                   "; ja 1f ; ud2 ; 1:"
315                   : : "a"(&operand), "c"(ext) : "cc", "memory");
316 }
317
318 static inline void vpid_sync_vcpu_single(uint16_t vpid)
319 {
320         if (vpid == 0) {
321                 return;
322         }
323
324         if (cpu_has_vmx_invvpid_single())
325                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
326 }
327
328 static inline void vpid_sync_vcpu_global(void)
329 {
330         if (cpu_has_vmx_invvpid_global())
331                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
332 }
333
334 static inline void vpid_sync_context(uint16_t vpid)
335 {
336         if (cpu_has_vmx_invvpid_single())
337                 vpid_sync_vcpu_single(vpid);
338         else
339                 vpid_sync_vcpu_global();
340 }
341
342 static void vmcs_clear(struct vmcs *vmcs)
343 {
344         uint64_t phys_addr = PADDR(vmcs);
345         uint8_t error;
346
347         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
348                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
349                       : "cc", "memory");
350         if (error)
351                 printk("vmclear fail: %p/%llx\n",
352                        vmcs, phys_addr);
353 }
354
355 static void vmcs_load(struct vmcs *vmcs)
356 {
357         uint64_t phys_addr = PADDR(vmcs);
358         uint8_t error;
359
360         asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
361                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
362                         : "cc", "memory");
363         if (error)
364                 printk("vmptrld %p/%llx failed\n",
365                        vmcs, phys_addr);
366 }
367
368 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
369 static physaddr_t vmcs_get_current(void)
370 {
371         physaddr_t vmcs_paddr;
372         /* RAX contains the addr of the location to store the VMCS pointer.  The
373          * compiler doesn't know the ASM will deref that pointer, hence the =m */
374         asm volatile (ASM_VMX_VMPTRST_RAX : "=m"(vmcs_paddr) : "a"(&vmcs_paddr));
375         return vmcs_paddr;
376 }
377
378 __always_inline unsigned long vmcs_readl(unsigned long field)
379 {
380         unsigned long value;
381
382         asm volatile (ASM_VMX_VMREAD_RDX_RAX
383                       : "=a"(value) : "d"(field) : "cc");
384         return value;
385 }
386
387 __always_inline uint16_t vmcs_read16(unsigned long field)
388 {
389         return vmcs_readl(field);
390 }
391
392 static __always_inline uint32_t vmcs_read32(unsigned long field)
393 {
394         return vmcs_readl(field);
395 }
396
397 static __always_inline uint64_t vmcs_read64(unsigned long field)
398 {
399 #ifdef CONFIG_X86_64
400         return vmcs_readl(field);
401 #else
402         return vmcs_readl(field) | ((uint64_t)vmcs_readl(field+1) << 32);
403 #endif
404 }
405
406 void vmwrite_error(unsigned long field, unsigned long value)
407 {
408         printk("vmwrite error: reg %lx value %lx (err %d)\n",
409                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
410 }
411
412 void vmcs_writel(unsigned long field, unsigned long value)
413 {
414         uint8_t error;
415
416         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
417                        : "=q"(error) : "a"(value), "d"(field) : "cc");
418         if (error)
419                 vmwrite_error(field, value);
420 }
421
422 static void vmcs_write16(unsigned long field, uint16_t value)
423 {
424         vmcs_writel(field, value);
425 }
426
427 static void vmcs_write32(unsigned long field, uint32_t value)
428 {
429         vmcs_writel(field, value);
430 }
431
432 static void vmcs_write64(unsigned long field, uint64_t value)
433 {
434         vmcs_writel(field, value);
435 }
436
437 static int adjust_vmx_controls(uint32_t ctl_min, uint32_t ctl_opt,
438                                       uint32_t msr, uint32_t *result)
439 {
440         uint32_t vmx_msr_low, vmx_msr_high;
441         uint32_t ctl = ctl_min | ctl_opt;
442         uint64_t vmx_msr = read_msr(msr);
443         vmx_msr_low = vmx_msr;
444         vmx_msr_high = vmx_msr>>32;
445
446         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
447         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
448
449         /* Ensure minimum (required) set of control bits are supported. */
450         if (ctl_min & ~ctl) {
451                 return -EIO;
452         }
453
454         *result = ctl;
455         return 0;
456 }
457
458 static  bool allow_1_setting(uint32_t msr, uint32_t ctl)
459 {
460         uint32_t vmx_msr_low, vmx_msr_high;
461
462         rdmsr(msr, vmx_msr_low, vmx_msr_high);
463         return vmx_msr_high & ctl;
464 }
465
466 static  void setup_vmcs_config(void *p)
467 {
468         int *ret = p;
469         struct vmcs_config *vmcs_conf = &vmcs_config;
470         uint32_t vmx_msr_low, vmx_msr_high;
471         uint32_t min, opt, min2, opt2;
472         uint32_t _pin_based_exec_control = 0;
473         uint32_t _cpu_based_exec_control = 0;
474         uint32_t _cpu_based_2nd_exec_control = 0;
475         uint32_t _vmexit_control = 0;
476         uint32_t _vmentry_control = 0;
477
478         *ret = -EIO;
479         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
480         opt = PIN_BASED_VIRTUAL_NMIS;
481         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
482                                 &_pin_based_exec_control) < 0) {
483                 return;
484         }
485
486         min =
487               CPU_BASED_CR8_LOAD_EXITING |
488               CPU_BASED_CR8_STORE_EXITING |
489               CPU_BASED_CR3_LOAD_EXITING |
490               CPU_BASED_CR3_STORE_EXITING |
491               CPU_BASED_MOV_DR_EXITING |
492               CPU_BASED_USE_TSC_OFFSETING |
493               CPU_BASED_MWAIT_EXITING |
494               CPU_BASED_MONITOR_EXITING |
495               CPU_BASED_INVLPG_EXITING;
496
497         min |= CPU_BASED_HLT_EXITING;
498
499         opt = CPU_BASED_TPR_SHADOW |
500               CPU_BASED_USE_MSR_BITMAPS |
501               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
502         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
503                                 &_cpu_based_exec_control) < 0) {
504                 return;
505         }
506
507         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
508                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
509                                            ~CPU_BASED_CR8_STORE_EXITING;
510
511         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
512                 min2 = 
513                         SECONDARY_EXEC_ENABLE_VPID |
514                         SECONDARY_EXEC_ENABLE_EPT |
515                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
516                 opt2 =  SECONDARY_EXEC_WBINVD_EXITING |
517                         SECONDARY_EXEC_RDTSCP |
518                         SECONDARY_EXEC_ENABLE_INVPCID;
519                 if (adjust_vmx_controls(min2, opt2,
520                                         MSR_IA32_VMX_PROCBASED_CTLS2,
521                                         &_cpu_based_2nd_exec_control) < 0) {
522                                                 return;
523                                         }
524         }
525
526         if (!(_cpu_based_2nd_exec_control &
527                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
528                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
529
530         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
531                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
532                    enabled */
533                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
534                                              CPU_BASED_CR3_STORE_EXITING |
535                                              CPU_BASED_INVLPG_EXITING);
536                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
537                       vmx_capability.ept, vmx_capability.vpid);
538         }
539
540         min = 0;
541
542         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
543
544 //      opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
545         opt = 0;
546         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
547                                 &_vmexit_control) < 0) {
548                 return;
549         }
550
551         min = 0;
552 //      opt = VM_ENTRY_LOAD_IA32_PAT;
553         opt = 0;
554         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
555                                 &_vmentry_control) < 0) {
556                 return;
557         }
558
559         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
560
561         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
562         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) {
563                 return;
564         }
565
566         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
567         if (vmx_msr_high & (1u<<16)) {
568                 printk("64-bit CPUs always have VMX_BASIC_MSR[48]==0. FAILS!\n");
569                 return;
570         }
571
572         /* Require Write-Back (WB) memory type for VMCS accesses. */
573         if (((vmx_msr_high >> 18) & 15) != 6) {
574                 printk("NO WB!\n");
575                 return;
576         }
577
578         vmcs_conf->size = vmx_msr_high & 0x1fff;
579         vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
580         vmcs_conf->revision_id = vmx_msr_low;
581         printk("vmcs_conf size %d order %d rev %d\n",
582                vmcs_conf->size, vmcs_conf->order,
583                vmcs_conf->revision_id);
584
585         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
586         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
587         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
588         vmcs_conf->vmexit_ctrl         = _vmexit_control;
589         vmcs_conf->vmentry_ctrl        = _vmentry_control;
590
591         vmx_capability.has_load_efer =
592                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
593                                 VM_ENTRY_LOAD_IA32_EFER)
594                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
595                                    VM_EXIT_LOAD_IA32_EFER);
596
597         /* Now that we've done all the setup we can do, verify
598          * that we have all the capabilities we need. These tests
599          * are done last presumably because all the work done above
600          * affects some of them.
601          */
602
603         if (!vmx_capability.has_load_efer) {
604                 printk("CPU lacks ability to load EFER register\n");
605                 return;
606         }
607
608         printk("CPU has all needed capabilities\n");
609         *ret = 0;
610 }
611
612 static struct vmcs *__vmx_alloc_vmcs(int node)
613 {
614         struct vmcs *vmcs;
615
616         vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
617         if (!vmcs)
618                 return 0;
619         memset(vmcs, 0, vmcs_config.size);
620         vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
621         printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
622         return vmcs;
623 }
624
625 /**
626  * vmx_alloc_vmcs - allocates a VMCS region
627  *
628  * NOTE: Assumes the new region will be used by the current CPU.
629  *
630  * Returns a valid VMCS region.
631  */
632 static struct vmcs *vmx_alloc_vmcs(void)
633 {
634         return __vmx_alloc_vmcs(node_id());
635 }
636
637 /**
638  * vmx_free_vmcs - frees a VMCS region
639  */
640 static void vmx_free_vmcs(struct vmcs *vmcs)
641 {
642   //free_pages((unsigned long)vmcs, vmcs_config.order);
643 }
644
645 /*
646  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
647  * will not change in the lifetime of the guest.
648  * Note that host-state that does change is set elsewhere. E.g., host-state
649  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
650  */
651 static void vmx_setup_constant_host_state(void)
652 {
653         uint32_t low32, high32;
654         unsigned long tmpl;
655         pseudodesc_t dt;
656
657         vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);  /* 22.2.3 */
658         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
659         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
660
661         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
662         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
663         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
664         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
665         vmcs_write16(HOST_TR_SELECTOR, GD_TSS);  /* 22.2.4 */
666
667         native_store_idt(&dt);
668         vmcs_writel(HOST_IDTR_BASE, dt.pd_base);   /* 22.2.4 */
669
670         asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
671         vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
672
673         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
674         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
675         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
676         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
677
678         rdmsr(MSR_EFER, low32, high32);
679         vmcs_write32(HOST_IA32_EFER, low32);
680
681         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
682                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
683                 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
684         }
685
686         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
687         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
688
689         /* TODO: This (at least gs) is per cpu */
690         rdmsrl(MSR_FS_BASE, tmpl);
691         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
692         rdmsrl(MSR_GS_BASE, tmpl);
693         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
694 }
695
696 static inline uint16_t vmx_read_ldt(void)
697 {
698         uint16_t ldt;
699         asm("sldt %0" : "=g"(ldt));
700         return ldt;
701 }
702
703 static unsigned long segment_base(uint16_t selector)
704 {
705         pseudodesc_t *gdt = &currentcpu->host_gdt;
706         struct desc_struct *d;
707         unsigned long table_base;
708         unsigned long v;
709
710         if (!(selector & ~3)) {
711                 return 0;
712         }
713
714         table_base = gdt->pd_base;
715
716         if (selector & 4) {           /* from ldt */
717                 uint16_t ldt_selector = vmx_read_ldt();
718
719                 if (!(ldt_selector & ~3)) {
720                         return 0;
721                 }
722
723                 table_base = segment_base(ldt_selector);
724         }
725         d = (struct desc_struct *)(table_base + (selector & ~7));
726         v = get_desc_base(d);
727 #ifdef CONFIG_X86_64
728        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
729                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
730 #endif
731         return v;
732 }
733
734 static inline unsigned long vmx_read_tr_base(void)
735 {
736         uint16_t tr;
737         asm("str %0" : "=g"(tr));
738         return segment_base(tr);
739 }
740
741 static void __vmx_setup_cpu(void)
742 {
743         pseudodesc_t *gdt = &currentcpu->host_gdt;
744         unsigned long sysenter_esp;
745         unsigned long tmpl;
746
747         /*
748          * Linux uses per-cpu TSS and GDT, so set these when switching
749          * processors.
750          */
751         vmcs_writel(HOST_TR_BASE, vmx_read_tr_base()); /* 22.2.4 */
752         vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);   /* 22.2.4 */
753
754         rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
755         vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
756
757         rdmsrl(MSR_FS_BASE, tmpl);
758         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
759         rdmsrl(MSR_GS_BASE, tmpl);
760         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
761 }
762
763 static void __vmx_get_cpu_helper(struct hw_trapframe *hw_tf, void *ptr)
764 {
765         struct vmx_vcpu *vcpu = ptr;
766
767         if (core_id() != vcpu->cpu)
768                 panic("%s: core_id() %d != vcpu->cpu %d\n",
769                       __func__, core_id(), vcpu->cpu);
770
771         vmcs_clear(vcpu->vmcs);
772         if (currentcpu->local_vcpu == vcpu)
773                 currentcpu->local_vcpu = NULL;
774 }
775
776 /**
777  * vmx_get_cpu - called before using a cpu
778  * @vcpu: VCPU that will be loaded.
779  *
780  * Disables preemption. Call vmx_put_cpu() when finished.
781  */
782 static void vmx_get_cpu(struct vmx_vcpu *vcpu)
783 {
784         int cur_cpu = core_id();
785         handler_wrapper_t *w;
786
787         if (currentcpu->local_vcpu)
788                 panic("get_cpu: currentcpu->localvcpu was non-NULL");
789         if (currentcpu->local_vcpu != vcpu) {
790                 currentcpu->local_vcpu = vcpu;
791
792                 if (vcpu->cpu != cur_cpu) {
793                         if (vcpu->cpu >= 0) {
794                                 panic("vcpu->cpu is not -1, it's %d\n", vcpu->cpu);
795                         } else
796                                 vmcs_clear(vcpu->vmcs);
797
798                         vpid_sync_context(vcpu->vpid);
799                         ept_sync_context(eptp);
800
801                         vcpu->launched = 0;
802                         vmcs_load(vcpu->vmcs);
803                         __vmx_setup_cpu();
804                         vcpu->cpu = cur_cpu;
805                 } else {
806                         vmcs_load(vcpu->vmcs);
807                 }
808         }
809 }
810
811 /**
812  * vmx_put_cpu - called after using a cpu
813  * @vcpu: VCPU that was loaded.
814  */
815 static void vmx_put_cpu(struct vmx_vcpu *vcpu)
816 {
817         if (core_id() != vcpu->cpu)
818                 panic("%s: core_id() %d != vcpu->cpu %d\n",
819                       __func__, core_id(), vcpu->cpu);
820
821         if (currentcpu->local_vcpu != vcpu)
822                 panic("vmx_put_cpu: asked to clear something not ours");
823
824
825         vpid_sync_context(vcpu->vpid);
826         ept_sync_context(eptp);
827         vmcs_clear(vcpu->vmcs);
828         vcpu->cpu = -1;
829         currentcpu->local_vcpu = NULL;
830         //put_cpu();
831 }
832
833 static void __vmx_sync_helper(struct hw_trapframe *hw_tf, void *ptr)
834 {
835         struct vmx_vcpu *vcpu = ptr;
836
837         ept_sync_context(eptp);
838 }
839
840 struct sync_addr_args {
841         struct vmx_vcpu *vcpu;
842         gpa_t gpa;
843 };
844
845 static void __vmx_sync_individual_addr_helper(struct hw_trapframe *hw_tf, void *ptr)
846 {
847         struct sync_addr_args *args = ptr;
848
849 //      ept_sync_individual_addr(
850
851 }
852
853 /**
854  * vmx_ept_sync_global - used to evict everything in the EPT
855  * @vcpu: the vcpu
856  */
857 void vmx_ept_sync_vcpu(struct vmx_vcpu *vcpu)
858 {
859         handler_wrapper_t *w;
860
861         smp_call_function_single(vcpu->cpu,
862                 __vmx_sync_helper, (void *) vcpu, &w);
863
864         if (smp_call_wait(w)) {
865                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
866         }
867
868
869 }
870
871 /**
872  * vmx_ept_sync_individual_addr - used to evict an individual address
873  * @vcpu: the vcpu
874  * @gpa: the guest-physical address
875  */
876 void vmx_ept_sync_individual_addr(struct vmx_vcpu *vcpu, gpa_t gpa)
877 {
878         struct sync_addr_args args;
879         args.vcpu = vcpu;
880         args.gpa = gpa;
881
882         handler_wrapper_t *w;
883
884
885         smp_call_function_single(vcpu->cpu,
886                                  __vmx_sync_individual_addr_helper, (void *) &args, &w);
887
888         if (smp_call_wait(w)) {
889                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
890         }
891
892 }
893
894 /**
895  * vmx_dump_cpu - prints the CPU state
896  * @vcpu: VCPU to print
897  */
898 static void vmx_dump_cpu(struct vmx_vcpu *vcpu)
899 {
900
901         unsigned long flags;
902
903         vmx_get_cpu(vcpu);
904         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
905         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
906         flags = vmcs_readl(GUEST_RFLAGS);
907         vmx_put_cpu(vcpu);
908
909         printk("--- Begin VCPU Dump ---\n");
910         printk("CPU %d VPID %d\n", vcpu->cpu, vcpu->vpid);
911         printk("RIP 0x%016lx RFLAGS 0x%08lx\n",
912                vcpu->regs.tf_rip, flags);
913         printk("RAX 0x%016lx RCX 0x%016lx\n",
914                 vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
915         printk("RDX 0x%016lx RBX 0x%016lx\n",
916                 vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
917         printk("RSP 0x%016lx RBP 0x%016lx\n",
918                 vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
919         printk("RSI 0x%016lx RDI 0x%016lx\n",
920                 vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
921         printk("R8  0x%016lx R9  0x%016lx\n",
922                 vcpu->regs.tf_r8, vcpu->regs.tf_r9);
923         printk("R10 0x%016lx R11 0x%016lx\n",
924                 vcpu->regs.tf_r10, vcpu->regs.tf_r11);
925         printk("R12 0x%016lx R13 0x%016lx\n",
926                 vcpu->regs.tf_r12, vcpu->regs.tf_r13);
927         printk("R14 0x%016lx R15 0x%016lx\n",
928                 vcpu->regs.tf_r14, vcpu->regs.tf_r15);
929         printk("--- End VCPU Dump ---\n");
930
931 }
932
933 uint64_t construct_eptp(unsigned long root_hpa)
934 {
935         uint64_t eptp;
936
937         /* TODO write the value reading from MSR */
938         eptp = VMX_EPT_DEFAULT_MT |
939                 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
940         if (cpu_has_vmx_ept_ad_bits())
941                 eptp |= VMX_EPT_AD_ENABLE_BIT;
942         eptp |= (root_hpa & PAGE_MASK);
943
944         return eptp;
945 }
946
947 /**
948  * vmx_setup_initial_guest_state - configures the initial state of guest registers
949  */
950 static void vmx_setup_initial_guest_state(void)
951 {
952         unsigned long tmpl;
953         unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
954                             X86_CR4_PGE | X86_CR4_OSFXSR;
955         uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
956 #if 0
957         do we need it
958         if (boot_cpu_has(X86_FEATURE_PCID))
959                 cr4 |= X86_CR4_PCIDE;
960         if (boot_cpu_has(X86_FEATURE_OSXSAVE))
961                 cr4 |= X86_CR4_OSXSAVE;
962 #endif
963         /* we almost certainly have this */
964         /* we'll go sour if we don't. */
965         if (1) //boot_cpu_has(X86_FEATURE_FSGSBASE))
966                 cr4 |= X86_CR4_RDWRGSFS;
967
968         /* configure control and data registers */
969         vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
970                                X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
971         vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
972                                      X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
973         vmcs_writel(GUEST_CR3, rcr3());
974         vmcs_writel(GUEST_CR4, cr4);
975         vmcs_writel(CR4_READ_SHADOW, cr4);
976         vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
977                                      EFER_SCE | EFER_FFXSR);
978         vmcs_writel(GUEST_GDTR_BASE, 0);
979         vmcs_writel(GUEST_GDTR_LIMIT, 0);
980         vmcs_writel(GUEST_IDTR_BASE, 0);
981         vmcs_writel(GUEST_IDTR_LIMIT, 0);
982         vmcs_writel(GUEST_RIP, 0xdeadbeef);
983         vmcs_writel(GUEST_RSP, 0xdeadbeef);
984         vmcs_writel(GUEST_RFLAGS, 0x02);
985         vmcs_writel(GUEST_DR7, 0);
986
987         /* guest segment bases */
988         vmcs_writel(GUEST_CS_BASE, 0);
989         vmcs_writel(GUEST_DS_BASE, 0);
990         vmcs_writel(GUEST_ES_BASE, 0);
991         vmcs_writel(GUEST_GS_BASE, 0);
992         vmcs_writel(GUEST_SS_BASE, 0);
993         rdmsrl(MSR_FS_BASE, tmpl);
994         vmcs_writel(GUEST_FS_BASE, tmpl);
995
996         /* guest segment access rights */
997         vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
998         vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
999         vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
1000         vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
1001         vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
1002         vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
1003
1004         /* guest segment limits */
1005         vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
1006         vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
1007         vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
1008         vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
1009         vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
1010         vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
1011
1012         /* configure segment selectors */
1013         vmcs_write16(GUEST_CS_SELECTOR, 0);
1014         vmcs_write16(GUEST_DS_SELECTOR, 0);
1015         vmcs_write16(GUEST_ES_SELECTOR, 0);
1016         vmcs_write16(GUEST_FS_SELECTOR, 0);
1017         vmcs_write16(GUEST_GS_SELECTOR, 0);
1018         vmcs_write16(GUEST_SS_SELECTOR, 0);
1019         vmcs_write16(GUEST_TR_SELECTOR, 0);
1020
1021         /* guest LDTR */
1022         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1023         vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
1024         vmcs_writel(GUEST_LDTR_BASE, 0);
1025         vmcs_writel(GUEST_LDTR_LIMIT, 0);
1026
1027         /* guest TSS */
1028         vmcs_writel(GUEST_TR_BASE, 0);
1029         vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
1030         vmcs_writel(GUEST_TR_LIMIT, 0xff);
1031
1032         /* initialize sysenter */
1033         vmcs_write32(GUEST_SYSENTER_CS, 0);
1034         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1035         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1036
1037         /* other random initialization */
1038         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1039         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1040         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1041         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1042         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1043 }
1044
1045 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
1046 {
1047         int f = sizeof(unsigned long);
1048         /*
1049          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1050          * have the write-low and read-high bitmap offsets the wrong way round.
1051          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1052          */
1053         if (msr <= 0x1fff) {
1054                 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
1055                 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
1056         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1057                 msr &= 0x1fff;
1058                 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
1059                 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
1060         }
1061 }
1062
1063 static void setup_msr(struct vmx_vcpu *vcpu)
1064 {
1065         int set[] = { MSR_LSTAR };
1066         struct vmx_msr_entry *e;
1067         int sz = sizeof(set) / sizeof(*set);
1068         int i;
1069
1070         //BUILD_BUG_ON(sz > NR_AUTOLOAD_MSRS);
1071
1072         vcpu->msr_autoload.nr = sz;
1073
1074         /* XXX enable only MSRs in set */
1075         vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1076
1077         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1078         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1079         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1080
1081         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1082         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1083         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1084
1085         for (i = 0; i < sz; i++) {
1086                 uint64_t val;
1087
1088                 e = &vcpu->msr_autoload.host[i];
1089                 e->index = set[i];
1090                 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1091                 rdmsrl(e->index, val);
1092                 e->value = val;
1093
1094                 e = &vcpu->msr_autoload.guest[i];
1095                 e->index = set[i];
1096                 e->value = 0xDEADBEEF;
1097         }
1098 }
1099
1100 /**
1101  *  vmx_setup_vmcs - configures the vmcs with starting parameters
1102  */
1103 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu)
1104 {
1105         vmcs_write16(VIRTUAL_PROCESSOR_ID, vcpu->vpid);
1106         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1107
1108         /* Control */
1109         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1110                 vmcs_config.pin_based_exec_ctrl);
1111
1112         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1113                 vmcs_config.cpu_based_exec_ctrl);
1114
1115         if (cpu_has_secondary_exec_ctrls()) {
1116                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1117                              vmcs_config.cpu_based_2nd_exec_ctrl);
1118         }
1119
1120         vmcs_write64(EPT_POINTER, eptp);
1121
1122         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1123         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1124         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1125
1126         setup_msr(vcpu);
1127 #if 0
1128         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1129                 uint32_t msr_low, msr_high;
1130                 uint64_t host_pat;
1131                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
1132                 host_pat = msr_low | ((uint64_t) msr_high << 32);
1133                 /* Write the default value follow host pat */
1134                 vmcs_write64(GUEST_IA32_PAT, host_pat);
1135                 /* Keep arch.pat sync with GUEST_IA32_PAT */
1136                 vmx->vcpu.arch.pat = host_pat;
1137         }
1138 #endif
1139 #if 0
1140         for (int i = 0; i < NR_VMX_MSR; ++i) {
1141                 uint32_t index = vmx_msr_index[i];
1142                 uint32_t data_low, data_high;
1143                 int j = vmx->nmsrs;
1144                 // TODO we should have read/writemsr_safe
1145 #if 0
1146                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1147                         continue;
1148                 if (wrmsr_safe(index, data_low, data_high) < 0)
1149                         continue;
1150 #endif
1151                 vmx->guest_msrs[j].index = i;
1152                 vmx->guest_msrs[j].data = 0;
1153                 vmx->guest_msrs[j].mask = -1ull;
1154                 ++vmx->nmsrs;
1155         }
1156 #endif
1157
1158         vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1159
1160         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1161         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1162
1163         vmcs_writel(CR0_GUEST_HOST_MASK, ~0ul);
1164         vmcs_writel(CR4_GUEST_HOST_MASK, ~0ul);
1165
1166         //kvm_write_tsc(&vmx->vcpu, 0);
1167         vmcs_writel(TSC_OFFSET, 0);
1168
1169         vmx_setup_constant_host_state();
1170 }
1171
1172 /**
1173  * vmx_allocate_vpid - reserves a vpid and sets it in the VCPU
1174  * @vmx: the VCPU
1175  */
1176 static int vmx_allocate_vpid(struct vmx_vcpu *vmx)
1177 {
1178         int vpid;
1179
1180         vmx->vpid = 0;
1181
1182         spin_lock(&vmx_vpid_lock);
1183         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
1184         if (vpid < VMX_NR_VPIDS) {
1185                 vmx->vpid = vpid;
1186                 __set_bit(vpid, vmx_vpid_bitmap);
1187         }
1188         spin_unlock(&vmx_vpid_lock);
1189
1190         return vpid >= VMX_NR_VPIDS;
1191 }
1192
1193 /**
1194  * vmx_free_vpid - frees a vpid
1195  * @vmx: the VCPU
1196  */
1197 static void vmx_free_vpid(struct vmx_vcpu *vmx)
1198 {
1199         spin_lock(&vmx_vpid_lock);
1200         if (vmx->vpid != 0)
1201                 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
1202         spin_unlock(&vmx_vpid_lock);
1203 }
1204
1205 /**
1206  * vmx_create_vcpu - allocates and initializes a new virtual cpu
1207  *
1208  * Returns: A new VCPU structure
1209  */
1210 struct vmx_vcpu *vmx_create_vcpu(struct proc *p)
1211 {
1212         struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1213         if (!vcpu) {
1214                 return NULL;
1215         }
1216
1217         memset(vcpu, 0, sizeof(*vcpu));
1218
1219         vcpu->proc = p; /* uncounted (weak) reference */
1220         vcpu->vmcs = vmx_alloc_vmcs();
1221         printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1222         if (!vcpu->vmcs)
1223                 goto fail_vmcs;
1224
1225         if (vmx_allocate_vpid(vcpu))
1226                 goto fail_vpid;
1227
1228         printd("%d: vmx_create_vcpu: vpid %d\n", core_id(), vcpu->vpid);
1229         vcpu->cpu = -1;
1230
1231         vmx_get_cpu(vcpu);
1232         vmx_setup_vmcs(vcpu);
1233         vmx_setup_initial_guest_state();
1234         vmx_put_cpu(vcpu);
1235
1236 #if 0
1237         if (cpu_has_vmx_ept_ad_bits()) {
1238                 vcpu->ept_ad_enabled = true;
1239                 printk("vmx: enabled EPT A/D bits");
1240         }
1241         if (vmx_create_ept(vcpu->gv))
1242                 goto fail_ept;
1243 #endif
1244
1245         return vcpu;
1246
1247 fail_ept:
1248         vmx_free_vpid(vcpu);
1249 fail_vpid:
1250         vmx_free_vmcs(vcpu->vmcs);
1251 fail_vmcs:
1252         kfree(vcpu);
1253         return NULL;
1254 }
1255
1256 /**
1257  * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1258  * @vcpu: the VCPU to destroy
1259  */
1260 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu)
1261 {
1262         vmx_get_cpu(vcpu);
1263         ept_sync_context(eptp);
1264         memset(ept, 0, PGSIZE);
1265         vmx_put_cpu(vcpu);
1266         vmx_free_vpid(vcpu);
1267         vmx_free_vmcs(vcpu->vmcs);
1268         kfree(vcpu);
1269 }
1270
1271 /**
1272  * vmx_task_vcpu - returns a pointer to the task's vcpu or NULL.
1273  * @task: the task
1274  */
1275 static inline struct vmx_vcpu *vmx_task_vcpu(struct proc *p)
1276 {
1277         struct dune_struct *dune = current->virtinfo;
1278         return dune ? dune->vcpu : NULL;
1279 }
1280
1281 /**
1282  * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1283  *
1284  * In the contexts where this is used the vcpu pointer should never be NULL.
1285  */
1286 static inline struct vmx_vcpu *vmx_current_vcpu(void)
1287 {
1288         struct vmx_vcpu *vcpu = vmx_task_vcpu(current);
1289         if (! vcpu)
1290                 panic("%s: core_id %d: no vcpu", __func__, core_id());
1291         return vcpu;
1292 }
1293
1294
1295 /**
1296  * vmx_run_vcpu - launches the CPU into non-root mode
1297  * We ONLY support 64-bit guests.
1298  * @vcpu: the vmx instance to launch
1299  */
1300 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1301 {
1302         asm(
1303                 /* Store host registers */
1304                 "push %%rdx; push %%rbp;"
1305                 "push %%rcx \n\t" /* placeholder for guest rcx */
1306                 "push %%rcx \n\t"
1307                 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1308                 "je 1f \n\t"
1309                 "mov %%rsp, %c[host_rsp](%0) \n\t"
1310                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1311                 "1: \n\t"
1312                 /* Reload cr2 if changed */
1313                 "mov %c[cr2](%0), %%rax \n\t"
1314                 "mov %%cr2, %%rdx \n\t"
1315                 "cmp %%rax, %%rdx \n\t"
1316                 "je 2f \n\t"
1317                 "mov %%rax, %%cr2 \n\t"
1318                 "2: \n\t"
1319                 /* Check if vmlaunch of vmresume is needed */
1320                 "cmpl $0, %c[launched](%0) \n\t"
1321                 /* Load guest registers.  Don't clobber flags. */
1322                 "mov %c[rax](%0), %%rax \n\t"
1323                 "mov %c[rbx](%0), %%rbx \n\t"
1324                 "mov %c[rdx](%0), %%rdx \n\t"
1325                 "mov %c[rsi](%0), %%rsi \n\t"
1326                 "mov %c[rdi](%0), %%rdi \n\t"
1327                 "mov %c[rbp](%0), %%rbp \n\t"
1328                 "mov %c[r8](%0),  %%r8  \n\t"
1329                 "mov %c[r9](%0),  %%r9  \n\t"
1330                 "mov %c[r10](%0), %%r10 \n\t"
1331                 "mov %c[r11](%0), %%r11 \n\t"
1332                 "mov %c[r12](%0), %%r12 \n\t"
1333                 "mov %c[r13](%0), %%r13 \n\t"
1334                 "mov %c[r14](%0), %%r14 \n\t"
1335                 "mov %c[r15](%0), %%r15 \n\t"
1336                 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1337
1338                 /* Enter guest mode */
1339                 "jne .Llaunched \n\t"
1340                 ASM_VMX_VMLAUNCH "\n\t"
1341                 "jmp .Lkvm_vmx_return \n\t"
1342                 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1343                 ".Lkvm_vmx_return: "
1344                 /* Save guest registers, load host registers, keep flags */
1345                 "mov %0, %c[wordsize](%%rsp) \n\t"
1346                 "pop %0 \n\t"
1347                 "mov %%rax, %c[rax](%0) \n\t"
1348                 "mov %%rbx, %c[rbx](%0) \n\t"
1349                 "popq %c[rcx](%0) \n\t"
1350                 "mov %%rdx, %c[rdx](%0) \n\t"
1351                 "mov %%rsi, %c[rsi](%0) \n\t"
1352                 "mov %%rdi, %c[rdi](%0) \n\t"
1353                 "mov %%rbp, %c[rbp](%0) \n\t"
1354                 "mov %%r8,  %c[r8](%0) \n\t"
1355                 "mov %%r9,  %c[r9](%0) \n\t"
1356                 "mov %%r10, %c[r10](%0) \n\t"
1357                 "mov %%r11, %c[r11](%0) \n\t"
1358                 "mov %%r12, %c[r12](%0) \n\t"
1359                 "mov %%r13, %c[r13](%0) \n\t"
1360                 "mov %%r14, %c[r14](%0) \n\t"
1361                 "mov %%r15, %c[r15](%0) \n\t"
1362                 "mov %%rax, %%r10 \n\t"
1363                 "mov %%rdx, %%r11 \n\t"
1364
1365                 "mov %%cr2, %%rax   \n\t"
1366                 "mov %%rax, %c[cr2](%0) \n\t"
1367
1368                 "pop  %%rbp; pop  %%rdx \n\t"
1369                 "setbe %c[fail](%0) \n\t"
1370                 "mov $" STRINGIFY(GD_UD) ", %%rax \n\t"
1371                 "mov %%rax, %%ds \n\t"
1372                 "mov %%rax, %%es \n\t"
1373               : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1374                 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1375                 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1376                 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1377                 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1378                 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1379                 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1380                 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1381                 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1382                 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1383                 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1384                 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1385                 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1386                 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1387                 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1388                 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1389                 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1390                 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1391                 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1392                 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1393                 [wordsize]"i"(sizeof(unsigned long))
1394               : "cc", "memory"
1395                 , "rax", "rbx", "rdi", "rsi"
1396                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1397         );
1398
1399         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1400         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1401         printk("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1402                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1403         /* FIXME: do we need to set up other flags? */
1404         vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
1405                       X86_EFLAGS_IF | 0x2;
1406
1407         vcpu->regs.tf_cs = GD_UT;
1408         vcpu->regs.tf_ss = GD_UD;
1409
1410         vcpu->launched = 1;
1411
1412         if (vcpu->fail) {
1413                 printk("failure detected (err %x)\n",
1414                        vmcs_read32(VM_INSTRUCTION_ERROR));
1415                 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1416         }
1417
1418         return vmcs_read32(VM_EXIT_REASON);
1419
1420 #if 0
1421         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1422         vmx_complete_atomic_exit(vmx);
1423         vmx_recover_nmi_blocking(vmx);
1424         vmx_complete_interrupts(vmx);
1425 #endif
1426 }
1427
1428 static void vmx_step_instruction(void)
1429 {
1430         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1431                                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1432 }
1433
1434 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu)
1435 {
1436         unsigned long gva, gpa;
1437         int exit_qual, ret = -1;
1438         page_t *page;
1439
1440         vmx_get_cpu(vcpu);
1441         exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1442         gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1443         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1444         printk("ept: gva %016lx, gpa %016lx\n", gva, gpa);
1445
1446         vmx_put_cpu(vcpu);
1447
1448         // this is a total hack, for testing things.
1449         // note that we only care about the gpa, and the
1450         // gpa is our process virtual address. 
1451         // Confused yet?
1452         page = page_lookup(current->env_pgdir, (void *)gpa, NULL);
1453         printk("Lookup %p returns %p\n", gpa, page);
1454         if (page) {
1455                 uint64_t hpa = page2pa(page);
1456                 printk("hpa for %p is %p\n", gpa, hpa);
1457                 ret = vmx_do_ept_fault(ept, gpa, hpa, exit_qual);
1458                 printk("vmx_do_ept_fault returns %d\n", ret);
1459         }
1460
1461         if (ret) {
1462                 printk("page fault failure "
1463                        "GPA: 0x%lx, GVA: 0x%lx\n",
1464                        gpa, gva);
1465                 vmx_dump_cpu(vcpu);
1466         }
1467
1468         return ret;
1469 }
1470
1471 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu)
1472 {
1473         unsigned int eax, ebx, ecx, edx;
1474
1475         eax = vcpu->regs.tf_rax;
1476         ecx = vcpu->regs.tf_rcx;
1477         cpuid(0, 2, &eax, &ebx, &ecx, &edx);
1478         vcpu->regs.tf_rax = eax;
1479         vcpu->regs.tf_rbx = ebx;
1480         vcpu->regs.tf_rcx = ecx;
1481         vcpu->regs.tf_rdx = edx;
1482 }
1483
1484 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu)
1485 {
1486         uint32_t intr_info;
1487
1488         vmx_get_cpu(vcpu);
1489         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1490         vmx_put_cpu(vcpu);
1491
1492         printk("vmx (VPID %d): got an exception\n", vcpu->vpid);
1493         printk("vmx (VPID %d): pid %d\n", vcpu->vpid,
1494                          current->pid);
1495         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1496                 return 0;
1497         }
1498
1499         printk("unhandled nmi, intr_info %x\n", intr_info);
1500         return -EIO;
1501 }
1502
1503
1504 static void noop(void) {
1505         __asm__ __volatile__ ("1: jmp 1b");
1506 }
1507
1508 static void fail(void) {
1509         __asm__ __volatile__ ("movq $0xdeadbeef, %rbx; movq 0, %rax");
1510 }
1511
1512 static unsigned long stack[512];
1513 /**
1514  * vmx_launch - the main loop for a VMX Dune process
1515  * @conf: the launch configuration
1516  */
1517 int vmx_launch(struct dune_config *conf)
1518 {
1519         int ret;
1520         struct dune_struct dune;
1521         struct vmx_vcpu *vcpu;
1522         int i = 0;
1523         unsigned long rip = conf->rip;
1524         unsigned long rsp = conf->rsp;
1525         unsigned long cr3 = conf->cr3;
1526         int errors = 0;
1527
1528         if (conf->rip < 4096 ) {
1529                 // testing.
1530                 switch(conf->rip) {
1531                 default:
1532                         rip = (uint64_t)noop + 4;
1533                         break;
1534                 case 1:
1535                         rip = (uint64_t)fail + 4;
1536                         break;
1537                 }
1538         }
1539
1540         if (conf->cr3 == 0) {
1541                 cr3 = rcr3();
1542         }
1543
1544         /* sanity checking.  -- later
1545         ret = ept_check_page(ept, rip);
1546         if (ret) {
1547                 printk("0x%x is not mapped in the ept!\n", rip);
1548                 errors++;
1549         }
1550         ret = ept_check_page(ept, rsp);
1551         if (ret) {
1552                 printk("0x%x is not mapped in the ept!\n", rsp);
1553                 errors++;
1554         }
1555         */
1556         if (errors) {
1557                 return -EINVAL;
1558         }
1559
1560
1561         printk("RUNNING: %s: rip %p rsp %p cr3 %p \n",
1562                __func__, rip, rsp, cr3);
1563         vcpu = vmx_create_vcpu();
1564         if (!vcpu) {
1565                 return -ENOMEM;
1566         }
1567
1568         vmx_get_cpu(vcpu);
1569         vmcs_writel(GUEST_RIP, rip);
1570         vmcs_writel(GUEST_RSP, rsp);
1571         vmcs_writel(GUEST_CR3, cr3);
1572         vmx_put_cpu(vcpu);
1573
1574         printk("created VCPU (VPID %d): pid %d\n",
1575                vcpu->vpid, current->pid);
1576
1577         vcpu->ret_code = -1;
1578
1579         if (current->virtinfo)
1580                 printk("vmx_launch: current->virtinfo is NOT NULL (%p)\n", current->virtinfo);
1581         //WARN_ON(current->virtinfo != NULL);
1582         dune.vcpu = vcpu;
1583
1584         current->virtinfo = &dune;
1585
1586         while (1) {
1587                 vmx_get_cpu(vcpu);
1588
1589                 // TODO: manage the fpu when we restart.
1590
1591                 // TODO: see if we need to exit before we go much further.
1592                 disable_irq();
1593                 ret = vmx_run_vcpu(vcpu);
1594                 enable_irq();
1595                 vmx_put_cpu(vcpu);
1596
1597                 if (ret == EXIT_REASON_VMCALL) {
1598                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1599                         printk("system call! WTF\n");
1600                 } else if (ret == EXIT_REASON_CPUID)
1601                         vmx_handle_cpuid(vcpu);
1602                 else if (ret == EXIT_REASON_EPT_VIOLATION) {
1603                         if (vmx_handle_ept_violation(vcpu))
1604                                 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1605                 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1606                         if (vmx_handle_nmi_exception(vcpu))
1607                                 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1608                 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1609                         printk("External interrupt\n");
1610                 } else {
1611                         printk("unhandled exit: reason %x, exit qualification %x\n",
1612                                ret, vmcs_read32(EXIT_QUALIFICATION));
1613                         vmx_dump_cpu(vcpu);
1614                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1615                 }
1616
1617                 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1618                  * similar to how proc_restartcore/smp_idle only restart the pcpui
1619                  * cur_ctx, we need to do the same, via the VMCS resume business. */
1620
1621                 if (vcpu->shutdown)
1622                         break;
1623         }
1624
1625         printk("RETURN. ip %016lx sp %016lx\n",
1626                 vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
1627         current->virtinfo = NULL;
1628
1629         /*
1630          * Return both the reason for the shutdown and a status value.
1631          * The exit() and exit_group() system calls only need 8 bits for
1632          * the status but we allow 16 bits in case we might want to
1633          * return more information for one of the other shutdown reasons.
1634          */
1635         ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
1636
1637         printk("destroying VCPU (VPID %d): pid %d\n",
1638                         vcpu->vpid, current->pid);
1639
1640         vmx_destroy_vcpu(vcpu);
1641
1642         return ret;
1643 }
1644
1645 /**
1646  * __vmx_enable - low-level enable of VMX mode on the current CPU
1647  * @vmxon_buf: an opaque buffer for use as the VMXON region
1648  */
1649 static  int __vmx_enable(struct vmcs *vmxon_buf)
1650 {
1651         uint64_t phys_addr = PADDR(vmxon_buf);
1652         uint64_t old, test_bits;
1653
1654         if (rcr4() & X86_CR4_VMXE) {
1655                 panic("Should never have this happen");
1656                 return -EBUSY;
1657         }
1658
1659         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1660
1661         test_bits = FEATURE_CONTROL_LOCKED;
1662         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1663
1664         if (0) // tboot_enabled())
1665                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1666
1667         if ((old & test_bits) != test_bits) {
1668                 /* If it's locked, then trying to set it will cause a GPF.
1669                  * No Dune for you!
1670                  */
1671                 if (old & FEATURE_CONTROL_LOCKED) {
1672                         printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1673                         return -1;
1674                 }
1675
1676                 /* enable and lock */
1677                 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1678         }
1679         lcr4(rcr4() | X86_CR4_VMXE);
1680
1681         __vmxon(phys_addr);
1682         vpid_sync_vcpu_global();
1683         ept_sync_global();
1684
1685         return 0;
1686 }
1687
1688 /**
1689  * vmx_enable - enables VMX mode on the current CPU
1690  * @unused: not used (required for on_each_cpu())
1691  *
1692  * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
1693  */
1694 static void vmx_enable(void)
1695 {
1696         struct vmcs *vmxon_buf = currentcpu->vmxarea;
1697         int ret;
1698
1699         ret = __vmx_enable(vmxon_buf);
1700         if (ret)
1701                 goto failed;
1702
1703         currentcpu->vmx_enabled = 1;
1704         // TODO: do we need this?
1705         store_gdt(&currentcpu->host_gdt);
1706
1707         printk("VMX enabled on CPU %d\n", core_id());
1708         return;
1709
1710 failed:
1711         has_vmx = FALSE;
1712         printk("failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1713 }
1714
1715 /**
1716  * vmx_disable - disables VMX mode on the current CPU
1717  */
1718 static void vmx_disable(void *unused)
1719 {
1720         if (currentcpu->vmx_enabled) {
1721                 __vmxoff();
1722                 lcr4(rcr4() & ~X86_CR4_VMXE);
1723                 currentcpu->vmx_enabled = 0;
1724         }
1725 }
1726
1727 /* Probe the cpus to see which ones can do vmx.
1728  * Return -errno if it fails, and 1 if it succeeds.
1729  */
1730 static bool probe_cpu_vmx(void)
1731 {
1732         /* The best way to test this code is:
1733          * wrmsr -p <cpu> 0x3a 1
1734          * This will lock vmx off; then modprobe dune.
1735          * Frequently, however, systems have all 0x3a registers set to 5,
1736          * meaning testing is impossible, as vmx can not be disabled.
1737          * We have to simulate it being unavailable in most cases.
1738          * The 'test' variable provides an easy way to simulate
1739          * unavailability of vmx on some, none, or all cpus.
1740          */
1741         if (!cpu_has_vmx()) {
1742                 printk("Machine does not support VT-x\n");
1743                 return FALSE;
1744         } else {
1745                 printk("Machine supports VT-x\n");
1746                 return TRUE;
1747         }
1748 }
1749
1750 static void setup_vmxarea(void)
1751 {
1752                 struct vmcs *vmxon_buf;
1753                 printd("Set up vmxarea for cpu %d\n", core_id());
1754                 vmxon_buf = __vmx_alloc_vmcs(node_id());
1755                 if (!vmxon_buf) {
1756                         printk("setup_vmxarea failed on node %d\n", core_id());
1757                         return;
1758                 }
1759                 currentcpu->vmxarea = vmxon_buf;
1760 }
1761
1762 /**
1763  * vmx_init sets up physical core data areas that are required to run a vm at all.
1764  * These data areas are not connected to a specific user process in any way. Instead,
1765  * they are in some sense externalizing what would other wise be a very large ball of
1766  * state that would be inside the CPU.
1767  */
1768 int intel_vmm_init(void)
1769 {
1770         int r, cpu, ret;
1771
1772         if (! probe_cpu_vmx()) {
1773                 return -EOPNOTSUPP;
1774         }
1775
1776         setup_vmcs_config(&ret);
1777
1778         if (ret) {
1779                 printk("setup_vmcs_config failed: %d\n", ret);
1780                 return ret;
1781         }
1782
1783         msr_bitmap = (unsigned long *)kpage_zalloc_addr();
1784         if (!msr_bitmap) {
1785                 printk("Could not allocate msr_bitmap\n");
1786                 return -ENOMEM;
1787         }
1788         /* FIXME: do we need APIC virtualization (flexpriority?) */
1789
1790         memset(msr_bitmap, 0xff, PAGE_SIZE);
1791         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
1792         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
1793
1794         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
1795
1796         /* TEMPORARY hack so we can do some basic VM testing. Create an ept and look for faults on it.
1797          */
1798         ept = kpage_zalloc_addr();
1799         eptp = construct_eptp(PADDR(ept));
1800         printk("ept is %p and eptp is %p\n", ept, eptp);
1801         return ret;
1802 }
1803
1804 int intel_vmm_pcpu_init(void)
1805 {
1806         setup_vmxarea();
1807         vmx_enable();
1808         return 0;
1809 }