VMM: Fixes vmmcp freezes
[akaros.git] / kern / arch / x86 / vmm / intel / vmx.c
1 /**
2  *  vmx.c - The Intel VT-x driver for Dune
3  *
4  * This file is derived from Linux KVM VT-x support.
5  * Copyright (C) 2006 Qumranet, Inc.
6  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
7  *
8  * Original Authors:
9  *   Avi Kivity   <avi@qumranet.com>
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *
12  * This modified version is simpler because it avoids the following
13  * features that are not requirements for Dune:
14  *  * Real-mode emulation
15  *  * Nested VT-x support
16  *  * I/O hardware emulation
17  *  * Any of the more esoteric X86 features and registers
18  *  * KVM-specific functionality
19  *
20  * In essence we provide only the minimum functionality needed to run
21  * a process in vmx non-root mode rather than the full hardware emulation
22  * needed to support an entire OS.
23  *
24  * This driver is a research prototype and as such has the following
25  * limitations:
26  *
27  * FIXME: Backward compatability is currently a non-goal, and only recent
28  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
29  * driver.
30  *
31  * FIXME: Eventually we should handle concurrent user's of VT-x more
32  * gracefully instead of requiring exclusive access. This would allow
33  * Dune to interoperate with KVM and other HV solutions.
34  *
35  * FIXME: We need to support hotplugged physical CPUs.
36  *
37  * Authors:
38  *   Adam Belay   <abelay@stanford.edu>
39  */
40
41 /* Basic flow.
42  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
43  * You're left with the feeling that they got part way through and realized they had to have one for
44  *
45  * 1) your CPU is going to be capable of running VMs, and you need state for that.
46  *
47  * 2) you're about to start a guest, and you need state for that.
48  *
49  * So there is get cpu set up to be able to run VMs stuff, and now
50  * let's start a guest stuff.  In Akaros, CPUs will always be set up
51  * to run a VM if that is possible. Processes can flip themselves into
52  * a VM and that will require another VMCS.
53  *
54  * So: at kernel startup time, the SMP boot stuff calls
55  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
56  * in the case of this file is intel_vmm_init. That does some code
57  * that sets up stuff for ALL sockets, based on the capabilities of
58  * the socket it runs on. If any cpu supports vmx, it assumes they all
59  * do. That's a realistic assumption. So the call_function_all is kind
60  * of stupid, really; it could just see what's on the current cpu and
61  * assume it's on all. HOWEVER: there are systems in the wilde that
62  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
63  * might as well allow for the chance that wel'll only all VMMCPs on a
64  * subset (not implemented yet however).  So: probe all CPUs, get a
65  * count of how many support VMX and, for now, assume they all do
66  * anyway.
67  *
68  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
69  * which contains all the naughty bits settings for all the cpus that can run a VM.
70  * Realistically, all VMX-capable cpus in a system will have identical configurations.
71  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
72  *
73  * configure the msr_bitmap. This is the bitmap of MSRs which the
74  * guest can manipulate.  Currently, we only allow GS and FS base.
75  *
76  * Reserve bit 0 in the vpid bitmap as guests can not use that
77  *
78  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
79  * per-guest. Once set up, it is left alone.  The ONLY think we set in
80  * there is the revision area. The VMX is page-sized per cpu and
81  * page-aligned. Note that it can be smaller, but why bother? We know
82  * the max size and alightment, and it's convenient.
83  *
84  * Now that it is set up, enable vmx on all cpus. This involves
85  * testing VMXE in cr4, to see if we've been here before (TODO: delete
86  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
87  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
88  * instruction), and syncing vpid's and ept's.  Now the CPU is ready
89  * to host guests.
90  *
91  * Setting up a guest.
92  * We divide this into two things: vmm_proc_init and vm_run.
93  * Currently, on Intel, vmm_proc_init does nothing.
94  *
95  * vm_run is really complicated. It is called with a coreid, rip, rsp,
96  * cr3, and flags.  On intel, it calls vmx_launch. vmx_launch is set
97  * up for a few test cases. If rip is 1, it sets the guest rip to
98  * a function which will deref 0 and should exit with failure 2. If rip is 0,
99  * it calls an infinite loop in the guest.
100  *
101  * The sequence of operations:
102  * create a vcpu
103  * while (1) {
104  * get a vcpu
105  * disable irqs (required or you can't enter the VM)
106  * vmx_run_vcpu()
107  * enable irqs
108  * manage the vm exit
109  * }
110  *
111  * get a vcpu
112  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
113  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
114  *
115  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
116  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
117  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
118  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
119  *
120  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
121  * of inline assembly with embedded CPP crap. I suspect we'll want to
122  * un-inline it someday, but maybe not.  It's called with a vcpu
123  * struct from which it loads guest state, and to which it stores
124  * non-virtualized host state. It issues a vmlaunch or vmresume
125  * instruction depending, and on return, it evaluates if things the
126  * launch/resume had an error in that operation. Note this is NOT the
127  * same as an error while in the virtual machine; this is an error in
128  * startup due to misconfiguration. Depending on whatis returned it's
129  * either a failed vm startup or an exit for lots of many reasons.
130  *
131  */
132 void monitor(void *);
133 /* basically: only rename those globals that might conflict
134  * with existing names. Leave all else the same.
135  * this code is more modern than the other code, yet still
136  * well encapsulated, it seems.
137  */
138 #include <kmalloc.h>
139 #include <string.h>
140 #include <stdio.h>
141 #include <assert.h>
142 #include <error.h>
143 #include <pmap.h>
144 #include <sys/queue.h>
145 #include <smp.h>
146 #include <kref.h>
147 #include <atomic.h>
148 #include <alarm.h>
149 #include <event.h>
150 #include <umem.h>
151 #include <bitops.h>
152 #include <arch/types.h>
153 #include <syscall.h>
154
155 #include "vmx.h"
156 #include "../vmm.h"
157
158 #include "compat.h"
159 #include "cpufeature.h"
160
161 #define currentcpu (&per_cpu_info[core_id()])
162
163 /* this is always 1, and only ever incremented. If it's more than 1,
164  * then you failed.
165  */
166 static bool has_vmx = FALSE;
167
168 /* TEMPORARY TEST HACK EPT */
169 void *ept;
170 uint64_t eptp;
171 /* END HACKQUE */
172
173 static DECLARE_BITMAP(vmx_vpid_bitmap, /*VMX_NR_VPIDS*/ 65536);
174 static spinlock_t vmx_vpid_lock;
175
176 static unsigned long *msr_bitmap;
177
178 static struct vmcs_config {
179         int size;
180         int order;
181         uint32_t revision_id;
182         uint32_t pin_based_exec_ctrl;
183         uint32_t cpu_based_exec_ctrl;
184         uint32_t cpu_based_2nd_exec_ctrl;
185         uint32_t vmexit_ctrl;
186         uint32_t vmentry_ctrl;
187 } vmcs_config;
188
189 struct vmx_capability vmx_capability;
190
191 static inline bool cpu_has_secondary_exec_ctrls(void)
192 {
193         return vmcs_config.cpu_based_exec_ctrl &
194                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
195 }
196
197 static inline bool cpu_has_vmx_vpid(void)
198 {
199         return vmcs_config.cpu_based_2nd_exec_ctrl &
200                 SECONDARY_EXEC_ENABLE_VPID;
201 }
202
203 static inline bool cpu_has_vmx_invpcid(void)
204 {
205         return vmcs_config.cpu_based_2nd_exec_ctrl &
206                 SECONDARY_EXEC_ENABLE_INVPCID;
207 }
208
209 static inline bool cpu_has_vmx_invvpid_single(void)
210 {
211         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
212 }
213
214 static inline bool cpu_has_vmx_invvpid_global(void)
215 {
216         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
217 }
218
219 static inline bool cpu_has_vmx_ept(void)
220 {
221         return vmcs_config.cpu_based_2nd_exec_ctrl &
222                 SECONDARY_EXEC_ENABLE_EPT;
223 }
224
225 static inline bool cpu_has_vmx_invept_individual_addr(void)
226 {
227         return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
228 }
229
230 static inline bool cpu_has_vmx_invept_context(void)
231 {
232         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
233 }
234
235 static inline bool cpu_has_vmx_invept_global(void)
236 {
237         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
238 }
239
240 static inline bool cpu_has_vmx_ept_ad_bits(void)
241 {
242         return vmx_capability.ept & VMX_EPT_AD_BIT;
243 }
244
245 static inline void __invept(int ext, uint64_t eptp, gpa_t gpa)
246 {
247         struct {
248                 uint64_t eptp, gpa;
249         } operand = {eptp, gpa};
250
251         asm volatile (ASM_VMX_INVEPT
252                         /* CF==1 or ZF==1 --> rc = -1 */
253                         "; ja 1f ; ud2 ; 1:\n"
254                         : : "a" (&operand), "c" (ext) : "cc", "memory");
255 }
256
257 static inline void ept_sync_global(void)
258 {
259         if (cpu_has_vmx_invept_global())
260                 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
261 }
262
263 static inline void ept_sync_context(uint64_t eptp)
264 {
265         if (cpu_has_vmx_invept_context())
266                 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
267         else
268                 ept_sync_global();
269 }
270
271 static inline void ept_sync_individual_addr(uint64_t eptp, gpa_t gpa)
272 {
273         if (cpu_has_vmx_invept_individual_addr())
274                 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
275                                 eptp, gpa);
276         else
277                 ept_sync_context(eptp);
278 }
279
280 static inline void __vmxon(uint64_t addr)
281 {
282         asm volatile (ASM_VMX_VMXON_RAX
283                         : : "a"(&addr), "m"(addr)
284                         : "memory", "cc");
285 }
286
287 static inline void __vmxoff(void)
288 {
289         asm volatile (ASM_VMX_VMXOFF : : : "cc");
290 }
291
292 static inline void __invvpid(int ext, uint16_t vpid, gva_t gva)
293 {
294     struct {
295         uint64_t vpid : 16;
296         uint64_t rsvd : 48;
297         uint64_t gva;
298     } operand = { vpid, 0, gva };
299
300     asm volatile (ASM_VMX_INVVPID
301                   /* CF==1 or ZF==1 --> rc = -1 */
302                   "; ja 1f ; ud2 ; 1:"
303                   : : "a"(&operand), "c"(ext) : "cc", "memory");
304 }
305
306 static inline void vpid_sync_vcpu_single(uint16_t vpid)
307 {
308         if (vpid == 0) {
309                 return;
310         }
311
312         if (cpu_has_vmx_invvpid_single())
313                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
314 }
315
316 static inline void vpid_sync_vcpu_global(void)
317 {
318         if (cpu_has_vmx_invvpid_global())
319                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
320 }
321
322 static inline void vpid_sync_context(uint16_t vpid)
323 {
324         if (cpu_has_vmx_invvpid_single())
325                 vpid_sync_vcpu_single(vpid);
326         else
327                 vpid_sync_vcpu_global();
328 }
329
330 static void vmcs_clear(struct vmcs *vmcs)
331 {
332         uint64_t phys_addr = PADDR(vmcs);
333         uint8_t error;
334
335         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
336                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
337                       : "cc", "memory");
338         if (error)
339                 printk("vmclear fail: %p/%llx\n",
340                        vmcs, phys_addr);
341 }
342
343 static void vmcs_load(struct vmcs *vmcs)
344 {
345         uint64_t phys_addr = PADDR(vmcs);
346         uint8_t error;
347
348         asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
349                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
350                         : "cc", "memory");
351         if (error)
352                 printk("vmptrld %p/%llx failed\n",
353                        vmcs, phys_addr);
354 }
355
356 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
357 static physaddr_t vmcs_get_current(void)
358 {
359         physaddr_t vmcs_paddr;
360         /* RAX contains the addr of the location to store the VMCS pointer.  The
361          * compiler doesn't know the ASM will deref that pointer, hence the =m */
362         asm volatile (ASM_VMX_VMPTRST_RAX : "=m"(vmcs_paddr) : "a"(&vmcs_paddr));
363         return vmcs_paddr;
364 }
365
366 __always_inline unsigned long vmcs_readl(unsigned long field)
367 {
368         unsigned long value;
369
370         asm volatile (ASM_VMX_VMREAD_RDX_RAX
371                       : "=a"(value) : "d"(field) : "cc");
372         return value;
373 }
374
375 __always_inline uint16_t vmcs_read16(unsigned long field)
376 {
377         return vmcs_readl(field);
378 }
379
380 static __always_inline uint32_t vmcs_read32(unsigned long field)
381 {
382         return vmcs_readl(field);
383 }
384
385 static __always_inline uint64_t vmcs_read64(unsigned long field)
386 {
387 #ifdef CONFIG_X86_64
388         return vmcs_readl(field);
389 #else
390         return vmcs_readl(field) | ((uint64_t)vmcs_readl(field+1) << 32);
391 #endif
392 }
393
394 void vmwrite_error(unsigned long field, unsigned long value)
395 {
396         printk("vmwrite error: reg %lx value %lx (err %d)\n",
397                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
398         /* not available so ...
399         dump_stack();
400         */
401         monitor(NULL);
402 }
403
404 void vmcs_writel(unsigned long field, unsigned long value)
405 {
406         uint8_t error;
407
408         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
409                        : "=q"(error) : "a"(value), "d"(field) : "cc");
410         if (error)
411                 vmwrite_error(field, value);
412 }
413
414 static void vmcs_write16(unsigned long field, uint16_t value)
415 {
416         vmcs_writel(field, value);
417 }
418
419 static void vmcs_write32(unsigned long field, uint32_t value)
420 {
421         vmcs_writel(field, value);
422 }
423
424 static void vmcs_write64(unsigned long field, uint64_t value)
425 {
426         vmcs_writel(field, value);
427 #ifndef CONFIG_X86_64
428         asm volatile ("");
429         vmcs_writel(field+1, value >> 32);
430 #endif
431 }
432
433
434 static int adjust_vmx_controls(uint32_t ctl_min, uint32_t ctl_opt,
435                                       uint32_t msr, uint32_t *result)
436 {
437         uint32_t vmx_msr_low, vmx_msr_high;
438         uint32_t ctl = ctl_min | ctl_opt;
439         uint64_t vmx_msr = read_msr(msr);
440         vmx_msr_low = vmx_msr;
441         vmx_msr_high = vmx_msr>>32;
442
443         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
444         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
445
446         /* Ensure minimum (required) set of control bits are supported. */
447         if (ctl_min & ~ctl) {
448                 return -EIO;
449         }
450
451         *result = ctl;
452         return 0;
453 }
454
455 static  bool allow_1_setting(uint32_t msr, uint32_t ctl)
456 {
457         uint32_t vmx_msr_low, vmx_msr_high;
458
459         rdmsr(msr, vmx_msr_low, vmx_msr_high);
460         return vmx_msr_high & ctl;
461 }
462
463 static  void setup_vmcs_config(void *p)
464 {
465         int *ret = p;
466         struct vmcs_config *vmcs_conf = &vmcs_config;
467         uint32_t vmx_msr_low, vmx_msr_high;
468         uint32_t min, opt, min2, opt2;
469         uint32_t _pin_based_exec_control = 0;
470         uint32_t _cpu_based_exec_control = 0;
471         uint32_t _cpu_based_2nd_exec_control = 0;
472         uint32_t _vmexit_control = 0;
473         uint32_t _vmentry_control = 0;
474
475         *ret = -EIO;
476         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
477         opt = PIN_BASED_VIRTUAL_NMIS;
478         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
479                                 &_pin_based_exec_control) < 0) {
480                 return;
481         }
482
483         min =
484               CPU_BASED_CR8_LOAD_EXITING |
485               CPU_BASED_CR8_STORE_EXITING |
486               CPU_BASED_CR3_LOAD_EXITING |
487               CPU_BASED_CR3_STORE_EXITING |
488               CPU_BASED_MOV_DR_EXITING |
489               CPU_BASED_USE_TSC_OFFSETING |
490               CPU_BASED_MWAIT_EXITING |
491               CPU_BASED_MONITOR_EXITING |
492               CPU_BASED_INVLPG_EXITING;
493
494         min |= CPU_BASED_HLT_EXITING;
495
496         opt = CPU_BASED_TPR_SHADOW |
497               CPU_BASED_USE_MSR_BITMAPS |
498               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
499         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
500                                 &_cpu_based_exec_control) < 0) {
501                 return;
502         }
503
504         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
505                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
506                                            ~CPU_BASED_CR8_STORE_EXITING;
507
508         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
509                 min2 = 
510                         SECONDARY_EXEC_ENABLE_VPID |
511                         SECONDARY_EXEC_ENABLE_EPT |
512                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
513                 opt2 =  SECONDARY_EXEC_WBINVD_EXITING |
514                         SECONDARY_EXEC_RDTSCP |
515                         SECONDARY_EXEC_ENABLE_INVPCID;
516                 if (adjust_vmx_controls(min2, opt2,
517                                         MSR_IA32_VMX_PROCBASED_CTLS2,
518                                         &_cpu_based_2nd_exec_control) < 0) {
519                                                 return;
520                                         }
521         }
522
523         if (!(_cpu_based_2nd_exec_control &
524                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
525                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
526
527         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
528                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
529                    enabled */
530                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
531                                              CPU_BASED_CR3_STORE_EXITING |
532                                              CPU_BASED_INVLPG_EXITING);
533                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
534                       vmx_capability.ept, vmx_capability.vpid);
535         }
536
537         min = 0;
538
539         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
540
541 //      opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
542         opt = 0;
543         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
544                                 &_vmexit_control) < 0) {
545                 return;
546         }
547
548         min = 0;
549 //      opt = VM_ENTRY_LOAD_IA32_PAT;
550         opt = 0;
551         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
552                                 &_vmentry_control) < 0) {
553                 return;
554         }
555
556         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
557
558         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
559         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) {
560                 return;
561         }
562
563         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
564         if (vmx_msr_high & (1u<<16)) {
565                 printk("64-bit CPUs always have VMX_BASIC_MSR[48]==0. FAILS!\n");
566                 return;
567         }
568
569         /* Require Write-Back (WB) memory type for VMCS accesses. */
570         if (((vmx_msr_high >> 18) & 15) != 6) {
571                 printk("NO WB!\n");
572                 return;
573         }
574
575         vmcs_conf->size = vmx_msr_high & 0x1fff;
576         vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
577         vmcs_conf->revision_id = vmx_msr_low;
578         printk("vmcs_conf size %d order %d rev %d\n",
579                vmcs_conf->size, vmcs_conf->order,
580                vmcs_conf->revision_id);
581
582         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
583         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
584         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
585         vmcs_conf->vmexit_ctrl         = _vmexit_control;
586         vmcs_conf->vmentry_ctrl        = _vmentry_control;
587
588         vmx_capability.has_load_efer =
589                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
590                                 VM_ENTRY_LOAD_IA32_EFER)
591                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
592                                    VM_EXIT_LOAD_IA32_EFER);
593
594         /* Now that we've done all the setup we can do, verify
595          * that we have all the capabilities we need. These tests
596          * are done last presumably because all the work done above
597          * affects some of them.
598          */
599
600         if (!vmx_capability.has_load_efer) {
601                 printk("CPU lacks ability to load EFER register\n");
602                 return;
603         }
604
605         printk("CPU has all needed capabilities\n");
606         *ret = 0;
607 }
608
609 static struct vmcs *__vmx_alloc_vmcs(int node)
610 {
611         struct vmcs *vmcs;
612
613         vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
614         if (!vmcs)
615                 return 0;
616         memset(vmcs, 0, vmcs_config.size);
617         vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
618         printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
619         return vmcs;
620 }
621
622 /**
623  * vmx_alloc_vmcs - allocates a VMCS region
624  *
625  * NOTE: Assumes the new region will be used by the current CPU.
626  *
627  * Returns a valid VMCS region.
628  */
629 static struct vmcs *vmx_alloc_vmcs(void)
630 {
631         return __vmx_alloc_vmcs(node_id());
632 }
633
634 /**
635  * vmx_free_vmcs - frees a VMCS region
636  */
637 static void vmx_free_vmcs(struct vmcs *vmcs)
638 {
639   //free_pages((unsigned long)vmcs, vmcs_config.order);
640 }
641
642 /*
643  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
644  * will not change in the lifetime of the guest.
645  * Note that host-state that does change is set elsewhere. E.g., host-state
646  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
647  */
648 static void vmx_setup_constant_host_state(void)
649 {
650         uint32_t low32, high32;
651         unsigned long tmpl;
652         pseudodesc_t dt;
653
654         vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);  /* 22.2.3 */
655         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
656         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
657
658         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
659         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
660         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
661         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
662         vmcs_write16(HOST_TR_SELECTOR, GD_TSS);  /* 22.2.4 */
663
664         native_store_idt(&dt);
665         vmcs_writel(HOST_IDTR_BASE, dt.pd_base);   /* 22.2.4 */
666
667         asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
668         vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
669
670         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
671         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
672         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
673         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
674
675         rdmsr(MSR_EFER, low32, high32);
676         vmcs_write32(HOST_IA32_EFER, low32);
677
678         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
679                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
680                 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
681         }
682
683         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
684         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
685
686         /* TODO: This (at least gs) is per cpu */
687 #ifdef CONFIG_X86_64
688         rdmsrl(MSR_FS_BASE, tmpl);
689         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
690         rdmsrl(MSR_GS_BASE, tmpl);
691         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
692 #else
693         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
694         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
695 #endif
696 }
697
698 static inline uint16_t vmx_read_ldt(void)
699 {
700         uint16_t ldt;
701         asm("sldt %0" : "=g"(ldt));
702         return ldt;
703 }
704
705 static unsigned long segment_base(uint16_t selector)
706 {
707         pseudodesc_t *gdt = &currentcpu->host_gdt;
708         struct desc_struct *d;
709         unsigned long table_base;
710         unsigned long v;
711
712         if (!(selector & ~3)) {
713                 return 0;
714         }
715
716         table_base = gdt->pd_base;
717
718         if (selector & 4) {           /* from ldt */
719                 uint16_t ldt_selector = vmx_read_ldt();
720
721                 if (!(ldt_selector & ~3)) {
722                         return 0;
723                 }
724
725                 table_base = segment_base(ldt_selector);
726         }
727         d = (struct desc_struct *)(table_base + (selector & ~7));
728         v = get_desc_base(d);
729 #ifdef CONFIG_X86_64
730        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
731                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
732 #endif
733         return v;
734 }
735
736 static inline unsigned long vmx_read_tr_base(void)
737 {
738         uint16_t tr;
739         asm("str %0" : "=g"(tr));
740         return segment_base(tr);
741 }
742
743 static void __vmx_setup_cpu(void)
744 {
745         pseudodesc_t *gdt = &currentcpu->host_gdt;
746         unsigned long sysenter_esp;
747         unsigned long tmpl;
748
749         /*
750          * Linux uses per-cpu TSS and GDT, so set these when switching
751          * processors.
752          */
753         vmcs_writel(HOST_TR_BASE, vmx_read_tr_base()); /* 22.2.4 */
754         vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);   /* 22.2.4 */
755
756         rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
757         vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
758
759         rdmsrl(MSR_FS_BASE, tmpl);
760         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
761         rdmsrl(MSR_GS_BASE, tmpl);
762         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
763 }
764
765 static void __vmx_get_cpu_helper(struct hw_trapframe *hw_tf, void *ptr)
766 {
767         struct vmx_vcpu *vcpu = ptr;
768
769         if (core_id() != vcpu->cpu)
770                 panic("%s: core_id() %d != vcpu->cpu %d\n",
771                       __func__, core_id(), vcpu->cpu);
772
773         vmcs_clear(vcpu->vmcs);
774         if (currentcpu->local_vcpu == vcpu)
775                 currentcpu->local_vcpu = NULL;
776 }
777
778 /**
779  * vmx_get_cpu - called before using a cpu
780  * @vcpu: VCPU that will be loaded.
781  *
782  * Disables preemption. Call vmx_put_cpu() when finished.
783  */
784 static void vmx_get_cpu(struct vmx_vcpu *vcpu)
785 {
786         int cur_cpu = core_id();
787         handler_wrapper_t *w;
788
789         //printk("currentcpu->local_vcpu %p vcpu %p\n",
790                 //currentcpu->local_vcpu, vcpu);
791         if (currentcpu->local_vcpu != vcpu) {
792                 currentcpu->local_vcpu = vcpu;
793
794                 if (vcpu->cpu != cur_cpu) {
795                         if (vcpu->cpu >= 0) {
796                                 smp_call_function_single(vcpu->cpu,
797                                                          __vmx_get_cpu_helper, (void *) vcpu, &w);
798                                 if (smp_call_wait(w))
799                                         printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
800                         } else
801                                 vmcs_clear(vcpu->vmcs);
802
803 //                      vpid_sync_context(vcpu->vpid);
804 //                      ept_sync_context(current->vmm->
805
806                         vcpu->launched = 0;
807                         vmcs_load(vcpu->vmcs);
808                         __vmx_setup_cpu();
809                         vcpu->cpu = cur_cpu;
810                 } else {
811                         vmcs_load(vcpu->vmcs);
812                 }
813         }
814 }
815
816 /**
817  * vmx_put_cpu - called after using a cpu
818  * @vcpu: VCPU that was loaded.
819  */
820 static void vmx_put_cpu(struct vmx_vcpu *vcpu)
821 {
822         //put_cpu();
823 }
824
825 static void __vmx_sync_helper(struct hw_trapframe *hw_tf, void *ptr)
826 {
827         struct vmx_vcpu *vcpu = ptr;
828
829 //      ept_sync_context(current);
830 }
831
832 struct sync_addr_args {
833         struct vmx_vcpu *vcpu;
834         gpa_t gpa;
835 };
836
837 static void __vmx_sync_individual_addr_helper(struct hw_trapframe *hw_tf, void *ptr)
838 {
839         struct sync_addr_args *args = ptr;
840
841 //      ept_sync_individual_addr(
842
843 }
844
845 /**
846  * vmx_ept_sync_global - used to evict everything in the EPT
847  * @vcpu: the vcpu
848  */
849 void vmx_ept_sync_vcpu(struct vmx_vcpu *vcpu)
850 {
851         handler_wrapper_t *w;
852
853         smp_call_function_single(vcpu->cpu,
854                 __vmx_sync_helper, (void *) vcpu, &w);
855
856         if (smp_call_wait(w)) {
857                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
858         }
859
860
861 }
862
863 /**
864  * vmx_ept_sync_individual_addr - used to evict an individual address
865  * @vcpu: the vcpu
866  * @gpa: the guest-physical address
867  */
868 void vmx_ept_sync_individual_addr(struct vmx_vcpu *vcpu, gpa_t gpa)
869 {
870         struct sync_addr_args args;
871         args.vcpu = vcpu;
872         args.gpa = gpa;
873
874         handler_wrapper_t *w;
875
876
877         smp_call_function_single(vcpu->cpu,
878                                  __vmx_sync_individual_addr_helper, (void *) &args, &w);
879
880         if (smp_call_wait(w)) {
881                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
882         }
883
884 }
885
886 /**
887  * vmx_dump_cpu - prints the CPU state
888  * @vcpu: VCPU to print
889  */
890 static void vmx_dump_cpu(struct vmx_vcpu *vcpu)
891 {
892
893         unsigned long flags;
894
895         vmx_get_cpu(vcpu);
896         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
897         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
898         flags = vmcs_readl(GUEST_RFLAGS);
899         vmx_put_cpu(vcpu);
900
901         printk("--- Begin VCPU Dump ---\n");
902         printk("CPU %d VPID %d\n", vcpu->cpu, vcpu->vpid);
903         printk("RIP 0x%016lx RFLAGS 0x%08lx\n",
904                vcpu->regs.tf_rip, flags);
905         printk("RAX 0x%016lx RCX 0x%016lx\n",
906                 vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
907         printk("RDX 0x%016lx RBX 0x%016lx\n",
908                 vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
909         printk("RSP 0x%016lx RBP 0x%016lx\n",
910                 vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
911         printk("RSI 0x%016lx RDI 0x%016lx\n",
912                 vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
913         printk("R8  0x%016lx R9  0x%016lx\n",
914                 vcpu->regs.tf_r8, vcpu->regs.tf_r9);
915         printk("R10 0x%016lx R11 0x%016lx\n",
916                 vcpu->regs.tf_r10, vcpu->regs.tf_r11);
917         printk("R12 0x%016lx R13 0x%016lx\n",
918                 vcpu->regs.tf_r12, vcpu->regs.tf_r13);
919         printk("R14 0x%016lx R15 0x%016lx\n",
920                 vcpu->regs.tf_r14, vcpu->regs.tf_r15);
921         printk("--- End VCPU Dump ---\n");
922
923 }
924
925 uint64_t construct_eptp(unsigned long root_hpa)
926 {
927         uint64_t eptp;
928
929         /* TODO write the value reading from MSR */
930         eptp = VMX_EPT_DEFAULT_MT |
931                 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
932         if (cpu_has_vmx_ept_ad_bits())
933                 eptp |= VMX_EPT_AD_ENABLE_BIT;
934         eptp |= (root_hpa & PAGE_MASK);
935
936         return eptp;
937 }
938
939 /**
940  * vmx_setup_initial_guest_state - configures the initial state of guest registers
941  */
942 static void vmx_setup_initial_guest_state(void)
943 {
944         unsigned long tmpl;
945         unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
946                             X86_CR4_PGE | X86_CR4_OSFXSR;
947         uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
948 #if 0
949         do we need it
950         if (boot_cpu_has(X86_FEATURE_PCID))
951                 cr4 |= X86_CR4_PCIDE;
952         if (boot_cpu_has(X86_FEATURE_OSXSAVE))
953                 cr4 |= X86_CR4_OSXSAVE;
954 #endif
955         /* we almost certainly have this */
956         /* we'll go sour if we don't. */
957         if (1) //boot_cpu_has(X86_FEATURE_FSGSBASE))
958                 cr4 |= X86_CR4_RDWRGSFS;
959
960         /* configure control and data registers */
961         vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
962                                X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
963         vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
964                                      X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
965         vmcs_writel(GUEST_CR3, rcr3());
966         vmcs_writel(GUEST_CR4, cr4);
967         vmcs_writel(CR4_READ_SHADOW, cr4);
968         vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
969                                      EFER_SCE | EFER_FFXSR);
970         vmcs_writel(GUEST_GDTR_BASE, 0);
971         vmcs_writel(GUEST_GDTR_LIMIT, 0);
972         vmcs_writel(GUEST_IDTR_BASE, 0);
973         vmcs_writel(GUEST_IDTR_LIMIT, 0);
974         vmcs_writel(GUEST_RIP, 0xdeadbeef);
975         vmcs_writel(GUEST_RSP, 0xdeadbeef);
976         vmcs_writel(GUEST_RFLAGS, 0x02);
977         vmcs_writel(GUEST_DR7, 0);
978
979         /* guest segment bases */
980         vmcs_writel(GUEST_CS_BASE, 0);
981         vmcs_writel(GUEST_DS_BASE, 0);
982         vmcs_writel(GUEST_ES_BASE, 0);
983         vmcs_writel(GUEST_GS_BASE, 0);
984         vmcs_writel(GUEST_SS_BASE, 0);
985         rdmsrl(MSR_FS_BASE, tmpl);
986         vmcs_writel(GUEST_FS_BASE, tmpl);
987
988         /* guest segment access rights */
989         vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
990         vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
991         vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
992         vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
993         vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
994         vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
995
996         /* guest segment limits */
997         vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
998         vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
999         vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
1000         vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
1001         vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
1002         vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
1003
1004         /* configure segment selectors */
1005         vmcs_write16(GUEST_CS_SELECTOR, 0);
1006         vmcs_write16(GUEST_DS_SELECTOR, 0);
1007         vmcs_write16(GUEST_ES_SELECTOR, 0);
1008         vmcs_write16(GUEST_FS_SELECTOR, 0);
1009         vmcs_write16(GUEST_GS_SELECTOR, 0);
1010         vmcs_write16(GUEST_SS_SELECTOR, 0);
1011         vmcs_write16(GUEST_TR_SELECTOR, 0);
1012
1013         /* guest LDTR */
1014         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1015         vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
1016         vmcs_writel(GUEST_LDTR_BASE, 0);
1017         vmcs_writel(GUEST_LDTR_LIMIT, 0);
1018
1019         /* guest TSS */
1020         vmcs_writel(GUEST_TR_BASE, 0);
1021         vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
1022         vmcs_writel(GUEST_TR_LIMIT, 0xff);
1023
1024         /* initialize sysenter */
1025         vmcs_write32(GUEST_SYSENTER_CS, 0);
1026         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1027         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1028
1029         /* other random initialization */
1030         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1031         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1032         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1033         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1034         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1035 }
1036
1037 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
1038 {
1039         int f = sizeof(unsigned long);
1040         /*
1041          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1042          * have the write-low and read-high bitmap offsets the wrong way round.
1043          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1044          */
1045         if (msr <= 0x1fff) {
1046                 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
1047                 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
1048         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1049                 msr &= 0x1fff;
1050                 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
1051                 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
1052         }
1053 }
1054
1055 static void setup_msr(struct vmx_vcpu *vcpu)
1056 {
1057         int set[] = { MSR_LSTAR };
1058         struct vmx_msr_entry *e;
1059         int sz = sizeof(set) / sizeof(*set);
1060         int i;
1061
1062         //BUILD_BUG_ON(sz > NR_AUTOLOAD_MSRS);
1063
1064         vcpu->msr_autoload.nr = sz;
1065
1066         /* XXX enable only MSRs in set */
1067         vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1068
1069         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1070         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1071         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1072
1073         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1074         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1075         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1076
1077         for (i = 0; i < sz; i++) {
1078                 uint64_t val;
1079
1080                 e = &vcpu->msr_autoload.host[i];
1081                 e->index = set[i];
1082                 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1083                 rdmsrl(e->index, val);
1084                 e->value = val;
1085
1086                 e = &vcpu->msr_autoload.guest[i];
1087                 e->index = set[i];
1088                 e->value = 0xDEADBEEF;
1089         }
1090 }
1091
1092 /**
1093  *  vmx_setup_vmcs - configures the vmcs with starting parameters
1094  */
1095 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu)
1096 {
1097         vmcs_write16(VIRTUAL_PROCESSOR_ID, vcpu->vpid);
1098         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1099
1100         /* Control */
1101         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1102                 vmcs_config.pin_based_exec_ctrl);
1103
1104         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1105                 vmcs_config.cpu_based_exec_ctrl);
1106
1107         if (cpu_has_secondary_exec_ctrls()) {
1108                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1109                              vmcs_config.cpu_based_2nd_exec_ctrl);
1110         }
1111
1112         vmcs_write64(EPT_POINTER, eptp);
1113
1114         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1115         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1116         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1117
1118         setup_msr(vcpu);
1119 #if 0
1120         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1121                 uint32_t msr_low, msr_high;
1122                 uint64_t host_pat;
1123                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
1124                 host_pat = msr_low | ((uint64_t) msr_high << 32);
1125                 /* Write the default value follow host pat */
1126                 vmcs_write64(GUEST_IA32_PAT, host_pat);
1127                 /* Keep arch.pat sync with GUEST_IA32_PAT */
1128                 vmx->vcpu.arch.pat = host_pat;
1129         }
1130
1131         for (i = 0; i < NR_VMX_MSR; ++i) {
1132                 uint32_t index = vmx_msr_index[i];
1133                 uint32_t data_low, data_high;
1134                 int j = vmx->nmsrs;
1135
1136                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1137                         continue;
1138                 if (wrmsr_safe(index, data_low, data_high) < 0)
1139                         continue;
1140                 vmx->guest_msrs[j].index = i;
1141                 vmx->guest_msrs[j].data = 0;
1142                 vmx->guest_msrs[j].mask = -1ull;
1143                 ++vmx->nmsrs;
1144         }
1145 #endif
1146
1147         vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1148
1149         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1150         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1151
1152         vmcs_writel(CR0_GUEST_HOST_MASK, ~0ul);
1153         vmcs_writel(CR4_GUEST_HOST_MASK, ~0ul);
1154
1155         //kvm_write_tsc(&vmx->vcpu, 0);
1156         vmcs_writel(TSC_OFFSET, 0);
1157
1158         vmx_setup_constant_host_state();
1159 }
1160
1161 /**
1162  * vmx_allocate_vpid - reserves a vpid and sets it in the VCPU
1163  * @vmx: the VCPU
1164  */
1165 static int vmx_allocate_vpid(struct vmx_vcpu *vmx)
1166 {
1167         int vpid;
1168
1169         vmx->vpid = 0;
1170
1171         spin_lock(&vmx_vpid_lock);
1172         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
1173         if (vpid < VMX_NR_VPIDS) {
1174                 vmx->vpid = vpid;
1175                 __set_bit(vpid, vmx_vpid_bitmap);
1176         }
1177         spin_unlock(&vmx_vpid_lock);
1178
1179         return vpid >= VMX_NR_VPIDS;
1180 }
1181
1182 /**
1183  * vmx_free_vpid - frees a vpid
1184  * @vmx: the VCPU
1185  */
1186 static void vmx_free_vpid(struct vmx_vcpu *vmx)
1187 {
1188         spin_lock(&vmx_vpid_lock);
1189         if (vmx->vpid != 0)
1190                 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
1191         spin_unlock(&vmx_vpid_lock);
1192 }
1193
1194 /**
1195  * vmx_create_vcpu - allocates and initializes a new virtual cpu
1196  *
1197  * Returns: A new VCPU structure
1198  */
1199 struct vmx_vcpu *vmx_create_vcpu(void)
1200 {
1201         struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1202         if (!vcpu) {
1203                 return NULL;
1204         }
1205
1206         memset(vcpu, 0, sizeof(*vcpu));
1207
1208         vcpu->vmcs = vmx_alloc_vmcs();
1209         printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1210         if (!vcpu->vmcs)
1211                 goto fail_vmcs;
1212
1213         if (vmx_allocate_vpid(vcpu))
1214                 goto fail_vpid;
1215
1216         printd("%d: vmx_create_vcpu: vpid %d\n", core_id(), vcpu->vpid);
1217         vcpu->cpu = -1;
1218
1219         vmx_get_cpu(vcpu);
1220         vmx_setup_vmcs(vcpu);
1221         vmx_setup_initial_guest_state();
1222         vmx_put_cpu(vcpu);
1223
1224 #if 0
1225         if (cpu_has_vmx_ept_ad_bits()) {
1226                 vcpu->ept_ad_enabled = true;
1227                 printk("vmx: enabled EPT A/D bits");
1228         }
1229         if (vmx_create_ept(vcpu->gv))
1230                 goto fail_ept;
1231 #endif
1232
1233         return vcpu;
1234
1235 fail_ept:
1236         vmx_free_vpid(vcpu);
1237 fail_vpid:
1238         vmx_free_vmcs(vcpu->vmcs);
1239 fail_vmcs:
1240         kfree(vcpu);
1241         return NULL;
1242 }
1243
1244 /**
1245  * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1246  * @vcpu: the VCPU to destroy
1247  */
1248 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu)
1249 {
1250         // needs to be done when we tear down the gv. vmx_destroy_ept(vcpu->gv);
1251         vmx_get_cpu(vcpu);
1252 //      ept_sync_context
1253         vmcs_clear(vcpu->vmcs);
1254         currentcpu->local_vcpu = NULL;
1255         vmx_put_cpu(vcpu);
1256         vmx_free_vpid(vcpu);
1257         vmx_free_vmcs(vcpu->vmcs);
1258         kfree(vcpu);
1259 }
1260
1261 /**
1262  * vmx_task_vcpu - returns a pointer to the task's vcpu or NULL.
1263  * @task: the task
1264  */
1265 static inline struct vmx_vcpu *vmx_task_vcpu(struct proc *p)
1266 {
1267         struct dune_struct *dune = current->virtinfo;
1268         return dune ? dune->vcpu : NULL;
1269 }
1270
1271 /**
1272  * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1273  *
1274  * In the contexts where this is used the vcpu pointer should never be NULL.
1275  */
1276 static inline struct vmx_vcpu *vmx_current_vcpu(void)
1277 {
1278         struct vmx_vcpu *vcpu = vmx_task_vcpu(current);
1279         if (! vcpu)
1280                 panic("%s: core_id %d: no vcpu", __func__, core_id());
1281         return vcpu;
1282 }
1283
1284
1285 /**
1286  * vmx_run_vcpu - launches the CPU into non-root mode
1287  * We ONLY support 64-bit guests.
1288  * @vcpu: the vmx instance to launch
1289  */
1290 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1291 {
1292         asm(
1293                 /* Store host registers */
1294                 "push %%rdx; push %%rbp;"
1295                 "push %%rcx \n\t" /* placeholder for guest rcx */
1296                 "push %%rcx \n\t"
1297                 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1298                 "je 1f \n\t"
1299                 "mov %%rsp, %c[host_rsp](%0) \n\t"
1300                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1301                 "1: \n\t"
1302                 /* Reload cr2 if changed */
1303                 "mov %c[cr2](%0), %%rax \n\t"
1304                 "mov %%cr2, %%rdx \n\t"
1305                 "cmp %%rax, %%rdx \n\t"
1306                 "je 2f \n\t"
1307                 "mov %%rax, %%cr2 \n\t"
1308                 "2: \n\t"
1309                 /* Check if vmlaunch of vmresume is needed */
1310                 "cmpl $0, %c[launched](%0) \n\t"
1311                 /* Load guest registers.  Don't clobber flags. */
1312                 "mov %c[rax](%0), %%rax \n\t"
1313                 "mov %c[rbx](%0), %%rbx \n\t"
1314                 "mov %c[rdx](%0), %%rdx \n\t"
1315                 "mov %c[rsi](%0), %%rsi \n\t"
1316                 "mov %c[rdi](%0), %%rdi \n\t"
1317                 "mov %c[rbp](%0), %%rbp \n\t"
1318                 "mov %c[r8](%0),  %%r8  \n\t"
1319                 "mov %c[r9](%0),  %%r9  \n\t"
1320                 "mov %c[r10](%0), %%r10 \n\t"
1321                 "mov %c[r11](%0), %%r11 \n\t"
1322                 "mov %c[r12](%0), %%r12 \n\t"
1323                 "mov %c[r13](%0), %%r13 \n\t"
1324                 "mov %c[r14](%0), %%r14 \n\t"
1325                 "mov %c[r15](%0), %%r15 \n\t"
1326                 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1327
1328                 /* Enter guest mode */
1329                 "jne .Llaunched \n\t"
1330                 ASM_VMX_VMLAUNCH "\n\t"
1331                 "jmp .Lkvm_vmx_return \n\t"
1332                 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1333                 ".Lkvm_vmx_return: "
1334                 /* Save guest registers, load host registers, keep flags */
1335                 "mov %0, %c[wordsize](%%rsp) \n\t"
1336                 "pop %0 \n\t"
1337                 "mov %%rax, %c[rax](%0) \n\t"
1338                 "mov %%rbx, %c[rbx](%0) \n\t"
1339                 "popq %c[rcx](%0) \n\t"
1340                 "mov %%rdx, %c[rdx](%0) \n\t"
1341                 "mov %%rsi, %c[rsi](%0) \n\t"
1342                 "mov %%rdi, %c[rdi](%0) \n\t"
1343                 "mov %%rbp, %c[rbp](%0) \n\t"
1344                 "mov %%r8,  %c[r8](%0) \n\t"
1345                 "mov %%r9,  %c[r9](%0) \n\t"
1346                 "mov %%r10, %c[r10](%0) \n\t"
1347                 "mov %%r11, %c[r11](%0) \n\t"
1348                 "mov %%r12, %c[r12](%0) \n\t"
1349                 "mov %%r13, %c[r13](%0) \n\t"
1350                 "mov %%r14, %c[r14](%0) \n\t"
1351                 "mov %%r15, %c[r15](%0) \n\t"
1352                 "mov %%rax, %%r10 \n\t"
1353                 "mov %%rdx, %%r11 \n\t"
1354
1355                 "mov %%cr2, %%rax   \n\t"
1356                 "mov %%rax, %c[cr2](%0) \n\t"
1357
1358                 "pop  %%rbp; pop  %%rdx \n\t"
1359                 "setbe %c[fail](%0) \n\t"
1360
1361                 "mov $" /*__stringify(GD_UD) */"16"", %%rax \n\t"
1362                 "mov %%rax, %%ds \n\t"
1363                 "mov %%rax, %%es \n\t"
1364               : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1365                 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1366                 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1367                 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1368                 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1369                 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1370                 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1371                 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1372                 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1373                 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1374                 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1375                 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1376                 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1377                 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1378                 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1379                 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1380                 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1381                 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1382                 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1383                 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1384                 [wordsize]"i"(sizeof(unsigned long))
1385               : "cc", "memory"
1386                 , "rax", "rbx", "rdi", "rsi"
1387                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1388         );
1389
1390         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1391         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1392         printk("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1393                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1394         /* FIXME: do we need to set up other flags? */
1395         vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
1396                       X86_EFLAGS_IF | 0x2;
1397         //monitor(NULL);
1398
1399         vcpu->regs.tf_cs = GD_UT;
1400         vcpu->regs.tf_ss = GD_UD;
1401
1402         vcpu->launched = 1;
1403
1404         if (vcpu->fail) {
1405                 printk("failure detected (err %x)\n",
1406                        vmcs_read32(VM_INSTRUCTION_ERROR));
1407                 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1408         }
1409
1410         return vmcs_read32(VM_EXIT_REASON);
1411
1412 #if 0
1413         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1414         vmx_complete_atomic_exit(vmx);
1415         vmx_recover_nmi_blocking(vmx);
1416         vmx_complete_interrupts(vmx);
1417 #endif
1418 }
1419
1420 static void vmx_step_instruction(void)
1421 {
1422         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1423                                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1424 }
1425
1426 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu)
1427 {
1428         unsigned long gva, gpa;
1429         int exit_qual, ret = -1;
1430         page_t *page;
1431
1432         vmx_get_cpu(vcpu);
1433         exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1434         gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1435         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1436         printk("ept: gva %016lx, gpa %016lx\n", gva, gpa);
1437
1438         vmx_put_cpu(vcpu);
1439
1440         // this is a total hack, for testing things.
1441         // note that we only care about the gpa, and the
1442         // gpa is our process virtual address. 
1443         // Confused yet?
1444         page = page_lookup(current->env_pgdir, (void *)gpa, NULL);
1445         printk("Lookup %p returns %p\n", gpa, page);
1446         if (page) {
1447                 uint64_t hpa = page2pa(page);
1448                 printk("hpa for %p is %p\n", gpa, hpa);
1449                 ret = vmx_do_ept_fault(ept, gpa, hpa, exit_qual);
1450                 printk("vmx_do_ept_fault returns %d\n", ret);
1451         }
1452
1453         if (ret) {
1454                 printk("page fault failure "
1455                        "GPA: 0x%lx, GVA: 0x%lx\n",
1456                        gpa, gva);
1457                 vmx_dump_cpu(vcpu);
1458         }
1459
1460         return ret;
1461 }
1462
1463 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu)
1464 {
1465         unsigned int eax, ebx, ecx, edx;
1466
1467         eax = vcpu->regs.tf_rax;
1468         ecx = vcpu->regs.tf_rcx;
1469         cpuid(0, 2, &eax, &ebx, &ecx, &edx);
1470         vcpu->regs.tf_rax = eax;
1471         vcpu->regs.tf_rbx = ebx;
1472         vcpu->regs.tf_rcx = ecx;
1473         vcpu->regs.tf_rdx = edx;
1474 }
1475
1476 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu)
1477 {
1478         uint32_t intr_info;
1479
1480         vmx_get_cpu(vcpu);
1481         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1482         vmx_put_cpu(vcpu);
1483
1484         printk("vmx (VPID %d): got an exception\n", vcpu->vpid);
1485         printk("vmx (VPID %d): pid %d\n", vcpu->vpid,
1486                          current->pid);
1487         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1488                 return 0;
1489         }
1490
1491         printk("unhandled nmi, intr_info %x\n", intr_info);
1492         return -EIO;
1493 }
1494
1495
1496 static void noop(void) {
1497         __asm__ __volatile__ ("1: jmp 1b");
1498 }
1499
1500 static void fail(void) {
1501         __asm__ __volatile__ ("movq $0xdeadbeef, %rbx; movq 0, %rax");
1502 }
1503
1504 static unsigned long stack[512];
1505 /**
1506  * vmx_launch - the main loop for a VMX Dune process
1507  * @conf: the launch configuration
1508  */
1509 int vmx_launch(struct dune_config *conf)
1510 {
1511         int ret;
1512         struct dune_struct dune;
1513         struct vmx_vcpu *vcpu;
1514         int i = 0;
1515         unsigned long rip = conf->rip;
1516         unsigned long rsp = conf->rsp;
1517         unsigned long cr3 = conf->cr3;
1518         int errors = 0;
1519
1520         if (conf->rip < 4096 ) {
1521                 // testing.
1522                 switch(conf->rip) {
1523                 default:
1524                         rip = (uint64_t)noop + 4;
1525                         break;
1526                 case 1:
1527                         rip = (uint64_t)fail + 4;
1528                         break;
1529                 }
1530         }
1531
1532         if (conf->cr3 == 0) {
1533                 cr3 = rcr3();
1534         }
1535
1536         /* sanity checking.  -- later
1537         ret = ept_check_page(ept, rip);
1538         if (ret) {
1539                 printk("0x%x is not mapped in the ept!\n", rip);
1540                 errors++;
1541         }
1542         ret = ept_check_page(ept, rsp);
1543         if (ret) {
1544                 printk("0x%x is not mapped in the ept!\n", rsp);
1545                 errors++;
1546         }
1547         */
1548         if (errors) {
1549                 return -EINVAL;
1550         }
1551
1552
1553         printk("RUNNING: %s: rip %p rsp %p cr3 %p \n",
1554                __func__, rip, rsp, cr3);
1555         vcpu = vmx_create_vcpu();
1556         if (!vcpu) {
1557                 return -ENOMEM;
1558         }
1559
1560         vmx_get_cpu(vcpu);
1561         vmcs_writel(GUEST_RIP, rip);
1562         vmcs_writel(GUEST_RSP, rsp);
1563         vmcs_writel(GUEST_CR3, cr3);
1564         vmx_put_cpu(vcpu);
1565
1566         printk("created VCPU (VPID %d): pid %d\n",
1567                vcpu->vpid, current->pid);
1568
1569         vcpu->ret_code = -1;
1570
1571         if (current->virtinfo)
1572                 printk("vmx_launch: current->virtinfo is NOT NULL (%p)\n", current->virtinfo);
1573         //WARN_ON(current->virtinfo != NULL);
1574         dune.vcpu = vcpu;
1575
1576         current->virtinfo = &dune;
1577
1578         while (1) {
1579                 vmx_get_cpu(vcpu);
1580
1581                 // TODO: manage the fpu when we restart.
1582
1583                 // TODO: see if we need to exit before we go much further.
1584                 disable_irq();
1585                 ret = vmx_run_vcpu(vcpu);
1586                 enable_irq();
1587
1588                 if (ret == EXIT_REASON_VMCALL ||
1589                     ret == EXIT_REASON_CPUID) {
1590                         vmx_step_instruction();
1591                 }
1592
1593                 vmx_put_cpu(vcpu);
1594
1595                 if (ret == EXIT_REASON_VMCALL) {
1596                         printk("system call! WTF\n");
1597                 } else if (ret == EXIT_REASON_CPUID)
1598                         vmx_handle_cpuid(vcpu);
1599                 else if (ret == EXIT_REASON_EPT_VIOLATION) {
1600                         if (vmx_handle_ept_violation(vcpu))
1601                                 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1602                 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1603                         if (vmx_handle_nmi_exception(vcpu))
1604                                 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1605                 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1606                         printk("External interrupt\n");
1607                 } else {
1608                         printk("unhandled exit: reason %x, exit qualification %x\n",
1609                                ret, vmcs_read32(EXIT_QUALIFICATION));
1610                         vmx_dump_cpu(vcpu);
1611                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1612                 }
1613
1614                 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1615                  * similar to how proc_restartcore/smp_idle only restart the pcpui
1616                  * cur_ctx, we need to do the same, via the VMCS resume business. */
1617
1618                 if (vcpu->shutdown)
1619                         break;
1620         }
1621
1622         printk("RETURN. ip %016lx sp %016lx\n",
1623                 vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
1624         monitor(NULL);
1625         current->virtinfo = NULL;
1626
1627         /*
1628          * Return both the reason for the shutdown and a status value.
1629          * The exit() and exit_group() system calls only need 8 bits for
1630          * the status but we allow 16 bits in case we might want to
1631          * return more information for one of the other shutdown reasons.
1632          */
1633         ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
1634
1635         printk("destroying VCPU (VPID %d): pid %d\n",
1636                         vcpu->vpid, current->pid);
1637
1638         vmx_destroy_vcpu(vcpu);
1639
1640         return ret;
1641 }
1642
1643 /**
1644  * __vmx_enable - low-level enable of VMX mode on the current CPU
1645  * @vmxon_buf: an opaque buffer for use as the VMXON region
1646  */
1647 static  int __vmx_enable(struct vmcs *vmxon_buf)
1648 {
1649         uint64_t phys_addr = PADDR(vmxon_buf);
1650         uint64_t old, test_bits;
1651
1652         if (rcr4() & X86_CR4_VMXE) {
1653                 panic("Should never have this happen");
1654                 return -EBUSY;
1655         }
1656
1657         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1658
1659         test_bits = FEATURE_CONTROL_LOCKED;
1660         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1661
1662         if (0) // tboot_enabled())
1663                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1664
1665         if ((old & test_bits) != test_bits) {
1666                 /* If it's locked, then trying to set it will cause a GPF.
1667                  * No Dune for you!
1668                  */
1669                 if (old & FEATURE_CONTROL_LOCKED) {
1670                         printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1671                         return -1;
1672                 }
1673
1674                 /* enable and lock */
1675                 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1676         }
1677         lcr4(rcr4() | X86_CR4_VMXE);
1678
1679         __vmxon(phys_addr);
1680         vpid_sync_vcpu_global();
1681         ept_sync_global();
1682
1683         return 0;
1684 }
1685
1686 /**
1687  * vmx_enable - enables VMX mode on the current CPU
1688  * @unused: not used (required for on_each_cpu())
1689  *
1690  * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
1691  */
1692 static void vmx_enable(void)
1693 {
1694         struct vmcs *vmxon_buf = currentcpu->vmxarea;
1695         int ret;
1696
1697         ret = __vmx_enable(vmxon_buf);
1698         if (ret)
1699                 goto failed;
1700
1701         currentcpu->vmx_enabled = 1;
1702         // TODO: do we need this?
1703         store_gdt(&currentcpu->host_gdt);
1704
1705         printk("VMX enabled on CPU %d\n", core_id());
1706         return;
1707
1708 failed:
1709         has_vmx = FALSE;
1710         printk("failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1711 }
1712
1713 /**
1714  * vmx_disable - disables VMX mode on the current CPU
1715  */
1716 static void vmx_disable(void *unused)
1717 {
1718         if (currentcpu->vmx_enabled) {
1719                 __vmxoff();
1720                 lcr4(rcr4() & ~X86_CR4_VMXE);
1721                 currentcpu->vmx_enabled = 0;
1722         }
1723 }
1724
1725 /* Probe the cpus to see which ones can do vmx.
1726  * Return -errno if it fails, and 1 if it succeeds.
1727  */
1728 static bool probe_cpu_vmx(void)
1729 {
1730         /* The best way to test this code is:
1731          * wrmsr -p <cpu> 0x3a 1
1732          * This will lock vmx off; then modprobe dune.
1733          * Frequently, however, systems have all 0x3a registers set to 5,
1734          * meaning testing is impossible, as vmx can not be disabled.
1735          * We have to simulate it being unavailable in most cases.
1736          * The 'test' variable provides an easy way to simulate
1737          * unavailability of vmx on some, none, or all cpus.
1738          */
1739         if (!cpu_has_vmx()) {
1740                 printk("Machine does not support VT-x\n");
1741                 return FALSE;
1742         } else {
1743                 printk("Machine supports VT-x\n");
1744                 return TRUE;
1745         }
1746 }
1747
1748 static void setup_vmxarea(void)
1749 {
1750                 struct vmcs *vmxon_buf;
1751                 printd("Set up vmxarea for cpu %d\n", core_id());
1752                 vmxon_buf = __vmx_alloc_vmcs(node_id());
1753                 if (!vmxon_buf) {
1754                         printk("setup_vmxarea failed on node %d\n", core_id());
1755                         return;
1756                 }
1757                 currentcpu->vmxarea = vmxon_buf;
1758 }
1759
1760 /**
1761  * vmx_init sets up physical core data areas that are required to run a vm at all.
1762  * These data areas are not connected to a specific user process in any way. Instead,
1763  * they are in some sense externalizing what would other wise be a very large ball of
1764  * state that would be inside the CPU.
1765  */
1766 int intel_vmm_init(void)
1767 {
1768         int r, cpu, ret;
1769
1770         if (! probe_cpu_vmx()) {
1771                 return -EOPNOTSUPP;
1772         }
1773
1774         setup_vmcs_config(&ret);
1775
1776         if (ret) {
1777                 printk("setup_vmcs_config failed: %d\n", ret);
1778                 return ret;
1779         }
1780
1781         msr_bitmap = (unsigned long *)kpage_zalloc_addr();
1782         if (!msr_bitmap) {
1783                 printk("Could not allocate msr_bitmap\n");
1784                 return -ENOMEM;
1785         }
1786         /* FIXME: do we need APIC virtualization (flexpriority?) */
1787
1788         memset(msr_bitmap, 0xff, PAGE_SIZE);
1789         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
1790         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
1791
1792         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
1793
1794         /* TEMPORARY hack so we can do some basic VM testing. Create an ept and look for faults on it.
1795          */
1796         ept = kpage_zalloc_addr();
1797         eptp = construct_eptp(PADDR(ept));
1798         printk("ept is %p and eptp is %p\n", ept, eptp);
1799         return ret;
1800 }
1801
1802 int intel_vmm_pcpu_init(void)
1803 {
1804         setup_vmxarea();
1805         vmx_enable();
1806         return 0;
1807 }