x86: Fixes init for machines that do not have VMX
[akaros.git] / kern / arch / x86 / vmm / intel / vmx.c
1 /**
2  *  vmx.c - The Intel VT-x driver for Dune
3  *
4  * This file is derived from Linux KVM VT-x support.
5  * Copyright (C) 2006 Qumranet, Inc.
6  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
7  *
8  * Original Authors:
9  *   Avi Kivity   <avi@qumranet.com>
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *
12  * This modified version is simpler because it avoids the following
13  * features that are not requirements for Dune:
14  *  * Real-mode emulation
15  *  * Nested VT-x support
16  *  * I/O hardware emulation
17  *  * Any of the more esoteric X86 features and registers
18  *  * KVM-specific functionality
19  *
20  * In essence we provide only the minimum functionality needed to run
21  * a process in vmx non-root mode rather than the full hardware emulation
22  * needed to support an entire OS.
23  *
24  * This driver is a research prototype and as such has the following
25  * limitations:
26  *
27  * FIXME: Backward compatability is currently a non-goal, and only recent
28  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
29  * driver.
30  *
31  * FIXME: Eventually we should handle concurrent user's of VT-x more
32  * gracefully instead of requiring exclusive access. This would allow
33  * Dune to interoperate with KVM and other HV solutions.
34  *
35  * FIXME: We need to support hotplugged physical CPUs.
36  *
37  * Authors:
38  *   Adam Belay   <abelay@stanford.edu>
39  */
40
41 /* Basic flow.
42  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
43  * You're left with the feeling that they got part way through and realized they had to have one for
44  *
45  * 1) your CPU is going to be capable of running VMs, and you need state for that.
46  *
47  * 2) you're about to start a guest, and you need state for that.
48  *
49  * So there is get cpu set up to be able to run VMs stuff, and now
50  * let's start a guest stuff.  In Akaros, CPUs will always be set up
51  * to run a VM if that is possible. Processes can flip themselves into
52  * a VM and that will require another VMCS.
53  *
54  * So: at kernel startup time, the SMP boot stuff calls
55  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
56  * in the case of this file is intel_vmm_init. That does some code
57  * that sets up stuff for ALL sockets, based on the capabilities of
58  * the socket it runs on. If any cpu supports vmx, it assumes they all
59  * do. That's a realistic assumption. So the call_function_all is kind
60  * of stupid, really; it could just see what's on the current cpu and
61  * assume it's on all. HOWEVER: there are systems in the wilde that
62  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
63  * might as well allow for the chance that wel'll only all VMMCPs on a
64  * subset (not implemented yet however).  So: probe all CPUs, get a
65  * count of how many support VMX and, for now, assume they all do
66  * anyway.
67  *
68  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
69  * which contains all the naughty bits settings for all the cpus that can run a VM.
70  * Realistically, all VMX-capable cpus in a system will have identical configurations.
71  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
72  *
73  * configure the msr_bitmap. This is the bitmap of MSRs which the
74  * guest can manipulate.  Currently, we only allow GS and FS base.
75  *
76  * Reserve bit 0 in the vpid bitmap as guests can not use that
77  *
78  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
79  * per-guest. Once set up, it is left alone.  The ONLY think we set in
80  * there is the revision area. The VMX is page-sized per cpu and
81  * page-aligned. Note that it can be smaller, but why bother? We know
82  * the max size and alightment, and it's convenient.
83  *
84  * Now that it is set up, enable vmx on all cpus. This involves
85  * testing VMXE in cr4, to see if we've been here before (TODO: delete
86  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
87  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
88  * instruction), and syncing vpid's and ept's.  Now the CPU is ready
89  * to host guests.
90  *
91  * Setting up a guest.
92  * We divide this into two things: vmm_proc_init and vm_run.
93  * Currently, on Intel, vmm_proc_init does nothing.
94  *
95  * vm_run is really complicated. It is called with a coreid, rip, rsp,
96  * cr3, and flags.  On intel, it calls vmx_launch. vmx_launch is set
97  * up for a few test cases. If rip is 1, it sets the guest rip to
98  * a function which will deref 0 and should exit with failure 2. If rip is 0,
99  * it calls an infinite loop in the guest.
100  *
101  * The sequence of operations:
102  * create a vcpu
103  * while (1) {
104  * get a vcpu
105  * disable irqs (required or you can't enter the VM)
106  * vmx_run_vcpu()
107  * enable irqs
108  * manage the vm exit
109  * }
110  *
111  * get a vcpu
112  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
113  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
114  *
115  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
116  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
117  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
118  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
119  *
120  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
121  * of inline assembly with embedded CPP crap. I suspect we'll want to
122  * un-inline it someday, but maybe not.  It's called with a vcpu
123  * struct from which it loads guest state, and to which it stores
124  * non-virtualized host state. It issues a vmlaunch or vmresume
125  * instruction depending, and on return, it evaluates if things the
126  * launch/resume had an error in that operation. Note this is NOT the
127  * same as an error while in the virtual machine; this is an error in
128  * startup due to misconfiguration. Depending on whatis returned it's
129  * either a failed vm startup or an exit for lots of many reasons.
130  *
131  */
132 void monitor(void *);
133 /* basically: only rename those globals that might conflict
134  * with existing names. Leave all else the same.
135  * this code is more modern than the other code, yet still
136  * well encapsulated, it seems.
137  */
138 #include <kmalloc.h>
139 #include <string.h>
140 #include <stdio.h>
141 #include <assert.h>
142 #include <error.h>
143 #include <pmap.h>
144 #include <sys/queue.h>
145 #include <smp.h>
146 #include <kref.h>
147 #include <atomic.h>
148 #include <alarm.h>
149 #include <event.h>
150 #include <umem.h>
151 #include <bitops.h>
152 #include <arch/types.h>
153 #include <syscall.h>
154
155 #include "vmx.h"
156 #include "../vmm.h"
157
158 #include "compat.h"
159 #include "cpufeature.h"
160
161 #define currentcpu (&per_cpu_info[core_id()])
162
163 /* this is always 1, and only ever incremented. If it's more than 1,
164  * then you failed.
165  */
166 static bool has_vmx = FALSE;
167
168 /* TEMPORARY TEST HACK EPT */
169 void *ept;
170 uint64_t eptp;
171 /* END HACKQUE */
172
173 static DECLARE_BITMAP(vmx_vpid_bitmap, /*VMX_NR_VPIDS*/ 65536);
174 static spinlock_t vmx_vpid_lock;
175
176 static unsigned long *msr_bitmap;
177
178 static struct vmcs_config {
179         int size;
180         int order;
181         uint32_t revision_id;
182         uint32_t pin_based_exec_ctrl;
183         uint32_t cpu_based_exec_ctrl;
184         uint32_t cpu_based_2nd_exec_ctrl;
185         uint32_t vmexit_ctrl;
186         uint32_t vmentry_ctrl;
187 } vmcs_config;
188
189 struct vmx_capability vmx_capability;
190
191 static inline bool cpu_has_secondary_exec_ctrls(void)
192 {
193         return vmcs_config.cpu_based_exec_ctrl &
194                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
195 }
196
197 static inline bool cpu_has_vmx_vpid(void)
198 {
199         return vmcs_config.cpu_based_2nd_exec_ctrl &
200                 SECONDARY_EXEC_ENABLE_VPID;
201 }
202
203 static inline bool cpu_has_vmx_invpcid(void)
204 {
205         return vmcs_config.cpu_based_2nd_exec_ctrl &
206                 SECONDARY_EXEC_ENABLE_INVPCID;
207 }
208
209 static inline bool cpu_has_vmx_invvpid_single(void)
210 {
211         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
212 }
213
214 static inline bool cpu_has_vmx_invvpid_global(void)
215 {
216         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
217 }
218
219 static inline bool cpu_has_vmx_ept(void)
220 {
221         return vmcs_config.cpu_based_2nd_exec_ctrl &
222                 SECONDARY_EXEC_ENABLE_EPT;
223 }
224
225 static inline bool cpu_has_vmx_invept_individual_addr(void)
226 {
227         return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
228 }
229
230 static inline bool cpu_has_vmx_invept_context(void)
231 {
232         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
233 }
234
235 static inline bool cpu_has_vmx_invept_global(void)
236 {
237         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
238 }
239
240 static inline bool cpu_has_vmx_ept_ad_bits(void)
241 {
242         return vmx_capability.ept & VMX_EPT_AD_BIT;
243 }
244
245 static inline void __invept(int ext, uint64_t eptp, gpa_t gpa)
246 {
247         struct {
248                 uint64_t eptp, gpa;
249         } operand = {eptp, gpa};
250
251         asm volatile (ASM_VMX_INVEPT
252                         /* CF==1 or ZF==1 --> rc = -1 */
253                         "; ja 1f ; ud2 ; 1:\n"
254                         : : "a" (&operand), "c" (ext) : "cc", "memory");
255 }
256
257 static inline void ept_sync_global(void)
258 {
259         if (cpu_has_vmx_invept_global())
260                 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
261 }
262
263 static inline void ept_sync_context(uint64_t eptp)
264 {
265         if (cpu_has_vmx_invept_context())
266                 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
267         else
268                 ept_sync_global();
269 }
270
271 static inline void ept_sync_individual_addr(uint64_t eptp, gpa_t gpa)
272 {
273         if (cpu_has_vmx_invept_individual_addr())
274                 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
275                                 eptp, gpa);
276         else
277                 ept_sync_context(eptp);
278 }
279
280 static inline void __vmxon(uint64_t addr)
281 {
282         asm volatile (ASM_VMX_VMXON_RAX
283                         : : "a"(&addr), "m"(addr)
284                         : "memory", "cc");
285 }
286
287 static inline void __vmxoff(void)
288 {
289         asm volatile (ASM_VMX_VMXOFF : : : "cc");
290 }
291
292 static inline void __invvpid(int ext, uint16_t vpid, gva_t gva)
293 {
294     struct {
295         uint64_t vpid : 16;
296         uint64_t rsvd : 48;
297         uint64_t gva;
298     } operand = { vpid, 0, gva };
299
300     asm volatile (ASM_VMX_INVVPID
301                   /* CF==1 or ZF==1 --> rc = -1 */
302                   "; ja 1f ; ud2 ; 1:"
303                   : : "a"(&operand), "c"(ext) : "cc", "memory");
304 }
305
306 static inline void vpid_sync_vcpu_single(uint16_t vpid)
307 {
308         if (vpid == 0) {
309                 return;
310         }
311
312         if (cpu_has_vmx_invvpid_single())
313                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
314 }
315
316 static inline void vpid_sync_vcpu_global(void)
317 {
318         if (cpu_has_vmx_invvpid_global())
319                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
320 }
321
322 static inline void vpid_sync_context(uint16_t vpid)
323 {
324         if (cpu_has_vmx_invvpid_single())
325                 vpid_sync_vcpu_single(vpid);
326         else
327                 vpid_sync_vcpu_global();
328 }
329
330 static void vmcs_clear(struct vmcs *vmcs)
331 {
332         uint64_t phys_addr = PADDR(vmcs);
333         uint8_t error;
334
335         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
336                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
337                       : "cc", "memory");
338         if (error)
339                 printk("vmclear fail: %p/%llx\n",
340                        vmcs, phys_addr);
341 }
342
343 static void vmcs_load(struct vmcs *vmcs)
344 {
345         uint64_t phys_addr = PADDR(vmcs);
346         uint8_t error;
347
348         asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
349                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
350                         : "cc", "memory");
351         if (error)
352                 printk("vmptrld %p/%llx failed\n",
353                        vmcs, phys_addr);
354 }
355
356
357 __always_inline unsigned long vmcs_readl(unsigned long field)
358 {
359         unsigned long value;
360
361         asm volatile (ASM_VMX_VMREAD_RDX_RAX
362                       : "=a"(value) : "d"(field) : "cc");
363         return value;
364 }
365
366 __always_inline uint16_t vmcs_read16(unsigned long field)
367 {
368         return vmcs_readl(field);
369 }
370
371 static __always_inline uint32_t vmcs_read32(unsigned long field)
372 {
373         return vmcs_readl(field);
374 }
375
376 static __always_inline uint64_t vmcs_read64(unsigned long field)
377 {
378 #ifdef CONFIG_X86_64
379         return vmcs_readl(field);
380 #else
381         return vmcs_readl(field) | ((uint64_t)vmcs_readl(field+1) << 32);
382 #endif
383 }
384
385 void vmwrite_error(unsigned long field, unsigned long value)
386 {
387         printk("vmwrite error: reg %lx value %lx (err %d)\n",
388                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
389         /* not available so ...
390         dump_stack();
391         */
392         monitor(NULL);
393 }
394
395 void vmcs_writel(unsigned long field, unsigned long value)
396 {
397         uint8_t error;
398
399         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
400                        : "=q"(error) : "a"(value), "d"(field) : "cc");
401         if (error)
402                 vmwrite_error(field, value);
403 }
404
405 static void vmcs_write16(unsigned long field, uint16_t value)
406 {
407         vmcs_writel(field, value);
408 }
409
410 static void vmcs_write32(unsigned long field, uint32_t value)
411 {
412         vmcs_writel(field, value);
413 }
414
415 static void vmcs_write64(unsigned long field, uint64_t value)
416 {
417         vmcs_writel(field, value);
418 #ifndef CONFIG_X86_64
419         asm volatile ("");
420         vmcs_writel(field+1, value >> 32);
421 #endif
422 }
423
424
425 static int adjust_vmx_controls(uint32_t ctl_min, uint32_t ctl_opt,
426                                       uint32_t msr, uint32_t *result)
427 {
428         uint32_t vmx_msr_low, vmx_msr_high;
429         uint32_t ctl = ctl_min | ctl_opt;
430         uint64_t vmx_msr = read_msr(msr);
431         vmx_msr_low = vmx_msr;
432         vmx_msr_high = vmx_msr>>32;
433
434         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
435         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
436
437         /* Ensure minimum (required) set of control bits are supported. */
438         if (ctl_min & ~ctl) {
439                 return -EIO;
440         }
441
442         *result = ctl;
443         return 0;
444 }
445
446 static  bool allow_1_setting(uint32_t msr, uint32_t ctl)
447 {
448         uint32_t vmx_msr_low, vmx_msr_high;
449
450         rdmsr(msr, vmx_msr_low, vmx_msr_high);
451         return vmx_msr_high & ctl;
452 }
453
454 static  void setup_vmcs_config(void *p)
455 {
456         int *ret = p;
457         struct vmcs_config *vmcs_conf = &vmcs_config;
458         uint32_t vmx_msr_low, vmx_msr_high;
459         uint32_t min, opt, min2, opt2;
460         uint32_t _pin_based_exec_control = 0;
461         uint32_t _cpu_based_exec_control = 0;
462         uint32_t _cpu_based_2nd_exec_control = 0;
463         uint32_t _vmexit_control = 0;
464         uint32_t _vmentry_control = 0;
465
466         *ret = -EIO;
467         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
468         opt = PIN_BASED_VIRTUAL_NMIS;
469         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
470                                 &_pin_based_exec_control) < 0) {
471                 return;
472         }
473
474         min =
475               CPU_BASED_CR8_LOAD_EXITING |
476               CPU_BASED_CR8_STORE_EXITING |
477               CPU_BASED_CR3_LOAD_EXITING |
478               CPU_BASED_CR3_STORE_EXITING |
479               CPU_BASED_MOV_DR_EXITING |
480               CPU_BASED_USE_TSC_OFFSETING |
481               CPU_BASED_MWAIT_EXITING |
482               CPU_BASED_MONITOR_EXITING |
483               CPU_BASED_INVLPG_EXITING;
484
485         min |= CPU_BASED_HLT_EXITING;
486
487         opt = CPU_BASED_TPR_SHADOW |
488               CPU_BASED_USE_MSR_BITMAPS |
489               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
490         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
491                                 &_cpu_based_exec_control) < 0) {
492                 return;
493         }
494
495         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
496                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
497                                            ~CPU_BASED_CR8_STORE_EXITING;
498
499         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
500                 min2 = 
501                         SECONDARY_EXEC_ENABLE_VPID |
502                         SECONDARY_EXEC_ENABLE_EPT |
503                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
504                 opt2 =  SECONDARY_EXEC_WBINVD_EXITING |
505                         SECONDARY_EXEC_RDTSCP |
506                         SECONDARY_EXEC_ENABLE_INVPCID;
507                 if (adjust_vmx_controls(min2, opt2,
508                                         MSR_IA32_VMX_PROCBASED_CTLS2,
509                                         &_cpu_based_2nd_exec_control) < 0) {
510                                                 return;
511                                         }
512         }
513
514         if (!(_cpu_based_2nd_exec_control &
515                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
516                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
517
518         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
519                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
520                    enabled */
521                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
522                                              CPU_BASED_CR3_STORE_EXITING |
523                                              CPU_BASED_INVLPG_EXITING);
524                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
525                       vmx_capability.ept, vmx_capability.vpid);
526         }
527
528         min = 0;
529
530         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
531
532 //      opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
533         opt = 0;
534         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
535                                 &_vmexit_control) < 0) {
536                 return;
537         }
538
539         min = 0;
540 //      opt = VM_ENTRY_LOAD_IA32_PAT;
541         opt = 0;
542         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
543                                 &_vmentry_control) < 0) {
544                 return;
545         }
546
547         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
548
549         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
550         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) {
551                 return;
552         }
553
554         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
555         if (vmx_msr_high & (1u<<16)) {
556                 printk("64-bit CPUs always have VMX_BASIC_MSR[48]==0. FAILS!\n");
557                 return;
558         }
559
560         /* Require Write-Back (WB) memory type for VMCS accesses. */
561         if (((vmx_msr_high >> 18) & 15) != 6) {
562                 printk("NO WB!\n");
563                 return;
564         }
565
566         vmcs_conf->size = vmx_msr_high & 0x1fff;
567         vmcs_conf->order = LOG2_UP(vmcs_config.size>> PAGE_SHIFT);
568         vmcs_conf->revision_id = vmx_msr_low;
569         printk("vmcs_conf size %d order %d rev %d\n",
570                vmcs_conf->size, vmcs_conf->order,
571                vmcs_conf->revision_id);
572
573         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
574         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
575         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
576         vmcs_conf->vmexit_ctrl         = _vmexit_control;
577         vmcs_conf->vmentry_ctrl        = _vmentry_control;
578
579         vmx_capability.has_load_efer =
580                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
581                                 VM_ENTRY_LOAD_IA32_EFER)
582                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
583                                    VM_EXIT_LOAD_IA32_EFER);
584
585         /* Now that we've done all the setup we can do, verify
586          * that we have all the capabilities we need. These tests
587          * are done last presumably because all the work done above
588          * affects some of them.
589          */
590
591         if (!vmx_capability.has_load_efer) {
592                 printk("CPU lacks ability to load EFER register\n");
593                 return;
594         }
595
596         printk("CPU has all needed capabilities\n");
597         *ret = 0;
598 }
599
600 static struct vmcs *__vmx_alloc_vmcs(int node)
601 {
602         struct vmcs *vmcs;
603
604         vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
605         if (!vmcs)
606                 return 0;
607         memset(vmcs, 0, vmcs_config.size);
608         vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
609         printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
610         return vmcs;
611 }
612
613 /**
614  * vmx_alloc_vmcs - allocates a VMCS region
615  *
616  * NOTE: Assumes the new region will be used by the current CPU.
617  *
618  * Returns a valid VMCS region.
619  */
620 static struct vmcs *vmx_alloc_vmcs(void)
621 {
622         return __vmx_alloc_vmcs(node_id());
623 }
624
625 /**
626  * vmx_free_vmcs - frees a VMCS region
627  */
628 static void vmx_free_vmcs(struct vmcs *vmcs)
629 {
630   //free_pages((unsigned long)vmcs, vmcs_config.order);
631 }
632
633 /*
634  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
635  * will not change in the lifetime of the guest.
636  * Note that host-state that does change is set elsewhere. E.g., host-state
637  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
638  */
639 static void vmx_setup_constant_host_state(void)
640 {
641         uint32_t low32, high32;
642         unsigned long tmpl;
643         pseudodesc_t dt;
644
645         vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);  /* 22.2.3 */
646         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
647         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
648
649         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
650         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
651         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
652         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
653         vmcs_write16(HOST_TR_SELECTOR, GD_TSS*8);  /* 22.2.4 */
654
655         native_store_idt(&dt);
656         vmcs_writel(HOST_IDTR_BASE, dt.pd_base);   /* 22.2.4 */
657
658         asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
659         vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
660
661         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
662         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
663         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
664         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
665
666         rdmsr(MSR_EFER, low32, high32);
667         vmcs_write32(HOST_IA32_EFER, low32);
668
669         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
670                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
671                 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
672         }
673
674         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
675         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
676
677 #ifdef CONFIG_X86_64
678         rdmsrl(MSR_FS_BASE, tmpl);
679         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
680         rdmsrl(MSR_GS_BASE, tmpl);
681         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
682 #else
683         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
684         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
685 #endif
686 }
687
688 static inline uint16_t vmx_read_ldt(void)
689 {
690         uint16_t ldt;
691         asm("sldt %0" : "=g"(ldt));
692         return ldt;
693 }
694
695 static unsigned long segment_base(uint16_t selector)
696 {
697         pseudodesc_t *gdt = &currentcpu->host_gdt;
698         struct desc_struct *d;
699         unsigned long table_base;
700         unsigned long v;
701
702         if (!(selector & ~3)) {
703                 return 0;
704         }
705
706         table_base = gdt->pd_base;
707
708         if (selector & 4) {           /* from ldt */
709                 uint16_t ldt_selector = vmx_read_ldt();
710
711                 if (!(ldt_selector & ~3)) {
712                         return 0;
713                 }
714
715                 table_base = segment_base(ldt_selector);
716         }
717         d = (struct desc_struct *)(table_base + (selector & ~7));
718         v = get_desc_base(d);
719 #ifdef CONFIG_X86_64
720        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
721                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
722 #endif
723         return v;
724 }
725
726 static inline unsigned long vmx_read_tr_base(void)
727 {
728         uint16_t tr;
729         asm("str %0" : "=g"(tr));
730         return segment_base(tr);
731 }
732
733 static void __vmx_setup_cpu(void)
734 {
735         pseudodesc_t *gdt = &currentcpu->host_gdt;
736         unsigned long sysenter_esp;
737         unsigned long tmpl;
738
739         /*
740          * Linux uses per-cpu TSS and GDT, so set these when switching
741          * processors.
742          */
743         vmcs_writel(HOST_TR_BASE, vmx_read_tr_base()); /* 22.2.4 */
744         vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);   /* 22.2.4 */
745
746         rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
747         vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
748
749         rdmsrl(MSR_FS_BASE, tmpl);
750         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
751         rdmsrl(MSR_GS_BASE, tmpl);
752         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
753 }
754
755 static void __vmx_get_cpu_helper(struct hw_trapframe *hw_tf, void *ptr)
756 {
757         struct vmx_vcpu *vcpu = ptr;
758
759         if (core_id() != vcpu->cpu)
760                 panic("%s: core_id() %d != vcpu->cpu %d\n",
761                       __func__, core_id(), vcpu->cpu);
762
763         vmcs_clear(vcpu->vmcs);
764         if (currentcpu->local_vcpu == vcpu)
765                 currentcpu->local_vcpu = NULL;
766 }
767
768 /**
769  * vmx_get_cpu - called before using a cpu
770  * @vcpu: VCPU that will be loaded.
771  *
772  * Disables preemption. Call vmx_put_cpu() when finished.
773  */
774 static void vmx_get_cpu(struct vmx_vcpu *vcpu)
775 {
776         int cur_cpu = core_id();
777         handler_wrapper_t *w;
778
779         //printk("currentcpu->local_vcpu %p vcpu %p\n",
780                 //currentcpu->local_vcpu, vcpu);
781         if (currentcpu->local_vcpu != vcpu) {
782                 currentcpu->local_vcpu = vcpu;
783
784                 if (vcpu->cpu != cur_cpu) {
785                         if (vcpu->cpu >= 0) {
786                                 smp_call_function_single(vcpu->cpu,
787                                                          __vmx_get_cpu_helper, (void *) vcpu, &w);
788                                 if (smp_call_wait(w))
789                                         printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
790                         } else
791                                 vmcs_clear(vcpu->vmcs);
792
793 //                      vpid_sync_context(vcpu->vpid);
794 //                      ept_sync_context(current->vmm->
795
796                         vcpu->launched = 0;
797                         vmcs_load(vcpu->vmcs);
798                         __vmx_setup_cpu();
799                         vcpu->cpu = cur_cpu;
800                 } else {
801                         vmcs_load(vcpu->vmcs);
802                 }
803         }
804 }
805
806 /**
807  * vmx_put_cpu - called after using a cpu
808  * @vcpu: VCPU that was loaded.
809  */
810 static void vmx_put_cpu(struct vmx_vcpu *vcpu)
811 {
812         //put_cpu();
813 }
814
815 static void __vmx_sync_helper(struct hw_trapframe *hw_tf, void *ptr)
816 {
817         struct vmx_vcpu *vcpu = ptr;
818
819 //      ept_sync_context(current);
820 }
821
822 struct sync_addr_args {
823         struct vmx_vcpu *vcpu;
824         gpa_t gpa;
825 };
826
827 static void __vmx_sync_individual_addr_helper(struct hw_trapframe *hw_tf, void *ptr)
828 {
829         struct sync_addr_args *args = ptr;
830
831 //      ept_sync_individual_addr(
832
833 }
834
835 /**
836  * vmx_ept_sync_global - used to evict everything in the EPT
837  * @vcpu: the vcpu
838  */
839 void vmx_ept_sync_vcpu(struct vmx_vcpu *vcpu)
840 {
841         handler_wrapper_t *w;
842
843         smp_call_function_single(vcpu->cpu,
844                 __vmx_sync_helper, (void *) vcpu, &w);
845
846         if (smp_call_wait(w)) {
847                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
848         }
849
850
851 }
852
853 /**
854  * vmx_ept_sync_individual_addr - used to evict an individual address
855  * @vcpu: the vcpu
856  * @gpa: the guest-physical address
857  */
858 void vmx_ept_sync_individual_addr(struct vmx_vcpu *vcpu, gpa_t gpa)
859 {
860         struct sync_addr_args args;
861         args.vcpu = vcpu;
862         args.gpa = gpa;
863
864         handler_wrapper_t *w;
865
866
867         smp_call_function_single(vcpu->cpu,
868                                  __vmx_sync_individual_addr_helper, (void *) &args, &w);
869
870         if (smp_call_wait(w)) {
871                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
872         }
873
874 }
875
876 /**
877  * vmx_dump_cpu - prints the CPU state
878  * @vcpu: VCPU to print
879  */
880 static void vmx_dump_cpu(struct vmx_vcpu *vcpu)
881 {
882
883         unsigned long flags;
884
885         vmx_get_cpu(vcpu);
886         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
887         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
888         flags = vmcs_readl(GUEST_RFLAGS);
889         vmx_put_cpu(vcpu);
890
891         printk("--- Begin VCPU Dump ---\n");
892         printk("CPU %d VPID %d\n", vcpu->cpu, vcpu->vpid);
893         printk("RIP 0x%016lx RFLAGS 0x%08lx\n",
894                vcpu->regs.tf_rip, flags);
895         printk("RAX 0x%016lx RCX 0x%016lx\n",
896                 vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
897         printk("RDX 0x%016lx RBX 0x%016lx\n",
898                 vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
899         printk("RSP 0x%016lx RBP 0x%016lx\n",
900                 vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
901         printk("RSI 0x%016lx RDI 0x%016lx\n",
902                 vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
903         printk("R8  0x%016lx R9  0x%016lx\n",
904                 vcpu->regs.tf_r8, vcpu->regs.tf_r9);
905         printk("R10 0x%016lx R11 0x%016lx\n",
906                 vcpu->regs.tf_r10, vcpu->regs.tf_r11);
907         printk("R12 0x%016lx R13 0x%016lx\n",
908                 vcpu->regs.tf_r12, vcpu->regs.tf_r13);
909         printk("R14 0x%016lx R15 0x%016lx\n",
910                 vcpu->regs.tf_r14, vcpu->regs.tf_r15);
911         printk("--- End VCPU Dump ---\n");
912
913 }
914
915 uint64_t construct_eptp(unsigned long root_hpa)
916 {
917         uint64_t eptp;
918
919         /* TODO write the value reading from MSR */
920         eptp = VMX_EPT_DEFAULT_MT |
921                 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
922         if (cpu_has_vmx_ept_ad_bits())
923                 eptp |= VMX_EPT_AD_ENABLE_BIT;
924         eptp |= (root_hpa & PAGE_MASK);
925
926         return eptp;
927 }
928
929 /**
930  * vmx_setup_initial_guest_state - configures the initial state of guest registers
931  */
932 static void vmx_setup_initial_guest_state(void)
933 {
934         unsigned long tmpl;
935         unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
936                             X86_CR4_PGE | X86_CR4_OSFXSR;
937         uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
938 #if 0
939         do we need it
940         if (boot_cpu_has(X86_FEATURE_PCID))
941                 cr4 |= X86_CR4_PCIDE;
942         if (boot_cpu_has(X86_FEATURE_OSXSAVE))
943                 cr4 |= X86_CR4_OSXSAVE;
944 #endif
945         /* we almost certainly have this */
946         /* we'll go sour if we don't. */
947         if (1) //boot_cpu_has(X86_FEATURE_FSGSBASE))
948                 cr4 |= X86_CR4_RDWRGSFS;
949
950         /* configure control and data registers */
951         vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
952                                X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
953         vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
954                                      X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
955         vmcs_writel(GUEST_CR3, rcr3());
956         vmcs_writel(GUEST_CR4, cr4);
957         vmcs_writel(CR4_READ_SHADOW, cr4);
958         vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
959                                      EFER_SCE | EFER_FFXSR);
960         vmcs_writel(GUEST_GDTR_BASE, 0);
961         vmcs_writel(GUEST_GDTR_LIMIT, 0);
962         vmcs_writel(GUEST_IDTR_BASE, 0);
963         vmcs_writel(GUEST_IDTR_LIMIT, 0);
964         vmcs_writel(GUEST_RIP, 0xdeadbeef);
965         vmcs_writel(GUEST_RSP, 0xdeadbeef);
966         vmcs_writel(GUEST_RFLAGS, 0x02);
967         vmcs_writel(GUEST_DR7, 0);
968
969         /* guest segment bases */
970         vmcs_writel(GUEST_CS_BASE, 0);
971         vmcs_writel(GUEST_DS_BASE, 0);
972         vmcs_writel(GUEST_ES_BASE, 0);
973         vmcs_writel(GUEST_GS_BASE, 0);
974         vmcs_writel(GUEST_SS_BASE, 0);
975         rdmsrl(MSR_FS_BASE, tmpl);
976         vmcs_writel(GUEST_FS_BASE, tmpl);
977
978         /* guest segment access rights */
979         vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
980         vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
981         vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
982         vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
983         vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
984         vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
985
986         /* guest segment limits */
987         vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
988         vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
989         vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
990         vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
991         vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
992         vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
993
994         /* configure segment selectors */
995         vmcs_write16(GUEST_CS_SELECTOR, 0);
996         vmcs_write16(GUEST_DS_SELECTOR, 0);
997         vmcs_write16(GUEST_ES_SELECTOR, 0);
998         vmcs_write16(GUEST_FS_SELECTOR, 0);
999         vmcs_write16(GUEST_GS_SELECTOR, 0);
1000         vmcs_write16(GUEST_SS_SELECTOR, 0);
1001         vmcs_write16(GUEST_TR_SELECTOR, 0);
1002
1003         /* guest LDTR */
1004         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1005         vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
1006         vmcs_writel(GUEST_LDTR_BASE, 0);
1007         vmcs_writel(GUEST_LDTR_LIMIT, 0);
1008
1009         /* guest TSS */
1010         vmcs_writel(GUEST_TR_BASE, 0);
1011         vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
1012         vmcs_writel(GUEST_TR_LIMIT, 0xff);
1013
1014         /* initialize sysenter */
1015         vmcs_write32(GUEST_SYSENTER_CS, 0);
1016         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1017         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1018
1019         /* other random initialization */
1020         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1021         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1022         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1023         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1024         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1025 }
1026
1027 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
1028 {
1029         int f = sizeof(unsigned long);
1030         /*
1031          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1032          * have the write-low and read-high bitmap offsets the wrong way round.
1033          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1034          */
1035         if (msr <= 0x1fff) {
1036                 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
1037                 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
1038         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1039                 msr &= 0x1fff;
1040                 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
1041                 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
1042         }
1043 }
1044
1045 static void setup_msr(struct vmx_vcpu *vcpu)
1046 {
1047         int set[] = { MSR_LSTAR };
1048         struct vmx_msr_entry *e;
1049         int sz = sizeof(set) / sizeof(*set);
1050         int i;
1051
1052         //BUILD_BUG_ON(sz > NR_AUTOLOAD_MSRS);
1053
1054         vcpu->msr_autoload.nr = sz;
1055
1056         /* XXX enable only MSRs in set */
1057         vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1058
1059         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1060         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1061         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1062
1063         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1064         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1065         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1066
1067         for (i = 0; i < sz; i++) {
1068                 uint64_t val;
1069
1070                 e = &vcpu->msr_autoload.host[i];
1071                 e->index = set[i];
1072                 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1073                 rdmsrl(e->index, val);
1074                 e->value = val;
1075
1076                 e = &vcpu->msr_autoload.guest[i];
1077                 e->index = set[i];
1078                 e->value = 0xDEADBEEF;
1079         }
1080 }
1081
1082 /**
1083  *  vmx_setup_vmcs - configures the vmcs with starting parameters
1084  */
1085 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu)
1086 {
1087         vmcs_write16(VIRTUAL_PROCESSOR_ID, vcpu->vpid);
1088         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1089
1090         /* Control */
1091         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1092                 vmcs_config.pin_based_exec_ctrl);
1093
1094         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1095                 vmcs_config.cpu_based_exec_ctrl);
1096
1097         if (cpu_has_secondary_exec_ctrls()) {
1098                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1099                              vmcs_config.cpu_based_2nd_exec_ctrl);
1100         }
1101
1102         vmcs_write64(EPT_POINTER, eptp);
1103
1104         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1105         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1106         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1107
1108         setup_msr(vcpu);
1109 #if 0
1110         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1111                 uint32_t msr_low, msr_high;
1112                 uint64_t host_pat;
1113                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
1114                 host_pat = msr_low | ((uint64_t) msr_high << 32);
1115                 /* Write the default value follow host pat */
1116                 vmcs_write64(GUEST_IA32_PAT, host_pat);
1117                 /* Keep arch.pat sync with GUEST_IA32_PAT */
1118                 vmx->vcpu.arch.pat = host_pat;
1119         }
1120
1121         for (i = 0; i < NR_VMX_MSR; ++i) {
1122                 uint32_t index = vmx_msr_index[i];
1123                 uint32_t data_low, data_high;
1124                 int j = vmx->nmsrs;
1125
1126                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1127                         continue;
1128                 if (wrmsr_safe(index, data_low, data_high) < 0)
1129                         continue;
1130                 vmx->guest_msrs[j].index = i;
1131                 vmx->guest_msrs[j].data = 0;
1132                 vmx->guest_msrs[j].mask = -1ull;
1133                 ++vmx->nmsrs;
1134         }
1135 #endif
1136
1137         vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1138
1139         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1140         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1141
1142         vmcs_writel(CR0_GUEST_HOST_MASK, ~0ul);
1143         vmcs_writel(CR4_GUEST_HOST_MASK, ~0ul);
1144
1145         //kvm_write_tsc(&vmx->vcpu, 0);
1146         vmcs_writel(TSC_OFFSET, 0);
1147
1148         vmx_setup_constant_host_state();
1149 }
1150
1151 /**
1152  * vmx_allocate_vpid - reserves a vpid and sets it in the VCPU
1153  * @vmx: the VCPU
1154  */
1155 static int vmx_allocate_vpid(struct vmx_vcpu *vmx)
1156 {
1157         int vpid;
1158
1159         vmx->vpid = 0;
1160
1161         spin_lock(&vmx_vpid_lock);
1162         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
1163         if (vpid < VMX_NR_VPIDS) {
1164                 vmx->vpid = vpid;
1165                 __set_bit(vpid, vmx_vpid_bitmap);
1166         }
1167         spin_unlock(&vmx_vpid_lock);
1168
1169         return vpid >= VMX_NR_VPIDS;
1170 }
1171
1172 /**
1173  * vmx_free_vpid - frees a vpid
1174  * @vmx: the VCPU
1175  */
1176 static void vmx_free_vpid(struct vmx_vcpu *vmx)
1177 {
1178         spin_lock(&vmx_vpid_lock);
1179         if (vmx->vpid != 0)
1180                 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
1181         spin_unlock(&vmx_vpid_lock);
1182 }
1183
1184 /**
1185  * vmx_create_vcpu - allocates and initializes a new virtual cpu
1186  *
1187  * Returns: A new VCPU structure
1188  */
1189 struct vmx_vcpu *vmx_create_vcpu(void)
1190 {
1191         struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1192         if (!vcpu) {
1193                 return NULL;
1194         }
1195
1196         memset(vcpu, 0, sizeof(*vcpu));
1197
1198         vcpu->vmcs = vmx_alloc_vmcs();
1199         printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1200         if (!vcpu->vmcs)
1201                 goto fail_vmcs;
1202
1203         if (vmx_allocate_vpid(vcpu))
1204                 goto fail_vpid;
1205
1206         printd("%d: vmx_create_vcpu: vpid %d\n", core_id(), vcpu->vpid);
1207         vcpu->cpu = -1;
1208
1209         vmx_get_cpu(vcpu);
1210         vmx_setup_vmcs(vcpu);
1211         vmx_setup_initial_guest_state();
1212         vmx_put_cpu(vcpu);
1213
1214 #if 0
1215         if (cpu_has_vmx_ept_ad_bits()) {
1216                 vcpu->ept_ad_enabled = true;
1217                 printk("vmx: enabled EPT A/D bits");
1218         }
1219         if (vmx_create_ept(vcpu->gv))
1220                 goto fail_ept;
1221 #endif
1222
1223         return vcpu;
1224
1225 fail_ept:
1226         vmx_free_vpid(vcpu);
1227 fail_vpid:
1228         vmx_free_vmcs(vcpu->vmcs);
1229 fail_vmcs:
1230         kfree(vcpu);
1231         return NULL;
1232 }
1233
1234 /**
1235  * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1236  * @vcpu: the VCPU to destroy
1237  */
1238 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu)
1239 {
1240         // needs to be done when we tear down the gv. vmx_destroy_ept(vcpu->gv);
1241         vmx_get_cpu(vcpu);
1242 //      ept_sync_context
1243         vmcs_clear(vcpu->vmcs);
1244         currentcpu->local_vcpu = NULL;
1245         vmx_put_cpu(vcpu);
1246         vmx_free_vpid(vcpu);
1247         vmx_free_vmcs(vcpu->vmcs);
1248         kfree(vcpu);
1249 }
1250
1251 /**
1252  * vmx_task_vcpu - returns a pointer to the task's vcpu or NULL.
1253  * @task: the task
1254  */
1255 static inline struct vmx_vcpu *vmx_task_vcpu(struct proc *p)
1256 {
1257         struct dune_struct *dune = current->virtinfo;
1258         return dune ? dune->vcpu : NULL;
1259 }
1260
1261 /**
1262  * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1263  *
1264  * In the contexts where this is used the vcpu pointer should never be NULL.
1265  */
1266 static inline struct vmx_vcpu *vmx_current_vcpu(void)
1267 {
1268         struct vmx_vcpu *vcpu = vmx_task_vcpu(current);
1269         if (! vcpu)
1270                 panic("%s: core_id %d: no vcpu", __func__, core_id());
1271         return vcpu;
1272 }
1273
1274
1275 /**
1276  * vmx_run_vcpu - launches the CPU into non-root mode
1277  * We ONLY support 64-bit guests.
1278  * @vcpu: the vmx instance to launch
1279  */
1280 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1281 {
1282         asm(
1283                 /* Store host registers */
1284                 "push %%rdx; push %%rbp;"
1285                 "push %%rcx \n\t" /* placeholder for guest rcx */
1286                 "push %%rcx \n\t"
1287                 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1288                 "je 1f \n\t"
1289                 "mov %%rsp, %c[host_rsp](%0) \n\t"
1290                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1291                 "1: \n\t"
1292                 /* Reload cr2 if changed */
1293                 "mov %c[cr2](%0), %%rax \n\t"
1294                 "mov %%cr2, %%rdx \n\t"
1295                 "cmp %%rax, %%rdx \n\t"
1296                 "je 2f \n\t"
1297                 "mov %%rax, %%cr2 \n\t"
1298                 "2: \n\t"
1299                 /* Check if vmlaunch of vmresume is needed */
1300                 "cmpl $0, %c[launched](%0) \n\t"
1301                 /* Load guest registers.  Don't clobber flags. */
1302                 "mov %c[rax](%0), %%rax \n\t"
1303                 "mov %c[rbx](%0), %%rbx \n\t"
1304                 "mov %c[rdx](%0), %%rdx \n\t"
1305                 "mov %c[rsi](%0), %%rsi \n\t"
1306                 "mov %c[rdi](%0), %%rdi \n\t"
1307                 "mov %c[rbp](%0), %%rbp \n\t"
1308                 "mov %c[r8](%0),  %%r8  \n\t"
1309                 "mov %c[r9](%0),  %%r9  \n\t"
1310                 "mov %c[r10](%0), %%r10 \n\t"
1311                 "mov %c[r11](%0), %%r11 \n\t"
1312                 "mov %c[r12](%0), %%r12 \n\t"
1313                 "mov %c[r13](%0), %%r13 \n\t"
1314                 "mov %c[r14](%0), %%r14 \n\t"
1315                 "mov %c[r15](%0), %%r15 \n\t"
1316                 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1317
1318                 /* Enter guest mode */
1319                 "jne .Llaunched \n\t"
1320                 ASM_VMX_VMLAUNCH "\n\t"
1321                 "jmp .Lkvm_vmx_return \n\t"
1322                 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1323                 ".Lkvm_vmx_return: "
1324                 /* Save guest registers, load host registers, keep flags */
1325                 "mov %0, %c[wordsize](%%rsp) \n\t"
1326                 "pop %0 \n\t"
1327                 "mov %%rax, %c[rax](%0) \n\t"
1328                 "mov %%rbx, %c[rbx](%0) \n\t"
1329                 "popq %c[rcx](%0) \n\t"
1330                 "mov %%rdx, %c[rdx](%0) \n\t"
1331                 "mov %%rsi, %c[rsi](%0) \n\t"
1332                 "mov %%rdi, %c[rdi](%0) \n\t"
1333                 "mov %%rbp, %c[rbp](%0) \n\t"
1334                 "mov %%r8,  %c[r8](%0) \n\t"
1335                 "mov %%r9,  %c[r9](%0) \n\t"
1336                 "mov %%r10, %c[r10](%0) \n\t"
1337                 "mov %%r11, %c[r11](%0) \n\t"
1338                 "mov %%r12, %c[r12](%0) \n\t"
1339                 "mov %%r13, %c[r13](%0) \n\t"
1340                 "mov %%r14, %c[r14](%0) \n\t"
1341                 "mov %%r15, %c[r15](%0) \n\t"
1342                 "mov %%rax, %%r10 \n\t"
1343                 "mov %%rdx, %%r11 \n\t"
1344
1345                 "mov %%cr2, %%rax   \n\t"
1346                 "mov %%rax, %c[cr2](%0) \n\t"
1347
1348                 "pop  %%rbp; pop  %%rdx \n\t"
1349                 "setbe %c[fail](%0) \n\t"
1350
1351                 "mov $" /*__stringify(GD_UD) */"16"", %%rax \n\t"
1352                 "mov %%rax, %%ds \n\t"
1353                 "mov %%rax, %%es \n\t"
1354               : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1355                 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1356                 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1357                 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1358                 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1359                 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1360                 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1361                 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1362                 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1363                 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1364                 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1365                 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1366                 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1367                 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1368                 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1369                 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1370                 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1371                 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1372                 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1373                 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1374                 [wordsize]"i"(sizeof(unsigned long))
1375               : "cc", "memory"
1376                 , "rax", "rbx", "rdi", "rsi"
1377                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1378         );
1379
1380         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1381         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1382         printk("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1383                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1384         /* FIXME: do we need to set up other flags? */
1385         vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
1386                       X86_EFLAGS_IF | 0x2;
1387         //monitor(NULL);
1388
1389         vcpu->regs.tf_cs = GD_UT;
1390         vcpu->regs.tf_ss = GD_UD;
1391
1392         vcpu->launched = 1;
1393
1394         if (vcpu->fail) {
1395                 printk("failure detected (err %x)\n",
1396                        vmcs_read32(VM_INSTRUCTION_ERROR));
1397                 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1398         }
1399
1400         return vmcs_read32(VM_EXIT_REASON);
1401
1402 #if 0
1403         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1404         vmx_complete_atomic_exit(vmx);
1405         vmx_recover_nmi_blocking(vmx);
1406         vmx_complete_interrupts(vmx);
1407 #endif
1408 }
1409
1410 static void vmx_step_instruction(void)
1411 {
1412         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1413                                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1414 }
1415
1416 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu)
1417 {
1418         unsigned long gva, gpa;
1419         int exit_qual, ret = -1;
1420         page_t *page;
1421
1422         vmx_get_cpu(vcpu);
1423         exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1424         gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1425         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1426         printk("ept: gva %016lx, gpa %016lx\n", gva, gpa);
1427
1428         vmx_put_cpu(vcpu);
1429
1430         // this is a total hack, for testing things.
1431         // note that we only care about the gpa, and the
1432         // gpa is our process virtual address. 
1433         // Confused yet?
1434         page = page_lookup(current->env_pgdir, (void *)gpa, NULL);
1435         printk("Lookup %p returns %p\n", gpa, page);
1436         if (page) {
1437                 uint64_t hpa = page2pa(page);
1438                 printk("hpa for %p is %p\n", gpa, hpa);
1439                 ret = vmx_do_ept_fault(ept, gpa, hpa, exit_qual);
1440                 printk("vmx_do_ept_fault returns %d\n", ret);
1441         }
1442
1443         if (ret) {
1444                 printk("page fault failure "
1445                        "GPA: 0x%lx, GVA: 0x%lx\n",
1446                        gpa, gva);
1447                 vmx_dump_cpu(vcpu);
1448         }
1449
1450         return ret;
1451 }
1452
1453 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu)
1454 {
1455         unsigned int eax, ebx, ecx, edx;
1456
1457         eax = vcpu->regs.tf_rax;
1458         ecx = vcpu->regs.tf_rcx;
1459         cpuid(0, 2, &eax, &ebx, &ecx, &edx);
1460         vcpu->regs.tf_rax = eax;
1461         vcpu->regs.tf_rbx = ebx;
1462         vcpu->regs.tf_rcx = ecx;
1463         vcpu->regs.tf_rdx = edx;
1464 }
1465
1466 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu)
1467 {
1468         uint32_t intr_info;
1469
1470         vmx_get_cpu(vcpu);
1471         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1472         vmx_put_cpu(vcpu);
1473
1474         printk("vmx (VPID %d): got an exception\n", vcpu->vpid);
1475         printk("vmx (VPID %d): pid %d\n", vcpu->vpid,
1476                          current->pid);
1477         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1478                 return 0;
1479         }
1480
1481         printk("unhandled nmi, intr_info %x\n", intr_info);
1482         return -EIO;
1483 }
1484
1485
1486 static void noop(void) {
1487         __asm__ __volatile__ ("1: jmp 1b");
1488 }
1489
1490 static void fail(void) {
1491         __asm__ __volatile__ ("movq $0xdeadbeef, %rbx; movq 0, %rax");
1492 }
1493
1494 static unsigned long stack[512];
1495 /**
1496  * vmx_launch - the main loop for a VMX Dune process
1497  * @conf: the launch configuration
1498  */
1499 int vmx_launch(struct dune_config *conf)
1500 {
1501         int ret;
1502         struct dune_struct dune;
1503         struct vmx_vcpu *vcpu;
1504         int i = 0;
1505         unsigned long rip = conf->rip;
1506         unsigned long rsp = conf->rsp;
1507         unsigned long cr3 = conf->cr3;
1508         int errors = 0;
1509
1510         if (conf->rip < 4096 ) {
1511                 // testing.
1512                 switch(conf->rip) {
1513                 default:
1514                         rip = (uint64_t)noop + 4;
1515                         break;
1516                 case 1:
1517                         rip = (uint64_t)fail + 4;
1518                         break;
1519                 }
1520         }
1521
1522         if (conf->cr3 == 0) {
1523                 cr3 = rcr3();
1524         }
1525
1526         /* sanity checking.  -- later
1527         ret = ept_check_page(ept, rip);
1528         if (ret) {
1529                 printk("0x%x is not mapped in the ept!\n", rip);
1530                 errors++;
1531         }
1532         ret = ept_check_page(ept, rsp);
1533         if (ret) {
1534                 printk("0x%x is not mapped in the ept!\n", rsp);
1535                 errors++;
1536         }
1537         */
1538         if (errors) {
1539                 return -EINVAL;
1540         }
1541
1542
1543         printk("RUNNING: %s: rip %p rsp %p cr3 %p \n",
1544                __func__, rip, rsp, cr3);
1545         vcpu = vmx_create_vcpu();
1546         if (!vcpu) {
1547                 return -ENOMEM;
1548         }
1549
1550         vmx_get_cpu(vcpu);
1551         vmcs_writel(GUEST_RIP, rip);
1552         vmcs_writel(GUEST_RSP, rsp);
1553         vmcs_writel(GUEST_CR3, cr3);
1554         vmx_put_cpu(vcpu);
1555
1556         printk("created VCPU (VPID %d): pid %d\n",
1557                vcpu->vpid, current->pid);
1558
1559         vcpu->ret_code = -1;
1560
1561         if (current->virtinfo)
1562                 printk("vmx_launch: current->virtinfo is NOT NULL (%p)\n", current->virtinfo);
1563         //WARN_ON(current->virtinfo != NULL);
1564         dune.vcpu = vcpu;
1565
1566         current->virtinfo = &dune;
1567
1568         while (1) {
1569                 vmx_get_cpu(vcpu);
1570
1571                 // TODO: manage the fpu when we restart.
1572
1573                 // TODO: see if we need to exit before we go much further.
1574                 disable_irq();
1575                 ret = vmx_run_vcpu(vcpu);
1576                 enable_irq();
1577
1578                 if (ret == EXIT_REASON_VMCALL ||
1579                     ret == EXIT_REASON_CPUID) {
1580                         vmx_step_instruction();
1581                 }
1582
1583                 vmx_put_cpu(vcpu);
1584
1585                 if (ret == EXIT_REASON_VMCALL) {
1586                         printk("system call! WTF\n");
1587                 } else if (ret == EXIT_REASON_CPUID)
1588                         vmx_handle_cpuid(vcpu);
1589                 else if (ret == EXIT_REASON_EPT_VIOLATION) {
1590                         if (vmx_handle_ept_violation(vcpu))
1591                                 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1592                 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1593                         if (vmx_handle_nmi_exception(vcpu))
1594                                 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1595                 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1596                         printk("External interrupt\n");
1597                 } else {
1598                         printk("unhandled exit: reason %x, exit qualification %x\n",
1599                                ret, vmcs_read32(EXIT_QUALIFICATION));
1600                         vmx_dump_cpu(vcpu);
1601                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1602                 }
1603
1604                 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1605                  * similar to how proc_restartcore/smp_idle only restart the pcpui
1606                  * cur_ctx, we need to do the same, via the VMCS resume business. */
1607
1608                 if (vcpu->shutdown)
1609                         break;
1610         }
1611
1612         printk("RETURN. ip %016lx sp %016lx\n",
1613                 vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
1614         monitor(NULL);
1615         current->virtinfo = NULL;
1616
1617         /*
1618          * Return both the reason for the shutdown and a status value.
1619          * The exit() and exit_group() system calls only need 8 bits for
1620          * the status but we allow 16 bits in case we might want to
1621          * return more information for one of the other shutdown reasons.
1622          */
1623         ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
1624
1625         printk("destroying VCPU (VPID %d): pid %d\n",
1626                         vcpu->vpid, current->pid);
1627
1628         vmx_destroy_vcpu(vcpu);
1629
1630         return ret;
1631 }
1632
1633 /**
1634  * __vmx_enable - low-level enable of VMX mode on the current CPU
1635  * @vmxon_buf: an opaque buffer for use as the VMXON region
1636  */
1637 static  int __vmx_enable(struct vmcs *vmxon_buf)
1638 {
1639         uint64_t phys_addr = PADDR(vmxon_buf);
1640         uint64_t old, test_bits;
1641
1642         if (rcr4() & X86_CR4_VMXE) {
1643                 panic("Should never have this happen");
1644                 return -EBUSY;
1645         }
1646
1647         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1648
1649         test_bits = FEATURE_CONTROL_LOCKED;
1650         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1651
1652         if (0) // tboot_enabled())
1653                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1654
1655         if ((old & test_bits) != test_bits) {
1656                 /* If it's locked, then trying to set it will cause a GPF.
1657                  * No Dune for you!
1658                  */
1659                 if (old & FEATURE_CONTROL_LOCKED) {
1660                         printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1661                         return -1;
1662                 }
1663
1664                 /* enable and lock */
1665                 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1666         }
1667         lcr4(rcr4() | X86_CR4_VMXE);
1668
1669         __vmxon(phys_addr);
1670         vpid_sync_vcpu_global();
1671         ept_sync_global();
1672
1673         return 0;
1674 }
1675
1676 /**
1677  * vmx_enable - enables VMX mode on the current CPU
1678  * @unused: not used (required for on_each_cpu())
1679  *
1680  * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
1681  */
1682 static void vmx_enable(void)
1683 {
1684         struct vmcs *vmxon_buf = currentcpu->vmxarea;
1685         int ret;
1686
1687         ret = __vmx_enable(vmxon_buf);
1688         if (ret)
1689                 goto failed;
1690
1691         currentcpu->vmx_enabled = 1;
1692         // TODO: do we need this?
1693         store_gdt(&currentcpu->host_gdt);
1694
1695         printk("VMX enabled on CPU %d\n", core_id());
1696         return;
1697
1698 failed:
1699         has_vmx = FALSE;
1700         printk("failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1701 }
1702
1703 /**
1704  * vmx_disable - disables VMX mode on the current CPU
1705  */
1706 static void vmx_disable(void *unused)
1707 {
1708         if (currentcpu->vmx_enabled) {
1709                 __vmxoff();
1710                 lcr4(rcr4() & ~X86_CR4_VMXE);
1711                 currentcpu->vmx_enabled = 0;
1712         }
1713 }
1714
1715 /* Probe the cpus to see which ones can do vmx.
1716  * Return -errno if it fails, and 1 if it succeeds.
1717  */
1718 static bool probe_cpu_vmx(void)
1719 {
1720         /* The best way to test this code is:
1721          * wrmsr -p <cpu> 0x3a 1
1722          * This will lock vmx off; then modprobe dune.
1723          * Frequently, however, systems have all 0x3a registers set to 5,
1724          * meaning testing is impossible, as vmx can not be disabled.
1725          * We have to simulate it being unavailable in most cases.
1726          * The 'test' variable provides an easy way to simulate
1727          * unavailability of vmx on some, none, or all cpus.
1728          */
1729         if (!cpu_has_vmx()) {
1730                 printk("Machine does not support VT-x\n");
1731                 return FALSE;
1732         } else {
1733                 printk("Machine supports VT-x\n");
1734                 return TRUE;
1735         }
1736 }
1737
1738 static void setup_vmxarea(void)
1739 {
1740                 struct vmcs *vmxon_buf;
1741                 printd("Set up vmxarea for cpu %d\n", core_id());
1742                 vmxon_buf = __vmx_alloc_vmcs(node_id());
1743                 if (!vmxon_buf) {
1744                         printk("setup_vmxarea failed on node %d\n", core_id());
1745                         return;
1746                 }
1747                 currentcpu->vmxarea = vmxon_buf;
1748 }
1749
1750 /**
1751  * vmx_init sets up physical core data areas that are required to run a vm at all.
1752  * These data areas are not connected to a specific user process in any way. Instead,
1753  * they are in some sense externalizing what would other wise be a very large ball of
1754  * state that would be inside the CPU.
1755  */
1756 int intel_vmm_init(void)
1757 {
1758         int r, cpu, ret;
1759
1760         if (! probe_cpu_vmx()) {
1761                 return -EOPNOTSUPP;
1762         }
1763
1764         setup_vmcs_config(&ret);
1765
1766         if (ret) {
1767                 printk("setup_vmcs_config failed: %d\n", ret);
1768                 return ret;
1769         }
1770
1771         msr_bitmap = (unsigned long *)kpage_zalloc_addr();
1772         if (!msr_bitmap) {
1773                 printk("Could not allocate msr_bitmap\n");
1774                 return -ENOMEM;
1775         }
1776         /* FIXME: do we need APIC virtualization (flexpriority?) */
1777
1778         memset(msr_bitmap, 0xff, PAGE_SIZE);
1779         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
1780         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
1781
1782         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
1783
1784         /* TEMPORARY hack so we can do some basic VM testing. Create an ept and look for faults on it.
1785          */
1786         ept = kpage_zalloc_addr();
1787         eptp = construct_eptp(PADDR(ept));
1788         printk("ept is %p and eptp is %p\n", ept, eptp);
1789         return ret;
1790 }
1791
1792 int intel_vmm_pcpu_init(void)
1793 {
1794         setup_vmxarea();
1795         vmx_enable();
1796         return 0;
1797 }