VMM: EPT checks, init, and basic usage
[akaros.git] / kern / arch / x86 / vmm / intel / vmx.c
1 /**
2  *  vmx.c - The Intel VT-x driver for Dune
3  *
4  * This file is derived from Linux KVM VT-x support.
5  * Copyright (C) 2006 Qumranet, Inc.
6  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
7  *
8  * Original Authors:
9  *   Avi Kivity   <avi@qumranet.com>
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *
12  * This modified version is simpler because it avoids the following
13  * features that are not requirements for Dune:
14  *  * Real-mode emulation
15  *  * Nested VT-x support
16  *  * I/O hardware emulation
17  *  * Any of the more esoteric X86 features and registers
18  *  * KVM-specific functionality
19  *
20  * In essence we provide only the minimum functionality needed to run
21  * a process in vmx non-root mode rather than the full hardware emulation
22  * needed to support an entire OS.
23  *
24  * This driver is a research prototype and as such has the following
25  * limitations:
26  *
27  * FIXME: Backward compatability is currently a non-goal, and only recent
28  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
29  * driver.
30  *
31  * FIXME: Eventually we should handle concurrent user's of VT-x more
32  * gracefully instead of requiring exclusive access. This would allow
33  * Dune to interoperate with KVM and other HV solutions.
34  *
35  * FIXME: We need to support hotplugged physical CPUs.
36  *
37  * Authors:
38  *   Adam Belay   <abelay@stanford.edu>
39  */
40
41 /* Basic flow.
42  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
43  * You're left with the feeling that they got part way through and realized they had to have one for
44  *
45  * 1) your CPU is going to be capable of running VMs, and you need state for that.
46  *
47  * 2) you're about to start a guest, and you need state for that.
48  *
49  * So there is get cpu set up to be able to run VMs stuff, and now
50  * let's start a guest stuff.  In Akaros, CPUs will always be set up
51  * to run a VM if that is possible. Processes can flip themselves into
52  * a VM and that will require another VMCS.
53  *
54  * So: at kernel startup time, the SMP boot stuff calls
55  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
56  * in the case of this file is intel_vmm_init. That does some code
57  * that sets up stuff for ALL sockets, based on the capabilities of
58  * the socket it runs on. If any cpu supports vmx, it assumes they all
59  * do. That's a realistic assumption. So the call_function_all is kind
60  * of stupid, really; it could just see what's on the current cpu and
61  * assume it's on all. HOWEVER: there are systems in the wilde that
62  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
63  * might as well allow for the chance that wel'll only all VMMCPs on a
64  * subset (not implemented yet however).  So: probe all CPUs, get a
65  * count of how many support VMX and, for now, assume they all do
66  * anyway.
67  *
68  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
69  * which contains all the naughty bits settings for all the cpus that can run a VM.
70  * Realistically, all VMX-capable cpus in a system will have identical configurations.
71  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
72  *
73  * configure the msr_bitmap. This is the bitmap of MSRs which the
74  * guest can manipulate.  Currently, we only allow GS and FS base.
75  *
76  * Reserve bit 0 in the vpid bitmap as guests can not use that
77  *
78  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
79  * per-guest. Once set up, it is left alone.  The ONLY think we set in
80  * there is the revision area. The VMX is page-sized per cpu and
81  * page-aligned. Note that it can be smaller, but why bother? We know
82  * the max size and alightment, and it's convenient.
83  *
84  * Now that it is set up, enable vmx on all cpus. This involves
85  * testing VMXE in cr4, to see if we've been here before (TODO: delete
86  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
87  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
88  * instruction), and syncing vpid's and ept's.  Now the CPU is ready
89  * to host guests.
90  *
91  * Setting up a guest.
92  * We divide this into two things: vmm_proc_init and vm_run.
93  * Currently, on Intel, vmm_proc_init does nothing.
94  *
95  * vm_run is really complicated. It is called with a coreid, rip, rsp,
96  * cr3, and flags.  On intel, it calls vmx_launch. vmx_launch is set
97  * up for a few test cases. If rip is 1, it sets the guest rip to
98  * a function which will deref 0 and should exit with failure 2. If rip is 0,
99  * it calls an infinite loop in the guest.
100  *
101  * The sequence of operations:
102  * create a vcpu
103  * while (1) {
104  * get a vcpu
105  * disable irqs (required or you can't enter the VM)
106  * vmx_run_vcpu()
107  * enable irqs
108  * manage the vm exit
109  * }
110  *
111  * get a vcpu
112  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
113  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
114  *
115  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
116  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
117  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
118  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
119  *
120  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
121  * of inline assembly with embedded CPP crap. I suspect we'll want to
122  * un-inline it someday, but maybe not.  It's called with a vcpu
123  * struct from which it loads guest state, and to which it stores
124  * non-virtualized host state. It issues a vmlaunch or vmresume
125  * instruction depending, and on return, it evaluates if things the
126  * launch/resume had an error in that operation. Note this is NOT the
127  * same as an error while in the virtual machine; this is an error in
128  * startup due to misconfiguration. Depending on whatis returned it's
129  * either a failed vm startup or an exit for lots of many reasons.
130  *
131  */
132
133 /* basically: only rename those globals that might conflict
134  * with existing names. Leave all else the same.
135  * this code is more modern than the other code, yet still
136  * well encapsulated, it seems.
137  */
138 #include <kmalloc.h>
139 #include <string.h>
140 #include <stdio.h>
141 #include <assert.h>
142 #include <error.h>
143 #include <pmap.h>
144 #include <sys/queue.h>
145 #include <smp.h>
146 #include <kref.h>
147 #include <atomic.h>
148 #include <alarm.h>
149 #include <event.h>
150 #include <umem.h>
151 #include <bitops.h>
152 #include <arch/types.h>
153 #include <syscall.h>
154
155 #include "vmx.h"
156 #include "../vmm.h"
157
158 #include "compat.h"
159 #include "cpufeature.h"
160
161 #define currentcpu (&per_cpu_info[core_id()])
162
163 /*
164  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
165  * away by decrementing the array size.
166  */
167 static const uint32_t vmx_msr_index[] = {
168 #ifdef CONFIG_X86_64
169         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
170 #endif
171         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
172 };
173 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
174
175 static DECLARE_BITMAP(vmx_vpid_bitmap, /*VMX_NR_VPIDS*/ 65536);
176 static spinlock_t vmx_vpid_lock;
177
178 static unsigned long *msr_bitmap;
179
180 static struct vmcs_config {
181         int size;
182         int order;
183         uint32_t revision_id;
184         uint32_t pin_based_exec_ctrl;
185         uint32_t cpu_based_exec_ctrl;
186         uint32_t cpu_based_2nd_exec_ctrl;
187         uint32_t vmexit_ctrl;
188         uint32_t vmentry_ctrl;
189 } vmcs_config;
190
191 struct vmx_capability vmx_capability;
192
193 static inline bool cpu_has_secondary_exec_ctrls(void)
194 {
195         return vmcs_config.cpu_based_exec_ctrl &
196                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
197 }
198
199 static inline bool cpu_has_vmx_vpid(void)
200 {
201         return vmcs_config.cpu_based_2nd_exec_ctrl &
202                 SECONDARY_EXEC_ENABLE_VPID;
203 }
204
205 static inline bool cpu_has_vmx_invpcid(void)
206 {
207         return vmcs_config.cpu_based_2nd_exec_ctrl &
208                 SECONDARY_EXEC_ENABLE_INVPCID;
209 }
210
211 static inline bool cpu_has_vmx_invvpid_single(void)
212 {
213         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
214 }
215
216 static inline bool cpu_has_vmx_invvpid_global(void)
217 {
218         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
219 }
220
221 static inline bool cpu_has_vmx_ept(void)
222 {
223         return vmcs_config.cpu_based_2nd_exec_ctrl &
224                 SECONDARY_EXEC_ENABLE_EPT;
225 }
226
227 static inline bool cpu_has_vmx_invept(void)
228 {
229         return vmx_capability.ept & VMX_EPT_INVEPT_BIT;
230 }
231
232 /* the SDM (2015-01) doesn't mention this ability (still?) */
233 static inline bool cpu_has_vmx_invept_individual_addr(void)
234 {
235         return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
236 }
237
238 static inline bool cpu_has_vmx_invept_context(void)
239 {
240         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
241 }
242
243 static inline bool cpu_has_vmx_invept_global(void)
244 {
245         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
246 }
247
248 static inline bool cpu_has_vmx_ept_ad_bits(void)
249 {
250         return vmx_capability.ept & VMX_EPT_AD_BIT;
251 }
252
253 static inline bool cpu_has_vmx_ept_execute_only(void)
254 {
255         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
256 }
257
258 static inline bool cpu_has_vmx_eptp_uncacheable(void)
259 {
260         return vmx_capability.ept & VMX_EPTP_UC_BIT;
261 }
262
263 static inline bool cpu_has_vmx_eptp_writeback(void)
264 {
265         return vmx_capability.ept & VMX_EPTP_WB_BIT;
266 }
267
268 static inline bool cpu_has_vmx_ept_2m_page(void)
269 {
270         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
271 }
272
273 static inline bool cpu_has_vmx_ept_1g_page(void)
274 {
275         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
276 }
277
278 static inline bool cpu_has_vmx_ept_4levels(void)
279 {
280         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
281 }
282
283 static inline void __invept(int ext, uint64_t eptp, gpa_t gpa)
284 {
285         struct {
286                 uint64_t eptp, gpa;
287         } operand = {eptp, gpa};
288
289         asm volatile (ASM_VMX_INVEPT
290                         /* CF==1 or ZF==1 --> rc = -1 */
291                         "; ja 1f ; ud2 ; 1:\n"
292                         : : "a" (&operand), "c" (ext) : "cc", "memory");
293 }
294
295 /* We assert support for the global flush during ept_init() */
296 static inline void ept_sync_global(void)
297 {
298         __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
299 }
300
301 static inline void ept_sync_context(uint64_t eptp)
302 {
303         if (cpu_has_vmx_invept_context())
304                 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
305         else
306                 ept_sync_global();
307 }
308
309 void ept_flush(uint64_t eptp)
310 {
311         ept_sync_context(eptp);
312 }
313
314 static inline void ept_sync_individual_addr(uint64_t eptp, gpa_t gpa)
315 {
316         if (cpu_has_vmx_invept_individual_addr())
317                 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
318                                 eptp, gpa);
319         else
320                 ept_sync_context(eptp);
321 }
322
323 static inline void __vmxon(uint64_t addr)
324 {
325         asm volatile (ASM_VMX_VMXON_RAX
326                         : : "a"(&addr), "m"(addr)
327                         : "memory", "cc");
328 }
329
330 static inline void __vmxoff(void)
331 {
332         asm volatile (ASM_VMX_VMXOFF : : : "cc");
333 }
334
335 static inline void __invvpid(int ext, uint16_t vpid, gva_t gva)
336 {
337     struct {
338         uint64_t vpid : 16;
339         uint64_t rsvd : 48;
340         uint64_t gva;
341     } operand = { vpid, 0, gva };
342
343     asm volatile (ASM_VMX_INVVPID
344                   /* CF==1 or ZF==1 --> rc = -1 */
345                   "; ja 1f ; ud2 ; 1:"
346                   : : "a"(&operand), "c"(ext) : "cc", "memory");
347 }
348
349 static inline void vpid_sync_vcpu_single(uint16_t vpid)
350 {
351         if (vpid == 0) {
352                 return;
353         }
354
355         if (cpu_has_vmx_invvpid_single())
356                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
357 }
358
359 static inline void vpid_sync_vcpu_global(void)
360 {
361         if (cpu_has_vmx_invvpid_global())
362                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
363 }
364
365 static inline void vpid_sync_context(uint16_t vpid)
366 {
367         if (cpu_has_vmx_invvpid_single())
368                 vpid_sync_vcpu_single(vpid);
369         else
370                 vpid_sync_vcpu_global();
371 }
372
373 static inline uint64_t vcpu_get_eptp(struct vmx_vcpu *vcpu)
374 {
375         return vcpu->proc->env_pgdir.eptp;
376 }
377
378 static void vmcs_clear(struct vmcs *vmcs)
379 {
380         uint64_t phys_addr = PADDR(vmcs);
381         uint8_t error;
382
383         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
384                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
385                       : "cc", "memory");
386         if (error)
387                 printk("vmclear fail: %p/%llx\n",
388                        vmcs, phys_addr);
389 }
390
391 static void vmcs_load(struct vmcs *vmcs)
392 {
393         uint64_t phys_addr = PADDR(vmcs);
394         uint8_t error;
395
396         asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
397                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
398                         : "cc", "memory");
399         if (error)
400                 printk("vmptrld %p/%llx failed\n",
401                        vmcs, phys_addr);
402 }
403
404 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
405 static physaddr_t vmcs_get_current(void)
406 {
407         physaddr_t vmcs_paddr;
408         /* RAX contains the addr of the location to store the VMCS pointer.  The
409          * compiler doesn't know the ASM will deref that pointer, hence the =m */
410         asm volatile (ASM_VMX_VMPTRST_RAX : "=m"(vmcs_paddr) : "a"(&vmcs_paddr));
411         return vmcs_paddr;
412 }
413
414 __always_inline unsigned long vmcs_readl(unsigned long field)
415 {
416         unsigned long value;
417
418         asm volatile (ASM_VMX_VMREAD_RDX_RAX
419                       : "=a"(value) : "d"(field) : "cc");
420         return value;
421 }
422
423 __always_inline uint16_t vmcs_read16(unsigned long field)
424 {
425         return vmcs_readl(field);
426 }
427
428 static __always_inline uint32_t vmcs_read32(unsigned long field)
429 {
430         return vmcs_readl(field);
431 }
432
433 static __always_inline uint64_t vmcs_read64(unsigned long field)
434 {
435 #ifdef CONFIG_X86_64
436         return vmcs_readl(field);
437 #else
438         return vmcs_readl(field) | ((uint64_t)vmcs_readl(field+1) << 32);
439 #endif
440 }
441
442 void vmwrite_error(unsigned long field, unsigned long value)
443 {
444         printk("vmwrite error: reg %lx value %lx (err %d)\n",
445                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
446 }
447
448 void vmcs_writel(unsigned long field, unsigned long value)
449 {
450         uint8_t error;
451
452         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
453                        : "=q"(error) : "a"(value), "d"(field) : "cc");
454         if (error)
455                 vmwrite_error(field, value);
456 }
457
458 static void vmcs_write16(unsigned long field, uint16_t value)
459 {
460         vmcs_writel(field, value);
461 }
462
463 static void vmcs_write32(unsigned long field, uint32_t value)
464 {
465         vmcs_writel(field, value);
466 }
467
468 static void vmcs_write64(unsigned long field, uint64_t value)
469 {
470         vmcs_writel(field, value);
471 }
472
473 static int adjust_vmx_controls(uint32_t ctl_min, uint32_t ctl_opt,
474                                       uint32_t msr, uint32_t *result)
475 {
476         uint32_t vmx_msr_low, vmx_msr_high;
477         uint32_t ctl = ctl_min | ctl_opt;
478         uint64_t vmx_msr = read_msr(msr);
479         vmx_msr_low = vmx_msr;
480         vmx_msr_high = vmx_msr>>32;
481
482         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
483         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
484
485         /* Ensure minimum (required) set of control bits are supported. */
486         if (ctl_min & ~ctl) {
487                 return -EIO;
488         }
489
490         *result = ctl;
491         return 0;
492 }
493
494 static  bool allow_1_setting(uint32_t msr, uint32_t ctl)
495 {
496         uint32_t vmx_msr_low, vmx_msr_high;
497
498         rdmsr(msr, vmx_msr_low, vmx_msr_high);
499         return vmx_msr_high & ctl;
500 }
501
502 static  void setup_vmcs_config(void *p)
503 {
504         int *ret = p;
505         struct vmcs_config *vmcs_conf = &vmcs_config;
506         uint32_t vmx_msr_low, vmx_msr_high;
507         uint32_t min, opt, min2, opt2;
508         uint32_t _pin_based_exec_control = 0;
509         uint32_t _cpu_based_exec_control = 0;
510         uint32_t _cpu_based_2nd_exec_control = 0;
511         uint32_t _vmexit_control = 0;
512         uint32_t _vmentry_control = 0;
513
514         *ret = -EIO;
515         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
516         opt = PIN_BASED_VIRTUAL_NMIS;
517         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
518                                 &_pin_based_exec_control) < 0) {
519                 return;
520         }
521
522         min =
523               CPU_BASED_CR8_LOAD_EXITING |
524               CPU_BASED_CR8_STORE_EXITING |
525               CPU_BASED_CR3_LOAD_EXITING |
526               CPU_BASED_CR3_STORE_EXITING |
527               CPU_BASED_MOV_DR_EXITING |
528               CPU_BASED_USE_TSC_OFFSETING |
529               CPU_BASED_MWAIT_EXITING |
530               CPU_BASED_MONITOR_EXITING |
531               CPU_BASED_INVLPG_EXITING;
532
533         min |= CPU_BASED_HLT_EXITING;
534
535         opt = CPU_BASED_TPR_SHADOW |
536               CPU_BASED_USE_MSR_BITMAPS |
537               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
538         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
539                                 &_cpu_based_exec_control) < 0) {
540                 return;
541         }
542
543         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
544                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
545                                            ~CPU_BASED_CR8_STORE_EXITING;
546
547         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
548                 min2 = 
549                         SECONDARY_EXEC_ENABLE_VPID |
550                         SECONDARY_EXEC_ENABLE_EPT |
551                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
552                 opt2 =  SECONDARY_EXEC_WBINVD_EXITING |
553                         SECONDARY_EXEC_RDTSCP |
554                         SECONDARY_EXEC_ENABLE_INVPCID;
555                 if (adjust_vmx_controls(min2, opt2,
556                                         MSR_IA32_VMX_PROCBASED_CTLS2,
557                                         &_cpu_based_2nd_exec_control) < 0) {
558                                                 return;
559                                         }
560         }
561
562         if (!(_cpu_based_2nd_exec_control &
563                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
564                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
565
566         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
567                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
568                    enabled */
569                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
570                                              CPU_BASED_CR3_STORE_EXITING |
571                                              CPU_BASED_INVLPG_EXITING);
572                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
573                       vmx_capability.ept, vmx_capability.vpid);
574         }
575
576         min = 0;
577
578         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
579
580 //      opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
581         opt = 0;
582         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
583                                 &_vmexit_control) < 0) {
584                 return;
585         }
586
587         min = 0;
588 //      opt = VM_ENTRY_LOAD_IA32_PAT;
589         opt = 0;
590         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
591                                 &_vmentry_control) < 0) {
592                 return;
593         }
594
595         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
596
597         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
598         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) {
599                 return;
600         }
601
602         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
603         if (vmx_msr_high & (1u<<16)) {
604                 printk("64-bit CPUs always have VMX_BASIC_MSR[48]==0. FAILS!\n");
605                 return;
606         }
607
608         /* Require Write-Back (WB) memory type for VMCS accesses. */
609         if (((vmx_msr_high >> 18) & 15) != 6) {
610                 printk("NO WB!\n");
611                 return;
612         }
613
614         vmcs_conf->size = vmx_msr_high & 0x1fff;
615         vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
616         vmcs_conf->revision_id = vmx_msr_low;
617         printk("vmcs_conf size %d order %d rev %d\n",
618                vmcs_conf->size, vmcs_conf->order,
619                vmcs_conf->revision_id);
620
621         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
622         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
623         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
624         vmcs_conf->vmexit_ctrl         = _vmexit_control;
625         vmcs_conf->vmentry_ctrl        = _vmentry_control;
626
627         vmx_capability.has_load_efer =
628                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
629                                 VM_ENTRY_LOAD_IA32_EFER)
630                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
631                                    VM_EXIT_LOAD_IA32_EFER);
632
633         /* Now that we've done all the setup we can do, verify
634          * that we have all the capabilities we need. These tests
635          * are done last presumably because all the work done above
636          * affects some of them.
637          */
638
639         if (!vmx_capability.has_load_efer) {
640                 printk("CPU lacks ability to load EFER register\n");
641                 return;
642         }
643
644         *ret = 0;
645 }
646
647 static struct vmcs *__vmx_alloc_vmcs(int node)
648 {
649         struct vmcs *vmcs;
650
651         vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
652         if (!vmcs)
653                 return 0;
654         memset(vmcs, 0, vmcs_config.size);
655         vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
656         printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
657         return vmcs;
658 }
659
660 /**
661  * vmx_alloc_vmcs - allocates a VMCS region
662  *
663  * NOTE: Assumes the new region will be used by the current CPU.
664  *
665  * Returns a valid VMCS region.
666  */
667 static struct vmcs *vmx_alloc_vmcs(void)
668 {
669         return __vmx_alloc_vmcs(node_id());
670 }
671
672 /**
673  * vmx_free_vmcs - frees a VMCS region
674  */
675 static void vmx_free_vmcs(struct vmcs *vmcs)
676 {
677   //free_pages((unsigned long)vmcs, vmcs_config.order);
678 }
679
680 /*
681  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
682  * will not change in the lifetime of the guest.
683  * Note that host-state that does change is set elsewhere. E.g., host-state
684  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
685  */
686 static void vmx_setup_constant_host_state(void)
687 {
688         uint32_t low32, high32;
689         unsigned long tmpl;
690         pseudodesc_t dt;
691
692         vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);  /* 22.2.3 */
693         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
694         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
695
696         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
697         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
698         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
699         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
700         vmcs_write16(HOST_TR_SELECTOR, GD_TSS);  /* 22.2.4 */
701
702         native_store_idt(&dt);
703         vmcs_writel(HOST_IDTR_BASE, dt.pd_base);   /* 22.2.4 */
704
705         asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
706         vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
707
708         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
709         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
710         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
711         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
712
713         rdmsr(MSR_EFER, low32, high32);
714         vmcs_write32(HOST_IA32_EFER, low32);
715
716         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
717                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
718                 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
719         }
720
721         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
722         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
723
724         /* TODO: This (at least gs) is per cpu */
725         rdmsrl(MSR_FS_BASE, tmpl);
726         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
727         rdmsrl(MSR_GS_BASE, tmpl);
728         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
729 }
730
731 static inline uint16_t vmx_read_ldt(void)
732 {
733         uint16_t ldt;
734         asm("sldt %0" : "=g"(ldt));
735         return ldt;
736 }
737
738 static unsigned long segment_base(uint16_t selector)
739 {
740         pseudodesc_t *gdt = &currentcpu->host_gdt;
741         struct desc_struct *d;
742         unsigned long table_base;
743         unsigned long v;
744
745         if (!(selector & ~3)) {
746                 return 0;
747         }
748
749         table_base = gdt->pd_base;
750
751         if (selector & 4) {           /* from ldt */
752                 uint16_t ldt_selector = vmx_read_ldt();
753
754                 if (!(ldt_selector & ~3)) {
755                         return 0;
756                 }
757
758                 table_base = segment_base(ldt_selector);
759         }
760         d = (struct desc_struct *)(table_base + (selector & ~7));
761         v = get_desc_base(d);
762 #ifdef CONFIG_X86_64
763        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
764                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
765 #endif
766         return v;
767 }
768
769 static inline unsigned long vmx_read_tr_base(void)
770 {
771         uint16_t tr;
772         asm("str %0" : "=g"(tr));
773         return segment_base(tr);
774 }
775
776 static void __vmx_setup_cpu(void)
777 {
778         pseudodesc_t *gdt = &currentcpu->host_gdt;
779         unsigned long sysenter_esp;
780         unsigned long tmpl;
781
782         /*
783          * Linux uses per-cpu TSS and GDT, so set these when switching
784          * processors.
785          */
786         vmcs_writel(HOST_TR_BASE, vmx_read_tr_base()); /* 22.2.4 */
787         vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);   /* 22.2.4 */
788
789         rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
790         vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
791
792         rdmsrl(MSR_FS_BASE, tmpl);
793         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
794         rdmsrl(MSR_GS_BASE, tmpl);
795         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
796 }
797
798 /**
799  * vmx_get_cpu - called before using a cpu
800  * @vcpu: VCPU that will be loaded.
801  *
802  * Disables preemption. Call vmx_put_cpu() when finished.
803  */
804 static void vmx_get_cpu(struct vmx_vcpu *vcpu)
805 {
806         int cur_cpu = core_id();
807         handler_wrapper_t *w;
808
809         if (currentcpu->local_vcpu)
810                 panic("get_cpu: currentcpu->localvcpu was non-NULL");
811         if (currentcpu->local_vcpu != vcpu) {
812                 currentcpu->local_vcpu = vcpu;
813
814                 if (vcpu->cpu != cur_cpu) {
815                         if (vcpu->cpu >= 0) {
816                                 panic("vcpu->cpu is not -1, it's %d\n", vcpu->cpu);
817                         } else
818                                 vmcs_clear(vcpu->vmcs);
819
820                         vpid_sync_context(vcpu->vpid);
821                         ept_sync_context(vcpu_get_eptp(vcpu));
822
823                         vcpu->launched = 0;
824                         vmcs_load(vcpu->vmcs);
825                         __vmx_setup_cpu();
826                         vcpu->cpu = cur_cpu;
827                 } else {
828                         vmcs_load(vcpu->vmcs);
829                 }
830         }
831 }
832
833 /**
834  * vmx_put_cpu - called after using a cpu
835  * @vcpu: VCPU that was loaded.
836  */
837 static void vmx_put_cpu(struct vmx_vcpu *vcpu)
838 {
839         if (core_id() != vcpu->cpu)
840                 panic("%s: core_id() %d != vcpu->cpu %d\n",
841                       __func__, core_id(), vcpu->cpu);
842
843         if (currentcpu->local_vcpu != vcpu)
844                 panic("vmx_put_cpu: asked to clear something not ours");
845
846
847         vpid_sync_context(vcpu->vpid);
848         ept_sync_context(vcpu_get_eptp(vcpu));
849         vmcs_clear(vcpu->vmcs);
850         vcpu->cpu = -1;
851         currentcpu->local_vcpu = NULL;
852         //put_cpu();
853 }
854
855 static void __vmx_sync_helper(struct hw_trapframe *hw_tf, void *ptr)
856 {
857         struct vmx_vcpu *vcpu = ptr;
858
859         ept_sync_context(vcpu_get_eptp(vcpu));
860 }
861
862 struct sync_addr_args {
863         struct vmx_vcpu *vcpu;
864         gpa_t gpa;
865 };
866
867 static void __vmx_sync_individual_addr_helper(struct hw_trapframe *hw_tf, void *ptr)
868 {
869         struct sync_addr_args *args = ptr;
870
871 //      ept_sync_individual_addr(
872
873 }
874
875 /**
876  * vmx_ept_sync_global - used to evict everything in the EPT
877  * @vcpu: the vcpu
878  */
879 void vmx_ept_sync_vcpu(struct vmx_vcpu *vcpu)
880 {
881         handler_wrapper_t *w;
882
883         smp_call_function_single(vcpu->cpu,
884                 __vmx_sync_helper, (void *) vcpu, &w);
885
886         if (smp_call_wait(w)) {
887                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
888         }
889
890
891 }
892
893 /**
894  * vmx_ept_sync_individual_addr - used to evict an individual address
895  * @vcpu: the vcpu
896  * @gpa: the guest-physical address
897  */
898 void vmx_ept_sync_individual_addr(struct vmx_vcpu *vcpu, gpa_t gpa)
899 {
900         struct sync_addr_args args;
901         args.vcpu = vcpu;
902         args.gpa = gpa;
903
904         handler_wrapper_t *w;
905
906
907         smp_call_function_single(vcpu->cpu,
908                                  __vmx_sync_individual_addr_helper, (void *) &args, &w);
909
910         if (smp_call_wait(w)) {
911                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
912         }
913
914 }
915
916 /**
917  * vmx_dump_cpu - prints the CPU state
918  * @vcpu: VCPU to print
919  */
920 static void vmx_dump_cpu(struct vmx_vcpu *vcpu)
921 {
922
923         unsigned long flags;
924
925         vmx_get_cpu(vcpu);
926         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
927         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
928         flags = vmcs_readl(GUEST_RFLAGS);
929         vmx_put_cpu(vcpu);
930
931         printk("--- Begin VCPU Dump ---\n");
932         printk("CPU %d VPID %d\n", vcpu->cpu, vcpu->vpid);
933         printk("RIP 0x%016lx RFLAGS 0x%08lx\n",
934                vcpu->regs.tf_rip, flags);
935         printk("RAX 0x%016lx RCX 0x%016lx\n",
936                 vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
937         printk("RDX 0x%016lx RBX 0x%016lx\n",
938                 vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
939         printk("RSP 0x%016lx RBP 0x%016lx\n",
940                 vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
941         printk("RSI 0x%016lx RDI 0x%016lx\n",
942                 vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
943         printk("R8  0x%016lx R9  0x%016lx\n",
944                 vcpu->regs.tf_r8, vcpu->regs.tf_r9);
945         printk("R10 0x%016lx R11 0x%016lx\n",
946                 vcpu->regs.tf_r10, vcpu->regs.tf_r11);
947         printk("R12 0x%016lx R13 0x%016lx\n",
948                 vcpu->regs.tf_r12, vcpu->regs.tf_r13);
949         printk("R14 0x%016lx R15 0x%016lx\n",
950                 vcpu->regs.tf_r14, vcpu->regs.tf_r15);
951         printk("--- End VCPU Dump ---\n");
952
953 }
954
955 uint64_t construct_eptp(physaddr_t root_hpa)
956 {
957         uint64_t eptp;
958
959         /* set WB memory and 4 levels of walk.  we checked these in ept_init */
960         eptp = VMX_EPT_MEM_TYPE_WB |
961                (VMX_EPT_GAW_4_LVL << VMX_EPT_GAW_EPTP_SHIFT);
962         if (cpu_has_vmx_ept_ad_bits())
963                 eptp |= VMX_EPT_AD_ENABLE_BIT;
964         eptp |= (root_hpa & PAGE_MASK);
965
966         return eptp;
967 }
968
969 /**
970  * vmx_setup_initial_guest_state - configures the initial state of guest registers
971  */
972 static void vmx_setup_initial_guest_state(void)
973 {
974         unsigned long tmpl;
975         unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
976                             X86_CR4_PGE | X86_CR4_OSFXSR;
977         uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
978 #if 0
979         do we need it
980         if (boot_cpu_has(X86_FEATURE_PCID))
981                 cr4 |= X86_CR4_PCIDE;
982         if (boot_cpu_has(X86_FEATURE_OSXSAVE))
983                 cr4 |= X86_CR4_OSXSAVE;
984 #endif
985         /* we almost certainly have this */
986         /* we'll go sour if we don't. */
987         if (1) //boot_cpu_has(X86_FEATURE_FSGSBASE))
988                 cr4 |= X86_CR4_RDWRGSFS;
989
990         /* configure control and data registers */
991         vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
992                                X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
993         vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
994                                      X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
995         vmcs_writel(GUEST_CR3, rcr3());
996         vmcs_writel(GUEST_CR4, cr4);
997         vmcs_writel(CR4_READ_SHADOW, cr4);
998         vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
999                                      EFER_SCE | EFER_FFXSR);
1000         vmcs_writel(GUEST_GDTR_BASE, 0);
1001         vmcs_writel(GUEST_GDTR_LIMIT, 0);
1002         vmcs_writel(GUEST_IDTR_BASE, 0);
1003         vmcs_writel(GUEST_IDTR_LIMIT, 0);
1004         vmcs_writel(GUEST_RIP, 0xdeadbeef);
1005         vmcs_writel(GUEST_RSP, 0xdeadbeef);
1006         vmcs_writel(GUEST_RFLAGS, 0x02);
1007         vmcs_writel(GUEST_DR7, 0);
1008
1009         /* guest segment bases */
1010         vmcs_writel(GUEST_CS_BASE, 0);
1011         vmcs_writel(GUEST_DS_BASE, 0);
1012         vmcs_writel(GUEST_ES_BASE, 0);
1013         vmcs_writel(GUEST_GS_BASE, 0);
1014         vmcs_writel(GUEST_SS_BASE, 0);
1015         rdmsrl(MSR_FS_BASE, tmpl);
1016         vmcs_writel(GUEST_FS_BASE, tmpl);
1017
1018         /* guest segment access rights */
1019         vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
1020         vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
1021         vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
1022         vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
1023         vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
1024         vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
1025
1026         /* guest segment limits */
1027         vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
1028         vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
1029         vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
1030         vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
1031         vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
1032         vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
1033
1034         /* configure segment selectors */
1035         vmcs_write16(GUEST_CS_SELECTOR, 0);
1036         vmcs_write16(GUEST_DS_SELECTOR, 0);
1037         vmcs_write16(GUEST_ES_SELECTOR, 0);
1038         vmcs_write16(GUEST_FS_SELECTOR, 0);
1039         vmcs_write16(GUEST_GS_SELECTOR, 0);
1040         vmcs_write16(GUEST_SS_SELECTOR, 0);
1041         vmcs_write16(GUEST_TR_SELECTOR, 0);
1042
1043         /* guest LDTR */
1044         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1045         vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
1046         vmcs_writel(GUEST_LDTR_BASE, 0);
1047         vmcs_writel(GUEST_LDTR_LIMIT, 0);
1048
1049         /* guest TSS */
1050         vmcs_writel(GUEST_TR_BASE, 0);
1051         vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
1052         vmcs_writel(GUEST_TR_LIMIT, 0xff);
1053
1054         /* initialize sysenter */
1055         vmcs_write32(GUEST_SYSENTER_CS, 0);
1056         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1057         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1058
1059         /* other random initialization */
1060         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1061         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1062         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1063         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1064         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1065 }
1066
1067 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
1068 {
1069         int f = sizeof(unsigned long);
1070         /*
1071          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1072          * have the write-low and read-high bitmap offsets the wrong way round.
1073          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1074          */
1075         if (msr <= 0x1fff) {
1076                 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
1077                 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
1078         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1079                 msr &= 0x1fff;
1080                 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
1081                 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
1082         }
1083 }
1084
1085 static void setup_msr(struct vmx_vcpu *vcpu)
1086 {
1087         int set[] = { MSR_LSTAR };
1088         struct vmx_msr_entry *e;
1089         int sz = sizeof(set) / sizeof(*set);
1090         int i;
1091
1092         //BUILD_BUG_ON(sz > NR_AUTOLOAD_MSRS);
1093
1094         vcpu->msr_autoload.nr = sz;
1095
1096         /* XXX enable only MSRs in set */
1097         vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1098
1099         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1100         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1101         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1102
1103         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1104         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1105         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1106
1107         for (i = 0; i < sz; i++) {
1108                 uint64_t val;
1109
1110                 e = &vcpu->msr_autoload.host[i];
1111                 e->index = set[i];
1112                 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1113                 rdmsrl(e->index, val);
1114                 e->value = val;
1115
1116                 e = &vcpu->msr_autoload.guest[i];
1117                 e->index = set[i];
1118                 e->value = 0xDEADBEEF;
1119         }
1120 }
1121
1122 /**
1123  *  vmx_setup_vmcs - configures the vmcs with starting parameters
1124  */
1125 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu)
1126 {
1127         vmcs_write16(VIRTUAL_PROCESSOR_ID, vcpu->vpid);
1128         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1129
1130         /* Control */
1131         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1132                 vmcs_config.pin_based_exec_ctrl);
1133
1134         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1135                 vmcs_config.cpu_based_exec_ctrl);
1136
1137         if (cpu_has_secondary_exec_ctrls()) {
1138                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1139                              vmcs_config.cpu_based_2nd_exec_ctrl);
1140         }
1141
1142         vmcs_write64(EPT_POINTER, vcpu_get_eptp(vcpu));
1143
1144         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1145         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1146         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1147
1148         setup_msr(vcpu);
1149 #if 0
1150         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1151                 uint32_t msr_low, msr_high;
1152                 uint64_t host_pat;
1153                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
1154                 host_pat = msr_low | ((uint64_t) msr_high << 32);
1155                 /* Write the default value follow host pat */
1156                 vmcs_write64(GUEST_IA32_PAT, host_pat);
1157                 /* Keep arch.pat sync with GUEST_IA32_PAT */
1158                 vmx->vcpu.arch.pat = host_pat;
1159         }
1160 #endif
1161 #if 0
1162         for (int i = 0; i < NR_VMX_MSR; ++i) {
1163                 uint32_t index = vmx_msr_index[i];
1164                 uint32_t data_low, data_high;
1165                 int j = vmx->nmsrs;
1166                 // TODO we should have read/writemsr_safe
1167 #if 0
1168                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1169                         continue;
1170                 if (wrmsr_safe(index, data_low, data_high) < 0)
1171                         continue;
1172 #endif
1173                 vmx->guest_msrs[j].index = i;
1174                 vmx->guest_msrs[j].data = 0;
1175                 vmx->guest_msrs[j].mask = -1ull;
1176                 ++vmx->nmsrs;
1177         }
1178 #endif
1179
1180         vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1181
1182         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1183         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1184
1185         vmcs_writel(CR0_GUEST_HOST_MASK, ~0ul);
1186         vmcs_writel(CR4_GUEST_HOST_MASK, ~0ul);
1187
1188         //kvm_write_tsc(&vmx->vcpu, 0);
1189         vmcs_writel(TSC_OFFSET, 0);
1190
1191         vmx_setup_constant_host_state();
1192 }
1193
1194 /**
1195  * vmx_allocate_vpid - reserves a vpid and sets it in the VCPU
1196  * @vmx: the VCPU
1197  */
1198 static int vmx_allocate_vpid(struct vmx_vcpu *vmx)
1199 {
1200         int vpid;
1201
1202         vmx->vpid = 0;
1203
1204         spin_lock(&vmx_vpid_lock);
1205         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
1206         if (vpid < VMX_NR_VPIDS) {
1207                 vmx->vpid = vpid;
1208                 __set_bit(vpid, vmx_vpid_bitmap);
1209         }
1210         spin_unlock(&vmx_vpid_lock);
1211
1212         return vpid >= VMX_NR_VPIDS;
1213 }
1214
1215 /**
1216  * vmx_free_vpid - frees a vpid
1217  * @vmx: the VCPU
1218  */
1219 static void vmx_free_vpid(struct vmx_vcpu *vmx)
1220 {
1221         spin_lock(&vmx_vpid_lock);
1222         if (vmx->vpid != 0)
1223                 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
1224         spin_unlock(&vmx_vpid_lock);
1225 }
1226
1227 /**
1228  * vmx_create_vcpu - allocates and initializes a new virtual cpu
1229  *
1230  * Returns: A new VCPU structure
1231  */
1232 struct vmx_vcpu *vmx_create_vcpu(struct proc *p)
1233 {
1234         struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1235         if (!vcpu) {
1236                 return NULL;
1237         }
1238
1239         memset(vcpu, 0, sizeof(*vcpu));
1240
1241         vcpu->proc = p; /* uncounted (weak) reference */
1242         vcpu->vmcs = vmx_alloc_vmcs();
1243         printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1244         if (!vcpu->vmcs)
1245                 goto fail_vmcs;
1246
1247         if (vmx_allocate_vpid(vcpu))
1248                 goto fail_vpid;
1249
1250         printd("%d: vmx_create_vcpu: vpid %d\n", core_id(), vcpu->vpid);
1251         vcpu->cpu = -1;
1252
1253         vmx_get_cpu(vcpu);
1254         vmx_setup_vmcs(vcpu);
1255         vmx_setup_initial_guest_state();
1256         vmx_put_cpu(vcpu);
1257
1258         return vcpu;
1259
1260 fail_ept:
1261         vmx_free_vpid(vcpu);
1262 fail_vpid:
1263         vmx_free_vmcs(vcpu->vmcs);
1264 fail_vmcs:
1265         kfree(vcpu);
1266         return NULL;
1267 }
1268
1269 /**
1270  * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1271  * @vcpu: the VCPU to destroy
1272  */
1273 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu)
1274 {
1275         vmx_free_vpid(vcpu);
1276         vmx_free_vmcs(vcpu->vmcs);
1277         kfree(vcpu);
1278 }
1279
1280 /**
1281  * vmx_task_vcpu - returns a pointer to the task's vcpu or NULL.
1282  * @task: the task
1283  */
1284 static inline struct vmx_vcpu *vmx_task_vcpu(struct proc *p)
1285 {
1286         struct dune_struct *dune = current->virtinfo;
1287         return dune ? dune->vcpu : NULL;
1288 }
1289
1290 /**
1291  * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1292  *
1293  * In the contexts where this is used the vcpu pointer should never be NULL.
1294  */
1295 static inline struct vmx_vcpu *vmx_current_vcpu(void)
1296 {
1297         struct vmx_vcpu *vcpu = vmx_task_vcpu(current);
1298         if (! vcpu)
1299                 panic("%s: core_id %d: no vcpu", __func__, core_id());
1300         return vcpu;
1301 }
1302
1303
1304 /**
1305  * vmx_run_vcpu - launches the CPU into non-root mode
1306  * We ONLY support 64-bit guests.
1307  * @vcpu: the vmx instance to launch
1308  */
1309 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1310 {
1311         asm(
1312                 /* Store host registers */
1313                 "push %%rdx; push %%rbp;"
1314                 "push %%rcx \n\t" /* placeholder for guest rcx */
1315                 "push %%rcx \n\t"
1316                 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1317                 "je 1f \n\t"
1318                 "mov %%rsp, %c[host_rsp](%0) \n\t"
1319                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1320                 "1: \n\t"
1321                 /* Reload cr2 if changed */
1322                 "mov %c[cr2](%0), %%rax \n\t"
1323                 "mov %%cr2, %%rdx \n\t"
1324                 "cmp %%rax, %%rdx \n\t"
1325                 "je 2f \n\t"
1326                 "mov %%rax, %%cr2 \n\t"
1327                 "2: \n\t"
1328                 /* Check if vmlaunch of vmresume is needed */
1329                 "cmpl $0, %c[launched](%0) \n\t"
1330                 /* Load guest registers.  Don't clobber flags. */
1331                 "mov %c[rax](%0), %%rax \n\t"
1332                 "mov %c[rbx](%0), %%rbx \n\t"
1333                 "mov %c[rdx](%0), %%rdx \n\t"
1334                 "mov %c[rsi](%0), %%rsi \n\t"
1335                 "mov %c[rdi](%0), %%rdi \n\t"
1336                 "mov %c[rbp](%0), %%rbp \n\t"
1337                 "mov %c[r8](%0),  %%r8  \n\t"
1338                 "mov %c[r9](%0),  %%r9  \n\t"
1339                 "mov %c[r10](%0), %%r10 \n\t"
1340                 "mov %c[r11](%0), %%r11 \n\t"
1341                 "mov %c[r12](%0), %%r12 \n\t"
1342                 "mov %c[r13](%0), %%r13 \n\t"
1343                 "mov %c[r14](%0), %%r14 \n\t"
1344                 "mov %c[r15](%0), %%r15 \n\t"
1345                 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1346
1347                 /* Enter guest mode */
1348                 "jne .Llaunched \n\t"
1349                 ASM_VMX_VMLAUNCH "\n\t"
1350                 "jmp .Lkvm_vmx_return \n\t"
1351                 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1352                 ".Lkvm_vmx_return: "
1353                 /* Save guest registers, load host registers, keep flags */
1354                 "mov %0, %c[wordsize](%%rsp) \n\t"
1355                 "pop %0 \n\t"
1356                 "mov %%rax, %c[rax](%0) \n\t"
1357                 "mov %%rbx, %c[rbx](%0) \n\t"
1358                 "popq %c[rcx](%0) \n\t"
1359                 "mov %%rdx, %c[rdx](%0) \n\t"
1360                 "mov %%rsi, %c[rsi](%0) \n\t"
1361                 "mov %%rdi, %c[rdi](%0) \n\t"
1362                 "mov %%rbp, %c[rbp](%0) \n\t"
1363                 "mov %%r8,  %c[r8](%0) \n\t"
1364                 "mov %%r9,  %c[r9](%0) \n\t"
1365                 "mov %%r10, %c[r10](%0) \n\t"
1366                 "mov %%r11, %c[r11](%0) \n\t"
1367                 "mov %%r12, %c[r12](%0) \n\t"
1368                 "mov %%r13, %c[r13](%0) \n\t"
1369                 "mov %%r14, %c[r14](%0) \n\t"
1370                 "mov %%r15, %c[r15](%0) \n\t"
1371                 "mov %%rax, %%r10 \n\t"
1372                 "mov %%rdx, %%r11 \n\t"
1373
1374                 "mov %%cr2, %%rax   \n\t"
1375                 "mov %%rax, %c[cr2](%0) \n\t"
1376
1377                 "pop  %%rbp; pop  %%rdx \n\t"
1378                 "setbe %c[fail](%0) \n\t"
1379                 "mov $" STRINGIFY(GD_UD) ", %%rax \n\t"
1380                 "mov %%rax, %%ds \n\t"
1381                 "mov %%rax, %%es \n\t"
1382               : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1383                 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1384                 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1385                 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1386                 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1387                 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1388                 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1389                 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1390                 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1391                 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1392                 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1393                 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1394                 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1395                 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1396                 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1397                 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1398                 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1399                 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1400                 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1401                 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1402                 [wordsize]"i"(sizeof(unsigned long))
1403               : "cc", "memory"
1404                 , "rax", "rbx", "rdi", "rsi"
1405                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1406         );
1407
1408         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1409         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1410         printk("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1411                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1412         /* FIXME: do we need to set up other flags? */
1413         vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
1414                       X86_EFLAGS_IF | 0x2;
1415
1416         vcpu->regs.tf_cs = GD_UT;
1417         vcpu->regs.tf_ss = GD_UD;
1418
1419         vcpu->launched = 1;
1420
1421         if (vcpu->fail) {
1422                 printk("failure detected (err %x)\n",
1423                        vmcs_read32(VM_INSTRUCTION_ERROR));
1424                 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1425         }
1426
1427         return vmcs_read32(VM_EXIT_REASON);
1428
1429 #if 0
1430         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1431         vmx_complete_atomic_exit(vmx);
1432         vmx_recover_nmi_blocking(vmx);
1433         vmx_complete_interrupts(vmx);
1434 #endif
1435 }
1436
1437 static void vmx_step_instruction(void)
1438 {
1439         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1440                                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1441 }
1442
1443 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu)
1444 {
1445         unsigned long gva, gpa;
1446         int exit_qual, ret = -1;
1447         page_t *page;
1448
1449         vmx_get_cpu(vcpu);
1450         exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1451         gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1452         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1453         printk("ept: gva %016lx, gpa %016lx\n", gva, gpa);
1454
1455         vmx_put_cpu(vcpu);
1456
1457         // this is a total hack, for testing things.
1458         // note that we only care about the gpa, and the
1459         // gpa is our process virtual address. 
1460         // Confused yet?
1461         page = page_lookup(current->env_pgdir, (void *)gpa, NULL);
1462         printk("Lookup %p returns %p\n", gpa, page);
1463         if (page) {
1464                 uint64_t hpa = page2pa(page);
1465                 printk("hpa for %p is %p\n", gpa, hpa);
1466                 ret = vmx_do_ept_fault(vcpu->proc->env_pgdir.epte, gpa, hpa, exit_qual);
1467                 printk("vmx_do_ept_fault returns %d\n", ret);
1468         }
1469
1470         if (ret) {
1471                 printk("page fault failure "
1472                        "GPA: 0x%lx, GVA: 0x%lx\n",
1473                        gpa, gva);
1474                 vmx_dump_cpu(vcpu);
1475         }
1476
1477         return ret;
1478 }
1479
1480 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu)
1481 {
1482         unsigned int eax, ebx, ecx, edx;
1483
1484         eax = vcpu->regs.tf_rax;
1485         ecx = vcpu->regs.tf_rcx;
1486         cpuid(0, 2, &eax, &ebx, &ecx, &edx);
1487         vcpu->regs.tf_rax = eax;
1488         vcpu->regs.tf_rbx = ebx;
1489         vcpu->regs.tf_rcx = ecx;
1490         vcpu->regs.tf_rdx = edx;
1491 }
1492
1493 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu)
1494 {
1495         uint32_t intr_info;
1496
1497         vmx_get_cpu(vcpu);
1498         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1499         vmx_put_cpu(vcpu);
1500
1501         printk("vmx (VPID %d): got an exception\n", vcpu->vpid);
1502         printk("vmx (VPID %d): pid %d\n", vcpu->vpid,
1503                          current->pid);
1504         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1505                 return 0;
1506         }
1507
1508         printk("unhandled nmi, intr_info %x\n", intr_info);
1509         return -EIO;
1510 }
1511
1512
1513 static void noop(void) {
1514         __asm__ __volatile__ ("1: jmp 1b");
1515 }
1516
1517 static void fail(void) {
1518         __asm__ __volatile__ ("movq $0xdeadbeef, %rbx; movq 0, %rax");
1519 }
1520
1521 static unsigned long stack[512];
1522 /**
1523  * vmx_launch - the main loop for a VMX Dune process
1524  * @conf: the launch configuration
1525  */
1526 int vmx_launch(struct dune_config *conf)
1527 {
1528         int ret;
1529         struct dune_struct dune;
1530         struct vmx_vcpu *vcpu;
1531         int i = 0;
1532         unsigned long rip = conf->rip;
1533         unsigned long rsp = conf->rsp;
1534         unsigned long cr3 = conf->cr3;
1535         int errors = 0;
1536
1537         if (conf->rip < 4096 ) {
1538                 // testing.
1539                 switch(conf->rip) {
1540                 default:
1541                         rip = (uint64_t)noop + 4;
1542                         break;
1543                 case 1:
1544                         rip = (uint64_t)fail + 4;
1545                         break;
1546                 }
1547         }
1548
1549         if (conf->cr3 == 0) {
1550                 cr3 = rcr3();
1551         }
1552
1553         /* sanity checking.  -- later
1554         ret = ept_check_page(ept, rip);
1555         if (ret) {
1556                 printk("0x%x is not mapped in the ept!\n", rip);
1557                 errors++;
1558         }
1559         ret = ept_check_page(ept, rsp);
1560         if (ret) {
1561                 printk("0x%x is not mapped in the ept!\n", rsp);
1562                 errors++;
1563         }
1564         */
1565         if (errors) {
1566                 return -EINVAL;
1567         }
1568
1569
1570         printk("RUNNING: %s: rip %p rsp %p cr3 %p \n",
1571                __func__, rip, rsp, cr3);
1572         /* TODO: dirty hack til we have VMM contexts */
1573         vcpu = current->vmm.guest_pcores[0];
1574         if (!vcpu) {
1575                 printk("Failed to get a CPU!\n");
1576                 return -ENOMEM;
1577         }
1578
1579         vmx_get_cpu(vcpu);
1580         vmcs_writel(GUEST_RIP, rip);
1581         vmcs_writel(GUEST_RSP, rsp);
1582         vmcs_writel(GUEST_CR3, cr3);
1583         vmx_put_cpu(vcpu);
1584
1585         vcpu->ret_code = -1;
1586
1587         if (current->virtinfo)
1588                 printk("vmx_launch: current->virtinfo is NOT NULL (%p)\n", current->virtinfo);
1589         //WARN_ON(current->virtinfo != NULL);
1590         dune.vcpu = vcpu;
1591
1592         current->virtinfo = &dune;
1593
1594         while (1) {
1595                 vmx_get_cpu(vcpu);
1596
1597                 // TODO: manage the fpu when we restart.
1598
1599                 // TODO: see if we need to exit before we go much further.
1600                 disable_irq();
1601                 ret = vmx_run_vcpu(vcpu);
1602                 enable_irq();
1603                 vmx_put_cpu(vcpu);
1604
1605                 if (ret == EXIT_REASON_VMCALL) {
1606                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1607                         printk("system call! WTF\n");
1608                 } else if (ret == EXIT_REASON_CPUID)
1609                         vmx_handle_cpuid(vcpu);
1610                 else if (ret == EXIT_REASON_EPT_VIOLATION) {
1611                         if (vmx_handle_ept_violation(vcpu))
1612                                 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1613                 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1614                         if (vmx_handle_nmi_exception(vcpu))
1615                                 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1616                 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1617                         printk("External interrupt\n");
1618                 } else {
1619                         printk("unhandled exit: reason %x, exit qualification %x\n",
1620                                ret, vmcs_read32(EXIT_QUALIFICATION));
1621                         vmx_dump_cpu(vcpu);
1622                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1623                 }
1624
1625                 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1626                  * similar to how proc_restartcore/smp_idle only restart the pcpui
1627                  * cur_ctx, we need to do the same, via the VMCS resume business. */
1628
1629                 if (vcpu->shutdown)
1630                         break;
1631         }
1632
1633         printk("RETURN. ip %016lx sp %016lx\n",
1634                 vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
1635         current->virtinfo = NULL;
1636
1637         /*
1638          * Return both the reason for the shutdown and a status value.
1639          * The exit() and exit_group() system calls only need 8 bits for
1640          * the status but we allow 16 bits in case we might want to
1641          * return more information for one of the other shutdown reasons.
1642          */
1643         ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
1644
1645         return ret;
1646 }
1647
1648 /**
1649  * __vmx_enable - low-level enable of VMX mode on the current CPU
1650  * @vmxon_buf: an opaque buffer for use as the VMXON region
1651  */
1652 static  int __vmx_enable(struct vmcs *vmxon_buf)
1653 {
1654         uint64_t phys_addr = PADDR(vmxon_buf);
1655         uint64_t old, test_bits;
1656
1657         if (rcr4() & X86_CR4_VMXE) {
1658                 panic("Should never have this happen");
1659                 return -EBUSY;
1660         }
1661
1662         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1663
1664         test_bits = FEATURE_CONTROL_LOCKED;
1665         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1666
1667         if (0) // tboot_enabled())
1668                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1669
1670         if ((old & test_bits) != test_bits) {
1671                 /* If it's locked, then trying to set it will cause a GPF.
1672                  * No Dune for you!
1673                  */
1674                 if (old & FEATURE_CONTROL_LOCKED) {
1675                         printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1676                         return -1;
1677                 }
1678
1679                 /* enable and lock */
1680                 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1681         }
1682         lcr4(rcr4() | X86_CR4_VMXE);
1683
1684         __vmxon(phys_addr);
1685         vpid_sync_vcpu_global();
1686         ept_sync_global();
1687
1688         return 0;
1689 }
1690
1691 /**
1692  * vmx_enable - enables VMX mode on the current CPU
1693  * @unused: not used (required for on_each_cpu())
1694  *
1695  * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
1696  */
1697 static void vmx_enable(void)
1698 {
1699         struct vmcs *vmxon_buf = currentcpu->vmxarea;
1700         int ret;
1701
1702         ret = __vmx_enable(vmxon_buf);
1703         if (ret)
1704                 goto failed;
1705
1706         currentcpu->vmx_enabled = 1;
1707         // TODO: do we need this?
1708         store_gdt(&currentcpu->host_gdt);
1709
1710         printk("VMX enabled on CPU %d\n", core_id());
1711         return;
1712
1713 failed:
1714         printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1715 }
1716
1717 /**
1718  * vmx_disable - disables VMX mode on the current CPU
1719  */
1720 static void vmx_disable(void *unused)
1721 {
1722         if (currentcpu->vmx_enabled) {
1723                 __vmxoff();
1724                 lcr4(rcr4() & ~X86_CR4_VMXE);
1725                 currentcpu->vmx_enabled = 0;
1726         }
1727 }
1728
1729 /* Probe the cpus to see which ones can do vmx.
1730  * Return -errno if it fails, and 1 if it succeeds.
1731  */
1732 static bool probe_cpu_vmx(void)
1733 {
1734         /* The best way to test this code is:
1735          * wrmsr -p <cpu> 0x3a 1
1736          * This will lock vmx off; then modprobe dune.
1737          * Frequently, however, systems have all 0x3a registers set to 5,
1738          * meaning testing is impossible, as vmx can not be disabled.
1739          * We have to simulate it being unavailable in most cases.
1740          * The 'test' variable provides an easy way to simulate
1741          * unavailability of vmx on some, none, or all cpus.
1742          */
1743         if (!cpu_has_vmx()) {
1744                 printk("Machine does not support VT-x\n");
1745                 return FALSE;
1746         } else {
1747                 printk("Machine supports VT-x\n");
1748                 return TRUE;
1749         }
1750 }
1751
1752 static void setup_vmxarea(void)
1753 {
1754                 struct vmcs *vmxon_buf;
1755                 printd("Set up vmxarea for cpu %d\n", core_id());
1756                 vmxon_buf = __vmx_alloc_vmcs(node_id());
1757                 if (!vmxon_buf) {
1758                         printk("setup_vmxarea failed on node %d\n", core_id());
1759                         return;
1760                 }
1761                 currentcpu->vmxarea = vmxon_buf;
1762 }
1763
1764 static int ept_init(void)
1765 {
1766         if (!cpu_has_vmx_ept()) {
1767                 printk("VMX doesn't support EPT!\n");
1768                 return -1;
1769         }
1770         if (!cpu_has_vmx_eptp_writeback()) {
1771                 printk("VMX EPT doesn't support WB memory!\n");
1772                 return -1;
1773         }
1774         if (!cpu_has_vmx_ept_4levels()) {
1775                 printk("VMX EPT doesn't support 4 level walks!\n");
1776                 return -1;
1777         }
1778         switch (arch_max_jumbo_page_shift()) {
1779                 case PML3_SHIFT:
1780                         if (!cpu_has_vmx_ept_1g_page()) {
1781                                 printk("VMX EPT doesn't support 1 GB pages!\n");
1782                                 return -1;
1783                         }
1784                         break;
1785                 case PML2_SHIFT:
1786                         if (!cpu_has_vmx_ept_2m_page()) {
1787                                 printk("VMX EPT doesn't support 2 MB pages!\n");
1788                                 return -1;
1789                         }
1790                         break;
1791                 default:
1792                         printk("Unexpected jumbo page size %d\n",
1793                                arch_max_jumbo_page_shift());
1794                         return -1;
1795         }
1796         if (!cpu_has_vmx_ept_ad_bits()) {
1797                 printk("VMX EPT doesn't support accessed/dirty!\n");
1798                 /* TODO: set the pmap_ops accordingly */
1799         }
1800         if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
1801                 printk("VMX EPT can't invalidate PTEs/TLBs!\n");
1802                 return -1;
1803         }
1804
1805         return 0;
1806 }
1807
1808 /**
1809  * vmx_init sets up physical core data areas that are required to run a vm at all.
1810  * These data areas are not connected to a specific user process in any way. Instead,
1811  * they are in some sense externalizing what would other wise be a very large ball of
1812  * state that would be inside the CPU.
1813  */
1814 int intel_vmm_init(void)
1815 {
1816         int r, cpu, ret;
1817
1818         if (! probe_cpu_vmx()) {
1819                 return -EOPNOTSUPP;
1820         }
1821
1822         setup_vmcs_config(&ret);
1823
1824         if (ret) {
1825                 printk("setup_vmcs_config failed: %d\n", ret);
1826                 return ret;
1827         }
1828
1829         msr_bitmap = (unsigned long *)kpage_zalloc_addr();
1830         if (!msr_bitmap) {
1831                 printk("Could not allocate msr_bitmap\n");
1832                 return -ENOMEM;
1833         }
1834         /* FIXME: do we need APIC virtualization (flexpriority?) */
1835
1836         memset(msr_bitmap, 0xff, PAGE_SIZE);
1837         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
1838         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
1839
1840         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
1841
1842         if ((ret = ept_init())) {
1843                 printk("EPT init failed, %d\n", ret);
1844                 return ret;
1845         }
1846         printk("VMX setup succeeded\n");
1847         return 0;
1848 }
1849
1850 int intel_vmm_pcpu_init(void)
1851 {
1852         setup_vmxarea();
1853         vmx_enable();
1854         return 0;
1855 }