vmmcp: open up cr4; fix cpuid handling
[akaros.git] / kern / arch / x86 / vmm / intel / vmx.c
1 //#define DEBUG
2 /**
3  *  vmx.c - The Intel VT-x driver for Dune
4  *
5  * This file is derived from Linux KVM VT-x support.
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8  *
9  * Original Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This modified version is simpler because it avoids the following
14  * features that are not requirements for Dune:
15  *  * Real-mode emulation
16  *  * Nested VT-x support
17  *  * I/O hardware emulation
18  *  * Any of the more esoteric X86 features and registers
19  *  * KVM-specific functionality
20  *
21  * In essence we provide only the minimum functionality needed to run
22  * a process in vmx non-root mode rather than the full hardware emulation
23  * needed to support an entire OS.
24  *
25  * This driver is a research prototype and as such has the following
26  * limitations:
27  *
28  * FIXME: Backward compatability is currently a non-goal, and only recent
29  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
30  * driver.
31  *
32  * FIXME: Eventually we should handle concurrent user's of VT-x more
33  * gracefully instead of requiring exclusive access. This would allow
34  * Dune to interoperate with KVM and other HV solutions.
35  *
36  * FIXME: We need to support hotplugged physical CPUs.
37  *
38  * Authors:
39  *   Adam Belay   <abelay@stanford.edu>
40  */
41
42 /* Basic flow.
43  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
44  * You're left with the feeling that they got part way through and realized they had to have one for
45  *
46  * 1) your CPU is going to be capable of running VMs, and you need state for that.
47  *
48  * 2) you're about to start a guest, and you need state for that.
49  *
50  * So there is get cpu set up to be able to run VMs stuff, and now
51  * let's start a guest stuff.  In Akaros, CPUs will always be set up
52  * to run a VM if that is possible. Processes can flip themselves into
53  * a VM and that will require another VMCS.
54  *
55  * So: at kernel startup time, the SMP boot stuff calls
56  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
57  * in the case of this file is intel_vmm_init. That does some code
58  * that sets up stuff for ALL sockets, based on the capabilities of
59  * the socket it runs on. If any cpu supports vmx, it assumes they all
60  * do. That's a realistic assumption. So the call_function_all is kind
61  * of stupid, really; it could just see what's on the current cpu and
62  * assume it's on all. HOWEVER: there are systems in the wilde that
63  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
64  * might as well allow for the chance that wel'll only all VMMCPs on a
65  * subset (not implemented yet however).  So: probe all CPUs, get a
66  * count of how many support VMX and, for now, assume they all do
67  * anyway.
68  *
69  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
70  * which contains all the naughty bits settings for all the cpus that can run a VM.
71  * Realistically, all VMX-capable cpus in a system will have identical configurations.
72  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
73  *
74  * configure the msr_bitmap. This is the bitmap of MSRs which the
75  * guest can manipulate.  Currently, we only allow GS and FS base.
76  *
77  * Reserve bit 0 in the vpid bitmap as guests can not use that
78  *
79  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
80  * per-guest. Once set up, it is left alone.  The ONLY think we set in
81  * there is the revision area. The VMX is page-sized per cpu and
82  * page-aligned. Note that it can be smaller, but why bother? We know
83  * the max size and alightment, and it's convenient.
84  *
85  * Now that it is set up, enable vmx on all cpus. This involves
86  * testing VMXE in cr4, to see if we've been here before (TODO: delete
87  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
88  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
89  * instruction), and syncing vpid's and ept's.  Now the CPU is ready
90  * to host guests.
91  *
92  * Setting up a guest.
93  * We divide this into two things: vmm_proc_init and vm_run.
94  * Currently, on Intel, vmm_proc_init does nothing.
95  *
96  * vm_run is really complicated. It is called with a coreid, rip, rsp,
97  * cr3, and flags.  On intel, it calls vmx_launch. vmx_launch is set
98  * up for a few test cases. If rip is 1, it sets the guest rip to
99  * a function which will deref 0 and should exit with failure 2. If rip is 0,
100  * it calls an infinite loop in the guest.
101  *
102  * The sequence of operations:
103  * create a vcpu
104  * while (1) {
105  * get a vcpu
106  * disable irqs (required or you can't enter the VM)
107  * vmx_run_vcpu()
108  * enable irqs
109  * manage the vm exit
110  * }
111  *
112  * get a vcpu
113  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
114  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
115  *
116  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
117  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
118  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
119  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
120  *
121  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
122  * of inline assembly with embedded CPP crap. I suspect we'll want to
123  * un-inline it someday, but maybe not.  It's called with a vcpu
124  * struct from which it loads guest state, and to which it stores
125  * non-virtualized host state. It issues a vmlaunch or vmresume
126  * instruction depending, and on return, it evaluates if things the
127  * launch/resume had an error in that operation. Note this is NOT the
128  * same as an error while in the virtual machine; this is an error in
129  * startup due to misconfiguration. Depending on whatis returned it's
130  * either a failed vm startup or an exit for lots of many reasons.
131  *
132  */
133
134 /* basically: only rename those globals that might conflict
135  * with existing names. Leave all else the same.
136  * this code is more modern than the other code, yet still
137  * well encapsulated, it seems.
138  */
139 #include <kmalloc.h>
140 #include <string.h>
141 #include <stdio.h>
142 #include <assert.h>
143 #include <error.h>
144 #include <pmap.h>
145 #include <sys/queue.h>
146 #include <smp.h>
147 #include <kref.h>
148 #include <atomic.h>
149 #include <alarm.h>
150 #include <event.h>
151 #include <umem.h>
152 #include <bitops.h>
153 #include <arch/types.h>
154 #include <syscall.h>
155
156 #include "vmx.h"
157 #include "../vmm.h"
158 #include <ros/vmm.h>
159
160 #include "cpufeature.h"
161
162 #define currentcpu (&per_cpu_info[core_id()])
163
164 static unsigned long *msr_bitmap;
165
166 int x86_ept_pte_fix_ups = 0;
167
168 struct vmx_capability vmx_capability;
169 struct vmcs_config vmcs_config;
170
171 static int autoloaded_msrs[] = {
172         MSR_KERNEL_GS_BASE,
173         MSR_LSTAR,
174         MSR_STAR,
175         MSR_SFMASK,
176 };
177
178 static char *cr_access_type[] = {
179         "move to cr",
180         "move from cr",
181         "clts",
182         "lmsw"
183 };
184
185 static char *cr_gpr[] = {
186         "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
187         "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
188 };
189
190 static int guest_cr_num[16] = {
191         GUEST_CR0,
192         -1,
193         -1,
194         GUEST_CR3,
195         GUEST_CR4,
196         -1,
197         -1,
198         -1,
199         -1, /* 8? */
200         -1, -1, -1, -1, -1, -1, -1
201 };
202 __always_inline unsigned long vmcs_readl(unsigned long field);
203 /* See section 24-3 of The Good Book */
204 void show_cr_access(uint64_t val) {
205         int crnr = val & 0xf;
206         int type = (val>>4) & 3;
207         int reg = (val >> 11) & 0xf;
208         printk("%s: %d: ", cr_access_type[type], crnr);
209         if (type < 2) {
210                 printk("%s", cr_gpr[reg]);
211                 if (guest_cr_num[crnr] > -1) {
212                         printk(": 0x%x", vmcs_readl(guest_cr_num[crnr]));
213                 }
214         }
215         printk("\n");
216 }
217
218 void ept_flush(uint64_t eptp)
219 {
220         ept_sync_context(eptp);
221 }
222
223 static void vmcs_clear(struct vmcs *vmcs)
224 {
225         uint64_t phys_addr = PADDR(vmcs);
226         uint8_t error;
227
228         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
229                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
230                       : "cc", "memory");
231         if (error)
232                 printk("vmclear fail: %p/%llx\n",
233                        vmcs, phys_addr);
234 }
235
236 static void vmcs_load(struct vmcs *vmcs)
237 {
238         uint64_t phys_addr = PADDR(vmcs);
239         uint8_t error;
240
241         asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
242                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
243                         : "cc", "memory");
244         if (error)
245                 printk("vmptrld %p/%llx failed\n",
246                        vmcs, phys_addr);
247 }
248
249 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
250 static physaddr_t vmcs_get_current(void)
251 {
252         physaddr_t vmcs_paddr;
253         /* RAX contains the addr of the location to store the VMCS pointer.  The
254          * compiler doesn't know the ASM will deref that pointer, hence the =m */
255         asm volatile (ASM_VMX_VMPTRST_RAX : "=m"(vmcs_paddr) : "a"(&vmcs_paddr));
256         return vmcs_paddr;
257 }
258
259 __always_inline unsigned long vmcs_readl(unsigned long field)
260 {
261         unsigned long value;
262
263         asm volatile (ASM_VMX_VMREAD_RDX_RAX
264                       : "=a"(value) : "d"(field) : "cc");
265         return value;
266 }
267
268 __always_inline uint16_t vmcs_read16(unsigned long field)
269 {
270         return vmcs_readl(field);
271 }
272
273 static __always_inline uint32_t vmcs_read32(unsigned long field)
274 {
275         return vmcs_readl(field);
276 }
277
278 static __always_inline uint64_t vmcs_read64(unsigned long field)
279 {
280         return vmcs_readl(field);
281 }
282
283 void vmwrite_error(unsigned long field, unsigned long value)
284 {
285         printk("vmwrite error: reg %lx value %lx (err %d)\n",
286                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
287 }
288
289 void vmcs_writel(unsigned long field, unsigned long value)
290 {
291         uint8_t error;
292
293         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
294                        : "=q"(error) : "a"(value), "d"(field) : "cc");
295         if (error)
296                 vmwrite_error(field, value);
297 }
298
299 static void vmcs_write16(unsigned long field, uint16_t value)
300 {
301         vmcs_writel(field, value);
302 }
303
304 static void vmcs_write32(unsigned long field, uint32_t value)
305 {
306         vmcs_writel(field, value);
307 }
308
309 static void vmcs_write64(unsigned long field, uint64_t value)
310 {
311         vmcs_writel(field, value);
312 }
313
314 /*
315  * A note on Things You Can't Make Up.
316  * or
317  * "George, you can type this shit, but you can't say it" -- Harrison Ford
318  *
319  * There are 5 VMCS 32-bit words that control guest permissions. If
320  * you set these correctly, you've got a guest that will behave. If
321  * you get even one bit wrong, you've got a guest that will chew your
322  * leg off. Some bits must be 1, some must be 0, and some can be set
323  * either way. To add to the fun, the docs are sort of a docudrama or,
324  * as the quote goes, "interesting if true."
325  *
326  * To determine what bit can be set in what VMCS 32-bit control word,
327  * there are 5 corresponding 64-bit MSRs.  And, to make it even more
328  * fun, the standard set of MSRs have errors in them, i.e. report
329  * incorrect values, for legacy reasons, and so you are supposed to
330  * "look around" to another set, which have correct bits in
331  * them. There are four such 'correct' registers, and they have _TRUE_
332  * in the names as you can see below. We test for the value of VMCS
333  * control bits in the _TRUE_ registers if possible. The fifth
334  * register, CPU Secondary Exec Controls, which came later, needs no
335  * _TRUE_ variant.
336  *
337  * For each MSR, the high 32 bits tell you what bits can be "1" by a
338  * "1" in that position; the low 32 bits tell you what bit can be "0"
339  * by a "0" in that position. So, for each of 32 bits in a given VMCS
340  * control word, there is a pair of bits in an MSR that tells you what
341  * values it can take. The two bits, of which there are *four*
342  * combinations, describe the *three* possible operations on a
343  * bit. The two bits, taken together, form an untruth table: There are
344  * three possibilities: The VMCS bit can be set to 0 or 1, or it can
345  * only be 0, or only 1. The fourth combination is not supposed to
346  * happen.
347  *
348  * So: there is the 1 bit from the upper 32 bits of the msr.
349  * If this bit is set, then the bit can be 1. If clear, it can not be 1.
350  *
351  * Then there is the 0 bit, from low 32 bits. If clear, the VMCS bit
352  * can be 0. If 1, the VMCS bit can not be 0.
353  *
354  * SO, let's call the 1 bit R1, and the 0 bit R0, we have:
355  *  R1 R0
356  *  0 0 -> must be 0
357  *  1 0 -> can be 1, can be 0
358  *  0 1 -> can not be 1, can not be 0. --> JACKPOT! Not seen yet.
359  *  1 1 -> must be one.
360  *
361  * It's also pretty hard to know what you can and can't set, and
362  * that's led to inadvertant opening of permissions at times.  Because
363  * of this complexity we've decided on the following: the driver must
364  * define EVERY bit, UNIQUELY, for each of the 5 registers, that it wants
365  * set. Further, for any bit that's settable, the driver must specify
366  * a setting; for any bit that's reserved, the driver settings must
367  * match that bit. If there are reserved bits we don't specify, that's
368  * ok; we'll take them as is.
369  *
370  * We use a set-means-set, and set-means-clear model, i.e. we use a
371  * 32-bit word to contain the bits we want to be 1, indicated by one;
372  * and another 32-bit word in which a bit we want to be 0 is indicated
373  * by a 1. This allows us to easily create masks of all bits we're
374  * going to set, for example.
375  *
376  * We have two 32-bit numbers for each 32-bit VMCS field: bits we want
377  * set and bits we want clear.  If you read the MSR for that field,
378  * compute the reserved 0 and 1 settings, and | them together, they
379  * need to result in 0xffffffff. You can see that we can create other
380  * tests for conflicts (i.e. overlap).
381  *
382  * At this point, I've tested check_vmx_controls in every way
383  * possible, beause I kept screwing the bitfields up. You'll get a nice
384  * error it won't work at all, which is what we want: a
385  * failure-prone setup, where even errors that might result in correct
386  * values are caught -- "right answer, wrong method, zero credit." If there's
387  * weirdness in the bits, we don't want to run.
388  */
389
390 static bool check_vmxec_controls(struct vmxec const *v, bool have_true_msr,
391                                  uint32_t *result)
392 {
393         bool err = false;
394         uint32_t vmx_msr_low, vmx_msr_high;
395         uint32_t reserved_0, reserved_1, changeable_bits;
396
397         if (have_true_msr)
398                 rdmsr(v->truemsr, vmx_msr_low, vmx_msr_high);
399         else
400                 rdmsr(v->msr, vmx_msr_low, vmx_msr_high);
401
402         if (vmx_msr_low & ~vmx_msr_high)
403                 warn("JACKPOT: Conflicting VMX ec ctls for %s, high 0x%08x low 0x%08x",
404                      v->name, vmx_msr_high, vmx_msr_low);
405
406         reserved_0 = (~vmx_msr_low) & (~vmx_msr_high);
407         reserved_1 = vmx_msr_low & vmx_msr_high;
408         changeable_bits = ~(reserved_0 | reserved_1);
409
410         /*
411          * this is very much as follows:
412          * accept the things I cannot change,
413          * change the things I can,
414          * know the difference.
415          */
416
417         /* Conflict. Don't try to both set and reset bits. */
418         if (v->set_to_0 & v->set_to_1) {
419                 printk("%s: set to 0 (0x%x) and set to 1 (0x%x) overlap: 0x%x\n",
420                        v->name, v->set_to_0, v->set_to_1, v->set_to_0 & v->set_to_1);
421                 err = true;
422         }
423
424         /* coverage */
425         if (((v->set_to_0 | v->set_to_1) & changeable_bits) !=
426             changeable_bits) {
427                 printk("%s: Need to cover 0x%x and have 0x%x,0x%x\n",
428                        v->name, changeable_bits, v->set_to_0,  v->set_to_1);
429                 err = true;
430         }
431
432         if ((v->set_to_0 | v->set_to_1 | reserved_0 | reserved_1) !=
433             0xffffffff) {
434                 printk("%s: incomplete coverage: have 0x%x, want 0x%x\n",
435                        v->name, v->set_to_0 | v->set_to_1 |
436                        reserved_0 | reserved_1, 0xffffffff);
437                 err = true;
438         }
439
440         /* Don't try to change bits that can't be changed. */
441         if ((v->set_to_0 & (reserved_0 | changeable_bits)) != v->set_to_0) {
442                 printk("%s: set to 0 (0x%x) can't be done\n", v->name,
443                         v->set_to_0);
444                 err = true;
445         }
446
447         if ((v->set_to_1 & (reserved_1 | changeable_bits)) != v->set_to_1) {
448                 printk("%s: set to 1 (0x%x) can't be done\n",
449                        v->name, v->set_to_1);
450                 err = true;
451         }
452
453         /* If there's been any error at all, spill our guts and return. */
454         if (err) {
455                 printk("%s: vmx_msr_high 0x%x, vmx_msr_low 0x%x, ",
456                        v->name, vmx_msr_high, vmx_msr_low);
457                 printk("set_to_1 0x%x,set_to_0 0x%x,reserved_1 0x%x",
458                        v->set_to_1, v->set_to_0, reserved_1);
459                 printk(" reserved_0 0x%x", reserved_0);
460                 printk(" changeable_bits 0x%x\n", changeable_bits);
461                 return false;
462         }
463
464         *result = v->set_to_1 | reserved_1;
465
466         printd("%s: check_vmxec_controls succeeds with result 0x%x\n",
467                v->name, *result);
468         return true;
469 }
470
471 /*
472  * We're trying to make this as readable as possible. Realistically, it will
473  * rarely if ever change, if the past is any guide.
474  */
475 static const struct vmxec pbec = {
476         .name = "Pin Based Execution Controls",
477         .msr = MSR_IA32_VMX_PINBASED_CTLS,
478         .truemsr = MSR_IA32_VMX_TRUE_PINBASED_CTLS,
479
480         .set_to_1 = (PIN_BASED_EXT_INTR_MASK |
481                      PIN_BASED_NMI_EXITING |
482                      PIN_BASED_VIRTUAL_NMIS),
483
484         .set_to_0 = (PIN_BASED_VMX_PREEMPTION_TIMER |
485                      PIN_BASED_POSTED_INTR),
486 };
487
488 static const struct vmxec cbec = {
489         .name = "CPU Based Execution Controls",
490         .msr = MSR_IA32_VMX_PROCBASED_CTLS,
491         .truemsr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
492
493         .set_to_1 = (CPU_BASED_HLT_EXITING |
494                      CPU_BASED_INVLPG_EXITING |
495                      CPU_BASED_MWAIT_EXITING |
496                      CPU_BASED_RDPMC_EXITING |
497                      CPU_BASED_CR8_LOAD_EXITING |
498                      CPU_BASED_CR8_STORE_EXITING |
499                      CPU_BASED_MOV_DR_EXITING |
500                      CPU_BASED_UNCOND_IO_EXITING |
501                      CPU_BASED_USE_MSR_BITMAPS |
502                      CPU_BASED_MONITOR_EXITING |
503                      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS),
504
505         .set_to_0 = (CPU_BASED_VIRTUAL_INTR_PENDING |
506                      CPU_BASED_USE_TSC_OFFSETING |
507                      CPU_BASED_RDTSC_EXITING |
508                      CPU_BASED_CR3_LOAD_EXITING |
509                      CPU_BASED_CR3_STORE_EXITING |
510                      CPU_BASED_TPR_SHADOW |
511                      CPU_BASED_VIRTUAL_NMI_PENDING |
512                      CPU_BASED_MONITOR_TRAP |
513                      CPU_BASED_PAUSE_EXITING |
514                      CPU_BASED_USE_IO_BITMAPS),
515 };
516
517 static const struct vmxec cb2ec = {
518         .name = "CPU Based 2nd Execution Controls",
519         .msr = MSR_IA32_VMX_PROCBASED_CTLS2,
520         .truemsr = MSR_IA32_VMX_PROCBASED_CTLS2,
521
522         .set_to_1 = (SECONDARY_EXEC_ENABLE_EPT |
523                      SECONDARY_EXEC_WBINVD_EXITING),
524
525         .set_to_0 = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
526                      SECONDARY_EXEC_DESCRIPTOR_EXITING |
527                      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
528                      SECONDARY_EXEC_ENABLE_VPID |
529                      SECONDARY_EXEC_UNRESTRICTED_GUEST |
530                      SECONDARY_EXEC_APIC_REGISTER_VIRT |
531                      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
532                      SECONDARY_EXEC_PAUSE_LOOP_EXITING |
533                      SECONDARY_EXEC_RDRAND_EXITING |
534                      SECONDARY_EXEC_ENABLE_INVPCID |
535                      SECONDARY_EXEC_ENABLE_VMFUNC |
536                      SECONDARY_EXEC_SHADOW_VMCS |
537                      SECONDARY_EXEC_RDSEED_EXITING |
538                      SECONDARY_EPT_VE |
539                      /* TODO: re enable this via a "Want" struct
540                         member at some point */
541                      SECONDARY_EXEC_RDTSCP |
542                      SECONDARY_ENABLE_XSAV_RESTORE)
543 };
544
545 static const struct vmxec vmentry = {
546         .name = "VMENTRY controls",
547         .msr = MSR_IA32_VMX_ENTRY_CTLS,
548         .truemsr = MSR_IA32_VMX_TRUE_ENTRY_CTLS,
549         /* exact order from vmx.h; only the first two are enabled. */
550
551         .set_to_1 =  (VM_ENTRY_LOAD_DEBUG_CONTROLS | /* can't set to 0 */
552                       VM_ENTRY_LOAD_IA32_EFER |
553                       VM_ENTRY_IA32E_MODE),
554
555         .set_to_0 = (VM_ENTRY_SMM |
556                      VM_ENTRY_DEACT_DUAL_MONITOR |
557                      VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
558                      VM_ENTRY_LOAD_IA32_PAT),
559 };
560
561 static const struct vmxec vmexit = {
562         .name = "VMEXIT controls",
563         .msr = MSR_IA32_VMX_EXIT_CTLS,
564         .truemsr = MSR_IA32_VMX_TRUE_EXIT_CTLS,
565
566         .set_to_1 = (VM_EXIT_SAVE_DEBUG_CONTROLS | /* can't set to 0 */
567                      VM_EXIT_SAVE_IA32_EFER |
568                      VM_EXIT_LOAD_IA32_EFER |
569                      VM_EXIT_HOST_ADDR_SPACE_SIZE), /* 64 bit */
570
571         .set_to_0 = (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
572                      VM_EXIT_ACK_INTR_ON_EXIT |
573                      VM_EXIT_SAVE_IA32_PAT |
574                      VM_EXIT_LOAD_IA32_PAT |
575                      VM_EXIT_SAVE_VMX_PREEMPTION_TIMER),
576 };
577
578 static void setup_vmcs_config(void *p)
579 {
580         int *ret = p;
581         struct vmcs_config *vmcs_conf = &vmcs_config;
582         uint32_t vmx_msr_high;
583         uint64_t vmx_msr;
584         bool have_true_msrs = false;
585         bool ok;
586
587         *ret = -EIO;
588
589         vmx_msr = read_msr(MSR_IA32_VMX_BASIC);
590         vmx_msr_high = vmx_msr >> 32;
591
592         /*
593          * If bit 55 (VMX_BASIC_HAVE_TRUE_MSRS) is set, then we
594          * can go for the true MSRs.  Else, we ask you to get a better CPU.
595          */
596         if (vmx_msr & VMX_BASIC_TRUE_CTLS) {
597                 have_true_msrs = true;
598                 printd("Running with TRUE MSRs\n");
599         } else {
600                 printk("Running with non-TRUE MSRs, this is old hardware\n");
601         }
602
603         /*
604          * Don't worry that one or more of these might fail and leave
605          * the VMCS in some kind of incomplete state. If one of these
606          * fails, the caller is going to discard the VMCS.
607          * It is written this way to ensure we get results of all tests and avoid
608          * BMAFR behavior.
609          */
610         ok = check_vmxec_controls(&pbec, have_true_msrs,
611                                   &vmcs_conf->pin_based_exec_ctrl);
612         ok = check_vmxec_controls(&cbec, have_true_msrs,
613                                   &vmcs_conf->cpu_based_exec_ctrl) && ok;
614         ok = check_vmxec_controls(&cb2ec, have_true_msrs,
615                                   &vmcs_conf->cpu_based_2nd_exec_ctrl) && ok;
616         ok = check_vmxec_controls(&vmentry, have_true_msrs,
617                                   &vmcs_conf->vmentry_ctrl) && ok;
618         ok = check_vmxec_controls(&vmexit, have_true_msrs,
619                                   &vmcs_conf->vmexit_ctrl) && ok;
620         if (! ok) {
621                 printk("vmxexec controls is no good.\n");
622                 return;
623         }
624
625         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
626         if ((vmx_msr_high & 0x1fff) > PGSIZE) {
627                 printk("vmx_msr_high & 0x1fff) is 0x%x, > PAGE_SIZE 0x%x\n",
628                        vmx_msr_high & 0x1fff, PGSIZE);
629                 return;
630         }
631
632         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
633         if (vmx_msr & VMX_BASIC_64) {
634                 printk("VMX doesn't support 64 bit width!\n");
635                 return;
636         }
637
638         if (((vmx_msr & VMX_BASIC_MEM_TYPE_MASK) >> VMX_BASIC_MEM_TYPE_SHIFT)
639             != VMX_BASIC_MEM_TYPE_WB) {
640                 printk("VMX doesn't support WB memory for VMCS accesses!\n");
641                 return;
642         }
643
644         vmcs_conf->size = vmx_msr_high & 0x1fff;
645         vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
646         vmcs_conf->revision_id = (uint32_t)vmx_msr;
647
648         /* Read in the caps for runtime checks.  This MSR is only available if
649          * secondary controls and ept or vpid is on, which we check earlier */
650         rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, vmx_capability.ept, vmx_capability.vpid);
651
652         *ret = 0;
653 }
654
655 static struct vmcs *__vmx_alloc_vmcs(int node)
656 {
657         struct vmcs *vmcs;
658
659         vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
660         if (!vmcs)
661                 return 0;
662         memset(vmcs, 0, vmcs_config.size);
663         vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
664         printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
665         return vmcs;
666 }
667
668 /**
669  * vmx_alloc_vmcs - allocates a VMCS region
670  *
671  * NOTE: Assumes the new region will be used by the current CPU.
672  *
673  * Returns a valid VMCS region.
674  */
675 static struct vmcs *vmx_alloc_vmcs(void)
676 {
677         return __vmx_alloc_vmcs(numa_id());
678 }
679
680 /**
681  * vmx_free_vmcs - frees a VMCS region
682  */
683 static void vmx_free_vmcs(struct vmcs *vmcs)
684 {
685   //free_pages((unsigned long)vmcs, vmcs_config.order);
686 }
687
688 /*
689  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
690  * will not change in the lifetime of the guest.
691  * Note that host-state that does change is set elsewhere. E.g., host-state
692  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
693  */
694 static void vmx_setup_constant_host_state(void)
695 {
696         uint32_t low32, high32;
697         unsigned long tmpl;
698         pseudodesc_t dt;
699
700         vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);  /* 22.2.3 */
701         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
702         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
703
704         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
705         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
706         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
707         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
708         vmcs_write16(HOST_TR_SELECTOR, GD_TSS);  /* 22.2.4 */
709
710         native_store_idt(&dt);
711         vmcs_writel(HOST_IDTR_BASE, dt.pd_base);   /* 22.2.4 */
712
713         asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
714         vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
715
716         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
717         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
718         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
719         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
720
721         rdmsr(MSR_EFER, low32, high32);
722         vmcs_write32(HOST_IA32_EFER, low32);
723
724         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
725                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
726                 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
727         }
728
729         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
730         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
731
732         /* TODO: This (at least gs) is per cpu */
733         rdmsrl(MSR_FS_BASE, tmpl);
734         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
735         rdmsrl(MSR_GS_BASE, tmpl);
736         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
737 }
738
739 static inline uint16_t vmx_read_ldt(void)
740 {
741         uint16_t ldt;
742         asm("sldt %0" : "=g"(ldt));
743         return ldt;
744 }
745
746 static unsigned long segment_base(uint16_t selector)
747 {
748         pseudodesc_t *gdt = &currentcpu->host_gdt;
749         struct desc_struct *d;
750         unsigned long table_base;
751         unsigned long v;
752
753         if (!(selector & ~3)) {
754                 return 0;
755         }
756
757         table_base = gdt->pd_base;
758
759         if (selector & 4) {           /* from ldt */
760                 uint16_t ldt_selector = vmx_read_ldt();
761
762                 if (!(ldt_selector & ~3)) {
763                         return 0;
764                 }
765
766                 table_base = segment_base(ldt_selector);
767         }
768         d = (struct desc_struct *)(table_base + (selector & ~7));
769         v = get_desc_base(d);
770         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
771                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
772         return v;
773 }
774
775 static inline unsigned long vmx_read_tr_base(void)
776 {
777         uint16_t tr;
778         asm("str %0" : "=g"(tr));
779         return segment_base(tr);
780 }
781
782 static void __vmx_setup_cpu(void)
783 {
784         pseudodesc_t *gdt = &currentcpu->host_gdt;
785         unsigned long sysenter_esp;
786         unsigned long tmpl;
787
788         /*
789          * Linux uses per-cpu TSS and GDT, so set these when switching
790          * processors.
791          */
792         vmcs_writel(HOST_TR_BASE, vmx_read_tr_base()); /* 22.2.4 */
793         vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);   /* 22.2.4 */
794
795         rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
796         vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
797
798         rdmsrl(MSR_FS_BASE, tmpl);
799         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
800         rdmsrl(MSR_GS_BASE, tmpl);
801         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
802 }
803
804 /**
805  * vmx_get_cpu - called before using a cpu
806  * @vcpu: VCPU that will be loaded.
807  *
808  * Disables preemption. Call vmx_put_cpu() when finished.
809  */
810 static void vmx_get_cpu(struct vmx_vcpu *vcpu)
811 {
812         int cur_cpu = core_id();
813         handler_wrapper_t *w;
814
815         if (currentcpu->local_vcpu)
816                 panic("get_cpu: currentcpu->localvcpu was non-NULL");
817         if (currentcpu->local_vcpu != vcpu) {
818                 currentcpu->local_vcpu = vcpu;
819
820                 if (vcpu->cpu != cur_cpu) {
821                         if (vcpu->cpu >= 0) {
822                                 panic("vcpu->cpu is not -1, it's %d\n", vcpu->cpu);
823                         } else
824                                 vmcs_clear(vcpu->vmcs);
825
826                         ept_sync_context(vcpu_get_eptp(vcpu));
827
828                         vcpu->launched = 0;
829                         vmcs_load(vcpu->vmcs);
830                         __vmx_setup_cpu();
831                         vcpu->cpu = cur_cpu;
832                 } else {
833                         vmcs_load(vcpu->vmcs);
834                 }
835         }
836 }
837
838 /**
839  * vmx_put_cpu - called after using a cpu
840  * @vcpu: VCPU that was loaded.
841  */
842 static void vmx_put_cpu(struct vmx_vcpu *vcpu)
843 {
844         if (core_id() != vcpu->cpu)
845                 panic("%s: core_id() %d != vcpu->cpu %d\n",
846                       __func__, core_id(), vcpu->cpu);
847
848         if (currentcpu->local_vcpu != vcpu)
849                 panic("vmx_put_cpu: asked to clear something not ours");
850
851         ept_sync_context(vcpu_get_eptp(vcpu));
852         vmcs_clear(vcpu->vmcs);
853         vcpu->cpu = -1;
854         currentcpu->local_vcpu = NULL;
855         //put_cpu();
856 }
857
858 /**
859  * vmx_dump_cpu - prints the CPU state
860  * @vcpu: VCPU to print
861  */
862 static void vmx_dump_cpu(struct vmx_vcpu *vcpu)
863 {
864
865         unsigned long flags;
866
867         vmx_get_cpu(vcpu);
868         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
869         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
870         flags = vmcs_readl(GUEST_RFLAGS);
871         vmx_put_cpu(vcpu);
872
873         printk("--- Begin VCPU Dump ---\n");
874         printk("CPU %d VPID %d\n", vcpu->cpu, 0);
875         printk("RIP 0x%016lx RFLAGS 0x%08lx\n",
876                vcpu->regs.tf_rip, flags);
877         printk("RAX 0x%016lx RCX 0x%016lx\n",
878                 vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
879         printk("RDX 0x%016lx RBX 0x%016lx\n",
880                 vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
881         printk("RSP 0x%016lx RBP 0x%016lx\n",
882                 vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
883         printk("RSI 0x%016lx RDI 0x%016lx\n",
884                 vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
885         printk("R8  0x%016lx R9  0x%016lx\n",
886                 vcpu->regs.tf_r8, vcpu->regs.tf_r9);
887         printk("R10 0x%016lx R11 0x%016lx\n",
888                 vcpu->regs.tf_r10, vcpu->regs.tf_r11);
889         printk("R12 0x%016lx R13 0x%016lx\n",
890                 vcpu->regs.tf_r12, vcpu->regs.tf_r13);
891         printk("R14 0x%016lx R15 0x%016lx\n",
892                 vcpu->regs.tf_r14, vcpu->regs.tf_r15);
893         printk("--- End VCPU Dump ---\n");
894
895 }
896
897 uint64_t construct_eptp(physaddr_t root_hpa)
898 {
899         uint64_t eptp;
900
901         /* set WB memory and 4 levels of walk.  we checked these in ept_init */
902         eptp = VMX_EPT_MEM_TYPE_WB |
903                (VMX_EPT_GAW_4_LVL << VMX_EPT_GAW_EPTP_SHIFT);
904         if (cpu_has_vmx_ept_ad_bits())
905                 eptp |= VMX_EPT_AD_ENABLE_BIT;
906         eptp |= (root_hpa & PAGE_MASK);
907
908         return eptp;
909 }
910
911 /**
912  * vmx_setup_initial_guest_state - configures the initial state of guest registers
913  */
914 static void vmx_setup_initial_guest_state(void)
915 {
916         unsigned long tmpl;
917         unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
918                             X86_CR4_PGE | X86_CR4_OSFXSR;
919         uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
920 #if 0
921         do we need it
922         if (boot_cpu_has(X86_FEATURE_PCID))
923                 cr4 |= X86_CR4_PCIDE;
924         if (boot_cpu_has(X86_FEATURE_OSXSAVE))
925                 cr4 |= X86_CR4_OSXSAVE;
926 #endif
927         /* we almost certainly have this */
928         /* we'll go sour if we don't. */
929         if (1) //boot_cpu_has(X86_FEATURE_FSGSBASE))
930                 cr4 |= X86_CR4_RDWRGSFS;
931
932         /* configure control and data registers */
933         vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
934                                X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
935         vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
936                                      X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
937         vmcs_writel(GUEST_CR3, rcr3());
938         vmcs_writel(GUEST_CR4, cr4);
939         vmcs_writel(CR4_READ_SHADOW, cr4);
940         vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
941                     EFER_SCE /*| EFER_FFXSR*/);
942         vmcs_writel(GUEST_GDTR_BASE, 0);
943         vmcs_writel(GUEST_GDTR_LIMIT, 0);
944         vmcs_writel(GUEST_IDTR_BASE, 0);
945         vmcs_writel(GUEST_IDTR_LIMIT, 0);
946         vmcs_writel(GUEST_RIP, 0xdeadbeef);
947         vmcs_writel(GUEST_RSP, 0xdeadbeef);
948         vmcs_writel(GUEST_RFLAGS, 0x02);
949         vmcs_writel(GUEST_DR7, 0);
950
951         /* guest segment bases */
952         vmcs_writel(GUEST_CS_BASE, 0);
953         vmcs_writel(GUEST_DS_BASE, 0);
954         vmcs_writel(GUEST_ES_BASE, 0);
955         vmcs_writel(GUEST_GS_BASE, 0);
956         vmcs_writel(GUEST_SS_BASE, 0);
957         rdmsrl(MSR_FS_BASE, tmpl);
958         vmcs_writel(GUEST_FS_BASE, tmpl);
959
960         /* guest segment access rights */
961         vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
962         vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
963         vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
964         vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
965         vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
966         vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
967
968         /* guest segment limits */
969         vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
970         vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
971         vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
972         vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
973         vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
974         vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
975
976         /* configure segment selectors */
977         vmcs_write16(GUEST_CS_SELECTOR, 0);
978         vmcs_write16(GUEST_DS_SELECTOR, 0);
979         vmcs_write16(GUEST_ES_SELECTOR, 0);
980         vmcs_write16(GUEST_FS_SELECTOR, 0);
981         vmcs_write16(GUEST_GS_SELECTOR, 0);
982         vmcs_write16(GUEST_SS_SELECTOR, 0);
983         vmcs_write16(GUEST_TR_SELECTOR, 0);
984
985         /* guest LDTR */
986         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
987         vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
988         vmcs_writel(GUEST_LDTR_BASE, 0);
989         vmcs_writel(GUEST_LDTR_LIMIT, 0);
990
991         /* guest TSS */
992         vmcs_writel(GUEST_TR_BASE, 0);
993         vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
994         vmcs_writel(GUEST_TR_LIMIT, 0xff);
995
996         /* initialize sysenter */
997         vmcs_write32(GUEST_SYSENTER_CS, 0);
998         vmcs_writel(GUEST_SYSENTER_ESP, 0);
999         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1000
1001         /* other random initialization */
1002         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1003         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1004         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1005         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1006         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1007 }
1008
1009 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
1010 {
1011         int f = sizeof(unsigned long);
1012         /*
1013          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1014          * have the write-low and read-high bitmap offsets the wrong way round.
1015          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1016          */
1017         if (msr <= 0x1fff) {
1018                 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
1019                 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
1020         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1021                 msr &= 0x1fff;
1022                 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
1023                 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
1024         }
1025 }
1026
1027 static void vcpu_print_autoloads(struct vmx_vcpu *vcpu)
1028 {
1029         struct vmx_msr_entry *e;
1030         int sz = sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs);
1031         printk("Host Autoloads:\n-------------------\n");
1032         for (int i = 0; i < sz; i++) {
1033                 e = &vcpu->msr_autoload.host[i];
1034                 printk("\tMSR 0x%08x: %p\n", e->index, e->value);
1035         }
1036         printk("Guest Autoloads:\n-------------------\n");
1037         for (int i = 0; i < sz; i++) {
1038                 e = &vcpu->msr_autoload.guest[i];
1039                 printk("\tMSR 0x%08x %p\n", e->index, e->value);
1040         }
1041 }
1042
1043 static void dumpmsrs(void)
1044 {
1045         int i;
1046         int set[] = {
1047                 MSR_LSTAR,
1048                 MSR_FS_BASE,
1049                 MSR_GS_BASE,
1050                 MSR_KERNEL_GS_BASE,
1051                 MSR_SFMASK,
1052                 MSR_IA32_PEBS_ENABLE
1053         };
1054         for(i = 0; i < ARRAY_SIZE(set); i++) {
1055                 printk("%p: %p\n", set[i], read_msr(set[i]));
1056         }
1057         printk("core id %d\n", core_id());
1058 }
1059
1060 /* Notes on autoloading.  We can't autoload FS_BASE or GS_BASE, according to the
1061  * manual, but that's because they are automatically saved and restored when all
1062  * of the other architectural registers are saved and restored, such as cs, ds,
1063  * es, and other fun things. (See 24.4.1).  We need to make sure we don't
1064  * accidentally intercept them too, since they are magically autloaded..
1065  *
1066  * We'll need to be careful of any MSR we neither autoload nor intercept
1067  * whenever we vmenter/vmexit, and we intercept by default.
1068  *
1069  * Other MSRs, such as MSR_IA32_PEBS_ENABLE only work on certain architectures
1070  * only work on certain architectures. */
1071 static void setup_msr(struct vmx_vcpu *vcpu)
1072 {
1073         struct vmx_msr_entry *e;
1074         int sz = sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs);
1075         int i;
1076
1077         static_assert((sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs)) <=
1078                       NR_AUTOLOAD_MSRS);
1079
1080         vcpu->msr_autoload.nr = sz;
1081
1082         /* Since PADDR(msr_bitmap) is non-zero, and the bitmap is all 0xff, we now
1083          * intercept all MSRs */
1084         vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1085
1086         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1087         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1088         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1089
1090         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1091         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1092         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1093
1094         for (i = 0; i < sz; i++) {
1095                 uint64_t val;
1096
1097                 e = &vcpu->msr_autoload.host[i];
1098                 e->index = autoloaded_msrs[i];
1099                 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1100                 rdmsrl(e->index, val);
1101                 e->value = val;
1102                 printk("host index %p val %p\n", e->index, e->value);
1103
1104                 e = &vcpu->msr_autoload.guest[i];
1105                 e->index = autoloaded_msrs[i];
1106                 e->value = 0xDEADBEEF;
1107                 printk("guest index %p val %p\n", e->index, e->value);
1108         }
1109 }
1110
1111 /**
1112  *  vmx_setup_vmcs - configures the vmcs with starting parameters
1113  */
1114 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu)
1115 {
1116         vmcs_write16(VIRTUAL_PROCESSOR_ID, 0);
1117         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1118
1119         /* Control */
1120         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1121                 vmcs_config.pin_based_exec_ctrl);
1122
1123         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1124                 vmcs_config.cpu_based_exec_ctrl);
1125
1126         if (cpu_has_secondary_exec_ctrls()) {
1127                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1128                              vmcs_config.cpu_based_2nd_exec_ctrl);
1129         }
1130
1131         vmcs_write64(EPT_POINTER, vcpu_get_eptp(vcpu));
1132
1133         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1134         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1135         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1136
1137         setup_msr(vcpu);
1138
1139         vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1140
1141         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1142         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1143
1144         vmcs_writel(CR0_GUEST_HOST_MASK, 0); // ~0ul);
1145         vmcs_writel(CR4_GUEST_HOST_MASK, 0); // ~0ul);
1146
1147         //kvm_write_tsc(&vmx->vcpu, 0);
1148         vmcs_writel(TSC_OFFSET, 0);
1149
1150         vmx_setup_constant_host_state();
1151 }
1152
1153 /**
1154  * vmx_create_vcpu - allocates and initializes a new virtual cpu
1155  *
1156  * Returns: A new VCPU structure
1157  */
1158 struct vmx_vcpu *vmx_create_vcpu(struct proc *p)
1159 {
1160         struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1161         if (!vcpu) {
1162                 return NULL;
1163         }
1164
1165         memset(vcpu, 0, sizeof(*vcpu));
1166
1167         vcpu->proc = p; /* uncounted (weak) reference */
1168         vcpu->vmcs = vmx_alloc_vmcs();
1169         printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1170         if (!vcpu->vmcs)
1171                 goto fail_vmcs;
1172
1173         vcpu->cpu = -1;
1174
1175         vmx_get_cpu(vcpu);
1176         vmx_setup_vmcs(vcpu);
1177         vmx_setup_initial_guest_state();
1178         vmx_put_cpu(vcpu);
1179
1180         return vcpu;
1181
1182 fail_vmcs:
1183         kfree(vcpu);
1184         return NULL;
1185 }
1186
1187 /**
1188  * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1189  * @vcpu: the VCPU to destroy
1190  */
1191 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu)
1192 {
1193         vmx_free_vmcs(vcpu->vmcs);
1194         kfree(vcpu);
1195 }
1196
1197 /**
1198  * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1199  *
1200  * In the contexts where this is used the vcpu pointer should never be NULL.
1201  */
1202 static inline struct vmx_vcpu *vmx_current_vcpu(void)
1203 {
1204         struct vmx_vcpu *vcpu = currentcpu->local_vcpu;
1205         if (!vcpu)
1206                 panic("Core has no vcpu!");
1207         return vcpu;
1208 }
1209
1210 /**
1211  * vmx_run_vcpu - launches the CPU into non-root mode
1212  * We ONLY support 64-bit guests.
1213  * @vcpu: the vmx instance to launch
1214  */
1215 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1216 {
1217         asm(
1218                 /* Store host registers */
1219                 "push %%rdx; push %%rbp;"
1220                 "push %%rcx \n\t" /* placeholder for guest rcx */
1221                 "push %%rcx \n\t"
1222                 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1223                 "je 1f \n\t"
1224                 "mov %%rsp, %c[host_rsp](%0) \n\t"
1225                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1226                 "1: \n\t"
1227                 /* Reload cr2 if changed */
1228                 "mov %c[cr2](%0), %%rax \n\t"
1229                 "mov %%cr2, %%rdx \n\t"
1230                 "cmp %%rax, %%rdx \n\t"
1231                 "je 2f \n\t"
1232                 "mov %%rax, %%cr2 \n\t"
1233                 "2: \n\t"
1234                 /* Check if vmlaunch of vmresume is needed */
1235                 "cmpl $0, %c[launched](%0) \n\t"
1236                 /* Load guest registers.  Don't clobber flags. */
1237                 "mov %c[rax](%0), %%rax \n\t"
1238                 "mov %c[rbx](%0), %%rbx \n\t"
1239                 "mov %c[rdx](%0), %%rdx \n\t"
1240                 "mov %c[rsi](%0), %%rsi \n\t"
1241                 "mov %c[rdi](%0), %%rdi \n\t"
1242                 "mov %c[rbp](%0), %%rbp \n\t"
1243                 "mov %c[r8](%0),  %%r8  \n\t"
1244                 "mov %c[r9](%0),  %%r9  \n\t"
1245                 "mov %c[r10](%0), %%r10 \n\t"
1246                 "mov %c[r11](%0), %%r11 \n\t"
1247                 "mov %c[r12](%0), %%r12 \n\t"
1248                 "mov %c[r13](%0), %%r13 \n\t"
1249                 "mov %c[r14](%0), %%r14 \n\t"
1250                 "mov %c[r15](%0), %%r15 \n\t"
1251                 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1252
1253                 /* Enter guest mode */
1254                 "jne .Llaunched \n\t"
1255                 ASM_VMX_VMLAUNCH "\n\t"
1256                 "jmp .Lkvm_vmx_return \n\t"
1257                 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1258                 ".Lkvm_vmx_return: "
1259                 /* Save guest registers, load host registers, keep flags */
1260                 "mov %0, %c[wordsize](%%rsp) \n\t"
1261                 "pop %0 \n\t"
1262                 "mov %%rax, %c[rax](%0) \n\t"
1263                 "mov %%rbx, %c[rbx](%0) \n\t"
1264                 "popq %c[rcx](%0) \n\t"
1265                 "mov %%rdx, %c[rdx](%0) \n\t"
1266                 "mov %%rsi, %c[rsi](%0) \n\t"
1267                 "mov %%rdi, %c[rdi](%0) \n\t"
1268                 "mov %%rbp, %c[rbp](%0) \n\t"
1269                 "mov %%r8,  %c[r8](%0) \n\t"
1270                 "mov %%r9,  %c[r9](%0) \n\t"
1271                 "mov %%r10, %c[r10](%0) \n\t"
1272                 "mov %%r11, %c[r11](%0) \n\t"
1273                 "mov %%r12, %c[r12](%0) \n\t"
1274                 "mov %%r13, %c[r13](%0) \n\t"
1275                 "mov %%r14, %c[r14](%0) \n\t"
1276                 "mov %%r15, %c[r15](%0) \n\t"
1277                 "mov %%rax, %%r10 \n\t"
1278                 "mov %%rdx, %%r11 \n\t"
1279
1280                 "mov %%cr2, %%rax   \n\t"
1281                 "mov %%rax, %c[cr2](%0) \n\t"
1282
1283                 "pop  %%rbp; pop  %%rdx \n\t"
1284                 "setbe %c[fail](%0) \n\t"
1285                 "mov $" STRINGIFY(GD_UD) ", %%rax \n\t"
1286                 "mov %%rax, %%ds \n\t"
1287                 "mov %%rax, %%es \n\t"
1288               : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1289                 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1290                 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1291                 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1292                 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1293                 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1294                 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1295                 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1296                 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1297                 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1298                 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1299                 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1300                 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1301                 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1302                 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1303                 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1304                 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1305                 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1306                 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1307                 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1308                 [wordsize]"i"(sizeof(unsigned long))
1309               : "cc", "memory"
1310                 , "rax", "rbx", "rdi", "rsi"
1311                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1312         );
1313
1314         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1315         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1316         printd("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1317                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1318         /* FIXME: do we need to set up other flags? */
1319         vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
1320                       X86_EFLAGS_IF | 0x2;
1321
1322         vcpu->regs.tf_cs = GD_UT;
1323         vcpu->regs.tf_ss = GD_UD;
1324
1325         vcpu->launched = 1;
1326
1327         if (vcpu->fail) {
1328                 printk("failure detected (err %x)\n",
1329                        vmcs_read32(VM_INSTRUCTION_ERROR));
1330                 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1331         }
1332
1333         return vmcs_read32(VM_EXIT_REASON);
1334
1335 #if 0
1336         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1337         vmx_complete_atomic_exit(vmx);
1338         vmx_recover_nmi_blocking(vmx);
1339         vmx_complete_interrupts(vmx);
1340 #endif
1341 }
1342
1343 static void vmx_step_instruction(void)
1344 {
1345         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1346                                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1347 }
1348
1349 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu)
1350 {
1351         unsigned long gva, gpa;
1352         int exit_qual, ret = -1;
1353         page_t *page;
1354
1355         vmx_get_cpu(vcpu);
1356         exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1357         gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1358         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1359
1360         vmx_put_cpu(vcpu);
1361
1362         int prot = 0;
1363         prot |= exit_qual & VMX_EPT_FAULT_READ ? PROT_READ : 0;
1364         prot |= exit_qual & VMX_EPT_FAULT_WRITE ? PROT_WRITE : 0;
1365         prot |= exit_qual & VMX_EPT_FAULT_INS ? PROT_EXEC : 0;
1366         ret = handle_page_fault(current, gpa, prot);
1367
1368         if (ret) {
1369                 printk("EPT page fault failure %d, GPA: %p, GVA: %p\n", ret, gpa, gva);
1370                 vmx_dump_cpu(vcpu);
1371         }
1372
1373         return ret;
1374 }
1375
1376 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu)
1377 {
1378         unsigned int eax, ebx, ecx, edx;
1379
1380         eax = vcpu->regs.tf_rax;
1381         ecx = vcpu->regs.tf_rcx;
1382         cpuid(eax, ecx, &eax, &ebx, &ecx, &edx);
1383         vcpu->regs.tf_rax = eax;
1384         vcpu->regs.tf_rbx = ebx;
1385         vcpu->regs.tf_rcx = ecx;
1386         vcpu->regs.tf_rdx = edx;
1387 }
1388
1389 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu)
1390 {
1391         uint32_t intr_info;
1392
1393         vmx_get_cpu(vcpu);
1394         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1395         vmx_put_cpu(vcpu);
1396
1397         printk("vmx (vcpu %p): got an exception\n", vcpu);
1398         printk("vmx (vcpu %p): pid %d\n", vcpu, vcpu->proc->pid);
1399         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1400                 return 0;
1401         }
1402
1403         printk("unhandled nmi, intr_info %x\n", intr_info);
1404         return -EIO;
1405 }
1406
1407 /**
1408  * vmx_launch - the main loop for a VMX Dune process
1409  * @conf: the launch configuration
1410  */
1411 int vmx_launch(uint64_t rip, uint64_t rsp, uint64_t cr3)
1412 {
1413         int ret;
1414         struct vmx_vcpu *vcpu;
1415         int errors = 0;
1416
1417         printd("RUNNING: %s: rip %p rsp %p cr3 %p \n",
1418                __func__, rip, rsp, cr3);
1419         /* TODO: dirty hack til we have VMM contexts */
1420         vcpu = current->vmm.guest_pcores[0];
1421         if (!vcpu) {
1422                 printk("Failed to get a CPU!\n");
1423                 return -ENOMEM;
1424         }
1425
1426         /* We need to prep the host's autoload region for our current core.  Right
1427          * now, the only autoloaded MSR that varies at runtime (in this case per
1428          * core is the KERN_GS_BASE). */
1429         rdmsrl(MSR_KERNEL_GS_BASE, vcpu->msr_autoload.host[0].value);
1430         /* if cr3 is set, means 'set everything', else means 'start where you left off' */
1431         if (cr3) {
1432                 vmx_get_cpu(vcpu);
1433                 vmcs_writel(GUEST_RIP, rip);
1434                 vmcs_writel(GUEST_RSP, rsp);
1435                 vmcs_writel(GUEST_CR3, cr3);
1436                 vmx_put_cpu(vcpu);
1437         }
1438
1439         vcpu->ret_code = -1;
1440
1441         while (1) {
1442                 vmx_get_cpu(vcpu);
1443
1444                 // TODO: manage the fpu when we restart.
1445
1446                 // TODO: see if we need to exit before we go much further.
1447                 disable_irq();
1448                 //dumpmsrs();
1449                 ret = vmx_run_vcpu(vcpu);
1450                 //dumpmsrs();
1451                 enable_irq();
1452                 vmx_put_cpu(vcpu);
1453
1454                 if (ret == EXIT_REASON_VMCALL) {
1455                         if (current->vmm.flags & VMM_VMCALL_PRINTF) {
1456                                 uint8_t byte = vcpu->regs.tf_rdi;
1457                                 printd("System call\n");
1458 #ifdef DEBUG
1459                                 vmx_dump_cpu(vcpu);
1460 #endif
1461                                 printk("%c", byte);
1462                                 // adjust the RIP
1463                                 vmx_get_cpu(vcpu);
1464                                 vmcs_writel(GUEST_RIP, vcpu->regs.tf_rip + 3);
1465                                 vmx_put_cpu(vcpu);
1466                         } else {
1467                                 vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1468                                 uint8_t byte = vcpu->regs.tf_rdi;
1469                                 printk("%p %c\n", byte, vcpu->regs.tf_rdi);
1470                                 vmx_dump_cpu(vcpu);
1471                                 printd("system call! WTF\n");
1472                         }
1473                 } else if (ret == EXIT_REASON_CR_ACCESS) {
1474                         show_cr_access(vmcs_read32(EXIT_QUALIFICATION));
1475                         vmx_dump_cpu(vcpu);
1476                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1477                 } else if (ret == EXIT_REASON_CPUID) {
1478                         vmx_handle_cpuid(vcpu);
1479                         vmx_get_cpu(vcpu);
1480                         vmcs_writel(GUEST_RIP, vcpu->regs.tf_rip + 2);
1481                         vmx_put_cpu(vcpu);
1482                 } else if (ret == EXIT_REASON_EPT_VIOLATION) {
1483                         if (vmx_handle_ept_violation(vcpu))
1484                                 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1485                 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1486                         if (vmx_handle_nmi_exception(vcpu))
1487                                 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1488                 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1489                         printd("External interrupt\n");
1490                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1491                 } else {
1492                         printk("unhandled exit: reason 0x%x, exit qualification 0x%x\n",
1493                                ret, vmcs_read32(EXIT_QUALIFICATION));
1494                         vmx_dump_cpu(vcpu);
1495                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1496                 }
1497
1498                 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1499                  * similar to how proc_restartcore/smp_idle only restart the pcpui
1500                  * cur_ctx, we need to do the same, via the VMCS resume business. */
1501
1502                 if (vcpu->shutdown)
1503                         break;
1504         }
1505
1506         printd("RETURN. ip %016lx sp %016lx\n",
1507                 vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
1508 //      hexdump((void *)vcpu->regs.tf_rsp, 128 * 8);
1509         /*
1510          * Return both the reason for the shutdown and a status value.
1511          * The exit() and exit_group() system calls only need 8 bits for
1512          * the status but we allow 16 bits in case we might want to
1513          * return more information for one of the other shutdown reasons.
1514          */
1515         ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
1516
1517         return ret;
1518 }
1519
1520 /**
1521  * __vmx_enable - low-level enable of VMX mode on the current CPU
1522  * @vmxon_buf: an opaque buffer for use as the VMXON region
1523  */
1524 static  int __vmx_enable(struct vmcs *vmxon_buf)
1525 {
1526         uint64_t phys_addr = PADDR(vmxon_buf);
1527         uint64_t old, test_bits;
1528
1529         if (rcr4() & X86_CR4_VMXE) {
1530                 panic("Should never have this happen");
1531                 return -EBUSY;
1532         }
1533
1534         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1535
1536         test_bits = FEATURE_CONTROL_LOCKED;
1537         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1538
1539         if (0) // tboot_enabled())
1540                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1541
1542         if ((old & test_bits) != test_bits) {
1543                 /* If it's locked, then trying to set it will cause a GPF.
1544                  * No Dune for you!
1545                  */
1546                 if (old & FEATURE_CONTROL_LOCKED) {
1547                         printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1548                         return -1;
1549                 }
1550
1551                 /* enable and lock */
1552                 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1553         }
1554         lcr4(rcr4() | X86_CR4_VMXE);
1555
1556         __vmxon(phys_addr);
1557         vpid_sync_vcpu_global();        /* good idea, even if we aren't using vpids */
1558         ept_sync_global();
1559
1560         return 0;
1561 }
1562
1563 /**
1564  * vmx_enable - enables VMX mode on the current CPU
1565  * @unused: not used (required for on_each_cpu())
1566  *
1567  * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
1568  */
1569 static void vmx_enable(void)
1570 {
1571         struct vmcs *vmxon_buf = currentcpu->vmxarea;
1572         int ret;
1573
1574         ret = __vmx_enable(vmxon_buf);
1575         if (ret)
1576                 goto failed;
1577
1578         currentcpu->vmx_enabled = 1;
1579         // TODO: do we need this?
1580         store_gdt(&currentcpu->host_gdt);
1581
1582         printk("VMX enabled on CPU %d\n", core_id());
1583         return;
1584
1585 failed:
1586         printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1587 }
1588
1589 /**
1590  * vmx_disable - disables VMX mode on the current CPU
1591  */
1592 static void vmx_disable(void *unused)
1593 {
1594         if (currentcpu->vmx_enabled) {
1595                 __vmxoff();
1596                 lcr4(rcr4() & ~X86_CR4_VMXE);
1597                 currentcpu->vmx_enabled = 0;
1598         }
1599 }
1600
1601 /* Probe the cpus to see which ones can do vmx.
1602  * Return -errno if it fails, and 1 if it succeeds.
1603  */
1604 static bool probe_cpu_vmx(void)
1605 {
1606         /* The best way to test this code is:
1607          * wrmsr -p <cpu> 0x3a 1
1608          * This will lock vmx off; then modprobe dune.
1609          * Frequently, however, systems have all 0x3a registers set to 5,
1610          * meaning testing is impossible, as vmx can not be disabled.
1611          * We have to simulate it being unavailable in most cases.
1612          * The 'test' variable provides an easy way to simulate
1613          * unavailability of vmx on some, none, or all cpus.
1614          */
1615         if (!cpu_has_vmx()) {
1616                 printk("Machine does not support VT-x\n");
1617                 return FALSE;
1618         } else {
1619                 printk("Machine supports VT-x\n");
1620                 return TRUE;
1621         }
1622 }
1623
1624 static void setup_vmxarea(void)
1625 {
1626                 struct vmcs *vmxon_buf;
1627                 printd("Set up vmxarea for cpu %d\n", core_id());
1628                 vmxon_buf = __vmx_alloc_vmcs(core_id());
1629                 if (!vmxon_buf) {
1630                         printk("setup_vmxarea failed on node %d\n", core_id());
1631                         return;
1632                 }
1633                 currentcpu->vmxarea = vmxon_buf;
1634 }
1635
1636 static int ept_init(void)
1637 {
1638         if (!cpu_has_vmx_ept()) {
1639                 printk("VMX doesn't support EPT!\n");
1640                 return -1;
1641         }
1642         if (!cpu_has_vmx_eptp_writeback()) {
1643                 printk("VMX EPT doesn't support WB memory!\n");
1644                 return -1;
1645         }
1646         if (!cpu_has_vmx_ept_4levels()) {
1647                 printk("VMX EPT doesn't support 4 level walks!\n");
1648                 return -1;
1649         }
1650         switch (arch_max_jumbo_page_shift()) {
1651                 case PML3_SHIFT:
1652                         if (!cpu_has_vmx_ept_1g_page()) {
1653                                 printk("VMX EPT doesn't support 1 GB pages!\n");
1654                                 return -1;
1655                         }
1656                         break;
1657                 case PML2_SHIFT:
1658                         if (!cpu_has_vmx_ept_2m_page()) {
1659                                 printk("VMX EPT doesn't support 2 MB pages!\n");
1660                                 return -1;
1661                         }
1662                         break;
1663                 default:
1664                         printk("Unexpected jumbo page size %d\n",
1665                                arch_max_jumbo_page_shift());
1666                         return -1;
1667         }
1668         if (!cpu_has_vmx_ept_ad_bits()) {
1669                 printk("VMX EPT doesn't support accessed/dirty!\n");
1670                 x86_ept_pte_fix_ups |= EPTE_A | EPTE_D;
1671         }
1672         if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
1673                 printk("VMX EPT can't invalidate PTEs/TLBs!\n");
1674                 return -1;
1675         }
1676
1677         return 0;
1678 }
1679
1680 /**
1681  * vmx_init sets up physical core data areas that are required to run a vm at all.
1682  * These data areas are not connected to a specific user process in any way. Instead,
1683  * they are in some sense externalizing what would other wise be a very large ball of
1684  * state that would be inside the CPU.
1685  */
1686 int intel_vmm_init(void)
1687 {
1688         int r, cpu, ret;
1689
1690         if (! probe_cpu_vmx()) {
1691                 return -EOPNOTSUPP;
1692         }
1693
1694         setup_vmcs_config(&ret);
1695
1696         if (ret) {
1697                 printk("setup_vmcs_config failed: %d\n", ret);
1698                 return ret;
1699         }
1700
1701         msr_bitmap = (unsigned long *)kpage_zalloc_addr();
1702         if (!msr_bitmap) {
1703                 printk("Could not allocate msr_bitmap\n");
1704                 return -ENOMEM;
1705         }
1706         /* FIXME: do we need APIC virtualization (flexpriority?) */
1707
1708         memset(msr_bitmap, 0xff, PAGE_SIZE);
1709         /* These are the only MSRs that are not autoloaded and not intercepted */
1710         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
1711         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
1712         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_EFER);
1713
1714         if ((ret = ept_init())) {
1715                 printk("EPT init failed, %d\n", ret);
1716                 return ret;
1717         }
1718         printk("VMX setup succeeded\n");
1719         return 0;
1720 }
1721
1722 int intel_vmm_pcpu_init(void)
1723 {
1724         setup_vmxarea();
1725         vmx_enable();
1726         return 0;
1727 }