VMM: Rip out testing code.
[akaros.git] / kern / arch / x86 / vmm / intel / vmx.c
1 /**
2  *  vmx.c - The Intel VT-x driver for Dune
3  *
4  * This file is derived from Linux KVM VT-x support.
5  * Copyright (C) 2006 Qumranet, Inc.
6  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
7  *
8  * Original Authors:
9  *   Avi Kivity   <avi@qumranet.com>
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *
12  * This modified version is simpler because it avoids the following
13  * features that are not requirements for Dune:
14  *  * Real-mode emulation
15  *  * Nested VT-x support
16  *  * I/O hardware emulation
17  *  * Any of the more esoteric X86 features and registers
18  *  * KVM-specific functionality
19  *
20  * In essence we provide only the minimum functionality needed to run
21  * a process in vmx non-root mode rather than the full hardware emulation
22  * needed to support an entire OS.
23  *
24  * This driver is a research prototype and as such has the following
25  * limitations:
26  *
27  * FIXME: Backward compatability is currently a non-goal, and only recent
28  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
29  * driver.
30  *
31  * FIXME: Eventually we should handle concurrent user's of VT-x more
32  * gracefully instead of requiring exclusive access. This would allow
33  * Dune to interoperate with KVM and other HV solutions.
34  *
35  * FIXME: We need to support hotplugged physical CPUs.
36  *
37  * Authors:
38  *   Adam Belay   <abelay@stanford.edu>
39  */
40
41 /* Basic flow.
42  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
43  * You're left with the feeling that they got part way through and realized they had to have one for
44  *
45  * 1) your CPU is going to be capable of running VMs, and you need state for that.
46  *
47  * 2) you're about to start a guest, and you need state for that.
48  *
49  * So there is get cpu set up to be able to run VMs stuff, and now
50  * let's start a guest stuff.  In Akaros, CPUs will always be set up
51  * to run a VM if that is possible. Processes can flip themselves into
52  * a VM and that will require another VMCS.
53  *
54  * So: at kernel startup time, the SMP boot stuff calls
55  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
56  * in the case of this file is intel_vmm_init. That does some code
57  * that sets up stuff for ALL sockets, based on the capabilities of
58  * the socket it runs on. If any cpu supports vmx, it assumes they all
59  * do. That's a realistic assumption. So the call_function_all is kind
60  * of stupid, really; it could just see what's on the current cpu and
61  * assume it's on all. HOWEVER: there are systems in the wilde that
62  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
63  * might as well allow for the chance that wel'll only all VMMCPs on a
64  * subset (not implemented yet however).  So: probe all CPUs, get a
65  * count of how many support VMX and, for now, assume they all do
66  * anyway.
67  *
68  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
69  * which contains all the naughty bits settings for all the cpus that can run a VM.
70  * Realistically, all VMX-capable cpus in a system will have identical configurations.
71  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
72  *
73  * configure the msr_bitmap. This is the bitmap of MSRs which the
74  * guest can manipulate.  Currently, we only allow GS and FS base.
75  *
76  * Reserve bit 0 in the vpid bitmap as guests can not use that
77  *
78  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
79  * per-guest. Once set up, it is left alone.  The ONLY think we set in
80  * there is the revision area. The VMX is page-sized per cpu and
81  * page-aligned. Note that it can be smaller, but why bother? We know
82  * the max size and alightment, and it's convenient.
83  *
84  * Now that it is set up, enable vmx on all cpus. This involves
85  * testing VMXE in cr4, to see if we've been here before (TODO: delete
86  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
87  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
88  * instruction), and syncing vpid's and ept's.  Now the CPU is ready
89  * to host guests.
90  *
91  * Setting up a guest.
92  * We divide this into two things: vmm_proc_init and vm_run.
93  * Currently, on Intel, vmm_proc_init does nothing.
94  *
95  * vm_run is really complicated. It is called with a coreid, rip, rsp,
96  * cr3, and flags.  On intel, it calls vmx_launch. vmx_launch is set
97  * up for a few test cases. If rip is 1, it sets the guest rip to
98  * a function which will deref 0 and should exit with failure 2. If rip is 0,
99  * it calls an infinite loop in the guest.
100  *
101  * The sequence of operations:
102  * create a vcpu
103  * while (1) {
104  * get a vcpu
105  * disable irqs (required or you can't enter the VM)
106  * vmx_run_vcpu()
107  * enable irqs
108  * manage the vm exit
109  * }
110  *
111  * get a vcpu
112  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
113  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
114  *
115  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
116  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
117  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
118  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
119  *
120  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
121  * of inline assembly with embedded CPP crap. I suspect we'll want to
122  * un-inline it someday, but maybe not.  It's called with a vcpu
123  * struct from which it loads guest state, and to which it stores
124  * non-virtualized host state. It issues a vmlaunch or vmresume
125  * instruction depending, and on return, it evaluates if things the
126  * launch/resume had an error in that operation. Note this is NOT the
127  * same as an error while in the virtual machine; this is an error in
128  * startup due to misconfiguration. Depending on whatis returned it's
129  * either a failed vm startup or an exit for lots of many reasons.
130  *
131  */
132
133 /* basically: only rename those globals that might conflict
134  * with existing names. Leave all else the same.
135  * this code is more modern than the other code, yet still
136  * well encapsulated, it seems.
137  */
138 #include <kmalloc.h>
139 #include <string.h>
140 #include <stdio.h>
141 #include <assert.h>
142 #include <error.h>
143 #include <pmap.h>
144 #include <sys/queue.h>
145 #include <smp.h>
146 #include <kref.h>
147 #include <atomic.h>
148 #include <alarm.h>
149 #include <event.h>
150 #include <umem.h>
151 #include <bitops.h>
152 #include <arch/types.h>
153 #include <syscall.h>
154
155 #include "vmx.h"
156 #include "../vmm.h"
157
158 #include "cpufeature.h"
159
160 #define currentcpu (&per_cpu_info[core_id()])
161
162 /*
163  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
164  * away by decrementing the array size.
165  */
166 static const uint32_t vmx_msr_index[] = {
167 #ifdef CONFIG_X86_64
168         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
169 #endif
170         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
171 };
172 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
173
174 static unsigned long *msr_bitmap;
175
176 int x86_ept_pte_fix_ups = 0;
177
178 struct vmx_capability vmx_capability;
179 struct vmcs_config vmcs_config;
180
181 void ept_flush(uint64_t eptp)
182 {
183         ept_sync_context(eptp);
184 }
185
186 static void vmcs_clear(struct vmcs *vmcs)
187 {
188         uint64_t phys_addr = PADDR(vmcs);
189         uint8_t error;
190
191         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
192                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
193                       : "cc", "memory");
194         if (error)
195                 printk("vmclear fail: %p/%llx\n",
196                        vmcs, phys_addr);
197 }
198
199 static void vmcs_load(struct vmcs *vmcs)
200 {
201         uint64_t phys_addr = PADDR(vmcs);
202         uint8_t error;
203
204         asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
205                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
206                         : "cc", "memory");
207         if (error)
208                 printk("vmptrld %p/%llx failed\n",
209                        vmcs, phys_addr);
210 }
211
212 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
213 static physaddr_t vmcs_get_current(void)
214 {
215         physaddr_t vmcs_paddr;
216         /* RAX contains the addr of the location to store the VMCS pointer.  The
217          * compiler doesn't know the ASM will deref that pointer, hence the =m */
218         asm volatile (ASM_VMX_VMPTRST_RAX : "=m"(vmcs_paddr) : "a"(&vmcs_paddr));
219         return vmcs_paddr;
220 }
221
222 __always_inline unsigned long vmcs_readl(unsigned long field)
223 {
224         unsigned long value;
225
226         asm volatile (ASM_VMX_VMREAD_RDX_RAX
227                       : "=a"(value) : "d"(field) : "cc");
228         return value;
229 }
230
231 __always_inline uint16_t vmcs_read16(unsigned long field)
232 {
233         return vmcs_readl(field);
234 }
235
236 static __always_inline uint32_t vmcs_read32(unsigned long field)
237 {
238         return vmcs_readl(field);
239 }
240
241 static __always_inline uint64_t vmcs_read64(unsigned long field)
242 {
243 #ifdef CONFIG_X86_64
244         return vmcs_readl(field);
245 #else
246         return vmcs_readl(field) | ((uint64_t)vmcs_readl(field+1) << 32);
247 #endif
248 }
249
250 void vmwrite_error(unsigned long field, unsigned long value)
251 {
252         printk("vmwrite error: reg %lx value %lx (err %d)\n",
253                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
254 }
255
256 void vmcs_writel(unsigned long field, unsigned long value)
257 {
258         uint8_t error;
259
260         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
261                        : "=q"(error) : "a"(value), "d"(field) : "cc");
262         if (error)
263                 vmwrite_error(field, value);
264 }
265
266 static void vmcs_write16(unsigned long field, uint16_t value)
267 {
268         vmcs_writel(field, value);
269 }
270
271 static void vmcs_write32(unsigned long field, uint32_t value)
272 {
273         vmcs_writel(field, value);
274 }
275
276 static void vmcs_write64(unsigned long field, uint64_t value)
277 {
278         vmcs_writel(field, value);
279 }
280
281 /*
282  * A note on Things You Can't Make Up.
283  * or
284  * "George, you can type this shit, but you can't say it" -- Harrison Ford
285  *
286  * There are 5 VMCS 32-bit words that control guest permissions. If
287  * you set these correctly, you've got a guest that will behave. If
288  * you get even one bit wrong, you've got a guest that will chew your
289  * leg off. Some bits must be 1, some must be 0, and some can be set
290  * either way. To add to the fun, the docs are sort of a docudrama or,
291  * as the quote goes, "interesting if true."
292  *
293  * To determine what bit can be set in what VMCS 32-bit control word,
294  * there are 5 corresponding 64-bit MSRs.  And, to make it even more
295  * fun, the standard set of MSRs have errors in them, i.e. report
296  * incorrect values, for legacy reasons, and so you are supposed to
297  * "look around" to another set, which have correct bits in
298  * them. There are four such 'correct' registers, and they have _TRUE_
299  * in the names as you can see below. We test for the value of VMCS
300  * control bits in the _TRUE_ registers if possible. The fifth
301  * register, CPU Secondary Exec Controls, which came later, needs no
302  * _TRUE_ variant.
303  *
304  * For each MSR, the high 32 bits tell you what bits can be "1" by a
305  * "1" in that position; the low 32 bits tell you what bit can be "0"
306  * by a "0" in that position. So, for each of 32 bits in a given VMCS
307  * control word, there is a pair of bits in an MSR that tells you what
308  * values it can take. The two bits, of which there are *four*
309  * combinations, describe the *three* possible operations on a
310  * bit. The two bits, taken together, form an untruth table: There are
311  * three possibilities: The VMCS bit can be set to 0 or 1, or it can
312  * only be 0, or only 1. The fourth combination is not supposed to
313  * happen.
314  *
315  * So: there is the 1 bit from the upper 32 bits of the msr.
316  * If this bit is set, then the bit can be 1. If clear, it can not be 1.
317  *
318  * Then there is the 0 bit, from low 32 bits. If clear, the VMCS bit
319  * can be 0. If 1, the VMCS bit can not be 0.
320  *
321  * SO, let's call the 1 bit R1, and the 0 bit R0, we have:
322  *  R1 R0
323  *  0 0 -> must be 0
324  *  1 0 -> can be 1, can be 0
325  *  0 1 -> can not be 1, can not be 0. --> JACKPOT! Not seen yet.
326  *  1 1 -> must be one.
327  *
328  * It's also pretty hard to know what you can and can't set, and
329  * that's led to inadvertant opening of permissions at times.  Because
330  * of this complexity we've decided on the following: the driver must
331  * define EVERY bit, UNIQUELY, for each of the 5 registers, that it wants
332  * set. Further, for any bit that's settable, the driver must specify
333  * a setting; for any bit that's reserved, the driver settings must
334  * match that bit. If there are reserved bits we don't specify, that's
335  * ok; we'll take them as is.
336  *
337  * We use a set-means-set, and set-means-clear model, i.e. we use a
338  * 32-bit word to contain the bits we want to be 1, indicated by one;
339  * and another 32-bit word in which a bit we want to be 0 is indicated
340  * by a 1. This allows us to easily create masks of all bits we're
341  * going to set, for example.
342  *
343  * We have two 32-bit numbers for each 32-bit VMCS field: bits we want
344  * set and bits we want clear.  If you read the MSR for that field,
345  * compute the reserved 0 and 1 settings, and | them together, they
346  * need to result in 0xffffffff. You can see that we can create other
347  * tests for conflicts (i.e. overlap).
348  *
349  * At this point, I've tested check_vmx_controls in every way
350  * possible, beause I kept screwing the bitfields up. You'll get a nice
351  * error it won't work at all, which is what we want: a
352  * failure-prone setup, where even errors that might result in correct
353  * values are caught -- "right answer, wrong method, zero credit." If there's
354  * weirdness in the bits, we don't want to run.
355  */
356
357 static bool check_vmxec_controls(struct vmxec const *v, bool have_true_msr,
358                                  uint32_t *result)
359 {
360         bool err = false;
361         uint32_t vmx_msr_low, vmx_msr_high;
362         uint32_t reserved_0, reserved_1, changeable_bits;
363
364         if (have_true_msr)
365                 rdmsr(v->truemsr, vmx_msr_low, vmx_msr_high);
366         else
367                 rdmsr(v->msr, vmx_msr_low, vmx_msr_high);
368
369         if (vmx_msr_low & ~vmx_msr_high)
370                 warn("JACKPOT: Conflicting VMX ec ctls for %s, high 0x%08x low 0x%08x",
371                      v->name, vmx_msr_high, vmx_msr_low);
372
373         reserved_0 = (~vmx_msr_low) & (~vmx_msr_high);
374         reserved_1 = vmx_msr_low & vmx_msr_high;
375         changeable_bits = ~(reserved_0 | reserved_1);
376
377         /*
378          * this is very much as follows:
379          * accept the things I cannot change,
380          * change the things I can,
381          * know the difference.
382          */
383
384         /* Conflict. Don't try to both set and reset bits. */
385         if (v->set_to_0 & v->set_to_1) {
386                 printk("%s: set to 0 (0x%x) and set to 1 (0x%x) overlap: 0x%x\n",
387                        v->name, v->set_to_0, v->set_to_1, v->set_to_0 & v->set_to_1);
388                 err = true;
389         }
390
391         /* coverage */
392         if (((v->set_to_0 | v->set_to_1) & changeable_bits) !=
393             changeable_bits) {
394                 printk("%s: Need to cover 0x%x and have 0x%x,0x%x\n",
395                        v->name, changeable_bits, v->set_to_0,  v->set_to_1);
396                 err = true;
397         }
398
399         if ((v->set_to_0 | v->set_to_1 | reserved_0 | reserved_1) !=
400             0xffffffff) {
401                 printk("%s: incomplete coverage: have 0x%x, want 0x%x\n",
402                        v->name, v->set_to_0 | v->set_to_1 |
403                        reserved_0 | reserved_1, 0xffffffff);
404                 err = true;
405         }
406
407         /* Don't try to change bits that can't be changed. */
408         if ((v->set_to_0 & (reserved_0 | changeable_bits)) != v->set_to_0) {
409                 printk("%s: set to 0 (0x%x) can't be done\n", v->name,
410                         v->set_to_0);
411                 err = true;
412         }
413
414         if ((v->set_to_1 & (reserved_1 | changeable_bits)) != v->set_to_1) {
415                 printk("%s: set to 1 (0x%x) can't be done\n",
416                        v->name, v->set_to_1);
417                 err = true;
418         }
419
420         /* If there's been any error at all, spill our guts and return. */
421         if (err) {
422                 printk("%s: vmx_msr_high 0x%x, vmx_msr_low 0x%x, ",
423                        v->name, vmx_msr_high, vmx_msr_low);
424                 printk("set_to_1 0x%x,set_to_0 0x%x,reserved_1 0x%x",
425                        v->set_to_1, v->set_to_0, reserved_1);
426                 printk(" reserved_0 0x%x", reserved_0);
427                 printk(" changeable_bits 0x%x\n", changeable_bits);
428                 return false;
429         }
430
431         *result = v->set_to_1 | reserved_1;
432
433         printd("%s: check_vmxec_controls succeeds with result 0x%x\n",
434                v->name, *result);
435         return true;
436 }
437
438 /*
439  * We're trying to make this as readable as possible. Realistically, it will
440  * rarely if ever change, if the past is any guide.
441  */
442 static const struct vmxec pbec = {
443         .name = "Pin Based Execution Controls",
444         .msr = MSR_IA32_VMX_PINBASED_CTLS,
445         .truemsr = MSR_IA32_VMX_TRUE_PINBASED_CTLS,
446
447         .set_to_1 = (PIN_BASED_EXT_INTR_MASK |
448                      PIN_BASED_NMI_EXITING |
449                      PIN_BASED_VIRTUAL_NMIS),
450
451         .set_to_0 = (PIN_BASED_VMX_PREEMPTION_TIMER |
452                      PIN_BASED_POSTED_INTR),
453 };
454
455 static const struct vmxec cbec = {
456         .name = "CPU Based Execution Controls",
457         .msr = MSR_IA32_VMX_PROCBASED_CTLS,
458         .truemsr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
459
460         .set_to_1 = (CPU_BASED_HLT_EXITING |
461                      CPU_BASED_INVLPG_EXITING |
462                      CPU_BASED_MWAIT_EXITING |
463                      CPU_BASED_RDPMC_EXITING |
464                      CPU_BASED_CR8_LOAD_EXITING |
465                      CPU_BASED_CR8_STORE_EXITING |
466                      CPU_BASED_MOV_DR_EXITING |
467                      CPU_BASED_UNCOND_IO_EXITING |
468                      CPU_BASED_USE_MSR_BITMAPS |
469                      CPU_BASED_MONITOR_EXITING |
470                      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS),
471
472         .set_to_0 = (CPU_BASED_VIRTUAL_INTR_PENDING |
473                      CPU_BASED_USE_TSC_OFFSETING |
474                      CPU_BASED_RDTSC_EXITING |
475                      CPU_BASED_CR3_LOAD_EXITING |
476                      CPU_BASED_CR3_STORE_EXITING |
477                      CPU_BASED_TPR_SHADOW |
478                      CPU_BASED_VIRTUAL_NMI_PENDING |
479                      CPU_BASED_MONITOR_TRAP |
480                      CPU_BASED_PAUSE_EXITING |
481                      CPU_BASED_USE_IO_BITMAPS),
482 };
483
484 static const struct vmxec cb2ec = {
485         .name = "CPU Based 2nd Execution Controls",
486         .msr = MSR_IA32_VMX_PROCBASED_CTLS2,
487         .truemsr = MSR_IA32_VMX_PROCBASED_CTLS2,
488
489         .set_to_1 = (SECONDARY_EXEC_ENABLE_EPT |
490                      SECONDARY_EXEC_RDTSCP |
491                      SECONDARY_EXEC_WBINVD_EXITING),
492
493         .set_to_0 = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
494                      SECONDARY_EXEC_DESCRIPTOR_EXITING |
495                      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
496                      SECONDARY_EXEC_ENABLE_VPID |
497                      SECONDARY_EXEC_UNRESTRICTED_GUEST |
498                      SECONDARY_EXEC_APIC_REGISTER_VIRT |
499                      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
500                      SECONDARY_EXEC_PAUSE_LOOP_EXITING |
501                      SECONDARY_EXEC_RDRAND_EXITING |
502                      SECONDARY_EXEC_ENABLE_INVPCID |
503                      SECONDARY_EXEC_ENABLE_VMFUNC |
504                      SECONDARY_EXEC_SHADOW_VMCS |
505                      SECONDARY_EXEC_RDSEED_EXITING |
506                      SECONDARY_EPT_VE |
507                      SECONDARY_ENABLE_XSAV_RESTORE)
508 };
509
510 static const struct vmxec vmentry = {
511         .name = "VMENTRY controls",
512         .msr = MSR_IA32_VMX_ENTRY_CTLS,
513         .truemsr = MSR_IA32_VMX_TRUE_ENTRY_CTLS,
514         /* exact order from vmx.h; only the first two are enabled. */
515
516         .set_to_1 =  (VM_ENTRY_LOAD_DEBUG_CONTROLS | /* can't set to 0 */
517                       VM_ENTRY_IA32E_MODE),
518
519         .set_to_0 = (VM_ENTRY_SMM |
520                      VM_ENTRY_DEACT_DUAL_MONITOR |
521                      VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
522                      VM_ENTRY_LOAD_IA32_PAT |
523                      VM_ENTRY_LOAD_IA32_EFER),
524 };
525
526 static const struct vmxec vmexit = {
527         .name = "VMEXIT controls",
528         .msr = MSR_IA32_VMX_EXIT_CTLS,
529         .truemsr = MSR_IA32_VMX_TRUE_EXIT_CTLS,
530
531         .set_to_1 = (VM_EXIT_SAVE_DEBUG_CONTROLS | /* can't set to 0 */
532                      VM_EXIT_HOST_ADDR_SPACE_SIZE), /* 64 bit */
533
534         .set_to_0 = (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
535                      VM_EXIT_ACK_INTR_ON_EXIT |
536                      VM_EXIT_SAVE_IA32_PAT |
537                      VM_EXIT_LOAD_IA32_PAT |
538                      VM_EXIT_SAVE_IA32_EFER |
539                      VM_EXIT_LOAD_IA32_EFER |
540                      VM_EXIT_SAVE_VMX_PREEMPTION_TIMER),
541 };
542
543 static void setup_vmcs_config(void *p)
544 {
545         int *ret = p;
546         struct vmcs_config *vmcs_conf = &vmcs_config;
547         uint32_t vmx_msr_high;
548         uint64_t vmx_msr;
549         bool have_true_msrs = false;
550         bool ok;
551
552         *ret = -EIO;
553
554         vmx_msr = read_msr(MSR_IA32_VMX_BASIC);
555         vmx_msr_high = vmx_msr >> 32;
556
557         /*
558          * If bit 55 (VMX_BASIC_HAVE_TRUE_MSRS) is set, then we
559          * can go for the true MSRs.  Else, we ask you to get a better CPU.
560          */
561         if (vmx_msr & VMX_BASIC_TRUE_CTLS) {
562                 have_true_msrs = true;
563                 printd("Running with TRUE MSRs\n");
564         } else {
565                 printk("Running with non-TRUE MSRs, this is old hardware\n");
566         }
567
568         /*
569          * Don't worry that one or more of these might fail and leave
570          * the VMCS in some kind of incomplete state. If one of these
571          * fails, the caller is going to discard the VMCS.
572          * It is written this way to ensure we get results of all tests and avoid
573          * BMAFR behavior.
574          */
575         ok = check_vmxec_controls(&pbec, have_true_msrs,
576                                   &vmcs_conf->pin_based_exec_ctrl);
577         ok = check_vmxec_controls(&cbec, have_true_msrs,
578                                   &vmcs_conf->cpu_based_exec_ctrl) && ok;
579         ok = check_vmxec_controls(&cb2ec, have_true_msrs,
580                                   &vmcs_conf->cpu_based_2nd_exec_ctrl) && ok;
581         ok = check_vmxec_controls(&vmentry, have_true_msrs,
582                                   &vmcs_conf->vmentry_ctrl) && ok;
583         ok = check_vmxec_controls(&vmexit, have_true_msrs,
584                                   &vmcs_conf->vmexit_ctrl) && ok;
585         if (! ok) {
586                 printk("vmxexec controls is no good.\n");
587                 return;
588         }
589
590         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
591         if ((vmx_msr_high & 0x1fff) > PGSIZE) {
592                 printk("vmx_msr_high & 0x1fff) is 0x%x, > PAGE_SIZE 0x%x\n",
593                        vmx_msr_high & 0x1fff, PGSIZE);
594                 return;
595         }
596
597         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
598         if (vmx_msr & VMX_BASIC_64) {
599                 printk("VMX doesn't support 64 bit width!\n");
600                 return;
601         }
602
603         if (((vmx_msr & VMX_BASIC_MEM_TYPE_MASK) >> VMX_BASIC_MEM_TYPE_SHIFT)
604             != VMX_BASIC_MEM_TYPE_WB) {
605                 printk("VMX doesn't support WB memory for VMCS accesses!\n");
606                 return;
607         }
608
609         vmcs_conf->size = vmx_msr_high & 0x1fff;
610         vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
611         vmcs_conf->revision_id = (uint32_t)vmx_msr;
612
613         /* Read in the caps for runtime checks.  This MSR is only available if
614          * secondary controls and ept or vpid is on, which we check earlier */
615         rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, vmx_capability.ept, vmx_capability.vpid);
616
617         *ret = 0;
618 }
619
620 static struct vmcs *__vmx_alloc_vmcs(int node)
621 {
622         struct vmcs *vmcs;
623
624         vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
625         if (!vmcs)
626                 return 0;
627         memset(vmcs, 0, vmcs_config.size);
628         vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
629         printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
630         return vmcs;
631 }
632
633 /**
634  * vmx_alloc_vmcs - allocates a VMCS region
635  *
636  * NOTE: Assumes the new region will be used by the current CPU.
637  *
638  * Returns a valid VMCS region.
639  */
640 static struct vmcs *vmx_alloc_vmcs(void)
641 {
642         return __vmx_alloc_vmcs(node_id());
643 }
644
645 /**
646  * vmx_free_vmcs - frees a VMCS region
647  */
648 static void vmx_free_vmcs(struct vmcs *vmcs)
649 {
650   //free_pages((unsigned long)vmcs, vmcs_config.order);
651 }
652
653 /*
654  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
655  * will not change in the lifetime of the guest.
656  * Note that host-state that does change is set elsewhere. E.g., host-state
657  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
658  */
659 static void vmx_setup_constant_host_state(void)
660 {
661         uint32_t low32, high32;
662         unsigned long tmpl;
663         pseudodesc_t dt;
664
665         vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);  /* 22.2.3 */
666         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
667         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
668
669         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
670         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
671         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
672         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
673         vmcs_write16(HOST_TR_SELECTOR, GD_TSS);  /* 22.2.4 */
674
675         native_store_idt(&dt);
676         vmcs_writel(HOST_IDTR_BASE, dt.pd_base);   /* 22.2.4 */
677
678         asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
679         vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
680
681         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
682         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
683         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
684         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
685
686         rdmsr(MSR_EFER, low32, high32);
687         vmcs_write32(HOST_IA32_EFER, low32);
688
689         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
690                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
691                 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
692         }
693
694         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
695         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
696
697         /* TODO: This (at least gs) is per cpu */
698         rdmsrl(MSR_FS_BASE, tmpl);
699         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
700         rdmsrl(MSR_GS_BASE, tmpl);
701         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
702 }
703
704 static inline uint16_t vmx_read_ldt(void)
705 {
706         uint16_t ldt;
707         asm("sldt %0" : "=g"(ldt));
708         return ldt;
709 }
710
711 static unsigned long segment_base(uint16_t selector)
712 {
713         pseudodesc_t *gdt = &currentcpu->host_gdt;
714         struct desc_struct *d;
715         unsigned long table_base;
716         unsigned long v;
717
718         if (!(selector & ~3)) {
719                 return 0;
720         }
721
722         table_base = gdt->pd_base;
723
724         if (selector & 4) {           /* from ldt */
725                 uint16_t ldt_selector = vmx_read_ldt();
726
727                 if (!(ldt_selector & ~3)) {
728                         return 0;
729                 }
730
731                 table_base = segment_base(ldt_selector);
732         }
733         d = (struct desc_struct *)(table_base + (selector & ~7));
734         v = get_desc_base(d);
735 #ifdef CONFIG_X86_64
736        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
737                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
738 #endif
739         return v;
740 }
741
742 static inline unsigned long vmx_read_tr_base(void)
743 {
744         uint16_t tr;
745         asm("str %0" : "=g"(tr));
746         return segment_base(tr);
747 }
748
749 static void __vmx_setup_cpu(void)
750 {
751         pseudodesc_t *gdt = &currentcpu->host_gdt;
752         unsigned long sysenter_esp;
753         unsigned long tmpl;
754
755         /*
756          * Linux uses per-cpu TSS and GDT, so set these when switching
757          * processors.
758          */
759         vmcs_writel(HOST_TR_BASE, vmx_read_tr_base()); /* 22.2.4 */
760         vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);   /* 22.2.4 */
761
762         rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
763         vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
764
765         rdmsrl(MSR_FS_BASE, tmpl);
766         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
767         rdmsrl(MSR_GS_BASE, tmpl);
768         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
769 }
770
771 /**
772  * vmx_get_cpu - called before using a cpu
773  * @vcpu: VCPU that will be loaded.
774  *
775  * Disables preemption. Call vmx_put_cpu() when finished.
776  */
777 static void vmx_get_cpu(struct vmx_vcpu *vcpu)
778 {
779         int cur_cpu = core_id();
780         handler_wrapper_t *w;
781
782         if (currentcpu->local_vcpu)
783                 panic("get_cpu: currentcpu->localvcpu was non-NULL");
784         if (currentcpu->local_vcpu != vcpu) {
785                 currentcpu->local_vcpu = vcpu;
786
787                 if (vcpu->cpu != cur_cpu) {
788                         if (vcpu->cpu >= 0) {
789                                 panic("vcpu->cpu is not -1, it's %d\n", vcpu->cpu);
790                         } else
791                                 vmcs_clear(vcpu->vmcs);
792
793                         ept_sync_context(vcpu_get_eptp(vcpu));
794
795                         vcpu->launched = 0;
796                         vmcs_load(vcpu->vmcs);
797                         __vmx_setup_cpu();
798                         vcpu->cpu = cur_cpu;
799                 } else {
800                         vmcs_load(vcpu->vmcs);
801                 }
802         }
803 }
804
805 /**
806  * vmx_put_cpu - called after using a cpu
807  * @vcpu: VCPU that was loaded.
808  */
809 static void vmx_put_cpu(struct vmx_vcpu *vcpu)
810 {
811         if (core_id() != vcpu->cpu)
812                 panic("%s: core_id() %d != vcpu->cpu %d\n",
813                       __func__, core_id(), vcpu->cpu);
814
815         if (currentcpu->local_vcpu != vcpu)
816                 panic("vmx_put_cpu: asked to clear something not ours");
817
818         ept_sync_context(vcpu_get_eptp(vcpu));
819         vmcs_clear(vcpu->vmcs);
820         vcpu->cpu = -1;
821         currentcpu->local_vcpu = NULL;
822         //put_cpu();
823 }
824
825 /**
826  * vmx_dump_cpu - prints the CPU state
827  * @vcpu: VCPU to print
828  */
829 static void vmx_dump_cpu(struct vmx_vcpu *vcpu)
830 {
831
832         unsigned long flags;
833
834         vmx_get_cpu(vcpu);
835         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
836         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
837         flags = vmcs_readl(GUEST_RFLAGS);
838         vmx_put_cpu(vcpu);
839
840         printk("--- Begin VCPU Dump ---\n");
841         printk("CPU %d VPID %d\n", vcpu->cpu, 0);
842         printk("RIP 0x%016lx RFLAGS 0x%08lx\n",
843                vcpu->regs.tf_rip, flags);
844         printk("RAX 0x%016lx RCX 0x%016lx\n",
845                 vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
846         printk("RDX 0x%016lx RBX 0x%016lx\n",
847                 vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
848         printk("RSP 0x%016lx RBP 0x%016lx\n",
849                 vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
850         printk("RSI 0x%016lx RDI 0x%016lx\n",
851                 vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
852         printk("R8  0x%016lx R9  0x%016lx\n",
853                 vcpu->regs.tf_r8, vcpu->regs.tf_r9);
854         printk("R10 0x%016lx R11 0x%016lx\n",
855                 vcpu->regs.tf_r10, vcpu->regs.tf_r11);
856         printk("R12 0x%016lx R13 0x%016lx\n",
857                 vcpu->regs.tf_r12, vcpu->regs.tf_r13);
858         printk("R14 0x%016lx R15 0x%016lx\n",
859                 vcpu->regs.tf_r14, vcpu->regs.tf_r15);
860         printk("--- End VCPU Dump ---\n");
861
862 }
863
864 uint64_t construct_eptp(physaddr_t root_hpa)
865 {
866         uint64_t eptp;
867
868         /* set WB memory and 4 levels of walk.  we checked these in ept_init */
869         eptp = VMX_EPT_MEM_TYPE_WB |
870                (VMX_EPT_GAW_4_LVL << VMX_EPT_GAW_EPTP_SHIFT);
871         if (cpu_has_vmx_ept_ad_bits())
872                 eptp |= VMX_EPT_AD_ENABLE_BIT;
873         eptp |= (root_hpa & PAGE_MASK);
874
875         return eptp;
876 }
877
878 /**
879  * vmx_setup_initial_guest_state - configures the initial state of guest registers
880  */
881 static void vmx_setup_initial_guest_state(void)
882 {
883         unsigned long tmpl;
884         unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
885                             X86_CR4_PGE | X86_CR4_OSFXSR;
886         uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
887 #if 0
888         do we need it
889         if (boot_cpu_has(X86_FEATURE_PCID))
890                 cr4 |= X86_CR4_PCIDE;
891         if (boot_cpu_has(X86_FEATURE_OSXSAVE))
892                 cr4 |= X86_CR4_OSXSAVE;
893 #endif
894         /* we almost certainly have this */
895         /* we'll go sour if we don't. */
896         if (1) //boot_cpu_has(X86_FEATURE_FSGSBASE))
897                 cr4 |= X86_CR4_RDWRGSFS;
898
899         /* configure control and data registers */
900         vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
901                                X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
902         vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
903                                      X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
904         vmcs_writel(GUEST_CR3, rcr3());
905         vmcs_writel(GUEST_CR4, cr4);
906         vmcs_writel(CR4_READ_SHADOW, cr4);
907         vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
908                                      EFER_SCE | EFER_FFXSR);
909         vmcs_writel(GUEST_GDTR_BASE, 0);
910         vmcs_writel(GUEST_GDTR_LIMIT, 0);
911         vmcs_writel(GUEST_IDTR_BASE, 0);
912         vmcs_writel(GUEST_IDTR_LIMIT, 0);
913         vmcs_writel(GUEST_RIP, 0xdeadbeef);
914         vmcs_writel(GUEST_RSP, 0xdeadbeef);
915         vmcs_writel(GUEST_RFLAGS, 0x02);
916         vmcs_writel(GUEST_DR7, 0);
917
918         /* guest segment bases */
919         vmcs_writel(GUEST_CS_BASE, 0);
920         vmcs_writel(GUEST_DS_BASE, 0);
921         vmcs_writel(GUEST_ES_BASE, 0);
922         vmcs_writel(GUEST_GS_BASE, 0);
923         vmcs_writel(GUEST_SS_BASE, 0);
924         rdmsrl(MSR_FS_BASE, tmpl);
925         vmcs_writel(GUEST_FS_BASE, tmpl);
926
927         /* guest segment access rights */
928         vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
929         vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
930         vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
931         vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
932         vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
933         vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
934
935         /* guest segment limits */
936         vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
937         vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
938         vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
939         vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
940         vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
941         vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
942
943         /* configure segment selectors */
944         vmcs_write16(GUEST_CS_SELECTOR, 0);
945         vmcs_write16(GUEST_DS_SELECTOR, 0);
946         vmcs_write16(GUEST_ES_SELECTOR, 0);
947         vmcs_write16(GUEST_FS_SELECTOR, 0);
948         vmcs_write16(GUEST_GS_SELECTOR, 0);
949         vmcs_write16(GUEST_SS_SELECTOR, 0);
950         vmcs_write16(GUEST_TR_SELECTOR, 0);
951
952         /* guest LDTR */
953         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
954         vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
955         vmcs_writel(GUEST_LDTR_BASE, 0);
956         vmcs_writel(GUEST_LDTR_LIMIT, 0);
957
958         /* guest TSS */
959         vmcs_writel(GUEST_TR_BASE, 0);
960         vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
961         vmcs_writel(GUEST_TR_LIMIT, 0xff);
962
963         /* initialize sysenter */
964         vmcs_write32(GUEST_SYSENTER_CS, 0);
965         vmcs_writel(GUEST_SYSENTER_ESP, 0);
966         vmcs_writel(GUEST_SYSENTER_EIP, 0);
967
968         /* other random initialization */
969         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
970         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
971         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
972         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
973         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
974 }
975
976 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
977 {
978         int f = sizeof(unsigned long);
979         /*
980          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
981          * have the write-low and read-high bitmap offsets the wrong way round.
982          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
983          */
984         if (msr <= 0x1fff) {
985                 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
986                 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
987         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
988                 msr &= 0x1fff;
989                 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
990                 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
991         }
992 }
993
994 static void setup_msr(struct vmx_vcpu *vcpu)
995 {
996         int set[] = { MSR_LSTAR };
997         struct vmx_msr_entry *e;
998         int sz = sizeof(set) / sizeof(*set);
999         int i;
1000
1001         //BUILD_BUG_ON(sz > NR_AUTOLOAD_MSRS);
1002
1003         vcpu->msr_autoload.nr = sz;
1004
1005         /* XXX enable only MSRs in set */
1006         vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1007
1008         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1009         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1010         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1011
1012         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1013         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1014         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1015
1016         for (i = 0; i < sz; i++) {
1017                 uint64_t val;
1018
1019                 e = &vcpu->msr_autoload.host[i];
1020                 e->index = set[i];
1021                 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1022                 rdmsrl(e->index, val);
1023                 e->value = val;
1024
1025                 e = &vcpu->msr_autoload.guest[i];
1026                 e->index = set[i];
1027                 e->value = 0xDEADBEEF;
1028         }
1029 }
1030
1031 /**
1032  *  vmx_setup_vmcs - configures the vmcs with starting parameters
1033  */
1034 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu)
1035 {
1036         vmcs_write16(VIRTUAL_PROCESSOR_ID, 0);
1037         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1038
1039         /* Control */
1040         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1041                 vmcs_config.pin_based_exec_ctrl);
1042
1043         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1044                 vmcs_config.cpu_based_exec_ctrl);
1045
1046         if (cpu_has_secondary_exec_ctrls()) {
1047                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1048                              vmcs_config.cpu_based_2nd_exec_ctrl);
1049         }
1050
1051         vmcs_write64(EPT_POINTER, vcpu_get_eptp(vcpu));
1052
1053         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1054         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1055         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1056
1057         setup_msr(vcpu);
1058 #if 0
1059         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1060                 uint32_t msr_low, msr_high;
1061                 uint64_t host_pat;
1062                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
1063                 host_pat = msr_low | ((uint64_t) msr_high << 32);
1064                 /* Write the default value follow host pat */
1065                 vmcs_write64(GUEST_IA32_PAT, host_pat);
1066                 /* Keep arch.pat sync with GUEST_IA32_PAT */
1067                 vmx->vcpu.arch.pat = host_pat;
1068         }
1069 #endif
1070 #if 0
1071         for (int i = 0; i < NR_VMX_MSR; ++i) {
1072                 uint32_t index = vmx_msr_index[i];
1073                 uint32_t data_low, data_high;
1074                 int j = vmx->nmsrs;
1075                 // TODO we should have read/writemsr_safe
1076 #if 0
1077                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1078                         continue;
1079                 if (wrmsr_safe(index, data_low, data_high) < 0)
1080                         continue;
1081 #endif
1082                 vmx->guest_msrs[j].index = i;
1083                 vmx->guest_msrs[j].data = 0;
1084                 vmx->guest_msrs[j].mask = -1ull;
1085                 ++vmx->nmsrs;
1086         }
1087 #endif
1088
1089         vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1090
1091         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1092         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1093
1094         vmcs_writel(CR0_GUEST_HOST_MASK, ~0ul);
1095         vmcs_writel(CR4_GUEST_HOST_MASK, ~0ul);
1096
1097         //kvm_write_tsc(&vmx->vcpu, 0);
1098         vmcs_writel(TSC_OFFSET, 0);
1099
1100         vmx_setup_constant_host_state();
1101 }
1102
1103 /**
1104  * vmx_create_vcpu - allocates and initializes a new virtual cpu
1105  *
1106  * Returns: A new VCPU structure
1107  */
1108 struct vmx_vcpu *vmx_create_vcpu(struct proc *p)
1109 {
1110         struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1111         if (!vcpu) {
1112                 return NULL;
1113         }
1114
1115         memset(vcpu, 0, sizeof(*vcpu));
1116
1117         vcpu->proc = p; /* uncounted (weak) reference */
1118         vcpu->vmcs = vmx_alloc_vmcs();
1119         printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1120         if (!vcpu->vmcs)
1121                 goto fail_vmcs;
1122
1123         vcpu->cpu = -1;
1124
1125         vmx_get_cpu(vcpu);
1126         vmx_setup_vmcs(vcpu);
1127         vmx_setup_initial_guest_state();
1128         vmx_put_cpu(vcpu);
1129
1130         return vcpu;
1131
1132 fail_vmcs:
1133         kfree(vcpu);
1134         return NULL;
1135 }
1136
1137 /**
1138  * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1139  * @vcpu: the VCPU to destroy
1140  */
1141 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu)
1142 {
1143         vmx_free_vmcs(vcpu->vmcs);
1144         kfree(vcpu);
1145 }
1146
1147 /**
1148  * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1149  *
1150  * In the contexts where this is used the vcpu pointer should never be NULL.
1151  */
1152 static inline struct vmx_vcpu *vmx_current_vcpu(void)
1153 {
1154         struct vmx_vcpu *vcpu = currentcpu->local_vcpu;
1155         if (!vcpu)
1156                 panic("Core has no vcpu!");
1157         return vcpu;
1158 }
1159
1160 /**
1161  * vmx_run_vcpu - launches the CPU into non-root mode
1162  * We ONLY support 64-bit guests.
1163  * @vcpu: the vmx instance to launch
1164  */
1165 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1166 {
1167         asm(
1168                 /* Store host registers */
1169                 "push %%rdx; push %%rbp;"
1170                 "push %%rcx \n\t" /* placeholder for guest rcx */
1171                 "push %%rcx \n\t"
1172                 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1173                 "je 1f \n\t"
1174                 "mov %%rsp, %c[host_rsp](%0) \n\t"
1175                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1176                 "1: \n\t"
1177                 /* Reload cr2 if changed */
1178                 "mov %c[cr2](%0), %%rax \n\t"
1179                 "mov %%cr2, %%rdx \n\t"
1180                 "cmp %%rax, %%rdx \n\t"
1181                 "je 2f \n\t"
1182                 "mov %%rax, %%cr2 \n\t"
1183                 "2: \n\t"
1184                 /* Check if vmlaunch of vmresume is needed */
1185                 "cmpl $0, %c[launched](%0) \n\t"
1186                 /* Load guest registers.  Don't clobber flags. */
1187                 "mov %c[rax](%0), %%rax \n\t"
1188                 "mov %c[rbx](%0), %%rbx \n\t"
1189                 "mov %c[rdx](%0), %%rdx \n\t"
1190                 "mov %c[rsi](%0), %%rsi \n\t"
1191                 "mov %c[rdi](%0), %%rdi \n\t"
1192                 "mov %c[rbp](%0), %%rbp \n\t"
1193                 "mov %c[r8](%0),  %%r8  \n\t"
1194                 "mov %c[r9](%0),  %%r9  \n\t"
1195                 "mov %c[r10](%0), %%r10 \n\t"
1196                 "mov %c[r11](%0), %%r11 \n\t"
1197                 "mov %c[r12](%0), %%r12 \n\t"
1198                 "mov %c[r13](%0), %%r13 \n\t"
1199                 "mov %c[r14](%0), %%r14 \n\t"
1200                 "mov %c[r15](%0), %%r15 \n\t"
1201                 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1202
1203                 /* Enter guest mode */
1204                 "jne .Llaunched \n\t"
1205                 ASM_VMX_VMLAUNCH "\n\t"
1206                 "jmp .Lkvm_vmx_return \n\t"
1207                 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1208                 ".Lkvm_vmx_return: "
1209                 /* Save guest registers, load host registers, keep flags */
1210                 "mov %0, %c[wordsize](%%rsp) \n\t"
1211                 "pop %0 \n\t"
1212                 "mov %%rax, %c[rax](%0) \n\t"
1213                 "mov %%rbx, %c[rbx](%0) \n\t"
1214                 "popq %c[rcx](%0) \n\t"
1215                 "mov %%rdx, %c[rdx](%0) \n\t"
1216                 "mov %%rsi, %c[rsi](%0) \n\t"
1217                 "mov %%rdi, %c[rdi](%0) \n\t"
1218                 "mov %%rbp, %c[rbp](%0) \n\t"
1219                 "mov %%r8,  %c[r8](%0) \n\t"
1220                 "mov %%r9,  %c[r9](%0) \n\t"
1221                 "mov %%r10, %c[r10](%0) \n\t"
1222                 "mov %%r11, %c[r11](%0) \n\t"
1223                 "mov %%r12, %c[r12](%0) \n\t"
1224                 "mov %%r13, %c[r13](%0) \n\t"
1225                 "mov %%r14, %c[r14](%0) \n\t"
1226                 "mov %%r15, %c[r15](%0) \n\t"
1227                 "mov %%rax, %%r10 \n\t"
1228                 "mov %%rdx, %%r11 \n\t"
1229
1230                 "mov %%cr2, %%rax   \n\t"
1231                 "mov %%rax, %c[cr2](%0) \n\t"
1232
1233                 "pop  %%rbp; pop  %%rdx \n\t"
1234                 "setbe %c[fail](%0) \n\t"
1235                 "mov $" STRINGIFY(GD_UD) ", %%rax \n\t"
1236                 "mov %%rax, %%ds \n\t"
1237                 "mov %%rax, %%es \n\t"
1238               : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1239                 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1240                 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1241                 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1242                 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1243                 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1244                 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1245                 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1246                 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1247                 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1248                 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1249                 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1250                 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1251                 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1252                 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1253                 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1254                 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1255                 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1256                 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1257                 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1258                 [wordsize]"i"(sizeof(unsigned long))
1259               : "cc", "memory"
1260                 , "rax", "rbx", "rdi", "rsi"
1261                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1262         );
1263
1264         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1265         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1266         printk("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1267                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1268         /* FIXME: do we need to set up other flags? */
1269         vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
1270                       X86_EFLAGS_IF | 0x2;
1271
1272         vcpu->regs.tf_cs = GD_UT;
1273         vcpu->regs.tf_ss = GD_UD;
1274
1275         vcpu->launched = 1;
1276
1277         if (vcpu->fail) {
1278                 printk("failure detected (err %x)\n",
1279                        vmcs_read32(VM_INSTRUCTION_ERROR));
1280                 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1281         }
1282
1283         return vmcs_read32(VM_EXIT_REASON);
1284
1285 #if 0
1286         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1287         vmx_complete_atomic_exit(vmx);
1288         vmx_recover_nmi_blocking(vmx);
1289         vmx_complete_interrupts(vmx);
1290 #endif
1291 }
1292
1293 static void vmx_step_instruction(void)
1294 {
1295         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1296                                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1297 }
1298
1299 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu)
1300 {
1301         unsigned long gva, gpa;
1302         int exit_qual, ret = -1;
1303         page_t *page;
1304
1305         vmx_get_cpu(vcpu);
1306         exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1307         gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1308         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1309
1310         vmx_put_cpu(vcpu);
1311
1312         int prot = 0;
1313         prot |= exit_qual & VMX_EPT_FAULT_READ ? PROT_READ : 0;
1314         prot |= exit_qual & VMX_EPT_FAULT_WRITE ? PROT_WRITE : 0;
1315         prot |= exit_qual & VMX_EPT_FAULT_INS ? PROT_EXEC : 0;
1316         ret = handle_page_fault(current, gpa, prot);
1317
1318         if (ret) {
1319                 printk("EPT page fault failure GPA: %p, GVA: %p\n", gpa, gva);
1320                 vmx_dump_cpu(vcpu);
1321         }
1322
1323         return ret;
1324 }
1325
1326 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu)
1327 {
1328         unsigned int eax, ebx, ecx, edx;
1329
1330         eax = vcpu->regs.tf_rax;
1331         ecx = vcpu->regs.tf_rcx;
1332         cpuid(0, 2, &eax, &ebx, &ecx, &edx);
1333         vcpu->regs.tf_rax = eax;
1334         vcpu->regs.tf_rbx = ebx;
1335         vcpu->regs.tf_rcx = ecx;
1336         vcpu->regs.tf_rdx = edx;
1337 }
1338
1339 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu)
1340 {
1341         uint32_t intr_info;
1342
1343         vmx_get_cpu(vcpu);
1344         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1345         vmx_put_cpu(vcpu);
1346
1347         printk("vmx (vcpu %p): got an exception\n", vcpu);
1348         printk("vmx (vcpu %p): pid %d\n", vcpu, vcpu->proc->pid);
1349         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1350                 return 0;
1351         }
1352
1353         printk("unhandled nmi, intr_info %x\n", intr_info);
1354         return -EIO;
1355 }
1356
1357 /**
1358  * vmx_launch - the main loop for a VMX Dune process
1359  * @conf: the launch configuration
1360  */
1361 int vmx_launch(uint64_t rip, uint64_t rsp, uint64_t cr3)
1362 {
1363         int ret;
1364         struct vmx_vcpu *vcpu;
1365         int i = 0;
1366         int errors = 0;
1367
1368         printk("RUNNING: %s: rip %p rsp %p cr3 %p \n",
1369                __func__, rip, rsp, cr3);
1370         /* TODO: dirty hack til we have VMM contexts */
1371         vcpu = current->vmm.guest_pcores[0];
1372         if (!vcpu) {
1373                 printk("Failed to get a CPU!\n");
1374                 return -ENOMEM;
1375         }
1376
1377         /* if cr3 is set, means 'set everything', else means 'start where you left off' */
1378         if (cr3) {
1379                 vmx_get_cpu(vcpu);
1380                 vmcs_writel(GUEST_RIP, rip);
1381                 vmcs_writel(GUEST_RSP, rsp);
1382                 vmcs_writel(GUEST_CR3, cr3);
1383                 vmx_put_cpu(vcpu);
1384         }
1385
1386         vcpu->ret_code = -1;
1387
1388         while (1) {
1389                 vmx_get_cpu(vcpu);
1390
1391                 // TODO: manage the fpu when we restart.
1392
1393                 // TODO: see if we need to exit before we go much further.
1394                 disable_irq();
1395                 ret = vmx_run_vcpu(vcpu);
1396                 enable_irq();
1397                 vmx_put_cpu(vcpu);
1398
1399                 if (ret == EXIT_REASON_VMCALL) {
1400                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1401                         printk("system call! WTF\n");
1402                 } else if (ret == EXIT_REASON_CPUID)
1403                         vmx_handle_cpuid(vcpu);
1404                 else if (ret == EXIT_REASON_EPT_VIOLATION) {
1405                         if (vmx_handle_ept_violation(vcpu))
1406                                 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1407                 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1408                         if (vmx_handle_nmi_exception(vcpu))
1409                                 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1410                 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1411                         printk("External interrupt\n");
1412                 } else {
1413                         printk("unhandled exit: reason 0x%x, exit qualification 0x%x\n",
1414                                ret, vmcs_read32(EXIT_QUALIFICATION));
1415                         vmx_dump_cpu(vcpu);
1416                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1417                 }
1418
1419                 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1420                  * similar to how proc_restartcore/smp_idle only restart the pcpui
1421                  * cur_ctx, we need to do the same, via the VMCS resume business. */
1422
1423                 if (vcpu->shutdown)
1424                         break;
1425         }
1426
1427         printk("RETURN. ip %016lx sp %016lx\n",
1428                 vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
1429
1430         /*
1431          * Return both the reason for the shutdown and a status value.
1432          * The exit() and exit_group() system calls only need 8 bits for
1433          * the status but we allow 16 bits in case we might want to
1434          * return more information for one of the other shutdown reasons.
1435          */
1436         ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
1437
1438         return ret;
1439 }
1440
1441 /**
1442  * __vmx_enable - low-level enable of VMX mode on the current CPU
1443  * @vmxon_buf: an opaque buffer for use as the VMXON region
1444  */
1445 static  int __vmx_enable(struct vmcs *vmxon_buf)
1446 {
1447         uint64_t phys_addr = PADDR(vmxon_buf);
1448         uint64_t old, test_bits;
1449
1450         if (rcr4() & X86_CR4_VMXE) {
1451                 panic("Should never have this happen");
1452                 return -EBUSY;
1453         }
1454
1455         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1456
1457         test_bits = FEATURE_CONTROL_LOCKED;
1458         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1459
1460         if (0) // tboot_enabled())
1461                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1462
1463         if ((old & test_bits) != test_bits) {
1464                 /* If it's locked, then trying to set it will cause a GPF.
1465                  * No Dune for you!
1466                  */
1467                 if (old & FEATURE_CONTROL_LOCKED) {
1468                         printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1469                         return -1;
1470                 }
1471
1472                 /* enable and lock */
1473                 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1474         }
1475         lcr4(rcr4() | X86_CR4_VMXE);
1476
1477         __vmxon(phys_addr);
1478         vpid_sync_vcpu_global();        /* good idea, even if we aren't using vpids */
1479         ept_sync_global();
1480
1481         return 0;
1482 }
1483
1484 /**
1485  * vmx_enable - enables VMX mode on the current CPU
1486  * @unused: not used (required for on_each_cpu())
1487  *
1488  * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
1489  */
1490 static void vmx_enable(void)
1491 {
1492         struct vmcs *vmxon_buf = currentcpu->vmxarea;
1493         int ret;
1494
1495         ret = __vmx_enable(vmxon_buf);
1496         if (ret)
1497                 goto failed;
1498
1499         currentcpu->vmx_enabled = 1;
1500         // TODO: do we need this?
1501         store_gdt(&currentcpu->host_gdt);
1502
1503         printk("VMX enabled on CPU %d\n", core_id());
1504         return;
1505
1506 failed:
1507         printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1508 }
1509
1510 /**
1511  * vmx_disable - disables VMX mode on the current CPU
1512  */
1513 static void vmx_disable(void *unused)
1514 {
1515         if (currentcpu->vmx_enabled) {
1516                 __vmxoff();
1517                 lcr4(rcr4() & ~X86_CR4_VMXE);
1518                 currentcpu->vmx_enabled = 0;
1519         }
1520 }
1521
1522 /* Probe the cpus to see which ones can do vmx.
1523  * Return -errno if it fails, and 1 if it succeeds.
1524  */
1525 static bool probe_cpu_vmx(void)
1526 {
1527         /* The best way to test this code is:
1528          * wrmsr -p <cpu> 0x3a 1
1529          * This will lock vmx off; then modprobe dune.
1530          * Frequently, however, systems have all 0x3a registers set to 5,
1531          * meaning testing is impossible, as vmx can not be disabled.
1532          * We have to simulate it being unavailable in most cases.
1533          * The 'test' variable provides an easy way to simulate
1534          * unavailability of vmx on some, none, or all cpus.
1535          */
1536         if (!cpu_has_vmx()) {
1537                 printk("Machine does not support VT-x\n");
1538                 return FALSE;
1539         } else {
1540                 printk("Machine supports VT-x\n");
1541                 return TRUE;
1542         }
1543 }
1544
1545 static void setup_vmxarea(void)
1546 {
1547                 struct vmcs *vmxon_buf;
1548                 printd("Set up vmxarea for cpu %d\n", core_id());
1549                 vmxon_buf = __vmx_alloc_vmcs(node_id());
1550                 if (!vmxon_buf) {
1551                         printk("setup_vmxarea failed on node %d\n", core_id());
1552                         return;
1553                 }
1554                 currentcpu->vmxarea = vmxon_buf;
1555 }
1556
1557 static int ept_init(void)
1558 {
1559         if (!cpu_has_vmx_ept()) {
1560                 printk("VMX doesn't support EPT!\n");
1561                 return -1;
1562         }
1563         if (!cpu_has_vmx_eptp_writeback()) {
1564                 printk("VMX EPT doesn't support WB memory!\n");
1565                 return -1;
1566         }
1567         if (!cpu_has_vmx_ept_4levels()) {
1568                 printk("VMX EPT doesn't support 4 level walks!\n");
1569                 return -1;
1570         }
1571         switch (arch_max_jumbo_page_shift()) {
1572                 case PML3_SHIFT:
1573                         if (!cpu_has_vmx_ept_1g_page()) {
1574                                 printk("VMX EPT doesn't support 1 GB pages!\n");
1575                                 return -1;
1576                         }
1577                         break;
1578                 case PML2_SHIFT:
1579                         if (!cpu_has_vmx_ept_2m_page()) {
1580                                 printk("VMX EPT doesn't support 2 MB pages!\n");
1581                                 return -1;
1582                         }
1583                         break;
1584                 default:
1585                         printk("Unexpected jumbo page size %d\n",
1586                                arch_max_jumbo_page_shift());
1587                         return -1;
1588         }
1589         if (!cpu_has_vmx_ept_ad_bits()) {
1590                 printk("VMX EPT doesn't support accessed/dirty!\n");
1591                 x86_ept_pte_fix_ups |= EPTE_A | EPTE_D;
1592         }
1593         if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
1594                 printk("VMX EPT can't invalidate PTEs/TLBs!\n");
1595                 return -1;
1596         }
1597
1598         return 0;
1599 }
1600
1601 /**
1602  * vmx_init sets up physical core data areas that are required to run a vm at all.
1603  * These data areas are not connected to a specific user process in any way. Instead,
1604  * they are in some sense externalizing what would other wise be a very large ball of
1605  * state that would be inside the CPU.
1606  */
1607 int intel_vmm_init(void)
1608 {
1609         int r, cpu, ret;
1610
1611         if (! probe_cpu_vmx()) {
1612                 return -EOPNOTSUPP;
1613         }
1614
1615         setup_vmcs_config(&ret);
1616
1617         if (ret) {
1618                 printk("setup_vmcs_config failed: %d\n", ret);
1619                 return ret;
1620         }
1621
1622         msr_bitmap = (unsigned long *)kpage_zalloc_addr();
1623         if (!msr_bitmap) {
1624                 printk("Could not allocate msr_bitmap\n");
1625                 return -ENOMEM;
1626         }
1627         /* FIXME: do we need APIC virtualization (flexpriority?) */
1628
1629         memset(msr_bitmap, 0xff, PAGE_SIZE);
1630         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
1631         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
1632
1633         if ((ret = ept_init())) {
1634                 printk("EPT init failed, %d\n", ret);
1635                 return ret;
1636         }
1637         printk("VMX setup succeeded\n");
1638         return 0;
1639 }
1640
1641 int intel_vmm_pcpu_init(void)
1642 {
1643         setup_vmxarea();
1644         vmx_enable();
1645         return 0;
1646 }