vmmcp: final set of changes for PCI emulation
[akaros.git] / kern / arch / x86 / vmm / intel / vmx.c
1 //#define DEBUG
2 /**
3  *  vmx.c - The Intel VT-x driver for Dune
4  *
5  * This file is derived from Linux KVM VT-x support.
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8  *
9  * Original Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This modified version is simpler because it avoids the following
14  * features that are not requirements for Dune:
15  *  * Real-mode emulation
16  *  * Nested VT-x support
17  *  * I/O hardware emulation
18  *  * Any of the more esoteric X86 features and registers
19  *  * KVM-specific functionality
20  *
21  * In essence we provide only the minimum functionality needed to run
22  * a process in vmx non-root mode rather than the full hardware emulation
23  * needed to support an entire OS.
24  *
25  * This driver is a research prototype and as such has the following
26  * limitations:
27  *
28  * FIXME: Backward compatability is currently a non-goal, and only recent
29  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
30  * driver.
31  *
32  * FIXME: Eventually we should handle concurrent user's of VT-x more
33  * gracefully instead of requiring exclusive access. This would allow
34  * Dune to interoperate with KVM and other HV solutions.
35  *
36  * FIXME: We need to support hotplugged physical CPUs.
37  *
38  * Authors:
39  *   Adam Belay   <abelay@stanford.edu>
40  */
41
42 /* Basic flow.
43  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
44  * You're left with the feeling that they got part way through and realized they had to have one for
45  *
46  * 1) your CPU is going to be capable of running VMs, and you need state for that.
47  *
48  * 2) you're about to start a guest, and you need state for that.
49  *
50  * So there is get cpu set up to be able to run VMs stuff, and now
51  * let's start a guest stuff.  In Akaros, CPUs will always be set up
52  * to run a VM if that is possible. Processes can flip themselves into
53  * a VM and that will require another VMCS.
54  *
55  * So: at kernel startup time, the SMP boot stuff calls
56  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
57  * in the case of this file is intel_vmm_init. That does some code
58  * that sets up stuff for ALL sockets, based on the capabilities of
59  * the socket it runs on. If any cpu supports vmx, it assumes they all
60  * do. That's a realistic assumption. So the call_function_all is kind
61  * of stupid, really; it could just see what's on the current cpu and
62  * assume it's on all. HOWEVER: there are systems in the wilde that
63  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
64  * might as well allow for the chance that wel'll only all VMMCPs on a
65  * subset (not implemented yet however).  So: probe all CPUs, get a
66  * count of how many support VMX and, for now, assume they all do
67  * anyway.
68  *
69  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
70  * which contains all the naughty bits settings for all the cpus that can run a VM.
71  * Realistically, all VMX-capable cpus in a system will have identical configurations.
72  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
73  *
74  * configure the msr_bitmap. This is the bitmap of MSRs which the
75  * guest can manipulate.  Currently, we only allow GS and FS base.
76  *
77  * Reserve bit 0 in the vpid bitmap as guests can not use that
78  *
79  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
80  * per-guest. Once set up, it is left alone.  The ONLY think we set in
81  * there is the revision area. The VMX is page-sized per cpu and
82  * page-aligned. Note that it can be smaller, but why bother? We know
83  * the max size and alightment, and it's convenient.
84  *
85  * Now that it is set up, enable vmx on all cpus. This involves
86  * testing VMXE in cr4, to see if we've been here before (TODO: delete
87  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
88  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
89  * instruction), and syncing vpid's and ept's.  Now the CPU is ready
90  * to host guests.
91  *
92  * Setting up a guest.
93  * We divide this into two things: vmm_proc_init and vm_run.
94  * Currently, on Intel, vmm_proc_init does nothing.
95  *
96  * vm_run is really complicated. It is called with a coreid, rip, rsp,
97  * cr3, and flags.  On intel, it calls vmx_launch. vmx_launch is set
98  * up for a few test cases. If rip is 1, it sets the guest rip to
99  * a function which will deref 0 and should exit with failure 2. If rip is 0,
100  * it calls an infinite loop in the guest.
101  *
102  * The sequence of operations:
103  * create a vcpu
104  * while (1) {
105  * get a vcpu
106  * disable irqs (required or you can't enter the VM)
107  * vmx_run_vcpu()
108  * enable irqs
109  * manage the vm exit
110  * }
111  *
112  * get a vcpu
113  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
114  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
115  *
116  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
117  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
118  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
119  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
120  *
121  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
122  * of inline assembly with embedded CPP crap. I suspect we'll want to
123  * un-inline it someday, but maybe not.  It's called with a vcpu
124  * struct from which it loads guest state, and to which it stores
125  * non-virtualized host state. It issues a vmlaunch or vmresume
126  * instruction depending, and on return, it evaluates if things the
127  * launch/resume had an error in that operation. Note this is NOT the
128  * same as an error while in the virtual machine; this is an error in
129  * startup due to misconfiguration. Depending on whatis returned it's
130  * either a failed vm startup or an exit for lots of many reasons.
131  *
132  */
133
134 /* basically: only rename those globals that might conflict
135  * with existing names. Leave all else the same.
136  * this code is more modern than the other code, yet still
137  * well encapsulated, it seems.
138  */
139 #include <kmalloc.h>
140 #include <string.h>
141 #include <stdio.h>
142 #include <assert.h>
143 #include <error.h>
144 #include <pmap.h>
145 #include <sys/queue.h>
146 #include <smp.h>
147 #include <kref.h>
148 #include <atomic.h>
149 #include <alarm.h>
150 #include <event.h>
151 #include <umem.h>
152 #include <bitops.h>
153 #include <arch/types.h>
154 #include <syscall.h>
155 #include <arch/io.h>
156
157 #include "vmx.h"
158 #include "../vmm.h"
159 #include <ros/vmm.h>
160
161 #include "cpufeature.h"
162
163 #define currentcpu (&per_cpu_info[core_id()])
164
165 static unsigned long *msr_bitmap;
166 #define VMX_IO_BITMAP_ORDER             4       /* 64 KB */
167 #define VMX_IO_BITMAP_SZ                (1 << (VMX_IO_BITMAP_ORDER + PGSHIFT))
168 static unsigned long *io_bitmap;
169
170 int x86_ept_pte_fix_ups = 0;
171
172 struct vmx_capability vmx_capability;
173 struct vmcs_config vmcs_config;
174
175 static int autoloaded_msrs[] = {
176         MSR_KERNEL_GS_BASE,
177         MSR_LSTAR,
178         MSR_STAR,
179         MSR_SFMASK,
180 };
181
182 static char *cr_access_type[] = {
183         "move to cr",
184         "move from cr",
185         "clts",
186         "lmsw"
187 };
188
189 static char *cr_gpr[] = {
190         "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
191         "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
192 };
193
194 static int guest_cr_num[16] = {
195         GUEST_CR0,
196         -1,
197         -1,
198         GUEST_CR3,
199         GUEST_CR4,
200         -1,
201         -1,
202         -1,
203         -1,     /* 8? */
204         -1, -1, -1, -1, -1, -1, -1
205 };
206
207 __always_inline unsigned long vmcs_readl(unsigned long field);
208 /* See section 24-3 of The Good Book */
209 void
210 show_cr_access(uint64_t val)
211 {
212         int crnr = val & 0xf;
213         int type = (val >> 4) & 3;
214         int reg = (val >> 11) & 0xf;
215         printk("%s: %d: ", cr_access_type[type], crnr);
216         if (type < 2) {
217                 printk("%s", cr_gpr[reg]);
218                 if (guest_cr_num[crnr] > -1) {
219                         printk(": 0x%x", vmcs_readl(guest_cr_num[crnr]));
220                 }
221         }
222         printk("\n");
223 }
224
225 void
226 ept_flush(uint64_t eptp)
227 {
228         ept_sync_context(eptp);
229 }
230
231 static void
232 vmcs_clear(struct vmcs *vmcs)
233 {
234         uint64_t phys_addr = PADDR(vmcs);
235         uint8_t error;
236
237         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0":"=qm"(error):"a"(&phys_addr),
238                                   "m"(phys_addr)
239                                   :"cc", "memory");
240         if (error)
241                 printk("vmclear fail: %p/%llx\n", vmcs, phys_addr);
242 }
243
244 static void
245 vmcs_load(struct vmcs *vmcs)
246 {
247         uint64_t phys_addr = PADDR(vmcs);
248         uint8_t error;
249
250         asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0":"=qm"(error):"a"(&phys_addr),
251                                   "m"(phys_addr)
252                                   :"cc", "memory");
253         if (error)
254                 printk("vmptrld %p/%llx failed\n", vmcs, phys_addr);
255 }
256
257 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
258 static physaddr_t
259 vmcs_get_current(void)
260 {
261         physaddr_t vmcs_paddr;
262         /* RAX contains the addr of the location to store the VMCS pointer.  The
263          * compiler doesn't know the ASM will deref that pointer, hence the =m */
264         asm volatile (ASM_VMX_VMPTRST_RAX:"=m"(vmcs_paddr):"a"(&vmcs_paddr));
265         return vmcs_paddr;
266 }
267
268 __always_inline unsigned long
269 vmcs_readl(unsigned long field)
270 {
271         unsigned long value;
272
273         asm volatile (ASM_VMX_VMREAD_RDX_RAX:"=a"(value):"d"(field):"cc");
274         return value;
275 }
276
277 __always_inline uint16_t
278 vmcs_read16(unsigned long field)
279 {
280         return vmcs_readl(field);
281 }
282
283 static __always_inline uint32_t
284 vmcs_read32(unsigned long field)
285 {
286         return vmcs_readl(field);
287 }
288
289 static __always_inline uint64_t
290 vmcs_read64(unsigned long field)
291 {
292         return vmcs_readl(field);
293 }
294
295 void
296 vmwrite_error(unsigned long field, unsigned long value)
297 {
298         printk("vmwrite error: reg %lx value %lx (err %d)\n",
299                    field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
300 }
301
302 void
303 vmcs_writel(unsigned long field, unsigned long value)
304 {
305         uint8_t error;
306
307         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0":"=q"(error):"a"(value),
308                                   "d"(field):"cc");
309         if (error)
310                 vmwrite_error(field, value);
311 }
312
313 static void
314 vmcs_write16(unsigned long field, uint16_t value)
315 {
316         vmcs_writel(field, value);
317 }
318
319 static void
320 vmcs_write32(unsigned long field, uint32_t value)
321 {
322         vmcs_writel(field, value);
323 }
324
325 static void
326 vmcs_write64(unsigned long field, uint64_t value)
327 {
328         vmcs_writel(field, value);
329 }
330
331 /*
332  * A note on Things You Can't Make Up.
333  * or
334  * "George, you can type this shit, but you can't say it" -- Harrison Ford
335  *
336  * There are 5 VMCS 32-bit words that control guest permissions. If
337  * you set these correctly, you've got a guest that will behave. If
338  * you get even one bit wrong, you've got a guest that will chew your
339  * leg off. Some bits must be 1, some must be 0, and some can be set
340  * either way. To add to the fun, the docs are sort of a docudrama or,
341  * as the quote goes, "interesting if true."
342  *
343  * To determine what bit can be set in what VMCS 32-bit control word,
344  * there are 5 corresponding 64-bit MSRs.  And, to make it even more
345  * fun, the standard set of MSRs have errors in them, i.e. report
346  * incorrect values, for legacy reasons, and so you are supposed to
347  * "look around" to another set, which have correct bits in
348  * them. There are four such 'correct' registers, and they have _TRUE_
349  * in the names as you can see below. We test for the value of VMCS
350  * control bits in the _TRUE_ registers if possible. The fifth
351  * register, CPU Secondary Exec Controls, which came later, needs no
352  * _TRUE_ variant.
353  *
354  * For each MSR, the high 32 bits tell you what bits can be "1" by a
355  * "1" in that position; the low 32 bits tell you what bit can be "0"
356  * by a "0" in that position. So, for each of 32 bits in a given VMCS
357  * control word, there is a pair of bits in an MSR that tells you what
358  * values it can take. The two bits, of which there are *four*
359  * combinations, describe the *three* possible operations on a
360  * bit. The two bits, taken together, form an untruth table: There are
361  * three possibilities: The VMCS bit can be set to 0 or 1, or it can
362  * only be 0, or only 1. The fourth combination is not supposed to
363  * happen.
364  *
365  * So: there is the 1 bit from the upper 32 bits of the msr.
366  * If this bit is set, then the bit can be 1. If clear, it can not be 1.
367  *
368  * Then there is the 0 bit, from low 32 bits. If clear, the VMCS bit
369  * can be 0. If 1, the VMCS bit can not be 0.
370  *
371  * SO, let's call the 1 bit R1, and the 0 bit R0, we have:
372  *  R1 R0
373  *  0 0 -> must be 0
374  *  1 0 -> can be 1, can be 0
375  *  0 1 -> can not be 1, can not be 0. --> JACKPOT! Not seen yet.
376  *  1 1 -> must be one.
377  *
378  * It's also pretty hard to know what you can and can't set, and
379  * that's led to inadvertant opening of permissions at times.  Because
380  * of this complexity we've decided on the following: the driver must
381  * define EVERY bit, UNIQUELY, for each of the 5 registers, that it wants
382  * set. Further, for any bit that's settable, the driver must specify
383  * a setting; for any bit that's reserved, the driver settings must
384  * match that bit. If there are reserved bits we don't specify, that's
385  * ok; we'll take them as is.
386  *
387  * We use a set-means-set, and set-means-clear model, i.e. we use a
388  * 32-bit word to contain the bits we want to be 1, indicated by one;
389  * and another 32-bit word in which a bit we want to be 0 is indicated
390  * by a 1. This allows us to easily create masks of all bits we're
391  * going to set, for example.
392  *
393  * We have two 32-bit numbers for each 32-bit VMCS field: bits we want
394  * set and bits we want clear.  If you read the MSR for that field,
395  * compute the reserved 0 and 1 settings, and | them together, they
396  * need to result in 0xffffffff. You can see that we can create other
397  * tests for conflicts (i.e. overlap).
398  *
399  * At this point, I've tested check_vmx_controls in every way
400  * possible, beause I kept screwing the bitfields up. You'll get a nice
401  * error it won't work at all, which is what we want: a
402  * failure-prone setup, where even errors that might result in correct
403  * values are caught -- "right answer, wrong method, zero credit." If there's
404  * weirdness in the bits, we don't want to run.
405  */
406
407 static bool
408 check_vmxec_controls(struct vmxec const *v, bool have_true_msr,
409                                          uint32_t * result)
410 {
411         bool err = false;
412         uint32_t vmx_msr_low, vmx_msr_high;
413         uint32_t reserved_0, reserved_1, changeable_bits;
414
415         if (have_true_msr)
416                 rdmsr(v->truemsr, vmx_msr_low, vmx_msr_high);
417         else
418                 rdmsr(v->msr, vmx_msr_low, vmx_msr_high);
419
420         if (vmx_msr_low & ~vmx_msr_high)
421                 warn("JACKPOT: Conflicting VMX ec ctls for %s, high 0x%08x low 0x%08x",
422                          v->name, vmx_msr_high, vmx_msr_low);
423
424         reserved_0 = (~vmx_msr_low) & (~vmx_msr_high);
425         reserved_1 = vmx_msr_low & vmx_msr_high;
426         changeable_bits = ~(reserved_0 | reserved_1);
427
428         /*
429          * this is very much as follows:
430          * accept the things I cannot change,
431          * change the things I can,
432          * know the difference.
433          */
434
435         /* Conflict. Don't try to both set and reset bits. */
436         if (v->set_to_0 & v->set_to_1) {
437                 printk("%s: set to 0 (0x%x) and set to 1 (0x%x) overlap: 0x%x\n",
438                            v->name, v->set_to_0, v->set_to_1, v->set_to_0 & v->set_to_1);
439                 err = true;
440         }
441
442         /* coverage */
443         if (((v->set_to_0 | v->set_to_1) & changeable_bits) != changeable_bits) {
444                 printk("%s: Need to cover 0x%x and have 0x%x,0x%x\n",
445                            v->name, changeable_bits, v->set_to_0, v->set_to_1);
446                 err = true;
447         }
448
449         if ((v->set_to_0 | v->set_to_1 | reserved_0 | reserved_1) != 0xffffffff) {
450                 printk("%s: incomplete coverage: have 0x%x, want 0x%x\n",
451                            v->name, v->set_to_0 | v->set_to_1 |
452                            reserved_0 | reserved_1, 0xffffffff);
453                 err = true;
454         }
455
456         /* Don't try to change bits that can't be changed. */
457         if ((v->set_to_0 & (reserved_0 | changeable_bits)) != v->set_to_0) {
458                 printk("%s: set to 0 (0x%x) can't be done\n", v->name, v->set_to_0);
459                 err = true;
460         }
461
462         if ((v->set_to_1 & (reserved_1 | changeable_bits)) != v->set_to_1) {
463                 printk("%s: set to 1 (0x%x) can't be done\n", v->name, v->set_to_1);
464                 err = true;
465         }
466
467         /* If there's been any error at all, spill our guts and return. */
468         if (err) {
469                 printk("%s: vmx_msr_high 0x%x, vmx_msr_low 0x%x, ",
470                            v->name, vmx_msr_high, vmx_msr_low);
471                 printk("set_to_1 0x%x,set_to_0 0x%x,reserved_1 0x%x",
472                            v->set_to_1, v->set_to_0, reserved_1);
473                 printk(" reserved_0 0x%x", reserved_0);
474                 printk(" changeable_bits 0x%x\n", changeable_bits);
475                 return false;
476         }
477
478         *result = v->set_to_1 | reserved_1;
479
480         printd("%s: check_vmxec_controls succeeds with result 0x%x\n",
481                    v->name, *result);
482         return true;
483 }
484
485 /*
486  * We're trying to make this as readable as possible. Realistically, it will
487  * rarely if ever change, if the past is any guide.
488  */
489 static const struct vmxec pbec = {
490         .name = "Pin Based Execution Controls",
491         .msr = MSR_IA32_VMX_PINBASED_CTLS,
492         .truemsr = MSR_IA32_VMX_TRUE_PINBASED_CTLS,
493
494         .set_to_1 = (PIN_BASED_EXT_INTR_MASK |
495                      PIN_BASED_NMI_EXITING |
496                      PIN_BASED_VIRTUAL_NMIS),
497
498         .set_to_0 = (PIN_BASED_VMX_PREEMPTION_TIMER |
499                      PIN_BASED_POSTED_INTR),
500 };
501
502 static const struct vmxec cbec = {
503         .name = "CPU Based Execution Controls",
504         .msr = MSR_IA32_VMX_PROCBASED_CTLS,
505         .truemsr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
506
507         .set_to_1 = (CPU_BASED_HLT_EXITING |
508                      CPU_BASED_MWAIT_EXITING |
509                      CPU_BASED_RDPMC_EXITING |
510                      CPU_BASED_CR8_LOAD_EXITING |
511                      CPU_BASED_CR8_STORE_EXITING |
512                      CPU_BASED_USE_MSR_BITMAPS |
513                      CPU_BASED_MONITOR_EXITING |
514                      CPU_BASED_USE_IO_BITMAPS |
515                      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS),
516
517         .set_to_0 = (CPU_BASED_VIRTUAL_INTR_PENDING |
518                      CPU_BASED_INVLPG_EXITING |
519                      CPU_BASED_USE_TSC_OFFSETING |
520                      CPU_BASED_RDTSC_EXITING |
521                      CPU_BASED_CR3_LOAD_EXITING |
522                      CPU_BASED_CR3_STORE_EXITING |
523                      CPU_BASED_TPR_SHADOW |
524                      CPU_BASED_MOV_DR_EXITING |
525                      CPU_BASED_VIRTUAL_NMI_PENDING |
526                      CPU_BASED_MONITOR_TRAP |
527                      CPU_BASED_PAUSE_EXITING |
528                      CPU_BASED_UNCOND_IO_EXITING),
529 };
530
531 static const struct vmxec cb2ec = {
532         .name = "CPU Based 2nd Execution Controls",
533         .msr = MSR_IA32_VMX_PROCBASED_CTLS2,
534         .truemsr = MSR_IA32_VMX_PROCBASED_CTLS2,
535
536         .set_to_1 = (SECONDARY_EXEC_ENABLE_EPT |
537                      SECONDARY_EXEC_WBINVD_EXITING),
538
539         .set_to_0 = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
540                      SECONDARY_EXEC_DESCRIPTOR_EXITING |
541                      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
542                      SECONDARY_EXEC_ENABLE_VPID |
543                      SECONDARY_EXEC_UNRESTRICTED_GUEST |
544                      SECONDARY_EXEC_APIC_REGISTER_VIRT |
545                      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
546                      SECONDARY_EXEC_PAUSE_LOOP_EXITING |
547                      SECONDARY_EXEC_RDRAND_EXITING |
548                      SECONDARY_EXEC_ENABLE_INVPCID |
549                      SECONDARY_EXEC_ENABLE_VMFUNC |
550                      SECONDARY_EXEC_SHADOW_VMCS |
551                      SECONDARY_EXEC_RDSEED_EXITING |
552                      SECONDARY_EPT_VE |
553                      /* TODO: re enable this via a "Want" struct
554                         member at some point */
555                      SECONDARY_EXEC_RDTSCP |
556                      SECONDARY_ENABLE_XSAV_RESTORE)
557 };
558
559 static const struct vmxec vmentry = {
560         .name = "VMENTRY controls",
561         .msr = MSR_IA32_VMX_ENTRY_CTLS,
562         .truemsr = MSR_IA32_VMX_TRUE_ENTRY_CTLS,
563         /* exact order from vmx.h; only the first two are enabled. */
564
565         .set_to_1 =  (VM_ENTRY_LOAD_DEBUG_CONTROLS | /* can't set to 0 */
566                       VM_ENTRY_LOAD_IA32_EFER |
567                       VM_ENTRY_IA32E_MODE),
568
569         .set_to_0 = (VM_ENTRY_SMM |
570                      VM_ENTRY_DEACT_DUAL_MONITOR |
571                      VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
572                      VM_ENTRY_LOAD_IA32_PAT),
573 };
574
575 static const struct vmxec vmexit = {
576         .name = "VMEXIT controls",
577         .msr = MSR_IA32_VMX_EXIT_CTLS,
578         .truemsr = MSR_IA32_VMX_TRUE_EXIT_CTLS,
579
580         .set_to_1 = (VM_EXIT_SAVE_DEBUG_CONTROLS |      /* can't set to 0 */
581                                  VM_EXIT_SAVE_IA32_EFER | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_HOST_ADDR_SPACE_SIZE),       /* 64 bit */
582
583         .set_to_0 = (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
584                                  VM_EXIT_ACK_INTR_ON_EXIT |
585                                  VM_EXIT_SAVE_IA32_PAT |
586                                  VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER),
587 };
588
589 static void
590 setup_vmcs_config(void *p)
591 {
592         int *ret = p;
593         struct vmcs_config *vmcs_conf = &vmcs_config;
594         uint32_t vmx_msr_high;
595         uint64_t vmx_msr;
596         bool have_true_msrs = false;
597         bool ok;
598
599         *ret = -EIO;
600
601         vmx_msr = read_msr(MSR_IA32_VMX_BASIC);
602         vmx_msr_high = vmx_msr >> 32;
603
604         /*
605          * If bit 55 (VMX_BASIC_HAVE_TRUE_MSRS) is set, then we
606          * can go for the true MSRs.  Else, we ask you to get a better CPU.
607          */
608         if (vmx_msr & VMX_BASIC_TRUE_CTLS) {
609                 have_true_msrs = true;
610                 printd("Running with TRUE MSRs\n");
611         } else {
612                 printk("Running with non-TRUE MSRs, this is old hardware\n");
613         }
614
615         /*
616          * Don't worry that one or more of these might fail and leave
617          * the VMCS in some kind of incomplete state. If one of these
618          * fails, the caller is going to discard the VMCS.
619          * It is written this way to ensure we get results of all tests and avoid
620          * BMAFR behavior.
621          */
622         ok = check_vmxec_controls(&pbec, have_true_msrs,
623                                   &vmcs_conf->pin_based_exec_ctrl);
624         ok = check_vmxec_controls(&cbec, have_true_msrs,
625                                   &vmcs_conf->cpu_based_exec_ctrl) && ok;
626         /* Only check cb2ec if we're still ok, o/w we may GPF */
627         ok = ok && check_vmxec_controls(&cb2ec, have_true_msrs,
628                                         &vmcs_conf->cpu_based_2nd_exec_ctrl);
629         ok = check_vmxec_controls(&vmentry, have_true_msrs,
630                                   &vmcs_conf->vmentry_ctrl) && ok;
631         ok = check_vmxec_controls(&vmexit, have_true_msrs,
632                                   &vmcs_conf->vmexit_ctrl) && ok;
633         if (! ok) {
634                 printk("vmxexec controls is no good.\n");
635                 return;
636         }
637
638         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
639         if ((vmx_msr_high & 0x1fff) > PGSIZE) {
640                 printk("vmx_msr_high & 0x1fff) is 0x%x, > PAGE_SIZE 0x%x\n",
641                            vmx_msr_high & 0x1fff, PGSIZE);
642                 return;
643         }
644
645         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
646         if (vmx_msr & VMX_BASIC_64) {
647                 printk("VMX doesn't support 64 bit width!\n");
648                 return;
649         }
650
651         if (((vmx_msr & VMX_BASIC_MEM_TYPE_MASK) >> VMX_BASIC_MEM_TYPE_SHIFT)
652                 != VMX_BASIC_MEM_TYPE_WB) {
653                 printk("VMX doesn't support WB memory for VMCS accesses!\n");
654                 return;
655         }
656
657         vmcs_conf->size = vmx_msr_high & 0x1fff;
658         vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
659         vmcs_conf->revision_id = (uint32_t) vmx_msr;
660
661         /* Read in the caps for runtime checks.  This MSR is only available if
662          * secondary controls and ept or vpid is on, which we check earlier */
663         rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, vmx_capability.ept, vmx_capability.vpid);
664
665         *ret = 0;
666 }
667
668 static struct vmcs *
669 __vmx_alloc_vmcs(int node)
670 {
671         struct vmcs *vmcs;
672
673         vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
674         if (!vmcs)
675                 return 0;
676         memset(vmcs, 0, vmcs_config.size);
677         vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
678         printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
679         return vmcs;
680 }
681
682 /**
683  * vmx_alloc_vmcs - allocates a VMCS region
684  *
685  * NOTE: Assumes the new region will be used by the current CPU.
686  *
687  * Returns a valid VMCS region.
688  */
689 static struct vmcs *
690 vmx_alloc_vmcs(void)
691 {
692         return __vmx_alloc_vmcs(numa_id());
693 }
694
695 /**
696  * vmx_free_vmcs - frees a VMCS region
697  */
698 static void
699 vmx_free_vmcs(struct vmcs *vmcs)
700 {
701         //free_pages((unsigned long)vmcs, vmcs_config.order);
702 }
703
704 /*
705  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
706  * will not change in the lifetime of the guest.
707  * Note that host-state that does change is set elsewhere. E.g., host-state
708  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
709  */
710 static void
711 vmx_setup_constant_host_state(void)
712 {
713         uint32_t low32, high32;
714         unsigned long tmpl;
715         pseudodesc_t dt;
716
717         vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);    /* 22.2.3 */
718         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
719         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
720
721         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
722         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
723         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
724         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
725         vmcs_write16(HOST_TR_SELECTOR, GD_TSS); /* 22.2.4 */
726
727         native_store_idt(&dt);
728         vmcs_writel(HOST_IDTR_BASE, dt.pd_base);        /* 22.2.4 */
729
730         asm("mov $.Lkvm_vmx_return, %0":"=r"(tmpl));
731         vmcs_writel(HOST_RIP, tmpl);    /* 22.2.5 */
732
733         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
734         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
735         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
736         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);      /* 22.2.3 */
737
738         rdmsr(MSR_EFER, low32, high32);
739         vmcs_write32(HOST_IA32_EFER, low32);
740
741         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
742                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
743                 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
744         }
745
746         vmcs_write16(HOST_FS_SELECTOR, 0);      /* 22.2.4 */
747         vmcs_write16(HOST_GS_SELECTOR, 0);      /* 22.2.4 */
748
749         /* TODO: This (at least gs) is per cpu */
750         rdmsrl(MSR_FS_BASE, tmpl);
751         vmcs_writel(HOST_FS_BASE, tmpl);        /* 22.2.4 */
752         rdmsrl(MSR_GS_BASE, tmpl);
753         vmcs_writel(HOST_GS_BASE, tmpl);        /* 22.2.4 */
754 }
755
756 static inline uint16_t
757 vmx_read_ldt(void)
758 {
759         uint16_t ldt;
760 asm("sldt %0":"=g"(ldt));
761         return ldt;
762 }
763
764 static unsigned long
765 segment_base(uint16_t selector)
766 {
767         pseudodesc_t *gdt = &currentcpu->host_gdt;
768         struct desc_struct *d;
769         unsigned long table_base;
770         unsigned long v;
771
772         if (!(selector & ~3)) {
773                 return 0;
774         }
775
776         table_base = gdt->pd_base;
777
778         if (selector & 4) {     /* from ldt */
779                 uint16_t ldt_selector = vmx_read_ldt();
780
781                 if (!(ldt_selector & ~3)) {
782                         return 0;
783                 }
784
785                 table_base = segment_base(ldt_selector);
786         }
787         d = (struct desc_struct *)(table_base + (selector & ~7));
788         v = get_desc_base(d);
789         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
790                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
791         return v;
792 }
793
794 static inline unsigned long
795 vmx_read_tr_base(void)
796 {
797         uint16_t tr;
798 asm("str %0":"=g"(tr));
799         return segment_base(tr);
800 }
801
802 static void
803 __vmx_setup_cpu(void)
804 {
805         pseudodesc_t *gdt = &currentcpu->host_gdt;
806         unsigned long sysenter_esp;
807         unsigned long tmpl;
808
809         /*
810          * Linux uses per-cpu TSS and GDT, so set these when switching
811          * processors.
812          */
813         vmcs_writel(HOST_TR_BASE, vmx_read_tr_base());  /* 22.2.4 */
814         vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);      /* 22.2.4 */
815
816         rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
817         vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp);      /* 22.2.3 */
818
819         rdmsrl(MSR_FS_BASE, tmpl);
820         vmcs_writel(HOST_FS_BASE, tmpl);        /* 22.2.4 */
821         rdmsrl(MSR_GS_BASE, tmpl);
822         vmcs_writel(HOST_GS_BASE, tmpl);        /* 22.2.4 */
823 }
824
825 /**
826  * vmx_get_cpu - called before using a cpu
827  * @vcpu: VCPU that will be loaded.
828  *
829  * Disables preemption. Call vmx_put_cpu() when finished.
830  */
831 static void
832 vmx_get_cpu(struct vmx_vcpu *vcpu)
833 {
834         int cur_cpu = core_id();
835         handler_wrapper_t *w;
836
837         if (currentcpu->local_vcpu)
838                 panic("get_cpu: currentcpu->localvcpu was non-NULL");
839         if (currentcpu->local_vcpu != vcpu) {
840                 currentcpu->local_vcpu = vcpu;
841
842                 if (vcpu->cpu != cur_cpu) {
843                         if (vcpu->cpu >= 0) {
844                                 panic("vcpu->cpu is not -1, it's %d\n", vcpu->cpu);
845                         } else
846                                 vmcs_clear(vcpu->vmcs);
847
848                         ept_sync_context(vcpu_get_eptp(vcpu));
849
850                         vcpu->launched = 0;
851                         vmcs_load(vcpu->vmcs);
852                         __vmx_setup_cpu();
853                         vcpu->cpu = cur_cpu;
854                 } else {
855                         vmcs_load(vcpu->vmcs);
856                 }
857         }
858 }
859
860 /**
861  * vmx_put_cpu - called after using a cpu
862  * @vcpu: VCPU that was loaded.
863  */
864 static void
865 vmx_put_cpu(struct vmx_vcpu *vcpu)
866 {
867         if (core_id() != vcpu->cpu)
868                 panic("%s: core_id() %d != vcpu->cpu %d\n",
869                           __func__, core_id(), vcpu->cpu);
870
871         if (currentcpu->local_vcpu != vcpu)
872                 panic("vmx_put_cpu: asked to clear something not ours");
873
874         ept_sync_context(vcpu_get_eptp(vcpu));
875         vmcs_clear(vcpu->vmcs);
876         vcpu->cpu = -1;
877         currentcpu->local_vcpu = NULL;
878         //put_cpu();
879 }
880
881 /**
882  * vmx_dump_cpu - prints the CPU state
883  * @vcpu: VCPU to print
884  */
885 static void
886 vmx_dump_cpu(struct vmx_vcpu *vcpu)
887 {
888
889         unsigned long flags;
890
891         vmx_get_cpu(vcpu);
892         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
893         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
894         flags = vmcs_readl(GUEST_RFLAGS);
895         vmx_put_cpu(vcpu);
896
897         printk("--- Begin VCPU Dump ---\n");
898         printk("CPU %d VPID %d\n", vcpu->cpu, 0);
899         printk("RIP 0x%016lx RFLAGS 0x%08lx\n", vcpu->regs.tf_rip, flags);
900         printk("RAX 0x%016lx RCX 0x%016lx\n", vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
901         printk("RDX 0x%016lx RBX 0x%016lx\n", vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
902         printk("RSP 0x%016lx RBP 0x%016lx\n", vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
903         printk("RSI 0x%016lx RDI 0x%016lx\n", vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
904         printk("R8  0x%016lx R9  0x%016lx\n", vcpu->regs.tf_r8, vcpu->regs.tf_r9);
905         printk("R10 0x%016lx R11 0x%016lx\n", vcpu->regs.tf_r10, vcpu->regs.tf_r11);
906         printk("R12 0x%016lx R13 0x%016lx\n", vcpu->regs.tf_r12, vcpu->regs.tf_r13);
907         printk("R14 0x%016lx R15 0x%016lx\n", vcpu->regs.tf_r14, vcpu->regs.tf_r15);
908         printk("--- End VCPU Dump ---\n");
909
910 }
911
912 uint64_t
913 construct_eptp(physaddr_t root_hpa)
914 {
915         uint64_t eptp;
916
917         /* set WB memory and 4 levels of walk.  we checked these in ept_init */
918         eptp = VMX_EPT_MEM_TYPE_WB | (VMX_EPT_GAW_4_LVL << VMX_EPT_GAW_EPTP_SHIFT);
919         if (cpu_has_vmx_ept_ad_bits())
920                 eptp |= VMX_EPT_AD_ENABLE_BIT;
921         eptp |= (root_hpa & PAGE_MASK);
922
923         return eptp;
924 }
925
926 /**
927  * vmx_setup_initial_guest_state - configures the initial state of guest registers
928  */
929 static void
930 vmx_setup_initial_guest_state(void)
931 {
932         unsigned long tmpl;
933         unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
934                 X86_CR4_PGE | X86_CR4_OSFXSR;
935         uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
936 #if 0
937         do
938                 we need it if (boot_cpu_has(X86_FEATURE_PCID))
939                         cr4 |= X86_CR4_PCIDE;
940         if (boot_cpu_has(X86_FEATURE_OSXSAVE))
941                 cr4 |= X86_CR4_OSXSAVE;
942 #endif
943         /* we almost certainly have this */
944         /* we'll go sour if we don't. */
945         if (1)  //boot_cpu_has(X86_FEATURE_FSGSBASE))
946                 cr4 |= X86_CR4_RDWRGSFS;
947
948         /* configure control and data registers */
949         vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
950                                 X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
951         vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
952                                 X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
953         vmcs_writel(GUEST_CR3, rcr3());
954         vmcs_writel(GUEST_CR4, cr4);
955         vmcs_writel(CR4_READ_SHADOW, cr4);
956         vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
957                                 EFER_SCE /*| EFER_FFXSR */ );
958         vmcs_writel(GUEST_GDTR_BASE, 0);
959         vmcs_writel(GUEST_GDTR_LIMIT, 0);
960         vmcs_writel(GUEST_IDTR_BASE, 0);
961         vmcs_writel(GUEST_IDTR_LIMIT, 0);
962         vmcs_writel(GUEST_RIP, 0xdeadbeef);
963         vmcs_writel(GUEST_RSP, 0xdeadbeef);
964         vmcs_writel(GUEST_RFLAGS, 0x02);
965         vmcs_writel(GUEST_DR7, 0);
966
967         /* guest segment bases */
968         vmcs_writel(GUEST_CS_BASE, 0);
969         vmcs_writel(GUEST_DS_BASE, 0);
970         vmcs_writel(GUEST_ES_BASE, 0);
971         vmcs_writel(GUEST_GS_BASE, 0);
972         vmcs_writel(GUEST_SS_BASE, 0);
973         rdmsrl(MSR_FS_BASE, tmpl);
974         vmcs_writel(GUEST_FS_BASE, tmpl);
975
976         /* guest segment access rights */
977         vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
978         vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
979         vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
980         vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
981         vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
982         vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
983
984         /* guest segment limits */
985         vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
986         vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
987         vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
988         vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
989         vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
990         vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
991
992         /* configure segment selectors */
993         vmcs_write16(GUEST_CS_SELECTOR, 0);
994         vmcs_write16(GUEST_DS_SELECTOR, 0);
995         vmcs_write16(GUEST_ES_SELECTOR, 0);
996         vmcs_write16(GUEST_FS_SELECTOR, 0);
997         vmcs_write16(GUEST_GS_SELECTOR, 0);
998         vmcs_write16(GUEST_SS_SELECTOR, 0);
999         vmcs_write16(GUEST_TR_SELECTOR, 0);
1000
1001         /* guest LDTR */
1002         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1003         vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
1004         vmcs_writel(GUEST_LDTR_BASE, 0);
1005         vmcs_writel(GUEST_LDTR_LIMIT, 0);
1006
1007         /* guest TSS */
1008         vmcs_writel(GUEST_TR_BASE, 0);
1009         vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
1010         vmcs_writel(GUEST_TR_LIMIT, 0xff);
1011
1012         /* initialize sysenter */
1013         vmcs_write32(GUEST_SYSENTER_CS, 0);
1014         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1015         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1016
1017         /* other random initialization */
1018         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1019         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1020         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1021         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1022         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
1023         }
1024
1025 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1026                                             uint32_t msr) {
1027         int f = sizeof(unsigned long);
1028         /*
1029          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1030          * have the write-low and read-high bitmap offsets the wrong way round.
1031          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1032          */
1033         if (msr <= 0x1fff) {
1034                 __clear_bit(msr, msr_bitmap + 0x000 / f);       /* read-low */
1035                 __clear_bit(msr, msr_bitmap + 0x800 / f);       /* write-low */
1036         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1037                 msr &= 0x1fff;
1038                 __clear_bit(msr, msr_bitmap + 0x400 / f);       /* read-high */
1039                 __clear_bit(msr, msr_bitmap + 0xc00 / f);       /* write-high */
1040         }
1041 }
1042
1043 /* note the io_bitmap is big enough for the 64K port space. */
1044 static void __vmx_disable_intercept_for_io(unsigned long *io_bitmap,
1045                                            uint16_t port) {
1046         __clear_bit(port, io_bitmap);
1047 }
1048
1049 static void vcpu_print_autoloads(struct vmx_vcpu *vcpu) {
1050         struct vmx_msr_entry *e;
1051         int sz = sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs);
1052         printk("Host Autoloads:\n-------------------\n");
1053         for (int i = 0; i < sz; i++) {
1054                 e = &vcpu->msr_autoload.host[i];
1055                 printk("\tMSR 0x%08x: %p\n", e->index, e->value);
1056         }
1057         printk("Guest Autoloads:\n-------------------\n");
1058         for (int i = 0; i < sz; i++) {
1059                 e = &vcpu->msr_autoload.guest[i];
1060                 printk("\tMSR 0x%08x %p\n", e->index, e->value);
1061         }
1062 }
1063
1064 static void dumpmsrs(void) {
1065         int i;
1066         int set[] = {
1067                 MSR_LSTAR,
1068                 MSR_FS_BASE,
1069                 MSR_GS_BASE,
1070                 MSR_KERNEL_GS_BASE,
1071                 MSR_SFMASK,
1072                 MSR_IA32_PEBS_ENABLE
1073         };
1074         for (i = 0; i < ARRAY_SIZE(set); i++) {
1075                 printk("%p: %p\n", set[i], read_msr(set[i]));
1076         }
1077         printk("core id %d\n", core_id());
1078 }
1079
1080 /* emulated msr. For now, an msr value and a pointer to a helper that
1081  * performs the requested operation.
1082  */
1083 struct emmsr {
1084         uint32_t reg;
1085         char *name;
1086         int (*f) (struct vmx_vcpu * vcpu, struct emmsr *, uint32_t, uint32_t);
1087         bool written;
1088         uint32_t edx, eax;
1089 };
1090
1091 int emsr_miscenable(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1092                     uint32_t);
1093 int emsr_mustmatch(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1094                    uint32_t);
1095 int emsr_readonly(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1096                   uint32_t);
1097 int emsr_readzero(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1098                   uint32_t);
1099 int emsr_fakewrite(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1100                    uint32_t);
1101 int emsr_ok(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t, uint32_t);
1102
1103 struct emmsr emmsrs[] = {
1104         {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable},
1105         {MSR_IA32_SYSENTER_CS, "MSR_IA32_SYSENTER_CS", emsr_ok},
1106         {MSR_IA32_SYSENTER_EIP, "MSR_IA32_SYSENTER_EIP", emsr_ok},
1107         {MSR_IA32_SYSENTER_ESP, "MSR_IA32_SYSENTER_ESP", emsr_ok},
1108         {MSR_IA32_UCODE_REV, "MSR_IA32_UCODE_REV", emsr_fakewrite},
1109         {MSR_CSTAR, "MSR_CSTAR", emsr_fakewrite},
1110         {MSR_IA32_VMX_BASIC_MSR, "MSR_IA32_VMX_BASIC_MSR", emsr_fakewrite},
1111         {MSR_IA32_VMX_PINBASED_CTLS_MSR, "MSR_IA32_VMX_PINBASED_CTLS_MSR",
1112          emsr_fakewrite},
1113         {MSR_IA32_VMX_PROCBASED_CTLS_MSR, "MSR_IA32_VMX_PROCBASED_CTLS_MSR",
1114          emsr_fakewrite},
1115         {MSR_IA32_VMX_PROCBASED_CTLS2, "MSR_IA32_VMX_PROCBASED_CTLS2",
1116          emsr_fakewrite},
1117         {MSR_IA32_VMX_EXIT_CTLS_MSR, "MSR_IA32_VMX_EXIT_CTLS_MSR",
1118          emsr_fakewrite},
1119         {MSR_IA32_VMX_ENTRY_CTLS_MSR, "MSR_IA32_VMX_ENTRY_CTLS_MSR",
1120          emsr_fakewrite},
1121         {MSR_IA32_ENERGY_PERF_BIAS, "MSR_IA32_ENERGY_PERF_BIAS",
1122          emsr_fakewrite},
1123         {MSR_LBR_SELECT, "MSR_LBR_SELECT", emsr_ok},
1124         {MSR_LBR_TOS, "MSR_LBR_TOS", emsr_ok},
1125         {MSR_LBR_NHM_FROM, "MSR_LBR_NHM_FROM", emsr_ok},
1126         {MSR_LBR_NHM_TO, "MSR_LBR_NHM_TO", emsr_ok},
1127         {MSR_LBR_CORE_FROM, "MSR_LBR_CORE_FROM", emsr_ok},
1128         {MSR_LBR_CORE_TO, "MSR_LBR_CORE_TO", emsr_ok},
1129
1130         // grumble. 
1131         {MSR_OFFCORE_RSP_0, "MSR_OFFCORE_RSP_0", emsr_ok},
1132         {MSR_OFFCORE_RSP_1, "MSR_OFFCORE_RSP_1", emsr_ok},
1133         // louder.
1134         {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok},
1135         // aaaaaahhhhhhhhhhhhhhhhhhhhh
1136         {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
1137         {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
1138         // unsafe.
1139         {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fakewrite},
1140
1141         // mostly harmless.
1142         {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite},
1143         {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero},
1144 };
1145
1146 static uint64_t set_low32(uint64_t hi, uint32_t lo)
1147 {
1148         return (hi & 0xffffffff00000000ULL) | lo;
1149 }
1150
1151 static uint64_t set_low16(uint64_t hi, uint16_t lo)
1152 {
1153         return (hi & 0xffffffffffff0000ULL) | lo;
1154 }
1155
1156 static uint64_t set_low8(uint64_t hi, uint8_t lo)
1157 {
1158         return (hi & 0xffffffffffffff00ULL) | lo;
1159 }
1160
1161 /* this may be the only register that needs special handling.
1162  * If there others then we might want to extend teh emmsr struct.
1163  */
1164 int emsr_miscenable(struct vmx_vcpu *vcpu, struct emmsr *msr,
1165                     uint32_t opcode, uint32_t qual) {
1166         uint32_t eax, edx;
1167         rdmsr(msr->reg, eax, edx);
1168         /* we just let them read the misc msr for now. */
1169         if (opcode == EXIT_REASON_MSR_READ) {
1170                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1171                 vcpu->regs.tf_rax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
1172                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1173                 return 0;
1174         } else {
1175                 /* if they are writing what is already written, that's ok. */
1176                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1177                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1178                         return 0;
1179         }
1180         printk
1181                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
1182                  msr->name, (uint32_t) vcpu->regs.tf_rdx,
1183                  (uint32_t) vcpu->regs.tf_rax, edx, eax);
1184         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1185 }
1186
1187 int emsr_mustmatch(struct vmx_vcpu *vcpu, struct emmsr *msr,
1188                    uint32_t opcode, uint32_t qual) {
1189         uint32_t eax, edx;
1190         rdmsr(msr->reg, eax, edx);
1191         /* we just let them read the misc msr for now. */
1192         if (opcode == EXIT_REASON_MSR_READ) {
1193                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1194                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1195                 return 0;
1196         } else {
1197                 /* if they are writing what is already written, that's ok. */
1198                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1199                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1200                         return 0;
1201         }
1202         printk
1203                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
1204                  msr->name, (uint32_t) vcpu->regs.tf_rdx,
1205                  (uint32_t) vcpu->regs.tf_rax, edx, eax);
1206         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1207 }
1208
1209 int emsr_ok(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1210             uint32_t qual) {
1211         if (opcode == EXIT_REASON_MSR_READ) {
1212                 rdmsr(msr->reg, vcpu->regs.tf_rdx, vcpu->regs.tf_rax);
1213         } else {
1214                 uint64_t val =
1215                         (uint64_t) vcpu->regs.tf_rdx << 32 | vcpu->regs.tf_rax;
1216                 write_msr(msr->reg, val);
1217         }
1218         return 0;
1219 }
1220
1221 int emsr_readonly(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1222                   uint32_t qual) {
1223         uint32_t eax, edx;
1224         rdmsr((uint32_t) vcpu->regs.tf_rcx, eax, edx);
1225         /* we just let them read the misc msr for now. */
1226         if (opcode == EXIT_REASON_MSR_READ) {
1227                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1228                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1229                 return 0;
1230         }
1231
1232         printk("%s: Tried to write a readonly register\n", msr->name);
1233         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1234 }
1235
1236 int emsr_readzero(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1237                   uint32_t qual) {
1238         if (opcode == EXIT_REASON_MSR_READ) {
1239                 vcpu->regs.tf_rax = 0;
1240                 vcpu->regs.tf_rdx = 0;
1241                 return 0;
1242         }
1243
1244         printk("%s: Tried to write a readonly register\n", msr->name);
1245         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1246 }
1247
1248 /* pretend to write it, but don't write it. */
1249 int emsr_fakewrite(struct vmx_vcpu *vcpu, struct emmsr *msr,
1250                    uint32_t opcode, uint32_t qual) {
1251         uint32_t eax, edx;
1252         if (!msr->written) {
1253                 rdmsr(msr->reg, eax, edx);
1254         } else {
1255                 edx = msr->edx;
1256                 eax = msr->eax;
1257         }
1258         /* we just let them read the misc msr for now. */
1259         if (opcode == EXIT_REASON_MSR_READ) {
1260                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1261                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1262                 return 0;
1263         } else {
1264                 /* if they are writing what is already written, that's ok. */
1265                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1266                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1267                         return 0;
1268                 msr->edx = vcpu->regs.tf_rdx;
1269                 msr->eax = vcpu->regs.tf_rax;
1270                 msr->written = true;
1271         }
1272         return 0;
1273 }
1274
1275 static int
1276 msrio(struct vmx_vcpu *vcpu, uint32_t opcode, uint32_t qual) {
1277         int i;
1278         for (i = 0; i < ARRAY_SIZE(emmsrs); i++) {
1279                 if (emmsrs[i].reg != vcpu->regs.tf_rcx)
1280                         continue;
1281                 return emmsrs[i].f(vcpu, &emmsrs[i], opcode, qual);
1282         }
1283         printk("msrio for 0x%lx failed\n", vcpu->regs.tf_rcx);
1284         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1285 }
1286
1287 /* crude PCI bus. Just enough to get virtio working. I would rather not add to this. */
1288 struct pciconfig {
1289         uint32_t registers[256];
1290 };
1291
1292 /* just index by devfn, i.e. 8 bits */
1293 struct pciconfig pcibus[] = {
1294         /* linux requires that devfn 0 be a bridge. 
1295          * 00:00.0 Host bridge: Intel Corporation 440BX/ZX/DX - 82443BX/ZX/DX Host bridge (rev 01)
1296          */
1297         {
1298                 {0x71908086, 0x02000006, 0x06000001},
1299         },
1300 };
1301 /* cf8 is a single-threaded resource. */
1302 static uint32_t cf8;
1303 static uint32_t allones = (uint32_t)-1;
1304
1305 /* Return a pointer to the 32-bit "register" in the "pcibus" give an address. Use cf8.
1306  * only for readonly access.
1307  * this will fail if we ever want to do writes, but we don't.
1308  */
1309 void regp(uint32_t **reg)
1310 {
1311         *reg = &allones;
1312         int devfn = (cf8>>8) & 0xff;
1313         //printk("devfn %d\n", devfn);
1314         if (devfn < ARRAY_SIZE(pcibus))
1315                 *reg = &pcibus[devfn].registers[(cf8>>2)&0x3f];
1316         //printk("-->regp *reg 0x%lx\n", **reg);
1317 }
1318
1319 static uint32_t configaddr(uint32_t val)
1320 {
1321         printk("%s 0x%lx\n", __func__, val);
1322         cf8 = val;
1323         return 0;
1324 }
1325
1326 static uint32_t configread32(uint32_t edx, uint64_t *reg)
1327 {
1328         uint32_t *r = &cf8;
1329         regp(&r);
1330         *reg = set_low32(*reg, *r);
1331         printk("%s: 0x%lx 0x%lx, 0x%lx 0x%lx\n", __func__, cf8, edx, r, *reg);
1332         return 0;
1333 }
1334
1335 static uint32_t configread16(uint32_t edx, uint64_t *reg)
1336 {
1337         uint64_t val;
1338         int which = ((edx&2)>>1) * 16;
1339         configread32(edx, &val);
1340         val >>= which;
1341         *reg = set_low16(*reg, val);
1342         printk("%s: 0x%lx, 0x%lx 0x%lx\n", __func__, edx, val, *reg);
1343         return 0;
1344 }
1345
1346 static uint32_t configread8(uint32_t edx, uint64_t *reg)
1347 {
1348         uint64_t val;
1349         int which = (edx&3) * 8;
1350         configread32(edx, &val);
1351         val >>= which;
1352         *reg = set_low16(*reg, val);
1353         printk("%s: 0x%lx, 0x%lx 0x%lx\n", __func__, edx, val, *reg);
1354         return 0;
1355 }
1356
1357 static int configwrite32(uint32_t addr, uint32_t val)
1358 {
1359         uint32_t *r = &cf8;
1360         regp(&r);
1361         *r = val;
1362         printk("%s 0x%lx 0x%lx\n", __func__, addr, val);
1363         return 0;
1364 }
1365
1366 static int configwrite16(uint32_t addr, uint16_t val)
1367 {
1368         printk("%s 0x%lx 0x%lx\n", __func__, addr, val);
1369         return 0;
1370 }
1371
1372 static int configwrite8(uint32_t addr, uint8_t val)
1373 {
1374         printk("%s 0x%lx 0x%lx\n", __func__, addr, val);
1375         return 0;
1376 }
1377
1378 /* this is very minimal. It needs to move to vmm/io.c but we don't
1379  * know if this minimal approach will even be workable. It only (for
1380  * now) handles pci config space. We'd like to hope that's all we will
1381  * need.
1382  * It would have been nice had intel encoded the IO exit info as nicely as they
1383  * encoded, some of the other exits.
1384  */
1385 static int io(struct vmx_vcpu *vcpu, int *advance)
1386 {
1387
1388         /* Get a pointer to the memory at %rip. This is quite messy and part of the
1389          * reason we don't want to do this at all. It sucks. Would have been nice
1390          * had linux had an option to ONLY do mmio config space access, but no such
1391          * luck.
1392          */
1393         uint8_t *ip8 = NULL;
1394         uint16_t *ip16;
1395         uintptr_t ip;
1396         uint32_t edx;
1397         /* for now, we're going to be a bit crude. In kernel, p is about v, so we just blow away
1398          * the upper 34 bits and take the rest as our address
1399          */
1400         ip = vcpu->regs.tf_rip & 0x3fffffff;
1401         edx = vcpu->regs.tf_rdx;
1402         ip8 = (void *)ip;
1403         ip16 = (void *)ip;
1404         //printk("io: ip16 %p\n", *ip16, edx);
1405
1406         if (*ip8 == 0xef) {
1407                 *advance = 1;
1408                 /* out at %edx */
1409                 if (edx == 0xcf8) {
1410                         //printk("Set cf8 ");
1411                         return configaddr(vcpu->regs.tf_rax);
1412                 }
1413                 if (edx == 0xcfc) {
1414                         //printk("Set cfc ");
1415                         return configwrite32(edx, vcpu->regs.tf_rax);
1416                 }
1417                 printk("unhandled IO address dx @%p is 0x%x\n", ip8, edx);
1418                 return SHUTDOWN_UNHANDLED_EXIT_REASON;
1419         }
1420         // out %al, %dx
1421         if (*ip8 == 0xee) {
1422                 *advance = 1;
1423                 /* out al %edx */
1424                 if (edx == 0xcfb) { // special!
1425                         printk("Just ignore the damned cfb write\n");
1426                         return 0;
1427                 }
1428                 if ((edx&~3) == 0xcfc) {
1429                         //printk("ignoring write to cfc ");
1430                         return 0;
1431                 }
1432                 printk("unhandled IO address dx @%p is 0x%x\n", ip8, edx);
1433                 return SHUTDOWN_UNHANDLED_EXIT_REASON;
1434         }
1435         if (*ip8 == 0xec) {
1436                 *advance = 1;
1437                 //printk("configread8 ");
1438                 return configread8(edx, &vcpu->regs.tf_rax);
1439         }
1440         if (*ip8 == 0xed) {
1441                 *advance = 1;
1442                 if (edx == 0xcf8) {
1443                         //printk("read cf8 0x%lx\n", vcpu->regs.tf_rax);
1444                         vcpu->regs.tf_rax = cf8;
1445                         return 0;
1446                 }
1447                 //printk("configread32 ");
1448                 return configread32(edx, &vcpu->regs.tf_rax);
1449         }
1450         if (*ip16 == 0xed66) {
1451                 *advance = 2;
1452                 //printk("configread16 ");
1453                 return configread16(edx, &vcpu->regs.tf_rax);
1454         }
1455         printk("unknown IO %p %x %x\n", ip8, *ip8, *ip16);
1456         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1457 }
1458
1459 /* Notes on autoloading.  We can't autoload FS_BASE or GS_BASE, according to the
1460  * manual, but that's because they are automatically saved and restored when all
1461  * of the other architectural registers are saved and restored, such as cs, ds,
1462  * es, and other fun things. (See 24.4.1).  We need to make sure we don't
1463  * accidentally intercept them too, since they are magically autloaded..
1464  *
1465  * We'll need to be careful of any MSR we neither autoload nor intercept
1466  * whenever we vmenter/vmexit, and we intercept by default.
1467  *
1468  * Other MSRs, such as MSR_IA32_PEBS_ENABLE only work on certain architectures
1469  * only work on certain architectures. */
1470 static void setup_msr(struct vmx_vcpu *vcpu) {
1471         struct vmx_msr_entry *e;
1472         int sz = sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs);
1473         int i;
1474
1475         static_assert((sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs)) <=
1476                       NR_AUTOLOAD_MSRS);
1477
1478         vcpu->msr_autoload.nr = sz;
1479
1480         /* Since PADDR(msr_bitmap) is non-zero, and the bitmap is all 0xff, we now
1481          * intercept all MSRs */
1482         vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1483
1484         vmcs_write64(IO_BITMAP_A, PADDR(io_bitmap));
1485         vmcs_write64(IO_BITMAP_B, PADDR((uintptr_t)io_bitmap +
1486                                         (VMX_IO_BITMAP_SZ / 2)));
1487
1488         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1489         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1490         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1491
1492         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1493         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1494         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1495
1496         for (i = 0; i < sz; i++) {
1497                 uint64_t val;
1498
1499                 e = &vcpu->msr_autoload.host[i];
1500                 e->index = autoloaded_msrs[i];
1501                 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1502                 rdmsrl(e->index, val);
1503                 e->value = val;
1504                 printk("host index %p val %p\n", e->index, e->value);
1505
1506                 e = &vcpu->msr_autoload.guest[i];
1507                 e->index = autoloaded_msrs[i];
1508                 e->value = 0xDEADBEEF;
1509                 printk("guest index %p val %p\n", e->index, e->value);
1510         }
1511 }
1512
1513 /**
1514  *  vmx_setup_vmcs - configures the vmcs with starting parameters
1515  */
1516 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu) {
1517         vmcs_write16(VIRTUAL_PROCESSOR_ID, 0);
1518         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1519
1520         /* Control */
1521         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1522                      vmcs_config.pin_based_exec_ctrl);
1523
1524         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1525                      vmcs_config.cpu_based_exec_ctrl);
1526
1527         if (cpu_has_secondary_exec_ctrls()) {
1528                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1529                              vmcs_config.cpu_based_2nd_exec_ctrl);
1530         }
1531
1532         vmcs_write64(EPT_POINTER, vcpu_get_eptp(vcpu));
1533
1534         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1535         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1536         vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1537
1538         setup_msr(vcpu);
1539
1540         vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1541
1542         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1543         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1544
1545         vmcs_writel(CR0_GUEST_HOST_MASK, 0);    // ~0ul);
1546         vmcs_writel(CR4_GUEST_HOST_MASK, 0);    // ~0ul);
1547
1548         //kvm_write_tsc(&vmx->vcpu, 0);
1549         vmcs_writel(TSC_OFFSET, 0);
1550
1551         vmx_setup_constant_host_state();
1552 }
1553
1554 /**
1555  * vmx_create_vcpu - allocates and initializes a new virtual cpu
1556  *
1557  * Returns: A new VCPU structure
1558  */
1559 struct vmx_vcpu *vmx_create_vcpu(struct proc *p) {
1560         struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1561         if (!vcpu) {
1562                 return NULL;
1563         }
1564
1565         memset(vcpu, 0, sizeof(*vcpu));
1566
1567         vcpu->proc = p; /* uncounted (weak) reference */
1568         vcpu->vmcs = vmx_alloc_vmcs();
1569         printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1570         if (!vcpu->vmcs)
1571                 goto fail_vmcs;
1572
1573         vcpu->cpu = -1;
1574
1575         vmx_get_cpu(vcpu);
1576         vmx_setup_vmcs(vcpu);
1577         vmx_setup_initial_guest_state();
1578         vmx_put_cpu(vcpu);
1579
1580         return vcpu;
1581
1582 fail_vmcs:
1583         kfree(vcpu);
1584         return NULL;
1585 }
1586
1587 /**
1588  * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1589  * @vcpu: the VCPU to destroy
1590  */
1591 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu) {
1592         vmx_free_vmcs(vcpu->vmcs);
1593         kfree(vcpu);
1594 }
1595
1596 /**
1597  * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1598  *
1599  * In the contexts where this is used the vcpu pointer should never be NULL.
1600  */
1601 static inline struct vmx_vcpu *vmx_current_vcpu(void) {
1602         struct vmx_vcpu *vcpu = currentcpu->local_vcpu;
1603         if (!vcpu)
1604                 panic("Core has no vcpu!");
1605         return vcpu;
1606 }
1607
1608 /**
1609  * vmx_run_vcpu - launches the CPU into non-root mode
1610  * We ONLY support 64-bit guests.
1611  * @vcpu: the vmx instance to launch
1612  */
1613 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1614 {
1615         asm(
1616                 /* Store host registers */
1617                 "push %%rdx; push %%rbp;"
1618                 "push %%rcx \n\t" /* placeholder for guest rcx */
1619                 "push %%rcx \n\t"
1620                 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1621                 "je 1f \n\t"
1622                 "mov %%rsp, %c[host_rsp](%0) \n\t"
1623                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1624                 "1: \n\t"
1625                 /* Reload cr2 if changed */
1626                 "mov %c[cr2](%0), %%rax \n\t"
1627                 "mov %%cr2, %%rdx \n\t"
1628                 "cmp %%rax, %%rdx \n\t"
1629                 "je 2f \n\t"
1630                 "mov %%rax, %%cr2 \n\t"
1631                 "2: \n\t"
1632                 /* Check if vmlaunch of vmresume is needed */
1633                 "cmpl $0, %c[launched](%0) \n\t"
1634                 /* Load guest registers.  Don't clobber flags. */
1635                 "mov %c[rax](%0), %%rax \n\t"
1636                 "mov %c[rbx](%0), %%rbx \n\t"
1637                 "mov %c[rdx](%0), %%rdx \n\t"
1638                 "mov %c[rsi](%0), %%rsi \n\t"
1639                 "mov %c[rdi](%0), %%rdi \n\t"
1640                 "mov %c[rbp](%0), %%rbp \n\t"
1641                 "mov %c[r8](%0),  %%r8  \n\t"
1642                 "mov %c[r9](%0),  %%r9  \n\t"
1643                 "mov %c[r10](%0), %%r10 \n\t"
1644                 "mov %c[r11](%0), %%r11 \n\t"
1645                 "mov %c[r12](%0), %%r12 \n\t"
1646                 "mov %c[r13](%0), %%r13 \n\t"
1647                 "mov %c[r14](%0), %%r14 \n\t"
1648                 "mov %c[r15](%0), %%r15 \n\t"
1649                 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1650
1651                 /* Enter guest mode */
1652                 "jne .Llaunched \n\t"
1653                 ASM_VMX_VMLAUNCH "\n\t"
1654                 "jmp .Lkvm_vmx_return \n\t"
1655                 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1656                 ".Lkvm_vmx_return: "
1657                 /* Save guest registers, load host registers, keep flags */
1658                 "mov %0, %c[wordsize](%%rsp) \n\t"
1659                 "pop %0 \n\t"
1660                 "mov %%rax, %c[rax](%0) \n\t"
1661                 "mov %%rbx, %c[rbx](%0) \n\t"
1662                 "popq %c[rcx](%0) \n\t"
1663                 "mov %%rdx, %c[rdx](%0) \n\t"
1664                 "mov %%rsi, %c[rsi](%0) \n\t"
1665                 "mov %%rdi, %c[rdi](%0) \n\t"
1666                 "mov %%rbp, %c[rbp](%0) \n\t"
1667                 "mov %%r8,  %c[r8](%0) \n\t"
1668                 "mov %%r9,  %c[r9](%0) \n\t"
1669                 "mov %%r10, %c[r10](%0) \n\t"
1670                 "mov %%r11, %c[r11](%0) \n\t"
1671                 "mov %%r12, %c[r12](%0) \n\t"
1672                 "mov %%r13, %c[r13](%0) \n\t"
1673                 "mov %%r14, %c[r14](%0) \n\t"
1674                 "mov %%r15, %c[r15](%0) \n\t"
1675                 "mov %%rax, %%r10 \n\t"
1676                 "mov %%rdx, %%r11 \n\t"
1677
1678                 "mov %%cr2, %%rax   \n\t"
1679                 "mov %%rax, %c[cr2](%0) \n\t"
1680
1681                 "pop  %%rbp; pop  %%rdx \n\t"
1682                 "setbe %c[fail](%0) \n\t"
1683                 "mov $" STRINGIFY(GD_UD) ", %%rax \n\t"
1684                 "mov %%rax, %%ds \n\t"
1685                 "mov %%rax, %%es \n\t"
1686               : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1687                 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1688                 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1689                 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1690                 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1691                 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1692                 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1693                 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1694                 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1695                 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1696                 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1697                 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1698                 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1699                 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1700                 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1701                 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1702                 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1703                 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1704                 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1705                 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1706                 [wordsize]"i"(sizeof(unsigned long))
1707               : "cc", "memory"
1708                 , "rax", "rbx", "rdi", "rsi"
1709                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1710         );
1711
1712         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1713         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1714         printd("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1715                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1716         /* FIXME: do we need to set up other flags? */
1717         vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
1718                 X86_EFLAGS_IF | 0x2;
1719
1720         vcpu->regs.tf_cs = GD_UT;
1721         vcpu->regs.tf_ss = GD_UD;
1722
1723         vcpu->launched = 1;
1724
1725         if (vcpu->fail) {
1726                 printk("failure detected (err %x)\n",
1727                        vmcs_read32(VM_INSTRUCTION_ERROR));
1728                 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1729         }
1730
1731         return vmcs_read32(VM_EXIT_REASON);
1732
1733 #if 0
1734         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1735         vmx_complete_atomic_exit(vmx);
1736         vmx_recover_nmi_blocking(vmx);
1737         vmx_complete_interrupts(vmx);
1738 #endif
1739 }
1740
1741 static void vmx_step_instruction(void) {
1742         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1743                     vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1744 }
1745
1746 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu) {
1747         unsigned long gva, gpa;
1748         int exit_qual, ret = -1;
1749         page_t *page;
1750
1751         vmx_get_cpu(vcpu);
1752         exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1753         gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1754         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1755
1756         vmx_put_cpu(vcpu);
1757
1758         int prot = 0;
1759         prot |= exit_qual & VMX_EPT_FAULT_READ ? PROT_READ : 0;
1760         prot |= exit_qual & VMX_EPT_FAULT_WRITE ? PROT_WRITE : 0;
1761         prot |= exit_qual & VMX_EPT_FAULT_INS ? PROT_EXEC : 0;
1762         ret = handle_page_fault(current, gpa, prot);
1763
1764         if (ret) {
1765                 printk("EPT page fault failure %d, GPA: %p, GVA: %p\n", ret, gpa,
1766                        gva);
1767                 vmx_dump_cpu(vcpu);
1768         }
1769
1770         return ret;
1771 }
1772
1773 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu) {
1774         unsigned int eax, ebx, ecx, edx;
1775
1776         eax = vcpu->regs.tf_rax;
1777         ecx = vcpu->regs.tf_rcx;
1778         cpuid(eax, ecx, &eax, &ebx, &ecx, &edx);
1779         vcpu->regs.tf_rax = eax;
1780         vcpu->regs.tf_rbx = ebx;
1781         vcpu->regs.tf_rcx = ecx;
1782         vcpu->regs.tf_rdx = edx;
1783 }
1784
1785 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu) {
1786         uint32_t intr_info;
1787
1788         vmx_get_cpu(vcpu);
1789         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1790         vmx_put_cpu(vcpu);
1791
1792         printk("vmx (vcpu %p): got an exception\n", vcpu);
1793         printk("vmx (vcpu %p): pid %d\n", vcpu, vcpu->proc->pid);
1794         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1795                 return 0;
1796         }
1797
1798         printk("unhandled nmi, intr_info %x\n", intr_info);
1799         return -EIO;
1800 }
1801
1802 /**
1803  * vmx_launch - the main loop for a VMX Dune process
1804  * @conf: the launch configuration
1805  */
1806 int vmx_launch(uint64_t rip, uint64_t rsp, uint64_t cr3) {
1807         int ret;
1808         struct vmx_vcpu *vcpu;
1809         int errors = 0;
1810         int advance;
1811
1812         printd("RUNNING: %s: rip %p rsp %p cr3 %p \n", __func__, rip, rsp, cr3);
1813         /* TODO: dirty hack til we have VMM contexts */
1814         vcpu = current->vmm.guest_pcores[0];
1815         if (!vcpu) {
1816                 printk("Failed to get a CPU!\n");
1817                 return -ENOMEM;
1818         }
1819
1820         /* We need to prep the host's autoload region for our current core.  Right
1821          * now, the only autoloaded MSR that varies at runtime (in this case per
1822          * core is the KERN_GS_BASE). */
1823         rdmsrl(MSR_KERNEL_GS_BASE, vcpu->msr_autoload.host[0].value);
1824         /* if cr3 is set, means 'set everything', else means 'start where you left off' */
1825         if (cr3) {
1826                 vmx_get_cpu(vcpu);
1827                 vmcs_writel(GUEST_RIP, rip);
1828                 vmcs_writel(GUEST_RSP, rsp);
1829                 vmcs_writel(GUEST_CR3, cr3);
1830                 vmx_put_cpu(vcpu);
1831         }
1832
1833         vcpu->ret_code = -1;
1834
1835         while (1) {
1836                 advance = 0;
1837                 vmx_get_cpu(vcpu);
1838
1839                 // TODO: manage the fpu when we restart.
1840
1841                 // TODO: see if we need to exit before we go much further.
1842                 disable_irq();
1843                 //dumpmsrs();
1844                 ret = vmx_run_vcpu(vcpu);
1845                 //dumpmsrs();
1846                 enable_irq();
1847                 vmx_put_cpu(vcpu);
1848
1849                 if (ret == EXIT_REASON_VMCALL) {
1850                         if (current->vmm.flags & VMM_VMCALL_PRINTF) {
1851                                 uint8_t byte = vcpu->regs.tf_rdi;
1852                                 printd("System call\n");
1853 #ifdef DEBUG
1854                                 vmx_dump_cpu(vcpu);
1855 #endif
1856                                 advance = 3;
1857                                 printk("%c", byte);
1858                                 // adjust the RIP
1859                         } else {
1860                                 vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1861                                 uint8_t byte = vcpu->regs.tf_rdi;
1862                                 printk("%p %c\n", byte, vcpu->regs.tf_rdi);
1863                                 vmx_dump_cpu(vcpu);
1864                                 printd("system call! WTF\n");
1865                         }
1866                 } else if (ret == EXIT_REASON_CR_ACCESS) {
1867                         show_cr_access(vmcs_read32(EXIT_QUALIFICATION));
1868                         vmx_dump_cpu(vcpu);
1869                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1870                 } else if (ret == EXIT_REASON_CPUID) {
1871                         vmx_handle_cpuid(vcpu);
1872                         vmx_get_cpu(vcpu);
1873                         vmcs_writel(GUEST_RIP, vcpu->regs.tf_rip + 2);
1874                         vmx_put_cpu(vcpu);
1875                 } else if (ret == EXIT_REASON_EPT_VIOLATION) {
1876                         if (vmx_handle_ept_violation(vcpu))
1877                                 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1878                 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1879                         if (vmx_handle_nmi_exception(vcpu))
1880                                 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1881                 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1882                         printd("External interrupt\n");
1883                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1884                 } else if (ret == EXIT_REASON_MSR_READ) {
1885                         printd("msr read\n");
1886                         vmx_dump_cpu(vcpu);
1887                         vcpu->shutdown =
1888                                 msrio(vcpu, ret, vmcs_read32(EXIT_QUALIFICATION));
1889                         advance = 2;
1890                 } else if (ret == EXIT_REASON_MSR_WRITE) {
1891                         printd("msr write\n");
1892                         vmx_dump_cpu(vcpu);
1893                         vcpu->shutdown =
1894                                 msrio(vcpu, ret, vmcs_read32(EXIT_QUALIFICATION));
1895                         advance = 2;
1896                 } else if (ret == EXIT_REASON_IO_INSTRUCTION) {
1897                         /* we never wanted to do this. But virtio
1898                          * requires pci config space emulation. */
1899                         vcpu->shutdown = io(vcpu, &advance);
1900                 } else {
1901                         printk("unhandled exit: reason 0x%x, exit qualification 0x%x\n",
1902                                ret, vmcs_read32(EXIT_QUALIFICATION));
1903                         vmx_dump_cpu(vcpu);
1904                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1905                 }
1906
1907                 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1908                  * similar to how proc_restartcore/smp_idle only restart the pcpui
1909                  * cur_ctx, we need to do the same, via the VMCS resume business. */
1910                 if (vcpu->shutdown)
1911                         break;
1912
1913                 if (advance) {
1914                         vmx_get_cpu(vcpu);
1915                         vmcs_writel(GUEST_RIP, vcpu->regs.tf_rip + advance);
1916                         vmx_put_cpu(vcpu);
1917                 }
1918         }
1919
1920         printd("RETURN. ip %016lx sp %016lx\n",
1921                vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
1922 //  hexdump((void *)vcpu->regs.tf_rsp, 128 * 8);
1923         /*
1924          * Return both the reason for the shutdown and a status value.
1925          * The exit() and exit_group() system calls only need 8 bits for
1926          * the status but we allow 16 bits in case we might want to
1927          * return more information for one of the other shutdown reasons.
1928          */
1929         ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
1930
1931         return ret;
1932 }
1933
1934 /**
1935  * __vmx_enable - low-level enable of VMX mode on the current CPU
1936  * @vmxon_buf: an opaque buffer for use as the VMXON region
1937  */
1938 static int __vmx_enable(struct vmcs *vmxon_buf) {
1939         uint64_t phys_addr = PADDR(vmxon_buf);
1940         uint64_t old, test_bits;
1941
1942         if (rcr4() & X86_CR4_VMXE) {
1943                 panic("Should never have this happen");
1944                 return -EBUSY;
1945         }
1946
1947         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1948
1949         test_bits = FEATURE_CONTROL_LOCKED;
1950         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1951
1952         if (0)  // tboot_enabled())
1953                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1954
1955         if ((old & test_bits) != test_bits) {
1956                 /* If it's locked, then trying to set it will cause a GPF.
1957                  * No Dune for you!
1958                  */
1959                 if (old & FEATURE_CONTROL_LOCKED) {
1960                         printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1961                         return -1;
1962                 }
1963
1964                 /* enable and lock */
1965                 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1966         }
1967         lcr4(rcr4() | X86_CR4_VMXE);
1968
1969         __vmxon(phys_addr);
1970         vpid_sync_vcpu_global();        /* good idea, even if we aren't using vpids */
1971         ept_sync_global();
1972
1973         return 0;
1974 }
1975
1976 /**
1977  * vmx_enable - enables VMX mode on the current CPU
1978  * @unused: not used (required for on_each_cpu())
1979  *
1980  * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
1981  */
1982 static void vmx_enable(void) {
1983         struct vmcs *vmxon_buf = currentcpu->vmxarea;
1984         int ret;
1985
1986         ret = __vmx_enable(vmxon_buf);
1987         if (ret)
1988                 goto failed;
1989
1990         currentcpu->vmx_enabled = 1;
1991         // TODO: do we need this?
1992         store_gdt(&currentcpu->host_gdt);
1993
1994         printk("VMX enabled on CPU %d\n", core_id());
1995         return;
1996
1997 failed:
1998         printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1999 }
2000
2001 /**
2002  * vmx_disable - disables VMX mode on the current CPU
2003  */
2004 static void vmx_disable(void *unused) {
2005         if (currentcpu->vmx_enabled) {
2006                 __vmxoff();
2007                 lcr4(rcr4() & ~X86_CR4_VMXE);
2008                 currentcpu->vmx_enabled = 0;
2009         }
2010 }
2011
2012 /* Probe the cpus to see which ones can do vmx.
2013  * Return -errno if it fails, and 1 if it succeeds.
2014  */
2015 static bool probe_cpu_vmx(void) {
2016         /* The best way to test this code is:
2017          * wrmsr -p <cpu> 0x3a 1
2018          * This will lock vmx off; then modprobe dune.
2019          * Frequently, however, systems have all 0x3a registers set to 5,
2020          * meaning testing is impossible, as vmx can not be disabled.
2021          * We have to simulate it being unavailable in most cases.
2022          * The 'test' variable provides an easy way to simulate
2023          * unavailability of vmx on some, none, or all cpus.
2024          */
2025         if (!cpu_has_vmx()) {
2026                 printk("Machine does not support VT-x\n");
2027                 return FALSE;
2028         } else {
2029                 printk("Machine supports VT-x\n");
2030                 return TRUE;
2031         }
2032 }
2033
2034 static void setup_vmxarea(void) {
2035         struct vmcs *vmxon_buf;
2036         printd("Set up vmxarea for cpu %d\n", core_id());
2037         vmxon_buf = __vmx_alloc_vmcs(core_id());
2038         if (!vmxon_buf) {
2039                 printk("setup_vmxarea failed on node %d\n", core_id());
2040                 return;
2041         }
2042         currentcpu->vmxarea = vmxon_buf;
2043 }
2044
2045 static int ept_init(void) {
2046         if (!cpu_has_vmx_ept()) {
2047                 printk("VMX doesn't support EPT!\n");
2048                 return -1;
2049         }
2050         if (!cpu_has_vmx_eptp_writeback()) {
2051                 printk("VMX EPT doesn't support WB memory!\n");
2052                 return -1;
2053         }
2054         if (!cpu_has_vmx_ept_4levels()) {
2055                 printk("VMX EPT doesn't support 4 level walks!\n");
2056                 return -1;
2057         }
2058         switch (arch_max_jumbo_page_shift()) {
2059         case PML3_SHIFT:
2060                 if (!cpu_has_vmx_ept_1g_page()) {
2061                         printk("VMX EPT doesn't support 1 GB pages!\n");
2062                         return -1;
2063                 }
2064                 break;
2065         case PML2_SHIFT:
2066                 if (!cpu_has_vmx_ept_2m_page()) {
2067                         printk("VMX EPT doesn't support 2 MB pages!\n");
2068                         return -1;
2069                 }
2070                 break;
2071         default:
2072                 printk("Unexpected jumbo page size %d\n",
2073                        arch_max_jumbo_page_shift());
2074                 return -1;
2075         }
2076         if (!cpu_has_vmx_ept_ad_bits()) {
2077                 printk("VMX EPT doesn't support accessed/dirty!\n");
2078                 x86_ept_pte_fix_ups |= EPTE_A | EPTE_D;
2079         }
2080         if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
2081                 printk("VMX EPT can't invalidate PTEs/TLBs!\n");
2082                 return -1;
2083         }
2084
2085         return 0;
2086 }
2087
2088 /**
2089  * vmx_init sets up physical core data areas that are required to run a vm at all.
2090  * These data areas are not connected to a specific user process in any way. Instead,
2091  * they are in some sense externalizing what would other wise be a very large ball of
2092  * state that would be inside the CPU.
2093  */
2094 int intel_vmm_init(void) {
2095         int r, cpu, ret;
2096
2097         if (!probe_cpu_vmx()) {
2098                 return -EOPNOTSUPP;
2099         }
2100
2101         setup_vmcs_config(&ret);
2102
2103         if (ret) {
2104                 printk("setup_vmcs_config failed: %d\n", ret);
2105                 return ret;
2106         }
2107
2108         msr_bitmap = (unsigned long *)kpage_zalloc_addr();
2109         if (!msr_bitmap) {
2110                 printk("Could not allocate msr_bitmap\n");
2111                 return -ENOMEM;
2112         }
2113         io_bitmap = (unsigned long *)get_cont_pages(VMX_IO_BITMAP_ORDER,
2114                                                     KMALLOC_WAIT);
2115         if (!io_bitmap) {
2116                 printk("Could not allocate msr_bitmap\n");
2117                 kfree(msr_bitmap);
2118                 return -ENOMEM;
2119         }
2120         /* FIXME: do we need APIC virtualization (flexpriority?) */
2121
2122         memset(msr_bitmap, 0xff, PAGE_SIZE);
2123         memset(io_bitmap, 0xff, VMX_IO_BITMAP_SZ);
2124
2125         /* These are the only MSRs that are not autoloaded and not intercepted */
2126         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
2127         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
2128         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_EFER);
2129
2130         /* TODO: this might be dangerous, since they can do more than just read the
2131          * CMOS */
2132         __vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_IDX);
2133         __vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_DATA);
2134
2135         if ((ret = ept_init())) {
2136                 printk("EPT init failed, %d\n", ret);
2137                 return ret;
2138         }
2139         printk("VMX setup succeeded\n");
2140         return 0;
2141 }
2142
2143 int intel_vmm_pcpu_init(void) {
2144         setup_vmxarea();
2145         vmx_enable();
2146         return 0;
2147 }