Re-enabled event injection via vmctl during vmx RESUME
[akaros.git] / kern / arch / x86 / vmm / intel / vmx.c
1 //#define DEBUG
2 /**
3  *  vmx.c - The Intel VT-x driver for Dune
4  *
5  * This file is derived from Linux KVM VT-x support.
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8  *
9  * Original Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This modified version is simpler because it avoids the following
14  * features that are not requirements for Dune:
15  *  * Real-mode emulation
16  *  * Nested VT-x support
17  *  * I/O hardware emulation
18  *  * Any of the more esoteric X86 features and registers
19  *  * KVM-specific functionality
20  *
21  * In essence we provide only the minimum functionality needed to run
22  * a process in vmx non-root mode rather than the full hardware emulation
23  * needed to support an entire OS.
24  *
25  * This driver is a research prototype and as such has the following
26  * limitations:
27  *
28  * FIXME: Backward compatability is currently a non-goal, and only recent
29  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
30  * driver.
31  *
32  * FIXME: Eventually we should handle concurrent user's of VT-x more
33  * gracefully instead of requiring exclusive access. This would allow
34  * Dune to interoperate with KVM and other HV solutions.
35  *
36  * FIXME: We need to support hotplugged physical CPUs.
37  *
38  * Authors:
39  *   Adam Belay   <abelay@stanford.edu>
40  */
41
42 /* Basic flow.
43  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
44  * You're left with the feeling that they got part way through and realized they had to have one for
45  *
46  * 1) your CPU is going to be capable of running VMs, and you need state for that.
47  *
48  * 2) you're about to start a guest, and you need state for that.
49  *
50  * So there is get cpu set up to be able to run VMs stuff, and now
51  * let's start a guest stuff.  In Akaros, CPUs will always be set up
52  * to run a VM if that is possible. Processes can flip themselves into
53  * a VM and that will require another VMCS.
54  *
55  * So: at kernel startup time, the SMP boot stuff calls
56  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
57  * in the case of this file is intel_vmm_init. That does some code
58  * that sets up stuff for ALL sockets, based on the capabilities of
59  * the socket it runs on. If any cpu supports vmx, it assumes they all
60  * do. That's a realistic assumption. So the call_function_all is kind
61  * of stupid, really; it could just see what's on the current cpu and
62  * assume it's on all. HOWEVER: there are systems in the wilde that
63  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
64  * might as well allow for the chance that wel'll only all VMMCPs on a
65  * subset (not implemented yet however).  So: probe all CPUs, get a
66  * count of how many support VMX and, for now, assume they all do
67  * anyway.
68  *
69  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
70  * which contains all the naughty bits settings for all the cpus that can run a VM.
71  * Realistically, all VMX-capable cpus in a system will have identical configurations.
72  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
73  *
74  * configure the msr_bitmap. This is the bitmap of MSRs which the
75  * guest can manipulate.  Currently, we only allow GS and FS base.
76  *
77  * Reserve bit 0 in the vpid bitmap as guests can not use that
78  *
79  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
80  * per-guest. Once set up, it is left alone.  The ONLY think we set in
81  * there is the revision area. The VMX is page-sized per cpu and
82  * page-aligned. Note that it can be smaller, but why bother? We know
83  * the max size and alightment, and it's convenient.
84  *
85  * Now that it is set up, enable vmx on all cpus. This involves
86  * testing VMXE in cr4, to see if we've been here before (TODO: delete
87  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
88  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
89  * instruction), and syncing vpid's and ept's.  Now the CPU is ready
90  * to host guests.
91  *
92  * Setting up a guest.
93  * We divide this into two things: vmm_proc_init and vm_run.
94  * Currently, on Intel, vmm_proc_init does nothing.
95  *
96  * vm_run is really complicated. It is called with a coreid, and
97  * vmctl struct. On intel, it calls vmx_launch. vmx_launch is set
98  * up for a few test cases. If rip is 1, it sets the guest rip to
99  * a function which will deref 0 and should exit with failure 2. If rip is 0,
100  * it calls an infinite loop in the guest.
101  *
102  * The sequence of operations:
103  * create a vcpu
104  * while (1) {
105  * get a vcpu
106  * disable irqs (required or you can't enter the VM)
107  * vmx_run_vcpu()
108  * enable irqs
109  * manage the vm exit
110  * }
111  *
112  * get a vcpu
113  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
114  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
115  *
116  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
117  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
118  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
119  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
120  *
121  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
122  * of inline assembly with embedded CPP crap. I suspect we'll want to
123  * un-inline it someday, but maybe not.  It's called with a vcpu
124  * struct from which it loads guest state, and to which it stores
125  * non-virtualized host state. It issues a vmlaunch or vmresume
126  * instruction depending, and on return, it evaluates if things the
127  * launch/resume had an error in that operation. Note this is NOT the
128  * same as an error while in the virtual machine; this is an error in
129  * startup due to misconfiguration. Depending on whatis returned it's
130  * either a failed vm startup or an exit for lots of many reasons.
131  *
132  */
133
134 /* basically: only rename those globals that might conflict
135  * with existing names. Leave all else the same.
136  * this code is more modern than the other code, yet still
137  * well encapsulated, it seems.
138  */
139 #include <kmalloc.h>
140 #include <string.h>
141 #include <stdio.h>
142 #include <assert.h>
143 #include <error.h>
144 #include <pmap.h>
145 #include <sys/queue.h>
146 #include <smp.h>
147 #include <kref.h>
148 #include <atomic.h>
149 #include <alarm.h>
150 #include <event.h>
151 #include <umem.h>
152 #include <bitops.h>
153 #include <arch/types.h>
154 #include <syscall.h>
155 #include <arch/io.h>
156
157 #include <ros/vmm.h>
158 #include "vmx.h"
159 #include "../vmm.h"
160
161 #include "cpufeature.h"
162
163 #include <trap.h>
164
165 #include <smp.h>
166
167 #define currentcpu (&per_cpu_info[core_id()])
168
169 /* debug stuff == remove later. It's not even multivm safe. */
170 uint64_t idtr;
171 int debug =0;
172
173 // END debug
174 static unsigned long *msr_bitmap;
175 #define VMX_IO_BITMAP_ORDER             4       /* 64 KB */
176 #define VMX_IO_BITMAP_SZ                (1 << (VMX_IO_BITMAP_ORDER + PGSHIFT))
177 static unsigned long *io_bitmap;
178
179 int x86_ept_pte_fix_ups = 0;
180
181 struct vmx_capability vmx_capability;
182 struct vmcs_config vmcs_config;
183
184 static int autoloaded_msrs[] = {
185         MSR_KERNEL_GS_BASE,
186         MSR_LSTAR,
187         MSR_STAR,
188         MSR_SFMASK,
189 };
190
191 static char *cr_access_type[] = {
192         "move to cr",
193         "move from cr",
194         "clts",
195         "lmsw"
196 };
197
198 static char *cr_gpr[] = {
199         "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
200         "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
201 };
202
203 static int guest_cr_num[16] = {
204         GUEST_CR0,
205         -1,
206         -1,
207         GUEST_CR3,
208         GUEST_CR4,
209         -1,
210         -1,
211         -1,
212         -1,     /* 8? */
213         -1, -1, -1, -1, -1, -1, -1
214 };
215
216 __always_inline unsigned long vmcs_readl(unsigned long field);
217 /* See section 24-3 of The Good Book */
218 void
219 show_cr_access(uint64_t val)
220 {
221         int crnr = val & 0xf;
222         int type = (val >> 4) & 3;
223         int reg = (val >> 11) & 0xf;
224         printk("%s: %d: ", cr_access_type[type], crnr);
225         if (type < 2) {
226                 printk("%s", cr_gpr[reg]);
227                 if (guest_cr_num[crnr] > -1) {
228                         printk(": 0x%x", vmcs_readl(guest_cr_num[crnr]));
229                 }
230         }
231         printk("\n");
232 }
233
234 void
235 ept_flush(uint64_t eptp)
236 {
237         ept_sync_context(eptp);
238 }
239
240 static void
241 vmcs_clear(struct vmcs *vmcs)
242 {
243         uint64_t phys_addr = PADDR(vmcs);
244         uint8_t error;
245
246         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0":"=qm"(error):"a"(&phys_addr),
247                                   "m"(phys_addr)
248                                   :"cc", "memory");
249         if (error)
250                 printk("vmclear fail: %p/%llx\n", vmcs, phys_addr);
251 }
252
253 static void
254 vmcs_load(struct vmcs *vmcs)
255 {
256         uint64_t phys_addr = PADDR(vmcs);
257         uint8_t error;
258
259         asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0":"=qm"(error):"a"(&phys_addr),
260                                   "m"(phys_addr)
261                                   :"cc", "memory");
262         if (error)
263                 printk("vmptrld %p/%llx failed\n", vmcs, phys_addr);
264 }
265
266 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
267 static physaddr_t
268 vmcs_get_current(void)
269 {
270         physaddr_t vmcs_paddr;
271         /* RAX contains the addr of the location to store the VMCS pointer.  The
272          * compiler doesn't know the ASM will deref that pointer, hence the =m */
273         asm volatile (ASM_VMX_VMPTRST_RAX:"=m"(vmcs_paddr):"a"(&vmcs_paddr));
274         return vmcs_paddr;
275 }
276
277 __always_inline unsigned long
278 vmcs_readl(unsigned long field)
279 {
280         unsigned long value;
281
282         asm volatile (ASM_VMX_VMREAD_RDX_RAX:"=a"(value):"d"(field):"cc");
283         return value;
284 }
285
286 __always_inline uint16_t
287 vmcs_read16(unsigned long field)
288 {
289         return vmcs_readl(field);
290 }
291
292 static __always_inline uint32_t
293 vmcs_read32(unsigned long field)
294 {
295         return vmcs_readl(field);
296 }
297
298 static __always_inline uint64_t
299 vmcs_read64(unsigned long field)
300 {
301         return vmcs_readl(field);
302 }
303
304 void
305 vmwrite_error(unsigned long field, unsigned long value)
306 {
307         printk("vmwrite error: reg %lx value %lx (err %d)\n",
308                    field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
309 }
310
311 void
312 vmcs_writel(unsigned long field, unsigned long value)
313 {
314         uint8_t error;
315
316         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0":"=q"(error):"a"(value),
317                                   "d"(field):"cc");
318         if (error)
319                 vmwrite_error(field, value);
320 }
321
322 static void
323 vmcs_write16(unsigned long field, uint16_t value)
324 {
325         vmcs_writel(field, value);
326 }
327
328 static void
329 vmcs_write32(unsigned long field, uint32_t value)
330 {
331         vmcs_writel(field, value);
332 }
333
334 static void
335 vmcs_write64(unsigned long field, uint64_t value)
336 {
337         vmcs_writel(field, value);
338 }
339
340 void vapic_status_dump_kernel(void *vapic);
341
342 /*
343  * A note on Things You Can't Make Up.
344  * or
345  * "George, you can type this shit, but you can't say it" -- Harrison Ford
346  *
347  * There are 5 VMCS 32-bit words that control guest permissions. If
348  * you set these correctly, you've got a guest that will behave. If
349  * you get even one bit wrong, you've got a guest that will chew your
350  * leg off. Some bits must be 1, some must be 0, and some can be set
351  * either way. To add to the fun, the docs are sort of a docudrama or,
352  * as the quote goes, "interesting if true."
353  *
354  * To determine what bit can be set in what VMCS 32-bit control word,
355  * there are 5 corresponding 64-bit MSRs.  And, to make it even more
356  * fun, the standard set of MSRs have errors in them, i.e. report
357  * incorrect values, for legacy reasons, and so you are supposed to
358  * "look around" to another set, which have correct bits in
359  * them. There are four such 'correct' registers, and they have _TRUE_
360  * in the names as you can see below. We test for the value of VMCS
361  * control bits in the _TRUE_ registers if possible. The fifth
362  * register, CPU Secondary Exec Controls, which came later, needs no
363  * _TRUE_ variant.
364  *
365  * For each MSR, the high 32 bits tell you what bits can be "1" by a
366  * "1" in that position; the low 32 bits tell you what bit can be "0"
367  * by a "0" in that position. So, for each of 32 bits in a given VMCS
368  * control word, there is a pair of bits in an MSR that tells you what
369  * values it can take. The two bits, of which there are *four*
370  * combinations, describe the *three* possible operations on a
371  * bit. The two bits, taken together, form an untruth table: There are
372  * three possibilities: The VMCS bit can be set to 0 or 1, or it can
373  * only be 0, or only 1. The fourth combination is not supposed to
374  * happen.
375  *
376  * So: there is the 1 bit from the upper 32 bits of the msr.
377  * If this bit is set, then the bit can be 1. If clear, it can not be 1.
378  *
379  * Then there is the 0 bit, from low 32 bits. If clear, the VMCS bit
380  * can be 0. If 1, the VMCS bit can not be 0.
381  *
382  * SO, let's call the 1 bit R1, and the 0 bit R0, we have:
383  *  R1 R0
384  *  0 0 -> must be 0
385  *  1 0 -> can be 1, can be 0
386  *  0 1 -> can not be 1, can not be 0. --> JACKPOT! Not seen yet.
387  *  1 1 -> must be one.
388  *
389  * It's also pretty hard to know what you can and can't set, and
390  * that's led to inadvertant opening of permissions at times.  Because
391  * of this complexity we've decided on the following: the driver must
392  * define EVERY bit, UNIQUELY, for each of the 5 registers, that it wants
393  * set. Further, for any bit that's settable, the driver must specify
394  * a setting; for any bit that's reserved, the driver settings must
395  * match that bit. If there are reserved bits we don't specify, that's
396  * ok; we'll take them as is.
397  *
398  * We use a set-means-set, and set-means-clear model, i.e. we use a
399  * 32-bit word to contain the bits we want to be 1, indicated by one;
400  * and another 32-bit word in which a bit we want to be 0 is indicated
401  * by a 1. This allows us to easily create masks of all bits we're
402  * going to set, for example.
403  *
404  * We have two 32-bit numbers for each 32-bit VMCS field: bits we want
405  * set and bits we want clear.  If you read the MSR for that field,
406  * compute the reserved 0 and 1 settings, and | them together, they
407  * need to result in 0xffffffff. You can see that we can create other
408  * tests for conflicts (i.e. overlap).
409  *
410  * At this point, I've tested check_vmx_controls in every way
411  * possible, beause I kept screwing the bitfields up. You'll get a nice
412  * error it won't work at all, which is what we want: a
413  * failure-prone setup, where even errors that might result in correct
414  * values are caught -- "right answer, wrong method, zero credit." If there's
415  * weirdness in the bits, we don't want to run.
416  * The try_set stuff adds particular ugliness but we have to have it.
417  */
418
419 static bool
420 check_vmxec_controls(struct vmxec const *v, bool have_true_msr,
421                                          uint32_t * result)
422 {
423         bool err = false;
424         uint32_t vmx_msr_low, vmx_msr_high;
425         uint32_t reserved_0, reserved_1, changeable_bits, try0, try1;
426
427         if (have_true_msr)
428                 rdmsr(v->truemsr, vmx_msr_low, vmx_msr_high);
429         else
430                 rdmsr(v->msr, vmx_msr_low, vmx_msr_high);
431
432         if (vmx_msr_low & ~vmx_msr_high)
433                 warn("JACKPOT: Conflicting VMX ec ctls for %s, high 0x%08x low 0x%08x",
434                          v->name, vmx_msr_high, vmx_msr_low);
435
436         reserved_0 = (~vmx_msr_low) & (~vmx_msr_high);
437         reserved_1 = vmx_msr_low & vmx_msr_high;
438         changeable_bits = ~(reserved_0 | reserved_1);
439
440         /*
441          * this is very much as follows:
442          * accept the things I cannot change,
443          * change the things I can,
444          * know the difference.
445          */
446
447         /* Conflict. Don't try to both set and reset bits. */
448         if ((v->must_be_1 & (v->must_be_0 | v->try_set_1 | v->try_set_0)) ||
449             (v->must_be_0 & (v->try_set_1 | v->try_set_0)) ||
450             (v->try_set_1 & v->try_set_0)) {
451                 printk("%s: must 0 (0x%x) and must be 1 (0x%x) and try_set_0 (0x%x) and try_set_1 (0x%x) overlap\n",
452                        v->name, v->must_be_0, v->must_be_1, v->try_set_0, v->try_set_1);
453                 err = true;
454         }
455
456         /* coverage */
457         if (((v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1) & changeable_bits) != changeable_bits) {
458                 printk("%s: Need to cover 0x%x and have 0x%x,0x%x\n",
459                        v->name, changeable_bits, v->must_be_0, v->must_be_1, v->try_set_0, v->try_set_1);
460                 err = true;
461         }
462
463         if ((v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1 | reserved_0 | reserved_1) != 0xffffffff) {
464                 printk("%s: incomplete coverage: have 0x%x, want 0x%x\n",
465                        v->name, v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1 |
466                        reserved_0 | reserved_1, 0xffffffff);
467                 err = true;
468         }
469
470         /* Don't try to change bits that can't be changed. */
471         if ((v->must_be_0 & (reserved_0 | changeable_bits)) != v->must_be_0) {
472                 printk("%s: set to 0 (0x%x) can't be done\n", v->name, v->must_be_0);
473                 err = true;
474         }
475
476         if ((v->must_be_1 & (reserved_1 | changeable_bits)) != v->must_be_1) {
477                 printk("%s: set to 1 (0x%x) can't be done\n", v->name, v->must_be_1);
478                 err = true;
479         }
480         // Note we don't REQUIRE that try_set_0 or try_set_0 be possible. We just want to try it.
481
482         // Clear bits in try_set that can't be set.
483         try1 = v->try_set_1 & (reserved_1 | changeable_bits);
484
485         /* If there's been any error at all, spill our guts and return. */
486         if (err) {
487                 printk("%s: vmx_msr_high 0x%x, vmx_msr_low 0x%x, ",
488                            v->name, vmx_msr_high, vmx_msr_low);
489                 printk("must_be_0 0x%x, try_set_0 0x%x,reserved_0 0x%x",
490                            v->must_be_0, v->try_set_0, reserved_0);
491                 printk("must_be_1 0x%x, try_set_1 0x%x,reserved_1 0x%x",
492                            v->must_be_1, v->try_set_1, reserved_1);
493                 printk(" reserved_0 0x%x", reserved_0);
494                 printk(" changeable_bits 0x%x\n", changeable_bits);
495                 return false;
496         }
497
498         *result = v->must_be_1 | try1 | reserved_1;
499
500         printk("%s: check_vmxec_controls succeeds with result 0x%x\n",
501                    v->name, *result);
502         return true;
503 }
504
505 /*
506  * We're trying to make this as readable as possible. Realistically, it will
507  * rarely if ever change, if the past is any guide.
508  */
509 static const struct vmxec pbec = {
510         .name = "Pin Based Execution Controls",
511         .msr = MSR_IA32_VMX_PINBASED_CTLS,
512         .truemsr = MSR_IA32_VMX_TRUE_PINBASED_CTLS,
513
514         .must_be_1 = (PIN_BASED_EXT_INTR_MASK |
515                      PIN_BASED_NMI_EXITING |
516                      PIN_BASED_VIRTUAL_NMIS |
517                      PIN_BASED_POSTED_INTR),
518
519         .must_be_0 = (PIN_BASED_VMX_PREEMPTION_TIMER),
520 };
521
522 static const struct vmxec cbec = {
523         .name = "CPU Based Execution Controls",
524         .msr = MSR_IA32_VMX_PROCBASED_CTLS,
525         .truemsr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
526
527         .must_be_1 = (//CPU_BASED_MWAIT_EXITING |
528                         CPU_BASED_HLT_EXITING |
529                      CPU_BASED_TPR_SHADOW |
530                      CPU_BASED_RDPMC_EXITING |
531                      CPU_BASED_CR8_LOAD_EXITING |
532                      CPU_BASED_CR8_STORE_EXITING |
533                      CPU_BASED_USE_MSR_BITMAPS |
534                      CPU_BASED_USE_IO_BITMAPS |
535                      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS),
536
537         .must_be_0 = (
538                         CPU_BASED_MWAIT_EXITING |
539                         CPU_BASED_VIRTUAL_INTR_PENDING |
540                      CPU_BASED_INVLPG_EXITING |
541                      CPU_BASED_USE_TSC_OFFSETING |
542                      CPU_BASED_RDTSC_EXITING |
543                      CPU_BASED_CR3_LOAD_EXITING |
544                      CPU_BASED_CR3_STORE_EXITING |
545                      CPU_BASED_MOV_DR_EXITING |
546                      CPU_BASED_VIRTUAL_NMI_PENDING |
547                      CPU_BASED_MONITOR_TRAP |
548                      CPU_BASED_PAUSE_EXITING |
549                      CPU_BASED_UNCOND_IO_EXITING),
550
551         .try_set_0 = (CPU_BASED_MONITOR_EXITING)
552 };
553
554 static const struct vmxec cb2ec = {
555         .name = "CPU Based 2nd Execution Controls",
556         .msr = MSR_IA32_VMX_PROCBASED_CTLS2,
557         .truemsr = MSR_IA32_VMX_PROCBASED_CTLS2,
558
559         .must_be_1 = (SECONDARY_EXEC_ENABLE_EPT |
560                      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
561                      SECONDARY_EXEC_APIC_REGISTER_VIRT |
562                      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
563                      SECONDARY_EXEC_WBINVD_EXITING),
564
565         .must_be_0 = (
566                      //SECONDARY_EXEC_APIC_REGISTER_VIRT |
567                      //SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
568                      SECONDARY_EXEC_DESCRIPTOR_EXITING |
569                      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
570                      SECONDARY_EXEC_ENABLE_VPID |
571                      SECONDARY_EXEC_UNRESTRICTED_GUEST |
572                      SECONDARY_EXEC_PAUSE_LOOP_EXITING |
573                      SECONDARY_EXEC_RDRAND_EXITING |
574                      SECONDARY_EXEC_ENABLE_INVPCID |
575                      SECONDARY_EXEC_ENABLE_VMFUNC |
576                      SECONDARY_EXEC_SHADOW_VMCS |
577                      SECONDARY_EXEC_RDSEED_EXITING |
578                      SECONDARY_EPT_VE |
579                      SECONDARY_ENABLE_XSAV_RESTORE),
580
581         .try_set_1 = SECONDARY_EXEC_RDTSCP,
582
583         // mystery bit.
584         .try_set_0 = 0x2000000
585
586 };
587
588 static const struct vmxec vmentry = {
589         .name = "VMENTRY controls",
590         .msr = MSR_IA32_VMX_ENTRY_CTLS,
591         .truemsr = MSR_IA32_VMX_TRUE_ENTRY_CTLS,
592         /* exact order from vmx.h; only the first two are enabled. */
593
594         .must_be_1 =  (VM_ENTRY_LOAD_DEBUG_CONTROLS | /* can't set to 0 */
595                       VM_ENTRY_LOAD_IA32_EFER |
596                       VM_ENTRY_IA32E_MODE),
597
598         .must_be_0 = (VM_ENTRY_SMM |
599                      VM_ENTRY_DEACT_DUAL_MONITOR |
600                      VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
601                      VM_ENTRY_LOAD_IA32_PAT),
602 };
603
604 static const struct vmxec vmexit = {
605         .name = "VMEXIT controls",
606         .msr = MSR_IA32_VMX_EXIT_CTLS,
607         .truemsr = MSR_IA32_VMX_TRUE_EXIT_CTLS,
608
609         .must_be_1 = (VM_EXIT_SAVE_DEBUG_CONTROLS |     /* can't set to 0 */
610                                  VM_EXIT_ACK_INTR_ON_EXIT |
611                                  VM_EXIT_SAVE_IA32_EFER |
612                                 VM_EXIT_LOAD_IA32_EFER |
613                                 VM_EXIT_HOST_ADDR_SPACE_SIZE),  /* 64 bit */
614
615         .must_be_0 = (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
616                                 // VM_EXIT_ACK_INTR_ON_EXIT |
617                                  VM_EXIT_SAVE_IA32_PAT |
618                                  VM_EXIT_LOAD_IA32_PAT |
619                                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER),
620 };
621
622 static void
623 setup_vmcs_config(void *p)
624 {
625         int *ret = p;
626         struct vmcs_config *vmcs_conf = &vmcs_config;
627         uint32_t vmx_msr_high;
628         uint64_t vmx_msr;
629         bool have_true_msrs = false;
630         bool ok;
631
632         *ret = -EIO;
633
634         vmx_msr = read_msr(MSR_IA32_VMX_BASIC);
635         vmx_msr_high = vmx_msr >> 32;
636
637         /*
638          * If bit 55 (VMX_BASIC_HAVE_TRUE_MSRS) is set, then we
639          * can go for the true MSRs.  Else, we ask you to get a better CPU.
640          */
641         if (vmx_msr & VMX_BASIC_TRUE_CTLS) {
642                 have_true_msrs = true;
643                 printd("Running with TRUE MSRs\n");
644         } else {
645                 printk("Running with non-TRUE MSRs, this is old hardware\n");
646         }
647
648         /*
649          * Don't worry that one or more of these might fail and leave
650          * the VMCS in some kind of incomplete state. If one of these
651          * fails, the caller is going to discard the VMCS.
652          * It is written this way to ensure we get results of all tests and avoid
653          * BMAFR behavior.
654          */
655         ok = check_vmxec_controls(&pbec, have_true_msrs,
656                                   &vmcs_conf->pin_based_exec_ctrl);
657         ok = check_vmxec_controls(&cbec, have_true_msrs,
658                                   &vmcs_conf->cpu_based_exec_ctrl) && ok;
659         /* Only check cb2ec if we're still ok, o/w we may GPF */
660         ok = ok && check_vmxec_controls(&cb2ec, have_true_msrs,
661                                         &vmcs_conf->cpu_based_2nd_exec_ctrl);
662         ok = check_vmxec_controls(&vmentry, have_true_msrs,
663                                   &vmcs_conf->vmentry_ctrl) && ok;
664         ok = check_vmxec_controls(&vmexit, have_true_msrs,
665                                   &vmcs_conf->vmexit_ctrl) && ok;
666         if (! ok) {
667                 printk("vmxexec controls is no good.\n");
668                 return;
669         }
670
671         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
672         if ((vmx_msr_high & 0x1fff) > PGSIZE) {
673                 printk("vmx_msr_high & 0x1fff) is 0x%x, > PAGE_SIZE 0x%x\n",
674                            vmx_msr_high & 0x1fff, PGSIZE);
675                 return;
676         }
677
678         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
679         if (vmx_msr & VMX_BASIC_64) {
680                 printk("VMX doesn't support 64 bit width!\n");
681                 return;
682         }
683
684         if (((vmx_msr & VMX_BASIC_MEM_TYPE_MASK) >> VMX_BASIC_MEM_TYPE_SHIFT)
685                 != VMX_BASIC_MEM_TYPE_WB) {
686                 printk("VMX doesn't support WB memory for VMCS accesses!\n");
687                 return;
688         }
689
690         vmcs_conf->size = vmx_msr_high & 0x1fff;
691         vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
692         vmcs_conf->revision_id = (uint32_t) vmx_msr;
693
694         /* Read in the caps for runtime checks.  This MSR is only available if
695          * secondary controls and ept or vpid is on, which we check earlier */
696         rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, vmx_capability.ept, vmx_capability.vpid);
697
698         *ret = 0;
699 }
700
701 static struct vmcs *
702 __vmx_alloc_vmcs(int node)
703 {
704         struct vmcs *vmcs;
705
706         vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
707         if (!vmcs)
708                 return 0;
709         memset(vmcs, 0, vmcs_config.size);
710         vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
711         printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
712         return vmcs;
713 }
714
715 /**
716  * vmx_alloc_vmcs - allocates a VMCS region
717  *
718  * NOTE: Assumes the new region will be used by the current CPU.
719  *
720  * Returns a valid VMCS region.
721  */
722 static struct vmcs *
723 vmx_alloc_vmcs(void)
724 {
725         return __vmx_alloc_vmcs(numa_id());
726 }
727
728 /**
729  * vmx_free_vmcs - frees a VMCS region
730  */
731 static void
732 vmx_free_vmcs(struct vmcs *vmcs)
733 {
734         //free_pages((unsigned long)vmcs, vmcs_config.order);
735 }
736
737 /*
738  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
739  * will not change in the lifetime of the guest.
740  * Note that host-state that does change is set elsewhere. E.g., host-state
741  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
742  */
743 static void
744 vmx_setup_constant_host_state(void)
745 {
746         uint32_t low32, high32;
747         unsigned long tmpl;
748         pseudodesc_t dt;
749
750         vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);    /* 22.2.3 */
751         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
752         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
753
754         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
755         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
756         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
757         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
758         vmcs_write16(HOST_TR_SELECTOR, GD_TSS); /* 22.2.4 */
759
760         native_store_idt(&dt);
761         vmcs_writel(HOST_IDTR_BASE, dt.pd_base);        /* 22.2.4 */
762
763         asm("mov $.Lkvm_vmx_return, %0":"=r"(tmpl));
764         vmcs_writel(HOST_RIP, tmpl);    /* 22.2.5 */
765
766         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
767         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
768         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
769         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);      /* 22.2.3 */
770
771         rdmsr(MSR_EFER, low32, high32);
772         vmcs_write32(HOST_IA32_EFER, low32);
773
774         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
775                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
776                 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
777         }
778
779         vmcs_write16(HOST_FS_SELECTOR, 0);      /* 22.2.4 */
780         vmcs_write16(HOST_GS_SELECTOR, 0);      /* 22.2.4 */
781
782         /* TODO: This (at least gs) is per cpu */
783         rdmsrl(MSR_FS_BASE, tmpl);
784         vmcs_writel(HOST_FS_BASE, tmpl);        /* 22.2.4 */
785         rdmsrl(MSR_GS_BASE, tmpl);
786         vmcs_writel(HOST_GS_BASE, tmpl);        /* 22.2.4 */
787 }
788
789 static inline uint16_t
790 vmx_read_ldt(void)
791 {
792         uint16_t ldt;
793 asm("sldt %0":"=g"(ldt));
794         return ldt;
795 }
796
797 static unsigned long
798 segment_base(uint16_t selector)
799 {
800         pseudodesc_t *gdt = &currentcpu->host_gdt;
801         struct desc_struct *d;
802         unsigned long table_base;
803         unsigned long v;
804
805         if (!(selector & ~3)) {
806                 return 0;
807         }
808
809         table_base = gdt->pd_base;
810
811         if (selector & 4) {     /* from ldt */
812                 uint16_t ldt_selector = vmx_read_ldt();
813
814                 if (!(ldt_selector & ~3)) {
815                         return 0;
816                 }
817
818                 table_base = segment_base(ldt_selector);
819         }
820         d = (struct desc_struct *)(table_base + (selector & ~7));
821         v = get_desc_base(d);
822         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
823                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
824         return v;
825 }
826
827 static inline unsigned long
828 vmx_read_tr_base(void)
829 {
830         uint16_t tr;
831 asm("str %0":"=g"(tr));
832         return segment_base(tr);
833 }
834
835 static void
836 __vmx_setup_cpu(void)
837 {
838         pseudodesc_t *gdt = &currentcpu->host_gdt;
839         unsigned long sysenter_esp;
840         unsigned long tmpl;
841
842         /*
843          * Linux uses per-cpu TSS and GDT, so set these when switching
844          * processors.
845          */
846         vmcs_writel(HOST_TR_BASE, vmx_read_tr_base());  /* 22.2.4 */
847         vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);      /* 22.2.4 */
848
849         rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
850         vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp);      /* 22.2.3 */
851
852         rdmsrl(MSR_FS_BASE, tmpl);
853         vmcs_writel(HOST_FS_BASE, tmpl);        /* 22.2.4 */
854         rdmsrl(MSR_GS_BASE, tmpl);
855         vmcs_writel(HOST_GS_BASE, tmpl);        /* 22.2.4 */
856 }
857
858 /**
859  * vmx_get_cpu - called before using a cpu
860  * @vcpu: VCPU that will be loaded.
861  *
862  * Disables preemption. Call vmx_put_cpu() when finished.
863  */
864 static void
865 vmx_get_cpu(struct vmx_vcpu *vcpu)
866 {
867         int cur_cpu = core_id();
868         handler_wrapper_t *w;
869
870         if (currentcpu->local_vcpu)
871                 panic("get_cpu: currentcpu->localvcpu was non-NULL");
872         if (currentcpu->local_vcpu != vcpu) {
873                 currentcpu->local_vcpu = vcpu;
874
875                 if (vcpu->cpu != cur_cpu) {
876                         if (vcpu->cpu >= 0) {
877                                 panic("vcpu->cpu is not -1, it's %d\n", vcpu->cpu);
878                         } else
879                                 vmcs_clear(vcpu->vmcs);
880
881                         ept_sync_context(vcpu_get_eptp(vcpu));
882
883                         vcpu->launched = 0;
884                         vmcs_load(vcpu->vmcs);
885                         __vmx_setup_cpu();
886                         vcpu->cpu = cur_cpu;
887                 } else {
888                         vmcs_load(vcpu->vmcs);
889                 }
890         }
891 }
892
893 /**
894  * vmx_put_cpu - called after using a cpu
895  * @vcpu: VCPU that was loaded.
896  */
897 static void
898 vmx_put_cpu(struct vmx_vcpu *vcpu)
899 {
900         if (core_id() != vcpu->cpu)
901                 panic("%s: core_id() %d != vcpu->cpu %d\n",
902                           __func__, core_id(), vcpu->cpu);
903
904         if (currentcpu->local_vcpu != vcpu)
905                 panic("vmx_put_cpu: asked to clear something not ours");
906
907         ept_sync_context(vcpu_get_eptp(vcpu));
908         vmcs_clear(vcpu->vmcs);
909         vcpu->cpu = -1;
910         currentcpu->local_vcpu = NULL;
911         //put_cpu();
912 }
913
914 /**
915  * vmx_dump_cpu - prints the CPU state
916  * @vcpu: VCPU to print
917  */
918 static void
919 vmx_dump_cpu(struct vmx_vcpu *vcpu)
920 {
921
922         unsigned long flags;
923
924         vmx_get_cpu(vcpu);
925         printk("GUEST_INTERRUPTIBILITY_INFO: 0x%08x\n",  vmcs_readl(GUEST_INTERRUPTIBILITY_INFO));
926         printk("VM_ENTRY_INTR_INFO_FIELD 0x%08x\n", vmcs_readl(VM_ENTRY_INTR_INFO_FIELD));
927         printk("EXIT_QUALIFICATION 0x%08x\n", vmcs_read32(EXIT_QUALIFICATION));
928         printk("VM_EXIT_REASON 0x%08x\n", vmcs_read32(VM_EXIT_REASON));
929         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
930         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
931         flags = vmcs_readl(GUEST_RFLAGS);
932         vmx_put_cpu(vcpu);
933
934         printk("--- Begin VCPU Dump ---\n");
935         printk("CPU %d VPID %d\n", vcpu->cpu, 0);
936         printk("RIP 0x%016lx RFLAGS 0x%08lx\n", vcpu->regs.tf_rip, flags);
937         printk("RAX 0x%016lx RCX 0x%016lx\n", vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
938         printk("RDX 0x%016lx RBX 0x%016lx\n", vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
939         printk("RSP 0x%016lx RBP 0x%016lx\n", vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
940         printk("RSI 0x%016lx RDI 0x%016lx\n", vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
941         printk("R8  0x%016lx R9  0x%016lx\n", vcpu->regs.tf_r8, vcpu->regs.tf_r9);
942         printk("R10 0x%016lx R11 0x%016lx\n", vcpu->regs.tf_r10, vcpu->regs.tf_r11);
943         printk("R12 0x%016lx R13 0x%016lx\n", vcpu->regs.tf_r12, vcpu->regs.tf_r13);
944         printk("R14 0x%016lx R15 0x%016lx\n", vcpu->regs.tf_r14, vcpu->regs.tf_r15);
945         printk("--- End VCPU Dump ---\n");
946
947 }
948
949 uint64_t
950 construct_eptp(physaddr_t root_hpa)
951 {
952         uint64_t eptp;
953
954         /* set WB memory and 4 levels of walk.  we checked these in ept_init */
955         eptp = VMX_EPT_MEM_TYPE_WB | (VMX_EPT_GAW_4_LVL << VMX_EPT_GAW_EPTP_SHIFT);
956         if (cpu_has_vmx_ept_ad_bits())
957                 eptp |= VMX_EPT_AD_ENABLE_BIT;
958         eptp |= (root_hpa & PAGE_MASK);
959
960         return eptp;
961 }
962
963 /**
964  * vmx_setup_initial_guest_state - configures the initial state of guest registers
965  */
966 static void
967 vmx_setup_initial_guest_state(void)
968 {
969         unsigned long tmpl;
970         unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
971                 X86_CR4_PGE | X86_CR4_OSFXSR;
972         uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
973 #if 0
974         do
975                 we need it if (boot_cpu_has(X86_FEATURE_PCID))
976                         cr4 |= X86_CR4_PCIDE;
977         if (boot_cpu_has(X86_FEATURE_OSXSAVE))
978                 cr4 |= X86_CR4_OSXSAVE;
979 #endif
980         /* we almost certainly have this */
981         /* we'll go sour if we don't. */
982         if (1)  //boot_cpu_has(X86_FEATURE_FSGSBASE))
983                 cr4 |= X86_CR4_RDWRGSFS;
984
985         /* configure control and data registers */
986         vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
987                                 X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
988         vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
989                                 X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
990         vmcs_writel(GUEST_CR3, rcr3());
991         vmcs_writel(GUEST_CR4, cr4);
992         vmcs_writel(CR4_READ_SHADOW, cr4);
993         vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
994                                 EFER_SCE /*| EFER_FFXSR */ );
995         vmcs_writel(GUEST_GDTR_BASE, 0);
996         vmcs_writel(GUEST_GDTR_LIMIT, 0);
997         vmcs_writel(GUEST_IDTR_BASE, 0);
998         vmcs_writel(GUEST_IDTR_LIMIT, 0);
999         vmcs_writel(GUEST_RIP, 0xdeadbeef);
1000         vmcs_writel(GUEST_RSP, 0xdeadbeef);
1001         vmcs_writel(GUEST_RFLAGS, 0x02);
1002         vmcs_writel(GUEST_DR7, 0);
1003
1004         /* guest segment bases */
1005         vmcs_writel(GUEST_CS_BASE, 0);
1006         vmcs_writel(GUEST_DS_BASE, 0);
1007         vmcs_writel(GUEST_ES_BASE, 0);
1008         vmcs_writel(GUEST_GS_BASE, 0);
1009         vmcs_writel(GUEST_SS_BASE, 0);
1010         rdmsrl(MSR_FS_BASE, tmpl);
1011         vmcs_writel(GUEST_FS_BASE, tmpl);
1012
1013         /* guest segment access rights */
1014         vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
1015         vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
1016         vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
1017         vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
1018         vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
1019         vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
1020
1021         /* guest segment limits */
1022         vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
1023         vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
1024         vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
1025         vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
1026         vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
1027         vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
1028
1029         /* configure segment selectors */
1030         vmcs_write16(GUEST_CS_SELECTOR, 0);
1031         vmcs_write16(GUEST_DS_SELECTOR, 0);
1032         vmcs_write16(GUEST_ES_SELECTOR, 0);
1033         vmcs_write16(GUEST_FS_SELECTOR, 0);
1034         vmcs_write16(GUEST_GS_SELECTOR, 0);
1035         vmcs_write16(GUEST_SS_SELECTOR, 0);
1036         vmcs_write16(GUEST_TR_SELECTOR, 0);
1037
1038         /* guest LDTR */
1039         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1040         vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
1041         vmcs_writel(GUEST_LDTR_BASE, 0);
1042         vmcs_writel(GUEST_LDTR_LIMIT, 0);
1043
1044         /* guest TSS */
1045         vmcs_writel(GUEST_TR_BASE, 0);
1046         vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
1047         vmcs_writel(GUEST_TR_LIMIT, 0xff);
1048
1049         /* initialize sysenter */
1050         vmcs_write32(GUEST_SYSENTER_CS, 0);
1051         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1052         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1053
1054         /* other random initialization */
1055         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1056         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1057         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1058         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1059         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
1060
1061         /* Initialize posted interrupt notification vector */
1062         vmcs_write16(POSTED_NOTIFICATION_VEC, I_VMMCP_POSTED);
1063         }
1064
1065 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1066                                             uint32_t msr) {
1067         int f = sizeof(unsigned long);
1068         /*
1069          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1070          * have the write-low and read-high bitmap offsets the wrong way round.
1071          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1072          */
1073         if (msr <= 0x1fff) {
1074                 __clear_bit(msr, msr_bitmap + 0x000 / f);       /* read-low */
1075                 __clear_bit(msr, msr_bitmap + 0x800 / f);       /* write-low */
1076         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1077                 msr &= 0x1fff;
1078                 __clear_bit(msr, msr_bitmap + 0x400 / f);       /* read-high */
1079                 __clear_bit(msr, msr_bitmap + 0xc00 / f);       /* write-high */
1080         }
1081 }
1082
1083 /* note the io_bitmap is big enough for the 64K port space. */
1084 static void __vmx_disable_intercept_for_io(unsigned long *io_bitmap,
1085                                            uint16_t port) {
1086         __clear_bit(port, io_bitmap);
1087 }
1088
1089 static void vcpu_print_autoloads(struct vmx_vcpu *vcpu) {
1090         struct vmx_msr_entry *e;
1091         int sz = sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs);
1092         printk("Host Autoloads:\n-------------------\n");
1093         for (int i = 0; i < sz; i++) {
1094                 e = &vcpu->msr_autoload.host[i];
1095                 printk("\tMSR 0x%08x: %p\n", e->index, e->value);
1096         }
1097         printk("Guest Autoloads:\n-------------------\n");
1098         for (int i = 0; i < sz; i++) {
1099                 e = &vcpu->msr_autoload.guest[i];
1100                 printk("\tMSR 0x%08x %p\n", e->index, e->value);
1101         }
1102 }
1103
1104 static void dumpmsrs(void) {
1105         int i;
1106         int set[] = {
1107                 MSR_LSTAR,
1108                 MSR_FS_BASE,
1109                 MSR_GS_BASE,
1110                 MSR_KERNEL_GS_BASE,
1111                 MSR_SFMASK,
1112                 MSR_IA32_PEBS_ENABLE
1113         };
1114         for (i = 0; i < ARRAY_SIZE(set); i++) {
1115                 printk("%p: %p\n", set[i], read_msr(set[i]));
1116         }
1117         printk("core id %d\n", core_id());
1118 }
1119
1120 /* emulated msr. For now, an msr value and a pointer to a helper that
1121  * performs the requested operation.
1122  */
1123 struct emmsr {
1124         uint32_t reg;
1125         char *name;
1126         int (*f) (struct vmx_vcpu * vcpu, struct emmsr *, uint32_t, uint32_t);
1127         bool written;
1128         uint32_t edx, eax;
1129 };
1130
1131 int emsr_miscenable(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1132                     uint32_t);
1133 int emsr_mustmatch(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1134                    uint32_t);
1135 int emsr_readonly(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1136                   uint32_t);
1137 int emsr_readzero(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1138                   uint32_t);
1139 int emsr_fakewrite(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1140                    uint32_t);
1141 int emsr_ok(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t, uint32_t);
1142
1143 int emsr_fake_apicbase(struct vmx_vcpu *vcpu, struct emmsr *msr,
1144                    uint32_t opcode, uint32_t qual);
1145
1146 struct emmsr emmsrs[] = {
1147         {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable},
1148         {MSR_IA32_SYSENTER_CS, "MSR_IA32_SYSENTER_CS", emsr_ok},
1149         {MSR_IA32_SYSENTER_EIP, "MSR_IA32_SYSENTER_EIP", emsr_ok},
1150         {MSR_IA32_SYSENTER_ESP, "MSR_IA32_SYSENTER_ESP", emsr_ok},
1151         {MSR_IA32_UCODE_REV, "MSR_IA32_UCODE_REV", emsr_fakewrite},
1152         {MSR_CSTAR, "MSR_CSTAR", emsr_fakewrite},
1153         {MSR_IA32_VMX_BASIC_MSR, "MSR_IA32_VMX_BASIC_MSR", emsr_fakewrite},
1154         {MSR_IA32_VMX_PINBASED_CTLS_MSR, "MSR_IA32_VMX_PINBASED_CTLS_MSR",
1155          emsr_fakewrite},
1156         {MSR_IA32_VMX_PROCBASED_CTLS_MSR, "MSR_IA32_VMX_PROCBASED_CTLS_MSR",
1157          emsr_fakewrite},
1158         {MSR_IA32_VMX_PROCBASED_CTLS2, "MSR_IA32_VMX_PROCBASED_CTLS2",
1159          emsr_fakewrite},
1160         {MSR_IA32_VMX_EXIT_CTLS_MSR, "MSR_IA32_VMX_EXIT_CTLS_MSR",
1161          emsr_fakewrite},
1162         {MSR_IA32_VMX_ENTRY_CTLS_MSR, "MSR_IA32_VMX_ENTRY_CTLS_MSR",
1163          emsr_fakewrite},
1164         {MSR_IA32_ENERGY_PERF_BIAS, "MSR_IA32_ENERGY_PERF_BIAS",
1165          emsr_fakewrite},
1166         {MSR_LBR_SELECT, "MSR_LBR_SELECT", emsr_ok},
1167         {MSR_LBR_TOS, "MSR_LBR_TOS", emsr_ok},
1168         {MSR_LBR_NHM_FROM, "MSR_LBR_NHM_FROM", emsr_ok},
1169         {MSR_LBR_NHM_TO, "MSR_LBR_NHM_TO", emsr_ok},
1170         {MSR_LBR_CORE_FROM, "MSR_LBR_CORE_FROM", emsr_ok},
1171         {MSR_LBR_CORE_TO, "MSR_LBR_CORE_TO", emsr_ok},
1172
1173         // grumble.
1174         {MSR_OFFCORE_RSP_0, "MSR_OFFCORE_RSP_0", emsr_ok},
1175         {MSR_OFFCORE_RSP_1, "MSR_OFFCORE_RSP_1", emsr_ok},
1176         // louder.
1177         {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok},
1178         // aaaaaahhhhhhhhhhhhhhhhhhhhh
1179         {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
1180         {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
1181         {MSR_IA32_PERF_CAPABILITIES, "MSR_IA32_PERF_CAPABILITIES", emsr_ok},
1182         // unsafe.
1183         {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fake_apicbase},
1184
1185         // mostly harmless.
1186         {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite},
1187         {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero},
1188
1189         // TBD
1190         {MSR_IA32_TSC_DEADLINE, "MSR_IA32_TSC_DEADLINE", emsr_fakewrite},
1191 };
1192
1193 static uint64_t set_low32(uint64_t hi, uint32_t lo)
1194 {
1195         return (hi & 0xffffffff00000000ULL) | lo;
1196 }
1197
1198 static uint64_t set_low16(uint64_t hi, uint16_t lo)
1199 {
1200         return (hi & 0xffffffffffff0000ULL) | lo;
1201 }
1202
1203 static uint64_t set_low8(uint64_t hi, uint8_t lo)
1204 {
1205         return (hi & 0xffffffffffffff00ULL) | lo;
1206 }
1207
1208 /* this may be the only register that needs special handling.
1209  * If there others then we might want to extend teh emmsr struct.
1210  */
1211 int emsr_miscenable(struct vmx_vcpu *vcpu, struct emmsr *msr,
1212                     uint32_t opcode, uint32_t qual) {
1213         uint32_t eax, edx;
1214         rdmsr(msr->reg, eax, edx);
1215         /* we just let them read the misc msr for now. */
1216         if (opcode == EXIT_REASON_MSR_READ) {
1217                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1218                 vcpu->regs.tf_rax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
1219                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1220                 return 0;
1221         } else {
1222                 /* if they are writing what is already written, that's ok. */
1223                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1224                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1225                         return 0;
1226         }
1227         printk
1228                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
1229                  msr->name, (uint32_t) vcpu->regs.tf_rdx,
1230                  (uint32_t) vcpu->regs.tf_rax, edx, eax);
1231         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1232 }
1233
1234 int emsr_mustmatch(struct vmx_vcpu *vcpu, struct emmsr *msr,
1235                    uint32_t opcode, uint32_t qual) {
1236         uint32_t eax, edx;
1237         rdmsr(msr->reg, eax, edx);
1238         /* we just let them read the misc msr for now. */
1239         if (opcode == EXIT_REASON_MSR_READ) {
1240                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1241                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1242                 return 0;
1243         } else {
1244                 /* if they are writing what is already written, that's ok. */
1245                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1246                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1247                         return 0;
1248         }
1249         printk
1250                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
1251                  msr->name, (uint32_t) vcpu->regs.tf_rdx,
1252                  (uint32_t) vcpu->regs.tf_rax, edx, eax);
1253         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1254 }
1255
1256 int emsr_ok(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1257             uint32_t qual) {
1258         if (opcode == EXIT_REASON_MSR_READ) {
1259                 rdmsr(msr->reg, vcpu->regs.tf_rdx, vcpu->regs.tf_rax);
1260         } else {
1261                 uint64_t val =
1262                         (uint64_t) vcpu->regs.tf_rdx << 32 | vcpu->regs.tf_rax;
1263                 write_msr(msr->reg, val);
1264         }
1265         return 0;
1266 }
1267
1268 int emsr_readonly(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1269                   uint32_t qual) {
1270         uint32_t eax, edx;
1271         rdmsr((uint32_t) vcpu->regs.tf_rcx, eax, edx);
1272         /* we just let them read the misc msr for now. */
1273         if (opcode == EXIT_REASON_MSR_READ) {
1274                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1275                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1276                 return 0;
1277         }
1278
1279         printk("%s: Tried to write a readonly register\n", msr->name);
1280         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1281 }
1282
1283 int emsr_readzero(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1284                   uint32_t qual) {
1285         if (opcode == EXIT_REASON_MSR_READ) {
1286                 vcpu->regs.tf_rax = 0;
1287                 vcpu->regs.tf_rdx = 0;
1288                 return 0;
1289         }
1290
1291         printk("%s: Tried to write a readonly register\n", msr->name);
1292         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1293 }
1294
1295 /* pretend to write it, but don't write it. */
1296 int emsr_fakewrite(struct vmx_vcpu *vcpu, struct emmsr *msr,
1297                    uint32_t opcode, uint32_t qual) {
1298         uint32_t eax, edx;
1299         if (!msr->written) {
1300                 rdmsr(msr->reg, eax, edx);
1301         } else {
1302                 edx = msr->edx;
1303                 eax = msr->eax;
1304         }
1305         /* we just let them read the misc msr for now. */
1306         if (opcode == EXIT_REASON_MSR_READ) {
1307                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1308                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1309                 return 0;
1310         } else {
1311                 /* if they are writing what is already written, that's ok. */
1312                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1313                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1314                         return 0;
1315                 msr->edx = vcpu->regs.tf_rdx;
1316                 msr->eax = vcpu->regs.tf_rax;
1317                 msr->written = true;
1318         }
1319         return 0;
1320 }
1321
1322 /* pretend to write it, but don't write it. */
1323 int emsr_fake_apicbase(struct vmx_vcpu *vcpu, struct emmsr *msr,
1324                    uint32_t opcode, uint32_t qual) {
1325         uint32_t eax, edx;
1326         if (!msr->written) {
1327                 //rdmsr(msr->reg, eax, edx);
1328                 /* TODO: tightly coupled to the addr in vmrunkernel.  We want this func
1329                  * to return the val that vmrunkernel put into the VMCS. */
1330                 eax = 0xfee00900;
1331                 edx = 0;
1332         } else {
1333                 edx = msr->edx;
1334                 eax = msr->eax;
1335         }
1336         /* we just let them read the misc msr for now. */
1337         if (opcode == EXIT_REASON_MSR_READ) {
1338                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1339                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1340                 return 0;
1341         } else {
1342                 /* if they are writing what is already written, that's ok. */
1343                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1344                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1345                         return 0;
1346                 msr->edx = vcpu->regs.tf_rdx;
1347                 msr->eax = vcpu->regs.tf_rax;
1348                 msr->written = true;
1349         }
1350         return 0;
1351 }
1352
1353
1354 static int
1355 msrio(struct vmx_vcpu *vcpu, uint32_t opcode, uint32_t qual) {
1356         int i;
1357         for (i = 0; i < ARRAY_SIZE(emmsrs); i++) {
1358                 if (emmsrs[i].reg != vcpu->regs.tf_rcx)
1359                         continue;
1360                 return emmsrs[i].f(vcpu, &emmsrs[i], opcode, qual);
1361         }
1362         printk("msrio for 0x%lx failed\n", vcpu->regs.tf_rcx);
1363         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1364 }
1365
1366 /* Notes on autoloading.  We can't autoload FS_BASE or GS_BASE, according to the
1367  * manual, but that's because they are automatically saved and restored when all
1368  * of the other architectural registers are saved and restored, such as cs, ds,
1369  * es, and other fun things. (See 24.4.1).  We need to make sure we don't
1370  * accidentally intercept them too, since they are magically autloaded..
1371  *
1372  * We'll need to be careful of any MSR we neither autoload nor intercept
1373  * whenever we vmenter/vmexit, and we intercept by default.
1374  *
1375  * Other MSRs, such as MSR_IA32_PEBS_ENABLE only work on certain architectures
1376  * only work on certain architectures. */
1377 static void setup_msr(struct vmx_vcpu *vcpu) {
1378         struct vmx_msr_entry *e;
1379         int sz = sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs);
1380         int i;
1381
1382         static_assert((sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs)) <=
1383                       NR_AUTOLOAD_MSRS);
1384
1385         vcpu->msr_autoload.nr = sz;
1386
1387         /* Since PADDR(msr_bitmap) is non-zero, and the bitmap is all 0xff, we now
1388          * intercept all MSRs */
1389         vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1390
1391         vmcs_write64(IO_BITMAP_A, PADDR(io_bitmap));
1392         vmcs_write64(IO_BITMAP_B, PADDR((uintptr_t)io_bitmap +
1393                                         (VMX_IO_BITMAP_SZ / 2)));
1394
1395         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1396         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1397         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1398
1399         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1400         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1401         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1402
1403         for (i = 0; i < sz; i++) {
1404                 uint64_t val;
1405
1406                 e = &vcpu->msr_autoload.host[i];
1407                 e->index = autoloaded_msrs[i];
1408                 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1409                 rdmsrl(e->index, val);
1410                 e->value = val;
1411                 printk("host index %p val %p\n", e->index, e->value);
1412
1413                 e = &vcpu->msr_autoload.guest[i];
1414                 e->index = autoloaded_msrs[i];
1415                 e->value = 0xDEADBEEF;
1416                 printk("guest index %p val %p\n", e->index, e->value);
1417         }
1418 }
1419
1420 /**
1421  *  vmx_setup_vmcs - configures the vmcs with starting parameters
1422  */
1423 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu) {
1424         vmcs_write16(VIRTUAL_PROCESSOR_ID, 0);
1425         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1426
1427         /* Control */
1428         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1429                      vmcs_config.pin_based_exec_ctrl);
1430
1431         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1432                      vmcs_config.cpu_based_exec_ctrl);
1433
1434         if (cpu_has_secondary_exec_ctrls()) {
1435                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1436                              vmcs_config.cpu_based_2nd_exec_ctrl);
1437         }
1438
1439         vmcs_write64(EPT_POINTER, vcpu_get_eptp(vcpu));
1440
1441         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1442         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1443         vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1444
1445         setup_msr(vcpu);
1446
1447         vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1448
1449         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1450         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1451
1452         vmcs_writel(CR0_GUEST_HOST_MASK, 0);    // ~0ul);
1453         vmcs_writel(CR4_GUEST_HOST_MASK, 0);    // ~0ul);
1454
1455         //kvm_write_tsc(&vmx->vcpu, 0);
1456         vmcs_writel(TSC_OFFSET, 0);
1457
1458         vmx_setup_constant_host_state();
1459 }
1460
1461 /**
1462  * vmx_create_vcpu - allocates and initializes a new virtual cpu
1463  *
1464  * Returns: A new VCPU structure
1465  */
1466 struct vmx_vcpu *vmx_create_vcpu(struct proc *p) {
1467         struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1468         if (!vcpu) {
1469                 return NULL;
1470         }
1471
1472         memset(vcpu, 0, sizeof(*vcpu));
1473
1474         vcpu->proc = p; /* uncounted (weak) reference */
1475         vcpu->vmcs = vmx_alloc_vmcs();
1476         printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1477         if (!vcpu->vmcs)
1478                 goto fail_vmcs;
1479
1480         vcpu->cpu = -1;
1481
1482         vmx_get_cpu(vcpu);
1483         vmx_setup_vmcs(vcpu);
1484         vmx_setup_initial_guest_state();
1485         vmx_put_cpu(vcpu);
1486
1487         return vcpu;
1488
1489 fail_vmcs:
1490         kfree(vcpu);
1491         return NULL;
1492 }
1493
1494 /**
1495  * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1496  * @vcpu: the VCPU to destroy
1497  */
1498 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu) {
1499         vmx_free_vmcs(vcpu->vmcs);
1500         kfree(vcpu);
1501 }
1502
1503 /**
1504  * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1505  *
1506  * In the contexts where this is used the vcpu pointer should never be NULL.
1507  */
1508 static inline struct vmx_vcpu *vmx_current_vcpu(void) {
1509         struct vmx_vcpu *vcpu = currentcpu->local_vcpu;
1510         if (!vcpu)
1511                 panic("Core has no vcpu!");
1512         return vcpu;
1513 }
1514
1515 /**
1516  * vmx_run_vcpu - launches the CPU into non-root mode
1517  * We ONLY support 64-bit guests.
1518  * @vcpu: the vmx instance to launch
1519  */
1520 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1521 {
1522         asm(
1523                 /* Store host registers */
1524                 "push %%rdx; push %%rbp;"
1525                 "push %%rcx \n\t" /* placeholder for guest rcx */
1526                 "push %%rcx \n\t"
1527                 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1528                 "je 1f \n\t"
1529                 "mov %%rsp, %c[host_rsp](%0) \n\t"
1530                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1531                 "1: \n\t"
1532                 /* Reload cr2 if changed */
1533                 "mov %c[cr2](%0), %%rax \n\t"
1534                 "mov %%cr2, %%rdx \n\t"
1535                 "cmp %%rax, %%rdx \n\t"
1536                 "je 2f \n\t"
1537                 "mov %%rax, %%cr2 \n\t"
1538                 "2: \n\t"
1539                 /* Check if vmlaunch of vmresume is needed */
1540                 "cmpl $0, %c[launched](%0) \n\t"
1541                 /* Load guest registers.  Don't clobber flags. */
1542                 "mov %c[rax](%0), %%rax \n\t"
1543                 "mov %c[rbx](%0), %%rbx \n\t"
1544                 "mov %c[rdx](%0), %%rdx \n\t"
1545                 "mov %c[rsi](%0), %%rsi \n\t"
1546                 "mov %c[rdi](%0), %%rdi \n\t"
1547                 "mov %c[rbp](%0), %%rbp \n\t"
1548                 "mov %c[r8](%0),  %%r8  \n\t"
1549                 "mov %c[r9](%0),  %%r9  \n\t"
1550                 "mov %c[r10](%0), %%r10 \n\t"
1551                 "mov %c[r11](%0), %%r11 \n\t"
1552                 "mov %c[r12](%0), %%r12 \n\t"
1553                 "mov %c[r13](%0), %%r13 \n\t"
1554                 "mov %c[r14](%0), %%r14 \n\t"
1555                 "mov %c[r15](%0), %%r15 \n\t"
1556                 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1557
1558                 /* Enter guest mode */
1559                 "jne .Llaunched \n\t"
1560                 ASM_VMX_VMLAUNCH "\n\t"
1561                 "jmp .Lkvm_vmx_return \n\t"
1562                 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1563                 ".Lkvm_vmx_return: "
1564                 /* Save guest registers, load host registers, keep flags */
1565                 "mov %0, %c[wordsize](%%rsp) \n\t"
1566                 "pop %0 \n\t"
1567                 "mov %%rax, %c[rax](%0) \n\t"
1568                 "mov %%rbx, %c[rbx](%0) \n\t"
1569                 "popq %c[rcx](%0) \n\t"
1570                 "mov %%rdx, %c[rdx](%0) \n\t"
1571                 "mov %%rsi, %c[rsi](%0) \n\t"
1572                 "mov %%rdi, %c[rdi](%0) \n\t"
1573                 "mov %%rbp, %c[rbp](%0) \n\t"
1574                 "mov %%r8,  %c[r8](%0) \n\t"
1575                 "mov %%r9,  %c[r9](%0) \n\t"
1576                 "mov %%r10, %c[r10](%0) \n\t"
1577                 "mov %%r11, %c[r11](%0) \n\t"
1578                 "mov %%r12, %c[r12](%0) \n\t"
1579                 "mov %%r13, %c[r13](%0) \n\t"
1580                 "mov %%r14, %c[r14](%0) \n\t"
1581                 "mov %%r15, %c[r15](%0) \n\t"
1582                 "mov %%rax, %%r10 \n\t"
1583                 "mov %%rdx, %%r11 \n\t"
1584
1585                 "mov %%cr2, %%rax   \n\t"
1586                 "mov %%rax, %c[cr2](%0) \n\t"
1587
1588                 "pop  %%rbp; pop  %%rdx \n\t"
1589                 "setbe %c[fail](%0) \n\t"
1590                 "mov $" STRINGIFY(GD_UD) ", %%rax \n\t"
1591                 "mov %%rax, %%ds \n\t"
1592                 "mov %%rax, %%es \n\t"
1593               : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1594                 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1595                 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1596                 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1597                 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1598                 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1599                 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1600                 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1601                 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1602                 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1603                 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1604                 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1605                 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1606                 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1607                 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1608                 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1609                 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1610                 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1611                 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1612                 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1613                 [wordsize]"i"(sizeof(unsigned long))
1614               : "cc", "memory"
1615                 , "rax", "rbx", "rdi", "rsi"
1616                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1617         );
1618
1619         if (vmcs_readl(GUEST_IDTR_BASE) != idtr){
1620                 printk("idt changed; old 0x%lx new 0x%lx\n", vmcs_read64(GUEST_IDTR_BASE), idtr);
1621                 idtr = vmcs_read64(GUEST_IDTR_BASE);
1622         }
1623         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1624         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1625         printd("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1626                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1627         /* FIXME: do we need to set up other flags? */
1628         // NO IDEA!
1629         vcpu->regs.tf_rflags = vmcs_readl(GUEST_RFLAGS); //& 0xFF) | X86_EFLAGS_IF | 0x2;
1630
1631         vcpu->regs.tf_cs = GD_UT;
1632         vcpu->regs.tf_ss = GD_UD;
1633
1634         vcpu->launched = 1;
1635
1636         if (vcpu->fail) {
1637                 printk("failure detected (err %x)\n",
1638                        vmcs_read32(VM_INSTRUCTION_ERROR));
1639                 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1640         }
1641
1642         return vmcs_read32(VM_EXIT_REASON);
1643
1644 #if 0
1645         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1646         vmx_complete_atomic_exit(vmx);
1647         vmx_recover_nmi_blocking(vmx);
1648         vmx_complete_interrupts(vmx);
1649 #endif
1650 }
1651
1652 static void vmx_step_instruction(void) {
1653         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1654                     vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1655 }
1656
1657 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu, struct vmctl *v) {
1658         unsigned long gva, gpa;
1659         int exit_qual, ret = -1;
1660         page_t *page;
1661
1662         vmx_get_cpu(vcpu);
1663         exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1664         gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1665         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1666         v->gpa = gpa;
1667         v->gva = gva;
1668         v->exit_qual = exit_qual;
1669         vmx_put_cpu(vcpu);
1670
1671         int prot = 0;
1672         prot |= exit_qual & VMX_EPT_FAULT_READ ? PROT_READ : 0;
1673         prot |= exit_qual & VMX_EPT_FAULT_WRITE ? PROT_WRITE : 0;
1674         prot |= exit_qual & VMX_EPT_FAULT_INS ? PROT_EXEC : 0;
1675         ret = handle_page_fault(current, gpa, prot);
1676
1677         // Some of these get fixed in the vmm; be less chatty now.
1678         if (0 && ret) {
1679                 printk("EPT page fault failure %d, GPA: %p, GVA: %p\n", ret, gpa,
1680                        gva);
1681                 vmx_dump_cpu(vcpu);
1682         }
1683
1684         /* we let the vmm handle the failure cases. So return
1685          * the VMX exit violation, not what handle_page_fault returned.
1686          */
1687         return EXIT_REASON_EPT_VIOLATION;
1688 }
1689
1690 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu) {
1691         unsigned int eax, ebx, ecx, edx;
1692
1693         eax = vcpu->regs.tf_rax;
1694         ecx = vcpu->regs.tf_rcx;
1695         cpuid(eax, ecx, &eax, &ebx, &ecx, &edx);
1696         vcpu->regs.tf_rax = eax;
1697         vcpu->regs.tf_rbx = ebx;
1698         vcpu->regs.tf_rcx = ecx;
1699         vcpu->regs.tf_rdx = edx;
1700 }
1701
1702 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu) {
1703         uint32_t intr_info;
1704
1705         vmx_get_cpu(vcpu);
1706         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1707         vmx_put_cpu(vcpu);
1708
1709         printk("vmx (vcpu %p): got an exception\n", vcpu);
1710         printk("vmx (vcpu %p): pid %d\n", vcpu, vcpu->proc->pid);
1711         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1712                 return 0;
1713         }
1714
1715         printk("unhandled nmi, intr_info %x\n", intr_info);
1716         return -EIO;
1717 }
1718
1719 static void vmx_hwapic_isr_update(struct vmctl *v, int isr)
1720 {
1721         uint16_t status;
1722         uint8_t old;
1723
1724         status = vmcs_read16(GUEST_INTR_STATUS);
1725         old = status >> 8;
1726         if (isr != old) {
1727                 status &= 0xff;
1728                 status |= isr << 8;
1729                 vmcs_write16(GUEST_INTR_STATUS, status);
1730         }
1731 }
1732
1733 static void vmx_set_rvi(int vector)
1734 {
1735         uint16_t status;
1736         uint8_t old;
1737
1738         status = vmcs_read16(GUEST_INTR_STATUS);
1739         printk("%s: Status is %04x", __func__, status);
1740         old = (uint8_t)status & 0xff;
1741         if ((uint8_t)vector != old) {
1742                 status &= ~0xff;
1743                 status |= (uint8_t)vector;
1744                 printk("%s: SET 0x%x\n", __func__, status);
1745
1746                 // Clear SVI
1747                 status &= 0xff;
1748                 vmcs_write16(GUEST_INTR_STATUS, status);
1749         }
1750         printk("%s: Status is %04x after RVI", __func__,
1751                         vmcs_read16(GUEST_INTR_STATUS));
1752 }
1753
1754 /*
1755 static void vmx_set_posted_interrupt(int vector)
1756 {
1757         unsigned long *bit_vec;
1758         unsigned long *pir = vmcs_readl(POSTED_INTR_DESC_ADDR_HIGH);
1759         pir = pir << 32;
1760         pir |= vmcs_readl(POSTED_INTR_DESC_ADDR);
1761
1762         // Move to the correct location to set our bit.
1763         bit_vec = pir + vector/32;
1764         test_and_set_bit(vector%32, bit_vec);
1765
1766         // Set outstanding notification bit
1767         bit_vec = pir + 8;
1768         test_and_set_bit(0, bit_vec);
1769 }
1770
1771 */
1772
1773 int vmx_interrupt_notify(struct vmctl *v) {
1774         int vm_core = v->core;
1775         send_ipi(vm_core, I_VMMCP_POSTED);
1776         if(debug) printk("Posting Interrupt\n");
1777         return 0;
1778 }
1779
1780 /**
1781  * vmx_launch - the main loop for a VMX Dune process
1782  * @conf: the launch configuration
1783  */
1784 int vmx_launch(struct vmctl *v) {
1785         int ret;
1786         struct vmx_vcpu *vcpu;
1787         int errors = 0;
1788         int advance;
1789         int interrupting = 0;
1790         uintptr_t pir_kva, vapic_kva, apic_kva;
1791         uint64_t pir_physical, vapic_physical, apic_physical;
1792         struct proc * current_proc = current;
1793
1794         /* TODO: dirty hack til we have VMM contexts */
1795         vcpu = current->vmm.guest_pcores[0];
1796         if (!vcpu) {
1797                 printk("Failed to get a CPU!\n");
1798                 return -ENOMEM;
1799         }
1800
1801         v->core = core_id();
1802         printd("Core Id: %d\n", v->core);
1803         /* We need to prep the host's autoload region for our current core.  Right
1804          * now, the only autoloaded MSR that varies at runtime (in this case per
1805          * core is the KERN_GS_BASE). */
1806         rdmsrl(MSR_KERNEL_GS_BASE, vcpu->msr_autoload.host[0].value);
1807         /* if cr3 is set, means 'set everything', else means 'start where you left off' */
1808         vmx_get_cpu(vcpu);
1809         switch(v->command) {
1810         case REG_ALL:
1811                 printd("REG_ALL\n");
1812                 // fallthrough
1813                 vcpu->regs = v->regs;
1814                 vmcs_writel(GUEST_RSP, v->regs.tf_rsp);
1815                 vmcs_writel(GUEST_RIP, v->regs.tf_rip);
1816                 break;
1817         case REG_RSP_RIP_CR3:
1818                 printd("REG_RSP_RIP_CR3\n");
1819                 vmcs_writel(GUEST_RSP, v->regs.tf_rsp);
1820                 vmcs_writel(GUEST_CR3, v->cr3);
1821                 vcpu->regs = v->regs;
1822
1823                 pir_kva = uva2kva(current_proc, (void *)v->pir);
1824                 pir_physical = (uint64_t)PADDR(pir_kva);
1825
1826                 vmcs_writel(POSTED_INTR_DESC_ADDR, pir_physical);
1827                 vmcs_writel(POSTED_INTR_DESC_ADDR_HIGH, pir_physical>>32);
1828                 printk("POSTED_INTR_DESC_ADDR_HIGH %ld\n", vmcs_readl(POSTED_INTR_DESC_ADDR_HIGH));
1829                 if (pir_physical & 0xfff) {
1830                         printk("Low order 12 bits of pir address is not 0, value: %p\n", pir_physical);
1831                 }
1832
1833                 vapic_kva = uva2kva(current_proc, (void *)v->vapic);
1834                 vapic_physical = (uint64_t)PADDR(vapic_kva);
1835
1836                 vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, vapic_physical);
1837                 vmcs_writel(VIRTUAL_APIC_PAGE_ADDR_HIGH, vapic_physical>>32);
1838                 if (vapic_physical & 0xfff) {
1839                         printk("Low order 12 bits of vapic address is not 0, value: %p\n", vapic_physical);
1840                 }
1841
1842                 printk("VAPIC PHYSICAL ADDRESS: %p\n", vapic_physical);
1843
1844                 apic_kva = uva2kva(current_proc, (void *)0xfee00000);
1845                 apic_physical = (uint64_t)PADDR(apic_kva);
1846
1847                 vmcs_writel(APIC_ACCESS_ADDR, apic_physical);
1848                 vmcs_writel(APIC_ACCESS_ADDR_HIGH, apic_physical>>32);
1849
1850                 // Clear the EOI exit bitmap(Gan)
1851                 vmcs_writel(EOI_EXIT_BITMAP0, 0);
1852                 vmcs_writel(EOI_EXIT_BITMAP0_HIGH, 0);
1853                 vmcs_writel(EOI_EXIT_BITMAP1, 0);
1854                 vmcs_writel(EOI_EXIT_BITMAP1_HIGH, 0);
1855                 vmcs_writel(EOI_EXIT_BITMAP2, 0);
1856                 vmcs_writel(EOI_EXIT_BITMAP2_HIGH, 0);
1857                 vmcs_writel(EOI_EXIT_BITMAP3, 0);
1858                 vmcs_writel(EOI_EXIT_BITMAP3_HIGH, 0);
1859
1860                 printk("v->apic %p v->pir %p\n", (void *)v->vapic, (void *)v->pir);
1861
1862                 // Initialize vmexits counter
1863                 for (int i = 0; i < 65; i++)
1864                         current_proc->vmm.vmexits[i] = 0;
1865                 // fallthrough
1866         case REG_RIP:
1867                 printd("REG_RIP %p\n", v->regs.tf_rip);
1868                 vmcs_writel(GUEST_RIP, v->regs.tf_rip);
1869                 break;
1870         case RESUME:
1871                 /* If v->interrupt is non-zero, set it in the vmcs and
1872                  * zero it in the vmctl. Else set RIP.
1873                  * We used to check RFLAGS.IF and such here but we'll let the VMM
1874                  * do it. If the VMM screws up we can always fix it. Note to people
1875                  * who know about security: could this be an issue?
1876                  * I don't see how: it will mainly just break your guest vm AFAICT.
1877                  */
1878                 if (v->interrupt) {
1879                         if(debug) printk("Set VM_ENTRY_INFTR_INFO_FIELD to 0x%x\n", v->interrupt);
1880                         vmcs_writel(VM_ENTRY_INTR_INFO_FIELD, v->interrupt);
1881
1882                         v->interrupt = 0;
1883                         interrupting = 1;
1884                 }
1885                 printd("RESUME\n");
1886                 break;
1887         default:
1888                 error(EINVAL, "Bad command in vmx_launch");
1889         }
1890         vcpu->shutdown = 0;
1891         vmx_put_cpu(vcpu);
1892         if (interrupting) {
1893                 if(debug) printk("BEFORE INTERRUPT: ");
1894                 if(debug) vmx_dump_cpu(vcpu);
1895         }
1896         vcpu->ret_code = -1;
1897
1898         while (1) {
1899                 advance = 0;
1900                 vmx_get_cpu(vcpu);
1901
1902                 // TODO: manage the fpu when we restart.
1903
1904                 // TODO: see if we need to exit before we go much further.
1905                 disable_irq();
1906                 //dumpmsrs();
1907                 ret = vmx_run_vcpu(vcpu);
1908
1909                 //dumpmsrs();
1910                 enable_irq();
1911
1912                 // Update the core the vm is running on in case it has changed.
1913                 v->core = core_id();
1914                 current_proc->vmm.vmexits[ret] += 1;
1915
1916                 v->intrinfo1 = vmcs_readl(GUEST_INTERRUPTIBILITY_INFO);
1917                 v->intrinfo2 = vmcs_readl(VM_EXIT_INTR_INFO);
1918                 vmx_put_cpu(vcpu);
1919
1920                 if (interrupting) {
1921                         if(debug) printk("POST INTERRUPT: \n");
1922                         unsigned long cr8val;
1923                         asm volatile("mov %%cr8,%0" : "=r" (cr8val));
1924                         if(debug) printk("CR8 Value: 0x%08x", cr8val);
1925
1926                         if(debug) printk("%s: Status is %04x\n", __func__,
1927                                         vmcs_read16(GUEST_INTR_STATUS));
1928                         if(debug) vmx_dump_cpu(vcpu);
1929                 }
1930
1931                 if (ret == EXIT_REASON_VMCALL) {
1932                         if (current->vmm.flags & VMM_VMCALL_PRINTF) {
1933                                 uint8_t byte = vcpu->regs.tf_rdi;
1934                                 printd("System call\n");
1935 #ifdef DEBUG
1936                                 vmx_dump_cpu(vcpu);
1937 #endif
1938                                 advance = 3;
1939                                 printk("%c", byte);
1940                                 // adjust the RIP
1941                         } else {
1942                                 vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1943 #ifdef DEBUG
1944                                 vmx_dump_cpu(vcpu);
1945                                 printd("system call! WTF\n");
1946 #endif
1947                         }
1948                 } else if (ret == EXIT_REASON_CR_ACCESS) {
1949                         show_cr_access(vmcs_read32(EXIT_QUALIFICATION));
1950                         vmx_dump_cpu(vcpu);
1951                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1952                 } else if (ret == EXIT_REASON_CPUID) {
1953                         printk("CPUID EXIT RIP: %p\n", vcpu->regs.tf_rip);
1954                         vmx_handle_cpuid(vcpu);
1955                         vmx_get_cpu(vcpu);
1956                         vmcs_writel(GUEST_RIP, vcpu->regs.tf_rip + 2);
1957                         vmx_put_cpu(vcpu);
1958                 } else if (ret == EXIT_REASON_EPT_VIOLATION) {
1959                         if (vmx_handle_ept_violation(vcpu, v))
1960                                 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1961                 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1962                         if (vmx_handle_nmi_exception(vcpu))
1963                                 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1964                 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1965                         printk("External interrupt\n");
1966                         vmx_dump_cpu(vcpu);
1967                         printk("GUEST_INTERRUPTIBILITY_INFO: 0x%08x,",  v->intrinfo1);
1968                         printk("VM_EXIT_INFO_FIELD 0x%08x,", v->intrinfo2);
1969                         printk("rflags 0x%x\n", vcpu->regs.tf_rflags);
1970                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1971                 } else if (ret == EXIT_REASON_MSR_READ) {
1972                         printd("msr read\n");
1973                         vmx_dump_cpu(vcpu);
1974                         vcpu->shutdown =
1975                                 msrio(vcpu, ret, vmcs_read32(EXIT_QUALIFICATION));
1976                         advance = 2;
1977                 } else if (ret == EXIT_REASON_MSR_WRITE) {
1978                         printd("msr write\n");
1979                         vmx_dump_cpu(vcpu);
1980                         vcpu->shutdown =
1981                                 msrio(vcpu, ret, vmcs_read32(EXIT_QUALIFICATION));
1982                         advance = 2;
1983                 } else if (ret == EXIT_REASON_IO_INSTRUCTION) {
1984                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1985                 } else if (ret == EXIT_REASON_APIC_WRITE) {
1986                         printk("BEGIN APIC WRITE EXIT DUMP\n");
1987                         vmx_dump_cpu(vcpu);
1988                         printk("END APIC WRITE EXIT DUMP\n");
1989                 //} else if (ret == EXIT_REASON_APIC_ACCESS) {
1990                         //vmx_dump_cpu(vcpu);
1991                 } else {
1992                         printk("unhandled exit: reason 0x%x, exit qualification 0x%x\n",
1993                                ret, vmcs_read32(EXIT_QUALIFICATION));
1994                         if (ret & 0x80000000) {
1995                                 printk("entry failed.\n");
1996                                 vmx_dump_cpu(vcpu);
1997                         }
1998                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1999                 }
2000
2001                 interrupting = 0;
2002                 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
2003                  * similar to how proc_restartcore/smp_idle only restart the pcpui
2004                  * cur_ctx, we need to do the same, via the VMCS resume business. */
2005                 if (vcpu->shutdown)
2006                         break;
2007
2008                 if (advance) {
2009                         vmx_get_cpu(vcpu);
2010                         vmcs_writel(GUEST_RIP, vcpu->regs.tf_rip + advance);
2011                         vmx_put_cpu(vcpu);
2012                 }
2013         }
2014
2015         printd("RETURN. ip %016lx sp %016lx, shutdown 0x%lx ret 0x%lx\n",
2016                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->shutdown, vcpu->shutdown);
2017         v->regs = vcpu->regs;
2018         v->shutdown = vcpu->shutdown;
2019         v->ret_code = ret;
2020 //  hexdump((void *)vcpu->regs.tf_rsp, 128 * 8);
2021         /*
2022          * Return both the reason for the shutdown and a status value.
2023          * The exit() and exit_group() system calls only need 8 bits for
2024          * the status but we allow 16 bits in case we might want to
2025          * return more information for one of the other shutdown reasons.
2026          */
2027         ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
2028
2029         return ret;
2030 }
2031
2032 /**
2033  * __vmx_enable - low-level enable of VMX mode on the current CPU
2034  * @vmxon_buf: an opaque buffer for use as the VMXON region
2035  */
2036 static int __vmx_enable(struct vmcs *vmxon_buf) {
2037         uint64_t phys_addr = PADDR(vmxon_buf);
2038         uint64_t old, test_bits;
2039
2040         if (rcr4() & X86_CR4_VMXE) {
2041                 panic("Should never have this happen");
2042                 return -EBUSY;
2043         }
2044
2045         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2046
2047         test_bits = FEATURE_CONTROL_LOCKED;
2048         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
2049
2050         if (0)  // tboot_enabled())
2051                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
2052
2053         if ((old & test_bits) != test_bits) {
2054                 /* If it's locked, then trying to set it will cause a GPF.
2055                  * No Dune for you!
2056                  */
2057                 if (old & FEATURE_CONTROL_LOCKED) {
2058                         printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
2059                         return -1;
2060                 }
2061
2062                 /* enable and lock */
2063                 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
2064         }
2065         lcr4(rcr4() | X86_CR4_VMXE);
2066
2067         __vmxon(phys_addr);
2068         vpid_sync_vcpu_global();        /* good idea, even if we aren't using vpids */
2069         ept_sync_global();
2070
2071         return 0;
2072 }
2073
2074 /**
2075  * vmx_enable - enables VMX mode on the current CPU
2076  * @unused: not used (required for on_each_cpu())
2077  *
2078  * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
2079  */
2080 static void vmx_enable(void) {
2081         struct vmcs *vmxon_buf = currentcpu->vmxarea;
2082         int ret;
2083
2084         ret = __vmx_enable(vmxon_buf);
2085         if (ret)
2086                 goto failed;
2087
2088         currentcpu->vmx_enabled = 1;
2089         // TODO: do we need this?
2090         store_gdt(&currentcpu->host_gdt);
2091
2092         printk("VMX enabled on CPU %d\n", core_id());
2093         return;
2094
2095 failed:
2096         printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret);
2097 }
2098
2099 /**
2100  * vmx_disable - disables VMX mode on the current CPU
2101  */
2102 static void vmx_disable(void *unused) {
2103         if (currentcpu->vmx_enabled) {
2104                 __vmxoff();
2105                 lcr4(rcr4() & ~X86_CR4_VMXE);
2106                 currentcpu->vmx_enabled = 0;
2107         }
2108 }
2109
2110 /* Probe the cpus to see which ones can do vmx.
2111  * Return -errno if it fails, and 1 if it succeeds.
2112  */
2113 static bool probe_cpu_vmx(void) {
2114         /* The best way to test this code is:
2115          * wrmsr -p <cpu> 0x3a 1
2116          * This will lock vmx off; then modprobe dune.
2117          * Frequently, however, systems have all 0x3a registers set to 5,
2118          * meaning testing is impossible, as vmx can not be disabled.
2119          * We have to simulate it being unavailable in most cases.
2120          * The 'test' variable provides an easy way to simulate
2121          * unavailability of vmx on some, none, or all cpus.
2122          */
2123         if (!cpu_has_vmx()) {
2124                 printk("Machine does not support VT-x\n");
2125                 return FALSE;
2126         } else {
2127                 printk("Machine supports VT-x\n");
2128                 return TRUE;
2129         }
2130 }
2131
2132 static void setup_vmxarea(void) {
2133         struct vmcs *vmxon_buf;
2134         printd("Set up vmxarea for cpu %d\n", core_id());
2135         vmxon_buf = __vmx_alloc_vmcs(core_id());
2136         if (!vmxon_buf) {
2137                 printk("setup_vmxarea failed on node %d\n", core_id());
2138                 return;
2139         }
2140         currentcpu->vmxarea = vmxon_buf;
2141 }
2142
2143 static int ept_init(void) {
2144         if (!cpu_has_vmx_ept()) {
2145                 printk("VMX doesn't support EPT!\n");
2146                 return -1;
2147         }
2148         if (!cpu_has_vmx_eptp_writeback()) {
2149                 printk("VMX EPT doesn't support WB memory!\n");
2150                 return -1;
2151         }
2152         if (!cpu_has_vmx_ept_4levels()) {
2153                 printk("VMX EPT doesn't support 4 level walks!\n");
2154                 return -1;
2155         }
2156         switch (arch_max_jumbo_page_shift()) {
2157         case PML3_SHIFT:
2158                 if (!cpu_has_vmx_ept_1g_page()) {
2159                         printk("VMX EPT doesn't support 1 GB pages!\n");
2160                         return -1;
2161                 }
2162                 break;
2163         case PML2_SHIFT:
2164                 if (!cpu_has_vmx_ept_2m_page()) {
2165                         printk("VMX EPT doesn't support 2 MB pages!\n");
2166                         return -1;
2167                 }
2168                 break;
2169         default:
2170                 printk("Unexpected jumbo page size %d\n",
2171                        arch_max_jumbo_page_shift());
2172                 return -1;
2173         }
2174         if (!cpu_has_vmx_ept_ad_bits()) {
2175                 printk("VMX EPT doesn't support accessed/dirty!\n");
2176                 x86_ept_pte_fix_ups |= EPTE_A | EPTE_D;
2177         }
2178         if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
2179                 printk("VMX EPT can't invalidate PTEs/TLBs!\n");
2180                 return -1;
2181         }
2182
2183         return 0;
2184 }
2185
2186 /**
2187  * vmx_init sets up physical core data areas that are required to run a vm at all.
2188  * These data areas are not connected to a specific user process in any way. Instead,
2189  * they are in some sense externalizing what would other wise be a very large ball of
2190  * state that would be inside the CPU.
2191  */
2192 int intel_vmm_init(void) {
2193         int r, cpu, ret;
2194
2195         if (!probe_cpu_vmx()) {
2196                 return -EOPNOTSUPP;
2197         }
2198
2199         setup_vmcs_config(&ret);
2200
2201         if (ret) {
2202                 printk("setup_vmcs_config failed: %d\n", ret);
2203                 return ret;
2204         }
2205
2206         msr_bitmap = (unsigned long *)kpage_zalloc_addr();
2207         if (!msr_bitmap) {
2208                 printk("Could not allocate msr_bitmap\n");
2209                 return -ENOMEM;
2210         }
2211         io_bitmap = (unsigned long *)get_cont_pages(VMX_IO_BITMAP_ORDER,
2212                                                     KMALLOC_WAIT);
2213         if (!io_bitmap) {
2214                 printk("Could not allocate msr_bitmap\n");
2215                 kfree(msr_bitmap);
2216                 return -ENOMEM;
2217         }
2218         /* FIXME: do we need APIC virtualization (flexpriority?) */
2219
2220         memset(msr_bitmap, 0xff, PAGE_SIZE);
2221         memset(io_bitmap, 0xff, VMX_IO_BITMAP_SZ);
2222
2223         /* These are the only MSRs that are not autoloaded and not intercepted */
2224         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
2225         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
2226         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_EFER);
2227
2228         /* TODO: this might be dangerous, since they can do more than just read the
2229          * CMOS */
2230         __vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_IDX);
2231         __vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_DATA);
2232
2233         if ((ret = ept_init())) {
2234                 printk("EPT init failed, %d\n", ret);
2235                 return ret;
2236         }
2237         printk("VMX setup succeeded\n");
2238         return 0;
2239 }
2240
2241 int intel_vmm_pcpu_init(void) {
2242         setup_vmxarea();
2243         vmx_enable();
2244         return 0;
2245 }
2246
2247
2248 void vapic_status_dump_kernel(void *vapic)
2249 {
2250         uint32_t *p = (uint32_t *)vapic;
2251         int i;
2252         printk("-- BEGIN KERNEL APIC STATUS DUMP --\n");
2253         for (i = 0x100/sizeof(*p); i < 0x180/sizeof(*p); i+=4) {
2254                 printk("VISR : 0x%x: 0x%08x\n", i, p[i]);
2255         }
2256         for (i = 0x200/sizeof(*p); i < 0x280/sizeof(*p); i+=4) {
2257                 printk("VIRR : 0x%x: 0x%08x\n", i, p[i]);
2258         }
2259         i = 0x0B0/sizeof(*p);
2260         printk("EOI FIELD : 0x%x, 0x%08x\n", i, p[i]);
2261
2262         printk("-- END KERNEL APIC STATUS DUMP --\n");
2263 }