74ea41b415a2a0a0608f6efd5fb95d3a28bca145
[akaros.git] / kern / arch / x86 / vmm / intel / vmx.c
1 //#define DEBUG
2 /**
3  *  vmx.c - The Intel VT-x driver for Dune
4  *
5  * This file is derived from Linux KVM VT-x support.
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8  *
9  * Original Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This modified version is simpler because it avoids the following
14  * features that are not requirements for Dune:
15  *  * Real-mode emulation
16  *  * Nested VT-x support
17  *  * I/O hardware emulation
18  *  * Any of the more esoteric X86 features and registers
19  *  * KVM-specific functionality
20  *
21  * In essence we provide only the minimum functionality needed to run
22  * a process in vmx non-root mode rather than the full hardware emulation
23  * needed to support an entire OS.
24  *
25  * This driver is a research prototype and as such has the following
26  * limitations:
27  *
28  * FIXME: Backward compatability is currently a non-goal, and only recent
29  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
30  * driver.
31  *
32  * FIXME: Eventually we should handle concurrent user's of VT-x more
33  * gracefully instead of requiring exclusive access. This would allow
34  * Dune to interoperate with KVM and other HV solutions.
35  *
36  * FIXME: We need to support hotplugged physical CPUs.
37  *
38  * Authors:
39  *   Adam Belay   <abelay@stanford.edu>
40  */
41
42 /* Basic flow.
43  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
44  * You're left with the feeling that they got part way through and realized they had to have one for
45  *
46  * 1) your CPU is going to be capable of running VMs, and you need state for that.
47  *
48  * 2) you're about to start a guest, and you need state for that.
49  *
50  * So there is get cpu set up to be able to run VMs stuff, and now
51  * let's start a guest stuff.  In Akaros, CPUs will always be set up
52  * to run a VM if that is possible. Processes can flip themselves into
53  * a VM and that will require another VMCS.
54  *
55  * So: at kernel startup time, the SMP boot stuff calls
56  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
57  * in the case of this file is intel_vmm_init. That does some code
58  * that sets up stuff for ALL sockets, based on the capabilities of
59  * the socket it runs on. If any cpu supports vmx, it assumes they all
60  * do. That's a realistic assumption. So the call_function_all is kind
61  * of stupid, really; it could just see what's on the current cpu and
62  * assume it's on all. HOWEVER: there are systems in the wilde that
63  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
64  * might as well allow for the chance that wel'll only all VMMCPs on a
65  * subset (not implemented yet however).  So: probe all CPUs, get a
66  * count of how many support VMX and, for now, assume they all do
67  * anyway.
68  *
69  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
70  * which contains all the naughty bits settings for all the cpus that can run a VM.
71  * Realistically, all VMX-capable cpus in a system will have identical configurations.
72  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
73  *
74  * configure the msr_bitmap. This is the bitmap of MSRs which the
75  * guest can manipulate.  Currently, we only allow GS and FS base.
76  *
77  * Reserve bit 0 in the vpid bitmap as guests can not use that
78  *
79  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
80  * per-guest. Once set up, it is left alone.  The ONLY think we set in
81  * there is the revision area. The VMX is page-sized per cpu and
82  * page-aligned. Note that it can be smaller, but why bother? We know
83  * the max size and alightment, and it's convenient.
84  *
85  * Now that it is set up, enable vmx on all cpus. This involves
86  * testing VMXE in cr4, to see if we've been here before (TODO: delete
87  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
88  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
89  * instruction), and syncing vpid's and ept's.  Now the CPU is ready
90  * to host guests.
91  *
92  * Setting up a guest.
93  * We divide this into two things: vmm_proc_init and vm_run.
94  * Currently, on Intel, vmm_proc_init does nothing.
95  *
96  * vm_run is really complicated. It is called with a coreid, and
97  * vmctl struct. On intel, it calls vmx_launch. vmx_launch is set
98  * up for a few test cases. If rip is 1, it sets the guest rip to
99  * a function which will deref 0 and should exit with failure 2. If rip is 0,
100  * it calls an infinite loop in the guest.
101  *
102  * The sequence of operations:
103  * create a vcpu
104  * while (1) {
105  * get a vcpu
106  * disable irqs (required or you can't enter the VM)
107  * vmx_run_vcpu()
108  * enable irqs
109  * manage the vm exit
110  * }
111  *
112  * get a vcpu
113  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
114  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
115  *
116  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
117  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
118  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
119  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
120  *
121  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
122  * of inline assembly with embedded CPP crap. I suspect we'll want to
123  * un-inline it someday, but maybe not.  It's called with a vcpu
124  * struct from which it loads guest state, and to which it stores
125  * non-virtualized host state. It issues a vmlaunch or vmresume
126  * instruction depending, and on return, it evaluates if things the
127  * launch/resume had an error in that operation. Note this is NOT the
128  * same as an error while in the virtual machine; this is an error in
129  * startup due to misconfiguration. Depending on whatis returned it's
130  * either a failed vm startup or an exit for lots of many reasons.
131  *
132  */
133
134 /* basically: only rename those globals that might conflict
135  * with existing names. Leave all else the same.
136  * this code is more modern than the other code, yet still
137  * well encapsulated, it seems.
138  */
139 #include <kmalloc.h>
140 #include <string.h>
141 #include <stdio.h>
142 #include <assert.h>
143 #include <error.h>
144 #include <pmap.h>
145 #include <sys/queue.h>
146 #include <smp.h>
147 #include <kref.h>
148 #include <atomic.h>
149 #include <alarm.h>
150 #include <event.h>
151 #include <umem.h>
152 #include <bitops.h>
153 #include <arch/types.h>
154 #include <syscall.h>
155 #include <arch/io.h>
156
157 #include <ros/vmm.h>
158 #include "vmx.h"
159 #include "../vmm.h"
160
161 #include "cpufeature.h"
162
163 #include <trap.h>
164
165 #include <smp.h>
166
167 #define currentcpu (&per_cpu_info[core_id()])
168
169 /* debug stuff == remove later. It's not even multivm safe. */
170 uint64_t idtr;
171
172 // END debug
173 static unsigned long *msr_bitmap;
174 #define VMX_IO_BITMAP_ORDER             4       /* 64 KB */
175 #define VMX_IO_BITMAP_SZ                (1 << (VMX_IO_BITMAP_ORDER + PGSHIFT))
176 static unsigned long *io_bitmap;
177
178 int x86_ept_pte_fix_ups = 0;
179
180 struct vmx_capability vmx_capability;
181 struct vmcs_config vmcs_config;
182
183 static int autoloaded_msrs[] = {
184         MSR_KERNEL_GS_BASE,
185         MSR_LSTAR,
186         MSR_STAR,
187         MSR_SFMASK,
188 };
189
190 static char *cr_access_type[] = {
191         "move to cr",
192         "move from cr",
193         "clts",
194         "lmsw"
195 };
196
197 static char *cr_gpr[] = {
198         "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
199         "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
200 };
201
202 static int guest_cr_num[16] = {
203         GUEST_CR0,
204         -1,
205         -1,
206         GUEST_CR3,
207         GUEST_CR4,
208         -1,
209         -1,
210         -1,
211         -1,     /* 8? */
212         -1, -1, -1, -1, -1, -1, -1
213 };
214
215 __always_inline unsigned long vmcs_readl(unsigned long field);
216 /* See section 24-3 of The Good Book */
217 void
218 show_cr_access(uint64_t val)
219 {
220         int crnr = val & 0xf;
221         int type = (val >> 4) & 3;
222         int reg = (val >> 11) & 0xf;
223         printk("%s: %d: ", cr_access_type[type], crnr);
224         if (type < 2) {
225                 printk("%s", cr_gpr[reg]);
226                 if (guest_cr_num[crnr] > -1) {
227                         printk(": 0x%x", vmcs_readl(guest_cr_num[crnr]));
228                 }
229         }
230         printk("\n");
231 }
232
233 void
234 ept_flush(uint64_t eptp)
235 {
236         ept_sync_context(eptp);
237 }
238
239 static void
240 vmcs_clear(struct vmcs *vmcs)
241 {
242         uint64_t phys_addr = PADDR(vmcs);
243         uint8_t error;
244
245         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0":"=qm"(error):"a"(&phys_addr),
246                                   "m"(phys_addr)
247                                   :"cc", "memory");
248         if (error)
249                 printk("vmclear fail: %p/%llx\n", vmcs, phys_addr);
250 }
251
252 static void
253 vmcs_load(struct vmcs *vmcs)
254 {
255         uint64_t phys_addr = PADDR(vmcs);
256         uint8_t error;
257
258         asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0":"=qm"(error):"a"(&phys_addr),
259                                   "m"(phys_addr)
260                                   :"cc", "memory");
261         if (error)
262                 printk("vmptrld %p/%llx failed\n", vmcs, phys_addr);
263 }
264
265 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
266 static physaddr_t
267 vmcs_get_current(void)
268 {
269         physaddr_t vmcs_paddr;
270         /* RAX contains the addr of the location to store the VMCS pointer.  The
271          * compiler doesn't know the ASM will deref that pointer, hence the =m */
272         asm volatile (ASM_VMX_VMPTRST_RAX:"=m"(vmcs_paddr):"a"(&vmcs_paddr));
273         return vmcs_paddr;
274 }
275
276 __always_inline unsigned long
277 vmcs_readl(unsigned long field)
278 {
279         unsigned long value;
280
281         asm volatile (ASM_VMX_VMREAD_RDX_RAX:"=a"(value):"d"(field):"cc");
282         return value;
283 }
284
285 __always_inline uint16_t
286 vmcs_read16(unsigned long field)
287 {
288         return vmcs_readl(field);
289 }
290
291 static __always_inline uint32_t
292 vmcs_read32(unsigned long field)
293 {
294         return vmcs_readl(field);
295 }
296
297 static __always_inline uint64_t
298 vmcs_read64(unsigned long field)
299 {
300         return vmcs_readl(field);
301 }
302
303 void
304 vmwrite_error(unsigned long field, unsigned long value)
305 {
306         printk("vmwrite error: reg %lx value %lx (err %d)\n",
307                    field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
308 }
309
310 void
311 vmcs_writel(unsigned long field, unsigned long value)
312 {
313         uint8_t error;
314
315         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0":"=q"(error):"a"(value),
316                                   "d"(field):"cc");
317         if (error)
318                 vmwrite_error(field, value);
319 }
320
321 static void
322 vmcs_write16(unsigned long field, uint16_t value)
323 {
324         vmcs_writel(field, value);
325 }
326
327 static void
328 vmcs_write32(unsigned long field, uint32_t value)
329 {
330         vmcs_writel(field, value);
331 }
332
333 static void
334 vmcs_write64(unsigned long field, uint64_t value)
335 {
336         vmcs_writel(field, value);
337 }
338
339 void vapic_status_dump_kernel(void *vapic);
340
341 /*
342  * A note on Things You Can't Make Up.
343  * or
344  * "George, you can type this shit, but you can't say it" -- Harrison Ford
345  *
346  * There are 5 VMCS 32-bit words that control guest permissions. If
347  * you set these correctly, you've got a guest that will behave. If
348  * you get even one bit wrong, you've got a guest that will chew your
349  * leg off. Some bits must be 1, some must be 0, and some can be set
350  * either way. To add to the fun, the docs are sort of a docudrama or,
351  * as the quote goes, "interesting if true."
352  *
353  * To determine what bit can be set in what VMCS 32-bit control word,
354  * there are 5 corresponding 64-bit MSRs.  And, to make it even more
355  * fun, the standard set of MSRs have errors in them, i.e. report
356  * incorrect values, for legacy reasons, and so you are supposed to
357  * "look around" to another set, which have correct bits in
358  * them. There are four such 'correct' registers, and they have _TRUE_
359  * in the names as you can see below. We test for the value of VMCS
360  * control bits in the _TRUE_ registers if possible. The fifth
361  * register, CPU Secondary Exec Controls, which came later, needs no
362  * _TRUE_ variant.
363  *
364  * For each MSR, the high 32 bits tell you what bits can be "1" by a
365  * "1" in that position; the low 32 bits tell you what bit can be "0"
366  * by a "0" in that position. So, for each of 32 bits in a given VMCS
367  * control word, there is a pair of bits in an MSR that tells you what
368  * values it can take. The two bits, of which there are *four*
369  * combinations, describe the *three* possible operations on a
370  * bit. The two bits, taken together, form an untruth table: There are
371  * three possibilities: The VMCS bit can be set to 0 or 1, or it can
372  * only be 0, or only 1. The fourth combination is not supposed to
373  * happen.
374  *
375  * So: there is the 1 bit from the upper 32 bits of the msr.
376  * If this bit is set, then the bit can be 1. If clear, it can not be 1.
377  *
378  * Then there is the 0 bit, from low 32 bits. If clear, the VMCS bit
379  * can be 0. If 1, the VMCS bit can not be 0.
380  *
381  * SO, let's call the 1 bit R1, and the 0 bit R0, we have:
382  *  R1 R0
383  *  0 0 -> must be 0
384  *  1 0 -> can be 1, can be 0
385  *  0 1 -> can not be 1, can not be 0. --> JACKPOT! Not seen yet.
386  *  1 1 -> must be one.
387  *
388  * It's also pretty hard to know what you can and can't set, and
389  * that's led to inadvertant opening of permissions at times.  Because
390  * of this complexity we've decided on the following: the driver must
391  * define EVERY bit, UNIQUELY, for each of the 5 registers, that it wants
392  * set. Further, for any bit that's settable, the driver must specify
393  * a setting; for any bit that's reserved, the driver settings must
394  * match that bit. If there are reserved bits we don't specify, that's
395  * ok; we'll take them as is.
396  *
397  * We use a set-means-set, and set-means-clear model, i.e. we use a
398  * 32-bit word to contain the bits we want to be 1, indicated by one;
399  * and another 32-bit word in which a bit we want to be 0 is indicated
400  * by a 1. This allows us to easily create masks of all bits we're
401  * going to set, for example.
402  *
403  * We have two 32-bit numbers for each 32-bit VMCS field: bits we want
404  * set and bits we want clear.  If you read the MSR for that field,
405  * compute the reserved 0 and 1 settings, and | them together, they
406  * need to result in 0xffffffff. You can see that we can create other
407  * tests for conflicts (i.e. overlap).
408  *
409  * At this point, I've tested check_vmx_controls in every way
410  * possible, beause I kept screwing the bitfields up. You'll get a nice
411  * error it won't work at all, which is what we want: a
412  * failure-prone setup, where even errors that might result in correct
413  * values are caught -- "right answer, wrong method, zero credit." If there's
414  * weirdness in the bits, we don't want to run.
415  * The try_set stuff adds particular ugliness but we have to have it.
416  */
417
418 static bool
419 check_vmxec_controls(struct vmxec const *v, bool have_true_msr,
420                                          uint32_t * result)
421 {
422         bool err = false;
423         uint32_t vmx_msr_low, vmx_msr_high;
424         uint32_t reserved_0, reserved_1, changeable_bits, try0, try1;
425
426         if (have_true_msr)
427                 rdmsr(v->truemsr, vmx_msr_low, vmx_msr_high);
428         else
429                 rdmsr(v->msr, vmx_msr_low, vmx_msr_high);
430
431         if (vmx_msr_low & ~vmx_msr_high)
432                 warn("JACKPOT: Conflicting VMX ec ctls for %s, high 0x%08x low 0x%08x",
433                          v->name, vmx_msr_high, vmx_msr_low);
434
435         reserved_0 = (~vmx_msr_low) & (~vmx_msr_high);
436         reserved_1 = vmx_msr_low & vmx_msr_high;
437         changeable_bits = ~(reserved_0 | reserved_1);
438
439         /*
440          * this is very much as follows:
441          * accept the things I cannot change,
442          * change the things I can,
443          * know the difference.
444          */
445
446         /* Conflict. Don't try to both set and reset bits. */
447         if ((v->must_be_1 & (v->must_be_0 | v->try_set_1 | v->try_set_0)) ||
448             (v->must_be_0 & (v->try_set_1 | v->try_set_0)) ||
449             (v->try_set_1 & v->try_set_0)) {
450                 printk("%s: must 0 (0x%x) and must be 1 (0x%x) and try_set_0 (0x%x) and try_set_1 (0x%x) overlap\n",
451                        v->name, v->must_be_0, v->must_be_1, v->try_set_0, v->try_set_1);
452                 err = true;
453         }
454
455         /* coverage */
456         if (((v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1) & changeable_bits) != changeable_bits) {
457                 printk("%s: Need to cover 0x%x and have 0x%x,0x%x\n",
458                        v->name, changeable_bits, v->must_be_0, v->must_be_1, v->try_set_0, v->try_set_1);
459                 err = true;
460         }
461
462         if ((v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1 | reserved_0 | reserved_1) != 0xffffffff) {
463                 printk("%s: incomplete coverage: have 0x%x, want 0x%x\n",
464                        v->name, v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1 |
465                        reserved_0 | reserved_1, 0xffffffff);
466                 err = true;
467         }
468
469         /* Don't try to change bits that can't be changed. */
470         if ((v->must_be_0 & (reserved_0 | changeable_bits)) != v->must_be_0) {
471                 printk("%s: set to 0 (0x%x) can't be done\n", v->name, v->must_be_0);
472                 err = true;
473         }
474
475         if ((v->must_be_1 & (reserved_1 | changeable_bits)) != v->must_be_1) {
476                 printk("%s: set to 1 (0x%x) can't be done\n", v->name, v->must_be_1);
477                 err = true;
478         }
479         // Note we don't REQUIRE that try_set_0 or try_set_0 be possible. We just want to try it.
480
481         // Clear bits in try_set that can't be set.
482         try1 = v->try_set_1 & (reserved_1 | changeable_bits);
483
484         /* If there's been any error at all, spill our guts and return. */
485         if (err) {
486                 printk("%s: vmx_msr_high 0x%x, vmx_msr_low 0x%x, ",
487                            v->name, vmx_msr_high, vmx_msr_low);
488                 printk("must_be_0 0x%x, try_set_0 0x%x,reserved_0 0x%x",
489                            v->must_be_0, v->try_set_0, reserved_0);
490                 printk("must_be_1 0x%x, try_set_1 0x%x,reserved_1 0x%x",
491                            v->must_be_1, v->try_set_1, reserved_1);
492                 printk(" reserved_0 0x%x", reserved_0);
493                 printk(" changeable_bits 0x%x\n", changeable_bits);
494                 return false;
495         }
496
497         *result = v->must_be_1 | try1 | reserved_1;
498
499         printk("%s: check_vmxec_controls succeeds with result 0x%x\n",
500                    v->name, *result);
501         return true;
502 }
503
504 /*
505  * We're trying to make this as readable as possible. Realistically, it will
506  * rarely if ever change, if the past is any guide.
507  */
508 static const struct vmxec pbec = {
509         .name = "Pin Based Execution Controls",
510         .msr = MSR_IA32_VMX_PINBASED_CTLS,
511         .truemsr = MSR_IA32_VMX_TRUE_PINBASED_CTLS,
512
513         .must_be_1 = (PIN_BASED_EXT_INTR_MASK |
514                      PIN_BASED_NMI_EXITING |
515                      PIN_BASED_VIRTUAL_NMIS |
516                      PIN_BASED_POSTED_INTR),
517
518         .must_be_0 = (PIN_BASED_VMX_PREEMPTION_TIMER),
519 };
520
521 static const struct vmxec cbec = {
522         .name = "CPU Based Execution Controls",
523         .msr = MSR_IA32_VMX_PROCBASED_CTLS,
524         .truemsr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
525
526         .must_be_1 = (//CPU_BASED_MWAIT_EXITING |
527                         CPU_BASED_HLT_EXITING |
528                      CPU_BASED_TPR_SHADOW |
529                      CPU_BASED_RDPMC_EXITING |
530                      CPU_BASED_CR8_LOAD_EXITING |
531                      CPU_BASED_CR8_STORE_EXITING |
532                      CPU_BASED_USE_MSR_BITMAPS |
533                      CPU_BASED_USE_IO_BITMAPS |
534                      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS),
535
536         .must_be_0 = (
537                         CPU_BASED_MWAIT_EXITING |
538                         CPU_BASED_VIRTUAL_INTR_PENDING |
539                      CPU_BASED_INVLPG_EXITING |
540                      CPU_BASED_USE_TSC_OFFSETING |
541                      CPU_BASED_RDTSC_EXITING |
542                      CPU_BASED_CR3_LOAD_EXITING |
543                      CPU_BASED_CR3_STORE_EXITING |
544                      CPU_BASED_MOV_DR_EXITING |
545                      CPU_BASED_VIRTUAL_NMI_PENDING |
546                      CPU_BASED_MONITOR_TRAP |
547                      CPU_BASED_PAUSE_EXITING |
548                      CPU_BASED_UNCOND_IO_EXITING),
549
550         .try_set_0 = (CPU_BASED_MONITOR_EXITING)
551 };
552
553 static const struct vmxec cb2ec = {
554         .name = "CPU Based 2nd Execution Controls",
555         .msr = MSR_IA32_VMX_PROCBASED_CTLS2,
556         .truemsr = MSR_IA32_VMX_PROCBASED_CTLS2,
557
558         .must_be_1 = (SECONDARY_EXEC_ENABLE_EPT |
559                      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
560                      SECONDARY_EXEC_APIC_REGISTER_VIRT |
561                      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
562                      SECONDARY_EXEC_WBINVD_EXITING),
563
564         .must_be_0 = (
565                      //SECONDARY_EXEC_APIC_REGISTER_VIRT |
566                      //SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
567                      SECONDARY_EXEC_DESCRIPTOR_EXITING |
568                      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
569                      SECONDARY_EXEC_ENABLE_VPID |
570                      SECONDARY_EXEC_UNRESTRICTED_GUEST |
571                      SECONDARY_EXEC_PAUSE_LOOP_EXITING |
572                      SECONDARY_EXEC_RDRAND_EXITING |
573                      SECONDARY_EXEC_ENABLE_INVPCID |
574                      SECONDARY_EXEC_ENABLE_VMFUNC |
575                      SECONDARY_EXEC_SHADOW_VMCS |
576                      SECONDARY_EXEC_RDSEED_EXITING |
577                      SECONDARY_EPT_VE |
578                      SECONDARY_ENABLE_XSAV_RESTORE),
579
580         .try_set_1 = SECONDARY_EXEC_RDTSCP,
581
582         // mystery bit.
583         .try_set_0 = 0x2000000
584
585 };
586
587 static const struct vmxec vmentry = {
588         .name = "VMENTRY controls",
589         .msr = MSR_IA32_VMX_ENTRY_CTLS,
590         .truemsr = MSR_IA32_VMX_TRUE_ENTRY_CTLS,
591         /* exact order from vmx.h; only the first two are enabled. */
592
593         .must_be_1 =  (VM_ENTRY_LOAD_DEBUG_CONTROLS | /* can't set to 0 */
594                       VM_ENTRY_LOAD_IA32_EFER |
595                       VM_ENTRY_IA32E_MODE),
596
597         .must_be_0 = (VM_ENTRY_SMM |
598                      VM_ENTRY_DEACT_DUAL_MONITOR |
599                      VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
600                      VM_ENTRY_LOAD_IA32_PAT),
601 };
602
603 static const struct vmxec vmexit = {
604         .name = "VMEXIT controls",
605         .msr = MSR_IA32_VMX_EXIT_CTLS,
606         .truemsr = MSR_IA32_VMX_TRUE_EXIT_CTLS,
607
608         .must_be_1 = (VM_EXIT_SAVE_DEBUG_CONTROLS |     /* can't set to 0 */
609                                  VM_EXIT_ACK_INTR_ON_EXIT |
610                                  VM_EXIT_SAVE_IA32_EFER | 
611                                 VM_EXIT_LOAD_IA32_EFER | 
612                                 VM_EXIT_HOST_ADDR_SPACE_SIZE),  /* 64 bit */
613
614         .must_be_0 = (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
615                                 // VM_EXIT_ACK_INTR_ON_EXIT |
616                                  VM_EXIT_SAVE_IA32_PAT |
617                                  VM_EXIT_LOAD_IA32_PAT | 
618                                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER),
619 };
620
621 static void
622 setup_vmcs_config(void *p)
623 {
624         int *ret = p;
625         struct vmcs_config *vmcs_conf = &vmcs_config;
626         uint32_t vmx_msr_high;
627         uint64_t vmx_msr;
628         bool have_true_msrs = false;
629         bool ok;
630
631         *ret = -EIO;
632
633         vmx_msr = read_msr(MSR_IA32_VMX_BASIC);
634         vmx_msr_high = vmx_msr >> 32;
635
636         /*
637          * If bit 55 (VMX_BASIC_HAVE_TRUE_MSRS) is set, then we
638          * can go for the true MSRs.  Else, we ask you to get a better CPU.
639          */
640         if (vmx_msr & VMX_BASIC_TRUE_CTLS) {
641                 have_true_msrs = true;
642                 printd("Running with TRUE MSRs\n");
643         } else {
644                 printk("Running with non-TRUE MSRs, this is old hardware\n");
645         }
646
647         /*
648          * Don't worry that one or more of these might fail and leave
649          * the VMCS in some kind of incomplete state. If one of these
650          * fails, the caller is going to discard the VMCS.
651          * It is written this way to ensure we get results of all tests and avoid
652          * BMAFR behavior.
653          */
654         ok = check_vmxec_controls(&pbec, have_true_msrs,
655                                   &vmcs_conf->pin_based_exec_ctrl);
656         ok = check_vmxec_controls(&cbec, have_true_msrs,
657                                   &vmcs_conf->cpu_based_exec_ctrl) && ok;
658         /* Only check cb2ec if we're still ok, o/w we may GPF */
659         ok = ok && check_vmxec_controls(&cb2ec, have_true_msrs,
660                                         &vmcs_conf->cpu_based_2nd_exec_ctrl);
661         ok = check_vmxec_controls(&vmentry, have_true_msrs,
662                                   &vmcs_conf->vmentry_ctrl) && ok;
663         ok = check_vmxec_controls(&vmexit, have_true_msrs,
664                                   &vmcs_conf->vmexit_ctrl) && ok;
665         if (! ok) {
666                 printk("vmxexec controls is no good.\n");
667                 return;
668         }
669
670         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
671         if ((vmx_msr_high & 0x1fff) > PGSIZE) {
672                 printk("vmx_msr_high & 0x1fff) is 0x%x, > PAGE_SIZE 0x%x\n",
673                            vmx_msr_high & 0x1fff, PGSIZE);
674                 return;
675         }
676
677         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
678         if (vmx_msr & VMX_BASIC_64) {
679                 printk("VMX doesn't support 64 bit width!\n");
680                 return;
681         }
682
683         if (((vmx_msr & VMX_BASIC_MEM_TYPE_MASK) >> VMX_BASIC_MEM_TYPE_SHIFT)
684                 != VMX_BASIC_MEM_TYPE_WB) {
685                 printk("VMX doesn't support WB memory for VMCS accesses!\n");
686                 return;
687         }
688
689         vmcs_conf->size = vmx_msr_high & 0x1fff;
690         vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
691         vmcs_conf->revision_id = (uint32_t) vmx_msr;
692
693         /* Read in the caps for runtime checks.  This MSR is only available if
694          * secondary controls and ept or vpid is on, which we check earlier */
695         rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, vmx_capability.ept, vmx_capability.vpid);
696
697         *ret = 0;
698 }
699
700 static struct vmcs *
701 __vmx_alloc_vmcs(int node)
702 {
703         struct vmcs *vmcs;
704
705         vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
706         if (!vmcs)
707                 return 0;
708         memset(vmcs, 0, vmcs_config.size);
709         vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
710         printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
711         return vmcs;
712 }
713
714 /**
715  * vmx_alloc_vmcs - allocates a VMCS region
716  *
717  * NOTE: Assumes the new region will be used by the current CPU.
718  *
719  * Returns a valid VMCS region.
720  */
721 static struct vmcs *
722 vmx_alloc_vmcs(void)
723 {
724         return __vmx_alloc_vmcs(numa_id());
725 }
726
727 /**
728  * vmx_free_vmcs - frees a VMCS region
729  */
730 static void
731 vmx_free_vmcs(struct vmcs *vmcs)
732 {
733         //free_pages((unsigned long)vmcs, vmcs_config.order);
734 }
735
736 /*
737  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
738  * will not change in the lifetime of the guest.
739  * Note that host-state that does change is set elsewhere. E.g., host-state
740  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
741  */
742 static void
743 vmx_setup_constant_host_state(void)
744 {
745         uint32_t low32, high32;
746         unsigned long tmpl;
747         pseudodesc_t dt;
748
749         vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);    /* 22.2.3 */
750         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
751         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
752
753         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
754         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
755         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
756         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
757         vmcs_write16(HOST_TR_SELECTOR, GD_TSS); /* 22.2.4 */
758
759         native_store_idt(&dt);
760         vmcs_writel(HOST_IDTR_BASE, dt.pd_base);        /* 22.2.4 */
761
762         asm("mov $.Lkvm_vmx_return, %0":"=r"(tmpl));
763         vmcs_writel(HOST_RIP, tmpl);    /* 22.2.5 */
764
765         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
766         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
767         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
768         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);      /* 22.2.3 */
769
770         rdmsr(MSR_EFER, low32, high32);
771         vmcs_write32(HOST_IA32_EFER, low32);
772
773         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
774                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
775                 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
776         }
777
778         vmcs_write16(HOST_FS_SELECTOR, 0);      /* 22.2.4 */
779         vmcs_write16(HOST_GS_SELECTOR, 0);      /* 22.2.4 */
780
781         /* TODO: This (at least gs) is per cpu */
782         rdmsrl(MSR_FS_BASE, tmpl);
783         vmcs_writel(HOST_FS_BASE, tmpl);        /* 22.2.4 */
784         rdmsrl(MSR_GS_BASE, tmpl);
785         vmcs_writel(HOST_GS_BASE, tmpl);        /* 22.2.4 */
786 }
787
788 static inline uint16_t
789 vmx_read_ldt(void)
790 {
791         uint16_t ldt;
792 asm("sldt %0":"=g"(ldt));
793         return ldt;
794 }
795
796 static unsigned long
797 segment_base(uint16_t selector)
798 {
799         pseudodesc_t *gdt = &currentcpu->host_gdt;
800         struct desc_struct *d;
801         unsigned long table_base;
802         unsigned long v;
803
804         if (!(selector & ~3)) {
805                 return 0;
806         }
807
808         table_base = gdt->pd_base;
809
810         if (selector & 4) {     /* from ldt */
811                 uint16_t ldt_selector = vmx_read_ldt();
812
813                 if (!(ldt_selector & ~3)) {
814                         return 0;
815                 }
816
817                 table_base = segment_base(ldt_selector);
818         }
819         d = (struct desc_struct *)(table_base + (selector & ~7));
820         v = get_desc_base(d);
821         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
822                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
823         return v;
824 }
825
826 static inline unsigned long
827 vmx_read_tr_base(void)
828 {
829         uint16_t tr;
830 asm("str %0":"=g"(tr));
831         return segment_base(tr);
832 }
833
834 static void
835 __vmx_setup_cpu(void)
836 {
837         pseudodesc_t *gdt = &currentcpu->host_gdt;
838         unsigned long sysenter_esp;
839         unsigned long tmpl;
840
841         /*
842          * Linux uses per-cpu TSS and GDT, so set these when switching
843          * processors.
844          */
845         vmcs_writel(HOST_TR_BASE, vmx_read_tr_base());  /* 22.2.4 */
846         vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);      /* 22.2.4 */
847
848         rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
849         vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp);      /* 22.2.3 */
850
851         rdmsrl(MSR_FS_BASE, tmpl);
852         vmcs_writel(HOST_FS_BASE, tmpl);        /* 22.2.4 */
853         rdmsrl(MSR_GS_BASE, tmpl);
854         vmcs_writel(HOST_GS_BASE, tmpl);        /* 22.2.4 */
855 }
856
857 /**
858  * vmx_get_cpu - called before using a cpu
859  * @vcpu: VCPU that will be loaded.
860  *
861  * Disables preemption. Call vmx_put_cpu() when finished.
862  */
863 static void
864 vmx_get_cpu(struct vmx_vcpu *vcpu)
865 {
866         int cur_cpu = core_id();
867         handler_wrapper_t *w;
868
869         if (currentcpu->local_vcpu)
870                 panic("get_cpu: currentcpu->localvcpu was non-NULL");
871         if (currentcpu->local_vcpu != vcpu) {
872                 currentcpu->local_vcpu = vcpu;
873
874                 if (vcpu->cpu != cur_cpu) {
875                         if (vcpu->cpu >= 0) {
876                                 panic("vcpu->cpu is not -1, it's %d\n", vcpu->cpu);
877                         } else
878                                 vmcs_clear(vcpu->vmcs);
879
880                         ept_sync_context(vcpu_get_eptp(vcpu));
881
882                         vcpu->launched = 0;
883                         vmcs_load(vcpu->vmcs);
884                         __vmx_setup_cpu();
885                         vcpu->cpu = cur_cpu;
886                 } else {
887                         vmcs_load(vcpu->vmcs);
888                 }
889         }
890 }
891
892 /**
893  * vmx_put_cpu - called after using a cpu
894  * @vcpu: VCPU that was loaded.
895  */
896 static void
897 vmx_put_cpu(struct vmx_vcpu *vcpu)
898 {
899         if (core_id() != vcpu->cpu)
900                 panic("%s: core_id() %d != vcpu->cpu %d\n",
901                           __func__, core_id(), vcpu->cpu);
902
903         if (currentcpu->local_vcpu != vcpu)
904                 panic("vmx_put_cpu: asked to clear something not ours");
905
906         ept_sync_context(vcpu_get_eptp(vcpu));
907         vmcs_clear(vcpu->vmcs);
908         vcpu->cpu = -1;
909         currentcpu->local_vcpu = NULL;
910         //put_cpu();
911 }
912
913 /**
914  * vmx_dump_cpu - prints the CPU state
915  * @vcpu: VCPU to print
916  */
917 static void
918 vmx_dump_cpu(struct vmx_vcpu *vcpu)
919 {
920
921         unsigned long flags;
922
923         vmx_get_cpu(vcpu);
924         printk("GUEST_INTERRUPTIBILITY_INFO: 0x%08x\n",  vmcs_readl(GUEST_INTERRUPTIBILITY_INFO));
925         printk("VM_ENTRY_INTR_INFO_FIELD 0x%08x\n", vmcs_readl(VM_ENTRY_INTR_INFO_FIELD));
926         printk("EXIT_QUALIFICATION 0x%08x\n", vmcs_read32(EXIT_QUALIFICATION));
927         printk("VM_EXIT_REASON 0x%08x\n", vmcs_read32(VM_EXIT_REASON));
928         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
929         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
930         flags = vmcs_readl(GUEST_RFLAGS);
931         vmx_put_cpu(vcpu);
932
933         printk("--- Begin VCPU Dump ---\n");
934         printk("CPU %d VPID %d\n", vcpu->cpu, 0);
935         printk("RIP 0x%016lx RFLAGS 0x%08lx\n", vcpu->regs.tf_rip, flags);
936         printk("RAX 0x%016lx RCX 0x%016lx\n", vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
937         printk("RDX 0x%016lx RBX 0x%016lx\n", vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
938         printk("RSP 0x%016lx RBP 0x%016lx\n", vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
939         printk("RSI 0x%016lx RDI 0x%016lx\n", vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
940         printk("R8  0x%016lx R9  0x%016lx\n", vcpu->regs.tf_r8, vcpu->regs.tf_r9);
941         printk("R10 0x%016lx R11 0x%016lx\n", vcpu->regs.tf_r10, vcpu->regs.tf_r11);
942         printk("R12 0x%016lx R13 0x%016lx\n", vcpu->regs.tf_r12, vcpu->regs.tf_r13);
943         printk("R14 0x%016lx R15 0x%016lx\n", vcpu->regs.tf_r14, vcpu->regs.tf_r15);
944         printk("--- End VCPU Dump ---\n");
945
946 }
947
948 uint64_t
949 construct_eptp(physaddr_t root_hpa)
950 {
951         uint64_t eptp;
952
953         /* set WB memory and 4 levels of walk.  we checked these in ept_init */
954         eptp = VMX_EPT_MEM_TYPE_WB | (VMX_EPT_GAW_4_LVL << VMX_EPT_GAW_EPTP_SHIFT);
955         if (cpu_has_vmx_ept_ad_bits())
956                 eptp |= VMX_EPT_AD_ENABLE_BIT;
957         eptp |= (root_hpa & PAGE_MASK);
958
959         return eptp;
960 }
961
962 /**
963  * vmx_setup_initial_guest_state - configures the initial state of guest registers
964  */
965 static void
966 vmx_setup_initial_guest_state(void)
967 {
968         unsigned long tmpl;
969         unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
970                 X86_CR4_PGE | X86_CR4_OSFXSR;
971         uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
972 #if 0
973         do
974                 we need it if (boot_cpu_has(X86_FEATURE_PCID))
975                         cr4 |= X86_CR4_PCIDE;
976         if (boot_cpu_has(X86_FEATURE_OSXSAVE))
977                 cr4 |= X86_CR4_OSXSAVE;
978 #endif
979         /* we almost certainly have this */
980         /* we'll go sour if we don't. */
981         if (1)  //boot_cpu_has(X86_FEATURE_FSGSBASE))
982                 cr4 |= X86_CR4_RDWRGSFS;
983
984         /* configure control and data registers */
985         vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
986                                 X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
987         vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
988                                 X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
989         vmcs_writel(GUEST_CR3, rcr3());
990         vmcs_writel(GUEST_CR4, cr4);
991         vmcs_writel(CR4_READ_SHADOW, cr4);
992         vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
993                                 EFER_SCE /*| EFER_FFXSR */ );
994         vmcs_writel(GUEST_GDTR_BASE, 0);
995         vmcs_writel(GUEST_GDTR_LIMIT, 0);
996         vmcs_writel(GUEST_IDTR_BASE, 0);
997         vmcs_writel(GUEST_IDTR_LIMIT, 0);
998         vmcs_writel(GUEST_RIP, 0xdeadbeef);
999         vmcs_writel(GUEST_RSP, 0xdeadbeef);
1000         vmcs_writel(GUEST_RFLAGS, 0x02);
1001         vmcs_writel(GUEST_DR7, 0);
1002
1003         /* guest segment bases */
1004         vmcs_writel(GUEST_CS_BASE, 0);
1005         vmcs_writel(GUEST_DS_BASE, 0);
1006         vmcs_writel(GUEST_ES_BASE, 0);
1007         vmcs_writel(GUEST_GS_BASE, 0);
1008         vmcs_writel(GUEST_SS_BASE, 0);
1009         rdmsrl(MSR_FS_BASE, tmpl);
1010         vmcs_writel(GUEST_FS_BASE, tmpl);
1011
1012         /* guest segment access rights */
1013         vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
1014         vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
1015         vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
1016         vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
1017         vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
1018         vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
1019
1020         /* guest segment limits */
1021         vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
1022         vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
1023         vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
1024         vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
1025         vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
1026         vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
1027
1028         /* configure segment selectors */
1029         vmcs_write16(GUEST_CS_SELECTOR, 0);
1030         vmcs_write16(GUEST_DS_SELECTOR, 0);
1031         vmcs_write16(GUEST_ES_SELECTOR, 0);
1032         vmcs_write16(GUEST_FS_SELECTOR, 0);
1033         vmcs_write16(GUEST_GS_SELECTOR, 0);
1034         vmcs_write16(GUEST_SS_SELECTOR, 0);
1035         vmcs_write16(GUEST_TR_SELECTOR, 0);
1036
1037         /* guest LDTR */
1038         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1039         vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
1040         vmcs_writel(GUEST_LDTR_BASE, 0);
1041         vmcs_writel(GUEST_LDTR_LIMIT, 0);
1042
1043         /* guest TSS */
1044         vmcs_writel(GUEST_TR_BASE, 0);
1045         vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
1046         vmcs_writel(GUEST_TR_LIMIT, 0xff);
1047
1048         /* initialize sysenter */
1049         vmcs_write32(GUEST_SYSENTER_CS, 0);
1050         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1051         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1052
1053         /* other random initialization */
1054         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1055         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1056         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1057         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1058         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
1059
1060         /* Initialize posted interrupt notification vector */
1061         vmcs_write16(POSTED_NOTIFICATION_VEC, I_VMMCP_POSTED);
1062         }
1063
1064 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1065                                             uint32_t msr) {
1066         int f = sizeof(unsigned long);
1067         /*
1068          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1069          * have the write-low and read-high bitmap offsets the wrong way round.
1070          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1071          */
1072         if (msr <= 0x1fff) {
1073                 __clear_bit(msr, msr_bitmap + 0x000 / f);       /* read-low */
1074                 __clear_bit(msr, msr_bitmap + 0x800 / f);       /* write-low */
1075         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1076                 msr &= 0x1fff;
1077                 __clear_bit(msr, msr_bitmap + 0x400 / f);       /* read-high */
1078                 __clear_bit(msr, msr_bitmap + 0xc00 / f);       /* write-high */
1079         }
1080 }
1081
1082 /* note the io_bitmap is big enough for the 64K port space. */
1083 static void __vmx_disable_intercept_for_io(unsigned long *io_bitmap,
1084                                            uint16_t port) {
1085         __clear_bit(port, io_bitmap);
1086 }
1087
1088 static void vcpu_print_autoloads(struct vmx_vcpu *vcpu) {
1089         struct vmx_msr_entry *e;
1090         int sz = sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs);
1091         printk("Host Autoloads:\n-------------------\n");
1092         for (int i = 0; i < sz; i++) {
1093                 e = &vcpu->msr_autoload.host[i];
1094                 printk("\tMSR 0x%08x: %p\n", e->index, e->value);
1095         }
1096         printk("Guest Autoloads:\n-------------------\n");
1097         for (int i = 0; i < sz; i++) {
1098                 e = &vcpu->msr_autoload.guest[i];
1099                 printk("\tMSR 0x%08x %p\n", e->index, e->value);
1100         }
1101 }
1102
1103 static void dumpmsrs(void) {
1104         int i;
1105         int set[] = {
1106                 MSR_LSTAR,
1107                 MSR_FS_BASE,
1108                 MSR_GS_BASE,
1109                 MSR_KERNEL_GS_BASE,
1110                 MSR_SFMASK,
1111                 MSR_IA32_PEBS_ENABLE
1112         };
1113         for (i = 0; i < ARRAY_SIZE(set); i++) {
1114                 printk("%p: %p\n", set[i], read_msr(set[i]));
1115         }
1116         printk("core id %d\n", core_id());
1117 }
1118
1119 /* emulated msr. For now, an msr value and a pointer to a helper that
1120  * performs the requested operation.
1121  */
1122 struct emmsr {
1123         uint32_t reg;
1124         char *name;
1125         int (*f) (struct vmx_vcpu * vcpu, struct emmsr *, uint32_t, uint32_t);
1126         bool written;
1127         uint32_t edx, eax;
1128 };
1129
1130 int emsr_miscenable(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1131                     uint32_t);
1132 int emsr_mustmatch(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1133                    uint32_t);
1134 int emsr_readonly(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1135                   uint32_t);
1136 int emsr_readzero(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1137                   uint32_t);
1138 int emsr_fakewrite(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1139                    uint32_t);
1140 int emsr_ok(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t, uint32_t);
1141
1142 int emsr_fake_apicbase(struct vmx_vcpu *vcpu, struct emmsr *msr,
1143                    uint32_t opcode, uint32_t qual);
1144
1145 struct emmsr emmsrs[] = {
1146         {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable},
1147         {MSR_IA32_SYSENTER_CS, "MSR_IA32_SYSENTER_CS", emsr_ok},
1148         {MSR_IA32_SYSENTER_EIP, "MSR_IA32_SYSENTER_EIP", emsr_ok},
1149         {MSR_IA32_SYSENTER_ESP, "MSR_IA32_SYSENTER_ESP", emsr_ok},
1150         {MSR_IA32_UCODE_REV, "MSR_IA32_UCODE_REV", emsr_fakewrite},
1151         {MSR_CSTAR, "MSR_CSTAR", emsr_fakewrite},
1152         {MSR_IA32_VMX_BASIC_MSR, "MSR_IA32_VMX_BASIC_MSR", emsr_fakewrite},
1153         {MSR_IA32_VMX_PINBASED_CTLS_MSR, "MSR_IA32_VMX_PINBASED_CTLS_MSR",
1154          emsr_fakewrite},
1155         {MSR_IA32_VMX_PROCBASED_CTLS_MSR, "MSR_IA32_VMX_PROCBASED_CTLS_MSR",
1156          emsr_fakewrite},
1157         {MSR_IA32_VMX_PROCBASED_CTLS2, "MSR_IA32_VMX_PROCBASED_CTLS2",
1158          emsr_fakewrite},
1159         {MSR_IA32_VMX_EXIT_CTLS_MSR, "MSR_IA32_VMX_EXIT_CTLS_MSR",
1160          emsr_fakewrite},
1161         {MSR_IA32_VMX_ENTRY_CTLS_MSR, "MSR_IA32_VMX_ENTRY_CTLS_MSR",
1162          emsr_fakewrite},
1163         {MSR_IA32_ENERGY_PERF_BIAS, "MSR_IA32_ENERGY_PERF_BIAS",
1164          emsr_fakewrite},
1165         {MSR_LBR_SELECT, "MSR_LBR_SELECT", emsr_ok},
1166         {MSR_LBR_TOS, "MSR_LBR_TOS", emsr_ok},
1167         {MSR_LBR_NHM_FROM, "MSR_LBR_NHM_FROM", emsr_ok},
1168         {MSR_LBR_NHM_TO, "MSR_LBR_NHM_TO", emsr_ok},
1169         {MSR_LBR_CORE_FROM, "MSR_LBR_CORE_FROM", emsr_ok},
1170         {MSR_LBR_CORE_TO, "MSR_LBR_CORE_TO", emsr_ok},
1171
1172         // grumble.
1173         {MSR_OFFCORE_RSP_0, "MSR_OFFCORE_RSP_0", emsr_ok},
1174         {MSR_OFFCORE_RSP_1, "MSR_OFFCORE_RSP_1", emsr_ok},
1175         // louder.
1176         {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok},
1177         // aaaaaahhhhhhhhhhhhhhhhhhhhh
1178         {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
1179         {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
1180         {MSR_IA32_PERF_CAPABILITIES, "MSR_IA32_PERF_CAPABILITIES", emsr_ok},
1181         // unsafe.
1182         {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fake_apicbase},
1183
1184         // mostly harmless.
1185         {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite},
1186         {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero},
1187
1188         // TBD
1189         {MSR_IA32_TSC_DEADLINE, "MSR_IA32_TSC_DEADLINE", emsr_fakewrite},
1190 };
1191
1192 static uint64_t set_low32(uint64_t hi, uint32_t lo)
1193 {
1194         return (hi & 0xffffffff00000000ULL) | lo;
1195 }
1196
1197 static uint64_t set_low16(uint64_t hi, uint16_t lo)
1198 {
1199         return (hi & 0xffffffffffff0000ULL) | lo;
1200 }
1201
1202 static uint64_t set_low8(uint64_t hi, uint8_t lo)
1203 {
1204         return (hi & 0xffffffffffffff00ULL) | lo;
1205 }
1206
1207 /* this may be the only register that needs special handling.
1208  * If there others then we might want to extend teh emmsr struct.
1209  */
1210 int emsr_miscenable(struct vmx_vcpu *vcpu, struct emmsr *msr,
1211                     uint32_t opcode, uint32_t qual) {
1212         uint32_t eax, edx;
1213         rdmsr(msr->reg, eax, edx);
1214         /* we just let them read the misc msr for now. */
1215         if (opcode == EXIT_REASON_MSR_READ) {
1216                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1217                 vcpu->regs.tf_rax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
1218                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1219                 return 0;
1220         } else {
1221                 /* if they are writing what is already written, that's ok. */
1222                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1223                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1224                         return 0;
1225         }
1226         printk
1227                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
1228                  msr->name, (uint32_t) vcpu->regs.tf_rdx,
1229                  (uint32_t) vcpu->regs.tf_rax, edx, eax);
1230         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1231 }
1232
1233 int emsr_mustmatch(struct vmx_vcpu *vcpu, struct emmsr *msr,
1234                    uint32_t opcode, uint32_t qual) {
1235         uint32_t eax, edx;
1236         rdmsr(msr->reg, eax, edx);
1237         /* we just let them read the misc msr for now. */
1238         if (opcode == EXIT_REASON_MSR_READ) {
1239                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1240                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1241                 return 0;
1242         } else {
1243                 /* if they are writing what is already written, that's ok. */
1244                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1245                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1246                         return 0;
1247         }
1248         printk
1249                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
1250                  msr->name, (uint32_t) vcpu->regs.tf_rdx,
1251                  (uint32_t) vcpu->regs.tf_rax, edx, eax);
1252         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1253 }
1254
1255 int emsr_ok(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1256             uint32_t qual) {
1257         if (opcode == EXIT_REASON_MSR_READ) {
1258                 rdmsr(msr->reg, vcpu->regs.tf_rdx, vcpu->regs.tf_rax);
1259         } else {
1260                 uint64_t val =
1261                         (uint64_t) vcpu->regs.tf_rdx << 32 | vcpu->regs.tf_rax;
1262                 write_msr(msr->reg, val);
1263         }
1264         return 0;
1265 }
1266
1267 int emsr_readonly(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1268                   uint32_t qual) {
1269         uint32_t eax, edx;
1270         rdmsr((uint32_t) vcpu->regs.tf_rcx, eax, edx);
1271         /* we just let them read the misc msr for now. */
1272         if (opcode == EXIT_REASON_MSR_READ) {
1273                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1274                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1275                 return 0;
1276         }
1277
1278         printk("%s: Tried to write a readonly register\n", msr->name);
1279         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1280 }
1281
1282 int emsr_readzero(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1283                   uint32_t qual) {
1284         if (opcode == EXIT_REASON_MSR_READ) {
1285                 vcpu->regs.tf_rax = 0;
1286                 vcpu->regs.tf_rdx = 0;
1287                 return 0;
1288         }
1289
1290         printk("%s: Tried to write a readonly register\n", msr->name);
1291         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1292 }
1293
1294 /* pretend to write it, but don't write it. */
1295 int emsr_fakewrite(struct vmx_vcpu *vcpu, struct emmsr *msr,
1296                    uint32_t opcode, uint32_t qual) {
1297         uint32_t eax, edx;
1298         if (!msr->written) {
1299                 rdmsr(msr->reg, eax, edx);
1300         } else {
1301                 edx = msr->edx;
1302                 eax = msr->eax;
1303         }
1304         /* we just let them read the misc msr for now. */
1305         if (opcode == EXIT_REASON_MSR_READ) {
1306                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1307                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1308                 return 0;
1309         } else {
1310                 /* if they are writing what is already written, that's ok. */
1311                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1312                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1313                         return 0;
1314                 msr->edx = vcpu->regs.tf_rdx;
1315                 msr->eax = vcpu->regs.tf_rax;
1316                 msr->written = true;
1317         }
1318         return 0;
1319 }
1320
1321 /* pretend to write it, but don't write it. */
1322 int emsr_fake_apicbase(struct vmx_vcpu *vcpu, struct emmsr *msr,
1323                    uint32_t opcode, uint32_t qual) {
1324         uint32_t eax, edx;
1325         if (!msr->written) {
1326                 //rdmsr(msr->reg, eax, edx);
1327                 /* TODO: tightly coupled to the addr in vmrunkernel.  We want this func
1328                  * to return the val that vmrunkernel put into the VMCS. */
1329                 eax = 0xfee00900;
1330                 edx = 0;
1331         } else {
1332                 edx = msr->edx;
1333                 eax = msr->eax;
1334         }
1335         /* we just let them read the misc msr for now. */
1336         if (opcode == EXIT_REASON_MSR_READ) {
1337                 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1338                 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1339                 return 0;
1340         } else {
1341                 /* if they are writing what is already written, that's ok. */
1342                 if (((uint32_t) vcpu->regs.tf_rax == eax)
1343                     && ((uint32_t) vcpu->regs.tf_rdx == edx))
1344                         return 0;
1345                 msr->edx = vcpu->regs.tf_rdx;
1346                 msr->eax = vcpu->regs.tf_rax;
1347                 msr->written = true;
1348         }
1349         return 0;
1350 }
1351
1352
1353 static int
1354 msrio(struct vmx_vcpu *vcpu, uint32_t opcode, uint32_t qual) {
1355         int i;
1356         for (i = 0; i < ARRAY_SIZE(emmsrs); i++) {
1357                 if (emmsrs[i].reg != vcpu->regs.tf_rcx)
1358                         continue;
1359                 return emmsrs[i].f(vcpu, &emmsrs[i], opcode, qual);
1360         }
1361         printk("msrio for 0x%lx failed\n", vcpu->regs.tf_rcx);
1362         return SHUTDOWN_UNHANDLED_EXIT_REASON;
1363 }
1364
1365 /* Notes on autoloading.  We can't autoload FS_BASE or GS_BASE, according to the
1366  * manual, but that's because they are automatically saved and restored when all
1367  * of the other architectural registers are saved and restored, such as cs, ds,
1368  * es, and other fun things. (See 24.4.1).  We need to make sure we don't
1369  * accidentally intercept them too, since they are magically autloaded..
1370  *
1371  * We'll need to be careful of any MSR we neither autoload nor intercept
1372  * whenever we vmenter/vmexit, and we intercept by default.
1373  *
1374  * Other MSRs, such as MSR_IA32_PEBS_ENABLE only work on certain architectures
1375  * only work on certain architectures. */
1376 static void setup_msr(struct vmx_vcpu *vcpu) {
1377         struct vmx_msr_entry *e;
1378         int sz = sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs);
1379         int i;
1380
1381         static_assert((sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs)) <=
1382                       NR_AUTOLOAD_MSRS);
1383
1384         vcpu->msr_autoload.nr = sz;
1385
1386         /* Since PADDR(msr_bitmap) is non-zero, and the bitmap is all 0xff, we now
1387          * intercept all MSRs */
1388         vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1389
1390         vmcs_write64(IO_BITMAP_A, PADDR(io_bitmap));
1391         vmcs_write64(IO_BITMAP_B, PADDR((uintptr_t)io_bitmap +
1392                                         (VMX_IO_BITMAP_SZ / 2)));
1393
1394         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1395         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1396         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1397
1398         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1399         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1400         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1401
1402         for (i = 0; i < sz; i++) {
1403                 uint64_t val;
1404
1405                 e = &vcpu->msr_autoload.host[i];
1406                 e->index = autoloaded_msrs[i];
1407                 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1408                 rdmsrl(e->index, val);
1409                 e->value = val;
1410                 printk("host index %p val %p\n", e->index, e->value);
1411
1412                 e = &vcpu->msr_autoload.guest[i];
1413                 e->index = autoloaded_msrs[i];
1414                 e->value = 0xDEADBEEF;
1415                 printk("guest index %p val %p\n", e->index, e->value);
1416         }
1417 }
1418
1419 /**
1420  *  vmx_setup_vmcs - configures the vmcs with starting parameters
1421  */
1422 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu) {
1423         vmcs_write16(VIRTUAL_PROCESSOR_ID, 0);
1424         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1425
1426         /* Control */
1427         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1428                      vmcs_config.pin_based_exec_ctrl);
1429
1430         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1431                      vmcs_config.cpu_based_exec_ctrl);
1432
1433         if (cpu_has_secondary_exec_ctrls()) {
1434                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1435                              vmcs_config.cpu_based_2nd_exec_ctrl);
1436         }
1437
1438         vmcs_write64(EPT_POINTER, vcpu_get_eptp(vcpu));
1439
1440         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1441         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1442         vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1443
1444         setup_msr(vcpu);
1445
1446         vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1447
1448         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1449         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1450
1451         vmcs_writel(CR0_GUEST_HOST_MASK, 0);    // ~0ul);
1452         vmcs_writel(CR4_GUEST_HOST_MASK, 0);    // ~0ul);
1453
1454         //kvm_write_tsc(&vmx->vcpu, 0);
1455         vmcs_writel(TSC_OFFSET, 0);
1456
1457         vmx_setup_constant_host_state();
1458 }
1459
1460 /**
1461  * vmx_create_vcpu - allocates and initializes a new virtual cpu
1462  *
1463  * Returns: A new VCPU structure
1464  */
1465 struct vmx_vcpu *vmx_create_vcpu(struct proc *p) {
1466         struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1467         if (!vcpu) {
1468                 return NULL;
1469         }
1470
1471         memset(vcpu, 0, sizeof(*vcpu));
1472
1473         vcpu->proc = p; /* uncounted (weak) reference */
1474         vcpu->vmcs = vmx_alloc_vmcs();
1475         printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1476         if (!vcpu->vmcs)
1477                 goto fail_vmcs;
1478
1479         vcpu->cpu = -1;
1480
1481         vmx_get_cpu(vcpu);
1482         vmx_setup_vmcs(vcpu);
1483         vmx_setup_initial_guest_state();
1484         vmx_put_cpu(vcpu);
1485
1486         return vcpu;
1487
1488 fail_vmcs:
1489         kfree(vcpu);
1490         return NULL;
1491 }
1492
1493 /**
1494  * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1495  * @vcpu: the VCPU to destroy
1496  */
1497 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu) {
1498         vmx_free_vmcs(vcpu->vmcs);
1499         kfree(vcpu);
1500 }
1501
1502 /**
1503  * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1504  *
1505  * In the contexts where this is used the vcpu pointer should never be NULL.
1506  */
1507 static inline struct vmx_vcpu *vmx_current_vcpu(void) {
1508         struct vmx_vcpu *vcpu = currentcpu->local_vcpu;
1509         if (!vcpu)
1510                 panic("Core has no vcpu!");
1511         return vcpu;
1512 }
1513
1514 /**
1515  * vmx_run_vcpu - launches the CPU into non-root mode
1516  * We ONLY support 64-bit guests.
1517  * @vcpu: the vmx instance to launch
1518  */
1519 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1520 {
1521         asm(
1522                 /* Store host registers */
1523                 "push %%rdx; push %%rbp;"
1524                 "push %%rcx \n\t" /* placeholder for guest rcx */
1525                 "push %%rcx \n\t"
1526                 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1527                 "je 1f \n\t"
1528                 "mov %%rsp, %c[host_rsp](%0) \n\t"
1529                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1530                 "1: \n\t"
1531                 /* Reload cr2 if changed */
1532                 "mov %c[cr2](%0), %%rax \n\t"
1533                 "mov %%cr2, %%rdx \n\t"
1534                 "cmp %%rax, %%rdx \n\t"
1535                 "je 2f \n\t"
1536                 "mov %%rax, %%cr2 \n\t"
1537                 "2: \n\t"
1538                 /* Check if vmlaunch of vmresume is needed */
1539                 "cmpl $0, %c[launched](%0) \n\t"
1540                 /* Load guest registers.  Don't clobber flags. */
1541                 "mov %c[rax](%0), %%rax \n\t"
1542                 "mov %c[rbx](%0), %%rbx \n\t"
1543                 "mov %c[rdx](%0), %%rdx \n\t"
1544                 "mov %c[rsi](%0), %%rsi \n\t"
1545                 "mov %c[rdi](%0), %%rdi \n\t"
1546                 "mov %c[rbp](%0), %%rbp \n\t"
1547                 "mov %c[r8](%0),  %%r8  \n\t"
1548                 "mov %c[r9](%0),  %%r9  \n\t"
1549                 "mov %c[r10](%0), %%r10 \n\t"
1550                 "mov %c[r11](%0), %%r11 \n\t"
1551                 "mov %c[r12](%0), %%r12 \n\t"
1552                 "mov %c[r13](%0), %%r13 \n\t"
1553                 "mov %c[r14](%0), %%r14 \n\t"
1554                 "mov %c[r15](%0), %%r15 \n\t"
1555                 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1556
1557                 /* Enter guest mode */
1558                 "jne .Llaunched \n\t"
1559                 ASM_VMX_VMLAUNCH "\n\t"
1560                 "jmp .Lkvm_vmx_return \n\t"
1561                 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1562                 ".Lkvm_vmx_return: "
1563                 /* Save guest registers, load host registers, keep flags */
1564                 "mov %0, %c[wordsize](%%rsp) \n\t"
1565                 "pop %0 \n\t"
1566                 "mov %%rax, %c[rax](%0) \n\t"
1567                 "mov %%rbx, %c[rbx](%0) \n\t"
1568                 "popq %c[rcx](%0) \n\t"
1569                 "mov %%rdx, %c[rdx](%0) \n\t"
1570                 "mov %%rsi, %c[rsi](%0) \n\t"
1571                 "mov %%rdi, %c[rdi](%0) \n\t"
1572                 "mov %%rbp, %c[rbp](%0) \n\t"
1573                 "mov %%r8,  %c[r8](%0) \n\t"
1574                 "mov %%r9,  %c[r9](%0) \n\t"
1575                 "mov %%r10, %c[r10](%0) \n\t"
1576                 "mov %%r11, %c[r11](%0) \n\t"
1577                 "mov %%r12, %c[r12](%0) \n\t"
1578                 "mov %%r13, %c[r13](%0) \n\t"
1579                 "mov %%r14, %c[r14](%0) \n\t"
1580                 "mov %%r15, %c[r15](%0) \n\t"
1581                 "mov %%rax, %%r10 \n\t"
1582                 "mov %%rdx, %%r11 \n\t"
1583
1584                 "mov %%cr2, %%rax   \n\t"
1585                 "mov %%rax, %c[cr2](%0) \n\t"
1586
1587                 "pop  %%rbp; pop  %%rdx \n\t"
1588                 "setbe %c[fail](%0) \n\t"
1589                 "mov $" STRINGIFY(GD_UD) ", %%rax \n\t"
1590                 "mov %%rax, %%ds \n\t"
1591                 "mov %%rax, %%es \n\t"
1592               : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1593                 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1594                 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1595                 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1596                 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1597                 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1598                 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1599                 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1600                 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1601                 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1602                 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1603                 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1604                 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1605                 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1606                 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1607                 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1608                 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1609                 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1610                 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1611                 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1612                 [wordsize]"i"(sizeof(unsigned long))
1613               : "cc", "memory"
1614                 , "rax", "rbx", "rdi", "rsi"
1615                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1616         );
1617
1618         if (vmcs_readl(GUEST_IDTR_BASE) != idtr){
1619                 printk("idt changed; old 0x%lx new 0x%lx\n", vmcs_read64(GUEST_IDTR_BASE), idtr);
1620                 idtr = vmcs_read64(GUEST_IDTR_BASE);
1621         }
1622         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1623         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1624         printd("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1625                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1626         /* FIXME: do we need to set up other flags? */
1627         // NO IDEA!
1628         vcpu->regs.tf_rflags = vmcs_readl(GUEST_RFLAGS); //& 0xFF) | X86_EFLAGS_IF | 0x2;
1629
1630         vcpu->regs.tf_cs = GD_UT;
1631         vcpu->regs.tf_ss = GD_UD;
1632
1633         vcpu->launched = 1;
1634
1635         if (vcpu->fail) {
1636                 printk("failure detected (err %x)\n",
1637                        vmcs_read32(VM_INSTRUCTION_ERROR));
1638                 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1639         }
1640
1641         return vmcs_read32(VM_EXIT_REASON);
1642
1643 #if 0
1644         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1645         vmx_complete_atomic_exit(vmx);
1646         vmx_recover_nmi_blocking(vmx);
1647         vmx_complete_interrupts(vmx);
1648 #endif
1649 }
1650
1651 static void vmx_step_instruction(void) {
1652         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1653                     vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1654 }
1655
1656 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu, struct vmctl *v) {
1657         unsigned long gva, gpa;
1658         int exit_qual, ret = -1;
1659         page_t *page;
1660
1661         vmx_get_cpu(vcpu);
1662         exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1663         gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1664         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1665         v->gpa = gpa;
1666         v->gva = gva;
1667         v->exit_qual = exit_qual;
1668         vmx_put_cpu(vcpu);
1669
1670         int prot = 0;
1671         prot |= exit_qual & VMX_EPT_FAULT_READ ? PROT_READ : 0;
1672         prot |= exit_qual & VMX_EPT_FAULT_WRITE ? PROT_WRITE : 0;
1673         prot |= exit_qual & VMX_EPT_FAULT_INS ? PROT_EXEC : 0;
1674         ret = handle_page_fault(current, gpa, prot);
1675
1676         // Some of these get fixed in the vmm; be less chatty now.
1677         if (0 && ret) {
1678                 printk("EPT page fault failure %d, GPA: %p, GVA: %p\n", ret, gpa,
1679                        gva);
1680                 vmx_dump_cpu(vcpu);
1681         }
1682
1683         /* we let the vmm handle the failure cases. So return
1684          * the VMX exit violation, not what handle_page_fault returned.
1685          */
1686         return EXIT_REASON_EPT_VIOLATION;
1687 }
1688
1689 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu) {
1690         unsigned int eax, ebx, ecx, edx;
1691
1692         eax = vcpu->regs.tf_rax;
1693         ecx = vcpu->regs.tf_rcx;
1694         cpuid(eax, ecx, &eax, &ebx, &ecx, &edx);
1695         vcpu->regs.tf_rax = eax;
1696         vcpu->regs.tf_rbx = ebx;
1697         vcpu->regs.tf_rcx = ecx;
1698         vcpu->regs.tf_rdx = edx;
1699 }
1700
1701 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu) {
1702         uint32_t intr_info;
1703
1704         vmx_get_cpu(vcpu);
1705         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1706         vmx_put_cpu(vcpu);
1707
1708         printk("vmx (vcpu %p): got an exception\n", vcpu);
1709         printk("vmx (vcpu %p): pid %d\n", vcpu, vcpu->proc->pid);
1710         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1711                 return 0;
1712         }
1713
1714         printk("unhandled nmi, intr_info %x\n", intr_info);
1715         return -EIO;
1716 }
1717
1718 static void vmx_hwapic_isr_update(struct vmctl *v, int isr)
1719 {
1720         uint16_t status;
1721         uint8_t old;
1722
1723         status = vmcs_read16(GUEST_INTR_STATUS);
1724         old = status >> 8;
1725         if (isr != old) {
1726                 status &= 0xff;
1727                 status |= isr << 8;
1728                 vmcs_write16(GUEST_INTR_STATUS, status);
1729         }
1730 }
1731
1732 static void vmx_set_rvi(int vector)
1733 {
1734         uint16_t status;
1735         uint8_t old;
1736
1737         status = vmcs_read16(GUEST_INTR_STATUS);
1738         printk("%s: Status is %04x", __func__, status);
1739         old = (uint8_t)status & 0xff;
1740         if ((uint8_t)vector != old) {
1741                 status &= ~0xff;
1742                 status |= (uint8_t)vector;
1743                 printk("%s: SET 0x%x\n", __func__, status);
1744
1745                 // Clear SVI
1746                 status &= 0xff;
1747                 vmcs_write16(GUEST_INTR_STATUS, status);
1748         }
1749         printk("%s: Status is %04x after RVI", __func__,
1750                         vmcs_read16(GUEST_INTR_STATUS));
1751 }
1752
1753 /*
1754 static void vmx_set_posted_interrupt(int vector)
1755 {
1756         unsigned long *bit_vec;
1757         unsigned long *pir = vmcs_readl(POSTED_INTR_DESC_ADDR_HIGH);
1758         pir = pir << 32;
1759         pir |= vmcs_readl(POSTED_INTR_DESC_ADDR);
1760
1761         // Move to the correct location to set our bit.
1762         bit_vec = pir + vector/32;
1763         test_and_set_bit(vector%32, bit_vec);
1764
1765         // Set outstanding notification bit
1766         bit_vec = pir + 8;
1767         test_and_set_bit(0, bit_vec);
1768 }
1769
1770 */
1771
1772 /**
1773  * vmx_launch - the main loop for a VMX Dune process
1774  * @conf: the launch configuration
1775  */
1776 int vmx_launch(struct vmctl *v) {
1777         int ret;
1778         struct vmx_vcpu *vcpu;
1779         int errors = 0;
1780         int advance;
1781         int interrupting = 0;
1782         uintptr_t pir_kva, vapic_kva, apic_kva;
1783         uint64_t pir_physical, vapic_physical, apic_physical;
1784         struct proc * current_proc = current;
1785
1786         /* TODO: dirty hack til we have VMM contexts */
1787         vcpu = current->vmm.guest_pcores[0];
1788         if (!vcpu) {
1789                 printk("Failed to get a CPU!\n");
1790                 return -ENOMEM;
1791         }
1792
1793         /* We need to prep the host's autoload region for our current core.  Right
1794          * now, the only autoloaded MSR that varies at runtime (in this case per
1795          * core is the KERN_GS_BASE). */
1796         rdmsrl(MSR_KERNEL_GS_BASE, vcpu->msr_autoload.host[0].value);
1797         /* if cr3 is set, means 'set everything', else means 'start where you left off' */
1798         vmx_get_cpu(vcpu);
1799         switch(v->command) {
1800         case REG_ALL:
1801                 printd("REG_ALL\n");
1802                 // fallthrough
1803                 vcpu->regs = v->regs;
1804                 vmcs_writel(GUEST_RSP, v->regs.tf_rsp);
1805                 vmcs_writel(GUEST_RIP, v->regs.tf_rip);
1806                 break;
1807         case REG_RSP_RIP_CR3:
1808                 printd("REG_RSP_RIP_CR3\n");
1809                 vmcs_writel(GUEST_RSP, v->regs.tf_rsp);
1810                 vmcs_writel(GUEST_CR3, v->cr3);
1811
1812                 pir_kva = uva2kva(current_proc, (void *)v->pir);
1813                 pir_physical = (uint64_t)PADDR(pir_kva);
1814
1815                 vmcs_writel(POSTED_INTR_DESC_ADDR, pir_physical);
1816                 vmcs_writel(POSTED_INTR_DESC_ADDR_HIGH, pir_physical>>32);
1817                 printk("POSTED_INTR_DESC_ADDR_HIGH %ld\n", vmcs_readl(POSTED_INTR_DESC_ADDR_HIGH));
1818                 if (pir_physical & 0xfff) {
1819                         printk("Low order 12 bits of pir address is not 0, value: %p\n", pir_physical);
1820                 }
1821
1822                 vapic_kva = uva2kva(current_proc, (void *)v->vapic);
1823                 vapic_physical = (uint64_t)PADDR(vapic_kva);
1824
1825                 vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, vapic_physical);
1826                 vmcs_writel(VIRTUAL_APIC_PAGE_ADDR_HIGH, vapic_physical>>32);
1827                 if (vapic_physical & 0xfff) {
1828                         printk("Low order 12 bits of vapic address is not 0, value: %p\n", vapic_physical);
1829                 }
1830
1831                 printk("VAPIC PHYSICAL ADDRESS: %p\n", vapic_physical);
1832
1833                 apic_kva = uva2kva(current_proc, (void *)0xfee00000);
1834                 apic_physical = (uint64_t)PADDR(apic_kva);
1835
1836                 vmcs_writel(APIC_ACCESS_ADDR, apic_physical);
1837                 vmcs_writel(APIC_ACCESS_ADDR_HIGH, apic_physical>>32);
1838
1839                 // Clear the EOI exit bitmap(Gan)
1840                 vmcs_writel(EOI_EXIT_BITMAP0, 0);
1841                 vmcs_writel(EOI_EXIT_BITMAP0_HIGH, 0);
1842                 vmcs_writel(EOI_EXIT_BITMAP1, 0);
1843                 vmcs_writel(EOI_EXIT_BITMAP1_HIGH, 0);
1844                 vmcs_writel(EOI_EXIT_BITMAP2, 0);
1845                 vmcs_writel(EOI_EXIT_BITMAP2_HIGH, 0);
1846                 vmcs_writel(EOI_EXIT_BITMAP3, 0);
1847                 vmcs_writel(EOI_EXIT_BITMAP3_HIGH, 0);
1848
1849                 printk("v->apic %p v->pir %p\n", (void *)v->vapic, (void *)v->pir);
1850                 // fallthrough
1851         case REG_RIP:
1852                 printd("REG_RIP %p\n", v->regs.tf_rip);
1853                 vmcs_writel(GUEST_RIP, v->regs.tf_rip);
1854                 break;
1855         case RESUME:
1856                 /* If v->interrupt is non-zero, set it in the vmcs and
1857                  * zero it in the vmctl. Else set RIP.
1858                  * We used to check IF and such here but we'll let the VMM do it. If the VMM screws up
1859                  * we can always fix it. Note to people who know about security: could this be an issue?
1860                  * I don't see how: it will mainly just break your guest vm AFAICT.
1861                  */
1862                 if (v->interrupt) {
1863                         printk("Set VM_ENTRY_INFTR_INFO_FIELD to 0x%x\n", v->interrupt);
1864                         //vmcs_writel(VM_ENTRY_INTR_INFO_FIELD, v->interrupt);
1865                         //vapic_status_dump_kernel((void *)v->vapic);
1866                         
1867                         //Not using this because we still need a VMExit.
1868                         //vmx_set_rvi(v->interrupt);
1869
1870                         //vapic_status_dump_kernel((void *)v->vapic);
1871                         v->interrupt = 0;
1872                         interrupting = 1;
1873                 }
1874                 printd("RESUME\n");
1875                 break;
1876         default: 
1877                 error(EINVAL, "Bad command in vmx_launch");
1878         }
1879         vcpu->shutdown = 0;
1880         vmx_put_cpu(vcpu);
1881         if (interrupting) {
1882                 printk("BEFORE INTERRUPT: ");
1883                 vmx_dump_cpu(vcpu);
1884         }
1885         vcpu->ret_code = -1;
1886
1887         while (1) {
1888                 advance = 0;
1889                 vmx_get_cpu(vcpu);
1890
1891                 // TODO: manage the fpu when we restart.
1892
1893                 // TODO: see if we need to exit before we go much further.
1894                 disable_irq();
1895                 //dumpmsrs();
1896                 ret = vmx_run_vcpu(vcpu);
1897                 
1898                 //dumpmsrs();
1899                 enable_irq();
1900                 v->intrinfo1 = vmcs_readl(GUEST_INTERRUPTIBILITY_INFO);
1901                 v->intrinfo2 = vmcs_readl(VM_EXIT_INTR_INFO);
1902                 vmx_put_cpu(vcpu);
1903
1904                 if (interrupting) {
1905                         printk("POST INTERRUPT: \n");
1906                         unsigned long cr8val;
1907                         asm volatile("mov %%cr8,%0" : "=r" (cr8val));
1908                         printk("CR8 Value: 0x%08x", cr8val);
1909                         
1910                         printk("%s: Status is %04x\n", __func__,
1911                                         vmcs_read16(GUEST_INTR_STATUS));
1912                         vmx_dump_cpu(vcpu);
1913                 }
1914
1915                 if (ret == EXIT_REASON_VMCALL) {
1916                         if (current->vmm.flags & VMM_VMCALL_PRINTF) {
1917                                 uint8_t byte = vcpu->regs.tf_rdi;
1918                                 printd("System call\n");
1919 #ifdef DEBUG
1920                                 vmx_dump_cpu(vcpu);
1921 #endif
1922                                 advance = 3;
1923                                 printk("%c", byte);
1924                                 // adjust the RIP
1925                         } else {
1926                                 vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1927 #ifdef DEBUG
1928                                 vmx_dump_cpu(vcpu);
1929                                 printd("system call! WTF\n");
1930 #endif
1931                         }
1932                 } else if (ret == EXIT_REASON_CR_ACCESS) {
1933                         show_cr_access(vmcs_read32(EXIT_QUALIFICATION));
1934                         vmx_dump_cpu(vcpu);
1935                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1936                 } else if (ret == EXIT_REASON_CPUID) {
1937                         vmx_handle_cpuid(vcpu);
1938                         vmx_get_cpu(vcpu);
1939                         vmcs_writel(GUEST_RIP, vcpu->regs.tf_rip + 2);
1940                         vmx_put_cpu(vcpu);
1941                 } else if (ret == EXIT_REASON_EPT_VIOLATION) {
1942                         if (vmx_handle_ept_violation(vcpu, v))
1943                                 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1944                 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1945                         if (vmx_handle_nmi_exception(vcpu))
1946                                 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1947                 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1948                         printk("External interrupt\n");
1949                         vmx_dump_cpu(vcpu);
1950                         printk("GUEST_INTERRUPTIBILITY_INFO: 0x%08x,",  v->intrinfo1);
1951                         printk("VM_EXIT_INFO_FIELD 0x%08x,", v->intrinfo2);
1952                         printk("rflags 0x%x\n", vcpu->regs.tf_rflags);
1953                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1954                 } else if (ret == EXIT_REASON_MSR_READ) {
1955                         printd("msr read\n");
1956                         vmx_dump_cpu(vcpu);
1957                         vcpu->shutdown =
1958                                 msrio(vcpu, ret, vmcs_read32(EXIT_QUALIFICATION));
1959                         advance = 2;
1960                 } else if (ret == EXIT_REASON_MSR_WRITE) {
1961                         printd("msr write\n");
1962                         vmx_dump_cpu(vcpu);
1963                         vcpu->shutdown =
1964                                 msrio(vcpu, ret, vmcs_read32(EXIT_QUALIFICATION));
1965                         advance = 2;
1966                 } else if (ret == EXIT_REASON_IO_INSTRUCTION) {
1967                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1968                 } else if (ret == EXIT_REASON_APIC_WRITE) {
1969                         printk("BEGIN APIC WRITE EXIT DUMP\n");
1970                         vmx_dump_cpu(vcpu);
1971                         printk("END APIC WRITE EXIT DUMP\n");
1972                 //} else if (ret == EXIT_REASON_APIC_ACCESS) {
1973                         //vmx_dump_cpu(vcpu);
1974                 } else {
1975                         printk("unhandled exit: reason 0x%x, exit qualification 0x%x\n",
1976                                ret, vmcs_read32(EXIT_QUALIFICATION));
1977                         if (ret & 0x80000000) {
1978                                 printk("entry failed.\n");
1979                                 vmx_dump_cpu(vcpu);
1980                         }
1981                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1982                 }
1983
1984                 interrupting = 0;
1985                 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1986                  * similar to how proc_restartcore/smp_idle only restart the pcpui
1987                  * cur_ctx, we need to do the same, via the VMCS resume business. */
1988                 if (vcpu->shutdown)
1989                         break;
1990
1991                 if (advance) {
1992                         vmx_get_cpu(vcpu);
1993                         vmcs_writel(GUEST_RIP, vcpu->regs.tf_rip + advance);
1994                         vmx_put_cpu(vcpu);
1995                 }
1996         }
1997
1998         printd("RETURN. ip %016lx sp %016lx, shutdown 0x%lx ret 0x%lx\n",
1999                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->shutdown, vcpu->shutdown);
2000         v->regs = vcpu->regs;
2001         v->shutdown = vcpu->shutdown;
2002         v->ret_code = ret;
2003 //  hexdump((void *)vcpu->regs.tf_rsp, 128 * 8);
2004         /*
2005          * Return both the reason for the shutdown and a status value.
2006          * The exit() and exit_group() system calls only need 8 bits for
2007          * the status but we allow 16 bits in case we might want to
2008          * return more information for one of the other shutdown reasons.
2009          */
2010         ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
2011
2012         return ret;
2013 }
2014
2015 /**
2016  * __vmx_enable - low-level enable of VMX mode on the current CPU
2017  * @vmxon_buf: an opaque buffer for use as the VMXON region
2018  */
2019 static int __vmx_enable(struct vmcs *vmxon_buf) {
2020         uint64_t phys_addr = PADDR(vmxon_buf);
2021         uint64_t old, test_bits;
2022
2023         if (rcr4() & X86_CR4_VMXE) {
2024                 panic("Should never have this happen");
2025                 return -EBUSY;
2026         }
2027
2028         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2029
2030         test_bits = FEATURE_CONTROL_LOCKED;
2031         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
2032
2033         if (0)  // tboot_enabled())
2034                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
2035
2036         if ((old & test_bits) != test_bits) {
2037                 /* If it's locked, then trying to set it will cause a GPF.
2038                  * No Dune for you!
2039                  */
2040                 if (old & FEATURE_CONTROL_LOCKED) {
2041                         printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
2042                         return -1;
2043                 }
2044
2045                 /* enable and lock */
2046                 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
2047         }
2048         lcr4(rcr4() | X86_CR4_VMXE);
2049
2050         __vmxon(phys_addr);
2051         vpid_sync_vcpu_global();        /* good idea, even if we aren't using vpids */
2052         ept_sync_global();
2053
2054         return 0;
2055 }
2056
2057 /**
2058  * vmx_enable - enables VMX mode on the current CPU
2059  * @unused: not used (required for on_each_cpu())
2060  *
2061  * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
2062  */
2063 static void vmx_enable(void) {
2064         struct vmcs *vmxon_buf = currentcpu->vmxarea;
2065         int ret;
2066
2067         ret = __vmx_enable(vmxon_buf);
2068         if (ret)
2069                 goto failed;
2070
2071         currentcpu->vmx_enabled = 1;
2072         // TODO: do we need this?
2073         store_gdt(&currentcpu->host_gdt);
2074
2075         printk("VMX enabled on CPU %d\n", core_id());
2076         return;
2077
2078 failed:
2079         printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret);
2080 }
2081
2082 /**
2083  * vmx_disable - disables VMX mode on the current CPU
2084  */
2085 static void vmx_disable(void *unused) {
2086         if (currentcpu->vmx_enabled) {
2087                 __vmxoff();
2088                 lcr4(rcr4() & ~X86_CR4_VMXE);
2089                 currentcpu->vmx_enabled = 0;
2090         }
2091 }
2092
2093 /* Probe the cpus to see which ones can do vmx.
2094  * Return -errno if it fails, and 1 if it succeeds.
2095  */
2096 static bool probe_cpu_vmx(void) {
2097         /* The best way to test this code is:
2098          * wrmsr -p <cpu> 0x3a 1
2099          * This will lock vmx off; then modprobe dune.
2100          * Frequently, however, systems have all 0x3a registers set to 5,
2101          * meaning testing is impossible, as vmx can not be disabled.
2102          * We have to simulate it being unavailable in most cases.
2103          * The 'test' variable provides an easy way to simulate
2104          * unavailability of vmx on some, none, or all cpus.
2105          */
2106         if (!cpu_has_vmx()) {
2107                 printk("Machine does not support VT-x\n");
2108                 return FALSE;
2109         } else {
2110                 printk("Machine supports VT-x\n");
2111                 return TRUE;
2112         }
2113 }
2114
2115 static void setup_vmxarea(void) {
2116         struct vmcs *vmxon_buf;
2117         printd("Set up vmxarea for cpu %d\n", core_id());
2118         vmxon_buf = __vmx_alloc_vmcs(core_id());
2119         if (!vmxon_buf) {
2120                 printk("setup_vmxarea failed on node %d\n", core_id());
2121                 return;
2122         }
2123         currentcpu->vmxarea = vmxon_buf;
2124 }
2125
2126 static int ept_init(void) {
2127         if (!cpu_has_vmx_ept()) {
2128                 printk("VMX doesn't support EPT!\n");
2129                 return -1;
2130         }
2131         if (!cpu_has_vmx_eptp_writeback()) {
2132                 printk("VMX EPT doesn't support WB memory!\n");
2133                 return -1;
2134         }
2135         if (!cpu_has_vmx_ept_4levels()) {
2136                 printk("VMX EPT doesn't support 4 level walks!\n");
2137                 return -1;
2138         }
2139         switch (arch_max_jumbo_page_shift()) {
2140         case PML3_SHIFT:
2141                 if (!cpu_has_vmx_ept_1g_page()) {
2142                         printk("VMX EPT doesn't support 1 GB pages!\n");
2143                         return -1;
2144                 }
2145                 break;
2146         case PML2_SHIFT:
2147                 if (!cpu_has_vmx_ept_2m_page()) {
2148                         printk("VMX EPT doesn't support 2 MB pages!\n");
2149                         return -1;
2150                 }
2151                 break;
2152         default:
2153                 printk("Unexpected jumbo page size %d\n",
2154                        arch_max_jumbo_page_shift());
2155                 return -1;
2156         }
2157         if (!cpu_has_vmx_ept_ad_bits()) {
2158                 printk("VMX EPT doesn't support accessed/dirty!\n");
2159                 x86_ept_pte_fix_ups |= EPTE_A | EPTE_D;
2160         }
2161         if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
2162                 printk("VMX EPT can't invalidate PTEs/TLBs!\n");
2163                 return -1;
2164         }
2165
2166         return 0;
2167 }
2168
2169 /**
2170  * vmx_init sets up physical core data areas that are required to run a vm at all.
2171  * These data areas are not connected to a specific user process in any way. Instead,
2172  * they are in some sense externalizing what would other wise be a very large ball of
2173  * state that would be inside the CPU.
2174  */
2175 int intel_vmm_init(void) {
2176         int r, cpu, ret;
2177
2178         if (!probe_cpu_vmx()) {
2179                 return -EOPNOTSUPP;
2180         }
2181
2182         setup_vmcs_config(&ret);
2183
2184         if (ret) {
2185                 printk("setup_vmcs_config failed: %d\n", ret);
2186                 return ret;
2187         }
2188
2189         msr_bitmap = (unsigned long *)kpage_zalloc_addr();
2190         if (!msr_bitmap) {
2191                 printk("Could not allocate msr_bitmap\n");
2192                 return -ENOMEM;
2193         }
2194         io_bitmap = (unsigned long *)get_cont_pages(VMX_IO_BITMAP_ORDER,
2195                                                     KMALLOC_WAIT);
2196         if (!io_bitmap) {
2197                 printk("Could not allocate msr_bitmap\n");
2198                 kfree(msr_bitmap);
2199                 return -ENOMEM;
2200         }
2201         /* FIXME: do we need APIC virtualization (flexpriority?) */
2202
2203         memset(msr_bitmap, 0xff, PAGE_SIZE);
2204         memset(io_bitmap, 0xff, VMX_IO_BITMAP_SZ);
2205
2206         /* These are the only MSRs that are not autoloaded and not intercepted */
2207         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
2208         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
2209         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_EFER);
2210
2211         /* TODO: this might be dangerous, since they can do more than just read the
2212          * CMOS */
2213         __vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_IDX);
2214         __vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_DATA);
2215
2216         if ((ret = ept_init())) {
2217                 printk("EPT init failed, %d\n", ret);
2218                 return ret;
2219         }
2220         printk("VMX setup succeeded\n");
2221         return 0;
2222 }
2223
2224 int intel_vmm_pcpu_init(void) {
2225         setup_vmxarea();
2226         vmx_enable();
2227         return 0;
2228 }
2229
2230
2231 void vapic_status_dump_kernel(void *vapic)
2232 {
2233         uint32_t *p = (uint32_t *)vapic;
2234         int i;
2235         printk("-- BEGIN KERNEL APIC STATUS DUMP --\n");
2236         for (i = 0x100/sizeof(*p); i < 0x180/sizeof(*p); i+=4) {
2237                 printk("VISR : 0x%x: 0x%08x\n", i, p[i]);
2238         }
2239         for (i = 0x200/sizeof(*p); i < 0x280/sizeof(*p); i+=4) {
2240                 printk("VIRR : 0x%x: 0x%08x\n", i, p[i]);
2241         }
2242         i = 0x0B0/sizeof(*p);
2243         printk("EOI FIELD : 0x%x, 0x%08x\n", i, p[i]);
2244
2245         printk("-- END KERNEL APIC STATUS DUMP --\n");
2246 }