3 * vmx.c - The Intel VT-x driver for Dune
5 * This file is derived from Linux KVM VT-x support.
6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
13 * This modified version is simpler because it avoids the following
14 * features that are not requirements for Dune:
15 * * Real-mode emulation
16 * * Nested VT-x support
17 * * I/O hardware emulation
18 * * Any of the more esoteric X86 features and registers
19 * * KVM-specific functionality
21 * In essence we provide only the minimum functionality needed to run
22 * a process in vmx non-root mode rather than the full hardware emulation
23 * needed to support an entire OS.
25 * This driver is a research prototype and as such has the following
28 * FIXME: Backward compatability is currently a non-goal, and only recent
29 * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
32 * FIXME: Eventually we should handle concurrent user's of VT-x more
33 * gracefully instead of requiring exclusive access. This would allow
34 * Dune to interoperate with KVM and other HV solutions.
36 * FIXME: We need to support hotplugged physical CPUs.
39 * Adam Belay <abelay@stanford.edu>
43 * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
44 * You're left with the feeling that they got part way through and realized they had to have one for
46 * 1) your CPU is going to be capable of running VMs, and you need state for that.
48 * 2) you're about to start a guest, and you need state for that.
50 * So there is get cpu set up to be able to run VMs stuff, and now
51 * let's start a guest stuff. In Akaros, CPUs will always be set up
52 * to run a VM if that is possible. Processes can flip themselves into
53 * a VM and that will require another VMCS.
55 * So: at kernel startup time, the SMP boot stuff calls
56 * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
57 * in the case of this file is intel_vmm_init. That does some code
58 * that sets up stuff for ALL sockets, based on the capabilities of
59 * the socket it runs on. If any cpu supports vmx, it assumes they all
60 * do. That's a realistic assumption. So the call_function_all is kind
61 * of stupid, really; it could just see what's on the current cpu and
62 * assume it's on all. HOWEVER: there are systems in the wilde that
63 * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
64 * might as well allow for the chance that wel'll only all VMMCPs on a
65 * subset (not implemented yet however). So: probe all CPUs, get a
66 * count of how many support VMX and, for now, assume they all do
69 * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
70 * which contains all the naughty bits settings for all the cpus that can run a VM.
71 * Realistically, all VMX-capable cpus in a system will have identical configurations.
72 * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
74 * configure the msr_bitmap. This is the bitmap of MSRs which the
75 * guest can manipulate. Currently, we only allow GS and FS base.
77 * Reserve bit 0 in the vpid bitmap as guests can not use that
79 * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
80 * per-guest. Once set up, it is left alone. The ONLY think we set in
81 * there is the revision area. The VMX is page-sized per cpu and
82 * page-aligned. Note that it can be smaller, but why bother? We know
83 * the max size and alightment, and it's convenient.
85 * Now that it is set up, enable vmx on all cpus. This involves
86 * testing VMXE in cr4, to see if we've been here before (TODO: delete
87 * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
88 * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
89 * instruction), and syncing vpid's and ept's. Now the CPU is ready
93 * We divide this into two things: vmm_proc_init and vm_run.
94 * Currently, on Intel, vmm_proc_init does nothing.
96 * vm_run is really complicated. It is called with a coreid, rip, rsp,
97 * cr3, and flags. On intel, it calls vmx_launch. vmx_launch is set
98 * up for a few test cases. If rip is 1, it sets the guest rip to
99 * a function which will deref 0 and should exit with failure 2. If rip is 0,
100 * it calls an infinite loop in the guest.
102 * The sequence of operations:
106 * disable irqs (required or you can't enter the VM)
113 * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
114 * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
116 * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
117 * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
118 * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
119 * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
121 * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
122 * of inline assembly with embedded CPP crap. I suspect we'll want to
123 * un-inline it someday, but maybe not. It's called with a vcpu
124 * struct from which it loads guest state, and to which it stores
125 * non-virtualized host state. It issues a vmlaunch or vmresume
126 * instruction depending, and on return, it evaluates if things the
127 * launch/resume had an error in that operation. Note this is NOT the
128 * same as an error while in the virtual machine; this is an error in
129 * startup due to misconfiguration. Depending on whatis returned it's
130 * either a failed vm startup or an exit for lots of many reasons.
134 /* basically: only rename those globals that might conflict
135 * with existing names. Leave all else the same.
136 * this code is more modern than the other code, yet still
137 * well encapsulated, it seems.
145 #include <sys/queue.h>
153 #include <arch/types.h>
161 #include "cpufeature.h"
163 #define currentcpu (&per_cpu_info[core_id()])
165 static unsigned long *msr_bitmap;
166 #define VMX_IO_BITMAP_ORDER 4 /* 64 KB */
167 #define VMX_IO_BITMAP_SZ (1 << (VMX_IO_BITMAP_ORDER + PGSHIFT))
168 static unsigned long *io_bitmap;
170 int x86_ept_pte_fix_ups = 0;
172 struct vmx_capability vmx_capability;
173 struct vmcs_config vmcs_config;
175 static int autoloaded_msrs[] = {
182 static char *cr_access_type[] = {
189 static char *cr_gpr[] = {
190 "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
191 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
194 static int guest_cr_num[16] = {
204 -1, -1, -1, -1, -1, -1, -1
207 __always_inline unsigned long vmcs_readl(unsigned long field);
208 /* See section 24-3 of The Good Book */
210 show_cr_access(uint64_t val)
212 int crnr = val & 0xf;
213 int type = (val >> 4) & 3;
214 int reg = (val >> 11) & 0xf;
215 printk("%s: %d: ", cr_access_type[type], crnr);
217 printk("%s", cr_gpr[reg]);
218 if (guest_cr_num[crnr] > -1) {
219 printk(": 0x%x", vmcs_readl(guest_cr_num[crnr]));
226 ept_flush(uint64_t eptp)
228 ept_sync_context(eptp);
232 vmcs_clear(struct vmcs *vmcs)
234 uint64_t phys_addr = PADDR(vmcs);
237 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0":"=qm"(error):"a"(&phys_addr),
241 printk("vmclear fail: %p/%llx\n", vmcs, phys_addr);
245 vmcs_load(struct vmcs *vmcs)
247 uint64_t phys_addr = PADDR(vmcs);
250 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0":"=qm"(error):"a"(&phys_addr),
254 printk("vmptrld %p/%llx failed\n", vmcs, phys_addr);
257 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
259 vmcs_get_current(void)
261 physaddr_t vmcs_paddr;
262 /* RAX contains the addr of the location to store the VMCS pointer. The
263 * compiler doesn't know the ASM will deref that pointer, hence the =m */
264 asm volatile (ASM_VMX_VMPTRST_RAX:"=m"(vmcs_paddr):"a"(&vmcs_paddr));
268 __always_inline unsigned long
269 vmcs_readl(unsigned long field)
273 asm volatile (ASM_VMX_VMREAD_RDX_RAX:"=a"(value):"d"(field):"cc");
277 __always_inline uint16_t
278 vmcs_read16(unsigned long field)
280 return vmcs_readl(field);
283 static __always_inline uint32_t
284 vmcs_read32(unsigned long field)
286 return vmcs_readl(field);
289 static __always_inline uint64_t
290 vmcs_read64(unsigned long field)
292 return vmcs_readl(field);
296 vmwrite_error(unsigned long field, unsigned long value)
298 printk("vmwrite error: reg %lx value %lx (err %d)\n",
299 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
303 vmcs_writel(unsigned long field, unsigned long value)
307 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0":"=q"(error):"a"(value),
310 vmwrite_error(field, value);
314 vmcs_write16(unsigned long field, uint16_t value)
316 vmcs_writel(field, value);
320 vmcs_write32(unsigned long field, uint32_t value)
322 vmcs_writel(field, value);
326 vmcs_write64(unsigned long field, uint64_t value)
328 vmcs_writel(field, value);
332 * A note on Things You Can't Make Up.
334 * "George, you can type this shit, but you can't say it" -- Harrison Ford
336 * There are 5 VMCS 32-bit words that control guest permissions. If
337 * you set these correctly, you've got a guest that will behave. If
338 * you get even one bit wrong, you've got a guest that will chew your
339 * leg off. Some bits must be 1, some must be 0, and some can be set
340 * either way. To add to the fun, the docs are sort of a docudrama or,
341 * as the quote goes, "interesting if true."
343 * To determine what bit can be set in what VMCS 32-bit control word,
344 * there are 5 corresponding 64-bit MSRs. And, to make it even more
345 * fun, the standard set of MSRs have errors in them, i.e. report
346 * incorrect values, for legacy reasons, and so you are supposed to
347 * "look around" to another set, which have correct bits in
348 * them. There are four such 'correct' registers, and they have _TRUE_
349 * in the names as you can see below. We test for the value of VMCS
350 * control bits in the _TRUE_ registers if possible. The fifth
351 * register, CPU Secondary Exec Controls, which came later, needs no
354 * For each MSR, the high 32 bits tell you what bits can be "1" by a
355 * "1" in that position; the low 32 bits tell you what bit can be "0"
356 * by a "0" in that position. So, for each of 32 bits in a given VMCS
357 * control word, there is a pair of bits in an MSR that tells you what
358 * values it can take. The two bits, of which there are *four*
359 * combinations, describe the *three* possible operations on a
360 * bit. The two bits, taken together, form an untruth table: There are
361 * three possibilities: The VMCS bit can be set to 0 or 1, or it can
362 * only be 0, or only 1. The fourth combination is not supposed to
365 * So: there is the 1 bit from the upper 32 bits of the msr.
366 * If this bit is set, then the bit can be 1. If clear, it can not be 1.
368 * Then there is the 0 bit, from low 32 bits. If clear, the VMCS bit
369 * can be 0. If 1, the VMCS bit can not be 0.
371 * SO, let's call the 1 bit R1, and the 0 bit R0, we have:
374 * 1 0 -> can be 1, can be 0
375 * 0 1 -> can not be 1, can not be 0. --> JACKPOT! Not seen yet.
376 * 1 1 -> must be one.
378 * It's also pretty hard to know what you can and can't set, and
379 * that's led to inadvertant opening of permissions at times. Because
380 * of this complexity we've decided on the following: the driver must
381 * define EVERY bit, UNIQUELY, for each of the 5 registers, that it wants
382 * set. Further, for any bit that's settable, the driver must specify
383 * a setting; for any bit that's reserved, the driver settings must
384 * match that bit. If there are reserved bits we don't specify, that's
385 * ok; we'll take them as is.
387 * We use a set-means-set, and set-means-clear model, i.e. we use a
388 * 32-bit word to contain the bits we want to be 1, indicated by one;
389 * and another 32-bit word in which a bit we want to be 0 is indicated
390 * by a 1. This allows us to easily create masks of all bits we're
391 * going to set, for example.
393 * We have two 32-bit numbers for each 32-bit VMCS field: bits we want
394 * set and bits we want clear. If you read the MSR for that field,
395 * compute the reserved 0 and 1 settings, and | them together, they
396 * need to result in 0xffffffff. You can see that we can create other
397 * tests for conflicts (i.e. overlap).
399 * At this point, I've tested check_vmx_controls in every way
400 * possible, beause I kept screwing the bitfields up. You'll get a nice
401 * error it won't work at all, which is what we want: a
402 * failure-prone setup, where even errors that might result in correct
403 * values are caught -- "right answer, wrong method, zero credit." If there's
404 * weirdness in the bits, we don't want to run.
408 check_vmxec_controls(struct vmxec const *v, bool have_true_msr,
412 uint32_t vmx_msr_low, vmx_msr_high;
413 uint32_t reserved_0, reserved_1, changeable_bits;
416 rdmsr(v->truemsr, vmx_msr_low, vmx_msr_high);
418 rdmsr(v->msr, vmx_msr_low, vmx_msr_high);
420 if (vmx_msr_low & ~vmx_msr_high)
421 warn("JACKPOT: Conflicting VMX ec ctls for %s, high 0x%08x low 0x%08x",
422 v->name, vmx_msr_high, vmx_msr_low);
424 reserved_0 = (~vmx_msr_low) & (~vmx_msr_high);
425 reserved_1 = vmx_msr_low & vmx_msr_high;
426 changeable_bits = ~(reserved_0 | reserved_1);
429 * this is very much as follows:
430 * accept the things I cannot change,
431 * change the things I can,
432 * know the difference.
435 /* Conflict. Don't try to both set and reset bits. */
436 if (v->set_to_0 & v->set_to_1) {
437 printk("%s: set to 0 (0x%x) and set to 1 (0x%x) overlap: 0x%x\n",
438 v->name, v->set_to_0, v->set_to_1, v->set_to_0 & v->set_to_1);
443 if (((v->set_to_0 | v->set_to_1) & changeable_bits) != changeable_bits) {
444 printk("%s: Need to cover 0x%x and have 0x%x,0x%x\n",
445 v->name, changeable_bits, v->set_to_0, v->set_to_1);
449 if ((v->set_to_0 | v->set_to_1 | reserved_0 | reserved_1) != 0xffffffff) {
450 printk("%s: incomplete coverage: have 0x%x, want 0x%x\n",
451 v->name, v->set_to_0 | v->set_to_1 |
452 reserved_0 | reserved_1, 0xffffffff);
456 /* Don't try to change bits that can't be changed. */
457 if ((v->set_to_0 & (reserved_0 | changeable_bits)) != v->set_to_0) {
458 printk("%s: set to 0 (0x%x) can't be done\n", v->name, v->set_to_0);
462 if ((v->set_to_1 & (reserved_1 | changeable_bits)) != v->set_to_1) {
463 printk("%s: set to 1 (0x%x) can't be done\n", v->name, v->set_to_1);
467 /* If there's been any error at all, spill our guts and return. */
469 printk("%s: vmx_msr_high 0x%x, vmx_msr_low 0x%x, ",
470 v->name, vmx_msr_high, vmx_msr_low);
471 printk("set_to_1 0x%x,set_to_0 0x%x,reserved_1 0x%x",
472 v->set_to_1, v->set_to_0, reserved_1);
473 printk(" reserved_0 0x%x", reserved_0);
474 printk(" changeable_bits 0x%x\n", changeable_bits);
478 *result = v->set_to_1 | reserved_1;
480 printd("%s: check_vmxec_controls succeeds with result 0x%x\n",
486 * We're trying to make this as readable as possible. Realistically, it will
487 * rarely if ever change, if the past is any guide.
489 static const struct vmxec pbec = {
490 .name = "Pin Based Execution Controls",
491 .msr = MSR_IA32_VMX_PINBASED_CTLS,
492 .truemsr = MSR_IA32_VMX_TRUE_PINBASED_CTLS,
494 .set_to_1 = (PIN_BASED_EXT_INTR_MASK |
495 PIN_BASED_NMI_EXITING |
496 PIN_BASED_VIRTUAL_NMIS),
498 .set_to_0 = (PIN_BASED_VMX_PREEMPTION_TIMER |
499 PIN_BASED_POSTED_INTR),
502 static const struct vmxec cbec = {
503 .name = "CPU Based Execution Controls",
504 .msr = MSR_IA32_VMX_PROCBASED_CTLS,
505 .truemsr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
507 .set_to_1 = (CPU_BASED_HLT_EXITING |
508 CPU_BASED_MWAIT_EXITING |
509 CPU_BASED_RDPMC_EXITING |
510 CPU_BASED_CR8_LOAD_EXITING |
511 CPU_BASED_CR8_STORE_EXITING |
512 CPU_BASED_USE_MSR_BITMAPS |
513 CPU_BASED_MONITOR_EXITING |
514 CPU_BASED_USE_IO_BITMAPS |
515 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS),
517 .set_to_0 = (CPU_BASED_VIRTUAL_INTR_PENDING |
518 CPU_BASED_INVLPG_EXITING |
519 CPU_BASED_USE_TSC_OFFSETING |
520 CPU_BASED_RDTSC_EXITING |
521 CPU_BASED_CR3_LOAD_EXITING |
522 CPU_BASED_CR3_STORE_EXITING |
523 CPU_BASED_TPR_SHADOW |
524 CPU_BASED_MOV_DR_EXITING |
525 CPU_BASED_VIRTUAL_NMI_PENDING |
526 CPU_BASED_MONITOR_TRAP |
527 CPU_BASED_PAUSE_EXITING |
528 CPU_BASED_UNCOND_IO_EXITING),
531 static const struct vmxec cb2ec = {
532 .name = "CPU Based 2nd Execution Controls",
533 .msr = MSR_IA32_VMX_PROCBASED_CTLS2,
534 .truemsr = MSR_IA32_VMX_PROCBASED_CTLS2,
536 .set_to_1 = (SECONDARY_EXEC_ENABLE_EPT |
537 SECONDARY_EXEC_WBINVD_EXITING),
539 .set_to_0 = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
540 SECONDARY_EXEC_DESCRIPTOR_EXITING |
541 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
542 SECONDARY_EXEC_ENABLE_VPID |
543 SECONDARY_EXEC_UNRESTRICTED_GUEST |
544 SECONDARY_EXEC_APIC_REGISTER_VIRT |
545 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
546 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
547 SECONDARY_EXEC_RDRAND_EXITING |
548 SECONDARY_EXEC_ENABLE_INVPCID |
549 SECONDARY_EXEC_ENABLE_VMFUNC |
550 SECONDARY_EXEC_SHADOW_VMCS |
551 SECONDARY_EXEC_RDSEED_EXITING |
553 /* TODO: re enable this via a "Want" struct
554 member at some point */
555 SECONDARY_EXEC_RDTSCP |
556 SECONDARY_ENABLE_XSAV_RESTORE)
559 static const struct vmxec vmentry = {
560 .name = "VMENTRY controls",
561 .msr = MSR_IA32_VMX_ENTRY_CTLS,
562 .truemsr = MSR_IA32_VMX_TRUE_ENTRY_CTLS,
563 /* exact order from vmx.h; only the first two are enabled. */
565 .set_to_1 = (VM_ENTRY_LOAD_DEBUG_CONTROLS | /* can't set to 0 */
566 VM_ENTRY_LOAD_IA32_EFER |
567 VM_ENTRY_IA32E_MODE),
569 .set_to_0 = (VM_ENTRY_SMM |
570 VM_ENTRY_DEACT_DUAL_MONITOR |
571 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
572 VM_ENTRY_LOAD_IA32_PAT),
575 static const struct vmxec vmexit = {
576 .name = "VMEXIT controls",
577 .msr = MSR_IA32_VMX_EXIT_CTLS,
578 .truemsr = MSR_IA32_VMX_TRUE_EXIT_CTLS,
580 .set_to_1 = (VM_EXIT_SAVE_DEBUG_CONTROLS | /* can't set to 0 */
581 VM_EXIT_SAVE_IA32_EFER | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_HOST_ADDR_SPACE_SIZE), /* 64 bit */
583 .set_to_0 = (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
584 VM_EXIT_ACK_INTR_ON_EXIT |
585 VM_EXIT_SAVE_IA32_PAT |
586 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER),
590 setup_vmcs_config(void *p)
593 struct vmcs_config *vmcs_conf = &vmcs_config;
594 uint32_t vmx_msr_high;
596 bool have_true_msrs = false;
601 vmx_msr = read_msr(MSR_IA32_VMX_BASIC);
602 vmx_msr_high = vmx_msr >> 32;
605 * If bit 55 (VMX_BASIC_HAVE_TRUE_MSRS) is set, then we
606 * can go for the true MSRs. Else, we ask you to get a better CPU.
608 if (vmx_msr & VMX_BASIC_TRUE_CTLS) {
609 have_true_msrs = true;
610 printd("Running with TRUE MSRs\n");
612 printk("Running with non-TRUE MSRs, this is old hardware\n");
616 * Don't worry that one or more of these might fail and leave
617 * the VMCS in some kind of incomplete state. If one of these
618 * fails, the caller is going to discard the VMCS.
619 * It is written this way to ensure we get results of all tests and avoid
622 ok = check_vmxec_controls(&pbec, have_true_msrs,
623 &vmcs_conf->pin_based_exec_ctrl);
624 ok = check_vmxec_controls(&cbec, have_true_msrs,
625 &vmcs_conf->cpu_based_exec_ctrl) && ok;
626 /* Only check cb2ec if we're still ok, o/w we may GPF */
627 ok = ok && check_vmxec_controls(&cb2ec, have_true_msrs,
628 &vmcs_conf->cpu_based_2nd_exec_ctrl);
629 ok = check_vmxec_controls(&vmentry, have_true_msrs,
630 &vmcs_conf->vmentry_ctrl) && ok;
631 ok = check_vmxec_controls(&vmexit, have_true_msrs,
632 &vmcs_conf->vmexit_ctrl) && ok;
634 printk("vmxexec controls is no good.\n");
638 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
639 if ((vmx_msr_high & 0x1fff) > PGSIZE) {
640 printk("vmx_msr_high & 0x1fff) is 0x%x, > PAGE_SIZE 0x%x\n",
641 vmx_msr_high & 0x1fff, PGSIZE);
645 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
646 if (vmx_msr & VMX_BASIC_64) {
647 printk("VMX doesn't support 64 bit width!\n");
651 if (((vmx_msr & VMX_BASIC_MEM_TYPE_MASK) >> VMX_BASIC_MEM_TYPE_SHIFT)
652 != VMX_BASIC_MEM_TYPE_WB) {
653 printk("VMX doesn't support WB memory for VMCS accesses!\n");
657 vmcs_conf->size = vmx_msr_high & 0x1fff;
658 vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
659 vmcs_conf->revision_id = (uint32_t) vmx_msr;
661 /* Read in the caps for runtime checks. This MSR is only available if
662 * secondary controls and ept or vpid is on, which we check earlier */
663 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, vmx_capability.ept, vmx_capability.vpid);
669 __vmx_alloc_vmcs(int node)
673 vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
676 memset(vmcs, 0, vmcs_config.size);
677 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
678 printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
683 * vmx_alloc_vmcs - allocates a VMCS region
685 * NOTE: Assumes the new region will be used by the current CPU.
687 * Returns a valid VMCS region.
692 return __vmx_alloc_vmcs(numa_id());
696 * vmx_free_vmcs - frees a VMCS region
699 vmx_free_vmcs(struct vmcs *vmcs)
701 //free_pages((unsigned long)vmcs, vmcs_config.order);
705 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
706 * will not change in the lifetime of the guest.
707 * Note that host-state that does change is set elsewhere. E.g., host-state
708 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
711 vmx_setup_constant_host_state(void)
713 uint32_t low32, high32;
717 vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS); /* 22.2.3 */
718 vmcs_writel(HOST_CR4, rcr4()); /* 22.2.3, 22.2.5 */
719 vmcs_writel(HOST_CR3, rcr3()); /* 22.2.3 */
721 vmcs_write16(HOST_CS_SELECTOR, GD_KT); /* 22.2.4 */
722 vmcs_write16(HOST_DS_SELECTOR, GD_KD); /* 22.2.4 */
723 vmcs_write16(HOST_ES_SELECTOR, GD_KD); /* 22.2.4 */
724 vmcs_write16(HOST_SS_SELECTOR, GD_KD); /* 22.2.4 */
725 vmcs_write16(HOST_TR_SELECTOR, GD_TSS); /* 22.2.4 */
727 native_store_idt(&dt);
728 vmcs_writel(HOST_IDTR_BASE, dt.pd_base); /* 22.2.4 */
730 asm("mov $.Lkvm_vmx_return, %0":"=r"(tmpl));
731 vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
733 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
734 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
735 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
736 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
738 rdmsr(MSR_EFER, low32, high32);
739 vmcs_write32(HOST_IA32_EFER, low32);
741 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
742 rdmsr(MSR_IA32_CR_PAT, low32, high32);
743 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
746 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
747 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
749 /* TODO: This (at least gs) is per cpu */
750 rdmsrl(MSR_FS_BASE, tmpl);
751 vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
752 rdmsrl(MSR_GS_BASE, tmpl);
753 vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
756 static inline uint16_t
760 asm("sldt %0":"=g"(ldt));
765 segment_base(uint16_t selector)
767 pseudodesc_t *gdt = ¤tcpu->host_gdt;
768 struct desc_struct *d;
769 unsigned long table_base;
772 if (!(selector & ~3)) {
776 table_base = gdt->pd_base;
778 if (selector & 4) { /* from ldt */
779 uint16_t ldt_selector = vmx_read_ldt();
781 if (!(ldt_selector & ~3)) {
785 table_base = segment_base(ldt_selector);
787 d = (struct desc_struct *)(table_base + (selector & ~7));
788 v = get_desc_base(d);
789 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
790 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
794 static inline unsigned long
795 vmx_read_tr_base(void)
798 asm("str %0":"=g"(tr));
799 return segment_base(tr);
803 __vmx_setup_cpu(void)
805 pseudodesc_t *gdt = ¤tcpu->host_gdt;
806 unsigned long sysenter_esp;
810 * Linux uses per-cpu TSS and GDT, so set these when switching
813 vmcs_writel(HOST_TR_BASE, vmx_read_tr_base()); /* 22.2.4 */
814 vmcs_writel(HOST_GDTR_BASE, gdt->pd_base); /* 22.2.4 */
816 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
817 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
819 rdmsrl(MSR_FS_BASE, tmpl);
820 vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
821 rdmsrl(MSR_GS_BASE, tmpl);
822 vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
826 * vmx_get_cpu - called before using a cpu
827 * @vcpu: VCPU that will be loaded.
829 * Disables preemption. Call vmx_put_cpu() when finished.
832 vmx_get_cpu(struct vmx_vcpu *vcpu)
834 int cur_cpu = core_id();
835 handler_wrapper_t *w;
837 if (currentcpu->local_vcpu)
838 panic("get_cpu: currentcpu->localvcpu was non-NULL");
839 if (currentcpu->local_vcpu != vcpu) {
840 currentcpu->local_vcpu = vcpu;
842 if (vcpu->cpu != cur_cpu) {
843 if (vcpu->cpu >= 0) {
844 panic("vcpu->cpu is not -1, it's %d\n", vcpu->cpu);
846 vmcs_clear(vcpu->vmcs);
848 ept_sync_context(vcpu_get_eptp(vcpu));
851 vmcs_load(vcpu->vmcs);
855 vmcs_load(vcpu->vmcs);
861 * vmx_put_cpu - called after using a cpu
862 * @vcpu: VCPU that was loaded.
865 vmx_put_cpu(struct vmx_vcpu *vcpu)
867 if (core_id() != vcpu->cpu)
868 panic("%s: core_id() %d != vcpu->cpu %d\n",
869 __func__, core_id(), vcpu->cpu);
871 if (currentcpu->local_vcpu != vcpu)
872 panic("vmx_put_cpu: asked to clear something not ours");
874 ept_sync_context(vcpu_get_eptp(vcpu));
875 vmcs_clear(vcpu->vmcs);
877 currentcpu->local_vcpu = NULL;
882 * vmx_dump_cpu - prints the CPU state
883 * @vcpu: VCPU to print
886 vmx_dump_cpu(struct vmx_vcpu *vcpu)
892 vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
893 vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
894 flags = vmcs_readl(GUEST_RFLAGS);
897 printk("--- Begin VCPU Dump ---\n");
898 printk("CPU %d VPID %d\n", vcpu->cpu, 0);
899 printk("RIP 0x%016lx RFLAGS 0x%08lx\n", vcpu->regs.tf_rip, flags);
900 printk("RAX 0x%016lx RCX 0x%016lx\n", vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
901 printk("RDX 0x%016lx RBX 0x%016lx\n", vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
902 printk("RSP 0x%016lx RBP 0x%016lx\n", vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
903 printk("RSI 0x%016lx RDI 0x%016lx\n", vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
904 printk("R8 0x%016lx R9 0x%016lx\n", vcpu->regs.tf_r8, vcpu->regs.tf_r9);
905 printk("R10 0x%016lx R11 0x%016lx\n", vcpu->regs.tf_r10, vcpu->regs.tf_r11);
906 printk("R12 0x%016lx R13 0x%016lx\n", vcpu->regs.tf_r12, vcpu->regs.tf_r13);
907 printk("R14 0x%016lx R15 0x%016lx\n", vcpu->regs.tf_r14, vcpu->regs.tf_r15);
908 printk("--- End VCPU Dump ---\n");
913 construct_eptp(physaddr_t root_hpa)
917 /* set WB memory and 4 levels of walk. we checked these in ept_init */
918 eptp = VMX_EPT_MEM_TYPE_WB | (VMX_EPT_GAW_4_LVL << VMX_EPT_GAW_EPTP_SHIFT);
919 if (cpu_has_vmx_ept_ad_bits())
920 eptp |= VMX_EPT_AD_ENABLE_BIT;
921 eptp |= (root_hpa & PAGE_MASK);
927 * vmx_setup_initial_guest_state - configures the initial state of guest registers
930 vmx_setup_initial_guest_state(void)
933 unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
934 X86_CR4_PGE | X86_CR4_OSFXSR;
935 uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
938 we need it if (boot_cpu_has(X86_FEATURE_PCID))
939 cr4 |= X86_CR4_PCIDE;
940 if (boot_cpu_has(X86_FEATURE_OSXSAVE))
941 cr4 |= X86_CR4_OSXSAVE;
943 /* we almost certainly have this */
944 /* we'll go sour if we don't. */
945 if (1) //boot_cpu_has(X86_FEATURE_FSGSBASE))
946 cr4 |= X86_CR4_RDWRGSFS;
948 /* configure control and data registers */
949 vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
950 X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
951 vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
952 X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
953 vmcs_writel(GUEST_CR3, rcr3());
954 vmcs_writel(GUEST_CR4, cr4);
955 vmcs_writel(CR4_READ_SHADOW, cr4);
956 vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
957 EFER_SCE /*| EFER_FFXSR */ );
958 vmcs_writel(GUEST_GDTR_BASE, 0);
959 vmcs_writel(GUEST_GDTR_LIMIT, 0);
960 vmcs_writel(GUEST_IDTR_BASE, 0);
961 vmcs_writel(GUEST_IDTR_LIMIT, 0);
962 vmcs_writel(GUEST_RIP, 0xdeadbeef);
963 vmcs_writel(GUEST_RSP, 0xdeadbeef);
964 vmcs_writel(GUEST_RFLAGS, 0x02);
965 vmcs_writel(GUEST_DR7, 0);
967 /* guest segment bases */
968 vmcs_writel(GUEST_CS_BASE, 0);
969 vmcs_writel(GUEST_DS_BASE, 0);
970 vmcs_writel(GUEST_ES_BASE, 0);
971 vmcs_writel(GUEST_GS_BASE, 0);
972 vmcs_writel(GUEST_SS_BASE, 0);
973 rdmsrl(MSR_FS_BASE, tmpl);
974 vmcs_writel(GUEST_FS_BASE, tmpl);
976 /* guest segment access rights */
977 vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
978 vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
979 vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
980 vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
981 vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
982 vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
984 /* guest segment limits */
985 vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
986 vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
987 vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
988 vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
989 vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
990 vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
992 /* configure segment selectors */
993 vmcs_write16(GUEST_CS_SELECTOR, 0);
994 vmcs_write16(GUEST_DS_SELECTOR, 0);
995 vmcs_write16(GUEST_ES_SELECTOR, 0);
996 vmcs_write16(GUEST_FS_SELECTOR, 0);
997 vmcs_write16(GUEST_GS_SELECTOR, 0);
998 vmcs_write16(GUEST_SS_SELECTOR, 0);
999 vmcs_write16(GUEST_TR_SELECTOR, 0);
1002 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1003 vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
1004 vmcs_writel(GUEST_LDTR_BASE, 0);
1005 vmcs_writel(GUEST_LDTR_LIMIT, 0);
1008 vmcs_writel(GUEST_TR_BASE, 0);
1009 vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
1010 vmcs_writel(GUEST_TR_LIMIT, 0xff);
1012 /* initialize sysenter */
1013 vmcs_write32(GUEST_SYSENTER_CS, 0);
1014 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1015 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1017 /* other random initialization */
1018 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1019 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1020 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1021 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1022 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1025 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1027 int f = sizeof(unsigned long);
1029 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1030 * have the write-low and read-high bitmap offsets the wrong way round.
1031 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1033 if (msr <= 0x1fff) {
1034 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
1035 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
1036 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1038 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
1039 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
1043 /* note the io_bitmap is big enough for the 64K port space. */
1044 static void __vmx_disable_intercept_for_io(unsigned long *io_bitmap,
1046 __clear_bit(port, io_bitmap);
1049 static void vcpu_print_autoloads(struct vmx_vcpu *vcpu) {
1050 struct vmx_msr_entry *e;
1051 int sz = sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs);
1052 printk("Host Autoloads:\n-------------------\n");
1053 for (int i = 0; i < sz; i++) {
1054 e = &vcpu->msr_autoload.host[i];
1055 printk("\tMSR 0x%08x: %p\n", e->index, e->value);
1057 printk("Guest Autoloads:\n-------------------\n");
1058 for (int i = 0; i < sz; i++) {
1059 e = &vcpu->msr_autoload.guest[i];
1060 printk("\tMSR 0x%08x %p\n", e->index, e->value);
1064 static void dumpmsrs(void) {
1072 MSR_IA32_PEBS_ENABLE
1074 for (i = 0; i < ARRAY_SIZE(set); i++) {
1075 printk("%p: %p\n", set[i], read_msr(set[i]));
1077 printk("core id %d\n", core_id());
1080 /* emulated msr. For now, an msr value and a pointer to a helper that
1081 * performs the requested operation.
1086 int (*f) (struct vmx_vcpu * vcpu, struct emmsr *, uint32_t, uint32_t);
1091 int emsr_miscenable(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1093 int emsr_mustmatch(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1095 int emsr_readonly(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1097 int emsr_readzero(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1099 int emsr_fakewrite(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t,
1101 int emsr_ok(struct vmx_vcpu *vcpu, struct emmsr *, uint32_t, uint32_t);
1103 struct emmsr emmsrs[] = {
1104 {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable},
1105 {MSR_IA32_SYSENTER_CS, "MSR_IA32_SYSENTER_CS", emsr_ok},
1106 {MSR_IA32_SYSENTER_EIP, "MSR_IA32_SYSENTER_EIP", emsr_ok},
1107 {MSR_IA32_SYSENTER_ESP, "MSR_IA32_SYSENTER_ESP", emsr_ok},
1108 {MSR_IA32_UCODE_REV, "MSR_IA32_UCODE_REV", emsr_fakewrite},
1109 {MSR_CSTAR, "MSR_CSTAR", emsr_fakewrite},
1110 {MSR_IA32_VMX_BASIC_MSR, "MSR_IA32_VMX_BASIC_MSR", emsr_fakewrite},
1111 {MSR_IA32_VMX_PINBASED_CTLS_MSR, "MSR_IA32_VMX_PINBASED_CTLS_MSR",
1113 {MSR_IA32_VMX_PROCBASED_CTLS_MSR, "MSR_IA32_VMX_PROCBASED_CTLS_MSR",
1115 {MSR_IA32_VMX_PROCBASED_CTLS2, "MSR_IA32_VMX_PROCBASED_CTLS2",
1117 {MSR_IA32_VMX_EXIT_CTLS_MSR, "MSR_IA32_VMX_EXIT_CTLS_MSR",
1119 {MSR_IA32_VMX_ENTRY_CTLS_MSR, "MSR_IA32_VMX_ENTRY_CTLS_MSR",
1121 {MSR_IA32_ENERGY_PERF_BIAS, "MSR_IA32_ENERGY_PERF_BIAS",
1123 {MSR_LBR_SELECT, "MSR_LBR_SELECT", emsr_ok},
1124 {MSR_LBR_TOS, "MSR_LBR_TOS", emsr_ok},
1125 {MSR_LBR_NHM_FROM, "MSR_LBR_NHM_FROM", emsr_ok},
1126 {MSR_LBR_NHM_TO, "MSR_LBR_NHM_TO", emsr_ok},
1127 {MSR_LBR_CORE_FROM, "MSR_LBR_CORE_FROM", emsr_ok},
1128 {MSR_LBR_CORE_TO, "MSR_LBR_CORE_TO", emsr_ok},
1131 {MSR_OFFCORE_RSP_0, "MSR_OFFCORE_RSP_0", emsr_ok},
1132 {MSR_OFFCORE_RSP_1, "MSR_OFFCORE_RSP_1", emsr_ok},
1134 {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok},
1135 // aaaaaahhhhhhhhhhhhhhhhhhhhh
1136 {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
1137 {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
1139 {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fakewrite},
1142 {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite},
1143 {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero},
1146 static uint64_t set_low32(uint64_t hi, uint32_t lo)
1148 return (hi & 0xffffffff00000000ULL) | lo;
1151 static uint64_t set_low16(uint64_t hi, uint16_t lo)
1153 return (hi & 0xffffffffffff0000ULL) | lo;
1156 static uint64_t set_low8(uint64_t hi, uint8_t lo)
1158 return (hi & 0xffffffffffffff00ULL) | lo;
1161 /* this may be the only register that needs special handling.
1162 * If there others then we might want to extend teh emmsr struct.
1164 int emsr_miscenable(struct vmx_vcpu *vcpu, struct emmsr *msr,
1165 uint32_t opcode, uint32_t qual) {
1167 rdmsr(msr->reg, eax, edx);
1168 /* we just let them read the misc msr for now. */
1169 if (opcode == EXIT_REASON_MSR_READ) {
1170 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1171 vcpu->regs.tf_rax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
1172 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1175 /* if they are writing what is already written, that's ok. */
1176 if (((uint32_t) vcpu->regs.tf_rax == eax)
1177 && ((uint32_t) vcpu->regs.tf_rdx == edx))
1181 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
1182 msr->name, (uint32_t) vcpu->regs.tf_rdx,
1183 (uint32_t) vcpu->regs.tf_rax, edx, eax);
1184 return SHUTDOWN_UNHANDLED_EXIT_REASON;
1187 int emsr_mustmatch(struct vmx_vcpu *vcpu, struct emmsr *msr,
1188 uint32_t opcode, uint32_t qual) {
1190 rdmsr(msr->reg, eax, edx);
1191 /* we just let them read the misc msr for now. */
1192 if (opcode == EXIT_REASON_MSR_READ) {
1193 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1194 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1197 /* if they are writing what is already written, that's ok. */
1198 if (((uint32_t) vcpu->regs.tf_rax == eax)
1199 && ((uint32_t) vcpu->regs.tf_rdx == edx))
1203 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
1204 msr->name, (uint32_t) vcpu->regs.tf_rdx,
1205 (uint32_t) vcpu->regs.tf_rax, edx, eax);
1206 return SHUTDOWN_UNHANDLED_EXIT_REASON;
1209 int emsr_ok(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1211 if (opcode == EXIT_REASON_MSR_READ) {
1212 rdmsr(msr->reg, vcpu->regs.tf_rdx, vcpu->regs.tf_rax);
1215 (uint64_t) vcpu->regs.tf_rdx << 32 | vcpu->regs.tf_rax;
1216 write_msr(msr->reg, val);
1221 int emsr_readonly(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1224 rdmsr((uint32_t) vcpu->regs.tf_rcx, eax, edx);
1225 /* we just let them read the misc msr for now. */
1226 if (opcode == EXIT_REASON_MSR_READ) {
1227 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1228 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1232 printk("%s: Tried to write a readonly register\n", msr->name);
1233 return SHUTDOWN_UNHANDLED_EXIT_REASON;
1236 int emsr_readzero(struct vmx_vcpu *vcpu, struct emmsr *msr, uint32_t opcode,
1238 if (opcode == EXIT_REASON_MSR_READ) {
1239 vcpu->regs.tf_rax = 0;
1240 vcpu->regs.tf_rdx = 0;
1244 printk("%s: Tried to write a readonly register\n", msr->name);
1245 return SHUTDOWN_UNHANDLED_EXIT_REASON;
1248 /* pretend to write it, but don't write it. */
1249 int emsr_fakewrite(struct vmx_vcpu *vcpu, struct emmsr *msr,
1250 uint32_t opcode, uint32_t qual) {
1252 if (!msr->written) {
1253 rdmsr(msr->reg, eax, edx);
1258 /* we just let them read the misc msr for now. */
1259 if (opcode == EXIT_REASON_MSR_READ) {
1260 vcpu->regs.tf_rax = set_low32(vcpu->regs.tf_rax, eax);
1261 vcpu->regs.tf_rdx = set_low32(vcpu->regs.tf_rdx, edx);
1264 /* if they are writing what is already written, that's ok. */
1265 if (((uint32_t) vcpu->regs.tf_rax == eax)
1266 && ((uint32_t) vcpu->regs.tf_rdx == edx))
1268 msr->edx = vcpu->regs.tf_rdx;
1269 msr->eax = vcpu->regs.tf_rax;
1270 msr->written = true;
1276 msrio(struct vmx_vcpu *vcpu, uint32_t opcode, uint32_t qual) {
1278 for (i = 0; i < ARRAY_SIZE(emmsrs); i++) {
1279 if (emmsrs[i].reg != vcpu->regs.tf_rcx)
1281 return emmsrs[i].f(vcpu, &emmsrs[i], opcode, qual);
1283 printk("msrio for 0x%lx failed\n", vcpu->regs.tf_rcx);
1284 return SHUTDOWN_UNHANDLED_EXIT_REASON;
1287 /* crude PCI bus. Just enough to get virtio working. I would rather not add to this. */
1289 uint32_t registers[256];
1292 /* just index by devfn, i.e. 8 bits */
1293 struct pciconfig pcibus[] = {
1294 /* linux requires that devfn 0 be a bridge.
1295 * 00:00.0 Host bridge: Intel Corporation 440BX/ZX/DX - 82443BX/ZX/DX Host bridge (rev 01)
1298 {0x71908086, 0x02000006, 0x06000001},
1301 /* cf8 is a single-threaded resource. */
1302 static uint32_t cf8;
1303 static uint32_t allones = (uint32_t)-1;
1305 /* Return a pointer to the 32-bit "register" in the "pcibus" give an address. Use cf8.
1306 * only for readonly access.
1307 * this will fail if we ever want to do writes, but we don't.
1309 void regp(uint32_t **reg)
1312 int devfn = (cf8>>8) & 0xff;
1313 printk("devfn %d\n", devfn);
1314 if (devfn < ARRAY_SIZE(pcibus))
1315 *reg = &pcibus[devfn].registers[(cf8>>2)&0x3f];
1316 printk("-->regp *reg 0x%lx\n", **reg);
1319 static uint32_t configaddr(uint32_t val)
1321 printk("%s 0x%lx\n", __func__, val);
1326 static uint32_t configread32(uint32_t edx, uint64_t *reg)
1330 *reg = set_low32(*reg, *r);
1331 printk("%s: 0x%lx 0x%lx, 0x%lx 0x%lx\n", __func__, cf8, edx, r, *reg);
1335 static uint32_t configread16(uint32_t edx, uint64_t *reg)
1338 int which = ((edx&2)>>1) * 16;
1339 configread32(edx, &val);
1341 *reg = set_low16(*reg, val);
1342 printk("%s: 0x%lx, 0x%lx 0x%lx\n", __func__, edx, val, *reg);
1346 static uint32_t configread8(uint32_t edx, uint64_t *reg)
1349 int which = (edx&3) * 8;
1350 configread32(edx, &val);
1352 *reg = set_low16(*reg, val);
1353 printk("%s: 0x%lx, 0x%lx 0x%lx\n", __func__, edx, val, *reg);
1357 static int configwrite32(uint32_t addr, uint32_t val)
1359 printk("%s 0x%lx 0x%lx\n", __func__, addr, val);
1363 static int configwrite16(uint32_t addr, uint16_t val)
1365 printk("%s 0x%lx 0x%lx\n", __func__, addr, val);
1369 static int configwrite8(uint32_t addr, uint8_t val)
1371 printk("%s 0x%lx 0x%lx\n", __func__, addr, val);
1375 /* this is very minimal. It needs to move to vmm/io.c but we don't
1376 * know if this minimal approach will even be workable. It only (for
1377 * now) handles pci config space. We'd like to hope that's all we will
1379 * It would have been nice had intel encoded the IO exit info as nicely as they
1380 * encoded, some of the other exits.
1382 static int io(struct vmx_vcpu *vcpu, int *advance)
1385 /* Get a pointer to the memory at %rip. This is quite messy and part of the
1386 * reason we don't want to do this at all. It sucks. Would have been nice
1387 * had linux had an option to ONLY do mmio config space access, but no such
1390 uint8_t *ip8 = NULL;
1394 /* for now, we're going to be a bit crude. In kernel, p is about v, so we just blow away
1395 * the upper 34 bits and take the rest as our address
1397 ip = vcpu->regs.tf_rip & 0x3fffffff;
1398 edx = vcpu->regs.tf_rdx;
1401 printk("io: ip16 %p\n", *ip16, edx);
1408 return configaddr(vcpu->regs.tf_rax);
1410 printk("unhandled IO address dx @%p is 0x%x\n", ip8, edx);
1411 return SHUTDOWN_UNHANDLED_EXIT_REASON;
1417 if (edx == 0xcfb) { // special!
1418 printk("Just ignore the damned cfb write\n");
1421 if ((edx&~3) == 0xcfc) {
1422 printk("configwrite8 ");
1423 return configwrite8(edx, vcpu->regs.tf_rax);
1425 printk("unhandled IO address dx @%p is 0x%x\n", ip8, edx);
1426 return SHUTDOWN_UNHANDLED_EXIT_REASON;
1430 printk("configread8 ");
1431 return configread8(edx, &vcpu->regs.tf_rax);
1436 printk("read cf8 0x%lx\n", vcpu->regs.tf_rax);
1437 vcpu->regs.tf_rax = cf8;
1440 printk("configread32 ");
1441 return configread32(edx, &vcpu->regs.tf_rax);
1443 if (*ip16 == 0xed66) {
1445 printk("configread16 ");
1446 return configread16(edx, &vcpu->regs.tf_rax);
1448 printk("unknown IO %p %x %x\n", ip8, *ip8, *ip16);
1449 return SHUTDOWN_UNHANDLED_EXIT_REASON;
1452 /* Notes on autoloading. We can't autoload FS_BASE or GS_BASE, according to the
1453 * manual, but that's because they are automatically saved and restored when all
1454 * of the other architectural registers are saved and restored, such as cs, ds,
1455 * es, and other fun things. (See 24.4.1). We need to make sure we don't
1456 * accidentally intercept them too, since they are magically autloaded..
1458 * We'll need to be careful of any MSR we neither autoload nor intercept
1459 * whenever we vmenter/vmexit, and we intercept by default.
1461 * Other MSRs, such as MSR_IA32_PEBS_ENABLE only work on certain architectures
1462 * only work on certain architectures. */
1463 static void setup_msr(struct vmx_vcpu *vcpu) {
1464 struct vmx_msr_entry *e;
1465 int sz = sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs);
1468 static_assert((sizeof(autoloaded_msrs) / sizeof(*autoloaded_msrs)) <=
1471 vcpu->msr_autoload.nr = sz;
1473 /* Since PADDR(msr_bitmap) is non-zero, and the bitmap is all 0xff, we now
1474 * intercept all MSRs */
1475 vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1477 vmcs_write64(IO_BITMAP_A, PADDR(io_bitmap));
1478 vmcs_write64(IO_BITMAP_B, PADDR((uintptr_t)io_bitmap +
1479 (VMX_IO_BITMAP_SZ / 2)));
1481 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1482 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1483 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1485 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1486 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1487 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1489 for (i = 0; i < sz; i++) {
1492 e = &vcpu->msr_autoload.host[i];
1493 e->index = autoloaded_msrs[i];
1494 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1495 rdmsrl(e->index, val);
1497 printk("host index %p val %p\n", e->index, e->value);
1499 e = &vcpu->msr_autoload.guest[i];
1500 e->index = autoloaded_msrs[i];
1501 e->value = 0xDEADBEEF;
1502 printk("guest index %p val %p\n", e->index, e->value);
1507 * vmx_setup_vmcs - configures the vmcs with starting parameters
1509 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu) {
1510 vmcs_write16(VIRTUAL_PROCESSOR_ID, 0);
1511 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1514 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1515 vmcs_config.pin_based_exec_ctrl);
1517 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1518 vmcs_config.cpu_based_exec_ctrl);
1520 if (cpu_has_secondary_exec_ctrls()) {
1521 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1522 vmcs_config.cpu_based_2nd_exec_ctrl);
1525 vmcs_write64(EPT_POINTER, vcpu_get_eptp(vcpu));
1527 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1528 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1529 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1533 vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1535 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1536 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1538 vmcs_writel(CR0_GUEST_HOST_MASK, 0); // ~0ul);
1539 vmcs_writel(CR4_GUEST_HOST_MASK, 0); // ~0ul);
1541 //kvm_write_tsc(&vmx->vcpu, 0);
1542 vmcs_writel(TSC_OFFSET, 0);
1544 vmx_setup_constant_host_state();
1548 * vmx_create_vcpu - allocates and initializes a new virtual cpu
1550 * Returns: A new VCPU structure
1552 struct vmx_vcpu *vmx_create_vcpu(struct proc *p) {
1553 struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1558 memset(vcpu, 0, sizeof(*vcpu));
1560 vcpu->proc = p; /* uncounted (weak) reference */
1561 vcpu->vmcs = vmx_alloc_vmcs();
1562 printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1569 vmx_setup_vmcs(vcpu);
1570 vmx_setup_initial_guest_state();
1581 * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1582 * @vcpu: the VCPU to destroy
1584 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu) {
1585 vmx_free_vmcs(vcpu->vmcs);
1590 * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1592 * In the contexts where this is used the vcpu pointer should never be NULL.
1594 static inline struct vmx_vcpu *vmx_current_vcpu(void) {
1595 struct vmx_vcpu *vcpu = currentcpu->local_vcpu;
1597 panic("Core has no vcpu!");
1602 * vmx_run_vcpu - launches the CPU into non-root mode
1603 * We ONLY support 64-bit guests.
1604 * @vcpu: the vmx instance to launch
1606 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1609 /* Store host registers */
1610 "push %%rdx; push %%rbp;"
1611 "push %%rcx \n\t" /* placeholder for guest rcx */
1613 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1615 "mov %%rsp, %c[host_rsp](%0) \n\t"
1616 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1618 /* Reload cr2 if changed */
1619 "mov %c[cr2](%0), %%rax \n\t"
1620 "mov %%cr2, %%rdx \n\t"
1621 "cmp %%rax, %%rdx \n\t"
1623 "mov %%rax, %%cr2 \n\t"
1625 /* Check if vmlaunch of vmresume is needed */
1626 "cmpl $0, %c[launched](%0) \n\t"
1627 /* Load guest registers. Don't clobber flags. */
1628 "mov %c[rax](%0), %%rax \n\t"
1629 "mov %c[rbx](%0), %%rbx \n\t"
1630 "mov %c[rdx](%0), %%rdx \n\t"
1631 "mov %c[rsi](%0), %%rsi \n\t"
1632 "mov %c[rdi](%0), %%rdi \n\t"
1633 "mov %c[rbp](%0), %%rbp \n\t"
1634 "mov %c[r8](%0), %%r8 \n\t"
1635 "mov %c[r9](%0), %%r9 \n\t"
1636 "mov %c[r10](%0), %%r10 \n\t"
1637 "mov %c[r11](%0), %%r11 \n\t"
1638 "mov %c[r12](%0), %%r12 \n\t"
1639 "mov %c[r13](%0), %%r13 \n\t"
1640 "mov %c[r14](%0), %%r14 \n\t"
1641 "mov %c[r15](%0), %%r15 \n\t"
1642 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1644 /* Enter guest mode */
1645 "jne .Llaunched \n\t"
1646 ASM_VMX_VMLAUNCH "\n\t"
1647 "jmp .Lkvm_vmx_return \n\t"
1648 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1649 ".Lkvm_vmx_return: "
1650 /* Save guest registers, load host registers, keep flags */
1651 "mov %0, %c[wordsize](%%rsp) \n\t"
1653 "mov %%rax, %c[rax](%0) \n\t"
1654 "mov %%rbx, %c[rbx](%0) \n\t"
1655 "popq %c[rcx](%0) \n\t"
1656 "mov %%rdx, %c[rdx](%0) \n\t"
1657 "mov %%rsi, %c[rsi](%0) \n\t"
1658 "mov %%rdi, %c[rdi](%0) \n\t"
1659 "mov %%rbp, %c[rbp](%0) \n\t"
1660 "mov %%r8, %c[r8](%0) \n\t"
1661 "mov %%r9, %c[r9](%0) \n\t"
1662 "mov %%r10, %c[r10](%0) \n\t"
1663 "mov %%r11, %c[r11](%0) \n\t"
1664 "mov %%r12, %c[r12](%0) \n\t"
1665 "mov %%r13, %c[r13](%0) \n\t"
1666 "mov %%r14, %c[r14](%0) \n\t"
1667 "mov %%r15, %c[r15](%0) \n\t"
1668 "mov %%rax, %%r10 \n\t"
1669 "mov %%rdx, %%r11 \n\t"
1671 "mov %%cr2, %%rax \n\t"
1672 "mov %%rax, %c[cr2](%0) \n\t"
1674 "pop %%rbp; pop %%rdx \n\t"
1675 "setbe %c[fail](%0) \n\t"
1676 "mov $" STRINGIFY(GD_UD) ", %%rax \n\t"
1677 "mov %%rax, %%ds \n\t"
1678 "mov %%rax, %%es \n\t"
1679 : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1680 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1681 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1682 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1683 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1684 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1685 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1686 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1687 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1688 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1689 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1690 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1691 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1692 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1693 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1694 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1695 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1696 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1697 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1698 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1699 [wordsize]"i"(sizeof(unsigned long))
1701 , "rax", "rbx", "rdi", "rsi"
1702 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1705 vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1706 vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1707 printd("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1708 vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1709 /* FIXME: do we need to set up other flags? */
1710 vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
1711 X86_EFLAGS_IF | 0x2;
1713 vcpu->regs.tf_cs = GD_UT;
1714 vcpu->regs.tf_ss = GD_UD;
1719 printk("failure detected (err %x)\n",
1720 vmcs_read32(VM_INSTRUCTION_ERROR));
1721 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1724 return vmcs_read32(VM_EXIT_REASON);
1727 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1728 vmx_complete_atomic_exit(vmx);
1729 vmx_recover_nmi_blocking(vmx);
1730 vmx_complete_interrupts(vmx);
1734 static void vmx_step_instruction(void) {
1735 vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1736 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1739 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu) {
1740 unsigned long gva, gpa;
1741 int exit_qual, ret = -1;
1745 exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1746 gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1747 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1752 prot |= exit_qual & VMX_EPT_FAULT_READ ? PROT_READ : 0;
1753 prot |= exit_qual & VMX_EPT_FAULT_WRITE ? PROT_WRITE : 0;
1754 prot |= exit_qual & VMX_EPT_FAULT_INS ? PROT_EXEC : 0;
1755 ret = handle_page_fault(current, gpa, prot);
1758 printk("EPT page fault failure %d, GPA: %p, GVA: %p\n", ret, gpa,
1766 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu) {
1767 unsigned int eax, ebx, ecx, edx;
1769 eax = vcpu->regs.tf_rax;
1770 ecx = vcpu->regs.tf_rcx;
1771 cpuid(eax, ecx, &eax, &ebx, &ecx, &edx);
1772 vcpu->regs.tf_rax = eax;
1773 vcpu->regs.tf_rbx = ebx;
1774 vcpu->regs.tf_rcx = ecx;
1775 vcpu->regs.tf_rdx = edx;
1778 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu) {
1782 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1785 printk("vmx (vcpu %p): got an exception\n", vcpu);
1786 printk("vmx (vcpu %p): pid %d\n", vcpu, vcpu->proc->pid);
1787 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1791 printk("unhandled nmi, intr_info %x\n", intr_info);
1796 * vmx_launch - the main loop for a VMX Dune process
1797 * @conf: the launch configuration
1799 int vmx_launch(uint64_t rip, uint64_t rsp, uint64_t cr3) {
1801 struct vmx_vcpu *vcpu;
1805 printd("RUNNING: %s: rip %p rsp %p cr3 %p \n", __func__, rip, rsp, cr3);
1806 /* TODO: dirty hack til we have VMM contexts */
1807 vcpu = current->vmm.guest_pcores[0];
1809 printk("Failed to get a CPU!\n");
1813 /* We need to prep the host's autoload region for our current core. Right
1814 * now, the only autoloaded MSR that varies at runtime (in this case per
1815 * core is the KERN_GS_BASE). */
1816 rdmsrl(MSR_KERNEL_GS_BASE, vcpu->msr_autoload.host[0].value);
1817 /* if cr3 is set, means 'set everything', else means 'start where you left off' */
1820 vmcs_writel(GUEST_RIP, rip);
1821 vmcs_writel(GUEST_RSP, rsp);
1822 vmcs_writel(GUEST_CR3, cr3);
1826 vcpu->ret_code = -1;
1832 // TODO: manage the fpu when we restart.
1834 // TODO: see if we need to exit before we go much further.
1837 ret = vmx_run_vcpu(vcpu);
1842 if (ret == EXIT_REASON_VMCALL) {
1843 if (current->vmm.flags & VMM_VMCALL_PRINTF) {
1844 uint8_t byte = vcpu->regs.tf_rdi;
1845 printd("System call\n");
1853 vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1854 uint8_t byte = vcpu->regs.tf_rdi;
1855 printk("%p %c\n", byte, vcpu->regs.tf_rdi);
1857 printd("system call! WTF\n");
1859 } else if (ret == EXIT_REASON_CR_ACCESS) {
1860 show_cr_access(vmcs_read32(EXIT_QUALIFICATION));
1862 vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1863 } else if (ret == EXIT_REASON_CPUID) {
1864 vmx_handle_cpuid(vcpu);
1866 vmcs_writel(GUEST_RIP, vcpu->regs.tf_rip + 2);
1868 } else if (ret == EXIT_REASON_EPT_VIOLATION) {
1869 if (vmx_handle_ept_violation(vcpu))
1870 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1871 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1872 if (vmx_handle_nmi_exception(vcpu))
1873 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1874 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1875 printd("External interrupt\n");
1876 vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1877 } else if (ret == EXIT_REASON_MSR_READ) {
1878 printd("msr read\n");
1881 msrio(vcpu, ret, vmcs_read32(EXIT_QUALIFICATION));
1883 } else if (ret == EXIT_REASON_MSR_WRITE) {
1884 printd("msr write\n");
1887 msrio(vcpu, ret, vmcs_read32(EXIT_QUALIFICATION));
1889 } else if (ret == EXIT_REASON_IO_INSTRUCTION) {
1890 /* we never wanted to do this. But virtio
1891 * requires pci config space emulation. */
1892 vcpu->shutdown = io(vcpu, &advance);
1894 printk("unhandled exit: reason 0x%x, exit qualification 0x%x\n",
1895 ret, vmcs_read32(EXIT_QUALIFICATION));
1897 vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1900 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1901 * similar to how proc_restartcore/smp_idle only restart the pcpui
1902 * cur_ctx, we need to do the same, via the VMCS resume business. */
1908 vmcs_writel(GUEST_RIP, vcpu->regs.tf_rip + advance);
1913 printd("RETURN. ip %016lx sp %016lx\n",
1914 vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
1915 // hexdump((void *)vcpu->regs.tf_rsp, 128 * 8);
1917 * Return both the reason for the shutdown and a status value.
1918 * The exit() and exit_group() system calls only need 8 bits for
1919 * the status but we allow 16 bits in case we might want to
1920 * return more information for one of the other shutdown reasons.
1922 ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
1928 * __vmx_enable - low-level enable of VMX mode on the current CPU
1929 * @vmxon_buf: an opaque buffer for use as the VMXON region
1931 static int __vmx_enable(struct vmcs *vmxon_buf) {
1932 uint64_t phys_addr = PADDR(vmxon_buf);
1933 uint64_t old, test_bits;
1935 if (rcr4() & X86_CR4_VMXE) {
1936 panic("Should never have this happen");
1940 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1942 test_bits = FEATURE_CONTROL_LOCKED;
1943 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1945 if (0) // tboot_enabled())
1946 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1948 if ((old & test_bits) != test_bits) {
1949 /* If it's locked, then trying to set it will cause a GPF.
1952 if (old & FEATURE_CONTROL_LOCKED) {
1953 printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1957 /* enable and lock */
1958 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1960 lcr4(rcr4() | X86_CR4_VMXE);
1963 vpid_sync_vcpu_global(); /* good idea, even if we aren't using vpids */
1970 * vmx_enable - enables VMX mode on the current CPU
1971 * @unused: not used (required for on_each_cpu())
1973 * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
1975 static void vmx_enable(void) {
1976 struct vmcs *vmxon_buf = currentcpu->vmxarea;
1979 ret = __vmx_enable(vmxon_buf);
1983 currentcpu->vmx_enabled = 1;
1984 // TODO: do we need this?
1985 store_gdt(¤tcpu->host_gdt);
1987 printk("VMX enabled on CPU %d\n", core_id());
1991 printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1995 * vmx_disable - disables VMX mode on the current CPU
1997 static void vmx_disable(void *unused) {
1998 if (currentcpu->vmx_enabled) {
2000 lcr4(rcr4() & ~X86_CR4_VMXE);
2001 currentcpu->vmx_enabled = 0;
2005 /* Probe the cpus to see which ones can do vmx.
2006 * Return -errno if it fails, and 1 if it succeeds.
2008 static bool probe_cpu_vmx(void) {
2009 /* The best way to test this code is:
2010 * wrmsr -p <cpu> 0x3a 1
2011 * This will lock vmx off; then modprobe dune.
2012 * Frequently, however, systems have all 0x3a registers set to 5,
2013 * meaning testing is impossible, as vmx can not be disabled.
2014 * We have to simulate it being unavailable in most cases.
2015 * The 'test' variable provides an easy way to simulate
2016 * unavailability of vmx on some, none, or all cpus.
2018 if (!cpu_has_vmx()) {
2019 printk("Machine does not support VT-x\n");
2022 printk("Machine supports VT-x\n");
2027 static void setup_vmxarea(void) {
2028 struct vmcs *vmxon_buf;
2029 printd("Set up vmxarea for cpu %d\n", core_id());
2030 vmxon_buf = __vmx_alloc_vmcs(core_id());
2032 printk("setup_vmxarea failed on node %d\n", core_id());
2035 currentcpu->vmxarea = vmxon_buf;
2038 static int ept_init(void) {
2039 if (!cpu_has_vmx_ept()) {
2040 printk("VMX doesn't support EPT!\n");
2043 if (!cpu_has_vmx_eptp_writeback()) {
2044 printk("VMX EPT doesn't support WB memory!\n");
2047 if (!cpu_has_vmx_ept_4levels()) {
2048 printk("VMX EPT doesn't support 4 level walks!\n");
2051 switch (arch_max_jumbo_page_shift()) {
2053 if (!cpu_has_vmx_ept_1g_page()) {
2054 printk("VMX EPT doesn't support 1 GB pages!\n");
2059 if (!cpu_has_vmx_ept_2m_page()) {
2060 printk("VMX EPT doesn't support 2 MB pages!\n");
2065 printk("Unexpected jumbo page size %d\n",
2066 arch_max_jumbo_page_shift());
2069 if (!cpu_has_vmx_ept_ad_bits()) {
2070 printk("VMX EPT doesn't support accessed/dirty!\n");
2071 x86_ept_pte_fix_ups |= EPTE_A | EPTE_D;
2073 if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
2074 printk("VMX EPT can't invalidate PTEs/TLBs!\n");
2082 * vmx_init sets up physical core data areas that are required to run a vm at all.
2083 * These data areas are not connected to a specific user process in any way. Instead,
2084 * they are in some sense externalizing what would other wise be a very large ball of
2085 * state that would be inside the CPU.
2087 int intel_vmm_init(void) {
2090 if (!probe_cpu_vmx()) {
2094 setup_vmcs_config(&ret);
2097 printk("setup_vmcs_config failed: %d\n", ret);
2101 msr_bitmap = (unsigned long *)kpage_zalloc_addr();
2103 printk("Could not allocate msr_bitmap\n");
2106 io_bitmap = (unsigned long *)get_cont_pages(VMX_IO_BITMAP_ORDER,
2109 printk("Could not allocate msr_bitmap\n");
2113 /* FIXME: do we need APIC virtualization (flexpriority?) */
2115 memset(msr_bitmap, 0xff, PAGE_SIZE);
2116 memset(io_bitmap, 0xff, VMX_IO_BITMAP_SZ);
2118 /* These are the only MSRs that are not autoloaded and not intercepted */
2119 __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
2120 __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
2121 __vmx_disable_intercept_for_msr(msr_bitmap, MSR_EFER);
2123 /* TODO: this might be dangerous, since they can do more than just read the
2125 __vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_IDX);
2126 __vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_DATA);
2128 if ((ret = ept_init())) {
2129 printk("EPT init failed, %d\n", ret);
2132 printk("VMX setup succeeded\n");
2136 int intel_vmm_pcpu_init(void) {