2 * vmx.c - The Intel VT-x driver for Dune
4 * This file is derived from Linux KVM VT-x support.
5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com>
12 * This modified version is simpler because it avoids the following
13 * features that are not requirements for Dune:
14 * * Real-mode emulation
15 * * Nested VT-x support
16 * * I/O hardware emulation
17 * * Any of the more esoteric X86 features and registers
18 * * KVM-specific functionality
20 * In essence we provide only the minimum functionality needed to run
21 * a process in vmx non-root mode rather than the full hardware emulation
22 * needed to support an entire OS.
24 * This driver is a research prototype and as such has the following
27 * FIXME: Backward compatability is currently a non-goal, and only recent
28 * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
31 * FIXME: Eventually we should handle concurrent user's of VT-x more
32 * gracefully instead of requiring exclusive access. This would allow
33 * Dune to interoperate with KVM and other HV solutions.
35 * FIXME: We need to support hotplugged physical CPUs.
38 * Adam Belay <abelay@stanford.edu>
42 * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
43 * You're left with the feeling that they got part way through and realized they had to have one for
45 * 1) your CPU is going to be capable of running VMs, and you need state for that.
47 * 2) you're about to start a guest, and you need state for that.
49 * So there is get cpu set up to be able to run VMs stuff, and now
50 * let's start a guest stuff. In Akaros, CPUs will always be set up
51 * to run a VM if that is possible. Processes can flip themselves into
52 * a VM and that will require another VMCS.
54 * So: at kernel startup time, the SMP boot stuff calls
55 * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
56 * in the case of this file is intel_vmm_init. That does some code
57 * that sets up stuff for ALL sockets, based on the capabilities of
58 * the socket it runs on. If any cpu supports vmx, it assumes they all
59 * do. That's a realistic assumption. So the call_function_all is kind
60 * of stupid, really; it could just see what's on the current cpu and
61 * assume it's on all. HOWEVER: there are systems in the wilde that
62 * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
63 * might as well allow for the chance that wel'll only all VMMCPs on a
64 * subset (not implemented yet however). So: probe all CPUs, get a
65 * count of how many support VMX and, for now, assume they all do
68 * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
69 * which contains all the naughty bits settings for all the cpus that can run a VM.
70 * Realistically, all VMX-capable cpus in a system will have identical configurations.
71 * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
73 * configure the msr_bitmap. This is the bitmap of MSRs which the
74 * guest can manipulate. Currently, we only allow GS and FS base.
76 * Reserve bit 0 in the vpid bitmap as guests can not use that
78 * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
79 * per-guest. Once set up, it is left alone. The ONLY think we set in
80 * there is the revision area. The VMX is page-sized per cpu and
81 * page-aligned. Note that it can be smaller, but why bother? We know
82 * the max size and alightment, and it's convenient.
84 * Now that it is set up, enable vmx on all cpus. This involves
85 * testing VMXE in cr4, to see if we've been here before (TODO: delete
86 * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
87 * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
88 * instruction), and syncing vpid's and ept's. Now the CPU is ready
92 * We divide this into two things: vmm_proc_init and vm_run.
93 * Currently, on Intel, vmm_proc_init does nothing.
95 * vm_run is really complicated. It is called with a coreid, rip, rsp,
96 * cr3, and flags. On intel, it calls vmx_launch. vmx_launch is set
97 * up for a few test cases. If rip is 1, it sets the guest rip to
98 * a function which will deref 0 and should exit with failure 2. If rip is 0,
99 * it calls an infinite loop in the guest.
101 * The sequence of operations:
105 * disable irqs (required or you can't enter the VM)
112 * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
113 * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
115 * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
116 * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
117 * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
118 * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
120 * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
121 * of inline assembly with embedded CPP crap. I suspect we'll want to
122 * un-inline it someday, but maybe not. It's called with a vcpu
123 * struct from which it loads guest state, and to which it stores
124 * non-virtualized host state. It issues a vmlaunch or vmresume
125 * instruction depending, and on return, it evaluates if things the
126 * launch/resume had an error in that operation. Note this is NOT the
127 * same as an error while in the virtual machine; this is an error in
128 * startup due to misconfiguration. Depending on whatis returned it's
129 * either a failed vm startup or an exit for lots of many reasons.
133 /* basically: only rename those globals that might conflict
134 * with existing names. Leave all else the same.
135 * this code is more modern than the other code, yet still
136 * well encapsulated, it seems.
144 #include <sys/queue.h>
152 #include <arch/types.h>
159 #include "cpufeature.h"
161 #define currentcpu (&per_cpu_info[core_id()])
164 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
165 * away by decrementing the array size.
167 static const uint32_t vmx_msr_index[] = {
169 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
171 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
173 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
175 static unsigned long *msr_bitmap;
177 static struct vmcs_config {
180 uint32_t revision_id;
181 uint32_t pin_based_exec_ctrl;
182 uint32_t cpu_based_exec_ctrl;
183 uint32_t cpu_based_2nd_exec_ctrl;
184 uint32_t vmexit_ctrl;
185 uint32_t vmentry_ctrl;
188 struct vmx_capability vmx_capability;
190 static inline bool cpu_has_secondary_exec_ctrls(void)
192 return vmcs_config.cpu_based_exec_ctrl &
193 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
196 static inline bool cpu_has_vmx_vpid(void)
198 return vmcs_config.cpu_based_2nd_exec_ctrl &
199 SECONDARY_EXEC_ENABLE_VPID;
202 static inline bool cpu_has_vmx_invpcid(void)
204 return vmcs_config.cpu_based_2nd_exec_ctrl &
205 SECONDARY_EXEC_ENABLE_INVPCID;
208 static inline bool cpu_has_vmx_invvpid_single(void)
210 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
213 static inline bool cpu_has_vmx_invvpid_global(void)
215 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
218 static inline bool cpu_has_vmx_ept(void)
220 return vmcs_config.cpu_based_2nd_exec_ctrl &
221 SECONDARY_EXEC_ENABLE_EPT;
224 static inline bool cpu_has_vmx_invept(void)
226 return vmx_capability.ept & VMX_EPT_INVEPT_BIT;
229 /* the SDM (2015-01) doesn't mention this ability (still?) */
230 static inline bool cpu_has_vmx_invept_individual_addr(void)
232 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
235 static inline bool cpu_has_vmx_invept_context(void)
237 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
240 static inline bool cpu_has_vmx_invept_global(void)
242 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
245 static inline bool cpu_has_vmx_ept_ad_bits(void)
247 return vmx_capability.ept & VMX_EPT_AD_BIT;
250 static inline bool cpu_has_vmx_ept_execute_only(void)
252 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
255 static inline bool cpu_has_vmx_eptp_uncacheable(void)
257 return vmx_capability.ept & VMX_EPTP_UC_BIT;
260 static inline bool cpu_has_vmx_eptp_writeback(void)
262 return vmx_capability.ept & VMX_EPTP_WB_BIT;
265 static inline bool cpu_has_vmx_ept_2m_page(void)
267 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
270 static inline bool cpu_has_vmx_ept_1g_page(void)
272 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
275 static inline bool cpu_has_vmx_ept_4levels(void)
277 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
280 static inline void __invept(int ext, uint64_t eptp, gpa_t gpa)
284 } operand = {eptp, gpa};
286 asm volatile (ASM_VMX_INVEPT
287 /* CF==1 or ZF==1 --> rc = -1 */
288 "; ja 1f ; ud2 ; 1:\n"
289 : : "a" (&operand), "c" (ext) : "cc", "memory");
292 /* We assert support for the global flush during ept_init() */
293 static inline void ept_sync_global(void)
295 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
298 static inline void ept_sync_context(uint64_t eptp)
300 if (cpu_has_vmx_invept_context())
301 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
306 void ept_flush(uint64_t eptp)
308 ept_sync_context(eptp);
311 static inline void ept_sync_individual_addr(uint64_t eptp, gpa_t gpa)
313 if (cpu_has_vmx_invept_individual_addr())
314 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
317 ept_sync_context(eptp);
320 static inline void __vmxon(uint64_t addr)
322 asm volatile (ASM_VMX_VMXON_RAX
323 : : "a"(&addr), "m"(addr)
327 static inline void __vmxoff(void)
329 asm volatile (ASM_VMX_VMXOFF : : : "cc");
332 static inline void __invvpid(int ext, uint16_t vpid, gva_t gva)
338 } operand = { vpid, 0, gva };
340 asm volatile (ASM_VMX_INVVPID
341 /* CF==1 or ZF==1 --> rc = -1 */
343 : : "a"(&operand), "c"(ext) : "cc", "memory");
346 static inline void vpid_sync_vcpu_single(uint16_t vpid)
352 if (cpu_has_vmx_invvpid_single())
353 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
356 static inline void vpid_sync_vcpu_global(void)
358 if (cpu_has_vmx_invvpid_global())
359 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
362 static inline void vpid_sync_context(uint16_t vpid)
364 if (cpu_has_vmx_invvpid_single())
365 vpid_sync_vcpu_single(vpid);
367 vpid_sync_vcpu_global();
370 static inline uint64_t vcpu_get_eptp(struct vmx_vcpu *vcpu)
372 return vcpu->proc->env_pgdir.eptp;
375 static void vmcs_clear(struct vmcs *vmcs)
377 uint64_t phys_addr = PADDR(vmcs);
380 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
381 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
384 printk("vmclear fail: %p/%llx\n",
388 static void vmcs_load(struct vmcs *vmcs)
390 uint64_t phys_addr = PADDR(vmcs);
393 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
394 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
397 printk("vmptrld %p/%llx failed\n",
401 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
402 static physaddr_t vmcs_get_current(void)
404 physaddr_t vmcs_paddr;
405 /* RAX contains the addr of the location to store the VMCS pointer. The
406 * compiler doesn't know the ASM will deref that pointer, hence the =m */
407 asm volatile (ASM_VMX_VMPTRST_RAX : "=m"(vmcs_paddr) : "a"(&vmcs_paddr));
411 __always_inline unsigned long vmcs_readl(unsigned long field)
415 asm volatile (ASM_VMX_VMREAD_RDX_RAX
416 : "=a"(value) : "d"(field) : "cc");
420 __always_inline uint16_t vmcs_read16(unsigned long field)
422 return vmcs_readl(field);
425 static __always_inline uint32_t vmcs_read32(unsigned long field)
427 return vmcs_readl(field);
430 static __always_inline uint64_t vmcs_read64(unsigned long field)
433 return vmcs_readl(field);
435 return vmcs_readl(field) | ((uint64_t)vmcs_readl(field+1) << 32);
439 void vmwrite_error(unsigned long field, unsigned long value)
441 printk("vmwrite error: reg %lx value %lx (err %d)\n",
442 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
445 void vmcs_writel(unsigned long field, unsigned long value)
449 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
450 : "=q"(error) : "a"(value), "d"(field) : "cc");
452 vmwrite_error(field, value);
455 static void vmcs_write16(unsigned long field, uint16_t value)
457 vmcs_writel(field, value);
460 static void vmcs_write32(unsigned long field, uint32_t value)
462 vmcs_writel(field, value);
465 static void vmcs_write64(unsigned long field, uint64_t value)
467 vmcs_writel(field, value);
470 static int adjust_vmx_controls(uint32_t ctl_min, uint32_t ctl_opt,
471 uint32_t msr, uint32_t *result)
473 uint32_t vmx_msr_low, vmx_msr_high;
474 uint32_t ctl = ctl_min | ctl_opt;
475 uint64_t vmx_msr = read_msr(msr);
476 vmx_msr_low = vmx_msr;
477 vmx_msr_high = vmx_msr>>32;
479 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
480 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
482 /* Ensure minimum (required) set of control bits are supported. */
483 if (ctl_min & ~ctl) {
491 static bool allow_1_setting(uint32_t msr, uint32_t ctl)
493 uint32_t vmx_msr_low, vmx_msr_high;
495 rdmsr(msr, vmx_msr_low, vmx_msr_high);
496 return vmx_msr_high & ctl;
499 static void setup_vmcs_config(void *p)
502 struct vmcs_config *vmcs_conf = &vmcs_config;
503 uint32_t vmx_msr_low, vmx_msr_high;
504 uint32_t min, opt, min2, opt2;
505 uint32_t _pin_based_exec_control = 0;
506 uint32_t _cpu_based_exec_control = 0;
507 uint32_t _cpu_based_2nd_exec_control = 0;
508 uint32_t _vmexit_control = 0;
509 uint32_t _vmentry_control = 0;
512 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
513 opt = PIN_BASED_VIRTUAL_NMIS;
514 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
515 &_pin_based_exec_control) < 0) {
520 CPU_BASED_CR8_LOAD_EXITING |
521 CPU_BASED_CR8_STORE_EXITING |
522 CPU_BASED_CR3_LOAD_EXITING |
523 CPU_BASED_CR3_STORE_EXITING |
524 CPU_BASED_MOV_DR_EXITING |
525 CPU_BASED_USE_TSC_OFFSETING |
526 CPU_BASED_MWAIT_EXITING |
527 CPU_BASED_MONITOR_EXITING |
528 CPU_BASED_INVLPG_EXITING;
530 min |= CPU_BASED_HLT_EXITING;
532 opt = CPU_BASED_TPR_SHADOW |
533 CPU_BASED_USE_MSR_BITMAPS |
534 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
535 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
536 &_cpu_based_exec_control) < 0) {
540 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
541 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
542 ~CPU_BASED_CR8_STORE_EXITING;
544 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
546 SECONDARY_EXEC_ENABLE_EPT |
547 SECONDARY_EXEC_UNRESTRICTED_GUEST;
548 opt2 = SECONDARY_EXEC_WBINVD_EXITING |
549 SECONDARY_EXEC_RDTSCP |
550 SECONDARY_EXEC_ENABLE_INVPCID;
551 if (adjust_vmx_controls(min2, opt2,
552 MSR_IA32_VMX_PROCBASED_CTLS2,
553 &_cpu_based_2nd_exec_control) < 0) {
558 if (!(_cpu_based_2nd_exec_control &
559 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
560 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
562 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
563 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
565 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
566 CPU_BASED_CR3_STORE_EXITING |
567 CPU_BASED_INVLPG_EXITING);
568 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
569 vmx_capability.ept, vmx_capability.vpid);
574 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
576 // opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
578 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
579 &_vmexit_control) < 0) {
584 // opt = VM_ENTRY_LOAD_IA32_PAT;
586 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
587 &_vmentry_control) < 0) {
591 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
593 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
594 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) {
598 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
599 if (vmx_msr_high & (1u<<16)) {
600 printk("64-bit CPUs always have VMX_BASIC_MSR[48]==0. FAILS!\n");
604 /* Require Write-Back (WB) memory type for VMCS accesses. */
605 if (((vmx_msr_high >> 18) & 15) != 6) {
610 vmcs_conf->size = vmx_msr_high & 0x1fff;
611 vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
612 vmcs_conf->revision_id = vmx_msr_low;
613 printk("vmcs_conf size %d order %d rev %d\n",
614 vmcs_conf->size, vmcs_conf->order,
615 vmcs_conf->revision_id);
617 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
618 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
619 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
620 vmcs_conf->vmexit_ctrl = _vmexit_control;
621 vmcs_conf->vmentry_ctrl = _vmentry_control;
623 vmx_capability.has_load_efer =
624 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
625 VM_ENTRY_LOAD_IA32_EFER)
626 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
627 VM_EXIT_LOAD_IA32_EFER);
629 /* Now that we've done all the setup we can do, verify
630 * that we have all the capabilities we need. These tests
631 * are done last presumably because all the work done above
632 * affects some of them.
635 if (!vmx_capability.has_load_efer) {
636 printk("CPU lacks ability to load EFER register\n");
643 static struct vmcs *__vmx_alloc_vmcs(int node)
647 vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
650 memset(vmcs, 0, vmcs_config.size);
651 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
652 printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
657 * vmx_alloc_vmcs - allocates a VMCS region
659 * NOTE: Assumes the new region will be used by the current CPU.
661 * Returns a valid VMCS region.
663 static struct vmcs *vmx_alloc_vmcs(void)
665 return __vmx_alloc_vmcs(node_id());
669 * vmx_free_vmcs - frees a VMCS region
671 static void vmx_free_vmcs(struct vmcs *vmcs)
673 //free_pages((unsigned long)vmcs, vmcs_config.order);
677 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
678 * will not change in the lifetime of the guest.
679 * Note that host-state that does change is set elsewhere. E.g., host-state
680 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
682 static void vmx_setup_constant_host_state(void)
684 uint32_t low32, high32;
688 vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS); /* 22.2.3 */
689 vmcs_writel(HOST_CR4, rcr4()); /* 22.2.3, 22.2.5 */
690 vmcs_writel(HOST_CR3, rcr3()); /* 22.2.3 */
692 vmcs_write16(HOST_CS_SELECTOR, GD_KT); /* 22.2.4 */
693 vmcs_write16(HOST_DS_SELECTOR, GD_KD); /* 22.2.4 */
694 vmcs_write16(HOST_ES_SELECTOR, GD_KD); /* 22.2.4 */
695 vmcs_write16(HOST_SS_SELECTOR, GD_KD); /* 22.2.4 */
696 vmcs_write16(HOST_TR_SELECTOR, GD_TSS); /* 22.2.4 */
698 native_store_idt(&dt);
699 vmcs_writel(HOST_IDTR_BASE, dt.pd_base); /* 22.2.4 */
701 asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
702 vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
704 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
705 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
706 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
707 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
709 rdmsr(MSR_EFER, low32, high32);
710 vmcs_write32(HOST_IA32_EFER, low32);
712 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
713 rdmsr(MSR_IA32_CR_PAT, low32, high32);
714 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
717 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
718 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
720 /* TODO: This (at least gs) is per cpu */
721 rdmsrl(MSR_FS_BASE, tmpl);
722 vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
723 rdmsrl(MSR_GS_BASE, tmpl);
724 vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
727 static inline uint16_t vmx_read_ldt(void)
730 asm("sldt %0" : "=g"(ldt));
734 static unsigned long segment_base(uint16_t selector)
736 pseudodesc_t *gdt = ¤tcpu->host_gdt;
737 struct desc_struct *d;
738 unsigned long table_base;
741 if (!(selector & ~3)) {
745 table_base = gdt->pd_base;
747 if (selector & 4) { /* from ldt */
748 uint16_t ldt_selector = vmx_read_ldt();
750 if (!(ldt_selector & ~3)) {
754 table_base = segment_base(ldt_selector);
756 d = (struct desc_struct *)(table_base + (selector & ~7));
757 v = get_desc_base(d);
759 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
760 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
765 static inline unsigned long vmx_read_tr_base(void)
768 asm("str %0" : "=g"(tr));
769 return segment_base(tr);
772 static void __vmx_setup_cpu(void)
774 pseudodesc_t *gdt = ¤tcpu->host_gdt;
775 unsigned long sysenter_esp;
779 * Linux uses per-cpu TSS and GDT, so set these when switching
782 vmcs_writel(HOST_TR_BASE, vmx_read_tr_base()); /* 22.2.4 */
783 vmcs_writel(HOST_GDTR_BASE, gdt->pd_base); /* 22.2.4 */
785 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
786 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
788 rdmsrl(MSR_FS_BASE, tmpl);
789 vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
790 rdmsrl(MSR_GS_BASE, tmpl);
791 vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
795 * vmx_get_cpu - called before using a cpu
796 * @vcpu: VCPU that will be loaded.
798 * Disables preemption. Call vmx_put_cpu() when finished.
800 static void vmx_get_cpu(struct vmx_vcpu *vcpu)
802 int cur_cpu = core_id();
803 handler_wrapper_t *w;
805 if (currentcpu->local_vcpu)
806 panic("get_cpu: currentcpu->localvcpu was non-NULL");
807 if (currentcpu->local_vcpu != vcpu) {
808 currentcpu->local_vcpu = vcpu;
810 if (vcpu->cpu != cur_cpu) {
811 if (vcpu->cpu >= 0) {
812 panic("vcpu->cpu is not -1, it's %d\n", vcpu->cpu);
814 vmcs_clear(vcpu->vmcs);
816 ept_sync_context(vcpu_get_eptp(vcpu));
819 vmcs_load(vcpu->vmcs);
823 vmcs_load(vcpu->vmcs);
829 * vmx_put_cpu - called after using a cpu
830 * @vcpu: VCPU that was loaded.
832 static void vmx_put_cpu(struct vmx_vcpu *vcpu)
834 if (core_id() != vcpu->cpu)
835 panic("%s: core_id() %d != vcpu->cpu %d\n",
836 __func__, core_id(), vcpu->cpu);
838 if (currentcpu->local_vcpu != vcpu)
839 panic("vmx_put_cpu: asked to clear something not ours");
841 ept_sync_context(vcpu_get_eptp(vcpu));
842 vmcs_clear(vcpu->vmcs);
844 currentcpu->local_vcpu = NULL;
848 static void __vmx_sync_helper(struct hw_trapframe *hw_tf, void *ptr)
850 struct vmx_vcpu *vcpu = ptr;
852 ept_sync_context(vcpu_get_eptp(vcpu));
855 struct sync_addr_args {
856 struct vmx_vcpu *vcpu;
860 static void __vmx_sync_individual_addr_helper(struct hw_trapframe *hw_tf, void *ptr)
862 struct sync_addr_args *args = ptr;
864 // ept_sync_individual_addr(
869 * vmx_ept_sync_global - used to evict everything in the EPT
872 void vmx_ept_sync_vcpu(struct vmx_vcpu *vcpu)
874 handler_wrapper_t *w;
876 smp_call_function_single(vcpu->cpu,
877 __vmx_sync_helper, (void *) vcpu, &w);
879 if (smp_call_wait(w)) {
880 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
887 * vmx_ept_sync_individual_addr - used to evict an individual address
889 * @gpa: the guest-physical address
891 void vmx_ept_sync_individual_addr(struct vmx_vcpu *vcpu, gpa_t gpa)
893 struct sync_addr_args args;
897 handler_wrapper_t *w;
900 smp_call_function_single(vcpu->cpu,
901 __vmx_sync_individual_addr_helper, (void *) &args, &w);
903 if (smp_call_wait(w)) {
904 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
910 * vmx_dump_cpu - prints the CPU state
911 * @vcpu: VCPU to print
913 static void vmx_dump_cpu(struct vmx_vcpu *vcpu)
919 vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
920 vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
921 flags = vmcs_readl(GUEST_RFLAGS);
924 printk("--- Begin VCPU Dump ---\n");
925 printk("CPU %d VPID %d\n", vcpu->cpu, 0);
926 printk("RIP 0x%016lx RFLAGS 0x%08lx\n",
927 vcpu->regs.tf_rip, flags);
928 printk("RAX 0x%016lx RCX 0x%016lx\n",
929 vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
930 printk("RDX 0x%016lx RBX 0x%016lx\n",
931 vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
932 printk("RSP 0x%016lx RBP 0x%016lx\n",
933 vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
934 printk("RSI 0x%016lx RDI 0x%016lx\n",
935 vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
936 printk("R8 0x%016lx R9 0x%016lx\n",
937 vcpu->regs.tf_r8, vcpu->regs.tf_r9);
938 printk("R10 0x%016lx R11 0x%016lx\n",
939 vcpu->regs.tf_r10, vcpu->regs.tf_r11);
940 printk("R12 0x%016lx R13 0x%016lx\n",
941 vcpu->regs.tf_r12, vcpu->regs.tf_r13);
942 printk("R14 0x%016lx R15 0x%016lx\n",
943 vcpu->regs.tf_r14, vcpu->regs.tf_r15);
944 printk("--- End VCPU Dump ---\n");
948 uint64_t construct_eptp(physaddr_t root_hpa)
952 /* set WB memory and 4 levels of walk. we checked these in ept_init */
953 eptp = VMX_EPT_MEM_TYPE_WB |
954 (VMX_EPT_GAW_4_LVL << VMX_EPT_GAW_EPTP_SHIFT);
955 if (cpu_has_vmx_ept_ad_bits())
956 eptp |= VMX_EPT_AD_ENABLE_BIT;
957 eptp |= (root_hpa & PAGE_MASK);
963 * vmx_setup_initial_guest_state - configures the initial state of guest registers
965 static void vmx_setup_initial_guest_state(void)
968 unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
969 X86_CR4_PGE | X86_CR4_OSFXSR;
970 uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
973 if (boot_cpu_has(X86_FEATURE_PCID))
974 cr4 |= X86_CR4_PCIDE;
975 if (boot_cpu_has(X86_FEATURE_OSXSAVE))
976 cr4 |= X86_CR4_OSXSAVE;
978 /* we almost certainly have this */
979 /* we'll go sour if we don't. */
980 if (1) //boot_cpu_has(X86_FEATURE_FSGSBASE))
981 cr4 |= X86_CR4_RDWRGSFS;
983 /* configure control and data registers */
984 vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
985 X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
986 vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
987 X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
988 vmcs_writel(GUEST_CR3, rcr3());
989 vmcs_writel(GUEST_CR4, cr4);
990 vmcs_writel(CR4_READ_SHADOW, cr4);
991 vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
992 EFER_SCE | EFER_FFXSR);
993 vmcs_writel(GUEST_GDTR_BASE, 0);
994 vmcs_writel(GUEST_GDTR_LIMIT, 0);
995 vmcs_writel(GUEST_IDTR_BASE, 0);
996 vmcs_writel(GUEST_IDTR_LIMIT, 0);
997 vmcs_writel(GUEST_RIP, 0xdeadbeef);
998 vmcs_writel(GUEST_RSP, 0xdeadbeef);
999 vmcs_writel(GUEST_RFLAGS, 0x02);
1000 vmcs_writel(GUEST_DR7, 0);
1002 /* guest segment bases */
1003 vmcs_writel(GUEST_CS_BASE, 0);
1004 vmcs_writel(GUEST_DS_BASE, 0);
1005 vmcs_writel(GUEST_ES_BASE, 0);
1006 vmcs_writel(GUEST_GS_BASE, 0);
1007 vmcs_writel(GUEST_SS_BASE, 0);
1008 rdmsrl(MSR_FS_BASE, tmpl);
1009 vmcs_writel(GUEST_FS_BASE, tmpl);
1011 /* guest segment access rights */
1012 vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
1013 vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
1014 vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
1015 vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
1016 vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
1017 vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
1019 /* guest segment limits */
1020 vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
1021 vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
1022 vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
1023 vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
1024 vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
1025 vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
1027 /* configure segment selectors */
1028 vmcs_write16(GUEST_CS_SELECTOR, 0);
1029 vmcs_write16(GUEST_DS_SELECTOR, 0);
1030 vmcs_write16(GUEST_ES_SELECTOR, 0);
1031 vmcs_write16(GUEST_FS_SELECTOR, 0);
1032 vmcs_write16(GUEST_GS_SELECTOR, 0);
1033 vmcs_write16(GUEST_SS_SELECTOR, 0);
1034 vmcs_write16(GUEST_TR_SELECTOR, 0);
1037 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1038 vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
1039 vmcs_writel(GUEST_LDTR_BASE, 0);
1040 vmcs_writel(GUEST_LDTR_LIMIT, 0);
1043 vmcs_writel(GUEST_TR_BASE, 0);
1044 vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
1045 vmcs_writel(GUEST_TR_LIMIT, 0xff);
1047 /* initialize sysenter */
1048 vmcs_write32(GUEST_SYSENTER_CS, 0);
1049 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1050 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1052 /* other random initialization */
1053 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1054 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1055 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1056 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1057 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1060 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
1062 int f = sizeof(unsigned long);
1064 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1065 * have the write-low and read-high bitmap offsets the wrong way round.
1066 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1068 if (msr <= 0x1fff) {
1069 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
1070 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
1071 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1073 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
1074 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
1078 static void setup_msr(struct vmx_vcpu *vcpu)
1080 int set[] = { MSR_LSTAR };
1081 struct vmx_msr_entry *e;
1082 int sz = sizeof(set) / sizeof(*set);
1085 //BUILD_BUG_ON(sz > NR_AUTOLOAD_MSRS);
1087 vcpu->msr_autoload.nr = sz;
1089 /* XXX enable only MSRs in set */
1090 vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1092 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
1093 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1094 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
1096 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
1097 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
1098 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
1100 for (i = 0; i < sz; i++) {
1103 e = &vcpu->msr_autoload.host[i];
1105 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
1106 rdmsrl(e->index, val);
1109 e = &vcpu->msr_autoload.guest[i];
1111 e->value = 0xDEADBEEF;
1116 * vmx_setup_vmcs - configures the vmcs with starting parameters
1118 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu)
1120 vmcs_write16(VIRTUAL_PROCESSOR_ID, 0);
1121 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1124 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1125 vmcs_config.pin_based_exec_ctrl);
1127 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1128 vmcs_config.cpu_based_exec_ctrl);
1130 if (cpu_has_secondary_exec_ctrls()) {
1131 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
1132 vmcs_config.cpu_based_2nd_exec_ctrl);
1135 vmcs_write64(EPT_POINTER, vcpu_get_eptp(vcpu));
1137 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1138 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1139 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1143 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1144 uint32_t msr_low, msr_high;
1146 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
1147 host_pat = msr_low | ((uint64_t) msr_high << 32);
1148 /* Write the default value follow host pat */
1149 vmcs_write64(GUEST_IA32_PAT, host_pat);
1150 /* Keep arch.pat sync with GUEST_IA32_PAT */
1151 vmx->vcpu.arch.pat = host_pat;
1155 for (int i = 0; i < NR_VMX_MSR; ++i) {
1156 uint32_t index = vmx_msr_index[i];
1157 uint32_t data_low, data_high;
1159 // TODO we should have read/writemsr_safe
1161 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1163 if (wrmsr_safe(index, data_low, data_high) < 0)
1166 vmx->guest_msrs[j].index = i;
1167 vmx->guest_msrs[j].data = 0;
1168 vmx->guest_msrs[j].mask = -1ull;
1173 vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1175 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1176 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1178 vmcs_writel(CR0_GUEST_HOST_MASK, ~0ul);
1179 vmcs_writel(CR4_GUEST_HOST_MASK, ~0ul);
1181 //kvm_write_tsc(&vmx->vcpu, 0);
1182 vmcs_writel(TSC_OFFSET, 0);
1184 vmx_setup_constant_host_state();
1188 * vmx_create_vcpu - allocates and initializes a new virtual cpu
1190 * Returns: A new VCPU structure
1192 struct vmx_vcpu *vmx_create_vcpu(struct proc *p)
1194 struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1199 memset(vcpu, 0, sizeof(*vcpu));
1201 vcpu->proc = p; /* uncounted (weak) reference */
1202 vcpu->vmcs = vmx_alloc_vmcs();
1203 printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1210 vmx_setup_vmcs(vcpu);
1211 vmx_setup_initial_guest_state();
1222 * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1223 * @vcpu: the VCPU to destroy
1225 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu)
1227 vmx_free_vmcs(vcpu->vmcs);
1232 * vmx_task_vcpu - returns a pointer to the task's vcpu or NULL.
1235 static inline struct vmx_vcpu *vmx_task_vcpu(struct proc *p)
1237 struct dune_struct *dune = current->virtinfo;
1238 return dune ? dune->vcpu : NULL;
1242 * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1244 * In the contexts where this is used the vcpu pointer should never be NULL.
1246 static inline struct vmx_vcpu *vmx_current_vcpu(void)
1248 struct vmx_vcpu *vcpu = vmx_task_vcpu(current);
1250 panic("%s: core_id %d: no vcpu", __func__, core_id());
1256 * vmx_run_vcpu - launches the CPU into non-root mode
1257 * We ONLY support 64-bit guests.
1258 * @vcpu: the vmx instance to launch
1260 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1263 /* Store host registers */
1264 "push %%rdx; push %%rbp;"
1265 "push %%rcx \n\t" /* placeholder for guest rcx */
1267 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1269 "mov %%rsp, %c[host_rsp](%0) \n\t"
1270 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1272 /* Reload cr2 if changed */
1273 "mov %c[cr2](%0), %%rax \n\t"
1274 "mov %%cr2, %%rdx \n\t"
1275 "cmp %%rax, %%rdx \n\t"
1277 "mov %%rax, %%cr2 \n\t"
1279 /* Check if vmlaunch of vmresume is needed */
1280 "cmpl $0, %c[launched](%0) \n\t"
1281 /* Load guest registers. Don't clobber flags. */
1282 "mov %c[rax](%0), %%rax \n\t"
1283 "mov %c[rbx](%0), %%rbx \n\t"
1284 "mov %c[rdx](%0), %%rdx \n\t"
1285 "mov %c[rsi](%0), %%rsi \n\t"
1286 "mov %c[rdi](%0), %%rdi \n\t"
1287 "mov %c[rbp](%0), %%rbp \n\t"
1288 "mov %c[r8](%0), %%r8 \n\t"
1289 "mov %c[r9](%0), %%r9 \n\t"
1290 "mov %c[r10](%0), %%r10 \n\t"
1291 "mov %c[r11](%0), %%r11 \n\t"
1292 "mov %c[r12](%0), %%r12 \n\t"
1293 "mov %c[r13](%0), %%r13 \n\t"
1294 "mov %c[r14](%0), %%r14 \n\t"
1295 "mov %c[r15](%0), %%r15 \n\t"
1296 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1298 /* Enter guest mode */
1299 "jne .Llaunched \n\t"
1300 ASM_VMX_VMLAUNCH "\n\t"
1301 "jmp .Lkvm_vmx_return \n\t"
1302 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1303 ".Lkvm_vmx_return: "
1304 /* Save guest registers, load host registers, keep flags */
1305 "mov %0, %c[wordsize](%%rsp) \n\t"
1307 "mov %%rax, %c[rax](%0) \n\t"
1308 "mov %%rbx, %c[rbx](%0) \n\t"
1309 "popq %c[rcx](%0) \n\t"
1310 "mov %%rdx, %c[rdx](%0) \n\t"
1311 "mov %%rsi, %c[rsi](%0) \n\t"
1312 "mov %%rdi, %c[rdi](%0) \n\t"
1313 "mov %%rbp, %c[rbp](%0) \n\t"
1314 "mov %%r8, %c[r8](%0) \n\t"
1315 "mov %%r9, %c[r9](%0) \n\t"
1316 "mov %%r10, %c[r10](%0) \n\t"
1317 "mov %%r11, %c[r11](%0) \n\t"
1318 "mov %%r12, %c[r12](%0) \n\t"
1319 "mov %%r13, %c[r13](%0) \n\t"
1320 "mov %%r14, %c[r14](%0) \n\t"
1321 "mov %%r15, %c[r15](%0) \n\t"
1322 "mov %%rax, %%r10 \n\t"
1323 "mov %%rdx, %%r11 \n\t"
1325 "mov %%cr2, %%rax \n\t"
1326 "mov %%rax, %c[cr2](%0) \n\t"
1328 "pop %%rbp; pop %%rdx \n\t"
1329 "setbe %c[fail](%0) \n\t"
1330 "mov $" STRINGIFY(GD_UD) ", %%rax \n\t"
1331 "mov %%rax, %%ds \n\t"
1332 "mov %%rax, %%es \n\t"
1333 : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1334 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1335 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1336 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1337 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1338 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1339 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1340 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1341 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1342 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1343 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1344 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1345 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1346 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1347 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1348 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1349 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1350 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1351 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1352 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1353 [wordsize]"i"(sizeof(unsigned long))
1355 , "rax", "rbx", "rdi", "rsi"
1356 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1359 vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1360 vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1361 printk("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1362 vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1363 /* FIXME: do we need to set up other flags? */
1364 vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
1365 X86_EFLAGS_IF | 0x2;
1367 vcpu->regs.tf_cs = GD_UT;
1368 vcpu->regs.tf_ss = GD_UD;
1373 printk("failure detected (err %x)\n",
1374 vmcs_read32(VM_INSTRUCTION_ERROR));
1375 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1378 return vmcs_read32(VM_EXIT_REASON);
1381 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1382 vmx_complete_atomic_exit(vmx);
1383 vmx_recover_nmi_blocking(vmx);
1384 vmx_complete_interrupts(vmx);
1388 static void vmx_step_instruction(void)
1390 vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1391 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1394 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu)
1396 unsigned long gva, gpa;
1397 int exit_qual, ret = -1;
1401 exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1402 gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1403 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1404 printk("ept: gva %016lx, gpa %016lx\n", gva, gpa);
1408 // this is a total hack, for testing things.
1409 // note that we only care about the gpa, and the
1410 // gpa is our process virtual address.
1412 page = page_lookup(current->env_pgdir, (void *)gpa, NULL);
1413 printk("Lookup %p returns %p\n", gpa, page);
1415 uint64_t hpa = page2pa(page);
1416 printk("hpa for %p is %p\n", gpa, hpa);
1417 ret = vmx_do_ept_fault(vcpu->proc->env_pgdir.epte, gpa, hpa, exit_qual);
1418 printk("vmx_do_ept_fault returns %d\n", ret);
1422 printk("page fault failure "
1423 "GPA: 0x%lx, GVA: 0x%lx\n",
1431 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu)
1433 unsigned int eax, ebx, ecx, edx;
1435 eax = vcpu->regs.tf_rax;
1436 ecx = vcpu->regs.tf_rcx;
1437 cpuid(0, 2, &eax, &ebx, &ecx, &edx);
1438 vcpu->regs.tf_rax = eax;
1439 vcpu->regs.tf_rbx = ebx;
1440 vcpu->regs.tf_rcx = ecx;
1441 vcpu->regs.tf_rdx = edx;
1444 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu)
1449 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1452 printk("vmx (vcpu %p): got an exception\n", vcpu);
1453 printk("vmx (vcpu %p): pid %d\n", vcpu, vcpu->proc->pid);
1454 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1458 printk("unhandled nmi, intr_info %x\n", intr_info);
1463 static void noop(void) {
1464 __asm__ __volatile__ ("1: jmp 1b");
1467 static void fail(void) {
1468 __asm__ __volatile__ ("movq $0xdeadbeef, %rbx; movq 0, %rax");
1471 static unsigned long stack[512];
1473 * vmx_launch - the main loop for a VMX Dune process
1474 * @conf: the launch configuration
1476 int vmx_launch(struct dune_config *conf)
1479 struct dune_struct dune;
1480 struct vmx_vcpu *vcpu;
1482 unsigned long rip = conf->rip;
1483 unsigned long rsp = conf->rsp;
1484 unsigned long cr3 = conf->cr3;
1487 if (conf->rip < 4096 ) {
1491 rip = (uint64_t)noop + 4;
1494 rip = (uint64_t)fail + 4;
1499 if (conf->cr3 == 0) {
1503 /* sanity checking. -- later
1504 ret = ept_check_page(ept, rip);
1506 printk("0x%x is not mapped in the ept!\n", rip);
1509 ret = ept_check_page(ept, rsp);
1511 printk("0x%x is not mapped in the ept!\n", rsp);
1520 printk("RUNNING: %s: rip %p rsp %p cr3 %p \n",
1521 __func__, rip, rsp, cr3);
1522 /* TODO: dirty hack til we have VMM contexts */
1523 vcpu = current->vmm.guest_pcores[0];
1525 printk("Failed to get a CPU!\n");
1530 vmcs_writel(GUEST_RIP, rip);
1531 vmcs_writel(GUEST_RSP, rsp);
1532 vmcs_writel(GUEST_CR3, cr3);
1535 vcpu->ret_code = -1;
1537 if (current->virtinfo)
1538 printk("vmx_launch: current->virtinfo is NOT NULL (%p)\n", current->virtinfo);
1539 //WARN_ON(current->virtinfo != NULL);
1542 current->virtinfo = &dune;
1547 // TODO: manage the fpu when we restart.
1549 // TODO: see if we need to exit before we go much further.
1551 ret = vmx_run_vcpu(vcpu);
1555 if (ret == EXIT_REASON_VMCALL) {
1556 vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1557 printk("system call! WTF\n");
1558 } else if (ret == EXIT_REASON_CPUID)
1559 vmx_handle_cpuid(vcpu);
1560 else if (ret == EXIT_REASON_EPT_VIOLATION) {
1561 if (vmx_handle_ept_violation(vcpu))
1562 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1563 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1564 if (vmx_handle_nmi_exception(vcpu))
1565 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1566 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1567 printk("External interrupt\n");
1569 printk("unhandled exit: reason %x, exit qualification %x\n",
1570 ret, vmcs_read32(EXIT_QUALIFICATION));
1572 vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1575 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1576 * similar to how proc_restartcore/smp_idle only restart the pcpui
1577 * cur_ctx, we need to do the same, via the VMCS resume business. */
1583 printk("RETURN. ip %016lx sp %016lx\n",
1584 vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
1585 current->virtinfo = NULL;
1588 * Return both the reason for the shutdown and a status value.
1589 * The exit() and exit_group() system calls only need 8 bits for
1590 * the status but we allow 16 bits in case we might want to
1591 * return more information for one of the other shutdown reasons.
1593 ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
1599 * __vmx_enable - low-level enable of VMX mode on the current CPU
1600 * @vmxon_buf: an opaque buffer for use as the VMXON region
1602 static int __vmx_enable(struct vmcs *vmxon_buf)
1604 uint64_t phys_addr = PADDR(vmxon_buf);
1605 uint64_t old, test_bits;
1607 if (rcr4() & X86_CR4_VMXE) {
1608 panic("Should never have this happen");
1612 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1614 test_bits = FEATURE_CONTROL_LOCKED;
1615 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1617 if (0) // tboot_enabled())
1618 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1620 if ((old & test_bits) != test_bits) {
1621 /* If it's locked, then trying to set it will cause a GPF.
1624 if (old & FEATURE_CONTROL_LOCKED) {
1625 printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1629 /* enable and lock */
1630 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1632 lcr4(rcr4() | X86_CR4_VMXE);
1635 vpid_sync_vcpu_global(); /* good idea, even if we aren't using vpids */
1642 * vmx_enable - enables VMX mode on the current CPU
1643 * @unused: not used (required for on_each_cpu())
1645 * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
1647 static void vmx_enable(void)
1649 struct vmcs *vmxon_buf = currentcpu->vmxarea;
1652 ret = __vmx_enable(vmxon_buf);
1656 currentcpu->vmx_enabled = 1;
1657 // TODO: do we need this?
1658 store_gdt(¤tcpu->host_gdt);
1660 printk("VMX enabled on CPU %d\n", core_id());
1664 printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1668 * vmx_disable - disables VMX mode on the current CPU
1670 static void vmx_disable(void *unused)
1672 if (currentcpu->vmx_enabled) {
1674 lcr4(rcr4() & ~X86_CR4_VMXE);
1675 currentcpu->vmx_enabled = 0;
1679 /* Probe the cpus to see which ones can do vmx.
1680 * Return -errno if it fails, and 1 if it succeeds.
1682 static bool probe_cpu_vmx(void)
1684 /* The best way to test this code is:
1685 * wrmsr -p <cpu> 0x3a 1
1686 * This will lock vmx off; then modprobe dune.
1687 * Frequently, however, systems have all 0x3a registers set to 5,
1688 * meaning testing is impossible, as vmx can not be disabled.
1689 * We have to simulate it being unavailable in most cases.
1690 * The 'test' variable provides an easy way to simulate
1691 * unavailability of vmx on some, none, or all cpus.
1693 if (!cpu_has_vmx()) {
1694 printk("Machine does not support VT-x\n");
1697 printk("Machine supports VT-x\n");
1702 static void setup_vmxarea(void)
1704 struct vmcs *vmxon_buf;
1705 printd("Set up vmxarea for cpu %d\n", core_id());
1706 vmxon_buf = __vmx_alloc_vmcs(node_id());
1708 printk("setup_vmxarea failed on node %d\n", core_id());
1711 currentcpu->vmxarea = vmxon_buf;
1714 static int ept_init(void)
1716 if (!cpu_has_vmx_ept()) {
1717 printk("VMX doesn't support EPT!\n");
1720 if (!cpu_has_vmx_eptp_writeback()) {
1721 printk("VMX EPT doesn't support WB memory!\n");
1724 if (!cpu_has_vmx_ept_4levels()) {
1725 printk("VMX EPT doesn't support 4 level walks!\n");
1728 switch (arch_max_jumbo_page_shift()) {
1730 if (!cpu_has_vmx_ept_1g_page()) {
1731 printk("VMX EPT doesn't support 1 GB pages!\n");
1736 if (!cpu_has_vmx_ept_2m_page()) {
1737 printk("VMX EPT doesn't support 2 MB pages!\n");
1742 printk("Unexpected jumbo page size %d\n",
1743 arch_max_jumbo_page_shift());
1746 if (!cpu_has_vmx_ept_ad_bits()) {
1747 printk("VMX EPT doesn't support accessed/dirty!\n");
1748 /* TODO: set the pmap_ops accordingly */
1750 if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
1751 printk("VMX EPT can't invalidate PTEs/TLBs!\n");
1759 * vmx_init sets up physical core data areas that are required to run a vm at all.
1760 * These data areas are not connected to a specific user process in any way. Instead,
1761 * they are in some sense externalizing what would other wise be a very large ball of
1762 * state that would be inside the CPU.
1764 int intel_vmm_init(void)
1768 if (! probe_cpu_vmx()) {
1772 setup_vmcs_config(&ret);
1775 printk("setup_vmcs_config failed: %d\n", ret);
1779 msr_bitmap = (unsigned long *)kpage_zalloc_addr();
1781 printk("Could not allocate msr_bitmap\n");
1784 /* FIXME: do we need APIC virtualization (flexpriority?) */
1786 memset(msr_bitmap, 0xff, PAGE_SIZE);
1787 __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
1788 __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
1790 if ((ret = ept_init())) {
1791 printk("EPT init failed, %d\n", ret);
1794 printk("VMX setup succeeded\n");
1798 int intel_vmm_pcpu_init(void)