Try out Dune vmm stuff.
authorRon Minnich <rminnich@gmail.com>
Sat, 7 Mar 2015 03:03:49 +0000 (19:03 -0800)
committerBarret Rhoden <brho@cs.berkeley.edu>
Tue, 17 Mar 2015 14:55:51 +0000 (10:55 -0400)
This has been frustrating, finding a reasonably compact yet not awful bit of code
to use for akaros. They're either gigantic, featureful with stuff we don't want,
buggy as can be, or all three.

The Dune stuff seems a reasonably pared down subset of kvm and it's actually
pretty clean. Also, unlike every other bit of vmx code we've tried lately,
it actually seems to work, which is a bonus. And it will never run as a 1978 8086.

We actually start a VM at this point. And, it fails, and exits, *but akaros is not hurt*.
We just keep going.

This was imported, heavily edited, and retroactively committed.
(git-fu'd by brho)

kern/arch/x86/Kbuild
kern/arch/x86/vmm/intel/Kbuild [new file with mode: 0644]
kern/arch/x86/vmm/intel/compat.h [new file with mode: 0644]
kern/arch/x86/vmm/intel/cpufeature.h [new file with mode: 0644]
kern/arch/x86/vmm/intel/ept.c [new file with mode: 0644]
kern/arch/x86/vmm/intel/vmx.c [new file with mode: 0644]
kern/arch/x86/vmm/intel/vmx.h
kern/arch/x86/vmm/vmm.c
kern/arch/x86/vmm/vmm.h

index 676ff2c..f847663 100644 (file)
@@ -30,4 +30,4 @@ obj-y                                         += time.o
 obj-y                                          += trap.o trap64.o
 obj-y                                          += trapentry64.o
 obj-y                                          += usb.o
-
+obj-y                                          += vmm/
diff --git a/kern/arch/x86/vmm/intel/Kbuild b/kern/arch/x86/vmm/intel/Kbuild
new file mode 100644 (file)
index 0000000..a0f377a
--- /dev/null
@@ -0,0 +1,2 @@
+obj-y                                          += vmx.o
+obj-y                                          += ept.o
diff --git a/kern/arch/x86/vmm/intel/compat.h b/kern/arch/x86/vmm/intel/compat.h
new file mode 100644 (file)
index 0000000..6f5699f
--- /dev/null
@@ -0,0 +1,46 @@
+#ifndef __DUNE_COMPAT_H_
+#define __DUNE_COMPAT_H_
+
+#if !defined(VMX_EPT_AD_BIT)
+#define VMX_EPT_AD_BIT          (1ull << 21)
+#define VMX_EPT_AD_ENABLE_BIT   (1ull << 6)
+#endif
+
+#ifndef VMX_EPT_EXTENT_INDIVIDUAL_BIT
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT           (1ull << 24)
+#endif
+
+#ifndef X86_CR4_PCIDE
+#define X86_CR4_PCIDE          0x00020000 /* enable PCID support */
+#endif
+
+#ifndef SECONDARY_EXEC_ENABLE_INVPCID
+#define SECONDARY_EXEC_ENABLE_INVPCID  0x00001000
+#endif
+
+// put this somewhere, someday.
+
+struct dune_config {
+       uint64_t rip;
+       uint64_t rsp;
+       uint64_t cr3;
+       uint64_t flags;
+};
+
+/*
+ * shutdown reasons
+ */
+enum shutdown_reason {
+       SHUTDOWN_SYS_EXIT = 1,
+       SHUTDOWN_SYS_EXIT_GROUP,
+       SHUTDOWN_SYS_EXECVE,
+       SHUTDOWN_FATAL_SIGNAL,
+       SHUTDOWN_EPT_VIOLATION,
+       SHUTDOWN_NMI_EXCEPTION,
+       SHUTDOWN_UNHANDLED_EXIT_REASON,
+};
+
+#define SHUTDOWN_REASON(r)     ((r) >> 16)
+#define SHUTDOWN_STATUS(r)     ((r) & 0xffff)
+
+#endif /* __DUNE_COMPAT_H_ */
diff --git a/kern/arch/x86/vmm/intel/cpufeature.h b/kern/arch/x86/vmm/intel/cpufeature.h
new file mode 100644 (file)
index 0000000..efea147
--- /dev/null
@@ -0,0 +1,364 @@
+/*
+ * Defines x86 CPU feature bits
+ */
+#ifndef _ASM_X86_CPUFEATURE_H
+#define _ASM_X86_CPUFEATURE_H
+
+#define NCAPINTS       10      /* N 32-bit words worth of info */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name.  If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+#define X86_FEATURE_FPU                (0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME                (0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE         (0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE                (0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC                (0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR                (0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE                (0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE                (0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8                (0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC       (0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP                (0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR       (0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE                (0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA                (0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV       (0*32+15) /* CMOV instructions */
+                                         /* (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT                (0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36      (0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN         (0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLSH     (0*32+19) /* "clflush" CLFLUSH instruction */
+#define X86_FEATURE_DS         (0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI       (0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX                (0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR       (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM                (0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2       (0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP  (0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT         (0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC                (0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64       (0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE                (0*32+31) /* Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL    (1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP         (1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX         (1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT     (1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT   (1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES    (1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP     (1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM         (1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT   (1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW      (1*32+31) /* 3DNow! */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY   (2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN    (2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI       (2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX      (3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR    (3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR  (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR        (3*32+ 3) /* Centaur MCRs (= MTRRs) */
+/* cpu types for specific tunings: */
+#define X86_FEATURE_K8         (3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7         (3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3         (3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4         (3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP         (3*32+ 9) /* smp kernel running on up */
+#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS       (3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS                (3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32  (3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD   (3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
+#define X86_FEATURE_11AP       (3*32+19) /* "" Bad local APIC aka 11AP */
+#define X86_FEATURE_NOPL       (3*32+20) /* The NOPL (0F 1F) instructions */
+                                         /* 21 available, was AMD_C1E */
+#define X86_FEATURE_XTOPOLOGY  (3*32+22) /* cpu topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC        (3*32+24) /* TSC does not stop in C states */
+#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID        (3*32+26) /* has extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM     (3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */
+#define X86_FEATURE_EAGER_FPU  (3*32+29) /* "eagerfpu" Non lazy FPU restore */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_XMM3       (4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ  (4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64     (4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT      (4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL      (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX                (4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX                (4*32+ 6) /* Safer mode */
+#define X86_FEATURE_EST                (4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2                (4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3      (4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID                (4*32+10) /* Context ID */
+#define X86_FEATURE_FMA                (4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16       (4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR       (4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM       (4*32+15) /* Performance Capabilities */
+#define X86_FEATURE_PCID       (4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA                (4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1     (4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2     (4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC     (4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE      (4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */
+#define X86_FEATURE_AES                (4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE      (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE    (4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX                (4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C       (4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRAND     (4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE     (5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN  (5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT     (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN  (5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2       (5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN    (5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE                (5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN     (5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM                (5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN     (5*32+13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+#define X86_FEATURE_LAHF_LM    (6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM                (6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC    (6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM                (6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A      (6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW       (6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS                (6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP                (6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT     (6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT                (6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP                (6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4       (6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE                (6*32+17) /* translation cache extension */
+#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM                (6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT    (6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc, word 7
+ */
+#define X86_FEATURE_IDA                (7*32+ 0) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT       (7*32+ 1) /* Always Running APIC Timer */
+#define X86_FEATURE_CPB                (7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB                (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_XSAVEOPT   (7*32+ 4) /* Optimized Xsave */
+#define X86_FEATURE_PLN                (7*32+ 5) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS                (7*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_DTHERM     (7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_HW_PSTATE  (7*32+ 8) /* AMD HW-PState */
+
+/* Virtualization flags: Linux defined, word 8 */
+#define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI        (8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT         (8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID        (8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_NPT                (8*32+ 5) /* AMD Nested Page Table support */
+#define X86_FEATURE_LBRV       (8*32+ 6) /* AMD LBR Virtualization support */
+#define X86_FEATURE_SVML       (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
+#define X86_FEATURE_NRIPS      (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR  (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN   (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
+
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+#define X86_FEATURE_FSGSBASE   (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */
+#define X86_FEATURE_BMI1       (9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE                (9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2       (9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP       (9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2       (9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS       (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+#define X86_FEATURE_INVPCID    (9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM                (9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_RDSEED     (9*32+18) /* The RDSEED instruction */
+#define X86_FEATURE_ADX                (9*32+19) /* The ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP       (9*32+20) /* Supervisor Mode Access Prevention */
+
+#include <bitops.h>
+
+extern const char * const x86_cap_flags[NCAPINTS*32];
+extern const char * const x86_power_flags[32];
+
+#define test_cpu_cap(c, bit)                                           \
+        test_bit(bit, (unsigned long *)((c)->x86_capability))
+
+#define REQUIRED_MASK_BIT_SET(bit)                                     \
+        ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) ||     \
+          (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) ||     \
+          (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) ||     \
+          (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) ||     \
+          (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) ||     \
+          (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) ||     \
+          (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) ||     \
+          (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ||     \
+          (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) ||     \
+          (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
+
+#define cpu_has(c, bit)                                                        \
+       (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :  \
+        test_cpu_cap(c, bit))
+
+#define this_cpu_has(bit)                                              \
+       (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :  \
+        x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
+
+#define boot_cpu_has(bit)      cpu_has(&boot_cpu_data, bit)
+
+#define set_cpu_cap(c, bit)    set_bit(bit, (unsigned long *)((c)->x86_capability))
+#define clear_cpu_cap(c, bit)  clear_bit(bit, (unsigned long *)((c)->x86_capability))
+#define setup_clear_cpu_cap(bit) do { \
+       clear_cpu_cap(&boot_cpu_data, bit);     \
+       set_bit(bit, (unsigned long *)cpu_caps_cleared); \
+} while (0)
+#define setup_force_cpu_cap(bit) do { \
+       set_cpu_cap(&boot_cpu_data, bit);       \
+       set_bit(bit, (unsigned long *)cpu_caps_set);    \
+} while (0)
+
+#define cpu_has_fpu            boot_cpu_has(X86_FEATURE_FPU)
+#define cpu_has_vme            boot_cpu_has(X86_FEATURE_VME)
+#define cpu_has_de             boot_cpu_has(X86_FEATURE_DE)
+#define cpu_has_pse            boot_cpu_has(X86_FEATURE_PSE)
+#define cpu_has_tsc            boot_cpu_has(X86_FEATURE_TSC)
+#define cpu_has_pae            boot_cpu_has(X86_FEATURE_PAE)
+#define cpu_has_pge            boot_cpu_has(X86_FEATURE_PGE)
+#define cpu_has_apic           boot_cpu_has(X86_FEATURE_APIC)
+#define cpu_has_sep            boot_cpu_has(X86_FEATURE_SEP)
+#define cpu_has_mtrr           boot_cpu_has(X86_FEATURE_MTRR)
+#define cpu_has_mmx            boot_cpu_has(X86_FEATURE_MMX)
+#define cpu_has_fxsr           boot_cpu_has(X86_FEATURE_FXSR)
+#define cpu_has_xmm            boot_cpu_has(X86_FEATURE_XMM)
+#define cpu_has_xmm2           boot_cpu_has(X86_FEATURE_XMM2)
+#define cpu_has_xmm3           boot_cpu_has(X86_FEATURE_XMM3)
+#define cpu_has_ssse3          boot_cpu_has(X86_FEATURE_SSSE3)
+#define cpu_has_aes            boot_cpu_has(X86_FEATURE_AES)
+#define cpu_has_avx            boot_cpu_has(X86_FEATURE_AVX)
+#define cpu_has_ht             boot_cpu_has(X86_FEATURE_HT)
+#define cpu_has_mp             boot_cpu_has(X86_FEATURE_MP)
+#define cpu_has_nx             boot_cpu_has(X86_FEATURE_NX)
+#define cpu_has_k6_mtrr                boot_cpu_has(X86_FEATURE_K6_MTRR)
+#define cpu_has_cyrix_arr      boot_cpu_has(X86_FEATURE_CYRIX_ARR)
+#define cpu_has_centaur_mcr    boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
+#define cpu_has_xstore         boot_cpu_has(X86_FEATURE_XSTORE)
+#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
+#define cpu_has_xcrypt         boot_cpu_has(X86_FEATURE_XCRYPT)
+#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
+#define cpu_has_ace2           boot_cpu_has(X86_FEATURE_ACE2)
+#define cpu_has_ace2_enabled   boot_cpu_has(X86_FEATURE_ACE2_EN)
+#define cpu_has_phe            boot_cpu_has(X86_FEATURE_PHE)
+#define cpu_has_phe_enabled    boot_cpu_has(X86_FEATURE_PHE_EN)
+#define cpu_has_pmm            boot_cpu_has(X86_FEATURE_PMM)
+#define cpu_has_pmm_enabled    boot_cpu_has(X86_FEATURE_PMM_EN)
+#define cpu_has_ds             boot_cpu_has(X86_FEATURE_DS)
+#define cpu_has_pebs           boot_cpu_has(X86_FEATURE_PEBS)
+#define cpu_has_clflush                boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_bts            boot_cpu_has(X86_FEATURE_BTS)
+#define cpu_has_gbpages                boot_cpu_has(X86_FEATURE_GBPAGES)
+#define cpu_has_arch_perfmon   boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
+#define cpu_has_pat            boot_cpu_has(X86_FEATURE_PAT)
+#define cpu_has_xmm4_1         boot_cpu_has(X86_FEATURE_XMM4_1)
+#define cpu_has_xmm4_2         boot_cpu_has(X86_FEATURE_XMM4_2)
+#define cpu_has_x2apic         boot_cpu_has(X86_FEATURE_X2APIC)
+#define cpu_has_xsave          boot_cpu_has(X86_FEATURE_XSAVE)
+#define cpu_has_xsaveopt       boot_cpu_has(X86_FEATURE_XSAVEOPT)
+#define cpu_has_osxsave                boot_cpu_has(X86_FEATURE_OSXSAVE)
+#define cpu_has_hypervisor     boot_cpu_has(X86_FEATURE_HYPERVISOR)
+#define cpu_has_pclmulqdq      boot_cpu_has(X86_FEATURE_PCLMULQDQ)
+#define cpu_has_perfctr_core   boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_cx8            boot_cpu_has(X86_FEATURE_CX8)
+#define cpu_has_cx16           boot_cpu_has(X86_FEATURE_CX16)
+#define cpu_has_eager_fpu      boot_cpu_has(X86_FEATURE_EAGER_FPU)
+#define cpu_has_topoext                boot_cpu_has(X86_FEATURE_TOPOEXT)
+
+#undef  cpu_has_vme
+#define cpu_has_vme            0
+
+#undef  cpu_has_pae
+#define cpu_has_pae            ___BUG___
+
+#undef  cpu_has_mp
+#define cpu_has_mp             1
+
+#undef  cpu_has_k6_mtrr
+#define cpu_has_k6_mtrr                0
+
+#undef  cpu_has_cyrix_arr
+#define cpu_has_cyrix_arr      0
+
+#undef  cpu_has_centaur_mcr
+#define cpu_has_centaur_mcr    0
+
+/*
+ * Static testing of CPU features.  Used the same as boot_cpu_has().
+ * These are only valid after alternatives have run, but will statically
+ * patch the target code for additional performance.
+ *
+ */
+static inline bool __static_cpu_has(uint16_t bit)
+{
+#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
+               asm goto("1: jmp %l[t_no]\n"
+                        "2:\n"
+                        ".section .altinstructions,\"a\"\n"
+                        " .long 1b - .\n"
+                        " .long 0\n"           /* no replacement */
+                        " .word %P0\n"         /* feature bit */
+                        " .byte 2b - 1b\n"     /* source len */
+                        " .byte 0\n"           /* replacement len */
+                        ".previous\n"
+                        /* skipping size check since replacement size = 0 */
+                        : : "i" (bit) : : t_no);
+               return true;
+       t_no:
+               return false;
+#else
+#error "Here's a nickel, kid. Go get yourself a real compiler"
+#endif
+}
+
+#define static_cpu_has(bit)                                    \
+(                                                              \
+       __builtin_constant_p(boot_cpu_has(bit)) ?               \
+               boot_cpu_has(bit) :                             \
+       __builtin_constant_p(bit) ?                             \
+               __static_cpu_has(bit) :                         \
+               boot_cpu_has(bit)                               \
+)
+#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/kern/arch/x86/vmm/intel/ept.c b/kern/arch/x86/vmm/intel/ept.c
new file mode 100644 (file)
index 0000000..8105c59
--- /dev/null
@@ -0,0 +1,366 @@
+/**
+ * ept.c - Support for Intel's Extended Page Tables
+ *
+ * Authors:
+ *   Adam Belay <abelay@stanford.edu>
+ *
+ * Right now we support EPT by making a sort of 'shadow' copy of the Linux
+ * process page table. In the future, a more invasive architecture port
+ * to VMX x86 could provide better performance by eliminating the need for
+ * two copies of each page table entry, relying instead on only the EPT
+ * format.
+ * 
+ * This code is only a prototype and could benefit from a more comprehensive
+ * review in terms of performance and correctness. Also, the implications
+ * of threaded processes haven't been fully considered.
+ *
+ * Some of the low-level EPT functions are based on KVM.
+ * Original Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ */
+
+#include <kmalloc.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <error.h>
+#include <pmap.h>
+#include <sys/queue.h>
+#include <smp.h>
+#include <kref.h>
+#include <atomic.h>
+#include <alarm.h>
+#include <event.h>
+#include <umem.h>
+#include <bitops.h>
+#include <arch/types.h>
+#include <syscall.h>
+#include <monitor.h>
+
+#include "vmx.h"
+#include "../vmm.h"
+
+#include "compat.h"
+#include "cpufeature.h"
+
+#define EPT_LEVELS     4       /* 0 through 3 */
+#define HUGE_PAGE_SIZE 2097152
+#define PageHuge(x) (0)
+
+static inline bool cpu_has_vmx_ept_execute_only(void)
+{
+       return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
+}
+
+static inline bool cpu_has_vmx_eptp_uncacheable(void)
+{
+       return vmx_capability.ept & VMX_EPTP_UC_BIT;
+}
+
+static inline bool cpu_has_vmx_eptp_writeback(void)
+{
+       return vmx_capability.ept & VMX_EPTP_WB_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_2m_page(void)
+{
+       return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_1g_page(void)
+{
+       return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+       return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
+#define VMX_EPT_FAULT_READ     0x01
+#define VMX_EPT_FAULT_WRITE    0x02
+#define VMX_EPT_FAULT_INS      0x04
+
+typedef unsigned long epte_t;
+
+#define __EPTE_READ    0x01
+#define __EPTE_WRITE   0x02
+#define __EPTE_EXEC    0x04
+#define __EPTE_IPAT    0x40
+#define __EPTE_SZ      0x80
+#define __EPTE_TYPE(n) (((n) & 0x7) << 3)
+
+enum {
+       EPTE_TYPE_UC = 0, /* uncachable */
+       EPTE_TYPE_WC = 1, /* write combining */
+       EPTE_TYPE_WT = 4, /* write through */
+       EPTE_TYPE_WP = 5, /* write protected */
+       EPTE_TYPE_WB = 6, /* write back */
+};
+
+#define __EPTE_NONE    0
+#define __EPTE_FULL    (__EPTE_READ | __EPTE_WRITE | __EPTE_EXEC)
+
+#define EPTE_ADDR      (~(PAGE_SIZE - 1))
+#define EPTE_FLAGS     (PAGE_SIZE - 1)
+
+static inline uintptr_t epte_addr(epte_t epte)
+{
+       return (epte & EPTE_ADDR);
+}
+
+static inline uintptr_t epte_page_vaddr(epte_t epte)
+{
+       return (uintptr_t) KADDR(epte_addr(epte));
+}
+
+static inline epte_t epte_flags(epte_t epte)
+{
+       return (epte & EPTE_FLAGS);
+}
+
+static inline int epte_present(epte_t epte)
+{
+       return (epte & __EPTE_FULL) > 0;
+}
+
+static inline int epte_big(epte_t epte)
+{
+       return (epte & __EPTE_SZ) > 0;
+}
+
+#define ADDR_TO_IDX(la, n) \
+       ((((unsigned long) (la)) >> (12 + 9 * (n))) & ((1 << 9) - 1))
+
+/* for now we assume in 'current' */
+static int
+ept_lookup_gpa(epte_t *dir, void *gpa, int level, int create, epte_t **epte_out)
+{
+       int i;
+
+       for (i = EPT_LEVELS - 1; i > level; i--) {
+               int idx = ADDR_TO_IDX(gpa, i);
+               printk("%d: gpa %p, idx %p\n", i, gpa, idx);
+               if (!epte_present(dir[idx])) {
+                       printk("not present\n");
+                       void *page;
+
+                       if (!create)
+                               return -ENOENT;
+
+                       page = (void *) kpage_zalloc_addr();
+                       if (!page)
+                               return -ENOMEM;
+                       printk("page %p\n", page);
+                       dir[idx] = epte_addr(PADDR(page)) |
+                                  __EPTE_FULL;
+                       printk("Set %p[%p] to %p\n", dir, idx, dir[idx]);
+               }
+
+               if (epte_big(dir[idx])) {
+                       if (i != 1)
+                               return -EINVAL;
+                       level = i;
+                       break;
+               }
+
+               dir = (epte_t *) epte_page_vaddr(dir[idx]);
+               printk("Dir for next pass: %p\n", dir);
+       }
+
+       *epte_out = &dir[ADDR_TO_IDX(gpa, level)];
+       printk("Final ept is %p\n", *epte_out);
+       return 0;
+}
+
+static void free_ept_page(epte_t epte)
+{
+       // TODO: clean this up. 
+       void *page = KADDR(epte & ~0xfff);
+       //struct page *page = pfn_to_page(epte_addr(epte) >> PAGE_SHIFT);
+
+       kfree(page);
+}
+
+static void vmx_free_ept(unsigned long ept_root)
+{
+       epte_t *pgd = (epte_t *) KADDR(ept_root);
+       int i, j, k, l;
+
+       // TODO: change all instances of 512 to something.
+       for (i = 0; i < 512; i++) {
+               epte_t *pud = (epte_t *) epte_page_vaddr(pgd[i]);
+               if (!epte_present(pgd[i]))
+                       continue;
+
+               for (j = 0; j < 512; j++) {
+                       epte_t *pmd = (epte_t *) epte_page_vaddr(pud[j]);
+                       if (!epte_present(pud[j]))
+                               continue;
+                       if (epte_flags(pud[j]) & __EPTE_SZ)
+                               continue;
+
+                       for (k = 0; k < 512; k++) {
+                               epte_t *pte = (epte_t *) epte_page_vaddr(pmd[k]);
+                               if (!epte_present(pmd[k]))
+                                       continue;
+                               if (epte_flags(pmd[k]) & __EPTE_SZ) {
+                                       free_ept_page(pmd[k]);
+                                       continue;
+                               }
+
+                               for (l = 0; l < 512; l++) {
+                                       if (!epte_present(pte[l]))
+                                               continue;
+
+                                       free_ept_page(pte[l]);
+                               }
+
+                               kfree(pte);
+                       }
+
+                       kfree(pmd);
+               }
+
+               kfree(pud);
+       }
+
+       kfree(pgd);
+}
+
+static int ept_clear_epte(epte_t *epte)
+{
+       if (*epte == __EPTE_NONE)
+               return 0;
+
+       free_ept_page(*epte);
+       *epte = __EPTE_NONE;
+
+       return 1;
+}
+
+/* We're given a guest physical and a host physical. */
+static int ept_set_epte(epte_t *dir, int make_write, unsigned long gpa, unsigned long hpa)
+{
+       int ret = -1;
+       epte_t *epte, flags;
+       struct page *page = NULL;
+
+       // We're going to assume locking is done by this point.
+       // TODO: PageHuge
+
+       ret = ept_lookup_gpa(dir, (void *) gpa, PageHuge(page) ? 1 : 0, 1, &epte);
+       if (ret) {
+               printk("ept: failed to lookup EPT entry\n");
+               return ret;
+       }
+
+       printk("=====================> epte %p is %p\n", epte, *epte);
+       if (epte_present(*epte) && (epte_big(*epte) || !PageHuge(page))) {
+               printk("PRESENT? WTF? OK ...\n");
+               monitor(NULL);
+               //ept_clear_epte(epte);
+       } else {
+               flags = __EPTE_READ | __EPTE_EXEC | __EPTE_WRITE |
+                       __EPTE_TYPE(EPTE_TYPE_WB) | __EPTE_IPAT;
+               if (make_write)
+                       flags |= __EPTE_WRITE;
+               
+               /* TODO: fix thishuge page shit.*/
+               if (PageHuge(page)) {
+                       flags |= __EPTE_SZ;
+                       if (epte_present(*epte) && !epte_big(*epte)){
+                               panic("free huge page?");
+                               //free_page(epte_page_vaddr(*epte));
+                       }
+                       /* FIXME: free L0 entries too */
+                       *epte = epte_addr(PADDR(page) & ~((1 << 21) - 1)) |
+                               flags;
+               } else {
+                       *epte = epte_addr(hpa) | flags;
+                       printk("Set epte to %p\n", *epte);
+               }
+       }
+       return 0;
+}
+
+// TODO: kill this? 
+// NOTE: guest physical is 1:1 mapped to host virtual. This is NOT 
+// like dune at all.
+int vmx_do_ept_fault(void *dir, unsigned long gpa, unsigned long hpa, int fault_flags)
+{
+       int ret;
+       int make_write = (fault_flags & VMX_EPT_FAULT_WRITE) ? 1 : 0;
+
+       printk("ept: GPA: 0x%lx, GVA: 0x%lx, flags: %x\n",
+                gpa, hpa, fault_flags);
+
+       ret = ept_set_epte((epte_t *)dir, make_write, gpa, hpa);
+
+       return ret;
+}
+
+/*
+ * ept_fault_pages pre-faults pages in the range start..end
+ */
+int ept_fault_pages(void *dir, uint32_t start, uint32_t end)
+{
+       uint64_t i;
+       int ret;
+       for(i = start; i < end; i++) {
+               uint64_t addr = i << 12;
+               ret = vmx_do_ept_fault((epte_t*)dir, i, i, VMX_EPT_FAULT_WRITE);
+               if (ret) {
+                       return ret;
+               }
+       }
+       return 0;
+}
+/**
+ * ept_invalidate_page - removes a page from the EPT
+ * @vcpu: the vcpu
+ * @mm: the process's mm_struct
+ * @addr: the address of the page
+ * 
+ * Returns 1 if the page was removed, 0 otherwise
+ */
+static int ept_invalidate_page(epte_t *dir, unsigned long addr)
+{
+       int ret;
+       epte_t *epte;
+       void *gpa = (void *) addr;
+
+       ret = ept_lookup_gpa(dir, (void *) gpa, 0, 0, &epte);
+       if (ret) {
+               return 0;
+       }
+
+       ret = ept_clear_epte(epte);
+
+       /* TODO: sync individual?
+       if (ret)
+               vmx_ept_sync_individual_addr(vcpu, (gpa_t) gpa);
+       */
+
+       return ret;
+}
+
+/**
+ * ept_check_page - determines if a page is mapped in the ept
+ * @vcpu: the vcpu
+ * @mm: the process's mm_struct
+ * @addr: the address of the page
+ * 
+ * Returns 1 if the page is mapped, 0 otherwise
+ */
+int ept_check_page(void *dir, unsigned long addr)
+{
+       int ret;
+       epte_t *epte;
+       void *gpa = (void *) addr;
+
+       ret = ept_lookup_gpa((epte_t *)dir, gpa, 0, 0, &epte);
+
+       return ret;
+}
diff --git a/kern/arch/x86/vmm/intel/vmx.c b/kern/arch/x86/vmm/intel/vmx.c
new file mode 100644 (file)
index 0000000..643d7af
--- /dev/null
@@ -0,0 +1,1798 @@
+/**
+ *  vmx.c - The Intel VT-x driver for Dune
+ *
+ * This file is derived from Linux KVM VT-x support.
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Original Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *
+ * This modified version is simpler because it avoids the following
+ * features that are not requirements for Dune:
+ *  * Real-mode emulation
+ *  * Nested VT-x support
+ *  * I/O hardware emulation
+ *  * Any of the more esoteric X86 features and registers
+ *  * KVM-specific functionality
+ *
+ * In essence we provide only the minimum functionality needed to run
+ * a process in vmx non-root mode rather than the full hardware emulation
+ * needed to support an entire OS.
+ *
+ * This driver is a research prototype and as such has the following
+ * limitations:
+ *
+ * FIXME: Backward compatability is currently a non-goal, and only recent
+ * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
+ * driver.
+ *
+ * FIXME: Eventually we should handle concurrent user's of VT-x more
+ * gracefully instead of requiring exclusive access. This would allow
+ * Dune to interoperate with KVM and other HV solutions.
+ *
+ * FIXME: We need to support hotplugged physical CPUs.
+ *
+ * Authors:
+ *   Adam Belay   <abelay@stanford.edu>
+ */
+
+/* Basic flow.
+ * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
+ * You're left with the feeling that they got part way through and realized they had to have one for
+ *
+ * 1) your CPU is going to be capable of running VMs, and you need state for that.
+ *
+ * 2) you're about to start a guest, and you need state for that.
+ *
+ * So there is get cpu set up to be able to run VMs stuff, and now
+ * let's start a guest stuff.  In Akaros, CPUs will always be set up
+ * to run a VM if that is possible. Processes can flip themselves into
+ * a VM and that will require another VMCS.
+ *
+ * So: at kernel startup time, the SMP boot stuff calls
+ * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
+ * in the case of this file is intel_vmm_init. That does some code
+ * that sets up stuff for ALL sockets, based on the capabilities of
+ * the socket it runs on. If any cpu supports vmx, it assumes they all
+ * do. That's a realistic assumption. So the call_function_all is kind
+ * of stupid, really; it could just see what's on the current cpu and
+ * assume it's on all. HOWEVER: there are systems in the wilde that
+ * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
+ * might as well allow for the chance that wel'll only all VMMCPs on a
+ * subset (not implemented yet however).  So: probe all CPUs, get a
+ * count of how many support VMX and, for now, assume they all do
+ * anyway.
+ *
+ * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
+ * which contains all the naughty bits settings for all the cpus that can run a VM.
+ * Realistically, all VMX-capable cpus in a system will have identical configurations.
+ * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
+ *
+ * configure the msr_bitmap. This is the bitmap of MSRs which the
+ * guest can manipulate.  Currently, we only allow GS and FS base.
+ *
+ * Reserve bit 0 in the vpid bitmap as guests can not use that
+ *
+ * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
+ * per-guest. Once set up, it is left alone.  The ONLY think we set in
+ * there is the revision area. The VMX is page-sized per cpu and
+ * page-aligned. Note that it can be smaller, but why bother? We know
+ * the max size and alightment, and it's convenient.
+ *
+ * Now that it is set up, enable vmx on all cpus. This involves
+ * testing VMXE in cr4, to see if we've been here before (TODO: delete
+ * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
+ * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
+ * instruction), and syncing vpid's and ept's.  Now the CPU is ready
+ * to host guests.
+ *
+ * Setting up a guest.
+ * We divide this into two things: vmm_proc_init and vm_run.
+ * Currently, on Intel, vmm_proc_init does nothing.
+ *
+ * vm_run is really complicated. It is called with a coreid, rip, rsp,
+ * cr3, and flags.  On intel, it calls vmx_launch. vmx_launch is set
+ * up for a few test cases. If rip is 1, it sets the guest rip to
+ * a function which will deref 0 and should exit with failure 2. If rip is 0,
+ * it calls an infinite loop in the guest.
+ *
+ * The sequence of operations:
+ * create a vcpu
+ * while (1) {
+ * get a vcpu
+ * disable irqs (required or you can't enter the VM)
+ * vmx_run_vcpu()
+ * enable irqs
+ * manage the vm exit
+ * }
+ *
+ * get a vcpu
+ * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
+ * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
+ *
+ * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
+ * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
+ * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
+ * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
+ *
+ * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
+ * of inline assembly with embedded CPP crap. I suspect we'll want to
+ * un-inline it someday, but maybe not.  It's called with a vcpu
+ * struct from which it loads guest state, and to which it stores
+ * non-virtualized host state. It issues a vmlaunch or vmresume
+ * instruction depending, and on return, it evaluates if things the
+ * launch/resume had an error in that operation. Note this is NOT the
+ * same as an error while in the virtual machine; this is an error in
+ * startup due to misconfiguration. Depending on whatis returned it's
+ * either a failed vm startup or an exit for lots of many reasons.
+ *
+ */
+void monitor(void *);
+/* basically: only rename those globals that might conflict
+ * with existing names. Leave all else the same.
+ * this code is more modern than the other code, yet still
+ * well encapsulated, it seems.
+ */
+#include <kmalloc.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <error.h>
+#include <pmap.h>
+#include <sys/queue.h>
+#include <smp.h>
+#include <kref.h>
+#include <atomic.h>
+#include <alarm.h>
+#include <event.h>
+#include <umem.h>
+#include <bitops.h>
+#include <arch/types.h>
+#include <syscall.h>
+
+#include "vmx.h"
+#include "../vmm.h"
+
+#include "compat.h"
+#include "cpufeature.h"
+
+#define currentcpu (&per_cpu_info[core_id()])
+
+/* this is always 1, and only ever incremented. If it's more than 1,
+ * then you failed.
+ */
+static bool has_vmx = FALSE;
+
+/* TEMPORARY TEST HACK EPT */
+void *ept;
+uint64_t eptp;
+/* END HACKQUE */
+
+static DECLARE_BITMAP(vmx_vpid_bitmap, /*VMX_NR_VPIDS*/ 65536);
+static spinlock_t vmx_vpid_lock;
+
+static unsigned long *msr_bitmap;
+
+static struct vmcs_config {
+       int size;
+       int order;
+       uint32_t revision_id;
+       uint32_t pin_based_exec_ctrl;
+       uint32_t cpu_based_exec_ctrl;
+       uint32_t cpu_based_2nd_exec_ctrl;
+       uint32_t vmexit_ctrl;
+       uint32_t vmentry_ctrl;
+} vmcs_config;
+
+struct vmx_capability vmx_capability;
+
+static inline bool cpu_has_secondary_exec_ctrls(void)
+{
+       return vmcs_config.cpu_based_exec_ctrl &
+               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool cpu_has_vmx_vpid(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline bool cpu_has_vmx_invpcid(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_ENABLE_INVPCID;
+}
+
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+       return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+       return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_ept(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline bool cpu_has_vmx_invept_individual_addr(void)
+{
+       return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_context(void)
+{
+       return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_global(void)
+{
+       return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_ad_bits(void)
+{
+       return vmx_capability.ept & VMX_EPT_AD_BIT;
+}
+
+static inline void __invept(int ext, uint64_t eptp, gpa_t gpa)
+{
+       struct {
+               uint64_t eptp, gpa;
+       } operand = {eptp, gpa};
+
+       asm volatile (ASM_VMX_INVEPT
+                       /* CF==1 or ZF==1 --> rc = -1 */
+                       "; ja 1f ; ud2 ; 1:\n"
+                       : : "a" (&operand), "c" (ext) : "cc", "memory");
+}
+
+static inline void ept_sync_global(void)
+{
+       if (cpu_has_vmx_invept_global())
+               __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(uint64_t eptp)
+{
+       if (cpu_has_vmx_invept_context())
+               __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+       else
+               ept_sync_global();
+}
+
+static inline void ept_sync_individual_addr(uint64_t eptp, gpa_t gpa)
+{
+       if (cpu_has_vmx_invept_individual_addr())
+               __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
+                               eptp, gpa);
+       else
+               ept_sync_context(eptp);
+}
+
+static inline void __vmxon(uint64_t addr)
+{
+       asm volatile (ASM_VMX_VMXON_RAX
+                       : : "a"(&addr), "m"(addr)
+                       : "memory", "cc");
+}
+
+static inline void __vmxoff(void)
+{
+       asm volatile (ASM_VMX_VMXOFF : : : "cc");
+}
+
+static inline void __invvpid(int ext, uint16_t vpid, gva_t gva)
+{
+    struct {
+       uint64_t vpid : 16;
+       uint64_t rsvd : 48;
+       uint64_t gva;
+    } operand = { vpid, 0, gva };
+
+    asm volatile (ASM_VMX_INVVPID
+                 /* CF==1 or ZF==1 --> rc = -1 */
+                 "; ja 1f ; ud2 ; 1:"
+                 : : "a"(&operand), "c"(ext) : "cc", "memory");
+}
+
+static inline void vpid_sync_vcpu_single(uint16_t vpid)
+{
+       if (vpid == 0) {
+               return;
+       }
+
+       if (cpu_has_vmx_invvpid_single())
+               __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+       if (cpu_has_vmx_invvpid_global())
+               __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(uint16_t vpid)
+{
+       if (cpu_has_vmx_invvpid_single())
+               vpid_sync_vcpu_single(vpid);
+       else
+               vpid_sync_vcpu_global();
+}
+
+static void vmcs_clear(struct vmcs *vmcs)
+{
+       uint64_t phys_addr = PADDR(vmcs);
+       uint8_t error;
+
+       asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
+                     : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
+                     : "cc", "memory");
+       if (error)
+               printk("vmclear fail: %p/%llx\n",
+                      vmcs, phys_addr);
+}
+
+static void vmcs_load(struct vmcs *vmcs)
+{
+       uint64_t phys_addr = PADDR(vmcs);
+       uint8_t error;
+
+       asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
+                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
+                       : "cc", "memory");
+       if (error)
+               printk("vmptrld %p/%llx failed\n",
+                      vmcs, phys_addr);
+}
+
+
+__always_inline unsigned long vmcs_readl(unsigned long field)
+{
+       unsigned long value;
+
+       asm volatile (ASM_VMX_VMREAD_RDX_RAX
+                     : "=a"(value) : "d"(field) : "cc");
+       return value;
+}
+
+__always_inline uint16_t vmcs_read16(unsigned long field)
+{
+       return vmcs_readl(field);
+}
+
+static __always_inline uint32_t vmcs_read32(unsigned long field)
+{
+       return vmcs_readl(field);
+}
+
+static __always_inline uint64_t vmcs_read64(unsigned long field)
+{
+#ifdef CONFIG_X86_64
+       return vmcs_readl(field);
+#else
+       return vmcs_readl(field) | ((uint64_t)vmcs_readl(field+1) << 32);
+#endif
+}
+
+void vmwrite_error(unsigned long field, unsigned long value)
+{
+       printk("vmwrite error: reg %lx value %lx (err %d)\n",
+              field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+       /* not available so ...
+       dump_stack();
+       */
+       monitor(NULL);
+}
+
+void vmcs_writel(unsigned long field, unsigned long value)
+{
+       uint8_t error;
+
+       asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
+                      : "=q"(error) : "a"(value), "d"(field) : "cc");
+       if (error)
+               vmwrite_error(field, value);
+}
+
+static void vmcs_write16(unsigned long field, uint16_t value)
+{
+       vmcs_writel(field, value);
+}
+
+static void vmcs_write32(unsigned long field, uint32_t value)
+{
+       vmcs_writel(field, value);
+}
+
+static void vmcs_write64(unsigned long field, uint64_t value)
+{
+       vmcs_writel(field, value);
+#ifndef CONFIG_X86_64
+       asm volatile ("");
+       vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+
+static int adjust_vmx_controls(uint32_t ctl_min, uint32_t ctl_opt,
+                                     uint32_t msr, uint32_t *result)
+{
+       uint32_t vmx_msr_low, vmx_msr_high;
+       uint32_t ctl = ctl_min | ctl_opt;
+       uint64_t vmx_msr = read_msr(msr);
+       vmx_msr_low = vmx_msr;
+       vmx_msr_high = vmx_msr>>32;
+
+       ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+       ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
+
+       /* Ensure minimum (required) set of control bits are supported. */
+       if (ctl_min & ~ctl) {
+               return -EIO;
+       }
+
+       *result = ctl;
+       return 0;
+}
+
+static  bool allow_1_setting(uint32_t msr, uint32_t ctl)
+{
+       uint32_t vmx_msr_low, vmx_msr_high;
+
+       rdmsr(msr, vmx_msr_low, vmx_msr_high);
+       return vmx_msr_high & ctl;
+}
+
+static  void setup_vmcs_config(void *p)
+{
+       int *ret = p;
+       struct vmcs_config *vmcs_conf = &vmcs_config;
+       uint32_t vmx_msr_low, vmx_msr_high;
+       uint32_t min, opt, min2, opt2;
+       uint32_t _pin_based_exec_control = 0;
+       uint32_t _cpu_based_exec_control = 0;
+       uint32_t _cpu_based_2nd_exec_control = 0;
+       uint32_t _vmexit_control = 0;
+       uint32_t _vmentry_control = 0;
+
+       *ret = -EIO;
+       min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+       opt = PIN_BASED_VIRTUAL_NMIS;
+       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+                               &_pin_based_exec_control) < 0) {
+               return;
+       }
+
+       min =
+             CPU_BASED_CR8_LOAD_EXITING |
+             CPU_BASED_CR8_STORE_EXITING |
+             CPU_BASED_CR3_LOAD_EXITING |
+             CPU_BASED_CR3_STORE_EXITING |
+             CPU_BASED_MOV_DR_EXITING |
+             CPU_BASED_USE_TSC_OFFSETING |
+             CPU_BASED_MWAIT_EXITING |
+             CPU_BASED_MONITOR_EXITING |
+             CPU_BASED_INVLPG_EXITING;
+
+       min |= CPU_BASED_HLT_EXITING;
+
+       opt = CPU_BASED_TPR_SHADOW |
+             CPU_BASED_USE_MSR_BITMAPS |
+             CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
+                               &_cpu_based_exec_control) < 0) {
+               return;
+       }
+
+       if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+               _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
+                                          ~CPU_BASED_CR8_STORE_EXITING;
+
+       if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+               min2 = 
+                       SECONDARY_EXEC_ENABLE_VPID |
+                       SECONDARY_EXEC_ENABLE_EPT |
+                       SECONDARY_EXEC_UNRESTRICTED_GUEST;
+               opt2 =  SECONDARY_EXEC_WBINVD_EXITING |
+                       SECONDARY_EXEC_RDTSCP |
+                       SECONDARY_EXEC_ENABLE_INVPCID;
+               if (adjust_vmx_controls(min2, opt2,
+                                       MSR_IA32_VMX_PROCBASED_CTLS2,
+                                       &_cpu_based_2nd_exec_control) < 0) {
+                                               return;
+                                       }
+       }
+
+       if (!(_cpu_based_2nd_exec_control &
+                               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+               _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+
+       if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
+               /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
+                  enabled */
+               _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+                                            CPU_BASED_CR3_STORE_EXITING |
+                                            CPU_BASED_INVLPG_EXITING);
+               rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
+                     vmx_capability.ept, vmx_capability.vpid);
+       }
+
+       min = 0;
+
+       min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+
+//     opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
+       opt = 0;
+       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
+                               &_vmexit_control) < 0) {
+               return;
+       }
+
+       min = 0;
+//     opt = VM_ENTRY_LOAD_IA32_PAT;
+       opt = 0;
+       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
+                               &_vmentry_control) < 0) {
+               return;
+       }
+
+       rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+
+       /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+       if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) {
+               return;
+       }
+
+       /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
+       if (vmx_msr_high & (1u<<16)) {
+               printk("64-bit CPUs always have VMX_BASIC_MSR[48]==0. FAILS!\n");
+               return;
+       }
+
+       /* Require Write-Back (WB) memory type for VMCS accesses. */
+       if (((vmx_msr_high >> 18) & 15) != 6) {
+               printk("NO WB!\n");
+               return;
+       }
+
+       vmcs_conf->size = vmx_msr_high & 0x1fff;
+       vmcs_conf->order = LOG2_UP(vmcs_config.size>> PAGE_SHIFT);
+       vmcs_conf->revision_id = vmx_msr_low;
+       printk("vmcs_conf size %d order %d rev %d\n",
+              vmcs_conf->size, vmcs_conf->order,
+              vmcs_conf->revision_id);
+
+       vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
+       vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+       vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+       vmcs_conf->vmexit_ctrl         = _vmexit_control;
+       vmcs_conf->vmentry_ctrl        = _vmentry_control;
+
+       vmx_capability.has_load_efer =
+               allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+                               VM_ENTRY_LOAD_IA32_EFER)
+               && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+                                  VM_EXIT_LOAD_IA32_EFER);
+
+       /* Now that we've done all the setup we can do, verify
+        * that we have all the capabilities we need. These tests
+        * are done last presumably because all the work done above
+        * affects some of them.
+        */
+
+       if (!vmx_capability.has_load_efer) {
+               printk("CPU lacks ability to load EFER register\n");
+               return;
+       }
+
+       printk("CPU has all needed capabilities\n");
+       *ret = 0;
+}
+
+static struct vmcs *__vmx_alloc_vmcs(int node)
+{
+       struct vmcs *vmcs;
+
+       vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
+       if (!vmcs)
+               return 0;
+       memset(vmcs, 0, vmcs_config.size);
+       vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
+       printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
+       return vmcs;
+}
+
+/**
+ * vmx_alloc_vmcs - allocates a VMCS region
+ *
+ * NOTE: Assumes the new region will be used by the current CPU.
+ *
+ * Returns a valid VMCS region.
+ */
+static struct vmcs *vmx_alloc_vmcs(void)
+{
+       return __vmx_alloc_vmcs(node_id());
+}
+
+/**
+ * vmx_free_vmcs - frees a VMCS region
+ */
+static void vmx_free_vmcs(struct vmcs *vmcs)
+{
+  //free_pages((unsigned long)vmcs, vmcs_config.order);
+}
+
+/*
+ * Set up the vmcs's constant host-state fields, i.e., host-state fields that
+ * will not change in the lifetime of the guest.
+ * Note that host-state that does change is set elsewhere. E.g., host-state
+ * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
+ */
+static void vmx_setup_constant_host_state(void)
+{
+       uint32_t low32, high32;
+       unsigned long tmpl;
+       pseudodesc_t dt;
+
+       vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);  /* 22.2.3 */
+       vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
+       vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
+
+       vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
+       vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
+       vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
+       vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
+       vmcs_write16(HOST_TR_SELECTOR, GD_TSS*8);  /* 22.2.4 */
+
+       native_store_idt(&dt);
+       vmcs_writel(HOST_IDTR_BASE, dt.pd_base);   /* 22.2.4 */
+
+       asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
+       vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
+
+       rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
+       vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+       rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+       vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
+
+       rdmsr(MSR_EFER, low32, high32);
+       vmcs_write32(HOST_IA32_EFER, low32);
+
+       if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+               rdmsr(MSR_IA32_CR_PAT, low32, high32);
+               vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
+       }
+
+       vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
+       vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
+
+#ifdef CONFIG_X86_64
+       rdmsrl(MSR_FS_BASE, tmpl);
+       vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
+       rdmsrl(MSR_GS_BASE, tmpl);
+       vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
+#else
+       vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+       vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+#endif
+}
+
+static inline uint16_t vmx_read_ldt(void)
+{
+       uint16_t ldt;
+       asm("sldt %0" : "=g"(ldt));
+       return ldt;
+}
+
+static unsigned long segment_base(uint16_t selector)
+{
+       pseudodesc_t *gdt = &currentcpu->host_gdt;
+       struct desc_struct *d;
+       unsigned long table_base;
+       unsigned long v;
+
+       if (!(selector & ~3)) {
+               return 0;
+       }
+
+       table_base = gdt->pd_base;
+
+       if (selector & 4) {           /* from ldt */
+               uint16_t ldt_selector = vmx_read_ldt();
+
+               if (!(ldt_selector & ~3)) {
+                       return 0;
+               }
+
+               table_base = segment_base(ldt_selector);
+       }
+       d = (struct desc_struct *)(table_base + (selector & ~7));
+       v = get_desc_base(d);
+#ifdef CONFIG_X86_64
+       if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+               v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
+#endif
+       return v;
+}
+
+static inline unsigned long vmx_read_tr_base(void)
+{
+       uint16_t tr;
+       asm("str %0" : "=g"(tr));
+       return segment_base(tr);
+}
+
+static void __vmx_setup_cpu(void)
+{
+       pseudodesc_t *gdt = &currentcpu->host_gdt;
+       unsigned long sysenter_esp;
+       unsigned long tmpl;
+
+       /*
+        * Linux uses per-cpu TSS and GDT, so set these when switching
+        * processors.
+        */
+       vmcs_writel(HOST_TR_BASE, vmx_read_tr_base()); /* 22.2.4 */
+       vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);   /* 22.2.4 */
+
+       rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
+       vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+       rdmsrl(MSR_FS_BASE, tmpl);
+       vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
+       rdmsrl(MSR_GS_BASE, tmpl);
+       vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
+}
+
+static void __vmx_get_cpu_helper(struct hw_trapframe *hw_tf, void *ptr)
+{
+       struct vmx_vcpu *vcpu = ptr;
+
+       if (core_id() != vcpu->cpu)
+               panic("%s: core_id() %d != vcpu->cpu %d\n",
+                     __func__, core_id(), vcpu->cpu);
+
+       vmcs_clear(vcpu->vmcs);
+       if (currentcpu->local_vcpu == vcpu)
+               currentcpu->local_vcpu = NULL;
+}
+
+/**
+ * vmx_get_cpu - called before using a cpu
+ * @vcpu: VCPU that will be loaded.
+ *
+ * Disables preemption. Call vmx_put_cpu() when finished.
+ */
+static void vmx_get_cpu(struct vmx_vcpu *vcpu)
+{
+       int cur_cpu = core_id();
+       handler_wrapper_t *w;
+
+       //printk("currentcpu->local_vcpu %p vcpu %p\n",
+               //currentcpu->local_vcpu, vcpu);
+       if (currentcpu->local_vcpu != vcpu) {
+               currentcpu->local_vcpu = vcpu;
+
+               if (vcpu->cpu != cur_cpu) {
+                       if (vcpu->cpu >= 0) {
+                               smp_call_function_single(vcpu->cpu,
+                                                        __vmx_get_cpu_helper, (void *) vcpu, &w);
+                               if (smp_call_wait(w))
+                                       printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
+                       } else
+                               vmcs_clear(vcpu->vmcs);
+
+//                     vpid_sync_context(vcpu->vpid);
+//                     ept_sync_context(current->vmm->
+
+                       vcpu->launched = 0;
+                       vmcs_load(vcpu->vmcs);
+                       __vmx_setup_cpu();
+                       vcpu->cpu = cur_cpu;
+               } else {
+                       vmcs_load(vcpu->vmcs);
+               }
+       }
+}
+
+/**
+ * vmx_put_cpu - called after using a cpu
+ * @vcpu: VCPU that was loaded.
+ */
+static void vmx_put_cpu(struct vmx_vcpu *vcpu)
+{
+       //put_cpu();
+}
+
+static void __vmx_sync_helper(struct hw_trapframe *hw_tf, void *ptr)
+{
+       struct vmx_vcpu *vcpu = ptr;
+
+//     ept_sync_context(current);
+}
+
+struct sync_addr_args {
+       struct vmx_vcpu *vcpu;
+       gpa_t gpa;
+};
+
+static void __vmx_sync_individual_addr_helper(struct hw_trapframe *hw_tf, void *ptr)
+{
+       struct sync_addr_args *args = ptr;
+
+//     ept_sync_individual_addr(
+
+}
+
+/**
+ * vmx_ept_sync_global - used to evict everything in the EPT
+ * @vcpu: the vcpu
+ */
+void vmx_ept_sync_vcpu(struct vmx_vcpu *vcpu)
+{
+       handler_wrapper_t *w;
+
+       smp_call_function_single(vcpu->cpu,
+               __vmx_sync_helper, (void *) vcpu, &w);
+
+       if (smp_call_wait(w)) {
+               printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
+       }
+
+
+}
+
+/**
+ * vmx_ept_sync_individual_addr - used to evict an individual address
+ * @vcpu: the vcpu
+ * @gpa: the guest-physical address
+ */
+void vmx_ept_sync_individual_addr(struct vmx_vcpu *vcpu, gpa_t gpa)
+{
+       struct sync_addr_args args;
+       args.vcpu = vcpu;
+       args.gpa = gpa;
+
+       handler_wrapper_t *w;
+
+
+       smp_call_function_single(vcpu->cpu,
+                                __vmx_sync_individual_addr_helper, (void *) &args, &w);
+
+       if (smp_call_wait(w)) {
+               printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
+       }
+
+}
+
+/**
+ * vmx_dump_cpu - prints the CPU state
+ * @vcpu: VCPU to print
+ */
+static void vmx_dump_cpu(struct vmx_vcpu *vcpu)
+{
+
+       unsigned long flags;
+
+       vmx_get_cpu(vcpu);
+       vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
+       vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
+       flags = vmcs_readl(GUEST_RFLAGS);
+       vmx_put_cpu(vcpu);
+
+       printk("--- Begin VCPU Dump ---\n");
+       printk("CPU %d VPID %d\n", vcpu->cpu, vcpu->vpid);
+       printk("RIP 0x%016lx RFLAGS 0x%08lx\n",
+              vcpu->regs.tf_rip, flags);
+       printk("RAX 0x%016lx RCX 0x%016lx\n",
+               vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
+       printk("RDX 0x%016lx RBX 0x%016lx\n",
+               vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
+       printk("RSP 0x%016lx RBP 0x%016lx\n",
+               vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
+       printk("RSI 0x%016lx RDI 0x%016lx\n",
+               vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
+       printk("R8  0x%016lx R9  0x%016lx\n",
+               vcpu->regs.tf_r8, vcpu->regs.tf_r9);
+       printk("R10 0x%016lx R11 0x%016lx\n",
+               vcpu->regs.tf_r10, vcpu->regs.tf_r11);
+       printk("R12 0x%016lx R13 0x%016lx\n",
+               vcpu->regs.tf_r12, vcpu->regs.tf_r13);
+       printk("R14 0x%016lx R15 0x%016lx\n",
+               vcpu->regs.tf_r14, vcpu->regs.tf_r15);
+       printk("--- End VCPU Dump ---\n");
+
+}
+
+uint64_t construct_eptp(unsigned long root_hpa)
+{
+       uint64_t eptp;
+
+       /* TODO write the value reading from MSR */
+       eptp = VMX_EPT_DEFAULT_MT |
+               VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+       if (cpu_has_vmx_ept_ad_bits())
+               eptp |= VMX_EPT_AD_ENABLE_BIT;
+       eptp |= (root_hpa & PAGE_MASK);
+
+       return eptp;
+}
+
+/**
+ * vmx_setup_initial_guest_state - configures the initial state of guest registers
+ */
+static void vmx_setup_initial_guest_state(void)
+{
+       unsigned long tmpl;
+       unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
+                           X86_CR4_PGE | X86_CR4_OSFXSR;
+       uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
+#if 0
+       do we need it
+       if (boot_cpu_has(X86_FEATURE_PCID))
+               cr4 |= X86_CR4_PCIDE;
+       if (boot_cpu_has(X86_FEATURE_OSXSAVE))
+               cr4 |= X86_CR4_OSXSAVE;
+#endif
+       /* we almost certainly have this */
+       /* we'll go sour if we don't. */
+       if (1) //boot_cpu_has(X86_FEATURE_FSGSBASE))
+               cr4 |= X86_CR4_RDWRGSFS;
+
+       /* configure control and data registers */
+       vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
+                              X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
+       vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
+                                    X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
+       vmcs_writel(GUEST_CR3, rcr3());
+       vmcs_writel(GUEST_CR4, cr4);
+       vmcs_writel(CR4_READ_SHADOW, cr4);
+       vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
+                                    EFER_SCE | EFER_FFXSR);
+       vmcs_writel(GUEST_GDTR_BASE, 0);
+       vmcs_writel(GUEST_GDTR_LIMIT, 0);
+       vmcs_writel(GUEST_IDTR_BASE, 0);
+       vmcs_writel(GUEST_IDTR_LIMIT, 0);
+       vmcs_writel(GUEST_RIP, 0xdeadbeef);
+       vmcs_writel(GUEST_RSP, 0xdeadbeef);
+       vmcs_writel(GUEST_RFLAGS, 0x02);
+       vmcs_writel(GUEST_DR7, 0);
+
+       /* guest segment bases */
+       vmcs_writel(GUEST_CS_BASE, 0);
+       vmcs_writel(GUEST_DS_BASE, 0);
+       vmcs_writel(GUEST_ES_BASE, 0);
+       vmcs_writel(GUEST_GS_BASE, 0);
+       vmcs_writel(GUEST_SS_BASE, 0);
+       rdmsrl(MSR_FS_BASE, tmpl);
+       vmcs_writel(GUEST_FS_BASE, tmpl);
+
+       /* guest segment access rights */
+       vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
+       vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
+       vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
+       vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
+       vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
+       vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
+
+       /* guest segment limits */
+       vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
+       vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
+       vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
+       vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
+       vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
+       vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
+
+       /* configure segment selectors */
+       vmcs_write16(GUEST_CS_SELECTOR, 0);
+       vmcs_write16(GUEST_DS_SELECTOR, 0);
+       vmcs_write16(GUEST_ES_SELECTOR, 0);
+       vmcs_write16(GUEST_FS_SELECTOR, 0);
+       vmcs_write16(GUEST_GS_SELECTOR, 0);
+       vmcs_write16(GUEST_SS_SELECTOR, 0);
+       vmcs_write16(GUEST_TR_SELECTOR, 0);
+
+       /* guest LDTR */
+       vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+       vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
+       vmcs_writel(GUEST_LDTR_BASE, 0);
+       vmcs_writel(GUEST_LDTR_LIMIT, 0);
+
+       /* guest TSS */
+       vmcs_writel(GUEST_TR_BASE, 0);
+       vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
+       vmcs_writel(GUEST_TR_LIMIT, 0xff);
+
+       /* initialize sysenter */
+       vmcs_write32(GUEST_SYSENTER_CS, 0);
+       vmcs_writel(GUEST_SYSENTER_ESP, 0);
+       vmcs_writel(GUEST_SYSENTER_EIP, 0);
+
+       /* other random initialization */
+       vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+       vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
+}
+
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
+{
+       int f = sizeof(unsigned long);
+       /*
+        * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+        * have the write-low and read-high bitmap offsets the wrong way round.
+        * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+        */
+       if (msr <= 0x1fff) {
+               __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
+               __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
+       } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+               msr &= 0x1fff;
+               __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
+               __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
+       }
+}
+
+static void setup_msr(struct vmx_vcpu *vcpu)
+{
+       int set[] = { MSR_LSTAR };
+       struct vmx_msr_entry *e;
+       int sz = sizeof(set) / sizeof(*set);
+       int i;
+
+       //BUILD_BUG_ON(sz > NR_AUTOLOAD_MSRS);
+
+       vcpu->msr_autoload.nr = sz;
+
+       /* XXX enable only MSRs in set */
+       vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
+
+       vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
+
+       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
+       vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
+       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
+
+       for (i = 0; i < sz; i++) {
+               uint64_t val;
+
+               e = &vcpu->msr_autoload.host[i];
+               e->index = set[i];
+               __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
+               rdmsrl(e->index, val);
+               e->value = val;
+
+               e = &vcpu->msr_autoload.guest[i];
+               e->index = set[i];
+               e->value = 0xDEADBEEF;
+       }
+}
+
+/**
+ *  vmx_setup_vmcs - configures the vmcs with starting parameters
+ */
+static void vmx_setup_vmcs(struct vmx_vcpu *vcpu)
+{
+       vmcs_write16(VIRTUAL_PROCESSOR_ID, vcpu->vpid);
+       vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+       /* Control */
+       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+               vmcs_config.pin_based_exec_ctrl);
+
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+               vmcs_config.cpu_based_exec_ctrl);
+
+       if (cpu_has_secondary_exec_ctrls()) {
+               vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+                            vmcs_config.cpu_based_2nd_exec_ctrl);
+       }
+
+       vmcs_write64(EPT_POINTER, eptp);
+
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+       vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+
+       setup_msr(vcpu);
+#if 0
+       if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+               uint32_t msr_low, msr_high;
+               uint64_t host_pat;
+               rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+               host_pat = msr_low | ((uint64_t) msr_high << 32);
+               /* Write the default value follow host pat */
+               vmcs_write64(GUEST_IA32_PAT, host_pat);
+               /* Keep arch.pat sync with GUEST_IA32_PAT */
+               vmx->vcpu.arch.pat = host_pat;
+       }
+
+       for (i = 0; i < NR_VMX_MSR; ++i) {
+               uint32_t index = vmx_msr_index[i];
+               uint32_t data_low, data_high;
+               int j = vmx->nmsrs;
+
+               if (rdmsr_safe(index, &data_low, &data_high) < 0)
+                       continue;
+               if (wrmsr_safe(index, data_low, data_high) < 0)
+                       continue;
+               vmx->guest_msrs[j].index = i;
+               vmx->guest_msrs[j].data = 0;
+               vmx->guest_msrs[j].mask = -1ull;
+               ++vmx->nmsrs;
+       }
+#endif
+
+       vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
+
+       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+       vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
+
+       vmcs_writel(CR0_GUEST_HOST_MASK, ~0ul);
+       vmcs_writel(CR4_GUEST_HOST_MASK, ~0ul);
+
+       //kvm_write_tsc(&vmx->vcpu, 0);
+       vmcs_writel(TSC_OFFSET, 0);
+
+       vmx_setup_constant_host_state();
+}
+
+/**
+ * vmx_allocate_vpid - reserves a vpid and sets it in the VCPU
+ * @vmx: the VCPU
+ */
+static int vmx_allocate_vpid(struct vmx_vcpu *vmx)
+{
+       int vpid;
+
+       vmx->vpid = 0;
+
+       spin_lock(&vmx_vpid_lock);
+       vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+       if (vpid < VMX_NR_VPIDS) {
+               vmx->vpid = vpid;
+               __set_bit(vpid, vmx_vpid_bitmap);
+       }
+       spin_unlock(&vmx_vpid_lock);
+
+       return vpid >= VMX_NR_VPIDS;
+}
+
+/**
+ * vmx_free_vpid - frees a vpid
+ * @vmx: the VCPU
+ */
+static void vmx_free_vpid(struct vmx_vcpu *vmx)
+{
+       spin_lock(&vmx_vpid_lock);
+       if (vmx->vpid != 0)
+               __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+       spin_unlock(&vmx_vpid_lock);
+}
+
+/**
+ * vmx_create_vcpu - allocates and initializes a new virtual cpu
+ *
+ * Returns: A new VCPU structure
+ */
+struct vmx_vcpu *vmx_create_vcpu(void)
+{
+       struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
+       if (!vcpu) {
+               return NULL;
+       }
+
+       memset(vcpu, 0, sizeof(*vcpu));
+
+       vcpu->vmcs = vmx_alloc_vmcs();
+       printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
+       if (!vcpu->vmcs)
+               goto fail_vmcs;
+
+       if (vmx_allocate_vpid(vcpu))
+               goto fail_vpid;
+
+       printd("%d: vmx_create_vcpu: vpid %d\n", core_id(), vcpu->vpid);
+       vcpu->cpu = -1;
+
+       vmx_get_cpu(vcpu);
+       vmx_setup_vmcs(vcpu);
+       vmx_setup_initial_guest_state();
+       vmx_put_cpu(vcpu);
+
+#if 0
+       if (cpu_has_vmx_ept_ad_bits()) {
+               vcpu->ept_ad_enabled = true;
+               printk("vmx: enabled EPT A/D bits");
+       }
+       if (vmx_create_ept(vcpu->gv))
+               goto fail_ept;
+#endif
+
+       return vcpu;
+
+fail_ept:
+       vmx_free_vpid(vcpu);
+fail_vpid:
+       vmx_free_vmcs(vcpu->vmcs);
+fail_vmcs:
+       kfree(vcpu);
+       return NULL;
+}
+
+/**
+ * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
+ * @vcpu: the VCPU to destroy
+ */
+void vmx_destroy_vcpu(struct vmx_vcpu *vcpu)
+{
+       // needs to be done when we tear down the gv. vmx_destroy_ept(vcpu->gv);
+       vmx_get_cpu(vcpu);
+//     ept_sync_context
+       vmcs_clear(vcpu->vmcs);
+       currentcpu->local_vcpu = NULL;
+       vmx_put_cpu(vcpu);
+       vmx_free_vpid(vcpu);
+       vmx_free_vmcs(vcpu->vmcs);
+       kfree(vcpu);
+}
+
+/**
+ * vmx_task_vcpu - returns a pointer to the task's vcpu or NULL.
+ * @task: the task
+ */
+static inline struct vmx_vcpu *vmx_task_vcpu(struct proc *p)
+{
+       struct dune_struct *dune = current->virtinfo;
+       return dune ? dune->vcpu : NULL;
+}
+
+/**
+ * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
+ *
+ * In the contexts where this is used the vcpu pointer should never be NULL.
+ */
+static inline struct vmx_vcpu *vmx_current_vcpu(void)
+{
+       struct vmx_vcpu *vcpu = vmx_task_vcpu(current);
+       if (! vcpu)
+               panic("%s: core_id %d: no vcpu", __func__, core_id());
+       return vcpu;
+}
+
+
+/**
+ * vmx_run_vcpu - launches the CPU into non-root mode
+ * We ONLY support 64-bit guests.
+ * @vcpu: the vmx instance to launch
+ */
+static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
+{
+       asm(
+               /* Store host registers */
+               "push %%rdx; push %%rbp;"
+               "push %%rcx \n\t" /* placeholder for guest rcx */
+               "push %%rcx \n\t"
+               "cmp %%rsp, %c[host_rsp](%0) \n\t"
+               "je 1f \n\t"
+               "mov %%rsp, %c[host_rsp](%0) \n\t"
+               ASM_VMX_VMWRITE_RSP_RDX "\n\t"
+               "1: \n\t"
+               /* Reload cr2 if changed */
+               "mov %c[cr2](%0), %%rax \n\t"
+               "mov %%cr2, %%rdx \n\t"
+               "cmp %%rax, %%rdx \n\t"
+               "je 2f \n\t"
+               "mov %%rax, %%cr2 \n\t"
+               "2: \n\t"
+               /* Check if vmlaunch of vmresume is needed */
+               "cmpl $0, %c[launched](%0) \n\t"
+               /* Load guest registers.  Don't clobber flags. */
+               "mov %c[rax](%0), %%rax \n\t"
+               "mov %c[rbx](%0), %%rbx \n\t"
+               "mov %c[rdx](%0), %%rdx \n\t"
+               "mov %c[rsi](%0), %%rsi \n\t"
+               "mov %c[rdi](%0), %%rdi \n\t"
+               "mov %c[rbp](%0), %%rbp \n\t"
+               "mov %c[r8](%0),  %%r8  \n\t"
+               "mov %c[r9](%0),  %%r9  \n\t"
+               "mov %c[r10](%0), %%r10 \n\t"
+               "mov %c[r11](%0), %%r11 \n\t"
+               "mov %c[r12](%0), %%r12 \n\t"
+               "mov %c[r13](%0), %%r13 \n\t"
+               "mov %c[r14](%0), %%r14 \n\t"
+               "mov %c[r15](%0), %%r15 \n\t"
+               "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
+
+               /* Enter guest mode */
+               "jne .Llaunched \n\t"
+               ASM_VMX_VMLAUNCH "\n\t"
+               "jmp .Lkvm_vmx_return \n\t"
+               ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
+               ".Lkvm_vmx_return: "
+               /* Save guest registers, load host registers, keep flags */
+               "mov %0, %c[wordsize](%%rsp) \n\t"
+               "pop %0 \n\t"
+               "mov %%rax, %c[rax](%0) \n\t"
+               "mov %%rbx, %c[rbx](%0) \n\t"
+               "popq %c[rcx](%0) \n\t"
+               "mov %%rdx, %c[rdx](%0) \n\t"
+               "mov %%rsi, %c[rsi](%0) \n\t"
+               "mov %%rdi, %c[rdi](%0) \n\t"
+               "mov %%rbp, %c[rbp](%0) \n\t"
+               "mov %%r8,  %c[r8](%0) \n\t"
+               "mov %%r9,  %c[r9](%0) \n\t"
+               "mov %%r10, %c[r10](%0) \n\t"
+               "mov %%r11, %c[r11](%0) \n\t"
+               "mov %%r12, %c[r12](%0) \n\t"
+               "mov %%r13, %c[r13](%0) \n\t"
+               "mov %%r14, %c[r14](%0) \n\t"
+               "mov %%r15, %c[r15](%0) \n\t"
+               "mov %%rax, %%r10 \n\t"
+               "mov %%rdx, %%r11 \n\t"
+
+               "mov %%cr2, %%rax   \n\t"
+               "mov %%rax, %c[cr2](%0) \n\t"
+
+               "pop  %%rbp; pop  %%rdx \n\t"
+               "setbe %c[fail](%0) \n\t"
+
+               "mov $" /*__stringify(GD_UD) */"16"", %%rax \n\t"
+               "mov %%rax, %%ds \n\t"
+               "mov %%rax, %%es \n\t"
+             : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
+               [launched]"i"(offsetof(struct vmx_vcpu, launched)),
+               [fail]"i"(offsetof(struct vmx_vcpu, fail)),
+               [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
+               [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
+               [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
+               [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
+               [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
+               [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
+               [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
+               [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
+               [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
+               [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
+               [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
+               [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
+               [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
+               [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
+               [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
+               [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
+               [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
+               [wordsize]"i"(sizeof(unsigned long))
+             : "cc", "memory"
+               , "rax", "rbx", "rdi", "rsi"
+               , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+       );
+
+       vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
+       vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
+       printk("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
+              vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
+       /* FIXME: do we need to set up other flags? */
+       vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
+                     X86_EFLAGS_IF | 0x2;
+       //monitor(NULL);
+
+       vcpu->regs.tf_cs = GD_UT;
+       vcpu->regs.tf_ss = GD_UD;
+
+       vcpu->launched = 1;
+
+       if (vcpu->fail) {
+               printk("failure detected (err %x)\n",
+                      vmcs_read32(VM_INSTRUCTION_ERROR));
+               return VMX_EXIT_REASONS_FAILED_VMENTRY;
+       }
+
+       return vmcs_read32(VM_EXIT_REASON);
+
+#if 0
+       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       vmx_complete_atomic_exit(vmx);
+       vmx_recover_nmi_blocking(vmx);
+       vmx_complete_interrupts(vmx);
+#endif
+}
+
+static void vmx_step_instruction(void)
+{
+       vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
+                              vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+}
+
+static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu)
+{
+       unsigned long gva, gpa;
+       int exit_qual, ret = -1;
+       page_t *page;
+
+       vmx_get_cpu(vcpu);
+       exit_qual = vmcs_read32(EXIT_QUALIFICATION);
+       gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
+       gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+       printk("ept: gva %016lx, gpa %016lx\n", gva, gpa);
+
+       vmx_put_cpu(vcpu);
+
+       // this is a total hack, for testing things.
+       // note that we only care about the gpa, and the
+       // gpa is our process virtual address. 
+       // Confused yet?
+       page = page_lookup(current->env_pgdir, (void *)gpa, NULL);
+       printk("Lookup %p returns %p\n", gpa, page);
+       if (page) {
+               uint64_t hpa = page2pa(page);
+               printk("hpa for %p is %p\n", gpa, hpa);
+               ret = vmx_do_ept_fault(ept, gpa, hpa, exit_qual);
+               printk("vmx_do_ept_fault returns %d\n", ret);
+       }
+
+       if (ret) {
+               printk("page fault failure "
+                      "GPA: 0x%lx, GVA: 0x%lx\n",
+                      gpa, gva);
+               vmx_dump_cpu(vcpu);
+       }
+
+       return ret;
+}
+
+static void vmx_handle_cpuid(struct vmx_vcpu *vcpu)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       eax = vcpu->regs.tf_rax;
+       ecx = vcpu->regs.tf_rcx;
+       cpuid(0, 2, &eax, &ebx, &ecx, &edx);
+       vcpu->regs.tf_rax = eax;
+       vcpu->regs.tf_rbx = ebx;
+       vcpu->regs.tf_rcx = ecx;
+       vcpu->regs.tf_rdx = edx;
+}
+
+static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu)
+{
+       uint32_t intr_info;
+
+       vmx_get_cpu(vcpu);
+       intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       vmx_put_cpu(vcpu);
+
+       printk("vmx (VPID %d): got an exception\n", vcpu->vpid);
+       printk("vmx (VPID %d): pid %d\n", vcpu->vpid,
+                        current->pid);
+       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
+               return 0;
+       }
+
+       printk("unhandled nmi, intr_info %x\n", intr_info);
+       return -EIO;
+}
+
+
+static void noop(void) {
+       __asm__ __volatile__ ("1: jmp 1b");
+}
+
+static void fail(void) {
+       __asm__ __volatile__ ("movq $0xdeadbeef, %rbx; movq 0, %rax");
+}
+
+static unsigned long stack[512];
+/**
+ * vmx_launch - the main loop for a VMX Dune process
+ * @conf: the launch configuration
+ */
+int vmx_launch(struct dune_config *conf)
+{
+       int ret;
+       struct dune_struct dune;
+       struct vmx_vcpu *vcpu;
+       int i = 0;
+       unsigned long rip = conf->rip;
+       unsigned long rsp = conf->rsp;
+       unsigned long cr3 = conf->cr3;
+       int errors = 0;
+
+       if (conf->rip < 4096 ) {
+               // testing.
+               switch(conf->rip) {
+               default:
+                       rip = (uint64_t)noop + 4;
+                       break;
+               case 1:
+                       rip = (uint64_t)fail + 4;
+                       break;
+               }
+       }
+
+       if (conf->cr3 == 0) {
+               cr3 = rcr3();
+       }
+
+       /* sanity checking.  -- later
+       ret = ept_check_page(ept, rip);
+       if (ret) {
+               printk("0x%x is not mapped in the ept!\n", rip);
+               errors++;
+       }
+       ret = ept_check_page(ept, rsp);
+       if (ret) {
+               printk("0x%x is not mapped in the ept!\n", rsp);
+               errors++;
+       }
+       */
+       if (errors) {
+               return -EINVAL;
+       }
+
+
+       printk("RUNNING: %s: rip %p rsp %p cr3 %p \n",
+              __func__, rip, rsp, cr3);
+       vcpu = vmx_create_vcpu();
+       if (!vcpu) {
+               return -ENOMEM;
+       }
+
+       vmx_get_cpu(vcpu);
+       vmcs_writel(GUEST_RIP, rip);
+       vmcs_writel(GUEST_RSP, rsp);
+       vmcs_writel(GUEST_CR3, cr3);
+       vmx_put_cpu(vcpu);
+
+       printk("created VCPU (VPID %d): pid %d\n",
+              vcpu->vpid, current->pid);
+
+       vcpu->ret_code = -1;
+
+       if (current->virtinfo)
+               printk("vmx_launch: current->virtinfo is NOT NULL (%p)\n", current->virtinfo);
+       //WARN_ON(current->virtinfo != NULL);
+       dune.vcpu = vcpu;
+
+       current->virtinfo = &dune;
+
+       while (1) {
+               vmx_get_cpu(vcpu);
+
+               // TODO: manage the fpu when we restart.
+
+               // TODO: see if we need to exit before we go much further.
+               disable_irq();
+               ret = vmx_run_vcpu(vcpu);
+               enable_irq();
+
+               if (ret == EXIT_REASON_VMCALL ||
+                   ret == EXIT_REASON_CPUID) {
+                       vmx_step_instruction();
+               }
+
+               vmx_put_cpu(vcpu);
+
+               if (ret == EXIT_REASON_VMCALL) {
+                       printk("system call! WTF\n");
+               } else if (ret == EXIT_REASON_CPUID)
+                       vmx_handle_cpuid(vcpu);
+               else if (ret == EXIT_REASON_EPT_VIOLATION) {
+                       if (vmx_handle_ept_violation(vcpu))
+                               vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
+               } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
+                       if (vmx_handle_nmi_exception(vcpu))
+                               vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
+               } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
+                       printk("External interrupt\n");
+               } else {
+                       printk("unhandled exit: reason %x, exit qualification %x\n",
+                              ret, vmcs_read32(EXIT_QUALIFICATION));
+                       vmx_dump_cpu(vcpu);
+                       vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
+               }
+
+               /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
+                * similar to how proc_restartcore/smp_idle only restart the pcpui
+                * cur_ctx, we need to do the same, via the VMCS resume business. */
+
+               if (vcpu->shutdown)
+                       break;
+       }
+
+       printk("RETURN. ip %016lx sp %016lx\n",
+               vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
+       monitor(NULL);
+       current->virtinfo = NULL;
+
+       /*
+        * Return both the reason for the shutdown and a status value.
+        * The exit() and exit_group() system calls only need 8 bits for
+        * the status but we allow 16 bits in case we might want to
+        * return more information for one of the other shutdown reasons.
+        */
+       ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
+
+       printk("destroying VCPU (VPID %d): pid %d\n",
+                       vcpu->vpid, current->pid);
+
+       vmx_destroy_vcpu(vcpu);
+
+       return ret;
+}
+
+/**
+ * __vmx_enable - low-level enable of VMX mode on the current CPU
+ * @vmxon_buf: an opaque buffer for use as the VMXON region
+ */
+static  int __vmx_enable(struct vmcs *vmxon_buf)
+{
+       uint64_t phys_addr = PADDR(vmxon_buf);
+       uint64_t old, test_bits;
+
+       if (rcr4() & X86_CR4_VMXE) {
+               panic("Should never have this happen");
+               return -EBUSY;
+       }
+
+       rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
+
+       test_bits = FEATURE_CONTROL_LOCKED;
+       test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+
+       if (0) // tboot_enabled())
+               test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
+
+       if ((old & test_bits) != test_bits) {
+               /* If it's locked, then trying to set it will cause a GPF.
+                * No Dune for you!
+                */
+               if (old & FEATURE_CONTROL_LOCKED) {
+                       printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
+                       return -1;
+               }
+
+               /* enable and lock */
+               write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
+       }
+       lcr4(rcr4() | X86_CR4_VMXE);
+
+       __vmxon(phys_addr);
+       vpid_sync_vcpu_global();
+       ept_sync_global();
+
+       return 0;
+}
+
+/**
+ * vmx_enable - enables VMX mode on the current CPU
+ * @unused: not used (required for on_each_cpu())
+ *
+ * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
+ */
+static void vmx_enable(void)
+{
+       struct vmcs *vmxon_buf = currentcpu->vmxarea;
+       int ret;
+
+       ret = __vmx_enable(vmxon_buf);
+       if (ret)
+               goto failed;
+
+       currentcpu->vmx_enabled = 1;
+       // TODO: do we need this?
+       store_gdt(&currentcpu->host_gdt);
+
+       printk("VMX enabled on CPU %d\n", core_id());
+       return;
+
+failed:
+       has_vmx = FALSE;
+       printk("failed to enable VMX on core %d, err = %d\n", core_id(), ret);
+}
+
+/**
+ * vmx_disable - disables VMX mode on the current CPU
+ */
+static void vmx_disable(void *unused)
+{
+       if (currentcpu->vmx_enabled) {
+               __vmxoff();
+               lcr4(rcr4() & ~X86_CR4_VMXE);
+               currentcpu->vmx_enabled = 0;
+       }
+}
+
+/* Probe the cpus to see which ones can do vmx.
+ * Return -errno if it fails, and 1 if it succeeds.
+ */
+static bool probe_cpu_vmx(void)
+{
+       /* The best way to test this code is:
+        * wrmsr -p <cpu> 0x3a 1
+        * This will lock vmx off; then modprobe dune.
+        * Frequently, however, systems have all 0x3a registers set to 5,
+        * meaning testing is impossible, as vmx can not be disabled.
+        * We have to simulate it being unavailable in most cases.
+        * The 'test' variable provides an easy way to simulate
+        * unavailability of vmx on some, none, or all cpus.
+        */
+       if (!cpu_has_vmx()) {
+               printk("Machine does not support VT-x\n");
+               return FALSE;
+       } else {
+               printk("Machine has vmx\n");
+               return TRUE;
+       }
+}
+
+static void setup_vmxarea(void)
+{
+               struct vmcs *vmxon_buf;
+               printd("Set up vmxarea for cpu %d\n", core_id());
+               vmxon_buf = __vmx_alloc_vmcs(node_id());
+               if (!vmxon_buf) {
+                       printk("setup_vmxarea failed on node %d\n", core_id());
+                       return;
+               }
+               currentcpu->vmxarea = vmxon_buf;
+}
+
+/**
+ * vmx_init sets up physical core data areas that are required to run a vm at all.
+ * These data areas are not connected to a specific user process in any way. Instead,
+ * they are in some sense externalizing what would other wise be a very large ball of
+ * state that would be inside the CPU.
+ */
+int intel_vmm_init(void)
+{
+       int r, cpu, ret;
+
+       if (! probe_cpu_vmx()) {
+               printk("CPU does not have VMX\n");
+               return -EOPNOTSUPP;
+       }
+
+       setup_vmcs_config(&ret);
+
+       if (ret) {
+               printk("setup_vmcs_config failed: %d\n", ret);
+               return ret;
+       }
+
+       msr_bitmap = (unsigned long *)kpage_zalloc_addr();
+       if (!msr_bitmap) {
+               printk("Could not allocate msr_bitmap\n");
+               return -ENOMEM;
+       }
+       /* FIXME: do we need APIC virtualization (flexpriority?) */
+
+       memset(msr_bitmap, 0xff, PAGE_SIZE);
+       __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
+       __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
+
+       set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
+       /* TEMPORARY hack so we can do some basic VM testing. Create an ept and look for faults on it.
+        */
+       ept = kpage_zalloc_addr();
+       eptp = construct_eptp(PADDR(ept));
+       printk("ept is %p and eptp is %p\n", ept, eptp);
+       return ret;
+}
+
+int intel_vmm_pcpu_init(void)
+{
+       setup_vmxarea();
+       vmx_enable();
+       return 0;
+}
index a87fbe6..536098d 100644 (file)
-/*-
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
+#ifndef VMX_H
+#define VMX_H
+
+/*
+ * vmx.h: VMX Architecture related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
  *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * A few random additions are:
+ * Copyright (C) 2006 Qumranet
+ *    Avi Kivity <avi@qumranet.com>
+ *    Yaniv Kamay <yaniv@qumranet.com>
  *
- * $FreeBSD$
  */
 
-#ifndef _VMX_H_
-#define        _VMX_H_
-
-struct vmxctx {
-       uint64_t guest_rdi;             /* Guest state */
-       uint64_t guest_rsi;
-       uint64_t guest_rdx;
-       uint64_t guest_rcx;
-       uint64_t guest_r8;
-       uint64_t guest_r9;
-       uint64_t guest_rax;
-       uint64_t guest_rbx;
-       uint64_t guest_rbp;
-       uint64_t guest_r10;
-       uint64_t guest_r11;
-       uint64_t guest_r12;
-       uint64_t guest_r13;
-       uint64_t guest_r14;
-       uint64_t guest_r15;
-       uint64_t guest_cr2;
-
-       uint64_t host_r15;              /* Host state */
-       uint64_t host_r14;
-       uint64_t host_r13;
-       uint64_t host_r12;
-       uint64_t host_rbp;
-       uint64_t host_rsp;
-       uint64_t host_rbx;
-       /*
-        * XXX todo debug registers and fpu state
-        */
-
-       int inst_fail_status;
-
-       /*
-        * The pmap needs to be deactivated in vmx_enter_guest()
-        * so keep a copy of the 'pmap' in each vmxctx.
-       struct pmap *pmap;
-        */
-       // For Akaros. The pmap did not apply directly, but struct proc * is right.
-       struct proc *p;
+#define CPU_BASED_VIRTUAL_INTR_PENDING  0x00000004
+#define CPU_BASED_USE_TSC_OFFSETING     0x00000008
+#define CPU_BASED_HLT_EXITING           0x00000080
+#define CPU_BASED_INVDPG_EXITING        0x00000200
+#define CPU_BASED_MWAIT_EXITING         0x00000400
+#define CPU_BASED_RDPMC_EXITING         0x00000800
+#define CPU_BASED_RDTSC_EXITING         0x00001000
+#define CPU_BASED_CR8_LOAD_EXITING      0x00080000
+#define CPU_BASED_CR8_STORE_EXITING     0x00100000
+#define CPU_BASED_TPR_SHADOW            0x00200000
+#define CPU_BASED_MOV_DR_EXITING        0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING     0x01000000
+#define CPU_BASED_ACTIVATE_IO_BITMAP    0x02000000
+#define CPU_BASED_MSR_BITMAPS           0x10000000
+#define CPU_BASED_MONITOR_EXITING       0x20000000
+#define CPU_BASED_PAUSE_EXITING         0x40000000
+
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
+#define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
+#define CPU_BASED_USE_TSC_OFFSETING             0x00000008
+#define CPU_BASED_HLT_EXITING                   0x00000080
+#define CPU_BASED_INVLPG_EXITING                0x00000200
+#define CPU_BASED_MWAIT_EXITING                 0x00000400
+#define CPU_BASED_RDPMC_EXITING                 0x00000800
+#define CPU_BASED_RDTSC_EXITING                 0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING             0x00008000
+#define CPU_BASED_CR3_STORE_EXITING            0x00010000
+#define CPU_BASED_CR8_LOAD_EXITING              0x00080000
+#define CPU_BASED_CR8_STORE_EXITING             0x00100000
+#define CPU_BASED_TPR_SHADOW                    0x00200000
+#define CPU_BASED_VIRTUAL_NMI_PENDING          0x00400000
+#define CPU_BASED_MOV_DR_EXITING                0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING             0x01000000
+#define CPU_BASED_USE_IO_BITMAPS                0x02000000
+#define CPU_BASED_USE_MSR_BITMAPS               0x10000000
+#define CPU_BASED_MONITOR_EXITING               0x20000000
+#define CPU_BASED_PAUSE_EXITING                 0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT               0x00000002
+#define SECONDARY_EXEC_RDTSCP                  0x00000008
+#define SECONDARY_EXEC_ENABLE_VPID              0x00000020
+#define SECONDARY_EXEC_WBINVD_EXITING          0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST      0x00000080
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING      0x00000400
+#define SECONDARY_EXEC_ENABLE_INVPCID          0x00001000
+
+
+#define PIN_BASED_EXT_INTR_MASK                 0x00000001
+#define PIN_BASED_NMI_EXITING                   0x00000008
+#define PIN_BASED_VIRTUAL_NMIS                  0x00000020
+
+#define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL      0x00001000
+#define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
+#define VM_EXIT_SAVE_IA32_PAT                  0x00040000
+#define VM_EXIT_LOAD_IA32_PAT                  0x00080000
+#define VM_EXIT_SAVE_IA32_EFER                  0x00100000
+#define VM_EXIT_LOAD_IA32_EFER                  0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER       0x00400000
+
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS            0x00000002
+#define VM_ENTRY_IA32E_MODE                     0x00000200
+#define VM_ENTRY_SMM                            0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL     0x00002000
+#define VM_ENTRY_LOAD_IA32_PAT                 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER                 0x00008000
+
+/* VMCS Encodings */
+enum vmcs_field {
+       VIRTUAL_PROCESSOR_ID            = 0x00000000,
+       GUEST_ES_SELECTOR               = 0x00000800,
+       GUEST_CS_SELECTOR               = 0x00000802,
+       GUEST_SS_SELECTOR               = 0x00000804,
+       GUEST_DS_SELECTOR               = 0x00000806,
+       GUEST_FS_SELECTOR               = 0x00000808,
+       GUEST_GS_SELECTOR               = 0x0000080a,
+       GUEST_LDTR_SELECTOR             = 0x0000080c,
+       GUEST_TR_SELECTOR               = 0x0000080e,
+       HOST_ES_SELECTOR                = 0x00000c00,
+       HOST_CS_SELECTOR                = 0x00000c02,
+       HOST_SS_SELECTOR                = 0x00000c04,
+       HOST_DS_SELECTOR                = 0x00000c06,
+       HOST_FS_SELECTOR                = 0x00000c08,
+       HOST_GS_SELECTOR                = 0x00000c0a,
+       HOST_TR_SELECTOR                = 0x00000c0c,
+       IO_BITMAP_A                     = 0x00002000,
+       IO_BITMAP_A_HIGH                = 0x00002001,
+       IO_BITMAP_B                     = 0x00002002,
+       IO_BITMAP_B_HIGH                = 0x00002003,
+       MSR_BITMAP                      = 0x00002004,
+       MSR_BITMAP_HIGH                 = 0x00002005,
+       VM_EXIT_MSR_STORE_ADDR          = 0x00002006,
+       VM_EXIT_MSR_STORE_ADDR_HIGH     = 0x00002007,
+       VM_EXIT_MSR_LOAD_ADDR           = 0x00002008,
+       VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
+       VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
+       VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
+       TSC_OFFSET                      = 0x00002010,
+       TSC_OFFSET_HIGH                 = 0x00002011,
+       VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
+       VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
+       APIC_ACCESS_ADDR                = 0x00002014,
+       APIC_ACCESS_ADDR_HIGH           = 0x00002015,
+       EPT_POINTER                     = 0x0000201a,
+       EPT_POINTER_HIGH                = 0x0000201b,
+       GUEST_PHYSICAL_ADDRESS          = 0x00002400,
+       GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
+       VMCS_LINK_POINTER               = 0x00002800,
+       VMCS_LINK_POINTER_HIGH          = 0x00002801,
+       GUEST_IA32_DEBUGCTL             = 0x00002802,
+       GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+       GUEST_IA32_PAT                  = 0x00002804,
+       GUEST_IA32_PAT_HIGH             = 0x00002805,
+       GUEST_IA32_EFER                 = 0x00002806,
+       GUEST_IA32_EFER_HIGH            = 0x00002807,
+       GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
+       GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
+       GUEST_PDPTR0                    = 0x0000280a,
+       GUEST_PDPTR0_HIGH               = 0x0000280b,
+       GUEST_PDPTR1                    = 0x0000280c,
+       GUEST_PDPTR1_HIGH               = 0x0000280d,
+       GUEST_PDPTR2                    = 0x0000280e,
+       GUEST_PDPTR2_HIGH               = 0x0000280f,
+       GUEST_PDPTR3                    = 0x00002810,
+       GUEST_PDPTR3_HIGH               = 0x00002811,
+       HOST_IA32_PAT                   = 0x00002c00,
+       HOST_IA32_PAT_HIGH              = 0x00002c01,
+       HOST_IA32_EFER                  = 0x00002c02,
+       HOST_IA32_EFER_HIGH             = 0x00002c03,
+       HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
+       HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
+       PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
+       CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
+       EXCEPTION_BITMAP                = 0x00004004,
+       PAGE_FAULT_ERROR_CODE_MASK      = 0x00004006,
+       PAGE_FAULT_ERROR_CODE_MATCH     = 0x00004008,
+       CR3_TARGET_COUNT                = 0x0000400a,
+       VM_EXIT_CONTROLS                = 0x0000400c,
+       VM_EXIT_MSR_STORE_COUNT         = 0x0000400e,
+       VM_EXIT_MSR_LOAD_COUNT          = 0x00004010,
+       VM_ENTRY_CONTROLS               = 0x00004012,
+       VM_ENTRY_MSR_LOAD_COUNT         = 0x00004014,
+       VM_ENTRY_INTR_INFO_FIELD        = 0x00004016,
+       VM_ENTRY_EXCEPTION_ERROR_CODE   = 0x00004018,
+       VM_ENTRY_INSTRUCTION_LEN        = 0x0000401a,
+       TPR_THRESHOLD                   = 0x0000401c,
+       SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
+       PLE_GAP                         = 0x00004020,
+       PLE_WINDOW                      = 0x00004022,
+       VM_INSTRUCTION_ERROR            = 0x00004400,
+       VM_EXIT_REASON                  = 0x00004402,
+       VM_EXIT_INTR_INFO               = 0x00004404,
+       VM_EXIT_INTR_ERROR_CODE         = 0x00004406,
+       IDT_VECTORING_INFO_FIELD        = 0x00004408,
+       IDT_VECTORING_ERROR_CODE        = 0x0000440a,
+       VM_EXIT_INSTRUCTION_LEN         = 0x0000440c,
+       VMX_INSTRUCTION_INFO            = 0x0000440e,
+       GUEST_ES_LIMIT                  = 0x00004800,
+       GUEST_CS_LIMIT                  = 0x00004802,
+       GUEST_SS_LIMIT                  = 0x00004804,
+       GUEST_DS_LIMIT                  = 0x00004806,
+       GUEST_FS_LIMIT                  = 0x00004808,
+       GUEST_GS_LIMIT                  = 0x0000480a,
+       GUEST_LDTR_LIMIT                = 0x0000480c,
+       GUEST_TR_LIMIT                  = 0x0000480e,
+       GUEST_GDTR_LIMIT                = 0x00004810,
+       GUEST_IDTR_LIMIT                = 0x00004812,
+       GUEST_ES_AR_BYTES               = 0x00004814,
+       GUEST_CS_AR_BYTES               = 0x00004816,
+       GUEST_SS_AR_BYTES               = 0x00004818,
+       GUEST_DS_AR_BYTES               = 0x0000481a,
+       GUEST_FS_AR_BYTES               = 0x0000481c,
+       GUEST_GS_AR_BYTES               = 0x0000481e,
+       GUEST_LDTR_AR_BYTES             = 0x00004820,
+       GUEST_TR_AR_BYTES               = 0x00004822,
+       GUEST_INTERRUPTIBILITY_INFO     = 0x00004824,
+       GUEST_ACTIVITY_STATE            = 0X00004826,
+       GUEST_SYSENTER_CS               = 0x0000482A,
+       HOST_IA32_SYSENTER_CS           = 0x00004c00,
+       CR0_GUEST_HOST_MASK             = 0x00006000,
+       CR4_GUEST_HOST_MASK             = 0x00006002,
+       CR0_READ_SHADOW                 = 0x00006004,
+       CR4_READ_SHADOW                 = 0x00006006,
+       CR3_TARGET_VALUE0               = 0x00006008,
+       CR3_TARGET_VALUE1               = 0x0000600a,
+       CR3_TARGET_VALUE2               = 0x0000600c,
+       CR3_TARGET_VALUE3               = 0x0000600e,
+       EXIT_QUALIFICATION              = 0x00006400,
+       GUEST_LINEAR_ADDRESS            = 0x0000640a,
+       GUEST_CR0                       = 0x00006800,
+       GUEST_CR3                       = 0x00006802,
+       GUEST_CR4                       = 0x00006804,
+       GUEST_ES_BASE                   = 0x00006806,
+       GUEST_CS_BASE                   = 0x00006808,
+       GUEST_SS_BASE                   = 0x0000680a,
+       GUEST_DS_BASE                   = 0x0000680c,
+       GUEST_FS_BASE                   = 0x0000680e,
+       GUEST_GS_BASE                   = 0x00006810,
+       GUEST_LDTR_BASE                 = 0x00006812,
+       GUEST_TR_BASE                   = 0x00006814,
+       GUEST_GDTR_BASE                 = 0x00006816,
+       GUEST_IDTR_BASE                 = 0x00006818,
+       GUEST_DR7                       = 0x0000681a,
+       GUEST_RSP                       = 0x0000681c,
+       GUEST_RIP                       = 0x0000681e,
+       GUEST_RFLAGS                    = 0x00006820,
+       GUEST_PENDING_DBG_EXCEPTIONS    = 0x00006822,
+       GUEST_SYSENTER_ESP              = 0x00006824,
+       GUEST_SYSENTER_EIP              = 0x00006826,
+       HOST_CR0                        = 0x00006c00,
+       HOST_CR3                        = 0x00006c02,
+       HOST_CR4                        = 0x00006c04,
+       HOST_FS_BASE                    = 0x00006c06,
+       HOST_GS_BASE                    = 0x00006c08,
+       HOST_TR_BASE                    = 0x00006c0a,
+       HOST_GDTR_BASE                  = 0x00006c0c,
+       HOST_IDTR_BASE                  = 0x00006c0e,
+       HOST_IA32_SYSENTER_ESP          = 0x00006c10,
+       HOST_IA32_SYSENTER_EIP          = 0x00006c12,
+       HOST_RSP                        = 0x00006c14,
+       HOST_RIP                        = 0x00006c16,
 };
 
-struct vmxcap {
-       int set;
-       uint32_t proc_ctls;
-       uint32_t proc_ctls2;
+#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
+
+#define EXIT_REASON_EXCEPTION_NMI       0
+#define EXIT_REASON_EXTERNAL_INTERRUPT  1
+#define EXIT_REASON_TRIPLE_FAULT        2
+
+#define EXIT_REASON_PENDING_INTERRUPT   7
+#define EXIT_REASON_NMI_WINDOW          8
+#define EXIT_REASON_TASK_SWITCH         9
+#define EXIT_REASON_CPUID               10
+#define EXIT_REASON_HLT                 12
+#define EXIT_REASON_INVD                13
+#define EXIT_REASON_INVLPG              14
+#define EXIT_REASON_RDPMC               15
+#define EXIT_REASON_RDTSC               16
+#define EXIT_REASON_VMCALL              18
+#define EXIT_REASON_VMCLEAR             19
+#define EXIT_REASON_VMLAUNCH            20
+#define EXIT_REASON_VMPTRLD             21
+#define EXIT_REASON_VMPTRST             22
+#define EXIT_REASON_VMREAD              23
+#define EXIT_REASON_VMRESUME            24
+#define EXIT_REASON_VMWRITE             25
+#define EXIT_REASON_VMOFF               26
+#define EXIT_REASON_VMON                27
+#define EXIT_REASON_CR_ACCESS           28
+#define EXIT_REASON_DR_ACCESS           29
+#define EXIT_REASON_IO_INSTRUCTION      30
+#define EXIT_REASON_MSR_READ            31
+#define EXIT_REASON_MSR_WRITE           32
+#define EXIT_REASON_INVALID_STATE       33
+#define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION   40
+#define EXIT_REASON_MCE_DURING_VMENTRY  41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_EPT_VIOLATION       48
+#define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_WBINVD              54
+#define EXIT_REASON_XSETBV              55
+#define EXIT_REASON_INVPCID             58
+
+#define VMX_EXIT_REASONS \
+       { EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
+       { EXIT_REASON_EXTERNAL_INTERRUPT,    "EXTERNAL_INTERRUPT" }, \
+       { EXIT_REASON_TRIPLE_FAULT,          "TRIPLE_FAULT" }, \
+       { EXIT_REASON_PENDING_INTERRUPT,     "PENDING_INTERRUPT" }, \
+       { EXIT_REASON_NMI_WINDOW,            "NMI_WINDOW" }, \
+       { EXIT_REASON_TASK_SWITCH,           "TASK_SWITCH" }, \
+       { EXIT_REASON_CPUID,                 "CPUID" }, \
+       { EXIT_REASON_HLT,                   "HLT" }, \
+       { EXIT_REASON_INVLPG,                "INVLPG" }, \
+       { EXIT_REASON_RDPMC,                 "RDPMC" }, \
+       { EXIT_REASON_RDTSC,                 "RDTSC" }, \
+       { EXIT_REASON_VMCALL,                "VMCALL" }, \
+       { EXIT_REASON_VMCLEAR,               "VMCLEAR" }, \
+       { EXIT_REASON_VMLAUNCH,              "VMLAUNCH" }, \
+       { EXIT_REASON_VMPTRLD,               "VMPTRLD" }, \
+       { EXIT_REASON_VMPTRST,               "VMPTRST" }, \
+       { EXIT_REASON_VMREAD,                "VMREAD" }, \
+       { EXIT_REASON_VMRESUME,              "VMRESUME" }, \
+       { EXIT_REASON_VMWRITE,               "VMWRITE" }, \
+       { EXIT_REASON_VMOFF,                 "VMOFF" }, \
+       { EXIT_REASON_VMON,                  "VMON" }, \
+       { EXIT_REASON_CR_ACCESS,             "CR_ACCESS" }, \
+       { EXIT_REASON_DR_ACCESS,             "DR_ACCESS" }, \
+       { EXIT_REASON_IO_INSTRUCTION,        "IO_INSTRUCTION" }, \
+       { EXIT_REASON_MSR_READ,              "MSR_READ" }, \
+       { EXIT_REASON_MSR_WRITE,             "MSR_WRITE" }, \
+       { EXIT_REASON_MWAIT_INSTRUCTION,     "MWAIT_INSTRUCTION" }, \
+       { EXIT_REASON_MONITOR_INSTRUCTION,   "MONITOR_INSTRUCTION" }, \
+       { EXIT_REASON_PAUSE_INSTRUCTION,     "PAUSE_INSTRUCTION" }, \
+       { EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
+       { EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \
+       { EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
+       { EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
+       { EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
+       { EXIT_REASON_WBINVD,                "WBINVD" }
+
+/*
+ * Interruption-information format
+ */
+#define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
+#define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
+#define INTR_INFO_DELIVER_CODE_MASK     0x800           /* 11 */
+#define INTR_INFO_UNBLOCK_NMI          0x1000          /* 12 */
+#define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
+#define INTR_INFO_RESVD_BITS_MASK       0x7ffff000
+
+#define VECTORING_INFO_VECTOR_MASK             INTR_INFO_VECTOR_MASK
+#define VECTORING_INFO_TYPE_MASK               INTR_INFO_INTR_TYPE_MASK
+#define VECTORING_INFO_DELIEVER_CODE_MASK      INTR_INFO_DELIEVER_CODE_MASK
+#define VECTORING_INFO_VALID_MASK              INTR_INFO_VALID_MASK
+
+#define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
+#define INTR_TYPE_NMI_INTR             (2 << 8) /* NMI */
+#define INTR_TYPE_HARD_EXCEPTION       (3 << 8) /* processor exception */
+#define INTR_TYPE_EXCEPTION             (3 << 8)       /* processor exception */  
+#define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
+#define INTR_TYPE_SOFT_EXCEPTION       (6 << 8) /* software exception */
+
+/* GUEST_INTERRUPTIBILITY_INFO flags. */
+#define GUEST_INTR_STATE_STI           0x00000001
+#define GUEST_INTR_STATE_MOV_SS                0x00000002
+#define GUEST_INTR_STATE_SMI           0x00000004
+#define GUEST_INTR_STATE_NMI           0x00000008
+
+/* GUEST_ACTIVITY_STATE flags */
+#define GUEST_ACTIVITY_ACTIVE          0
+#define GUEST_ACTIVITY_HLT             1
+#define GUEST_ACTIVITY_SHUTDOWN                2
+#define GUEST_ACTIVITY_WAIT_SIPI       3
+
+/*
+ * Exit Qualifications for MOV for Control Register Access
+ */
+#define CONTROL_REG_ACCESS_NUM          0x7    /* 2:0, number of control register */
+#define CONTROL_REG_ACCESS_TYPE         0x30   /* 5:4, access type */
+#define CONTROL_REG_ACCESS_REG          0xf00  /* 10:8, general purpose register */
+#define LMSW_SOURCE_DATA_SHIFT 16
+#define LMSW_SOURCE_DATA  (0xFFFF << LMSW_SOURCE_DATA_SHIFT)   /* 16:31 lmsw source */
+#define REG_EAX                         (0 << 8)
+#define REG_ECX                         (1 << 8)
+#define REG_EDX                         (2 << 8)
+#define REG_EBX                         (3 << 8)
+#define REG_ESP                         (4 << 8)
+#define REG_EBP                         (5 << 8)
+#define REG_ESI                         (6 << 8)
+#define REG_EDI                         (7 << 8)
+#define REG_R8                         (8 << 8)
+#define REG_R9                         (9 << 8)
+#define REG_R10                        (10 << 8)
+#define REG_R11                        (11 << 8)
+#define REG_R12                        (12 << 8)
+#define REG_R13                        (13 << 8)
+#define REG_R14                        (14 << 8)
+#define REG_R15                        (15 << 8)
+
+/*
+ * Exit Qualifications for MOV for Debug Register Access
+ */
+#define DEBUG_REG_ACCESS_NUM            0x7    /* 2:0, number of debug register */
+#define DEBUG_REG_ACCESS_TYPE           0x10   /* 4, direction of access */
+#define TYPE_MOV_TO_DR                  (0 << 4)
+#define TYPE_MOV_FROM_DR                (1 << 4)
+#define DEBUG_REG_ACCESS_REG(eq)        (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
+
+
+/*
+ * Exit Qualifications for APIC-Access
+ */
+#define APIC_ACCESS_OFFSET              0xfff   /* 11:0, offset within the APIC page */
+#define APIC_ACCESS_TYPE                0xf000  /* 15:12, access type */
+#define TYPE_LINEAR_APIC_INST_READ      (0 << 12)
+#define TYPE_LINEAR_APIC_INST_WRITE     (1 << 12)
+#define TYPE_LINEAR_APIC_INST_FETCH     (2 << 12)
+#define TYPE_LINEAR_APIC_EVENT          (3 << 12)
+#define TYPE_PHYSICAL_APIC_EVENT        (10 << 12)
+#define TYPE_PHYSICAL_APIC_INST         (15 << 12)
+
+/* segment AR */
+#define SEGMENT_AR_L_MASK (1 << 13)
+
+/* entry controls */
+#define VM_ENTRY_CONTROLS_IA32E_MASK (1 << 9)
+
+#define AR_TYPE_ACCESSES_MASK 1
+#define AR_TYPE_READABLE_MASK (1 << 1)
+#define AR_TYPE_WRITEABLE_MASK (1 << 2)
+#define AR_TYPE_CODE_MASK (1 << 3)
+#define AR_TYPE_MASK 0x0f
+#define AR_TYPE_BUSY_64_TSS 11
+#define AR_TYPE_BUSY_32_TSS 11
+#define AR_TYPE_BUSY_16_TSS 3
+#define AR_TYPE_LDT 2
+
+#define AR_UNUSABLE_MASK (1 << 16)
+#define AR_S_MASK (1 << 4)
+#define AR_P_MASK (1 << 7)
+#define AR_L_MASK (1 << 13)
+#define AR_DB_MASK (1 << 14)
+#define AR_G_MASK (1 << 15)
+#define AR_DPL_SHIFT 5
+#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
+
+#define AR_RESERVD_MASK 0xfffe0f00
+
+#define TSS_PRIVATE_MEMSLOT                    (KVM_MEMORY_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT       (KVM_MEMORY_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT     (KVM_MEMORY_SLOTS + 2)
+
+#define VMX_NR_VPIDS                           (1 << 16)
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT         1
+#define VMX_VPID_EXTENT_ALL_CONTEXT            2
+
+#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR         0
+#define VMX_EPT_EXTENT_CONTEXT                 1
+#define VMX_EPT_EXTENT_GLOBAL                  2
+
+#define VMX_EPT_EXECUTE_ONLY_BIT               (1ull)
+#define VMX_EPT_PAGE_WALK_4_BIT                        (1ull << 6)
+#define VMX_EPTP_UC_BIT                                (1ull << 8)
+#define VMX_EPTP_WB_BIT                                (1ull << 14)
+#define VMX_EPT_2MB_PAGE_BIT                   (1ull << 16)
+#define VMX_EPT_1GB_PAGE_BIT                   (1ull << 17)
+#define VMX_EPT_AD_BIT                             (1ull << 21)
+#define VMX_EPT_EXTENT_CONTEXT_BIT             (1ull << 25)
+#define VMX_EPT_EXTENT_GLOBAL_BIT              (1ull << 26)
+
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
+#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
+
+#define VMX_EPT_DEFAULT_GAW                    3
+#define VMX_EPT_MAX_GAW                                0x4
+#define VMX_EPT_MT_EPTE_SHIFT                  3
+#define VMX_EPT_GAW_EPTP_SHIFT                 3
+#define VMX_EPT_AD_ENABLE_BIT                  (1ull << 6)
+#define VMX_EPT_DEFAULT_MT                     0x6ull
+#define VMX_EPT_READABLE_MASK                  0x1ull
+#define VMX_EPT_WRITABLE_MASK                  0x2ull
+#define VMX_EPT_EXECUTABLE_MASK                        0x4ull
+#define VMX_EPT_IPAT_BIT                       (1ull << 6)
+#define VMX_EPT_ACCESS_BIT                             (1ull << 8)
+#define VMX_EPT_DIRTY_BIT                              (1ull << 9)
+
+#define VMX_EPT_IDENTITY_PAGETABLE_ADDR                0xfffbc000ul
+
+
+#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT           ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID                  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+
+struct vmx_msr_entry {
+       uint32_t index;
+       uint32_t reserved;
+       uint64_t value;
+} __attribute__((aligned(16))) ;
+
+/*
+ * Exit Qualifications for entry failure during or after loading guest state
+ */
+#define ENTRY_FAIL_DEFAULT             0
+#define ENTRY_FAIL_PDPTE               2
+#define ENTRY_FAIL_NMI                 3
+#define ENTRY_FAIL_VMCS_LINK_PTR       4
+
+/*
+ * VM-instruction error numbers
+ */
+enum vm_instruction_error_number {
+       VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1,
+       VMXERR_VMCLEAR_INVALID_ADDRESS = 2,
+       VMXERR_VMCLEAR_VMXON_POINTER = 3,
+       VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4,
+       VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5,
+       VMXERR_VMRESUME_AFTER_VMXOFF = 6,
+       VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7,
+       VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8,
+       VMXERR_VMPTRLD_INVALID_ADDRESS = 9,
+       VMXERR_VMPTRLD_VMXON_POINTER = 10,
+       VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11,
+       VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12,
+       VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13,
+       VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15,
+       VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16,
+       VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17,
+       VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18,
+       VMXERR_VMCALL_NONCLEAR_VMCS = 19,
+       VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20,
+       VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22,
+       VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23,
+       VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24,
+       VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25,
+       VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26,
+       VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
 };
 
-struct vmxstate {
-       uint64_t nextrip;                       /* next instruction to be executed by guest */
-       int lastcpu;                            /* host cpu that this 'vcpu' last ran on */
-       uint16_t vpid;
+#define MSR_IA32_VMX_BASIC_MSR                 0x480
+#define MSR_IA32_VMX_PINBASED_CTLS_MSR         0x481
+#define MSR_IA32_VMX_PROCBASED_CTLS_MSR                0x482
+#define MSR_IA32_VMX_EXIT_CTLS_MSR             0x483
+#define MSR_IA32_VMX_ENTRY_CTLS_MSR            0x484
+
+int intel_vmm_init(void);
+int intel_vmm_pcpu_init(void);
+
+/* Additional bits for VMMCPs, originally from the Dune version of kvm. */
+/*
+ * vmx.h - header file for USM VMX driver.
+ */
+
+/* This is per-guest per-core, and the implementation specific area
+ * should be assumed to have hidden fields.
+ */
+struct vmcs {
+       uint32_t revision_id;
+       uint32_t abort_code;
+       char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
 };
 
-// TODO: akaros: merge all our various apic structs. 
-struct apic_page {
-       uint32_t reg[PAGE_SIZE / 4];
+typedef uint64_t gpa_t;
+typedef uint64_t gva_t;
+#define rdmsrl(msr, val) (val) = read_msr((msr))
+#define rdmsr(msr, low, high) do {uint64_t m = read_msr(msr); low = m; high = m>>32;} while (0)
+
+struct vmx_capability {
+       uint32_t ept;
+       uint32_t vpid;
+       int has_load_efer:1;
 };
 
-/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */
-struct pir_desc {
-       atomic_t pir[4];
-       atomic_t pending;
-       uint64_t unused[3];
-} __attribute__((aligned(64)));
-
-/* Index into the 'guest_msrs[]' array */
-enum {
-       IDX_MSR_LSTAR,
-       IDX_MSR_CSTAR,
-       IDX_MSR_STAR,
-       IDX_MSR_SYSCALL_MASK,
-       IDX_MSR_KERNEL_GS_BASE,
-       GUEST_MSR_NUM                           /* must be the last enumeration */
+extern struct vmx_capability vmx_capability;
+
+#define NR_AUTOLOAD_MSRS 8
+
+/* the horror. */
+struct desc_struct {
+        union {
+                struct {
+                        unsigned int a;
+                        unsigned int b;
+                };
+                struct {
+                        uint16_t limit0;
+                        uint16_t base0;
+                        unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+                        unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
+                };
+        };
+} __attribute__((packed));
+
+/* LDT or TSS descriptor in the GDT. 16 bytes. */
+struct ldttss_desc64 {
+       uint16_t limit0;
+       uint16_t base0;
+       unsigned base1 : 8, type : 5, dpl : 2, p : 1;
+       unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+       uint32_t base3;
+       uint32_t zero1;
+} __attribute__((packed));
+
+struct dune_struct {
+        struct vmx_vcpu *vcpu;
 };
 
-struct msr_bitmap {
-       char bitmap[PAGE_SIZE]; 
-} __attribute__ ((aligned(PAGE_SIZE)));
-/* virtual machine softc */
-// TODO: this has to go somewhere is we make VMs a flavor of an MCP, as we hope to do.
-struct vmx {
-       struct vmcs vmcs[MAX_NUM_CPUS]; /* one vmcs per virtual cpu */
-       struct apic_page apic_page[MAX_NUM_CPUS];       /* one apic page per vcpu */
-       struct msr_bitmap msr_bitmap;
-       struct pir_desc pir_desc[MAX_NUM_CPUS];
-       uint64_t guest_msrs[MAX_NUM_CPUS][GUEST_MSR_NUM];
-       struct vmxctx ctx[MAX_NUM_CPUS];
-       struct vmxcap cap[MAX_NUM_CPUS];
-       struct vmxstate state[MAX_NUM_CPUS];
-       uint64_t eptp;
-       struct vm *vm;
-       long eptgen[MAX_NUM_CPUS];      /* cached pmap->pm_eptgen */
+struct vmx_vcpu {
+
+       int cpu;
+       int vpid;
+       int launched;
+       struct hw_trapframe regs;
+       uint8_t  fail;
+       uint64_t exit_reason;
+       uint64_t host_rsp;
+
+       uint64_t cr2;
+
+       int shutdown;
+       int ret_code;
+       struct dune_guest *guest;
+
+       struct msr_autoload {
+               unsigned nr;
+               struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
+               struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+       } msr_autoload;
+
+       struct vmcs *vmcs;
 };
 
-#define        VMX_GUEST_VMEXIT        0
-#define        VMX_VMRESUME_ERROR      1
-#define        VMX_VMLAUNCH_ERROR      2
-#define        VMX_INVEPT_ERROR        3
-
-// This is here solely to make all the static asserts work. Hack. But those
-// are very useful functions. 
-// TODO: there HAS to be a better way ...
-static void __1(void) {
-       static_assert((offsetof(struct vmx, pir_desc[0]) & 63) == 0);
-       // should not fail  but does ... TODO Akaros
-       //static_assert((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
-       static_assert((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
-       static_assert(sizeof(struct pir_desc) == 64);
-       static_assert(sizeof(struct apic_page) == PAGE_SIZE);
+extern int vmx_init(void);
+extern void vmx_exit(void);
+int ept_fault_pages(void *dir, uint32_t start, uint32_t end);
+int ept_check_page(void *dir, unsigned long addr);
+int vmx_do_ept_fault(void *dir, unsigned long gpa, unsigned long gva, int fault_flags);
+/* no way to get around some of this stuff. */
+/* we will do the bare minimum required. */
+static inline void native_store_idt(pseudodesc_t *dtr)
+{
+       asm volatile("sidt %0":"=m" (*dtr));
 }
 
-int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched);
-void vmx_call_isr(uintptr_t entry);
-
-unsigned long vmx_fix_cr0(unsigned long cr0);
-unsigned long vmx_fix_cr4(unsigned long cr4);
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+       return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+}
 
-extern char vmx_exit_guest[];
+#define store_gdt(dtr)                          native_store_gdt(dtr)
+static inline void native_store_gdt(pseudodesc_t *dtr)
+{
+        asm volatile("sgdt %0":"=m" (*dtr));
+}
 
 #endif
index 41f5eb0..e0ddfce 100644 (file)
  */
 #include <assert.h>
 #include <pmap.h>
+#include <smp.h>
+#include <kmalloc.h>
 
-// NO . FILES HERE INCLUDE .h
-// That forces us to make the includes visible.
-#include "intel/vmx_cpufunc.h"
-#include "intel/vmcs.h"
 #include "intel/vmx.h"
-#include "x86.h"
+#include "intel/compat.h"
 #include "vmm.h"
-#include "func.h"
 
-/* this will be the init function for vmm. For now, it just ensures we
-   don't break things. */
+/* Figure out what kind of CPU we are on, and if it supports any reasonable
+ * virtualization. For now, if we're not some sort of newer intel, don't
+ * bother. This does all cores. Again, note, we make these decisions at runtime,
+ * to avoid getting into the problems that compile-time decisions can cause. 
+ * At this point, of course, it's still all intel.
+ */
+void vmm_init(void)
+{
+       int ret;
+       /* Check first for intel capabilities. This is hence two back-to-back
+        * implementationd-dependent checks. That's ok, it's all msr dependent.
+        */
+       ret = intel_vmm_init();
+       if (! ret) {
+               printd("intel_vmm_init worked\n");
+               return;
+       }
+
+       /* TODO: AMD. Will we ever care? It's not clear. */
+       printk("vmm_init failed, ret %d\n", ret);
+       return;
+}
+
+void vmm_pcpu_init(void)
+{
+       if (! intel_vmm_pcpu_init()) {
+               printd("vmm_pcpu_init worked\n");
+               return;
+       }
+       /* TODO: AMD. Will we ever care? It's not clear. */
+       printk("vmm_pcpu_init failed\n");
+}
+
+int vm_run(uint64_t rip, uint64_t rsp, uint64_t cr3)
+{
+       struct dune_config d = {rip, rsp, cr3};
+       int vmx_launch(struct dune_config *conf);       
+       if (current->vmm.amd) {
+               return -1;
+       } else {
+               return vmx_launch(&d);
+       }
+       return -1;
+}
+
+/* Initializes a process to run virtual machine contexts, returning the number
+ * initialized, optionally setting errno */
+int vmm_struct_init(struct vmm *vmm, unsigned int nr_guest_pcores)
+{
+       unsigned int i;
+       qlock(&vmm->qlock);
+       if (vmm->vmmcp) {
+               set_errno(EINVAL);
+               qunlock(&vmm->qlock);
+               return 0;
+       }
+       nr_guest_pcores = MIN(nr_guest_pcores, num_cpus);
+       vmm->amd = 0;
+       vmm->guest_pcores = kzmalloc(sizeof(void*) * nr_guest_pcores, KMALLOC_WAIT);
+       for (i = 0; i < nr_guest_pcores; i++) {
+               vmm->guest_pcores[i] = vmx_create_vcpu();
+               /* If we failed, we'll clean it up when the process dies */
+               if (!vmm->guest_pcores[i]) {
+                       set_errno(ENOMEM);
+                       break;
+               }
+       }
+       vmm->nr_guest_pcores = i;
+       vmm->vmmcp = TRUE;
+       qunlock(&vmm->qlock);
+       return i;
+}
+
+void vmm_struct_cleanup(struct vmm *vmm)
+{
+       qlock(&vmm->qlock);
+       if (!vmm->vmmcp) {
+               qunlock(&vmm->qlock);
+               return;
+       }
+       for (int i = 0; i < vmm->nr_guest_pcores; i++) {
+               if (vmm->guest_pcores[i])
+                       vmx_destroy_vcpu(vmm->guest_pcores[i]);
+       }
+       kfree(vmm->guest_pcores);
+       vmm->vmmcp = FALSE;
+       qunlock(&vmm->qlock);
+}
index dba60be..27f248b 100644 (file)
-/*-
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
 #ifndef _VMM_H_
 #define        _VMM_H_
 
-enum vm_suspend_how {
-       VM_SUSPEND_NONE,
-       VM_SUSPEND_RESET,
-       VM_SUSPEND_POWEROFF,
-       VM_SUSPEND_HALT,
-       VM_SUSPEND_TRIPLEFAULT,
-       VM_SUSPEND_LAST
-};
-
-/*
- * Identifiers for architecturally defined registers.
- */
-enum vm_reg_name {
-       VM_REG_GUEST_RAX,
-       VM_REG_GUEST_RBX,
-       VM_REG_GUEST_RCX,
-       VM_REG_GUEST_RDX,
-       VM_REG_GUEST_RSI,
-       VM_REG_GUEST_RDI,
-       VM_REG_GUEST_RBP,
-       VM_REG_GUEST_R8,
-       VM_REG_GUEST_R9,
-       VM_REG_GUEST_R10,
-       VM_REG_GUEST_R11,
-       VM_REG_GUEST_R12,
-       VM_REG_GUEST_R13,
-       VM_REG_GUEST_R14,
-       VM_REG_GUEST_R15,
-       VM_REG_GUEST_CR0,
-       VM_REG_GUEST_CR3,
-       VM_REG_GUEST_CR4,
-       VM_REG_GUEST_DR7,
-       VM_REG_GUEST_RSP,
-       VM_REG_GUEST_RIP,
-       VM_REG_GUEST_RFLAGS,
-       VM_REG_GUEST_ES,
-       VM_REG_GUEST_CS,
-       VM_REG_GUEST_SS,
-       VM_REG_GUEST_DS,
-       VM_REG_GUEST_FS,
-       VM_REG_GUEST_GS,
-       VM_REG_GUEST_LDTR,
-       VM_REG_GUEST_TR,
-       VM_REG_GUEST_IDTR,
-       VM_REG_GUEST_GDTR,
-       VM_REG_GUEST_EFER,
-       VM_REG_GUEST_CR2,
-       VM_REG_GUEST_PDPTE0,
-       VM_REG_GUEST_PDPTE1,
-       VM_REG_GUEST_PDPTE2,
-       VM_REG_GUEST_PDPTE3,
-       VM_REG_GUEST_INTR_SHADOW,
-       VM_REG_LAST
-};
-
-enum x2apic_state {
-       X2APIC_DISABLED,
-       X2APIC_ENABLED,
-       X2APIC_STATE_LAST
-};
-
-#define        VM_INTINFO_VECTOR(info) ((info) & 0xff)
-#define        VM_INTINFO_DEL_ERRCODE  0x800
-#define        VM_INTINFO_RSVD         0x7ffff000
-#define        VM_INTINFO_VALID        0x80000000
-#define        VM_INTINFO_TYPE         0x700
-#define        VM_INTINFO_HWINTR       (0 << 8)
-#define        VM_INTINFO_NMI          (2 << 8)
-#define        VM_INTINFO_HWEXCEPTION  (3 << 8)
-#define        VM_INTINFO_SWINTR       (4 << 8)
-
-enum vcpu_state {
-       VCPU_IDLE,
-       VCPU_FROZEN,
-       VCPU_RUNNING,
-       VCPU_SLEEPING,
-};
-
-/*
- * Identifiers for optional vmm capabilities
- */
-enum vm_cap_type {
-       VM_CAP_HALT_EXIT,
-       VM_CAP_MTRAP_EXIT,
-       VM_CAP_PAUSE_EXIT,
-       VM_CAP_UNRESTRICTED_GUEST,
-       VM_CAP_ENABLE_INVPCID,
-       VM_CAP_MAX
-};
-
-enum vm_intr_trigger {
-       EDGE_TRIGGER,
-       LEVEL_TRIGGER
-};
-       
-/*
- * The 'access' field has the format specified in Table 21-2 of the Intel
- * Architecture Manual vol 3b.
- *
- * XXX The contents of the 'access' field are architecturally defined except
- * bit 16 - Segment Unusable.
- */
-struct seg_desc {
-       uint64_t        base;
-       uint32_t        limit;
-       uint32_t        access;
-};
-#define        SEG_DESC_TYPE(access)           ((access) & 0x001f)
-#define        SEG_DESC_DPL(access)            (((access) >> 5) & 0x3)
-#define        SEG_DESC_PRESENT(access)        (((access) & 0x0080) ? 1 : 0)
-#define        SEG_DESC_DEF32(access)          (((access) & 0x4000) ? 1 : 0)
-#define        SEG_DESC_GRANULARITY(access)    (((access) & 0x8000) ? 1 : 0)
-#define        SEG_DESC_UNUSABLE(access)       (((access) & 0x10000) ? 1 : 0)
-
-enum vm_cpu_mode {
-       CPU_MODE_REAL,
-       CPU_MODE_PROTECTED,
-       CPU_MODE_COMPATIBILITY,         /* IA-32E mode (CS.L = 0) */
-       CPU_MODE_64BIT,                 /* IA-32E mode (CS.L = 1) */
-};
-
-enum vm_paging_mode {
-       PAGING_MODE_FLAT,
-       PAGING_MODE_32,
-       PAGING_MODE_PAE,
-       PAGING_MODE_64,
-};
-
-struct vm_guest_paging {
-       uint64_t        cr3;
-       int             cpl;
-       enum vm_cpu_mode cpu_mode;
-       enum vm_paging_mode paging_mode;
-};
-
-/*
- * The data structures 'vie' and 'vie_op' are meant to be opaque to the
- * consumers of instruction decoding. The only reason why their contents
- * need to be exposed is because they are part of the 'vm_exit' structure.
- */
-struct vie_op {
-       uint8_t         op_byte;        /* actual opcode byte */
-       uint8_t         op_type;        /* type of operation (e.g. MOV) */
-       uint16_t        op_flags;
-};
-
-#define        VIE_INST_SIZE   15
-struct vie {
-       uint8_t         inst[VIE_INST_SIZE];    /* instruction bytes */
-       uint8_t         num_valid;              /* size of the instruction */
-       uint8_t         num_processed;
-
-       uint8_t         addrsize:4, opsize:4;   /* address and operand sizes */
-       uint8_t         rex_w:1,                /* REX prefix */
-                       rex_r:1,
-                       rex_x:1,
-                       rex_b:1,
-                       rex_present:1,
-                       opsize_override:1,      /* Operand size override */
-                       addrsize_override:1;    /* Address size override */
+static inline int cpu_has_vmx(void)
+{
+       unsigned long ecx = cpuid_ecx(1);
+       return ecx & (1<<5); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
 
-       uint8_t         mod:2,                  /* ModRM byte */
-                       reg:4,
-                       rm:4;
-
-       uint8_t         ss:2,                   /* SIB byte */
-                       index:4,
-                       base:4;
-
-       uint8_t         disp_bytes;
-       uint8_t         imm_bytes;
-
-       uint8_t         scale;
-       int             base_register;          /* VM_REG_GUEST_xyz */
-       int             index_register;         /* VM_REG_GUEST_xyz */
-
-       int64_t         displacement;           /* optional addr displacement */
-       int64_t         immediate;              /* optional immediate operand */
-
-       uint8_t         decoded;        /* set to 1 if successfully decoded */
-
-       struct vie_op   op;                     /* opcode description */
-};
-
-enum vm_exitcode {
-       VM_EXITCODE_INOUT,
-       VM_EXITCODE_VMX,
-       VM_EXITCODE_BOGUS,
-       VM_EXITCODE_RDMSR,
-       VM_EXITCODE_WRMSR,
-       VM_EXITCODE_HLT,
-       VM_EXITCODE_MTRAP,
-       VM_EXITCODE_PAUSE,
-       VM_EXITCODE_PAGING,
-       VM_EXITCODE_INST_EMUL,
-       VM_EXITCODE_SPINUP_AP,
-       VM_EXITCODE_DEPRECATED1,        /* used to be SPINDOWN_CPU */
-       VM_EXITCODE_RENDEZVOUS,
-       VM_EXITCODE_IOAPIC_EOI,
-       VM_EXITCODE_SUSPENDED,
-       VM_EXITCODE_INOUT_STR,
-       VM_EXITCODE_TASK_SWITCH,
-       VM_EXITCODE_MONITOR,
-       VM_EXITCODE_MWAIT,
-       VM_EXITCODE_SVM,
-       VM_EXITCODE_MAX
-};
-
-struct vm_inout {
-       uint16_t        bytes:3;        /* 1 or 2 or 4 */
-       uint16_t        in:1;
-       uint16_t        string:1;
-       uint16_t        rep:1;
-       uint16_t        port;
-       uint32_t        eax;            /* valid for out */
-};
-
-struct vm_inout_str {
-       struct vm_inout inout;          /* must be the first element */
-       struct vm_guest_paging paging;
-       uint64_t        rflags;
-       uint64_t        cr0;
-       uint64_t        index;
-       uint64_t        count;          /* rep=1 (%rcx), rep=0 (1) */
-       int             addrsize;
-       enum vm_reg_name seg_name;
-       struct seg_desc seg_desc;
-};
-
-enum task_switch_reason {
-       TSR_CALL,
-       TSR_IRET,
-       TSR_JMP,
-       TSR_IDT_GATE,   /* task gate in IDT */
-};
-
-struct vm_task_switch {
-       uint16_t        tsssel;         /* new TSS selector */
-       int             ext;            /* task switch due to external event */
-       uint32_t        errcode;
-       int             errcode_valid;  /* push 'errcode' on the new stack */
-       enum task_switch_reason reason;
-       struct vm_guest_paging paging;
-};
-
-struct vm_exit {
-       enum vm_exitcode        exitcode;
-       int                     inst_length;    /* 0 means unknown */
-       uint64_t                rip;
-       union {
-               struct vm_inout inout;
-               struct vm_inout_str inout_str;
-               struct {
-                       uint64_t        gpa;
-                       int             fault_type;
-               } paging;
-               struct {
-                       uint64_t        gpa;
-                       uint64_t        gla;
-                       int             cs_d;           /* CS.D */
-                       struct vm_guest_paging paging;
-                       struct vie      vie;
-               } inst_emul;
-               /*
-                * VMX specific payload. Used when there is no "better"
-                * exitcode to represent the VM-exit.
-                */
-               struct {
-                       int             status;         /* vmx inst status */
-                       /*
-                        * 'exit_reason' and 'exit_qualification' are valid
-                        * only if 'status' is zero.
-                        */
-                       uint32_t        exit_reason;
-                       uint64_t        exit_qualification;
-                       /*
-                        * 'inst_error' and 'inst_type' are valid
-                        * only if 'status' is non-zero.
-                        */
-                       int             inst_type;
-                       int             inst_error;
-               } vmx;
-               /*
-                * SVM specific payload.
-                */
-               struct {
-                       uint64_t        exitcode;
-                       uint64_t        exitinfo1;
-                       uint64_t        exitinfo2;
-               } svm;
-               struct {
-                       uint32_t        code;           /* ecx value */
-                       uint64_t        wval;
-               } msr;
-               struct {
-                       int             vcpu;
-                       uint64_t        rip;
-               } spinup_ap;
-               struct {
-                       uint64_t        rflags;
-               } hlt;
-               struct {
-                       int             vector;
-               } ioapic_eoi;
-               struct {
-                       enum vm_suspend_how how;
-               } suspended;
-               struct vm_task_switch task_switch;
-       } u;
-};
+/* maybe someday, not today. */
+static inline int cpu_has_svm(const char **msg)
+{
+       return 0;
+}
 
 struct vmm {
+       qlock_t qlock;
+       // always false.
+       int amd;
        // true if this is a VMMCP.
        bool vmmcp;
 
        // Number of cores in this VMMCP.
-       int ncores;
-       // The EPT entries are incompatible in just a few bit
-       // positions. Very few. You could *almost* use the same page
-       // tables for EPT and process page tables, but ... not quite.
-       // Really: you put X in bit two of the EPT and in bit 63 of
-       // the standard one.  Setting WB (6 in bits 5:3) in the EPT
-       // versions disables caching (CD is bit 4) in the native
-       // versions.  WTF?
-       //
-       // As a result we have to keep these two in sync, IFF
-       // we have a VMMCP. N.B. only for the sie of the EPT
-       // address space, which is limited to much less than
-       // the virtual address space.
-       physaddr_t eptpt;
+       int nr_guest_pcores;
 
        // The VMCS is intel-specific. But, maybe, someday, AMD will
        // be back.  Just make this an anon union and we'll work it
@@ -369,8 +32,22 @@ struct vmm {
        // you could install a kernel from the ISO, but the kernel it
        // installed would GPF on a K7.
        union {
-               struct vmcs *vmcs;
+               void *svm;
+               struct vmx_vcpu **guest_pcores;
        };
 };
 
+void vmm_init(void);
+void vmm_pcpu_init(void);
+
+int vmm_struct_init(struct vmm *vmm, unsigned int nr_guest_pcores);
+void vmm_struct_cleanup(struct vmm *vmm);
+
+int vm_run(uint64_t,uint64_t, uint64_t);
+int intel_vmx_start(int id);
+int intel_vmx_setup(int nvmcs);
+
+struct vmx_vcpu *vmx_create_vcpu(void);
+void vmx_destroy_vcpu(struct vmx_vcpu *vcpu);
+
 #endif /* _VMM_H_ */