Placeholder for vm support
authorRonald G. Minnich <rminnich@google.com>
Sat, 4 Jan 2014 00:07:16 +0000 (16:07 -0800)
committerBarret Rhoden <brho@cs.berkeley.edu>
Fri, 17 Jan 2014 22:35:29 +0000 (14:35 -0800)
vm.c gets lots of errors, but I think they're going to prove
to be pretty simple.

One thing we can do cleanly in akaros, since we know we want
a vm and we know we're only going to go with one, is put the
per-cpu stuff in the actual akaros per-cpu structs. That gets
rid of some fiddling around.

We should figure out if msr-index.h just should go in x86.h.
Personally, I vote yes; I don't like lots of little includes and
cpp is fast enough for it not to matter.

Signed-off-by: Ronald G. Minnich <rminnich@google.com>
kern/arch/x86/Kbuild
kern/arch/x86/litevm.h
kern/arch/x86/msr-index.h [new file with mode: 0644]
kern/arch/x86/trap.h
kern/arch/x86/types.h
kern/arch/x86/vm.c [new file with mode: 0644]

index 60a7e38..f715a05 100644 (file)
@@ -22,3 +22,4 @@ obj-y                                         += smp_boot.o
 obj-y                                          += smp_entry$(BITS).o
 obj-y                                          += trap.o trap$(BITS).o
 obj-y                                          += trapentry$(BITS).o
+#obj-y                                         += vm.o
index a938da1..7e9b43e 100644 (file)
@@ -386,4 +386,192 @@ static inline struct litevm_mmu_page *page_header(hpa_t shadow_page)
 
 #endif
 
+/* just to get things to build ... include this stuff for now.
+ * at some point, this interface gets nuke to and made more plan 
+ * 9 like.
+ */
+
+/* for LITEVM_CREATE_MEMORY_REGION */
+struct litevm_memory_region {
+       uint32_t slot;
+       uint32_t flags;
+       uint64_t guest_phys_addr;
+       uint64_t memory_size; /* bytes */
+};
+
+/* for litevm_memory_region::flags */
+#define LITEVM_MEM_LOG_DIRTY_PAGES  1UL
+
+
+#define LITEVM_EXIT_TYPE_FAIL_ENTRY 1
+#define LITEVM_EXIT_TYPE_VM_EXIT    2
+
+enum litevm_exit_reason {
+       LITEVM_EXIT_UNKNOWN,
+       LITEVM_EXIT_EXCEPTION,
+       LITEVM_EXIT_IO,
+       LITEVM_EXIT_CPUID,
+       LITEVM_EXIT_DEBUG,
+       LITEVM_EXIT_HLT,
+       LITEVM_EXIT_MMIO,
+};
+
+/* for LITEVM_RUN */
+struct litevm_run {
+       /* in */
+       uint32_t vcpu;
+       uint32_t emulated;  /* skip current instruction */
+       uint32_t mmio_completed; /* mmio request completed */
+
+       /* out */
+       uint32_t exit_type;
+       uint32_t exit_reason;
+       uint32_t instruction_length;
+       union {
+               /* LITEVM_EXIT_UNKNOWN */
+               struct {
+                       uint32_t hardware_exit_reason;
+               } hw;
+               /* LITEVM_EXIT_EXCEPTION */
+               struct {
+                       uint32_t exception;
+                       uint32_t error_code;
+               } ex;
+               /* LITEVM_EXIT_IO */
+               struct {
+#define LITEVM_EXIT_IO_IN  0
+#define LITEVM_EXIT_IO_OUT 1
+                       uint8_t direction;
+                       uint8_t size; /* bytes */
+                       uint8_t string;
+                       uint8_t string_down;
+                       uint8_t rep;
+                       uint8_t pad;
+                       uint16_t port;
+                       uint64_t count;
+                       union {
+                               uint64_t address;
+                               uint32_t value;
+                       };
+               } io;
+               struct {
+               } debug;
+               /* LITEVM_EXIT_MMIO */
+               struct {
+                       uint64_t phys_addr;
+                       uint8_t  data[8];
+                       uint32_t len;
+                       uint8_t  is_write;
+               } mmio;
+       };
+};
+
+/* for LITEVM_GET_REGS and LITEVM_SET_REGS */
+struct litevm_regs {
+       /* in */
+       uint32_t vcpu;
+       uint32_t padding;
+
+       /* out (LITEVM_GET_REGS) / in (LITEVM_SET_REGS) */
+       uint64_t rax, rbx, rcx, rdx;
+       uint64_t rsi, rdi, rsp, rbp;
+       uint64_t r8,  r9,  r10, r11;
+       uint64_t r12, r13, r14, r15;
+       uint64_t rip, rflags;
+};
+
+struct litevm_segment {
+       uint64_t base;
+       uint32_t limit;
+       uint16_t selector;
+       uint8_t  type;
+       uint8_t  present, dpl, db, s, l, g, avl;
+       uint8_t  unusable;
+       uint8_t  padding;
+};
+
+struct litevm_dtable {
+       uint64_t base;
+       uint16_t limit;
+       uint16_t padding[3];
+};
+
+/* for LITEVM_GET_SREGS and LITEVM_SET_SREGS */
+struct litevm_sregs {
+       /* in */
+       uint32_t vcpu;
+       uint32_t padding;
+
+       /* out (LITEVM_GET_SREGS) / in (LITEVM_SET_SREGS) */
+       struct litevm_segment cs, ds, es, fs, gs, ss;
+       struct litevm_segment tr, ldt;
+       struct litevm_dtable gdt, idt;
+       uint64_t cr0, cr2, cr3, cr4, cr8;
+       uint64_t efer;
+       uint64_t apic_base;
+
+       /* out (LITEVM_GET_SREGS) */
+       uint32_t pending_int;
+       uint32_t padding2;
+};
+
+/* for LITEVM_TRANSLATE */
+struct litevm_translation {
+       /* in */
+       uint64_t linear_address;
+       uint32_t vcpu;
+       uint32_t padding;
+
+       /* out */
+       uint64_t physical_address;
+       uint8_t  valid;
+       uint8_t  writeable;
+       uint8_t  usermode;
+};
+
+/* for LITEVM_INTERRUPT */
+struct litevm_interrupt {
+       /* in */
+       uint32_t vcpu;
+       uint32_t irq;
+};
+
+struct litevm_breakpoint {
+       uint32_t enabled;
+       uint32_t padding;
+       uint64_t address;
+};
+
+/* for LITEVM_DEBUG_GUEST */
+struct litevm_debug_guest {
+       /* int */
+       uint32_t vcpu;
+       uint32_t enabled;
+       struct litevm_breakpoint breakpoints[4];
+       uint32_t singlestep;
+};
+
+/* for LITEVM_GET_DIRTY_LOG */
+struct litevm_dirty_log {
+       uint32_t slot;
+       uint32_t padding;
+       union {
+               void *dirty_bitmap; /* one bit per page */
+               uint64_t paddingw;
+       };
+};
+
+enum {
+       LITEVM_RUN = 1,
+       LITEVM_GET_REGS,
+       LITEVM_SET_REGS,
+       LITEVM_GET_SREGS,
+       LITEVM_SET_SREGS,
+       LITEVM_TRANSLATE,
+       LITEVM_INTERRUPT,
+       LITEVM_DEBUG_GUEST,
+       LITEVM_SET_MEMORY_REGION,
+       LITEVM_CREATE_VCPU,
+       LITEVM_GET_DIRTY_LOG,
+};
 #endif
diff --git a/kern/arch/x86/msr-index.h b/kern/arch/x86/msr-index.h
new file mode 100644 (file)
index 0000000..12d5e6f
--- /dev/null
@@ -0,0 +1,528 @@
+#ifndef _ASM_X86_MSR_INDEX_H
+#define _ASM_X86_MSR_INDEX_H
+
+/* CPU model specific register (MSR) numbers */
+
+/* x86-64 specific MSRs */
+#define MSR_EFER               0xc0000080 /* extended feature register */
+#define MSR_STAR               0xc0000081 /* legacy mode SYSCALL target */
+#define MSR_LSTAR              0xc0000082 /* long mode SYSCALL target */
+#define MSR_CSTAR              0xc0000083 /* compat mode SYSCALL target */
+#define MSR_SYSCALL_MASK       0xc0000084 /* EFLAGS mask for syscall */
+#define MSR_FS_BASE            0xc0000100 /* 64bit FS base */
+#define MSR_GS_BASE            0xc0000101 /* 64bit GS base */
+#define MSR_KERNEL_GS_BASE     0xc0000102 /* SwapGS GS shadow */
+#define MSR_TSC_AUX            0xc0000103 /* Auxiliary TSC */
+
+/* EFER bits: */
+#define _EFER_SCE              0  /* SYSCALL/SYSRET */
+#define _EFER_LME              8  /* Long mode enable */
+#define _EFER_LMA              10 /* Long mode active (read-only) */
+#define _EFER_NX               11 /* No execute enable */
+#define _EFER_SVME             12 /* Enable virtualization */
+#define _EFER_LMSLE            13 /* Long Mode Segment Limit Enable */
+#define _EFER_FFXSR            14 /* Enable Fast FXSAVE/FXRSTOR */
+
+#define EFER_SCE               (1<<_EFER_SCE)
+#define EFER_LME               (1<<_EFER_LME)
+#define EFER_LMA               (1<<_EFER_LMA)
+#define EFER_NX                        (1<<_EFER_NX)
+#define EFER_SVME              (1<<_EFER_SVME)
+#define EFER_LMSLE             (1<<_EFER_LMSLE)
+#define EFER_FFXSR             (1<<_EFER_FFXSR)
+
+/* Intel MSRs. Some also available on other CPUs */
+#define MSR_IA32_PERFCTR0              0x000000c1
+#define MSR_IA32_PERFCTR1              0x000000c2
+#define MSR_FSB_FREQ                   0x000000cd
+#define MSR_NHM_PLATFORM_INFO          0x000000ce
+
+#define MSR_NHM_SNB_PKG_CST_CFG_CTL    0x000000e2
+#define NHM_C3_AUTO_DEMOTE             (1UL << 25)
+#define NHM_C1_AUTO_DEMOTE             (1UL << 26)
+#define ATM_LNC_C6_AUTO_DEMOTE         (1UL << 25)
+#define SNB_C1_AUTO_UNDEMOTE           (1UL << 27)
+#define SNB_C3_AUTO_UNDEMOTE           (1UL << 28)
+
+#define MSR_MTRRcap                    0x000000fe
+#define MSR_IA32_BBL_CR_CTL            0x00000119
+#define MSR_IA32_BBL_CR_CTL3           0x0000011e
+
+#define MSR_IA32_SYSENTER_CS           0x00000174
+#define MSR_IA32_SYSENTER_ESP          0x00000175
+#define MSR_IA32_SYSENTER_EIP          0x00000176
+
+#define MSR_IA32_MCG_CAP               0x00000179
+#define MSR_IA32_MCG_STATUS            0x0000017a
+#define MSR_IA32_MCG_CTL               0x0000017b
+
+#define MSR_OFFCORE_RSP_0              0x000001a6
+#define MSR_OFFCORE_RSP_1              0x000001a7
+#define MSR_NHM_TURBO_RATIO_LIMIT      0x000001ad
+#define MSR_IVT_TURBO_RATIO_LIMIT      0x000001ae
+
+#define MSR_LBR_SELECT                 0x000001c8
+#define MSR_LBR_TOS                    0x000001c9
+#define MSR_LBR_NHM_FROM               0x00000680
+#define MSR_LBR_NHM_TO                 0x000006c0
+#define MSR_LBR_CORE_FROM              0x00000040
+#define MSR_LBR_CORE_TO                        0x00000060
+
+#define MSR_IA32_PEBS_ENABLE           0x000003f1
+#define MSR_IA32_DS_AREA               0x00000600
+#define MSR_IA32_PERF_CAPABILITIES     0x00000345
+
+#define MSR_MTRRfix64K_00000           0x00000250
+#define MSR_MTRRfix16K_80000           0x00000258
+#define MSR_MTRRfix16K_A0000           0x00000259
+#define MSR_MTRRfix4K_C0000            0x00000268
+#define MSR_MTRRfix4K_C8000            0x00000269
+#define MSR_MTRRfix4K_D0000            0x0000026a
+#define MSR_MTRRfix4K_D8000            0x0000026b
+#define MSR_MTRRfix4K_E0000            0x0000026c
+#define MSR_MTRRfix4K_E8000            0x0000026d
+#define MSR_MTRRfix4K_F0000            0x0000026e
+#define MSR_MTRRfix4K_F8000            0x0000026f
+#define MSR_MTRRdefType                        0x000002ff
+
+#define MSR_IA32_CR_PAT                        0x00000277
+
+#define MSR_IA32_DEBUGCTLMSR           0x000001d9
+#define MSR_IA32_LASTBRANCHFROMIP      0x000001db
+#define MSR_IA32_LASTBRANCHTOIP                0x000001dc
+#define MSR_IA32_LASTINTFROMIP         0x000001dd
+#define MSR_IA32_LASTINTTOIP           0x000001de
+
+/* DEBUGCTLMSR bits (others vary by model): */
+#define DEBUGCTLMSR_LBR                        (1UL <<  0) /* last branch recording */
+#define DEBUGCTLMSR_BTF                        (1UL <<  1) /* single-step on branches */
+#define DEBUGCTLMSR_TR                 (1UL <<  6)
+#define DEBUGCTLMSR_BTS                        (1UL <<  7)
+#define DEBUGCTLMSR_BTINT              (1UL <<  8)
+#define DEBUGCTLMSR_BTS_OFF_OS         (1UL <<  9)
+#define DEBUGCTLMSR_BTS_OFF_USR                (1UL << 10)
+#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+
+#define MSR_IA32_MC0_CTL               0x00000400
+#define MSR_IA32_MC0_STATUS            0x00000401
+#define MSR_IA32_MC0_ADDR              0x00000402
+#define MSR_IA32_MC0_MISC              0x00000403
+
+/* C-state Residency Counters */
+#define MSR_PKG_C3_RESIDENCY           0x000003f8
+#define MSR_PKG_C6_RESIDENCY           0x000003f9
+#define MSR_PKG_C7_RESIDENCY           0x000003fa
+#define MSR_CORE_C3_RESIDENCY          0x000003fc
+#define MSR_CORE_C6_RESIDENCY          0x000003fd
+#define MSR_CORE_C7_RESIDENCY          0x000003fe
+#define MSR_PKG_C2_RESIDENCY           0x0000060d
+#define MSR_PKG_C8_RESIDENCY           0x00000630      /* HSW-ULT only */
+#define MSR_PKG_C9_RESIDENCY           0x00000631      /* HSW-ULT only */
+#define MSR_PKG_C10_RESIDENCY          0x00000632      /* HSW-ULT only */
+
+/* Run Time Average Power Limiting (RAPL) Interface */
+
+#define MSR_RAPL_POWER_UNIT            0x00000606
+
+#define MSR_PKG_POWER_LIMIT            0x00000610
+#define MSR_PKG_ENERGY_STATUS          0x00000611
+#define MSR_PKG_PERF_STATUS            0x00000613
+#define MSR_PKG_POWER_INFO             0x00000614
+
+#define MSR_DRAM_POWER_LIMIT           0x00000618
+#define MSR_DRAM_ENERGY_STATUS         0x00000619
+#define MSR_DRAM_PERF_STATUS           0x0000061b
+#define MSR_DRAM_POWER_INFO            0x0000061c
+
+#define MSR_PP0_POWER_LIMIT            0x00000638
+#define MSR_PP0_ENERGY_STATUS          0x00000639
+#define MSR_PP0_POLICY                 0x0000063a
+#define MSR_PP0_PERF_STATUS            0x0000063b
+
+#define MSR_PP1_POWER_LIMIT            0x00000640
+#define MSR_PP1_ENERGY_STATUS          0x00000641
+#define MSR_PP1_POLICY                 0x00000642
+
+#define MSR_AMD64_MC0_MASK             0xc0010044
+
+#define MSR_IA32_MCx_CTL(x)            (MSR_IA32_MC0_CTL + 4*(x))
+#define MSR_IA32_MCx_STATUS(x)         (MSR_IA32_MC0_STATUS + 4*(x))
+#define MSR_IA32_MCx_ADDR(x)           (MSR_IA32_MC0_ADDR + 4*(x))
+#define MSR_IA32_MCx_MISC(x)           (MSR_IA32_MC0_MISC + 4*(x))
+
+#define MSR_AMD64_MCx_MASK(x)          (MSR_AMD64_MC0_MASK + (x))
+
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2              0x00000280
+#define MSR_IA32_MCx_CTL2(x)           (MSR_IA32_MC0_CTL2 + (x))
+
+#define MSR_P6_PERFCTR0                        0x000000c1
+#define MSR_P6_PERFCTR1                        0x000000c2
+#define MSR_P6_EVNTSEL0                        0x00000186
+#define MSR_P6_EVNTSEL1                        0x00000187
+
+#define MSR_KNC_PERFCTR0               0x00000020
+#define MSR_KNC_PERFCTR1               0x00000021
+#define MSR_KNC_EVNTSEL0               0x00000028
+#define MSR_KNC_EVNTSEL1               0x00000029
+
+/* AMD64 MSRs. Not complete. See the architecture manual for a more
+   complete list. */
+
+#define MSR_AMD64_PATCH_LEVEL          0x0000008b
+#define MSR_AMD64_TSC_RATIO            0xc0000104
+#define MSR_AMD64_NB_CFG               0xc001001f
+#define MSR_AMD64_PATCH_LOADER         0xc0010020
+#define MSR_AMD64_OSVW_ID_LENGTH       0xc0010140
+#define MSR_AMD64_OSVW_STATUS          0xc0010141
+#define MSR_AMD64_DC_CFG               0xc0011022
+#define MSR_AMD64_IBSFETCHCTL          0xc0011030
+#define MSR_AMD64_IBSFETCHLINAD                0xc0011031
+#define MSR_AMD64_IBSFETCHPHYSAD       0xc0011032
+#define MSR_AMD64_IBSFETCH_REG_COUNT   3
+#define MSR_AMD64_IBSFETCH_REG_MASK    ((1UL<<MSR_AMD64_IBSFETCH_REG_COUNT)-1)
+#define MSR_AMD64_IBSOPCTL             0xc0011033
+#define MSR_AMD64_IBSOPRIP             0xc0011034
+#define MSR_AMD64_IBSOPDATA            0xc0011035
+#define MSR_AMD64_IBSOPDATA2           0xc0011036
+#define MSR_AMD64_IBSOPDATA3           0xc0011037
+#define MSR_AMD64_IBSDCLINAD           0xc0011038
+#define MSR_AMD64_IBSDCPHYSAD          0xc0011039
+#define MSR_AMD64_IBSOP_REG_COUNT      7
+#define MSR_AMD64_IBSOP_REG_MASK       ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1)
+#define MSR_AMD64_IBSCTL               0xc001103a
+#define MSR_AMD64_IBSBRTARGET          0xc001103b
+#define MSR_AMD64_IBS_REG_COUNT_MAX    8 /* includes MSR_AMD64_IBSBRTARGET */
+
+/* Fam 15h MSRs */
+#define MSR_F15H_PERF_CTL              0xc0010200
+#define MSR_F15H_PERF_CTR              0xc0010201
+
+/* Fam 10h MSRs */
+#define MSR_FAM10H_MMIO_CONF_BASE      0xc0010058
+#define FAM10H_MMIO_CONF_ENABLE                (1<<0)
+#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
+#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
+#define FAM10H_MMIO_CONF_BASE_MASK     0xfffffffULL
+#define FAM10H_MMIO_CONF_BASE_SHIFT    20
+#define MSR_FAM10H_NODE_ID             0xc001100c
+
+/* K8 MSRs */
+#define MSR_K8_TOP_MEM1                        0xc001001a
+#define MSR_K8_TOP_MEM2                        0xc001001d
+#define MSR_K8_SYSCFG                  0xc0010010
+#define MSR_K8_INT_PENDING_MSG         0xc0010055
+/* C1E active bits in int pending message */
+#define K8_INTP_C1E_ACTIVE_MASK                0x18000000
+#define MSR_K8_TSEG_ADDR               0xc0010112
+#define K8_MTRRFIXRANGE_DRAM_ENABLE    0x00040000 /* MtrrFixDramEn bit    */
+#define K8_MTRRFIXRANGE_DRAM_MODIFY    0x00080000 /* MtrrFixDramModEn bit */
+#define K8_MTRR_RDMEM_WRMEM_MASK       0x18181818 /* Mask: RdMem|WrMem    */
+
+/* K7 MSRs */
+#define MSR_K7_EVNTSEL0                        0xc0010000
+#define MSR_K7_PERFCTR0                        0xc0010004
+#define MSR_K7_EVNTSEL1                        0xc0010001
+#define MSR_K7_PERFCTR1                        0xc0010005
+#define MSR_K7_EVNTSEL2                        0xc0010002
+#define MSR_K7_PERFCTR2                        0xc0010006
+#define MSR_K7_EVNTSEL3                        0xc0010003
+#define MSR_K7_PERFCTR3                        0xc0010007
+#define MSR_K7_CLK_CTL                 0xc001001b
+#define MSR_K7_HWCR                    0xc0010015
+#define MSR_K7_FID_VID_CTL             0xc0010041
+#define MSR_K7_FID_VID_STATUS          0xc0010042
+
+/* K6 MSRs */
+#define MSR_K6_WHCR                    0xc0000082
+#define MSR_K6_UWCCR                   0xc0000085
+#define MSR_K6_EPMR                    0xc0000086
+#define MSR_K6_PSOR                    0xc0000087
+#define MSR_K6_PFIR                    0xc0000088
+
+/* Centaur-Hauls/IDT defined MSRs. */
+#define MSR_IDT_FCR1                   0x00000107
+#define MSR_IDT_FCR2                   0x00000108
+#define MSR_IDT_FCR3                   0x00000109
+#define MSR_IDT_FCR4                   0x0000010a
+
+#define MSR_IDT_MCR0                   0x00000110
+#define MSR_IDT_MCR1                   0x00000111
+#define MSR_IDT_MCR2                   0x00000112
+#define MSR_IDT_MCR3                   0x00000113
+#define MSR_IDT_MCR4                   0x00000114
+#define MSR_IDT_MCR5                   0x00000115
+#define MSR_IDT_MCR6                   0x00000116
+#define MSR_IDT_MCR7                   0x00000117
+#define MSR_IDT_MCR_CTRL               0x00000120
+
+/* VIA Cyrix defined MSRs*/
+#define MSR_VIA_FCR                    0x00001107
+#define MSR_VIA_LONGHAUL               0x0000110a
+#define MSR_VIA_RNG                    0x0000110b
+#define MSR_VIA_BCR2                   0x00001147
+
+/* Transmeta defined MSRs */
+#define MSR_TMTA_LONGRUN_CTRL          0x80868010
+#define MSR_TMTA_LONGRUN_FLAGS         0x80868011
+#define MSR_TMTA_LRTI_READOUT          0x80868018
+#define MSR_TMTA_LRTI_VOLT_MHZ         0x8086801a
+
+/* Intel defined MSRs. */
+#define MSR_IA32_P5_MC_ADDR            0x00000000
+#define MSR_IA32_P5_MC_TYPE            0x00000001
+#define MSR_IA32_TSC                   0x00000010
+#define MSR_IA32_PLATFORM_ID           0x00000017
+#define MSR_IA32_EBL_CR_POWERON                0x0000002a
+#define MSR_EBC_FREQUENCY_ID           0x0000002c
+#define MSR_IA32_FEATURE_CONTROL        0x0000003a
+#define MSR_IA32_TSC_ADJUST             0x0000003b
+
+#define FEATURE_CONTROL_LOCKED                         (1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX       (1<<1)
+#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX      (1<<2)
+
+#define MSR_IA32_APICBASE              0x0000001b
+#define MSR_IA32_APICBASE_BSP          (1<<8)
+#define MSR_IA32_APICBASE_ENABLE       (1<<11)
+#define MSR_IA32_APICBASE_BASE         (0xfffff<<12)
+
+#define MSR_IA32_TSCDEADLINE           0x000006e0
+
+#define MSR_IA32_UCODE_WRITE           0x00000079
+#define MSR_IA32_UCODE_REV             0x0000008b
+
+#define MSR_IA32_PERF_STATUS           0x00000198
+#define MSR_IA32_PERF_CTL              0x00000199
+#define MSR_AMD_PSTATE_DEF_BASE                0xc0010064
+#define MSR_AMD_PERF_STATUS            0xc0010063
+#define MSR_AMD_PERF_CTL               0xc0010062
+
+#define MSR_IA32_MPERF                 0x000000e7
+#define MSR_IA32_APERF                 0x000000e8
+
+#define MSR_IA32_THERM_CONTROL         0x0000019a
+#define MSR_IA32_THERM_INTERRUPT       0x0000019b
+
+#define THERM_INT_HIGH_ENABLE          (1 << 0)
+#define THERM_INT_LOW_ENABLE           (1 << 1)
+#define THERM_INT_PLN_ENABLE           (1 << 24)
+
+#define MSR_IA32_THERM_STATUS          0x0000019c
+
+#define THERM_STATUS_PROCHOT           (1 << 0)
+#define THERM_STATUS_POWER_LIMIT       (1 << 10)
+
+#define MSR_THERM2_CTL                 0x0000019d
+
+#define MSR_THERM2_CTL_TM_SELECT       (1ULL << 16)
+
+#define MSR_IA32_MISC_ENABLE           0x000001a0
+
+#define MSR_IA32_TEMPERATURE_TARGET    0x000001a2
+
+#define MSR_IA32_ENERGY_PERF_BIAS      0x000001b0
+#define ENERGY_PERF_BIAS_PERFORMANCE   0
+#define ENERGY_PERF_BIAS_NORMAL                6
+#define ENERGY_PERF_BIAS_POWERSAVE     15
+
+#define MSR_IA32_PACKAGE_THERM_STATUS          0x000001b1
+
+#define PACKAGE_THERM_STATUS_PROCHOT           (1 << 0)
+#define PACKAGE_THERM_STATUS_POWER_LIMIT       (1 << 10)
+
+#define MSR_IA32_PACKAGE_THERM_INTERRUPT       0x000001b2
+
+#define PACKAGE_THERM_INT_HIGH_ENABLE          (1 << 0)
+#define PACKAGE_THERM_INT_LOW_ENABLE           (1 << 1)
+#define PACKAGE_THERM_INT_PLN_ENABLE           (1 << 24)
+
+/* Thermal Thresholds Support */
+#define THERM_INT_THRESHOLD0_ENABLE    (1 << 15)
+#define THERM_SHIFT_THRESHOLD0        8
+#define THERM_MASK_THRESHOLD0          (0x7f << THERM_SHIFT_THRESHOLD0)
+#define THERM_INT_THRESHOLD1_ENABLE    (1 << 23)
+#define THERM_SHIFT_THRESHOLD1        16
+#define THERM_MASK_THRESHOLD1          (0x7f << THERM_SHIFT_THRESHOLD1)
+#define THERM_STATUS_THRESHOLD0        (1 << 6)
+#define THERM_LOG_THRESHOLD0           (1 << 7)
+#define THERM_STATUS_THRESHOLD1        (1 << 8)
+#define THERM_LOG_THRESHOLD1           (1 << 9)
+
+/* MISC_ENABLE bits: architectural */
+#define MSR_IA32_MISC_ENABLE_FAST_STRING       (1ULL << 0)
+#define MSR_IA32_MISC_ENABLE_TCC               (1ULL << 1)
+#define MSR_IA32_MISC_ENABLE_EMON              (1ULL << 7)
+#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL       (1ULL << 11)
+#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL      (1ULL << 12)
+#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP        (1ULL << 16)
+#define MSR_IA32_MISC_ENABLE_MWAIT             (1ULL << 18)
+#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID       (1ULL << 22)
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE      (1ULL << 23)
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE                (1ULL << 34)
+
+/* MISC_ENABLE bits: model-specific, meaning may vary from core to core */
+#define MSR_IA32_MISC_ENABLE_X87_COMPAT                (1ULL << 2)
+#define MSR_IA32_MISC_ENABLE_TM1               (1ULL << 3)
+#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE        (1ULL << 4)
+#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE   (1ULL << 6)
+#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK     (1ULL << 8)
+#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE  (1ULL << 9)
+#define MSR_IA32_MISC_ENABLE_FERR              (1ULL << 10)
+#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX    (1ULL << 10)
+#define MSR_IA32_MISC_ENABLE_TM2               (1ULL << 13)
+#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE  (1ULL << 19)
+#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK    (1ULL << 20)
+#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT       (1ULL << 24)
+#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE  (1ULL << 37)
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE     (1ULL << 38)
+#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE   (1ULL << 39)
+
+#define MSR_IA32_TSC_DEADLINE          0x000006E0
+
+/* P4/Xeon+ specific */
+#define MSR_IA32_MCG_EAX               0x00000180
+#define MSR_IA32_MCG_EBX               0x00000181
+#define MSR_IA32_MCG_ECX               0x00000182
+#define MSR_IA32_MCG_EDX               0x00000183
+#define MSR_IA32_MCG_ESI               0x00000184
+#define MSR_IA32_MCG_EDI               0x00000185
+#define MSR_IA32_MCG_EBP               0x00000186
+#define MSR_IA32_MCG_ESP               0x00000187
+#define MSR_IA32_MCG_EFLAGS            0x00000188
+#define MSR_IA32_MCG_EIP               0x00000189
+#define MSR_IA32_MCG_RESERVED          0x0000018a
+
+/* Pentium IV performance counter MSRs */
+#define MSR_P4_BPU_PERFCTR0            0x00000300
+#define MSR_P4_BPU_PERFCTR1            0x00000301
+#define MSR_P4_BPU_PERFCTR2            0x00000302
+#define MSR_P4_BPU_PERFCTR3            0x00000303
+#define MSR_P4_MS_PERFCTR0             0x00000304
+#define MSR_P4_MS_PERFCTR1             0x00000305
+#define MSR_P4_MS_PERFCTR2             0x00000306
+#define MSR_P4_MS_PERFCTR3             0x00000307
+#define MSR_P4_FLAME_PERFCTR0          0x00000308
+#define MSR_P4_FLAME_PERFCTR1          0x00000309
+#define MSR_P4_FLAME_PERFCTR2          0x0000030a
+#define MSR_P4_FLAME_PERFCTR3          0x0000030b
+#define MSR_P4_IQ_PERFCTR0             0x0000030c
+#define MSR_P4_IQ_PERFCTR1             0x0000030d
+#define MSR_P4_IQ_PERFCTR2             0x0000030e
+#define MSR_P4_IQ_PERFCTR3             0x0000030f
+#define MSR_P4_IQ_PERFCTR4             0x00000310
+#define MSR_P4_IQ_PERFCTR5             0x00000311
+#define MSR_P4_BPU_CCCR0               0x00000360
+#define MSR_P4_BPU_CCCR1               0x00000361
+#define MSR_P4_BPU_CCCR2               0x00000362
+#define MSR_P4_BPU_CCCR3               0x00000363
+#define MSR_P4_MS_CCCR0                        0x00000364
+#define MSR_P4_MS_CCCR1                        0x00000365
+#define MSR_P4_MS_CCCR2                        0x00000366
+#define MSR_P4_MS_CCCR3                        0x00000367
+#define MSR_P4_FLAME_CCCR0             0x00000368
+#define MSR_P4_FLAME_CCCR1             0x00000369
+#define MSR_P4_FLAME_CCCR2             0x0000036a
+#define MSR_P4_FLAME_CCCR3             0x0000036b
+#define MSR_P4_IQ_CCCR0                        0x0000036c
+#define MSR_P4_IQ_CCCR1                        0x0000036d
+#define MSR_P4_IQ_CCCR2                        0x0000036e
+#define MSR_P4_IQ_CCCR3                        0x0000036f
+#define MSR_P4_IQ_CCCR4                        0x00000370
+#define MSR_P4_IQ_CCCR5                        0x00000371
+#define MSR_P4_ALF_ESCR0               0x000003ca
+#define MSR_P4_ALF_ESCR1               0x000003cb
+#define MSR_P4_BPU_ESCR0               0x000003b2
+#define MSR_P4_BPU_ESCR1               0x000003b3
+#define MSR_P4_BSU_ESCR0               0x000003a0
+#define MSR_P4_BSU_ESCR1               0x000003a1
+#define MSR_P4_CRU_ESCR0               0x000003b8
+#define MSR_P4_CRU_ESCR1               0x000003b9
+#define MSR_P4_CRU_ESCR2               0x000003cc
+#define MSR_P4_CRU_ESCR3               0x000003cd
+#define MSR_P4_CRU_ESCR4               0x000003e0
+#define MSR_P4_CRU_ESCR5               0x000003e1
+#define MSR_P4_DAC_ESCR0               0x000003a8
+#define MSR_P4_DAC_ESCR1               0x000003a9
+#define MSR_P4_FIRM_ESCR0              0x000003a4
+#define MSR_P4_FIRM_ESCR1              0x000003a5
+#define MSR_P4_FLAME_ESCR0             0x000003a6
+#define MSR_P4_FLAME_ESCR1             0x000003a7
+#define MSR_P4_FSB_ESCR0               0x000003a2
+#define MSR_P4_FSB_ESCR1               0x000003a3
+#define MSR_P4_IQ_ESCR0                        0x000003ba
+#define MSR_P4_IQ_ESCR1                        0x000003bb
+#define MSR_P4_IS_ESCR0                        0x000003b4
+#define MSR_P4_IS_ESCR1                        0x000003b5
+#define MSR_P4_ITLB_ESCR0              0x000003b6
+#define MSR_P4_ITLB_ESCR1              0x000003b7
+#define MSR_P4_IX_ESCR0                        0x000003c8
+#define MSR_P4_IX_ESCR1                        0x000003c9
+#define MSR_P4_MOB_ESCR0               0x000003aa
+#define MSR_P4_MOB_ESCR1               0x000003ab
+#define MSR_P4_MS_ESCR0                        0x000003c0
+#define MSR_P4_MS_ESCR1                        0x000003c1
+#define MSR_P4_PMH_ESCR0               0x000003ac
+#define MSR_P4_PMH_ESCR1               0x000003ad
+#define MSR_P4_RAT_ESCR0               0x000003bc
+#define MSR_P4_RAT_ESCR1               0x000003bd
+#define MSR_P4_SAAT_ESCR0              0x000003ae
+#define MSR_P4_SAAT_ESCR1              0x000003af
+#define MSR_P4_SSU_ESCR0               0x000003be
+#define MSR_P4_SSU_ESCR1               0x000003bf /* guess: not in manual */
+
+#define MSR_P4_TBPU_ESCR0              0x000003c2
+#define MSR_P4_TBPU_ESCR1              0x000003c3
+#define MSR_P4_TC_ESCR0                        0x000003c4
+#define MSR_P4_TC_ESCR1                        0x000003c5
+#define MSR_P4_U2L_ESCR0               0x000003b0
+#define MSR_P4_U2L_ESCR1               0x000003b1
+
+#define MSR_P4_PEBS_MATRIX_VERT                0x000003f2
+
+/* Intel Core-based CPU performance counters */
+#define MSR_CORE_PERF_FIXED_CTR0       0x00000309
+#define MSR_CORE_PERF_FIXED_CTR1       0x0000030a
+#define MSR_CORE_PERF_FIXED_CTR2       0x0000030b
+#define MSR_CORE_PERF_FIXED_CTR_CTRL   0x0000038d
+#define MSR_CORE_PERF_GLOBAL_STATUS    0x0000038e
+#define MSR_CORE_PERF_GLOBAL_CTRL      0x0000038f
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL  0x00000390
+
+/* Geode defined MSRs */
+#define MSR_GEODE_BUSCONT_CONF0                0x00001900
+
+/* Intel VT MSRs */
+#define MSR_IA32_VMX_BASIC              0x00000480
+#define MSR_IA32_VMX_PINBASED_CTLS      0x00000481
+#define MSR_IA32_VMX_PROCBASED_CTLS     0x00000482
+#define MSR_IA32_VMX_EXIT_CTLS          0x00000483
+#define MSR_IA32_VMX_ENTRY_CTLS         0x00000484
+#define MSR_IA32_VMX_MISC               0x00000485
+#define MSR_IA32_VMX_CR0_FIXED0         0x00000486
+#define MSR_IA32_VMX_CR0_FIXED1         0x00000487
+#define MSR_IA32_VMX_CR4_FIXED0         0x00000488
+#define MSR_IA32_VMX_CR4_FIXED1         0x00000489
+#define MSR_IA32_VMX_VMCS_ENUM          0x0000048a
+#define MSR_IA32_VMX_PROCBASED_CTLS2    0x0000048b
+#define MSR_IA32_VMX_EPT_VPID_CAP       0x0000048c
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS  0x0000048d
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
+#define MSR_IA32_VMX_TRUE_EXIT_CTLS      0x0000048f
+#define MSR_IA32_VMX_TRUE_ENTRY_CTLS     0x00000490
+
+/* VMX_BASIC bits and bitmasks */
+#define VMX_BASIC_VMCS_SIZE_SHIFT      32
+#define VMX_BASIC_64           0x0001000000000000LLU
+#define VMX_BASIC_MEM_TYPE_SHIFT       50
+#define VMX_BASIC_MEM_TYPE_MASK        0x003c000000000000LLU
+#define VMX_BASIC_MEM_TYPE_WB  6LLU
+#define VMX_BASIC_INOUT                0x0040000000000000LLU
+
+/* AMD-V MSRs */
+
+#define MSR_VM_CR                       0xc0010114
+#define MSR_VM_IGNNE                    0xc0010115
+#define MSR_VM_HSAVE_PA                 0xc0010117
+
+#endif /* _ASM_X86_MSR_INDEX_H */
index 4ba0239..01e1a20 100644 (file)
@@ -1,9 +1,7 @@
 #ifndef ROS_KERN_ARCH_TRAP_H
 #define ROS_KERN_ARCH_TRAP_H
 
-#define MSR_IA32_SYSENTER_CS 0x174
-#define MSR_IA32_SYSENTER_ESP 0x175
-#define MSR_IA32_SYSENTER_EIP 0x176
+#include "msr-index.h"
 
 #define NUM_IRQS                                       256
 #define KERNEL_IRQ_OFFSET                      32
index 50181ff..19700b9 100644 (file)
@@ -33,7 +33,8 @@ typedef int gid_t;
 #define MAX_VADDR     ((uint64_t)(~0) >> (64-NUM_ADDR_BITS))
 typedef uint64_t uintptr_t;
 #define PAGE_SHIFT 12
-
+#define PAGE_SIZE (1<<PAGE_SHIFT)
+#define PAGE_MASK 0xFFFFFFFFfffff000
 #else /* 32 bit */
 
 #define NUM_ADDR_BITS 32
@@ -42,6 +43,8 @@ typedef uint64_t uintptr_t;
 typedef uint32_t uintptr_t;
 
 #define PAGE_SHIFT 12
+#define PAGE_SIZE (1<<PAGE_SHIFT)
+#define PAGE_MASK 0xFFFFF000
 #endif /* 64bit / 32bit */
 
 
diff --git a/kern/arch/x86/vm.c b/kern/arch/x86/vm.c
new file mode 100644 (file)
index 0000000..053a5f6
--- /dev/null
@@ -0,0 +1,3362 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *
+ */
+
+#define DEBUG
+#include <kmalloc.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <error.h>
+#include <pmap.h>
+#include <sys/queue.h>
+#include <smp.h>
+#include <kref.h>
+#include <atomic.h>
+#include <alarm.h>
+#include <event.h>
+#include <umem.h>
+#include <devalarm.h>
+#include <arch/types.h>
+#include <arch/litevm.h>
+#include <arch/emulate.h>
+#include <arch/vmdebug.h>
+#include <arch/msr-index.h>
+
+struct litevm_stat litevm_stat;
+
+static struct litevm_stats_debugfs_item {
+       const char *name;
+       uint32_t *data;
+} debugfs_entries[] = {
+       { "pf_fixed", &litevm_stat.pf_fixed },
+       { "pf_guest", &litevm_stat.pf_guest },
+       { "tlb_flush", &litevm_stat.tlb_flush },
+       { "invlpg", &litevm_stat.invlpg },
+       { "exits", &litevm_stat.exits },
+       { "io_exits", &litevm_stat.io_exits },
+       { "mmio_exits", &litevm_stat.mmio_exits },
+       { "signal_exits", &litevm_stat.signal_exits },
+       { "irq_exits", &litevm_stat.irq_exits },
+       { 0, 0 }
+};
+
+static struct dentry *debugfs_dir;
+
+static const uint32_t vmx_msr_index[] = {
+#ifdef __x86_64__
+       MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
+#endif
+       MSR_EFER, // wtf? MSR_K6_STAR,
+};
+#define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
+
+#ifdef __x86_64__
+/*
+ * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
+ * mechanism (cpu bug AA24)
+ */
+#define NR_BAD_MSRS 2
+#else
+#define NR_BAD_MSRS 0
+#endif
+
+#define TSS_IOPB_BASE_OFFSET 0x66
+#define TSS_BASE_SIZE 0x68
+#define TSS_IOPB_SIZE (65536 / 8)
+#define TSS_REDIRECTION_SIZE (256 / 8)
+#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+
+#define MSR_IA32_VMX_BASIC_MSR                 0x480
+#define MSR_IA32_VMX_PINBASED_CTLS_MSR         0x481
+#define MSR_IA32_VMX_PROCBASED_CTLS_MSR                0x482
+#define MSR_IA32_VMX_EXIT_CTLS_MSR             0x483
+#define MSR_IA32_VMX_ENTRY_CTLS_MSR            0x484
+
+#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
+#define LMSW_GUEST_MASK 0x0eULL
+#define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
+//#define CR4_VMXE 0x2000
+#define CR8_RESEVED_BITS (~0x0fULL)
+#define EFER_RESERVED_BITS 0xfffffffffffff2fe
+
+#ifdef __x86_64__
+#define HOST_IS_64 1
+#else
+#define HOST_IS_64 0
+#endif
+
+static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu, uint32_t msr)
+{
+       int i;
+
+       for (i = 0; i < vcpu->nmsrs; ++i)
+               if (vcpu->guest_msrs[i].index == msr)
+                       return &vcpu->guest_msrs[i];
+       return 0;
+}
+
+struct descriptor_table {
+       uint16_t limit;
+       unsigned long base;
+} __attribute__((packed));
+
+static void get_gdt(struct descriptor_table *table)
+{
+       asm ("sgdt %0" : "=m"(*table));
+}
+
+static void get_idt(struct descriptor_table *table)
+{
+       asm ("sidt %0" : "=m"(*table));
+}
+
+static uint16_t read_fs(void)
+{
+       uint16_t seg;
+       asm ("mov %%fs, %0" : "=g"(seg));
+       return seg;
+}
+
+static uint16_t read_gs(void)
+{
+       uint16_t seg;
+       asm ("mov %%gs, %0" : "=g"(seg));
+       return seg;
+}
+
+static uint16_t read_ldt(void)
+{
+       uint16_t ldt;
+       asm ("sldt %0" : "=g"(ldt));
+       return ldt;
+}
+
+static void load_fs(uint16_t sel)
+{
+       asm ("mov %0, %%fs" : : "g"(sel));
+}
+
+static void load_gs(uint16_t sel)
+{
+       asm ("mov %0, %%gs" : : "g"(sel));
+}
+
+#ifndef load_ldt
+static void load_ldt(uint16_t sel)
+{
+       asm ("lldt %0" : : "g"(sel));
+}
+#endif
+
+static void fx_save(void *image)
+{
+       asm ("fxsave (%0)":: "r" (image));
+}
+
+static void fx_restore(void *image)
+{
+       asm ("fxrstor (%0)":: "r" (image));
+}
+
+static void fpu_init(void)
+{
+       asm ("finit");
+}
+
+struct segment_descriptor {
+       uint16_t limit_low;
+       uint16_t base_low;
+       uint8_t  base_mid;
+       uint8_t  type : 4;
+       uint8_t  system : 1;
+       uint8_t  dpl : 2;
+       uint8_t  present : 1;
+       uint8_t  limit_high : 4;
+       uint8_t  avl : 1;
+       uint8_t  long_mode : 1;
+       uint8_t  default_op : 1;
+       uint8_t  granularity : 1;
+       uint8_t  base_high;
+} __attribute__((packed));
+
+#ifdef __x86_64__
+// LDT or TSS descriptor in the GDT. 16 bytes.
+struct segment_descriptor_64 {
+       struct segment_descriptor s;
+       uint32_t base_higher;
+       uint32_t pad_zero;
+};
+
+#endif
+
+static unsigned long segment_base(uint16_t selector)
+{
+       struct descriptor_table gdt;
+       struct segment_descriptor *d;
+       unsigned long table_base;
+       typedef unsigned long ul;
+       unsigned long v;
+
+       asm ("sgdt %0" : "=m"(gdt));
+       table_base = gdt.base;
+
+       if (selector & 4) {           /* from ldt */
+               uint16_t ldt_selector;
+
+               asm ("sldt %0" : "=g"(ldt_selector));
+               table_base = segment_base(ldt_selector);
+       }
+       d = (struct segment_descriptor *)(table_base + (selector & ~7));
+       v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
+#ifdef __x86_64__
+       if (d->system == 0
+           && (d->type == 2 || d->type == 9 || d->type == 11))
+               v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
+#endif
+       return v;
+}
+
+static unsigned long read_tr_base(void)
+{
+       uint16_t tr;
+       asm ("str %0" : "=g"(tr));
+       return segment_base(tr);
+}
+
+static void reload_tss(void)
+{
+#ifndef __x86_64__
+
+       /*
+        * VT restores TR but not its size.  Useless.
+        */
+       struct descriptor_table gdt;
+       struct segment_descriptor *descs;
+
+       get_gdt(&gdt);
+       descs = (void *)gdt.base;
+       descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
+       load_TR_desc();
+#endif
+}
+
+#warning "DEFINE PER CPU "
+//static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+//static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+
+static struct vmcs_descriptor {
+       int size;
+       int order;
+       uint32_t revision_id;
+} vmcs_descriptor;
+
+#if 0
+#ifdef __x86_64__
+static unsigned long read_msr(unsigned long msr)
+{
+       uint64_t value;
+
+       rdmsrl(msr, value);
+       return value;
+}
+#endif
+#endif
+static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
+{
+       struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
+       return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
+}
+
+
+
+int litevm_read_guest(struct litevm_vcpu *vcpu,
+                            gva_t addr,
+                            unsigned long size,
+                            void *dest)
+{
+       unsigned char *host_buf = dest;
+       unsigned long req_size = size;
+
+       while (size) {
+               hpa_t paddr;
+               unsigned now;
+               unsigned offset;
+               hva_t guest_buf;
+
+               paddr = gva_to_hpa(vcpu, addr);
+
+               if (is_error_hpa(paddr))
+                       break;
+#warning "kmap_atomic"
+               guest_buf = NULL; //(hva_t)kmap_atomic(
+               //      pfn_to_page(paddr >> PAGE_SHIFT));
+               offset = addr & ~PAGE_MASK;
+               guest_buf |= offset;
+               now = MIN(size, PAGE_SIZE - offset);
+               memcpy(host_buf, (void*)guest_buf, now);
+               host_buf += now;
+               addr += now;
+               size -= now;
+#warning "kunmap_atomic"
+//             kunmap_atomic((void *)(guest_buf & PAGE_MASK));
+       }
+       return req_size - size;
+}
+
+int litevm_write_guest(struct litevm_vcpu *vcpu,
+                            gva_t addr,
+                            unsigned long size,
+                            void *data)
+{
+       unsigned char *host_buf = data;
+       unsigned long req_size = size;
+
+       while (size) {
+               hpa_t paddr;
+               unsigned now;
+               unsigned offset;
+               hva_t guest_buf;
+
+               paddr = gva_to_hpa(vcpu, addr);
+
+               if (is_error_hpa(paddr))
+                       break;
+
+               guest_buf = (hva_t)kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT));
+               offset = addr & ~PAGE_MASK;
+               guest_buf |= offset;
+               now = MIN(size, PAGE_SIZE - offset);
+               memcpy((void*)guest_buf, host_buf, now);
+               host_buf += now;
+               addr += now;
+               size -= now;
+               //kunmap_atomic((void *)(guest_buf & PAGE_MASK));
+       }
+       return req_size - size;
+}
+
+static void setup_vmcs_descriptor(void)
+{
+       uint64_t msr;
+
+       msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
+       vmcs_descriptor.size = (msr>>32) & 0x1fff;
+       vmcs_descriptor.order = get_order(vmcs_descriptor.size);
+       vmcs_descriptor.revision_id = (uint32_t)msr;
+};
+
+static void vmcs_clear(struct vmcs *vmcs)
+{
+#warning "__pa"
+       uint64_t phys_addr = 0//;__pa(vmcs);
+       uint8_t error;
+
+       asm volatile ("vmclear %1; setna %0"
+                      : "=m"(error) : "m"(phys_addr) : "cc", "memory" );
+       if (error)
+               printk("litevm: vmclear fail: %p/%llx\n",
+                      vmcs, phys_addr);
+}
+
+static void __vcpu_clear(void *arg)
+{
+       struct litevm_vcpu *vcpu = arg;
+#warning "smp_processor_id"
+       int cpu = 0 ; //smp_processor_id();
+
+       if (vcpu->cpu == cpu)
+               vmcs_clear(vcpu->vmcs);
+#warning "per cpu"
+/*
+       if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
+       per_cpu(current_vmcs, cpu) = 0;*/
+}
+
+static int vcpu_slot(struct litevm_vcpu *vcpu)
+{
+       return vcpu - vcpu->litevm->vcpus;
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
+{
+#warning "__pa"
+       uint64_t phys_addr = 0; //__pa(vcpu->vmcs);
+       int cpu;
+#warning "get_cpu"
+       cpu = 0; //get_cpu();
+
+       if (vcpu->cpu != cpu) {
+#warning "smp_call_function"
+//             smp_call_function(__vcpu_clear, vcpu, 1);
+               vcpu->launched = 0;
+       }
+#warning "per cpu"
+       if (0){//per_cpu(current_vmcs, cpu) != vcpu->vmcs) {
+               uint8_t error;
+
+               per_cpu(current_vmcs, cpu) = vcpu->vmcs;
+               asm volatile ("vmptrld %1; setna %0"
+                              : "=m"(error) : "m"(phys_addr) : "cc" );
+               if (error)
+                       printk("litevm: vmptrld %p/%llx fail\n",
+                              vcpu->vmcs, phys_addr);
+       }
+
+       if (vcpu->cpu != cpu) {
+               struct descriptor_table dt;
+               unsigned long sysenter_esp;
+
+               vcpu->cpu = cpu;
+               /*
+                * Linux uses per-cpu TSS and GDT, so set these when switching
+                * processors.
+                */
+               vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
+               get_gdt(&dt);
+               vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
+
+               rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
+               vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+       }
+       return vcpu;
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put()
+ */
+static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
+{
+       struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
+
+       mutex_lock(&vcpu->mutex);
+       if (unlikely(!vcpu->vmcs)) {
+               mutex_unlock(&vcpu->mutex);
+               return 0;
+       }
+       return __vcpu_load(vcpu);
+}
+
+static void vcpu_put(struct litevm_vcpu *vcpu)
+{
+       put_cpu();
+       mutex_unlock(&vcpu->mutex);
+}
+
+
+static struct vmcs *alloc_vmcs_cpu(int cpu)
+{
+       int node = cpu_to_node(cpu);
+       struct page *pages;
+       struct vmcs *vmcs;
+
+       pages = alloc_pages_node(node, KMALLOC_WAIT, vmcs_descriptor.order);
+       if (!pages)
+               return 0;
+       vmcs = page_address(pages);
+       memset(vmcs, 0, vmcs_descriptor.size);
+       vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
+       return vmcs;
+}
+
+static struct vmcs *alloc_vmcs(void)
+{
+       return alloc_vmcs_cpu(smp_processor_id());
+}
+
+static void free_vmcs(struct vmcs *vmcs)
+{
+       free_pages((unsigned long)vmcs, vmcs_descriptor.order);
+}
+
+static __init int cpu_has_litevm_support(void)
+{
+       unsigned long ecx = cpuid_ecx(1);
+       return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+
+static void free_litevm_area(void)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu)
+               free_vmcs(per_cpu(vmxarea, cpu));
+}
+
+static __init int alloc_litevm_area(void)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               struct vmcs *vmcs;
+
+               vmcs = alloc_vmcs_cpu(cpu);
+               if (!vmcs) {
+                       free_litevm_area();
+                       return -ENOMEM;
+               }
+
+               per_cpu(vmxarea, cpu) = vmcs;
+       }
+       return 0;
+}
+
+static __init int vmx_disabled_by_bios(void)
+{
+       uint64_t msr;
+
+       rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+       return (msr & 5) == 1; /* locked but not enabled */
+}
+
+static __init void litevm_enable(void *garbage)
+{
+       int cpu = raw_smp_processor_id();
+       uint64_t phys_addr = __pa(per_cpu(vmxarea, cpu));
+       uint64_t old;
+
+       rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
+       if ((old & 5) == 0)
+               /* enable and lock */
+               wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5);
+       write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
+       asm volatile ("vmxon %0" : : "m"(phys_addr) : "memory", "cc");
+}
+
+static void litevm_disable(void *garbage)
+{
+       asm volatile ("vmxoff" : : : "cc");
+}
+
+static int litevm_dev_open(struct inode *inode, struct file *filp)
+{
+       struct litevm *litevm = kzalloc(sizeof(struct litevm), KMALLOC_WAIT);
+       int i;
+
+       if (!litevm)
+               return -ENOMEM;
+
+       spin_lock_init(&litevm->lock);
+       INIT_LIST_HEAD(&litevm->active_mmu_pages);
+       for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
+               struct litevm_vcpu *vcpu = &litevm->vcpus[i];
+
+               mutex_init(&vcpu->mutex);
+               vcpu->mmu.root_hpa = INVALID_PAGE;
+               INIT_LIST_HEAD(&vcpu->free_pages);
+       }
+       filp->private_data = litevm;
+       return 0;
+}
+
+/*
+ * Free any memory in @free but not in @dont.
+ */
+static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
+                                 struct litevm_memory_slot *dont)
+{
+       int i;
+
+       if (!dont || free->phys_mem != dont->phys_mem)
+               if (free->phys_mem) {
+                       for (i = 0; i < free->npages; ++i)
+                               __free_page(free->phys_mem[i]);
+                       vfree(free->phys_mem);
+               }
+
+       if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
+               vfree(free->dirty_bitmap);
+
+       free->phys_mem = 0;
+       free->npages = 0;
+       free->dirty_bitmap = 0;
+}
+
+static void litevm_free_physmem(struct litevm *litevm)
+{
+       int i;
+
+       for (i = 0; i < litevm->nmemslots; ++i)
+               litevm_free_physmem_slot(&litevm->memslots[i], 0);
+}
+
+static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
+{
+       if (vcpu->vmcs) {
+               on_each_cpu(__vcpu_clear, vcpu, 1);
+               free_vmcs(vcpu->vmcs);
+               vcpu->vmcs = 0;
+       }
+}
+
+static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
+{
+       litevm_free_vmcs(vcpu);
+       litevm_mmu_destroy(vcpu);
+}
+
+static void litevm_free_vcpus(struct litevm *litevm)
+{
+       unsigned int i;
+
+       for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
+               litevm_free_vcpu(&litevm->vcpus[i]);
+}
+
+static int litevm_dev_release(struct inode *inode, struct file *filp)
+{
+       struct litevm *litevm = filp->private_data;
+
+       litevm_free_vcpus(litevm);
+       litevm_free_physmem(litevm);
+       kfree(litevm);
+       return 0;
+}
+
+unsigned long vmcs_readl(unsigned long field)
+{
+       unsigned long value;
+
+       asm volatile ("vmread %1, %0" : "=g"(value) : "r"(field) : "cc");
+       return value;
+}
+
+void vmcs_writel(unsigned long field, unsigned long value)
+{
+       uint8_t error;
+
+       asm volatile ("vmwrite %1, %2; setna %0"
+                      : "=g"(error) : "r"(value), "r"(field) : "cc" );
+       if (error)
+               printk("vmwrite error: reg %lx value %lx (err %d)\n",
+                      field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+static void vmcs_write16(unsigned long field, uint16_t value)
+{
+       vmcs_writel(field, value);
+}
+
+static void vmcs_write64(unsigned long field, uint64_t value)
+{
+#ifdef __x86_64__
+       vmcs_writel(field, value);
+#else
+       vmcs_writel(field, value);
+       asm volatile ("");
+       vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static void inject_gp(struct litevm_vcpu *vcpu)
+{
+       printd("inject_general_protection: rip 0x%lx\n",
+              vmcs_readl(GUEST_RIP));
+       vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                    GP_VECTOR |
+                    INTR_TYPE_EXCEPTION |
+                    INTR_INFO_DELIEVER_CODE_MASK |
+                    INTR_INFO_VALID_MASK);
+}
+
+static void update_exception_bitmap(struct litevm_vcpu *vcpu)
+{
+       if (vcpu->rmode.active)
+               vmcs_write32(EXCEPTION_BITMAP, ~0);
+       else
+               vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
+}
+
+static void enter_pmode(struct litevm_vcpu *vcpu)
+{
+       unsigned long flags;
+
+       vcpu->rmode.active = 0;
+
+       vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
+       vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
+       vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
+
+       flags = vmcs_readl(GUEST_RFLAGS);
+       flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
+       flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
+       vmcs_writel(GUEST_RFLAGS, flags);
+
+       vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
+                       (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK) );
+
+       update_exception_bitmap(vcpu);
+
+       #define FIX_PMODE_DATASEG(seg, save) {                          \
+                       vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
+                       vmcs_writel(GUEST_##seg##_BASE, 0);             \
+                       vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
+                       vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
+       }
+
+       FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
+       FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
+       FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
+       FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
+       FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
+
+       vmcs_write16(GUEST_CS_SELECTOR,
+                    vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
+       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+}
+
+static int rmode_tss_base(struct litevm* litevm)
+{
+       gfn_t base_gfn = litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
+       return base_gfn << PAGE_SHIFT;
+}
+
+static void enter_rmode(struct litevm_vcpu *vcpu)
+{
+       unsigned long flags;
+
+       vcpu->rmode.active = 1;
+
+       vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+       vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
+
+       vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+       vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
+
+       vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+       vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+       flags = vmcs_readl(GUEST_RFLAGS);
+       vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+
+       flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+
+       vmcs_writel(GUEST_RFLAGS, flags);
+       vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
+       update_exception_bitmap(vcpu);
+
+       #define FIX_RMODE_SEG(seg, save) {                                 \
+               vmcs_write16(GUEST_##seg##_SELECTOR,                       \
+                                       vmcs_readl(GUEST_##seg##_BASE) >> 4); \
+               vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
+               vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
+       }
+
+       vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
+       vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
+
+       FIX_RMODE_SEG(ES, vcpu->rmode.es);
+       FIX_RMODE_SEG(DS, vcpu->rmode.ds);
+       FIX_RMODE_SEG(SS, vcpu->rmode.ss);
+       FIX_RMODE_SEG(GS, vcpu->rmode.gs);
+       FIX_RMODE_SEG(FS, vcpu->rmode.fs);
+}
+
+static int init_rmode_tss(struct litevm* litevm)
+{
+       struct page *p1, *p2, *p3;
+       gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
+       char *page;
+
+       p1 = _gfn_to_page(litevm, fn++);
+       p2 = _gfn_to_page(litevm, fn++);
+       p3 = _gfn_to_page(litevm, fn);
+
+       if (!p1 || !p2 || !p3) {
+               litevm_printf(litevm,"%s: gfn_to_page failed\n", __FUNCTION__);
+               return 0;
+       }
+
+       page = kmap_atomic(p1);
+       memset(page, 0, PAGE_SIZE);
+       *(uint16_t*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+       kunmap_atomic(page);
+
+       page = kmap_atomic(p2);
+       memset(page, 0, PAGE_SIZE);
+       kunmap_atomic(page);
+
+       page = kmap_atomic(p3);
+       memset(page, 0, PAGE_SIZE);
+       *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
+       kunmap_atomic(page);
+
+       return 1;
+}
+
+#ifdef __x86_64__
+
+static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
+{
+       struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
+
+       vcpu->shadow_efer = efer;
+       if (efer & EFER_LMA) {
+               vmcs_write32(VM_ENTRY_CONTROLS,
+                                    vmcs_read32(VM_ENTRY_CONTROLS) |
+                                    VM_ENTRY_CONTROLS_IA32E_MASK);
+               msr->data = efer;
+
+       } else {
+               vmcs_write32(VM_ENTRY_CONTROLS,
+                                    vmcs_read32(VM_ENTRY_CONTROLS) &
+                                    ~VM_ENTRY_CONTROLS_IA32E_MASK);
+
+               msr->data = efer & ~EFER_LME;
+       }
+}
+
+static void enter_lmode(struct litevm_vcpu *vcpu)
+{
+       uint32_t guest_tr_ar;
+
+       guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
+       if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
+               printd("%s: tss fixup for long mode. \n",
+                      __FUNCTION__);
+               vmcs_write32(GUEST_TR_AR_BYTES,
+                            (guest_tr_ar & ~AR_TYPE_MASK)
+                            | AR_TYPE_BUSY_64_TSS);
+       }
+
+       vcpu->shadow_efer |= EFER_LMA;
+
+       find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
+       vmcs_write32(VM_ENTRY_CONTROLS,
+                    vmcs_read32(VM_ENTRY_CONTROLS)
+                    | VM_ENTRY_CONTROLS_IA32E_MASK);
+}
+
+static void exit_lmode(struct litevm_vcpu *vcpu)
+{
+       vcpu->shadow_efer &= ~EFER_LMA;
+
+       vmcs_write32(VM_ENTRY_CONTROLS,
+                    vmcs_read32(VM_ENTRY_CONTROLS)
+                    & ~VM_ENTRY_CONTROLS_IA32E_MASK);
+}
+
+#endif
+
+static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
+{
+       if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
+               enter_pmode(vcpu);
+
+       if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
+               enter_rmode(vcpu);
+
+#ifdef __x86_64__
+       if (vcpu->shadow_efer & EFER_LME) {
+               if (!is_paging() && (cr0 & CR0_PG_MASK))
+                       enter_lmode(vcpu);
+               if (is_paging() && !(cr0 & CR0_PG_MASK))
+                       exit_lmode(vcpu);
+       }
+#endif
+
+       vmcs_writel(CR0_READ_SHADOW, cr0);
+       vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
+}
+
+static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
+                                        unsigned long cr3)
+{
+       gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
+       unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
+       int i;
+       uint64_t pdpte;
+       uint64_t *pdpt;
+       struct litevm_memory_slot *memslot;
+
+       spin_lock(&vcpu->litevm->lock);
+       memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
+       /* FIXME: !memslot - emulate? 0xff? */
+       pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn));
+
+       for (i = 0; i < 4; ++i) {
+               pdpte = pdpt[offset + i];
+               if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
+                       break;
+       }
+
+       kunmap_atomic(pdpt);
+       spin_unlock(&vcpu->litevm->lock);
+
+       return i != 4;
+}
+
+static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
+{
+       if (cr0 & CR0_RESEVED_BITS) {
+               printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
+                      cr0, guest_cr0());
+               inject_gp(vcpu);
+               return;
+       }
+
+       if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
+               printd("set_cr0: #GP, CD == 0 && NW == 1\n");
+               inject_gp(vcpu);
+               return;
+       }
+
+       if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
+               printd("set_cr0: #GP, set PG flag "
+                      "and a clear PE flag\n");
+               inject_gp(vcpu);
+               return;
+       }
+
+       if (!is_paging() && (cr0 & CR0_PG_MASK)) {
+#ifdef __x86_64__
+               if ((vcpu->shadow_efer & EFER_LME)) {
+                       uint32_t guest_cs_ar;
+                       if (!is_pae()) {
+                               printd("set_cr0: #GP, start paging "
+                                      "in long mode while PAE is disabled\n");
+                               inject_gp(vcpu);
+                               return;
+                       }
+                       guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
+                       if (guest_cs_ar & SEGMENT_AR_L_MASK) {
+                               printd("set_cr0: #GP, start paging "
+                                      "in long mode while CS.L == 1\n");
+                               inject_gp(vcpu);
+                               return;
+
+                       }
+               } else
+#endif
+               if (is_pae() &&
+                           pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
+                       printd("set_cr0: #GP, pdptrs "
+                              "reserved bits\n");
+                       inject_gp(vcpu);
+                       return;
+               }
+
+       }
+
+       __set_cr0(vcpu, cr0);
+       litevm_mmu_reset_context(vcpu);
+       return;
+}
+
+static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
+{
+       unsigned long cr0 = guest_cr0();
+
+       if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
+               enter_pmode(vcpu);
+               vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
+
+       } else
+               printd("lmsw: unexpected\n");
+
+       vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
+                               | (msw & LMSW_GUEST_MASK));
+}
+
+static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
+{
+       vmcs_writel(CR4_READ_SHADOW, cr4);
+       vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
+                   LITEVM_RMODE_VM_CR4_ALWAYS_ON : LITEVM_PMODE_VM_CR4_ALWAYS_ON));
+}
+
+static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
+{
+       if (cr4 & CR4_RESEVED_BITS) {
+               printd("set_cr4: #GP, reserved bits\n");
+               inject_gp(vcpu);
+               return;
+       }
+
+       if (is_long_mode()) {
+               if (!(cr4 & CR4_PAE_MASK)) {
+                       printd("set_cr4: #GP, clearing PAE while "
+                              "in long mode\n");
+                       inject_gp(vcpu);
+                       return;
+               }
+       } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
+                  && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
+               printd("set_cr4: #GP, pdptrs reserved bits\n");
+               inject_gp(vcpu);
+       }
+
+       if (cr4 & CR4_VMXE_MASK) {
+               printd("set_cr4: #GP, setting VMXE\n");
+               inject_gp(vcpu);
+               return;
+       }
+       __set_cr4(vcpu, cr4);
+       spin_lock(&vcpu->litevm->lock);
+       litevm_mmu_reset_context(vcpu);
+       spin_unlock(&vcpu->litevm->lock);
+}
+
+static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
+{
+       if (is_long_mode()) {
+               if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
+                       printd("set_cr3: #GP, reserved bits\n");
+                       inject_gp(vcpu);
+                       return;
+               }
+       } else {
+               if (cr3 & CR3_RESEVED_BITS) {
+                       printd("set_cr3: #GP, reserved bits\n");
+                       inject_gp(vcpu);
+                       return;
+               }
+               if (is_paging() && is_pae() &&
+                   pdptrs_have_reserved_bits_set(vcpu, cr3)) {
+                       printd("set_cr3: #GP, pdptrs "
+                              "reserved bits\n");
+                       inject_gp(vcpu);
+                       return;
+               }
+       }
+
+       vcpu->cr3 = cr3;
+       spin_lock(&vcpu->litevm->lock);
+       vcpu->mmu.new_cr3(vcpu);
+       spin_unlock(&vcpu->litevm->lock);
+}
+
+static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
+{
+       if ( cr8 & CR8_RESEVED_BITS) {
+               printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
+               inject_gp(vcpu);
+               return;
+       }
+       vcpu->cr8 = cr8;
+}
+
+static uint32_t get_rdx_init_val(void)
+{
+       uint32_t val;
+
+       asm ("movl $1, %%eax \n\t"
+            "movl %%eax, %0 \n\t" : "=g"(val) );
+       return val;
+
+}
+
+static void fx_init(struct litevm_vcpu *vcpu)
+{
+       struct __attribute__ ((__packed__)) fx_image_s {
+               uint16_t control; //fcw
+               uint16_t status; //fsw
+               uint16_t tag; // ftw
+               uint16_t opcode; //fop
+               uint64_t ip; // fpu ip
+               uint64_t operand;// fpu dp
+               uint32_t mxcsr;
+               uint32_t mxcsr_mask;
+
+       } *fx_image;
+
+       fx_save(vcpu->host_fx_image);
+       fpu_init();
+       fx_save(vcpu->guest_fx_image);
+       fx_restore(vcpu->host_fx_image);
+
+       fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
+       fx_image->mxcsr = 0x1f80;
+       memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
+              0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
+}
+
+static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field, uint32_t val)
+{
+       uint32_t msr_high, msr_low;
+
+       rdmsr(msr, msr_low, msr_high);
+
+       val &= msr_high;
+       val |= msr_low;
+       vmcs_write32(vmcs_field, val);
+}
+
+/*
+ * Sets up the vmcs for emulated real mode.
+ */
+static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
+{
+       extern asmlinkage void litevm_vmx_return(void);
+       uint32_t host_sysenter_cs;
+       uint32_t junk;
+       unsigned long a;
+       struct descriptor_table dt;
+       int i;
+       int ret;
+       uint64_t tsc;
+       int nr_good_msrs;
+
+
+       if (!init_rmode_tss(vcpu->litevm)) {
+               ret = 0;
+               goto out;
+       }
+
+       memset(vcpu->regs, 0, sizeof(vcpu->regs));
+       vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
+       vcpu->cr8 = 0;
+       vcpu->apic_base = 0xfee00000 |
+                       /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
+                       MSR_IA32_APICBASE_ENABLE;
+
+       fx_init(vcpu);
+
+#define SEG_SETUP(seg) do {                                    \
+               vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
+               vmcs_writel(GUEST_##seg##_BASE, 0);             \
+               vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
+               vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
+       } while (0)
+
+       /*
+        * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
+        * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
+        */
+       vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+       vmcs_writel(GUEST_CS_BASE, 0x000f0000);
+       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+
+       SEG_SETUP(DS);
+       SEG_SETUP(ES);
+       SEG_SETUP(FS);
+       SEG_SETUP(GS);
+       SEG_SETUP(SS);
+
+       vmcs_write16(GUEST_TR_SELECTOR, 0);
+       vmcs_writel(GUEST_TR_BASE, 0);
+       vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+       vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+       vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+       vmcs_writel(GUEST_LDTR_BASE, 0);
+       vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+       vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
+
+       vmcs_write32(GUEST_SYSENTER_CS, 0);
+       vmcs_writel(GUEST_SYSENTER_ESP, 0);
+       vmcs_writel(GUEST_SYSENTER_EIP, 0);
+
+       vmcs_writel(GUEST_RFLAGS, 0x02);
+       vmcs_writel(GUEST_RIP, 0xfff0);
+       vmcs_writel(GUEST_RSP, 0);
+
+       vmcs_writel(GUEST_CR3, 0);
+
+       //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
+       vmcs_writel(GUEST_DR7, 0x400);
+
+       vmcs_writel(GUEST_GDTR_BASE, 0);
+       vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+       vmcs_writel(GUEST_IDTR_BASE, 0);
+       vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+       vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+       vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+
+       /* I/O */
+       vmcs_write64(IO_BITMAP_A, 0);
+       vmcs_write64(IO_BITMAP_B, 0);
+
+       rdtscll(tsc);
+       vmcs_write64(TSC_OFFSET, -tsc);
+
+       vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+       /* Special registers */
+       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+       /* Control */
+       vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR,
+                              PIN_BASED_VM_EXEC_CONTROL,
+                              PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
+                              | PIN_BASED_NMI_EXITING   /* 20.6.1 */
+                       );
+       vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR,
+                              CPU_BASED_VM_EXEC_CONTROL,
+                              CPU_BASED_HLT_EXITING         /* 20.6.2 */
+                              | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
+                              | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
+                              | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
+                              | CPU_BASED_INVDPG_EXITING
+                              | CPU_BASED_MOV_DR_EXITING
+                              | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
+                       );
+
+       vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+       vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+
+       vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
+       vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
+       vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
+
+       vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+       vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+       vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+       vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
+       vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
+       vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+#ifdef __x86_64__
+       rdmsrl(MSR_FS_BASE, a);
+       vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
+       rdmsrl(MSR_GS_BASE, a);
+       vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
+#else
+       vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+       vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+#endif
+
+       vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
+
+       get_idt(&dt);
+       vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
+
+
+       vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return); /* 22.2.5 */
+
+       rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
+       vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
+       rdmsrl(MSR_IA32_SYSENTER_ESP, a);
+       vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
+       rdmsrl(MSR_IA32_SYSENTER_EIP, a);
+       vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
+
+       ret = -ENOMEM;
+       vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
+       if (!vcpu->guest_msrs)
+               goto out;
+       vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
+       if (!vcpu->host_msrs)
+               goto out_free_guest_msrs;
+
+       for (i = 0; i < NR_VMX_MSR; ++i) {
+               uint32_t index = vmx_msr_index[i];
+               uint32_t data_low, data_high;
+               uint64_t data;
+               int j = vcpu->nmsrs;
+
+               if (rdmsr_safe(index, &data_low, &data_high) < 0)
+                       continue;
+               data = data_low | ((uint64_t)data_high << 32);
+               vcpu->host_msrs[j].index = index;
+               vcpu->host_msrs[j].reserved = 0;
+               vcpu->host_msrs[j].data = data;
+               vcpu->guest_msrs[j] = vcpu->host_msrs[j];
+               ++vcpu->nmsrs;
+       }
+       printk("msrs: %d\n", vcpu->nmsrs);
+
+       nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
+       vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
+                   virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS));
+       vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
+                   virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS));
+       vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
+                   virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS));
+       vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS,
+                              (HOST_IS_64 << 9));  /* 22.2,1, 20.7.1 */
+       vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);  /* 22.2.2 */
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
+
+
+       /* 22.2.1, 20.8.1 */
+       vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR,
+                               VM_ENTRY_CONTROLS, 0);
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
+
+       vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
+       vmcs_writel(TPR_THRESHOLD, 0);
+
+       vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
+       vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
+
+       __set_cr0(vcpu, 0x60000010); // enter rmode
+       __set_cr4(vcpu, 0);
+#ifdef __x86_64__
+       __set_efer(vcpu, 0);
+#endif
+
+       ret = litevm_mmu_init(vcpu);
+
+       return ret;
+
+out_free_guest_msrs:
+       kfree(vcpu->guest_msrs);
+out:
+       return ret;
+}
+
+/*
+ * Sync the rsp and rip registers into the vcpu structure.  This allows
+ * registers to be accessed by indexing vcpu->regs.
+ */
+static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
+{
+       vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+       vcpu->rip = vmcs_readl(GUEST_RIP);
+}
+
+/*
+ * Syncs rsp and rip back into the vmcs.  Should be called after possible
+ * modification.
+ */
+static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
+{
+       vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
+       vmcs_writel(GUEST_RIP, vcpu->rip);
+}
+
+/*
+ * Creates some virtual cpus.  Good luck creating more than one.
+ */
+static int litevm_dev_ioctl_create_vcpu(struct litevm *litevm, int n)
+{
+       int r;
+       struct litevm_vcpu *vcpu;
+       struct vmcs *vmcs;
+
+       r = -EINVAL;
+       if (n < 0 || n >= LITEVM_MAX_VCPUS)
+               goto out;
+
+       vcpu = &litevm->vcpus[n];
+
+       mutex_lock(&vcpu->mutex);
+
+       if (vcpu->vmcs) {
+               mutex_unlock(&vcpu->mutex);
+               return -EEXIST;
+       }
+
+       vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
+                                          FX_IMAGE_ALIGN);
+       vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
+
+       vcpu->cpu = -1;  /* First load will set up TR */
+       vcpu->litevm = litevm;
+       vmcs = alloc_vmcs();
+       if (!vmcs) {
+               mutex_unlock(&vcpu->mutex);
+               goto out_free_vcpus;
+       }
+       vmcs_clear(vmcs);
+       vcpu->vmcs = vmcs;
+       vcpu->launched = 0;
+
+       __vcpu_load(vcpu);
+
+       r = litevm_vcpu_setup(vcpu);
+
+       vcpu_put(vcpu);
+
+       if (r < 0)
+               goto out_free_vcpus;
+
+       return 0;
+
+out_free_vcpus:
+       litevm_free_vcpu(vcpu);
+out:
+       return r;
+}
+
+/*
+ * Allocate some memory and give it an address in the guest physical address
+ * space.
+ *
+ * Discontiguous memory is allowed, mostly for framebuffers.
+ */
+static int litevm_dev_ioctl_set_memory_region(struct litevm *litevm,
+                                          struct litevm_memory_region *mem)
+{
+       int r;
+       gfn_t base_gfn;
+       unsigned long npages;
+       unsigned long i;
+       struct litevm_memory_slot *memslot;
+       struct litevm_memory_slot old, new;
+       int memory_config_version;
+
+       r = -EINVAL;
+       /* General sanity checks */
+       if (mem->memory_size & (PAGE_SIZE - 1))
+               goto out;
+       if (mem->guest_phys_addr & (PAGE_SIZE - 1))
+               goto out;
+       if (mem->slot >= LITEVM_MEMORY_SLOTS)
+               goto out;
+       if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
+               goto out;
+
+       memslot = &litevm->memslots[mem->slot];
+       base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
+       npages = mem->memory_size >> PAGE_SHIFT;
+
+       if (!npages)
+               mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
+
+raced:
+       spin_lock(&litevm->lock);
+
+       memory_config_version = litevm->memory_config_version;
+       new = old = *memslot;
+
+       new.base_gfn = base_gfn;
+       new.npages = npages;
+       new.flags = mem->flags;
+
+       /* Disallow changing a memory slot's size. */
+       r = -EINVAL;
+       if (npages && old.npages && npages != old.npages)
+               goto out_unlock;
+
+       /* Check for overlaps */
+       r = -EEXIST;
+       for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
+               struct litevm_memory_slot *s = &litevm->memslots[i];
+
+               if (s == memslot)
+                       continue;
+               if (!((base_gfn + npages <= s->base_gfn) ||
+                     (base_gfn >= s->base_gfn + s->npages)))
+                       goto out_unlock;
+       }
+       /*
+        * Do memory allocations outside lock.  memory_config_version will
+        * detect any races.
+        */
+       spin_unlock(&litevm->lock);
+
+       /* Deallocate if slot is being removed */
+       if (!npages)
+               new.phys_mem = 0;
+
+       /* Free page dirty bitmap if unneeded */
+       if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
+               new.dirty_bitmap = 0;
+
+       r = -ENOMEM;
+
+       /* Allocate if a slot is being created */
+       if (npages && !new.phys_mem) {
+               new.phys_mem = vmalloc(npages * sizeof(struct page *));
+
+               if (!new.phys_mem)
+                       goto out_free;
+
+               memset(new.phys_mem, 0, npages * sizeof(struct page *));
+               for (i = 0; i < npages; ++i) {
+                       new.phys_mem[i] = alloc_page(GFP_HIGHUSER);
+                       if (!new.phys_mem[i])
+                               goto out_free;
+               }
+       }
+
+       /* Allocate page dirty bitmap if needed */
+       if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
+               unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
+
+               new.dirty_bitmap = vmalloc(dirty_bytes);
+               if (!new.dirty_bitmap)
+                       goto out_free;
+               memset(new.dirty_bitmap, 0, dirty_bytes);
+       }
+
+       spin_lock(&litevm->lock);
+
+       if (memory_config_version != litevm->memory_config_version) {
+               spin_unlock(&litevm->lock);
+               litevm_free_physmem_slot(&new, &old);
+               goto raced;
+       }
+
+       r = -EAGAIN;
+       if (litevm->busy)
+               goto out_unlock;
+
+       if (mem->slot >= litevm->nmemslots)
+               litevm->nmemslots = mem->slot + 1;
+
+       *memslot = new;
+       ++litevm->memory_config_version;
+
+       spin_unlock(&litevm->lock);
+
+       for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
+               struct litevm_vcpu *vcpu;
+
+               vcpu = vcpu_load(litevm, i);
+               if (!vcpu)
+                       continue;
+               litevm_mmu_reset_context(vcpu);
+               vcpu_put(vcpu);
+       }
+
+       litevm_free_physmem_slot(&old, &new);
+       return 0;
+
+out_unlock:
+       spin_unlock(&litevm->lock);
+out_free:
+       litevm_free_physmem_slot(&new, &old);
+out:
+       return r;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
+                                      struct litevm_dirty_log *log)
+{
+       struct litevm_memory_slot *memslot;
+       int r, i;
+       int n;
+       unsigned long any = 0;
+
+       spin_lock(&litevm->lock);
+
+       /*
+        * Prevent changes to guest memory configuration even while the lock
+        * is not taken.
+        */
+       ++litevm->busy;
+       spin_unlock(&litevm->lock);
+       r = -EINVAL;
+       if (log->slot >= LITEVM_MEMORY_SLOTS)
+               goto out;
+
+       memslot = &litevm->memslots[log->slot];
+       r = -ENOENT;
+       if (!memslot->dirty_bitmap)
+               goto out;
+
+       n = ALIGN(memslot->npages, 8) / 8;
+
+       for (i = 0; !any && i < n; ++i)
+               any = memslot->dirty_bitmap[i];
+
+       r = -EFAULT;
+       if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+               goto out;
+
+
+       if (any) {
+               spin_lock(&litevm->lock);
+               litevm_mmu_slot_remove_write_access(litevm, log->slot);
+               spin_unlock(&litevm->lock);
+               memset(memslot->dirty_bitmap, 0, n);
+               for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
+                       struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
+
+                       if (!vcpu)
+                               continue;
+                       flush_guest_tlb(vcpu);
+                       vcpu_put(vcpu);
+               }
+       }
+
+       r = 0;
+
+out:
+       spin_lock(&litevm->lock);
+       --litevm->busy;
+       spin_unlock(&litevm->lock);
+       return r;
+}
+
+struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
+{
+       int i;
+
+       for (i = 0; i < litevm->nmemslots; ++i) {
+               struct litevm_memory_slot *memslot = &litevm->memslots[i];
+
+               if (gfn >= memslot->base_gfn
+                   && gfn < memslot->base_gfn + memslot->npages)
+                       return memslot;
+       }
+       return 0;
+}
+
+void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
+{
+       int i;
+       struct litevm_memory_slot *memslot = 0;
+       unsigned long rel_gfn;
+
+       for (i = 0; i < litevm->nmemslots; ++i) {
+               memslot = &litevm->memslots[i];
+
+               if (gfn >= memslot->base_gfn
+                   && gfn < memslot->base_gfn + memslot->npages) {
+
+                       if (!memslot || !memslot->dirty_bitmap)
+                               return;
+
+                       rel_gfn = gfn - memslot->base_gfn;
+
+                       /* avoid RMW */
+                       if (!test_bit(rel_gfn, memslot->dirty_bitmap))
+                               set_bit(rel_gfn, memslot->dirty_bitmap);
+                       return;
+               }
+       }
+}
+
+static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
+{
+       unsigned long rip;
+       uint32_t interruptibility;
+
+       rip = vmcs_readl(GUEST_RIP);
+       rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+       vmcs_writel(GUEST_RIP, rip);
+
+       /*
+        * We emulated an instruction, so temporary interrupt blocking
+        * should be removed, if set.
+        */
+       interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+       if (interruptibility & 3)
+               vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+                            interruptibility & ~3);
+}
+
+static int emulator_read_std(unsigned long addr,
+                            unsigned long *val,
+                            unsigned int bytes,
+                            struct x86_emulate_ctxt *ctxt)
+{
+       struct litevm_vcpu *vcpu = ctxt->vcpu;
+       void *data = val;
+
+       while (bytes) {
+               gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+               unsigned offset = addr & (PAGE_SIZE-1);
+               unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
+               unsigned long pfn;
+               struct litevm_memory_slot *memslot;
+               void *page;
+
+               if (gpa == UNMAPPED_GVA)
+                       return X86EMUL_PROPAGATE_FAULT;
+               pfn = gpa >> PAGE_SHIFT;
+               memslot = gfn_to_memslot(vcpu->litevm, pfn);
+               if (!memslot)
+                       return X86EMUL_UNHANDLEABLE;
+               page = kmap_atomic(gfn_to_page(memslot, pfn));
+
+               memcpy(data, page + offset, tocopy);
+
+               kunmap_atomic(page);
+
+               bytes -= tocopy;
+               data += tocopy;
+               addr += tocopy;
+       }
+
+       return X86EMUL_CONTINUE;
+}
+
+static int emulator_write_std(unsigned long addr,
+                             unsigned long val,
+                             unsigned int bytes,
+                             struct x86_emulate_ctxt *ctxt)
+{
+       printk("emulator_write_std: addr %lx n %d\n",
+              addr, bytes);
+       return X86EMUL_UNHANDLEABLE;
+}
+
+static int emulator_read_emulated(unsigned long addr,
+                                 unsigned long *val,
+                                 unsigned int bytes,
+                                 struct x86_emulate_ctxt *ctxt)
+{
+       struct litevm_vcpu *vcpu = ctxt->vcpu;
+
+       if (vcpu->mmio_read_completed) {
+               memcpy(val, vcpu->mmio_data, bytes);
+               vcpu->mmio_read_completed = 0;
+               return X86EMUL_CONTINUE;
+       } else if (emulator_read_std(addr, val, bytes, ctxt)
+                  == X86EMUL_CONTINUE)
+               return X86EMUL_CONTINUE;
+       else {
+               gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+               if (gpa == UNMAPPED_GVA)
+                       return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
+               vcpu->mmio_needed = 1;
+               vcpu->mmio_phys_addr = gpa;
+               vcpu->mmio_size = bytes;
+               vcpu->mmio_is_write = 0;
+
+               return X86EMUL_UNHANDLEABLE;
+       }
+}
+
+static int emulator_write_emulated(unsigned long addr,
+                                  unsigned long val,
+                                  unsigned int bytes,
+                                  struct x86_emulate_ctxt *ctxt)
+{
+       struct litevm_vcpu *vcpu = ctxt->vcpu;
+       gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+
+       if (gpa == UNMAPPED_GVA)
+               return X86EMUL_PROPAGATE_FAULT;
+
+       vcpu->mmio_needed = 1;
+       vcpu->mmio_phys_addr = gpa;
+       vcpu->mmio_size = bytes;
+       vcpu->mmio_is_write = 1;
+       memcpy(vcpu->mmio_data, &val, bytes);
+
+       return X86EMUL_CONTINUE;
+}
+
+static int emulator_cmpxchg_emulated(unsigned long addr,
+                                    unsigned long old,
+                                    unsigned long new,
+                                    unsigned int bytes,
+                                    struct x86_emulate_ctxt *ctxt)
+{
+       static int reported;
+
+       if (!reported) {
+               reported = 1;
+               printk(KERN_WARNING "litevm: emulating exchange as write\n");
+       }
+       return emulator_write_emulated(addr, new, bytes, ctxt);
+}
+
+static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
+{
+       static int reported;
+       uint8_t opcodes[4];
+       unsigned long rip = vmcs_readl(GUEST_RIP);
+       unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
+
+       if (reported)
+               return;
+
+       emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
+
+       printk("emulation failed but !mmio_needed?"
+              " rip %lx %02x %02x %02x %02x\n",
+              rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+       reported = 1;
+}
+
+struct x86_emulate_ops emulate_ops = {
+       .read_std            = emulator_read_std,
+       .write_std           = emulator_write_std,
+       .read_emulated       = emulator_read_emulated,
+       .write_emulated      = emulator_write_emulated,
+       .cmpxchg_emulated    = emulator_cmpxchg_emulated,
+};
+
+enum emulation_result {
+       EMULATE_DONE,       /* no further processing */
+       EMULATE_DO_MMIO,      /* litevm_run filled with mmio request */
+       EMULATE_FAIL,         /* can't emulate this instruction */
+};
+
+static int emulate_instruction(struct litevm_vcpu *vcpu,
+                              struct litevm_run *run,
+                              unsigned long cr2,
+                              uint16_t error_code)
+{
+       struct x86_emulate_ctxt emulate_ctxt;
+       int r;
+       uint32_t cs_ar;
+
+       vcpu_load_rsp_rip(vcpu);
+
+       cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
+
+       emulate_ctxt.vcpu = vcpu;
+       emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
+       emulate_ctxt.cr2 = cr2;
+       emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
+               ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
+               ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
+               ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+
+       if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
+               emulate_ctxt.cs_base = 0;
+               emulate_ctxt.ds_base = 0;
+               emulate_ctxt.es_base = 0;
+               emulate_ctxt.ss_base = 0;
+               emulate_ctxt.gs_base = 0;
+               emulate_ctxt.fs_base = 0;
+       } else {
+               emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
+               emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
+               emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
+               emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
+               emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
+               emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
+       }
+
+       vcpu->mmio_is_write = 0;
+       r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
+
+       if ((r || vcpu->mmio_is_write) && run) {
+               run->mmio.phys_addr = vcpu->mmio_phys_addr;
+               memcpy(run->mmio.data, vcpu->mmio_data, 8);
+               run->mmio.len = vcpu->mmio_size;
+               run->mmio.is_write = vcpu->mmio_is_write;
+       }
+
+       if (r) {
+               if (!vcpu->mmio_needed) {
+                       report_emulation_failure(&emulate_ctxt);
+                       return EMULATE_FAIL;
+               }
+               return EMULATE_DO_MMIO;
+       }
+
+       vcpu_put_rsp_rip(vcpu);
+       vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
+
+       if (vcpu->mmio_is_write)
+               return EMULATE_DO_MMIO;
+
+       return EMULATE_DONE;
+}
+
+static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
+{
+       return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
+{
+       vmcs_writel(GUEST_GDTR_BASE, base);
+       vmcs_write32(GUEST_GDTR_LIMIT, limit);
+}
+
+void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
+{
+       vmcs_writel(GUEST_IDTR_BASE, base);
+       vmcs_write32(GUEST_IDTR_LIMIT, limit);
+}
+
+void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
+                  unsigned long *rflags)
+{
+       lmsw(vcpu, msw);
+       *rflags = vmcs_readl(GUEST_RFLAGS);
+}
+
+unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
+{
+       switch (cr) {
+       case 0:
+               return guest_cr0();
+       case 2:
+               return vcpu->cr2;
+       case 3:
+               return vcpu->cr3;
+       case 4:
+               return guest_cr4();
+       default:
+               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+               return 0;
+       }
+}
+
+void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
+                    unsigned long *rflags)
+{
+       switch (cr) {
+       case 0:
+               set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
+               *rflags = vmcs_readl(GUEST_RFLAGS);
+               break;
+       case 2:
+               vcpu->cr2 = val;
+               break;
+       case 3:
+               set_cr3(vcpu, val);
+               break;
+       case 4:
+               set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
+               break;
+       default:
+               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+       }
+}
+
+static int handle_rmode_exception(struct litevm_vcpu *vcpu,
+                                 int vec, uint32_t err_code)
+{
+       if (!vcpu->rmode.active)
+               return 0;
+
+       if (vec == GP_VECTOR && err_code == 0)
+               if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
+                       return 1;
+       return 0;
+}
+
+static int handle_exception(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
+{
+       uint32_t intr_info, error_code;
+       unsigned long cr2, rip;
+       uint32_t vect_info;
+       enum emulation_result er;
+
+       vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+       if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+                                               !is_page_fault(intr_info)) {
+               printk("%s: unexpected, vectoring info 0x%x "
+                      "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
+       }
+
+       if (is_external_interrupt(vect_info)) {
+               int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
+               set_bit(irq, vcpu->irq_pending);
+               set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+       }
+
+       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
+               asm ("int $2");
+               return 1;
+       }
+       error_code = 0;
+       rip = vmcs_readl(GUEST_RIP);
+       if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
+               error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+       if (is_page_fault(intr_info)) {
+               cr2 = vmcs_readl(EXIT_QUALIFICATION);
+
+               spin_lock(&vcpu->litevm->lock);
+               if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
+                       spin_unlock(&vcpu->litevm->lock);
+                       return 1;
+               }
+
+               er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
+               spin_unlock(&vcpu->litevm->lock);
+
+               switch (er) {
+               case EMULATE_DONE:
+                       return 1;
+               case EMULATE_DO_MMIO:
+                       ++litevm_stat.mmio_exits;
+                       litevm_run->exit_reason = LITEVM_EXIT_MMIO;
+                       return 0;
+                case EMULATE_FAIL:
+                       vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
+                       break;
+               default:
+                       BUG();
+               }
+       }
+
+       if (vcpu->rmode.active &&
+           handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
+                                                               error_code))
+               return 1;
+
+       if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
+               litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
+               return 0;
+       }
+       litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
+       litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
+       litevm_run->ex.error_code = error_code;
+       return 0;
+}
+
+static int handle_external_interrupt(struct litevm_vcpu *vcpu,
+                                    struct litevm_run *litevm_run)
+{
+       ++litevm_stat.irq_exits;
+       return 1;
+}
+
+
+static int get_io_count(struct litevm_vcpu *vcpu, uint64_t *count)
+{
+       uint64_t inst;
+       gva_t rip;
+       int countr_size;
+       int i, n;
+
+       if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
+               countr_size = 2;
+       } else {
+               uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
+
+               countr_size = (cs_ar & AR_L_MASK) ? 8:
+                             (cs_ar & AR_DB_MASK) ? 4: 2;
+       }
+
+       rip =  vmcs_readl(GUEST_RIP);
+       if (countr_size != 8)
+               rip += vmcs_readl(GUEST_CS_BASE);
+
+       n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
+
+       for (i = 0; i < n; i++) {
+               switch (((uint8_t*)&inst)[i]) {
+               case 0xf0:
+               case 0xf2:
+               case 0xf3:
+               case 0x2e:
+               case 0x36:
+               case 0x3e:
+               case 0x26:
+               case 0x64:
+               case 0x65:
+               case 0x66:
+                       break;
+               case 0x67:
+                       countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
+               default:
+                       goto done;
+               }
+       }
+       return 0;
+done:
+       countr_size *= 8;
+       *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
+       return 1;
+}
+
+static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
+{
+       uint64_t exit_qualification;
+
+       ++litevm_stat.io_exits;
+       exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+       litevm_run->exit_reason = LITEVM_EXIT_IO;
+       if (exit_qualification & 8)
+               litevm_run->io.direction = LITEVM_EXIT_IO_IN;
+       else
+               litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
+       litevm_run->io.size = (exit_qualification & 7) + 1;
+       litevm_run->io.string = (exit_qualification & 16) != 0;
+       litevm_run->io.string_down
+               = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
+       litevm_run->io.rep = (exit_qualification & 32) != 0;
+       litevm_run->io.port = exit_qualification >> 16;
+       if (litevm_run->io.string) {
+               if (!get_io_count(vcpu, &litevm_run->io.count))
+                       return 1;
+               litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+       } else
+               litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
+       return 0;
+}
+
+static int handle_invlpg(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
+{
+       uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
+       int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+       spin_lock(&vcpu->litevm->lock);
+       vcpu->mmu.inval_page(vcpu, address);
+       spin_unlock(&vcpu->litevm->lock);
+       vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
+       return 1;
+}
+
+static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
+{
+       uint64_t exit_qualification;
+       int cr;
+       int reg;
+
+#ifdef LITEVM_DEBUG
+       if (guest_cpl() != 0) {
+               vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
+               inject_gp(vcpu);
+               return 1;
+       }
+#endif
+
+       exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+       cr = exit_qualification & 15;
+       reg = (exit_qualification >> 8) & 15;
+       switch ((exit_qualification >> 4) & 3) {
+       case 0: /* mov to cr */
+               switch (cr) {
+               case 0:
+                       vcpu_load_rsp_rip(vcpu);
+                       set_cr0(vcpu, vcpu->regs[reg]);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               case 3:
+                       vcpu_load_rsp_rip(vcpu);
+                       set_cr3(vcpu, vcpu->regs[reg]);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               case 4:
+                       vcpu_load_rsp_rip(vcpu);
+                       set_cr4(vcpu, vcpu->regs[reg]);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               case 8:
+                       vcpu_load_rsp_rip(vcpu);
+                       set_cr8(vcpu, vcpu->regs[reg]);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               };
+               break;
+       case 1: /*mov from cr*/
+               switch (cr) {
+               case 3:
+                       vcpu_load_rsp_rip(vcpu);
+                       vcpu->regs[reg] = vcpu->cr3;
+                       vcpu_put_rsp_rip(vcpu);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               case 8:
+                       printd("handle_cr: read CR8 "
+                              "cpu erratum AA15\n");
+                       vcpu_load_rsp_rip(vcpu);
+                       vcpu->regs[reg] = vcpu->cr8;
+                       vcpu_put_rsp_rip(vcpu);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+               break;
+       case 3: /* lmsw */
+               lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
+
+               skip_emulated_instruction(vcpu);
+               return 1;
+       default:
+               break;
+       }
+       litevm_run->exit_reason = 0;
+       printk("litevm: unhandled control register: op %d cr %d\n",
+              (int)(exit_qualification >> 4) & 3, cr);
+       return 0;
+}
+
+static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
+{
+       uint64_t exit_qualification;
+       unsigned long val;
+       int dr, reg;
+
+       /*
+        * FIXME: this code assumes the host is debugging the guest.
+        *        need to deal with guest debugging itself too.
+        */
+       exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+       dr = exit_qualification & 7;
+       reg = (exit_qualification >> 8) & 15;
+       vcpu_load_rsp_rip(vcpu);
+       if (exit_qualification & 16) {
+               /* mov from dr */
+               switch (dr) {
+               case 6:
+                       val = 0xffff0ff0;
+                       break;
+               case 7:
+                       val = 0x400;
+                       break;
+               default:
+                       val = 0;
+               }
+               vcpu->regs[reg] = val;
+       } else {
+               /* mov to dr */
+       }
+       vcpu_put_rsp_rip(vcpu);
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
+{
+       litevm_run->exit_reason = LITEVM_EXIT_CPUID;
+       return 0;
+}
+
+static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
+{
+       uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
+       struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
+       uint64_t data;
+
+#ifdef LITEVM_DEBUG
+       if (guest_cpl() != 0) {
+               vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
+               inject_gp(vcpu);
+               return 1;
+       }
+#endif
+
+       switch (ecx) {
+#ifdef __x86_64__
+       case MSR_FS_BASE:
+               data = vmcs_readl(GUEST_FS_BASE);
+               break;
+       case MSR_GS_BASE:
+               data = vmcs_readl(GUEST_GS_BASE);
+               break;
+#endif
+       case MSR_IA32_SYSENTER_CS:
+               data = vmcs_read32(GUEST_SYSENTER_CS);
+               break;
+       case MSR_IA32_SYSENTER_EIP:
+               data = vmcs_read32(GUEST_SYSENTER_EIP);
+               break;
+       case MSR_IA32_SYSENTER_ESP:
+               data = vmcs_read32(GUEST_SYSENTER_ESP);
+               break;
+       case MSR_IA32_MC0_CTL:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MCG_CAP:
+       case MSR_IA32_MC0_MISC:
+       case MSR_IA32_MC0_MISC+4:
+       case MSR_IA32_MC0_MISC+8:
+       case MSR_IA32_MC0_MISC+12:
+       case MSR_IA32_MC0_MISC+16:
+       case MSR_IA32_UCODE_REV:
+               /* MTRR registers */
+       case 0xfe:
+       case 0x200 ... 0x2ff:
+               data = 0;
+               break;
+       case MSR_IA32_APICBASE:
+               data = vcpu->apic_base;
+               break;
+       default:
+               if (msr) {
+                       data = msr->data;
+                       break;
+               }
+               printk("litevm: unhandled rdmsr: %x\n", ecx);
+               inject_gp(vcpu);
+               return 1;
+       }
+
+       /* FIXME: handling of bits 32:63 of rax, rdx */
+       vcpu->regs[VCPU_REGS_RAX] = data & -1u;
+       vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+#ifdef __x86_64__
+
+static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
+{
+       struct vmx_msr_entry *msr;
+
+       if (efer & EFER_RESERVED_BITS) {
+               printd("set_efer: 0x%llx #GP, reserved bits\n",
+                      efer);
+               inject_gp(vcpu);
+               return;
+       }
+
+       if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+               printd("set_efer: #GP, change LME while paging\n");
+               inject_gp(vcpu);
+               return;
+       }
+
+       efer &= ~EFER_LMA;
+       efer |= vcpu->shadow_efer & EFER_LMA;
+
+       vcpu->shadow_efer = efer;
+
+       msr = find_msr_entry(vcpu, MSR_EFER);
+
+       if (!(efer & EFER_LMA))
+           efer &= ~EFER_LME;
+       msr->data = efer;
+       skip_emulated_instruction(vcpu);
+}
+
+#endif
+
+#define MSR_IA32_TIME_STAMP_COUNTER 0x10
+
+static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
+{
+       uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
+       struct vmx_msr_entry *msr;
+       uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
+               | ((uint64_t)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
+
+#ifdef LITEVM_DEBUG
+       if (guest_cpl() != 0) {
+               vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
+               inject_gp(vcpu);
+               return 1;
+       }
+#endif
+
+       switch (ecx) {
+#ifdef __x86_64__
+       case MSR_FS_BASE:
+               vmcs_writel(GUEST_FS_BASE, data);
+               break;
+       case MSR_GS_BASE:
+               vmcs_writel(GUEST_GS_BASE, data);
+               break;
+#endif
+       case MSR_IA32_SYSENTER_CS:
+               vmcs_write32(GUEST_SYSENTER_CS, data);
+               break;
+       case MSR_IA32_SYSENTER_EIP:
+               vmcs_write32(GUEST_SYSENTER_EIP, data);
+               break;
+       case MSR_IA32_SYSENTER_ESP:
+               vmcs_write32(GUEST_SYSENTER_ESP, data);
+               break;
+#ifdef __x86_64
+       case MSR_EFER:
+               set_efer(vcpu, data);
+               return 1;
+       case MSR_IA32_MC0_STATUS:
+               printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
+                           , __FUNCTION__, data);
+               break;
+#endif
+       case MSR_IA32_TIME_STAMP_COUNTER: {
+               uint64_t tsc;
+
+               rdtscll(tsc);
+               vmcs_write64(TSC_OFFSET, data - tsc);
+               break;
+       }
+       case MSR_IA32_UCODE_REV:
+       case MSR_IA32_UCODE_WRITE:
+       case 0x200 ... 0x2ff: /* MTRRs */
+               break;
+       case MSR_IA32_APICBASE:
+               vcpu->apic_base = data;
+               break;
+       default:
+               msr = find_msr_entry(vcpu, ecx);
+               if (msr) {
+                       msr->data = data;
+                       break;
+               }
+               printk("litevm: unhandled wrmsr: %x\n", ecx);
+               inject_gp(vcpu);
+               return 1;
+       }
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+static int handle_interrupt_window(struct litevm_vcpu *vcpu,
+                                  struct litevm_run *litevm_run)
+{
+       /* Turn off interrupt window reporting. */
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+                    vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
+                    & ~CPU_BASED_VIRTUAL_INTR_PENDING);
+       return 1;
+}
+
+static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
+{
+       skip_emulated_instruction(vcpu);
+       if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF))
+               return 1;
+
+       litevm_run->exit_reason = LITEVM_EXIT_HLT;
+       return 0;
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*litevm_vmx_exit_handlers[])(struct litevm_vcpu *vcpu,
+                                     struct litevm_run *litevm_run) = {
+       [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
+       [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
+       [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
+       [EXIT_REASON_INVLPG]                  = handle_invlpg,
+       [EXIT_REASON_CR_ACCESS]               = handle_cr,
+       [EXIT_REASON_DR_ACCESS]               = handle_dr,
+       [EXIT_REASON_CPUID]                   = handle_cpuid,
+       [EXIT_REASON_MSR_READ]                = handle_rdmsr,
+       [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
+       [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
+       [EXIT_REASON_HLT]                     = handle_halt,
+};
+
+static const int litevm_vmx_max_exit_handlers =
+       sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
+
+/*
+ * The guest has exited.  See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int litevm_handle_exit(struct litevm_run *litevm_run, struct litevm_vcpu *vcpu)
+{
+       uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+       if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
+                               exit_reason != EXIT_REASON_EXCEPTION_NMI )
+               printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
+                      "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
+       litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+       if (exit_reason < litevm_vmx_max_exit_handlers
+           && litevm_vmx_exit_handlers[exit_reason])
+               return litevm_vmx_exit_handlers[exit_reason](vcpu, litevm_run);
+       else {
+               litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
+               litevm_run->hw.hardware_exit_reason = exit_reason;
+       }
+       return 0;
+}
+
+static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
+{
+       uint16_t ent[2];
+       uint16_t cs;
+       uint16_t ip;
+       unsigned long flags;
+       unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
+       uint16_t sp =  vmcs_readl(GUEST_RSP);
+       uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+
+       if (sp > ss_limit || sp - 6 > sp) {
+               vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
+                           __FUNCTION__,
+                           vmcs_readl(GUEST_RSP),
+                           vmcs_readl(GUEST_SS_BASE),
+                           vmcs_read32(GUEST_SS_LIMIT));
+               return;
+       }
+
+       if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
+                                                               sizeof(ent)) {
+               vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
+               return;
+       }
+
+       flags =  vmcs_readl(GUEST_RFLAGS);
+       cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
+       ip =  vmcs_readl(GUEST_RIP);
+
+
+       if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
+           litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
+           litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
+               vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
+               return;
+       }
+
+       vmcs_writel(GUEST_RFLAGS, flags &
+                   ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
+       vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
+       vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
+       vmcs_writel(GUEST_RIP, ent[0]);
+       vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
+}
+
+static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
+{
+       int word_index = __ffs(vcpu->irq_summary);
+       int bit_index = __ffs(vcpu->irq_pending[word_index]);
+       int irq = word_index * BITS_PER_LONG + bit_index;
+
+       clear_bit(bit_index, &vcpu->irq_pending[word_index]);
+       if (!vcpu->irq_pending[word_index])
+               clear_bit(word_index, &vcpu->irq_summary);
+
+       if (vcpu->rmode.active) {
+               inject_rmode_irq(vcpu, irq);
+               return;
+       }
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                       irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+}
+
+static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
+{
+       if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
+           && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
+               /*
+                * Interrupts enabled, and not blocked by sti or mov ss. Good.
+                */
+               litevm_do_inject_irq(vcpu);
+       else
+               /*
+                * Interrupts blocked.  Wait for unblock.
+                */
+               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+                            vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
+                            | CPU_BASED_VIRTUAL_INTR_PENDING);
+}
+
+static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
+{
+       struct litevm_guest_debug *dbg = &vcpu->guest_debug;
+
+       set_debugreg(dbg->bp[0], 0);
+       set_debugreg(dbg->bp[1], 1);
+       set_debugreg(dbg->bp[2], 2);
+       set_debugreg(dbg->bp[3], 3);
+
+       if (dbg->singlestep) {
+               unsigned long flags;
+
+               flags = vmcs_readl(GUEST_RFLAGS);
+               flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+               vmcs_writel(GUEST_RFLAGS, flags);
+       }
+}
+
+static void load_msrs(struct vmx_msr_entry *e, int n)
+{
+       int i;
+
+       for (i = 0; i < n; ++i)
+               wrmsrl(e[i].index, e[i].data);
+}
+
+static void save_msrs(struct vmx_msr_entry *e, int n)
+{
+       int i;
+
+       for (i = 0; i < n; ++i)
+               rdmsrl(e[i].index, e[i].data);
+}
+
+static int litevm_dev_ioctl_run(struct litevm *litevm, struct litevm_run *litevm_run)
+{
+       struct litevm_vcpu *vcpu;
+       uint8_t fail;
+       uint16_t fs_sel, gs_sel, ldt_sel;
+       int fs_gs_ldt_reload_needed;
+
+       if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
+               return -EINVAL;
+
+       vcpu = vcpu_load(litevm, litevm_run->vcpu);
+       if (!vcpu)
+               return -ENOENT;
+
+       if (litevm_run->emulated) {
+               skip_emulated_instruction(vcpu);
+               litevm_run->emulated = 0;
+       }
+
+       if (litevm_run->mmio_completed) {
+               memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
+               vcpu->mmio_read_completed = 1;
+       }
+
+       vcpu->mmio_needed = 0;
+
+again:
+       /*
+        * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
+        * allow segment selectors with cpl > 0 or ti == 1.
+        */
+       fs_sel = read_fs();
+       gs_sel = read_gs();
+       ldt_sel = read_ldt();
+       fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
+       if (!fs_gs_ldt_reload_needed) {
+               vmcs_write16(HOST_FS_SELECTOR, fs_sel);
+               vmcs_write16(HOST_GS_SELECTOR, gs_sel);
+       } else {
+               vmcs_write16(HOST_FS_SELECTOR, 0);
+               vmcs_write16(HOST_GS_SELECTOR, 0);
+       }
+
+#ifdef __x86_64__
+       vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
+       vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
+#endif
+
+       if (vcpu->irq_summary &&
+           !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
+               litevm_try_inject_irq(vcpu);
+
+       if (vcpu->guest_debug.enabled)
+               litevm_guest_debug_pre(vcpu);
+
+       fx_save(vcpu->host_fx_image);
+       fx_restore(vcpu->guest_fx_image);
+
+       save_msrs(vcpu->host_msrs, vcpu->nmsrs);
+       load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
+
+       asm (
+               /* Store host registers */
+               "pushf \n\t"
+#ifdef __x86_64__
+               "push %%rax; push %%rbx; push %%rdx;"
+               "push %%rsi; push %%rdi; push %%rbp;"
+               "push %%r8;  push %%r9;  push %%r10; push %%r11;"
+               "push %%r12; push %%r13; push %%r14; push %%r15;"
+               "push %%rcx \n\t"
+               "vmwrite %%rsp, %2 \n\t"
+#else
+               "pusha; push %%ecx \n\t"
+               "vmwrite %%esp, %2 \n\t"
+#endif
+               /* Check if vmlaunch of vmresume is needed */
+               "cmp $0, %1 \n\t"
+               /* Load guest registers.  Don't clobber flags. */
+#ifdef __x86_64__
+               "mov %c[cr2](%3), %%rax \n\t"
+               "mov %%rax, %%cr2 \n\t"
+               "mov %c[rax](%3), %%rax \n\t"
+               "mov %c[rbx](%3), %%rbx \n\t"
+               "mov %c[rdx](%3), %%rdx \n\t"
+               "mov %c[rsi](%3), %%rsi \n\t"
+               "mov %c[rdi](%3), %%rdi \n\t"
+               "mov %c[rbp](%3), %%rbp \n\t"
+               "mov %c[r8](%3),  %%r8  \n\t"
+               "mov %c[r9](%3),  %%r9  \n\t"
+               "mov %c[r10](%3), %%r10 \n\t"
+               "mov %c[r11](%3), %%r11 \n\t"
+               "mov %c[r12](%3), %%r12 \n\t"
+               "mov %c[r13](%3), %%r13 \n\t"
+               "mov %c[r14](%3), %%r14 \n\t"
+               "mov %c[r15](%3), %%r15 \n\t"
+               "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
+#else
+               "mov %c[cr2](%3), %%eax \n\t"
+               "mov %%eax,   %%cr2 \n\t"
+               "mov %c[rax](%3), %%eax \n\t"
+               "mov %c[rbx](%3), %%ebx \n\t"
+               "mov %c[rdx](%3), %%edx \n\t"
+               "mov %c[rsi](%3), %%esi \n\t"
+               "mov %c[rdi](%3), %%edi \n\t"
+               "mov %c[rbp](%3), %%ebp \n\t"
+               "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
+#endif
+               /* Enter guest mode */
+               "jne launched \n\t"
+               "vmlaunch \n\t"
+               "jmp litevm_vmx_return \n\t"
+               "launched: vmresume \n\t"
+               ".globl litevm_vmx_return \n\t"
+               "litevm_vmx_return: "
+               /* Save guest registers, load host registers, keep flags */
+#ifdef __x86_64__
+               "xchg %3,     0(%%rsp) \n\t"
+               "mov %%rax, %c[rax](%3) \n\t"
+               "mov %%rbx, %c[rbx](%3) \n\t"
+               "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
+               "mov %%rdx, %c[rdx](%3) \n\t"
+               "mov %%rsi, %c[rsi](%3) \n\t"
+               "mov %%rdi, %c[rdi](%3) \n\t"
+               "mov %%rbp, %c[rbp](%3) \n\t"
+               "mov %%r8,  %c[r8](%3) \n\t"
+               "mov %%r9,  %c[r9](%3) \n\t"
+               "mov %%r10, %c[r10](%3) \n\t"
+               "mov %%r11, %c[r11](%3) \n\t"
+               "mov %%r12, %c[r12](%3) \n\t"
+               "mov %%r13, %c[r13](%3) \n\t"
+               "mov %%r14, %c[r14](%3) \n\t"
+               "mov %%r15, %c[r15](%3) \n\t"
+               "mov %%cr2, %%rax   \n\t"
+               "mov %%rax, %c[cr2](%3) \n\t"
+               "mov 0(%%rsp), %3 \n\t"
+
+               "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
+               "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
+               "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
+               "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
+#else
+               "xchg %3, 0(%%esp) \n\t"
+               "mov %%eax, %c[rax](%3) \n\t"
+               "mov %%ebx, %c[rbx](%3) \n\t"
+               "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
+               "mov %%edx, %c[rdx](%3) \n\t"
+               "mov %%esi, %c[rsi](%3) \n\t"
+               "mov %%edi, %c[rdi](%3) \n\t"
+               "mov %%ebp, %c[rbp](%3) \n\t"
+               "mov %%cr2, %%eax  \n\t"
+               "mov %%eax, %c[cr2](%3) \n\t"
+               "mov 0(%%esp), %3 \n\t"
+
+               "pop %%ecx; popa \n\t"
+#endif
+               "setbe %0 \n\t"
+               "popf \n\t"
+             : "=g" (fail)
+             : "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
+               "c"(vcpu),
+               [rax]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
+               [rbx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
+               [rcx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
+               [rdx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
+               [rsi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
+               [rdi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
+               [rbp]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
+#ifdef __x86_64__
+               [r8 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8 ])),
+               [r9 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9 ])),
+               [r10]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
+               [r11]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
+               [r12]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
+               [r13]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
+               [r14]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
+               [r15]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
+#endif
+               [cr2]"i"(offsetof(struct litevm_vcpu, cr2))
+             : "cc", "memory" );
+
+       ++litevm_stat.exits;
+
+       save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
+       load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
+
+       fx_save(vcpu->guest_fx_image);
+       fx_restore(vcpu->host_fx_image);
+
+#ifndef __x86_64__
+       asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+#endif
+
+       litevm_run->exit_type = 0;
+       if (fail) {
+               litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
+               litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
+       } else {
+               if (fs_gs_ldt_reload_needed) {
+                       load_ldt(ldt_sel);
+                       load_fs(fs_sel);
+                       /*
+                        * If we have to reload gs, we must take care to
+                        * preserve our gs base.
+                        */
+                       local_irq_disable();
+                       load_gs(gs_sel);
+#ifdef __x86_64__
+                       wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
+#endif
+                       local_irq_enable();
+
+                       reload_tss();
+               }
+               vcpu->launched = 1;
+               litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
+               if (litevm_handle_exit(litevm_run, vcpu)) {
+                       /* Give scheduler a change to reschedule. */
+                       vcpu_put(vcpu);
+                       if (signal_pending(current)) {
+                               ++litevm_stat.signal_exits;
+                               return -EINTR;
+                       }
+                       cond_resched();
+                       /* Cannot fail -  no vcpu unplug yet. */
+                       vcpu_load(litevm, vcpu_slot(vcpu));
+                       goto again;
+               }
+       }
+
+       vcpu_put(vcpu);
+       return 0;
+}
+
+static int litevm_dev_ioctl_get_regs(struct litevm *litevm, struct litevm_regs *regs)
+{
+       struct litevm_vcpu *vcpu;
+
+       if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS)
+               return -EINVAL;
+
+       vcpu = vcpu_load(litevm, regs->vcpu);
+       if (!vcpu)
+               return -ENOENT;
+
+       regs->rax = vcpu->regs[VCPU_REGS_RAX];
+       regs->rbx = vcpu->regs[VCPU_REGS_RBX];
+       regs->rcx = vcpu->regs[VCPU_REGS_RCX];
+       regs->rdx = vcpu->regs[VCPU_REGS_RDX];
+       regs->rsi = vcpu->regs[VCPU_REGS_RSI];
+       regs->rdi = vcpu->regs[VCPU_REGS_RDI];
+       regs->rsp = vmcs_readl(GUEST_RSP);
+       regs->rbp = vcpu->regs[VCPU_REGS_RBP];
+#ifdef __x86_64__
+       regs->r8 = vcpu->regs[VCPU_REGS_R8];
+       regs->r9 = vcpu->regs[VCPU_REGS_R9];
+       regs->r10 = vcpu->regs[VCPU_REGS_R10];
+       regs->r11 = vcpu->regs[VCPU_REGS_R11];
+       regs->r12 = vcpu->regs[VCPU_REGS_R12];
+       regs->r13 = vcpu->regs[VCPU_REGS_R13];
+       regs->r14 = vcpu->regs[VCPU_REGS_R14];
+       regs->r15 = vcpu->regs[VCPU_REGS_R15];
+#endif
+
+       regs->rip = vmcs_readl(GUEST_RIP);
+       regs->rflags = vmcs_readl(GUEST_RFLAGS);
+
+       /*
+        * Don't leak debug flags in case they were set for guest debugging
+        */
+       if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
+               regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+static int litevm_dev_ioctl_set_regs(struct litevm *litevm, struct litevm_regs *regs)
+{
+       struct litevm_vcpu *vcpu;
+
+       if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS)
+               return -EINVAL;
+
+       vcpu = vcpu_load(litevm, regs->vcpu);
+       if (!vcpu)
+               return -ENOENT;
+
+       vcpu->regs[VCPU_REGS_RAX] = regs->rax;
+       vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
+       vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
+       vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
+       vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
+       vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
+       vmcs_writel(GUEST_RSP, regs->rsp);
+       vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
+#ifdef __x86_64__
+       vcpu->regs[VCPU_REGS_R8] = regs->r8;
+       vcpu->regs[VCPU_REGS_R9] = regs->r9;
+       vcpu->regs[VCPU_REGS_R10] = regs->r10;
+       vcpu->regs[VCPU_REGS_R11] = regs->r11;
+       vcpu->regs[VCPU_REGS_R12] = regs->r12;
+       vcpu->regs[VCPU_REGS_R13] = regs->r13;
+       vcpu->regs[VCPU_REGS_R14] = regs->r14;
+       vcpu->regs[VCPU_REGS_R15] = regs->r15;
+#endif
+
+       vmcs_writel(GUEST_RIP, regs->rip);
+       vmcs_writel(GUEST_RFLAGS, regs->rflags);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+static int litevm_dev_ioctl_get_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
+{
+       struct litevm_vcpu *vcpu;
+
+       if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS)
+               return -EINVAL;
+       vcpu = vcpu_load(litevm, sregs->vcpu);
+       if (!vcpu)
+               return -ENOENT;
+
+#define get_segment(var, seg) \
+       do { \
+               uint32_t ar; \
+               \
+               sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
+               sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
+               sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
+               ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
+               if (ar & AR_UNUSABLE_MASK) ar = 0; \
+               sregs->var.type = ar & 15; \
+               sregs->var.s = (ar >> 4) & 1; \
+               sregs->var.dpl = (ar >> 5) & 3; \
+               sregs->var.present = (ar >> 7) & 1; \
+               sregs->var.avl = (ar >> 12) & 1; \
+               sregs->var.l = (ar >> 13) & 1; \
+               sregs->var.db = (ar >> 14) & 1; \
+               sregs->var.g = (ar >> 15) & 1; \
+               sregs->var.unusable = (ar >> 16) & 1; \
+       } while (0);
+
+       get_segment(cs, CS);
+       get_segment(ds, DS);
+       get_segment(es, ES);
+       get_segment(fs, FS);
+       get_segment(gs, GS);
+       get_segment(ss, SS);
+
+       get_segment(tr, TR);
+       get_segment(ldt, LDTR);
+#undef get_segment
+
+#define get_dtable(var, table) \
+       sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
+               sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
+
+       get_dtable(idt, IDTR);
+       get_dtable(gdt, GDTR);
+#undef get_dtable
+
+       sregs->cr0 = guest_cr0();
+       sregs->cr2 = vcpu->cr2;
+       sregs->cr3 = vcpu->cr3;
+       sregs->cr4 = guest_cr4();
+       sregs->cr8 = vcpu->cr8;
+       sregs->efer = vcpu->shadow_efer;
+       sregs->apic_base = vcpu->apic_base;
+
+       sregs->pending_int = vcpu->irq_summary != 0;
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+static int litevm_dev_ioctl_set_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
+{
+       struct litevm_vcpu *vcpu;
+       int mmu_reset_needed = 0;
+
+       if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS)
+               return -EINVAL;
+       vcpu = vcpu_load(litevm, sregs->vcpu);
+       if (!vcpu)
+               return -ENOENT;
+
+#define set_segment(var, seg) \
+       do { \
+               uint32_t ar; \
+               \
+               vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
+               vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
+               vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
+               if (sregs->var.unusable) { \
+                       ar = (1 << 16); \
+               } else { \
+                       ar = (sregs->var.type & 15); \
+                       ar |= (sregs->var.s & 1) << 4; \
+                       ar |= (sregs->var.dpl & 3) << 5; \
+                       ar |= (sregs->var.present & 1) << 7; \
+                       ar |= (sregs->var.avl & 1) << 12; \
+                       ar |= (sregs->var.l & 1) << 13; \
+                       ar |= (sregs->var.db & 1) << 14; \
+                       ar |= (sregs->var.g & 1) << 15; \
+               } \
+               vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
+       } while (0);
+
+       set_segment(cs, CS);
+       set_segment(ds, DS);
+       set_segment(es, ES);
+       set_segment(fs, FS);
+       set_segment(gs, GS);
+       set_segment(ss, SS);
+
+       set_segment(tr, TR);
+
+       set_segment(ldt, LDTR);
+#undef set_segment
+
+#define set_dtable(var, table) \
+       vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
+       vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
+
+       set_dtable(idt, IDTR);
+       set_dtable(gdt, GDTR);
+#undef set_dtable
+
+       vcpu->cr2 = sregs->cr2;
+       mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
+       vcpu->cr3 = sregs->cr3;
+
+       vcpu->cr8 = sregs->cr8;
+
+       mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
+#ifdef __x86_64__
+       __set_efer(vcpu, sregs->efer);
+#endif
+       vcpu->apic_base = sregs->apic_base;
+
+       mmu_reset_needed |= guest_cr0() != sregs->cr0;
+       vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
+       update_exception_bitmap(vcpu);
+       vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
+       vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
+
+       mmu_reset_needed |=  guest_cr4() != sregs->cr4;
+       __set_cr4(vcpu, sregs->cr4);
+
+       if (mmu_reset_needed)
+               litevm_mmu_reset_context(vcpu);
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+static int litevm_dev_ioctl_translate(struct litevm *litevm, struct litevm_translation *tr)
+{
+       unsigned long vaddr = tr->linear_address;
+       struct litevm_vcpu *vcpu;
+       gpa_t gpa;
+
+       vcpu = vcpu_load(litevm, tr->vcpu);
+       if (!vcpu)
+               return -ENOENT;
+       spin_lock(&litevm->lock);
+       gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
+       tr->physical_address = gpa;
+       tr->valid = gpa != UNMAPPED_GVA;
+       tr->writeable = 1;
+       tr->usermode = 0;
+       spin_unlock(&litevm->lock);
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+static int litevm_dev_ioctl_interrupt(struct litevm *litevm, struct litevm_interrupt *irq)
+{
+       struct litevm_vcpu *vcpu;
+
+       if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS)
+               return -EINVAL;
+       if (irq->irq < 0 || irq->irq >= 256)
+               return -EINVAL;
+       vcpu = vcpu_load(litevm, irq->vcpu);
+       if (!vcpu)
+               return -ENOENT;
+
+       set_bit(irq->irq, vcpu->irq_pending);
+       set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+static int litevm_dev_ioctl_debug_guest(struct litevm *litevm,
+                                    struct litevm_debug_guest *dbg)
+{
+       struct litevm_vcpu *vcpu;
+       unsigned long dr7 = 0x400;
+       uint32_t exception_bitmap;
+       int old_singlestep;
+
+       if (dbg->vcpu < 0 || dbg->vcpu >= LITEVM_MAX_VCPUS)
+               return -EINVAL;
+       vcpu = vcpu_load(litevm, dbg->vcpu);
+       if (!vcpu)
+               return -ENOENT;
+
+       exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
+       old_singlestep = vcpu->guest_debug.singlestep;
+
+       vcpu->guest_debug.enabled = dbg->enabled;
+       if (vcpu->guest_debug.enabled) {
+               int i;
+
+               dr7 |= 0x200;  /* exact */
+               for (i = 0; i < 4; ++i) {
+                       if (!dbg->breakpoints[i].enabled)
+                               continue;
+                       vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
+                       dr7 |= 2 << (i*2);    /* global enable */
+                       dr7 |= 0 << (i*4+16); /* execution breakpoint */
+               }
+
+               exception_bitmap |= (1u << 1);  /* Trap debug exceptions */
+
+               vcpu->guest_debug.singlestep = dbg->singlestep;
+       } else {
+               exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
+               vcpu->guest_debug.singlestep = 0;
+       }
+
+       if (old_singlestep && !vcpu->guest_debug.singlestep) {
+               unsigned long flags;
+
+               flags = vmcs_readl(GUEST_RFLAGS);
+               flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+               vmcs_writel(GUEST_RFLAGS, flags);
+       }
+
+       vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
+       vmcs_writel(GUEST_DR7, dr7);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+static long litevm_dev_ioctl(struct file *filp,
+                         unsigned int ioctl, unsigned long arg)
+{
+       struct litevm *litevm = filp->private_data;
+       int r = -EINVAL;
+
+       switch (ioctl) {
+       case LITEVM_CREATE_VCPU: {
+               r = litevm_dev_ioctl_create_vcpu(litevm, arg);
+               if (r)
+                       goto out;
+               break;
+       }
+       case LITEVM_RUN: {
+               struct litevm_run litevm_run;
+
+               r = -EFAULT;
+               if (copy_from_user(&litevm_run, (void *)arg, sizeof litevm_run))
+                       goto out;
+               r = litevm_dev_ioctl_run(litevm, &litevm_run);
+               if (r < 0)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user((void *)arg, &litevm_run, sizeof litevm_run))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case LITEVM_GET_REGS: {
+               struct litevm_regs litevm_regs;
+
+               r = -EFAULT;
+               if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
+                       goto out;
+               r = litevm_dev_ioctl_get_regs(litevm, &litevm_regs);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user((void *)arg, &litevm_regs, sizeof litevm_regs))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case LITEVM_SET_REGS: {
+               struct litevm_regs litevm_regs;
+
+               r = -EFAULT;
+               if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
+                       goto out;
+               r = litevm_dev_ioctl_set_regs(litevm, &litevm_regs);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case LITEVM_GET_SREGS: {
+               struct litevm_sregs litevm_sregs;
+
+               r = -EFAULT;
+               if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
+                       goto out;
+               r = litevm_dev_ioctl_get_sregs(litevm, &litevm_sregs);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user((void *)arg, &litevm_sregs, sizeof litevm_sregs))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case LITEVM_SET_SREGS: {
+               struct litevm_sregs litevm_sregs;
+
+               r = -EFAULT;
+               if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
+                       goto out;
+               r = litevm_dev_ioctl_set_sregs(litevm, &litevm_sregs);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case LITEVM_TRANSLATE: {
+               struct litevm_translation tr;
+
+               r = -EFAULT;
+               if (copy_from_user(&tr, (void *)arg, sizeof tr))
+                       goto out;
+               r = litevm_dev_ioctl_translate(litevm, &tr);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user((void *)arg, &tr, sizeof tr))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case LITEVM_INTERRUPT: {
+               struct litevm_interrupt irq;
+
+               r = -EFAULT;
+               if (copy_from_user(&irq, (void *)arg, sizeof irq))
+                       goto out;
+               r = litevm_dev_ioctl_interrupt(litevm, &irq);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case LITEVM_DEBUG_GUEST: {
+               struct litevm_debug_guest dbg;
+
+               r = -EFAULT;
+               if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
+                       goto out;
+               r = litevm_dev_ioctl_debug_guest(litevm, &dbg);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case LITEVM_SET_MEMORY_REGION: {
+               struct litevm_memory_region litevm_mem;
+
+               r = -EFAULT;
+               if (copy_from_user(&litevm_mem, (void *)arg, sizeof litevm_mem))
+                       goto out;
+               r = litevm_dev_ioctl_set_memory_region(litevm, &litevm_mem);
+               if (r)
+                       goto out;
+               break;
+       }
+       case LITEVM_GET_DIRTY_LOG: {
+               struct litevm_dirty_log log;
+
+               r = -EFAULT;
+               if (copy_from_user(&log, (void *)arg, sizeof log))
+                       goto out;
+               r = litevm_dev_ioctl_get_dirty_log(litevm, &log);
+               if (r)
+                       goto out;
+               break;
+       }
+       default:
+               ;
+       }
+out:
+       return r;
+}
+
+#if 0
+static int litevm_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct litevm *litevm = vma->vm_file->private_data;
+       struct litevm_memory_slot *slot;
+       struct page *page;
+
+       slot = gfn_to_memslot(litevm, vmf->pgoff);
+       if (!slot)
+               return VM_FAULT_SIGBUS;
+       page = gfn_to_page(slot, vmf->pgoff);
+       if (!page)
+               return VM_FAULT_SIGBUS;
+
+       get_page(page);
+       vmf->page = page;
+       return 0;
+}
+#endif
+
+static int litevm_reboot(struct notifier_block *notifier, unsigned long val,
+                       void *v)
+{
+       panic("litevm_reboot");
+#if 0
+       if (val == SYS_RESTART) {
+               /*
+                * Some (well, at least mine) BIOSes hang on reboot if
+                * in vmx root mode.
+                */
+               printk("litevm: exiting vmx mode\n");
+               on_each_cpu(litevm_disable, 0, 1);
+       }
+       return NOTIFY_OK;
+#endif
+       return 0;
+}
+
+hpa_t bad_page_address;
+
+static int litevm_init(void)
+{
+       static struct page *bad_page;
+       int r = 0;
+
+#if 0
+       if (!cpu_has_litevm_support()) {
+               printk("litevm: no hardware support\n");
+               return -EOPNOTSUPP;
+       }
+       if (vmx_disabled_by_bios()) {
+               printk("litevm: disabled by bios\n");
+               return -EOPNOTSUPP;
+       }
+#endif
+
+       setup_vmcs_descriptor();
+       r = alloc_litevm_area();
+       if (r)
+               goto out;
+#warning "on each cpu ..."
+//     on_each_cpu(litevm_enable, 0, 1);
+
+       if ((bad_page = alloc_page(KMALLOC_WAIT)) == NULL) {
+               r = -ENOMEM;
+               goto out_free;
+       }
+
+       bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
+       memset(__va(bad_page_address), 0, PAGE_SIZE);
+
+       return r;
+
+out_free:
+       free_litevm_area();
+out:
+       return r;
+}
+
+static void litevm_exit(void)
+{
+       free_litevm_area();
+       __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
+}
+
+