BXE: min->MIN, plus an spatch
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 void monitor(void *);
39
40 #define currentcpu (&per_cpu_info[core_id()])
41 #define QLOCK_init(x) {printd("qlock_init %p\n", x); qlock_init(x); printd("%p lock_inited\n", x);}
42 #define QLOCK(x) {printd("qlock %p\n", x); qlock(x); printd("%p locked\n", x);}
43 #define QUNLOCK(x) {printd("qunlock %p\n", x); qunlock(x); printd("%p unlocked\n", x);}
44 #define SPLI_irqsave(x){printd("spin_lock_init %p:", x); spinlock_init(x); printd("inited\n");}
45 #define SPLL(x){printd("spin_lock %p\n", x); spin_lock_irqsave(x); printd("%p locked\n", x);}
46 #define SPLU(x){printd("spin_unlock %p\n", x); spin_unlock(x); printd("%p unlocked\n", x);}
47 struct litevm_stat litevm_stat;
48
49 static struct litevm_stats_debugfs_item {
50         const char *name;
51         uint32_t *data;
52 } debugfs_entries[] = {
53         {
54         "pf_fixed", &litevm_stat.pf_fixed}, {
55         "pf_guest", &litevm_stat.pf_guest}, {
56         "tlb_flush", &litevm_stat.tlb_flush}, {
57         "invlpg", &litevm_stat.invlpg}, {
58         "exits", &litevm_stat.exits}, {
59         "io_exits", &litevm_stat.io_exits}, {
60         "mmio_exits", &litevm_stat.mmio_exits}, {
61         "signal_exits", &litevm_stat.signal_exits}, {
62         "irq_exits", &litevm_stat.irq_exits}, {
63         0, 0}
64 };
65
66 static struct dentry *debugfs_dir;
67
68 static const uint32_t vmx_msr_index[] = {
69 #ifdef __x86_64__
70         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
71 #endif
72         MSR_EFER,       // wtf? MSR_K6_STAR,
73 };
74
75 static const char* vmx_msr_name[] = {
76 #ifdef __x86_64__
77         "MSR_SYSCALL_MASK", "MSR_LSTAR", "MSR_CSTAR", "MSR_KERNEL_GS_BASE",
78 #endif
79         "MSR_EFER",     // wtf? MSR_K6_STAR,
80 };
81
82 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
83
84 #ifdef __x86_64__
85 /*
86  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
87  * mechanism (cpu bug AA24)
88  */
89 #define NR_BAD_MSRS 2
90 #else
91 #define NR_BAD_MSRS 0
92 #endif
93
94 #define TSS_IOPB_BASE_OFFSET 0x66
95 #define TSS_BASE_SIZE 0x68
96 #define TSS_IOPB_SIZE (65536 / 8)
97 #define TSS_REDIRECTION_SIZE (256 / 8)
98 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
99
100 #define MSR_IA32_VMX_BASIC_MSR                  0x480
101 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
102 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
103 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
104 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
105
106 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
107 #define LMSW_GUEST_MASK 0x0eULL
108 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
109 //#define CR4_VMXE 0x2000
110 #define CR8_RESEVED_BITS (~0x0fULL)
111 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
112
113 #ifdef __x86_64__
114 #define HOST_IS_64 1
115 #else
116 #define HOST_IS_64 0
117 #endif
118
119 int vm_set_memory_region(struct litevm *litevm,
120                                                  struct litevm_memory_region *mem);
121
122 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
123 /**
124  * __ffs - find first set bit in word
125  * @word: The word to search
126  *
127  * Undefined if no bit exists, so code should check against 0 first.
128  */
129 static inline unsigned long __ffs(unsigned long word)
130 {
131         print_func_entry();
132 asm("rep; bsf %1,%0":"=r"(word)
133 :               "rm"(word));
134         print_func_exit();
135         return word;
136 }
137
138 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu,
139                                                                                         uint32_t msr)
140 {
141         print_func_entry();
142         int i;
143
144         for (i = 0; i < vcpu->nmsrs; ++i)
145                 if (vcpu->guest_msrs[i].index == msr) {
146                         print_func_exit();
147                         return &vcpu->guest_msrs[i];
148                 }
149         print_func_exit();
150         return 0;
151 }
152
153 struct descriptor_table {
154         uint16_t limit;
155         unsigned long base;
156 } __attribute__ ((packed));
157
158 static void get_gdt(struct descriptor_table *table)
159 {
160         print_func_entry();
161 asm("sgdt %0":"=m"(*table));
162         print_func_exit();
163 }
164
165 static void get_idt(struct descriptor_table *table)
166 {
167         print_func_entry();
168 asm("sidt %0":"=m"(*table));
169         print_func_exit();
170 }
171
172 static uint16_t read_fs(void)
173 {
174         //print_func_entry();
175         uint16_t seg;
176         asm("mov %%fs, %0":"=g"(seg));
177         //print_func_exit();
178         return seg;
179 }
180
181 static uint16_t read_gs(void)
182 {
183         //print_func_entry();
184         uint16_t seg;
185         asm("mov %%gs, %0":"=g"(seg));
186         //print_func_exit();
187         return seg;
188 }
189
190 static uint16_t read_ldt(void)
191 {
192         //print_func_entry();
193         uint16_t ldt;
194         asm("sldt %0":"=g"(ldt));
195         //print_func_exit();
196         return ldt;
197 }
198
199 static void load_fs(uint16_t sel)
200 {
201         //print_func_entry();
202         asm("mov %0, %%fs": :"g"(sel));
203         //print_func_exit();
204 }
205
206 static void load_gs(uint16_t sel)
207 {
208         //print_func_entry();
209         asm("mov %0, %%gs": :"g"(sel));
210         //print_func_exit();
211 }
212
213 #ifndef load_ldt
214 static void load_ldt(uint16_t sel)
215 {
216         //print_func_entry();
217         asm("lldt %0": :"g"(sel));
218         //print_func_exit();
219 }
220 #endif
221
222 static void fx_save(void *image)
223 {
224         //print_func_entry();
225         asm("fxsave (%0)"::"r"(image));
226         //print_func_exit();
227 }
228
229 static void fx_restore(void *image)
230 {
231         //print_func_entry();
232         asm("fxrstor (%0)"::"r"(image));
233         //print_func_exit();
234 }
235
236 static void fpu_init(void)
237 {
238         print_func_entry();
239         asm("finit");
240         print_func_exit();
241 }
242
243 struct segment_descriptor {
244         uint16_t limit_low;
245         uint16_t base_low;
246         uint8_t base_mid;
247         uint8_t type:4;
248         uint8_t system:1;
249         uint8_t dpl:2;
250         uint8_t present:1;
251         uint8_t limit_high:4;
252         uint8_t avl:1;
253         uint8_t long_mode:1;
254         uint8_t default_op:1;
255         uint8_t granularity:1;
256         uint8_t base_high;
257 } __attribute__ ((packed));
258
259 #ifdef __x86_64__
260 // LDT or TSS descriptor in the GDT. 16 bytes.
261 struct segment_descriptor_64 {
262         struct segment_descriptor s;
263         uint32_t base_higher;
264         uint32_t pad_zero;
265 };
266
267 #endif
268
269 static unsigned long segment_base(uint16_t selector)
270 {
271         print_func_entry();
272         struct descriptor_table gdt;
273         struct segment_descriptor *d;
274         unsigned long table_base;
275         typedef unsigned long ul;
276         unsigned long v;
277
278 asm("sgdt %0":"=m"(gdt));
279         table_base = gdt.base;
280
281         if (selector & 4) {     /* from ldt */
282                 uint16_t ldt_selector;
283
284 asm("sldt %0":"=g"(ldt_selector));
285                 table_base = segment_base(ldt_selector);
286         }
287         d = (struct segment_descriptor *)(table_base + (selector & ~7));
288         v = d->base_low | ((ul) d->base_mid << 16) | ((ul) d->base_high << 24);
289 #ifdef __x86_64__
290         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
291                 v |= ((ul) ((struct segment_descriptor_64 *)d)->base_higher) << 32;
292 #endif
293         print_func_exit();
294         return v;
295 }
296
297 static unsigned long read_tr_base(void)
298 {
299         print_func_entry();
300         uint16_t tr;
301 asm("str %0":"=g"(tr));
302         print_func_exit();
303         return segment_base(tr);
304 }
305
306 static void reload_tss(void)
307 {
308         print_func_entry();
309 #ifndef __x86_64__
310
311         /*
312          * VT restores TR but not its size.  Useless.
313          */
314         struct descriptor_table gdt;
315         struct segment_descriptor *descs;
316
317         get_gdt(&gdt);
318         descs = (void *)gdt.base;
319         descs[GD_TSS].type = 9; /* available TSS */
320         load_TR_desc();
321 #endif
322         print_func_exit();
323 }
324
325 static struct vmcs_descriptor {
326         int size;
327         int order;
328         uint32_t revision_id;
329 } vmcs_descriptor;
330
331 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
332 {
333         print_func_entry();
334         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
335         print_func_exit();
336         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
337 }
338
339 int litevm_read_guest(struct litevm_vcpu *vcpu,
340                                           gva_t addr, unsigned long size, void *dest)
341 {
342         print_func_entry();
343         unsigned char *host_buf = dest;
344         unsigned long req_size = size;
345
346         while (size) {
347                 hpa_t paddr;
348                 unsigned now;
349                 unsigned offset;
350                 hva_t guest_buf;
351
352                 paddr = gva_to_hpa(vcpu, addr);
353
354                 if (is_error_hpa(paddr))
355                         break;
356                 guest_buf = (hva_t) KADDR(paddr);
357                 offset = addr & ~PAGE_MASK;
358                 guest_buf |= offset;
359                 now = MIN(size, PAGE_SIZE - offset);
360                 memcpy(host_buf, (void *)guest_buf, now);
361                 host_buf += now;
362                 addr += now;
363                 size -= now;
364         }
365         print_func_exit();
366         return req_size - size;
367 }
368
369 int litevm_write_guest(struct litevm_vcpu *vcpu,
370                                            gva_t addr, unsigned long size, void *data)
371 {
372         print_func_entry();
373         unsigned char *host_buf = data;
374         unsigned long req_size = size;
375
376         while (size) {
377                 hpa_t paddr;
378                 unsigned now;
379                 unsigned offset;
380                 hva_t guest_buf;
381
382                 paddr = gva_to_hpa(vcpu, addr);
383
384                 if (is_error_hpa(paddr))
385                         break;
386
387                 guest_buf = (hva_t) KADDR(paddr);
388                 offset = addr & ~PAGE_MASK;
389                 guest_buf |= offset;
390                 now = MIN(size, PAGE_SIZE - offset);
391                 memcpy((void *)guest_buf, host_buf, now);
392                 host_buf += now;
393                 addr += now;
394                 size -= now;
395         }
396         print_func_exit();
397         return req_size - size;
398 }
399
400 static void setup_vmcs_descriptor(void)
401 {
402         print_func_entry();
403         uint64_t msr;
404
405         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
406         vmcs_descriptor.size = (msr >> 32) & 0x1fff;
407         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size >> PAGE_SHIFT);
408         vmcs_descriptor.revision_id = (uint32_t) msr;
409         printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n",
410                    msr, vmcs_descriptor.size, vmcs_descriptor.order,
411                    vmcs_descriptor.revision_id);
412         print_func_exit();
413 };
414
415 static void vmcs_clear(struct vmcs *vmcs)
416 {
417         print_func_entry();
418         uint64_t phys_addr = PADDR(vmcs);
419         uint8_t error;
420         printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr);
421         asm volatile ("vmclear %1; setna %0":"=m" (error):"m"(phys_addr):"cc",
422                                   "memory");
423         if (error)
424                 printk("litevm: vmclear fail: %p/%llx\n", vmcs, phys_addr);
425         print_func_exit();
426 }
427
428 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
429 {
430         print_func_entry();
431         struct litevm_vcpu *vcpu = arg;
432         int cpu = core_id();
433         printd
434                 ("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n",
435                  cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
436
437         if (vcpu->cpu == cpu)
438                 vmcs_clear(vcpu->vmcs);
439
440         if (currentcpu->vmcs == vcpu->vmcs)
441                 currentcpu->vmcs = NULL;
442         print_func_exit();
443 }
444
445 static int vcpu_slot(struct litevm_vcpu *vcpu)
446 {
447         print_func_entry();
448         print_func_exit();
449         return vcpu - vcpu->litevm->vcpus;
450 }
451
452 /*
453  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
454  * vcpu mutex is already taken.
455  */
456 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
457 {
458         print_func_entry();
459         uint64_t phys_addr = PADDR(vcpu->vmcs);
460         int cpu;
461         cpu = core_id();
462
463         printk("__vcpu_load: vcpu->cpu %d cpu %d\n", vcpu->cpu, cpu);
464         if ((vcpu->cpu != cpu) && (vcpu->cpu != -1)){
465                 handler_wrapper_t *w;
466                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
467                 smp_call_wait(w);
468                 vcpu->launched = 0;
469         }
470
471         printk("2 ..");
472         if (currentcpu->vmcs != vcpu->vmcs) {
473                 uint8_t error;
474
475                 currentcpu->vmcs = vcpu->vmcs;
476                 asm volatile ("vmptrld %1; setna %0":"=m" (error):"m"(phys_addr):"cc");
477                 if (error) {
478                         printk("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
479                         error("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
480                 }
481         }
482
483         printk("3 ..");
484         if (vcpu->cpu != cpu) {
485                 struct descriptor_table dt;
486                 unsigned long sysenter_esp;
487
488                 vcpu->cpu = cpu;
489                 /*
490                  * Linux uses per-cpu TSS and GDT, so set these when switching
491                  * processors.
492                  */
493                 vmcs_writel(HOST_TR_BASE, read_tr_base());      /* 22.2.4 */
494                 get_gdt(&dt);
495                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
496
497                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
498                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp);      /* 22.2.3 */
499         }
500         print_func_exit();
501         return vcpu;
502 }
503
504 /*
505  * Switches to specified vcpu, until a matching vcpu_put()
506  * And leaves it locked!
507  */
508 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
509 {
510         struct litevm_vcpu *ret;
511         print_func_entry();
512         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
513
514         printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu);
515
516         QLOCK(&vcpu->mutex);
517         printk("Locked\n");
518         if (!vcpu->vmcs) {
519                 QUNLOCK(&vcpu->mutex);
520                 printk("vcpu->vmcs for vcpu %p is NULL", vcpu);
521                 error("vcpu->vmcs is NULL");
522         }
523         ret = __vcpu_load(vcpu);
524         print_func_exit();
525         return ret;
526 }
527
528 static void vcpu_put(struct litevm_vcpu *vcpu)
529 {
530         print_func_entry();
531         //put_cpu();
532         QUNLOCK(&vcpu->mutex);
533         print_func_exit();
534 }
535
536 static struct vmcs *alloc_vmcs_cpu(int cpu)
537 {
538         print_func_entry();
539         int node = node_id();
540         struct vmcs *vmcs;
541
542         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
543         if (!vmcs) {
544                 print_func_exit();
545                 printk("no memory for vcpus");
546                 error("no memory for vcpus");
547         }
548         memset(vmcs, 0, vmcs_descriptor.size);
549         vmcs->revision_id = vmcs_descriptor.revision_id;        /* vmcs revision id */
550         print_func_exit();
551         return vmcs;
552 }
553
554 static struct vmcs *alloc_vmcs(void)
555 {
556         struct vmcs *ret;
557         print_func_entry();
558         ret = alloc_vmcs_cpu(core_id());
559         print_func_exit();
560         return ret;
561 }
562
563 static int cpu_has_litevm_support(void)
564 {
565         int ret;
566         print_func_entry();
567         /* sigh ... qemu. */
568         char vid[16];
569         if (vendor_id(vid) < 0)
570                 return 0;
571         printk("vendor id is %s\n", vid);
572         if (vid[0] == 'Q') /* qemu */
573                 return 0;
574         if (vid[0] == 'A') /* AMD or qemu claiming to be AMD */
575                 return 0;
576         uint32_t ecx = cpuid_ecx(1);
577         ret = ecx & (1 << 5);   /* CPUID.1:ECX.VMX[bit 5] -> VT */
578         printk("%s: CPUID.1:ECX.VMX[bit 5] -> VT is%s available\n", __func__, ret ? "" : " NOT");
579         print_func_exit();
580         return ret;
581 }
582
583 static int vmx_disabled_by_bios(void)
584 {
585         print_func_entry();
586         uint64_t msr;
587
588         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
589         print_func_exit();
590         return (msr & 5) == 1;  /* locked but not enabled */
591 }
592
593 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
594 {
595         print_func_entry();
596         int cpu = hw_core_id();
597         uint64_t phys_addr;
598         uint64_t old;
599         uint64_t status = 0;
600         currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order,
601                                                                                           KMALLOC_WAIT);
602         if (!currentcpu->vmxarea)
603                 return;
604         memset(currentcpu->vmxarea, 0, vmcs_descriptor.size);
605         currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id;
606         phys_addr = PADDR(currentcpu->vmxarea);
607         printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(),
608                    currentcpu->vmxarea, (void *)phys_addr);
609         if (phys_addr & 0xfff) {
610                 printk("fix vmxarea alignment!");
611         }
612         printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
613         old = read_msr(MSR_IA32_FEATURE_CONTROL);
614         printk("%d: vm_enable, old is %d\n", core_id(), old);
615         if ((old & 5) == 0) {
616                 /* enable and lock */
617                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
618                 old = read_msr(MSR_IA32_FEATURE_CONTROL);
619                 printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old);
620         }
621         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
622         lcr4(rcr4() | CR4_VMXE);        /* FIXME: not cpu hotplug safe */
623         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
624         printk("%d:cr0 is %x\n", core_id(), rcr0());
625         lcr0(rcr0() | 0x20);
626         printk("%d:cr0 is %x\n", core_id(), rcr0());
627         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
628         outb(0x92, inb(0x92) | 2);
629         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
630         asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:":"=m" (status):"m"
631                                   (phys_addr):"memory", "cc");
632         printk("%d:vmxon status is %d\n", core_id(), status);
633         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
634         if (!status) {
635                 printk("%d:vm_enable: status says fail\n", core_id());
636         }
637         print_func_exit();
638 }
639
640 static void litevm_disable(void *garbage)
641 {
642         print_func_entry();
643         asm volatile ("vmxoff":::"cc");
644         print_func_exit();
645 }
646
647 struct litevm *vmx_open(void)
648 {
649         print_func_entry();
650         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
651         int i;
652
653         printk("vmx_open: litevm is %p\n", litevm);
654         if (!litevm) {
655                 printk("NO LITEVM! MAKES NO SENSE!\n");
656                 error("litevm alloc failed");
657                 print_func_exit();
658                 return 0;
659         }
660
661         SPLI_irqsave(&litevm->lock);
662         LIST_INIT(&litevm->link);
663         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
664                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
665                 printk("init vcpu %p\n", vcpu);
666
667                 QLOCK_init(&vcpu->mutex);
668                 vcpu->mmu.root_hpa = INVALID_PAGE;
669                 vcpu->litevm = litevm;
670                 LIST_INIT(&vcpu->link);
671         }
672         printk("vmx_open: busy %d\n", litevm->busy);
673         printk("return %p\n", litevm);
674         print_func_exit();
675         return litevm;
676 }
677
678 /*
679  * Free any memory in @free but not in @dont.
680  */
681 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
682                                                                          struct litevm_memory_slot *dont)
683 {
684         print_func_entry();
685         int i;
686
687         if (!dont || free->phys_mem != dont->phys_mem)
688                 if (free->phys_mem) {
689                         for (i = 0; i < free->npages; ++i) {
690                                 page_t *page = free->phys_mem[i];
691                                 page_decref(page);
692                                 assert(page_is_free(page2ppn(page)));
693                         }
694                         kfree(free->phys_mem);
695                 }
696
697         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
698                 kfree(free->dirty_bitmap);
699
700         free->phys_mem = 0;
701         free->npages = 0;
702         free->dirty_bitmap = 0;
703         print_func_exit();
704 }
705
706 static void litevm_free_physmem(struct litevm *litevm)
707 {
708         print_func_entry();
709         int i;
710
711         for (i = 0; i < litevm->nmemslots; ++i)
712                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
713         print_func_exit();
714 }
715
716 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
717 {
718         print_func_entry();
719         if (vcpu->vmcs) {
720                 handler_wrapper_t *w;
721                 smp_call_function_all(__vcpu_clear, vcpu, &w);
722                 smp_call_wait(w);
723                 //free_vmcs(vcpu->vmcs);
724                 vcpu->vmcs = 0;
725         }
726         print_func_exit();
727 }
728
729 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
730 {
731         print_func_entry();
732         litevm_free_vmcs(vcpu);
733         litevm_mmu_destroy(vcpu);
734         print_func_exit();
735 }
736
737 static void litevm_free_vcpus(struct litevm *litevm)
738 {
739         print_func_entry();
740         unsigned int i;
741
742         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
743                 litevm_free_vcpu(&litevm->vcpus[i]);
744         print_func_exit();
745 }
746
747 static int litevm_dev_release(struct litevm *litevm)
748 {
749         print_func_entry();
750
751         litevm_free_vcpus(litevm);
752         litevm_free_physmem(litevm);
753         kfree(litevm);
754         print_func_exit();
755         return 0;
756 }
757
758 unsigned long vmcs_readl(unsigned long field)
759 {
760         unsigned long value;
761
762         asm volatile ("vmread %1, %0":"=g" (value):"r"(field):"cc");
763         return value;
764 }
765
766 void vmcs_writel(unsigned long field, unsigned long value)
767 {
768         uint8_t error;
769
770         asm volatile ("vmwrite %1, %2; setna %0":"=g" (error):"r"(value),
771                                   "r"(field):"cc");
772         if (error)
773                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
774                            field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
775 }
776
777 static void vmcs_write16(unsigned long field, uint16_t value)
778 {
779         vmcs_writel(field, value);
780 }
781
782 static void vmcs_write64(unsigned long field, uint64_t value)
783 {
784         print_func_entry();
785 #ifdef __x86_64__
786         vmcs_writel(field, value);
787 #else
788         vmcs_writel(field, value);
789         asm volatile ("");
790         vmcs_writel(field + 1, value >> 32);
791 #endif
792         print_func_exit();
793 }
794
795 static void inject_gp(struct litevm_vcpu *vcpu)
796 {
797         print_func_entry();
798         printd("inject_general_protection: rip 0x%lx\n", vmcs_readl(GUEST_RIP));
799         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
800         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
801                                  GP_VECTOR |
802                                  INTR_TYPE_EXCEPTION |
803                                  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK);
804         print_func_exit();
805 }
806
807 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
808 {
809         print_func_entry();
810         if (vcpu->rmode.active)
811                 vmcs_write32(EXCEPTION_BITMAP, ~0);
812         else
813                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
814         print_func_exit();
815 }
816
817 static void enter_pmode(struct litevm_vcpu *vcpu)
818 {
819         print_func_entry();
820         unsigned long flags;
821
822         vcpu->rmode.active = 0;
823
824         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
825         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
826         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
827
828         flags = vmcs_readl(GUEST_RFLAGS);
829         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
830         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
831         vmcs_writel(GUEST_RFLAGS, flags);
832
833         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
834                                 (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK));
835
836         update_exception_bitmap(vcpu);
837
838 #define FIX_PMODE_DATASEG(seg, save) {                          \
839                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
840                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
841                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
842                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
843         }
844
845         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
846         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
847         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
848         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
849         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
850
851         vmcs_write16(GUEST_CS_SELECTOR,
852                                  vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
853         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
854         print_func_exit();
855 }
856
857 static int rmode_tss_base(struct litevm *litevm)
858 {
859         print_func_entry();
860         gfn_t base_gfn =
861                 litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
862         print_func_exit();
863         return base_gfn << PAGE_SHIFT;
864 }
865
866 static void enter_rmode(struct litevm_vcpu *vcpu)
867 {
868         print_func_entry();
869         unsigned long flags;
870
871         vcpu->rmode.active = 1;
872
873         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
874         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
875
876         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
877         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
878
879         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
880         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
881
882         flags = vmcs_readl(GUEST_RFLAGS);
883         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
884
885         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
886
887         printk("FLAGS 0x%x\n", flags);
888         vmcs_writel(GUEST_RFLAGS, flags);
889         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
890         update_exception_bitmap(vcpu);
891
892 #define FIX_RMODE_SEG(seg, save) {                                 \
893                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
894                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
895                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
896                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
897         }
898
899         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
900         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
901
902         FIX_RMODE_SEG(ES, vcpu->rmode.es);
903         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
904         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
905         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
906         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
907         print_func_exit();
908 }
909
910 static int init_rmode_tss(struct litevm *litevm)
911 {
912         print_func_entry();
913         struct page *p1, *p2, *p3;
914         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
915         char *page;
916
917         p1 = _gfn_to_page(litevm, fn++);
918         p2 = _gfn_to_page(litevm, fn++);
919         p3 = _gfn_to_page(litevm, fn);
920
921         if (!p1 || !p2 || !p3) {
922                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
923                 print_func_exit();
924                 return 0;
925         }
926
927         page = page2kva(p1);
928         memset(page, 0, PAGE_SIZE);
929         *(uint16_t *) (page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
930
931         page = page2kva(p2);
932         memset(page, 0, PAGE_SIZE);
933
934         page = page2kva(p3);
935         memset(page, 0, PAGE_SIZE);
936         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
937
938         print_func_exit();
939         return 1;
940 }
941
942 #ifdef __x86_64__
943
944 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
945 {
946         print_func_entry();
947         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
948
949         vcpu->shadow_efer = efer;
950         if (efer & EFER_LMA) {
951                 vmcs_write32(VM_ENTRY_CONTROLS,
952                                          vmcs_read32(VM_ENTRY_CONTROLS) |
953                                          VM_ENTRY_CONTROLS_IA32E_MASK);
954                 msr->value = efer;
955
956         } else {
957                 vmcs_write32(VM_ENTRY_CONTROLS,
958                                          vmcs_read32(VM_ENTRY_CONTROLS) &
959                                          ~VM_ENTRY_CONTROLS_IA32E_MASK);
960
961                 msr->value = efer & ~EFER_LME;
962         }
963         print_func_exit();
964 }
965
966 static void enter_lmode(struct litevm_vcpu *vcpu)
967 {
968         print_func_entry();
969         uint32_t guest_tr_ar;
970
971         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
972         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
973                 printd("%s: tss fixup for long mode. \n", __FUNCTION__);
974                 vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK)
975                                          | AR_TYPE_BUSY_64_TSS);
976         }
977
978         vcpu->shadow_efer |= EFER_LMA;
979
980         find_msr_entry(vcpu, MSR_EFER)->value |= EFER_LMA | EFER_LME;
981         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
982                                  | VM_ENTRY_CONTROLS_IA32E_MASK);
983         print_func_exit();
984 }
985
986 static void exit_lmode(struct litevm_vcpu *vcpu)
987 {
988         print_func_entry();
989         vcpu->shadow_efer &= ~EFER_LMA;
990
991         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
992                                  & ~VM_ENTRY_CONTROLS_IA32E_MASK);
993         print_func_exit();
994 }
995
996 #endif
997
998 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
999 {
1000         print_func_entry();
1001         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
1002                 enter_pmode(vcpu);
1003
1004         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
1005                 enter_rmode(vcpu);
1006
1007 #ifdef __x86_64__
1008         if (vcpu->shadow_efer & EFER_LME) {
1009                 if (!is_paging() && (cr0 & CR0_PG_MASK))
1010                         enter_lmode(vcpu);
1011                 if (is_paging() && !(cr0 & CR0_PG_MASK))
1012                         exit_lmode(vcpu);
1013         }
1014 #endif
1015
1016         vmcs_writel(CR0_READ_SHADOW, cr0);
1017         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
1018         print_func_exit();
1019 }
1020
1021 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
1022                                                                                  unsigned long cr3)
1023 {
1024         print_func_entry();
1025         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
1026         unsigned offset = (cr3 & (PAGE_SIZE - 1)) >> 5;
1027         int i;
1028         uint64_t pdpte;
1029         uint64_t *pdpt;
1030         struct litevm_memory_slot *memslot;
1031
1032         SPLL(&vcpu->litevm->lock);
1033         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
1034         /* FIXME: !memslot - emulate? 0xff? */
1035         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
1036
1037         for (i = 0; i < 4; ++i) {
1038                 pdpte = pdpt[offset + i];
1039                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
1040                         break;
1041         }
1042
1043         SPLU(&vcpu->litevm->lock);
1044
1045         print_func_exit();
1046         return i != 4;
1047 }
1048
1049 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
1050 {
1051         print_func_entry();
1052         if (cr0 & CR0_RESEVED_BITS) {
1053                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, guest_cr0());
1054                 inject_gp(vcpu);
1055                 print_func_exit();
1056                 return;
1057         }
1058
1059         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
1060                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
1061                 inject_gp(vcpu);
1062                 print_func_exit();
1063                 return;
1064         }
1065
1066         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
1067                 printd("set_cr0: #GP, set PG flag " "and a clear PE flag\n");
1068                 inject_gp(vcpu);
1069                 print_func_exit();
1070                 return;
1071         }
1072
1073         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
1074 #ifdef __x86_64__
1075                 if ((vcpu->shadow_efer & EFER_LME)) {
1076                         uint32_t guest_cs_ar;
1077                         if (!is_pae()) {
1078                                 printd("set_cr0: #GP, start paging "
1079                                            "in long mode while PAE is disabled\n");
1080                                 inject_gp(vcpu);
1081                                 print_func_exit();
1082                                 return;
1083                         }
1084                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1085                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
1086                                 printd("set_cr0: #GP, start paging "
1087                                            "in long mode while CS.L == 1\n");
1088                                 inject_gp(vcpu);
1089                                 print_func_exit();
1090                                 return;
1091
1092                         }
1093                 } else
1094 #endif
1095                 if (is_pae() && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1096                         printd("set_cr0: #GP, pdptrs " "reserved bits\n");
1097                         inject_gp(vcpu);
1098                         print_func_exit();
1099                         return;
1100                 }
1101
1102         }
1103
1104         __set_cr0(vcpu, cr0);
1105         litevm_mmu_reset_context(vcpu);
1106         print_func_exit();
1107         return;
1108 }
1109
1110 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
1111 {
1112         print_func_entry();
1113         unsigned long cr0 = guest_cr0();
1114
1115         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
1116                 enter_pmode(vcpu);
1117                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
1118
1119         } else
1120                 printd("lmsw: unexpected\n");
1121
1122         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
1123                                 | (msw & LMSW_GUEST_MASK));
1124         print_func_exit();
1125 }
1126
1127 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1128 {
1129         print_func_entry();
1130         vmcs_writel(CR4_READ_SHADOW, cr4);
1131         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1132                                                                   LITEVM_RMODE_VM_CR4_ALWAYS_ON :
1133                                                                   LITEVM_PMODE_VM_CR4_ALWAYS_ON));
1134         print_func_exit();
1135 }
1136
1137 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1138 {
1139         print_func_entry();
1140         if (cr4 & CR4_RESEVED_BITS) {
1141                 printd("set_cr4: #GP, reserved bits\n");
1142                 inject_gp(vcpu);
1143                 print_func_exit();
1144                 return;
1145         }
1146
1147         if (is_long_mode()) {
1148                 if (!(cr4 & CR4_PAE_MASK)) {
1149                         printd("set_cr4: #GP, clearing PAE while " "in long mode\n");
1150                         inject_gp(vcpu);
1151                         print_func_exit();
1152                         return;
1153                 }
1154         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
1155                            && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1156                 printd("set_cr4: #GP, pdptrs reserved bits\n");
1157                 inject_gp(vcpu);
1158         }
1159
1160         if (cr4 & CR4_VMXE_MASK) {
1161                 printd("set_cr4: #GP, setting VMXE\n");
1162                 inject_gp(vcpu);
1163                 print_func_exit();
1164                 return;
1165         }
1166         __set_cr4(vcpu, cr4);
1167         SPLL(&vcpu->litevm->lock);
1168         litevm_mmu_reset_context(vcpu);
1169         SPLU(&vcpu->litevm->lock);
1170         print_func_exit();
1171 }
1172
1173 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1174 {
1175         print_func_entry();
1176         if (is_long_mode()) {
1177                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
1178                         printd("set_cr3: #GP, reserved bits\n");
1179                         inject_gp(vcpu);
1180                         print_func_exit();
1181                         return;
1182                 }
1183         } else {
1184                 if (cr3 & CR3_RESEVED_BITS) {
1185                         printd("set_cr3: #GP, reserved bits\n");
1186                         inject_gp(vcpu);
1187                         print_func_exit();
1188                         return;
1189                 }
1190                 if (is_paging() && is_pae() && pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1191                         printd("set_cr3: #GP, pdptrs " "reserved bits\n");
1192                         inject_gp(vcpu);
1193                         print_func_exit();
1194                         return;
1195                 }
1196         }
1197
1198         vcpu->cr3 = cr3;
1199         SPLL(&vcpu->litevm->lock);
1200         vcpu->mmu.new_cr3(vcpu);
1201         SPLU(&vcpu->litevm->lock);
1202         print_func_exit();
1203 }
1204
1205 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1206 {
1207         print_func_entry();
1208         if (cr8 & CR8_RESEVED_BITS) {
1209                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1210                 inject_gp(vcpu);
1211                 print_func_exit();
1212                 return;
1213         }
1214         vcpu->cr8 = cr8;
1215         print_func_exit();
1216 }
1217
1218 static uint32_t get_rdx_init_val(void)
1219 {
1220         print_func_entry();
1221         uint32_t val;
1222
1223 asm("movl $1, %%eax \n\t" "movl %%eax, %0 \n\t":"=g"(val));
1224         print_func_exit();
1225         return val;
1226
1227 }
1228
1229 static void fx_init(struct litevm_vcpu *vcpu)
1230 {
1231         print_func_entry();
1232         struct __attribute__ ((__packed__)) fx_image_s {
1233                 uint16_t control;               //fcw
1234                 uint16_t status;                //fsw
1235                 uint16_t tag;                   // ftw
1236                 uint16_t opcode;                //fop
1237                 uint64_t ip;                    // fpu ip
1238                 uint64_t operand;               // fpu dp
1239                 uint32_t mxcsr;
1240                 uint32_t mxcsr_mask;
1241
1242         } *fx_image;
1243
1244         fx_save(vcpu->host_fx_image);
1245         fpu_init();
1246         fx_save(vcpu->guest_fx_image);
1247         fx_restore(vcpu->host_fx_image);
1248
1249         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1250         fx_image->mxcsr = 0x1f80;
1251         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1252                    0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1253         print_func_exit();
1254 }
1255
1256 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field,
1257                                                                    uint32_t val)
1258 {
1259         uint32_t msr_high, msr_low;
1260         uint64_t msrval;
1261
1262         msrval = read_msr(msr);
1263         msr_low = msrval;
1264         msr_high = (msrval >> 32);
1265
1266         val &= msr_high;
1267         val |= msr_low;
1268         vmcs_write32(vmcs_field, val);
1269 }
1270
1271 /*
1272  * Sets up the vmcs for emulated real mode.
1273  */
1274 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1275 {
1276         print_func_entry();
1277
1278 /* no op on x86_64 */
1279 #define asmlinkage
1280         extern asmlinkage void litevm_vmx_return(void);
1281         uint32_t host_sysenter_cs;
1282         uint32_t junk;
1283         uint64_t a;
1284         struct descriptor_table dt;
1285         int i;
1286         int ret;
1287         uint64_t tsc;
1288         int nr_good_msrs;
1289
1290         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1291         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1292         vcpu->cr8 = 0;
1293         vcpu->apic_base = 0xfee00000 |
1294                 /*for vcpu 0 */ MSR_IA32_APICBASE_BSP |
1295                 MSR_IA32_APICBASE_ENABLE;
1296
1297         fx_init(vcpu);
1298
1299 #define SEG_SETUP(seg) do {                                     \
1300                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1301                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1302                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1303                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1304         } while (0)
1305
1306         /*
1307          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1308          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1309          */
1310         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1311         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1312         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1313         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1314
1315         SEG_SETUP(DS);
1316         SEG_SETUP(ES);
1317         SEG_SETUP(FS);
1318         SEG_SETUP(GS);
1319         SEG_SETUP(SS);
1320
1321         vmcs_write16(GUEST_TR_SELECTOR, 0);
1322         vmcs_writel(GUEST_TR_BASE, 0);
1323         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1324         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1325
1326         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1327         vmcs_writel(GUEST_LDTR_BASE, 0);
1328         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1329         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1330
1331         vmcs_write32(GUEST_SYSENTER_CS, 0);
1332         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1333         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1334
1335         vmcs_writel(GUEST_RFLAGS, 0x02);
1336         vmcs_writel(GUEST_RIP, 0xfff0);
1337         vmcs_writel(GUEST_RSP, 0);
1338
1339         vmcs_writel(GUEST_CR3, 0);
1340
1341         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1342         vmcs_writel(GUEST_DR7, 0x400);
1343
1344         vmcs_writel(GUEST_GDTR_BASE, 0);
1345         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1346
1347         vmcs_writel(GUEST_IDTR_BASE, 0);
1348         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1349
1350         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1351         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1352         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1353
1354         /* I/O */
1355         vmcs_write64(IO_BITMAP_A, 0);
1356         vmcs_write64(IO_BITMAP_B, 0);
1357
1358         tsc = read_tsc();
1359         vmcs_write64(TSC_OFFSET, -tsc);
1360
1361         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1362
1363         /* Special registers */
1364         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1365
1366         /* Control */
1367         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_EXT_INTR_MASK       /* 20.6.1 */
1368                                                    | PIN_BASED_NMI_EXITING      /* 20.6.1 */
1369                 );
1370         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_HLT_EXITING        /* 20.6.2 */
1371                                                    | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1372                                                    | CPU_BASED_CR8_STORE_EXITING        /* 20.6.2 */
1373                                                    | CPU_BASED_UNCOND_IO_EXITING        /* 20.6.2 */
1374                                                    | CPU_BASED_INVDPG_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING  /* 21.3 */
1375                 );
1376
1377         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1378         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1379         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1380         vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1381
1382         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1383         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1384         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1385
1386         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
1387         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
1388         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
1389         vmcs_write16(HOST_FS_SELECTOR, read_fs());      /* 22.2.4 */
1390         vmcs_write16(HOST_GS_SELECTOR, read_gs());      /* 22.2.4 */
1391         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
1392
1393 #ifdef __x86_64__
1394         a = read_msr(MSR_FS_BASE);
1395         vmcs_writel(HOST_FS_BASE, a);   /* 22.2.4 */
1396         a = read_msr(MSR_GS_BASE);
1397         vmcs_writel(HOST_GS_BASE, a);   /* 22.2.4 */
1398 #else
1399         vmcs_writel(HOST_FS_BASE, 0);   /* 22.2.4 */
1400         vmcs_writel(HOST_GS_BASE, 0);   /* 22.2.4 */
1401 #endif
1402
1403         vmcs_write16(HOST_TR_SELECTOR, GD_TSS * 8);     /* 22.2.4 */
1404
1405         get_idt(&dt);
1406         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1407
1408         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return);        /* 22.2.5 */
1409
1410         /* it's the HIGH 32 bits! */
1411         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1412         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1413         a = read_msr(MSR_IA32_SYSENTER_ESP);
1414         vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1415         a = read_msr(MSR_IA32_SYSENTER_EIP);
1416         vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1417
1418         ret = -ENOMEM;
1419         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1420         if (!vcpu->guest_msrs)
1421                 error("guest_msrs kmalloc failed");
1422         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1423         if (!vcpu->host_msrs)
1424                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1425
1426         for (i = 0; i < NR_VMX_MSR; ++i) {
1427                 uint32_t index = vmx_msr_index[i];
1428                 uint32_t data_low, data_high;
1429                 uint64_t data;
1430                 int j = vcpu->nmsrs;
1431
1432 #warning "need readmsr_safe"
1433 //      if (rdmsr_safe(index, &data_low, &data_high) < 0)
1434 //          continue;
1435                 data = read_msr(index);
1436                 vcpu->host_msrs[j].index = index;
1437                 vcpu->host_msrs[j].reserved = 0;
1438                 vcpu->host_msrs[j].value = data;
1439                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1440                 ++vcpu->nmsrs;
1441         }
1442         printk("msrs: %d\n", vcpu->nmsrs);
1443
1444         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1445         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1446         vmcs_writel(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1447         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1448         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, (HOST_IS_64 << 9));        /* 22.2,1, 20.7.1 */
1449         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs);    /* 22.2.2 */
1450         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);     /* 22.2.2 */
1451         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs);    /* 22.2.2 */
1452
1453         /* 22.2.1, 20.8.1 */
1454         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, VM_ENTRY_CONTROLS, 0);
1455         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
1456
1457         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1458         vmcs_writel(TPR_THRESHOLD, 0);
1459
1460         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1461         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1462
1463         __set_cr0(vcpu, 0x60000010);    // enter rmode
1464         __set_cr4(vcpu, 0);
1465 #ifdef __x86_64__
1466         __set_efer(vcpu, 0);
1467 #endif
1468
1469         ret = litevm_mmu_init(vcpu);
1470
1471         print_func_exit();
1472         return ret;
1473
1474 out_free_guest_msrs:
1475         kfree(vcpu->guest_msrs);
1476 out:
1477         return ret;
1478 }
1479
1480 /*
1481  * Sync the rsp and rip registers into the vcpu structure.  This allows
1482  * registers to be accessed by indexing vcpu->regs.
1483  */
1484 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1485 {
1486         print_func_entry();
1487         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1488         vcpu->rip = vmcs_readl(GUEST_RIP);
1489         print_func_exit();
1490 }
1491
1492 /*
1493  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1494  * modification.
1495  */
1496 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1497 {
1498         print_func_entry();
1499         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1500         vmcs_writel(GUEST_RIP, vcpu->rip);
1501         print_func_exit();
1502 }
1503
1504 /*
1505  * Creates some virtual cpus.  Good luck creating more than one.
1506  */
1507 int vmx_create_vcpu(struct litevm *litevm, int n)
1508 {
1509         print_func_entry();
1510         ERRSTACK(2);
1511         int r;
1512         struct litevm_vcpu *vcpu;
1513         struct vmcs *vmcs;
1514         char *errstring = NULL;
1515
1516         if (n < 0 || n >= LITEVM_MAX_VCPUS) {
1517                 printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1518                            LITEVM_MAX_VCPUS);
1519                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1520                           LITEVM_MAX_VCPUS);
1521         }
1522         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1523         vcpu = &litevm->vcpus[n];
1524
1525         printk("vmx_create_vcpu: @%d, %p\n", n, vcpu);
1526         QLOCK(&vcpu->mutex);
1527
1528         if (vcpu->vmcs) {
1529                 QUNLOCK(&vcpu->mutex);
1530                 printk("VM already exists\n");
1531                 error("VM already exists");
1532         }
1533         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1534         /* I'm a bad person */
1535         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1536         uint64_t a = (uint64_t) vcpu->fx_buf;
1537         a += FX_IMAGE_ALIGN - 1;
1538         a /= FX_IMAGE_ALIGN;
1539         a *= FX_IMAGE_ALIGN;
1540
1541         vcpu->host_fx_image = (char *)a;
1542         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1543
1544         vcpu->cpu = -1; /* First load will set up TR */
1545         vcpu->litevm = litevm;
1546         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1547         if (waserror()){
1548                 printk("ERR 1 in %s, %s\n", __func__, current_errstr());
1549                 QUNLOCK(&vcpu->mutex);
1550                 litevm_free_vcpu(vcpu);
1551                 nexterror();
1552         }
1553         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1554         vmcs = alloc_vmcs();
1555         vmcs_clear(vmcs);
1556         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1557         printk("after vmcs_clear\n");
1558         vcpu->vmcs = vmcs;
1559         printk("vcpu %p set vmcs to %p\n", vcpu, vmcs);
1560         vcpu->launched = 0;
1561         printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs);
1562
1563         __vcpu_load(vcpu);
1564
1565         printk("PAST vcpu_load\n");
1566         if (waserror()) {
1567                 /* we really need to fix waserror() */
1568                 printk("vcpu_setup failed: %s\n", current_errstr());
1569                 QUNLOCK(&vcpu->mutex);
1570                 nexterror();
1571         }
1572
1573         /* need memory for the rmode_tss. I have no idea how this happened
1574          * originally in kvm.
1575          */
1576         /* this sucks. */
1577         QUNLOCK(&vcpu->mutex);
1578         void *v;
1579         struct litevm_memory_region vmr;
1580         vmr.slot = 0;
1581         vmr.flags = 0;
1582         vmr.guest_phys_addr = /* guess. */ 0x1000000;
1583         vmr.memory_size = 0x10000;
1584         vmr.init_data = NULL;
1585         if (vm_set_memory_region(litevm, &vmr))
1586                 printk("vm_set_memory_region failed");
1587
1588         printk("set memory region done\n");
1589
1590         if (!init_rmode_tss(litevm)) {
1591                 error("vcpu_setup: init_rmode_tss failed");
1592         }
1593
1594
1595         QLOCK(&vcpu->mutex);
1596         r = litevm_vcpu_setup(vcpu);
1597
1598         vcpu_put(vcpu);
1599
1600         printk("r is %d\n", r);
1601
1602         if (!r) {
1603                 poperror();
1604                 print_func_exit();
1605                 return 0;
1606         }
1607
1608         errstring = "vcup set failed";
1609
1610 out_free_vcpus:
1611 out:
1612         print_func_exit();
1613         return r;
1614 }
1615
1616 /*
1617  * Allocate some memory and give it an address in the guest physical address
1618  * space.
1619  *
1620  * Discontiguous memory is allowed, mostly for framebuffers.
1621  */
1622 int vm_set_memory_region(struct litevm *litevm,
1623                                                  struct litevm_memory_region *mem)
1624 {
1625         print_func_entry();
1626         ERRSTACK(2);
1627         int r;
1628         gfn_t base_gfn;
1629         unsigned long npages;
1630         unsigned long i;
1631         struct litevm_memory_slot *memslot;
1632         struct litevm_memory_slot old, new;
1633         int memory_config_version;
1634         void *init_data = mem->init_data;
1635         int pass = 1;
1636         printk("%s: slot %d base %08x npages %d\n", 
1637                 __func__, 
1638                mem->slot, mem->guest_phys_addr, 
1639                mem->memory_size);
1640         /* should not happen but ... */
1641         if (!litevm)
1642                 error("NULL litevm in %s", __func__);
1643
1644         if (!mem)
1645                 error("NULL mem in %s", __func__);
1646         /* I don't care right now. *
1647         if (litevm->busy)
1648                 error("litevm->busy is set! 0x%x\n", litevm->busy);
1649         */
1650         r = -EINVAL;
1651         /* General sanity checks */
1652         if (mem->memory_size & (PAGE_SIZE - 1))
1653                 error("mem->memory_size %lld is not page-aligned", mem->memory_size);
1654         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1655                 error("guest_phys_addr 0x%llx is not page-aligned",
1656                           mem->guest_phys_addr);
1657         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1658                 error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
1659         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1660                 error("0x%x + 0x%x is < 0x%x",
1661                           mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
1662
1663         memslot = &litevm->memslots[mem->slot];
1664         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1665         npages = mem->memory_size >> PAGE_SHIFT;
1666
1667         if (!npages)
1668                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1669
1670         /* this is actually a very tricky for loop. The use of
1671          * error is a bit dangerous, so we don't use it much.
1672          * consider a rewrite. Would be nice if akaros could do the
1673          * allocation of a bunch of pages for us.
1674          */
1675 raced:
1676         printk("raced: pass %d\n", pass);
1677         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1678         void monitor(void *);
1679         monitor(NULL);
1680         SPLL(&litevm->lock);
1681         printk("locked\n");
1682
1683         if (waserror()) {
1684                 printk("error in %s, %s\n", __func__, current_errstr());
1685                 SPLU(&litevm->lock);
1686                 nexterror();
1687         }
1688
1689         memory_config_version = litevm->memory_config_version;
1690         new = old = *memslot;
1691         printk("memory_config_version %d\n", memory_config_version);
1692
1693         new.base_gfn = base_gfn;
1694         new.npages = npages;
1695         new.flags = mem->flags;
1696
1697         /* Disallow changing a memory slot's size. */
1698         r = -EINVAL;
1699         if (npages && old.npages && npages != old.npages)
1700                 error("npages is %d, old.npages is %d, can't change",
1701                           npages, old.npages);
1702
1703         /* Check for overlaps */
1704         r = -EEXIST;
1705         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1706                 struct litevm_memory_slot *s = &litevm->memslots[i];
1707 printk("Region %d: base gfn 0x%x npages %d\n", s->base_gfn, s->npages);
1708                 if (s == memslot)
1709                         continue;
1710                 if (!((base_gfn + npages <= s->base_gfn) ||
1711                           (base_gfn >= s->base_gfn + s->npages)))
1712                         error("Overlap");
1713         }
1714         /*
1715          * Do memory allocations outside lock.  memory_config_version will
1716          * detect any races.
1717          */
1718         SPLU(&litevm->lock);
1719         printk("unlocked\n");
1720         poperror();
1721
1722         /* Deallocate if slot is being removed */
1723         if (!npages)
1724                 new.phys_mem = 0;
1725
1726         /* Free page dirty bitmap if unneeded */
1727         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1728                 new.dirty_bitmap = 0;
1729
1730         r = -ENOMEM;
1731
1732         /* Allocate if a slot is being created */
1733         if (npages && !new.phys_mem) {
1734                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1735
1736                 if (!new.phys_mem)
1737                         goto out_free;
1738
1739                 for (i = 0; i < npages; ++i) {
1740                         int ret;
1741                         ret = kpage_alloc(&new.phys_mem[i]);
1742                         printk("PAGEALLOC: va %p pa %p\n",page2kva(new.phys_mem[i]),page2pa(new.phys_mem[i]));
1743                         if (ret != ESUCCESS)
1744                                 goto out_free;
1745                         if (init_data) {
1746                                 printk("init data memcpy(%p,%p,4096);\n",
1747                                            page2kva(new.phys_mem[i]), init_data);
1748                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1749                                 init_data += PAGE_SIZE;
1750                         } else {
1751                                 int j;
1752                                 //memset(page2kva(new.phys_mem[i]), 0xf4 /* hlt */, PAGE_SIZE);
1753                                 uint8_t *cp = page2kva(new.phys_mem[i]);
1754                                 memset(cp, 0, PAGE_SIZE);
1755                                 if (base_gfn < 0x100000){
1756                                 for(j = 0; j < PAGE_SIZE; j += 2){
1757                                         // XORL %RAX, %RAX
1758                                         cp[j] = 0x31; cp[j+1] = 0xc0;
1759                                 }
1760                                 // 1: jmp 1b
1761                                 cp[4094] = 0xeb;
1762                                 cp[4095] = 0xfe;
1763                                 }
1764                                         
1765                                 init_data += PAGE_SIZE;
1766                         }
1767                 }
1768         }
1769
1770         /* Allocate page dirty bitmap if needed */
1771         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1772                 unsigned dirty_bytes;   //ALIGN(npages, BITS_PER_LONG) / 8;
1773                 dirty_bytes =
1774                         (((npages + BITS_PER_LONG -
1775                            1) / BITS_PER_LONG) * BITS_PER_LONG) / 8;
1776
1777                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1778                 if (!new.dirty_bitmap) {
1779                         printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
1780                         goto out_free;
1781                 }
1782         }
1783
1784         SPLL(&litevm->lock);
1785         printk("locked\n");
1786         if (memory_config_version != litevm->memory_config_version) {
1787                 SPLU(&litevm->lock);
1788                 printk("unlocked, try again\n");
1789                 litevm_free_physmem_slot(&new, &old);
1790                 goto raced;
1791         }
1792
1793         r = -EAGAIN;
1794         if (litevm->busy) {
1795                 printk("BUSY!\n");
1796                 goto out_unlock;
1797         }
1798
1799         if (mem->slot >= litevm->nmemslots)
1800                 litevm->nmemslots = mem->slot + 1;
1801
1802         *memslot = new;
1803         ++litevm->memory_config_version;
1804
1805         SPLU(&litevm->lock);
1806         printk("unlocked\n");
1807         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1808                 struct litevm_vcpu *vcpu;
1809
1810                 vcpu = vcpu_load(litevm, i);
1811                 if (!vcpu){
1812                         printk("%s: no cpu %d\n", __func__, i);
1813                         continue;
1814                 }
1815                 litevm_mmu_reset_context(vcpu);
1816                 vcpu_put(vcpu);
1817         }
1818
1819         litevm_free_physmem_slot(&old, &new);
1820         print_func_exit();
1821         return 0;
1822
1823 out_unlock:
1824         SPLU(&litevm->lock);
1825         printk("out_unlock\n");
1826 out_free:
1827         printk("out_free\n");
1828         litevm_free_physmem_slot(&new, &old);
1829 out:
1830         printk("vm_set_memory_region: return %d\n", r);
1831         print_func_exit();
1832         return r;
1833 }
1834
1835 #if 0
1836 /*
1837  * Get (and clear) the dirty memory log for a memory slot.
1838  */
1839 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1840                                                                                   struct litevm_dirty_log *log)
1841 {
1842         struct litevm_memory_slot *memslot;
1843         int r, i;
1844         int n;
1845         unsigned long any = 0;
1846
1847         SPLL(&litevm->lock);
1848
1849         /*
1850          * Prevent changes to guest memory configuration even while the lock
1851          * is not taken.
1852          */
1853         ++litevm->busy;
1854         SPLU(&litevm->lock);
1855         r = -EINVAL;
1856         if (log->slot >= LITEVM_MEMORY_SLOTS)
1857                 goto out;
1858
1859         memslot = &litevm->memslots[log->slot];
1860         r = -ENOENT;
1861         if (!memslot->dirty_bitmap)
1862                 goto out;
1863
1864         n = ALIGN(memslot->npages, 8) / 8;
1865
1866         for (i = 0; !any && i < n; ++i)
1867                 any = memslot->dirty_bitmap[i];
1868
1869         r = -EFAULT;
1870         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1871                 goto out;
1872
1873         if (any) {
1874                 SPLL(&litevm->lock);
1875                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1876                 SPLU(&litevm->lock);
1877                 memset(memslot->dirty_bitmap, 0, n);
1878                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1879                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1880
1881                         if (!vcpu)
1882                                 continue;
1883                         flush_guest_tlb(vcpu);
1884                         vcpu_put(vcpu);
1885                 }
1886         }
1887
1888         r = 0;
1889
1890 out:
1891         SPLL(&litevm->lock);
1892         --litevm->busy;
1893         SPLU(&litevm->lock);
1894         return r;
1895 }
1896 #endif
1897
1898 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1899 {
1900         print_func_entry();
1901         int i;
1902
1903         printk("%s: litevm %p gfn %d\n", litevm, gfn);
1904         for (i = 0; i < litevm->nmemslots; ++i) {
1905                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1906
1907                 printk("%s: slot %d gfn 0x%lx base_gfn %lx npages %x\n", 
1908                         __func__, i, gfn,memslot->base_gfn, memslot->npages);
1909                 if (gfn >= memslot->base_gfn
1910                         && gfn < memslot->base_gfn + memslot->npages) {
1911                         print_func_exit();
1912                         return memslot;
1913                 }
1914         }
1915         print_func_exit();
1916         return 0;
1917 }
1918
1919 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1920 {
1921         print_func_entry();
1922         int i;
1923         struct litevm_memory_slot *memslot = 0;
1924         unsigned long rel_gfn;
1925
1926         for (i = 0; i < litevm->nmemslots; ++i) {
1927                 memslot = &litevm->memslots[i];
1928
1929                 if (gfn >= memslot->base_gfn
1930                         && gfn < memslot->base_gfn + memslot->npages) {
1931
1932                         if (!memslot || !memslot->dirty_bitmap) {
1933                                 print_func_exit();
1934                                 return;
1935                         }
1936
1937                         rel_gfn = gfn - memslot->base_gfn;
1938
1939                         /* avoid RMW */
1940                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1941                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1942                         print_func_exit();
1943                         return;
1944                 }
1945         }
1946         print_func_exit();
1947 }
1948
1949 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1950 {
1951         print_func_entry();
1952         unsigned long rip;
1953         uint32_t interruptibility;
1954
1955         rip = vmcs_readl(GUEST_RIP);
1956         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1957         vmcs_writel(GUEST_RIP, rip);
1958
1959         /*
1960          * We emulated an instruction, so temporary interrupt blocking
1961          * should be removed, if set.
1962          */
1963         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1964         if (interruptibility & 3)
1965                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3);
1966         print_func_exit();
1967 }
1968
1969 static int emulator_read_std(unsigned long addr,
1970                                                          unsigned long *val,
1971                                                          unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1972 {
1973         print_func_entry();
1974         struct litevm_vcpu *vcpu = ctxt->vcpu;
1975         void *data = val;
1976
1977         while (bytes) {
1978                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1979                 unsigned offset = addr & (PAGE_SIZE - 1);
1980                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ?
1981                         bytes : (unsigned)PAGE_SIZE - offset;
1982                 unsigned long pfn;
1983                 struct litevm_memory_slot *memslot;
1984                 void *page;
1985
1986                 if (gpa == UNMAPPED_GVA) {
1987                         print_func_exit();
1988                         return X86EMUL_PROPAGATE_FAULT;
1989                 }
1990                 pfn = gpa >> PAGE_SHIFT;
1991                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1992                 if (!memslot) {
1993                         print_func_exit();
1994                         return X86EMUL_UNHANDLEABLE;
1995                 }
1996                 page = page2kva(gfn_to_page(memslot, pfn));
1997
1998                 memcpy(data, page + offset, tocopy);
1999
2000                 bytes -= tocopy;
2001                 data += tocopy;
2002                 addr += tocopy;
2003         }
2004
2005         print_func_exit();
2006         return X86EMUL_CONTINUE;
2007 }
2008
2009 static int emulator_write_std(unsigned long addr,
2010                                                           unsigned long val,
2011                                                           unsigned int bytes, struct x86_emulate_ctxt *ctxt)
2012 {
2013         print_func_entry();
2014         printk("emulator_write_std: addr %lx n %d\n", addr, bytes);
2015         print_func_exit();
2016         return X86EMUL_UNHANDLEABLE;
2017 }
2018
2019 static int emulator_read_emulated(unsigned long addr,
2020                                                                   unsigned long *val,
2021                                                                   unsigned int bytes,
2022                                                                   struct x86_emulate_ctxt *ctxt)
2023 {
2024         print_func_entry();
2025         struct litevm_vcpu *vcpu = ctxt->vcpu;
2026
2027         if (vcpu->mmio_read_completed) {
2028                 memcpy(val, vcpu->mmio_data, bytes);
2029                 vcpu->mmio_read_completed = 0;
2030                 print_func_exit();
2031                 return X86EMUL_CONTINUE;
2032         } else if (emulator_read_std(addr, val, bytes, ctxt)
2033                            == X86EMUL_CONTINUE) {
2034                 print_func_exit();
2035                 return X86EMUL_CONTINUE;
2036         } else {
2037                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2038                 if (gpa == UNMAPPED_GVA) {
2039                         print_func_exit();
2040                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
2041                 }
2042                 vcpu->mmio_needed = 1;
2043                 vcpu->mmio_phys_addr = gpa;
2044                 vcpu->mmio_size = bytes;
2045                 vcpu->mmio_is_write = 0;
2046
2047                 print_func_exit();
2048                 return X86EMUL_UNHANDLEABLE;
2049         }
2050 }
2051
2052 static int emulator_write_emulated(unsigned long addr,
2053                                                                    unsigned long val,
2054                                                                    unsigned int bytes,
2055                                                                    struct x86_emulate_ctxt *ctxt)
2056 {
2057         print_func_entry();
2058         struct litevm_vcpu *vcpu = ctxt->vcpu;
2059         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2060
2061         if (gpa == UNMAPPED_GVA) {
2062                 print_func_exit();
2063                 return X86EMUL_PROPAGATE_FAULT;
2064         }
2065
2066         vcpu->mmio_needed = 1;
2067         vcpu->mmio_phys_addr = gpa;
2068         vcpu->mmio_size = bytes;
2069         vcpu->mmio_is_write = 1;
2070         memcpy(vcpu->mmio_data, &val, bytes);
2071
2072         print_func_exit();
2073         return X86EMUL_CONTINUE;
2074 }
2075
2076 static int emulator_cmpxchg_emulated(unsigned long addr,
2077                                                                          unsigned long old,
2078                                                                          unsigned long new,
2079                                                                          unsigned int bytes,
2080                                                                          struct x86_emulate_ctxt *ctxt)
2081 {
2082         print_func_entry();
2083         static int reported;
2084
2085         if (!reported) {
2086                 reported = 1;
2087                 printk("litevm: emulating exchange as write\n");
2088         }
2089         print_func_exit();
2090         return emulator_write_emulated(addr, new, bytes, ctxt);
2091 }
2092
2093 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
2094 {
2095         print_func_entry();
2096         static int reported;
2097         uint8_t opcodes[4];
2098         unsigned long rip = vmcs_readl(GUEST_RIP);
2099         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
2100
2101         if (reported) {
2102                 print_func_exit();
2103                 return;
2104         }
2105
2106         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
2107
2108         printk("emulation failed but !mmio_needed?"
2109                    " rip %lx %02x %02x %02x %02x\n",
2110                    rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2111         reported = 1;
2112         print_func_exit();
2113 }
2114
2115 struct x86_emulate_ops emulate_ops = {
2116         .read_std = emulator_read_std,
2117         .write_std = emulator_write_std,
2118         .read_emulated = emulator_read_emulated,
2119         .write_emulated = emulator_write_emulated,
2120         .cmpxchg_emulated = emulator_cmpxchg_emulated,
2121 };
2122
2123 enum emulation_result {
2124         EMULATE_DONE,                           /* no further processing */
2125         EMULATE_DO_MMIO,                        /* litevm_run filled with mmio request */
2126         EMULATE_FAIL,                           /* can't emulate this instruction */
2127 };
2128
2129 static int emulate_instruction(struct litevm_vcpu *vcpu,
2130                                                            struct litevm_run *run,
2131                                                            unsigned long cr2, uint16_t error_code)
2132 {
2133         print_func_entry();
2134         struct x86_emulate_ctxt emulate_ctxt;
2135         int r;
2136         uint32_t cs_ar;
2137
2138         vcpu_load_rsp_rip(vcpu);
2139
2140         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2141
2142         emulate_ctxt.vcpu = vcpu;
2143         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
2144         emulate_ctxt.cr2 = cr2;
2145         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
2146                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
2147                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
2148                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2149
2150         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2151                 emulate_ctxt.cs_base = 0;
2152                 emulate_ctxt.ds_base = 0;
2153                 emulate_ctxt.es_base = 0;
2154                 emulate_ctxt.ss_base = 0;
2155                 emulate_ctxt.gs_base = 0;
2156                 emulate_ctxt.fs_base = 0;
2157         } else {
2158                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
2159                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
2160                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
2161                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
2162                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
2163                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
2164         }
2165
2166         vcpu->mmio_is_write = 0;
2167         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
2168
2169         if ((r || vcpu->mmio_is_write) && run) {
2170                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2171                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2172                 run->mmio.len = vcpu->mmio_size;
2173                 run->mmio.is_write = vcpu->mmio_is_write;
2174         }
2175
2176         if (r) {
2177                 if (!vcpu->mmio_needed) {
2178                         report_emulation_failure(&emulate_ctxt);
2179                         print_func_exit();
2180                         return EMULATE_FAIL;
2181                 }
2182                 print_func_exit();
2183                 return EMULATE_DO_MMIO;
2184         }
2185
2186         vcpu_put_rsp_rip(vcpu);
2187         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
2188
2189         if (vcpu->mmio_is_write) {
2190                 print_func_exit();
2191                 return EMULATE_DO_MMIO;
2192         }
2193
2194         print_func_exit();
2195         return EMULATE_DONE;
2196 }
2197
2198 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
2199 {
2200         print_func_entry();
2201         print_func_exit();
2202         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2203 }
2204
2205 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2206 {
2207         print_func_entry();
2208         vmcs_writel(GUEST_GDTR_BASE, base);
2209         vmcs_write32(GUEST_GDTR_LIMIT, limit);
2210         print_func_exit();
2211 }
2212
2213 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2214 {
2215         print_func_entry();
2216         vmcs_writel(GUEST_IDTR_BASE, base);
2217         vmcs_write32(GUEST_IDTR_LIMIT, limit);
2218         print_func_exit();
2219 }
2220
2221 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
2222                                    unsigned long *rflags)
2223 {
2224         print_func_entry();
2225         lmsw(vcpu, msw);
2226         *rflags = vmcs_readl(GUEST_RFLAGS);
2227         print_func_exit();
2228 }
2229
2230 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
2231 {
2232         print_func_entry();
2233         switch (cr) {
2234                 case 0:
2235                         print_func_exit();
2236                         return guest_cr0();
2237                 case 2:
2238                         print_func_exit();
2239                         return vcpu->cr2;
2240                 case 3:
2241                         print_func_exit();
2242                         return vcpu->cr3;
2243                 case 4:
2244                         print_func_exit();
2245                         return guest_cr4();
2246                 default:
2247                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2248                         print_func_exit();
2249                         return 0;
2250         }
2251 }
2252
2253 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
2254                                          unsigned long *rflags)
2255 {
2256         print_func_entry();
2257         switch (cr) {
2258                 case 0:
2259                         set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
2260                         *rflags = vmcs_readl(GUEST_RFLAGS);
2261                         break;
2262                 case 2:
2263                         vcpu->cr2 = val;
2264                         break;
2265                 case 3:
2266                         set_cr3(vcpu, val);
2267                         break;
2268                 case 4:
2269                         set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
2270                         break;
2271                 default:
2272                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2273         }
2274         print_func_exit();
2275 }
2276
2277 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
2278                                                                   int vec, uint32_t err_code)
2279 {
2280         print_func_entry();
2281         if (!vcpu->rmode.active) {
2282                 print_func_exit();
2283                 return 0;
2284         }
2285
2286         if (vec == GP_VECTOR && err_code == 0)
2287                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) {
2288                         print_func_exit();
2289                         return 1;
2290                 }
2291         print_func_exit();
2292         return 0;
2293 }
2294
2295 static int handle_exception(struct litevm_vcpu *vcpu,
2296                                                         struct litevm_run *litevm_run)
2297 {
2298         print_func_entry();
2299         uint32_t intr_info, error_code;
2300         unsigned long cr2, rip;
2301         uint32_t vect_info;
2302         enum emulation_result er;
2303
2304         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2305         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2306 printk("vect_info %x intro_info %x\n", vect_info, intr_info);
2307 printk("page fault? %d\n", is_page_fault(intr_info));
2308
2309         if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) {
2310                 printk("%s: unexpected, vectoring info 0x%x "
2311                            "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
2312         }
2313
2314         if (is_external_interrupt(vect_info)) {
2315 printk("extern interrupt\n");
2316                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2317                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_pending), irq);
2318                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_summary),
2319                                                            irq / BITS_PER_LONG);
2320         }
2321
2322         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) {  /* nmi */
2323 printk("nmi\n");
2324                 asm("int $2");
2325                 print_func_exit();
2326                 return 1;
2327         }
2328         error_code = 0;
2329         rip = vmcs_readl(GUEST_RIP);
2330 printk("GUEST_RIP %x\n", rip);
2331         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
2332                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2333         if (is_page_fault(intr_info)) {
2334 printk("PAGE FAULT!\n");
2335                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2336
2337                 SPLL(&vcpu->litevm->lock);
2338                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
2339                         SPLU(&vcpu->litevm->lock);
2340                         print_func_exit();
2341                         return 1;
2342                 }
2343
2344                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
2345                 SPLU(&vcpu->litevm->lock);
2346
2347                 switch (er) {
2348                         case EMULATE_DONE:
2349                                 print_func_exit();
2350                                 return 1;
2351                         case EMULATE_DO_MMIO:
2352                                 ++litevm_stat.mmio_exits;
2353                                 litevm_run->exit_reason = LITEVM_EXIT_MMIO;
2354                                 print_func_exit();
2355                                 return 0;
2356                         case EMULATE_FAIL:
2357                                 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2358                                 break;
2359                         default:
2360                                 assert(0);
2361                 }
2362         }
2363
2364         if (vcpu->rmode.active &&
2365                 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2366                                                            error_code)) {
2367             printk("RMODE EXCEPTION might have been handled\n");
2368                 print_func_exit();
2369                 return 1;
2370         }
2371
2372         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
2373                 (INTR_TYPE_EXCEPTION | 1)) {
2374                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2375                 print_func_exit();
2376                 return 0;
2377         }
2378         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2379         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2380         litevm_run->ex.error_code = error_code;
2381         print_func_exit();
2382         return 0;
2383 }
2384
2385 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2386                                                                          struct litevm_run *litevm_run)
2387 {
2388         //print_func_entry();
2389         ++litevm_stat.irq_exits;
2390         //print_func_exit();
2391         return 1;
2392 }
2393
2394 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t * count)
2395 {
2396         print_func_entry();
2397         uint64_t inst;
2398         gva_t rip;
2399         int countr_size;
2400         int i, n;
2401
2402         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2403                 countr_size = 2;
2404         } else {
2405                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2406
2407                 countr_size = (cs_ar & AR_L_MASK) ? 8 : (cs_ar & AR_DB_MASK) ? 4 : 2;
2408         }
2409
2410         rip = vmcs_readl(GUEST_RIP);
2411         if (countr_size != 8)
2412                 rip += vmcs_readl(GUEST_CS_BASE);
2413
2414         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2415
2416         for (i = 0; i < n; i++) {
2417                 switch (((uint8_t *) & inst)[i]) {
2418                         case 0xf0:
2419                         case 0xf2:
2420                         case 0xf3:
2421                         case 0x2e:
2422                         case 0x36:
2423                         case 0x3e:
2424                         case 0x26:
2425                         case 0x64:
2426                         case 0x65:
2427                         case 0x66:
2428                                 break;
2429                         case 0x67:
2430                                 countr_size = (countr_size == 2) ? 4 : (countr_size >> 1);
2431                         default:
2432                                 goto done;
2433                 }
2434         }
2435         print_func_exit();
2436         return 0;
2437 done:
2438         countr_size *= 8;
2439         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2440         print_func_exit();
2441         return 1;
2442 }
2443
2444 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2445 {
2446         print_func_entry();
2447         uint64_t exit_qualification;
2448
2449         ++litevm_stat.io_exits;
2450         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2451         litevm_run->exit_reason = LITEVM_EXIT_IO;
2452         if (exit_qualification & 8)
2453                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2454         else
2455                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2456         litevm_run->io.size = (exit_qualification & 7) + 1;
2457         litevm_run->io.string = (exit_qualification & 16) != 0;
2458         litevm_run->io.string_down
2459                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2460         litevm_run->io.rep = (exit_qualification & 32) != 0;
2461         litevm_run->io.port = exit_qualification >> 16;
2462         if (litevm_run->io.string) {
2463                 if (!get_io_count(vcpu, &litevm_run->io.count)) {
2464                         print_func_exit();
2465                         return 1;
2466                 }
2467                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2468         } else
2469                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX];       /* rax */
2470         print_func_exit();
2471         return 0;
2472 }
2473
2474 static int handle_invlpg(struct litevm_vcpu *vcpu,
2475                                                  struct litevm_run *litevm_run)
2476 {
2477         print_func_entry();
2478         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2479         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2480         SPLL(&vcpu->litevm->lock);
2481         vcpu->mmu.inval_page(vcpu, address);
2482         SPLU(&vcpu->litevm->lock);
2483         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2484         print_func_exit();
2485         return 1;
2486 }
2487
2488 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2489 {
2490         print_func_entry();
2491         uint64_t exit_qualification;
2492         int cr;
2493         int reg;
2494
2495 #ifdef LITEVM_DEBUG
2496         if (guest_cpl() != 0) {
2497                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2498                 inject_gp(vcpu);
2499                 print_func_exit();
2500                 return 1;
2501         }
2502 #endif
2503
2504         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2505         cr = exit_qualification & 15;
2506         reg = (exit_qualification >> 8) & 15;
2507         switch ((exit_qualification >> 4) & 3) {
2508                 case 0: /* mov to cr */
2509                         switch (cr) {
2510                                 case 0:
2511                                         vcpu_load_rsp_rip(vcpu);
2512                                         set_cr0(vcpu, vcpu->regs[reg]);
2513                                         skip_emulated_instruction(vcpu);
2514                                         print_func_exit();
2515                                         return 1;
2516                                 case 3:
2517                                         vcpu_load_rsp_rip(vcpu);
2518                                         set_cr3(vcpu, vcpu->regs[reg]);
2519                                         skip_emulated_instruction(vcpu);
2520                                         print_func_exit();
2521                                         return 1;
2522                                 case 4:
2523                                         vcpu_load_rsp_rip(vcpu);
2524                                         set_cr4(vcpu, vcpu->regs[reg]);
2525                                         skip_emulated_instruction(vcpu);
2526                                         print_func_exit();
2527                                         return 1;
2528                                 case 8:
2529                                         vcpu_load_rsp_rip(vcpu);
2530                                         set_cr8(vcpu, vcpu->regs[reg]);
2531                                         skip_emulated_instruction(vcpu);
2532                                         print_func_exit();
2533                                         return 1;
2534                         };
2535                         break;
2536                 case 1: /*mov from cr */
2537                         switch (cr) {
2538                                 case 3:
2539                                         vcpu_load_rsp_rip(vcpu);
2540                                         vcpu->regs[reg] = vcpu->cr3;
2541                                         vcpu_put_rsp_rip(vcpu);
2542                                         skip_emulated_instruction(vcpu);
2543                                         print_func_exit();
2544                                         return 1;
2545                                 case 8:
2546                                         printd("handle_cr: read CR8 " "cpu erratum AA15\n");
2547                                         vcpu_load_rsp_rip(vcpu);
2548                                         vcpu->regs[reg] = vcpu->cr8;
2549                                         vcpu_put_rsp_rip(vcpu);
2550                                         skip_emulated_instruction(vcpu);
2551                                         print_func_exit();
2552                                         return 1;
2553                         }
2554                         break;
2555                 case 3: /* lmsw */
2556                         lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2557
2558                         skip_emulated_instruction(vcpu);
2559                         print_func_exit();
2560                         return 1;
2561                 default:
2562                         break;
2563         }
2564         litevm_run->exit_reason = 0;
2565         printk("litevm: unhandled control register: op %d cr %d\n",
2566                    (int)(exit_qualification >> 4) & 3, cr);
2567         print_func_exit();
2568         return 0;
2569 }
2570
2571 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2572 {
2573         print_func_entry();
2574         uint64_t exit_qualification;
2575         unsigned long val;
2576         int dr, reg;
2577
2578         /*
2579          * FIXME: this code assumes the host is debugging the guest.
2580          *        need to deal with guest debugging itself too.
2581          */
2582         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2583         dr = exit_qualification & 7;
2584         reg = (exit_qualification >> 8) & 15;
2585         vcpu_load_rsp_rip(vcpu);
2586         if (exit_qualification & 16) {
2587                 /* mov from dr */
2588                 switch (dr) {
2589                         case 6:
2590                                 val = 0xffff0ff0;
2591                                 break;
2592                         case 7:
2593                                 val = 0x400;
2594                                 break;
2595                         default:
2596                                 val = 0;
2597                 }
2598                 vcpu->regs[reg] = val;
2599         } else {
2600                 /* mov to dr */
2601         }
2602         vcpu_put_rsp_rip(vcpu);
2603         skip_emulated_instruction(vcpu);
2604         print_func_exit();
2605         return 1;
2606 }
2607
2608 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2609 {
2610         print_func_entry();
2611         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2612         print_func_exit();
2613         return 0;
2614 }
2615
2616 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2617 {
2618         print_func_entry();
2619         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2620         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2621         uint64_t data;
2622
2623         if (guest_cpl() != 0) {
2624                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2625                 inject_gp(vcpu);
2626                 print_func_exit();
2627                 return 1;
2628         }
2629
2630         switch (ecx) {
2631                 case MSR_FS_BASE:
2632                         data = vmcs_readl(GUEST_FS_BASE);
2633                         break;
2634                 case MSR_GS_BASE:
2635                         data = vmcs_readl(GUEST_GS_BASE);
2636                         break;
2637                 case MSR_IA32_SYSENTER_CS:
2638                         data = vmcs_read32(GUEST_SYSENTER_CS);
2639                         break;
2640                 case MSR_IA32_SYSENTER_EIP:
2641                         data = vmcs_read32(GUEST_SYSENTER_EIP);
2642                         break;
2643                 case MSR_IA32_SYSENTER_ESP:
2644                         data = vmcs_read32(GUEST_SYSENTER_ESP);
2645                         break;
2646                 case MSR_IA32_MC0_CTL:
2647                 case MSR_IA32_MCG_STATUS:
2648                 case MSR_IA32_MCG_CAP:
2649                 case MSR_IA32_MC0_MISC:
2650                 case MSR_IA32_MC0_MISC + 4:
2651                 case MSR_IA32_MC0_MISC + 8:
2652                 case MSR_IA32_MC0_MISC + 12:
2653                 case MSR_IA32_MC0_MISC + 16:
2654                 case MSR_IA32_UCODE_REV:
2655                         /* MTRR registers */
2656                 case 0xfe:
2657                 case 0x200 ... 0x2ff:
2658                         data = 0;
2659                         break;
2660                 case MSR_IA32_APICBASE:
2661                         data = vcpu->apic_base;
2662                         break;
2663                 default:
2664                         if (msr) {
2665                                 data = msr->value;
2666                                 break;
2667                         }
2668                         printk("litevm: unhandled rdmsr: %x\n", ecx);
2669                         inject_gp(vcpu);
2670                         print_func_exit();
2671                         return 1;
2672         }
2673
2674         /* FIXME: handling of bits 32:63 of rax, rdx */
2675         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2676         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2677         skip_emulated_instruction(vcpu);
2678         print_func_exit();
2679         return 1;
2680 }
2681
2682 #ifdef __x86_64__
2683
2684 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2685 {
2686         print_func_entry();
2687         struct vmx_msr_entry *msr;
2688
2689         if (efer & EFER_RESERVED_BITS) {
2690                 printd("set_efer: 0x%llx #GP, reserved bits\n", efer);
2691                 inject_gp(vcpu);
2692                 print_func_exit();
2693                 return;
2694         }
2695
2696         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2697                 printd("set_efer: #GP, change LME while paging\n");
2698                 inject_gp(vcpu);
2699                 print_func_exit();
2700                 return;
2701         }
2702
2703         efer &= ~EFER_LMA;
2704         efer |= vcpu->shadow_efer & EFER_LMA;
2705
2706         vcpu->shadow_efer = efer;
2707
2708         msr = find_msr_entry(vcpu, MSR_EFER);
2709
2710         if (!(efer & EFER_LMA))
2711                 efer &= ~EFER_LME;
2712         msr->value = efer;
2713         skip_emulated_instruction(vcpu);
2714         print_func_exit();
2715 }
2716
2717 #endif
2718
2719 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2720
2721 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2722 {
2723         print_func_entry();
2724         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2725         struct vmx_msr_entry *msr;
2726         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2727                 | ((uint64_t) (vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2728
2729         if (guest_cpl() != 0) {
2730                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2731                 inject_gp(vcpu);
2732                 print_func_exit();
2733                 return 1;
2734         }
2735
2736         switch (ecx) {
2737                 case MSR_FS_BASE:
2738                         vmcs_writel(GUEST_FS_BASE, data);
2739                         break;
2740                 case MSR_GS_BASE:
2741                         vmcs_writel(GUEST_GS_BASE, data);
2742                         break;
2743                 case MSR_IA32_SYSENTER_CS:
2744                         vmcs_write32(GUEST_SYSENTER_CS, data);
2745                         break;
2746                 case MSR_IA32_SYSENTER_EIP:
2747                         vmcs_write32(GUEST_SYSENTER_EIP, data);
2748                         break;
2749                 case MSR_IA32_SYSENTER_ESP:
2750                         vmcs_write32(GUEST_SYSENTER_ESP, data);
2751                         break;
2752                 case MSR_EFER:
2753                         set_efer(vcpu, data);
2754                         print_func_exit();
2755                         return 1;
2756                 case MSR_IA32_MC0_STATUS:
2757                         printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data);
2758                         break;
2759                 case MSR_IA32_TIME_STAMP_COUNTER:{
2760                                 uint64_t tsc;
2761
2762                                 tsc = read_tsc();
2763                                 vmcs_write64(TSC_OFFSET, data - tsc);
2764                                 break;
2765                         }
2766                 case MSR_IA32_UCODE_REV:
2767                 case MSR_IA32_UCODE_WRITE:
2768                 case 0x200 ... 0x2ff:   /* MTRRs */
2769                         break;
2770                 case MSR_IA32_APICBASE:
2771                         vcpu->apic_base = data;
2772                         break;
2773                 default:
2774                         msr = find_msr_entry(vcpu, ecx);
2775                         if (msr) {
2776                                 msr->value = data;
2777                                 break;
2778                         }
2779                         printk("litevm: unhandled wrmsr: %x\n", ecx);
2780                         inject_gp(vcpu);
2781                         print_func_exit();
2782                         return 1;
2783         }
2784         skip_emulated_instruction(vcpu);
2785         print_func_exit();
2786         return 1;
2787 }
2788
2789 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2790                                                                    struct litevm_run *litevm_run)
2791 {
2792         print_func_entry();
2793         /* Turn off interrupt window reporting. */
2794         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2795                                  vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2796                                  & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2797         print_func_exit();
2798         return 1;
2799 }
2800
2801 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2802 {
2803         print_func_entry();
2804         skip_emulated_instruction(vcpu);
2805         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) {
2806                 print_func_exit();
2807                 return 1;
2808         }
2809
2810         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2811         print_func_exit();
2812         return 0;
2813 }
2814
2815 /*
2816  * The exit handlers return 1 if the exit was handled fully and guest execution
2817  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2818  * to be done to userspace and return 0.
2819  */
2820 static int (*litevm_vmx_exit_handlers[]) (struct litevm_vcpu * vcpu,
2821                                                                                   struct litevm_run * litevm_run) = {
2822 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2823                 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2824                 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2825                 [EXIT_REASON_INVLPG] = handle_invlpg,
2826                 [EXIT_REASON_CR_ACCESS] = handle_cr,
2827                 [EXIT_REASON_DR_ACCESS] = handle_dr,
2828                 [EXIT_REASON_CPUID] = handle_cpuid,
2829                 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2830                 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2831                 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2832                 [EXIT_REASON_HLT] = handle_halt,};
2833
2834 static const int litevm_vmx_max_exit_handlers =
2835         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2836
2837 /*
2838  * The guest has exited.  See if we can fix it or if we need userspace
2839  * assistance.
2840  */
2841 static int litevm_handle_exit(struct litevm_run *litevm_run,
2842                                                           struct litevm_vcpu *vcpu)
2843 {
2844         //print_func_entry();
2845         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2846         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2847
2848 //printk("vectoring_info %08x exit_reason %x\n", vectoring_info, exit_reason);
2849         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2850                 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2851                 printk("%s: unexpected, valid vectoring info and "
2852                            "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2853         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2854         if (exit_reason < litevm_vmx_max_exit_handlers
2855                 && litevm_vmx_exit_handlers[exit_reason]) {
2856 //printk("reason is KNOWN\n");
2857                 //print_func_exit();
2858                 return litevm_vmx_exit_handlers[exit_reason] (vcpu, litevm_run);
2859         } else {
2860 printk("reason is UNKNOWN\n");
2861                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2862                 litevm_run->hw.hardware_exit_reason = exit_reason;
2863         }
2864         //print_func_exit();
2865         return 0;
2866 }
2867
2868 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2869 {
2870         print_func_entry();
2871         uint16_t ent[2];
2872         uint16_t cs;
2873         uint16_t ip;
2874         unsigned long flags;
2875         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2876         uint16_t sp = vmcs_readl(GUEST_RSP);
2877         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2878
2879         /* This is the 'does it wrap' test. */
2880         /* This original test elicited complaints from the C compiler. 
2881          * It's a bit too Klever for me.
2882         if (sp > ss_limit || ((sp - 6) > sp)) {
2883         */
2884         if (sp > ss_limit || (sp < 6)) {
2885                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2886                                         __FUNCTION__,
2887                                         vmcs_readl(GUEST_RSP),
2888                                         vmcs_readl(GUEST_SS_BASE), vmcs_read32(GUEST_SS_LIMIT));
2889                 print_func_exit();
2890                 return;
2891         }
2892
2893         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2894                 sizeof(ent)) {
2895                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2896                 print_func_exit();
2897                 return;
2898         }
2899
2900         flags = vmcs_readl(GUEST_RFLAGS);
2901         cs = vmcs_readl(GUEST_CS_BASE) >> 4;
2902         ip = vmcs_readl(GUEST_RIP);
2903
2904         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2905                 litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2906                 litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2907                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2908                 print_func_exit();
2909                 return;
2910         }
2911
2912         vmcs_writel(GUEST_RFLAGS, flags &
2913                                 ~(X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2914         vmcs_write16(GUEST_CS_SELECTOR, ent[1]);
2915         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2916         vmcs_writel(GUEST_RIP, ent[0]);
2917         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2918         print_func_exit();
2919 }
2920
2921 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2922 {
2923         print_func_entry();
2924         int word_index = __ffs(vcpu->irq_summary);
2925         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2926         int irq = word_index * BITS_PER_LONG + bit_index;
2927
2928         /* don't have clear_bit and I'm not sure the akaros
2929          * bitops are really going to work.
2930          */
2931         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2932         if (!vcpu->irq_pending[word_index])
2933                 vcpu->irq_summary &= ~(1 << word_index);
2934
2935         if (vcpu->rmode.active) {
2936                 inject_rmode_irq(vcpu, irq);
2937                 print_func_exit();
2938                 return;
2939         }
2940         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2941                                  irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2942         print_func_exit();
2943 }
2944
2945 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2946 {
2947         print_func_entry();
2948         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2949                 && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2950                 /*
2951                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2952                  */
2953                 litevm_do_inject_irq(vcpu);
2954         else
2955                 /*
2956                  * Interrupts blocked.  Wait for unblock.
2957                  */
2958                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2959                                          vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2960                                          | CPU_BASED_VIRTUAL_INTR_PENDING);
2961         print_func_exit();
2962 }
2963
2964 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2965 {
2966         print_func_entry();
2967         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2968
2969 /*
2970         set_debugreg(dbg->bp[0], 0);
2971         set_debugreg(dbg->bp[1], 1);
2972         set_debugreg(dbg->bp[2], 2);
2973         set_debugreg(dbg->bp[3], 3);
2974 */
2975
2976         if (dbg->singlestep) {
2977                 unsigned long flags;
2978
2979                 flags = vmcs_readl(GUEST_RFLAGS);
2980                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2981                 vmcs_writel(GUEST_RFLAGS, flags);
2982         }
2983         print_func_exit();
2984 }
2985
2986 static void load_msrs(struct vmx_msr_entry *e, int n)
2987 {
2988         //print_func_entry();
2989         int i;
2990
2991         if (! e) {
2992                 printk("LOAD MSR WITH NULL POINTER?");
2993                 error("LOAD MSR WITH NULL POINTER?");
2994         }
2995         for (i = 0; i < n; ++i) {
2996                 //printk("Load MSR (%lx), with %lx\n", e[i].index, e[i].data);
2997                 write_msr(e[i].index, e[i].value);
2998                 //printk("Done\n");
2999         }
3000         //print_func_exit();
3001 }
3002
3003 static void save_msrs(struct vmx_msr_entry *e, int n)
3004 {
3005         //print_func_entry();
3006         int i;
3007
3008         for (i = 0; i < n; ++i)
3009                 e[i].value = read_msr(e[i].index);
3010         //print_func_exit();
3011 }
3012
3013 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
3014 {
3015         print_func_entry();
3016         struct litevm_vcpu *vcpu;
3017         uint8_t fail;
3018         uint16_t fs_sel, gs_sel, ldt_sel;
3019         int fs_gs_ldt_reload_needed;
3020
3021         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
3022                 error("vcpu is %d but must be in the range %d..%d\n",
3023                           litevm_run->vcpu, LITEVM_MAX_VCPUS);
3024
3025         vcpu = vcpu_load(litevm, litevm_run->vcpu);
3026         if (!vcpu)
3027                 error("vcpu_load failed");
3028         printk("Loaded\n");
3029
3030         if (litevm_run->emulated) {
3031                 skip_emulated_instruction(vcpu);
3032                 litevm_run->emulated = 0;
3033         }
3034         printk("Emulated\n");
3035
3036         if (litevm_run->mmio_completed) {
3037                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
3038                 vcpu->mmio_read_completed = 1;
3039         }
3040         printk("mmio completed\n");
3041
3042         vcpu->mmio_needed = 0;
3043
3044 again:
3045         /*
3046          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
3047          * allow segment selectors with cpl > 0 or ti == 1.
3048          */
3049         fs_sel = read_fs();
3050         //printk("fs_sel %x\n", fs_sel);
3051         gs_sel = read_gs();
3052         //printk("gs_sel %x\n", gs_sel);
3053         ldt_sel = read_ldt();
3054         //printk("ldt_sel %x\n", ldt_sel);
3055         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
3056         if (!fs_gs_ldt_reload_needed) {
3057                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
3058                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
3059         } else {
3060                 vmcs_write16(HOST_FS_SELECTOR, 0);
3061                 vmcs_write16(HOST_GS_SELECTOR, 0);
3062         }
3063         //printk("reloaded gs and gs\n");
3064
3065 #ifdef __x86_64__
3066         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
3067         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
3068         //printk("Set FS_BASE and GS_BASE");
3069 #endif
3070
3071         if (vcpu->irq_summary &&
3072                 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
3073                 litevm_try_inject_irq(vcpu);
3074
3075         if (vcpu->guest_debug.enabled)
3076                 litevm_guest_debug_pre(vcpu);
3077
3078         fx_save(vcpu->host_fx_image);
3079         fx_restore(vcpu->guest_fx_image);
3080
3081         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
3082         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3083
3084         printk("GO FOR IT! %08lx\n", vmcs_readl(GUEST_RIP));
3085         asm(
3086                    /* Store host registers */
3087                    "pushf \n\t"
3088 #ifdef __x86_64__
3089                    "push %%rax; push %%rbx; push %%rdx;"
3090                    "push %%rsi; push %%rdi; push %%rbp;"
3091                    "push %%r8;  push %%r9;  push %%r10; push %%r11;"
3092                    "push %%r12; push %%r13; push %%r14; push %%r15;"
3093                    "push %%rcx \n\t" "vmwrite %%rsp, %2 \n\t"
3094 #else
3095                    "pusha; push %%ecx \n\t" "vmwrite %%esp, %2 \n\t"
3096 #endif
3097                    /* Check if vmlaunch of vmresume is needed */
3098                    "cmp $0, %1 \n\t"
3099                    /* Load guest registers.  Don't clobber flags. */
3100 #ifdef __x86_64__
3101                    "mov %c[cr2](%3), %%rax \n\t" "mov %%rax, %%cr2 \n\t" "mov %c[rax](%3), %%rax \n\t" "mov %c[rbx](%3), %%rbx \n\t" "mov %c[rdx](%3), %%rdx \n\t" "mov %c[rsi](%3), %%rsi \n\t" "mov %c[rdi](%3), %%rdi \n\t" "mov %c[rbp](%3), %%rbp \n\t" "mov %c[r8](%3),  %%r8  \n\t" "mov %c[r9](%3),  %%r9  \n\t" "mov %c[r10](%3), %%r10 \n\t" "mov %c[r11](%3), %%r11 \n\t" "mov %c[r12](%3), %%r12 \n\t" "mov %c[r13](%3), %%r13 \n\t" "mov %c[r14](%3), %%r14 \n\t" "mov %c[r15](%3), %%r15 \n\t" "mov %c[rcx](%3), %%rcx \n\t"      /* kills %3 (rcx) */
3102 #else
3103                    "mov %c[cr2](%3), %%eax \n\t" "mov %%eax,   %%cr2 \n\t" "mov %c[rax](%3), %%eax \n\t" "mov %c[rbx](%3), %%ebx \n\t" "mov %c[rdx](%3), %%edx \n\t" "mov %c[rsi](%3), %%esi \n\t" "mov %c[rdi](%3), %%edi \n\t" "mov %c[rbp](%3), %%ebp \n\t" "mov %c[rcx](%3), %%ecx \n\t"    /* kills %3 (ecx) */
3104 #endif
3105                    /* Enter guest mode */
3106                    "jne launched \n\t"
3107                    "vmlaunch \n\t"
3108                    "jmp litevm_vmx_return \n\t"
3109                    "launched: vmresume \n\t"
3110                    ".globl litevm_vmx_return \n\t" "litevm_vmx_return: "
3111                    /* Save guest registers, load host registers, keep flags */
3112 #ifdef __x86_64__
3113                    "xchg %3,     0(%%rsp) \n\t"
3114                    "mov %%rax, %c[rax](%3) \n\t"
3115                    "mov %%rbx, %c[rbx](%3) \n\t"
3116                    "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
3117                    "mov %%rdx, %c[rdx](%3) \n\t"
3118                    "mov %%rsi, %c[rsi](%3) \n\t"
3119                    "mov %%rdi, %c[rdi](%3) \n\t"
3120                    "mov %%rbp, %c[rbp](%3) \n\t"
3121                    "mov %%r8,  %c[r8](%3) \n\t"
3122                    "mov %%r9,  %c[r9](%3) \n\t"
3123                    "mov %%r10, %c[r10](%3) \n\t"
3124                    "mov %%r11, %c[r11](%3) \n\t"
3125                    "mov %%r12, %c[r12](%3) \n\t"
3126                    "mov %%r13, %c[r13](%3) \n\t"
3127                    "mov %%r14, %c[r14](%3) \n\t"
3128                    "mov %%r15, %c[r15](%3) \n\t"
3129                    "mov %%cr2, %%rax   \n\t"
3130                    "mov %%rax, %c[cr2](%3) \n\t"
3131                    "mov 0(%%rsp), %3 \n\t"
3132                    "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
3133                    "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
3134                    "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
3135                    "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
3136 #else
3137                    "xchg %3, 0(%%esp) \n\t"
3138                    "mov %%eax, %c[rax](%3) \n\t"
3139                    "mov %%ebx, %c[rbx](%3) \n\t"
3140                    "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
3141                    "mov %%edx, %c[rdx](%3) \n\t"
3142                    "mov %%esi, %c[rsi](%3) \n\t"
3143                    "mov %%edi, %c[rdi](%3) \n\t"
3144                    "mov %%ebp, %c[rbp](%3) \n\t"
3145                    "mov %%cr2, %%eax  \n\t"
3146                    "mov %%eax, %c[cr2](%3) \n\t"
3147                    "mov 0(%%esp), %3 \n\t" "pop %%ecx; popa \n\t"
3148 #endif
3149 "setbe %0 \n\t" "popf \n\t":"=g"(fail)
3150 :                  "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
3151                    "c"(vcpu),
3152                    [rax] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
3153                    [rbx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
3154                    [rcx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
3155                    [rdx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
3156                    [rsi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
3157                    [rdi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
3158                    [rbp] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
3159 #ifdef __x86_64__
3160                    [r8] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8])),
3161                    [r9] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9])),
3162                    [r10] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
3163                    [r11] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
3164                    [r12] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
3165                    [r13] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
3166                    [r14] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
3167                    [r15] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
3168 #endif
3169                    [cr2] "i"(offsetof(struct litevm_vcpu, cr2))
3170                    :"cc", "memory");
3171
3172         ++litevm_stat.exits;
3173         printk("vm_run exits! %08lx flags %08lx\n", vmcs_readl(GUEST_RIP),
3174                 vmcs_readl(GUEST_RFLAGS));
3175         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3176         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
3177
3178         fx_save(vcpu->guest_fx_image);
3179         fx_restore(vcpu->host_fx_image);
3180
3181 #ifndef __x86_64__
3182 asm("mov %0, %%ds; mov %0, %%es": :"r"(__USER_DS));
3183 #endif
3184
3185         litevm_run->exit_type = 0;
3186         if (fail) {
3187 printk("FAIL\n");
3188                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
3189                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
3190 printk("reason %d\n", litevm_run->exit_reason);
3191         } else {
3192 printk("NOT FAIL\n");
3193                 if (fs_gs_ldt_reload_needed) {
3194                         load_ldt(ldt_sel);
3195                         load_fs(fs_sel);
3196                         /*
3197                          * If we have to reload gs, we must take care to
3198                          * preserve our gs base.
3199                          */
3200                         disable_irq();
3201                         load_gs(gs_sel);
3202 #ifdef __x86_64__
3203                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
3204 #endif
3205                         enable_irq();
3206
3207                         reload_tss();
3208                 }
3209                 vcpu->launched = 1;
3210                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
3211 //printk("Let's see why it exited\n");
3212                 if (litevm_handle_exit(litevm_run, vcpu)) {
3213                         /* Give scheduler a change to reschedule. */
3214 #if 0
3215                         vcpu_put(vcpu);
3216 #warning "how to tell if signal is pending"
3217 /*
3218                         if (signal_pending(current)) {
3219                                 ++litevm_stat.signal_exits;
3220                                 return -EINTR;
3221                         }
3222 */
3223                         consider getting rid of this for now. 
3224                         Maybe it is just breaking things.
3225                         kthread_yield();
3226                         /* Cannot fail -  no vcpu unplug yet. */
3227                         vcpu_load(litevm, vcpu_slot(vcpu));
3228 #endif
3229                         monitor(NULL);
3230                         goto again;
3231                 }
3232         }
3233 done: 
3234
3235         printk("vm_run exits! %08lx flags %08lx\n", vmcs_readl(GUEST_RIP),
3236                 vmcs_readl(GUEST_RFLAGS));
3237         vcpu_put(vcpu);
3238         printk("vm_run returns\n");
3239         print_func_exit();
3240         return 0;
3241 }
3242
3243 static int litevm_dev_ioctl_get_regs(struct litevm *litevm,
3244                                                                          struct litevm_regs *regs)
3245 {
3246         print_func_entry();
3247         struct litevm_vcpu *vcpu;
3248
3249         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3250                 print_func_exit();
3251                 return -EINVAL;
3252         }
3253
3254         vcpu = vcpu_load(litevm, regs->vcpu);
3255         if (!vcpu) {
3256                 print_func_exit();
3257                 return -ENOENT;
3258         }
3259
3260         regs->rax = vcpu->regs[VCPU_REGS_RAX];
3261         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
3262         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
3263         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
3264         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
3265         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
3266         regs->rsp = vmcs_readl(GUEST_RSP);
3267         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
3268 #ifdef __x86_64__
3269         regs->r8 = vcpu->regs[VCPU_REGS_R8];
3270         regs->r9 = vcpu->regs[VCPU_REGS_R9];
3271         regs->r10 = vcpu->regs[VCPU_REGS_R10];
3272         regs->r11 = vcpu->regs[VCPU_REGS_R11];
3273         regs->r12 = vcpu->regs[VCPU_REGS_R12];
3274         regs->r13 = vcpu->regs[VCPU_REGS_R13];
3275         regs->r14 = vcpu->regs[VCPU_REGS_R14];
3276         regs->r15 = vcpu->regs[VCPU_REGS_R15];
3277 #endif
3278
3279         regs->rip = vmcs_readl(GUEST_RIP);
3280         regs->rflags = vmcs_readl(GUEST_RFLAGS);
3281
3282         /*
3283          * Don't leak debug flags in case they were set for guest debugging
3284          */
3285         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3286                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3287
3288         vcpu_put(vcpu);
3289
3290         print_func_exit();
3291         return 0;
3292 }
3293
3294 static int litevm_dev_ioctl_set_regs(struct litevm *litevm,
3295                                                                          struct litevm_regs *regs)
3296 {
3297         print_func_entry();
3298         struct litevm_vcpu *vcpu;
3299
3300         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3301                 print_func_exit();
3302                 return -EINVAL;
3303         }
3304
3305         vcpu = vcpu_load(litevm, regs->vcpu);
3306         if (!vcpu) {
3307                 print_func_exit();
3308                 return -ENOENT;
3309         }
3310
3311         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
3312         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
3313         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
3314         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
3315         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
3316         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
3317         vmcs_writel(GUEST_RSP, regs->rsp);
3318         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
3319 #ifdef __x86_64__
3320         vcpu->regs[VCPU_REGS_R8] = regs->r8;
3321         vcpu->regs[VCPU_REGS_R9] = regs->r9;
3322         vcpu->regs[VCPU_REGS_R10] = regs->r10;
3323         vcpu->regs[VCPU_REGS_R11] = regs->r11;
3324         vcpu->regs[VCPU_REGS_R12] = regs->r12;
3325         vcpu->regs[VCPU_REGS_R13] = regs->r13;
3326         vcpu->regs[VCPU_REGS_R14] = regs->r14;
3327         vcpu->regs[VCPU_REGS_R15] = regs->r15;
3328 #endif
3329
3330         vmcs_writel(GUEST_RIP, regs->rip);
3331         vmcs_writel(GUEST_RFLAGS, regs->rflags);
3332
3333         vcpu_put(vcpu);
3334
3335         print_func_exit();
3336         return 0;
3337 }
3338
3339 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm,
3340                                                                           struct litevm_sregs *sregs)
3341 {
3342         print_func_entry();
3343         struct litevm_vcpu *vcpu;
3344
3345         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3346                 print_func_exit();
3347                 return -EINVAL;
3348         }
3349         vcpu = vcpu_load(litevm, sregs->vcpu);
3350         if (!vcpu) {
3351                 print_func_exit();
3352                 return -ENOENT;
3353         }
3354 #define get_segment(var, seg) \
3355         do { \
3356                 uint32_t ar; \
3357                 \
3358                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
3359                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
3360                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
3361                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
3362                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
3363                 sregs->var.type = ar & 15; \
3364                 sregs->var.s = (ar >> 4) & 1; \
3365                 sregs->var.dpl = (ar >> 5) & 3; \
3366                 sregs->var.present = (ar >> 7) & 1; \
3367                 sregs->var.avl = (ar >> 12) & 1; \
3368                 sregs->var.l = (ar >> 13) & 1; \
3369                 sregs->var.db = (ar >> 14) & 1; \
3370                 sregs->var.g = (ar >> 15) & 1; \
3371                 sregs->var.unusable = (ar >> 16) & 1; \
3372         } while (0);
3373
3374         get_segment(cs, CS);
3375         get_segment(ds, DS);
3376         get_segment(es, ES);
3377         get_segment(fs, FS);
3378         get_segment(gs, GS);
3379         get_segment(ss, SS);
3380
3381         get_segment(tr, TR);
3382         get_segment(ldt, LDTR);
3383 #undef get_segment
3384
3385 #define get_dtable(var, table) \
3386         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
3387                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
3388
3389         get_dtable(idt, IDTR);
3390         get_dtable(gdt, GDTR);
3391 #undef get_dtable
3392
3393         sregs->cr0 = guest_cr0();
3394         sregs->cr2 = vcpu->cr2;
<