VMX sort of worked.
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 #define currentcpu (&per_cpu_info[core_id()])
39 #define QLOCK_init(x) {printk("qlock_init %p\n", x); qlock_init(x); printk("%p lock_inited\n", x);}
40 #define QLOCK(x) {printk("qlock %p\n", x); qlock(x); printk("%p locked\n", x);}
41 #define QUNLOCK(x) {printk("qunlock %p\n", x); qunlock(x); printk("%p unlocked\n", x);}
42 #define SPLI_irqsave(x){printk("spin_lock_init %p:", x); spinlock_init(x); printk("inited\n");}
43 #define SPLL(x){printk("spin_lock %p\n", x); spin_lock_irqsave(x); printk("%p locked\n", x);}
44 #define SPLU(x){printk("spin_unlock %p\n", x); spin_unlock(x); printk("%p unlocked\n", x);}
45 struct litevm_stat litevm_stat;
46
47 static struct litevm_stats_debugfs_item {
48         const char *name;
49         uint32_t *data;
50 } debugfs_entries[] = {
51         {
52         "pf_fixed", &litevm_stat.pf_fixed}, {
53         "pf_guest", &litevm_stat.pf_guest}, {
54         "tlb_flush", &litevm_stat.tlb_flush}, {
55         "invlpg", &litevm_stat.invlpg}, {
56         "exits", &litevm_stat.exits}, {
57         "io_exits", &litevm_stat.io_exits}, {
58         "mmio_exits", &litevm_stat.mmio_exits}, {
59         "signal_exits", &litevm_stat.signal_exits}, {
60         "irq_exits", &litevm_stat.irq_exits}, {
61         0, 0}
62 };
63
64 static struct dentry *debugfs_dir;
65
66 static const uint32_t vmx_msr_index[] = {
67 #ifdef __x86_64__
68         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
69 #endif
70         MSR_EFER,       // wtf? MSR_K6_STAR,
71 };
72
73 static const char* vmx_msr_name[] = {
74 #ifdef __x86_64__
75         "MSR_SYSCALL_MASK", "MSR_LSTAR", "MSR_CSTAR", "MSR_KERNEL_GS_BASE",
76 #endif
77         "MSR_EFER",     // wtf? MSR_K6_STAR,
78 };
79
80 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
81
82 #ifdef __x86_64__
83 /*
84  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
85  * mechanism (cpu bug AA24)
86  */
87 #define NR_BAD_MSRS 2
88 #else
89 #define NR_BAD_MSRS 0
90 #endif
91
92 #define TSS_IOPB_BASE_OFFSET 0x66
93 #define TSS_BASE_SIZE 0x68
94 #define TSS_IOPB_SIZE (65536 / 8)
95 #define TSS_REDIRECTION_SIZE (256 / 8)
96 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
97
98 #define MSR_IA32_VMX_BASIC_MSR                  0x480
99 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
100 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
101 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
102 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
103
104 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
105 #define LMSW_GUEST_MASK 0x0eULL
106 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
107 //#define CR4_VMXE 0x2000
108 #define CR8_RESEVED_BITS (~0x0fULL)
109 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
110
111 #ifdef __x86_64__
112 #define HOST_IS_64 1
113 #else
114 #define HOST_IS_64 0
115 #endif
116
117 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
118 /**
119  * __ffs - find first set bit in word
120  * @word: The word to search
121  *
122  * Undefined if no bit exists, so code should check against 0 first.
123  */
124 static inline unsigned long __ffs(unsigned long word)
125 {
126         print_func_entry();
127 asm("rep; bsf %1,%0":"=r"(word)
128 :               "rm"(word));
129         print_func_exit();
130         return word;
131 }
132
133 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu,
134                                                                                         uint32_t msr)
135 {
136         print_func_entry();
137         int i;
138
139         for (i = 0; i < vcpu->nmsrs; ++i)
140                 if (vcpu->guest_msrs[i].index == msr) {
141                         print_func_exit();
142                         return &vcpu->guest_msrs[i];
143                 }
144         print_func_exit();
145         return 0;
146 }
147
148 struct descriptor_table {
149         uint16_t limit;
150         unsigned long base;
151 } __attribute__ ((packed));
152
153 static void get_gdt(struct descriptor_table *table)
154 {
155         print_func_entry();
156 asm("sgdt %0":"=m"(*table));
157         print_func_exit();
158 }
159
160 static void get_idt(struct descriptor_table *table)
161 {
162         print_func_entry();
163 asm("sidt %0":"=m"(*table));
164         print_func_exit();
165 }
166
167 static uint16_t read_fs(void)
168 {
169         print_func_entry();
170         uint16_t seg;
171 asm("mov %%fs, %0":"=g"(seg));
172         print_func_exit();
173         return seg;
174 }
175
176 static uint16_t read_gs(void)
177 {
178         print_func_entry();
179         uint16_t seg;
180 asm("mov %%gs, %0":"=g"(seg));
181         print_func_exit();
182         return seg;
183 }
184
185 static uint16_t read_ldt(void)
186 {
187         print_func_entry();
188         uint16_t ldt;
189 asm("sldt %0":"=g"(ldt));
190         print_func_exit();
191         return ldt;
192 }
193
194 static void load_fs(uint16_t sel)
195 {
196         print_func_entry();
197 asm("mov %0, %%fs": :"g"(sel));
198         print_func_exit();
199 }
200
201 static void load_gs(uint16_t sel)
202 {
203         print_func_entry();
204 asm("mov %0, %%gs": :"g"(sel));
205         print_func_exit();
206 }
207
208 #ifndef load_ldt
209 static void load_ldt(uint16_t sel)
210 {
211         print_func_entry();
212 asm("lldt %0": :"g"(sel));
213         print_func_exit();
214 }
215 #endif
216
217 static void fx_save(void *image)
218 {
219         print_func_entry();
220         asm("fxsave (%0)"::"r"(image));
221         print_func_exit();
222 }
223
224 static void fx_restore(void *image)
225 {
226         print_func_entry();
227         asm("fxrstor (%0)"::"r"(image));
228         print_func_exit();
229 }
230
231 static void fpu_init(void)
232 {
233         print_func_entry();
234         asm("finit");
235         print_func_exit();
236 }
237
238 struct segment_descriptor {
239         uint16_t limit_low;
240         uint16_t base_low;
241         uint8_t base_mid;
242         uint8_t type:4;
243         uint8_t system:1;
244         uint8_t dpl:2;
245         uint8_t present:1;
246         uint8_t limit_high:4;
247         uint8_t avl:1;
248         uint8_t long_mode:1;
249         uint8_t default_op:1;
250         uint8_t granularity:1;
251         uint8_t base_high;
252 } __attribute__ ((packed));
253
254 #ifdef __x86_64__
255 // LDT or TSS descriptor in the GDT. 16 bytes.
256 struct segment_descriptor_64 {
257         struct segment_descriptor s;
258         uint32_t base_higher;
259         uint32_t pad_zero;
260 };
261
262 #endif
263
264 static unsigned long segment_base(uint16_t selector)
265 {
266         print_func_entry();
267         struct descriptor_table gdt;
268         struct segment_descriptor *d;
269         unsigned long table_base;
270         typedef unsigned long ul;
271         unsigned long v;
272
273 asm("sgdt %0":"=m"(gdt));
274         table_base = gdt.base;
275
276         if (selector & 4) {     /* from ldt */
277                 uint16_t ldt_selector;
278
279 asm("sldt %0":"=g"(ldt_selector));
280                 table_base = segment_base(ldt_selector);
281         }
282         d = (struct segment_descriptor *)(table_base + (selector & ~7));
283         v = d->base_low | ((ul) d->base_mid << 16) | ((ul) d->base_high << 24);
284 #ifdef __x86_64__
285         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
286                 v |= ((ul) ((struct segment_descriptor_64 *)d)->base_higher) << 32;
287 #endif
288         print_func_exit();
289         return v;
290 }
291
292 static unsigned long read_tr_base(void)
293 {
294         print_func_entry();
295         uint16_t tr;
296 asm("str %0":"=g"(tr));
297         print_func_exit();
298         return segment_base(tr);
299 }
300
301 static void reload_tss(void)
302 {
303         print_func_entry();
304 #ifndef __x86_64__
305
306         /*
307          * VT restores TR but not its size.  Useless.
308          */
309         struct descriptor_table gdt;
310         struct segment_descriptor *descs;
311
312         get_gdt(&gdt);
313         descs = (void *)gdt.base;
314         descs[GDT_ENTRY_TSS].type = 9;  /* available TSS */
315         load_TR_desc();
316 #endif
317         print_func_exit();
318 }
319
320 static struct vmcs_descriptor {
321         int size;
322         int order;
323         uint32_t revision_id;
324 } vmcs_descriptor;
325
326 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
327 {
328         print_func_entry();
329         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
330         print_func_exit();
331         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
332 }
333
334 int litevm_read_guest(struct litevm_vcpu *vcpu,
335                                           gva_t addr, unsigned long size, void *dest)
336 {
337         print_func_entry();
338         unsigned char *host_buf = dest;
339         unsigned long req_size = size;
340
341         while (size) {
342                 hpa_t paddr;
343                 unsigned now;
344                 unsigned offset;
345                 hva_t guest_buf;
346
347                 paddr = gva_to_hpa(vcpu, addr);
348
349                 if (is_error_hpa(paddr))
350                         break;
351                 guest_buf = (hva_t) KADDR(paddr);
352                 offset = addr & ~PAGE_MASK;
353                 guest_buf |= offset;
354                 now = MIN(size, PAGE_SIZE - offset);
355                 memcpy(host_buf, (void *)guest_buf, now);
356                 host_buf += now;
357                 addr += now;
358                 size -= now;
359         }
360         print_func_exit();
361         return req_size - size;
362 }
363
364 int litevm_write_guest(struct litevm_vcpu *vcpu,
365                                            gva_t addr, unsigned long size, void *data)
366 {
367         print_func_entry();
368         unsigned char *host_buf = data;
369         unsigned long req_size = size;
370
371         while (size) {
372                 hpa_t paddr;
373                 unsigned now;
374                 unsigned offset;
375                 hva_t guest_buf;
376
377                 paddr = gva_to_hpa(vcpu, addr);
378
379                 if (is_error_hpa(paddr))
380                         break;
381
382                 guest_buf = (hva_t) KADDR(paddr);
383                 offset = addr & ~PAGE_MASK;
384                 guest_buf |= offset;
385                 now = MIN(size, PAGE_SIZE - offset);
386                 memcpy((void *)guest_buf, host_buf, now);
387                 host_buf += now;
388                 addr += now;
389                 size -= now;
390         }
391         print_func_exit();
392         return req_size - size;
393 }
394
395 static void setup_vmcs_descriptor(void)
396 {
397         print_func_entry();
398         uint64_t msr;
399
400         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
401         vmcs_descriptor.size = (msr >> 32) & 0x1fff;
402         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size >> PAGE_SHIFT);
403         vmcs_descriptor.revision_id = (uint32_t) msr;
404         printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n",
405                    msr, vmcs_descriptor.size, vmcs_descriptor.order,
406                    vmcs_descriptor.revision_id);
407         print_func_exit();
408 };
409
410 static void vmcs_clear(struct vmcs *vmcs)
411 {
412         print_func_entry();
413         uint64_t phys_addr = PADDR(vmcs);
414         uint8_t error;
415         printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr);
416         asm volatile ("vmclear %1; setna %0":"=m" (error):"m"(phys_addr):"cc",
417                                   "memory");
418         if (error)
419                 printk("litevm: vmclear fail: %p/%llx\n", vmcs, phys_addr);
420         print_func_exit();
421 }
422
423 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
424 {
425         print_func_entry();
426         struct litevm_vcpu *vcpu = arg;
427         int cpu = core_id();
428         printd
429                 ("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n",
430                  cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
431
432         if (vcpu->cpu == cpu)
433                 vmcs_clear(vcpu->vmcs);
434
435         if (currentcpu->vmcs == vcpu->vmcs)
436                 currentcpu->vmcs = NULL;
437         print_func_exit();
438 }
439
440 static int vcpu_slot(struct litevm_vcpu *vcpu)
441 {
442         print_func_entry();
443         print_func_exit();
444         return vcpu - vcpu->litevm->vcpus;
445 }
446
447 /*
448  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
449  * vcpu mutex is already taken.
450  */
451 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
452 {
453         print_func_entry();
454         uint64_t phys_addr = PADDR(vcpu->vmcs);
455         int cpu;
456         cpu = core_id();
457
458         printk("__vcpu_load: vcpu->cpu %d cpu %d\n", vcpu->cpu, cpu);
459         if ((vcpu->cpu != cpu) && (vcpu->cpu != -1)){
460                 handler_wrapper_t *w;
461                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
462                 smp_call_wait(w);
463                 vcpu->launched = 0;
464         }
465
466         printk("2 ..");
467         if (currentcpu->vmcs != vcpu->vmcs) {
468                 uint8_t error;
469
470                 currentcpu->vmcs = vcpu->vmcs;
471                 asm volatile ("vmptrld %1; setna %0":"=m" (error):"m"(phys_addr):"cc");
472                 if (error) {
473                         printk("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
474                         error("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
475                 }
476         }
477
478         printk("3 ..");
479         if (vcpu->cpu != cpu) {
480                 struct descriptor_table dt;
481                 unsigned long sysenter_esp;
482
483                 vcpu->cpu = cpu;
484                 /*
485                  * Linux uses per-cpu TSS and GDT, so set these when switching
486                  * processors.
487                  */
488                 vmcs_writel(HOST_TR_BASE, read_tr_base());      /* 22.2.4 */
489                 get_gdt(&dt);
490                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
491
492                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
493                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp);      /* 22.2.3 */
494         }
495         print_func_exit();
496         return vcpu;
497 }
498
499 /*
500  * Switches to specified vcpu, until a matching vcpu_put()
501  * And leaves it locked!
502  */
503 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
504 {
505         struct litevm_vcpu *ret;
506         print_func_entry();
507         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
508
509         printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu);
510
511         QLOCK(&vcpu->mutex);
512         printk("Locked\n");
513         if (!vcpu->vmcs) {
514                 QUNLOCK(&vcpu->mutex);
515                 printk("vcpu->vmcs for vcpu %p is NULL", vcpu);
516                 error("vcpu->vmcs is NULL");
517         }
518         ret = __vcpu_load(vcpu);
519         print_func_exit();
520         return ret;
521 }
522
523 static void vcpu_put(struct litevm_vcpu *vcpu)
524 {
525         print_func_entry();
526         //put_cpu();
527         QUNLOCK(&vcpu->mutex);
528         print_func_exit();
529 }
530
531 static struct vmcs *alloc_vmcs_cpu(int cpu)
532 {
533         print_func_entry();
534         int node = node_id();
535         struct vmcs *vmcs;
536
537         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
538         if (!vmcs) {
539                 print_func_exit();
540                 printk("no memory for vcpus");
541                 error("no memory for vcpus");
542         }
543         memset(vmcs, 0, vmcs_descriptor.size);
544         vmcs->revision_id = vmcs_descriptor.revision_id;        /* vmcs revision id */
545         print_func_exit();
546         return vmcs;
547 }
548
549 static struct vmcs *alloc_vmcs(void)
550 {
551         struct vmcs *ret;
552         print_func_entry();
553         ret = alloc_vmcs_cpu(core_id());
554         print_func_exit();
555         return ret;
556 }
557
558 static int cpu_has_litevm_support(void)
559 {
560         print_func_entry();
561         /* sigh ... qemu. */
562         char vid[16];
563         if (vendor_id(vid) < 0)
564                 return 0;
565         printk("vendor id is %s\n", vid);
566         if (vid[0] == 'Q') /* qemu */
567                 return 0;
568         if (vid[0] == 'A') /* AMD or qemu claiming to be AMD */
569                 return 0;
570         uint32_t ecx = cpuid_ecx(1);
571         print_func_exit();
572         return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */
573 }
574
575 static int vmx_disabled_by_bios(void)
576 {
577         print_func_entry();
578         uint64_t msr;
579
580         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
581         print_func_exit();
582         return (msr & 5) == 1;  /* locked but not enabled */
583 }
584
585 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
586 {
587         print_func_entry();
588         int cpu = hw_core_id();
589         uint64_t phys_addr;
590         uint64_t old;
591         uint64_t status = 0;
592         currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order,
593                                                                                           KMALLOC_WAIT);
594         if (!currentcpu->vmxarea)
595                 return;
596         memset(currentcpu->vmxarea, 0, vmcs_descriptor.size);
597         currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id;
598         phys_addr = PADDR(currentcpu->vmxarea);
599         printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(),
600                    currentcpu->vmxarea, (void *)phys_addr);
601         if (phys_addr & 0xfff) {
602                 printk("fix vmxarea alignment!");
603         }
604         printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
605         old = read_msr(MSR_IA32_FEATURE_CONTROL);
606         printk("%d: vm_enable, old is %d\n", core_id(), old);
607         if ((old & 5) == 0) {
608                 /* enable and lock */
609                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
610                 old = read_msr(MSR_IA32_FEATURE_CONTROL);
611                 printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old);
612         }
613         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
614         lcr4(rcr4() | CR4_VMXE);        /* FIXME: not cpu hotplug safe */
615         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
616         printk("%d:cr0 is %x\n", core_id(), rcr0());
617         lcr0(rcr0() | 0x20);
618         printk("%d:cr0 is %x\n", core_id(), rcr0());
619         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
620         outb(0x92, inb(0x92) | 2);
621         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
622         asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:":"=m" (status):"m"
623                                   (phys_addr):"memory", "cc");
624         printk("%d:vmxon status is %d\n", core_id(), status);
625         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
626         if (!status) {
627                 printk("%d:vm_enable: status says fail\n", core_id());
628         }
629         print_func_exit();
630 }
631
632 static void litevm_disable(void *garbage)
633 {
634         print_func_entry();
635         asm volatile ("vmxoff":::"cc");
636         print_func_exit();
637 }
638
639 struct litevm *vmx_open(void)
640 {
641         print_func_entry();
642         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
643         int i;
644
645         printk("vmx_open: litevm is %p\n", litevm);
646         if (!litevm) {
647                 printk("NO LITEVM! MAKES NO SENSE!\n");
648                 error("litevm alloc failed");
649                 print_func_exit();
650                 return 0;
651         }
652
653         SPLI_irqsave(&litevm->lock);
654         LIST_INIT(&litevm->link);
655         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
656                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
657                 printk("init vcpu %p\n", vcpu);
658
659                 QLOCK_init(&vcpu->mutex);
660                 vcpu->mmu.root_hpa = INVALID_PAGE;
661                 vcpu->litevm = litevm;
662                 LIST_INIT(&vcpu->link);
663         }
664         printk("vmx_open: busy %d\n", litevm->busy);
665         printk("return %p\n", litevm);
666         print_func_exit();
667         return litevm;
668 }
669
670 /*
671  * Free any memory in @free but not in @dont.
672  */
673 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
674                                                                          struct litevm_memory_slot *dont)
675 {
676         print_func_entry();
677         int i;
678
679         if (!dont || free->phys_mem != dont->phys_mem)
680                 if (free->phys_mem) {
681                         for (i = 0; i < free->npages; ++i) {
682                                 page_t *page = free->phys_mem[i];
683                                 page_decref(page);
684                                 assert(page_is_free(page2ppn(page)));
685                         }
686                         kfree(free->phys_mem);
687                 }
688
689         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
690                 kfree(free->dirty_bitmap);
691
692         free->phys_mem = 0;
693         free->npages = 0;
694         free->dirty_bitmap = 0;
695         print_func_exit();
696 }
697
698 static void litevm_free_physmem(struct litevm *litevm)
699 {
700         print_func_entry();
701         int i;
702
703         for (i = 0; i < litevm->nmemslots; ++i)
704                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
705         print_func_exit();
706 }
707
708 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
709 {
710         print_func_entry();
711         if (vcpu->vmcs) {
712                 handler_wrapper_t *w;
713                 smp_call_function_all(__vcpu_clear, vcpu, &w);
714                 smp_call_wait(w);
715                 //free_vmcs(vcpu->vmcs);
716                 vcpu->vmcs = 0;
717         }
718         print_func_exit();
719 }
720
721 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
722 {
723         print_func_entry();
724         litevm_free_vmcs(vcpu);
725         litevm_mmu_destroy(vcpu);
726         print_func_exit();
727 }
728
729 static void litevm_free_vcpus(struct litevm *litevm)
730 {
731         print_func_entry();
732         unsigned int i;
733
734         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
735                 litevm_free_vcpu(&litevm->vcpus[i]);
736         print_func_exit();
737 }
738
739 static int litevm_dev_release(struct litevm *litevm)
740 {
741         print_func_entry();
742
743         litevm_free_vcpus(litevm);
744         litevm_free_physmem(litevm);
745         kfree(litevm);
746         print_func_exit();
747         return 0;
748 }
749
750 unsigned long vmcs_readl(unsigned long field)
751 {
752         print_func_entry();
753         unsigned long value;
754
755         asm volatile ("vmread %1, %0":"=g" (value):"r"(field):"cc");
756         print_func_exit();
757         return value;
758 }
759
760 void vmcs_writel(unsigned long field, unsigned long value)
761 {
762         print_func_entry();
763         uint8_t error;
764
765         asm volatile ("vmwrite %1, %2; setna %0":"=g" (error):"r"(value),
766                                   "r"(field):"cc");
767         if (error)
768                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
769                            field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
770         print_func_exit();
771 }
772
773 static void vmcs_write16(unsigned long field, uint16_t value)
774 {
775         print_func_entry();
776         vmcs_writel(field, value);
777         print_func_exit();
778 }
779
780 static void vmcs_write64(unsigned long field, uint64_t value)
781 {
782         print_func_entry();
783 #ifdef __x86_64__
784         vmcs_writel(field, value);
785 #else
786         vmcs_writel(field, value);
787         asm volatile ("");
788         vmcs_writel(field + 1, value >> 32);
789 #endif
790         print_func_exit();
791 }
792
793 static void inject_gp(struct litevm_vcpu *vcpu)
794 {
795         print_func_entry();
796         printd("inject_general_protection: rip 0x%lx\n", vmcs_readl(GUEST_RIP));
797         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
798         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
799                                  GP_VECTOR |
800                                  INTR_TYPE_EXCEPTION |
801                                  INTR_INFO_DELIEVER_CODE_MASK | INTR_INFO_VALID_MASK);
802         print_func_exit();
803 }
804
805 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
806 {
807         print_func_entry();
808         if (vcpu->rmode.active)
809                 vmcs_write32(EXCEPTION_BITMAP, ~0);
810         else
811                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
812         print_func_exit();
813 }
814
815 static void enter_pmode(struct litevm_vcpu *vcpu)
816 {
817         print_func_entry();
818         unsigned long flags;
819
820         vcpu->rmode.active = 0;
821
822         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
823         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
824         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
825
826         flags = vmcs_readl(GUEST_RFLAGS);
827         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
828         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
829         vmcs_writel(GUEST_RFLAGS, flags);
830
831         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
832                                 (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK));
833
834         update_exception_bitmap(vcpu);
835
836 #define FIX_PMODE_DATASEG(seg, save) {                          \
837                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
838                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
839                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
840                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
841         }
842
843         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
844         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
845         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
846         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
847         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
848
849         vmcs_write16(GUEST_CS_SELECTOR,
850                                  vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
851         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
852         print_func_exit();
853 }
854
855 static int rmode_tss_base(struct litevm *litevm)
856 {
857         print_func_entry();
858         gfn_t base_gfn =
859                 litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
860         print_func_exit();
861         return base_gfn << PAGE_SHIFT;
862 }
863
864 static void enter_rmode(struct litevm_vcpu *vcpu)
865 {
866         print_func_entry();
867         unsigned long flags;
868
869         vcpu->rmode.active = 1;
870
871         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
872         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
873
874         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
875         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
876
877         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
878         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
879
880         flags = vmcs_readl(GUEST_RFLAGS);
881         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
882
883         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
884
885         vmcs_writel(GUEST_RFLAGS, flags);
886         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
887         update_exception_bitmap(vcpu);
888
889 #define FIX_RMODE_SEG(seg, save) {                                 \
890                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
891                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
892                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
893                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
894         }
895
896         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
897         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
898
899         FIX_RMODE_SEG(ES, vcpu->rmode.es);
900         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
901         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
902         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
903         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
904         print_func_exit();
905 }
906
907 static int init_rmode_tss(struct litevm *litevm)
908 {
909         print_func_entry();
910         struct page *p1, *p2, *p3;
911         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
912         char *page;
913
914         p1 = _gfn_to_page(litevm, fn++);
915         p2 = _gfn_to_page(litevm, fn++);
916         p3 = _gfn_to_page(litevm, fn);
917
918         if (!p1 || !p2 || !p3) {
919                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
920                 print_func_exit();
921                 return 0;
922         }
923
924         page = page2kva(p1);
925         memset(page, 0, PAGE_SIZE);
926         *(uint16_t *) (page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
927
928         page = page2kva(p2);
929         memset(page, 0, PAGE_SIZE);
930
931         page = page2kva(p3);
932         memset(page, 0, PAGE_SIZE);
933         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
934
935         print_func_exit();
936         return 1;
937 }
938
939 #ifdef __x86_64__
940
941 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
942 {
943         print_func_entry();
944         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
945
946         vcpu->shadow_efer = efer;
947         if (efer & EFER_LMA) {
948                 vmcs_write32(VM_ENTRY_CONTROLS,
949                                          vmcs_read32(VM_ENTRY_CONTROLS) |
950                                          VM_ENTRY_CONTROLS_IA32E_MASK);
951                 msr->data = efer;
952
953         } else {
954                 vmcs_write32(VM_ENTRY_CONTROLS,
955                                          vmcs_read32(VM_ENTRY_CONTROLS) &
956                                          ~VM_ENTRY_CONTROLS_IA32E_MASK);
957
958                 msr->data = efer & ~EFER_LME;
959         }
960         print_func_exit();
961 }
962
963 static void enter_lmode(struct litevm_vcpu *vcpu)
964 {
965         print_func_entry();
966         uint32_t guest_tr_ar;
967
968         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
969         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
970                 printd("%s: tss fixup for long mode. \n", __FUNCTION__);
971                 vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK)
972                                          | AR_TYPE_BUSY_64_TSS);
973         }
974
975         vcpu->shadow_efer |= EFER_LMA;
976
977         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
978         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
979                                  | VM_ENTRY_CONTROLS_IA32E_MASK);
980         print_func_exit();
981 }
982
983 static void exit_lmode(struct litevm_vcpu *vcpu)
984 {
985         print_func_entry();
986         vcpu->shadow_efer &= ~EFER_LMA;
987
988         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
989                                  & ~VM_ENTRY_CONTROLS_IA32E_MASK);
990         print_func_exit();
991 }
992
993 #endif
994
995 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
996 {
997         print_func_entry();
998         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
999                 enter_pmode(vcpu);
1000
1001         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
1002                 enter_rmode(vcpu);
1003
1004 #ifdef __x86_64__
1005         if (vcpu->shadow_efer & EFER_LME) {
1006                 if (!is_paging() && (cr0 & CR0_PG_MASK))
1007                         enter_lmode(vcpu);
1008                 if (is_paging() && !(cr0 & CR0_PG_MASK))
1009                         exit_lmode(vcpu);
1010         }
1011 #endif
1012
1013         vmcs_writel(CR0_READ_SHADOW, cr0);
1014         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
1015         print_func_exit();
1016 }
1017
1018 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
1019                                                                                  unsigned long cr3)
1020 {
1021         print_func_entry();
1022         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
1023         unsigned offset = (cr3 & (PAGE_SIZE - 1)) >> 5;
1024         int i;
1025         uint64_t pdpte;
1026         uint64_t *pdpt;
1027         struct litevm_memory_slot *memslot;
1028
1029         SPLL(&vcpu->litevm->lock);
1030         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
1031         /* FIXME: !memslot - emulate? 0xff? */
1032         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
1033
1034         for (i = 0; i < 4; ++i) {
1035                 pdpte = pdpt[offset + i];
1036                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
1037                         break;
1038         }
1039
1040         SPLU(&vcpu->litevm->lock);
1041
1042         print_func_exit();
1043         return i != 4;
1044 }
1045
1046 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
1047 {
1048         print_func_entry();
1049         if (cr0 & CR0_RESEVED_BITS) {
1050                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, guest_cr0());
1051                 inject_gp(vcpu);
1052                 print_func_exit();
1053                 return;
1054         }
1055
1056         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
1057                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
1058                 inject_gp(vcpu);
1059                 print_func_exit();
1060                 return;
1061         }
1062
1063         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
1064                 printd("set_cr0: #GP, set PG flag " "and a clear PE flag\n");
1065                 inject_gp(vcpu);
1066                 print_func_exit();
1067                 return;
1068         }
1069
1070         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
1071 #ifdef __x86_64__
1072                 if ((vcpu->shadow_efer & EFER_LME)) {
1073                         uint32_t guest_cs_ar;
1074                         if (!is_pae()) {
1075                                 printd("set_cr0: #GP, start paging "
1076                                            "in long mode while PAE is disabled\n");
1077                                 inject_gp(vcpu);
1078                                 print_func_exit();
1079                                 return;
1080                         }
1081                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1082                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
1083                                 printd("set_cr0: #GP, start paging "
1084                                            "in long mode while CS.L == 1\n");
1085                                 inject_gp(vcpu);
1086                                 print_func_exit();
1087                                 return;
1088
1089                         }
1090                 } else
1091 #endif
1092                 if (is_pae() && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1093                         printd("set_cr0: #GP, pdptrs " "reserved bits\n");
1094                         inject_gp(vcpu);
1095                         print_func_exit();
1096                         return;
1097                 }
1098
1099         }
1100
1101         __set_cr0(vcpu, cr0);
1102         litevm_mmu_reset_context(vcpu);
1103         print_func_exit();
1104         return;
1105 }
1106
1107 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
1108 {
1109         print_func_entry();
1110         unsigned long cr0 = guest_cr0();
1111
1112         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
1113                 enter_pmode(vcpu);
1114                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
1115
1116         } else
1117                 printd("lmsw: unexpected\n");
1118
1119         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
1120                                 | (msw & LMSW_GUEST_MASK));
1121         print_func_exit();
1122 }
1123
1124 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1125 {
1126         print_func_entry();
1127         vmcs_writel(CR4_READ_SHADOW, cr4);
1128         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1129                                                                   LITEVM_RMODE_VM_CR4_ALWAYS_ON :
1130                                                                   LITEVM_PMODE_VM_CR4_ALWAYS_ON));
1131         print_func_exit();
1132 }
1133
1134 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1135 {
1136         print_func_entry();
1137         if (cr4 & CR4_RESEVED_BITS) {
1138                 printd("set_cr4: #GP, reserved bits\n");
1139                 inject_gp(vcpu);
1140                 print_func_exit();
1141                 return;
1142         }
1143
1144         if (is_long_mode()) {
1145                 if (!(cr4 & CR4_PAE_MASK)) {
1146                         printd("set_cr4: #GP, clearing PAE while " "in long mode\n");
1147                         inject_gp(vcpu);
1148                         print_func_exit();
1149                         return;
1150                 }
1151         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
1152                            && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1153                 printd("set_cr4: #GP, pdptrs reserved bits\n");
1154                 inject_gp(vcpu);
1155         }
1156
1157         if (cr4 & CR4_VMXE_MASK) {
1158                 printd("set_cr4: #GP, setting VMXE\n");
1159                 inject_gp(vcpu);
1160                 print_func_exit();
1161                 return;
1162         }
1163         __set_cr4(vcpu, cr4);
1164         SPLL(&vcpu->litevm->lock);
1165         litevm_mmu_reset_context(vcpu);
1166         SPLU(&vcpu->litevm->lock);
1167         print_func_exit();
1168 }
1169
1170 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1171 {
1172         print_func_entry();
1173         if (is_long_mode()) {
1174                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
1175                         printd("set_cr3: #GP, reserved bits\n");
1176                         inject_gp(vcpu);
1177                         print_func_exit();
1178                         return;
1179                 }
1180         } else {
1181                 if (cr3 & CR3_RESEVED_BITS) {
1182                         printd("set_cr3: #GP, reserved bits\n");
1183                         inject_gp(vcpu);
1184                         print_func_exit();
1185                         return;
1186                 }
1187                 if (is_paging() && is_pae() && pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1188                         printd("set_cr3: #GP, pdptrs " "reserved bits\n");
1189                         inject_gp(vcpu);
1190                         print_func_exit();
1191                         return;
1192                 }
1193         }
1194
1195         vcpu->cr3 = cr3;
1196         SPLL(&vcpu->litevm->lock);
1197         vcpu->mmu.new_cr3(vcpu);
1198         SPLU(&vcpu->litevm->lock);
1199         print_func_exit();
1200 }
1201
1202 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1203 {
1204         print_func_entry();
1205         if (cr8 & CR8_RESEVED_BITS) {
1206                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1207                 inject_gp(vcpu);
1208                 print_func_exit();
1209                 return;
1210         }
1211         vcpu->cr8 = cr8;
1212         print_func_exit();
1213 }
1214
1215 static uint32_t get_rdx_init_val(void)
1216 {
1217         print_func_entry();
1218         uint32_t val;
1219
1220 asm("movl $1, %%eax \n\t" "movl %%eax, %0 \n\t":"=g"(val));
1221         print_func_exit();
1222         return val;
1223
1224 }
1225
1226 static void fx_init(struct litevm_vcpu *vcpu)
1227 {
1228         print_func_entry();
1229         struct __attribute__ ((__packed__)) fx_image_s {
1230                 uint16_t control;               //fcw
1231                 uint16_t status;                //fsw
1232                 uint16_t tag;                   // ftw
1233                 uint16_t opcode;                //fop
1234                 uint64_t ip;                    // fpu ip
1235                 uint64_t operand;               // fpu dp
1236                 uint32_t mxcsr;
1237                 uint32_t mxcsr_mask;
1238
1239         } *fx_image;
1240
1241         fx_save(vcpu->host_fx_image);
1242         fpu_init();
1243         fx_save(vcpu->guest_fx_image);
1244         fx_restore(vcpu->host_fx_image);
1245
1246         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1247         fx_image->mxcsr = 0x1f80;
1248         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1249                    0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1250         print_func_exit();
1251 }
1252
1253 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field,
1254                                                                    uint32_t val)
1255 {
1256         print_func_entry();
1257         uint32_t msr_high, msr_low;
1258         uint64_t msrval;
1259
1260         msrval = read_msr(msr);
1261         msr_low = msrval;
1262         msr_high = (msrval >> 32);
1263
1264         val &= msr_high;
1265         val |= msr_low;
1266         vmcs_write32(vmcs_field, val);
1267         print_func_exit();
1268 }
1269
1270 /*
1271  * Sets up the vmcs for emulated real mode.
1272  */
1273 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1274 {
1275         print_func_entry();
1276
1277 /* no op on x86_64 */
1278 #define asmlinkage
1279         extern asmlinkage void litevm_vmx_return(void);
1280         uint32_t host_sysenter_cs;
1281         uint32_t junk;
1282         uint64_t a;
1283         struct descriptor_table dt;
1284         int i;
1285         int ret;
1286         uint64_t tsc;
1287         int nr_good_msrs;
1288
1289         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1290         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1291         vcpu->cr8 = 0;
1292         vcpu->apic_base = 0xfee00000 |
1293                 /*for vcpu 0 */ MSR_IA32_APICBASE_BSP |
1294                 MSR_IA32_APICBASE_ENABLE;
1295
1296         fx_init(vcpu);
1297
1298 #define SEG_SETUP(seg) do {                                     \
1299                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1300                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1301                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1302                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1303         } while (0)
1304
1305         /*
1306          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1307          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1308          */
1309         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1310         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1311         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1312         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1313
1314         SEG_SETUP(DS);
1315         SEG_SETUP(ES);
1316         SEG_SETUP(FS);
1317         SEG_SETUP(GS);
1318         SEG_SETUP(SS);
1319
1320         vmcs_write16(GUEST_TR_SELECTOR, 0);
1321         vmcs_writel(GUEST_TR_BASE, 0);
1322         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1323         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1324
1325         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1326         vmcs_writel(GUEST_LDTR_BASE, 0);
1327         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1328         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1329
1330         vmcs_write32(GUEST_SYSENTER_CS, 0);
1331         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1332         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1333
1334         vmcs_writel(GUEST_RFLAGS, 0x02);
1335         vmcs_writel(GUEST_RIP, 0xfff0);
1336         vmcs_writel(GUEST_RSP, 0);
1337
1338         vmcs_writel(GUEST_CR3, 0);
1339
1340         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1341         vmcs_writel(GUEST_DR7, 0x400);
1342
1343         vmcs_writel(GUEST_GDTR_BASE, 0);
1344         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1345
1346         vmcs_writel(GUEST_IDTR_BASE, 0);
1347         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1348
1349         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1350         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1351         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1352
1353         /* I/O */
1354         vmcs_write64(IO_BITMAP_A, 0);
1355         vmcs_write64(IO_BITMAP_B, 0);
1356
1357         tsc = read_tsc();
1358         vmcs_write64(TSC_OFFSET, -tsc);
1359
1360         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1361
1362         /* Special registers */
1363         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1364
1365         /* Control */
1366         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_EXT_INTR_MASK       /* 20.6.1 */
1367                                                    | PIN_BASED_NMI_EXITING      /* 20.6.1 */
1368                 );
1369         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_HLT_EXITING        /* 20.6.2 */
1370                                                    | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1371                                                    | CPU_BASED_CR8_STORE_EXITING        /* 20.6.2 */
1372                                                    | CPU_BASED_UNCOND_IO_EXITING        /* 20.6.2 */
1373                                                    | CPU_BASED_INVDPG_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING  /* 21.3 */
1374                 );
1375
1376         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1377         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1378         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1379         vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1380
1381         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1382         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1383         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1384
1385 #warning "not setting selectors; do we need them?"
1386 #if 0
1387         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);    /* 22.2.4 */
1388         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1389         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1390 #endif
1391         vmcs_write16(HOST_FS_SELECTOR, read_fs());      /* 22.2.4 */
1392         vmcs_write16(HOST_GS_SELECTOR, read_gs());      /* 22.2.4 */
1393 #if 0
1394         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1395 #endif
1396 #ifdef __x86_64__
1397         a = read_msr(MSR_FS_BASE);
1398         vmcs_writel(HOST_FS_BASE, a);   /* 22.2.4 */
1399         a = read_msr(MSR_GS_BASE);
1400         vmcs_writel(HOST_GS_BASE, a);   /* 22.2.4 */
1401 #else
1402         vmcs_writel(HOST_FS_BASE, 0);   /* 22.2.4 */
1403         vmcs_writel(HOST_GS_BASE, 0);   /* 22.2.4 */
1404 #endif
1405
1406 #warning "Not setting HOST_TR_SELECTOR"
1407 #if 0
1408         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS * 8);      /* 22.2.4 */
1409 #endif
1410
1411         get_idt(&dt);
1412         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1413
1414         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return);        /* 22.2.5 */
1415
1416         /* it's the HIGH 32 bits! */
1417         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1418         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1419         a = read_msr(MSR_IA32_SYSENTER_ESP);
1420         vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1421         a = read_msr(MSR_IA32_SYSENTER_EIP);
1422         vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1423
1424         ret = -ENOMEM;
1425         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1426         if (!vcpu->guest_msrs)
1427                 error("guest_msrs kmalloc failed");
1428         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1429         if (!vcpu->host_msrs)
1430                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1431
1432         for (i = 0; i < NR_VMX_MSR; ++i) {
1433                 uint32_t index = vmx_msr_index[i];
1434                 uint32_t data_low, data_high;
1435                 uint64_t data;
1436                 int j = vcpu->nmsrs;
1437
1438 #warning "need readmsr_safe"
1439 //      if (rdmsr_safe(index, &data_low, &data_high) < 0)
1440 //          continue;
1441                 data = read_msr(index);
1442                 vcpu->host_msrs[j].index = index;
1443                 vcpu->host_msrs[j].reserved = 0;
1444                 vcpu->host_msrs[j].data = data;
1445                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1446                 ++vcpu->nmsrs;
1447         }
1448         printk("msrs: %d\n", vcpu->nmsrs);
1449
1450         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1451         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1452         vmcs_writel(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1453         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1454         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, (HOST_IS_64 << 9));        /* 22.2,1, 20.7.1 */
1455         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs);    /* 22.2.2 */
1456         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);     /* 22.2.2 */
1457         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs);    /* 22.2.2 */
1458
1459         /* 22.2.1, 20.8.1 */
1460         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, VM_ENTRY_CONTROLS, 0);
1461         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
1462
1463         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1464         vmcs_writel(TPR_THRESHOLD, 0);
1465
1466         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1467         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1468
1469         __set_cr0(vcpu, 0x60000010);    // enter rmode
1470         __set_cr4(vcpu, 0);
1471 #ifdef __x86_64__
1472         __set_efer(vcpu, 0);
1473 #endif
1474
1475         ret = litevm_mmu_init(vcpu);
1476
1477         print_func_exit();
1478         return ret;
1479
1480 out_free_guest_msrs:
1481         kfree(vcpu->guest_msrs);
1482 out:
1483         return ret;
1484 }
1485
1486 /*
1487  * Sync the rsp and rip registers into the vcpu structure.  This allows
1488  * registers to be accessed by indexing vcpu->regs.
1489  */
1490 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1491 {
1492         print_func_entry();
1493         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1494         vcpu->rip = vmcs_readl(GUEST_RIP);
1495         print_func_exit();
1496 }
1497
1498 /*
1499  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1500  * modification.
1501  */
1502 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1503 {
1504         print_func_entry();
1505         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1506         vmcs_writel(GUEST_RIP, vcpu->rip);
1507         print_func_exit();
1508 }
1509
1510 /*
1511  * Creates some virtual cpus.  Good luck creating more than one.
1512  */
1513 int vmx_create_vcpu(struct litevm *litevm, int n)
1514 {
1515         print_func_entry();
1516         ERRSTACK(2);
1517         int r;
1518         struct litevm_vcpu *vcpu;
1519         struct vmcs *vmcs;
1520         char *errstring = NULL;
1521
1522         if (n < 0 || n >= LITEVM_MAX_VCPUS) {
1523                 printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1524                            LITEVM_MAX_VCPUS);
1525                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1526                           LITEVM_MAX_VCPUS);
1527         }
1528         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1529         vcpu = &litevm->vcpus[n];
1530
1531         printk("vmx_create_vcpu: @%d, %p\n", n, vcpu);
1532         QLOCK(&vcpu->mutex);
1533
1534         if (vcpu->vmcs) {
1535                 QUNLOCK(&vcpu->mutex);
1536                 printk("VM already exists\n");
1537                 error("VM already exists");
1538         }
1539         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1540         /* I'm a bad person */
1541         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1542         uint64_t a = (uint64_t) vcpu->fx_buf;
1543         a += FX_IMAGE_ALIGN - 1;
1544         a /= FX_IMAGE_ALIGN;
1545         a *= FX_IMAGE_ALIGN;
1546
1547         vcpu->host_fx_image = (char *)a;
1548         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1549
1550         vcpu->cpu = -1; /* First load will set up TR */
1551         vcpu->litevm = litevm;
1552         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1553         if (waserror()){
1554                 printk("ERR 1 in %s, %s\n", __func__, current_errstr());
1555                 QUNLOCK(&vcpu->mutex);
1556                 litevm_free_vcpu(vcpu);
1557                 nexterror();
1558         }
1559         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1560         vmcs = alloc_vmcs();
1561         vmcs_clear(vmcs);
1562         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1563         printk("after vmcs_clear\n");
1564         vcpu->vmcs = vmcs;
1565         printk("vcpu %p set vmcs to %p\n", vcpu, vmcs);
1566         vcpu->launched = 0;
1567         printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs);
1568
1569         __vcpu_load(vcpu);
1570
1571         printk("PAST vcpu_load\n");
1572         if (waserror()) {
1573                 /* we really need to fix waserror() */
1574                 printk("vcpu_setup failed: %s\n", current_errstr());
1575                 QUNLOCK(&vcpu->mutex);
1576                 nexterror();
1577         }
1578
1579         /* need memory for the rmode_tss. I have no idea how this happened
1580          * originally in kvm.
1581          */
1582         /* this sucks. */
1583         QUNLOCK(&vcpu->mutex);
1584         void *v;
1585         struct litevm_memory_region vmr;
1586         vmr.slot = 0;
1587         vmr.flags = 0;
1588         vmr.guest_phys_addr = /* guess. */ 0x1000000;
1589         vmr.memory_size = 0x10000;
1590         if (vm_set_memory_region(litevm, &vmr))
1591                 printk("vm_set_memory_region failed");
1592
1593         printk("set memory region done\n");
1594
1595         if (!init_rmode_tss(litevm)) {
1596                 error("vcpu_setup: init_rmode_tss failed");
1597         }
1598
1599
1600         QLOCK(&vcpu->mutex);
1601         r = litevm_vcpu_setup(vcpu);
1602
1603         vcpu_put(vcpu);
1604
1605         printk("r is %d\n", r);
1606
1607         if (!r) {
1608                 poperror();
1609                 print_func_exit();
1610                 return 0;
1611         }
1612
1613         errstring = "vcup set failed";
1614
1615 out_free_vcpus:
1616 out:
1617         print_func_exit();
1618         return r;
1619 }
1620
1621 /*
1622  * Allocate some memory and give it an address in the guest physical address
1623  * space.
1624  *
1625  * Discontiguous memory is allowed, mostly for framebuffers.
1626  */
1627 int vm_set_memory_region(struct litevm *litevm,
1628                                                  struct litevm_memory_region *mem)
1629 {
1630         print_func_entry();
1631         ERRSTACK(2);
1632         int r;
1633         gfn_t base_gfn;
1634         unsigned long npages;
1635         unsigned long i;
1636         struct litevm_memory_slot *memslot;
1637         struct litevm_memory_slot old, new;
1638         int memory_config_version;
1639         void *init_data = mem->init_data;
1640         int pass = 1;
1641
1642         printk("litevm %p\n", litevm);
1643         /* should not happen but ... */
1644         if (!litevm)
1645                 error("NULL litevm in %s", __func__);
1646
1647         if (!mem)
1648                 error("NULL mem in %s", __func__);
1649         /* I don't care right now. *
1650         if (litevm->busy)
1651                 error("litevm->busy is set! 0x%x\n", litevm->busy);
1652         */
1653         r = -EINVAL;
1654         /* General sanity checks */
1655         if (mem->memory_size & (PAGE_SIZE - 1))
1656                 error("mem->memory_size %lld is not page-aligned", mem->memory_size);
1657         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1658                 error("guest_phys_addr 0x%llx is not page-aligned",
1659                           mem->guest_phys_addr);
1660         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1661                 error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
1662         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1663                 error("0x%x + 0x%x is < 0x%x",
1664                           mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
1665
1666         memslot = &litevm->memslots[mem->slot];
1667         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1668         npages = mem->memory_size >> PAGE_SHIFT;
1669
1670         if (!npages)
1671                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1672
1673         /* this is actually a very tricky for loop. The use of
1674          * error is a bit dangerous, so we don't use it much.
1675          * consider a rewrite. Would be nice if akaros could do the
1676          * allocation of a bunch of pages for us.
1677          */
1678 raced:
1679         printk("raced: pass %d\n", pass);
1680         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1681         monitor(NULL);
1682         SPLL(&litevm->lock);
1683         printk("locked\n");
1684
1685         if (waserror()) {
1686                 printk("error in %s, %s\n", __func__, current_errstr());
1687                 SPLU(&litevm->lock);
1688                 nexterror();
1689         }
1690
1691         memory_config_version = litevm->memory_config_version;
1692         new = old = *memslot;
1693         printk("memory_config_version %d\n", memory_config_version);
1694
1695         new.base_gfn = base_gfn;
1696         new.npages = npages;
1697         new.flags = mem->flags;
1698
1699         /* Disallow changing a memory slot's size. */
1700         r = -EINVAL;
1701         if (npages && old.npages && npages != old.npages)
1702                 error("npages is %d, old.npages is %d, can't change",
1703                           npages, old.npages);
1704
1705         /* Check for overlaps */
1706         r = -EEXIST;
1707         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1708                 struct litevm_memory_slot *s = &litevm->memslots[i];
1709
1710                 if (s == memslot)
1711                         continue;
1712                 if (!((base_gfn + npages <= s->base_gfn) ||
1713                           (base_gfn >= s->base_gfn + s->npages)))
1714                         error("Overlap");
1715         }
1716         /*
1717          * Do memory allocations outside lock.  memory_config_version will
1718          * detect any races.
1719          */
1720         SPLU(&litevm->lock);
1721         printk("unlocked\n");
1722         poperror();
1723
1724         /* Deallocate if slot is being removed */
1725         if (!npages)
1726                 new.phys_mem = 0;
1727
1728         /* Free page dirty bitmap if unneeded */
1729         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1730                 new.dirty_bitmap = 0;
1731
1732         r = -ENOMEM;
1733
1734         /* Allocate if a slot is being created */
1735         if (npages && !new.phys_mem) {
1736                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1737
1738                 if (!new.phys_mem)
1739                         goto out_free;
1740
1741                 for (i = 0; i < npages; ++i) {
1742                         int ret;
1743                         ret = kpage_alloc(&new.phys_mem[i]);
1744                         if (ret != ESUCCESS)
1745                                 goto out_free;
1746                         if (init_data) {
1747                                 printk("init data memcpy(%p,%p,4096);\n",
1748                                            page2kva(new.phys_mem[i]), init_data);
1749                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1750                                 init_data += PAGE_SIZE;
1751                         }
1752                 }
1753         }
1754
1755         /* Allocate page dirty bitmap if needed */
1756         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1757                 unsigned dirty_bytes;   //ALIGN(npages, BITS_PER_LONG) / 8;
1758                 dirty_bytes =
1759                         (((npages + BITS_PER_LONG -
1760                            1) / BITS_PER_LONG) * BITS_PER_LONG) / 8;
1761
1762                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1763                 if (!new.dirty_bitmap) {
1764                         printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
1765                         goto out_free;
1766                 }
1767         }
1768
1769         SPLL(&litevm->lock);
1770         printk("locked\n");
1771         if (memory_config_version != litevm->memory_config_version) {
1772                 SPLU(&litevm->lock);
1773                 printk("unlocked, try again\n");
1774                 litevm_free_physmem_slot(&new, &old);
1775                 goto raced;
1776         }
1777
1778         r = -EAGAIN;
1779         if (litevm->busy) {
1780                 printk("BUSY!\n");
1781                 goto out_unlock;
1782         }
1783
1784         if (mem->slot >= litevm->nmemslots)
1785                 litevm->nmemslots = mem->slot + 1;
1786
1787         *memslot = new;
1788         ++litevm->memory_config_version;
1789
1790         SPLU(&litevm->lock);
1791         printk("unlocked\n");
1792         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1793                 struct litevm_vcpu *vcpu;
1794
1795                 vcpu = vcpu_load(litevm, i);
1796                 if (!vcpu){
1797                         printk("%s: no cpu %d\n", __func__, i);
1798                         continue;
1799                 }
1800                 litevm_mmu_reset_context(vcpu);
1801                 vcpu_put(vcpu);
1802         }
1803
1804         litevm_free_physmem_slot(&old, &new);
1805         print_func_exit();
1806         return 0;
1807
1808 out_unlock:
1809         SPLU(&litevm->lock);
1810         printk("out_unlock\n");
1811 out_free:
1812         printk("out_free\n");
1813         litevm_free_physmem_slot(&new, &old);
1814 out:
1815         printk("vm_set_memory_region: return %d\n", r);
1816         print_func_exit();
1817         return r;
1818 }
1819
1820 #if 0
1821 /*
1822  * Get (and clear) the dirty memory log for a memory slot.
1823  */
1824 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1825                                                                                   struct litevm_dirty_log *log)
1826 {
1827         struct litevm_memory_slot *memslot;
1828         int r, i;
1829         int n;
1830         unsigned long any = 0;
1831
1832         SPLL(&litevm->lock);
1833
1834         /*
1835          * Prevent changes to guest memory configuration even while the lock
1836          * is not taken.
1837          */
1838         ++litevm->busy;
1839         SPLU(&litevm->lock);
1840         r = -EINVAL;
1841         if (log->slot >= LITEVM_MEMORY_SLOTS)
1842                 goto out;
1843
1844         memslot = &litevm->memslots[log->slot];
1845         r = -ENOENT;
1846         if (!memslot->dirty_bitmap)
1847                 goto out;
1848
1849         n = ALIGN(memslot->npages, 8) / 8;
1850
1851         for (i = 0; !any && i < n; ++i)
1852                 any = memslot->dirty_bitmap[i];
1853
1854         r = -EFAULT;
1855         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1856                 goto out;
1857
1858         if (any) {
1859                 SPLL(&litevm->lock);
1860                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1861                 SPLU(&litevm->lock);
1862                 memset(memslot->dirty_bitmap, 0, n);
1863                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1864                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1865
1866                         if (!vcpu)
1867                                 continue;
1868                         flush_guest_tlb(vcpu);
1869                         vcpu_put(vcpu);
1870                 }
1871         }
1872
1873         r = 0;
1874
1875 out:
1876         SPLL(&litevm->lock);
1877         --litevm->busy;
1878         SPLU(&litevm->lock);
1879         return r;
1880 }
1881 #endif
1882
1883 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1884 {
1885         print_func_entry();
1886         int i;
1887
1888         printk("%s: litevm %p gfn %d\n", litevm, gfn);
1889         for (i = 0; i < litevm->nmemslots; ++i) {
1890                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1891
1892                 if (gfn >= memslot->base_gfn
1893                         && gfn < memslot->base_gfn + memslot->npages) {
1894                         print_func_exit();
1895                         return memslot;
1896                 }
1897         }
1898         print_func_exit();
1899         return 0;
1900 }
1901
1902 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1903 {
1904         print_func_entry();
1905         int i;
1906         struct litevm_memory_slot *memslot = 0;
1907         unsigned long rel_gfn;
1908
1909         for (i = 0; i < litevm->nmemslots; ++i) {
1910                 memslot = &litevm->memslots[i];
1911
1912                 if (gfn >= memslot->base_gfn
1913                         && gfn < memslot->base_gfn + memslot->npages) {
1914
1915                         if (!memslot || !memslot->dirty_bitmap) {
1916                                 print_func_exit();
1917                                 return;
1918                         }
1919
1920                         rel_gfn = gfn - memslot->base_gfn;
1921
1922                         /* avoid RMW */
1923                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1924                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1925                         print_func_exit();
1926                         return;
1927                 }
1928         }
1929         print_func_exit();
1930 }
1931
1932 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1933 {
1934         print_func_entry();
1935         unsigned long rip;
1936         uint32_t interruptibility;
1937
1938         rip = vmcs_readl(GUEST_RIP);
1939         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1940         vmcs_writel(GUEST_RIP, rip);
1941
1942         /*
1943          * We emulated an instruction, so temporary interrupt blocking
1944          * should be removed, if set.
1945          */
1946         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1947         if (interruptibility & 3)
1948                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3);
1949         print_func_exit();
1950 }
1951
1952 static int emulator_read_std(unsigned long addr,
1953                                                          unsigned long *val,
1954                                                          unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1955 {
1956         print_func_entry();
1957         struct litevm_vcpu *vcpu = ctxt->vcpu;
1958         void *data = val;
1959
1960         while (bytes) {
1961                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1962                 unsigned offset = addr & (PAGE_SIZE - 1);
1963                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ?
1964                         bytes : (unsigned)PAGE_SIZE - offset;
1965                 unsigned long pfn;
1966                 struct litevm_memory_slot *memslot;
1967                 void *page;
1968
1969                 if (gpa == UNMAPPED_GVA) {
1970                         print_func_exit();
1971                         return X86EMUL_PROPAGATE_FAULT;
1972                 }
1973                 pfn = gpa >> PAGE_SHIFT;
1974                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1975                 if (!memslot) {
1976                         print_func_exit();
1977                         return X86EMUL_UNHANDLEABLE;
1978                 }
1979                 page = page2kva(gfn_to_page(memslot, pfn));
1980
1981                 memcpy(data, page + offset, tocopy);
1982
1983                 bytes -= tocopy;
1984                 data += tocopy;
1985                 addr += tocopy;
1986         }
1987
1988         print_func_exit();
1989         return X86EMUL_CONTINUE;
1990 }
1991
1992 static int emulator_write_std(unsigned long addr,
1993                                                           unsigned long val,
1994                                                           unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1995 {
1996         print_func_entry();
1997         printk("emulator_write_std: addr %lx n %d\n", addr, bytes);
1998         print_func_exit();
1999         return X86EMUL_UNHANDLEABLE;
2000 }
2001
2002 static int emulator_read_emulated(unsigned long addr,
2003                                                                   unsigned long *val,
2004                                                                   unsigned int bytes,
2005                                                                   struct x86_emulate_ctxt *ctxt)
2006 {
2007         print_func_entry();
2008         struct litevm_vcpu *vcpu = ctxt->vcpu;
2009
2010         if (vcpu->mmio_read_completed) {
2011                 memcpy(val, vcpu->mmio_data, bytes);
2012                 vcpu->mmio_read_completed = 0;
2013                 print_func_exit();
2014                 return X86EMUL_CONTINUE;
2015         } else if (emulator_read_std(addr, val, bytes, ctxt)
2016                            == X86EMUL_CONTINUE) {
2017                 print_func_exit();
2018                 return X86EMUL_CONTINUE;
2019         } else {
2020                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2021                 if (gpa == UNMAPPED_GVA) {
2022                         print_func_exit();
2023                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
2024                 }
2025                 vcpu->mmio_needed = 1;
2026                 vcpu->mmio_phys_addr = gpa;
2027                 vcpu->mmio_size = bytes;
2028                 vcpu->mmio_is_write = 0;
2029
2030                 print_func_exit();
2031                 return X86EMUL_UNHANDLEABLE;
2032         }
2033 }
2034
2035 static int emulator_write_emulated(unsigned long addr,
2036                                                                    unsigned long val,
2037                                                                    unsigned int bytes,
2038                                                                    struct x86_emulate_ctxt *ctxt)
2039 {
2040         print_func_entry();
2041         struct litevm_vcpu *vcpu = ctxt->vcpu;
2042         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2043
2044         if (gpa == UNMAPPED_GVA) {
2045                 print_func_exit();
2046                 return X86EMUL_PROPAGATE_FAULT;
2047         }
2048
2049         vcpu->mmio_needed = 1;
2050         vcpu->mmio_phys_addr = gpa;
2051         vcpu->mmio_size = bytes;
2052         vcpu->mmio_is_write = 1;
2053         memcpy(vcpu->mmio_data, &val, bytes);
2054
2055         print_func_exit();
2056         return X86EMUL_CONTINUE;
2057 }
2058
2059 static int emulator_cmpxchg_emulated(unsigned long addr,
2060                                                                          unsigned long old,
2061                                                                          unsigned long new,
2062                                                                          unsigned int bytes,
2063                                                                          struct x86_emulate_ctxt *ctxt)
2064 {
2065         print_func_entry();
2066         static int reported;
2067
2068         if (!reported) {
2069                 reported = 1;
2070                 printk("litevm: emulating exchange as write\n");
2071         }
2072         print_func_exit();
2073         return emulator_write_emulated(addr, new, bytes, ctxt);
2074 }
2075
2076 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
2077 {
2078         print_func_entry();
2079         static int reported;
2080         uint8_t opcodes[4];
2081         unsigned long rip = vmcs_readl(GUEST_RIP);
2082         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
2083
2084         if (reported) {
2085                 print_func_exit();
2086                 return;
2087         }
2088
2089         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
2090
2091         printk("emulation failed but !mmio_needed?"
2092                    " rip %lx %02x %02x %02x %02x\n",
2093                    rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2094         reported = 1;
2095         print_func_exit();
2096 }
2097
2098 struct x86_emulate_ops emulate_ops = {
2099         .read_std = emulator_read_std,
2100         .write_std = emulator_write_std,
2101         .read_emulated = emulator_read_emulated,
2102         .write_emulated = emulator_write_emulated,
2103         .cmpxchg_emulated = emulator_cmpxchg_emulated,
2104 };
2105
2106 enum emulation_result {
2107         EMULATE_DONE,                           /* no further processing */
2108         EMULATE_DO_MMIO,                        /* litevm_run filled with mmio request */
2109         EMULATE_FAIL,                           /* can't emulate this instruction */
2110 };
2111
2112 static int emulate_instruction(struct litevm_vcpu *vcpu,
2113                                                            struct litevm_run *run,
2114                                                            unsigned long cr2, uint16_t error_code)
2115 {
2116         print_func_entry();
2117         struct x86_emulate_ctxt emulate_ctxt;
2118         int r;
2119         uint32_t cs_ar;
2120
2121         vcpu_load_rsp_rip(vcpu);
2122
2123         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2124
2125         emulate_ctxt.vcpu = vcpu;
2126         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
2127         emulate_ctxt.cr2 = cr2;
2128         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
2129                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
2130                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
2131                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2132
2133         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2134                 emulate_ctxt.cs_base = 0;
2135                 emulate_ctxt.ds_base = 0;
2136                 emulate_ctxt.es_base = 0;
2137                 emulate_ctxt.ss_base = 0;
2138                 emulate_ctxt.gs_base = 0;
2139                 emulate_ctxt.fs_base = 0;
2140         } else {
2141                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
2142                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
2143                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
2144                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
2145                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
2146                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
2147         }
2148
2149         vcpu->mmio_is_write = 0;
2150         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
2151
2152         if ((r || vcpu->mmio_is_write) && run) {
2153                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2154                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2155                 run->mmio.len = vcpu->mmio_size;
2156                 run->mmio.is_write = vcpu->mmio_is_write;
2157         }
2158
2159         if (r) {
2160                 if (!vcpu->mmio_needed) {
2161                         report_emulation_failure(&emulate_ctxt);
2162                         print_func_exit();
2163                         return EMULATE_FAIL;
2164                 }
2165                 print_func_exit();
2166                 return EMULATE_DO_MMIO;
2167         }
2168
2169         vcpu_put_rsp_rip(vcpu);
2170         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
2171
2172         if (vcpu->mmio_is_write) {
2173                 print_func_exit();
2174                 return EMULATE_DO_MMIO;
2175         }
2176
2177         print_func_exit();
2178         return EMULATE_DONE;
2179 }
2180
2181 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
2182 {
2183         print_func_entry();
2184         print_func_exit();
2185         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2186 }
2187
2188 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2189 {
2190         print_func_entry();
2191         vmcs_writel(GUEST_GDTR_BASE, base);
2192         vmcs_write32(GUEST_GDTR_LIMIT, limit);
2193         print_func_exit();
2194 }
2195
2196 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2197 {
2198         print_func_entry();
2199         vmcs_writel(GUEST_IDTR_BASE, base);
2200         vmcs_write32(GUEST_IDTR_LIMIT, limit);
2201         print_func_exit();
2202 }
2203
2204 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
2205                                    unsigned long *rflags)
2206 {
2207         print_func_entry();
2208         lmsw(vcpu, msw);
2209         *rflags = vmcs_readl(GUEST_RFLAGS);
2210         print_func_exit();
2211 }
2212
2213 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
2214 {
2215         print_func_entry();
2216         switch (cr) {
2217                 case 0:
2218                         print_func_exit();
2219                         return guest_cr0();
2220                 case 2:
2221                         print_func_exit();
2222                         return vcpu->cr2;
2223                 case 3:
2224                         print_func_exit();
2225                         return vcpu->cr3;
2226                 case 4:
2227                         print_func_exit();
2228                         return guest_cr4();
2229                 default:
2230                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2231                         print_func_exit();
2232                         return 0;
2233         }
2234 }
2235
2236 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
2237                                          unsigned long *rflags)
2238 {
2239         print_func_entry();
2240         switch (cr) {
2241                 case 0:
2242                         set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
2243                         *rflags = vmcs_readl(GUEST_RFLAGS);
2244                         break;
2245                 case 2:
2246                         vcpu->cr2 = val;
2247                         break;
2248                 case 3:
2249                         set_cr3(vcpu, val);
2250                         break;
2251                 case 4:
2252                         set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
2253                         break;
2254                 default:
2255                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2256         }
2257         print_func_exit();
2258 }
2259
2260 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
2261                                                                   int vec, uint32_t err_code)
2262 {
2263         print_func_entry();
2264         if (!vcpu->rmode.active) {
2265                 print_func_exit();
2266                 return 0;
2267         }
2268
2269         if (vec == GP_VECTOR && err_code == 0)
2270                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) {
2271                         print_func_exit();
2272                         return 1;
2273                 }
2274         print_func_exit();
2275         return 0;
2276 }
2277
2278 static int handle_exception(struct litevm_vcpu *vcpu,
2279                                                         struct litevm_run *litevm_run)
2280 {
2281         print_func_entry();
2282         uint32_t intr_info, error_code;
2283         unsigned long cr2, rip;
2284         uint32_t vect_info;
2285         enum emulation_result er;
2286
2287         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2288         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2289
2290         if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) {
2291                 printk("%s: unexpected, vectoring info 0x%x "
2292                            "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
2293         }
2294
2295         if (is_external_interrupt(vect_info)) {
2296                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2297                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_pending), irq);
2298                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_summary),
2299                                                            irq / BITS_PER_LONG);
2300         }
2301
2302         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) {  /* nmi */
2303                 asm("int $2");
2304                 print_func_exit();
2305                 return 1;
2306         }
2307         error_code = 0;
2308         rip = vmcs_readl(GUEST_RIP);
2309         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
2310                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2311         if (is_page_fault(intr_info)) {
2312                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2313
2314                 SPLL(&vcpu->litevm->lock);
2315                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
2316                         SPLU(&vcpu->litevm->lock);
2317                         print_func_exit();
2318                         return 1;
2319                 }
2320
2321                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
2322                 SPLU(&vcpu->litevm->lock);
2323
2324                 switch (er) {
2325                         case EMULATE_DONE:
2326                                 print_func_exit();
2327                                 return 1;
2328                         case EMULATE_DO_MMIO:
2329                                 ++litevm_stat.mmio_exits;
2330                                 litevm_run->exit_reason = LITEVM_EXIT_MMIO;
2331                                 print_func_exit();
2332                                 return 0;
2333                         case EMULATE_FAIL:
2334                                 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2335                                 break;
2336                         default:
2337                                 assert(0);
2338                 }
2339         }
2340
2341         if (vcpu->rmode.active &&
2342                 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2343                                                            error_code)) {
2344                 print_func_exit();
2345                 return 1;
2346         }
2347
2348         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
2349                 (INTR_TYPE_EXCEPTION | 1)) {
2350                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2351                 print_func_exit();
2352                 return 0;
2353         }
2354         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2355         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2356         litevm_run->ex.error_code = error_code;
2357         print_func_exit();
2358         return 0;
2359 }
2360
2361 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2362                                                                          struct litevm_run *litevm_run)
2363 {
2364         print_func_entry();
2365         ++litevm_stat.irq_exits;
2366         print_func_exit();
2367         return 1;
2368 }
2369
2370 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t * count)
2371 {
2372         print_func_entry();
2373         uint64_t inst;
2374         gva_t rip;
2375         int countr_size;
2376         int i, n;
2377
2378         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2379                 countr_size = 2;
2380         } else {
2381                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2382
2383                 countr_size = (cs_ar & AR_L_MASK) ? 8 : (cs_ar & AR_DB_MASK) ? 4 : 2;
2384         }
2385
2386         rip = vmcs_readl(GUEST_RIP);
2387         if (countr_size != 8)
2388                 rip += vmcs_readl(GUEST_CS_BASE);
2389
2390         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2391
2392         for (i = 0; i < n; i++) {
2393                 switch (((uint8_t *) & inst)[i]) {
2394                         case 0xf0:
2395                         case 0xf2:
2396                         case 0xf3:
2397                         case 0x2e:
2398                         case 0x36:
2399                         case 0x3e:
2400                         case 0x26:
2401                         case 0x64:
2402                         case 0x65:
2403                         case 0x66:
2404                                 break;
2405                         case 0x67:
2406                                 countr_size = (countr_size == 2) ? 4 : (countr_size >> 1);
2407                         default:
2408                                 goto done;
2409                 }
2410         }
2411         print_func_exit();
2412         return 0;
2413 done:
2414         countr_size *= 8;
2415         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2416         print_func_exit();
2417         return 1;
2418 }
2419
2420 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2421 {
2422         print_func_entry();
2423         uint64_t exit_qualification;
2424
2425         ++litevm_stat.io_exits;
2426         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2427         litevm_run->exit_reason = LITEVM_EXIT_IO;
2428         if (exit_qualification & 8)
2429                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2430         else
2431                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2432         litevm_run->io.size = (exit_qualification & 7) + 1;
2433         litevm_run->io.string = (exit_qualification & 16) != 0;
2434         litevm_run->io.string_down
2435                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2436         litevm_run->io.rep = (exit_qualification & 32) != 0;
2437         litevm_run->io.port = exit_qualification >> 16;
2438         if (litevm_run->io.string) {
2439                 if (!get_io_count(vcpu, &litevm_run->io.count)) {
2440                         print_func_exit();
2441                         return 1;
2442                 }
2443                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2444         } else
2445                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX];       /* rax */
2446         print_func_exit();
2447         return 0;
2448 }
2449
2450 static int handle_invlpg(struct litevm_vcpu *vcpu,
2451                                                  struct litevm_run *litevm_run)
2452 {
2453         print_func_entry();
2454         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2455         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2456         SPLL(&vcpu->litevm->lock);
2457         vcpu->mmu.inval_page(vcpu, address);
2458         SPLU(&vcpu->litevm->lock);
2459         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2460         print_func_exit();
2461         return 1;
2462 }
2463
2464 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2465 {
2466         print_func_entry();
2467         uint64_t exit_qualification;
2468         int cr;
2469         int reg;
2470
2471 #ifdef LITEVM_DEBUG
2472         if (guest_cpl() != 0) {
2473                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2474                 inject_gp(vcpu);
2475                 print_func_exit();
2476                 return 1;
2477         }
2478 #endif
2479
2480         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2481         cr = exit_qualification & 15;
2482         reg = (exit_qualification >> 8) & 15;
2483         switch ((exit_qualification >> 4) & 3) {
2484                 case 0: /* mov to cr */
2485                         switch (cr) {
2486                                 case 0:
2487                                         vcpu_load_rsp_rip(vcpu);
2488                                         set_cr0(vcpu, vcpu->regs[reg]);
2489                                         skip_emulated_instruction(vcpu);
2490                                         print_func_exit();
2491                                         return 1;
2492                                 case 3:
2493                                         vcpu_load_rsp_rip(vcpu);
2494                                         set_cr3(vcpu, vcpu->regs[reg]);
2495                                         skip_emulated_instruction(vcpu);
2496                                         print_func_exit();
2497                                         return 1;
2498                                 case 4:
2499                                         vcpu_load_rsp_rip(vcpu);
2500                                         set_cr4(vcpu, vcpu->regs[reg]);
2501                                         skip_emulated_instruction(vcpu);
2502                                         print_func_exit();
2503                                         return 1;
2504                                 case 8:
2505                                         vcpu_load_rsp_rip(vcpu);
2506                                         set_cr8(vcpu, vcpu->regs[reg]);
2507                                         skip_emulated_instruction(vcpu);
2508                                         print_func_exit();
2509                                         return 1;
2510                         };
2511                         break;
2512                 case 1: /*mov from cr */
2513                         switch (cr) {
2514                                 case 3:
2515                                         vcpu_load_rsp_rip(vcpu);
2516                                         vcpu->regs[reg] = vcpu->cr3;
2517                                         vcpu_put_rsp_rip(vcpu);
2518                                         skip_emulated_instruction(vcpu);
2519                                         print_func_exit();
2520                                         return 1;
2521                                 case 8:
2522                                         printd("handle_cr: read CR8 " "cpu erratum AA15\n");
2523                                         vcpu_load_rsp_rip(vcpu);
2524                                         vcpu->regs[reg] = vcpu->cr8;
2525                                         vcpu_put_rsp_rip(vcpu);
2526                                         skip_emulated_instruction(vcpu);
2527                                         print_func_exit();
2528                                         return 1;
2529                         }
2530                         break;
2531                 case 3: /* lmsw */
2532                         lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2533
2534                         skip_emulated_instruction(vcpu);
2535                         print_func_exit();
2536                         return 1;
2537                 default:
2538                         break;
2539         }
2540         litevm_run->exit_reason = 0;
2541         printk("litevm: unhandled control register: op %d cr %d\n",
2542                    (int)(exit_qualification >> 4) & 3, cr);
2543         print_func_exit();
2544         return 0;
2545 }
2546
2547 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2548 {
2549         print_func_entry();
2550         uint64_t exit_qualification;
2551         unsigned long val;
2552         int dr, reg;
2553
2554         /*
2555          * FIXME: this code assumes the host is debugging the guest.
2556          *        need to deal with guest debugging itself too.
2557          */
2558         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2559         dr = exit_qualification & 7;
2560         reg = (exit_qualification >> 8) & 15;
2561         vcpu_load_rsp_rip(vcpu);
2562         if (exit_qualification & 16) {
2563                 /* mov from dr */
2564                 switch (dr) {
2565                         case 6:
2566                                 val = 0xffff0ff0;
2567                                 break;
2568                         case 7:
2569                                 val = 0x400;
2570                                 break;
2571                         default:
2572                                 val = 0;
2573                 }
2574                 vcpu->regs[reg] = val;
2575         } else {
2576                 /* mov to dr */
2577         }
2578         vcpu_put_rsp_rip(vcpu);
2579         skip_emulated_instruction(vcpu);
2580         print_func_exit();
2581         return 1;
2582 }
2583
2584 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2585 {
2586         print_func_entry();
2587         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2588         print_func_exit();
2589         return 0;
2590 }
2591
2592 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2593 {
2594         print_func_entry();
2595         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2596         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2597         uint64_t data;
2598
2599         if (guest_cpl() != 0) {
2600                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2601                 inject_gp(vcpu);
2602                 print_func_exit();
2603                 return 1;
2604         }
2605
2606         switch (ecx) {
2607                 case MSR_FS_BASE:
2608                         data = vmcs_readl(GUEST_FS_BASE);
2609                         break;
2610                 case MSR_GS_BASE:
2611                         data = vmcs_readl(GUEST_GS_BASE);
2612                         break;
2613                 case MSR_IA32_SYSENTER_CS:
2614                         data = vmcs_read32(GUEST_SYSENTER_CS);
2615                         break;
2616                 case MSR_IA32_SYSENTER_EIP:
2617                         data = vmcs_read32(GUEST_SYSENTER_EIP);
2618                         break;
2619                 case MSR_IA32_SYSENTER_ESP:
2620                         data = vmcs_read32(GUEST_SYSENTER_ESP);
2621                         break;
2622                 case MSR_IA32_MC0_CTL:
2623                 case MSR_IA32_MCG_STATUS:
2624                 case MSR_IA32_MCG_CAP:
2625                 case MSR_IA32_MC0_MISC:
2626                 case MSR_IA32_MC0_MISC + 4:
2627                 case MSR_IA32_MC0_MISC + 8:
2628                 case MSR_IA32_MC0_MISC + 12:
2629                 case MSR_IA32_MC0_MISC + 16:
2630                 case MSR_IA32_UCODE_REV:
2631                         /* MTRR registers */
2632                 case 0xfe:
2633                 case 0x200 ... 0x2ff:
2634                         data = 0;
2635                         break;
2636                 case MSR_IA32_APICBASE:
2637                         data = vcpu->apic_base;
2638                         break;
2639                 default:
2640                         if (msr) {
2641                                 data = msr->data;
2642                                 break;
2643                         }
2644                         printk("litevm: unhandled rdmsr: %x\n", ecx);
2645                         inject_gp(vcpu);
2646                         print_func_exit();
2647                         return 1;
2648         }
2649
2650         /* FIXME: handling of bits 32:63 of rax, rdx */
2651         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2652         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2653         skip_emulated_instruction(vcpu);
2654         print_func_exit();
2655         return 1;
2656 }
2657
2658 #ifdef __x86_64__
2659
2660 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2661 {
2662         print_func_entry();
2663         struct vmx_msr_entry *msr;
2664
2665         if (efer & EFER_RESERVED_BITS) {
2666                 printd("set_efer: 0x%llx #GP, reserved bits\n", efer);
2667                 inject_gp(vcpu);
2668                 print_func_exit();
2669                 return;
2670         }
2671
2672         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2673                 printd("set_efer: #GP, change LME while paging\n");
2674                 inject_gp(vcpu);
2675                 print_func_exit();
2676                 return;
2677         }
2678
2679         efer &= ~EFER_LMA;
2680         efer |= vcpu->shadow_efer & EFER_LMA;
2681
2682         vcpu->shadow_efer = efer;
2683
2684         msr = find_msr_entry(vcpu, MSR_EFER);
2685
2686         if (!(efer & EFER_LMA))
2687                 efer &= ~EFER_LME;
2688         msr->data = efer;
2689         skip_emulated_instruction(vcpu);
2690         print_func_exit();
2691 }
2692
2693 #endif
2694
2695 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2696
2697 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2698 {
2699         print_func_entry();
2700         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2701         struct vmx_msr_entry *msr;
2702         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2703                 | ((uint64_t) (vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2704
2705         if (guest_cpl() != 0) {
2706                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2707                 inject_gp(vcpu);
2708                 print_func_exit();
2709                 return 1;
2710         }
2711
2712         switch (ecx) {
2713                 case MSR_FS_BASE:
2714                         vmcs_writel(GUEST_FS_BASE, data);
2715                         break;
2716                 case MSR_GS_BASE:
2717                         vmcs_writel(GUEST_GS_BASE, data);
2718                         break;
2719                 case MSR_IA32_SYSENTER_CS:
2720                         vmcs_write32(GUEST_SYSENTER_CS, data);
2721                         break;
2722                 case MSR_IA32_SYSENTER_EIP:
2723                         vmcs_write32(GUEST_SYSENTER_EIP, data);
2724                         break;
2725                 case MSR_IA32_SYSENTER_ESP:
2726                         vmcs_write32(GUEST_SYSENTER_ESP, data);
2727                         break;
2728                 case MSR_EFER:
2729                         set_efer(vcpu, data);
2730                         print_func_exit();
2731                         return 1;
2732                 case MSR_IA32_MC0_STATUS:
2733                         printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data);
2734                         break;
2735                 case MSR_IA32_TIME_STAMP_COUNTER:{
2736                                 uint64_t tsc;
2737
2738                                 tsc = read_tsc();
2739                                 vmcs_write64(TSC_OFFSET, data - tsc);
2740                                 break;
2741                         }
2742                 case MSR_IA32_UCODE_REV:
2743                 case MSR_IA32_UCODE_WRITE:
2744                 case 0x200 ... 0x2ff:   /* MTRRs */
2745                         break;
2746                 case MSR_IA32_APICBASE:
2747                         vcpu->apic_base = data;
2748                         break;
2749                 default:
2750                         msr = find_msr_entry(vcpu, ecx);
2751                         if (msr) {
2752                                 msr->data = data;
2753                                 break;
2754                         }
2755                         printk("litevm: unhandled wrmsr: %x\n", ecx);
2756                         inject_gp(vcpu);
2757                         print_func_exit();
2758                         return 1;
2759         }
2760         skip_emulated_instruction(vcpu);
2761         print_func_exit();
2762         return 1;
2763 }
2764
2765 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2766                                                                    struct litevm_run *litevm_run)
2767 {
2768         print_func_entry();
2769         /* Turn off interrupt window reporting. */
2770         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2771                                  vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2772                                  & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2773         print_func_exit();
2774         return 1;
2775 }
2776
2777 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2778 {
2779         print_func_entry();
2780         skip_emulated_instruction(vcpu);
2781         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) {
2782                 print_func_exit();
2783                 return 1;
2784         }
2785
2786         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2787         print_func_exit();
2788         return 0;
2789 }
2790
2791 /*
2792  * The exit handlers return 1 if the exit was handled fully and guest execution
2793  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2794  * to be done to userspace and return 0.
2795  */
2796 static int (*litevm_vmx_exit_handlers[]) (struct litevm_vcpu * vcpu,
2797                                                                                   struct litevm_run * litevm_run) = {
2798 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2799                 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2800                 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2801                 [EXIT_REASON_INVLPG] = handle_invlpg,
2802                 [EXIT_REASON_CR_ACCESS] = handle_cr,
2803                 [EXIT_REASON_DR_ACCESS] = handle_dr,
2804                 [EXIT_REASON_CPUID] = handle_cpuid,
2805                 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2806                 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2807                 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2808                 [EXIT_REASON_HLT] = handle_halt,};
2809
2810 static const int litevm_vmx_max_exit_handlers =
2811         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2812
2813 /*
2814  * The guest has exited.  See if we can fix it or if we need userspace
2815  * assistance.
2816  */
2817 static int litevm_handle_exit(struct litevm_run *litevm_run,
2818                                                           struct litevm_vcpu *vcpu)
2819 {
2820         print_func_entry();
2821         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2822         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2823
2824         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2825                 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2826                 printk("%s: unexpected, valid vectoring info and "
2827                            "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2828         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2829         if (exit_reason < litevm_vmx_max_exit_handlers
2830                 && litevm_vmx_exit_handlers[exit_reason]) {
2831                 print_func_exit();
2832                 return litevm_vmx_exit_handlers[exit_reason] (vcpu, litevm_run);
2833         } else {
2834                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2835                 litevm_run->hw.hardware_exit_reason = exit_reason;
2836         }
2837         print_func_exit();
2838         return 0;
2839 }
2840
2841 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2842 {
2843         print_func_entry();
2844         uint16_t ent[2];
2845         uint16_t cs;
2846         uint16_t ip;
2847         unsigned long flags;
2848         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2849         uint16_t sp = vmcs_readl(GUEST_RSP);
2850         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2851
2852         if (sp > ss_limit || ((sp - 6) > sp)) {
2853                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2854                                         __FUNCTION__,
2855                                         vmcs_readl(GUEST_RSP),
2856                                         vmcs_readl(GUEST_SS_BASE), vmcs_read32(GUEST_SS_LIMIT));
2857                 print_func_exit();
2858                 return;
2859         }
2860
2861         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2862                 sizeof(ent)) {
2863                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2864                 print_func_exit();
2865                 return;
2866         }
2867
2868         flags = vmcs_readl(GUEST_RFLAGS);
2869         cs = vmcs_readl(GUEST_CS_BASE) >> 4;
2870         ip = vmcs_readl(GUEST_RIP);
2871
2872         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2873                 litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2874                 litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2875                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2876                 print_func_exit();
2877                 return;
2878         }
2879
2880         vmcs_writel(GUEST_RFLAGS, flags &
2881                                 ~(X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2882         vmcs_write16(GUEST_CS_SELECTOR, ent[1]);
2883         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2884         vmcs_writel(GUEST_RIP, ent[0]);
2885         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2886         print_func_exit();
2887 }
2888
2889 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2890 {
2891         print_func_entry();
2892         int word_index = __ffs(vcpu->irq_summary);
2893         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2894         int irq = word_index * BITS_PER_LONG + bit_index;
2895
2896         /* don't have clear_bit and I'm not sure the akaros
2897          * bitops are really going to work.
2898          */
2899         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2900         if (!vcpu->irq_pending[word_index])
2901                 vcpu->irq_summary &= ~(1 << word_index);
2902
2903         if (vcpu->rmode.active) {
2904                 inject_rmode_irq(vcpu, irq);
2905                 print_func_exit();
2906                 return;
2907         }
2908         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2909                                  irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2910         print_func_exit();
2911 }
2912
2913 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2914 {
2915         print_func_entry();
2916         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2917                 && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2918                 /*
2919                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2920                  */
2921                 litevm_do_inject_irq(vcpu);
2922         else
2923                 /*
2924                  * Interrupts blocked.  Wait for unblock.
2925                  */
2926                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2927                                          vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2928                                          | CPU_BASED_VIRTUAL_INTR_PENDING);
2929         print_func_exit();
2930 }
2931
2932 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2933 {
2934         print_func_entry();
2935         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2936
2937 #warning "no debugging guests yet"
2938         assert(0);
2939 /*
2940         set_debugreg(dbg->bp[0], 0);
2941         set_debugreg(dbg->bp[1], 1);
2942         set_debugreg(dbg->bp[2], 2);
2943         set_debugreg(dbg->bp[3], 3);
2944 */
2945         if (dbg->singlestep) {
2946                 unsigned long flags;
2947
2948                 flags = vmcs_readl(GUEST_RFLAGS);
2949                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2950                 vmcs_writel(GUEST_RFLAGS, flags);
2951         }
2952         print_func_exit();
2953 }
2954
2955 static void load_msrs(struct vmx_msr_entry *e, int n)
2956 {
2957         print_func_entry();
2958         int i;
2959
2960         if (! e) {
2961                 printk("LOAD MSR WITH NULL POINTER?");
2962                 error("LOAD MSR WITH NULL POINTER?");
2963         }
2964         for (i = 0; i < n; ++i) {
2965                 printk("Load MSR (%lx), with %lx\n", e[i].index, e[i].data);
2966                 write_msr(e[i].index, e[i].data);
2967                 printk("Done\n");
2968         }
2969         print_func_exit();
2970 }
2971
2972 static void save_msrs(struct vmx_msr_entry *e, int n)
2973 {
2974         print_func_entry();
2975         int i;
2976
2977         for (i = 0; i < n; ++i)
2978                 e[i].data = read_msr(e[i].index);
2979         print_func_exit();
2980 }
2981
2982 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
2983 {
2984         print_func_entry();
2985         struct litevm_vcpu *vcpu;
2986         uint8_t fail;
2987         uint16_t fs_sel, gs_sel, ldt_sel;
2988         int fs_gs_ldt_reload_needed;
2989
2990         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2991                 error("vcpu is %d but must be in the range %d..%d\n",
2992                           litevm_run->vcpu, LITEVM_MAX_VCPUS);
2993
2994         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2995         if (!vcpu)
2996                 error("vcpu_load failed");
2997         printk("Loaded\n");
2998
2999         if (litevm_run->emulated) {
3000                 skip_emulated_instruction(vcpu);
3001                 litevm_run->emulated = 0;
3002         }
3003         printk("Emulated\n");
3004
3005         if (litevm_run->mmio_completed) {
3006                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
3007                 vcpu->mmio_read_completed = 1;
3008         }
3009         printk("mmio completed\n");
3010
3011         vcpu->mmio_needed = 0;
3012
3013 again:
3014         /*
3015          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
3016          * allow segment selectors with cpl > 0 or ti == 1.
3017          */
3018         fs_sel = read_fs();
3019         printk("fs_sel %x\n", fs_sel);
3020         gs_sel = read_gs();
3021         printk("gs_sel %x\n", gs_sel);
3022         ldt_sel = read_ldt();
3023         printk("ldt_sel %x\n", ldt_sel);
3024         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
3025         if (!fs_gs_ldt_reload_needed) {
3026                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
3027                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
3028         } else {
3029                 vmcs_write16(HOST_FS_SELECTOR, 0);
3030                 vmcs_write16(HOST_GS_SELECTOR, 0);
3031         }
3032         printk("reloaded gs and gs\n");
3033
3034 #ifdef __x86_64__
3035         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
3036         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
3037         printk("Set FS_BASE and GS_BASE");
3038 #endif
3039
3040         printk("skipping IRQs for now\n");
3041         if (0)
3042         if (vcpu->irq_summary &&
3043                 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
3044                 litevm_try_inject_irq(vcpu);
3045
3046         printk("no debugging for now\n");
3047         if (0)
3048         if (vcpu->guest_debug.enabled)
3049                 litevm_guest_debug_pre(vcpu);
3050
3051         fx_save(vcpu->host_fx_image);
3052         fx_restore(vcpu->guest_fx_image);
3053
3054         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
3055         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3056
3057         printk("GO FOR IT!\n");
3058         asm(
3059                    /* Store host registers */
3060                    "pushf \n\t"
3061 #ifdef __x86_64__
3062                    "push %%rax; push %%rbx; push %%rdx;"
3063                    "push %%rsi; push %%rdi; push %%rbp;"
3064                    "push %%r8;  push %%r9;  push %%r10; push %%r11;"
3065                    "push %%r12; push %%r13; push %%r14; push %%r15;"
3066                    "push %%rcx \n\t" "vmwrite %%rsp, %2 \n\t"
3067 #else
3068                    "pusha; push %%ecx \n\t" "vmwrite %%esp, %2 \n\t"
3069 #endif
3070                    /* Check if vmlaunch of vmresume is needed */
3071                    "cmp $0, %1 \n\t"
3072                    /* Load guest registers.  Don't clobber flags. */
3073 #ifdef __x86_64__
3074                    "mov %c[cr2](%3), %%rax \n\t" "mov %%rax, %%cr2 \n\t" "mov %c[rax](%3), %%rax \n\t" "mov %c[rbx](%3), %%rbx \n\t" "mov %c[rdx](%3), %%rdx \n\t" "mov %c[rsi](%3), %%rsi \n\t" "mov %c[rdi](%3), %%rdi \n\t" "mov %c[rbp](%3), %%rbp \n\t" "mov %c[r8](%3),  %%r8  \n\t" "mov %c[r9](%3),  %%r9  \n\t" "mov %c[r10](%3), %%r10 \n\t" "mov %c[r11](%3), %%r11 \n\t" "mov %c[r12](%3), %%r12 \n\t" "mov %c[r13](%3), %%r13 \n\t" "mov %c[r14](%3), %%r14 \n\t" "mov %c[r15](%3), %%r15 \n\t" "mov %c[rcx](%3), %%rcx \n\t"      /* kills %3 (rcx) */
3075 #else
3076                    "mov %c[cr2](%3), %%eax \n\t" "mov %%eax,   %%cr2 \n\t" "mov %c[rax](%3), %%eax \n\t" "mov %c[rbx](%3), %%ebx \n\t" "mov %c[rdx](%3), %%edx \n\t" "mov %c[rsi](%3), %%esi \n\t" "mov %c[rdi](%3), %%edi \n\t" "mov %c[rbp](%3), %%ebp \n\t" "mov %c[rcx](%3), %%ecx \n\t"    /* kills %3 (ecx) */
3077 #endif
3078                    /* Enter guest mode */
3079                    "jne launched \n\t"
3080                    "vmlaunch \n\t"
3081                    "jmp litevm_vmx_return \n\t"
3082                    "launched: vmresume \n\t"
3083                    ".globl litevm_vmx_return \n\t" "litevm_vmx_return: "
3084                    /* Save guest registers, load host registers, keep flags */
3085 #ifdef __x86_64__
3086                    "xchg %3,     0(%%rsp) \n\t"
3087                    "mov %%rax, %c[rax](%3) \n\t"
3088                    "mov %%rbx, %c[rbx](%3) \n\t"
3089                    "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
3090                    "mov %%rdx, %c[rdx](%3) \n\t"
3091                    "mov %%rsi, %c[rsi](%3) \n\t"
3092                    "mov %%rdi, %c[rdi](%3) \n\t"
3093                    "mov %%rbp, %c[rbp](%3) \n\t"
3094                    "mov %%r8,  %c[r8](%3) \n\t"
3095                    "mov %%r9,  %c[r9](%3) \n\t"
3096                    "mov %%r10, %c[r10](%3) \n\t"
3097                    "mov %%r11, %c[r11](%3) \n\t"
3098                    "mov %%r12, %c[r12](%3) \n\t"
3099                    "mov %%r13, %c[r13](%3) \n\t"
3100                    "mov %%r14, %c[r14](%3) \n\t"
3101                    "mov %%r15, %c[r15](%3) \n\t"
3102                    "mov %%cr2, %%rax   \n\t"
3103                    "mov %%rax, %c[cr2](%3) \n\t"
3104                    "mov 0(%%rsp), %3 \n\t"
3105                    "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
3106                    "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
3107                    "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
3108                    "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
3109 #else
3110                    "xchg %3, 0(%%esp) \n\t"
3111                    "mov %%eax, %c[rax](%3) \n\t"
3112                    "mov %%ebx, %c[rbx](%3) \n\t"
3113                    "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
3114                    "mov %%edx, %c[rdx](%3) \n\t"
3115                    "mov %%esi, %c[rsi](%3) \n\t"
3116                    "mov %%edi, %c[rdi](%3) \n\t"
3117                    "mov %%ebp, %c[rbp](%3) \n\t"
3118                    "mov %%cr2, %%eax  \n\t"
3119                    "mov %%eax, %c[cr2](%3) \n\t"
3120                    "mov 0(%%esp), %3 \n\t" "pop %%ecx; popa \n\t"
3121 #endif
3122 "setbe %0 \n\t" "popf \n\t":"=g"(fail)
3123 :                  "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
3124                    "c"(vcpu),
3125                    [rax] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
3126                    [rbx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
3127                    [rcx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
3128                    [rdx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
3129                    [rsi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
3130                    [rdi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
3131                    [rbp] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
3132 #ifdef __x86_64__
3133                    [r8] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8])),
3134                    [r9] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9])),
3135                    [r10] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
3136                    [r11] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
3137                    [r12] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
3138                    [r13] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
3139                    [r14] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
3140                    [r15] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
3141 #endif
3142                    [cr2] "i"(offsetof(struct litevm_vcpu, cr2))
3143                    :"cc", "memory");
3144
3145         ++litevm_stat.exits;
3146         printk("vm_run exits");
3147         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3148         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
3149
3150         fx_save(vcpu->guest_fx_image);
3151         fx_restore(vcpu->host_fx_image);
3152
3153 #ifndef __x86_64__
3154 asm("mov %0, %%ds; mov %0, %%es": :"r"(__USER_DS));
3155 #endif
3156
3157         litevm_run->exit_type = 0;
3158         if (fail) {
3159                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
3160                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
3161         } else {
3162                 if (fs_gs_ldt_reload_needed) {
3163                         load_ldt(ldt_sel);
3164                         load_fs(fs_sel);
3165                         /*
3166                          * If we have to reload gs, we must take care to
3167                          * preserve our gs base.
3168                          */
3169                         disable_irq();
3170                         load_gs(gs_sel);
3171 #ifdef __x86_64__
3172                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
3173 #endif
3174                         enable_irq();
3175
3176                         reload_tss();
3177                 }
3178                 vcpu->launched = 1;
3179                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
3180                 if (litevm_handle_exit(litevm_run, vcpu)) {
3181                         /* Give scheduler a change to reschedule. */
3182                         vcpu_put(vcpu);
3183 #warning "how to tell if signal is pending"
3184 /*
3185                         if (signal_pending(current)) {
3186                                 ++litevm_stat.signal_exits;
3187                                 return -EINTR;
3188                         }
3189 */
3190                         kthread_yield();
3191                         /* Cannot fail -  no vcpu unplug yet. */
3192                         vcpu_load(litevm, vcpu_slot(vcpu));
3193                         goto again;
3194                 }
3195         }
3196
3197         vcpu_put(vcpu);
3198         printk("vm_run returns\n");
3199         print_func_exit();
3200         return 0;
3201 }
3202
3203 static int litevm_dev_ioctl_get_regs(struct litevm *litevm,
3204                                                                          struct litevm_regs *regs)
3205 {
3206         print_func_entry();
3207         struct litevm_vcpu *vcpu;
3208
3209         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3210                 print_func_exit();
3211                 return -EINVAL;
3212         }
3213
3214         vcpu = vcpu_load(litevm, regs->vcpu);
3215         if (!vcpu) {
3216                 print_func_exit();
3217                 return -ENOENT;
3218         }
3219
3220         regs->rax = vcpu->regs[VCPU_REGS_RAX];
3221         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
3222         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
3223         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
3224         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
3225         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
3226         regs->rsp = vmcs_readl(GUEST_RSP);
3227         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
3228 #ifdef __x86_64__
3229         regs->r8 = vcpu->regs[VCPU_REGS_R8];
3230         regs->r9 = vcpu->regs[VCPU_REGS_R9];
3231         regs->r10 = vcpu->regs[VCPU_REGS_R10];
3232         regs->r11 = vcpu->regs[VCPU_REGS_R11];
3233         regs->r12 = vcpu->regs[VCPU_REGS_R12];
3234         regs->r13 = vcpu->regs[VCPU_REGS_R13];
3235         regs->r14 = vcpu->regs[VCPU_REGS_R14];
3236         regs->r15 = vcpu->regs[VCPU_REGS_R15];
3237 #endif
3238
3239         regs->rip = vmcs_readl(GUEST_RIP);
3240         regs->rflags = vmcs_readl(GUEST_RFLAGS);
3241
3242         /*
3243          * Don't leak debug flags in case they were set for guest debugging
3244          */
3245         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3246                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3247
3248         vcpu_put(vcpu);
3249
3250         print_func_exit();
3251         return 0;
3252 }
3253
3254 static int litevm_dev_ioctl_set_regs(struct litevm *litevm,
3255                                                                          struct litevm_regs *regs)
3256 {
3257         print_func_entry();
3258         struct litevm_vcpu *vcpu;
3259
3260         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3261                 print_func_exit();
3262                 return -EINVAL;
3263         }
3264
3265         vcpu = vcpu_load(litevm, regs->vcpu);
3266         if (!vcpu) {
3267                 print_func_exit();
3268                 return -ENOENT;
3269         }
3270
3271         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
3272         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
3273         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
3274         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
3275         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
3276         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
3277         vmcs_writel(GUEST_RSP, regs->rsp);
3278         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
3279 #ifdef __x86_64__
3280         vcpu->regs[VCPU_REGS_R8] = regs->r8;
3281         vcpu->regs[VCPU_REGS_R9] = regs->r9;
3282         vcpu->regs[VCPU_REGS_R10] = regs->r10;
3283         vcpu->regs[VCPU_REGS_R11] = regs->r11;
3284         vcpu->regs[VCPU_REGS_R12] = regs->r12;
3285         vcpu->regs[VCPU_REGS_R13] = regs->r13;
3286         vcpu->regs[VCPU_REGS_R14] = regs->r14;
3287         vcpu->regs[VCPU_REGS_R15] = regs->r15;
3288 #endif
3289
3290         vmcs_writel(GUEST_RIP, regs->rip);
3291         vmcs_writel(GUEST_RFLAGS, regs->rflags);
3292
3293         vcpu_put(vcpu);
3294
3295         print_func_exit();
3296         return 0;
3297 }
3298
3299 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm,
3300                                                                           struct litevm_sregs *sregs)
3301 {
3302         print_func_entry();
3303         struct litevm_vcpu *vcpu;
3304
3305         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3306                 print_func_exit();
3307                 return -EINVAL;
3308         }
3309         vcpu = vcpu_load(litevm, sregs->vcpu);
3310         if (!vcpu) {
3311                 print_func_exit();
3312                 return -ENOENT;
3313         }
3314 #define get_segment(var, seg) \
3315         do { \
3316                 uint32_t ar; \
3317                 \
3318                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
3319                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
3320                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
3321                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
3322                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
3323                 sregs->var.type = ar & 15; \
3324                 sregs->var.s = (ar >> 4) & 1; \
3325                 sregs->var.dpl = (ar >> 5) & 3; \
3326                 sregs->var.present = (ar >> 7) & 1; \
3327                 sregs->var.avl = (ar >> 12) & 1; \
3328                 sregs->var.l = (ar >> 13) & 1; \
3329                 sregs->var.db = (ar >> 14) & 1; \
3330                 sregs->var.g = (ar >> 15) & 1; \
3331                 sregs->var.unusable = (ar >> 16) & 1; \
3332         } while (0);
3333
3334         get_segment(cs, CS);
3335         get_segment(ds, DS);
3336         get_segment(es, ES);
3337         get_segment(fs, FS);
3338         get_segment(gs, GS);
3339         get_segment(ss, SS);
3340
3341         get_segment(tr, TR);
3342         get_segment(ldt, LDTR);
3343 #undef get_segment
3344
3345 #define get_dtable(var, table) \
3346         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
3347                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
3348
3349         get_dtable(idt, IDTR);
3350         get_dtable(gdt, GDTR);
3351 #undef get_dtable
3352
3353         sregs->cr0 = guest_cr0();
3354         sregs->cr2 = vcpu->cr2;
3355         sregs->cr3 = vcpu->cr3;
3356         sregs->cr4 = guest_cr4();
3357         sregs->cr8 = vcpu->cr8;
3358         sregs->efer = vcpu->shadow_efer;
3359         sregs->apic_base = vcpu->apic_base;
3360
3361         sregs->pending_int = vcpu->irq_summary != 0;
3362
3363         vcpu_put(vcpu);
3364
3365         print_func_exit();
3366         return 0;
3367 }
3368
3369 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm,
3370                                                                           struct litevm_sregs *sregs)
3371 {
3372         print_func_entry();
3373         struct litevm_vcpu *vcpu;
3374         int mmu_reset_needed = 0;
3375
3376         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3377                 print_func_exit();
3378                 return -EINVAL;
3379         }
3380         vcpu = vcpu_load(litevm, sregs->vcpu);
3381         if (!vcpu) {
3382                 print_func_exit();
3383                 return -ENOENT;
3384         }
3385 #define set_segment(var, seg) \
3386         do { \
3387                 uint32_t ar; \
3388                 \
3389                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
3390                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
3391                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
3392                 if (sregs->var.unusable) { \
3393                         ar = (1 << 16); \
3394                 } else { \
3395                         ar = (sregs->var.type & 15); \
3396                         ar |= (sregs->var.s & 1) << 4; \
3397                         ar |= (sregs->var.dpl & 3) << 5; \
3398                         ar |= (sregs->var.present & 1) << 7; \
3399                         ar |= (sregs->var.avl & 1) << 12; \
3400                         ar |= (sregs->var.l & 1) << 13; \
3401                         ar |= (sregs->var.db & 1) << 14; \
3402                         ar |= (sregs->var.g & 1) << 15; \
3403                 } \