Use vmap_pmem
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 void monitor(void *);
39
40 #define currentcpu (&per_cpu_info[core_id()])
41 #define QLOCK_init(x) {printd("qlock_init %p\n", x); qlock_init(x); printd("%p lock_inited\n", x);}
42 #define QLOCK(x) {printd("qlock %p\n", x); qlock(x); printd("%p locked\n", x);}
43 #define QUNLOCK(x) {printd("qunlock %p\n", x); qunlock(x); printd("%p unlocked\n", x);}
44 #define SPLI_irqsave(x){printd("spin_lock_init %p:", x); spinlock_init(x); printd("inited\n");}
45 #define SPLL(x){printd("spin_lock %p\n", x); spin_lock_irqsave(x); printd("%p locked\n", x);}
46 #define SPLU(x){printd("spin_unlock %p\n", x); spin_unlock(x); printd("%p unlocked\n", x);}
47 struct litevm_stat litevm_stat;
48
49 static struct litevm_stats_debugfs_item {
50         const char *name;
51         uint32_t *data;
52 } debugfs_entries[] = {
53         {
54         "pf_fixed", &litevm_stat.pf_fixed}, {
55         "pf_guest", &litevm_stat.pf_guest}, {
56         "tlb_flush", &litevm_stat.tlb_flush}, {
57         "invlpg", &litevm_stat.invlpg}, {
58         "exits", &litevm_stat.exits}, {
59         "io_exits", &litevm_stat.io_exits}, {
60         "mmio_exits", &litevm_stat.mmio_exits}, {
61         "signal_exits", &litevm_stat.signal_exits}, {
62         "irq_exits", &litevm_stat.irq_exits}, {
63         0, 0}
64 };
65
66 static struct dentry *debugfs_dir;
67
68 static const uint32_t vmx_msr_index[] = {
69 #ifdef __x86_64__
70         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
71 #endif
72         MSR_EFER,       // wtf? MSR_K6_STAR,
73 };
74
75 static const char* vmx_msr_name[] = {
76 #ifdef __x86_64__
77         "MSR_SYSCALL_MASK", "MSR_LSTAR", "MSR_CSTAR", "MSR_KERNEL_GS_BASE",
78 #endif
79         "MSR_EFER",     // wtf? MSR_K6_STAR,
80 };
81
82 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
83
84 #ifdef __x86_64__
85 /*
86  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
87  * mechanism (cpu bug AA24)
88  */
89 #define NR_BAD_MSRS 2
90 #else
91 #define NR_BAD_MSRS 0
92 #endif
93
94 #define TSS_IOPB_BASE_OFFSET 0x66
95 #define TSS_BASE_SIZE 0x68
96 #define TSS_IOPB_SIZE (65536 / 8)
97 #define TSS_REDIRECTION_SIZE (256 / 8)
98 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
99
100 #define MSR_IA32_VMX_BASIC_MSR                  0x480
101 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
102 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
103 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
104 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
105
106 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
107 #define LMSW_GUEST_MASK 0x0eULL
108 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
109 //#define CR4_VMXE 0x2000
110 #define CR8_RESEVED_BITS (~0x0fULL)
111 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
112
113 #ifdef __x86_64__
114 #define HOST_IS_64 1
115 #else
116 #define HOST_IS_64 0
117 #endif
118
119 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
120 /**
121  * __ffs - find first set bit in word
122  * @word: The word to search
123  *
124  * Undefined if no bit exists, so code should check against 0 first.
125  */
126 static inline unsigned long __ffs(unsigned long word)
127 {
128         print_func_entry();
129 asm("rep; bsf %1,%0":"=r"(word)
130 :               "rm"(word));
131         print_func_exit();
132         return word;
133 }
134
135 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu,
136                                                                                         uint32_t msr)
137 {
138         print_func_entry();
139         int i;
140
141         for (i = 0; i < vcpu->nmsrs; ++i)
142                 if (vcpu->guest_msrs[i].index == msr) {
143                         print_func_exit();
144                         return &vcpu->guest_msrs[i];
145                 }
146         print_func_exit();
147         return 0;
148 }
149
150 struct descriptor_table {
151         uint16_t limit;
152         unsigned long base;
153 } __attribute__ ((packed));
154
155 static void get_gdt(struct descriptor_table *table)
156 {
157         print_func_entry();
158 asm("sgdt %0":"=m"(*table));
159         print_func_exit();
160 }
161
162 static void get_idt(struct descriptor_table *table)
163 {
164         print_func_entry();
165 asm("sidt %0":"=m"(*table));
166         print_func_exit();
167 }
168
169 static uint16_t read_fs(void)
170 {
171         //print_func_entry();
172         uint16_t seg;
173         asm("mov %%fs, %0":"=g"(seg));
174         //print_func_exit();
175         return seg;
176 }
177
178 static uint16_t read_gs(void)
179 {
180         //print_func_entry();
181         uint16_t seg;
182         asm("mov %%gs, %0":"=g"(seg));
183         //print_func_exit();
184         return seg;
185 }
186
187 static uint16_t read_ldt(void)
188 {
189         //print_func_entry();
190         uint16_t ldt;
191         asm("sldt %0":"=g"(ldt));
192         //print_func_exit();
193         return ldt;
194 }
195
196 static void load_fs(uint16_t sel)
197 {
198         //print_func_entry();
199         asm("mov %0, %%fs": :"g"(sel));
200         //print_func_exit();
201 }
202
203 static void load_gs(uint16_t sel)
204 {
205         //print_func_entry();
206         asm("mov %0, %%gs": :"g"(sel));
207         //print_func_exit();
208 }
209
210 #ifndef load_ldt
211 static void load_ldt(uint16_t sel)
212 {
213         //print_func_entry();
214         asm("lldt %0": :"g"(sel));
215         //print_func_exit();
216 }
217 #endif
218
219 static void fx_save(void *image)
220 {
221         //print_func_entry();
222         asm("fxsave (%0)"::"r"(image));
223         //print_func_exit();
224 }
225
226 static void fx_restore(void *image)
227 {
228         //print_func_entry();
229         asm("fxrstor (%0)"::"r"(image));
230         //print_func_exit();
231 }
232
233 static void fpu_init(void)
234 {
235         print_func_entry();
236         asm("finit");
237         print_func_exit();
238 }
239
240 struct segment_descriptor {
241         uint16_t limit_low;
242         uint16_t base_low;
243         uint8_t base_mid;
244         uint8_t type:4;
245         uint8_t system:1;
246         uint8_t dpl:2;
247         uint8_t present:1;
248         uint8_t limit_high:4;
249         uint8_t avl:1;
250         uint8_t long_mode:1;
251         uint8_t default_op:1;
252         uint8_t granularity:1;
253         uint8_t base_high;
254 } __attribute__ ((packed));
255
256 #ifdef __x86_64__
257 // LDT or TSS descriptor in the GDT. 16 bytes.
258 struct segment_descriptor_64 {
259         struct segment_descriptor s;
260         uint32_t base_higher;
261         uint32_t pad_zero;
262 };
263
264 #endif
265
266 static unsigned long segment_base(uint16_t selector)
267 {
268         print_func_entry();
269         struct descriptor_table gdt;
270         struct segment_descriptor *d;
271         unsigned long table_base;
272         typedef unsigned long ul;
273         unsigned long v;
274
275 asm("sgdt %0":"=m"(gdt));
276         table_base = gdt.base;
277
278         if (selector & 4) {     /* from ldt */
279                 uint16_t ldt_selector;
280
281 asm("sldt %0":"=g"(ldt_selector));
282                 table_base = segment_base(ldt_selector);
283         }
284         d = (struct segment_descriptor *)(table_base + (selector & ~7));
285         v = d->base_low | ((ul) d->base_mid << 16) | ((ul) d->base_high << 24);
286 #ifdef __x86_64__
287         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
288                 v |= ((ul) ((struct segment_descriptor_64 *)d)->base_higher) << 32;
289 #endif
290         print_func_exit();
291         return v;
292 }
293
294 static unsigned long read_tr_base(void)
295 {
296         print_func_entry();
297         uint16_t tr;
298 asm("str %0":"=g"(tr));
299         print_func_exit();
300         return segment_base(tr);
301 }
302
303 static void reload_tss(void)
304 {
305         print_func_entry();
306 #ifndef __x86_64__
307
308         /*
309          * VT restores TR but not its size.  Useless.
310          */
311         struct descriptor_table gdt;
312         struct segment_descriptor *descs;
313
314         get_gdt(&gdt);
315         descs = (void *)gdt.base;
316         descs[GD_TSS].type = 9; /* available TSS */
317         load_TR_desc();
318 #endif
319         print_func_exit();
320 }
321
322 static struct vmcs_descriptor {
323         int size;
324         int order;
325         uint32_t revision_id;
326 } vmcs_descriptor;
327
328 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
329 {
330         print_func_entry();
331         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
332         print_func_exit();
333         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
334 }
335
336 int litevm_read_guest(struct litevm_vcpu *vcpu,
337                                           gva_t addr, unsigned long size, void *dest)
338 {
339         print_func_entry();
340         unsigned char *host_buf = dest;
341         unsigned long req_size = size;
342
343         while (size) {
344                 hpa_t paddr;
345                 unsigned now;
346                 unsigned offset;
347                 hva_t guest_buf;
348
349                 paddr = gva_to_hpa(vcpu, addr);
350
351                 if (is_error_hpa(paddr))
352                         break;
353                 guest_buf = (hva_t) KADDR(paddr);
354                 offset = addr & ~PAGE_MASK;
355                 guest_buf |= offset;
356                 now = MIN(size, PAGE_SIZE - offset);
357                 memcpy(host_buf, (void *)guest_buf, now);
358                 host_buf += now;
359                 addr += now;
360                 size -= now;
361         }
362         print_func_exit();
363         return req_size - size;
364 }
365
366 int litevm_write_guest(struct litevm_vcpu *vcpu,
367                                            gva_t addr, unsigned long size, void *data)
368 {
369         print_func_entry();
370         unsigned char *host_buf = data;
371         unsigned long req_size = size;
372
373         while (size) {
374                 hpa_t paddr;
375                 unsigned now;
376                 unsigned offset;
377                 hva_t guest_buf;
378
379                 paddr = gva_to_hpa(vcpu, addr);
380
381                 if (is_error_hpa(paddr))
382                         break;
383
384                 guest_buf = (hva_t) KADDR(paddr);
385                 offset = addr & ~PAGE_MASK;
386                 guest_buf |= offset;
387                 now = MIN(size, PAGE_SIZE - offset);
388                 memcpy((void *)guest_buf, host_buf, now);
389                 host_buf += now;
390                 addr += now;
391                 size -= now;
392         }
393         print_func_exit();
394         return req_size - size;
395 }
396
397 static void setup_vmcs_descriptor(void)
398 {
399         print_func_entry();
400         uint64_t msr;
401
402         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
403         vmcs_descriptor.size = (msr >> 32) & 0x1fff;
404         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size >> PAGE_SHIFT);
405         vmcs_descriptor.revision_id = (uint32_t) msr;
406         printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n",
407                    msr, vmcs_descriptor.size, vmcs_descriptor.order,
408                    vmcs_descriptor.revision_id);
409         print_func_exit();
410 };
411
412 static void vmcs_clear(struct vmcs *vmcs)
413 {
414         print_func_entry();
415         uint64_t phys_addr = PADDR(vmcs);
416         uint8_t error;
417         printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr);
418         asm volatile ("vmclear %1; setna %0":"=m" (error):"m"(phys_addr):"cc",
419                                   "memory");
420         if (error)
421                 printk("litevm: vmclear fail: %p/%llx\n", vmcs, phys_addr);
422         print_func_exit();
423 }
424
425 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
426 {
427         print_func_entry();
428         struct litevm_vcpu *vcpu = arg;
429         int cpu = core_id();
430         printd
431                 ("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n",
432                  cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
433
434         if (vcpu->cpu == cpu)
435                 vmcs_clear(vcpu->vmcs);
436
437         if (currentcpu->vmcs == vcpu->vmcs)
438                 currentcpu->vmcs = NULL;
439         print_func_exit();
440 }
441
442 static int vcpu_slot(struct litevm_vcpu *vcpu)
443 {
444         print_func_entry();
445         print_func_exit();
446         return vcpu - vcpu->litevm->vcpus;
447 }
448
449 /*
450  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
451  * vcpu mutex is already taken.
452  */
453 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
454 {
455         print_func_entry();
456         uint64_t phys_addr = PADDR(vcpu->vmcs);
457         int cpu;
458         cpu = core_id();
459
460         printk("__vcpu_load: vcpu->cpu %d cpu %d\n", vcpu->cpu, cpu);
461         if ((vcpu->cpu != cpu) && (vcpu->cpu != -1)){
462                 handler_wrapper_t *w;
463                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
464                 smp_call_wait(w);
465                 vcpu->launched = 0;
466         }
467
468         printk("2 ..");
469         if (currentcpu->vmcs != vcpu->vmcs) {
470                 uint8_t error;
471
472                 currentcpu->vmcs = vcpu->vmcs;
473                 asm volatile ("vmptrld %1; setna %0":"=m" (error):"m"(phys_addr):"cc");
474                 if (error) {
475                         printk("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
476                         error("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
477                 }
478         }
479
480         printk("3 ..");
481         if (vcpu->cpu != cpu) {
482                 struct descriptor_table dt;
483                 unsigned long sysenter_esp;
484
485                 vcpu->cpu = cpu;
486                 /*
487                  * Linux uses per-cpu TSS and GDT, so set these when switching
488                  * processors.
489                  */
490                 vmcs_writel(HOST_TR_BASE, read_tr_base());      /* 22.2.4 */
491                 get_gdt(&dt);
492                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
493
494                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
495                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp);      /* 22.2.3 */
496         }
497         print_func_exit();
498         return vcpu;
499 }
500
501 /*
502  * Switches to specified vcpu, until a matching vcpu_put()
503  * And leaves it locked!
504  */
505 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
506 {
507         struct litevm_vcpu *ret;
508         print_func_entry();
509         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
510
511         printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu);
512
513         QLOCK(&vcpu->mutex);
514         printk("Locked\n");
515         if (!vcpu->vmcs) {
516                 QUNLOCK(&vcpu->mutex);
517                 printk("vcpu->vmcs for vcpu %p is NULL", vcpu);
518                 error("vcpu->vmcs is NULL");
519         }
520         ret = __vcpu_load(vcpu);
521         print_func_exit();
522         return ret;
523 }
524
525 static void vcpu_put(struct litevm_vcpu *vcpu)
526 {
527         print_func_entry();
528         //put_cpu();
529         QUNLOCK(&vcpu->mutex);
530         print_func_exit();
531 }
532
533 static struct vmcs *alloc_vmcs_cpu(int cpu)
534 {
535         print_func_entry();
536         int node = node_id();
537         struct vmcs *vmcs;
538
539         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
540         if (!vmcs) {
541                 print_func_exit();
542                 printk("no memory for vcpus");
543                 error("no memory for vcpus");
544         }
545         memset(vmcs, 0, vmcs_descriptor.size);
546         vmcs->revision_id = vmcs_descriptor.revision_id;        /* vmcs revision id */
547         print_func_exit();
548         return vmcs;
549 }
550
551 static struct vmcs *alloc_vmcs(void)
552 {
553         struct vmcs *ret;
554         print_func_entry();
555         ret = alloc_vmcs_cpu(core_id());
556         print_func_exit();
557         return ret;
558 }
559
560 static int cpu_has_litevm_support(void)
561 {
562         print_func_entry();
563         /* sigh ... qemu. */
564         char vid[16];
565         if (vendor_id(vid) < 0)
566                 return 0;
567         printk("vendor id is %s\n", vid);
568         if (vid[0] == 'Q') /* qemu */
569                 return 0;
570         if (vid[0] == 'A') /* AMD or qemu claiming to be AMD */
571                 return 0;
572         uint32_t ecx = cpuid_ecx(1);
573         print_func_exit();
574         return ecx & (1 << 5);  /* CPUID.1:ECX.VMX[bit 5] -> VT */
575 }
576
577 static int vmx_disabled_by_bios(void)
578 {
579         print_func_entry();
580         uint64_t msr;
581
582         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
583         print_func_exit();
584         return (msr & 5) == 1;  /* locked but not enabled */
585 }
586
587 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
588 {
589         print_func_entry();
590         int cpu = hw_core_id();
591         uint64_t phys_addr;
592         uint64_t old;
593         uint64_t status = 0;
594         currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order,
595                                                                                           KMALLOC_WAIT);
596         if (!currentcpu->vmxarea)
597                 return;
598         memset(currentcpu->vmxarea, 0, vmcs_descriptor.size);
599         currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id;
600         phys_addr = PADDR(currentcpu->vmxarea);
601         printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(),
602                    currentcpu->vmxarea, (void *)phys_addr);
603         if (phys_addr & 0xfff) {
604                 printk("fix vmxarea alignment!");
605         }
606         printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
607         old = read_msr(MSR_IA32_FEATURE_CONTROL);
608         printk("%d: vm_enable, old is %d\n", core_id(), old);
609         if ((old & 5) == 0) {
610                 /* enable and lock */
611                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
612                 old = read_msr(MSR_IA32_FEATURE_CONTROL);
613                 printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old);
614         }
615         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
616         lcr4(rcr4() | CR4_VMXE);        /* FIXME: not cpu hotplug safe */
617         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
618         printk("%d:cr0 is %x\n", core_id(), rcr0());
619         lcr0(rcr0() | 0x20);
620         printk("%d:cr0 is %x\n", core_id(), rcr0());
621         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
622         outb(0x92, inb(0x92) | 2);
623         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
624         asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:":"=m" (status):"m"
625                                   (phys_addr):"memory", "cc");
626         printk("%d:vmxon status is %d\n", core_id(), status);
627         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
628         if (!status) {
629                 printk("%d:vm_enable: status says fail\n", core_id());
630         }
631         print_func_exit();
632 }
633
634 static void litevm_disable(void *garbage)
635 {
636         print_func_entry();
637         asm volatile ("vmxoff":::"cc");
638         print_func_exit();
639 }
640
641 struct litevm *vmx_open(void)
642 {
643         print_func_entry();
644         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
645         int i;
646
647         printk("vmx_open: litevm is %p\n", litevm);
648         if (!litevm) {
649                 printk("NO LITEVM! MAKES NO SENSE!\n");
650                 error("litevm alloc failed");
651                 print_func_exit();
652                 return 0;
653         }
654
655         SPLI_irqsave(&litevm->lock);
656         LIST_INIT(&litevm->link);
657         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
658                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
659                 printk("init vcpu %p\n", vcpu);
660
661                 QLOCK_init(&vcpu->mutex);
662                 vcpu->mmu.root_hpa = INVALID_PAGE;
663                 vcpu->litevm = litevm;
664                 LIST_INIT(&vcpu->link);
665         }
666         printk("vmx_open: busy %d\n", litevm->busy);
667         printk("return %p\n", litevm);
668         print_func_exit();
669         return litevm;
670 }
671
672 /*
673  * Free any memory in @free but not in @dont.
674  */
675 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
676                                                                          struct litevm_memory_slot *dont)
677 {
678         print_func_entry();
679         int i;
680
681         if (!dont || free->phys_mem != dont->phys_mem)
682                 if (free->phys_mem) {
683                         for (i = 0; i < free->npages; ++i) {
684                                 page_t *page = free->phys_mem[i];
685                                 page_decref(page);
686                                 assert(page_is_free(page2ppn(page)));
687                         }
688                         kfree(free->phys_mem);
689                 }
690
691         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
692                 kfree(free->dirty_bitmap);
693
694         free->phys_mem = 0;
695         free->npages = 0;
696         free->dirty_bitmap = 0;
697         print_func_exit();
698 }
699
700 static void litevm_free_physmem(struct litevm *litevm)
701 {
702         print_func_entry();
703         int i;
704
705         for (i = 0; i < litevm->nmemslots; ++i)
706                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
707         print_func_exit();
708 }
709
710 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
711 {
712         print_func_entry();
713         if (vcpu->vmcs) {
714                 handler_wrapper_t *w;
715                 smp_call_function_all(__vcpu_clear, vcpu, &w);
716                 smp_call_wait(w);
717                 //free_vmcs(vcpu->vmcs);
718                 vcpu->vmcs = 0;
719         }
720         print_func_exit();
721 }
722
723 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
724 {
725         print_func_entry();
726         litevm_free_vmcs(vcpu);
727         litevm_mmu_destroy(vcpu);
728         print_func_exit();
729 }
730
731 static void litevm_free_vcpus(struct litevm *litevm)
732 {
733         print_func_entry();
734         unsigned int i;
735
736         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
737                 litevm_free_vcpu(&litevm->vcpus[i]);
738         print_func_exit();
739 }
740
741 static int litevm_dev_release(struct litevm *litevm)
742 {
743         print_func_entry();
744
745         litevm_free_vcpus(litevm);
746         litevm_free_physmem(litevm);
747         kfree(litevm);
748         print_func_exit();
749         return 0;
750 }
751
752 unsigned long vmcs_readl(unsigned long field)
753 {
754         unsigned long value;
755
756         asm volatile ("vmread %1, %0":"=g" (value):"r"(field):"cc");
757         return value;
758 }
759
760 void vmcs_writel(unsigned long field, unsigned long value)
761 {
762         uint8_t error;
763
764         asm volatile ("vmwrite %1, %2; setna %0":"=g" (error):"r"(value),
765                                   "r"(field):"cc");
766         if (error)
767                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
768                            field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
769 }
770
771 static void vmcs_write16(unsigned long field, uint16_t value)
772 {
773         vmcs_writel(field, value);
774 }
775
776 static void vmcs_write64(unsigned long field, uint64_t value)
777 {
778         print_func_entry();
779 #ifdef __x86_64__
780         vmcs_writel(field, value);
781 #else
782         vmcs_writel(field, value);
783         asm volatile ("");
784         vmcs_writel(field + 1, value >> 32);
785 #endif
786         print_func_exit();
787 }
788
789 static void inject_gp(struct litevm_vcpu *vcpu)
790 {
791         print_func_entry();
792         printd("inject_general_protection: rip 0x%lx\n", vmcs_readl(GUEST_RIP));
793         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
794         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
795                                  GP_VECTOR |
796                                  INTR_TYPE_EXCEPTION |
797                                  INTR_INFO_DELIEVER_CODE_MASK | INTR_INFO_VALID_MASK);
798         print_func_exit();
799 }
800
801 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
802 {
803         print_func_entry();
804         if (vcpu->rmode.active)
805                 vmcs_write32(EXCEPTION_BITMAP, ~0);
806         else
807                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
808         print_func_exit();
809 }
810
811 static void enter_pmode(struct litevm_vcpu *vcpu)
812 {
813         print_func_entry();
814         unsigned long flags;
815
816         vcpu->rmode.active = 0;
817
818         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
819         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
820         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
821
822         flags = vmcs_readl(GUEST_RFLAGS);
823         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
824         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
825         vmcs_writel(GUEST_RFLAGS, flags);
826
827         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
828                                 (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK));
829
830         update_exception_bitmap(vcpu);
831
832 #define FIX_PMODE_DATASEG(seg, save) {                          \
833                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
834                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
835                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
836                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
837         }
838
839         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
840         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
841         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
842         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
843         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
844
845         vmcs_write16(GUEST_CS_SELECTOR,
846                                  vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
847         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
848         print_func_exit();
849 }
850
851 static int rmode_tss_base(struct litevm *litevm)
852 {
853         print_func_entry();
854         gfn_t base_gfn =
855                 litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
856         print_func_exit();
857         return base_gfn << PAGE_SHIFT;
858 }
859
860 static void enter_rmode(struct litevm_vcpu *vcpu)
861 {
862         print_func_entry();
863         unsigned long flags;
864
865         vcpu->rmode.active = 1;
866
867         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
868         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
869
870         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
871         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
872
873         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
874         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
875
876         flags = vmcs_readl(GUEST_RFLAGS);
877         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
878
879         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
880
881         printk("FLAGS 0x%x\n", flags);
882         vmcs_writel(GUEST_RFLAGS, flags);
883         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
884         update_exception_bitmap(vcpu);
885
886 #define FIX_RMODE_SEG(seg, save) {                                 \
887                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
888                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
889                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
890                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
891         }
892
893         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
894         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
895
896         FIX_RMODE_SEG(ES, vcpu->rmode.es);
897         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
898         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
899         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
900         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
901         print_func_exit();
902 }
903
904 static int init_rmode_tss(struct litevm *litevm)
905 {
906         print_func_entry();
907         struct page *p1, *p2, *p3;
908         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
909         char *page;
910
911         p1 = _gfn_to_page(litevm, fn++);
912         p2 = _gfn_to_page(litevm, fn++);
913         p3 = _gfn_to_page(litevm, fn);
914
915         if (!p1 || !p2 || !p3) {
916                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
917                 print_func_exit();
918                 return 0;
919         }
920
921         page = page2kva(p1);
922         memset(page, 0, PAGE_SIZE);
923         *(uint16_t *) (page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
924
925         page = page2kva(p2);
926         memset(page, 0, PAGE_SIZE);
927
928         page = page2kva(p3);
929         memset(page, 0, PAGE_SIZE);
930         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
931
932         print_func_exit();
933         return 1;
934 }
935
936 #ifdef __x86_64__
937
938 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
939 {
940         print_func_entry();
941         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
942
943         vcpu->shadow_efer = efer;
944         if (efer & EFER_LMA) {
945                 vmcs_write32(VM_ENTRY_CONTROLS,
946                                          vmcs_read32(VM_ENTRY_CONTROLS) |
947                                          VM_ENTRY_CONTROLS_IA32E_MASK);
948                 msr->data = efer;
949
950         } else {
951                 vmcs_write32(VM_ENTRY_CONTROLS,
952                                          vmcs_read32(VM_ENTRY_CONTROLS) &
953                                          ~VM_ENTRY_CONTROLS_IA32E_MASK);
954
955                 msr->data = efer & ~EFER_LME;
956         }
957         print_func_exit();
958 }
959
960 static void enter_lmode(struct litevm_vcpu *vcpu)
961 {
962         print_func_entry();
963         uint32_t guest_tr_ar;
964
965         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
966         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
967                 printd("%s: tss fixup for long mode. \n", __FUNCTION__);
968                 vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK)
969                                          | AR_TYPE_BUSY_64_TSS);
970         }
971
972         vcpu->shadow_efer |= EFER_LMA;
973
974         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
975         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
976                                  | VM_ENTRY_CONTROLS_IA32E_MASK);
977         print_func_exit();
978 }
979
980 static void exit_lmode(struct litevm_vcpu *vcpu)
981 {
982         print_func_entry();
983         vcpu->shadow_efer &= ~EFER_LMA;
984
985         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
986                                  & ~VM_ENTRY_CONTROLS_IA32E_MASK);
987         print_func_exit();
988 }
989
990 #endif
991
992 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
993 {
994         print_func_entry();
995         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
996                 enter_pmode(vcpu);
997
998         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
999                 enter_rmode(vcpu);
1000
1001 #ifdef __x86_64__
1002         if (vcpu->shadow_efer & EFER_LME) {
1003                 if (!is_paging() && (cr0 & CR0_PG_MASK))
1004                         enter_lmode(vcpu);
1005                 if (is_paging() && !(cr0 & CR0_PG_MASK))
1006                         exit_lmode(vcpu);
1007         }
1008 #endif
1009
1010         vmcs_writel(CR0_READ_SHADOW, cr0);
1011         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
1012         print_func_exit();
1013 }
1014
1015 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
1016                                                                                  unsigned long cr3)
1017 {
1018         print_func_entry();
1019         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
1020         unsigned offset = (cr3 & (PAGE_SIZE - 1)) >> 5;
1021         int i;
1022         uint64_t pdpte;
1023         uint64_t *pdpt;
1024         struct litevm_memory_slot *memslot;
1025
1026         SPLL(&vcpu->litevm->lock);
1027         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
1028         /* FIXME: !memslot - emulate? 0xff? */
1029         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
1030
1031         for (i = 0; i < 4; ++i) {
1032                 pdpte = pdpt[offset + i];
1033                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
1034                         break;
1035         }
1036
1037         SPLU(&vcpu->litevm->lock);
1038
1039         print_func_exit();
1040         return i != 4;
1041 }
1042
1043 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
1044 {
1045         print_func_entry();
1046         if (cr0 & CR0_RESEVED_BITS) {
1047                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, guest_cr0());
1048                 inject_gp(vcpu);
1049                 print_func_exit();
1050                 return;
1051         }
1052
1053         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
1054                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
1055                 inject_gp(vcpu);
1056                 print_func_exit();
1057                 return;
1058         }
1059
1060         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
1061                 printd("set_cr0: #GP, set PG flag " "and a clear PE flag\n");
1062                 inject_gp(vcpu);
1063                 print_func_exit();
1064                 return;
1065         }
1066
1067         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
1068 #ifdef __x86_64__
1069                 if ((vcpu->shadow_efer & EFER_LME)) {
1070                         uint32_t guest_cs_ar;
1071                         if (!is_pae()) {
1072                                 printd("set_cr0: #GP, start paging "
1073                                            "in long mode while PAE is disabled\n");
1074                                 inject_gp(vcpu);
1075                                 print_func_exit();
1076                                 return;
1077                         }
1078                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1079                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
1080                                 printd("set_cr0: #GP, start paging "
1081                                            "in long mode while CS.L == 1\n");
1082                                 inject_gp(vcpu);
1083                                 print_func_exit();
1084                                 return;
1085
1086                         }
1087                 } else
1088 #endif
1089                 if (is_pae() && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1090                         printd("set_cr0: #GP, pdptrs " "reserved bits\n");
1091                         inject_gp(vcpu);
1092                         print_func_exit();
1093                         return;
1094                 }
1095
1096         }
1097
1098         __set_cr0(vcpu, cr0);
1099         litevm_mmu_reset_context(vcpu);
1100         print_func_exit();
1101         return;
1102 }
1103
1104 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
1105 {
1106         print_func_entry();
1107         unsigned long cr0 = guest_cr0();
1108
1109         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
1110                 enter_pmode(vcpu);
1111                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
1112
1113         } else
1114                 printd("lmsw: unexpected\n");
1115
1116         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
1117                                 | (msw & LMSW_GUEST_MASK));
1118         print_func_exit();
1119 }
1120
1121 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1122 {
1123         print_func_entry();
1124         vmcs_writel(CR4_READ_SHADOW, cr4);
1125         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1126                                                                   LITEVM_RMODE_VM_CR4_ALWAYS_ON :
1127                                                                   LITEVM_PMODE_VM_CR4_ALWAYS_ON));
1128         print_func_exit();
1129 }
1130
1131 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1132 {
1133         print_func_entry();
1134         if (cr4 & CR4_RESEVED_BITS) {
1135                 printd("set_cr4: #GP, reserved bits\n");
1136                 inject_gp(vcpu);
1137                 print_func_exit();
1138                 return;
1139         }
1140
1141         if (is_long_mode()) {
1142                 if (!(cr4 & CR4_PAE_MASK)) {
1143                         printd("set_cr4: #GP, clearing PAE while " "in long mode\n");
1144                         inject_gp(vcpu);
1145                         print_func_exit();
1146                         return;
1147                 }
1148         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
1149                            && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1150                 printd("set_cr4: #GP, pdptrs reserved bits\n");
1151                 inject_gp(vcpu);
1152         }
1153
1154         if (cr4 & CR4_VMXE_MASK) {
1155                 printd("set_cr4: #GP, setting VMXE\n");
1156                 inject_gp(vcpu);
1157                 print_func_exit();
1158                 return;
1159         }
1160         __set_cr4(vcpu, cr4);
1161         SPLL(&vcpu->litevm->lock);
1162         litevm_mmu_reset_context(vcpu);
1163         SPLU(&vcpu->litevm->lock);
1164         print_func_exit();
1165 }
1166
1167 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1168 {
1169         print_func_entry();
1170         if (is_long_mode()) {
1171                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
1172                         printd("set_cr3: #GP, reserved bits\n");
1173                         inject_gp(vcpu);
1174                         print_func_exit();
1175                         return;
1176                 }
1177         } else {
1178                 if (cr3 & CR3_RESEVED_BITS) {
1179                         printd("set_cr3: #GP, reserved bits\n");
1180                         inject_gp(vcpu);
1181                         print_func_exit();
1182                         return;
1183                 }
1184                 if (is_paging() && is_pae() && pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1185                         printd("set_cr3: #GP, pdptrs " "reserved bits\n");
1186                         inject_gp(vcpu);
1187                         print_func_exit();
1188                         return;
1189                 }
1190         }
1191
1192         vcpu->cr3 = cr3;
1193         SPLL(&vcpu->litevm->lock);
1194         vcpu->mmu.new_cr3(vcpu);
1195         SPLU(&vcpu->litevm->lock);
1196         print_func_exit();
1197 }
1198
1199 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1200 {
1201         print_func_entry();
1202         if (cr8 & CR8_RESEVED_BITS) {
1203                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1204                 inject_gp(vcpu);
1205                 print_func_exit();
1206                 return;
1207         }
1208         vcpu->cr8 = cr8;
1209         print_func_exit();
1210 }
1211
1212 static uint32_t get_rdx_init_val(void)
1213 {
1214         print_func_entry();
1215         uint32_t val;
1216
1217 asm("movl $1, %%eax \n\t" "movl %%eax, %0 \n\t":"=g"(val));
1218         print_func_exit();
1219         return val;
1220
1221 }
1222
1223 static void fx_init(struct litevm_vcpu *vcpu)
1224 {
1225         print_func_entry();
1226         struct __attribute__ ((__packed__)) fx_image_s {
1227                 uint16_t control;               //fcw
1228                 uint16_t status;                //fsw
1229                 uint16_t tag;                   // ftw
1230                 uint16_t opcode;                //fop
1231                 uint64_t ip;                    // fpu ip
1232                 uint64_t operand;               // fpu dp
1233                 uint32_t mxcsr;
1234                 uint32_t mxcsr_mask;
1235
1236         } *fx_image;
1237
1238         fx_save(vcpu->host_fx_image);
1239         fpu_init();
1240         fx_save(vcpu->guest_fx_image);
1241         fx_restore(vcpu->host_fx_image);
1242
1243         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1244         fx_image->mxcsr = 0x1f80;
1245         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1246                    0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1247         print_func_exit();
1248 }
1249
1250 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field,
1251                                                                    uint32_t val)
1252 {
1253         uint32_t msr_high, msr_low;
1254         uint64_t msrval;
1255
1256         msrval = read_msr(msr);
1257         msr_low = msrval;
1258         msr_high = (msrval >> 32);
1259
1260         val &= msr_high;
1261         val |= msr_low;
1262         vmcs_write32(vmcs_field, val);
1263 }
1264
1265 /*
1266  * Sets up the vmcs for emulated real mode.
1267  */
1268 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1269 {
1270         print_func_entry();
1271
1272 /* no op on x86_64 */
1273 #define asmlinkage
1274         extern asmlinkage void litevm_vmx_return(void);
1275         uint32_t host_sysenter_cs;
1276         uint32_t junk;
1277         uint64_t a;
1278         struct descriptor_table dt;
1279         int i;
1280         int ret;
1281         uint64_t tsc;
1282         int nr_good_msrs;
1283
1284         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1285         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1286         vcpu->cr8 = 0;
1287         vcpu->apic_base = 0xfee00000 |
1288                 /*for vcpu 0 */ MSR_IA32_APICBASE_BSP |
1289                 MSR_IA32_APICBASE_ENABLE;
1290
1291         fx_init(vcpu);
1292
1293 #define SEG_SETUP(seg) do {                                     \
1294                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1295                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1296                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1297                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1298         } while (0)
1299
1300         /*
1301          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1302          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1303          */
1304         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1305         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1306         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1307         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1308
1309         SEG_SETUP(DS);
1310         SEG_SETUP(ES);
1311         SEG_SETUP(FS);
1312         SEG_SETUP(GS);
1313         SEG_SETUP(SS);
1314
1315         vmcs_write16(GUEST_TR_SELECTOR, 0);
1316         vmcs_writel(GUEST_TR_BASE, 0);
1317         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1318         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1319
1320         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1321         vmcs_writel(GUEST_LDTR_BASE, 0);
1322         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1323         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1324
1325         vmcs_write32(GUEST_SYSENTER_CS, 0);
1326         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1327         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1328
1329         vmcs_writel(GUEST_RFLAGS, 0x02);
1330         vmcs_writel(GUEST_RIP, 0xfff0);
1331         vmcs_writel(GUEST_RSP, 0);
1332
1333         vmcs_writel(GUEST_CR3, 0);
1334
1335         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1336         vmcs_writel(GUEST_DR7, 0x400);
1337
1338         vmcs_writel(GUEST_GDTR_BASE, 0);
1339         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1340
1341         vmcs_writel(GUEST_IDTR_BASE, 0);
1342         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1343
1344         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1345         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1346         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1347
1348         /* I/O */
1349         vmcs_write64(IO_BITMAP_A, 0);
1350         vmcs_write64(IO_BITMAP_B, 0);
1351
1352         tsc = read_tsc();
1353         vmcs_write64(TSC_OFFSET, -tsc);
1354
1355         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1356
1357         /* Special registers */
1358         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1359
1360         /* Control */
1361         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_EXT_INTR_MASK       /* 20.6.1 */
1362                                                    | PIN_BASED_NMI_EXITING      /* 20.6.1 */
1363                 );
1364         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_HLT_EXITING        /* 20.6.2 */
1365                                                    | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1366                                                    | CPU_BASED_CR8_STORE_EXITING        /* 20.6.2 */
1367                                                    | CPU_BASED_UNCOND_IO_EXITING        /* 20.6.2 */
1368                                                    | CPU_BASED_INVDPG_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING  /* 21.3 */
1369                 );
1370
1371         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1372         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1373         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1374         vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1375
1376         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1377         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1378         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1379
1380         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
1381         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
1382         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
1383         vmcs_write16(HOST_FS_SELECTOR, read_fs());      /* 22.2.4 */
1384         vmcs_write16(HOST_GS_SELECTOR, read_gs());      /* 22.2.4 */
1385         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
1386
1387 #ifdef __x86_64__
1388         a = read_msr(MSR_FS_BASE);
1389         vmcs_writel(HOST_FS_BASE, a);   /* 22.2.4 */
1390         a = read_msr(MSR_GS_BASE);
1391         vmcs_writel(HOST_GS_BASE, a);   /* 22.2.4 */
1392 #else
1393         vmcs_writel(HOST_FS_BASE, 0);   /* 22.2.4 */
1394         vmcs_writel(HOST_GS_BASE, 0);   /* 22.2.4 */
1395 #endif
1396
1397         vmcs_write16(HOST_TR_SELECTOR, GD_TSS * 8);     /* 22.2.4 */
1398
1399         get_idt(&dt);
1400         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1401
1402         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return);        /* 22.2.5 */
1403
1404         /* it's the HIGH 32 bits! */
1405         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1406         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1407         a = read_msr(MSR_IA32_SYSENTER_ESP);
1408         vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1409         a = read_msr(MSR_IA32_SYSENTER_EIP);
1410         vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1411
1412         ret = -ENOMEM;
1413         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1414         if (!vcpu->guest_msrs)
1415                 error("guest_msrs kmalloc failed");
1416         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1417         if (!vcpu->host_msrs)
1418                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1419
1420         for (i = 0; i < NR_VMX_MSR; ++i) {
1421                 uint32_t index = vmx_msr_index[i];
1422                 uint32_t data_low, data_high;
1423                 uint64_t data;
1424                 int j = vcpu->nmsrs;
1425
1426 #warning "need readmsr_safe"
1427 //      if (rdmsr_safe(index, &data_low, &data_high) < 0)
1428 //          continue;
1429                 data = read_msr(index);
1430                 vcpu->host_msrs[j].index = index;
1431                 vcpu->host_msrs[j].reserved = 0;
1432                 vcpu->host_msrs[j].data = data;
1433                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1434                 ++vcpu->nmsrs;
1435         }
1436         printk("msrs: %d\n", vcpu->nmsrs);
1437
1438         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1439         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1440         vmcs_writel(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1441         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1442         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, (HOST_IS_64 << 9));        /* 22.2,1, 20.7.1 */
1443         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs);    /* 22.2.2 */
1444         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);     /* 22.2.2 */
1445         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs);    /* 22.2.2 */
1446
1447         /* 22.2.1, 20.8.1 */
1448         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, VM_ENTRY_CONTROLS, 0);
1449         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
1450
1451         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1452         vmcs_writel(TPR_THRESHOLD, 0);
1453
1454         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1455         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1456
1457         __set_cr0(vcpu, 0x60000010);    // enter rmode
1458         __set_cr4(vcpu, 0);
1459 #ifdef __x86_64__
1460         __set_efer(vcpu, 0);
1461 #endif
1462
1463         ret = litevm_mmu_init(vcpu);
1464
1465         print_func_exit();
1466         return ret;
1467
1468 out_free_guest_msrs:
1469         kfree(vcpu->guest_msrs);
1470 out:
1471         return ret;
1472 }
1473
1474 /*
1475  * Sync the rsp and rip registers into the vcpu structure.  This allows
1476  * registers to be accessed by indexing vcpu->regs.
1477  */
1478 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1479 {
1480         print_func_entry();
1481         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1482         vcpu->rip = vmcs_readl(GUEST_RIP);
1483         print_func_exit();
1484 }
1485
1486 /*
1487  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1488  * modification.
1489  */
1490 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1491 {
1492         print_func_entry();
1493         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1494         vmcs_writel(GUEST_RIP, vcpu->rip);
1495         print_func_exit();
1496 }
1497
1498 /*
1499  * Creates some virtual cpus.  Good luck creating more than one.
1500  */
1501 int vmx_create_vcpu(struct litevm *litevm, int n)
1502 {
1503         print_func_entry();
1504         ERRSTACK(2);
1505         int r;
1506         struct litevm_vcpu *vcpu;
1507         struct vmcs *vmcs;
1508         char *errstring = NULL;
1509
1510         if (n < 0 || n >= LITEVM_MAX_VCPUS) {
1511                 printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1512                            LITEVM_MAX_VCPUS);
1513                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1514                           LITEVM_MAX_VCPUS);
1515         }
1516         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1517         vcpu = &litevm->vcpus[n];
1518
1519         printk("vmx_create_vcpu: @%d, %p\n", n, vcpu);
1520         QLOCK(&vcpu->mutex);
1521
1522         if (vcpu->vmcs) {
1523                 QUNLOCK(&vcpu->mutex);
1524                 printk("VM already exists\n");
1525                 error("VM already exists");
1526         }
1527         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1528         /* I'm a bad person */
1529         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1530         uint64_t a = (uint64_t) vcpu->fx_buf;
1531         a += FX_IMAGE_ALIGN - 1;
1532         a /= FX_IMAGE_ALIGN;
1533         a *= FX_IMAGE_ALIGN;
1534
1535         vcpu->host_fx_image = (char *)a;
1536         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1537
1538         vcpu->cpu = -1; /* First load will set up TR */
1539         vcpu->litevm = litevm;
1540         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1541         if (waserror()){
1542                 printk("ERR 1 in %s, %s\n", __func__, current_errstr());
1543                 QUNLOCK(&vcpu->mutex);
1544                 litevm_free_vcpu(vcpu);
1545                 nexterror();
1546         }
1547         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1548         vmcs = alloc_vmcs();
1549         vmcs_clear(vmcs);
1550         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1551         printk("after vmcs_clear\n");
1552         vcpu->vmcs = vmcs;
1553         printk("vcpu %p set vmcs to %p\n", vcpu, vmcs);
1554         vcpu->launched = 0;
1555         printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs);
1556
1557         __vcpu_load(vcpu);
1558
1559         printk("PAST vcpu_load\n");
1560         if (waserror()) {
1561                 /* we really need to fix waserror() */
1562                 printk("vcpu_setup failed: %s\n", current_errstr());
1563                 QUNLOCK(&vcpu->mutex);
1564                 nexterror();
1565         }
1566
1567         /* need memory for the rmode_tss. I have no idea how this happened
1568          * originally in kvm.
1569          */
1570         /* this sucks. */
1571         QUNLOCK(&vcpu->mutex);
1572         void *v;
1573         struct litevm_memory_region vmr;
1574         vmr.slot = 0;
1575         vmr.flags = 0;
1576         vmr.guest_phys_addr = /* guess. */ 0x1000000;
1577         vmr.memory_size = 0x10000;
1578         vmr.init_data = NULL;
1579         if (vm_set_memory_region(litevm, &vmr))
1580                 printk("vm_set_memory_region failed");
1581
1582         printk("set memory region done\n");
1583
1584         if (!init_rmode_tss(litevm)) {
1585                 error("vcpu_setup: init_rmode_tss failed");
1586         }
1587
1588
1589         QLOCK(&vcpu->mutex);
1590         r = litevm_vcpu_setup(vcpu);
1591
1592         vcpu_put(vcpu);
1593
1594         printk("r is %d\n", r);
1595
1596         if (!r) {
1597                 poperror();
1598                 print_func_exit();
1599                 return 0;
1600         }
1601
1602         errstring = "vcup set failed";
1603
1604 out_free_vcpus:
1605 out:
1606         print_func_exit();
1607         return r;
1608 }
1609
1610 /*
1611  * Allocate some memory and give it an address in the guest physical address
1612  * space.
1613  *
1614  * Discontiguous memory is allowed, mostly for framebuffers.
1615  */
1616 int vm_set_memory_region(struct litevm *litevm,
1617                                                  struct litevm_memory_region *mem)
1618 {
1619         print_func_entry();
1620         ERRSTACK(2);
1621         int r;
1622         gfn_t base_gfn;
1623         unsigned long npages;
1624         unsigned long i;
1625         struct litevm_memory_slot *memslot;
1626         struct litevm_memory_slot old, new;
1627         int memory_config_version;
1628         void *init_data = mem->init_data;
1629         int pass = 1;
1630         printk("%s: slot %d base %08x npages %d\n", 
1631                 __func__, 
1632                mem->slot, mem->guest_phys_addr, 
1633                mem->memory_size);
1634         /* should not happen but ... */
1635         if (!litevm)
1636                 error("NULL litevm in %s", __func__);
1637
1638         if (!mem)
1639                 error("NULL mem in %s", __func__);
1640         /* I don't care right now. *
1641         if (litevm->busy)
1642                 error("litevm->busy is set! 0x%x\n", litevm->busy);
1643         */
1644         r = -EINVAL;
1645         /* General sanity checks */
1646         if (mem->memory_size & (PAGE_SIZE - 1))
1647                 error("mem->memory_size %lld is not page-aligned", mem->memory_size);
1648         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1649                 error("guest_phys_addr 0x%llx is not page-aligned",
1650                           mem->guest_phys_addr);
1651         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1652                 error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
1653         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1654                 error("0x%x + 0x%x is < 0x%x",
1655                           mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
1656
1657         memslot = &litevm->memslots[mem->slot];
1658         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1659         npages = mem->memory_size >> PAGE_SHIFT;
1660
1661         if (!npages)
1662                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1663
1664         /* this is actually a very tricky for loop. The use of
1665          * error is a bit dangerous, so we don't use it much.
1666          * consider a rewrite. Would be nice if akaros could do the
1667          * allocation of a bunch of pages for us.
1668          */
1669 raced:
1670         printk("raced: pass %d\n", pass);
1671         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1672         void monitor(void *);
1673         monitor(NULL);
1674         SPLL(&litevm->lock);
1675         printk("locked\n");
1676
1677         if (waserror()) {
1678                 printk("error in %s, %s\n", __func__, current_errstr());
1679                 SPLU(&litevm->lock);
1680                 nexterror();
1681         }
1682
1683         memory_config_version = litevm->memory_config_version;
1684         new = old = *memslot;
1685         printk("memory_config_version %d\n", memory_config_version);
1686
1687         new.base_gfn = base_gfn;
1688         new.npages = npages;
1689         new.flags = mem->flags;
1690
1691         /* Disallow changing a memory slot's size. */
1692         r = -EINVAL;
1693         if (npages && old.npages && npages != old.npages)
1694                 error("npages is %d, old.npages is %d, can't change",
1695                           npages, old.npages);
1696
1697         /* Check for overlaps */
1698         r = -EEXIST;
1699         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1700                 struct litevm_memory_slot *s = &litevm->memslots[i];
1701 printk("Region %d: base gfn 0x%x npages %d\n", s->base_gfn, s->npages);
1702                 if (s == memslot)
1703                         continue;
1704                 if (!((base_gfn + npages <= s->base_gfn) ||
1705                           (base_gfn >= s->base_gfn + s->npages)))
1706                         error("Overlap");
1707         }
1708         /*
1709          * Do memory allocations outside lock.  memory_config_version will
1710          * detect any races.
1711          */
1712         SPLU(&litevm->lock);
1713         printk("unlocked\n");
1714         poperror();
1715
1716         /* Deallocate if slot is being removed */
1717         if (!npages)
1718                 new.phys_mem = 0;
1719
1720         /* Free page dirty bitmap if unneeded */
1721         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1722                 new.dirty_bitmap = 0;
1723
1724         r = -ENOMEM;
1725
1726         /* Allocate if a slot is being created */
1727         if (npages && !new.phys_mem) {
1728                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1729
1730                 if (!new.phys_mem)
1731                         goto out_free;
1732
1733                 for (i = 0; i < npages; ++i) {
1734                         int ret;
1735                         ret = kpage_alloc(&new.phys_mem[i]);
1736                         printk("PAGEALLOC: va %p pa %p\n",page2kva(new.phys_mem[i]),page2pa(new.phys_mem[i]));
1737                         if (ret != ESUCCESS)
1738                                 goto out_free;
1739                         if (init_data) {
1740                                 printk("init data memcpy(%p,%p,4096);\n",
1741                                            page2kva(new.phys_mem[i]), init_data);
1742                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1743                                 init_data += PAGE_SIZE;
1744                         } else {
1745                                 int j;
1746                                 //memset(page2kva(new.phys_mem[i]), 0xf4 /* hlt */, PAGE_SIZE);
1747                                 uint8_t *cp = page2kva(new.phys_mem[i]);
1748                                 memset(cp, 0, PAGE_SIZE);
1749                                 if (base_gfn < 0x100000){
1750                                 for(j = 0; j < PAGE_SIZE; j += 2){
1751                                         // XORL %RAX, %RAX
1752                                         cp[j] = 0x31; cp[j+1] = 0xc0;
1753                                 }
1754                                 // 1: jmp 1b
1755                                 cp[4094] = 0xeb;
1756                                 cp[4095] = 0xfe;
1757                                 }
1758                                         
1759                                 init_data += PAGE_SIZE;
1760                         }
1761                 }
1762         }
1763
1764         /* Allocate page dirty bitmap if needed */
1765         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1766                 unsigned dirty_bytes;   //ALIGN(npages, BITS_PER_LONG) / 8;
1767                 dirty_bytes =
1768                         (((npages + BITS_PER_LONG -
1769                            1) / BITS_PER_LONG) * BITS_PER_LONG) / 8;
1770
1771                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1772                 if (!new.dirty_bitmap) {
1773                         printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
1774                         goto out_free;
1775                 }
1776         }
1777
1778         SPLL(&litevm->lock);
1779         printk("locked\n");
1780         if (memory_config_version != litevm->memory_config_version) {
1781                 SPLU(&litevm->lock);
1782                 printk("unlocked, try again\n");
1783                 litevm_free_physmem_slot(&new, &old);
1784                 goto raced;
1785         }
1786
1787         r = -EAGAIN;
1788         if (litevm->busy) {
1789                 printk("BUSY!\n");
1790                 goto out_unlock;
1791         }
1792
1793         if (mem->slot >= litevm->nmemslots)
1794                 litevm->nmemslots = mem->slot + 1;
1795
1796         *memslot = new;
1797         ++litevm->memory_config_version;
1798
1799         SPLU(&litevm->lock);
1800         printk("unlocked\n");
1801         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1802                 struct litevm_vcpu *vcpu;
1803
1804                 vcpu = vcpu_load(litevm, i);
1805                 if (!vcpu){
1806                         printk("%s: no cpu %d\n", __func__, i);
1807                         continue;
1808                 }
1809                 litevm_mmu_reset_context(vcpu);
1810                 vcpu_put(vcpu);
1811         }
1812
1813         litevm_free_physmem_slot(&old, &new);
1814         print_func_exit();
1815         return 0;
1816
1817 out_unlock:
1818         SPLU(&litevm->lock);
1819         printk("out_unlock\n");
1820 out_free:
1821         printk("out_free\n");
1822         litevm_free_physmem_slot(&new, &old);
1823 out:
1824         printk("vm_set_memory_region: return %d\n", r);
1825         print_func_exit();
1826         return r;
1827 }
1828
1829 #if 0
1830 /*
1831  * Get (and clear) the dirty memory log for a memory slot.
1832  */
1833 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1834                                                                                   struct litevm_dirty_log *log)
1835 {
1836         struct litevm_memory_slot *memslot;
1837         int r, i;
1838         int n;
1839         unsigned long any = 0;
1840
1841         SPLL(&litevm->lock);
1842
1843         /*
1844          * Prevent changes to guest memory configuration even while the lock
1845          * is not taken.
1846          */
1847         ++litevm->busy;
1848         SPLU(&litevm->lock);
1849         r = -EINVAL;
1850         if (log->slot >= LITEVM_MEMORY_SLOTS)
1851                 goto out;
1852
1853         memslot = &litevm->memslots[log->slot];
1854         r = -ENOENT;
1855         if (!memslot->dirty_bitmap)
1856                 goto out;
1857
1858         n = ALIGN(memslot->npages, 8) / 8;
1859
1860         for (i = 0; !any && i < n; ++i)
1861                 any = memslot->dirty_bitmap[i];
1862
1863         r = -EFAULT;
1864         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1865                 goto out;
1866
1867         if (any) {
1868                 SPLL(&litevm->lock);
1869                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1870                 SPLU(&litevm->lock);
1871                 memset(memslot->dirty_bitmap, 0, n);
1872                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1873                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1874
1875                         if (!vcpu)
1876                                 continue;
1877                         flush_guest_tlb(vcpu);
1878                         vcpu_put(vcpu);
1879                 }
1880         }
1881
1882         r = 0;
1883
1884 out:
1885         SPLL(&litevm->lock);
1886         --litevm->busy;
1887         SPLU(&litevm->lock);
1888         return r;
1889 }
1890 #endif
1891
1892 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1893 {
1894         print_func_entry();
1895         int i;
1896
1897         printk("%s: litevm %p gfn %d\n", litevm, gfn);
1898         for (i = 0; i < litevm->nmemslots; ++i) {
1899                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1900
1901                 printk("%s: slot %d gfn 0x%lx base_gfn %lx npages %x\n", 
1902                         __func__, i, gfn,memslot->base_gfn, memslot->npages);
1903                 if (gfn >= memslot->base_gfn
1904                         && gfn < memslot->base_gfn + memslot->npages) {
1905                         print_func_exit();
1906                         return memslot;
1907                 }
1908         }
1909         print_func_exit();
1910         return 0;
1911 }
1912
1913 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1914 {
1915         print_func_entry();
1916         int i;
1917         struct litevm_memory_slot *memslot = 0;
1918         unsigned long rel_gfn;
1919
1920         for (i = 0; i < litevm->nmemslots; ++i) {
1921                 memslot = &litevm->memslots[i];
1922
1923                 if (gfn >= memslot->base_gfn
1924                         && gfn < memslot->base_gfn + memslot->npages) {
1925
1926                         if (!memslot || !memslot->dirty_bitmap) {
1927                                 print_func_exit();
1928                                 return;
1929                         }
1930
1931                         rel_gfn = gfn - memslot->base_gfn;
1932
1933                         /* avoid RMW */
1934                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1935                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1936                         print_func_exit();
1937                         return;
1938                 }
1939         }
1940         print_func_exit();
1941 }
1942
1943 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1944 {
1945         print_func_entry();
1946         unsigned long rip;
1947         uint32_t interruptibility;
1948
1949         rip = vmcs_readl(GUEST_RIP);
1950         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1951         vmcs_writel(GUEST_RIP, rip);
1952
1953         /*
1954          * We emulated an instruction, so temporary interrupt blocking
1955          * should be removed, if set.
1956          */
1957         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1958         if (interruptibility & 3)
1959                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3);
1960         print_func_exit();
1961 }
1962
1963 static int emulator_read_std(unsigned long addr,
1964                                                          unsigned long *val,
1965                                                          unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1966 {
1967         print_func_entry();
1968         struct litevm_vcpu *vcpu = ctxt->vcpu;
1969         void *data = val;
1970
1971         while (bytes) {
1972                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1973                 unsigned offset = addr & (PAGE_SIZE - 1);
1974                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ?
1975                         bytes : (unsigned)PAGE_SIZE - offset;
1976                 unsigned long pfn;
1977                 struct litevm_memory_slot *memslot;
1978                 void *page;
1979
1980                 if (gpa == UNMAPPED_GVA) {
1981                         print_func_exit();
1982                         return X86EMUL_PROPAGATE_FAULT;
1983                 }
1984                 pfn = gpa >> PAGE_SHIFT;
1985                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1986                 if (!memslot) {
1987                         print_func_exit();
1988                         return X86EMUL_UNHANDLEABLE;
1989                 }
1990                 page = page2kva(gfn_to_page(memslot, pfn));
1991
1992                 memcpy(data, page + offset, tocopy);
1993
1994                 bytes -= tocopy;
1995                 data += tocopy;
1996                 addr += tocopy;
1997         }
1998
1999         print_func_exit();
2000         return X86EMUL_CONTINUE;
2001 }
2002
2003 static int emulator_write_std(unsigned long addr,
2004                                                           unsigned long val,
2005                                                           unsigned int bytes, struct x86_emulate_ctxt *ctxt)
2006 {
2007         print_func_entry();
2008         printk("emulator_write_std: addr %lx n %d\n", addr, bytes);
2009         print_func_exit();
2010         return X86EMUL_UNHANDLEABLE;
2011 }
2012
2013 static int emulator_read_emulated(unsigned long addr,
2014                                                                   unsigned long *val,
2015                                                                   unsigned int bytes,
2016                                                                   struct x86_emulate_ctxt *ctxt)
2017 {
2018         print_func_entry();
2019         struct litevm_vcpu *vcpu = ctxt->vcpu;
2020
2021         if (vcpu->mmio_read_completed) {
2022                 memcpy(val, vcpu->mmio_data, bytes);
2023                 vcpu->mmio_read_completed = 0;
2024                 print_func_exit();
2025                 return X86EMUL_CONTINUE;
2026         } else if (emulator_read_std(addr, val, bytes, ctxt)
2027                            == X86EMUL_CONTINUE) {
2028                 print_func_exit();
2029                 return X86EMUL_CONTINUE;
2030         } else {
2031                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2032                 if (gpa == UNMAPPED_GVA) {
2033                         print_func_exit();
2034                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
2035                 }
2036                 vcpu->mmio_needed = 1;
2037                 vcpu->mmio_phys_addr = gpa;
2038                 vcpu->mmio_size = bytes;
2039                 vcpu->mmio_is_write = 0;
2040
2041                 print_func_exit();
2042                 return X86EMUL_UNHANDLEABLE;
2043         }
2044 }
2045
2046 static int emulator_write_emulated(unsigned long addr,
2047                                                                    unsigned long val,
2048                                                                    unsigned int bytes,
2049                                                                    struct x86_emulate_ctxt *ctxt)
2050 {
2051         print_func_entry();
2052         struct litevm_vcpu *vcpu = ctxt->vcpu;
2053         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2054
2055         if (gpa == UNMAPPED_GVA) {
2056                 print_func_exit();
2057                 return X86EMUL_PROPAGATE_FAULT;
2058         }
2059
2060         vcpu->mmio_needed = 1;
2061         vcpu->mmio_phys_addr = gpa;
2062         vcpu->mmio_size = bytes;
2063         vcpu->mmio_is_write = 1;
2064         memcpy(vcpu->mmio_data, &val, bytes);
2065
2066         print_func_exit();
2067         return X86EMUL_CONTINUE;
2068 }
2069
2070 static int emulator_cmpxchg_emulated(unsigned long addr,
2071                                                                          unsigned long old,
2072                                                                          unsigned long new,
2073                                                                          unsigned int bytes,
2074                                                                          struct x86_emulate_ctxt *ctxt)
2075 {
2076         print_func_entry();
2077         static int reported;
2078
2079         if (!reported) {
2080                 reported = 1;
2081                 printk("litevm: emulating exchange as write\n");
2082         }
2083         print_func_exit();
2084         return emulator_write_emulated(addr, new, bytes, ctxt);
2085 }
2086
2087 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
2088 {
2089         print_func_entry();
2090         static int reported;
2091         uint8_t opcodes[4];
2092         unsigned long rip = vmcs_readl(GUEST_RIP);
2093         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
2094
2095         if (reported) {
2096                 print_func_exit();
2097                 return;
2098         }
2099
2100         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
2101
2102         printk("emulation failed but !mmio_needed?"
2103                    " rip %lx %02x %02x %02x %02x\n",
2104                    rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2105         reported = 1;
2106         print_func_exit();
2107 }
2108
2109 struct x86_emulate_ops emulate_ops = {
2110         .read_std = emulator_read_std,
2111         .write_std = emulator_write_std,
2112         .read_emulated = emulator_read_emulated,
2113         .write_emulated = emulator_write_emulated,
2114         .cmpxchg_emulated = emulator_cmpxchg_emulated,
2115 };
2116
2117 enum emulation_result {
2118         EMULATE_DONE,                           /* no further processing */
2119         EMULATE_DO_MMIO,                        /* litevm_run filled with mmio request */
2120         EMULATE_FAIL,                           /* can't emulate this instruction */
2121 };
2122
2123 static int emulate_instruction(struct litevm_vcpu *vcpu,
2124                                                            struct litevm_run *run,
2125                                                            unsigned long cr2, uint16_t error_code)
2126 {
2127         print_func_entry();
2128         struct x86_emulate_ctxt emulate_ctxt;
2129         int r;
2130         uint32_t cs_ar;
2131
2132         vcpu_load_rsp_rip(vcpu);
2133
2134         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2135
2136         emulate_ctxt.vcpu = vcpu;
2137         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
2138         emulate_ctxt.cr2 = cr2;
2139         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
2140                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
2141                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
2142                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2143
2144         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2145                 emulate_ctxt.cs_base = 0;
2146                 emulate_ctxt.ds_base = 0;
2147                 emulate_ctxt.es_base = 0;
2148                 emulate_ctxt.ss_base = 0;
2149                 emulate_ctxt.gs_base = 0;
2150                 emulate_ctxt.fs_base = 0;
2151         } else {
2152                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
2153                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
2154                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
2155                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
2156                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
2157                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
2158         }
2159
2160         vcpu->mmio_is_write = 0;
2161         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
2162
2163         if ((r || vcpu->mmio_is_write) && run) {
2164                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2165                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2166                 run->mmio.len = vcpu->mmio_size;
2167                 run->mmio.is_write = vcpu->mmio_is_write;
2168         }
2169
2170         if (r) {
2171                 if (!vcpu->mmio_needed) {
2172                         report_emulation_failure(&emulate_ctxt);
2173                         print_func_exit();
2174                         return EMULATE_FAIL;
2175                 }
2176                 print_func_exit();
2177                 return EMULATE_DO_MMIO;
2178         }
2179
2180         vcpu_put_rsp_rip(vcpu);
2181         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
2182
2183         if (vcpu->mmio_is_write) {
2184                 print_func_exit();
2185                 return EMULATE_DO_MMIO;
2186         }
2187
2188         print_func_exit();
2189         return EMULATE_DONE;
2190 }
2191
2192 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
2193 {
2194         print_func_entry();
2195         print_func_exit();
2196         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2197 }
2198
2199 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2200 {
2201         print_func_entry();
2202         vmcs_writel(GUEST_GDTR_BASE, base);
2203         vmcs_write32(GUEST_GDTR_LIMIT, limit);
2204         print_func_exit();
2205 }
2206
2207 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2208 {
2209         print_func_entry();
2210         vmcs_writel(GUEST_IDTR_BASE, base);
2211         vmcs_write32(GUEST_IDTR_LIMIT, limit);
2212         print_func_exit();
2213 }
2214
2215 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
2216                                    unsigned long *rflags)
2217 {
2218         print_func_entry();
2219         lmsw(vcpu, msw);
2220         *rflags = vmcs_readl(GUEST_RFLAGS);
2221         print_func_exit();
2222 }
2223
2224 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
2225 {
2226         print_func_entry();
2227         switch (cr) {
2228                 case 0:
2229                         print_func_exit();
2230                         return guest_cr0();
2231                 case 2:
2232                         print_func_exit();
2233                         return vcpu->cr2;
2234                 case 3:
2235                         print_func_exit();
2236                         return vcpu->cr3;
2237                 case 4:
2238                         print_func_exit();
2239                         return guest_cr4();
2240                 default:
2241                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2242                         print_func_exit();
2243                         return 0;
2244         }
2245 }
2246
2247 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
2248                                          unsigned long *rflags)
2249 {
2250         print_func_entry();
2251         switch (cr) {
2252                 case 0:
2253                         set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
2254                         *rflags = vmcs_readl(GUEST_RFLAGS);
2255                         break;
2256                 case 2:
2257                         vcpu->cr2 = val;
2258                         break;
2259                 case 3:
2260                         set_cr3(vcpu, val);
2261                         break;
2262                 case 4:
2263                         set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
2264                         break;
2265                 default:
2266                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2267         }
2268         print_func_exit();
2269 }
2270
2271 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
2272                                                                   int vec, uint32_t err_code)
2273 {
2274         print_func_entry();
2275         if (!vcpu->rmode.active) {
2276                 print_func_exit();
2277                 return 0;
2278         }
2279
2280         if (vec == GP_VECTOR && err_code == 0)
2281                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) {
2282                         print_func_exit();
2283                         return 1;
2284                 }
2285         print_func_exit();
2286         return 0;
2287 }
2288
2289 static int handle_exception(struct litevm_vcpu *vcpu,
2290                                                         struct litevm_run *litevm_run)
2291 {
2292         print_func_entry();
2293         uint32_t intr_info, error_code;
2294         unsigned long cr2, rip;
2295         uint32_t vect_info;
2296         enum emulation_result er;
2297
2298         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2299         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2300 printk("vect_info %x intro_info %x\n", vect_info, intr_info);
2301 printk("page fault? %d\n", is_page_fault(intr_info));
2302
2303         if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) {
2304                 printk("%s: unexpected, vectoring info 0x%x "
2305                            "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
2306         }
2307
2308         if (is_external_interrupt(vect_info)) {
2309 printk("extern interrupt\n");
2310                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2311                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_pending), irq);
2312                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_summary),
2313                                                            irq / BITS_PER_LONG);
2314         }
2315
2316         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) {  /* nmi */
2317 printk("nmi\n");
2318                 asm("int $2");
2319                 print_func_exit();
2320                 return 1;
2321         }
2322         error_code = 0;
2323         rip = vmcs_readl(GUEST_RIP);
2324 printk("GUEST_RIP %x\n", rip);
2325         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
2326                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2327         if (is_page_fault(intr_info)) {
2328 printk("PAGE FAULT!\n");
2329                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2330
2331                 SPLL(&vcpu->litevm->lock);
2332                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
2333                         SPLU(&vcpu->litevm->lock);
2334                         print_func_exit();
2335                         return 1;
2336                 }
2337
2338                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
2339                 SPLU(&vcpu->litevm->lock);
2340
2341                 switch (er) {
2342                         case EMULATE_DONE:
2343                                 print_func_exit();
2344                                 return 1;
2345                         case EMULATE_DO_MMIO:
2346                                 ++litevm_stat.mmio_exits;
2347                                 litevm_run->exit_reason = LITEVM_EXIT_MMIO;
2348                                 print_func_exit();
2349                                 return 0;
2350                         case EMULATE_FAIL:
2351                                 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2352                                 break;
2353                         default:
2354                                 assert(0);
2355                 }
2356         }
2357
2358         if (vcpu->rmode.active &&
2359                 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2360                                                            error_code)) {
2361             printk("RMODE EXCEPTION might have been handled\n");
2362                 print_func_exit();
2363                 return 1;
2364         }
2365
2366         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
2367                 (INTR_TYPE_EXCEPTION | 1)) {
2368                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2369                 print_func_exit();
2370                 return 0;
2371         }
2372         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2373         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2374         litevm_run->ex.error_code = error_code;
2375         print_func_exit();
2376         return 0;
2377 }
2378
2379 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2380                                                                          struct litevm_run *litevm_run)
2381 {
2382         //print_func_entry();
2383         ++litevm_stat.irq_exits;
2384         //print_func_exit();
2385         return 1;
2386 }
2387
2388 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t * count)
2389 {
2390         print_func_entry();
2391         uint64_t inst;
2392         gva_t rip;
2393         int countr_size;
2394         int i, n;
2395
2396         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2397                 countr_size = 2;
2398         } else {
2399                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2400
2401                 countr_size = (cs_ar & AR_L_MASK) ? 8 : (cs_ar & AR_DB_MASK) ? 4 : 2;
2402         }
2403
2404         rip = vmcs_readl(GUEST_RIP);
2405         if (countr_size != 8)
2406                 rip += vmcs_readl(GUEST_CS_BASE);
2407
2408         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2409
2410         for (i = 0; i < n; i++) {
2411                 switch (((uint8_t *) & inst)[i]) {
2412                         case 0xf0:
2413                         case 0xf2:
2414                         case 0xf3:
2415                         case 0x2e:
2416                         case 0x36:
2417                         case 0x3e:
2418                         case 0x26:
2419                         case 0x64:
2420                         case 0x65:
2421                         case 0x66:
2422                                 break;
2423                         case 0x67:
2424                                 countr_size = (countr_size == 2) ? 4 : (countr_size >> 1);
2425                         default:
2426                                 goto done;
2427                 }
2428         }
2429         print_func_exit();
2430         return 0;
2431 done:
2432         countr_size *= 8;
2433         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2434         print_func_exit();
2435         return 1;
2436 }
2437
2438 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2439 {
2440         print_func_entry();
2441         uint64_t exit_qualification;
2442
2443         ++litevm_stat.io_exits;
2444         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2445         litevm_run->exit_reason = LITEVM_EXIT_IO;
2446         if (exit_qualification & 8)
2447                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2448         else
2449                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2450         litevm_run->io.size = (exit_qualification & 7) + 1;
2451         litevm_run->io.string = (exit_qualification & 16) != 0;
2452         litevm_run->io.string_down
2453                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2454         litevm_run->io.rep = (exit_qualification & 32) != 0;
2455         litevm_run->io.port = exit_qualification >> 16;
2456         if (litevm_run->io.string) {
2457                 if (!get_io_count(vcpu, &litevm_run->io.count)) {
2458                         print_func_exit();
2459                         return 1;
2460                 }
2461                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2462         } else
2463                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX];       /* rax */
2464         print_func_exit();
2465         return 0;
2466 }
2467
2468 static int handle_invlpg(struct litevm_vcpu *vcpu,
2469                                                  struct litevm_run *litevm_run)
2470 {
2471         print_func_entry();
2472         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2473         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2474         SPLL(&vcpu->litevm->lock);
2475         vcpu->mmu.inval_page(vcpu, address);
2476         SPLU(&vcpu->litevm->lock);
2477         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2478         print_func_exit();
2479         return 1;
2480 }
2481
2482 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2483 {
2484         print_func_entry();
2485         uint64_t exit_qualification;
2486         int cr;
2487         int reg;
2488
2489 #ifdef LITEVM_DEBUG
2490         if (guest_cpl() != 0) {
2491                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2492                 inject_gp(vcpu);
2493                 print_func_exit();
2494                 return 1;
2495         }
2496 #endif
2497
2498         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2499         cr = exit_qualification & 15;
2500         reg = (exit_qualification >> 8) & 15;
2501         switch ((exit_qualification >> 4) & 3) {
2502                 case 0: /* mov to cr */
2503                         switch (cr) {
2504                                 case 0:
2505                                         vcpu_load_rsp_rip(vcpu);
2506                                         set_cr0(vcpu, vcpu->regs[reg]);
2507                                         skip_emulated_instruction(vcpu);
2508                                         print_func_exit();
2509                                         return 1;
2510                                 case 3:
2511                                         vcpu_load_rsp_rip(vcpu);
2512                                         set_cr3(vcpu, vcpu->regs[reg]);
2513                                         skip_emulated_instruction(vcpu);
2514                                         print_func_exit();
2515                                         return 1;
2516                                 case 4:
2517                                         vcpu_load_rsp_rip(vcpu);
2518                                         set_cr4(vcpu, vcpu->regs[reg]);
2519                                         skip_emulated_instruction(vcpu);
2520                                         print_func_exit();
2521                                         return 1;
2522                                 case 8:
2523                                         vcpu_load_rsp_rip(vcpu);
2524                                         set_cr8(vcpu, vcpu->regs[reg]);
2525                                         skip_emulated_instruction(vcpu);
2526                                         print_func_exit();
2527                                         return 1;
2528                         };
2529                         break;
2530                 case 1: /*mov from cr */
2531                         switch (cr) {
2532                                 case 3:
2533                                         vcpu_load_rsp_rip(vcpu);
2534                                         vcpu->regs[reg] = vcpu->cr3;
2535                                         vcpu_put_rsp_rip(vcpu);
2536                                         skip_emulated_instruction(vcpu);
2537                                         print_func_exit();
2538                                         return 1;
2539                                 case 8:
2540                                         printd("handle_cr: read CR8 " "cpu erratum AA15\n");
2541                                         vcpu_load_rsp_rip(vcpu);
2542                                         vcpu->regs[reg] = vcpu->cr8;
2543                                         vcpu_put_rsp_rip(vcpu);
2544                                         skip_emulated_instruction(vcpu);
2545                                         print_func_exit();
2546                                         return 1;
2547                         }
2548                         break;
2549                 case 3: /* lmsw */
2550                         lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2551
2552                         skip_emulated_instruction(vcpu);
2553                         print_func_exit();
2554                         return 1;
2555                 default:
2556                         break;
2557         }
2558         litevm_run->exit_reason = 0;
2559         printk("litevm: unhandled control register: op %d cr %d\n",
2560                    (int)(exit_qualification >> 4) & 3, cr);
2561         print_func_exit();
2562         return 0;
2563 }
2564
2565 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2566 {
2567         print_func_entry();
2568         uint64_t exit_qualification;
2569         unsigned long val;
2570         int dr, reg;
2571
2572         /*
2573          * FIXME: this code assumes the host is debugging the guest.
2574          *        need to deal with guest debugging itself too.
2575          */
2576         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2577         dr = exit_qualification & 7;
2578         reg = (exit_qualification >> 8) & 15;
2579         vcpu_load_rsp_rip(vcpu);
2580         if (exit_qualification & 16) {
2581                 /* mov from dr */
2582                 switch (dr) {
2583                         case 6:
2584                                 val = 0xffff0ff0;
2585                                 break;
2586                         case 7:
2587                                 val = 0x400;
2588                                 break;
2589                         default:
2590                                 val = 0;
2591                 }
2592                 vcpu->regs[reg] = val;
2593         } else {
2594                 /* mov to dr */
2595         }
2596         vcpu_put_rsp_rip(vcpu);
2597         skip_emulated_instruction(vcpu);
2598         print_func_exit();
2599         return 1;
2600 }
2601
2602 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2603 {
2604         print_func_entry();
2605         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2606         print_func_exit();
2607         return 0;
2608 }
2609
2610 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2611 {
2612         print_func_entry();
2613         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2614         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2615         uint64_t data;
2616
2617         if (guest_cpl() != 0) {
2618                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2619                 inject_gp(vcpu);
2620                 print_func_exit();
2621                 return 1;
2622         }
2623
2624         switch (ecx) {
2625                 case MSR_FS_BASE:
2626                         data = vmcs_readl(GUEST_FS_BASE);
2627                         break;
2628                 case MSR_GS_BASE:
2629                         data = vmcs_readl(GUEST_GS_BASE);
2630                         break;
2631                 case MSR_IA32_SYSENTER_CS:
2632                         data = vmcs_read32(GUEST_SYSENTER_CS);
2633                         break;
2634                 case MSR_IA32_SYSENTER_EIP:
2635                         data = vmcs_read32(GUEST_SYSENTER_EIP);
2636                         break;
2637                 case MSR_IA32_SYSENTER_ESP:
2638                         data = vmcs_read32(GUEST_SYSENTER_ESP);
2639                         break;
2640                 case MSR_IA32_MC0_CTL:
2641                 case MSR_IA32_MCG_STATUS:
2642                 case MSR_IA32_MCG_CAP:
2643                 case MSR_IA32_MC0_MISC:
2644                 case MSR_IA32_MC0_MISC + 4:
2645                 case MSR_IA32_MC0_MISC + 8:
2646                 case MSR_IA32_MC0_MISC + 12:
2647                 case MSR_IA32_MC0_MISC + 16:
2648                 case MSR_IA32_UCODE_REV:
2649                         /* MTRR registers */
2650                 case 0xfe:
2651                 case 0x200 ... 0x2ff:
2652                         data = 0;
2653                         break;
2654                 case MSR_IA32_APICBASE:
2655                         data = vcpu->apic_base;
2656                         break;
2657                 default:
2658                         if (msr) {
2659                                 data = msr->data;
2660                                 break;
2661                         }
2662                         printk("litevm: unhandled rdmsr: %x\n", ecx);
2663                         inject_gp(vcpu);
2664                         print_func_exit();
2665                         return 1;
2666         }
2667
2668         /* FIXME: handling of bits 32:63 of rax, rdx */
2669         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2670         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2671         skip_emulated_instruction(vcpu);
2672         print_func_exit();
2673         return 1;
2674 }
2675
2676 #ifdef __x86_64__
2677
2678 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2679 {
2680         print_func_entry();
2681         struct vmx_msr_entry *msr;
2682
2683         if (efer & EFER_RESERVED_BITS) {
2684                 printd("set_efer: 0x%llx #GP, reserved bits\n", efer);
2685                 inject_gp(vcpu);
2686                 print_func_exit();
2687                 return;
2688         }
2689
2690         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2691                 printd("set_efer: #GP, change LME while paging\n");
2692                 inject_gp(vcpu);
2693                 print_func_exit();
2694                 return;
2695         }
2696
2697         efer &= ~EFER_LMA;
2698         efer |= vcpu->shadow_efer & EFER_LMA;
2699
2700         vcpu->shadow_efer = efer;
2701
2702         msr = find_msr_entry(vcpu, MSR_EFER);
2703
2704         if (!(efer & EFER_LMA))
2705                 efer &= ~EFER_LME;
2706         msr->data = efer;
2707         skip_emulated_instruction(vcpu);
2708         print_func_exit();
2709 }
2710
2711 #endif
2712
2713 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2714
2715 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2716 {
2717         print_func_entry();
2718         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2719         struct vmx_msr_entry *msr;
2720         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2721                 | ((uint64_t) (vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2722
2723         if (guest_cpl() != 0) {
2724                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2725                 inject_gp(vcpu);
2726                 print_func_exit();
2727                 return 1;
2728         }
2729
2730         switch (ecx) {
2731                 case MSR_FS_BASE:
2732                         vmcs_writel(GUEST_FS_BASE, data);
2733                         break;
2734                 case MSR_GS_BASE:
2735                         vmcs_writel(GUEST_GS_BASE, data);
2736                         break;
2737                 case MSR_IA32_SYSENTER_CS:
2738                         vmcs_write32(GUEST_SYSENTER_CS, data);
2739                         break;
2740                 case MSR_IA32_SYSENTER_EIP:
2741                         vmcs_write32(GUEST_SYSENTER_EIP, data);
2742                         break;
2743                 case MSR_IA32_SYSENTER_ESP:
2744                         vmcs_write32(GUEST_SYSENTER_ESP, data);
2745                         break;
2746                 case MSR_EFER:
2747                         set_efer(vcpu, data);
2748                         print_func_exit();
2749                         return 1;
2750                 case MSR_IA32_MC0_STATUS:
2751                         printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data);
2752                         break;
2753                 case MSR_IA32_TIME_STAMP_COUNTER:{
2754                                 uint64_t tsc;
2755
2756                                 tsc = read_tsc();
2757                                 vmcs_write64(TSC_OFFSET, data - tsc);
2758                                 break;
2759                         }
2760                 case MSR_IA32_UCODE_REV:
2761                 case MSR_IA32_UCODE_WRITE:
2762                 case 0x200 ... 0x2ff:   /* MTRRs */
2763                         break;
2764                 case MSR_IA32_APICBASE:
2765                         vcpu->apic_base = data;
2766                         break;
2767                 default:
2768                         msr = find_msr_entry(vcpu, ecx);
2769                         if (msr) {
2770                                 msr->data = data;
2771                                 break;
2772                         }
2773                         printk("litevm: unhandled wrmsr: %x\n", ecx);
2774                         inject_gp(vcpu);
2775                         print_func_exit();
2776                         return 1;
2777         }
2778         skip_emulated_instruction(vcpu);
2779         print_func_exit();
2780         return 1;
2781 }
2782
2783 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2784                                                                    struct litevm_run *litevm_run)
2785 {
2786         print_func_entry();
2787         /* Turn off interrupt window reporting. */
2788         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2789                                  vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2790                                  & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2791         print_func_exit();
2792         return 1;
2793 }
2794
2795 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2796 {
2797         print_func_entry();
2798         skip_emulated_instruction(vcpu);
2799         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) {
2800                 print_func_exit();
2801                 return 1;
2802         }
2803
2804         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2805         print_func_exit();
2806         return 0;
2807 }
2808
2809 /*
2810  * The exit handlers return 1 if the exit was handled fully and guest execution
2811  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2812  * to be done to userspace and return 0.
2813  */
2814 static int (*litevm_vmx_exit_handlers[]) (struct litevm_vcpu * vcpu,
2815                                                                                   struct litevm_run * litevm_run) = {
2816 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2817                 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2818                 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2819                 [EXIT_REASON_INVLPG] = handle_invlpg,
2820                 [EXIT_REASON_CR_ACCESS] = handle_cr,
2821                 [EXIT_REASON_DR_ACCESS] = handle_dr,
2822                 [EXIT_REASON_CPUID] = handle_cpuid,
2823                 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2824                 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2825                 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2826                 [EXIT_REASON_HLT] = handle_halt,};
2827
2828 static const int litevm_vmx_max_exit_handlers =
2829         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2830
2831 /*
2832  * The guest has exited.  See if we can fix it or if we need userspace
2833  * assistance.
2834  */
2835 static int litevm_handle_exit(struct litevm_run *litevm_run,
2836                                                           struct litevm_vcpu *vcpu)
2837 {
2838         //print_func_entry();
2839         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2840         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2841
2842 //printk("vectoring_info %08x exit_reason %x\n", vectoring_info, exit_reason);
2843         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2844                 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2845                 printk("%s: unexpected, valid vectoring info and "
2846                            "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2847         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2848         if (exit_reason < litevm_vmx_max_exit_handlers
2849                 && litevm_vmx_exit_handlers[exit_reason]) {
2850 //printk("reason is KNOWN\n");
2851                 //print_func_exit();
2852                 return litevm_vmx_exit_handlers[exit_reason] (vcpu, litevm_run);
2853         } else {
2854 printk("reason is UNKNOWN\n");
2855                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2856                 litevm_run->hw.hardware_exit_reason = exit_reason;
2857         }
2858         //print_func_exit();
2859         return 0;
2860 }
2861
2862 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2863 {
2864         print_func_entry();
2865         uint16_t ent[2];
2866         uint16_t cs;
2867         uint16_t ip;
2868         unsigned long flags;
2869         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2870         uint16_t sp = vmcs_readl(GUEST_RSP);
2871         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2872
2873         if (sp > ss_limit || ((sp - 6) > sp)) {
2874                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2875                                         __FUNCTION__,
2876                                         vmcs_readl(GUEST_RSP),
2877                                         vmcs_readl(GUEST_SS_BASE), vmcs_read32(GUEST_SS_LIMIT));
2878                 print_func_exit();
2879                 return;
2880         }
2881
2882         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2883                 sizeof(ent)) {
2884                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2885                 print_func_exit();
2886                 return;
2887         }
2888
2889         flags = vmcs_readl(GUEST_RFLAGS);
2890         cs = vmcs_readl(GUEST_CS_BASE) >> 4;
2891         ip = vmcs_readl(GUEST_RIP);
2892
2893         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2894                 litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2895                 litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2896                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2897                 print_func_exit();
2898                 return;
2899         }
2900
2901         vmcs_writel(GUEST_RFLAGS, flags &
2902                                 ~(X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2903         vmcs_write16(GUEST_CS_SELECTOR, ent[1]);
2904         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2905         vmcs_writel(GUEST_RIP, ent[0]);
2906         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2907         print_func_exit();
2908 }
2909
2910 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2911 {
2912         print_func_entry();
2913         int word_index = __ffs(vcpu->irq_summary);
2914         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2915         int irq = word_index * BITS_PER_LONG + bit_index;
2916
2917         /* don't have clear_bit and I'm not sure the akaros
2918          * bitops are really going to work.
2919          */
2920         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2921         if (!vcpu->irq_pending[word_index])
2922                 vcpu->irq_summary &= ~(1 << word_index);
2923
2924         if (vcpu->rmode.active) {
2925                 inject_rmode_irq(vcpu, irq);
2926                 print_func_exit();
2927                 return;
2928         }
2929         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2930                                  irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2931         print_func_exit();
2932 }
2933
2934 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2935 {
2936         print_func_entry();
2937         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2938                 && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2939                 /*
2940                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2941                  */
2942                 litevm_do_inject_irq(vcpu);
2943         else
2944                 /*
2945                  * Interrupts blocked.  Wait for unblock.
2946                  */
2947                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2948                                          vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2949                                          | CPU_BASED_VIRTUAL_INTR_PENDING);
2950         print_func_exit();
2951 }
2952
2953 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2954 {
2955         print_func_entry();
2956         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2957
2958 /*
2959         set_debugreg(dbg->bp[0], 0);
2960         set_debugreg(dbg->bp[1], 1);
2961         set_debugreg(dbg->bp[2], 2);
2962         set_debugreg(dbg->bp[3], 3);
2963 */
2964
2965         if (dbg->singlestep) {
2966                 unsigned long flags;
2967
2968                 flags = vmcs_readl(GUEST_RFLAGS);
2969                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2970                 vmcs_writel(GUEST_RFLAGS, flags);
2971         }
2972         print_func_exit();
2973 }
2974
2975 static void load_msrs(struct vmx_msr_entry *e, int n)
2976 {
2977         //print_func_entry();
2978         int i;
2979
2980         if (! e) {
2981                 printk("LOAD MSR WITH NULL POINTER?");
2982                 error("LOAD MSR WITH NULL POINTER?");
2983         }
2984         for (i = 0; i < n; ++i) {
2985                 //printk("Load MSR (%lx), with %lx\n", e[i].index, e[i].data);
2986                 write_msr(e[i].index, e[i].data);
2987                 //printk("Done\n");
2988         }
2989         //print_func_exit();
2990 }
2991
2992 static void save_msrs(struct vmx_msr_entry *e, int n)
2993 {
2994         //print_func_entry();
2995         int i;
2996
2997         for (i = 0; i < n; ++i)
2998                 e[i].data = read_msr(e[i].index);
2999         //print_func_exit();
3000 }
3001
3002 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
3003 {
3004         print_func_entry();
3005         struct litevm_vcpu *vcpu;
3006         uint8_t fail;
3007         uint16_t fs_sel, gs_sel, ldt_sel;
3008         int fs_gs_ldt_reload_needed;
3009
3010         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
3011                 error("vcpu is %d but must be in the range %d..%d\n",
3012                           litevm_run->vcpu, LITEVM_MAX_VCPUS);
3013
3014         vcpu = vcpu_load(litevm, litevm_run->vcpu);
3015         if (!vcpu)
3016                 error("vcpu_load failed");
3017         printk("Loaded\n");
3018
3019         if (litevm_run->emulated) {
3020                 skip_emulated_instruction(vcpu);
3021                 litevm_run->emulated = 0;
3022         }
3023         printk("Emulated\n");
3024
3025         if (litevm_run->mmio_completed) {
3026                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
3027                 vcpu->mmio_read_completed = 1;
3028         }
3029         printk("mmio completed\n");
3030
3031         vcpu->mmio_needed = 0;
3032
3033 again:
3034         /*
3035          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
3036          * allow segment selectors with cpl > 0 or ti == 1.
3037          */
3038         fs_sel = read_fs();
3039         //printk("fs_sel %x\n", fs_sel);
3040         gs_sel = read_gs();
3041         //printk("gs_sel %x\n", gs_sel);
3042         ldt_sel = read_ldt();
3043         //printk("ldt_sel %x\n", ldt_sel);
3044         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
3045         if (!fs_gs_ldt_reload_needed) {
3046                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
3047                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
3048         } else {
3049                 vmcs_write16(HOST_FS_SELECTOR, 0);
3050                 vmcs_write16(HOST_GS_SELECTOR, 0);
3051         }
3052         //printk("reloaded gs and gs\n");
3053
3054 #ifdef __x86_64__
3055         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
3056         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
3057         //printk("Set FS_BASE and GS_BASE");
3058 #endif
3059
3060         if (vcpu->irq_summary &&
3061                 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
3062                 litevm_try_inject_irq(vcpu);
3063
3064         if (vcpu->guest_debug.enabled)
3065                 litevm_guest_debug_pre(vcpu);
3066
3067         fx_save(vcpu->host_fx_image);
3068         fx_restore(vcpu->guest_fx_image);
3069
3070         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
3071         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3072
3073         printk("GO FOR IT! %08lx\n", vmcs_readl(GUEST_RIP));
3074         asm(
3075                    /* Store host registers */
3076                    "pushf \n\t"
3077 #ifdef __x86_64__
3078                    "push %%rax; push %%rbx; push %%rdx;"
3079                    "push %%rsi; push %%rdi; push %%rbp;"
3080                    "push %%r8;  push %%r9;  push %%r10; push %%r11;"
3081                    "push %%r12; push %%r13; push %%r14; push %%r15;"
3082                    "push %%rcx \n\t" "vmwrite %%rsp, %2 \n\t"
3083 #else
3084                    "pusha; push %%ecx \n\t" "vmwrite %%esp, %2 \n\t"
3085 #endif
3086                    /* Check if vmlaunch of vmresume is needed */
3087                    "cmp $0, %1 \n\t"
3088                    /* Load guest registers.  Don't clobber flags. */
3089 #ifdef __x86_64__
3090                    "mov %c[cr2](%3), %%rax \n\t" "mov %%rax, %%cr2 \n\t" "mov %c[rax](%3), %%rax \n\t" "mov %c[rbx](%3), %%rbx \n\t" "mov %c[rdx](%3), %%rdx \n\t" "mov %c[rsi](%3), %%rsi \n\t" "mov %c[rdi](%3), %%rdi \n\t" "mov %c[rbp](%3), %%rbp \n\t" "mov %c[r8](%3),  %%r8  \n\t" "mov %c[r9](%3),  %%r9  \n\t" "mov %c[r10](%3), %%r10 \n\t" "mov %c[r11](%3), %%r11 \n\t" "mov %c[r12](%3), %%r12 \n\t" "mov %c[r13](%3), %%r13 \n\t" "mov %c[r14](%3), %%r14 \n\t" "mov %c[r15](%3), %%r15 \n\t" "mov %c[rcx](%3), %%rcx \n\t"      /* kills %3 (rcx) */
3091 #else
3092                    "mov %c[cr2](%3), %%eax \n\t" "mov %%eax,   %%cr2 \n\t" "mov %c[rax](%3), %%eax \n\t" "mov %c[rbx](%3), %%ebx \n\t" "mov %c[rdx](%3), %%edx \n\t" "mov %c[rsi](%3), %%esi \n\t" "mov %c[rdi](%3), %%edi \n\t" "mov %c[rbp](%3), %%ebp \n\t" "mov %c[rcx](%3), %%ecx \n\t"    /* kills %3 (ecx) */
3093 #endif
3094                    /* Enter guest mode */
3095                    "jne launched \n\t"
3096                    "vmlaunch \n\t"
3097                    "jmp litevm_vmx_return \n\t"
3098                    "launched: vmresume \n\t"
3099                    ".globl litevm_vmx_return \n\t" "litevm_vmx_return: "
3100                    /* Save guest registers, load host registers, keep flags */
3101 #ifdef __x86_64__
3102                    "xchg %3,     0(%%rsp) \n\t"
3103                    "mov %%rax, %c[rax](%3) \n\t"
3104                    "mov %%rbx, %c[rbx](%3) \n\t"
3105                    "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
3106                    "mov %%rdx, %c[rdx](%3) \n\t"
3107                    "mov %%rsi, %c[rsi](%3) \n\t"
3108                    "mov %%rdi, %c[rdi](%3) \n\t"
3109                    "mov %%rbp, %c[rbp](%3) \n\t"
3110                    "mov %%r8,  %c[r8](%3) \n\t"
3111                    "mov %%r9,  %c[r9](%3) \n\t"
3112                    "mov %%r10, %c[r10](%3) \n\t"
3113                    "mov %%r11, %c[r11](%3) \n\t"
3114                    "mov %%r12, %c[r12](%3) \n\t"
3115                    "mov %%r13, %c[r13](%3) \n\t"
3116                    "mov %%r14, %c[r14](%3) \n\t"
3117                    "mov %%r15, %c[r15](%3) \n\t"
3118                    "mov %%cr2, %%rax   \n\t"
3119                    "mov %%rax, %c[cr2](%3) \n\t"
3120                    "mov 0(%%rsp), %3 \n\t"
3121                    "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
3122                    "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
3123                    "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
3124                    "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
3125 #else
3126                    "xchg %3, 0(%%esp) \n\t"
3127                    "mov %%eax, %c[rax](%3) \n\t"
3128                    "mov %%ebx, %c[rbx](%3) \n\t"
3129                    "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
3130                    "mov %%edx, %c[rdx](%3) \n\t"
3131                    "mov %%esi, %c[rsi](%3) \n\t"
3132                    "mov %%edi, %c[rdi](%3) \n\t"
3133                    "mov %%ebp, %c[rbp](%3) \n\t"
3134                    "mov %%cr2, %%eax  \n\t"
3135                    "mov %%eax, %c[cr2](%3) \n\t"
3136                    "mov 0(%%esp), %3 \n\t" "pop %%ecx; popa \n\t"
3137 #endif
3138 "setbe %0 \n\t" "popf \n\t":"=g"(fail)
3139 :                  "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
3140                    "c"(vcpu),
3141                    [rax] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
3142                    [rbx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
3143                    [rcx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
3144                    [rdx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
3145                    [rsi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
3146                    [rdi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
3147                    [rbp] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
3148 #ifdef __x86_64__
3149                    [r8] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8])),
3150                    [r9] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9])),
3151                    [r10] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
3152                    [r11] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
3153                    [r12] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
3154                    [r13] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
3155                    [r14] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
3156                    [r15] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
3157 #endif
3158                    [cr2] "i"(offsetof(struct litevm_vcpu, cr2))
3159                    :"cc", "memory");
3160
3161         ++litevm_stat.exits;
3162         printk("vm_run exits! %08lx flags %08lx\n", vmcs_readl(GUEST_RIP),
3163                 vmcs_readl(GUEST_RFLAGS));
3164         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3165         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
3166
3167         fx_save(vcpu->guest_fx_image);
3168         fx_restore(vcpu->host_fx_image);
3169
3170 #ifndef __x86_64__
3171 asm("mov %0, %%ds; mov %0, %%es": :"r"(__USER_DS));
3172 #endif
3173
3174         litevm_run->exit_type = 0;
3175         if (fail) {
3176 printk("FAIL\n");
3177                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
3178                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
3179 printk("reason %d\n", litevm_run->exit_reason);
3180         } else {
3181 printk("NOT FAIL\n");
3182                 if (fs_gs_ldt_reload_needed) {
3183                         load_ldt(ldt_sel);
3184                         load_fs(fs_sel);
3185                         /*
3186                          * If we have to reload gs, we must take care to
3187                          * preserve our gs base.
3188                          */
3189                         disable_irq();
3190                         load_gs(gs_sel);
3191 #ifdef __x86_64__
3192                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
3193 #endif
3194                         enable_irq();
3195
3196                         reload_tss();
3197                 }
3198                 vcpu->launched = 1;
3199                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
3200 //printk("Let's see why it exited\n");
3201                 if (litevm_handle_exit(litevm_run, vcpu)) {
3202                         /* Give scheduler a change to reschedule. */
3203 #if 0
3204                         vcpu_put(vcpu);
3205 #warning "how to tell if signal is pending"
3206 /*
3207                         if (signal_pending(current)) {
3208                                 ++litevm_stat.signal_exits;
3209                                 return -EINTR;
3210                         }
3211 */
3212                         consider getting rid of this for now. 
3213                         Maybe it is just breaking things.
3214                         kthread_yield();
3215                         /* Cannot fail -  no vcpu unplug yet. */
3216                         vcpu_load(litevm, vcpu_slot(vcpu));
3217 #endif
3218                         monitor(NULL);
3219                         goto again;
3220                 }
3221         }
3222 done: 
3223
3224         printk("vm_run exits! %08lx flags %08lx\n", vmcs_readl(GUEST_RIP),
3225                 vmcs_readl(GUEST_RFLAGS));
3226         vcpu_put(vcpu);
3227         printk("vm_run returns\n");
3228         print_func_exit();
3229         return 0;
3230 }
3231
3232 static int litevm_dev_ioctl_get_regs(struct litevm *litevm,
3233                                                                          struct litevm_regs *regs)
3234 {
3235         print_func_entry();
3236         struct litevm_vcpu *vcpu;
3237
3238         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3239                 print_func_exit();
3240                 return -EINVAL;
3241         }
3242
3243         vcpu = vcpu_load(litevm, regs->vcpu);
3244         if (!vcpu) {
3245                 print_func_exit();
3246                 return -ENOENT;
3247         }
3248
3249         regs->rax = vcpu->regs[VCPU_REGS_RAX];
3250         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
3251         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
3252         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
3253         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
3254         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
3255         regs->rsp = vmcs_readl(GUEST_RSP);
3256         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
3257 #ifdef __x86_64__
3258         regs->r8 = vcpu->regs[VCPU_REGS_R8];
3259         regs->r9 = vcpu->regs[VCPU_REGS_R9];
3260         regs->r10 = vcpu->regs[VCPU_REGS_R10];
3261         regs->r11 = vcpu->regs[VCPU_REGS_R11];
3262         regs->r12 = vcpu->regs[VCPU_REGS_R12];
3263         regs->r13 = vcpu->regs[VCPU_REGS_R13];
3264         regs->r14 = vcpu->regs[VCPU_REGS_R14];
3265         regs->r15 = vcpu->regs[VCPU_REGS_R15];
3266 #endif
3267
3268         regs->rip = vmcs_readl(GUEST_RIP);
3269         regs->rflags = vmcs_readl(GUEST_RFLAGS);
3270
3271         /*
3272          * Don't leak debug flags in case they were set for guest debugging
3273          */
3274         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3275                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3276
3277         vcpu_put(vcpu);
3278
3279         print_func_exit();
3280         return 0;
3281 }
3282
3283 static int litevm_dev_ioctl_set_regs(struct litevm *litevm,
3284                                                                          struct litevm_regs *regs)
3285 {
3286         print_func_entry();
3287         struct litevm_vcpu *vcpu;
3288
3289         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3290                 print_func_exit();
3291                 return -EINVAL;
3292         }
3293
3294         vcpu = vcpu_load(litevm, regs->vcpu);
3295         if (!vcpu) {
3296                 print_func_exit();
3297                 return -ENOENT;
3298         }
3299
3300         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
3301         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
3302         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
3303         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
3304         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
3305         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
3306         vmcs_writel(GUEST_RSP, regs->rsp);
3307         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
3308 #ifdef __x86_64__
3309         vcpu->regs[VCPU_REGS_R8] = regs->r8;
3310         vcpu->regs[VCPU_REGS_R9] = regs->r9;
3311         vcpu->regs[VCPU_REGS_R10] = regs->r10;
3312         vcpu->regs[VCPU_REGS_R11] = regs->r11;
3313         vcpu->regs[VCPU_REGS_R12] = regs->r12;
3314         vcpu->regs[VCPU_REGS_R13] = regs->r13;
3315         vcpu->regs[VCPU_REGS_R14] = regs->r14;
3316         vcpu->regs[VCPU_REGS_R15] = regs->r15;
3317 #endif
3318
3319         vmcs_writel(GUEST_RIP, regs->rip);
3320         vmcs_writel(GUEST_RFLAGS, regs->rflags);
3321
3322         vcpu_put(vcpu);
3323
3324         print_func_exit();
3325         return 0;
3326 }
3327
3328 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm,
3329                                                                           struct litevm_sregs *sregs)
3330 {
3331         print_func_entry();
3332         struct litevm_vcpu *vcpu;
3333
3334         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3335                 print_func_exit();
3336                 return -EINVAL;
3337         }
3338         vcpu = vcpu_load(litevm, sregs->vcpu);
3339         if (!vcpu) {
3340                 print_func_exit();
3341                 return -ENOENT;
3342         }
3343 #define get_segment(var, seg) \
3344         do { \
3345                 uint32_t ar; \
3346                 \
3347                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
3348                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
3349                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
3350                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
3351                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
3352                 sregs->var.type = ar & 15; \
3353                 sregs->var.s = (ar >> 4) & 1; \
3354                 sregs->var.dpl = (ar >> 5) & 3; \
3355                 sregs->var.present = (ar >> 7) & 1; \
3356                 sregs->var.avl = (ar >> 12) & 1; \
3357                 sregs->var.l = (ar >> 13) & 1; \
3358                 sregs->var.db = (ar >> 14) & 1; \
3359                 sregs->var.g = (ar >> 15) & 1; \
3360                 sregs->var.unusable = (ar >> 16) & 1; \
3361         } while (0);
3362
3363         get_segment(cs, CS);
3364         get_segment(ds, DS);
3365         get_segment(es, ES);
3366         get_segment(fs, FS);
3367         get_segment(gs, GS);
3368         get_segment(ss, SS);
3369
3370         get_segment(tr, TR);
3371         get_segment(ldt, LDTR);
3372 #undef get_segment
3373
3374 #define get_dtable(var, table) \
3375         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
3376                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
3377
3378         get_dtable(idt, IDTR);
3379         get_dtable(gdt, GDTR);
3380 #undef get_dtable
3381
3382         sregs->cr0 = guest_cr0();
3383         sregs->cr2 = vcpu->cr2;
3384         sregs->cr3 = vcpu->cr3;
3385         sregs->cr4 = guest_cr4();
3386         sregs->cr8 = vcpu->cr8;
3387         sregs->efer = vcpu->shadow_efer;
3388         sregs->apic_base = vcpu->apic_base;
3389
3390         sregs->pending_int = vcpu->irq_summary != 0;
3391
3392         vcpu_put(vcpu);
3393
3394         print_func_exit();
3395         return 0;
3396 }
3397