More debugging for VMs.
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 /* from linux */
39 #define __KERNEL_CS 0x10
40 #define __KERNEL_DS 0x18
41 /* used? Who knows */
42 #define GDT_ENTRY_TSS 0x24
43
44 #define currentcpu (&per_cpu_info[core_id()])
45 #define QLOCK_init(x) {printd("qlock_init %p\n", x); qlock_init(x); printd("%p lock_inited\n", x);}
46 #define QLOCK(x) {printd("qlock %p\n", x); qlock(x); printd("%p locked\n", x);}
47 #define QUNLOCK(x) {printd("qunlock %p\n", x); qunlock(x); printd("%p unlocked\n", x);}
48 #define SPLI_irqsave(x){printd("spin_lock_init %p:", x); spinlock_init(x); printd("inited\n");}
49 #define SPLL(x){printd("spin_lock %p\n", x); spin_lock_irqsave(x); printd("%p locked\n", x);}
50 #define SPLU(x){printd("spin_unlock %p\n", x); spin_unlock(x); printd("%p unlocked\n", x);}
51 struct litevm_stat litevm_stat;
52
53 static struct litevm_stats_debugfs_item {
54         const char *name;
55         uint32_t *data;
56 } debugfs_entries[] = {
57         {
58         "pf_fixed", &litevm_stat.pf_fixed}, {
59         "pf_guest", &litevm_stat.pf_guest}, {
60         "tlb_flush", &litevm_stat.tlb_flush}, {
61         "invlpg", &litevm_stat.invlpg}, {
62         "exits", &litevm_stat.exits}, {
63         "io_exits", &litevm_stat.io_exits}, {
64         "mmio_exits", &litevm_stat.mmio_exits}, {
65         "signal_exits", &litevm_stat.signal_exits}, {
66         "irq_exits", &litevm_stat.irq_exits}, {
67         0, 0}
68 };
69
70 static struct dentry *debugfs_dir;
71
72 static const uint32_t vmx_msr_index[] = {
73 #ifdef __x86_64__
74         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
75 #endif
76         MSR_EFER,       // wtf? MSR_K6_STAR,
77 };
78
79 static const char* vmx_msr_name[] = {
80 #ifdef __x86_64__
81         "MSR_SYSCALL_MASK", "MSR_LSTAR", "MSR_CSTAR", "MSR_KERNEL_GS_BASE",
82 #endif
83         "MSR_EFER",     // wtf? MSR_K6_STAR,
84 };
85
86 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
87
88 #ifdef __x86_64__
89 /*
90  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
91  * mechanism (cpu bug AA24)
92  */
93 #define NR_BAD_MSRS 2
94 #else
95 #define NR_BAD_MSRS 0
96 #endif
97
98 #define TSS_IOPB_BASE_OFFSET 0x66
99 #define TSS_BASE_SIZE 0x68
100 #define TSS_IOPB_SIZE (65536 / 8)
101 #define TSS_REDIRECTION_SIZE (256 / 8)
102 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
103
104 #define MSR_IA32_VMX_BASIC_MSR                  0x480
105 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
106 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
107 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
108 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
109
110 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
111 #define LMSW_GUEST_MASK 0x0eULL
112 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
113 //#define CR4_VMXE 0x2000
114 #define CR8_RESEVED_BITS (~0x0fULL)
115 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
116
117 #ifdef __x86_64__
118 #define HOST_IS_64 1
119 #else
120 #define HOST_IS_64 0
121 #endif
122
123 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
124 /**
125  * __ffs - find first set bit in word
126  * @word: The word to search
127  *
128  * Undefined if no bit exists, so code should check against 0 first.
129  */
130 static inline unsigned long __ffs(unsigned long word)
131 {
132         print_func_entry();
133 asm("rep; bsf %1,%0":"=r"(word)
134 :               "rm"(word));
135         print_func_exit();
136         return word;
137 }
138
139 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu,
140                                                                                         uint32_t msr)
141 {
142         print_func_entry();
143         int i;
144
145         for (i = 0; i < vcpu->nmsrs; ++i)
146                 if (vcpu->guest_msrs[i].index == msr) {
147                         print_func_exit();
148                         return &vcpu->guest_msrs[i];
149                 }
150         print_func_exit();
151         return 0;
152 }
153
154 struct descriptor_table {
155         uint16_t limit;
156         unsigned long base;
157 } __attribute__ ((packed));
158
159 static void get_gdt(struct descriptor_table *table)
160 {
161         print_func_entry();
162 asm("sgdt %0":"=m"(*table));
163         print_func_exit();
164 }
165
166 static void get_idt(struct descriptor_table *table)
167 {
168         print_func_entry();
169 asm("sidt %0":"=m"(*table));
170         print_func_exit();
171 }
172
173 static uint16_t read_fs(void)
174 {
175         print_func_entry();
176         uint16_t seg;
177 asm("mov %%fs, %0":"=g"(seg));
178         print_func_exit();
179         return seg;
180 }
181
182 static uint16_t read_gs(void)
183 {
184         print_func_entry();
185         uint16_t seg;
186 asm("mov %%gs, %0":"=g"(seg));
187         print_func_exit();
188         return seg;
189 }
190
191 static uint16_t read_ldt(void)
192 {
193         print_func_entry();
194         uint16_t ldt;
195 asm("sldt %0":"=g"(ldt));
196         print_func_exit();
197         return ldt;
198 }
199
200 static void load_fs(uint16_t sel)
201 {
202         print_func_entry();
203 asm("mov %0, %%fs": :"g"(sel));
204         print_func_exit();
205 }
206
207 static void load_gs(uint16_t sel)
208 {
209         print_func_entry();
210 asm("mov %0, %%gs": :"g"(sel));
211         print_func_exit();
212 }
213
214 #ifndef load_ldt
215 static void load_ldt(uint16_t sel)
216 {
217         print_func_entry();
218 asm("lldt %0": :"g"(sel));
219         print_func_exit();
220 }
221 #endif
222
223 static void fx_save(void *image)
224 {
225         print_func_entry();
226         asm("fxsave (%0)"::"r"(image));
227         print_func_exit();
228 }
229
230 static void fx_restore(void *image)
231 {
232         print_func_entry();
233         asm("fxrstor (%0)"::"r"(image));
234         print_func_exit();
235 }
236
237 static void fpu_init(void)
238 {
239         print_func_entry();
240         asm("finit");
241         print_func_exit();
242 }
243
244 struct segment_descriptor {
245         uint16_t limit_low;
246         uint16_t base_low;
247         uint8_t base_mid;
248         uint8_t type:4;
249         uint8_t system:1;
250         uint8_t dpl:2;
251         uint8_t present:1;
252         uint8_t limit_high:4;
253         uint8_t avl:1;
254         uint8_t long_mode:1;
255         uint8_t default_op:1;
256         uint8_t granularity:1;
257         uint8_t base_high;
258 } __attribute__ ((packed));
259
260 #ifdef __x86_64__
261 // LDT or TSS descriptor in the GDT. 16 bytes.
262 struct segment_descriptor_64 {
263         struct segment_descriptor s;
264         uint32_t base_higher;
265         uint32_t pad_zero;
266 };
267
268 #endif
269
270 static unsigned long segment_base(uint16_t selector)
271 {
272         print_func_entry();
273         struct descriptor_table gdt;
274         struct segment_descriptor *d;
275         unsigned long table_base;
276         typedef unsigned long ul;
277         unsigned long v;
278
279 asm("sgdt %0":"=m"(gdt));
280         table_base = gdt.base;
281
282         if (selector & 4) {     /* from ldt */
283                 uint16_t ldt_selector;
284
285 asm("sldt %0":"=g"(ldt_selector));
286                 table_base = segment_base(ldt_selector);
287         }
288         d = (struct segment_descriptor *)(table_base + (selector & ~7));
289         v = d->base_low | ((ul) d->base_mid << 16) | ((ul) d->base_high << 24);
290 #ifdef __x86_64__
291         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
292                 v |= ((ul) ((struct segment_descriptor_64 *)d)->base_higher) << 32;
293 #endif
294         print_func_exit();
295         return v;
296 }
297
298 static unsigned long read_tr_base(void)
299 {
300         print_func_entry();
301         uint16_t tr;
302 asm("str %0":"=g"(tr));
303         print_func_exit();
304         return segment_base(tr);
305 }
306
307 static void reload_tss(void)
308 {
309         print_func_entry();
310 #ifndef __x86_64__
311
312         /*
313          * VT restores TR but not its size.  Useless.
314          */
315         struct descriptor_table gdt;
316         struct segment_descriptor *descs;
317
318         get_gdt(&gdt);
319         descs = (void *)gdt.base;
320         descs[GDT_ENTRY_TSS].type = 9;  /* available TSS */
321         load_TR_desc();
322 #endif
323         print_func_exit();
324 }
325
326 static struct vmcs_descriptor {
327         int size;
328         int order;
329         uint32_t revision_id;
330 } vmcs_descriptor;
331
332 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
333 {
334         print_func_entry();
335         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
336         print_func_exit();
337         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
338 }
339
340 int litevm_read_guest(struct litevm_vcpu *vcpu,
341                                           gva_t addr, unsigned long size, void *dest)
342 {
343         print_func_entry();
344         unsigned char *host_buf = dest;
345         unsigned long req_size = size;
346
347         while (size) {
348                 hpa_t paddr;
349                 unsigned now;
350                 unsigned offset;
351                 hva_t guest_buf;
352
353                 paddr = gva_to_hpa(vcpu, addr);
354
355                 if (is_error_hpa(paddr))
356                         break;
357                 guest_buf = (hva_t) KADDR(paddr);
358                 offset = addr & ~PAGE_MASK;
359                 guest_buf |= offset;
360                 now = MIN(size, PAGE_SIZE - offset);
361                 memcpy(host_buf, (void *)guest_buf, now);
362                 host_buf += now;
363                 addr += now;
364                 size -= now;
365         }
366         print_func_exit();
367         return req_size - size;
368 }
369
370 int litevm_write_guest(struct litevm_vcpu *vcpu,
371                                            gva_t addr, unsigned long size, void *data)
372 {
373         print_func_entry();
374         unsigned char *host_buf = data;
375         unsigned long req_size = size;
376
377         while (size) {
378                 hpa_t paddr;
379                 unsigned now;
380                 unsigned offset;
381                 hva_t guest_buf;
382
383                 paddr = gva_to_hpa(vcpu, addr);
384
385                 if (is_error_hpa(paddr))
386                         break;
387
388                 guest_buf = (hva_t) KADDR(paddr);
389                 offset = addr & ~PAGE_MASK;
390                 guest_buf |= offset;
391                 now = MIN(size, PAGE_SIZE - offset);
392                 memcpy((void *)guest_buf, host_buf, now);
393                 host_buf += now;
394                 addr += now;
395                 size -= now;
396         }
397         print_func_exit();
398         return req_size - size;
399 }
400
401 static void setup_vmcs_descriptor(void)
402 {
403         print_func_entry();
404         uint64_t msr;
405
406         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
407         vmcs_descriptor.size = (msr >> 32) & 0x1fff;
408         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size >> PAGE_SHIFT);
409         vmcs_descriptor.revision_id = (uint32_t) msr;
410         printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n",
411                    msr, vmcs_descriptor.size, vmcs_descriptor.order,
412                    vmcs_descriptor.revision_id);
413         print_func_exit();
414 };
415
416 static void vmcs_clear(struct vmcs *vmcs)
417 {
418         print_func_entry();
419         uint64_t phys_addr = PADDR(vmcs);
420         uint8_t error;
421         printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr);
422         asm volatile ("vmclear %1; setna %0":"=m" (error):"m"(phys_addr):"cc",
423                                   "memory");
424         if (error)
425                 printk("litevm: vmclear fail: %p/%llx\n", vmcs, phys_addr);
426         print_func_exit();
427 }
428
429 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
430 {
431         print_func_entry();
432         struct litevm_vcpu *vcpu = arg;
433         int cpu = core_id();
434         printd
435                 ("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n",
436                  cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
437
438         if (vcpu->cpu == cpu)
439                 vmcs_clear(vcpu->vmcs);
440
441         if (currentcpu->vmcs == vcpu->vmcs)
442                 currentcpu->vmcs = NULL;
443         print_func_exit();
444 }
445
446 static int vcpu_slot(struct litevm_vcpu *vcpu)
447 {
448         print_func_entry();
449         print_func_exit();
450         return vcpu - vcpu->litevm->vcpus;
451 }
452
453 /*
454  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
455  * vcpu mutex is already taken.
456  */
457 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
458 {
459         print_func_entry();
460         uint64_t phys_addr = PADDR(vcpu->vmcs);
461         int cpu;
462         cpu = core_id();
463
464         printk("__vcpu_load: vcpu->cpu %d cpu %d\n", vcpu->cpu, cpu);
465         if ((vcpu->cpu != cpu) && (vcpu->cpu != -1)){
466                 handler_wrapper_t *w;
467                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
468                 smp_call_wait(w);
469                 vcpu->launched = 0;
470         }
471
472         printk("2 ..");
473         if (currentcpu->vmcs != vcpu->vmcs) {
474                 uint8_t error;
475
476                 currentcpu->vmcs = vcpu->vmcs;
477                 asm volatile ("vmptrld %1; setna %0":"=m" (error):"m"(phys_addr):"cc");
478                 if (error) {
479                         printk("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
480                         error("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
481                 }
482         }
483
484         printk("3 ..");
485         if (vcpu->cpu != cpu) {
486                 struct descriptor_table dt;
487                 unsigned long sysenter_esp;
488
489                 vcpu->cpu = cpu;
490                 /*
491                  * Linux uses per-cpu TSS and GDT, so set these when switching
492                  * processors.
493                  */
494                 vmcs_writel(HOST_TR_BASE, read_tr_base());      /* 22.2.4 */
495                 get_gdt(&dt);
496                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
497
498                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
499                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp);      /* 22.2.3 */
500         }
501         print_func_exit();
502         return vcpu;
503 }
504
505 /*
506  * Switches to specified vcpu, until a matching vcpu_put()
507  * And leaves it locked!
508  */
509 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
510 {
511         struct litevm_vcpu *ret;
512         print_func_entry();
513         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
514
515         printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu);
516
517         QLOCK(&vcpu->mutex);
518         printk("Locked\n");
519         if (!vcpu->vmcs) {
520                 QUNLOCK(&vcpu->mutex);
521                 printk("vcpu->vmcs for vcpu %p is NULL", vcpu);
522                 error("vcpu->vmcs is NULL");
523         }
524         ret = __vcpu_load(vcpu);
525         print_func_exit();
526         return ret;
527 }
528
529 static void vcpu_put(struct litevm_vcpu *vcpu)
530 {
531         print_func_entry();
532         //put_cpu();
533         QUNLOCK(&vcpu->mutex);
534         print_func_exit();
535 }
536
537 static struct vmcs *alloc_vmcs_cpu(int cpu)
538 {
539         print_func_entry();
540         int node = node_id();
541         struct vmcs *vmcs;
542
543         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
544         if (!vmcs) {
545                 print_func_exit();
546                 printk("no memory for vcpus");
547                 error("no memory for vcpus");
548         }
549         memset(vmcs, 0, vmcs_descriptor.size);
550         vmcs->revision_id = vmcs_descriptor.revision_id;        /* vmcs revision id */
551         print_func_exit();
552         return vmcs;
553 }
554
555 static struct vmcs *alloc_vmcs(void)
556 {
557         struct vmcs *ret;
558         print_func_entry();
559         ret = alloc_vmcs_cpu(core_id());
560         print_func_exit();
561         return ret;
562 }
563
564 static int cpu_has_litevm_support(void)
565 {
566         print_func_entry();
567         /* sigh ... qemu. */
568         char vid[16];
569         if (vendor_id(vid) < 0)
570                 return 0;
571         printk("vendor id is %s\n", vid);
572         if (vid[0] == 'Q') /* qemu */
573                 return 0;
574         if (vid[0] == 'A') /* AMD or qemu claiming to be AMD */
575                 return 0;
576         uint32_t ecx = cpuid_ecx(1);
577         print_func_exit();
578         return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */
579 }
580
581 static int vmx_disabled_by_bios(void)
582 {
583         print_func_entry();
584         uint64_t msr;
585
586         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
587         print_func_exit();
588         return (msr & 5) == 1;  /* locked but not enabled */
589 }
590
591 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
592 {
593         print_func_entry();
594         int cpu = hw_core_id();
595         uint64_t phys_addr;
596         uint64_t old;
597         uint64_t status = 0;
598         currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order,
599                                                                                           KMALLOC_WAIT);
600         if (!currentcpu->vmxarea)
601                 return;
602         memset(currentcpu->vmxarea, 0, vmcs_descriptor.size);
603         currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id;
604         phys_addr = PADDR(currentcpu->vmxarea);
605         printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(),
606                    currentcpu->vmxarea, (void *)phys_addr);
607         if (phys_addr & 0xfff) {
608                 printk("fix vmxarea alignment!");
609         }
610         printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
611         old = read_msr(MSR_IA32_FEATURE_CONTROL);
612         printk("%d: vm_enable, old is %d\n", core_id(), old);
613         if ((old & 5) == 0) {
614                 /* enable and lock */
615                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
616                 old = read_msr(MSR_IA32_FEATURE_CONTROL);
617                 printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old);
618         }
619         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
620         lcr4(rcr4() | CR4_VMXE);        /* FIXME: not cpu hotplug safe */
621         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
622         printk("%d:cr0 is %x\n", core_id(), rcr0());
623         lcr0(rcr0() | 0x20);
624         printk("%d:cr0 is %x\n", core_id(), rcr0());
625         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
626         outb(0x92, inb(0x92) | 2);
627         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
628         asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:":"=m" (status):"m"
629                                   (phys_addr):"memory", "cc");
630         printk("%d:vmxon status is %d\n", core_id(), status);
631         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
632         if (!status) {
633                 printk("%d:vm_enable: status says fail\n", core_id());
634         }
635         print_func_exit();
636 }
637
638 static void litevm_disable(void *garbage)
639 {
640         print_func_entry();
641         asm volatile ("vmxoff":::"cc");
642         print_func_exit();
643 }
644
645 struct litevm *vmx_open(void)
646 {
647         print_func_entry();
648         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
649         int i;
650
651         printk("vmx_open: litevm is %p\n", litevm);
652         if (!litevm) {
653                 printk("NO LITEVM! MAKES NO SENSE!\n");
654                 error("litevm alloc failed");
655                 print_func_exit();
656                 return 0;
657         }
658
659         SPLI_irqsave(&litevm->lock);
660         LIST_INIT(&litevm->link);
661         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
662                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
663                 printk("init vcpu %p\n", vcpu);
664
665                 QLOCK_init(&vcpu->mutex);
666                 vcpu->mmu.root_hpa = INVALID_PAGE;
667                 vcpu->litevm = litevm;
668                 LIST_INIT(&vcpu->link);
669         }
670         printk("vmx_open: busy %d\n", litevm->busy);
671         printk("return %p\n", litevm);
672         print_func_exit();
673         return litevm;
674 }
675
676 /*
677  * Free any memory in @free but not in @dont.
678  */
679 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
680                                                                          struct litevm_memory_slot *dont)
681 {
682         print_func_entry();
683         int i;
684
685         if (!dont || free->phys_mem != dont->phys_mem)
686                 if (free->phys_mem) {
687                         for (i = 0; i < free->npages; ++i) {
688                                 page_t *page = free->phys_mem[i];
689                                 page_decref(page);
690                                 assert(page_is_free(page2ppn(page)));
691                         }
692                         kfree(free->phys_mem);
693                 }
694
695         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
696                 kfree(free->dirty_bitmap);
697
698         free->phys_mem = 0;
699         free->npages = 0;
700         free->dirty_bitmap = 0;
701         print_func_exit();
702 }
703
704 static void litevm_free_physmem(struct litevm *litevm)
705 {
706         print_func_entry();
707         int i;
708
709         for (i = 0; i < litevm->nmemslots; ++i)
710                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
711         print_func_exit();
712 }
713
714 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
715 {
716         print_func_entry();
717         if (vcpu->vmcs) {
718                 handler_wrapper_t *w;
719                 smp_call_function_all(__vcpu_clear, vcpu, &w);
720                 smp_call_wait(w);
721                 //free_vmcs(vcpu->vmcs);
722                 vcpu->vmcs = 0;
723         }
724         print_func_exit();
725 }
726
727 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
728 {
729         print_func_entry();
730         litevm_free_vmcs(vcpu);
731         litevm_mmu_destroy(vcpu);
732         print_func_exit();
733 }
734
735 static void litevm_free_vcpus(struct litevm *litevm)
736 {
737         print_func_entry();
738         unsigned int i;
739
740         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
741                 litevm_free_vcpu(&litevm->vcpus[i]);
742         print_func_exit();
743 }
744
745 static int litevm_dev_release(struct litevm *litevm)
746 {
747         print_func_entry();
748
749         litevm_free_vcpus(litevm);
750         litevm_free_physmem(litevm);
751         kfree(litevm);
752         print_func_exit();
753         return 0;
754 }
755
756 unsigned long vmcs_readl(unsigned long field)
757 {
758         unsigned long value;
759
760         asm volatile ("vmread %1, %0":"=g" (value):"r"(field):"cc");
761         return value;
762 }
763
764 void vmcs_writel(unsigned long field, unsigned long value)
765 {
766         uint8_t error;
767
768         asm volatile ("vmwrite %1, %2; setna %0":"=g" (error):"r"(value),
769                                   "r"(field):"cc");
770         if (error)
771                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
772                            field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
773 }
774
775 static void vmcs_write16(unsigned long field, uint16_t value)
776 {
777         vmcs_writel(field, value);
778 }
779
780 static void vmcs_write64(unsigned long field, uint64_t value)
781 {
782         print_func_entry();
783 #ifdef __x86_64__
784         vmcs_writel(field, value);
785 #else
786         vmcs_writel(field, value);
787         asm volatile ("");
788         vmcs_writel(field + 1, value >> 32);
789 #endif
790         print_func_exit();
791 }
792
793 static void inject_gp(struct litevm_vcpu *vcpu)
794 {
795         print_func_entry();
796         printd("inject_general_protection: rip 0x%lx\n", vmcs_readl(GUEST_RIP));
797         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
798         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
799                                  GP_VECTOR |
800                                  INTR_TYPE_EXCEPTION |
801                                  INTR_INFO_DELIEVER_CODE_MASK | INTR_INFO_VALID_MASK);
802         print_func_exit();
803 }
804
805 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
806 {
807         print_func_entry();
808         if (vcpu->rmode.active)
809                 vmcs_write32(EXCEPTION_BITMAP, ~0);
810         else
811                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
812         print_func_exit();
813 }
814
815 static void enter_pmode(struct litevm_vcpu *vcpu)
816 {
817         print_func_entry();
818         unsigned long flags;
819
820         vcpu->rmode.active = 0;
821
822         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
823         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
824         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
825
826         flags = vmcs_readl(GUEST_RFLAGS);
827         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
828         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
829         vmcs_writel(GUEST_RFLAGS, flags);
830
831         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
832                                 (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK));
833
834         update_exception_bitmap(vcpu);
835
836 #define FIX_PMODE_DATASEG(seg, save) {                          \
837                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
838                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
839                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
840                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
841         }
842
843         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
844         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
845         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
846         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
847         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
848
849         vmcs_write16(GUEST_CS_SELECTOR,
850                                  vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
851         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
852         print_func_exit();
853 }
854
855 static int rmode_tss_base(struct litevm *litevm)
856 {
857         print_func_entry();
858         gfn_t base_gfn =
859                 litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
860         print_func_exit();
861         return base_gfn << PAGE_SHIFT;
862 }
863
864 static void enter_rmode(struct litevm_vcpu *vcpu)
865 {
866         print_func_entry();
867         unsigned long flags;
868
869         vcpu->rmode.active = 1;
870
871         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
872         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
873
874         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
875         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
876
877         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
878         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
879
880         flags = vmcs_readl(GUEST_RFLAGS);
881         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
882
883         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
884
885         vmcs_writel(GUEST_RFLAGS, flags);
886         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
887         update_exception_bitmap(vcpu);
888
889 #define FIX_RMODE_SEG(seg, save) {                                 \
890                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
891                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
892                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
893                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
894         }
895
896         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
897         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
898
899         FIX_RMODE_SEG(ES, vcpu->rmode.es);
900         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
901         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
902         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
903         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
904         print_func_exit();
905 }
906
907 static int init_rmode_tss(struct litevm *litevm)
908 {
909         print_func_entry();
910         struct page *p1, *p2, *p3;
911         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
912         char *page;
913
914         p1 = _gfn_to_page(litevm, fn++);
915         p2 = _gfn_to_page(litevm, fn++);
916         p3 = _gfn_to_page(litevm, fn);
917
918         if (!p1 || !p2 || !p3) {
919                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
920                 print_func_exit();
921                 return 0;
922         }
923
924         page = page2kva(p1);
925         memset(page, 0, PAGE_SIZE);
926         *(uint16_t *) (page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
927
928         page = page2kva(p2);
929         memset(page, 0, PAGE_SIZE);
930
931         page = page2kva(p3);
932         memset(page, 0, PAGE_SIZE);
933         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
934
935         print_func_exit();
936         return 1;
937 }
938
939 #ifdef __x86_64__
940
941 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
942 {
943         print_func_entry();
944         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
945
946         vcpu->shadow_efer = efer;
947         if (efer & EFER_LMA) {
948                 vmcs_write32(VM_ENTRY_CONTROLS,
949                                          vmcs_read32(VM_ENTRY_CONTROLS) |
950                                          VM_ENTRY_CONTROLS_IA32E_MASK);
951                 msr->data = efer;
952
953         } else {
954                 vmcs_write32(VM_ENTRY_CONTROLS,
955                                          vmcs_read32(VM_ENTRY_CONTROLS) &
956                                          ~VM_ENTRY_CONTROLS_IA32E_MASK);
957
958                 msr->data = efer & ~EFER_LME;
959         }
960         print_func_exit();
961 }
962
963 static void enter_lmode(struct litevm_vcpu *vcpu)
964 {
965         print_func_entry();
966         uint32_t guest_tr_ar;
967
968         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
969         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
970                 printd("%s: tss fixup for long mode. \n", __FUNCTION__);
971                 vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK)
972                                          | AR_TYPE_BUSY_64_TSS);
973         }
974
975         vcpu->shadow_efer |= EFER_LMA;
976
977         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
978         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
979                                  | VM_ENTRY_CONTROLS_IA32E_MASK);
980         print_func_exit();
981 }
982
983 static void exit_lmode(struct litevm_vcpu *vcpu)
984 {
985         print_func_entry();
986         vcpu->shadow_efer &= ~EFER_LMA;
987
988         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
989                                  & ~VM_ENTRY_CONTROLS_IA32E_MASK);
990         print_func_exit();
991 }
992
993 #endif
994
995 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
996 {
997         print_func_entry();
998         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
999                 enter_pmode(vcpu);
1000
1001         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
1002                 enter_rmode(vcpu);
1003
1004 #ifdef __x86_64__
1005         if (vcpu->shadow_efer & EFER_LME) {
1006                 if (!is_paging() && (cr0 & CR0_PG_MASK))
1007                         enter_lmode(vcpu);
1008                 if (is_paging() && !(cr0 & CR0_PG_MASK))
1009                         exit_lmode(vcpu);
1010         }
1011 #endif
1012
1013         vmcs_writel(CR0_READ_SHADOW, cr0);
1014         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
1015         print_func_exit();
1016 }
1017
1018 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
1019                                                                                  unsigned long cr3)
1020 {
1021         print_func_entry();
1022         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
1023         unsigned offset = (cr3 & (PAGE_SIZE - 1)) >> 5;
1024         int i;
1025         uint64_t pdpte;
1026         uint64_t *pdpt;
1027         struct litevm_memory_slot *memslot;
1028
1029         SPLL(&vcpu->litevm->lock);
1030         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
1031         /* FIXME: !memslot - emulate? 0xff? */
1032         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
1033
1034         for (i = 0; i < 4; ++i) {
1035                 pdpte = pdpt[offset + i];
1036                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
1037                         break;
1038         }
1039
1040         SPLU(&vcpu->litevm->lock);
1041
1042         print_func_exit();
1043         return i != 4;
1044 }
1045
1046 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
1047 {
1048         print_func_entry();
1049         if (cr0 & CR0_RESEVED_BITS) {
1050                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, guest_cr0());
1051                 inject_gp(vcpu);
1052                 print_func_exit();
1053                 return;
1054         }
1055
1056         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
1057                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
1058                 inject_gp(vcpu);
1059                 print_func_exit();
1060                 return;
1061         }
1062
1063         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
1064                 printd("set_cr0: #GP, set PG flag " "and a clear PE flag\n");
1065                 inject_gp(vcpu);
1066                 print_func_exit();
1067                 return;
1068         }
1069
1070         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
1071 #ifdef __x86_64__
1072                 if ((vcpu->shadow_efer & EFER_LME)) {
1073                         uint32_t guest_cs_ar;
1074                         if (!is_pae()) {
1075                                 printd("set_cr0: #GP, start paging "
1076                                            "in long mode while PAE is disabled\n");
1077                                 inject_gp(vcpu);
1078                                 print_func_exit();
1079                                 return;
1080                         }
1081                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1082                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
1083                                 printd("set_cr0: #GP, start paging "
1084                                            "in long mode while CS.L == 1\n");
1085                                 inject_gp(vcpu);
1086                                 print_func_exit();
1087                                 return;
1088
1089                         }
1090                 } else
1091 #endif
1092                 if (is_pae() && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1093                         printd("set_cr0: #GP, pdptrs " "reserved bits\n");
1094                         inject_gp(vcpu);
1095                         print_func_exit();
1096                         return;
1097                 }
1098
1099         }
1100
1101         __set_cr0(vcpu, cr0);
1102         litevm_mmu_reset_context(vcpu);
1103         print_func_exit();
1104         return;
1105 }
1106
1107 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
1108 {
1109         print_func_entry();
1110         unsigned long cr0 = guest_cr0();
1111
1112         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
1113                 enter_pmode(vcpu);
1114                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
1115
1116         } else
1117                 printd("lmsw: unexpected\n");
1118
1119         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
1120                                 | (msw & LMSW_GUEST_MASK));
1121         print_func_exit();
1122 }
1123
1124 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1125 {
1126         print_func_entry();
1127         vmcs_writel(CR4_READ_SHADOW, cr4);
1128         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1129                                                                   LITEVM_RMODE_VM_CR4_ALWAYS_ON :
1130                                                                   LITEVM_PMODE_VM_CR4_ALWAYS_ON));
1131         print_func_exit();
1132 }
1133
1134 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1135 {
1136         print_func_entry();
1137         if (cr4 & CR4_RESEVED_BITS) {
1138                 printd("set_cr4: #GP, reserved bits\n");
1139                 inject_gp(vcpu);
1140                 print_func_exit();
1141                 return;
1142         }
1143
1144         if (is_long_mode()) {
1145                 if (!(cr4 & CR4_PAE_MASK)) {
1146                         printd("set_cr4: #GP, clearing PAE while " "in long mode\n");
1147                         inject_gp(vcpu);
1148                         print_func_exit();
1149                         return;
1150                 }
1151         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
1152                            && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1153                 printd("set_cr4: #GP, pdptrs reserved bits\n");
1154                 inject_gp(vcpu);
1155         }
1156
1157         if (cr4 & CR4_VMXE_MASK) {
1158                 printd("set_cr4: #GP, setting VMXE\n");
1159                 inject_gp(vcpu);
1160                 print_func_exit();
1161                 return;
1162         }
1163         __set_cr4(vcpu, cr4);
1164         SPLL(&vcpu->litevm->lock);
1165         litevm_mmu_reset_context(vcpu);
1166         SPLU(&vcpu->litevm->lock);
1167         print_func_exit();
1168 }
1169
1170 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1171 {
1172         print_func_entry();
1173         if (is_long_mode()) {
1174                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
1175                         printd("set_cr3: #GP, reserved bits\n");
1176                         inject_gp(vcpu);
1177                         print_func_exit();
1178                         return;
1179                 }
1180         } else {
1181                 if (cr3 & CR3_RESEVED_BITS) {
1182                         printd("set_cr3: #GP, reserved bits\n");
1183                         inject_gp(vcpu);
1184                         print_func_exit();
1185                         return;
1186                 }
1187                 if (is_paging() && is_pae() && pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1188                         printd("set_cr3: #GP, pdptrs " "reserved bits\n");
1189                         inject_gp(vcpu);
1190                         print_func_exit();
1191                         return;
1192                 }
1193         }
1194
1195         vcpu->cr3 = cr3;
1196         SPLL(&vcpu->litevm->lock);
1197         vcpu->mmu.new_cr3(vcpu);
1198         SPLU(&vcpu->litevm->lock);
1199         print_func_exit();
1200 }
1201
1202 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1203 {
1204         print_func_entry();
1205         if (cr8 & CR8_RESEVED_BITS) {
1206                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1207                 inject_gp(vcpu);
1208                 print_func_exit();
1209                 return;
1210         }
1211         vcpu->cr8 = cr8;
1212         print_func_exit();
1213 }
1214
1215 static uint32_t get_rdx_init_val(void)
1216 {
1217         print_func_entry();
1218         uint32_t val;
1219
1220 asm("movl $1, %%eax \n\t" "movl %%eax, %0 \n\t":"=g"(val));
1221         print_func_exit();
1222         return val;
1223
1224 }
1225
1226 static void fx_init(struct litevm_vcpu *vcpu)
1227 {
1228         print_func_entry();
1229         struct __attribute__ ((__packed__)) fx_image_s {
1230                 uint16_t control;               //fcw
1231                 uint16_t status;                //fsw
1232                 uint16_t tag;                   // ftw
1233                 uint16_t opcode;                //fop
1234                 uint64_t ip;                    // fpu ip
1235                 uint64_t operand;               // fpu dp
1236                 uint32_t mxcsr;
1237                 uint32_t mxcsr_mask;
1238
1239         } *fx_image;
1240
1241         fx_save(vcpu->host_fx_image);
1242         fpu_init();
1243         fx_save(vcpu->guest_fx_image);
1244         fx_restore(vcpu->host_fx_image);
1245
1246         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1247         fx_image->mxcsr = 0x1f80;
1248         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1249                    0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1250         print_func_exit();
1251 }
1252
1253 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field,
1254                                                                    uint32_t val)
1255 {
1256         uint32_t msr_high, msr_low;
1257         uint64_t msrval;
1258
1259         msrval = read_msr(msr);
1260         msr_low = msrval;
1261         msr_high = (msrval >> 32);
1262
1263         val &= msr_high;
1264         val |= msr_low;
1265         vmcs_write32(vmcs_field, val);
1266 }
1267
1268 /*
1269  * Sets up the vmcs for emulated real mode.
1270  */
1271 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1272 {
1273         print_func_entry();
1274
1275 /* no op on x86_64 */
1276 #define asmlinkage
1277         extern asmlinkage void litevm_vmx_return(void);
1278         uint32_t host_sysenter_cs;
1279         uint32_t junk;
1280         uint64_t a;
1281         struct descriptor_table dt;
1282         int i;
1283         int ret;
1284         uint64_t tsc;
1285         int nr_good_msrs;
1286
1287         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1288         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1289         vcpu->cr8 = 0;
1290         vcpu->apic_base = 0xfee00000 |
1291                 /*for vcpu 0 */ MSR_IA32_APICBASE_BSP |
1292                 MSR_IA32_APICBASE_ENABLE;
1293
1294         fx_init(vcpu);
1295
1296 #define SEG_SETUP(seg) do {                                     \
1297                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1298                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1299                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1300                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1301         } while (0)
1302
1303         /*
1304          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1305          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1306          */
1307         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1308         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1309         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1310         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1311
1312         SEG_SETUP(DS);
1313         SEG_SETUP(ES);
1314         SEG_SETUP(FS);
1315         SEG_SETUP(GS);
1316         SEG_SETUP(SS);
1317
1318         vmcs_write16(GUEST_TR_SELECTOR, 0);
1319         vmcs_writel(GUEST_TR_BASE, 0);
1320         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1321         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1322
1323         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1324         vmcs_writel(GUEST_LDTR_BASE, 0);
1325         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1326         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1327
1328         vmcs_write32(GUEST_SYSENTER_CS, 0);
1329         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1330         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1331
1332         vmcs_writel(GUEST_RFLAGS, 0x02);
1333         vmcs_writel(GUEST_RIP, 0xfff0);
1334         vmcs_writel(GUEST_RSP, 0);
1335
1336         vmcs_writel(GUEST_CR3, 0);
1337
1338         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1339         vmcs_writel(GUEST_DR7, 0x400);
1340
1341         vmcs_writel(GUEST_GDTR_BASE, 0);
1342         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1343
1344         vmcs_writel(GUEST_IDTR_BASE, 0);
1345         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1346
1347         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1348         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1349         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1350
1351         /* I/O */
1352         vmcs_write64(IO_BITMAP_A, 0);
1353         vmcs_write64(IO_BITMAP_B, 0);
1354
1355         tsc = read_tsc();
1356         vmcs_write64(TSC_OFFSET, -tsc);
1357
1358         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1359
1360         /* Special registers */
1361         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1362
1363         /* Control */
1364         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_EXT_INTR_MASK       /* 20.6.1 */
1365                                                    | PIN_BASED_NMI_EXITING      /* 20.6.1 */
1366                 );
1367         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_HLT_EXITING        /* 20.6.2 */
1368                                                    | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1369                                                    | CPU_BASED_CR8_STORE_EXITING        /* 20.6.2 */
1370                                                    | CPU_BASED_UNCOND_IO_EXITING        /* 20.6.2 */
1371                                                    | CPU_BASED_INVDPG_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING  /* 21.3 */
1372                 );
1373
1374         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1375         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1376         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1377         vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1378
1379         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1380         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1381         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1382
1383         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);    /* 22.2.4 */
1384         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1385         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1386         vmcs_write16(HOST_FS_SELECTOR, read_fs());      /* 22.2.4 */
1387         vmcs_write16(HOST_GS_SELECTOR, read_gs());      /* 22.2.4 */
1388         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1389
1390 #ifdef __x86_64__
1391         a = read_msr(MSR_FS_BASE);
1392         vmcs_writel(HOST_FS_BASE, a);   /* 22.2.4 */
1393         a = read_msr(MSR_GS_BASE);
1394         vmcs_writel(HOST_GS_BASE, a);   /* 22.2.4 */
1395 #else
1396         vmcs_writel(HOST_FS_BASE, 0);   /* 22.2.4 */
1397         vmcs_writel(HOST_GS_BASE, 0);   /* 22.2.4 */
1398 #endif
1399
1400         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS * 8);      /* 22.2.4 */
1401
1402         get_idt(&dt);
1403         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1404
1405         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return);        /* 22.2.5 */
1406
1407         /* it's the HIGH 32 bits! */
1408         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1409         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1410         a = read_msr(MSR_IA32_SYSENTER_ESP);
1411         vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1412         a = read_msr(MSR_IA32_SYSENTER_EIP);
1413         vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1414
1415         ret = -ENOMEM;
1416         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1417         if (!vcpu->guest_msrs)
1418                 error("guest_msrs kmalloc failed");
1419         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1420         if (!vcpu->host_msrs)
1421                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1422
1423         for (i = 0; i < NR_VMX_MSR; ++i) {
1424                 uint32_t index = vmx_msr_index[i];
1425                 uint32_t data_low, data_high;
1426                 uint64_t data;
1427                 int j = vcpu->nmsrs;
1428
1429 #warning "need readmsr_safe"
1430 //      if (rdmsr_safe(index, &data_low, &data_high) < 0)
1431 //          continue;
1432                 data = read_msr(index);
1433                 vcpu->host_msrs[j].index = index;
1434                 vcpu->host_msrs[j].reserved = 0;
1435                 vcpu->host_msrs[j].data = data;
1436                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1437                 ++vcpu->nmsrs;
1438         }
1439         printk("msrs: %d\n", vcpu->nmsrs);
1440
1441         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1442         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1443         vmcs_writel(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1444         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1445         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, (HOST_IS_64 << 9));        /* 22.2,1, 20.7.1 */
1446         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs);    /* 22.2.2 */
1447         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);     /* 22.2.2 */
1448         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs);    /* 22.2.2 */
1449
1450         /* 22.2.1, 20.8.1 */
1451         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, VM_ENTRY_CONTROLS, 0);
1452         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
1453
1454         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1455         vmcs_writel(TPR_THRESHOLD, 0);
1456
1457         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1458         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1459
1460         __set_cr0(vcpu, 0x60000010);    // enter rmode
1461         __set_cr4(vcpu, 0);
1462 #ifdef __x86_64__
1463         __set_efer(vcpu, 0);
1464 #endif
1465
1466         ret = litevm_mmu_init(vcpu);
1467
1468         print_func_exit();
1469         return ret;
1470
1471 out_free_guest_msrs:
1472         kfree(vcpu->guest_msrs);
1473 out:
1474         return ret;
1475 }
1476
1477 /*
1478  * Sync the rsp and rip registers into the vcpu structure.  This allows
1479  * registers to be accessed by indexing vcpu->regs.
1480  */
1481 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1482 {
1483         print_func_entry();
1484         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1485         vcpu->rip = vmcs_readl(GUEST_RIP);
1486         print_func_exit();
1487 }
1488
1489 /*
1490  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1491  * modification.
1492  */
1493 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1494 {
1495         print_func_entry();
1496         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1497         vmcs_writel(GUEST_RIP, vcpu->rip);
1498         print_func_exit();
1499 }
1500
1501 /*
1502  * Creates some virtual cpus.  Good luck creating more than one.
1503  */
1504 int vmx_create_vcpu(struct litevm *litevm, int n)
1505 {
1506         print_func_entry();
1507         ERRSTACK(2);
1508         int r;
1509         struct litevm_vcpu *vcpu;
1510         struct vmcs *vmcs;
1511         char *errstring = NULL;
1512
1513         if (n < 0 || n >= LITEVM_MAX_VCPUS) {
1514                 printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1515                            LITEVM_MAX_VCPUS);
1516                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1517                           LITEVM_MAX_VCPUS);
1518         }
1519         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1520         vcpu = &litevm->vcpus[n];
1521
1522         printk("vmx_create_vcpu: @%d, %p\n", n, vcpu);
1523         QLOCK(&vcpu->mutex);
1524
1525         if (vcpu->vmcs) {
1526                 QUNLOCK(&vcpu->mutex);
1527                 printk("VM already exists\n");
1528                 error("VM already exists");
1529         }
1530         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1531         /* I'm a bad person */
1532         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1533         uint64_t a = (uint64_t) vcpu->fx_buf;
1534         a += FX_IMAGE_ALIGN - 1;
1535         a /= FX_IMAGE_ALIGN;
1536         a *= FX_IMAGE_ALIGN;
1537
1538         vcpu->host_fx_image = (char *)a;
1539         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1540
1541         vcpu->cpu = -1; /* First load will set up TR */
1542         vcpu->litevm = litevm;
1543         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1544         if (waserror()){
1545                 printk("ERR 1 in %s, %s\n", __func__, current_errstr());
1546                 QUNLOCK(&vcpu->mutex);
1547                 litevm_free_vcpu(vcpu);
1548                 nexterror();
1549         }
1550         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1551         vmcs = alloc_vmcs();
1552         vmcs_clear(vmcs);
1553         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1554         printk("after vmcs_clear\n");
1555         vcpu->vmcs = vmcs;
1556         printk("vcpu %p set vmcs to %p\n", vcpu, vmcs);
1557         vcpu->launched = 0;
1558         printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs);
1559
1560         __vcpu_load(vcpu);
1561
1562         printk("PAST vcpu_load\n");
1563         if (waserror()) {
1564                 /* we really need to fix waserror() */
1565                 printk("vcpu_setup failed: %s\n", current_errstr());
1566                 QUNLOCK(&vcpu->mutex);
1567                 nexterror();
1568         }
1569
1570         /* need memory for the rmode_tss. I have no idea how this happened
1571          * originally in kvm.
1572          */
1573         /* this sucks. */
1574         QUNLOCK(&vcpu->mutex);
1575         void *v;
1576         struct litevm_memory_region vmr;
1577         vmr.slot = 0;
1578         vmr.flags = 0;
1579         vmr.guest_phys_addr = /* guess. */ 0x1000000;
1580         vmr.memory_size = 0x10000;
1581         if (vm_set_memory_region(litevm, &vmr))
1582                 printk("vm_set_memory_region failed");
1583
1584         printk("set memory region done\n");
1585
1586         if (!init_rmode_tss(litevm)) {
1587                 error("vcpu_setup: init_rmode_tss failed");
1588         }
1589
1590
1591         QLOCK(&vcpu->mutex);
1592         r = litevm_vcpu_setup(vcpu);
1593
1594         vcpu_put(vcpu);
1595
1596         printk("r is %d\n", r);
1597
1598         if (!r) {
1599                 poperror();
1600                 print_func_exit();
1601                 return 0;
1602         }
1603
1604         errstring = "vcup set failed";
1605
1606 out_free_vcpus:
1607 out:
1608         print_func_exit();
1609         return r;
1610 }
1611
1612 /*
1613  * Allocate some memory and give it an address in the guest physical address
1614  * space.
1615  *
1616  * Discontiguous memory is allowed, mostly for framebuffers.
1617  */
1618 int vm_set_memory_region(struct litevm *litevm,
1619                                                  struct litevm_memory_region *mem)
1620 {
1621         print_func_entry();
1622         ERRSTACK(2);
1623         int r;
1624         gfn_t base_gfn;
1625         unsigned long npages;
1626         unsigned long i;
1627         struct litevm_memory_slot *memslot;
1628         struct litevm_memory_slot old, new;
1629         int memory_config_version;
1630         void *init_data = mem->init_data;
1631         int pass = 1;
1632         printk("%s: slot %d base %08x npages %d\n", 
1633                 __func__, 
1634                mem->slot, mem->guest_phys_addr, 
1635                mem->memory_size);
1636         /* should not happen but ... */
1637         if (!litevm)
1638                 error("NULL litevm in %s", __func__);
1639
1640         if (!mem)
1641                 error("NULL mem in %s", __func__);
1642         /* I don't care right now. *
1643         if (litevm->busy)
1644                 error("litevm->busy is set! 0x%x\n", litevm->busy);
1645         */
1646         r = -EINVAL;
1647         /* General sanity checks */
1648         if (mem->memory_size & (PAGE_SIZE - 1))
1649                 error("mem->memory_size %lld is not page-aligned", mem->memory_size);
1650         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1651                 error("guest_phys_addr 0x%llx is not page-aligned",
1652                           mem->guest_phys_addr);
1653         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1654                 error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
1655         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1656                 error("0x%x + 0x%x is < 0x%x",
1657                           mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
1658
1659         memslot = &litevm->memslots[mem->slot];
1660         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1661         npages = mem->memory_size >> PAGE_SHIFT;
1662
1663         if (!npages)
1664                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1665
1666         /* this is actually a very tricky for loop. The use of
1667          * error is a bit dangerous, so we don't use it much.
1668          * consider a rewrite. Would be nice if akaros could do the
1669          * allocation of a bunch of pages for us.
1670          */
1671 raced:
1672         printk("raced: pass %d\n", pass);
1673         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1674         void monitor(void *);
1675         monitor(NULL);
1676         SPLL(&litevm->lock);
1677         printk("locked\n");
1678
1679         if (waserror()) {
1680                 printk("error in %s, %s\n", __func__, current_errstr());
1681                 SPLU(&litevm->lock);
1682                 nexterror();
1683         }
1684
1685         memory_config_version = litevm->memory_config_version;
1686         new = old = *memslot;
1687         printk("memory_config_version %d\n", memory_config_version);
1688
1689         new.base_gfn = base_gfn;
1690         new.npages = npages;
1691         new.flags = mem->flags;
1692
1693         /* Disallow changing a memory slot's size. */
1694         r = -EINVAL;
1695         if (npages && old.npages && npages != old.npages)
1696                 error("npages is %d, old.npages is %d, can't change",
1697                           npages, old.npages);
1698
1699         /* Check for overlaps */
1700         r = -EEXIST;
1701         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1702                 struct litevm_memory_slot *s = &litevm->memslots[i];
1703 printk("Region %d: base gfn 0x%x npages %d\n", s->base_gfn, s->npages);
1704                 if (s == memslot)
1705                         continue;
1706                 if (!((base_gfn + npages <= s->base_gfn) ||
1707                           (base_gfn >= s->base_gfn + s->npages)))
1708                         error("Overlap");
1709         }
1710         /*
1711          * Do memory allocations outside lock.  memory_config_version will
1712          * detect any races.
1713          */
1714         SPLU(&litevm->lock);
1715         printk("unlocked\n");
1716         poperror();
1717
1718         /* Deallocate if slot is being removed */
1719         if (!npages)
1720                 new.phys_mem = 0;
1721
1722         /* Free page dirty bitmap if unneeded */
1723         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1724                 new.dirty_bitmap = 0;
1725
1726         r = -ENOMEM;
1727
1728         /* Allocate if a slot is being created */
1729         if (npages && !new.phys_mem) {
1730                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1731
1732                 if (!new.phys_mem)
1733                         goto out_free;
1734
1735                 for (i = 0; i < npages; ++i) {
1736                         int ret;
1737                         ret = kpage_alloc(&new.phys_mem[i]);
1738                         if (ret != ESUCCESS)
1739                                 goto out_free;
1740                         if (init_data) {
1741                                 printk("init data memcpy(%p,%p,4096);\n",
1742                                            page2kva(new.phys_mem[i]), init_data);
1743                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1744                                 init_data += PAGE_SIZE;
1745                         }
1746                 }
1747         }
1748
1749         /* Allocate page dirty bitmap if needed */
1750         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1751                 unsigned dirty_bytes;   //ALIGN(npages, BITS_PER_LONG) / 8;
1752                 dirty_bytes =
1753                         (((npages + BITS_PER_LONG -
1754                            1) / BITS_PER_LONG) * BITS_PER_LONG) / 8;
1755
1756                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1757                 if (!new.dirty_bitmap) {
1758                         printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
1759                         goto out_free;
1760                 }
1761         }
1762
1763         SPLL(&litevm->lock);
1764         printk("locked\n");
1765         if (memory_config_version != litevm->memory_config_version) {
1766                 SPLU(&litevm->lock);
1767                 printk("unlocked, try again\n");
1768                 litevm_free_physmem_slot(&new, &old);
1769                 goto raced;
1770         }
1771
1772         r = -EAGAIN;
1773         if (litevm->busy) {
1774                 printk("BUSY!\n");
1775                 goto out_unlock;
1776         }
1777
1778         if (mem->slot >= litevm->nmemslots)
1779                 litevm->nmemslots = mem->slot + 1;
1780
1781         *memslot = new;
1782         ++litevm->memory_config_version;
1783
1784         SPLU(&litevm->lock);
1785         printk("unlocked\n");
1786         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1787                 struct litevm_vcpu *vcpu;
1788
1789                 vcpu = vcpu_load(litevm, i);
1790                 if (!vcpu){
1791                         printk("%s: no cpu %d\n", __func__, i);
1792                         continue;
1793                 }
1794                 litevm_mmu_reset_context(vcpu);
1795                 vcpu_put(vcpu);
1796         }
1797
1798         litevm_free_physmem_slot(&old, &new);
1799         print_func_exit();
1800         return 0;
1801
1802 out_unlock:
1803         SPLU(&litevm->lock);
1804         printk("out_unlock\n");
1805 out_free:
1806         printk("out_free\n");
1807         litevm_free_physmem_slot(&new, &old);
1808 out:
1809         printk("vm_set_memory_region: return %d\n", r);
1810         print_func_exit();
1811         return r;
1812 }
1813
1814 #if 0
1815 /*
1816  * Get (and clear) the dirty memory log for a memory slot.
1817  */
1818 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1819                                                                                   struct litevm_dirty_log *log)
1820 {
1821         struct litevm_memory_slot *memslot;
1822         int r, i;
1823         int n;
1824         unsigned long any = 0;
1825
1826         SPLL(&litevm->lock);
1827
1828         /*
1829          * Prevent changes to guest memory configuration even while the lock
1830          * is not taken.
1831          */
1832         ++litevm->busy;
1833         SPLU(&litevm->lock);
1834         r = -EINVAL;
1835         if (log->slot >= LITEVM_MEMORY_SLOTS)
1836                 goto out;
1837
1838         memslot = &litevm->memslots[log->slot];
1839         r = -ENOENT;
1840         if (!memslot->dirty_bitmap)
1841                 goto out;
1842
1843         n = ALIGN(memslot->npages, 8) / 8;
1844
1845         for (i = 0; !any && i < n; ++i)
1846                 any = memslot->dirty_bitmap[i];
1847
1848         r = -EFAULT;
1849         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1850                 goto out;
1851
1852         if (any) {
1853                 SPLL(&litevm->lock);
1854                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1855                 SPLU(&litevm->lock);
1856                 memset(memslot->dirty_bitmap, 0, n);
1857                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1858                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1859
1860                         if (!vcpu)
1861                                 continue;
1862                         flush_guest_tlb(vcpu);
1863                         vcpu_put(vcpu);
1864                 }
1865         }
1866
1867         r = 0;
1868
1869 out:
1870         SPLL(&litevm->lock);
1871         --litevm->busy;
1872         SPLU(&litevm->lock);
1873         return r;
1874 }
1875 #endif
1876
1877 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1878 {
1879         print_func_entry();
1880         int i;
1881
1882         printk("%s: litevm %p gfn %d\n", litevm, gfn);
1883         for (i = 0; i < litevm->nmemslots; ++i) {
1884                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1885
1886                 if (gfn >= memslot->base_gfn
1887                         && gfn < memslot->base_gfn + memslot->npages) {
1888                         print_func_exit();
1889                         return memslot;
1890                 }
1891         }
1892         print_func_exit();
1893         return 0;
1894 }
1895
1896 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1897 {
1898         print_func_entry();
1899         int i;
1900         struct litevm_memory_slot *memslot = 0;
1901         unsigned long rel_gfn;
1902
1903         for (i = 0; i < litevm->nmemslots; ++i) {
1904                 memslot = &litevm->memslots[i];
1905
1906                 if (gfn >= memslot->base_gfn
1907                         && gfn < memslot->base_gfn + memslot->npages) {
1908
1909                         if (!memslot || !memslot->dirty_bitmap) {
1910                                 print_func_exit();
1911                                 return;
1912                         }
1913
1914                         rel_gfn = gfn - memslot->base_gfn;
1915
1916                         /* avoid RMW */
1917                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1918                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1919                         print_func_exit();
1920                         return;
1921                 }
1922         }
1923         print_func_exit();
1924 }
1925
1926 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1927 {
1928         print_func_entry();
1929         unsigned long rip;
1930         uint32_t interruptibility;
1931
1932         rip = vmcs_readl(GUEST_RIP);
1933         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1934         vmcs_writel(GUEST_RIP, rip);
1935
1936         /*
1937          * We emulated an instruction, so temporary interrupt blocking
1938          * should be removed, if set.
1939          */
1940         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1941         if (interruptibility & 3)
1942                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3);
1943         print_func_exit();
1944 }
1945
1946 static int emulator_read_std(unsigned long addr,
1947                                                          unsigned long *val,
1948                                                          unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1949 {
1950         print_func_entry();
1951         struct litevm_vcpu *vcpu = ctxt->vcpu;
1952         void *data = val;
1953
1954         while (bytes) {
1955                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1956                 unsigned offset = addr & (PAGE_SIZE - 1);
1957                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ?
1958                         bytes : (unsigned)PAGE_SIZE - offset;
1959                 unsigned long pfn;
1960                 struct litevm_memory_slot *memslot;
1961                 void *page;
1962
1963                 if (gpa == UNMAPPED_GVA) {
1964                         print_func_exit();
1965                         return X86EMUL_PROPAGATE_FAULT;
1966                 }
1967                 pfn = gpa >> PAGE_SHIFT;
1968                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1969                 if (!memslot) {
1970                         print_func_exit();
1971                         return X86EMUL_UNHANDLEABLE;
1972                 }
1973                 page = page2kva(gfn_to_page(memslot, pfn));
1974
1975                 memcpy(data, page + offset, tocopy);
1976
1977                 bytes -= tocopy;
1978                 data += tocopy;
1979                 addr += tocopy;
1980         }
1981
1982         print_func_exit();
1983         return X86EMUL_CONTINUE;
1984 }
1985
1986 static int emulator_write_std(unsigned long addr,
1987                                                           unsigned long val,
1988                                                           unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1989 {
1990         print_func_entry();
1991         printk("emulator_write_std: addr %lx n %d\n", addr, bytes);
1992         print_func_exit();
1993         return X86EMUL_UNHANDLEABLE;
1994 }
1995
1996 static int emulator_read_emulated(unsigned long addr,
1997                                                                   unsigned long *val,
1998                                                                   unsigned int bytes,
1999                                                                   struct x86_emulate_ctxt *ctxt)
2000 {
2001         print_func_entry();
2002         struct litevm_vcpu *vcpu = ctxt->vcpu;
2003
2004         if (vcpu->mmio_read_completed) {
2005                 memcpy(val, vcpu->mmio_data, bytes);
2006                 vcpu->mmio_read_completed = 0;
2007                 print_func_exit();
2008                 return X86EMUL_CONTINUE;
2009         } else if (emulator_read_std(addr, val, bytes, ctxt)
2010                            == X86EMUL_CONTINUE) {
2011                 print_func_exit();
2012                 return X86EMUL_CONTINUE;
2013         } else {
2014                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2015                 if (gpa == UNMAPPED_GVA) {
2016                         print_func_exit();
2017                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
2018                 }
2019                 vcpu->mmio_needed = 1;
2020                 vcpu->mmio_phys_addr = gpa;
2021                 vcpu->mmio_size = bytes;
2022                 vcpu->mmio_is_write = 0;
2023
2024                 print_func_exit();
2025                 return X86EMUL_UNHANDLEABLE;
2026         }
2027 }
2028
2029 static int emulator_write_emulated(unsigned long addr,
2030                                                                    unsigned long val,
2031                                                                    unsigned int bytes,
2032                                                                    struct x86_emulate_ctxt *ctxt)
2033 {
2034         print_func_entry();
2035         struct litevm_vcpu *vcpu = ctxt->vcpu;
2036         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2037
2038         if (gpa == UNMAPPED_GVA) {
2039                 print_func_exit();
2040                 return X86EMUL_PROPAGATE_FAULT;
2041         }
2042
2043         vcpu->mmio_needed = 1;
2044         vcpu->mmio_phys_addr = gpa;
2045         vcpu->mmio_size = bytes;
2046         vcpu->mmio_is_write = 1;
2047         memcpy(vcpu->mmio_data, &val, bytes);
2048
2049         print_func_exit();
2050         return X86EMUL_CONTINUE;
2051 }
2052
2053 static int emulator_cmpxchg_emulated(unsigned long addr,
2054                                                                          unsigned long old,
2055                                                                          unsigned long new,
2056                                                                          unsigned int bytes,
2057                                                                          struct x86_emulate_ctxt *ctxt)
2058 {
2059         print_func_entry();
2060         static int reported;
2061
2062         if (!reported) {
2063                 reported = 1;
2064                 printk("litevm: emulating exchange as write\n");
2065         }
2066         print_func_exit();
2067         return emulator_write_emulated(addr, new, bytes, ctxt);
2068 }
2069
2070 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
2071 {
2072         print_func_entry();
2073         static int reported;
2074         uint8_t opcodes[4];
2075         unsigned long rip = vmcs_readl(GUEST_RIP);
2076         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
2077
2078         if (reported) {
2079                 print_func_exit();
2080                 return;
2081         }
2082
2083         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
2084
2085         printk("emulation failed but !mmio_needed?"
2086                    " rip %lx %02x %02x %02x %02x\n",
2087                    rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2088         reported = 1;
2089         print_func_exit();
2090 }
2091
2092 struct x86_emulate_ops emulate_ops = {
2093         .read_std = emulator_read_std,
2094         .write_std = emulator_write_std,
2095         .read_emulated = emulator_read_emulated,
2096         .write_emulated = emulator_write_emulated,
2097         .cmpxchg_emulated = emulator_cmpxchg_emulated,
2098 };
2099
2100 enum emulation_result {
2101         EMULATE_DONE,                           /* no further processing */
2102         EMULATE_DO_MMIO,                        /* litevm_run filled with mmio request */
2103         EMULATE_FAIL,                           /* can't emulate this instruction */
2104 };
2105
2106 static int emulate_instruction(struct litevm_vcpu *vcpu,
2107                                                            struct litevm_run *run,
2108                                                            unsigned long cr2, uint16_t error_code)
2109 {
2110         print_func_entry();
2111         struct x86_emulate_ctxt emulate_ctxt;
2112         int r;
2113         uint32_t cs_ar;
2114
2115         vcpu_load_rsp_rip(vcpu);
2116
2117         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2118
2119         emulate_ctxt.vcpu = vcpu;
2120         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
2121         emulate_ctxt.cr2 = cr2;
2122         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
2123                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
2124                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
2125                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2126
2127         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2128                 emulate_ctxt.cs_base = 0;
2129                 emulate_ctxt.ds_base = 0;
2130                 emulate_ctxt.es_base = 0;
2131                 emulate_ctxt.ss_base = 0;
2132                 emulate_ctxt.gs_base = 0;
2133                 emulate_ctxt.fs_base = 0;
2134         } else {
2135                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
2136                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
2137                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
2138                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
2139                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
2140                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
2141         }
2142
2143         vcpu->mmio_is_write = 0;
2144         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
2145
2146         if ((r || vcpu->mmio_is_write) && run) {
2147                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2148                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2149                 run->mmio.len = vcpu->mmio_size;
2150                 run->mmio.is_write = vcpu->mmio_is_write;
2151         }
2152
2153         if (r) {
2154                 if (!vcpu->mmio_needed) {
2155                         report_emulation_failure(&emulate_ctxt);
2156                         print_func_exit();
2157                         return EMULATE_FAIL;
2158                 }
2159                 print_func_exit();
2160                 return EMULATE_DO_MMIO;
2161         }
2162
2163         vcpu_put_rsp_rip(vcpu);
2164         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
2165
2166         if (vcpu->mmio_is_write) {
2167                 print_func_exit();
2168                 return EMULATE_DO_MMIO;
2169         }
2170
2171         print_func_exit();
2172         return EMULATE_DONE;
2173 }
2174
2175 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
2176 {
2177         print_func_entry();
2178         print_func_exit();
2179         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2180 }
2181
2182 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2183 {
2184         print_func_entry();
2185         vmcs_writel(GUEST_GDTR_BASE, base);
2186         vmcs_write32(GUEST_GDTR_LIMIT, limit);
2187         print_func_exit();
2188 }
2189
2190 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2191 {
2192         print_func_entry();
2193         vmcs_writel(GUEST_IDTR_BASE, base);
2194         vmcs_write32(GUEST_IDTR_LIMIT, limit);
2195         print_func_exit();
2196 }
2197
2198 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
2199                                    unsigned long *rflags)
2200 {
2201         print_func_entry();
2202         lmsw(vcpu, msw);
2203         *rflags = vmcs_readl(GUEST_RFLAGS);
2204         print_func_exit();
2205 }
2206
2207 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
2208 {
2209         print_func_entry();
2210         switch (cr) {
2211                 case 0:
2212                         print_func_exit();
2213                         return guest_cr0();
2214                 case 2:
2215                         print_func_exit();
2216                         return vcpu->cr2;
2217                 case 3:
2218                         print_func_exit();
2219                         return vcpu->cr3;
2220                 case 4:
2221                         print_func_exit();
2222                         return guest_cr4();
2223                 default:
2224                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2225                         print_func_exit();
2226                         return 0;
2227         }
2228 }
2229
2230 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
2231                                          unsigned long *rflags)
2232 {
2233         print_func_entry();
2234         switch (cr) {
2235                 case 0:
2236                         set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
2237                         *rflags = vmcs_readl(GUEST_RFLAGS);
2238                         break;
2239                 case 2:
2240                         vcpu->cr2 = val;
2241                         break;
2242                 case 3:
2243                         set_cr3(vcpu, val);
2244                         break;
2245                 case 4:
2246                         set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
2247                         break;
2248                 default:
2249                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2250         }
2251         print_func_exit();
2252 }
2253
2254 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
2255                                                                   int vec, uint32_t err_code)
2256 {
2257         print_func_entry();
2258         if (!vcpu->rmode.active) {
2259                 print_func_exit();
2260                 return 0;
2261         }
2262
2263         if (vec == GP_VECTOR && err_code == 0)
2264                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) {
2265                         print_func_exit();
2266                         return 1;
2267                 }
2268         print_func_exit();
2269         return 0;
2270 }
2271
2272 static int handle_exception(struct litevm_vcpu *vcpu,
2273                                                         struct litevm_run *litevm_run)
2274 {
2275         print_func_entry();
2276         uint32_t intr_info, error_code;
2277         unsigned long cr2, rip;
2278         uint32_t vect_info;
2279         enum emulation_result er;
2280
2281         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2282         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2283 printk("vect_info %x intro_info %x\n", vect_info, intr_info);
2284 printk("page fault? %d\n", is_page_fault(intr_info));
2285
2286         if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) {
2287                 printk("%s: unexpected, vectoring info 0x%x "
2288                            "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
2289         }
2290
2291         if (is_external_interrupt(vect_info)) {
2292 printk("extern interrupt\n");
2293                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2294                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_pending), irq);
2295                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_summary),
2296                                                            irq / BITS_PER_LONG);
2297         }
2298
2299         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) {  /* nmi */
2300 printk("nmi\n");
2301                 asm("int $2");
2302                 print_func_exit();
2303                 return 1;
2304         }
2305         error_code = 0;
2306         rip = vmcs_readl(GUEST_RIP);
2307 printk("GUEST_RIP %x\n", rip);
2308         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
2309                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2310         if (is_page_fault(intr_info)) {
2311 printk("PAGE FAULT!\n");
2312                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2313
2314                 SPLL(&vcpu->litevm->lock);
2315                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
2316                         SPLU(&vcpu->litevm->lock);
2317                         print_func_exit();
2318                         return 1;
2319                 }
2320
2321                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
2322                 SPLU(&vcpu->litevm->lock);
2323
2324                 switch (er) {
2325                         case EMULATE_DONE:
2326                                 print_func_exit();
2327                                 return 1;
2328                         case EMULATE_DO_MMIO:
2329                                 ++litevm_stat.mmio_exits;
2330                                 litevm_run->exit_reason = LITEVM_EXIT_MMIO;
2331                                 print_func_exit();
2332                                 return 0;
2333                         case EMULATE_FAIL:
2334                                 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2335                                 break;
2336                         default:
2337                                 assert(0);
2338                 }
2339         }
2340
2341         if (vcpu->rmode.active &&
2342                 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2343                                                            error_code)) {
2344             printk("RMODE EXCEPTION might have been handled\n");
2345                 print_func_exit();
2346                 return 1;
2347         }
2348
2349         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
2350                 (INTR_TYPE_EXCEPTION | 1)) {
2351                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2352                 print_func_exit();
2353                 return 0;
2354         }
2355         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2356         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2357         litevm_run->ex.error_code = error_code;
2358         print_func_exit();
2359         return 0;
2360 }
2361
2362 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2363                                                                          struct litevm_run *litevm_run)
2364 {
2365         print_func_entry();
2366         ++litevm_stat.irq_exits;
2367         print_func_exit();
2368         return 1;
2369 }
2370
2371 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t * count)
2372 {
2373         print_func_entry();
2374         uint64_t inst;
2375         gva_t rip;
2376         int countr_size;
2377         int i, n;
2378
2379         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2380                 countr_size = 2;
2381         } else {
2382                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2383
2384                 countr_size = (cs_ar & AR_L_MASK) ? 8 : (cs_ar & AR_DB_MASK) ? 4 : 2;
2385         }
2386
2387         rip = vmcs_readl(GUEST_RIP);
2388         if (countr_size != 8)
2389                 rip += vmcs_readl(GUEST_CS_BASE);
2390
2391         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2392
2393         for (i = 0; i < n; i++) {
2394                 switch (((uint8_t *) & inst)[i]) {
2395                         case 0xf0:
2396                         case 0xf2:
2397                         case 0xf3:
2398                         case 0x2e:
2399                         case 0x36:
2400                         case 0x3e:
2401                         case 0x26:
2402                         case 0x64:
2403                         case 0x65:
2404                         case 0x66:
2405                                 break;
2406                         case 0x67:
2407                                 countr_size = (countr_size == 2) ? 4 : (countr_size >> 1);
2408                         default:
2409                                 goto done;
2410                 }
2411         }
2412         print_func_exit();
2413         return 0;
2414 done:
2415         countr_size *= 8;
2416         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2417         print_func_exit();
2418         return 1;
2419 }
2420
2421 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2422 {
2423         print_func_entry();
2424         uint64_t exit_qualification;
2425
2426         ++litevm_stat.io_exits;
2427         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2428         litevm_run->exit_reason = LITEVM_EXIT_IO;
2429         if (exit_qualification & 8)
2430                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2431         else
2432                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2433         litevm_run->io.size = (exit_qualification & 7) + 1;
2434         litevm_run->io.string = (exit_qualification & 16) != 0;
2435         litevm_run->io.string_down
2436                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2437         litevm_run->io.rep = (exit_qualification & 32) != 0;
2438         litevm_run->io.port = exit_qualification >> 16;
2439         if (litevm_run->io.string) {
2440                 if (!get_io_count(vcpu, &litevm_run->io.count)) {
2441                         print_func_exit();
2442                         return 1;
2443                 }
2444                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2445         } else
2446                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX];       /* rax */
2447         print_func_exit();
2448         return 0;
2449 }
2450
2451 static int handle_invlpg(struct litevm_vcpu *vcpu,
2452                                                  struct litevm_run *litevm_run)
2453 {
2454         print_func_entry();
2455         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2456         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2457         SPLL(&vcpu->litevm->lock);
2458         vcpu->mmu.inval_page(vcpu, address);
2459         SPLU(&vcpu->litevm->lock);
2460         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2461         print_func_exit();
2462         return 1;
2463 }
2464
2465 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2466 {
2467         print_func_entry();
2468         uint64_t exit_qualification;
2469         int cr;
2470         int reg;
2471
2472 #ifdef LITEVM_DEBUG
2473         if (guest_cpl() != 0) {
2474                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2475                 inject_gp(vcpu);
2476                 print_func_exit();
2477                 return 1;
2478         }
2479 #endif
2480
2481         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2482         cr = exit_qualification & 15;
2483         reg = (exit_qualification >> 8) & 15;
2484         switch ((exit_qualification >> 4) & 3) {
2485                 case 0: /* mov to cr */
2486                         switch (cr) {
2487                                 case 0:
2488                                         vcpu_load_rsp_rip(vcpu);
2489                                         set_cr0(vcpu, vcpu->regs[reg]);
2490                                         skip_emulated_instruction(vcpu);
2491                                         print_func_exit();
2492                                         return 1;
2493                                 case 3:
2494                                         vcpu_load_rsp_rip(vcpu);
2495                                         set_cr3(vcpu, vcpu->regs[reg]);
2496                                         skip_emulated_instruction(vcpu);
2497                                         print_func_exit();
2498                                         return 1;
2499                                 case 4:
2500                                         vcpu_load_rsp_rip(vcpu);
2501                                         set_cr4(vcpu, vcpu->regs[reg]);
2502                                         skip_emulated_instruction(vcpu);
2503                                         print_func_exit();
2504                                         return 1;
2505                                 case 8:
2506                                         vcpu_load_rsp_rip(vcpu);
2507                                         set_cr8(vcpu, vcpu->regs[reg]);
2508                                         skip_emulated_instruction(vcpu);
2509                                         print_func_exit();
2510                                         return 1;
2511                         };
2512                         break;
2513                 case 1: /*mov from cr */
2514                         switch (cr) {
2515                                 case 3:
2516                                         vcpu_load_rsp_rip(vcpu);
2517                                         vcpu->regs[reg] = vcpu->cr3;
2518                                         vcpu_put_rsp_rip(vcpu);
2519                                         skip_emulated_instruction(vcpu);
2520                                         print_func_exit();
2521                                         return 1;
2522                                 case 8:
2523                                         printd("handle_cr: read CR8 " "cpu erratum AA15\n");
2524                                         vcpu_load_rsp_rip(vcpu);
2525                                         vcpu->regs[reg] = vcpu->cr8;
2526                                         vcpu_put_rsp_rip(vcpu);
2527                                         skip_emulated_instruction(vcpu);
2528                                         print_func_exit();
2529                                         return 1;
2530                         }
2531                         break;
2532                 case 3: /* lmsw */
2533                         lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2534
2535                         skip_emulated_instruction(vcpu);
2536                         print_func_exit();
2537                         return 1;
2538                 default:
2539                         break;
2540         }
2541         litevm_run->exit_reason = 0;
2542         printk("litevm: unhandled control register: op %d cr %d\n",
2543                    (int)(exit_qualification >> 4) & 3, cr);
2544         print_func_exit();
2545         return 0;
2546 }
2547
2548 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2549 {
2550         print_func_entry();
2551         uint64_t exit_qualification;
2552         unsigned long val;
2553         int dr, reg;
2554
2555         /*
2556          * FIXME: this code assumes the host is debugging the guest.
2557          *        need to deal with guest debugging itself too.
2558          */
2559         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2560         dr = exit_qualification & 7;
2561         reg = (exit_qualification >> 8) & 15;
2562         vcpu_load_rsp_rip(vcpu);
2563         if (exit_qualification & 16) {
2564                 /* mov from dr */
2565                 switch (dr) {
2566                         case 6:
2567                                 val = 0xffff0ff0;
2568                                 break;
2569                         case 7:
2570                                 val = 0x400;
2571                                 break;
2572                         default:
2573                                 val = 0;
2574                 }
2575                 vcpu->regs[reg] = val;
2576         } else {
2577                 /* mov to dr */
2578         }
2579         vcpu_put_rsp_rip(vcpu);
2580         skip_emulated_instruction(vcpu);
2581         print_func_exit();
2582         return 1;
2583 }
2584
2585 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2586 {
2587         print_func_entry();
2588         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2589         print_func_exit();
2590         return 0;
2591 }
2592
2593 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2594 {
2595         print_func_entry();
2596         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2597         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2598         uint64_t data;
2599
2600         if (guest_cpl() != 0) {
2601                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2602                 inject_gp(vcpu);
2603                 print_func_exit();
2604                 return 1;
2605         }
2606
2607         switch (ecx) {
2608                 case MSR_FS_BASE:
2609                         data = vmcs_readl(GUEST_FS_BASE);
2610                         break;
2611                 case MSR_GS_BASE:
2612                         data = vmcs_readl(GUEST_GS_BASE);
2613                         break;
2614                 case MSR_IA32_SYSENTER_CS:
2615                         data = vmcs_read32(GUEST_SYSENTER_CS);
2616                         break;
2617                 case MSR_IA32_SYSENTER_EIP:
2618                         data = vmcs_read32(GUEST_SYSENTER_EIP);
2619                         break;
2620                 case MSR_IA32_SYSENTER_ESP:
2621                         data = vmcs_read32(GUEST_SYSENTER_ESP);
2622                         break;
2623                 case MSR_IA32_MC0_CTL:
2624                 case MSR_IA32_MCG_STATUS:
2625                 case MSR_IA32_MCG_CAP:
2626                 case MSR_IA32_MC0_MISC:
2627                 case MSR_IA32_MC0_MISC + 4:
2628                 case MSR_IA32_MC0_MISC + 8:
2629                 case MSR_IA32_MC0_MISC + 12:
2630                 case MSR_IA32_MC0_MISC + 16:
2631                 case MSR_IA32_UCODE_REV:
2632                         /* MTRR registers */
2633                 case 0xfe:
2634                 case 0x200 ... 0x2ff:
2635                         data = 0;
2636                         break;
2637                 case MSR_IA32_APICBASE:
2638                         data = vcpu->apic_base;
2639                         break;
2640                 default:
2641                         if (msr) {
2642                                 data = msr->data;
2643                                 break;
2644                         }
2645                         printk("litevm: unhandled rdmsr: %x\n", ecx);
2646                         inject_gp(vcpu);
2647                         print_func_exit();
2648                         return 1;
2649         }
2650
2651         /* FIXME: handling of bits 32:63 of rax, rdx */
2652         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2653         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2654         skip_emulated_instruction(vcpu);
2655         print_func_exit();
2656         return 1;
2657 }
2658
2659 #ifdef __x86_64__
2660
2661 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2662 {
2663         print_func_entry();
2664         struct vmx_msr_entry *msr;
2665
2666         if (efer & EFER_RESERVED_BITS) {
2667                 printd("set_efer: 0x%llx #GP, reserved bits\n", efer);
2668                 inject_gp(vcpu);
2669                 print_func_exit();
2670                 return;
2671         }
2672
2673         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2674                 printd("set_efer: #GP, change LME while paging\n");
2675                 inject_gp(vcpu);
2676                 print_func_exit();
2677                 return;
2678         }
2679
2680         efer &= ~EFER_LMA;
2681         efer |= vcpu->shadow_efer & EFER_LMA;
2682
2683         vcpu->shadow_efer = efer;
2684
2685         msr = find_msr_entry(vcpu, MSR_EFER);
2686
2687         if (!(efer & EFER_LMA))
2688                 efer &= ~EFER_LME;
2689         msr->data = efer;
2690         skip_emulated_instruction(vcpu);
2691         print_func_exit();
2692 }
2693
2694 #endif
2695
2696 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2697
2698 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2699 {
2700         print_func_entry();
2701         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2702         struct vmx_msr_entry *msr;
2703         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2704                 | ((uint64_t) (vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2705
2706         if (guest_cpl() != 0) {
2707                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2708                 inject_gp(vcpu);
2709                 print_func_exit();
2710                 return 1;
2711         }
2712
2713         switch (ecx) {
2714                 case MSR_FS_BASE:
2715                         vmcs_writel(GUEST_FS_BASE, data);
2716                         break;
2717                 case MSR_GS_BASE:
2718                         vmcs_writel(GUEST_GS_BASE, data);
2719                         break;
2720                 case MSR_IA32_SYSENTER_CS:
2721                         vmcs_write32(GUEST_SYSENTER_CS, data);
2722                         break;
2723                 case MSR_IA32_SYSENTER_EIP:
2724                         vmcs_write32(GUEST_SYSENTER_EIP, data);
2725                         break;
2726                 case MSR_IA32_SYSENTER_ESP:
2727                         vmcs_write32(GUEST_SYSENTER_ESP, data);
2728                         break;
2729                 case MSR_EFER:
2730                         set_efer(vcpu, data);
2731                         print_func_exit();
2732                         return 1;
2733                 case MSR_IA32_MC0_STATUS:
2734                         printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data);
2735                         break;
2736                 case MSR_IA32_TIME_STAMP_COUNTER:{
2737                                 uint64_t tsc;
2738
2739                                 tsc = read_tsc();
2740                                 vmcs_write64(TSC_OFFSET, data - tsc);
2741                                 break;
2742                         }
2743                 case MSR_IA32_UCODE_REV:
2744                 case MSR_IA32_UCODE_WRITE:
2745                 case 0x200 ... 0x2ff:   /* MTRRs */
2746                         break;
2747                 case MSR_IA32_APICBASE:
2748                         vcpu->apic_base = data;
2749                         break;
2750                 default:
2751                         msr = find_msr_entry(vcpu, ecx);
2752                         if (msr) {
2753                                 msr->data = data;
2754                                 break;
2755                         }
2756                         printk("litevm: unhandled wrmsr: %x\n", ecx);
2757                         inject_gp(vcpu);
2758                         print_func_exit();
2759                         return 1;
2760         }
2761         skip_emulated_instruction(vcpu);
2762         print_func_exit();
2763         return 1;
2764 }
2765
2766 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2767                                                                    struct litevm_run *litevm_run)
2768 {
2769         print_func_entry();
2770         /* Turn off interrupt window reporting. */
2771         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2772                                  vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2773                                  & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2774         print_func_exit();
2775         return 1;
2776 }
2777
2778 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2779 {
2780         print_func_entry();
2781         skip_emulated_instruction(vcpu);
2782         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) {
2783                 print_func_exit();
2784                 return 1;
2785         }
2786
2787         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2788         print_func_exit();
2789         return 0;
2790 }
2791
2792 /*
2793  * The exit handlers return 1 if the exit was handled fully and guest execution
2794  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2795  * to be done to userspace and return 0.
2796  */
2797 static int (*litevm_vmx_exit_handlers[]) (struct litevm_vcpu * vcpu,
2798                                                                                   struct litevm_run * litevm_run) = {
2799 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2800                 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2801                 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2802                 [EXIT_REASON_INVLPG] = handle_invlpg,
2803                 [EXIT_REASON_CR_ACCESS] = handle_cr,
2804                 [EXIT_REASON_DR_ACCESS] = handle_dr,
2805                 [EXIT_REASON_CPUID] = handle_cpuid,
2806                 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2807                 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2808                 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2809                 [EXIT_REASON_HLT] = handle_halt,};
2810
2811 static const int litevm_vmx_max_exit_handlers =
2812         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2813
2814 /*
2815  * The guest has exited.  See if we can fix it or if we need userspace
2816  * assistance.
2817  */
2818 static int litevm_handle_exit(struct litevm_run *litevm_run,
2819                                                           struct litevm_vcpu *vcpu)
2820 {
2821         print_func_entry();
2822         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2823         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2824
2825 printk("vectoring_info %08x exit_reason %x\n", vectoring_info, exit_reason);
2826         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2827                 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2828                 printk("%s: unexpected, valid vectoring info and "
2829                            "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2830         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2831         if (exit_reason < litevm_vmx_max_exit_handlers
2832                 && litevm_vmx_exit_handlers[exit_reason]) {
2833 printk("reason is KNOWN\n");
2834                 print_func_exit();
2835                 return litevm_vmx_exit_handlers[exit_reason] (vcpu, litevm_run);
2836         } else {
2837 printk("reason is UNKNOWN\n");
2838                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2839                 litevm_run->hw.hardware_exit_reason = exit_reason;
2840         }
2841         print_func_exit();
2842         return 0;
2843 }
2844
2845 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2846 {
2847         print_func_entry();
2848         uint16_t ent[2];
2849         uint16_t cs;
2850         uint16_t ip;
2851         unsigned long flags;
2852         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2853         uint16_t sp = vmcs_readl(GUEST_RSP);
2854         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2855
2856         if (sp > ss_limit || ((sp - 6) > sp)) {
2857                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2858                                         __FUNCTION__,
2859                                         vmcs_readl(GUEST_RSP),
2860                                         vmcs_readl(GUEST_SS_BASE), vmcs_read32(GUEST_SS_LIMIT));
2861                 print_func_exit();
2862                 return;
2863         }
2864
2865         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2866                 sizeof(ent)) {
2867                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2868                 print_func_exit();
2869                 return;
2870         }
2871
2872         flags = vmcs_readl(GUEST_RFLAGS);
2873         cs = vmcs_readl(GUEST_CS_BASE) >> 4;
2874         ip = vmcs_readl(GUEST_RIP);
2875
2876         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2877                 litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2878                 litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2879                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2880                 print_func_exit();
2881                 return;
2882         }
2883
2884         vmcs_writel(GUEST_RFLAGS, flags &
2885                                 ~(X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2886         vmcs_write16(GUEST_CS_SELECTOR, ent[1]);
2887         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2888         vmcs_writel(GUEST_RIP, ent[0]);
2889         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2890         print_func_exit();
2891 }
2892
2893 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2894 {
2895         print_func_entry();
2896         int word_index = __ffs(vcpu->irq_summary);
2897         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2898         int irq = word_index * BITS_PER_LONG + bit_index;
2899
2900         /* don't have clear_bit and I'm not sure the akaros
2901          * bitops are really going to work.
2902          */
2903         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2904         if (!vcpu->irq_pending[word_index])
2905                 vcpu->irq_summary &= ~(1 << word_index);
2906
2907         if (vcpu->rmode.active) {
2908                 inject_rmode_irq(vcpu, irq);
2909                 print_func_exit();
2910                 return;
2911         }
2912         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2913                                  irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2914         print_func_exit();
2915 }
2916
2917 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2918 {
2919         print_func_entry();
2920         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2921                 && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2922                 /*
2923                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2924                  */
2925                 litevm_do_inject_irq(vcpu);
2926         else
2927                 /*
2928                  * Interrupts blocked.  Wait for unblock.
2929                  */
2930                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2931                                          vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2932                                          | CPU_BASED_VIRTUAL_INTR_PENDING);
2933         print_func_exit();
2934 }
2935
2936 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2937 {
2938         print_func_entry();
2939         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2940
2941 #warning "no debugging guests yet"
2942         assert(0);
2943 /*
2944         set_debugreg(dbg->bp[0], 0);
2945         set_debugreg(dbg->bp[1], 1);
2946         set_debugreg(dbg->bp[2], 2);
2947         set_debugreg(dbg->bp[3], 3);
2948 */
2949         if (dbg->singlestep) {
2950                 unsigned long flags;
2951
2952                 flags = vmcs_readl(GUEST_RFLAGS);
2953                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2954                 vmcs_writel(GUEST_RFLAGS, flags);
2955         }
2956         print_func_exit();
2957 }
2958
2959 static void load_msrs(struct vmx_msr_entry *e, int n)
2960 {
2961         print_func_entry();
2962         int i;
2963
2964         if (! e) {
2965                 printk("LOAD MSR WITH NULL POINTER?");
2966                 error("LOAD MSR WITH NULL POINTER?");
2967         }
2968         for (i = 0; i < n; ++i) {
2969                 printk("Load MSR (%lx), with %lx\n", e[i].index, e[i].data);
2970                 write_msr(e[i].index, e[i].data);
2971                 printk("Done\n");
2972         }
2973         print_func_exit();
2974 }
2975
2976 static void save_msrs(struct vmx_msr_entry *e, int n)
2977 {
2978         print_func_entry();
2979         int i;
2980
2981         for (i = 0; i < n; ++i)
2982                 e[i].data = read_msr(e[i].index);
2983         print_func_exit();
2984 }
2985
2986 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
2987 {
2988         print_func_entry();
2989         struct litevm_vcpu *vcpu;
2990         uint8_t fail;
2991         uint16_t fs_sel, gs_sel, ldt_sel;
2992         int fs_gs_ldt_reload_needed;
2993
2994         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2995                 error("vcpu is %d but must be in the range %d..%d\n",
2996                           litevm_run->vcpu, LITEVM_MAX_VCPUS);
2997
2998         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2999         if (!vcpu)
3000                 error("vcpu_load failed");
3001         printk("Loaded\n");
3002
3003         if (litevm_run->emulated) {
3004                 skip_emulated_instruction(vcpu);
3005                 litevm_run->emulated = 0;
3006         }
3007         printk("Emulated\n");
3008
3009         if (litevm_run->mmio_completed) {
3010                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
3011                 vcpu->mmio_read_completed = 1;
3012         }
3013         printk("mmio completed\n");
3014
3015         vcpu->mmio_needed = 0;
3016
3017 again:
3018         /*
3019          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
3020          * allow segment selectors with cpl > 0 or ti == 1.
3021          */
3022         fs_sel = read_fs();
3023         printk("fs_sel %x\n", fs_sel);
3024         gs_sel = read_gs();
3025         printk("gs_sel %x\n", gs_sel);
3026         ldt_sel = read_ldt();
3027         printk("ldt_sel %x\n", ldt_sel);
3028         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
3029         if (!fs_gs_ldt_reload_needed) {
3030                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
3031                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
3032         } else {
3033                 vmcs_write16(HOST_FS_SELECTOR, 0);
3034                 vmcs_write16(HOST_GS_SELECTOR, 0);
3035         }
3036         printk("reloaded gs and gs\n");
3037
3038 #ifdef __x86_64__
3039         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
3040         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
3041         printk("Set FS_BASE and GS_BASE");
3042 #endif
3043
3044         printk("skipping IRQs for now\n");
3045         if (0)
3046         if (vcpu->irq_summary &&
3047                 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
3048                 litevm_try_inject_irq(vcpu);
3049
3050         printk("no debugging for now\n");
3051         if (0)
3052         if (vcpu->guest_debug.enabled)
3053                 litevm_guest_debug_pre(vcpu);
3054
3055         fx_save(vcpu->host_fx_image);
3056         fx_restore(vcpu->guest_fx_image);
3057
3058         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
3059         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3060
3061         printk("GO FOR IT!\n");
3062         asm(
3063                    /* Store host registers */
3064                    "pushf \n\t"
3065 #ifdef __x86_64__
3066                    "push %%rax; push %%rbx; push %%rdx;"
3067                    "push %%rsi; push %%rdi; push %%rbp;"
3068                    "push %%r8;  push %%r9;  push %%r10; push %%r11;"
3069                    "push %%r12; push %%r13; push %%r14; push %%r15;"
3070                    "push %%rcx \n\t" "vmwrite %%rsp, %2 \n\t"
3071 #else
3072                    "pusha; push %%ecx \n\t" "vmwrite %%esp, %2 \n\t"
3073 #endif
3074                    /* Check if vmlaunch of vmresume is needed */
3075                    "cmp $0, %1 \n\t"
3076                    /* Load guest registers.  Don't clobber flags. */
3077 #ifdef __x86_64__
3078                    "mov %c[cr2](%3), %%rax \n\t" "mov %%rax, %%cr2 \n\t" "mov %c[rax](%3), %%rax \n\t" "mov %c[rbx](%3), %%rbx \n\t" "mov %c[rdx](%3), %%rdx \n\t" "mov %c[rsi](%3), %%rsi \n\t" "mov %c[rdi](%3), %%rdi \n\t" "mov %c[rbp](%3), %%rbp \n\t" "mov %c[r8](%3),  %%r8  \n\t" "mov %c[r9](%3),  %%r9  \n\t" "mov %c[r10](%3), %%r10 \n\t" "mov %c[r11](%3), %%r11 \n\t" "mov %c[r12](%3), %%r12 \n\t" "mov %c[r13](%3), %%r13 \n\t" "mov %c[r14](%3), %%r14 \n\t" "mov %c[r15](%3), %%r15 \n\t" "mov %c[rcx](%3), %%rcx \n\t"      /* kills %3 (rcx) */
3079 #else
3080                    "mov %c[cr2](%3), %%eax \n\t" "mov %%eax,   %%cr2 \n\t" "mov %c[rax](%3), %%eax \n\t" "mov %c[rbx](%3), %%ebx \n\t" "mov %c[rdx](%3), %%edx \n\t" "mov %c[rsi](%3), %%esi \n\t" "mov %c[rdi](%3), %%edi \n\t" "mov %c[rbp](%3), %%ebp \n\t" "mov %c[rcx](%3), %%ecx \n\t"    /* kills %3 (ecx) */
3081 #endif
3082                    /* Enter guest mode */
3083                    "jne launched \n\t"
3084                    "vmlaunch \n\t"
3085                    "jmp litevm_vmx_return \n\t"
3086                    "launched: vmresume \n\t"
3087                    ".globl litevm_vmx_return \n\t" "litevm_vmx_return: "
3088                    /* Save guest registers, load host registers, keep flags */
3089 #ifdef __x86_64__
3090                    "xchg %3,     0(%%rsp) \n\t"
3091                    "mov %%rax, %c[rax](%3) \n\t"
3092                    "mov %%rbx, %c[rbx](%3) \n\t"
3093                    "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
3094                    "mov %%rdx, %c[rdx](%3) \n\t"
3095                    "mov %%rsi, %c[rsi](%3) \n\t"
3096                    "mov %%rdi, %c[rdi](%3) \n\t"
3097                    "mov %%rbp, %c[rbp](%3) \n\t"
3098                    "mov %%r8,  %c[r8](%3) \n\t"
3099                    "mov %%r9,  %c[r9](%3) \n\t"
3100                    "mov %%r10, %c[r10](%3) \n\t"
3101                    "mov %%r11, %c[r11](%3) \n\t"
3102                    "mov %%r12, %c[r12](%3) \n\t"
3103                    "mov %%r13, %c[r13](%3) \n\t"
3104                    "mov %%r14, %c[r14](%3) \n\t"
3105                    "mov %%r15, %c[r15](%3) \n\t"
3106                    "mov %%cr2, %%rax   \n\t"
3107                    "mov %%rax, %c[cr2](%3) \n\t"
3108                    "mov 0(%%rsp), %3 \n\t"
3109                    "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
3110                    "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
3111                    "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
3112                    "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
3113 #else
3114                    "xchg %3, 0(%%esp) \n\t"
3115                    "mov %%eax, %c[rax](%3) \n\t"
3116                    "mov %%ebx, %c[rbx](%3) \n\t"
3117                    "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
3118                    "mov %%edx, %c[rdx](%3) \n\t"
3119                    "mov %%esi, %c[rsi](%3) \n\t"
3120                    "mov %%edi, %c[rdi](%3) \n\t"
3121                    "mov %%ebp, %c[rbp](%3) \n\t"
3122                    "mov %%cr2, %%eax  \n\t"
3123                    "mov %%eax, %c[cr2](%3) \n\t"
3124                    "mov 0(%%esp), %3 \n\t" "pop %%ecx; popa \n\t"
3125 #endif
3126 "setbe %0 \n\t" "popf \n\t":"=g"(fail)
3127 :                  "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
3128                    "c"(vcpu),
3129                    [rax] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
3130                    [rbx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
3131                    [rcx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
3132                    [rdx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
3133                    [rsi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
3134                    [rdi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
3135                    [rbp] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
3136 #ifdef __x86_64__
3137                    [r8] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8])),
3138                    [r9] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9])),
3139                    [r10] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
3140                    [r11] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
3141                    [r12] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
3142                    [r13] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
3143                    [r14] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
3144                    [r15] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
3145 #endif
3146                    [cr2] "i"(offsetof(struct litevm_vcpu, cr2))
3147                    :"cc", "memory");
3148
3149         ++litevm_stat.exits;
3150         printk("vm_run exits");
3151         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3152         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
3153
3154         fx_save(vcpu->guest_fx_image);
3155         fx_restore(vcpu->host_fx_image);
3156
3157 #ifndef __x86_64__
3158 asm("mov %0, %%ds; mov %0, %%es": :"r"(__USER_DS));
3159 #endif
3160
3161         litevm_run->exit_type = 0;
3162         if (fail) {
3163 printk("FAIL\n");
3164                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
3165                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
3166 printk("reason %d\n", litevm_run->exit_reason);
3167         } else {
3168 printk("NOT FAIL\n");
3169                 if (fs_gs_ldt_reload_needed) {
3170                         load_ldt(ldt_sel);
3171                         load_fs(fs_sel);
3172                         /*
3173                          * If we have to reload gs, we must take care to
3174                          * preserve our gs base.
3175                          */
3176                         disable_irq();
3177                         load_gs(gs_sel);
3178 #ifdef __x86_64__
3179                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
3180 #endif
3181                         enable_irq();
3182
3183                         reload_tss();
3184                 }
3185                 vcpu->launched = 1;
3186                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
3187 printk("Let's see why it exited\n");
3188                 if (litevm_handle_exit(litevm_run, vcpu)) {
3189                         /* Give scheduler a change to reschedule. */
3190                         vcpu_put(vcpu);
3191 #warning "how to tell if signal is pending"
3192 /*
3193                         if (signal_pending(current)) {
3194                                 ++litevm_stat.signal_exits;
3195                                 return -EINTR;
3196                         }
3197 */
3198                         kthread_yield();
3199                         /* Cannot fail -  no vcpu unplug yet. */
3200                         vcpu_load(litevm, vcpu_slot(vcpu));
3201                         goto again;
3202                 }
3203         }
3204
3205         vcpu_put(vcpu);
3206         printk("vm_run returns\n");
3207         print_func_exit();
3208         return 0;
3209 }
3210
3211 static int litevm_dev_ioctl_get_regs(struct litevm *litevm,
3212                                                                          struct litevm_regs *regs)
3213 {
3214         print_func_entry();
3215         struct litevm_vcpu *vcpu;
3216
3217         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3218                 print_func_exit();
3219                 return -EINVAL;
3220         }
3221
3222         vcpu = vcpu_load(litevm, regs->vcpu);
3223         if (!vcpu) {
3224                 print_func_exit();
3225                 return -ENOENT;
3226         }
3227
3228         regs->rax = vcpu->regs[VCPU_REGS_RAX];
3229         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
3230         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
3231         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
3232         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
3233         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
3234         regs->rsp = vmcs_readl(GUEST_RSP);
3235         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
3236 #ifdef __x86_64__
3237         regs->r8 = vcpu->regs[VCPU_REGS_R8];
3238         regs->r9 = vcpu->regs[VCPU_REGS_R9];
3239         regs->r10 = vcpu->regs[VCPU_REGS_R10];
3240         regs->r11 = vcpu->regs[VCPU_REGS_R11];
3241         regs->r12 = vcpu->regs[VCPU_REGS_R12];
3242         regs->r13 = vcpu->regs[VCPU_REGS_R13];
3243         regs->r14 = vcpu->regs[VCPU_REGS_R14];
3244         regs->r15 = vcpu->regs[VCPU_REGS_R15];
3245 #endif
3246
3247         regs->rip = vmcs_readl(GUEST_RIP);
3248         regs->rflags = vmcs_readl(GUEST_RFLAGS);
3249
3250         /*
3251          * Don't leak debug flags in case they were set for guest debugging
3252          */
3253         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3254                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3255
3256         vcpu_put(vcpu);
3257
3258         print_func_exit();
3259         return 0;
3260 }
3261
3262 static int litevm_dev_ioctl_set_regs(struct litevm *litevm,
3263                                                                          struct litevm_regs *regs)
3264 {
3265         print_func_entry();
3266         struct litevm_vcpu *vcpu;
3267
3268         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3269                 print_func_exit();
3270                 return -EINVAL;
3271         }
3272
3273         vcpu = vcpu_load(litevm, regs->vcpu);
3274         if (!vcpu) {
3275                 print_func_exit();
3276                 return -ENOENT;
3277         }
3278
3279         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
3280         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
3281         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
3282         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
3283         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
3284         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
3285         vmcs_writel(GUEST_RSP, regs->rsp);
3286         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
3287 #ifdef __x86_64__
3288         vcpu->regs[VCPU_REGS_R8] = regs->r8;
3289         vcpu->regs[VCPU_REGS_R9] = regs->r9;
3290         vcpu->regs[VCPU_REGS_R10] = regs->r10;
3291         vcpu->regs[VCPU_REGS_R11] = regs->r11;
3292         vcpu->regs[VCPU_REGS_R12] = regs->r12;
3293         vcpu->regs[VCPU_REGS_R13] = regs->r13;
3294         vcpu->regs[VCPU_REGS_R14] = regs->r14;
3295         vcpu->regs[VCPU_REGS_R15] = regs->r15;
3296 #endif
3297
3298         vmcs_writel(GUEST_RIP, regs->rip);
3299         vmcs_writel(GUEST_RFLAGS, regs->rflags);
3300
3301         vcpu_put(vcpu);
3302
3303         print_func_exit();
3304         return 0;
3305 }
3306
3307 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm,
3308                                                                           struct litevm_sregs *sregs)
3309 {
3310         print_func_entry();
3311         struct litevm_vcpu *vcpu;
3312
3313         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3314                 print_func_exit();
3315                 return -EINVAL;
3316         }
3317         vcpu = vcpu_load(litevm, sregs->vcpu);
3318         if (!vcpu) {
3319                 print_func_exit();
3320                 return -ENOENT;
3321         }
3322 #define get_segment(var, seg) \
3323         do { \
3324                 uint32_t ar; \
3325                 \
3326                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
3327                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
3328                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
3329                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
3330                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
3331                 sregs->var.type = ar & 15; \
3332                 sregs->var.s = (ar >> 4) & 1; \
3333                 sregs->var.dpl = (ar >> 5) & 3; \
3334                 sregs->var.present = (ar >> 7) & 1; \
3335                 sregs->var.avl = (ar >> 12) & 1; \
3336                 sregs->var.l = (ar >> 13) & 1; \
3337                 sregs->var.db = (ar >> 14) & 1; \
3338                 sregs->var.g = (ar >> 15) & 1; \
3339                 sregs->var.unusable = (ar >> 16) & 1; \
3340         } while (0);
3341
3342         get_segment(cs, CS);
3343         get_segment(ds, DS);
3344         get_segment(es, ES);
3345         get_segment(fs, FS);
3346         get_segment(gs, GS);
3347         get_segment(ss, SS);
3348
3349         get_segment(tr, TR);
3350         get_segment(ldt, LDTR);
3351 #undef get_segment
3352
3353 #define get_dtable(var, table) \
3354         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
3355                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
3356
3357         get_dtable(idt, IDTR);
3358         get_dtable(gdt, GDTR);
3359 #undef get_dtable
3360
3361         sregs->cr0 = guest_cr0();
3362         sregs->cr2 = vcpu->cr2;
3363         sregs->cr3 = vcpu->cr3;
3364         sregs->cr4 = guest_cr4();
3365         sregs->cr8 = vcpu->cr8;
3366         sregs->efer = vcpu->shadow_efer;
3367         sregs->apic_base = vcpu->apic_base;
3368
3369         sregs->pending_int = vcpu->irq_summary != 0;
3370
3371         vcpu_put(vcpu);
3372
3373         print_func_exit();
3374         return 0;
3375 }
3376
3377 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm,
3378                                                                           struct litevm_sregs *sregs)
3379 {
3380         print_func_entry();
3381         struct litevm_vcpu *vcpu;
3382         int mmu_reset_needed = 0;
3383
3384         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3385                 print_func_exit();
3386                 return -EINVAL;
3387         }
3388         vcpu = vcpu_load(litevm, sregs->vcpu);
3389         if (!vcpu) {
3390                 print_func_exit();
3391                 return -ENOENT;
3392         }
3393 #define set_segment(var, seg) \
3394         do { \
3395                 uint32_t ar; \
3396                 \
3397                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
3398                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
3399                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
3400                 if (sregs->var.unusable) { \
3401                         ar = (1 << 16); \
3402                 } else { \
3403                         ar = (sregs->var.type & 15); \
3404                         ar |= (sregs->var.s & 1) << 4; \
3405                         ar |= (sregs->var.dpl & 3) << 5; \