9715e7ffd3b178af25861babf553ba47b0c69892
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 /* from linux */
39 #define __KERNEL_CS 0x10
40 #define __KERNEL_DS 0x18
41 /* used? Who knows */
42 #define GDT_ENTRY_TSS 0x24
43
44 #define currentcpu (&per_cpu_info[core_id()])
45 #define QLOCK_init(x) {printk("qlock_init %p\n", x); qlock_init(x); printk("%p lock_inited\n", x);}
46 #define QLOCK(x) {printk("qlock %p\n", x); qlock(x); printk("%p locked\n", x);}
47 #define QUNLOCK(x) {printk("qunlock %p\n", x); qunlock(x); printk("%p unlocked\n", x);}
48 #define SPLI_irqsave(x){printk("spin_lock_init %p:", x); spinlock_init(x); printk("inited\n");}
49 #define SPLL(x){printk("spin_lock %p\n", x); spin_lock_irqsave(x); printk("%p locked\n", x);}
50 #define SPLU(x){printk("spin_unlock %p\n", x); spin_unlock(x); printk("%p unlocked\n", x);}
51 struct litevm_stat litevm_stat;
52
53 static struct litevm_stats_debugfs_item {
54         const char *name;
55         uint32_t *data;
56 } debugfs_entries[] = {
57         {
58         "pf_fixed", &litevm_stat.pf_fixed}, {
59         "pf_guest", &litevm_stat.pf_guest}, {
60         "tlb_flush", &litevm_stat.tlb_flush}, {
61         "invlpg", &litevm_stat.invlpg}, {
62         "exits", &litevm_stat.exits}, {
63         "io_exits", &litevm_stat.io_exits}, {
64         "mmio_exits", &litevm_stat.mmio_exits}, {
65         "signal_exits", &litevm_stat.signal_exits}, {
66         "irq_exits", &litevm_stat.irq_exits}, {
67         0, 0}
68 };
69
70 static struct dentry *debugfs_dir;
71
72 static const uint32_t vmx_msr_index[] = {
73 #ifdef __x86_64__
74         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
75 #endif
76         MSR_EFER,       // wtf? MSR_K6_STAR,
77 };
78
79 static const char* vmx_msr_name[] = {
80 #ifdef __x86_64__
81         "MSR_SYSCALL_MASK", "MSR_LSTAR", "MSR_CSTAR", "MSR_KERNEL_GS_BASE",
82 #endif
83         "MSR_EFER",     // wtf? MSR_K6_STAR,
84 };
85
86 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
87
88 #ifdef __x86_64__
89 /*
90  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
91  * mechanism (cpu bug AA24)
92  */
93 #define NR_BAD_MSRS 2
94 #else
95 #define NR_BAD_MSRS 0
96 #endif
97
98 #define TSS_IOPB_BASE_OFFSET 0x66
99 #define TSS_BASE_SIZE 0x68
100 #define TSS_IOPB_SIZE (65536 / 8)
101 #define TSS_REDIRECTION_SIZE (256 / 8)
102 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
103
104 #define MSR_IA32_VMX_BASIC_MSR                  0x480
105 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
106 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
107 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
108 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
109
110 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
111 #define LMSW_GUEST_MASK 0x0eULL
112 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
113 //#define CR4_VMXE 0x2000
114 #define CR8_RESEVED_BITS (~0x0fULL)
115 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
116
117 #ifdef __x86_64__
118 #define HOST_IS_64 1
119 #else
120 #define HOST_IS_64 0
121 #endif
122
123 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
124 /**
125  * __ffs - find first set bit in word
126  * @word: The word to search
127  *
128  * Undefined if no bit exists, so code should check against 0 first.
129  */
130 static inline unsigned long __ffs(unsigned long word)
131 {
132         print_func_entry();
133 asm("rep; bsf %1,%0":"=r"(word)
134 :               "rm"(word));
135         print_func_exit();
136         return word;
137 }
138
139 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu,
140                                                                                         uint32_t msr)
141 {
142         print_func_entry();
143         int i;
144
145         for (i = 0; i < vcpu->nmsrs; ++i)
146                 if (vcpu->guest_msrs[i].index == msr) {
147                         print_func_exit();
148                         return &vcpu->guest_msrs[i];
149                 }
150         print_func_exit();
151         return 0;
152 }
153
154 struct descriptor_table {
155         uint16_t limit;
156         unsigned long base;
157 } __attribute__ ((packed));
158
159 static void get_gdt(struct descriptor_table *table)
160 {
161         print_func_entry();
162 asm("sgdt %0":"=m"(*table));
163         print_func_exit();
164 }
165
166 static void get_idt(struct descriptor_table *table)
167 {
168         print_func_entry();
169 asm("sidt %0":"=m"(*table));
170         print_func_exit();
171 }
172
173 static uint16_t read_fs(void)
174 {
175         print_func_entry();
176         uint16_t seg;
177 asm("mov %%fs, %0":"=g"(seg));
178         print_func_exit();
179         return seg;
180 }
181
182 static uint16_t read_gs(void)
183 {
184         print_func_entry();
185         uint16_t seg;
186 asm("mov %%gs, %0":"=g"(seg));
187         print_func_exit();
188         return seg;
189 }
190
191 static uint16_t read_ldt(void)
192 {
193         print_func_entry();
194         uint16_t ldt;
195 asm("sldt %0":"=g"(ldt));
196         print_func_exit();
197         return ldt;
198 }
199
200 static void load_fs(uint16_t sel)
201 {
202         print_func_entry();
203 asm("mov %0, %%fs": :"g"(sel));
204         print_func_exit();
205 }
206
207 static void load_gs(uint16_t sel)
208 {
209         print_func_entry();
210 asm("mov %0, %%gs": :"g"(sel));
211         print_func_exit();
212 }
213
214 #ifndef load_ldt
215 static void load_ldt(uint16_t sel)
216 {
217         print_func_entry();
218 asm("lldt %0": :"g"(sel));
219         print_func_exit();
220 }
221 #endif
222
223 static void fx_save(void *image)
224 {
225         print_func_entry();
226         asm("fxsave (%0)"::"r"(image));
227         print_func_exit();
228 }
229
230 static void fx_restore(void *image)
231 {
232         print_func_entry();
233         asm("fxrstor (%0)"::"r"(image));
234         print_func_exit();
235 }
236
237 static void fpu_init(void)
238 {
239         print_func_entry();
240         asm("finit");
241         print_func_exit();
242 }
243
244 struct segment_descriptor {
245         uint16_t limit_low;
246         uint16_t base_low;
247         uint8_t base_mid;
248         uint8_t type:4;
249         uint8_t system:1;
250         uint8_t dpl:2;
251         uint8_t present:1;
252         uint8_t limit_high:4;
253         uint8_t avl:1;
254         uint8_t long_mode:1;
255         uint8_t default_op:1;
256         uint8_t granularity:1;
257         uint8_t base_high;
258 } __attribute__ ((packed));
259
260 #ifdef __x86_64__
261 // LDT or TSS descriptor in the GDT. 16 bytes.
262 struct segment_descriptor_64 {
263         struct segment_descriptor s;
264         uint32_t base_higher;
265         uint32_t pad_zero;
266 };
267
268 #endif
269
270 static unsigned long segment_base(uint16_t selector)
271 {
272         print_func_entry();
273         struct descriptor_table gdt;
274         struct segment_descriptor *d;
275         unsigned long table_base;
276         typedef unsigned long ul;
277         unsigned long v;
278
279 asm("sgdt %0":"=m"(gdt));
280         table_base = gdt.base;
281
282         if (selector & 4) {     /* from ldt */
283                 uint16_t ldt_selector;
284
285 asm("sldt %0":"=g"(ldt_selector));
286                 table_base = segment_base(ldt_selector);
287         }
288         d = (struct segment_descriptor *)(table_base + (selector & ~7));
289         v = d->base_low | ((ul) d->base_mid << 16) | ((ul) d->base_high << 24);
290 #ifdef __x86_64__
291         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
292                 v |= ((ul) ((struct segment_descriptor_64 *)d)->base_higher) << 32;
293 #endif
294         print_func_exit();
295         return v;
296 }
297
298 static unsigned long read_tr_base(void)
299 {
300         print_func_entry();
301         uint16_t tr;
302 asm("str %0":"=g"(tr));
303         print_func_exit();
304         return segment_base(tr);
305 }
306
307 static void reload_tss(void)
308 {
309         print_func_entry();
310 #ifndef __x86_64__
311
312         /*
313          * VT restores TR but not its size.  Useless.
314          */
315         struct descriptor_table gdt;
316         struct segment_descriptor *descs;
317
318         get_gdt(&gdt);
319         descs = (void *)gdt.base;
320         descs[GDT_ENTRY_TSS].type = 9;  /* available TSS */
321         load_TR_desc();
322 #endif
323         print_func_exit();
324 }
325
326 static struct vmcs_descriptor {
327         int size;
328         int order;
329         uint32_t revision_id;
330 } vmcs_descriptor;
331
332 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
333 {
334         print_func_entry();
335         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
336         print_func_exit();
337         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
338 }
339
340 int litevm_read_guest(struct litevm_vcpu *vcpu,
341                                           gva_t addr, unsigned long size, void *dest)
342 {
343         print_func_entry();
344         unsigned char *host_buf = dest;
345         unsigned long req_size = size;
346
347         while (size) {
348                 hpa_t paddr;
349                 unsigned now;
350                 unsigned offset;
351                 hva_t guest_buf;
352
353                 paddr = gva_to_hpa(vcpu, addr);
354
355                 if (is_error_hpa(paddr))
356                         break;
357                 guest_buf = (hva_t) KADDR(paddr);
358                 offset = addr & ~PAGE_MASK;
359                 guest_buf |= offset;
360                 now = MIN(size, PAGE_SIZE - offset);
361                 memcpy(host_buf, (void *)guest_buf, now);
362                 host_buf += now;
363                 addr += now;
364                 size -= now;
365         }
366         print_func_exit();
367         return req_size - size;
368 }
369
370 int litevm_write_guest(struct litevm_vcpu *vcpu,
371                                            gva_t addr, unsigned long size, void *data)
372 {
373         print_func_entry();
374         unsigned char *host_buf = data;
375         unsigned long req_size = size;
376
377         while (size) {
378                 hpa_t paddr;
379                 unsigned now;
380                 unsigned offset;
381                 hva_t guest_buf;
382
383                 paddr = gva_to_hpa(vcpu, addr);
384
385                 if (is_error_hpa(paddr))
386                         break;
387
388                 guest_buf = (hva_t) KADDR(paddr);
389                 offset = addr & ~PAGE_MASK;
390                 guest_buf |= offset;
391                 now = MIN(size, PAGE_SIZE - offset);
392                 memcpy((void *)guest_buf, host_buf, now);
393                 host_buf += now;
394                 addr += now;
395                 size -= now;
396         }
397         print_func_exit();
398         return req_size - size;
399 }
400
401 static void setup_vmcs_descriptor(void)
402 {
403         print_func_entry();
404         uint64_t msr;
405
406         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
407         vmcs_descriptor.size = (msr >> 32) & 0x1fff;
408         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size >> PAGE_SHIFT);
409         vmcs_descriptor.revision_id = (uint32_t) msr;
410         printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n",
411                    msr, vmcs_descriptor.size, vmcs_descriptor.order,
412                    vmcs_descriptor.revision_id);
413         print_func_exit();
414 };
415
416 static void vmcs_clear(struct vmcs *vmcs)
417 {
418         print_func_entry();
419         uint64_t phys_addr = PADDR(vmcs);
420         uint8_t error;
421         printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr);
422         asm volatile ("vmclear %1; setna %0":"=m" (error):"m"(phys_addr):"cc",
423                                   "memory");
424         if (error)
425                 printk("litevm: vmclear fail: %p/%llx\n", vmcs, phys_addr);
426         print_func_exit();
427 }
428
429 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
430 {
431         print_func_entry();
432         struct litevm_vcpu *vcpu = arg;
433         int cpu = core_id();
434         printd
435                 ("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n",
436                  cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
437
438         if (vcpu->cpu == cpu)
439                 vmcs_clear(vcpu->vmcs);
440
441         if (currentcpu->vmcs == vcpu->vmcs)
442                 currentcpu->vmcs = NULL;
443         print_func_exit();
444 }
445
446 static int vcpu_slot(struct litevm_vcpu *vcpu)
447 {
448         print_func_entry();
449         print_func_exit();
450         return vcpu - vcpu->litevm->vcpus;
451 }
452
453 /*
454  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
455  * vcpu mutex is already taken.
456  */
457 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
458 {
459         print_func_entry();
460         uint64_t phys_addr = PADDR(vcpu->vmcs);
461         int cpu;
462         cpu = core_id();
463
464         printk("__vcpu_load: vcpu->cpu %d cpu %d\n", vcpu->cpu, cpu);
465         if ((vcpu->cpu != cpu) && (vcpu->cpu != -1)){
466                 handler_wrapper_t *w;
467                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
468                 smp_call_wait(w);
469                 vcpu->launched = 0;
470         }
471
472         printk("2 ..");
473         if (currentcpu->vmcs != vcpu->vmcs) {
474                 uint8_t error;
475
476                 currentcpu->vmcs = vcpu->vmcs;
477                 asm volatile ("vmptrld %1; setna %0":"=m" (error):"m"(phys_addr):"cc");
478                 if (error) {
479                         printk("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
480                         error("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
481                 }
482         }
483
484         printk("3 ..");
485         if (vcpu->cpu != cpu) {
486                 struct descriptor_table dt;
487                 unsigned long sysenter_esp;
488
489                 vcpu->cpu = cpu;
490                 /*
491                  * Linux uses per-cpu TSS and GDT, so set these when switching
492                  * processors.
493                  */
494                 vmcs_writel(HOST_TR_BASE, read_tr_base());      /* 22.2.4 */
495                 get_gdt(&dt);
496                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
497
498                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
499                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp);      /* 22.2.3 */
500         }
501         print_func_exit();
502         return vcpu;
503 }
504
505 /*
506  * Switches to specified vcpu, until a matching vcpu_put()
507  * And leaves it locked!
508  */
509 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
510 {
511         struct litevm_vcpu *ret;
512         print_func_entry();
513         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
514
515         printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu);
516
517         QLOCK(&vcpu->mutex);
518         printk("Locked\n");
519         if (!vcpu->vmcs) {
520                 QUNLOCK(&vcpu->mutex);
521                 printk("vcpu->vmcs for vcpu %p is NULL", vcpu);
522                 error("vcpu->vmcs is NULL");
523         }
524         ret = __vcpu_load(vcpu);
525         print_func_exit();
526         return ret;
527 }
528
529 static void vcpu_put(struct litevm_vcpu *vcpu)
530 {
531         print_func_entry();
532         //put_cpu();
533         QUNLOCK(&vcpu->mutex);
534         print_func_exit();
535 }
536
537 static struct vmcs *alloc_vmcs_cpu(int cpu)
538 {
539         print_func_entry();
540         int node = node_id();
541         struct vmcs *vmcs;
542
543         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
544         if (!vmcs) {
545                 print_func_exit();
546                 printk("no memory for vcpus");
547                 error("no memory for vcpus");
548         }
549         memset(vmcs, 0, vmcs_descriptor.size);
550         vmcs->revision_id = vmcs_descriptor.revision_id;        /* vmcs revision id */
551         print_func_exit();
552         return vmcs;
553 }
554
555 static struct vmcs *alloc_vmcs(void)
556 {
557         struct vmcs *ret;
558         print_func_entry();
559         ret = alloc_vmcs_cpu(core_id());
560         print_func_exit();
561         return ret;
562 }
563
564 static int cpu_has_litevm_support(void)
565 {
566         print_func_entry();
567         /* sigh ... qemu. */
568         char vid[16];
569         if (vendor_id(vid) < 0)
570                 return 0;
571         printk("vendor id is %s\n", vid);
572         if (vid[0] == 'Q') /* qemu */
573                 return 0;
574         if (vid[0] == 'A') /* AMD or qemu claiming to be AMD */
575                 return 0;
576         uint32_t ecx = cpuid_ecx(1);
577         print_func_exit();
578         return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */
579 }
580
581 static int vmx_disabled_by_bios(void)
582 {
583         print_func_entry();
584         uint64_t msr;
585
586         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
587         print_func_exit();
588         return (msr & 5) == 1;  /* locked but not enabled */
589 }
590
591 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
592 {
593         print_func_entry();
594         int cpu = hw_core_id();
595         uint64_t phys_addr;
596         uint64_t old;
597         uint64_t status = 0;
598         currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order,
599                                                                                           KMALLOC_WAIT);
600         if (!currentcpu->vmxarea)
601                 return;
602         memset(currentcpu->vmxarea, 0, vmcs_descriptor.size);
603         currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id;
604         phys_addr = PADDR(currentcpu->vmxarea);
605         printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(),
606                    currentcpu->vmxarea, (void *)phys_addr);
607         if (phys_addr & 0xfff) {
608                 printk("fix vmxarea alignment!");
609         }
610         printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
611         old = read_msr(MSR_IA32_FEATURE_CONTROL);
612         printk("%d: vm_enable, old is %d\n", core_id(), old);
613         if ((old & 5) == 0) {
614                 /* enable and lock */
615                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
616                 old = read_msr(MSR_IA32_FEATURE_CONTROL);
617                 printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old);
618         }
619         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
620         lcr4(rcr4() | CR4_VMXE);        /* FIXME: not cpu hotplug safe */
621         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
622         printk("%d:cr0 is %x\n", core_id(), rcr0());
623         lcr0(rcr0() | 0x20);
624         printk("%d:cr0 is %x\n", core_id(), rcr0());
625         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
626         outb(0x92, inb(0x92) | 2);
627         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
628         asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:":"=m" (status):"m"
629                                   (phys_addr):"memory", "cc");
630         printk("%d:vmxon status is %d\n", core_id(), status);
631         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
632         if (!status) {
633                 printk("%d:vm_enable: status says fail\n", core_id());
634         }
635         print_func_exit();
636 }
637
638 static void litevm_disable(void *garbage)
639 {
640         print_func_entry();
641         asm volatile ("vmxoff":::"cc");
642         print_func_exit();
643 }
644
645 struct litevm *vmx_open(void)
646 {
647         print_func_entry();
648         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
649         int i;
650
651         printk("vmx_open: litevm is %p\n", litevm);
652         if (!litevm) {
653                 printk("NO LITEVM! MAKES NO SENSE!\n");
654                 error("litevm alloc failed");
655                 print_func_exit();
656                 return 0;
657         }
658
659         SPLI_irqsave(&litevm->lock);
660         LIST_INIT(&litevm->link);
661         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
662                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
663                 printk("init vcpu %p\n", vcpu);
664
665                 QLOCK_init(&vcpu->mutex);
666                 vcpu->mmu.root_hpa = INVALID_PAGE;
667                 vcpu->litevm = litevm;
668                 LIST_INIT(&vcpu->link);
669         }
670         printk("vmx_open: busy %d\n", litevm->busy);
671         printk("return %p\n", litevm);
672         print_func_exit();
673         return litevm;
674 }
675
676 /*
677  * Free any memory in @free but not in @dont.
678  */
679 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
680                                                                          struct litevm_memory_slot *dont)
681 {
682         print_func_entry();
683         int i;
684
685         if (!dont || free->phys_mem != dont->phys_mem)
686                 if (free->phys_mem) {
687                         for (i = 0; i < free->npages; ++i) {
688                                 page_t *page = free->phys_mem[i];
689                                 page_decref(page);
690                                 assert(page_is_free(page2ppn(page)));
691                         }
692                         kfree(free->phys_mem);
693                 }
694
695         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
696                 kfree(free->dirty_bitmap);
697
698         free->phys_mem = 0;
699         free->npages = 0;
700         free->dirty_bitmap = 0;
701         print_func_exit();
702 }
703
704 static void litevm_free_physmem(struct litevm *litevm)
705 {
706         print_func_entry();
707         int i;
708
709         for (i = 0; i < litevm->nmemslots; ++i)
710                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
711         print_func_exit();
712 }
713
714 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
715 {
716         print_func_entry();
717         if (vcpu->vmcs) {
718                 handler_wrapper_t *w;
719                 smp_call_function_all(__vcpu_clear, vcpu, &w);
720                 smp_call_wait(w);
721                 //free_vmcs(vcpu->vmcs);
722                 vcpu->vmcs = 0;
723         }
724         print_func_exit();
725 }
726
727 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
728 {
729         print_func_entry();
730         litevm_free_vmcs(vcpu);
731         litevm_mmu_destroy(vcpu);
732         print_func_exit();
733 }
734
735 static void litevm_free_vcpus(struct litevm *litevm)
736 {
737         print_func_entry();
738         unsigned int i;
739
740         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
741                 litevm_free_vcpu(&litevm->vcpus[i]);
742         print_func_exit();
743 }
744
745 static int litevm_dev_release(struct litevm *litevm)
746 {
747         print_func_entry();
748
749         litevm_free_vcpus(litevm);
750         litevm_free_physmem(litevm);
751         kfree(litevm);
752         print_func_exit();
753         return 0;
754 }
755
756 unsigned long vmcs_readl(unsigned long field)
757 {
758         print_func_entry();
759         unsigned long value;
760
761         asm volatile ("vmread %1, %0":"=g" (value):"r"(field):"cc");
762         print_func_exit();
763         return value;
764 }
765
766 void vmcs_writel(unsigned long field, unsigned long value)
767 {
768         print_func_entry();
769         uint8_t error;
770
771         asm volatile ("vmwrite %1, %2; setna %0":"=g" (error):"r"(value),
772                                   "r"(field):"cc");
773         if (error)
774                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
775                            field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
776         print_func_exit();
777 }
778
779 static void vmcs_write16(unsigned long field, uint16_t value)
780 {
781         print_func_entry();
782         vmcs_writel(field, value);
783         print_func_exit();
784 }
785
786 static void vmcs_write64(unsigned long field, uint64_t value)
787 {
788         print_func_entry();
789 #ifdef __x86_64__
790         vmcs_writel(field, value);
791 #else
792         vmcs_writel(field, value);
793         asm volatile ("");
794         vmcs_writel(field + 1, value >> 32);
795 #endif
796         print_func_exit();
797 }
798
799 static void inject_gp(struct litevm_vcpu *vcpu)
800 {
801         print_func_entry();
802         printd("inject_general_protection: rip 0x%lx\n", vmcs_readl(GUEST_RIP));
803         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
804         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
805                                  GP_VECTOR |
806                                  INTR_TYPE_EXCEPTION |
807                                  INTR_INFO_DELIEVER_CODE_MASK | INTR_INFO_VALID_MASK);
808         print_func_exit();
809 }
810
811 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
812 {
813         print_func_entry();
814         if (vcpu->rmode.active)
815                 vmcs_write32(EXCEPTION_BITMAP, ~0);
816         else
817                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
818         print_func_exit();
819 }
820
821 static void enter_pmode(struct litevm_vcpu *vcpu)
822 {
823         print_func_entry();
824         unsigned long flags;
825
826         vcpu->rmode.active = 0;
827
828         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
829         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
830         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
831
832         flags = vmcs_readl(GUEST_RFLAGS);
833         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
834         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
835         vmcs_writel(GUEST_RFLAGS, flags);
836
837         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
838                                 (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK));
839
840         update_exception_bitmap(vcpu);
841
842 #define FIX_PMODE_DATASEG(seg, save) {                          \
843                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
844                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
845                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
846                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
847         }
848
849         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
850         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
851         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
852         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
853         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
854
855         vmcs_write16(GUEST_CS_SELECTOR,
856                                  vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
857         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
858         print_func_exit();
859 }
860
861 static int rmode_tss_base(struct litevm *litevm)
862 {
863         print_func_entry();
864         gfn_t base_gfn =
865                 litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
866         print_func_exit();
867         return base_gfn << PAGE_SHIFT;
868 }
869
870 static void enter_rmode(struct litevm_vcpu *vcpu)
871 {
872         print_func_entry();
873         unsigned long flags;
874
875         vcpu->rmode.active = 1;
876
877         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
878         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
879
880         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
881         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
882
883         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
884         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
885
886         flags = vmcs_readl(GUEST_RFLAGS);
887         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
888
889         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
890
891         vmcs_writel(GUEST_RFLAGS, flags);
892         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
893         update_exception_bitmap(vcpu);
894
895 #define FIX_RMODE_SEG(seg, save) {                                 \
896                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
897                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
898                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
899                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
900         }
901
902         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
903         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
904
905         FIX_RMODE_SEG(ES, vcpu->rmode.es);
906         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
907         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
908         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
909         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
910         print_func_exit();
911 }
912
913 static int init_rmode_tss(struct litevm *litevm)
914 {
915         print_func_entry();
916         struct page *p1, *p2, *p3;
917         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
918         char *page;
919
920         p1 = _gfn_to_page(litevm, fn++);
921         p2 = _gfn_to_page(litevm, fn++);
922         p3 = _gfn_to_page(litevm, fn);
923
924         if (!p1 || !p2 || !p3) {
925                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
926                 print_func_exit();
927                 return 0;
928         }
929
930         page = page2kva(p1);
931         memset(page, 0, PAGE_SIZE);
932         *(uint16_t *) (page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
933
934         page = page2kva(p2);
935         memset(page, 0, PAGE_SIZE);
936
937         page = page2kva(p3);
938         memset(page, 0, PAGE_SIZE);
939         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
940
941         print_func_exit();
942         return 1;
943 }
944
945 #ifdef __x86_64__
946
947 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
948 {
949         print_func_entry();
950         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
951
952         vcpu->shadow_efer = efer;
953         if (efer & EFER_LMA) {
954                 vmcs_write32(VM_ENTRY_CONTROLS,
955                                          vmcs_read32(VM_ENTRY_CONTROLS) |
956                                          VM_ENTRY_CONTROLS_IA32E_MASK);
957                 msr->data = efer;
958
959         } else {
960                 vmcs_write32(VM_ENTRY_CONTROLS,
961                                          vmcs_read32(VM_ENTRY_CONTROLS) &
962                                          ~VM_ENTRY_CONTROLS_IA32E_MASK);
963
964                 msr->data = efer & ~EFER_LME;
965         }
966         print_func_exit();
967 }
968
969 static void enter_lmode(struct litevm_vcpu *vcpu)
970 {
971         print_func_entry();
972         uint32_t guest_tr_ar;
973
974         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
975         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
976                 printd("%s: tss fixup for long mode. \n", __FUNCTION__);
977                 vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK)
978                                          | AR_TYPE_BUSY_64_TSS);
979         }
980
981         vcpu->shadow_efer |= EFER_LMA;
982
983         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
984         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
985                                  | VM_ENTRY_CONTROLS_IA32E_MASK);
986         print_func_exit();
987 }
988
989 static void exit_lmode(struct litevm_vcpu *vcpu)
990 {
991         print_func_entry();
992         vcpu->shadow_efer &= ~EFER_LMA;
993
994         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
995                                  & ~VM_ENTRY_CONTROLS_IA32E_MASK);
996         print_func_exit();
997 }
998
999 #endif
1000
1001 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
1002 {
1003         print_func_entry();
1004         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
1005                 enter_pmode(vcpu);
1006
1007         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
1008                 enter_rmode(vcpu);
1009
1010 #ifdef __x86_64__
1011         if (vcpu->shadow_efer & EFER_LME) {
1012                 if (!is_paging() && (cr0 & CR0_PG_MASK))
1013                         enter_lmode(vcpu);
1014                 if (is_paging() && !(cr0 & CR0_PG_MASK))
1015                         exit_lmode(vcpu);
1016         }
1017 #endif
1018
1019         vmcs_writel(CR0_READ_SHADOW, cr0);
1020         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
1021         print_func_exit();
1022 }
1023
1024 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
1025                                                                                  unsigned long cr3)
1026 {
1027         print_func_entry();
1028         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
1029         unsigned offset = (cr3 & (PAGE_SIZE - 1)) >> 5;
1030         int i;
1031         uint64_t pdpte;
1032         uint64_t *pdpt;
1033         struct litevm_memory_slot *memslot;
1034
1035         SPLL(&vcpu->litevm->lock);
1036         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
1037         /* FIXME: !memslot - emulate? 0xff? */
1038         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
1039
1040         for (i = 0; i < 4; ++i) {
1041                 pdpte = pdpt[offset + i];
1042                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
1043                         break;
1044         }
1045
1046         SPLU(&vcpu->litevm->lock);
1047
1048         print_func_exit();
1049         return i != 4;
1050 }
1051
1052 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
1053 {
1054         print_func_entry();
1055         if (cr0 & CR0_RESEVED_BITS) {
1056                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, guest_cr0());
1057                 inject_gp(vcpu);
1058                 print_func_exit();
1059                 return;
1060         }
1061
1062         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
1063                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
1064                 inject_gp(vcpu);
1065                 print_func_exit();
1066                 return;
1067         }
1068
1069         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
1070                 printd("set_cr0: #GP, set PG flag " "and a clear PE flag\n");
1071                 inject_gp(vcpu);
1072                 print_func_exit();
1073                 return;
1074         }
1075
1076         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
1077 #ifdef __x86_64__
1078                 if ((vcpu->shadow_efer & EFER_LME)) {
1079                         uint32_t guest_cs_ar;
1080                         if (!is_pae()) {
1081                                 printd("set_cr0: #GP, start paging "
1082                                            "in long mode while PAE is disabled\n");
1083                                 inject_gp(vcpu);
1084                                 print_func_exit();
1085                                 return;
1086                         }
1087                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1088                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
1089                                 printd("set_cr0: #GP, start paging "
1090                                            "in long mode while CS.L == 1\n");
1091                                 inject_gp(vcpu);
1092                                 print_func_exit();
1093                                 return;
1094
1095                         }
1096                 } else
1097 #endif
1098                 if (is_pae() && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1099                         printd("set_cr0: #GP, pdptrs " "reserved bits\n");
1100                         inject_gp(vcpu);
1101                         print_func_exit();
1102                         return;
1103                 }
1104
1105         }
1106
1107         __set_cr0(vcpu, cr0);
1108         litevm_mmu_reset_context(vcpu);
1109         print_func_exit();
1110         return;
1111 }
1112
1113 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
1114 {
1115         print_func_entry();
1116         unsigned long cr0 = guest_cr0();
1117
1118         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
1119                 enter_pmode(vcpu);
1120                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
1121
1122         } else
1123                 printd("lmsw: unexpected\n");
1124
1125         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
1126                                 | (msw & LMSW_GUEST_MASK));
1127         print_func_exit();
1128 }
1129
1130 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1131 {
1132         print_func_entry();
1133         vmcs_writel(CR4_READ_SHADOW, cr4);
1134         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1135                                                                   LITEVM_RMODE_VM_CR4_ALWAYS_ON :
1136                                                                   LITEVM_PMODE_VM_CR4_ALWAYS_ON));
1137         print_func_exit();
1138 }
1139
1140 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1141 {
1142         print_func_entry();
1143         if (cr4 & CR4_RESEVED_BITS) {
1144                 printd("set_cr4: #GP, reserved bits\n");
1145                 inject_gp(vcpu);
1146                 print_func_exit();
1147                 return;
1148         }
1149
1150         if (is_long_mode()) {
1151                 if (!(cr4 & CR4_PAE_MASK)) {
1152                         printd("set_cr4: #GP, clearing PAE while " "in long mode\n");
1153                         inject_gp(vcpu);
1154                         print_func_exit();
1155                         return;
1156                 }
1157         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
1158                            && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1159                 printd("set_cr4: #GP, pdptrs reserved bits\n");
1160                 inject_gp(vcpu);
1161         }
1162
1163         if (cr4 & CR4_VMXE_MASK) {
1164                 printd("set_cr4: #GP, setting VMXE\n");
1165                 inject_gp(vcpu);
1166                 print_func_exit();
1167                 return;
1168         }
1169         __set_cr4(vcpu, cr4);
1170         SPLL(&vcpu->litevm->lock);
1171         litevm_mmu_reset_context(vcpu);
1172         SPLU(&vcpu->litevm->lock);
1173         print_func_exit();
1174 }
1175
1176 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1177 {
1178         print_func_entry();
1179         if (is_long_mode()) {
1180                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
1181                         printd("set_cr3: #GP, reserved bits\n");
1182                         inject_gp(vcpu);
1183                         print_func_exit();
1184                         return;
1185                 }
1186         } else {
1187                 if (cr3 & CR3_RESEVED_BITS) {
1188                         printd("set_cr3: #GP, reserved bits\n");
1189                         inject_gp(vcpu);
1190                         print_func_exit();
1191                         return;
1192                 }
1193                 if (is_paging() && is_pae() && pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1194                         printd("set_cr3: #GP, pdptrs " "reserved bits\n");
1195                         inject_gp(vcpu);
1196                         print_func_exit();
1197                         return;
1198                 }
1199         }
1200
1201         vcpu->cr3 = cr3;
1202         SPLL(&vcpu->litevm->lock);
1203         vcpu->mmu.new_cr3(vcpu);
1204         SPLU(&vcpu->litevm->lock);
1205         print_func_exit();
1206 }
1207
1208 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1209 {
1210         print_func_entry();
1211         if (cr8 & CR8_RESEVED_BITS) {
1212                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1213                 inject_gp(vcpu);
1214                 print_func_exit();
1215                 return;
1216         }
1217         vcpu->cr8 = cr8;
1218         print_func_exit();
1219 }
1220
1221 static uint32_t get_rdx_init_val(void)
1222 {
1223         print_func_entry();
1224         uint32_t val;
1225
1226 asm("movl $1, %%eax \n\t" "movl %%eax, %0 \n\t":"=g"(val));
1227         print_func_exit();
1228         return val;
1229
1230 }
1231
1232 static void fx_init(struct litevm_vcpu *vcpu)
1233 {
1234         print_func_entry();
1235         struct __attribute__ ((__packed__)) fx_image_s {
1236                 uint16_t control;               //fcw
1237                 uint16_t status;                //fsw
1238                 uint16_t tag;                   // ftw
1239                 uint16_t opcode;                //fop
1240                 uint64_t ip;                    // fpu ip
1241                 uint64_t operand;               // fpu dp
1242                 uint32_t mxcsr;
1243                 uint32_t mxcsr_mask;
1244
1245         } *fx_image;
1246
1247         fx_save(vcpu->host_fx_image);
1248         fpu_init();
1249         fx_save(vcpu->guest_fx_image);
1250         fx_restore(vcpu->host_fx_image);
1251
1252         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1253         fx_image->mxcsr = 0x1f80;
1254         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1255                    0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1256         print_func_exit();
1257 }
1258
1259 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field,
1260                                                                    uint32_t val)
1261 {
1262         print_func_entry();
1263         uint32_t msr_high, msr_low;
1264         uint64_t msrval;
1265
1266         msrval = read_msr(msr);
1267         msr_low = msrval;
1268         msr_high = (msrval >> 32);
1269
1270         val &= msr_high;
1271         val |= msr_low;
1272         vmcs_write32(vmcs_field, val);
1273         print_func_exit();
1274 }
1275
1276 /*
1277  * Sets up the vmcs for emulated real mode.
1278  */
1279 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1280 {
1281         print_func_entry();
1282
1283 /* no op on x86_64 */
1284 #define asmlinkage
1285         extern asmlinkage void litevm_vmx_return(void);
1286         uint32_t host_sysenter_cs;
1287         uint32_t junk;
1288         uint64_t a;
1289         struct descriptor_table dt;
1290         int i;
1291         int ret;
1292         uint64_t tsc;
1293         int nr_good_msrs;
1294
1295         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1296         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1297         vcpu->cr8 = 0;
1298         vcpu->apic_base = 0xfee00000 |
1299                 /*for vcpu 0 */ MSR_IA32_APICBASE_BSP |
1300                 MSR_IA32_APICBASE_ENABLE;
1301
1302         fx_init(vcpu);
1303
1304 #define SEG_SETUP(seg) do {                                     \
1305                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1306                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1307                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1308                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1309         } while (0)
1310
1311         /*
1312          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1313          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1314          */
1315         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1316         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1317         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1318         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1319
1320         SEG_SETUP(DS);
1321         SEG_SETUP(ES);
1322         SEG_SETUP(FS);
1323         SEG_SETUP(GS);
1324         SEG_SETUP(SS);
1325
1326         vmcs_write16(GUEST_TR_SELECTOR, 0);
1327         vmcs_writel(GUEST_TR_BASE, 0);
1328         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1329         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1330
1331         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1332         vmcs_writel(GUEST_LDTR_BASE, 0);
1333         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1334         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1335
1336         vmcs_write32(GUEST_SYSENTER_CS, 0);
1337         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1338         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1339
1340         vmcs_writel(GUEST_RFLAGS, 0x02);
1341         vmcs_writel(GUEST_RIP, 0xfff0);
1342         vmcs_writel(GUEST_RSP, 0);
1343
1344         vmcs_writel(GUEST_CR3, 0);
1345
1346         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1347         vmcs_writel(GUEST_DR7, 0x400);
1348
1349         vmcs_writel(GUEST_GDTR_BASE, 0);
1350         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1351
1352         vmcs_writel(GUEST_IDTR_BASE, 0);
1353         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1354
1355         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1356         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1357         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1358
1359         /* I/O */
1360         vmcs_write64(IO_BITMAP_A, 0);
1361         vmcs_write64(IO_BITMAP_B, 0);
1362
1363         tsc = read_tsc();
1364         vmcs_write64(TSC_OFFSET, -tsc);
1365
1366         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1367
1368         /* Special registers */
1369         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1370
1371         /* Control */
1372         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_EXT_INTR_MASK       /* 20.6.1 */
1373                                                    | PIN_BASED_NMI_EXITING      /* 20.6.1 */
1374                 );
1375         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_HLT_EXITING        /* 20.6.2 */
1376                                                    | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1377                                                    | CPU_BASED_CR8_STORE_EXITING        /* 20.6.2 */
1378                                                    | CPU_BASED_UNCOND_IO_EXITING        /* 20.6.2 */
1379                                                    | CPU_BASED_INVDPG_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING  /* 21.3 */
1380                 );
1381
1382         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1383         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1384         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1385         vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1386
1387         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1388         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1389         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1390
1391         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);    /* 22.2.4 */
1392         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1393         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1394         vmcs_write16(HOST_FS_SELECTOR, read_fs());      /* 22.2.4 */
1395         vmcs_write16(HOST_GS_SELECTOR, read_gs());      /* 22.2.4 */
1396         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1397
1398 #ifdef __x86_64__
1399         a = read_msr(MSR_FS_BASE);
1400         vmcs_writel(HOST_FS_BASE, a);   /* 22.2.4 */
1401         a = read_msr(MSR_GS_BASE);
1402         vmcs_writel(HOST_GS_BASE, a);   /* 22.2.4 */
1403 #else
1404         vmcs_writel(HOST_FS_BASE, 0);   /* 22.2.4 */
1405         vmcs_writel(HOST_GS_BASE, 0);   /* 22.2.4 */
1406 #endif
1407
1408         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS * 8);      /* 22.2.4 */
1409
1410         get_idt(&dt);
1411         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1412
1413         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return);        /* 22.2.5 */
1414
1415         /* it's the HIGH 32 bits! */
1416         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1417         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1418         a = read_msr(MSR_IA32_SYSENTER_ESP);
1419         vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1420         a = read_msr(MSR_IA32_SYSENTER_EIP);
1421         vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1422
1423         ret = -ENOMEM;
1424         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1425         if (!vcpu->guest_msrs)
1426                 error("guest_msrs kmalloc failed");
1427         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1428         if (!vcpu->host_msrs)
1429                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1430
1431         for (i = 0; i < NR_VMX_MSR; ++i) {
1432                 uint32_t index = vmx_msr_index[i];
1433                 uint32_t data_low, data_high;
1434                 uint64_t data;
1435                 int j = vcpu->nmsrs;
1436
1437 #warning "need readmsr_safe"
1438 //      if (rdmsr_safe(index, &data_low, &data_high) < 0)
1439 //          continue;
1440                 data = read_msr(index);
1441                 vcpu->host_msrs[j].index = index;
1442                 vcpu->host_msrs[j].reserved = 0;
1443                 vcpu->host_msrs[j].data = data;
1444                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1445                 ++vcpu->nmsrs;
1446         }
1447         printk("msrs: %d\n", vcpu->nmsrs);
1448
1449         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1450         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1451         vmcs_writel(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1452         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1453         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, (HOST_IS_64 << 9));        /* 22.2,1, 20.7.1 */
1454         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs);    /* 22.2.2 */
1455         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);     /* 22.2.2 */
1456         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs);    /* 22.2.2 */
1457
1458         /* 22.2.1, 20.8.1 */
1459         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, VM_ENTRY_CONTROLS, 0);
1460         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
1461
1462         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1463         vmcs_writel(TPR_THRESHOLD, 0);
1464
1465         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1466         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1467
1468         __set_cr0(vcpu, 0x60000010);    // enter rmode
1469         __set_cr4(vcpu, 0);
1470 #ifdef __x86_64__
1471         __set_efer(vcpu, 0);
1472 #endif
1473
1474         ret = litevm_mmu_init(vcpu);
1475
1476         print_func_exit();
1477         return ret;
1478
1479 out_free_guest_msrs:
1480         kfree(vcpu->guest_msrs);
1481 out:
1482         return ret;
1483 }
1484
1485 /*
1486  * Sync the rsp and rip registers into the vcpu structure.  This allows
1487  * registers to be accessed by indexing vcpu->regs.
1488  */
1489 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1490 {
1491         print_func_entry();
1492         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1493         vcpu->rip = vmcs_readl(GUEST_RIP);
1494         print_func_exit();
1495 }
1496
1497 /*
1498  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1499  * modification.
1500  */
1501 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1502 {
1503         print_func_entry();
1504         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1505         vmcs_writel(GUEST_RIP, vcpu->rip);
1506         print_func_exit();
1507 }
1508
1509 /*
1510  * Creates some virtual cpus.  Good luck creating more than one.
1511  */
1512 int vmx_create_vcpu(struct litevm *litevm, int n)
1513 {
1514         print_func_entry();
1515         ERRSTACK(2);
1516         int r;
1517         struct litevm_vcpu *vcpu;
1518         struct vmcs *vmcs;
1519         char *errstring = NULL;
1520
1521         if (n < 0 || n >= LITEVM_MAX_VCPUS) {
1522                 printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1523                            LITEVM_MAX_VCPUS);
1524                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1525                           LITEVM_MAX_VCPUS);
1526         }
1527         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1528         vcpu = &litevm->vcpus[n];
1529
1530         printk("vmx_create_vcpu: @%d, %p\n", n, vcpu);
1531         QLOCK(&vcpu->mutex);
1532
1533         if (vcpu->vmcs) {
1534                 QUNLOCK(&vcpu->mutex);
1535                 printk("VM already exists\n");
1536                 error("VM already exists");
1537         }
1538         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1539         /* I'm a bad person */
1540         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1541         uint64_t a = (uint64_t) vcpu->fx_buf;
1542         a += FX_IMAGE_ALIGN - 1;
1543         a /= FX_IMAGE_ALIGN;
1544         a *= FX_IMAGE_ALIGN;
1545
1546         vcpu->host_fx_image = (char *)a;
1547         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1548
1549         vcpu->cpu = -1; /* First load will set up TR */
1550         vcpu->litevm = litevm;
1551         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1552         if (waserror()){
1553                 printk("ERR 1 in %s, %s\n", __func__, current_errstr());
1554                 QUNLOCK(&vcpu->mutex);
1555                 litevm_free_vcpu(vcpu);
1556                 nexterror();
1557         }
1558         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1559         vmcs = alloc_vmcs();
1560         vmcs_clear(vmcs);
1561         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1562         printk("after vmcs_clear\n");
1563         vcpu->vmcs = vmcs;
1564         printk("vcpu %p set vmcs to %p\n", vcpu, vmcs);
1565         vcpu->launched = 0;
1566         printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs);
1567
1568         __vcpu_load(vcpu);
1569
1570         printk("PAST vcpu_load\n");
1571         if (waserror()) {
1572                 /* we really need to fix waserror() */
1573                 printk("vcpu_setup failed: %s\n", current_errstr());
1574                 QUNLOCK(&vcpu->mutex);
1575                 nexterror();
1576         }
1577
1578         /* need memory for the rmode_tss. I have no idea how this happened
1579          * originally in kvm.
1580          */
1581         /* this sucks. */
1582         QUNLOCK(&vcpu->mutex);
1583         void *v;
1584         struct litevm_memory_region vmr;
1585         vmr.slot = 0;
1586         vmr.flags = 0;
1587         vmr.guest_phys_addr = /* guess. */ 0x1000000;
1588         vmr.memory_size = 0x10000;
1589         if (vm_set_memory_region(litevm, &vmr))
1590                 printk("vm_set_memory_region failed");
1591
1592         printk("set memory region done\n");
1593
1594         if (!init_rmode_tss(litevm)) {
1595                 error("vcpu_setup: init_rmode_tss failed");
1596         }
1597
1598
1599         QLOCK(&vcpu->mutex);
1600         r = litevm_vcpu_setup(vcpu);
1601
1602         vcpu_put(vcpu);
1603
1604         printk("r is %d\n", r);
1605
1606         if (!r) {
1607                 poperror();
1608                 print_func_exit();
1609                 return 0;
1610         }
1611
1612         errstring = "vcup set failed";
1613
1614 out_free_vcpus:
1615 out:
1616         print_func_exit();
1617         return r;
1618 }
1619
1620 /*
1621  * Allocate some memory and give it an address in the guest physical address
1622  * space.
1623  *
1624  * Discontiguous memory is allowed, mostly for framebuffers.
1625  */
1626 int vm_set_memory_region(struct litevm *litevm,
1627                                                  struct litevm_memory_region *mem)
1628 {
1629         print_func_entry();
1630         ERRSTACK(2);
1631         int r;
1632         gfn_t base_gfn;
1633         unsigned long npages;
1634         unsigned long i;
1635         struct litevm_memory_slot *memslot;
1636         struct litevm_memory_slot old, new;
1637         int memory_config_version;
1638         void *init_data = mem->init_data;
1639         int pass = 1;
1640
1641         printk("litevm %p\n", litevm);
1642         /* should not happen but ... */
1643         if (!litevm)
1644                 error("NULL litevm in %s", __func__);
1645
1646         if (!mem)
1647                 error("NULL mem in %s", __func__);
1648         /* I don't care right now. *
1649         if (litevm->busy)
1650                 error("litevm->busy is set! 0x%x\n", litevm->busy);
1651         */
1652         r = -EINVAL;
1653         /* General sanity checks */
1654         if (mem->memory_size & (PAGE_SIZE - 1))
1655                 error("mem->memory_size %lld is not page-aligned", mem->memory_size);
1656         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1657                 error("guest_phys_addr 0x%llx is not page-aligned",
1658                           mem->guest_phys_addr);
1659         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1660                 error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
1661         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1662                 error("0x%x + 0x%x is < 0x%x",
1663                           mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
1664
1665         memslot = &litevm->memslots[mem->slot];
1666         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1667         npages = mem->memory_size >> PAGE_SHIFT;
1668
1669         if (!npages)
1670                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1671
1672         /* this is actually a very tricky for loop. The use of
1673          * error is a bit dangerous, so we don't use it much.
1674          * consider a rewrite. Would be nice if akaros could do the
1675          * allocation of a bunch of pages for us.
1676          */
1677 raced:
1678         printk("raced: pass %d\n", pass);
1679         printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
1680         monitor(NULL);
1681         SPLL(&litevm->lock);
1682         printk("locked\n");
1683
1684         if (waserror()) {
1685                 printk("error in %s, %s\n", __func__, current_errstr());
1686                 SPLU(&litevm->lock);
1687                 nexterror();
1688         }
1689
1690         memory_config_version = litevm->memory_config_version;
1691         new = old = *memslot;
1692         printk("memory_config_version %d\n", memory_config_version);
1693
1694         new.base_gfn = base_gfn;
1695         new.npages = npages;
1696         new.flags = mem->flags;
1697
1698         /* Disallow changing a memory slot's size. */
1699         r = -EINVAL;
1700         if (npages && old.npages && npages != old.npages)
1701                 error("npages is %d, old.npages is %d, can't change",
1702                           npages, old.npages);
1703
1704         /* Check for overlaps */
1705         r = -EEXIST;
1706         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1707                 struct litevm_memory_slot *s = &litevm->memslots[i];
1708
1709                 if (s == memslot)
1710                         continue;
1711                 if (!((base_gfn + npages <= s->base_gfn) ||
1712                           (base_gfn >= s->base_gfn + s->npages)))
1713                         error("Overlap");
1714         }
1715         /*
1716          * Do memory allocations outside lock.  memory_config_version will
1717          * detect any races.
1718          */
1719         SPLU(&litevm->lock);
1720         printk("unlocked\n");
1721         poperror();
1722
1723         /* Deallocate if slot is being removed */
1724         if (!npages)
1725                 new.phys_mem = 0;
1726
1727         /* Free page dirty bitmap if unneeded */
1728         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1729                 new.dirty_bitmap = 0;
1730
1731         r = -ENOMEM;
1732
1733         /* Allocate if a slot is being created */
1734         if (npages && !new.phys_mem) {
1735                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1736
1737                 if (!new.phys_mem)
1738                         goto out_free;
1739
1740                 for (i = 0; i < npages; ++i) {
1741                         int ret;
1742                         ret = kpage_alloc(&new.phys_mem[i]);
1743                         if (ret != ESUCCESS)
1744                                 goto out_free;
1745                         if (init_data) {
1746                                 printk("init data memcpy(%p,%p,4096);\n",
1747                                            page2kva(new.phys_mem[i]), init_data);
1748                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1749                                 init_data += PAGE_SIZE;
1750                         }
1751                 }
1752         }
1753
1754         /* Allocate page dirty bitmap if needed */
1755         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1756                 unsigned dirty_bytes;   //ALIGN(npages, BITS_PER_LONG) / 8;
1757                 dirty_bytes =
1758                         (((npages + BITS_PER_LONG -
1759                            1) / BITS_PER_LONG) * BITS_PER_LONG) / 8;
1760
1761                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1762                 if (!new.dirty_bitmap) {
1763                         printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
1764                         goto out_free;
1765                 }
1766         }
1767
1768         SPLL(&litevm->lock);
1769         printk("locked\n");
1770         if (memory_config_version != litevm->memory_config_version) {
1771                 SPLU(&litevm->lock);
1772                 printk("unlocked, try again\n");
1773                 litevm_free_physmem_slot(&new, &old);
1774                 goto raced;
1775         }
1776
1777         r = -EAGAIN;
1778         if (litevm->busy) {
1779                 printk("BUSY!\n");
1780                 goto out_unlock;
1781         }
1782
1783         if (mem->slot >= litevm->nmemslots)
1784                 litevm->nmemslots = mem->slot + 1;
1785
1786         *memslot = new;
1787         ++litevm->memory_config_version;
1788
1789         SPLU(&litevm->lock);
1790         printk("unlocked\n");
1791         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1792                 struct litevm_vcpu *vcpu;
1793
1794                 vcpu = vcpu_load(litevm, i);
1795                 if (!vcpu){
1796                         printk("%s: no cpu %d\n", __func__, i);
1797                         continue;
1798                 }
1799                 litevm_mmu_reset_context(vcpu);
1800                 vcpu_put(vcpu);
1801         }
1802
1803         litevm_free_physmem_slot(&old, &new);
1804         print_func_exit();
1805         return 0;
1806
1807 out_unlock:
1808         SPLU(&litevm->lock);
1809         printk("out_unlock\n");
1810 out_free:
1811         printk("out_free\n");
1812         litevm_free_physmem_slot(&new, &old);
1813 out:
1814         printk("vm_set_memory_region: return %d\n", r);
1815         print_func_exit();
1816         return r;
1817 }
1818
1819 #if 0
1820 /*
1821  * Get (and clear) the dirty memory log for a memory slot.
1822  */
1823 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1824                                                                                   struct litevm_dirty_log *log)
1825 {
1826         struct litevm_memory_slot *memslot;
1827         int r, i;
1828         int n;
1829         unsigned long any = 0;
1830
1831         SPLL(&litevm->lock);
1832
1833         /*
1834          * Prevent changes to guest memory configuration even while the lock
1835          * is not taken.
1836          */
1837         ++litevm->busy;
1838         SPLU(&litevm->lock);
1839         r = -EINVAL;
1840         if (log->slot >= LITEVM_MEMORY_SLOTS)
1841                 goto out;
1842
1843         memslot = &litevm->memslots[log->slot];
1844         r = -ENOENT;
1845         if (!memslot->dirty_bitmap)
1846                 goto out;
1847
1848         n = ALIGN(memslot->npages, 8) / 8;
1849
1850         for (i = 0; !any && i < n; ++i)
1851                 any = memslot->dirty_bitmap[i];
1852
1853         r = -EFAULT;
1854         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1855                 goto out;
1856
1857         if (any) {
1858                 SPLL(&litevm->lock);
1859                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1860                 SPLU(&litevm->lock);
1861                 memset(memslot->dirty_bitmap, 0, n);
1862                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1863                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1864
1865                         if (!vcpu)
1866                                 continue;
1867                         flush_guest_tlb(vcpu);
1868                         vcpu_put(vcpu);
1869                 }
1870         }
1871
1872         r = 0;
1873
1874 out:
1875         SPLL(&litevm->lock);
1876         --litevm->busy;
1877         SPLU(&litevm->lock);
1878         return r;
1879 }
1880 #endif
1881
1882 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1883 {
1884         print_func_entry();
1885         int i;
1886
1887         printk("%s: litevm %p gfn %d\n", litevm, gfn);
1888         for (i = 0; i < litevm->nmemslots; ++i) {
1889                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1890
1891                 if (gfn >= memslot->base_gfn
1892                         && gfn < memslot->base_gfn + memslot->npages) {
1893                         print_func_exit();
1894                         return memslot;
1895                 }
1896         }
1897         print_func_exit();
1898         return 0;
1899 }
1900
1901 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1902 {
1903         print_func_entry();
1904         int i;
1905         struct litevm_memory_slot *memslot = 0;
1906         unsigned long rel_gfn;
1907
1908         for (i = 0; i < litevm->nmemslots; ++i) {
1909                 memslot = &litevm->memslots[i];
1910
1911                 if (gfn >= memslot->base_gfn
1912                         && gfn < memslot->base_gfn + memslot->npages) {
1913
1914                         if (!memslot || !memslot->dirty_bitmap) {
1915                                 print_func_exit();
1916                                 return;
1917                         }
1918
1919                         rel_gfn = gfn - memslot->base_gfn;
1920
1921                         /* avoid RMW */
1922                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1923                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1924                         print_func_exit();
1925                         return;
1926                 }
1927         }
1928         print_func_exit();
1929 }
1930
1931 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1932 {
1933         print_func_entry();
1934         unsigned long rip;
1935         uint32_t interruptibility;
1936
1937         rip = vmcs_readl(GUEST_RIP);
1938         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1939         vmcs_writel(GUEST_RIP, rip);
1940
1941         /*
1942          * We emulated an instruction, so temporary interrupt blocking
1943          * should be removed, if set.
1944          */
1945         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1946         if (interruptibility & 3)
1947                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3);
1948         print_func_exit();
1949 }
1950
1951 static int emulator_read_std(unsigned long addr,
1952                                                          unsigned long *val,
1953                                                          unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1954 {
1955         print_func_entry();
1956         struct litevm_vcpu *vcpu = ctxt->vcpu;
1957         void *data = val;
1958
1959         while (bytes) {
1960                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1961                 unsigned offset = addr & (PAGE_SIZE - 1);
1962                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ?
1963                         bytes : (unsigned)PAGE_SIZE - offset;
1964                 unsigned long pfn;
1965                 struct litevm_memory_slot *memslot;
1966                 void *page;
1967
1968                 if (gpa == UNMAPPED_GVA) {
1969                         print_func_exit();
1970                         return X86EMUL_PROPAGATE_FAULT;
1971                 }
1972                 pfn = gpa >> PAGE_SHIFT;
1973                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1974                 if (!memslot) {
1975                         print_func_exit();
1976                         return X86EMUL_UNHANDLEABLE;
1977                 }
1978                 page = page2kva(gfn_to_page(memslot, pfn));
1979
1980                 memcpy(data, page + offset, tocopy);
1981
1982                 bytes -= tocopy;
1983                 data += tocopy;
1984                 addr += tocopy;
1985         }
1986
1987         print_func_exit();
1988         return X86EMUL_CONTINUE;
1989 }
1990
1991 static int emulator_write_std(unsigned long addr,
1992                                                           unsigned long val,
1993                                                           unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1994 {
1995         print_func_entry();
1996         printk("emulator_write_std: addr %lx n %d\n", addr, bytes);
1997         print_func_exit();
1998         return X86EMUL_UNHANDLEABLE;
1999 }
2000
2001 static int emulator_read_emulated(unsigned long addr,
2002                                                                   unsigned long *val,
2003                                                                   unsigned int bytes,
2004                                                                   struct x86_emulate_ctxt *ctxt)
2005 {
2006         print_func_entry();
2007         struct litevm_vcpu *vcpu = ctxt->vcpu;
2008
2009         if (vcpu->mmio_read_completed) {
2010                 memcpy(val, vcpu->mmio_data, bytes);
2011                 vcpu->mmio_read_completed = 0;
2012                 print_func_exit();
2013                 return X86EMUL_CONTINUE;
2014         } else if (emulator_read_std(addr, val, bytes, ctxt)
2015                            == X86EMUL_CONTINUE) {
2016                 print_func_exit();
2017                 return X86EMUL_CONTINUE;
2018         } else {
2019                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2020                 if (gpa == UNMAPPED_GVA) {
2021                         print_func_exit();
2022                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
2023                 }
2024                 vcpu->mmio_needed = 1;
2025                 vcpu->mmio_phys_addr = gpa;
2026                 vcpu->mmio_size = bytes;
2027                 vcpu->mmio_is_write = 0;
2028
2029                 print_func_exit();
2030                 return X86EMUL_UNHANDLEABLE;
2031         }
2032 }
2033
2034 static int emulator_write_emulated(unsigned long addr,
2035                                                                    unsigned long val,
2036                                                                    unsigned int bytes,
2037                                                                    struct x86_emulate_ctxt *ctxt)
2038 {
2039         print_func_entry();
2040         struct litevm_vcpu *vcpu = ctxt->vcpu;
2041         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2042
2043         if (gpa == UNMAPPED_GVA) {
2044                 print_func_exit();
2045                 return X86EMUL_PROPAGATE_FAULT;
2046         }
2047
2048         vcpu->mmio_needed = 1;
2049         vcpu->mmio_phys_addr = gpa;
2050         vcpu->mmio_size = bytes;
2051         vcpu->mmio_is_write = 1;
2052         memcpy(vcpu->mmio_data, &val, bytes);
2053
2054         print_func_exit();
2055         return X86EMUL_CONTINUE;
2056 }
2057
2058 static int emulator_cmpxchg_emulated(unsigned long addr,
2059                                                                          unsigned long old,
2060                                                                          unsigned long new,
2061                                                                          unsigned int bytes,
2062                                                                          struct x86_emulate_ctxt *ctxt)
2063 {
2064         print_func_entry();
2065         static int reported;
2066
2067         if (!reported) {
2068                 reported = 1;
2069                 printk("litevm: emulating exchange as write\n");
2070         }
2071         print_func_exit();
2072         return emulator_write_emulated(addr, new, bytes, ctxt);
2073 }
2074
2075 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
2076 {
2077         print_func_entry();
2078         static int reported;
2079         uint8_t opcodes[4];
2080         unsigned long rip = vmcs_readl(GUEST_RIP);
2081         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
2082
2083         if (reported) {
2084                 print_func_exit();
2085                 return;
2086         }
2087
2088         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
2089
2090         printk("emulation failed but !mmio_needed?"
2091                    " rip %lx %02x %02x %02x %02x\n",
2092                    rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2093         reported = 1;
2094         print_func_exit();
2095 }
2096
2097 struct x86_emulate_ops emulate_ops = {
2098         .read_std = emulator_read_std,
2099         .write_std = emulator_write_std,
2100         .read_emulated = emulator_read_emulated,
2101         .write_emulated = emulator_write_emulated,
2102         .cmpxchg_emulated = emulator_cmpxchg_emulated,
2103 };
2104
2105 enum emulation_result {
2106         EMULATE_DONE,                           /* no further processing */
2107         EMULATE_DO_MMIO,                        /* litevm_run filled with mmio request */
2108         EMULATE_FAIL,                           /* can't emulate this instruction */
2109 };
2110
2111 static int emulate_instruction(struct litevm_vcpu *vcpu,
2112                                                            struct litevm_run *run,
2113                                                            unsigned long cr2, uint16_t error_code)
2114 {
2115         print_func_entry();
2116         struct x86_emulate_ctxt emulate_ctxt;
2117         int r;
2118         uint32_t cs_ar;
2119
2120         vcpu_load_rsp_rip(vcpu);
2121
2122         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2123
2124         emulate_ctxt.vcpu = vcpu;
2125         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
2126         emulate_ctxt.cr2 = cr2;
2127         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
2128                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
2129                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
2130                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2131
2132         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2133                 emulate_ctxt.cs_base = 0;
2134                 emulate_ctxt.ds_base = 0;
2135                 emulate_ctxt.es_base = 0;
2136                 emulate_ctxt.ss_base = 0;
2137                 emulate_ctxt.gs_base = 0;
2138                 emulate_ctxt.fs_base = 0;
2139         } else {
2140                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
2141                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
2142                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
2143                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
2144                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
2145                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
2146         }
2147
2148         vcpu->mmio_is_write = 0;
2149         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
2150
2151         if ((r || vcpu->mmio_is_write) && run) {
2152                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2153                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2154                 run->mmio.len = vcpu->mmio_size;
2155                 run->mmio.is_write = vcpu->mmio_is_write;
2156         }
2157
2158         if (r) {
2159                 if (!vcpu->mmio_needed) {
2160                         report_emulation_failure(&emulate_ctxt);
2161                         print_func_exit();
2162                         return EMULATE_FAIL;
2163                 }
2164                 print_func_exit();
2165                 return EMULATE_DO_MMIO;
2166         }
2167
2168         vcpu_put_rsp_rip(vcpu);
2169         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
2170
2171         if (vcpu->mmio_is_write) {
2172                 print_func_exit();
2173                 return EMULATE_DO_MMIO;
2174         }
2175
2176         print_func_exit();
2177         return EMULATE_DONE;
2178 }
2179
2180 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
2181 {
2182         print_func_entry();
2183         print_func_exit();
2184         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2185 }
2186
2187 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2188 {
2189         print_func_entry();
2190         vmcs_writel(GUEST_GDTR_BASE, base);
2191         vmcs_write32(GUEST_GDTR_LIMIT, limit);
2192         print_func_exit();
2193 }
2194
2195 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2196 {
2197         print_func_entry();
2198         vmcs_writel(GUEST_IDTR_BASE, base);
2199         vmcs_write32(GUEST_IDTR_LIMIT, limit);
2200         print_func_exit();
2201 }
2202
2203 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
2204                                    unsigned long *rflags)
2205 {
2206         print_func_entry();
2207         lmsw(vcpu, msw);
2208         *rflags = vmcs_readl(GUEST_RFLAGS);
2209         print_func_exit();
2210 }
2211
2212 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
2213 {
2214         print_func_entry();
2215         switch (cr) {
2216                 case 0:
2217                         print_func_exit();
2218                         return guest_cr0();
2219                 case 2:
2220                         print_func_exit();
2221                         return vcpu->cr2;
2222                 case 3:
2223                         print_func_exit();
2224                         return vcpu->cr3;
2225                 case 4:
2226                         print_func_exit();
2227                         return guest_cr4();
2228                 default:
2229                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2230                         print_func_exit();
2231                         return 0;
2232         }
2233 }
2234
2235 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
2236                                          unsigned long *rflags)
2237 {
2238         print_func_entry();
2239         switch (cr) {
2240                 case 0:
2241                         set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
2242                         *rflags = vmcs_readl(GUEST_RFLAGS);
2243                         break;
2244                 case 2:
2245                         vcpu->cr2 = val;
2246                         break;
2247                 case 3:
2248                         set_cr3(vcpu, val);
2249                         break;
2250                 case 4:
2251                         set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
2252                         break;
2253                 default:
2254                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2255         }
2256         print_func_exit();
2257 }
2258
2259 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
2260                                                                   int vec, uint32_t err_code)
2261 {
2262         print_func_entry();
2263         if (!vcpu->rmode.active) {
2264                 print_func_exit();
2265                 return 0;
2266         }
2267
2268         if (vec == GP_VECTOR && err_code == 0)
2269                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) {
2270                         print_func_exit();
2271                         return 1;
2272                 }
2273         print_func_exit();
2274         return 0;
2275 }
2276
2277 static int handle_exception(struct litevm_vcpu *vcpu,
2278                                                         struct litevm_run *litevm_run)
2279 {
2280         print_func_entry();
2281         uint32_t intr_info, error_code;
2282         unsigned long cr2, rip;
2283         uint32_t vect_info;
2284         enum emulation_result er;
2285
2286         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2287         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2288
2289         if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) {
2290                 printk("%s: unexpected, vectoring info 0x%x "
2291                            "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
2292         }
2293
2294         if (is_external_interrupt(vect_info)) {
2295                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2296                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_pending), irq);
2297                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_summary),
2298                                                            irq / BITS_PER_LONG);
2299         }
2300
2301         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) {  /* nmi */
2302                 asm("int $2");
2303                 print_func_exit();
2304                 return 1;
2305         }
2306         error_code = 0;
2307         rip = vmcs_readl(GUEST_RIP);
2308         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
2309                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2310         if (is_page_fault(intr_info)) {
2311                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2312
2313                 SPLL(&vcpu->litevm->lock);
2314                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
2315                         SPLU(&vcpu->litevm->lock);
2316                         print_func_exit();
2317                         return 1;
2318                 }
2319
2320                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
2321                 SPLU(&vcpu->litevm->lock);
2322
2323                 switch (er) {
2324                         case EMULATE_DONE:
2325                                 print_func_exit();
2326                                 return 1;
2327                         case EMULATE_DO_MMIO:
2328                                 ++litevm_stat.mmio_exits;
2329                                 litevm_run->exit_reason = LITEVM_EXIT_MMIO;
2330                                 print_func_exit();
2331                                 return 0;
2332                         case EMULATE_FAIL:
2333                                 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2334                                 break;
2335                         default:
2336                                 assert(0);
2337                 }
2338         }
2339
2340         if (vcpu->rmode.active &&
2341                 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2342                                                            error_code)) {
2343                 print_func_exit();
2344                 return 1;
2345         }
2346
2347         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
2348                 (INTR_TYPE_EXCEPTION | 1)) {
2349                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2350                 print_func_exit();
2351                 return 0;
2352         }
2353         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2354         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2355         litevm_run->ex.error_code = error_code;
2356         print_func_exit();
2357         return 0;
2358 }
2359
2360 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2361                                                                          struct litevm_run *litevm_run)
2362 {
2363         print_func_entry();
2364         ++litevm_stat.irq_exits;
2365         print_func_exit();
2366         return 1;
2367 }
2368
2369 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t * count)
2370 {
2371         print_func_entry();
2372         uint64_t inst;
2373         gva_t rip;
2374         int countr_size;
2375         int i, n;
2376
2377         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2378                 countr_size = 2;
2379         } else {
2380                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2381
2382                 countr_size = (cs_ar & AR_L_MASK) ? 8 : (cs_ar & AR_DB_MASK) ? 4 : 2;
2383         }
2384
2385         rip = vmcs_readl(GUEST_RIP);
2386         if (countr_size != 8)
2387                 rip += vmcs_readl(GUEST_CS_BASE);
2388
2389         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2390
2391         for (i = 0; i < n; i++) {
2392                 switch (((uint8_t *) & inst)[i]) {
2393                         case 0xf0:
2394                         case 0xf2:
2395                         case 0xf3:
2396                         case 0x2e:
2397                         case 0x36:
2398                         case 0x3e:
2399                         case 0x26:
2400                         case 0x64:
2401                         case 0x65:
2402                         case 0x66:
2403                                 break;
2404                         case 0x67:
2405                                 countr_size = (countr_size == 2) ? 4 : (countr_size >> 1);
2406                         default:
2407                                 goto done;
2408                 }
2409         }
2410         print_func_exit();
2411         return 0;
2412 done:
2413         countr_size *= 8;
2414         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2415         print_func_exit();
2416         return 1;
2417 }
2418
2419 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2420 {
2421         print_func_entry();
2422         uint64_t exit_qualification;
2423
2424         ++litevm_stat.io_exits;
2425         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2426         litevm_run->exit_reason = LITEVM_EXIT_IO;
2427         if (exit_qualification & 8)
2428                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2429         else
2430                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2431         litevm_run->io.size = (exit_qualification & 7) + 1;
2432         litevm_run->io.string = (exit_qualification & 16) != 0;
2433         litevm_run->io.string_down
2434                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2435         litevm_run->io.rep = (exit_qualification & 32) != 0;
2436         litevm_run->io.port = exit_qualification >> 16;
2437         if (litevm_run->io.string) {
2438                 if (!get_io_count(vcpu, &litevm_run->io.count)) {
2439                         print_func_exit();
2440                         return 1;
2441                 }
2442                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2443         } else
2444                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX];       /* rax */
2445         print_func_exit();
2446         return 0;
2447 }
2448
2449 static int handle_invlpg(struct litevm_vcpu *vcpu,
2450                                                  struct litevm_run *litevm_run)
2451 {
2452         print_func_entry();
2453         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2454         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2455         SPLL(&vcpu->litevm->lock);
2456         vcpu->mmu.inval_page(vcpu, address);
2457         SPLU(&vcpu->litevm->lock);
2458         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2459         print_func_exit();
2460         return 1;
2461 }
2462
2463 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2464 {
2465         print_func_entry();
2466         uint64_t exit_qualification;
2467         int cr;
2468         int reg;
2469
2470 #ifdef LITEVM_DEBUG
2471         if (guest_cpl() != 0) {
2472                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2473                 inject_gp(vcpu);
2474                 print_func_exit();
2475                 return 1;
2476         }
2477 #endif
2478
2479         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2480         cr = exit_qualification & 15;
2481         reg = (exit_qualification >> 8) & 15;
2482         switch ((exit_qualification >> 4) & 3) {
2483                 case 0: /* mov to cr */
2484                         switch (cr) {
2485                                 case 0:
2486                                         vcpu_load_rsp_rip(vcpu);
2487                                         set_cr0(vcpu, vcpu->regs[reg]);
2488                                         skip_emulated_instruction(vcpu);
2489                                         print_func_exit();
2490                                         return 1;
2491                                 case 3:
2492                                         vcpu_load_rsp_rip(vcpu);
2493                                         set_cr3(vcpu, vcpu->regs[reg]);
2494                                         skip_emulated_instruction(vcpu);
2495                                         print_func_exit();
2496                                         return 1;
2497                                 case 4:
2498                                         vcpu_load_rsp_rip(vcpu);
2499                                         set_cr4(vcpu, vcpu->regs[reg]);
2500                                         skip_emulated_instruction(vcpu);
2501                                         print_func_exit();
2502                                         return 1;
2503                                 case 8:
2504                                         vcpu_load_rsp_rip(vcpu);
2505                                         set_cr8(vcpu, vcpu->regs[reg]);
2506                                         skip_emulated_instruction(vcpu);
2507                                         print_func_exit();
2508                                         return 1;
2509                         };
2510                         break;
2511                 case 1: /*mov from cr */
2512                         switch (cr) {
2513                                 case 3:
2514                                         vcpu_load_rsp_rip(vcpu);
2515                                         vcpu->regs[reg] = vcpu->cr3;
2516                                         vcpu_put_rsp_rip(vcpu);
2517                                         skip_emulated_instruction(vcpu);
2518                                         print_func_exit();
2519                                         return 1;
2520                                 case 8:
2521                                         printd("handle_cr: read CR8 " "cpu erratum AA15\n");
2522                                         vcpu_load_rsp_rip(vcpu);
2523                                         vcpu->regs[reg] = vcpu->cr8;
2524                                         vcpu_put_rsp_rip(vcpu);
2525                                         skip_emulated_instruction(vcpu);
2526                                         print_func_exit();
2527                                         return 1;
2528                         }
2529                         break;
2530                 case 3: /* lmsw */
2531                         lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2532
2533                         skip_emulated_instruction(vcpu);
2534                         print_func_exit();
2535                         return 1;
2536                 default:
2537                         break;
2538         }
2539         litevm_run->exit_reason = 0;
2540         printk("litevm: unhandled control register: op %d cr %d\n",
2541                    (int)(exit_qualification >> 4) & 3, cr);
2542         print_func_exit();
2543         return 0;
2544 }
2545
2546 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2547 {
2548         print_func_entry();
2549         uint64_t exit_qualification;
2550         unsigned long val;
2551         int dr, reg;
2552
2553         /*
2554          * FIXME: this code assumes the host is debugging the guest.
2555          *        need to deal with guest debugging itself too.
2556          */
2557         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2558         dr = exit_qualification & 7;
2559         reg = (exit_qualification >> 8) & 15;
2560         vcpu_load_rsp_rip(vcpu);
2561         if (exit_qualification & 16) {
2562                 /* mov from dr */
2563                 switch (dr) {
2564                         case 6:
2565                                 val = 0xffff0ff0;
2566                                 break;
2567                         case 7:
2568                                 val = 0x400;
2569                                 break;
2570                         default:
2571                                 val = 0;
2572                 }
2573                 vcpu->regs[reg] = val;
2574         } else {
2575                 /* mov to dr */
2576         }
2577         vcpu_put_rsp_rip(vcpu);
2578         skip_emulated_instruction(vcpu);
2579         print_func_exit();
2580         return 1;
2581 }
2582
2583 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2584 {
2585         print_func_entry();
2586         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2587         print_func_exit();
2588         return 0;
2589 }
2590
2591 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2592 {
2593         print_func_entry();
2594         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2595         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2596         uint64_t data;
2597
2598         if (guest_cpl() != 0) {
2599                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2600                 inject_gp(vcpu);
2601                 print_func_exit();
2602                 return 1;
2603         }
2604
2605         switch (ecx) {
2606                 case MSR_FS_BASE:
2607                         data = vmcs_readl(GUEST_FS_BASE);
2608                         break;
2609                 case MSR_GS_BASE:
2610                         data = vmcs_readl(GUEST_GS_BASE);
2611                         break;
2612                 case MSR_IA32_SYSENTER_CS:
2613                         data = vmcs_read32(GUEST_SYSENTER_CS);
2614                         break;
2615                 case MSR_IA32_SYSENTER_EIP:
2616                         data = vmcs_read32(GUEST_SYSENTER_EIP);
2617                         break;
2618                 case MSR_IA32_SYSENTER_ESP:
2619                         data = vmcs_read32(GUEST_SYSENTER_ESP);
2620                         break;
2621                 case MSR_IA32_MC0_CTL:
2622                 case MSR_IA32_MCG_STATUS:
2623                 case MSR_IA32_MCG_CAP:
2624                 case MSR_IA32_MC0_MISC:
2625                 case MSR_IA32_MC0_MISC + 4:
2626                 case MSR_IA32_MC0_MISC + 8:
2627                 case MSR_IA32_MC0_MISC + 12:
2628                 case MSR_IA32_MC0_MISC + 16:
2629                 case MSR_IA32_UCODE_REV:
2630                         /* MTRR registers */
2631                 case 0xfe:
2632                 case 0x200 ... 0x2ff:
2633                         data = 0;
2634                         break;
2635                 case MSR_IA32_APICBASE:
2636                         data = vcpu->apic_base;
2637                         break;
2638                 default:
2639                         if (msr) {
2640                                 data = msr->data;
2641                                 break;
2642                         }
2643                         printk("litevm: unhandled rdmsr: %x\n", ecx);
2644                         inject_gp(vcpu);
2645                         print_func_exit();
2646                         return 1;
2647         }
2648
2649         /* FIXME: handling of bits 32:63 of rax, rdx */
2650         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2651         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2652         skip_emulated_instruction(vcpu);
2653         print_func_exit();
2654         return 1;
2655 }
2656
2657 #ifdef __x86_64__
2658
2659 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2660 {
2661         print_func_entry();
2662         struct vmx_msr_entry *msr;
2663
2664         if (efer & EFER_RESERVED_BITS) {
2665                 printd("set_efer: 0x%llx #GP, reserved bits\n", efer);
2666                 inject_gp(vcpu);
2667                 print_func_exit();
2668                 return;
2669         }
2670
2671         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2672                 printd("set_efer: #GP, change LME while paging\n");
2673                 inject_gp(vcpu);
2674                 print_func_exit();
2675                 return;
2676         }
2677
2678         efer &= ~EFER_LMA;
2679         efer |= vcpu->shadow_efer & EFER_LMA;
2680
2681         vcpu->shadow_efer = efer;
2682
2683         msr = find_msr_entry(vcpu, MSR_EFER);
2684
2685         if (!(efer & EFER_LMA))
2686                 efer &= ~EFER_LME;
2687         msr->data = efer;
2688         skip_emulated_instruction(vcpu);
2689         print_func_exit();
2690 }
2691
2692 #endif
2693
2694 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2695
2696 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2697 {
2698         print_func_entry();
2699         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2700         struct vmx_msr_entry *msr;
2701         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2702                 | ((uint64_t) (vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2703
2704         if (guest_cpl() != 0) {
2705                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2706                 inject_gp(vcpu);
2707                 print_func_exit();
2708                 return 1;
2709         }
2710
2711         switch (ecx) {
2712                 case MSR_FS_BASE:
2713                         vmcs_writel(GUEST_FS_BASE, data);
2714                         break;
2715                 case MSR_GS_BASE:
2716                         vmcs_writel(GUEST_GS_BASE, data);
2717                         break;
2718                 case MSR_IA32_SYSENTER_CS:
2719                         vmcs_write32(GUEST_SYSENTER_CS, data);
2720                         break;
2721                 case MSR_IA32_SYSENTER_EIP:
2722                         vmcs_write32(GUEST_SYSENTER_EIP, data);
2723                         break;
2724                 case MSR_IA32_SYSENTER_ESP:
2725                         vmcs_write32(GUEST_SYSENTER_ESP, data);
2726                         break;
2727                 case MSR_EFER:
2728                         set_efer(vcpu, data);
2729                         print_func_exit();
2730                         return 1;
2731                 case MSR_IA32_MC0_STATUS:
2732                         printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data);
2733                         break;
2734                 case MSR_IA32_TIME_STAMP_COUNTER:{
2735                                 uint64_t tsc;
2736
2737                                 tsc = read_tsc();
2738                                 vmcs_write64(TSC_OFFSET, data - tsc);
2739                                 break;
2740                         }
2741                 case MSR_IA32_UCODE_REV:
2742                 case MSR_IA32_UCODE_WRITE:
2743                 case 0x200 ... 0x2ff:   /* MTRRs */
2744                         break;
2745                 case MSR_IA32_APICBASE:
2746                         vcpu->apic_base = data;
2747                         break;
2748                 default:
2749                         msr = find_msr_entry(vcpu, ecx);
2750                         if (msr) {
2751                                 msr->data = data;
2752                                 break;
2753                         }
2754                         printk("litevm: unhandled wrmsr: %x\n", ecx);
2755                         inject_gp(vcpu);
2756                         print_func_exit();
2757                         return 1;
2758         }
2759         skip_emulated_instruction(vcpu);
2760         print_func_exit();
2761         return 1;
2762 }
2763
2764 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2765                                                                    struct litevm_run *litevm_run)
2766 {
2767         print_func_entry();
2768         /* Turn off interrupt window reporting. */
2769         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2770                                  vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2771                                  & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2772         print_func_exit();
2773         return 1;
2774 }
2775
2776 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2777 {
2778         print_func_entry();
2779         skip_emulated_instruction(vcpu);
2780         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) {
2781                 print_func_exit();
2782                 return 1;
2783         }
2784
2785         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2786         print_func_exit();
2787         return 0;
2788 }
2789
2790 /*
2791  * The exit handlers return 1 if the exit was handled fully and guest execution
2792  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2793  * to be done to userspace and return 0.
2794  */
2795 static int (*litevm_vmx_exit_handlers[]) (struct litevm_vcpu * vcpu,
2796                                                                                   struct litevm_run * litevm_run) = {
2797 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2798                 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2799                 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2800                 [EXIT_REASON_INVLPG] = handle_invlpg,
2801                 [EXIT_REASON_CR_ACCESS] = handle_cr,
2802                 [EXIT_REASON_DR_ACCESS] = handle_dr,
2803                 [EXIT_REASON_CPUID] = handle_cpuid,
2804                 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2805                 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2806                 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2807                 [EXIT_REASON_HLT] = handle_halt,};
2808
2809 static const int litevm_vmx_max_exit_handlers =
2810         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2811
2812 /*
2813  * The guest has exited.  See if we can fix it or if we need userspace
2814  * assistance.
2815  */
2816 static int litevm_handle_exit(struct litevm_run *litevm_run,
2817                                                           struct litevm_vcpu *vcpu)
2818 {
2819         print_func_entry();
2820         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2821         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2822
2823 printk("vectoring_info %08x exit_reason %x\n", vectoring_info, exit_reason);
2824         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2825                 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2826                 printk("%s: unexpected, valid vectoring info and "
2827                            "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2828         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2829         if (exit_reason < litevm_vmx_max_exit_handlers
2830                 && litevm_vmx_exit_handlers[exit_reason]) {
2831 printk("reason is KNOWN\n");
2832                 print_func_exit();
2833                 return litevm_vmx_exit_handlers[exit_reason] (vcpu, litevm_run);
2834         } else {
2835 printk("reason is UNKNOWN\n");
2836                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2837                 litevm_run->hw.hardware_exit_reason = exit_reason;
2838         }
2839         print_func_exit();
2840         return 0;
2841 }
2842
2843 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2844 {
2845         print_func_entry();
2846         uint16_t ent[2];
2847         uint16_t cs;
2848         uint16_t ip;
2849         unsigned long flags;
2850         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2851         uint16_t sp = vmcs_readl(GUEST_RSP);
2852         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2853
2854         if (sp > ss_limit || ((sp - 6) > sp)) {
2855                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2856                                         __FUNCTION__,
2857                                         vmcs_readl(GUEST_RSP),
2858                                         vmcs_readl(GUEST_SS_BASE), vmcs_read32(GUEST_SS_LIMIT));
2859                 print_func_exit();
2860                 return;
2861         }
2862
2863         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2864                 sizeof(ent)) {
2865                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2866                 print_func_exit();
2867                 return;
2868         }
2869
2870         flags = vmcs_readl(GUEST_RFLAGS);
2871         cs = vmcs_readl(GUEST_CS_BASE) >> 4;
2872         ip = vmcs_readl(GUEST_RIP);
2873
2874         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2875                 litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2876                 litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2877                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2878                 print_func_exit();
2879                 return;
2880         }
2881
2882         vmcs_writel(GUEST_RFLAGS, flags &
2883                                 ~(X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2884         vmcs_write16(GUEST_CS_SELECTOR, ent[1]);
2885         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2886         vmcs_writel(GUEST_RIP, ent[0]);
2887         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2888         print_func_exit();
2889 }
2890
2891 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2892 {
2893         print_func_entry();
2894         int word_index = __ffs(vcpu->irq_summary);
2895         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2896         int irq = word_index * BITS_PER_LONG + bit_index;
2897
2898         /* don't have clear_bit and I'm not sure the akaros
2899          * bitops are really going to work.
2900          */
2901         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2902         if (!vcpu->irq_pending[word_index])
2903                 vcpu->irq_summary &= ~(1 << word_index);
2904
2905         if (vcpu->rmode.active) {
2906                 inject_rmode_irq(vcpu, irq);
2907                 print_func_exit();
2908                 return;
2909         }
2910         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2911                                  irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2912         print_func_exit();
2913 }
2914
2915 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2916 {
2917         print_func_entry();
2918         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2919                 && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2920                 /*
2921                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2922                  */
2923                 litevm_do_inject_irq(vcpu);
2924         else
2925                 /*
2926                  * Interrupts blocked.  Wait for unblock.
2927                  */
2928                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2929                                          vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2930                                          | CPU_BASED_VIRTUAL_INTR_PENDING);
2931         print_func_exit();
2932 }
2933
2934 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2935 {
2936         print_func_entry();
2937         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2938
2939 #warning "no debugging guests yet"
2940         assert(0);
2941 /*
2942         set_debugreg(dbg->bp[0], 0);
2943         set_debugreg(dbg->bp[1], 1);
2944         set_debugreg(dbg->bp[2], 2);
2945         set_debugreg(dbg->bp[3], 3);
2946 */
2947         if (dbg->singlestep) {
2948                 unsigned long flags;
2949
2950                 flags = vmcs_readl(GUEST_RFLAGS);
2951                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2952                 vmcs_writel(GUEST_RFLAGS, flags);
2953         }
2954         print_func_exit();
2955 }
2956
2957 static void load_msrs(struct vmx_msr_entry *e, int n)
2958 {
2959         print_func_entry();
2960         int i;
2961
2962         if (! e) {
2963                 printk("LOAD MSR WITH NULL POINTER?");
2964                 error("LOAD MSR WITH NULL POINTER?");
2965         }
2966         for (i = 0; i < n; ++i) {
2967                 printk("Load MSR (%lx), with %lx\n", e[i].index, e[i].data);
2968                 write_msr(e[i].index, e[i].data);
2969                 printk("Done\n");
2970         }
2971         print_func_exit();
2972 }
2973
2974 static void save_msrs(struct vmx_msr_entry *e, int n)
2975 {
2976         print_func_entry();
2977         int i;
2978
2979         for (i = 0; i < n; ++i)
2980                 e[i].data = read_msr(e[i].index);
2981         print_func_exit();
2982 }
2983
2984 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
2985 {
2986         print_func_entry();
2987         struct litevm_vcpu *vcpu;
2988         uint8_t fail;
2989         uint16_t fs_sel, gs_sel, ldt_sel;
2990         int fs_gs_ldt_reload_needed;
2991
2992         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2993                 error("vcpu is %d but must be in the range %d..%d\n",
2994                           litevm_run->vcpu, LITEVM_MAX_VCPUS);
2995
2996         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2997         if (!vcpu)
2998                 error("vcpu_load failed");
2999         printk("Loaded\n");
3000
3001         if (litevm_run->emulated) {
3002                 skip_emulated_instruction(vcpu);
3003                 litevm_run->emulated = 0;
3004         }
3005         printk("Emulated\n");
3006
3007         if (litevm_run->mmio_completed) {
3008                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
3009                 vcpu->mmio_read_completed = 1;
3010         }
3011         printk("mmio completed\n");
3012
3013         vcpu->mmio_needed = 0;
3014
3015 again:
3016         /*
3017          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
3018          * allow segment selectors with cpl > 0 or ti == 1.
3019          */
3020         fs_sel = read_fs();
3021         printk("fs_sel %x\n", fs_sel);
3022         gs_sel = read_gs();
3023         printk("gs_sel %x\n", gs_sel);
3024         ldt_sel = read_ldt();
3025         printk("ldt_sel %x\n", ldt_sel);
3026         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
3027         if (!fs_gs_ldt_reload_needed) {
3028                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
3029                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
3030         } else {
3031                 vmcs_write16(HOST_FS_SELECTOR, 0);
3032                 vmcs_write16(HOST_GS_SELECTOR, 0);
3033         }
3034         printk("reloaded gs and gs\n");
3035
3036 #ifdef __x86_64__
3037         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
3038         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
3039         printk("Set FS_BASE and GS_BASE");
3040 #endif
3041
3042         printk("skipping IRQs for now\n");
3043         if (0)
3044         if (vcpu->irq_summary &&
3045                 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
3046                 litevm_try_inject_irq(vcpu);
3047
3048         printk("no debugging for now\n");
3049         if (0)
3050         if (vcpu->guest_debug.enabled)
3051                 litevm_guest_debug_pre(vcpu);
3052
3053         fx_save(vcpu->host_fx_image);
3054         fx_restore(vcpu->guest_fx_image);
3055
3056         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
3057         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3058
3059         printk("GO FOR IT!\n");
3060         asm(
3061                    /* Store host registers */
3062                    "pushf \n\t"
3063 #ifdef __x86_64__
3064                    "push %%rax; push %%rbx; push %%rdx;"
3065                    "push %%rsi; push %%rdi; push %%rbp;"
3066                    "push %%r8;  push %%r9;  push %%r10; push %%r11;"
3067                    "push %%r12; push %%r13; push %%r14; push %%r15;"
3068                    "push %%rcx \n\t" "vmwrite %%rsp, %2 \n\t"
3069 #else
3070                    "pusha; push %%ecx \n\t" "vmwrite %%esp, %2 \n\t"
3071 #endif
3072                    /* Check if vmlaunch of vmresume is needed */
3073                    "cmp $0, %1 \n\t"
3074                    /* Load guest registers.  Don't clobber flags. */
3075 #ifdef __x86_64__
3076                    "mov %c[cr2](%3), %%rax \n\t" "mov %%rax, %%cr2 \n\t" "mov %c[rax](%3), %%rax \n\t" "mov %c[rbx](%3), %%rbx \n\t" "mov %c[rdx](%3), %%rdx \n\t" "mov %c[rsi](%3), %%rsi \n\t" "mov %c[rdi](%3), %%rdi \n\t" "mov %c[rbp](%3), %%rbp \n\t" "mov %c[r8](%3),  %%r8  \n\t" "mov %c[r9](%3),  %%r9  \n\t" "mov %c[r10](%3), %%r10 \n\t" "mov %c[r11](%3), %%r11 \n\t" "mov %c[r12](%3), %%r12 \n\t" "mov %c[r13](%3), %%r13 \n\t" "mov %c[r14](%3), %%r14 \n\t" "mov %c[r15](%3), %%r15 \n\t" "mov %c[rcx](%3), %%rcx \n\t"      /* kills %3 (rcx) */
3077 #else
3078                    "mov %c[cr2](%3), %%eax \n\t" "mov %%eax,   %%cr2 \n\t" "mov %c[rax](%3), %%eax \n\t" "mov %c[rbx](%3), %%ebx \n\t" "mov %c[rdx](%3), %%edx \n\t" "mov %c[rsi](%3), %%esi \n\t" "mov %c[rdi](%3), %%edi \n\t" "mov %c[rbp](%3), %%ebp \n\t" "mov %c[rcx](%3), %%ecx \n\t"    /* kills %3 (ecx) */
3079 #endif
3080                    /* Enter guest mode */
3081                    "jne launched \n\t"
3082                    "vmlaunch \n\t"
3083                    "jmp litevm_vmx_return \n\t"
3084                    "launched: vmresume \n\t"
3085                    ".globl litevm_vmx_return \n\t" "litevm_vmx_return: "
3086                    /* Save guest registers, load host registers, keep flags */
3087 #ifdef __x86_64__
3088                    "xchg %3,     0(%%rsp) \n\t"
3089                    "mov %%rax, %c[rax](%3) \n\t"
3090                    "mov %%rbx, %c[rbx](%3) \n\t"
3091                    "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
3092                    "mov %%rdx, %c[rdx](%3) \n\t"
3093                    "mov %%rsi, %c[rsi](%3) \n\t"
3094                    "mov %%rdi, %c[rdi](%3) \n\t"
3095                    "mov %%rbp, %c[rbp](%3) \n\t"
3096                    "mov %%r8,  %c[r8](%3) \n\t"
3097                    "mov %%r9,  %c[r9](%3) \n\t"
3098                    "mov %%r10, %c[r10](%3) \n\t"
3099                    "mov %%r11, %c[r11](%3) \n\t"
3100                    "mov %%r12, %c[r12](%3) \n\t"
3101                    "mov %%r13, %c[r13](%3) \n\t"
3102                    "mov %%r14, %c[r14](%3) \n\t"
3103                    "mov %%r15, %c[r15](%3) \n\t"
3104                    "mov %%cr2, %%rax   \n\t"
3105                    "mov %%rax, %c[cr2](%3) \n\t"
3106                    "mov 0(%%rsp), %3 \n\t"
3107                    "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
3108                    "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
3109                    "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
3110                    "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
3111 #else
3112                    "xchg %3, 0(%%esp) \n\t"
3113                    "mov %%eax, %c[rax](%3) \n\t"
3114                    "mov %%ebx, %c[rbx](%3) \n\t"
3115                    "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
3116                    "mov %%edx, %c[rdx](%3) \n\t"
3117                    "mov %%esi, %c[rsi](%3) \n\t"
3118                    "mov %%edi, %c[rdi](%3) \n\t"
3119                    "mov %%ebp, %c[rbp](%3) \n\t"
3120                    "mov %%cr2, %%eax  \n\t"
3121                    "mov %%eax, %c[cr2](%3) \n\t"
3122                    "mov 0(%%esp), %3 \n\t" "pop %%ecx; popa \n\t"
3123 #endif
3124 "setbe %0 \n\t" "popf \n\t":"=g"(fail)
3125 :                  "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
3126                    "c"(vcpu),
3127                    [rax] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
3128                    [rbx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
3129                    [rcx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
3130                    [rdx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
3131                    [rsi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
3132                    [rdi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
3133                    [rbp] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
3134 #ifdef __x86_64__
3135                    [r8] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8])),
3136                    [r9] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9])),
3137                    [r10] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
3138                    [r11] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
3139                    [r12] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
3140                    [r13] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
3141                    [r14] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
3142                    [r15] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
3143 #endif
3144                    [cr2] "i"(offsetof(struct litevm_vcpu, cr2))
3145                    :"cc", "memory");
3146
3147         ++litevm_stat.exits;
3148         printk("vm_run exits");
3149         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3150         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
3151
3152         fx_save(vcpu->guest_fx_image);
3153         fx_restore(vcpu->host_fx_image);
3154
3155 #ifndef __x86_64__
3156 asm("mov %0, %%ds; mov %0, %%es": :"r"(__USER_DS));
3157 #endif
3158
3159         litevm_run->exit_type = 0;
3160         if (fail) {
3161 printk("FAIL\n");
3162                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
3163                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
3164 printk("reason %d\n", litevm_run->exit_reason);
3165         } else {
3166 printk("NOT FAIL\n");
3167                 if (fs_gs_ldt_reload_needed) {
3168                         load_ldt(ldt_sel);
3169                         load_fs(fs_sel);
3170                         /*
3171                          * If we have to reload gs, we must take care to
3172                          * preserve our gs base.
3173                          */
3174                         disable_irq();
3175                         load_gs(gs_sel);
3176 #ifdef __x86_64__
3177                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
3178 #endif
3179                         enable_irq();
3180
3181                         reload_tss();
3182                 }
3183                 vcpu->launched = 1;
3184                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
3185 printk("Let's see why it exited\n");
3186                 if (litevm_handle_exit(litevm_run, vcpu)) {
3187                         /* Give scheduler a change to reschedule. */
3188                         vcpu_put(vcpu);
3189 #warning "how to tell if signal is pending"
3190 /*
3191                         if (signal_pending(current)) {
3192                                 ++litevm_stat.signal_exits;
3193                                 return -EINTR;
3194                         }
3195 */
3196                         kthread_yield();
3197                         /* Cannot fail -  no vcpu unplug yet. */
3198                         vcpu_load(litevm, vcpu_slot(vcpu));
3199                         goto again;
3200                 }
3201         }
3202
3203         vcpu_put(vcpu);
3204         printk("vm_run returns\n");
3205         print_func_exit();
3206         return 0;
3207 }
3208
3209 static int litevm_dev_ioctl_get_regs(struct litevm *litevm,
3210                                                                          struct litevm_regs *regs)
3211 {
3212         print_func_entry();
3213         struct litevm_vcpu *vcpu;
3214
3215         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3216                 print_func_exit();
3217                 return -EINVAL;
3218         }
3219
3220         vcpu = vcpu_load(litevm, regs->vcpu);
3221         if (!vcpu) {
3222                 print_func_exit();
3223                 return -ENOENT;
3224         }
3225
3226         regs->rax = vcpu->regs[VCPU_REGS_RAX];
3227         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
3228         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
3229         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
3230         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
3231         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
3232         regs->rsp = vmcs_readl(GUEST_RSP);
3233         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
3234 #ifdef __x86_64__
3235         regs->r8 = vcpu->regs[VCPU_REGS_R8];
3236         regs->r9 = vcpu->regs[VCPU_REGS_R9];
3237         regs->r10 = vcpu->regs[VCPU_REGS_R10];
3238         regs->r11 = vcpu->regs[VCPU_REGS_R11];
3239         regs->r12 = vcpu->regs[VCPU_REGS_R12];
3240         regs->r13 = vcpu->regs[VCPU_REGS_R13];
3241         regs->r14 = vcpu->regs[VCPU_REGS_R14];
3242         regs->r15 = vcpu->regs[VCPU_REGS_R15];
3243 #endif
3244
3245         regs->rip = vmcs_readl(GUEST_RIP);
3246         regs->rflags = vmcs_readl(GUEST_RFLAGS);
3247
3248         /*
3249          * Don't leak debug flags in case they were set for guest debugging
3250          */
3251         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3252                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3253
3254         vcpu_put(vcpu);
3255
3256         print_func_exit();
3257         return 0;
3258 }
3259
3260 static int litevm_dev_ioctl_set_regs(struct litevm *litevm,
3261                                                                          struct litevm_regs *regs)
3262 {
3263         print_func_entry();
3264         struct litevm_vcpu *vcpu;
3265
3266         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3267                 print_func_exit();
3268                 return -EINVAL;
3269         }
3270
3271         vcpu = vcpu_load(litevm, regs->vcpu);
3272         if (!vcpu) {
3273                 print_func_exit();
3274                 return -ENOENT;
3275         }
3276
3277         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
3278         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
3279         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
3280         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
3281         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
3282         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
3283         vmcs_writel(GUEST_RSP, regs->rsp);
3284         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
3285 #ifdef __x86_64__
3286         vcpu->regs[VCPU_REGS_R8] = regs->r8;
3287         vcpu->regs[VCPU_REGS_R9] = regs->r9;
3288         vcpu->regs[VCPU_REGS_R10] = regs->r10;
3289         vcpu->regs[VCPU_REGS_R11] = regs->r11;
3290         vcpu->regs[VCPU_REGS_R12] = regs->r12;
3291         vcpu->regs[VCPU_REGS_R13] = regs->r13;
3292         vcpu->regs[VCPU_REGS_R14] = regs->r14;
3293         vcpu->regs[VCPU_REGS_R15] = regs->r15;
3294 #endif
3295
3296         vmcs_writel(GUEST_RIP, regs->rip);
3297         vmcs_writel(GUEST_RFLAGS, regs->rflags);
3298
3299         vcpu_put(vcpu);
3300
3301         print_func_exit();
3302         return 0;
3303 }
3304
3305 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm,
3306                                                                           struct litevm_sregs *sregs)
3307 {
3308         print_func_entry();
3309         struct litevm_vcpu *vcpu;
3310
3311         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3312                 print_func_exit();
3313                 return -EINVAL;
3314         }
3315         vcpu = vcpu_load(litevm, sregs->vcpu);
3316         if (!vcpu) {
3317                 print_func_exit();
3318                 return -ENOENT;
3319         }
3320 #define get_segment(var, seg) \
3321         do { \
3322                 uint32_t ar; \
3323                 \
3324                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
3325                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
3326                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
3327                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
3328                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
3329                 sregs->var.type = ar & 15; \
3330                 sregs->var.s = (ar >> 4) & 1; \
3331                 sregs->var.dpl = (ar >> 5) & 3; \
3332                 sregs->var.present = (ar >> 7) & 1; \
3333                 sregs->var.avl = (ar >> 12) & 1; \
3334                 sregs->var.l = (ar >> 13) & 1; \
3335                 sregs->var.db = (ar >> 14) & 1; \
3336                 sregs->var.g = (ar >> 15) & 1; \
3337                 sregs->var.unusable = (ar >> 16) & 1; \
3338         } while (0);
3339
3340         get_segment(cs, CS);
3341         get_segment(ds, DS);
3342         get_segment(es, ES);
3343         get_segment(fs, FS);
3344         get_segment(gs, GS);
3345         get_segment(ss, SS);
3346
3347         get_segment(tr, TR);
3348         get_segment(ldt, LDTR);
3349 #undef get_segment
3350
3351 #define get_dtable(var, table) \
3352         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
3353                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
3354
3355         get_dtable(idt, IDTR);
3356         get_dtable(gdt, GDTR);
3357 #undef get_dtable
3358
3359         sregs->cr0 = guest_cr0();
3360         sregs->cr2 = vcpu->cr2;
3361         sregs->cr3 = vcpu->cr3;
3362         sregs->cr4 = guest_cr4();
3363         sregs->cr8 = vcpu->cr8;
3364         sregs->efer = vcpu->shadow_efer;
3365         sregs->apic_base = vcpu->apic_base;
3366
3367         sregs->pending_int = vcpu->irq_summary != 0;
3368
3369         vcpu_put(vcpu);
3370
3371         print_func_exit();
3372         return 0;
3373 }
3374
3375 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm,
3376                                                                           struct litevm_sregs *sregs)
3377 {
3378         print_func_entry();
3379         struct litevm_vcpu *vcpu;
3380         int mmu_reset_needed = 0;
3381
3382         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3383                 print_func_exit();
3384                 return -EINVAL;
3385         }
3386         vcpu = vcpu_load(litevm, sregs->vcpu);
3387         if (!vcpu) {
3388                 print_func_exit();
3389                 return -ENOENT;
3390         }
3391 #define set_segment(var, seg) \
3392         do { \
3393                 uint32_t ar; \
3394                 \
3395                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
3396                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
3397                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
3398                 if (sregs->var.unusable) { \
3399                         ar = (1 << 16); \
3400                 } else { \
3401                         ar = (sregs->var.type & 15); \
3402                         ar |= (sregs->var.s & 1) << 4; \
3403                         ar |= (sregs->var.dpl & 3) << 5; \
3404                         ar |= (sregs->var.present & 1) << 7; \
3405                         ar |= (sregs->var.avl & 1) << 12; \
3406                         ar |= (sregs->var.l & 1) << 13; \
3407                         ar |= (sregs->var.db & 1) << 14; \
3408                         ar |= (sregs->var.g & 1) << 15; \
3409                 } \
3410                 vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
3411         } while (0);
3412
3413         set_segment(cs, CS);
3414         set_segment(ds, DS);
3415         set_segment(es, ES);
3416         set_segment(fs, FS);
3417         set_segment(gs, GS);
3418         set_segment(ss, SS);
3419
3420         set_segment(tr, TR);
3421
3422         set_segment(ldt, LDTR);
3423 #undef set_segment
3424
3425 #define set_dtable(var, table) \
3426         vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
3427         vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
3428
3429         set_dtable(idt, IDTR);
3430         set_dtable(gdt, GDTR);
3431 #undef set_dtable
3432
3433         vcpu->cr2 = sregs->cr2;
3434         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
3435         vcpu->cr3 = sregs->cr3;
3436
3437         vcpu->cr8 = sregs->cr8;
3438
3439         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
3440 #ifdef __x86_64__
3441         __set_efer(vcpu, sregs->efer);
3442 #endif
3443         vcpu->apic_base = sregs->apic_base;
3444
3445         mmu_reset_needed |= guest_cr0() != sregs->cr0;
3446         vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
3447         update_exception_bitmap(vcpu);
3448         vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
3449         vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
3450
3451         mmu_reset_needed |= guest_cr4() != sregs->cr4;
3452         __set_cr4(vcpu, sregs->cr4);
3453
3454         if (mmu_reset_needed)
3455                 litevm_mmu_reset_context(vcpu);
3456         vcpu_put(vcpu);
3457
3458         print_func_exit();
3459         return 0;
3460 }
3461
3462 /*
3463  * Translate a guest virtual address to a guest physical address.
3464  */
3465 static int litevm_dev_ioctl_translate(struct litevm *litevm,
3466                                                                           struct litevm_translation *tr)
3467 {
3468         print_func_entry();
3469         unsigned long vaddr = tr->linear_address;
3470         struct litevm_vcpu *vcpu;
3471         gpa_t gpa;
3472
3473         vcpu = vcpu_load(litevm, tr->vcpu);
3474         if (!vcpu) {
3475                 print_func_exit();
3476                 return -ENOENT;
3477         }
3478         SPLL(&litevm->lock);
3479         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
3480         tr->physical_address = gpa;
3481         tr->valid = gpa != UNMAPPED_GVA;
3482         tr->writeable = 1;
3483         tr->usermode = 0;
3484         SPLU(&litevm->lock);
3485         vcpu_put(vcpu);
3486
3487         print_func_exit();
3488         return 0;
3489 }
3490
3491 #if 0
3492 static int litevm_dev_ioctl_interrupt(struct litevm *litevm,
3493                                                                           struct litevm_interrupt *irq)
3494 {
3495         struct litevm_vcpu *vcpu;
3496
3497         if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS)
3498                 return -EINVAL;
3499         if (irq->irq < 0 || irq->irq >= 256)
3500                 return -EINVAL;
3501         vcpu = vcpu_load(litevm, irq->vcpu);
3502         if (!vcpu)
3503                 return -ENOENT;
3504
3505         set_bit(irq->irq, vcpu->irq_pending);
3506         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
3507
3508         vcpu_put(vcpu);
3509
3510         return 0;
3511 }
3512 #endif
3513
3514 #if 0
3515 static int litevm_dev_ioctl_debug_guest(struct litevm *litevm,
3516                                                                                 struct litevm_debug_guest *dbg)
3517 {
3518         struct litevm_vcpu *vcpu;
3519         unsigned long dr7 = 0x400;
3520         uint32_t exception_bitmap;
3521         int old_singlestep;
3522
3523         if (dbg->vcpu < 0 || dbg->vcpu >= LITEVM_MAX_VCPUS)
3524                 return -EINVAL;
3525         vcpu = vcpu_load(litevm, dbg->vcpu);
3526         if (!vcpu)
3527                 return -ENOENT;
3528
3529         exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
3530         old_singlestep = vcpu->guest_debug.singlestep;
3531
3532         vcpu->guest_debug.enabled = dbg->enabled;
3533         if (vcpu->guest_debug.enabled) {
3534                 int i;
3535
3536                 dr7 |= 0x200;   /* exact */
3537                 for (i = 0; i < 4; ++i) {
3538                         if (!dbg->breakpoints[i].enabled)
3539                                 continue;
3540                         vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
3541                         dr7 |= 2 << (i * 2);    /* global enable */
3542                         dr7 |= 0 << (i * 4 + 16);       /* execution breakpoint */
3543                 }
3544
3545                 exception_bitmap |= (1u << 1);  /* Trap debug exceptions */
3546
3547                 vcpu->guest_debug.singlestep = dbg->singlestep;
3548         } else {
3549                 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
3550                 vcpu->guest_debug.singlestep = 0;
3551         }
3552
3553         if (old_singlestep && !vcpu->guest_debug.singlestep) {
3554                 unsigned long flags;
3555
3556                 flags = vmcs_readl(GUEST_RFLAGS);
3557                 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3558                 vmcs_writel(GUEST_RFLAGS, flags);
3559         }
3560
3561         vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
3562         vmcs_writel(GUEST_DR7, dr7);
3563
3564         vcpu_put(vcpu);
3565
3566         return 0;
3567 }
3568 #endif
3569
3570 #if 0
3571 long litevm_control(struct litevm *litevm, int command, unsigned long arg)
3572 {
3573         int r = -EINVAL;
3574
3575         switch (command) {
3576                 case LITEVM_CREATE_VCPU:{
3577                                 r = create_vcpu(litevm, arg);
3578                                 if (r)
3579                                         goto out;
3580                                 break;
3581                         }
3582                 case LITEVM_RUN:{
3583                                 struct litevm_run litevm_run;
3584
3585                                 r = -EFAULT;
3586                                 if (copy_from_user(&litevm_run, (void *)arg, sizeof litevm_run))
3587                                         goto out;
3588                                 r = litevm_dev_ioctl_run(litevm, &litevm_run);
3589                                 if (r < 0)
3590                                         goto out;
3591                                 r = -EFAULT;
3592                                 if (copy_to_user((void *)arg, &litevm_run, sizeof litevm_run))
3593                                         goto out;
3594                                 r = 0;
3595                                 break;
3596                         }
3597                 case LITEVM_GET_REGS:{
3598                                 struct litevm_regs litevm_regs;
3599
3600                                 r = -EFAULT;
3601                                 if (copy_from_user
3602                                         (&litevm_regs, (void *)arg, sizeof litevm_regs))
3603                                         goto out;
3604                                 r = litevm_dev_ioctl_get_regs(litevm, &litevm_regs);
3605                                 if (r)
3606                                         goto out;
3607                                 r = -EFAULT;
3608                                 if (copy_to_user((void *)arg, &litevm_regs, sizeof litevm_regs))
3609                                         goto out;
3610                                 r = 0;
3611                                 break;
3612                         }
3613                 case LITEVM_SET_REGS:{
3614                                 struct litevm_regs litevm_regs;
3615
3616                                 r = -EFAULT;
3617                                 if (copy_from_user
3618                                         (&litevm_regs, (void *)arg, sizeof litevm_regs))
3619                                         goto out;
3620                                 r = litevm_dev_ioctl_set_regs(litevm, &litevm_regs);
3621                                 if (r)
3622                                         goto out;
3623                                 r = 0;
3624                                 break;
3625                         }
3626                 case LITEVM_GET_SREGS:{
3627                                 struct litevm_sregs litevm_sregs;
3628
3629                                 r = -EFAULT;
3630                                 if (copy_from_user
3631                                         (&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3632                                         goto out;
3633                                 r = litevm_dev_ioctl_get_sregs(litevm, &litevm_sregs);
3634                                 if (r)
3635                                         goto out;
3636                                 r = -EFAULT;
3637                                 if (copy_to_user
3638                                         ((void *)arg, &litevm_sregs, sizeof litevm_sregs))
3639                                         goto out;
3640                                 r = 0;
3641                                 break;
3642                         }
3643                 case LITEVM_SET_SREGS:{
3644                                 struct litevm_sregs litevm_sregs;
3645
3646                                 r = -EFAULT;
3647                                 if (copy_from_user
3648                                         (&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3649                                         goto out;
3650                                 r = litevm_dev_ioctl_set_sregs(litevm, &litevm_sregs);
3651                                 if (r)
3652                                         goto out;
3653                                 r = 0;
3654                                 break;
3655                         }
3656                 case LITEVM_TRANSLATE:{
3657                                 struct litevm_translation tr;
3658
3659                                 r = -EFAULT;
3660                                 if (copy_from_user(&tr, (void *)arg, sizeof tr))
3661                                         goto out;
3662                                 r = litevm_dev_ioctl_translate(litevm, &tr);
3663                                 if (r)
3664                                         goto out;
3665                                 r = -EFAULT;
3666                                 if (copy_to_user((void *)arg, &tr, sizeof tr))
3667                                         goto out;
3668                                 r = 0;
3669                                 break;
3670                         }
3671                 case LITEVM_INTERRUPT:{
3672                                 struct litevm_interrupt irq;
3673
3674                                 r = -EFAULT;
3675                                 if (copy_from_user(&irq, (void *)arg, sizeof irq))
3676                                         goto out;
3677                                 r = litevm_dev_ioctl_interrupt(litevm, &irq);
3678                                 if (r)
3679                                         goto out;
3680                                 r = 0;
3681                                 break;
3682                         }
3683                 case LITEVM_DEBUG_GUEST:{
3684                                 struct litevm_debug_guest dbg;
3685
3686                                 r = -EFAULT;
3687                                 if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
3688                                         goto out;
3689                                 r = litevm_dev_ioctl_debug_guest(litevm, &dbg);
3690                                 if (r)
3691                                         goto out;
3692                                 r = 0;
3693                                 break;
3694                         }
3695                 case LITEVM_SET_MEMORY_REGION:{
3696                                 struct litevm_memory_region litevm_mem;
3697
3698                                 r = -EFAULT;
3699                                 if (copy_from_user(&litevm_mem, (void *)arg, sizeof litevm_mem))
3700                                         goto out;
3701                                 r = litevm_dev_ioctl_set_memory_region(litevm, &litevm_mem);
3702                                 if (r)
3703                                         goto out;
3704                                 break;
3705                         }
3706                 case LITEVM_GET_DIRTY_LOG:{
3707                                 struct litevm_dirty_log log;
3708
3709                                 r = -EFAULT;
3710                                 if (copy_from_user(&log, (void *)arg, sizeof log))
3711                                         goto out;
3712                                 r = litevm_dev_ioctl_get_dirty_log(litevm, &log);
3713                                 if (r)
3714                                         goto out;
3715                                 break;
3716                         }
3717                 default:
3718                         ;
3719         }
3720 out:
3721         return r;
3722 }
3723 #endif
3724
3725 #if 0
3726 static int litevm_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3727 {
3728         struct litevm *litevm = vma->vm_file->private_data;
3729         struct litevm_memory_slot *slot;
3730         struct page *page;
3731
3732         slot = gfn_to_memslot(litevm, vmf->pgoff);
3733         if (!slot)
3734                 return VM_FAULT_SIGBUS;
3735         page = gfn_to_page(slot, vmf->pgoff);
3736         if (!page)
3737                 return VM_FAULT_SIGBUS;
3738
3739         get_page(page);
3740         vmf->page = page;
3741         return 0;
3742 }
3743 #endif
3744
3745 #if 0
3746 static int litevm_reboot(struct notifier_block *notifier, unsigned long val,
3747                                                  void *v)
3748 {
3749         panic("litevm_reboot");
3750         if (val == SYS_RESTART) {
3751                 /*
3752                  * Some (well, at least mine) BIOSes hang on reboot if
3753                  * in vmx root mode.
3754                  */
3755                 printk("litevm: exiting vmx mode\n");
3756                 handler_wrapper_t *w;
3757                 smp_call_function_all(litevm_disable, 0, &w);
3758                 smp_call_wait(w);
3759         }
3760         return NOTIFY_OK;
3761         return 0;
3762 }
3763 #endif
3764
3765 hpa_t bad_page_address;
3766
3767 int vmx_init(void)
3768 {
3769         print_func_entry();
3770         handler_wrapper_t *w;
3771         int r = 0;
3772
3773         if (!cpu_has_litevm_support()) {
3774                 printk("litevm: no hardware support\n");
3775                 print_func_exit();
3776                 return -EOPNOTSUPP;
3777         }
3778         if (vmx_disabled_by_bios()) {
3779                 printk("litevm: disabled by bios\n");
3780                 print_func_exit();
3781                 return -EOPNOTSUPP;
3782         }
3783
3784         setup_vmcs_descriptor();
3785         smp_call_function_all(vm_enable, 0, &w);
3786         if (smp_call_wait(w)) {
3787                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
3788         }
3789
3790         if ((bad_page_address = PADDR(kpage_zalloc_addr())) == 0ULL) {
3791                 r = -ENOMEM;
3792         }
3793
3794         print_func_exit();
3795         return r;
3796 }
3797
3798 static void litevm_exit(void)
3799 {
3800         print_func_entry();
3801         //free_litevm_area();
3802         //__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3803         print_func_exit();
3804 }