76daa4de63757e6280af7f94abd6dd3882c49972
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 #define currentcpu (&per_cpu_info[core_id()])
39
40 struct litevm_stat litevm_stat;
41
42 static struct litevm_stats_debugfs_item {
43         const char *name;
44         uint32_t *data;
45 } debugfs_entries[] = {
46         { "pf_fixed", &litevm_stat.pf_fixed },
47         { "pf_guest", &litevm_stat.pf_guest },
48         { "tlb_flush", &litevm_stat.tlb_flush },
49         { "invlpg", &litevm_stat.invlpg },
50         { "exits", &litevm_stat.exits },
51         { "io_exits", &litevm_stat.io_exits },
52         { "mmio_exits", &litevm_stat.mmio_exits },
53         { "signal_exits", &litevm_stat.signal_exits },
54         { "irq_exits", &litevm_stat.irq_exits },
55         { 0, 0 }
56 };
57
58 static struct dentry *debugfs_dir;
59
60 static const uint32_t vmx_msr_index[] = {
61 #ifdef __x86_64__
62         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
63 #endif
64         MSR_EFER, // wtf? MSR_K6_STAR,
65 };
66 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
67
68 #ifdef __x86_64__
69 /*
70  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
71  * mechanism (cpu bug AA24)
72  */
73 #define NR_BAD_MSRS 2
74 #else
75 #define NR_BAD_MSRS 0
76 #endif
77
78 #define TSS_IOPB_BASE_OFFSET 0x66
79 #define TSS_BASE_SIZE 0x68
80 #define TSS_IOPB_SIZE (65536 / 8)
81 #define TSS_REDIRECTION_SIZE (256 / 8)
82 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
83
84 #define MSR_IA32_VMX_BASIC_MSR                  0x480
85 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
86 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
87 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
88 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
89
90 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
91 #define LMSW_GUEST_MASK 0x0eULL
92 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
93 //#define CR4_VMXE 0x2000
94 #define CR8_RESEVED_BITS (~0x0fULL)
95 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
96
97 #ifdef __x86_64__
98 #define HOST_IS_64 1
99 #else
100 #define HOST_IS_64 0
101 #endif
102
103 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
104 /**
105  * __ffs - find first set bit in word
106  * @word: The word to search
107  *
108  * Undefined if no bit exists, so code should check against 0 first.
109  */
110 static inline unsigned long __ffs(unsigned long word)
111 {
112         print_func_entry();
113         asm("rep; bsf %1,%0"
114                 : "=r" (word)
115                 : "rm" (word));
116         print_func_exit();
117         return word;
118 }
119
120 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu, uint32_t msr)
121 {
122         print_func_entry();
123         int i;
124
125         for (i = 0; i < vcpu->nmsrs; ++i)
126                 if (vcpu->guest_msrs[i].index == msr) {
127                         print_func_exit();
128                         return &vcpu->guest_msrs[i];
129                 }
130         print_func_exit();
131         return 0;
132 }
133
134 struct descriptor_table {
135         uint16_t limit;
136         unsigned long base;
137 } __attribute__((packed));
138
139 static void get_gdt(struct descriptor_table *table)
140 {
141         print_func_entry();
142         asm ("sgdt %0" : "=m"(*table));
143         print_func_exit();
144 }
145
146 static void get_idt(struct descriptor_table *table)
147 {
148         print_func_entry();
149         asm ("sidt %0" : "=m"(*table));
150         print_func_exit();
151 }
152
153 static uint16_t read_fs(void)
154 {
155         print_func_entry();
156         uint16_t seg;
157         asm ("mov %%fs, %0" : "=g"(seg));
158         print_func_exit();
159         return seg;
160 }
161
162 static uint16_t read_gs(void)
163 {
164         print_func_entry();
165         uint16_t seg;
166         asm ("mov %%gs, %0" : "=g"(seg));
167         print_func_exit();
168         return seg;
169 }
170
171 static uint16_t read_ldt(void)
172 {
173         print_func_entry();
174         uint16_t ldt;
175         asm ("sldt %0" : "=g"(ldt));
176         print_func_exit();
177         return ldt;
178 }
179
180 static void load_fs(uint16_t sel)
181 {
182         print_func_entry();
183         asm ("mov %0, %%fs" : : "g"(sel));
184         print_func_exit();
185 }
186
187 static void load_gs(uint16_t sel)
188 {
189         print_func_entry();
190         asm ("mov %0, %%gs" : : "g"(sel));
191         print_func_exit();
192 }
193
194 #ifndef load_ldt
195 static void load_ldt(uint16_t sel)
196 {
197         print_func_entry();
198         asm ("lldt %0" : : "g"(sel));
199         print_func_exit();
200 }
201 #endif
202
203 static void fx_save(void *image)
204 {
205         print_func_entry();
206         asm ("fxsave (%0)":: "r" (image));
207         print_func_exit();
208 }
209
210 static void fx_restore(void *image)
211 {
212         print_func_entry();
213         asm ("fxrstor (%0)":: "r" (image));
214         print_func_exit();
215 }
216
217 static void fpu_init(void)
218 {
219         print_func_entry();
220         asm ("finit");
221         print_func_exit();
222 }
223
224 struct segment_descriptor {
225         uint16_t limit_low;
226         uint16_t base_low;
227         uint8_t  base_mid;
228         uint8_t  type : 4;
229         uint8_t  system : 1;
230         uint8_t  dpl : 2;
231         uint8_t  present : 1;
232         uint8_t  limit_high : 4;
233         uint8_t  avl : 1;
234         uint8_t  long_mode : 1;
235         uint8_t  default_op : 1;
236         uint8_t  granularity : 1;
237         uint8_t  base_high;
238 } __attribute__((packed));
239
240 #ifdef __x86_64__
241 // LDT or TSS descriptor in the GDT. 16 bytes.
242 struct segment_descriptor_64 {
243         struct segment_descriptor s;
244         uint32_t base_higher;
245         uint32_t pad_zero;
246 };
247
248 #endif
249
250 static unsigned long segment_base(uint16_t selector)
251 {
252         print_func_entry();
253         struct descriptor_table gdt;
254         struct segment_descriptor *d;
255         unsigned long table_base;
256         typedef unsigned long ul;
257         unsigned long v;
258
259         asm ("sgdt %0" : "=m"(gdt));
260         table_base = gdt.base;
261
262         if (selector & 4) {           /* from ldt */
263                 uint16_t ldt_selector;
264
265                 asm ("sldt %0" : "=g"(ldt_selector));
266                 table_base = segment_base(ldt_selector);
267         }
268         d = (struct segment_descriptor *)(table_base + (selector & ~7));
269         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
270 #ifdef __x86_64__
271         if (d->system == 0
272             && (d->type == 2 || d->type == 9 || d->type == 11))
273                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
274 #endif
275         print_func_exit();
276         return v;
277 }
278
279 static unsigned long read_tr_base(void)
280 {
281         print_func_entry();
282         uint16_t tr;
283         asm ("str %0" : "=g"(tr));
284         print_func_exit();
285         return segment_base(tr);
286 }
287
288 static void reload_tss(void)
289 {
290 print_func_entry();
291 #ifndef __x86_64__
292
293         /*
294          * VT restores TR but not its size.  Useless.
295          */
296         struct descriptor_table gdt;
297         struct segment_descriptor *descs;
298
299         get_gdt(&gdt);
300         descs = (void *)gdt.base;
301         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
302         load_TR_desc();
303 #endif
304 print_func_exit();
305 }
306
307 static struct vmcs_descriptor {
308         int size;
309         int order;
310         uint32_t revision_id;
311 } vmcs_descriptor;
312
313 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
314 {
315         print_func_entry();
316         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
317         print_func_exit();
318         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
319 }
320
321
322
323 int litevm_read_guest(struct litevm_vcpu *vcpu,
324                              gva_t addr,
325                              unsigned long size,
326                              void *dest)
327 {
328         print_func_entry();
329         unsigned char *host_buf = dest;
330         unsigned long req_size = size;
331
332         while (size) {
333                 hpa_t paddr;
334                 unsigned now;
335                 unsigned offset;
336                 hva_t guest_buf;
337
338                 paddr = gva_to_hpa(vcpu, addr);
339
340                 if (is_error_hpa(paddr))
341                         break;
342                 guest_buf = (hva_t)KADDR(paddr);
343                 offset = addr & ~PAGE_MASK;
344                 guest_buf |= offset;
345                 now = MIN(size, PAGE_SIZE - offset);
346                 memcpy(host_buf, (void*)guest_buf, now);
347                 host_buf += now;
348                 addr += now;
349                 size -= now;
350         }
351         print_func_exit();
352         return req_size - size;
353 }
354
355 int litevm_write_guest(struct litevm_vcpu *vcpu,
356                              gva_t addr,
357                              unsigned long size,
358                              void *data)
359 {
360         print_func_entry();
361         unsigned char *host_buf = data;
362         unsigned long req_size = size;
363
364         while (size) {
365                 hpa_t paddr;
366                 unsigned now;
367                 unsigned offset;
368                 hva_t guest_buf;
369
370                 paddr = gva_to_hpa(vcpu, addr);
371
372                 if (is_error_hpa(paddr))
373                         break;
374
375                 guest_buf = (hva_t)KADDR(paddr);
376                 offset = addr & ~PAGE_MASK;
377                 guest_buf |= offset;
378                 now = MIN(size, PAGE_SIZE - offset);
379                 memcpy((void*)guest_buf, host_buf, now);
380                 host_buf += now;
381                 addr += now;
382                 size -= now;
383         }
384         print_func_exit();
385         return req_size - size;
386 }
387
388 static void setup_vmcs_descriptor(void)
389 {
390         print_func_entry();
391         uint64_t msr;
392
393         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
394         vmcs_descriptor.size = (msr>>32) & 0x1fff;
395         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size>>PAGE_SHIFT);
396         vmcs_descriptor.revision_id = (uint32_t)msr;
397         printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n",
398                msr, vmcs_descriptor.size, vmcs_descriptor.order,
399                vmcs_descriptor.revision_id);
400         print_func_exit();
401 };
402
403 static void vmcs_clear(struct vmcs *vmcs)
404 {
405         print_func_entry();
406         uint64_t phys_addr = PADDR(vmcs);
407         uint8_t error;
408         printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr);
409         asm volatile ("vmclear %1; setna %0"
410                        : "=m"(error) : "m"(phys_addr) : "cc", "memory" );
411         if (error)
412                 printk("litevm: vmclear fail: %p/%llx\n",
413                        vmcs, phys_addr);
414         print_func_exit();
415 }
416
417 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
418 {
419         print_func_entry();
420         struct litevm_vcpu *vcpu = arg;
421         int cpu = core_id();
422         printd("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n", 
423                cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
424
425         if (vcpu->cpu == cpu)
426                 vmcs_clear(vcpu->vmcs);
427
428         if (currentcpu->vmcs == vcpu->vmcs)
429                 currentcpu->vmcs = NULL;
430         print_func_exit();
431 }
432
433 static int vcpu_slot(struct litevm_vcpu *vcpu)
434 {
435         print_func_entry();
436         print_func_exit();
437         return vcpu - vcpu->litevm->vcpus;
438 }
439
440 /*
441  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
442  * vcpu mutex is already taken.
443  */
444 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
445 {
446         print_func_entry();
447         uint64_t phys_addr = PADDR(vcpu->vmcs);
448         int cpu;
449         cpu = core_id();
450
451         if (vcpu->cpu != cpu) {
452                 handler_wrapper_t *w;
453                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
454                 smp_call_wait(w);
455                 vcpu->launched = 0;
456         }
457         if (currentcpu->vmcs != vcpu->vmcs) {
458                 uint8_t error;
459
460                 currentcpu->vmcs = vcpu->vmcs;
461                 asm volatile ("vmptrld %1; setna %0"
462                                : "=m"(error) : "m"(phys_addr) : "cc" );
463                 if (error){
464                         printk("litevm: vmptrld %p/%llx fail\n",
465                                vcpu->vmcs, phys_addr);
466                         error("litevm: vmptrld %p/%llx fail\n",
467                                vcpu->vmcs, phys_addr);
468                 }
469         }
470
471         if (vcpu->cpu != cpu) {
472                 struct descriptor_table dt;
473                 unsigned long sysenter_esp;
474
475                 vcpu->cpu = cpu;
476                 /*
477                  * Linux uses per-cpu TSS and GDT, so set these when switching
478                  * processors.
479                  */
480                 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
481                 get_gdt(&dt);
482                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
483
484                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
485                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
486         }
487         print_func_exit();
488         return vcpu;
489 }
490
491 /*
492  * Switches to specified vcpu, until a matching vcpu_put()
493  */
494 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
495 {
496         print_func_entry();
497         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
498
499         printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu);
500
501         qlock(&vcpu->mutex);
502         if (!vcpu->vmcs) {
503                 qunlock(&vcpu->mutex);
504                 error("vcpu->vmcs is NULL");
505         }
506         print_func_exit();
507         return __vcpu_load(vcpu);
508 }
509
510 static void vcpu_put(struct litevm_vcpu *vcpu)
511 {
512         print_func_entry();
513         //put_cpu();
514         qunlock(&vcpu->mutex);
515         print_func_exit();
516 }
517
518
519 static struct vmcs *alloc_vmcs_cpu(int cpu)
520 {
521         print_func_entry();
522         int node = node_id();
523         struct vmcs *vmcs;
524
525         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
526         if (!pages) {
527                 print_func_exit();
528                 return 0;
529         }
530         memset(vmcs, 0, vmcs_descriptor.size);
531         vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
532         print_func_exit();
533         return vmcs;
534 }
535
536 static struct vmcs *alloc_vmcs(void)
537 {
538         struct vmcs *ret;
539         print_func_entry();
540         ret = alloc_vmcs_cpu(core_id());
541         print_func_exit();
542         return ret;
543 }
544
545 static int cpu_has_litevm_support(void)
546 {
547         print_func_entry();
548         uint32_t ecx = cpuid_ecx(1);
549         print_func_exit();
550         return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */
551 }
552
553 static int vmx_disabled_by_bios(void)
554 {
555         print_func_entry();
556         uint64_t msr;
557
558         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
559         print_func_exit();
560         return (msr & 5) == 1; /* locked but not enabled */
561 }
562
563 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
564 {
565         print_func_entry();
566         int cpu = hw_core_id();
567         uint64_t phys_addr;
568         uint64_t old;
569         uint64_t status = 0;
570         currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order,
571                                                   KMALLOC_WAIT);
572         if (! currentcpu->vmxarea)
573                 return;
574         memset(currentcpu->vmxarea, 0, vmcs_descriptor.size);
575         currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id;
576         phys_addr = PADDR(currentcpu->vmxarea);
577         printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(),
578                currentcpu->vmxarea, (void *)phys_addr);
579         if (phys_addr & 0xfff){
580                 printk("fix vmxarea alignment!");
581         }
582         printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
583         old = read_msr(MSR_IA32_FEATURE_CONTROL);
584         printk("%d: vm_enable, old is %d\n", core_id(), old);
585         if ((old & 5) == 0){
586                 /* enable and lock */
587                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
588                 old = read_msr(MSR_IA32_FEATURE_CONTROL);
589                 printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old);
590         }
591         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
592         lcr4(rcr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
593         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
594         printk("%d:cr0 is %x\n", core_id(), rcr0());
595         lcr0(rcr0() | 0x20);
596         printk("%d:cr0 is %x\n", core_id(), rcr0());
597         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
598         outb(0x92, inb(0x92)|2);
599         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
600         asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:"       \
601                       : "=m" (status) : "m"(phys_addr) : "memory", "cc");
602         printk("%d:vmxon status is %d\n", core_id(), status);
603         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
604         if (! status){
605                 printk("%d:vm_enable: status says fail\n", core_id());
606         }
607         print_func_exit();
608 }
609
610 static void litevm_disable(void *garbage)
611 {
612         print_func_entry();
613         asm volatile ("vmxoff" : : : "cc");
614         print_func_exit();
615 }
616
617 struct litevm *vmx_open(void)
618 {
619         print_func_entry();
620         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
621         int i;
622
623         if (!litevm) {
624                 printk("NO LITEVM! MAKES NO SENSE!\n");
625                 error("litevm alloc failed");
626                 print_func_exit();
627                 return 0;
628         }
629
630         spinlock_init_irqsave(&litevm->lock);
631         LIST_INIT(&litevm->link);
632         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
633                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
634
635                 qlock_init(&vcpu->mutex);
636                 vcpu->mmu.root_hpa = INVALID_PAGE;
637                 LIST_INIT(&vcpu->link);
638         }
639         printk("vmx_open: busy %d\n", litevm->busy);
640         printk("return %p\n", litevm);
641         print_func_exit();
642         return litevm;
643 }
644
645 /*
646  * Free any memory in @free but not in @dont.
647  */
648 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
649                                   struct litevm_memory_slot *dont)
650 {
651         print_func_entry();
652         int i;
653
654         if (!dont || free->phys_mem != dont->phys_mem)
655                 if (free->phys_mem) {
656                         for (i = 0; i < free->npages; ++i){
657                                 page_t *page = free->phys_mem[i];
658                                 page_decref(page);
659                                 assert(page_is_free(page2ppn(page)));
660                         }
661                         kfree(free->phys_mem);
662                 }
663
664         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
665                 kfree(free->dirty_bitmap);
666
667         free->phys_mem = 0;
668         free->npages = 0;
669         free->dirty_bitmap = 0;
670         print_func_exit();
671 }
672
673 static void litevm_free_physmem(struct litevm *litevm)
674 {
675         print_func_entry();
676         int i;
677
678         for (i = 0; i < litevm->nmemslots; ++i)
679                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
680         print_func_exit();
681 }
682
683 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
684 {
685         print_func_entry();
686         if (vcpu->vmcs) {
687                 handler_wrapper_t *w;
688                 smp_call_function_all(__vcpu_clear, vcpu, &w);
689                 smp_call_wait(w);
690                 //free_vmcs(vcpu->vmcs);
691                 vcpu->vmcs = 0;
692         }
693         print_func_exit();
694 }
695
696 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
697 {
698         print_func_entry();
699         litevm_free_vmcs(vcpu);
700         litevm_mmu_destroy(vcpu);
701         print_func_exit();
702 }
703
704 static void litevm_free_vcpus(struct litevm *litevm)
705 {
706         print_func_entry();
707         unsigned int i;
708
709         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
710                 litevm_free_vcpu(&litevm->vcpus[i]);
711         print_func_exit();
712 }
713
714 static int litevm_dev_release(struct litevm *litevm)
715 {
716 print_func_entry();
717
718         litevm_free_vcpus(litevm);
719         litevm_free_physmem(litevm);
720         kfree(litevm);
721         print_func_exit();
722         return 0;
723 }
724
725 unsigned long vmcs_readl(unsigned long field)
726 {
727         print_func_entry();
728         unsigned long value;
729
730         asm volatile ("vmread %1, %0" : "=g"(value) : "r"(field) : "cc");
731         print_func_exit();
732         return value;
733 }
734
735 void vmcs_writel(unsigned long field, unsigned long value)
736 {
737         print_func_entry();
738         uint8_t error;
739
740         asm volatile ("vmwrite %1, %2; setna %0"
741                        : "=g"(error) : "r"(value), "r"(field) : "cc" );
742         if (error)
743                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
744                        field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
745         print_func_exit();
746 }
747
748 static void vmcs_write16(unsigned long field, uint16_t value)
749 {
750         print_func_entry();
751         vmcs_writel(field, value);
752         print_func_exit();
753 }
754
755 static void vmcs_write64(unsigned long field, uint64_t value)
756 {
757 print_func_entry();
758 #ifdef __x86_64__
759         vmcs_writel(field, value);
760 #else
761         vmcs_writel(field, value);
762         asm volatile ("");
763         vmcs_writel(field+1, value >> 32);
764 #endif
765 print_func_exit();
766 }
767
768 static void inject_gp(struct litevm_vcpu *vcpu)
769 {
770         print_func_entry();
771         printd("inject_general_protection: rip 0x%lx\n",
772                vmcs_readl(GUEST_RIP));
773         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
774         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
775                      GP_VECTOR |
776                      INTR_TYPE_EXCEPTION |
777                      INTR_INFO_DELIEVER_CODE_MASK |
778                      INTR_INFO_VALID_MASK);
779         print_func_exit();
780 }
781
782 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
783 {
784         print_func_entry();
785         if (vcpu->rmode.active)
786                 vmcs_write32(EXCEPTION_BITMAP, ~0);
787         else
788                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
789         print_func_exit();
790 }
791
792 static void enter_pmode(struct litevm_vcpu *vcpu)
793 {
794         print_func_entry();
795         unsigned long flags;
796
797         vcpu->rmode.active = 0;
798
799         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
800         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
801         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
802
803         flags = vmcs_readl(GUEST_RFLAGS);
804         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
805         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
806         vmcs_writel(GUEST_RFLAGS, flags);
807
808         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
809                         (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK) );
810
811         update_exception_bitmap(vcpu);
812
813         #define FIX_PMODE_DATASEG(seg, save) {                          \
814                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
815                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
816                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
817                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
818         }
819
820         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
821         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
822         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
823         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
824         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
825
826         vmcs_write16(GUEST_CS_SELECTOR,
827                      vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
828         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
829         print_func_exit();
830 }
831
832 static int rmode_tss_base(struct litevm* litevm)
833 {
834         print_func_entry();
835         gfn_t base_gfn = litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
836         print_func_exit();
837         return base_gfn << PAGE_SHIFT;
838 }
839
840 static void enter_rmode(struct litevm_vcpu *vcpu)
841 {
842         print_func_entry();
843         unsigned long flags;
844
845         vcpu->rmode.active = 1;
846
847         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
848         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
849
850         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
851         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
852
853         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
854         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
855
856         flags = vmcs_readl(GUEST_RFLAGS);
857         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
858
859         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
860
861         vmcs_writel(GUEST_RFLAGS, flags);
862         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
863         update_exception_bitmap(vcpu);
864
865         #define FIX_RMODE_SEG(seg, save) {                                 \
866                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
867                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
868                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
869                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
870         }
871
872         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
873         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
874
875         FIX_RMODE_SEG(ES, vcpu->rmode.es);
876         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
877         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
878         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
879         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
880         print_func_exit();
881 }
882
883 static int init_rmode_tss(struct litevm* litevm)
884 {
885         print_func_entry();
886         struct page *p1, *p2, *p3;
887         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
888         char *page;
889
890         p1 = _gfn_to_page(litevm, fn++);
891         p2 = _gfn_to_page(litevm, fn++);
892         p3 = _gfn_to_page(litevm, fn);
893
894         if (!p1 || !p2 || !p3) {
895                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
896                 print_func_exit();
897                 return 0;
898         }
899
900         page = page2kva(p1);
901         memset(page, 0, PAGE_SIZE);
902         *(uint16_t*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
903
904         page = page2kva(p2);
905         memset(page, 0, PAGE_SIZE);
906
907         page = page2kva(p3);
908         memset(page, 0, PAGE_SIZE);
909         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
910
911         print_func_exit();
912         return 1;
913 }
914
915 #ifdef __x86_64__
916
917 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
918 {
919         print_func_entry();
920         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
921
922         vcpu->shadow_efer = efer;
923         if (efer & EFER_LMA) {
924                 vmcs_write32(VM_ENTRY_CONTROLS,
925                                      vmcs_read32(VM_ENTRY_CONTROLS) |
926                                      VM_ENTRY_CONTROLS_IA32E_MASK);
927                 msr->data = efer;
928
929         } else {
930                 vmcs_write32(VM_ENTRY_CONTROLS,
931                                      vmcs_read32(VM_ENTRY_CONTROLS) &
932                                      ~VM_ENTRY_CONTROLS_IA32E_MASK);
933
934                 msr->data = efer & ~EFER_LME;
935         }
936         print_func_exit();
937 }
938
939 static void enter_lmode(struct litevm_vcpu *vcpu)
940 {
941         print_func_entry();
942         uint32_t guest_tr_ar;
943
944         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
945         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
946                 printd("%s: tss fixup for long mode. \n",
947                        __FUNCTION__);
948                 vmcs_write32(GUEST_TR_AR_BYTES,
949                              (guest_tr_ar & ~AR_TYPE_MASK)
950                              | AR_TYPE_BUSY_64_TSS);
951         }
952
953         vcpu->shadow_efer |= EFER_LMA;
954
955         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
956         vmcs_write32(VM_ENTRY_CONTROLS,
957                      vmcs_read32(VM_ENTRY_CONTROLS)
958                      | VM_ENTRY_CONTROLS_IA32E_MASK);
959         print_func_exit();
960 }
961
962 static void exit_lmode(struct litevm_vcpu *vcpu)
963 {
964         print_func_entry();
965         vcpu->shadow_efer &= ~EFER_LMA;
966
967         vmcs_write32(VM_ENTRY_CONTROLS,
968                      vmcs_read32(VM_ENTRY_CONTROLS)
969                      & ~VM_ENTRY_CONTROLS_IA32E_MASK);
970         print_func_exit();
971 }
972
973 #endif
974
975 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
976 {
977         print_func_entry();
978         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
979                 enter_pmode(vcpu);
980
981         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
982                 enter_rmode(vcpu);
983
984 #ifdef __x86_64__
985         if (vcpu->shadow_efer & EFER_LME) {
986                 if (!is_paging() && (cr0 & CR0_PG_MASK))
987                         enter_lmode(vcpu);
988                 if (is_paging() && !(cr0 & CR0_PG_MASK))
989                         exit_lmode(vcpu);
990         }
991 #endif
992
993         vmcs_writel(CR0_READ_SHADOW, cr0);
994         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
995         print_func_exit();
996 }
997
998 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
999                                          unsigned long cr3)
1000 {
1001         print_func_entry();
1002         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
1003         unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
1004         int i;
1005         uint64_t pdpte;
1006         uint64_t *pdpt;
1007         struct litevm_memory_slot *memslot;
1008
1009         spin_lock_irqsave(&vcpu->litevm->lock);
1010         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
1011         /* FIXME: !memslot - emulate? 0xff? */
1012         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
1013
1014         for (i = 0; i < 4; ++i) {
1015                 pdpte = pdpt[offset + i];
1016                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
1017                         break;
1018         }
1019
1020         spin_unlock(&vcpu->litevm->lock);
1021
1022         print_func_exit();
1023         return i != 4;
1024 }
1025
1026 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
1027 {
1028         print_func_entry();
1029         if (cr0 & CR0_RESEVED_BITS) {
1030                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
1031                        cr0, guest_cr0());
1032                 inject_gp(vcpu);
1033                 print_func_exit();
1034                 return;
1035         }
1036
1037         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
1038                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
1039                 inject_gp(vcpu);
1040                 print_func_exit();
1041                 return;
1042         }
1043
1044         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
1045                 printd("set_cr0: #GP, set PG flag "
1046                        "and a clear PE flag\n");
1047                 inject_gp(vcpu);
1048                 print_func_exit();
1049                 return;
1050         }
1051
1052         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
1053 #ifdef __x86_64__
1054                 if ((vcpu->shadow_efer & EFER_LME)) {
1055                         uint32_t guest_cs_ar;
1056                         if (!is_pae()) {
1057                                 printd("set_cr0: #GP, start paging "
1058                                        "in long mode while PAE is disabled\n");
1059                                 inject_gp(vcpu);
1060                                 print_func_exit();
1061                                 return;
1062                         }
1063                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1064                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
1065                                 printd("set_cr0: #GP, start paging "
1066                                        "in long mode while CS.L == 1\n");
1067                                 inject_gp(vcpu);
1068                                 print_func_exit();
1069                                 return;
1070
1071                         }
1072                 } else
1073 #endif
1074                 if (is_pae() &&
1075                             pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1076                         printd("set_cr0: #GP, pdptrs "
1077                                "reserved bits\n");
1078                         inject_gp(vcpu);
1079                         print_func_exit();
1080                         return;
1081                 }
1082
1083         }
1084
1085         __set_cr0(vcpu, cr0);
1086         litevm_mmu_reset_context(vcpu);
1087         print_func_exit();
1088         return;
1089 }
1090
1091 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
1092 {
1093         print_func_entry();
1094         unsigned long cr0 = guest_cr0();
1095
1096         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
1097                 enter_pmode(vcpu);
1098                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
1099
1100         } else
1101                 printd("lmsw: unexpected\n");
1102
1103         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
1104                                 | (msw & LMSW_GUEST_MASK));
1105         print_func_exit();
1106 }
1107
1108 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1109 {
1110         print_func_entry();
1111         vmcs_writel(CR4_READ_SHADOW, cr4);
1112         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1113                     LITEVM_RMODE_VM_CR4_ALWAYS_ON : LITEVM_PMODE_VM_CR4_ALWAYS_ON));
1114         print_func_exit();
1115 }
1116
1117 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1118 {
1119         print_func_entry();
1120         if (cr4 & CR4_RESEVED_BITS) {
1121                 printd("set_cr4: #GP, reserved bits\n");
1122                 inject_gp(vcpu);
1123                 print_func_exit();
1124                 return;
1125         }
1126
1127         if (is_long_mode()) {
1128                 if (!(cr4 & CR4_PAE_MASK)) {
1129                         printd("set_cr4: #GP, clearing PAE while "
1130                                "in long mode\n");
1131                         inject_gp(vcpu);
1132                         print_func_exit();
1133                         return;
1134                 }
1135         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
1136                    && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1137                 printd("set_cr4: #GP, pdptrs reserved bits\n");
1138                 inject_gp(vcpu);
1139         }
1140
1141         if (cr4 & CR4_VMXE_MASK) {
1142                 printd("set_cr4: #GP, setting VMXE\n");
1143                 inject_gp(vcpu);
1144                 print_func_exit();
1145                 return;
1146         }
1147         __set_cr4(vcpu, cr4);
1148         spin_lock_irqsave(&vcpu->litevm->lock);
1149         litevm_mmu_reset_context(vcpu);
1150         spin_unlock(&vcpu->litevm->lock);
1151         print_func_exit();
1152 }
1153
1154 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1155 {
1156         print_func_entry();
1157         if (is_long_mode()) {
1158                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
1159                         printd("set_cr3: #GP, reserved bits\n");
1160                         inject_gp(vcpu);
1161                         print_func_exit();
1162                         return;
1163                 }
1164         } else {
1165                 if (cr3 & CR3_RESEVED_BITS) {
1166                         printd("set_cr3: #GP, reserved bits\n");
1167                         inject_gp(vcpu);
1168                         print_func_exit();
1169                         return;
1170                 }
1171                 if (is_paging() && is_pae() &&
1172                     pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1173                         printd("set_cr3: #GP, pdptrs "
1174                                "reserved bits\n");
1175                         inject_gp(vcpu);
1176                         print_func_exit();
1177                         return;
1178                 }
1179         }
1180
1181         vcpu->cr3 = cr3;
1182         spin_lock_irqsave(&vcpu->litevm->lock);
1183         vcpu->mmu.new_cr3(vcpu);
1184         spin_unlock(&vcpu->litevm->lock);
1185         print_func_exit();
1186 }
1187
1188 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1189 {
1190         print_func_entry();
1191         if ( cr8 & CR8_RESEVED_BITS) {
1192                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1193                 inject_gp(vcpu);
1194                 print_func_exit();
1195                 return;
1196         }
1197         vcpu->cr8 = cr8;
1198         print_func_exit();
1199 }
1200
1201 static uint32_t get_rdx_init_val(void)
1202 {
1203         print_func_entry();
1204         uint32_t val;
1205
1206         asm ("movl $1, %%eax \n\t"
1207              "movl %%eax, %0 \n\t" : "=g"(val) );
1208         print_func_exit();
1209         return val;
1210
1211 }
1212
1213 static void fx_init(struct litevm_vcpu *vcpu)
1214 {
1215         print_func_entry();
1216         struct __attribute__ ((__packed__)) fx_image_s {
1217                 uint16_t control; //fcw
1218                 uint16_t status; //fsw
1219                 uint16_t tag; // ftw
1220                 uint16_t opcode; //fop
1221                 uint64_t ip; // fpu ip
1222                 uint64_t operand;// fpu dp
1223                 uint32_t mxcsr;
1224                 uint32_t mxcsr_mask;
1225
1226         } *fx_image;
1227
1228         fx_save(vcpu->host_fx_image);
1229         fpu_init();
1230         fx_save(vcpu->guest_fx_image);
1231         fx_restore(vcpu->host_fx_image);
1232
1233         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1234         fx_image->mxcsr = 0x1f80;
1235         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1236                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1237         print_func_exit();
1238 }
1239
1240 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field, uint32_t val)
1241 {
1242         print_func_entry();
1243         uint32_t msr_high, msr_low;
1244         uint64_t msrval;
1245
1246         msrval = read_msr(msr);
1247         msr_low = msrval;
1248         msr_high = (msrval>>32);
1249
1250         val &= msr_high;
1251         val |= msr_low;
1252         vmcs_write32(vmcs_field, val);
1253         print_func_exit();
1254 }
1255
1256 /*
1257  * Sets up the vmcs for emulated real mode.
1258  */
1259 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1260 {
1261 print_func_entry();
1262 /* no op on x86_64 */
1263 #define asmlinkage
1264         extern asmlinkage void litevm_vmx_return(void);
1265         uint32_t host_sysenter_cs;
1266         uint32_t junk;
1267         uint64_t a;
1268         struct descriptor_table dt;
1269         int i;
1270         int ret;
1271         uint64_t tsc;
1272         int nr_good_msrs;
1273
1274
1275         if (!init_rmode_tss(vcpu->litevm)) {
1276                 error("vcpu_setup: init_rmode_tss failed");
1277         }
1278
1279         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1280         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1281         vcpu->cr8 = 0;
1282         vcpu->apic_base = 0xfee00000 |
1283                         /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
1284                         MSR_IA32_APICBASE_ENABLE;
1285
1286         fx_init(vcpu);
1287
1288 #define SEG_SETUP(seg) do {                                     \
1289                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1290                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1291                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1292                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1293         } while (0)
1294
1295         /*
1296          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1297          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1298          */
1299         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1300         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1301         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1302         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1303
1304         SEG_SETUP(DS);
1305         SEG_SETUP(ES);
1306         SEG_SETUP(FS);
1307         SEG_SETUP(GS);
1308         SEG_SETUP(SS);
1309
1310         vmcs_write16(GUEST_TR_SELECTOR, 0);
1311         vmcs_writel(GUEST_TR_BASE, 0);
1312         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1313         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1314
1315         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1316         vmcs_writel(GUEST_LDTR_BASE, 0);
1317         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1318         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1319
1320         vmcs_write32(GUEST_SYSENTER_CS, 0);
1321         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1322         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1323
1324         vmcs_writel(GUEST_RFLAGS, 0x02);
1325         vmcs_writel(GUEST_RIP, 0xfff0);
1326         vmcs_writel(GUEST_RSP, 0);
1327
1328         vmcs_writel(GUEST_CR3, 0);
1329
1330         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1331         vmcs_writel(GUEST_DR7, 0x400);
1332
1333         vmcs_writel(GUEST_GDTR_BASE, 0);
1334         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1335
1336         vmcs_writel(GUEST_IDTR_BASE, 0);
1337         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1338
1339         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1340         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1341         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1342
1343         /* I/O */
1344         vmcs_write64(IO_BITMAP_A, 0);
1345         vmcs_write64(IO_BITMAP_B, 0);
1346
1347         tsc = read_tsc();
1348         vmcs_write64(TSC_OFFSET, -tsc);
1349
1350         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1351
1352         /* Special registers */
1353         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1354
1355         /* Control */
1356         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR,
1357                                PIN_BASED_VM_EXEC_CONTROL,
1358                                PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
1359                                | PIN_BASED_NMI_EXITING   /* 20.6.1 */
1360                         );
1361         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR,
1362                                CPU_BASED_VM_EXEC_CONTROL,
1363                                CPU_BASED_HLT_EXITING         /* 20.6.2 */
1364                                | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
1365                                | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
1366                                | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
1367                                | CPU_BASED_INVDPG_EXITING
1368                                | CPU_BASED_MOV_DR_EXITING
1369                                | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
1370                         );
1371
1372         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1373         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1374         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1375         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1376
1377         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1378         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1379         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1380
1381 #warning "not setting selectors; do we need them?"
1382 #if 0
1383         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
1384         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1385         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1386 #endif
1387         vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
1388         vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
1389 #if 0
1390         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1391 #endif
1392 #ifdef __x86_64__
1393         a = read_msr(MSR_FS_BASE);
1394         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1395         a = read_msr(MSR_GS_BASE);
1396         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1397 #else
1398         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1399         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1400 #endif
1401
1402 #warning "Not setting HOST_TR_SELECTOR"
1403 #if 0
1404         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
1405 #endif
1406
1407         get_idt(&dt);
1408         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1409
1410
1411         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return); /* 22.2.5 */
1412
1413         /* it's the HIGH 32 bits! */
1414         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1415         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1416         a = read_msr(MSR_IA32_SYSENTER_ESP);
1417         vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
1418         a = read_msr(MSR_IA32_SYSENTER_EIP);
1419         vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
1420
1421         ret = -ENOMEM;
1422         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1423         if (!vcpu->guest_msrs)
1424                 error("guest_msrs kmalloc failed");
1425         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1426         if (!vcpu->host_msrs)
1427                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1428
1429         for (i = 0; i < NR_VMX_MSR; ++i) {
1430                 uint32_t index = vmx_msr_index[i];
1431                 uint32_t data_low, data_high;
1432                 uint64_t data;
1433                 int j = vcpu->nmsrs;
1434
1435 #warning "need readmsr_safe"
1436 //              if (rdmsr_safe(index, &data_low, &data_high) < 0)
1437 //                      continue;
1438                 data = read_msr(index);
1439                 vcpu->host_msrs[j].index = index;
1440                 vcpu->host_msrs[j].reserved = 0;
1441                 vcpu->host_msrs[j].data = data;
1442                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1443                 ++vcpu->nmsrs;
1444         }
1445         printk("msrs: %d\n", vcpu->nmsrs);
1446
1447         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1448         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
1449                     PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1450         vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
1451                     PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1452         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
1453                     PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1454         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS,
1455                                (HOST_IS_64 << 9));  /* 22.2,1, 20.7.1 */
1456         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
1457         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);  /* 22.2.2 */
1458         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1459
1460
1461         /* 22.2.1, 20.8.1 */
1462         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR,
1463                                VM_ENTRY_CONTROLS, 0);
1464         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1465
1466         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1467         vmcs_writel(TPR_THRESHOLD, 0);
1468
1469         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1470         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1471
1472         __set_cr0(vcpu, 0x60000010); // enter rmode
1473         __set_cr4(vcpu, 0);
1474 #ifdef __x86_64__
1475         __set_efer(vcpu, 0);
1476 #endif
1477
1478         ret = litevm_mmu_init(vcpu);
1479
1480         print_func_exit();
1481         return ret;
1482
1483 out_free_guest_msrs:
1484         kfree(vcpu->guest_msrs);
1485 out:
1486         return ret;
1487 }
1488
1489 /*
1490  * Sync the rsp and rip registers into the vcpu structure.  This allows
1491  * registers to be accessed by indexing vcpu->regs.
1492  */
1493 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1494 {
1495         print_func_entry();
1496         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1497         vcpu->rip = vmcs_readl(GUEST_RIP);
1498         print_func_exit();
1499 }
1500
1501 /*
1502  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1503  * modification.
1504  */
1505 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1506 {
1507         print_func_entry();
1508         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1509         vmcs_writel(GUEST_RIP, vcpu->rip);
1510         print_func_exit();
1511 }
1512
1513 /*
1514  * Creates some virtual cpus.  Good luck creating more than one.
1515  */
1516 int vmx_create_vcpu(struct litevm *litevm, int n)
1517 {
1518         print_func_entry();
1519         ERRSTACK(1);
1520         int r;
1521         struct litevm_vcpu *vcpu;
1522         struct vmcs *vmcs;
1523         char *errstring = NULL;
1524
1525         if (n < 0 || n >= LITEVM_MAX_VCPUS){
1526                 printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n, LITEVM_MAX_VCPUS);
1527                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n, LITEVM_MAX_VCPUS);
1528         }
1529
1530         vcpu = &litevm->vcpus[n];
1531
1532         qlock(&vcpu->mutex);
1533
1534         if (vcpu->vmcs) {
1535                 qunlock(&vcpu->mutex);
1536                 printk("VM already exists\n");
1537                 error("VM already exists");
1538         }
1539
1540         /* I'm a bad person */
1541         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1542         uint64_t a = (uint64_t) vcpu->fx_buf;
1543         a += FX_IMAGE_ALIGN-1;
1544         a /= FX_IMAGE_ALIGN;
1545         a *= FX_IMAGE_ALIGN;
1546
1547         vcpu->host_fx_image = (char*)a;
1548         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1549
1550         vcpu->cpu = -1;  /* First load will set up TR */
1551         vcpu->litevm = litevm;
1552
1553         vmcs = alloc_vmcs();
1554         if (!vmcs) {
1555                 errstring = "vmcs allocate failed";
1556                 printk("%s\n", errstring);
1557                 qunlock(&vcpu->mutex);
1558                 goto out_free_vcpus;
1559         }
1560         vmcs_clear(vmcs);
1561         printk("after vmcs_clear\n");
1562         vcpu->vmcs = vmcs;
1563         vcpu->launched = 0;
1564         printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs);
1565         error("before vcpu_load");
1566         __vcpu_load(vcpu);
1567
1568         printk("PAST vcpu_load\n");
1569         #warning unmatched waserror!
1570         if (waserror()){
1571                 /* we really need to fix waserror() */
1572                 poperror();
1573                 goto out_free_vcpus;
1574         }
1575
1576         r = litevm_vcpu_setup(vcpu);
1577
1578         vcpu_put(vcpu);
1579
1580         printk("r is %d\n", r);
1581
1582         if (! r) {
1583                 
1584                 print_func_exit();
1585                 return 0;
1586         }
1587
1588         errstring = "vcup set failed";
1589
1590 out_free_vcpus:
1591         printk("out_free_vcpus: life sucks\n");
1592         litevm_free_vcpu(vcpu);
1593         error(errstring);
1594 out:
1595         print_func_exit();
1596         return r;
1597 }
1598
1599 /*
1600  * Allocate some memory and give it an address in the guest physical address
1601  * space.
1602  *
1603  * Discontiguous memory is allowed, mostly for framebuffers.
1604  */
1605 int vm_set_memory_region(struct litevm *litevm,
1606                                            struct litevm_memory_region *mem)
1607 {
1608         print_func_entry();
1609         ERRSTACK(2);
1610         int r;
1611         gfn_t base_gfn;
1612         unsigned long npages;
1613         unsigned long i;
1614         struct litevm_memory_slot *memslot;
1615         struct litevm_memory_slot old, new;
1616         int memory_config_version;
1617         void *init_data = mem->init_data;
1618         int pass = 1;
1619
1620         printk("litevm %p\n", litevm);
1621         /* should not happen but ... */
1622         if (! litevm)
1623                 error("NULL litevm in %s", __func__);
1624
1625         if (!mem)
1626                 error("NULL mem in %s", __func__);
1627
1628         if (litevm->busy)
1629                 error("litevm->busy is set! 0x%x\n", litevm->busy);
1630         r = -EINVAL;
1631         /* General sanity checks */
1632         if (mem->memory_size & (PAGE_SIZE - 1))
1633                 error("mem->memory_size %lld is not page-aligned", mem->memory_size);
1634         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1635                 error("guest_phys_addr 0x%llx is not page-aligned", mem->guest_phys_addr);
1636         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1637                 error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
1638         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1639                 error("0x%x + 0x%x is < 0x%x", 
1640                       mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
1641
1642         memslot = &litevm->memslots[mem->slot];
1643         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1644         npages = mem->memory_size >> PAGE_SHIFT;
1645
1646         if (!npages)
1647                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1648
1649         /* this is actually a very tricky for loop. The use of
1650          * error is a bit dangerous, so we don't use it much.
1651          * consider a rewrite. Would be nice if akaros could do the
1652          * allocation of a bunch of pages for us.
1653          */
1654 raced:
1655         printk("raced: pass %d\n", pass);
1656         spin_lock_irqsave(&litevm->lock);
1657         printk("locked\n");
1658
1659         if (waserror()){
1660                 spin_unlock(&litevm->lock);
1661                 nexterror();
1662         }
1663                 
1664         memory_config_version = litevm->memory_config_version;
1665         new = old = *memslot;
1666
1667         new.base_gfn = base_gfn;
1668         new.npages = npages;
1669         new.flags = mem->flags;
1670
1671         /* Disallow changing a memory slot's size. */
1672         r = -EINVAL;
1673         if (npages && old.npages && npages != old.npages)
1674                 error("npages is %d, old.npages is %d, can't change",
1675                       npages, old.npages);
1676
1677         /* Check for overlaps */
1678         r = -EEXIST;
1679         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1680                 struct litevm_memory_slot *s = &litevm->memslots[i];
1681
1682                 if (s == memslot)
1683                         continue;
1684                 if (!((base_gfn + npages <= s->base_gfn) ||
1685                       (base_gfn >= s->base_gfn + s->npages)))
1686                         error("Overlap");
1687         }
1688         /*
1689          * Do memory allocations outside lock.  memory_config_version will
1690          * detect any races.
1691          */
1692         spin_unlock(&litevm->lock);
1693         printk("unlocked\n");
1694         poperror();
1695
1696         /* Deallocate if slot is being removed */
1697         if (!npages)
1698                 new.phys_mem = 0;
1699
1700         /* Free page dirty bitmap if unneeded */
1701         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1702                 new.dirty_bitmap = 0;
1703
1704         r = -ENOMEM;
1705
1706         /* Allocate if a slot is being created */
1707         if (npages && !new.phys_mem) {
1708                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1709
1710                 if (!new.phys_mem)
1711                         goto out_free;
1712
1713                 for (i = 0; i < npages; ++i) {
1714                         int ret;
1715                         ret = kpage_alloc(&new.phys_mem[i]);
1716                         if (ret != ESUCCESS)
1717                                 goto out_free;
1718                         if (init_data){
1719                                 printk("init data memcpy(%p,%p,4096);\n",
1720                                        page2kva(new.phys_mem[i]), init_data);
1721                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1722                                 init_data += PAGE_SIZE;
1723                         }
1724                 }
1725         }
1726
1727         /* Allocate page dirty bitmap if needed */
1728         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1729                 unsigned dirty_bytes;//ALIGN(npages, BITS_PER_LONG) / 8;
1730                 dirty_bytes = (((npages + BITS_PER_LONG-1)/BITS_PER_LONG)*BITS_PER_LONG)/8;
1731
1732                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1733                 if (!new.dirty_bitmap){
1734                         printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
1735                         goto out_free;
1736                 }
1737         }
1738
1739         spin_lock_irqsave(&litevm->lock);
1740         printk("locked\n");
1741         if (memory_config_version != litevm->memory_config_version) {
1742                 spin_unlock(&litevm->lock);
1743                 printk("unlocked, try again\n");
1744                 litevm_free_physmem_slot(&new, &old);
1745                 goto raced;
1746         }
1747
1748         r = -EAGAIN;
1749         if (litevm->busy){
1750                 printk("BUSY!\n");
1751                 goto out_unlock;
1752         }
1753
1754         if (mem->slot >= litevm->nmemslots)
1755                 litevm->nmemslots = mem->slot + 1;
1756
1757         *memslot = new;
1758         ++litevm->memory_config_version;
1759
1760         spin_unlock(&litevm->lock);
1761         printk("unlocked\n");
1762         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1763                 struct litevm_vcpu *vcpu;
1764
1765                 vcpu = vcpu_load(litevm, i);
1766                 if (!vcpu)
1767                         continue;
1768                 litevm_mmu_reset_context(vcpu);
1769                 vcpu_put(vcpu);
1770         }
1771
1772         litevm_free_physmem_slot(&old, &new);
1773         print_func_exit();
1774         return 0;
1775
1776 out_unlock:
1777         spin_unlock(&litevm->lock);
1778         printk("out_unlock\n");
1779 out_free:
1780         printk("out_free\n");
1781         litevm_free_physmem_slot(&new, &old);
1782 out:
1783         printk("vm_set_memory_region: return %d\n", r);
1784         print_func_exit();
1785         return r;
1786 }
1787
1788 #if 0
1789 /*
1790  * Get (and clear) the dirty memory log for a memory slot.
1791  */
1792 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1793                                        struct litevm_dirty_log *log)
1794 {
1795         struct litevm_memory_slot *memslot;
1796         int r, i;
1797         int n;
1798         unsigned long any = 0;
1799
1800         spin_lock_irqsave(&litevm->lock);
1801
1802         /*
1803          * Prevent changes to guest memory configuration even while the lock
1804          * is not taken.
1805          */
1806         ++litevm->busy;
1807         spin_unlock(&litevm->lock);
1808         r = -EINVAL;
1809         if (log->slot >= LITEVM_MEMORY_SLOTS)
1810                 goto out;
1811
1812         memslot = &litevm->memslots[log->slot];
1813         r = -ENOENT;
1814         if (!memslot->dirty_bitmap)
1815                 goto out;
1816
1817         n = ALIGN(memslot->npages, 8) / 8;
1818
1819         for (i = 0; !any && i < n; ++i)
1820                 any = memslot->dirty_bitmap[i];
1821
1822         r = -EFAULT;
1823         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1824                 goto out;
1825
1826
1827         if (any) {
1828                 spin_lock_irqsave(&litevm->lock);
1829                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1830                 spin_unlock(&litevm->lock);
1831                 memset(memslot->dirty_bitmap, 0, n);
1832                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1833                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1834
1835                         if (!vcpu)
1836                                 continue;
1837                         flush_guest_tlb(vcpu);
1838                         vcpu_put(vcpu);
1839                 }
1840         }
1841
1842         r = 0;
1843
1844 out:
1845         spin_lock_irqsave(&litevm->lock);
1846         --litevm->busy;
1847         spin_unlock(&litevm->lock);
1848         return r;
1849 }
1850 #endif
1851
1852 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1853 {
1854         print_func_entry();
1855         int i;
1856
1857         for (i = 0; i < litevm->nmemslots; ++i) {
1858                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1859
1860                 if (gfn >= memslot->base_gfn
1861                     && gfn < memslot->base_gfn + memslot->npages) {
1862                         print_func_exit();
1863                         return memslot;
1864                 }
1865         }
1866         print_func_exit();
1867         return 0;
1868 }
1869
1870 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1871 {
1872         print_func_entry();
1873         int i;
1874         struct litevm_memory_slot *memslot = 0;
1875         unsigned long rel_gfn;
1876
1877         for (i = 0; i < litevm->nmemslots; ++i) {
1878                 memslot = &litevm->memslots[i];
1879
1880                 if (gfn >= memslot->base_gfn
1881                     && gfn < memslot->base_gfn + memslot->npages) {
1882
1883                         if (!memslot || !memslot->dirty_bitmap) {
1884                                 print_func_exit();
1885                                 return;
1886                         }
1887
1888                         rel_gfn = gfn - memslot->base_gfn;
1889
1890                         /* avoid RMW */
1891                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1892                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1893                         print_func_exit();
1894                         return;
1895                 }
1896         }
1897         print_func_exit();
1898 }
1899
1900 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1901 {
1902         print_func_entry();
1903         unsigned long rip;
1904         uint32_t interruptibility;
1905
1906         rip = vmcs_readl(GUEST_RIP);
1907         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1908         vmcs_writel(GUEST_RIP, rip);
1909
1910         /*
1911          * We emulated an instruction, so temporary interrupt blocking
1912          * should be removed, if set.
1913          */
1914         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1915         if (interruptibility & 3)
1916                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
1917                              interruptibility & ~3);
1918         print_func_exit();
1919 }
1920
1921 static int emulator_read_std(unsigned long addr,
1922                              unsigned long *val,
1923                              unsigned int bytes,
1924                              struct x86_emulate_ctxt *ctxt)
1925 {
1926         print_func_entry();
1927         struct litevm_vcpu *vcpu = ctxt->vcpu;
1928         void *data = val;
1929
1930         while (bytes) {
1931                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1932                 unsigned offset = addr & (PAGE_SIZE-1);
1933                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ? 
1934                         bytes : (unsigned)PAGE_SIZE - offset;
1935                 unsigned long pfn;
1936                 struct litevm_memory_slot *memslot;
1937                 void *page;
1938
1939                 if (gpa == UNMAPPED_GVA) {
1940                         print_func_exit();
1941                         return X86EMUL_PROPAGATE_FAULT;
1942                 }
1943                 pfn = gpa >> PAGE_SHIFT;
1944                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1945                 if (!memslot) {
1946                         print_func_exit();
1947                         return X86EMUL_UNHANDLEABLE;
1948                 }
1949                 page = page2kva(gfn_to_page(memslot, pfn));
1950
1951                 memcpy(data, page + offset, tocopy);
1952
1953                 bytes -= tocopy;
1954                 data += tocopy;
1955                 addr += tocopy;
1956         }
1957
1958         print_func_exit();
1959         return X86EMUL_CONTINUE;
1960 }
1961
1962 static int emulator_write_std(unsigned long addr,
1963                               unsigned long val,
1964                               unsigned int bytes,
1965                               struct x86_emulate_ctxt *ctxt)
1966 {
1967         print_func_entry();
1968         printk("emulator_write_std: addr %lx n %d\n",
1969                addr, bytes);
1970         print_func_exit();
1971         return X86EMUL_UNHANDLEABLE;
1972 }
1973
1974 static int emulator_read_emulated(unsigned long addr,
1975                                   unsigned long *val,
1976                                   unsigned int bytes,
1977                                   struct x86_emulate_ctxt *ctxt)
1978 {
1979         print_func_entry();
1980         struct litevm_vcpu *vcpu = ctxt->vcpu;
1981
1982         if (vcpu->mmio_read_completed) {
1983                 memcpy(val, vcpu->mmio_data, bytes);
1984                 vcpu->mmio_read_completed = 0;
1985                 print_func_exit();
1986                 return X86EMUL_CONTINUE;
1987         } else if (emulator_read_std(addr, val, bytes, ctxt)
1988                    == X86EMUL_CONTINUE) {
1989                 print_func_exit();
1990                 return X86EMUL_CONTINUE;
1991         }
1992         else {
1993                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1994                 if (gpa == UNMAPPED_GVA) {
1995                         print_func_exit();
1996                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
1997                 }
1998                 vcpu->mmio_needed = 1;
1999                 vcpu->mmio_phys_addr = gpa;
2000                 vcpu->mmio_size = bytes;
2001                 vcpu->mmio_is_write = 0;
2002
2003                 print_func_exit();
2004                 return X86EMUL_UNHANDLEABLE;
2005         }
2006 }
2007
2008 static int emulator_write_emulated(unsigned long addr,
2009                                    unsigned long val,
2010                                    unsigned int bytes,
2011                                    struct x86_emulate_ctxt *ctxt)
2012 {
2013         print_func_entry();
2014         struct litevm_vcpu *vcpu = ctxt->vcpu;
2015         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2016
2017         if (gpa == UNMAPPED_GVA) {
2018                 print_func_exit();
2019                 return X86EMUL_PROPAGATE_FAULT;
2020         }
2021
2022         vcpu->mmio_needed = 1;
2023         vcpu->mmio_phys_addr = gpa;
2024         vcpu->mmio_size = bytes;
2025         vcpu->mmio_is_write = 1;
2026         memcpy(vcpu->mmio_data, &val, bytes);
2027
2028         print_func_exit();
2029         return X86EMUL_CONTINUE;
2030 }
2031
2032 static int emulator_cmpxchg_emulated(unsigned long addr,
2033                                      unsigned long old,
2034                                      unsigned long new,
2035                                      unsigned int bytes,
2036                                      struct x86_emulate_ctxt *ctxt)
2037 {
2038         print_func_entry();
2039         static int reported;
2040
2041         if (!reported) {
2042                 reported = 1;
2043                 printk("litevm: emulating exchange as write\n");
2044         }
2045         print_func_exit();
2046         return emulator_write_emulated(addr, new, bytes, ctxt);
2047 }
2048
2049 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
2050 {
2051         print_func_entry();
2052         static int reported;
2053         uint8_t opcodes[4];
2054         unsigned long rip = vmcs_readl(GUEST_RIP);
2055         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
2056
2057         if (reported) {
2058                 print_func_exit();
2059                 return;
2060         }
2061
2062         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
2063
2064         printk("emulation failed but !mmio_needed?"
2065                " rip %lx %02x %02x %02x %02x\n",
2066                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2067         reported = 1;
2068         print_func_exit();
2069 }
2070
2071 struct x86_emulate_ops emulate_ops = {
2072         .read_std            = emulator_read_std,
2073         .write_std           = emulator_write_std,
2074         .read_emulated       = emulator_read_emulated,
2075         .write_emulated      = emulator_write_emulated,
2076         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2077 };
2078
2079 enum emulation_result {
2080         EMULATE_DONE,       /* no further processing */
2081         EMULATE_DO_MMIO,      /* litevm_run filled with mmio request */
2082         EMULATE_FAIL,         /* can't emulate this instruction */
2083 };
2084
2085 static int emulate_instruction(struct litevm_vcpu *vcpu,
2086                                struct litevm_run *run,
2087                                unsigned long cr2,
2088                                uint16_t error_code)
2089 {
2090         print_func_entry();
2091         struct x86_emulate_ctxt emulate_ctxt;
2092         int r;
2093         uint32_t cs_ar;
2094
2095         vcpu_load_rsp_rip(vcpu);
2096
2097         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2098
2099         emulate_ctxt.vcpu = vcpu;
2100         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
2101         emulate_ctxt.cr2 = cr2;
2102         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
2103                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
2104                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
2105                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2106
2107         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2108                 emulate_ctxt.cs_base = 0;
2109                 emulate_ctxt.ds_base = 0;
2110                 emulate_ctxt.es_base = 0;
2111                 emulate_ctxt.ss_base = 0;
2112                 emulate_ctxt.gs_base = 0;
2113                 emulate_ctxt.fs_base = 0;
2114         } else {
2115                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
2116                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
2117                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
2118                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
2119                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
2120                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
2121         }
2122
2123         vcpu->mmio_is_write = 0;
2124         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
2125
2126         if ((r || vcpu->mmio_is_write) && run) {
2127                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2128                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2129                 run->mmio.len = vcpu->mmio_size;
2130                 run->mmio.is_write = vcpu->mmio_is_write;
2131         }
2132
2133         if (r) {
2134                 if (!vcpu->mmio_needed) {
2135                         report_emulation_failure(&emulate_ctxt);
2136                         print_func_exit();
2137                         return EMULATE_FAIL;
2138                 }
2139                 print_func_exit();
2140                 return EMULATE_DO_MMIO;
2141         }
2142
2143         vcpu_put_rsp_rip(vcpu);
2144         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
2145
2146         if (vcpu->mmio_is_write) {
2147                 print_func_exit();
2148                 return EMULATE_DO_MMIO;
2149         }
2150
2151         print_func_exit();
2152         return EMULATE_DONE;
2153 }
2154
2155 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
2156 {
2157         print_func_entry();
2158         print_func_exit();
2159         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2160 }
2161
2162 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2163 {
2164         print_func_entry();
2165         vmcs_writel(GUEST_GDTR_BASE, base);
2166         vmcs_write32(GUEST_GDTR_LIMIT, limit);
2167         print_func_exit();
2168 }
2169
2170 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2171 {
2172         print_func_entry();
2173         vmcs_writel(GUEST_IDTR_BASE, base);
2174         vmcs_write32(GUEST_IDTR_LIMIT, limit);
2175         print_func_exit();
2176 }
2177
2178 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
2179                    unsigned long *rflags)
2180 {
2181         print_func_entry();
2182         lmsw(vcpu, msw);
2183         *rflags = vmcs_readl(GUEST_RFLAGS);
2184         print_func_exit();
2185 }
2186
2187 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
2188 {
2189         print_func_entry();
2190         switch (cr) {
2191         case 0:
2192                 print_func_exit();
2193                 return guest_cr0();
2194         case 2:
2195                 print_func_exit();
2196                 return vcpu->cr2;
2197         case 3:
2198                 print_func_exit();
2199                 return vcpu->cr3;
2200         case 4:
2201                 print_func_exit();
2202                 return guest_cr4();
2203         default:
2204                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2205                 print_func_exit();
2206                 return 0;
2207         }
2208 }
2209
2210 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
2211                      unsigned long *rflags)
2212 {
2213         print_func_entry();
2214         switch (cr) {
2215         case 0:
2216                 set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
2217                 *rflags = vmcs_readl(GUEST_RFLAGS);
2218                 break;
2219         case 2:
2220                 vcpu->cr2 = val;
2221                 break;
2222         case 3:
2223                 set_cr3(vcpu, val);
2224                 break;
2225         case 4:
2226                 set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
2227                 break;
2228         default:
2229                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2230         }
2231         print_func_exit();
2232 }
2233
2234 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
2235                                   int vec, uint32_t err_code)
2236 {
2237         print_func_entry();
2238         if (!vcpu->rmode.active) {
2239                 print_func_exit();
2240                 return 0;
2241         }
2242
2243         if (vec == GP_VECTOR && err_code == 0)
2244                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) {
2245                         print_func_exit();
2246                         return 1;
2247                 }
2248         print_func_exit();
2249         return 0;
2250 }
2251
2252 static int handle_exception(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2253 {
2254         print_func_entry();
2255         uint32_t intr_info, error_code;
2256         unsigned long cr2, rip;
2257         uint32_t vect_info;
2258         enum emulation_result er;
2259
2260         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2261         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2262
2263         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2264                                                 !is_page_fault(intr_info)) {
2265                 printk("%s: unexpected, vectoring info 0x%x "
2266                        "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
2267         }
2268
2269         if (is_external_interrupt(vect_info)) {
2270                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2271                 SET_BITMASK_BIT_ATOMIC(((uint8_t *)&vcpu->irq_pending), irq);
2272                 SET_BITMASK_BIT_ATOMIC(((uint8_t *)&vcpu->irq_summary), irq / BITS_PER_LONG);
2273         }
2274
2275         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
2276                 asm ("int $2");
2277                 print_func_exit();
2278                 return 1;
2279         }
2280         error_code = 0;
2281         rip = vmcs_readl(GUEST_RIP);
2282         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
2283                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2284         if (is_page_fault(intr_info)) {
2285                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2286
2287                 spin_lock_irqsave(&vcpu->litevm->lock);
2288                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
2289                         spin_unlock(&vcpu->litevm->lock);
2290                         print_func_exit();
2291                         return 1;
2292                 }
2293
2294                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
2295                 spin_unlock(&vcpu->litevm->lock);
2296
2297                 switch (er) {
2298                 case EMULATE_DONE:
2299                         print_func_exit();
2300                         return 1;
2301                 case EMULATE_DO_MMIO:
2302                         ++litevm_stat.mmio_exits;
2303                         litevm_run->exit_reason = LITEVM_EXIT_MMIO;
2304                         print_func_exit();
2305                         return 0;
2306                  case EMULATE_FAIL:
2307                         vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2308                         break;
2309                 default:
2310                         assert(0);
2311                 }
2312         }
2313
2314         if (vcpu->rmode.active &&
2315             handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2316                                                                 error_code)) {
2317                 print_func_exit();
2318                     return 1;
2319             }
2320
2321         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
2322                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2323                 print_func_exit();
2324                 return 0;
2325         }
2326         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2327         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2328         litevm_run->ex.error_code = error_code;
2329         print_func_exit();
2330         return 0;
2331 }
2332
2333 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2334                                      struct litevm_run *litevm_run)
2335 {
2336         print_func_entry();
2337         ++litevm_stat.irq_exits;
2338         print_func_exit();
2339         return 1;
2340 }
2341
2342
2343 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t *count)
2344 {
2345         print_func_entry();
2346         uint64_t inst;
2347         gva_t rip;
2348         int countr_size;
2349         int i, n;
2350
2351         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2352                 countr_size = 2;
2353         } else {
2354                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2355
2356                 countr_size = (cs_ar & AR_L_MASK) ? 8:
2357                               (cs_ar & AR_DB_MASK) ? 4: 2;
2358         }
2359
2360         rip =  vmcs_readl(GUEST_RIP);
2361         if (countr_size != 8)
2362                 rip += vmcs_readl(GUEST_CS_BASE);
2363
2364         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2365
2366         for (i = 0; i < n; i++) {
2367                 switch (((uint8_t*)&inst)[i]) {
2368                 case 0xf0:
2369                 case 0xf2:
2370                 case 0xf3:
2371                 case 0x2e:
2372                 case 0x36:
2373                 case 0x3e:
2374                 case 0x26:
2375                 case 0x64:
2376                 case 0x65:
2377                 case 0x66:
2378                         break;
2379                 case 0x67:
2380                         countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
2381                 default:
2382                         goto done;
2383                 }
2384         }
2385         print_func_exit();
2386         return 0;
2387 done:
2388         countr_size *= 8;
2389         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2390         print_func_exit();
2391         return 1;
2392 }
2393
2394 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2395 {
2396         print_func_entry();
2397         uint64_t exit_qualification;
2398
2399         ++litevm_stat.io_exits;
2400         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2401         litevm_run->exit_reason = LITEVM_EXIT_IO;
2402         if (exit_qualification & 8)
2403                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2404         else
2405                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2406         litevm_run->io.size = (exit_qualification & 7) + 1;
2407         litevm_run->io.string = (exit_qualification & 16) != 0;
2408         litevm_run->io.string_down
2409                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2410         litevm_run->io.rep = (exit_qualification & 32) != 0;
2411         litevm_run->io.port = exit_qualification >> 16;
2412         if (litevm_run->io.string) {
2413                 if (!get_io_count(vcpu, &litevm_run->io.count)) {
2414                         print_func_exit();
2415                         return 1;
2416                 }
2417                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2418         } else
2419                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
2420         print_func_exit();
2421         return 0;
2422 }
2423
2424 static int handle_invlpg(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2425 {
2426         print_func_entry();
2427         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2428         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2429         spin_lock_irqsave(&vcpu->litevm->lock);
2430         vcpu->mmu.inval_page(vcpu, address);
2431         spin_unlock(&vcpu->litevm->lock);
2432         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2433         print_func_exit();
2434         return 1;
2435 }
2436
2437 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2438 {
2439         print_func_entry();
2440         uint64_t exit_qualification;
2441         int cr;
2442         int reg;
2443
2444 #ifdef LITEVM_DEBUG
2445         if (guest_cpl() != 0) {
2446                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2447                 inject_gp(vcpu);
2448                 print_func_exit();
2449                 return 1;
2450         }
2451 #endif
2452
2453         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2454         cr = exit_qualification & 15;
2455         reg = (exit_qualification >> 8) & 15;
2456         switch ((exit_qualification >> 4) & 3) {
2457         case 0: /* mov to cr */
2458                 switch (cr) {
2459                 case 0:
2460                         vcpu_load_rsp_rip(vcpu);
2461                         set_cr0(vcpu, vcpu->regs[reg]);
2462                         skip_emulated_instruction(vcpu);
2463                         print_func_exit();
2464                         return 1;
2465                 case 3:
2466                         vcpu_load_rsp_rip(vcpu);
2467                         set_cr3(vcpu, vcpu->regs[reg]);
2468                         skip_emulated_instruction(vcpu);
2469                         print_func_exit();
2470                         return 1;
2471                 case 4:
2472                         vcpu_load_rsp_rip(vcpu);
2473                         set_cr4(vcpu, vcpu->regs[reg]);
2474                         skip_emulated_instruction(vcpu);
2475                         print_func_exit();
2476                         return 1;
2477                 case 8:
2478                         vcpu_load_rsp_rip(vcpu);
2479                         set_cr8(vcpu, vcpu->regs[reg]);
2480                         skip_emulated_instruction(vcpu);
2481                         print_func_exit();
2482                         return 1;
2483                 };
2484                 break;
2485         case 1: /*mov from cr*/
2486                 switch (cr) {
2487                 case 3:
2488                         vcpu_load_rsp_rip(vcpu);
2489                         vcpu->regs[reg] = vcpu->cr3;
2490                         vcpu_put_rsp_rip(vcpu);
2491                         skip_emulated_instruction(vcpu);
2492                         print_func_exit();
2493                         return 1;
2494                 case 8:
2495                         printd("handle_cr: read CR8 "
2496                                "cpu erratum AA15\n");
2497                         vcpu_load_rsp_rip(vcpu);
2498                         vcpu->regs[reg] = vcpu->cr8;
2499                         vcpu_put_rsp_rip(vcpu);
2500                         skip_emulated_instruction(vcpu);
2501                         print_func_exit();
2502                         return 1;
2503                 }
2504                 break;
2505         case 3: /* lmsw */
2506                 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2507
2508                 skip_emulated_instruction(vcpu);
2509                 print_func_exit();
2510                 return 1;
2511         default:
2512                 break;
2513         }
2514         litevm_run->exit_reason = 0;
2515         printk("litevm: unhandled control register: op %d cr %d\n",
2516                (int)(exit_qualification >> 4) & 3, cr);
2517         print_func_exit();
2518         return 0;
2519 }
2520
2521 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2522 {
2523         print_func_entry();
2524         uint64_t exit_qualification;
2525         unsigned long val;
2526         int dr, reg;
2527
2528         /*
2529          * FIXME: this code assumes the host is debugging the guest.
2530          *        need to deal with guest debugging itself too.
2531          */
2532         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2533         dr = exit_qualification & 7;
2534         reg = (exit_qualification >> 8) & 15;
2535         vcpu_load_rsp_rip(vcpu);
2536         if (exit_qualification & 16) {
2537                 /* mov from dr */
2538                 switch (dr) {
2539                 case 6:
2540                         val = 0xffff0ff0;
2541                         break;
2542                 case 7:
2543                         val = 0x400;
2544                         break;
2545                 default:
2546                         val = 0;
2547                 }
2548                 vcpu->regs[reg] = val;
2549         } else {
2550                 /* mov to dr */
2551         }
2552         vcpu_put_rsp_rip(vcpu);
2553         skip_emulated_instruction(vcpu);
2554         print_func_exit();
2555         return 1;
2556 }
2557
2558 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2559 {
2560         print_func_entry();
2561         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2562         print_func_exit();
2563         return 0;
2564 }
2565
2566 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2567 {
2568         print_func_entry();
2569         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2570         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2571         uint64_t data;
2572
2573         if (guest_cpl() != 0) {
2574                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2575                 inject_gp(vcpu);
2576                 print_func_exit();
2577                 return 1;
2578         }
2579
2580         switch (ecx) {
2581         case MSR_FS_BASE:
2582                 data = vmcs_readl(GUEST_FS_BASE);
2583                 break;
2584         case MSR_GS_BASE:
2585                 data = vmcs_readl(GUEST_GS_BASE);
2586                 break;
2587         case MSR_IA32_SYSENTER_CS:
2588                 data = vmcs_read32(GUEST_SYSENTER_CS);
2589                 break;
2590         case MSR_IA32_SYSENTER_EIP:
2591                 data = vmcs_read32(GUEST_SYSENTER_EIP);
2592                 break;
2593         case MSR_IA32_SYSENTER_ESP:
2594                 data = vmcs_read32(GUEST_SYSENTER_ESP);
2595                 break;
2596         case MSR_IA32_MC0_CTL:
2597         case MSR_IA32_MCG_STATUS:
2598         case MSR_IA32_MCG_CAP:
2599         case MSR_IA32_MC0_MISC:
2600         case MSR_IA32_MC0_MISC+4:
2601         case MSR_IA32_MC0_MISC+8:
2602         case MSR_IA32_MC0_MISC+12:
2603         case MSR_IA32_MC0_MISC+16:
2604         case MSR_IA32_UCODE_REV:
2605                 /* MTRR registers */
2606         case 0xfe:
2607         case 0x200 ... 0x2ff:
2608                 data = 0;
2609                 break;
2610         case MSR_IA32_APICBASE:
2611                 data = vcpu->apic_base;
2612                 break;
2613         default:
2614                 if (msr) {
2615                         data = msr->data;
2616                         break;
2617                 }
2618                 printk("litevm: unhandled rdmsr: %x\n", ecx);
2619                 inject_gp(vcpu);
2620                 print_func_exit();
2621                 return 1;
2622         }
2623
2624         /* FIXME: handling of bits 32:63 of rax, rdx */
2625         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2626         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2627         skip_emulated_instruction(vcpu);
2628         print_func_exit();
2629         return 1;
2630 }
2631
2632 #ifdef __x86_64__
2633
2634 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2635 {
2636         print_func_entry();
2637         struct vmx_msr_entry *msr;
2638
2639         if (efer & EFER_RESERVED_BITS) {
2640                 printd("set_efer: 0x%llx #GP, reserved bits\n",
2641                        efer);
2642                 inject_gp(vcpu);
2643                 print_func_exit();
2644                 return;
2645         }
2646
2647         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2648                 printd("set_efer: #GP, change LME while paging\n");
2649                 inject_gp(vcpu);
2650                 print_func_exit();
2651                 return;
2652         }
2653
2654         efer &= ~EFER_LMA;
2655         efer |= vcpu->shadow_efer & EFER_LMA;
2656
2657         vcpu->shadow_efer = efer;
2658
2659         msr = find_msr_entry(vcpu, MSR_EFER);
2660
2661         if (!(efer & EFER_LMA))
2662             efer &= ~EFER_LME;
2663         msr->data = efer;
2664         skip_emulated_instruction(vcpu);
2665         print_func_exit();
2666 }
2667
2668 #endif
2669
2670 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2671
2672 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2673 {
2674         print_func_entry();
2675         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2676         struct vmx_msr_entry *msr;
2677         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2678                 | ((uint64_t)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2679
2680         if (guest_cpl() != 0) {
2681                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2682                 inject_gp(vcpu);
2683                 print_func_exit();
2684                 return 1;
2685         }
2686
2687         switch (ecx) {
2688         case MSR_FS_BASE:
2689                 vmcs_writel(GUEST_FS_BASE, data);
2690                 break;
2691         case MSR_GS_BASE:
2692                 vmcs_writel(GUEST_GS_BASE, data);
2693                 break;
2694         case MSR_IA32_SYSENTER_CS:
2695                 vmcs_write32(GUEST_SYSENTER_CS, data);
2696                 break;
2697         case MSR_IA32_SYSENTER_EIP:
2698                 vmcs_write32(GUEST_SYSENTER_EIP, data);
2699                 break;
2700         case MSR_IA32_SYSENTER_ESP:
2701                 vmcs_write32(GUEST_SYSENTER_ESP, data);
2702                 break;
2703         case MSR_EFER:
2704                 set_efer(vcpu, data);
2705                 print_func_exit();
2706                 return 1;
2707         case MSR_IA32_MC0_STATUS:
2708                 printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
2709                             , __FUNCTION__, data);
2710                 break;
2711         case MSR_IA32_TIME_STAMP_COUNTER: {
2712                 uint64_t tsc;
2713                 
2714                 tsc = read_tsc();
2715                 vmcs_write64(TSC_OFFSET, data - tsc);
2716                 break;
2717         }
2718         case MSR_IA32_UCODE_REV:
2719         case MSR_IA32_UCODE_WRITE:
2720         case 0x200 ... 0x2ff: /* MTRRs */
2721                 break;
2722         case MSR_IA32_APICBASE:
2723                 vcpu->apic_base = data;
2724                 break;
2725         default:
2726                 msr = find_msr_entry(vcpu, ecx);
2727                 if (msr) {
2728                         msr->data = data;
2729                         break;
2730                 }
2731                 printk("litevm: unhandled wrmsr: %x\n", ecx);
2732                 inject_gp(vcpu);
2733                 print_func_exit();
2734                 return 1;
2735         }
2736         skip_emulated_instruction(vcpu);
2737         print_func_exit();
2738         return 1;
2739 }
2740
2741 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2742                                    struct litevm_run *litevm_run)
2743 {
2744         print_func_entry();
2745         /* Turn off interrupt window reporting. */
2746         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2747                      vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2748                      & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2749         print_func_exit();
2750         return 1;
2751 }
2752
2753 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2754 {
2755         print_func_entry();
2756         skip_emulated_instruction(vcpu);
2757         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) {
2758                 print_func_exit();
2759                 return 1;
2760         }
2761
2762         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2763         print_func_exit();
2764         return 0;
2765 }
2766
2767 /*
2768  * The exit handlers return 1 if the exit was handled fully and guest execution
2769  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2770  * to be done to userspace and return 0.
2771  */
2772 static int (*litevm_vmx_exit_handlers[])(struct litevm_vcpu *vcpu,
2773                                       struct litevm_run *litevm_run) = {
2774         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
2775         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
2776         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
2777         [EXIT_REASON_INVLPG]                  = handle_invlpg,
2778         [EXIT_REASON_CR_ACCESS]               = handle_cr,
2779         [EXIT_REASON_DR_ACCESS]               = handle_dr,
2780         [EXIT_REASON_CPUID]                   = handle_cpuid,
2781         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
2782         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
2783         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
2784         [EXIT_REASON_HLT]                     = handle_halt,
2785 };
2786
2787 static const int litevm_vmx_max_exit_handlers =
2788         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2789
2790 /*
2791  * The guest has exited.  See if we can fix it or if we need userspace
2792  * assistance.
2793  */
2794 static int litevm_handle_exit(struct litevm_run *litevm_run, struct litevm_vcpu *vcpu)
2795 {
2796         print_func_entry();
2797         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2798         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2799
2800         if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
2801                                 exit_reason != EXIT_REASON_EXCEPTION_NMI )
2802                 printk("%s: unexpected, valid vectoring info and "
2803                        "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2804         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2805         if (exit_reason < litevm_vmx_max_exit_handlers
2806             && litevm_vmx_exit_handlers[exit_reason]) {
2807                 print_func_exit();
2808                 return litevm_vmx_exit_handlers[exit_reason](vcpu, litevm_run);
2809         }
2810         else {
2811                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2812                 litevm_run->hw.hardware_exit_reason = exit_reason;
2813         }
2814         print_func_exit();
2815         return 0;
2816 }
2817
2818 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2819 {
2820         print_func_entry();
2821         uint16_t ent[2];
2822         uint16_t cs;
2823         uint16_t ip;
2824         unsigned long flags;
2825         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2826         uint16_t sp =  vmcs_readl(GUEST_RSP);
2827         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2828
2829         if (sp > ss_limit || ((sp - 6) > sp)) {
2830                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2831                             __FUNCTION__,
2832                             vmcs_readl(GUEST_RSP),
2833                             vmcs_readl(GUEST_SS_BASE),
2834                             vmcs_read32(GUEST_SS_LIMIT));
2835                 print_func_exit();
2836                 return;
2837         }
2838
2839         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2840                                                                 sizeof(ent)) {
2841                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2842                 print_func_exit();
2843                 return;
2844         }
2845
2846         flags =  vmcs_readl(GUEST_RFLAGS);
2847         cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
2848         ip =  vmcs_readl(GUEST_RIP);
2849
2850
2851         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2852             litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2853             litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2854                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2855                 print_func_exit();
2856                 return;
2857         }
2858
2859         vmcs_writel(GUEST_RFLAGS, flags &
2860                     ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2861         vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
2862         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2863         vmcs_writel(GUEST_RIP, ent[0]);
2864         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2865         print_func_exit();
2866 }
2867
2868 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2869 {
2870         print_func_entry();
2871         int word_index = __ffs(vcpu->irq_summary);
2872         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2873         int irq = word_index * BITS_PER_LONG + bit_index;
2874
2875         /* don't have clear_bit and I'm not sure the akaros
2876          * bitops are really going to work.
2877          */
2878         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2879         if (!vcpu->irq_pending[word_index])
2880                 vcpu->irq_summary &= ~ (1 << word_index);
2881
2882         if (vcpu->rmode.active) {
2883                 inject_rmode_irq(vcpu, irq);
2884                 print_func_exit();
2885                 return;
2886         }
2887         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2888                         irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2889         print_func_exit();
2890 }
2891
2892 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2893 {
2894         print_func_entry();
2895         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2896             && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2897                 /*
2898                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2899                  */
2900                 litevm_do_inject_irq(vcpu);
2901         else
2902                 /*
2903                  * Interrupts blocked.  Wait for unblock.
2904                  */
2905                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2906                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2907                              | CPU_BASED_VIRTUAL_INTR_PENDING);
2908         print_func_exit();
2909 }
2910
2911 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2912 {
2913         print_func_entry();
2914         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2915
2916 #warning "no debugging guests yet"
2917         assert(0);
2918 /*
2919         set_debugreg(dbg->bp[0], 0);
2920         set_debugreg(dbg->bp[1], 1);
2921         set_debugreg(dbg->bp[2], 2);
2922         set_debugreg(dbg->bp[3], 3);
2923 */
2924         if (dbg->singlestep) {
2925                 unsigned long flags;
2926
2927                 flags = vmcs_readl(GUEST_RFLAGS);
2928                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2929                 vmcs_writel(GUEST_RFLAGS, flags);
2930         }
2931         print_func_exit();
2932 }
2933
2934 static void load_msrs(struct vmx_msr_entry *e, int n)
2935 {
2936         print_func_entry();
2937         int i;
2938
2939         for (i = 0; i < n; ++i)
2940                 write_msr(e[i].index, e[i].data);
2941         print_func_exit();
2942 }
2943
2944 static void save_msrs(struct vmx_msr_entry *e, int n)
2945 {
2946         print_func_entry();
2947         int i;
2948
2949         for (i = 0; i < n; ++i)
2950                 e[i].data = read_msr(e[i].index);
2951         print_func_exit();
2952 }
2953
2954 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
2955 {
2956         print_func_entry();
2957         struct litevm_vcpu *vcpu;
2958         uint8_t fail;
2959         uint16_t fs_sel, gs_sel, ldt_sel;
2960         int fs_gs_ldt_reload_needed;
2961
2962         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2963                 error("vcpu is %d but must be in the range %d..%d\n",
2964                       litevm_run->vcpu, LITEVM_MAX_VCPUS);
2965
2966         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2967         if (!vcpu)
2968                 error("vcpu_load failed");
2969
2970         if (litevm_run->emulated) {
2971                 skip_emulated_instruction(vcpu);
2972                 litevm_run->emulated = 0;
2973         }
2974
2975         if (litevm_run->mmio_completed) {
2976                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
2977                 vcpu->mmio_read_completed = 1;
2978         }
2979
2980         vcpu->mmio_needed = 0;
2981
2982 again:
2983         /*
2984          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2985          * allow segment selectors with cpl > 0 or ti == 1.
2986          */
2987         fs_sel = read_fs();
2988         gs_sel = read_gs();
2989         ldt_sel = read_ldt();
2990         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
2991         if (!fs_gs_ldt_reload_needed) {
2992                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2993                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2994         } else {
2995                 vmcs_write16(HOST_FS_SELECTOR, 0);
2996                 vmcs_write16(HOST_GS_SELECTOR, 0);
2997         }
2998
2999 #ifdef __x86_64__
3000         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
3001         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
3002 #endif
3003
3004         if (vcpu->irq_summary &&
3005             !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
3006                 litevm_try_inject_irq(vcpu);
3007
3008         if (vcpu->guest_debug.enabled)
3009                 litevm_guest_debug_pre(vcpu);
3010
3011         fx_save(vcpu->host_fx_image);
3012         fx_restore(vcpu->guest_fx_image);
3013
3014         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
3015         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3016
3017         asm (
3018                 /* Store host registers */
3019                 "pushf \n\t"
3020 #ifdef __x86_64__
3021                 "push %%rax; push %%rbx; push %%rdx;"
3022                 "push %%rsi; push %%rdi; push %%rbp;"
3023                 "push %%r8;  push %%r9;  push %%r10; push %%r11;"
3024                 "push %%r12; push %%r13; push %%r14; push %%r15;"
3025                 "push %%rcx \n\t"
3026                 "vmwrite %%rsp, %2 \n\t"
3027 #else
3028                 "pusha; push %%ecx \n\t"
3029                 "vmwrite %%esp, %2 \n\t"
3030 #endif
3031                 /* Check if vmlaunch of vmresume is needed */
3032                 "cmp $0, %1 \n\t"
3033                 /* Load guest registers.  Don't clobber flags. */
3034 #ifdef __x86_64__
3035                 "mov %c[cr2](%3), %%rax \n\t"
3036                 "mov %%rax, %%cr2 \n\t"
3037                 "mov %c[rax](%3), %%rax \n\t"
3038                 "mov %c[rbx](%3), %%rbx \n\t"
3039                 "mov %c[rdx](%3), %%rdx \n\t"
3040                 "mov %c[rsi](%3), %%rsi \n\t"
3041                 "mov %c[rdi](%3), %%rdi \n\t"
3042                 "mov %c[rbp](%3), %%rbp \n\t"
3043                 "mov %c[r8](%3),  %%r8  \n\t"
3044                 "mov %c[r9](%3),  %%r9  \n\t"
3045                 "mov %c[r10](%3), %%r10 \n\t"
3046                 "mov %c[r11](%3), %%r11 \n\t"
3047                 "mov %c[r12](%3), %%r12 \n\t"
3048                 "mov %c[r13](%3), %%r13 \n\t"
3049                 "mov %c[r14](%3), %%r14 \n\t"
3050                 "mov %c[r15](%3), %%r15 \n\t"
3051                 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
3052 #else
3053                 "mov %c[cr2](%3), %%eax \n\t"
3054                 "mov %%eax,   %%cr2 \n\t"
3055                 "mov %c[rax](%3), %%eax \n\t"
3056                 "mov %c[rbx](%3), %%ebx \n\t"
3057                 "mov %c[rdx](%3), %%edx \n\t"
3058                 "mov %c[rsi](%3), %%esi \n\t"
3059                 "mov %c[rdi](%3), %%edi \n\t"
3060                 "mov %c[rbp](%3), %%ebp \n\t"
3061                 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
3062 #endif
3063                 /* Enter guest mode */
3064                 "jne launched \n\t"
3065                 "vmlaunch \n\t"
3066                 "jmp litevm_vmx_return \n\t"
3067                 "launched: vmresume \n\t"
3068                 ".globl litevm_vmx_return \n\t"
3069                 "litevm_vmx_return: "
3070                 /* Save guest registers, load host registers, keep flags */
3071 #ifdef __x86_64__
3072                 "xchg %3,     0(%%rsp) \n\t"
3073                 "mov %%rax, %c[rax](%3) \n\t"
3074                 "mov %%rbx, %c[rbx](%3) \n\t"
3075                 "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
3076                 "mov %%rdx, %c[rdx](%3) \n\t"
3077                 "mov %%rsi, %c[rsi](%3) \n\t"
3078                 "mov %%rdi, %c[rdi](%3) \n\t"
3079                 "mov %%rbp, %c[rbp](%3) \n\t"
3080                 "mov %%r8,  %c[r8](%3) \n\t"
3081                 "mov %%r9,  %c[r9](%3) \n\t"
3082                 "mov %%r10, %c[r10](%3) \n\t"
3083                 "mov %%r11, %c[r11](%3) \n\t"
3084                 "mov %%r12, %c[r12](%3) \n\t"
3085                 "mov %%r13, %c[r13](%3) \n\t"
3086                 "mov %%r14, %c[r14](%3) \n\t"
3087                 "mov %%r15, %c[r15](%3) \n\t"
3088                 "mov %%cr2, %%rax   \n\t"
3089                 "mov %%rax, %c[cr2](%3) \n\t"
3090                 "mov 0(%%rsp), %3 \n\t"
3091
3092                 "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
3093                 "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
3094                 "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
3095                 "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
3096 #else
3097                 "xchg %3, 0(%%esp) \n\t"
3098                 "mov %%eax, %c[rax](%3) \n\t"
3099                 "mov %%ebx, %c[rbx](%3) \n\t"
3100                 "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
3101                 "mov %%edx, %c[rdx](%3) \n\t"
3102                 "mov %%esi, %c[rsi](%3) \n\t"
3103                 "mov %%edi, %c[rdi](%3) \n\t"
3104                 "mov %%ebp, %c[rbp](%3) \n\t"
3105                 "mov %%cr2, %%eax  \n\t"
3106                 "mov %%eax, %c[cr2](%3) \n\t"
3107                 "mov 0(%%esp), %3 \n\t"
3108
3109                 "pop %%ecx; popa \n\t"
3110 #endif
3111                 "setbe %0 \n\t"
3112                 "popf \n\t"
3113               : "=g" (fail)
3114               : "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
3115                 "c"(vcpu),
3116                 [rax]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
3117                 [rbx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
3118                 [rcx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
3119                 [rdx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
3120                 [rsi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
3121                 [rdi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
3122                 [rbp]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
3123 #ifdef __x86_64__
3124                 [r8 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8 ])),
3125                 [r9 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9 ])),
3126                 [r10]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
3127                 [r11]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
3128                 [r12]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
3129                 [r13]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
3130                 [r14]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
3131                 [r15]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
3132 #endif
3133                 [cr2]"i"(offsetof(struct litevm_vcpu, cr2))
3134               : "cc", "memory" );
3135
3136         ++litevm_stat.exits;
3137         printk("vm_run exits");
3138         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3139         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
3140
3141         fx_save(vcpu->guest_fx_image);
3142         fx_restore(vcpu->host_fx_image);
3143
3144 #ifndef __x86_64__
3145         asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3146 #endif
3147
3148         litevm_run->exit_type = 0;
3149         if (fail) {
3150                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
3151                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
3152         } else {
3153                 if (fs_gs_ldt_reload_needed) {
3154                         load_ldt(ldt_sel);
3155                         load_fs(fs_sel);
3156                         /*
3157                          * If we have to reload gs, we must take care to
3158                          * preserve our gs base.
3159                          */
3160                         disable_irq();
3161                         load_gs(gs_sel);
3162 #ifdef __x86_64__
3163                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
3164 #endif
3165                         enable_irq();
3166
3167                         reload_tss();
3168                 }
3169                 vcpu->launched = 1;
3170                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
3171                 if (litevm_handle_exit(litevm_run, vcpu)) {
3172                         /* Give scheduler a change to reschedule. */
3173                         vcpu_put(vcpu);
3174 #warning "how to tell if signal is pending"
3175 /*
3176                         if (signal_pending(current)) {
3177                                 ++litevm_stat.signal_exits;
3178                                 return -EINTR;
3179                         }
3180 */
3181                         kthread_yield();
3182                         /* Cannot fail -  no vcpu unplug yet. */
3183                         vcpu_load(litevm, vcpu_slot(vcpu));
3184                         goto again;
3185                 }
3186         }
3187
3188         vcpu_put(vcpu);
3189         printk("vm_run returns\n");
3190         print_func_exit();
3191         return 0;
3192 }
3193
3194 static int litevm_dev_ioctl_get_regs(struct litevm *litevm, struct litevm_regs *regs)
3195 {
3196         print_func_entry();
3197         struct litevm_vcpu *vcpu;
3198
3199         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3200                 print_func_exit();
3201                 return -EINVAL;
3202         }
3203
3204         vcpu = vcpu_load(litevm, regs->vcpu);
3205         if (!vcpu) {
3206                 print_func_exit();
3207                 return -ENOENT;
3208         }
3209
3210         regs->rax = vcpu->regs[VCPU_REGS_RAX];
3211         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
3212         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
3213         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
3214         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
3215         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
3216         regs->rsp = vmcs_readl(GUEST_RSP);
3217         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
3218 #ifdef __x86_64__
3219         regs->r8 = vcpu->regs[VCPU_REGS_R8];
3220         regs->r9 = vcpu->regs[VCPU_REGS_R9];
3221         regs->r10 = vcpu->regs[VCPU_REGS_R10];
3222         regs->r11 = vcpu->regs[VCPU_REGS_R11];
3223         regs->r12 = vcpu->regs[VCPU_REGS_R12];
3224         regs->r13 = vcpu->regs[VCPU_REGS_R13];
3225         regs->r14 = vcpu->regs[VCPU_REGS_R14];
3226         regs->r15 = vcpu->regs[VCPU_REGS_R15];
3227 #endif
3228
3229         regs->rip = vmcs_readl(GUEST_RIP);
3230         regs->rflags = vmcs_readl(GUEST_RFLAGS);
3231
3232         /*
3233          * Don't leak debug flags in case they were set for guest debugging
3234          */
3235         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3236                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3237
3238         vcpu_put(vcpu);
3239
3240         print_func_exit();
3241         return 0;
3242 }
3243
3244 static int litevm_dev_ioctl_set_regs(struct litevm *litevm, struct litevm_regs *regs)
3245 {
3246         print_func_entry();
3247         struct litevm_vcpu *vcpu;
3248
3249         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3250                 print_func_exit();
3251                 return -EINVAL;
3252         }
3253
3254         vcpu = vcpu_load(litevm, regs->vcpu);
3255         if (!vcpu) {
3256                 print_func_exit();
3257                 return -ENOENT;
3258         }
3259
3260         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
3261         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
3262         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
3263         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
3264         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
3265         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
3266         vmcs_writel(GUEST_RSP, regs->rsp);
3267         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
3268 #ifdef __x86_64__
3269         vcpu->regs[VCPU_REGS_R8] = regs->r8;
3270         vcpu->regs[VCPU_REGS_R9] = regs->r9;
3271         vcpu->regs[VCPU_REGS_R10] = regs->r10;
3272         vcpu->regs[VCPU_REGS_R11] = regs->r11;
3273         vcpu->regs[VCPU_REGS_R12] = regs->r12;
3274         vcpu->regs[VCPU_REGS_R13] = regs->r13;
3275         vcpu->regs[VCPU_REGS_R14] = regs->r14;
3276         vcpu->regs[VCPU_REGS_R15] = regs->r15;
3277 #endif
3278
3279         vmcs_writel(GUEST_RIP, regs->rip);
3280         vmcs_writel(GUEST_RFLAGS, regs->rflags);
3281
3282         vcpu_put(vcpu);
3283
3284         print_func_exit();
3285         return 0;
3286 }
3287
3288 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
3289 {
3290         print_func_entry();
3291         struct litevm_vcpu *vcpu;
3292
3293         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3294                 print_func_exit();
3295                 return -EINVAL;
3296         }
3297         vcpu = vcpu_load(litevm, sregs->vcpu);
3298         if (!vcpu) {
3299                 print_func_exit();
3300                 return -ENOENT;
3301         }
3302
3303 #define get_segment(var, seg) \
3304         do { \
3305                 uint32_t ar; \
3306                 \
3307                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
3308                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
3309                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
3310                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
3311                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
3312                 sregs->var.type = ar & 15; \
3313                 sregs->var.s = (ar >> 4) & 1; \
3314                 sregs->var.dpl = (ar >> 5) & 3; \
3315                 sregs->var.present = (ar >> 7) & 1; \
3316                 sregs->var.avl = (ar >> 12) & 1; \
3317                 sregs->var.l = (ar >> 13) & 1; \
3318                 sregs->var.db = (ar >> 14) & 1; \
3319                 sregs->var.g = (ar >> 15) & 1; \
3320                 sregs->var.unusable = (ar >> 16) & 1; \
3321         } while (0);
3322
3323         get_segment(cs, CS);
3324         get_segment(ds, DS);
3325         get_segment(es, ES);
3326         get_segment(fs, FS);
3327         get_segment(gs, GS);
3328         get_segment(ss, SS);
3329
3330         get_segment(tr, TR);
3331         get_segment(ldt, LDTR);
3332 #undef get_segment
3333
3334 #define get_dtable(var, table) \
3335         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
3336                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
3337
3338         get_dtable(idt, IDTR);
3339         get_dtable(gdt, GDTR);
3340 #undef get_dtable
3341
3342         sregs->cr0 = guest_cr0();
3343         sregs->cr2 = vcpu->cr2;
3344         sregs->cr3 = vcpu->cr3;
3345         sregs->cr4 = guest_cr4();
3346         sregs->cr8 = vcpu->cr8;
3347         sregs->efer = vcpu->shadow_efer;
3348         sregs->apic_base = vcpu->apic_base;
3349
3350         sregs->pending_int = vcpu->irq_summary != 0;
3351
3352         vcpu_put(vcpu);
3353
3354         print_func_exit();
3355         return 0;
3356 }
3357
3358 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
3359 {
3360         print_func_entry();
3361         struct litevm_vcpu *vcpu;
3362         int mmu_reset_needed = 0;
3363
3364         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3365                 print_func_exit();
3366                 return -EINVAL;
3367         }
3368         vcpu = vcpu_load(litevm, sregs->vcpu);
3369         if (!vcpu) {
3370                 print_func_exit();
3371                 return -ENOENT;
3372         }
3373
3374 #define set_segment(var, seg) \
3375         do { \
3376                 uint32_t ar; \
3377                 \
3378                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
3379                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
3380                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
3381                 if (sregs->var.unusable) { \
3382                         ar = (1 << 16); \
3383                 } else { \
3384                         ar = (sregs->var.type & 15); \
3385                         ar |= (sregs->var.s & 1) << 4; \
3386                         ar |= (sregs->var.dpl & 3) << 5; \
3387                         ar |= (sregs->var.present & 1) << 7; \
3388                         ar |= (sregs->var.avl & 1) << 12; \
3389                         ar |= (sregs->var.l & 1) << 13; \
3390                         ar |= (sregs->var.db & 1) << 14; \
3391                         ar |= (sregs->var.g & 1) << 15; \
3392                 } \
3393                 vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
3394         } while (0);
3395
3396         set_segment(cs, CS);
3397         set_segment(ds, DS);
3398         set_segment(es, ES);
3399         set_segment(fs, FS);
3400         set_segment(gs, GS);
3401         set_segment(ss, SS);
3402
3403         set_segment(tr, TR);
3404
3405         set_segment(ldt, LDTR);
3406 #undef set_segment
3407
3408 #define set_dtable(var, table) \
3409         vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
3410         vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
3411
3412         set_dtable(idt, IDTR);
3413         set_dtable(gdt, GDTR);
3414 #undef set_dtable
3415
3416         vcpu->cr2 = sregs->cr2;
3417         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
3418         vcpu->cr3 = sregs->cr3;
3419
3420         vcpu->cr8 = sregs->cr8;
3421
3422         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
3423 #ifdef __x86_64__
3424         __set_efer(vcpu, sregs->efer);
3425 #endif
3426         vcpu->apic_base = sregs->apic_base;
3427
3428         mmu_reset_needed |= guest_cr0() != sregs->cr0;
3429         vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
3430         update_exception_bitmap(vcpu);
3431         vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
3432         vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
3433
3434         mmu_reset_needed |=  guest_cr4() != sregs->cr4;
3435         __set_cr4(vcpu, sregs->cr4);
3436
3437         if (mmu_reset_needed)
3438                 litevm_mmu_reset_context(vcpu);
3439         vcpu_put(vcpu);
3440
3441         print_func_exit();
3442         return 0;
3443 }
3444
3445 /*
3446  * Translate a guest virtual address to a guest physical address.
3447  */
3448 static int litevm_dev_ioctl_translate(struct litevm *litevm, struct litevm_translation *tr)
3449 {
3450         print_func_entry();
3451         unsigned long vaddr = tr->linear_address;
3452         struct litevm_vcpu *vcpu;
3453         gpa_t gpa;
3454
3455         vcpu = vcpu_load(litevm, tr->vcpu);
3456         if (!vcpu) {
3457                 print_func_exit();
3458                 return -ENOENT;
3459         }
3460         spin_lock_irqsave(&litevm->lock);
3461         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
3462         tr->physical_address = gpa;
3463         tr->valid = gpa != UNMAPPED_GVA;
3464         tr->writeable = 1;
3465         tr->usermode = 0;
3466         spin_unlock(&litevm->lock);
3467         vcpu_put(vcpu);
3468
3469         print_func_exit();
3470         return 0;
3471 }
3472
3473 #if 0
3474 static int litevm_dev_ioctl_interrupt(struct litevm *litevm, struct litevm_interrupt *irq)
3475 {
3476         struct litevm_vcpu *vcpu;
3477
3478         if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS)
3479                 return -EINVAL;
3480         if (irq->irq < 0 || irq->irq >= 256)
3481                 return -EINVAL;
3482         vcpu = vcpu_load(litevm, irq->vcpu);
3483         if (!vcpu)
3484                 return -ENOENT;
3485
3486         set_bit(irq->irq, vcpu->irq_pending);
3487         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
3488
3489         vcpu_put(vcpu);
3490
3491         return 0;
3492 }
3493 #endif
3494
3495 #if 0
3496 static int litevm_dev_ioctl_debug_guest(struct litevm *litevm,
3497                                      struct litevm_debug_guest *dbg)
3498 {
3499         struct litevm_vcpu *vcpu;
3500         unsigned long dr7 = 0x400;
3501         uint32_t exception_bitmap;
3502         int old_singlestep;
3503
3504         if (dbg->vcpu < 0 || dbg->vcpu >= LITEVM_MAX_VCPUS)
3505                 return -EINVAL;
3506         vcpu = vcpu_load(litevm, dbg->vcpu);
3507         if (!vcpu)
3508                 return -ENOENT;
3509
3510         exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
3511         old_singlestep = vcpu->guest_debug.singlestep;
3512
3513         vcpu->guest_debug.enabled = dbg->enabled;
3514         if (vcpu->guest_debug.enabled) {
3515                 int i;
3516
3517                 dr7 |= 0x200;  /* exact */
3518                 for (i = 0; i < 4; ++i) {
3519                         if (!dbg->breakpoints[i].enabled)
3520                                 continue;
3521                         vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
3522                         dr7 |= 2 << (i*2);    /* global enable */
3523                         dr7 |= 0 << (i*4+16); /* execution breakpoint */
3524                 }
3525
3526                 exception_bitmap |= (1u << 1);  /* Trap debug exceptions */
3527
3528                 vcpu->guest_debug.singlestep = dbg->singlestep;
3529         } else {
3530                 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
3531                 vcpu->guest_debug.singlestep = 0;
3532         }
3533
3534         if (old_singlestep && !vcpu->guest_debug.singlestep) {
3535                 unsigned long flags;
3536
3537                 flags = vmcs_readl(GUEST_RFLAGS);
3538                 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3539                 vmcs_writel(GUEST_RFLAGS, flags);
3540         }
3541
3542         vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
3543         vmcs_writel(GUEST_DR7, dr7);
3544
3545         vcpu_put(vcpu);
3546
3547         return 0;
3548 }
3549 #endif
3550
3551 #if 0
3552 long litevm_control(struct litevm *litevm, int command, unsigned long arg)
3553 {
3554         int r = -EINVAL;
3555
3556         switch (command) {
3557         case LITEVM_CREATE_VCPU: {
3558                 r = create_vcpu(litevm, arg);
3559                 if (r)
3560                         goto out;
3561                 break;
3562         }
3563         case LITEVM_RUN: {
3564                 struct litevm_run litevm_run;
3565
3566                 r = -EFAULT;
3567                 if (copy_from_user(&litevm_run, (void *)arg, sizeof litevm_run))
3568                         goto out;
3569                 r = litevm_dev_ioctl_run(litevm, &litevm_run);
3570                 if (r < 0)
3571                         goto out;
3572                 r = -EFAULT;
3573                 if (copy_to_user((void *)arg, &litevm_run, sizeof litevm_run))
3574                         goto out;
3575                 r = 0;
3576                 break;
3577         }
3578         case LITEVM_GET_REGS: {
3579                 struct litevm_regs litevm_regs;
3580
3581                 r = -EFAULT;
3582                 if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
3583                         goto out;
3584                 r = litevm_dev_ioctl_get_regs(litevm, &litevm_regs);
3585                 if (r)
3586                         goto out;
3587                 r = -EFAULT;
3588                 if (copy_to_user((void *)arg, &litevm_regs, sizeof litevm_regs))
3589                         goto out;
3590                 r = 0;
3591                 break;
3592         }
3593         case LITEVM_SET_REGS: {
3594                 struct litevm_regs litevm_regs;
3595
3596                 r = -EFAULT;
3597                 if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
3598                         goto out;
3599                 r = litevm_dev_ioctl_set_regs(litevm, &litevm_regs);
3600                 if (r)
3601                         goto out;
3602                 r = 0;
3603                 break;
3604         }
3605         case LITEVM_GET_SREGS: {
3606                 struct litevm_sregs litevm_sregs;
3607
3608                 r = -EFAULT;
3609                 if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3610                         goto out;
3611                 r = litevm_dev_ioctl_get_sregs(litevm, &litevm_sregs);
3612                 if (r)
3613                         goto out;
3614                 r = -EFAULT;
3615                 if (copy_to_user((void *)arg, &litevm_sregs, sizeof litevm_sregs))
3616                         goto out;
3617                 r = 0;
3618                 break;
3619         }
3620         case LITEVM_SET_SREGS: {
3621                 struct litevm_sregs litevm_sregs;
3622
3623                 r = -EFAULT;
3624                 if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3625                         goto out;
3626                 r = litevm_dev_ioctl_set_sregs(litevm, &litevm_sregs);
3627                 if (r)
3628                         goto out;
3629                 r = 0;
3630                 break;
3631         }
3632         case LITEVM_TRANSLATE: {
3633                 struct litevm_translation tr;
3634
3635                 r = -EFAULT;
3636                 if (copy_from_user(&tr, (void *)arg, sizeof tr))
3637                         goto out;
3638                 r = litevm_dev_ioctl_translate(litevm, &tr);
3639                 if (r)
3640                         goto out;
3641                 r = -EFAULT;
3642                 if (copy_to_user((void *)arg, &tr, sizeof tr))
3643                         goto out;
3644                 r = 0;
3645                 break;
3646         }
3647         case LITEVM_INTERRUPT: {
3648                 struct litevm_interrupt irq;
3649
3650                 r = -EFAULT;
3651                 if (copy_from_user(&irq, (void *)arg, sizeof irq))
3652                         goto out;
3653                 r = litevm_dev_ioctl_interrupt(litevm, &irq);
3654                 if (r)
3655                         goto out;
3656                 r = 0;
3657                 break;
3658         }
3659         case LITEVM_DEBUG_GUEST: {
3660                 struct litevm_debug_guest dbg;
3661
3662                 r = -EFAULT;
3663                 if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
3664                         goto out;
3665                 r = litevm_dev_ioctl_debug_guest(litevm, &dbg);
3666                 if (r)
3667                         goto out;
3668                 r = 0;
3669                 break;
3670         }
3671         case LITEVM_SET_MEMORY_REGION: {
3672                 struct litevm_memory_region litevm_mem;
3673
3674                 r = -EFAULT;
3675                 if (copy_from_user(&litevm_mem, (void *)arg, sizeof litevm_mem))
3676                         goto out;
3677                 r = litevm_dev_ioctl_set_memory_region(litevm, &litevm_mem);
3678                 if (r)
3679                         goto out;
3680                 break;
3681         }
3682         case LITEVM_GET_DIRTY_LOG: {
3683                 struct litevm_dirty_log log;
3684
3685                 r = -EFAULT;
3686                 if (copy_from_user(&log, (void *)arg, sizeof log))
3687                         goto out;
3688                 r = litevm_dev_ioctl_get_dirty_log(litevm, &log);
3689                 if (r)
3690                         goto out;
3691                 break;
3692         }
3693         default:
3694                 ;
3695         }
3696 out:
3697         return r;
3698 }
3699 #endif
3700
3701 #if 0
3702 static int litevm_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3703 {
3704         struct litevm *litevm = vma->vm_file->private_data;
3705         struct litevm_memory_slot *slot;
3706         struct page *page;
3707
3708         slot = gfn_to_memslot(litevm, vmf->pgoff);
3709         if (!slot)
3710                 return VM_FAULT_SIGBUS;
3711         page = gfn_to_page(slot, vmf->pgoff);
3712         if (!page)
3713                 return VM_FAULT_SIGBUS;
3714
3715         get_page(page);
3716         vmf->page = page;
3717         return 0;
3718 }
3719 #endif
3720
3721 #if 0
3722 static int litevm_reboot(struct notifier_block *notifier, unsigned long val,
3723                        void *v)
3724 {
3725         panic("litevm_reboot");
3726         if (val == SYS_RESTART) {
3727                 /*
3728                  * Some (well, at least mine) BIOSes hang on reboot if
3729                  * in vmx root mode.
3730                  */
3731                 printk("litevm: exiting vmx mode\n");
3732                 handler_wrapper_t *w;
3733                 smp_call_function_all(litevm_disable, 0, &w);
3734                 smp_call_wait(w);
3735         }
3736         return NOTIFY_OK;
3737         return 0;
3738 }
3739 #endif
3740
3741 hpa_t bad_page_address;
3742
3743 int vmx_init(void)
3744 {
3745         print_func_entry();
3746         handler_wrapper_t *w;
3747         int r = 0;
3748
3749         if (!cpu_has_litevm_support()) {
3750                 printk("litevm: no hardware support\n");
3751                 print_func_exit();
3752                 return -EOPNOTSUPP;
3753         }
3754         if (vmx_disabled_by_bios()) {
3755                 printk("litevm: disabled by bios\n");
3756                 print_func_exit();
3757                 return -EOPNOTSUPP;
3758         }
3759
3760         setup_vmcs_descriptor();
3761         smp_call_function_all(vm_enable, 0, &w);
3762         if (smp_call_wait(w)){
3763                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
3764         }
3765
3766         if ((bad_page_address = PADDR(kpage_zalloc_addr())) == 0ULL) {
3767                 r = -ENOMEM;
3768         }
3769
3770         print_func_exit();
3771         return r;
3772 }
3773
3774 static void litevm_exit(void)
3775 {
3776         print_func_entry();
3777         //free_litevm_area();
3778         //__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3779         print_func_exit();
3780 }
3781
3782