x86: Fixes bug in pml callbacks
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 #define currentcpu (&per_cpu_info[core_id()])
39
40 struct litevm_stat litevm_stat;
41
42 static struct litevm_stats_debugfs_item {
43         const char *name;
44         uint32_t *data;
45 } debugfs_entries[] = {
46         { "pf_fixed", &litevm_stat.pf_fixed },
47         { "pf_guest", &litevm_stat.pf_guest },
48         { "tlb_flush", &litevm_stat.tlb_flush },
49         { "invlpg", &litevm_stat.invlpg },
50         { "exits", &litevm_stat.exits },
51         { "io_exits", &litevm_stat.io_exits },
52         { "mmio_exits", &litevm_stat.mmio_exits },
53         { "signal_exits", &litevm_stat.signal_exits },
54         { "irq_exits", &litevm_stat.irq_exits },
55         { 0, 0 }
56 };
57
58 static struct dentry *debugfs_dir;
59
60 static const uint32_t vmx_msr_index[] = {
61 #ifdef __x86_64__
62         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
63 #endif
64         MSR_EFER, // wtf? MSR_K6_STAR,
65 };
66 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
67
68 #ifdef __x86_64__
69 /*
70  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
71  * mechanism (cpu bug AA24)
72  */
73 #define NR_BAD_MSRS 2
74 #else
75 #define NR_BAD_MSRS 0
76 #endif
77
78 #define TSS_IOPB_BASE_OFFSET 0x66
79 #define TSS_BASE_SIZE 0x68
80 #define TSS_IOPB_SIZE (65536 / 8)
81 #define TSS_REDIRECTION_SIZE (256 / 8)
82 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
83
84 #define MSR_IA32_VMX_BASIC_MSR                  0x480
85 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
86 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
87 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
88 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
89
90 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
91 #define LMSW_GUEST_MASK 0x0eULL
92 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
93 //#define CR4_VMXE 0x2000
94 #define CR8_RESEVED_BITS (~0x0fULL)
95 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
96
97 #ifdef __x86_64__
98 #define HOST_IS_64 1
99 #else
100 #define HOST_IS_64 0
101 #endif
102
103 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
104 /**
105  * __ffs - find first set bit in word
106  * @word: The word to search
107  *
108  * Undefined if no bit exists, so code should check against 0 first.
109  */
110 static inline unsigned long __ffs(unsigned long word)
111 {
112         print_func_entry();
113         asm("rep; bsf %1,%0"
114                 : "=r" (word)
115                 : "rm" (word));
116         print_func_exit();
117         return word;
118 }
119
120 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu, uint32_t msr)
121 {
122         print_func_entry();
123         int i;
124
125         for (i = 0; i < vcpu->nmsrs; ++i)
126                 if (vcpu->guest_msrs[i].index == msr) {
127                         print_func_exit();
128                         return &vcpu->guest_msrs[i];
129                 }
130         print_func_exit();
131         return 0;
132 }
133
134 struct descriptor_table {
135         uint16_t limit;
136         unsigned long base;
137 } __attribute__((packed));
138
139 static void get_gdt(struct descriptor_table *table)
140 {
141         print_func_entry();
142         asm ("sgdt %0" : "=m"(*table));
143         print_func_exit();
144 }
145
146 static void get_idt(struct descriptor_table *table)
147 {
148         print_func_entry();
149         asm ("sidt %0" : "=m"(*table));
150         print_func_exit();
151 }
152
153 static uint16_t read_fs(void)
154 {
155         print_func_entry();
156         uint16_t seg;
157         asm ("mov %%fs, %0" : "=g"(seg));
158         print_func_exit();
159         return seg;
160 }
161
162 static uint16_t read_gs(void)
163 {
164         print_func_entry();
165         uint16_t seg;
166         asm ("mov %%gs, %0" : "=g"(seg));
167         print_func_exit();
168         return seg;
169 }
170
171 static uint16_t read_ldt(void)
172 {
173         print_func_entry();
174         uint16_t ldt;
175         asm ("sldt %0" : "=g"(ldt));
176         print_func_exit();
177         return ldt;
178 }
179
180 static void load_fs(uint16_t sel)
181 {
182         print_func_entry();
183         asm ("mov %0, %%fs" : : "g"(sel));
184         print_func_exit();
185 }
186
187 static void load_gs(uint16_t sel)
188 {
189         print_func_entry();
190         asm ("mov %0, %%gs" : : "g"(sel));
191         print_func_exit();
192 }
193
194 #ifndef load_ldt
195 static void load_ldt(uint16_t sel)
196 {
197         print_func_entry();
198         asm ("lldt %0" : : "g"(sel));
199         print_func_exit();
200 }
201 #endif
202
203 static void fx_save(void *image)
204 {
205         print_func_entry();
206         asm ("fxsave (%0)":: "r" (image));
207         print_func_exit();
208 }
209
210 static void fx_restore(void *image)
211 {
212         print_func_entry();
213         asm ("fxrstor (%0)":: "r" (image));
214         print_func_exit();
215 }
216
217 static void fpu_init(void)
218 {
219         print_func_entry();
220         asm ("finit");
221         print_func_exit();
222 }
223
224 struct segment_descriptor {
225         uint16_t limit_low;
226         uint16_t base_low;
227         uint8_t  base_mid;
228         uint8_t  type : 4;
229         uint8_t  system : 1;
230         uint8_t  dpl : 2;
231         uint8_t  present : 1;
232         uint8_t  limit_high : 4;
233         uint8_t  avl : 1;
234         uint8_t  long_mode : 1;
235         uint8_t  default_op : 1;
236         uint8_t  granularity : 1;
237         uint8_t  base_high;
238 } __attribute__((packed));
239
240 #ifdef __x86_64__
241 // LDT or TSS descriptor in the GDT. 16 bytes.
242 struct segment_descriptor_64 {
243         struct segment_descriptor s;
244         uint32_t base_higher;
245         uint32_t pad_zero;
246 };
247
248 #endif
249
250 static unsigned long segment_base(uint16_t selector)
251 {
252         print_func_entry();
253         struct descriptor_table gdt;
254         struct segment_descriptor *d;
255         unsigned long table_base;
256         typedef unsigned long ul;
257         unsigned long v;
258
259         asm ("sgdt %0" : "=m"(gdt));
260         table_base = gdt.base;
261
262         if (selector & 4) {           /* from ldt */
263                 uint16_t ldt_selector;
264
265                 asm ("sldt %0" : "=g"(ldt_selector));
266                 table_base = segment_base(ldt_selector);
267         }
268         d = (struct segment_descriptor *)(table_base + (selector & ~7));
269         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
270 #ifdef __x86_64__
271         if (d->system == 0
272             && (d->type == 2 || d->type == 9 || d->type == 11))
273                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
274 #endif
275         print_func_exit();
276         return v;
277 }
278
279 static unsigned long read_tr_base(void)
280 {
281         print_func_entry();
282         uint16_t tr;
283         asm ("str %0" : "=g"(tr));
284         print_func_exit();
285         return segment_base(tr);
286 }
287
288 static void reload_tss(void)
289 {
290 print_func_entry();
291 #ifndef __x86_64__
292
293         /*
294          * VT restores TR but not its size.  Useless.
295          */
296         struct descriptor_table gdt;
297         struct segment_descriptor *descs;
298
299         get_gdt(&gdt);
300         descs = (void *)gdt.base;
301         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
302         load_TR_desc();
303 #endif
304 print_func_exit();
305 }
306
307 static struct vmcs_descriptor {
308         int size;
309         int order;
310         uint32_t revision_id;
311 } vmcs_descriptor;
312
313 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
314 {
315         print_func_entry();
316         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
317         print_func_exit();
318         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
319 }
320
321
322
323 int litevm_read_guest(struct litevm_vcpu *vcpu,
324                              gva_t addr,
325                              unsigned long size,
326                              void *dest)
327 {
328         print_func_entry();
329         unsigned char *host_buf = dest;
330         unsigned long req_size = size;
331
332         while (size) {
333                 hpa_t paddr;
334                 unsigned now;
335                 unsigned offset;
336                 hva_t guest_buf;
337
338                 paddr = gva_to_hpa(vcpu, addr);
339
340                 if (is_error_hpa(paddr))
341                         break;
342                 guest_buf = (hva_t)KADDR(paddr);
343                 offset = addr & ~PAGE_MASK;
344                 guest_buf |= offset;
345                 now = MIN(size, PAGE_SIZE - offset);
346                 memcpy(host_buf, (void*)guest_buf, now);
347                 host_buf += now;
348                 addr += now;
349                 size -= now;
350         }
351         print_func_exit();
352         return req_size - size;
353 }
354
355 int litevm_write_guest(struct litevm_vcpu *vcpu,
356                              gva_t addr,
357                              unsigned long size,
358                              void *data)
359 {
360         print_func_entry();
361         unsigned char *host_buf = data;
362         unsigned long req_size = size;
363
364         while (size) {
365                 hpa_t paddr;
366                 unsigned now;
367                 unsigned offset;
368                 hva_t guest_buf;
369
370                 paddr = gva_to_hpa(vcpu, addr);
371
372                 if (is_error_hpa(paddr))
373                         break;
374
375                 guest_buf = (hva_t)KADDR(paddr);
376                 offset = addr & ~PAGE_MASK;
377                 guest_buf |= offset;
378                 now = MIN(size, PAGE_SIZE - offset);
379                 memcpy((void*)guest_buf, host_buf, now);
380                 host_buf += now;
381                 addr += now;
382                 size -= now;
383         }
384         print_func_exit();
385         return req_size - size;
386 }
387
388 static void setup_vmcs_descriptor(void)
389 {
390         print_func_entry();
391         uint64_t msr;
392
393         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
394         vmcs_descriptor.size = (msr>>32) & 0x1fff;
395         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size>>PAGE_SHIFT);
396         vmcs_descriptor.revision_id = (uint32_t)msr;
397         printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n",
398                msr, vmcs_descriptor.size, vmcs_descriptor.order,
399                vmcs_descriptor.revision_id);
400         print_func_exit();
401 };
402
403 static void vmcs_clear(struct vmcs *vmcs)
404 {
405         print_func_entry();
406         uint64_t phys_addr = PADDR(vmcs);
407         uint8_t error;
408         printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr);
409         asm volatile ("vmclear %1; setna %0"
410                        : "=m"(error) : "m"(phys_addr) : "cc", "memory" );
411         if (error)
412                 printk("litevm: vmclear fail: %p/%llx\n",
413                        vmcs, phys_addr);
414         print_func_exit();
415 }
416
417 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
418 {
419         print_func_entry();
420         struct litevm_vcpu *vcpu = arg;
421         int cpu = core_id();
422         printd("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n", 
423                cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
424
425         if (vcpu->cpu == cpu)
426                 vmcs_clear(vcpu->vmcs);
427
428         if (currentcpu->vmcs == vcpu->vmcs)
429                 currentcpu->vmcs = NULL;
430         print_func_exit();
431 }
432
433 static int vcpu_slot(struct litevm_vcpu *vcpu)
434 {
435         print_func_entry();
436         print_func_exit();
437         return vcpu - vcpu->litevm->vcpus;
438 }
439
440 /*
441  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
442  * vcpu mutex is already taken.
443  */
444 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
445 {
446         print_func_entry();
447         uint64_t phys_addr = PADDR(vcpu->vmcs);
448         int cpu;
449         cpu = core_id();
450
451         if (vcpu->cpu != cpu) {
452                 handler_wrapper_t *w;
453                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
454                 smp_call_wait(w);
455                 vcpu->launched = 0;
456         }
457         if (currentcpu->vmcs != vcpu->vmcs) {
458                 uint8_t error;
459
460                 currentcpu->vmcs = vcpu->vmcs;
461                 asm volatile ("vmptrld %1; setna %0"
462                                : "=m"(error) : "m"(phys_addr) : "cc" );
463                 if (error){
464                         printk("litevm: vmptrld %p/%llx fail\n",
465                                vcpu->vmcs, phys_addr);
466                         error("litevm: vmptrld %p/%llx fail\n",
467                                vcpu->vmcs, phys_addr);
468                 }
469         }
470
471         if (vcpu->cpu != cpu) {
472                 struct descriptor_table dt;
473                 unsigned long sysenter_esp;
474
475                 vcpu->cpu = cpu;
476                 /*
477                  * Linux uses per-cpu TSS and GDT, so set these when switching
478                  * processors.
479                  */
480                 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
481                 get_gdt(&dt);
482                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
483
484                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
485                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
486         }
487         print_func_exit();
488         return vcpu;
489 }
490
491 /*
492  * Switches to specified vcpu, until a matching vcpu_put()
493  */
494 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
495 {
496         print_func_entry();
497         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
498
499         printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu);
500
501         qlock(&vcpu->mutex);
502         if (!vcpu->vmcs) {
503                 qunlock(&vcpu->mutex);
504                 error("vcpu->vmcs is NULL");
505         }
506         print_func_exit();
507         return __vcpu_load(vcpu);
508 }
509
510 static void vcpu_put(struct litevm_vcpu *vcpu)
511 {
512         print_func_entry();
513         //put_cpu();
514         qunlock(&vcpu->mutex);
515         print_func_exit();
516 }
517
518
519 static struct vmcs *alloc_vmcs_cpu(int cpu)
520 {
521         print_func_entry();
522         int node = node_id();
523         struct vmcs *vmcs;
524
525         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
526         if (!pages) {
527                 print_func_exit();
528                 return 0;
529         }
530         memset(vmcs, 0, vmcs_descriptor.size);
531         vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
532         print_func_exit();
533         return vmcs;
534 }
535
536 static struct vmcs *alloc_vmcs(void)
537 {
538         struct vmcs *ret;
539         print_func_entry();
540         ret = alloc_vmcs_cpu(core_id());
541         print_func_exit();
542         return ret;
543 }
544
545 static int cpu_has_litevm_support(void)
546 {
547         print_func_entry();
548         uint32_t ecx = cpuid_ecx(1);
549         print_func_exit();
550         return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */
551 }
552
553 static int vmx_disabled_by_bios(void)
554 {
555         print_func_entry();
556         uint64_t msr;
557
558         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
559         print_func_exit();
560         return (msr & 5) == 1; /* locked but not enabled */
561 }
562
563 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
564 {
565         print_func_entry();
566         int cpu = hw_core_id();
567         uint64_t phys_addr;
568         uint64_t old;
569         uint64_t status = 0;
570         currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order,
571                                                   KMALLOC_WAIT);
572         if (! currentcpu->vmxarea)
573                 return;
574         memset(currentcpu->vmxarea, 0, vmcs_descriptor.size);
575         currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id;
576         phys_addr = PADDR(currentcpu->vmxarea);
577         printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(),
578                currentcpu->vmxarea, (void *)phys_addr);
579         if (phys_addr & 0xfff){
580                 printk("fix vmxarea alignment!");
581         }
582         printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
583         old = read_msr(MSR_IA32_FEATURE_CONTROL);
584         printk("%d: vm_enable, old is %d\n", core_id(), old);
585         if ((old & 5) == 0){
586                 /* enable and lock */
587                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
588                 old = read_msr(MSR_IA32_FEATURE_CONTROL);
589                 printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old);
590         }
591         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
592         lcr4(rcr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
593         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
594         printk("%d:cr0 is %x\n", core_id(), rcr0());
595         lcr0(rcr0() | 0x20);
596         printk("%d:cr0 is %x\n", core_id(), rcr0());
597         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
598         outb(0x92, inb(0x92)|2);
599         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
600         asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:"       \
601                       : "=m" (status) : "m"(phys_addr) : "memory", "cc");
602         printk("%d:vmxon status is %d\n", core_id(), status);
603         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
604         if (! status){
605                 printk("%d:vm_enable: status says fail\n", core_id());
606         }
607         print_func_exit();
608 }
609
610 static void litevm_disable(void *garbage)
611 {
612         print_func_entry();
613         asm volatile ("vmxoff" : : : "cc");
614         print_func_exit();
615 }
616
617 struct litevm *vmx_open(void)
618 {
619         print_func_entry();
620         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
621         int i;
622
623         if (!litevm) {
624                 printk("NO LITEVM! MAKES NO SENSE!\n");
625                 error("litevm alloc failed");
626                 print_func_exit();
627                 return 0;
628         }
629
630         spinlock_init_irqsave(&litevm->lock);
631         LIST_INIT(&litevm->link);
632         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
633                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
634
635                 qlock_init(&vcpu->mutex);
636                 vcpu->mmu.root_hpa = INVALID_PAGE;
637                 LIST_INIT(&vcpu->link);
638         }
639         printk("vmx_open: busy %d\n", litevm->busy);
640         printk("return %p\n", litevm);
641         print_func_exit();
642         return litevm;
643 }
644
645 /*
646  * Free any memory in @free but not in @dont.
647  */
648 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
649                                   struct litevm_memory_slot *dont)
650 {
651         print_func_entry();
652         int i;
653
654         if (!dont || free->phys_mem != dont->phys_mem)
655                 if (free->phys_mem) {
656                         for (i = 0; i < free->npages; ++i){
657                                 page_t *page = free->phys_mem[i];
658                                 page_decref(page);
659                                 assert(page_is_free(page2ppn(page)));
660                         }
661                         kfree(free->phys_mem);
662                 }
663
664         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
665                 kfree(free->dirty_bitmap);
666
667         free->phys_mem = 0;
668         free->npages = 0;
669         free->dirty_bitmap = 0;
670         print_func_exit();
671 }
672
673 static void litevm_free_physmem(struct litevm *litevm)
674 {
675         print_func_entry();
676         int i;
677
678         for (i = 0; i < litevm->nmemslots; ++i)
679                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
680         print_func_exit();
681 }
682
683 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
684 {
685         print_func_entry();
686         if (vcpu->vmcs) {
687                 handler_wrapper_t *w;
688                 smp_call_function_all(__vcpu_clear, vcpu, &w);
689                 smp_call_wait(w);
690                 //free_vmcs(vcpu->vmcs);
691                 vcpu->vmcs = 0;
692         }
693         print_func_exit();
694 }
695
696 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
697 {
698         print_func_entry();
699         litevm_free_vmcs(vcpu);
700         litevm_mmu_destroy(vcpu);
701         print_func_exit();
702 }
703
704 static void litevm_free_vcpus(struct litevm *litevm)
705 {
706         print_func_entry();
707         unsigned int i;
708
709         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
710                 litevm_free_vcpu(&litevm->vcpus[i]);
711         print_func_exit();
712 }
713
714 static int litevm_dev_release(struct litevm *litevm)
715 {
716 print_func_entry();
717
718         litevm_free_vcpus(litevm);
719         litevm_free_physmem(litevm);
720         kfree(litevm);
721         print_func_exit();
722         return 0;
723 }
724
725 unsigned long vmcs_readl(unsigned long field)
726 {
727         print_func_entry();
728         unsigned long value;
729
730         asm volatile ("vmread %1, %0" : "=g"(value) : "r"(field) : "cc");
731         print_func_exit();
732         return value;
733 }
734
735 void vmcs_writel(unsigned long field, unsigned long value)
736 {
737         print_func_entry();
738         uint8_t error;
739
740         asm volatile ("vmwrite %1, %2; setna %0"
741                        : "=g"(error) : "r"(value), "r"(field) : "cc" );
742         if (error)
743                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
744                        field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
745         print_func_exit();
746 }
747
748 static void vmcs_write16(unsigned long field, uint16_t value)
749 {
750         print_func_entry();
751         vmcs_writel(field, value);
752         print_func_exit();
753 }
754
755 static void vmcs_write64(unsigned long field, uint64_t value)
756 {
757 print_func_entry();
758 #ifdef __x86_64__
759         vmcs_writel(field, value);
760 #else
761         vmcs_writel(field, value);
762         asm volatile ("");
763         vmcs_writel(field+1, value >> 32);
764 #endif
765 print_func_exit();
766 }
767
768 static void inject_gp(struct litevm_vcpu *vcpu)
769 {
770         print_func_entry();
771         printd("inject_general_protection: rip 0x%lx\n",
772                vmcs_readl(GUEST_RIP));
773         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
774         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
775                      GP_VECTOR |
776                      INTR_TYPE_EXCEPTION |
777                      INTR_INFO_DELIEVER_CODE_MASK |
778                      INTR_INFO_VALID_MASK);
779         print_func_exit();
780 }
781
782 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
783 {
784         print_func_entry();
785         if (vcpu->rmode.active)
786                 vmcs_write32(EXCEPTION_BITMAP, ~0);
787         else
788                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
789         print_func_exit();
790 }
791
792 static void enter_pmode(struct litevm_vcpu *vcpu)
793 {
794         print_func_entry();
795         unsigned long flags;
796
797         vcpu->rmode.active = 0;
798
799         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
800         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
801         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
802
803         flags = vmcs_readl(GUEST_RFLAGS);
804         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
805         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
806         vmcs_writel(GUEST_RFLAGS, flags);
807
808         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
809                         (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK) );
810
811         update_exception_bitmap(vcpu);
812
813         #define FIX_PMODE_DATASEG(seg, save) {                          \
814                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
815                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
816                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
817                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
818         }
819
820         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
821         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
822         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
823         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
824         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
825
826         vmcs_write16(GUEST_CS_SELECTOR,
827                      vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
828         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
829         print_func_exit();
830 }
831
832 static int rmode_tss_base(struct litevm* litevm)
833 {
834         print_func_entry();
835         gfn_t base_gfn = litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
836         print_func_exit();
837         return base_gfn << PAGE_SHIFT;
838 }
839
840 static void enter_rmode(struct litevm_vcpu *vcpu)
841 {
842         print_func_entry();
843         unsigned long flags;
844
845         vcpu->rmode.active = 1;
846
847         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
848         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
849
850         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
851         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
852
853         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
854         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
855
856         flags = vmcs_readl(GUEST_RFLAGS);
857         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
858
859         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
860
861         vmcs_writel(GUEST_RFLAGS, flags);
862         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
863         update_exception_bitmap(vcpu);
864
865         #define FIX_RMODE_SEG(seg, save) {                                 \
866                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
867                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
868                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
869                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
870         }
871
872         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
873         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
874
875         FIX_RMODE_SEG(ES, vcpu->rmode.es);
876         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
877         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
878         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
879         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
880         print_func_exit();
881 }
882
883 static int init_rmode_tss(struct litevm* litevm)
884 {
885         print_func_entry();
886         struct page *p1, *p2, *p3;
887         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
888         char *page;
889
890         p1 = _gfn_to_page(litevm, fn++);
891         p2 = _gfn_to_page(litevm, fn++);
892         p3 = _gfn_to_page(litevm, fn);
893
894         if (!p1 || !p2 || !p3) {
895                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
896                 print_func_exit();
897                 return 0;
898         }
899
900         page = page2kva(p1);
901         memset(page, 0, PAGE_SIZE);
902         *(uint16_t*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
903
904         page = page2kva(p2);
905         memset(page, 0, PAGE_SIZE);
906
907         page = page2kva(p3);
908         memset(page, 0, PAGE_SIZE);
909         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
910
911         print_func_exit();
912         return 1;
913 }
914
915 #ifdef __x86_64__
916
917 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
918 {
919         print_func_entry();
920         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
921
922         vcpu->shadow_efer = efer;
923         if (efer & EFER_LMA) {
924                 vmcs_write32(VM_ENTRY_CONTROLS,
925                                      vmcs_read32(VM_ENTRY_CONTROLS) |
926                                      VM_ENTRY_CONTROLS_IA32E_MASK);
927                 msr->data = efer;
928
929         } else {
930                 vmcs_write32(VM_ENTRY_CONTROLS,
931                                      vmcs_read32(VM_ENTRY_CONTROLS) &
932                                      ~VM_ENTRY_CONTROLS_IA32E_MASK);
933
934                 msr->data = efer & ~EFER_LME;
935         }
936         print_func_exit();
937 }
938
939 static void enter_lmode(struct litevm_vcpu *vcpu)
940 {
941         print_func_entry();
942         uint32_t guest_tr_ar;
943
944         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
945         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
946                 printd("%s: tss fixup for long mode. \n",
947                        __FUNCTION__);
948                 vmcs_write32(GUEST_TR_AR_BYTES,
949                              (guest_tr_ar & ~AR_TYPE_MASK)
950                              | AR_TYPE_BUSY_64_TSS);
951         }
952
953         vcpu->shadow_efer |= EFER_LMA;
954
955         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
956         vmcs_write32(VM_ENTRY_CONTROLS,
957                      vmcs_read32(VM_ENTRY_CONTROLS)
958                      | VM_ENTRY_CONTROLS_IA32E_MASK);
959         print_func_exit();
960 }
961
962 static void exit_lmode(struct litevm_vcpu *vcpu)
963 {
964         print_func_entry();
965         vcpu->shadow_efer &= ~EFER_LMA;
966
967         vmcs_write32(VM_ENTRY_CONTROLS,
968                      vmcs_read32(VM_ENTRY_CONTROLS)
969                      & ~VM_ENTRY_CONTROLS_IA32E_MASK);
970         print_func_exit();
971 }
972
973 #endif
974
975 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
976 {
977         print_func_entry();
978         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
979                 enter_pmode(vcpu);
980
981         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
982                 enter_rmode(vcpu);
983
984 #ifdef __x86_64__
985         if (vcpu->shadow_efer & EFER_LME) {
986                 if (!is_paging() && (cr0 & CR0_PG_MASK))
987                         enter_lmode(vcpu);
988                 if (is_paging() && !(cr0 & CR0_PG_MASK))
989                         exit_lmode(vcpu);
990         }
991 #endif
992
993         vmcs_writel(CR0_READ_SHADOW, cr0);
994         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
995         print_func_exit();
996 }
997
998 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
999                                          unsigned long cr3)
1000 {
1001         print_func_entry();
1002         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
1003         unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
1004         int i;
1005         uint64_t pdpte;
1006         uint64_t *pdpt;
1007         struct litevm_memory_slot *memslot;
1008
1009         spin_lock_irqsave(&vcpu->litevm->lock);
1010         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
1011         /* FIXME: !memslot - emulate? 0xff? */
1012         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
1013
1014         for (i = 0; i < 4; ++i) {
1015                 pdpte = pdpt[offset + i];
1016                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
1017                         break;
1018         }
1019
1020         spin_unlock(&vcpu->litevm->lock);
1021
1022         print_func_exit();
1023         return i != 4;
1024 }
1025
1026 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
1027 {
1028         print_func_entry();
1029         if (cr0 & CR0_RESEVED_BITS) {
1030                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
1031                        cr0, guest_cr0());
1032                 inject_gp(vcpu);
1033                 print_func_exit();
1034                 return;
1035         }
1036
1037         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
1038                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
1039                 inject_gp(vcpu);
1040                 print_func_exit();
1041                 return;
1042         }
1043
1044         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
1045                 printd("set_cr0: #GP, set PG flag "
1046                        "and a clear PE flag\n");
1047                 inject_gp(vcpu);
1048                 print_func_exit();
1049                 return;
1050         }
1051
1052         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
1053 #ifdef __x86_64__
1054                 if ((vcpu->shadow_efer & EFER_LME)) {
1055                         uint32_t guest_cs_ar;
1056                         if (!is_pae()) {
1057                                 printd("set_cr0: #GP, start paging "
1058                                        "in long mode while PAE is disabled\n");
1059                                 inject_gp(vcpu);
1060                                 print_func_exit();
1061                                 return;
1062                         }
1063                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1064                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
1065                                 printd("set_cr0: #GP, start paging "
1066                                        "in long mode while CS.L == 1\n");
1067                                 inject_gp(vcpu);
1068                                 print_func_exit();
1069                                 return;
1070
1071                         }
1072                 } else
1073 #endif
1074                 if (is_pae() &&
1075                             pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1076                         printd("set_cr0: #GP, pdptrs "
1077                                "reserved bits\n");
1078                         inject_gp(vcpu);
1079                         print_func_exit();
1080                         return;
1081                 }
1082
1083         }
1084
1085         __set_cr0(vcpu, cr0);
1086         litevm_mmu_reset_context(vcpu);
1087         print_func_exit();
1088         return;
1089 }
1090
1091 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
1092 {
1093         print_func_entry();
1094         unsigned long cr0 = guest_cr0();
1095
1096         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
1097                 enter_pmode(vcpu);
1098                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
1099
1100         } else
1101                 printd("lmsw: unexpected\n");
1102
1103         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
1104                                 | (msw & LMSW_GUEST_MASK));
1105         print_func_exit();
1106 }
1107
1108 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1109 {
1110         print_func_entry();
1111         vmcs_writel(CR4_READ_SHADOW, cr4);
1112         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1113                     LITEVM_RMODE_VM_CR4_ALWAYS_ON : LITEVM_PMODE_VM_CR4_ALWAYS_ON));
1114         print_func_exit();
1115 }
1116
1117 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1118 {
1119         print_func_entry();
1120         if (cr4 & CR4_RESEVED_BITS) {
1121                 printd("set_cr4: #GP, reserved bits\n");
1122                 inject_gp(vcpu);
1123                 print_func_exit();
1124                 return;
1125         }
1126
1127         if (is_long_mode()) {
1128                 if (!(cr4 & CR4_PAE_MASK)) {
1129                         printd("set_cr4: #GP, clearing PAE while "
1130                                "in long mode\n");
1131                         inject_gp(vcpu);
1132                         print_func_exit();
1133                         return;
1134                 }
1135         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
1136                    && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1137                 printd("set_cr4: #GP, pdptrs reserved bits\n");
1138                 inject_gp(vcpu);
1139         }
1140
1141         if (cr4 & CR4_VMXE_MASK) {
1142                 printd("set_cr4: #GP, setting VMXE\n");
1143                 inject_gp(vcpu);
1144                 print_func_exit();
1145                 return;
1146         }
1147         __set_cr4(vcpu, cr4);
1148         spin_lock_irqsave(&vcpu->litevm->lock);
1149         litevm_mmu_reset_context(vcpu);
1150         spin_unlock(&vcpu->litevm->lock);
1151         print_func_exit();
1152 }
1153
1154 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1155 {
1156         print_func_entry();
1157         if (is_long_mode()) {
1158                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
1159                         printd("set_cr3: #GP, reserved bits\n");
1160                         inject_gp(vcpu);
1161                         print_func_exit();
1162                         return;
1163                 }
1164         } else {
1165                 if (cr3 & CR3_RESEVED_BITS) {
1166                         printd("set_cr3: #GP, reserved bits\n");
1167                         inject_gp(vcpu);
1168                         print_func_exit();
1169                         return;
1170                 }
1171                 if (is_paging() && is_pae() &&
1172                     pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1173                         printd("set_cr3: #GP, pdptrs "
1174                                "reserved bits\n");
1175                         inject_gp(vcpu);
1176                         print_func_exit();
1177                         return;
1178                 }
1179         }
1180
1181         vcpu->cr3 = cr3;
1182         spin_lock_irqsave(&vcpu->litevm->lock);
1183         vcpu->mmu.new_cr3(vcpu);
1184         spin_unlock(&vcpu->litevm->lock);
1185         print_func_exit();
1186 }
1187
1188 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1189 {
1190         print_func_entry();
1191         if ( cr8 & CR8_RESEVED_BITS) {
1192                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1193                 inject_gp(vcpu);
1194                 print_func_exit();
1195                 return;
1196         }
1197         vcpu->cr8 = cr8;
1198         print_func_exit();
1199 }
1200
1201 static uint32_t get_rdx_init_val(void)
1202 {
1203         print_func_entry();
1204         uint32_t val;
1205
1206         asm ("movl $1, %%eax \n\t"
1207              "movl %%eax, %0 \n\t" : "=g"(val) );
1208         print_func_exit();
1209         return val;
1210
1211 }
1212
1213 static void fx_init(struct litevm_vcpu *vcpu)
1214 {
1215         print_func_entry();
1216         struct __attribute__ ((__packed__)) fx_image_s {
1217                 uint16_t control; //fcw
1218                 uint16_t status; //fsw
1219                 uint16_t tag; // ftw
1220                 uint16_t opcode; //fop
1221                 uint64_t ip; // fpu ip
1222                 uint64_t operand;// fpu dp
1223                 uint32_t mxcsr;
1224                 uint32_t mxcsr_mask;
1225
1226         } *fx_image;
1227
1228         fx_save(vcpu->host_fx_image);
1229         fpu_init();
1230         fx_save(vcpu->guest_fx_image);
1231         fx_restore(vcpu->host_fx_image);
1232
1233         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1234         fx_image->mxcsr = 0x1f80;
1235         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1236                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1237         print_func_exit();
1238 }
1239
1240 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field, uint32_t val)
1241 {
1242         print_func_entry();
1243         uint32_t msr_high, msr_low;
1244         uint64_t msrval;
1245
1246         msrval = read_msr(msr);
1247         msr_low = msrval;
1248         msr_high = (msrval>>32);
1249
1250         val &= msr_high;
1251         val |= msr_low;
1252         vmcs_write32(vmcs_field, val);
1253         print_func_exit();
1254 }
1255
1256 /*
1257  * Sets up the vmcs for emulated real mode.
1258  */
1259 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1260 {
1261 print_func_entry();
1262 /* no op on x86_64 */
1263 #define asmlinkage
1264         extern asmlinkage void litevm_vmx_return(void);
1265         uint32_t host_sysenter_cs;
1266         uint32_t junk;
1267         uint64_t a;
1268         struct descriptor_table dt;
1269         int i;
1270         int ret;
1271         uint64_t tsc;
1272         int nr_good_msrs;
1273
1274
1275         if (!init_rmode_tss(vcpu->litevm)) {
1276                 error("vcpu_setup: init_rmode_tss failed");
1277         }
1278
1279         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1280         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1281         vcpu->cr8 = 0;
1282         vcpu->apic_base = 0xfee00000 |
1283                         /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
1284                         MSR_IA32_APICBASE_ENABLE;
1285
1286         fx_init(vcpu);
1287
1288 #define SEG_SETUP(seg) do {                                     \
1289                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1290                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1291                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1292                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1293         } while (0)
1294
1295         /*
1296          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1297          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1298          */
1299         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1300         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1301         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1302         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1303
1304         SEG_SETUP(DS);
1305         SEG_SETUP(ES);
1306         SEG_SETUP(FS);
1307         SEG_SETUP(GS);
1308         SEG_SETUP(SS);
1309
1310         vmcs_write16(GUEST_TR_SELECTOR, 0);
1311         vmcs_writel(GUEST_TR_BASE, 0);
1312         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1313         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1314
1315         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1316         vmcs_writel(GUEST_LDTR_BASE, 0);
1317         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1318         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1319
1320         vmcs_write32(GUEST_SYSENTER_CS, 0);
1321         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1322         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1323
1324         vmcs_writel(GUEST_RFLAGS, 0x02);
1325         vmcs_writel(GUEST_RIP, 0xfff0);
1326         vmcs_writel(GUEST_RSP, 0);
1327
1328         vmcs_writel(GUEST_CR3, 0);
1329
1330         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1331         vmcs_writel(GUEST_DR7, 0x400);
1332
1333         vmcs_writel(GUEST_GDTR_BASE, 0);
1334         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1335
1336         vmcs_writel(GUEST_IDTR_BASE, 0);
1337         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1338
1339         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1340         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1341         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1342
1343         /* I/O */
1344         vmcs_write64(IO_BITMAP_A, 0);
1345         vmcs_write64(IO_BITMAP_B, 0);
1346
1347         tsc = read_tsc();
1348         vmcs_write64(TSC_OFFSET, -tsc);
1349
1350         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1351
1352         /* Special registers */
1353         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1354
1355         /* Control */
1356         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR,
1357                                PIN_BASED_VM_EXEC_CONTROL,
1358                                PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
1359                                | PIN_BASED_NMI_EXITING   /* 20.6.1 */
1360                         );
1361         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR,
1362                                CPU_BASED_VM_EXEC_CONTROL,
1363                                CPU_BASED_HLT_EXITING         /* 20.6.2 */
1364                                | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
1365                                | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
1366                                | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
1367                                | CPU_BASED_INVDPG_EXITING
1368                                | CPU_BASED_MOV_DR_EXITING
1369                                | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
1370                         );
1371
1372         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1373         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1374         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1375         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1376
1377         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1378         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1379         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1380
1381 #warning "not setting selectors; do we need them?"
1382 #if 0
1383         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
1384         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1385         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1386 #endif
1387         vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
1388         vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
1389 #if 0
1390         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1391 #endif
1392 #ifdef __x86_64__
1393         a = read_msr(MSR_FS_BASE);
1394         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1395         a = read_msr(MSR_GS_BASE);
1396         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1397 #else
1398         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1399         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1400 #endif
1401
1402 #warning "Not setting HOST_TR_SELECTOR"
1403 #if 0
1404         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
1405 #endif
1406
1407         get_idt(&dt);
1408         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1409
1410
1411         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return); /* 22.2.5 */
1412
1413         /* it's the HIGH 32 bits! */
1414         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1415         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1416         a = read_msr(MSR_IA32_SYSENTER_ESP);
1417         vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
1418         a = read_msr(MSR_IA32_SYSENTER_EIP);
1419         vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
1420
1421         ret = -ENOMEM;
1422         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1423         if (!vcpu->guest_msrs)
1424                 error("guest_msrs kmalloc failed");
1425         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1426         if (!vcpu->host_msrs)
1427                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1428
1429         for (i = 0; i < NR_VMX_MSR; ++i) {
1430                 uint32_t index = vmx_msr_index[i];
1431                 uint32_t data_low, data_high;
1432                 uint64_t data;
1433                 int j = vcpu->nmsrs;
1434
1435 #warning "need readmsr_safe"
1436 //              if (rdmsr_safe(index, &data_low, &data_high) < 0)
1437 //                      continue;
1438                 data = read_msr(index);
1439                 vcpu->host_msrs[j].index = index;
1440                 vcpu->host_msrs[j].reserved = 0;
1441                 vcpu->host_msrs[j].data = data;
1442                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1443                 ++vcpu->nmsrs;
1444         }
1445         printk("msrs: %d\n", vcpu->nmsrs);
1446
1447         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1448         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
1449                     PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1450         vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
1451                     PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1452         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
1453                     PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1454         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS,
1455                                (HOST_IS_64 << 9));  /* 22.2,1, 20.7.1 */
1456         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
1457         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);  /* 22.2.2 */
1458         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1459
1460
1461         /* 22.2.1, 20.8.1 */
1462         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR,
1463                                VM_ENTRY_CONTROLS, 0);
1464         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1465
1466         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1467         vmcs_writel(TPR_THRESHOLD, 0);
1468
1469         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1470         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1471
1472         __set_cr0(vcpu, 0x60000010); // enter rmode
1473         __set_cr4(vcpu, 0);
1474 #ifdef __x86_64__
1475         __set_efer(vcpu, 0);
1476 #endif
1477
1478         ret = litevm_mmu_init(vcpu);
1479
1480         print_func_exit();
1481         return ret;
1482
1483 out_free_guest_msrs:
1484         kfree(vcpu->guest_msrs);
1485 out:
1486         return ret;
1487 }
1488
1489 /*
1490  * Sync the rsp and rip registers into the vcpu structure.  This allows
1491  * registers to be accessed by indexing vcpu->regs.
1492  */
1493 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1494 {
1495         print_func_entry();
1496         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1497         vcpu->rip = vmcs_readl(GUEST_RIP);
1498         print_func_exit();
1499 }
1500
1501 /*
1502  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1503  * modification.
1504  */
1505 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1506 {
1507         print_func_entry();
1508         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1509         vmcs_writel(GUEST_RIP, vcpu->rip);
1510         print_func_exit();
1511 }
1512
1513 /*
1514  * Creates some virtual cpus.  Good luck creating more than one.
1515  */
1516 int vmx_create_vcpu(struct litevm *litevm, int n)
1517 {
1518         print_func_entry();
1519         ERRSTACK(1);
1520         int r;
1521         struct litevm_vcpu *vcpu;
1522         struct vmcs *vmcs;
1523         char *errstring = NULL;
1524
1525         if (n < 0 || n >= LITEVM_MAX_VCPUS){
1526                 printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n, LITEVM_MAX_VCPUS);
1527                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n, LITEVM_MAX_VCPUS);
1528         }
1529
1530         vcpu = &litevm->vcpus[n];
1531
1532         qlock(&vcpu->mutex);
1533
1534         if (vcpu->vmcs) {
1535                 qunlock(&vcpu->mutex);
1536                 printk("VM already exists\n");
1537                 error("VM already exists");
1538         }
1539
1540         /* I'm a bad person */
1541         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1542         uint64_t a = (uint64_t) vcpu->fx_buf;
1543         a += FX_IMAGE_ALIGN-1;
1544         a /= FX_IMAGE_ALIGN;
1545         a *= FX_IMAGE_ALIGN;
1546
1547         vcpu->host_fx_image = (char*)a;
1548         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1549
1550         vcpu->cpu = -1;  /* First load will set up TR */
1551         vcpu->litevm = litevm;
1552
1553         vmcs = alloc_vmcs();
1554         if (!vmcs) {
1555                 errstring = "vmcs allocate failed";
1556                 printk("%s\n", errstring);
1557                 qunlock(&vcpu->mutex);
1558                 goto out_free_vcpus;
1559         }
1560         vmcs_clear(vmcs);
1561         printk("after vmcs_clear\n");
1562         vcpu->vmcs = vmcs;
1563         vcpu->launched = 0;
1564         printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs);
1565         error("before vcpu_load");
1566         __vcpu_load(vcpu);
1567
1568         printk("PAST vcpu_load\n");
1569         if (waserror()){
1570                 /* we really need to fix waserror() */
1571                 poperror();
1572                 goto out_free_vcpus;
1573         }
1574
1575         r = litevm_vcpu_setup(vcpu);
1576
1577         vcpu_put(vcpu);
1578
1579         printk("r is %d\n", r);
1580
1581         if (! r) {
1582                 
1583                 print_func_exit();
1584                 return 0;
1585         }
1586
1587         errstring = "vcup set failed";
1588
1589 out_free_vcpus:
1590         printk("out_free_vcpus: life sucks\n");
1591         litevm_free_vcpu(vcpu);
1592         error(errstring);
1593 out:
1594         print_func_exit();
1595         return r;
1596 }
1597
1598 /*
1599  * Allocate some memory and give it an address in the guest physical address
1600  * space.
1601  *
1602  * Discontiguous memory is allowed, mostly for framebuffers.
1603  */
1604 int vm_set_memory_region(struct litevm *litevm,
1605                                            struct litevm_memory_region *mem)
1606 {
1607         print_func_entry();
1608         ERRSTACK(2);
1609         int r;
1610         gfn_t base_gfn;
1611         unsigned long npages;
1612         unsigned long i;
1613         struct litevm_memory_slot *memslot;
1614         struct litevm_memory_slot old, new;
1615         int memory_config_version;
1616         void *init_data = mem->init_data;
1617         int pass = 1;
1618
1619         printk("litevm %p\n", litevm);
1620         /* should not happen but ... */
1621         if (! litevm)
1622                 error("NULL litevm in %s", __func__);
1623
1624         if (!mem)
1625                 error("NULL mem in %s", __func__);
1626
1627         if (litevm->busy)
1628                 error("litevm->busy is set! 0x%x\n", litevm->busy);
1629         r = -EINVAL;
1630         /* General sanity checks */
1631         if (mem->memory_size & (PAGE_SIZE - 1))
1632                 error("mem->memory_size %lld is not page-aligned", mem->memory_size);
1633         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1634                 error("guest_phys_addr 0x%llx is not page-aligned", mem->guest_phys_addr);
1635         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1636                 error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
1637         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1638                 error("0x%x + 0x%x is < 0x%x", 
1639                       mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
1640
1641         memslot = &litevm->memslots[mem->slot];
1642         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1643         npages = mem->memory_size >> PAGE_SHIFT;
1644
1645         if (!npages)
1646                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1647
1648         /* this is actually a very tricky for loop. The use of
1649          * error is a bit dangerous, so we don't use it much.
1650          * consider a rewrite. Would be nice if akaros could do the
1651          * allocation of a bunch of pages for us.
1652          */
1653 raced:
1654         printk("raced: pass %d\n", pass);
1655         spin_lock_irqsave(&litevm->lock);
1656         printk("locked\n");
1657
1658         if (waserror()){
1659                 spin_unlock(&litevm->lock);
1660                 nexterror();
1661         }
1662                 
1663         memory_config_version = litevm->memory_config_version;
1664         new = old = *memslot;
1665
1666         new.base_gfn = base_gfn;
1667         new.npages = npages;
1668         new.flags = mem->flags;
1669
1670         /* Disallow changing a memory slot's size. */
1671         r = -EINVAL;
1672         if (npages && old.npages && npages != old.npages)
1673                 error("npages is %d, old.npages is %d, can't change",
1674                       npages, old.npages);
1675
1676         /* Check for overlaps */
1677         r = -EEXIST;
1678         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1679                 struct litevm_memory_slot *s = &litevm->memslots[i];
1680
1681                 if (s == memslot)
1682                         continue;
1683                 if (!((base_gfn + npages <= s->base_gfn) ||
1684                       (base_gfn >= s->base_gfn + s->npages)))
1685                         error("Overlap");
1686         }
1687         /*
1688          * Do memory allocations outside lock.  memory_config_version will
1689          * detect any races.
1690          */
1691         spin_unlock(&litevm->lock);
1692         printk("unlocked\n");
1693         poperror();
1694
1695         /* Deallocate if slot is being removed */
1696         if (!npages)
1697                 new.phys_mem = 0;
1698
1699         /* Free page dirty bitmap if unneeded */
1700         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1701                 new.dirty_bitmap = 0;
1702
1703         r = -ENOMEM;
1704
1705         /* Allocate if a slot is being created */
1706         if (npages && !new.phys_mem) {
1707                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1708
1709                 if (!new.phys_mem)
1710                         goto out_free;
1711
1712                 for (i = 0; i < npages; ++i) {
1713                         int ret;
1714                         ret = kpage_alloc(&new.phys_mem[i]);
1715                         if (ret != ESUCCESS)
1716                                 goto out_free;
1717                         if (init_data){
1718                                 printk("init data memcpy(%p,%p,4096);\n",
1719                                        page2kva(new.phys_mem[i]), init_data);
1720                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1721                                 init_data += PAGE_SIZE;
1722                         }
1723                 }
1724         }
1725
1726         /* Allocate page dirty bitmap if needed */
1727         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1728                 unsigned dirty_bytes;//ALIGN(npages, BITS_PER_LONG) / 8;
1729                 dirty_bytes = (((npages + BITS_PER_LONG-1)/BITS_PER_LONG)*BITS_PER_LONG)/8;
1730
1731                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1732                 if (!new.dirty_bitmap){
1733                         printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
1734                         goto out_free;
1735                 }
1736         }
1737
1738         spin_lock_irqsave(&litevm->lock);
1739         printk("locked\n");
1740         if (memory_config_version != litevm->memory_config_version) {
1741                 spin_unlock(&litevm->lock);
1742                 printk("unlocked, try again\n");
1743                 litevm_free_physmem_slot(&new, &old);
1744                 goto raced;
1745         }
1746
1747         r = -EAGAIN;
1748         if (litevm->busy){
1749                 printk("BUSY!\n");
1750                 goto out_unlock;
1751         }
1752
1753         if (mem->slot >= litevm->nmemslots)
1754                 litevm->nmemslots = mem->slot + 1;
1755
1756         *memslot = new;
1757         ++litevm->memory_config_version;
1758
1759         spin_unlock(&litevm->lock);
1760         printk("unlocked\n");
1761         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1762                 struct litevm_vcpu *vcpu;
1763
1764                 vcpu = vcpu_load(litevm, i);
1765                 if (!vcpu)
1766                         continue;
1767                 litevm_mmu_reset_context(vcpu);
1768                 vcpu_put(vcpu);
1769         }
1770
1771         litevm_free_physmem_slot(&old, &new);
1772         print_func_exit();
1773         return 0;
1774
1775 out_unlock:
1776         spin_unlock(&litevm->lock);
1777         printk("out_unlock\n");
1778 out_free:
1779         printk("out_free\n");
1780         litevm_free_physmem_slot(&new, &old);
1781 out:
1782         printk("vm_set_memory_region: return %d\n", r);
1783         print_func_exit();
1784         return r;
1785 }
1786
1787 #if 0
1788 /*
1789  * Get (and clear) the dirty memory log for a memory slot.
1790  */
1791 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1792                                        struct litevm_dirty_log *log)
1793 {
1794         struct litevm_memory_slot *memslot;
1795         int r, i;
1796         int n;
1797         unsigned long any = 0;
1798
1799         spin_lock_irqsave(&litevm->lock);
1800
1801         /*
1802          * Prevent changes to guest memory configuration even while the lock
1803          * is not taken.
1804          */
1805         ++litevm->busy;
1806         spin_unlock(&litevm->lock);
1807         r = -EINVAL;
1808         if (log->slot >= LITEVM_MEMORY_SLOTS)
1809                 goto out;
1810
1811         memslot = &litevm->memslots[log->slot];
1812         r = -ENOENT;
1813         if (!memslot->dirty_bitmap)
1814                 goto out;
1815
1816         n = ALIGN(memslot->npages, 8) / 8;
1817
1818         for (i = 0; !any && i < n; ++i)
1819                 any = memslot->dirty_bitmap[i];
1820
1821         r = -EFAULT;
1822         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1823                 goto out;
1824
1825
1826         if (any) {
1827                 spin_lock_irqsave(&litevm->lock);
1828                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1829                 spin_unlock(&litevm->lock);
1830                 memset(memslot->dirty_bitmap, 0, n);
1831                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1832                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1833
1834                         if (!vcpu)
1835                                 continue;
1836                         flush_guest_tlb(vcpu);
1837                         vcpu_put(vcpu);
1838                 }
1839         }
1840
1841         r = 0;
1842
1843 out:
1844         spin_lock_irqsave(&litevm->lock);
1845         --litevm->busy;
1846         spin_unlock(&litevm->lock);
1847         return r;
1848 }
1849 #endif
1850
1851 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1852 {
1853         print_func_entry();
1854         int i;
1855
1856         for (i = 0; i < litevm->nmemslots; ++i) {
1857                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1858
1859                 if (gfn >= memslot->base_gfn
1860                     && gfn < memslot->base_gfn + memslot->npages) {
1861                         print_func_exit();
1862                         return memslot;
1863                 }
1864         }
1865         print_func_exit();
1866         return 0;
1867 }
1868
1869 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1870 {
1871         print_func_entry();
1872         int i;
1873         struct litevm_memory_slot *memslot = 0;
1874         unsigned long rel_gfn;
1875
1876         for (i = 0; i < litevm->nmemslots; ++i) {
1877                 memslot = &litevm->memslots[i];
1878
1879                 if (gfn >= memslot->base_gfn
1880                     && gfn < memslot->base_gfn + memslot->npages) {
1881
1882                         if (!memslot || !memslot->dirty_bitmap) {
1883                                 print_func_exit();
1884                                 return;
1885                         }
1886
1887                         rel_gfn = gfn - memslot->base_gfn;
1888
1889                         /* avoid RMW */
1890                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1891                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1892                         print_func_exit();
1893                         return;
1894                 }
1895         }
1896         print_func_exit();
1897 }
1898
1899 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1900 {
1901         print_func_entry();
1902         unsigned long rip;
1903         uint32_t interruptibility;
1904
1905         rip = vmcs_readl(GUEST_RIP);
1906         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1907         vmcs_writel(GUEST_RIP, rip);
1908
1909         /*
1910          * We emulated an instruction, so temporary interrupt blocking
1911          * should be removed, if set.
1912          */
1913         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1914         if (interruptibility & 3)
1915                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
1916                              interruptibility & ~3);
1917         print_func_exit();
1918 }
1919
1920 static int emulator_read_std(unsigned long addr,
1921                              unsigned long *val,
1922                              unsigned int bytes,
1923                              struct x86_emulate_ctxt *ctxt)
1924 {
1925         print_func_entry();
1926         struct litevm_vcpu *vcpu = ctxt->vcpu;
1927         void *data = val;
1928
1929         while (bytes) {
1930                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1931                 unsigned offset = addr & (PAGE_SIZE-1);
1932                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ? 
1933                         bytes : (unsigned)PAGE_SIZE - offset;
1934                 unsigned long pfn;
1935                 struct litevm_memory_slot *memslot;
1936                 void *page;
1937
1938                 if (gpa == UNMAPPED_GVA) {
1939                         print_func_exit();
1940                         return X86EMUL_PROPAGATE_FAULT;
1941                 }
1942                 pfn = gpa >> PAGE_SHIFT;
1943                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1944                 if (!memslot) {
1945                         print_func_exit();
1946                         return X86EMUL_UNHANDLEABLE;
1947                 }
1948                 page = page2kva(gfn_to_page(memslot, pfn));
1949
1950                 memcpy(data, page + offset, tocopy);
1951
1952                 bytes -= tocopy;
1953                 data += tocopy;
1954                 addr += tocopy;
1955         }
1956
1957         print_func_exit();
1958         return X86EMUL_CONTINUE;
1959 }
1960
1961 static int emulator_write_std(unsigned long addr,
1962                               unsigned long val,
1963                               unsigned int bytes,
1964                               struct x86_emulate_ctxt *ctxt)
1965 {
1966         print_func_entry();
1967         printk("emulator_write_std: addr %lx n %d\n",
1968                addr, bytes);
1969         print_func_exit();
1970         return X86EMUL_UNHANDLEABLE;
1971 }
1972
1973 static int emulator_read_emulated(unsigned long addr,
1974                                   unsigned long *val,
1975                                   unsigned int bytes,
1976                                   struct x86_emulate_ctxt *ctxt)
1977 {
1978         print_func_entry();
1979         struct litevm_vcpu *vcpu = ctxt->vcpu;
1980
1981         if (vcpu->mmio_read_completed) {
1982                 memcpy(val, vcpu->mmio_data, bytes);
1983                 vcpu->mmio_read_completed = 0;
1984                 print_func_exit();
1985                 return X86EMUL_CONTINUE;
1986         } else if (emulator_read_std(addr, val, bytes, ctxt)
1987                    == X86EMUL_CONTINUE) {
1988                 print_func_exit();
1989                 return X86EMUL_CONTINUE;
1990         }
1991         else {
1992                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1993                 if (gpa == UNMAPPED_GVA) {
1994                         print_func_exit();
1995                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
1996                 }
1997                 vcpu->mmio_needed = 1;
1998                 vcpu->mmio_phys_addr = gpa;
1999                 vcpu->mmio_size = bytes;
2000                 vcpu->mmio_is_write = 0;
2001
2002                 print_func_exit();
2003                 return X86EMUL_UNHANDLEABLE;
2004         }
2005 }
2006
2007 static int emulator_write_emulated(unsigned long addr,
2008                                    unsigned long val,
2009                                    unsigned int bytes,
2010                                    struct x86_emulate_ctxt *ctxt)
2011 {
2012         print_func_entry();
2013         struct litevm_vcpu *vcpu = ctxt->vcpu;
2014         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
2015
2016         if (gpa == UNMAPPED_GVA) {
2017                 print_func_exit();
2018                 return X86EMUL_PROPAGATE_FAULT;
2019         }
2020
2021         vcpu->mmio_needed = 1;
2022         vcpu->mmio_phys_addr = gpa;
2023         vcpu->mmio_size = bytes;
2024         vcpu->mmio_is_write = 1;
2025         memcpy(vcpu->mmio_data, &val, bytes);
2026
2027         print_func_exit();
2028         return X86EMUL_CONTINUE;
2029 }
2030
2031 static int emulator_cmpxchg_emulated(unsigned long addr,
2032                                      unsigned long old,
2033                                      unsigned long new,
2034                                      unsigned int bytes,
2035                                      struct x86_emulate_ctxt *ctxt)
2036 {
2037         print_func_entry();
2038         static int reported;
2039
2040         if (!reported) {
2041                 reported = 1;
2042                 printk("litevm: emulating exchange as write\n");
2043         }
2044         print_func_exit();
2045         return emulator_write_emulated(addr, new, bytes, ctxt);
2046 }
2047
2048 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
2049 {
2050         print_func_entry();
2051         static int reported;
2052         uint8_t opcodes[4];
2053         unsigned long rip = vmcs_readl(GUEST_RIP);
2054         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
2055
2056         if (reported) {
2057                 print_func_exit();
2058                 return;
2059         }
2060
2061         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
2062
2063         printk("emulation failed but !mmio_needed?"
2064                " rip %lx %02x %02x %02x %02x\n",
2065                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2066         reported = 1;
2067         print_func_exit();
2068 }
2069
2070 struct x86_emulate_ops emulate_ops = {
2071         .read_std            = emulator_read_std,
2072         .write_std           = emulator_write_std,
2073         .read_emulated       = emulator_read_emulated,
2074         .write_emulated      = emulator_write_emulated,
2075         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2076 };
2077
2078 enum emulation_result {
2079         EMULATE_DONE,       /* no further processing */
2080         EMULATE_DO_MMIO,      /* litevm_run filled with mmio request */
2081         EMULATE_FAIL,         /* can't emulate this instruction */
2082 };
2083
2084 static int emulate_instruction(struct litevm_vcpu *vcpu,
2085                                struct litevm_run *run,
2086                                unsigned long cr2,
2087                                uint16_t error_code)
2088 {
2089         print_func_entry();
2090         struct x86_emulate_ctxt emulate_ctxt;
2091         int r;
2092         uint32_t cs_ar;
2093
2094         vcpu_load_rsp_rip(vcpu);
2095
2096         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2097
2098         emulate_ctxt.vcpu = vcpu;
2099         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
2100         emulate_ctxt.cr2 = cr2;
2101         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
2102                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
2103                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
2104                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2105
2106         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2107                 emulate_ctxt.cs_base = 0;
2108                 emulate_ctxt.ds_base = 0;
2109                 emulate_ctxt.es_base = 0;
2110                 emulate_ctxt.ss_base = 0;
2111                 emulate_ctxt.gs_base = 0;
2112                 emulate_ctxt.fs_base = 0;
2113         } else {
2114                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
2115                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
2116                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
2117                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
2118                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
2119                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
2120         }
2121
2122         vcpu->mmio_is_write = 0;
2123         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
2124
2125         if ((r || vcpu->mmio_is_write) && run) {
2126                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2127                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2128                 run->mmio.len = vcpu->mmio_size;
2129                 run->mmio.is_write = vcpu->mmio_is_write;
2130         }
2131
2132         if (r) {
2133                 if (!vcpu->mmio_needed) {
2134                         report_emulation_failure(&emulate_ctxt);
2135                         print_func_exit();
2136                         return EMULATE_FAIL;
2137                 }
2138                 print_func_exit();
2139                 return EMULATE_DO_MMIO;
2140         }
2141
2142         vcpu_put_rsp_rip(vcpu);
2143         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
2144
2145         if (vcpu->mmio_is_write) {
2146                 print_func_exit();
2147                 return EMULATE_DO_MMIO;
2148         }
2149
2150         print_func_exit();
2151         return EMULATE_DONE;
2152 }
2153
2154 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
2155 {
2156         print_func_entry();
2157         print_func_exit();
2158         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2159 }
2160
2161 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2162 {
2163         print_func_entry();
2164         vmcs_writel(GUEST_GDTR_BASE, base);
2165         vmcs_write32(GUEST_GDTR_LIMIT, limit);
2166         print_func_exit();
2167 }
2168
2169 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2170 {
2171         print_func_entry();
2172         vmcs_writel(GUEST_IDTR_BASE, base);
2173         vmcs_write32(GUEST_IDTR_LIMIT, limit);
2174         print_func_exit();
2175 }
2176
2177 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
2178                    unsigned long *rflags)
2179 {
2180         print_func_entry();
2181         lmsw(vcpu, msw);
2182         *rflags = vmcs_readl(GUEST_RFLAGS);
2183         print_func_exit();
2184 }
2185
2186 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
2187 {
2188         print_func_entry();
2189         switch (cr) {
2190         case 0:
2191                 print_func_exit();
2192                 return guest_cr0();
2193         case 2:
2194                 print_func_exit();
2195                 return vcpu->cr2;
2196         case 3:
2197                 print_func_exit();
2198                 return vcpu->cr3;
2199         case 4:
2200                 print_func_exit();
2201                 return guest_cr4();
2202         default:
2203                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2204                 print_func_exit();
2205                 return 0;
2206         }
2207 }
2208
2209 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
2210                      unsigned long *rflags)
2211 {
2212         print_func_entry();
2213         switch (cr) {
2214         case 0:
2215                 set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
2216                 *rflags = vmcs_readl(GUEST_RFLAGS);
2217                 break;
2218         case 2:
2219                 vcpu->cr2 = val;
2220                 break;
2221         case 3:
2222                 set_cr3(vcpu, val);
2223                 break;
2224         case 4:
2225                 set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
2226                 break;
2227         default:
2228                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2229         }
2230         print_func_exit();
2231 }
2232
2233 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
2234                                   int vec, uint32_t err_code)
2235 {
2236         print_func_entry();
2237         if (!vcpu->rmode.active) {
2238                 print_func_exit();
2239                 return 0;
2240         }
2241
2242         if (vec == GP_VECTOR && err_code == 0)
2243                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) {
2244                         print_func_exit();
2245                         return 1;
2246                 }
2247         print_func_exit();
2248         return 0;
2249 }
2250
2251 static int handle_exception(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2252 {
2253         print_func_entry();
2254         uint32_t intr_info, error_code;
2255         unsigned long cr2, rip;
2256         uint32_t vect_info;
2257         enum emulation_result er;
2258
2259         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2260         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2261
2262         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2263                                                 !is_page_fault(intr_info)) {
2264                 printk("%s: unexpected, vectoring info 0x%x "
2265                        "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
2266         }
2267
2268         if (is_external_interrupt(vect_info)) {
2269                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2270                 SET_BITMASK_BIT_ATOMIC(((uint8_t *)&vcpu->irq_pending), irq);
2271                 SET_BITMASK_BIT_ATOMIC(((uint8_t *)&vcpu->irq_summary), irq / BITS_PER_LONG);
2272         }
2273
2274         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
2275                 asm ("int $2");
2276                 print_func_exit();
2277                 return 1;
2278         }
2279         error_code = 0;
2280         rip = vmcs_readl(GUEST_RIP);
2281         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
2282                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2283         if (is_page_fault(intr_info)) {
2284                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2285
2286                 spin_lock_irqsave(&vcpu->litevm->lock);
2287                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
2288                         spin_unlock(&vcpu->litevm->lock);
2289                         print_func_exit();
2290                         return 1;
2291                 }
2292
2293                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
2294                 spin_unlock(&vcpu->litevm->lock);
2295
2296                 switch (er) {
2297                 case EMULATE_DONE:
2298                         print_func_exit();
2299                         return 1;
2300                 case EMULATE_DO_MMIO:
2301                         ++litevm_stat.mmio_exits;
2302                         litevm_run->exit_reason = LITEVM_EXIT_MMIO;
2303                         print_func_exit();
2304                         return 0;
2305                  case EMULATE_FAIL:
2306                         vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2307                         break;
2308                 default:
2309                         assert(0);
2310                 }
2311         }
2312
2313         if (vcpu->rmode.active &&
2314             handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2315                                                                 error_code)) {
2316                 print_func_exit();
2317                     return 1;
2318             }
2319
2320         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
2321                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2322                 print_func_exit();
2323                 return 0;
2324         }
2325         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2326         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2327         litevm_run->ex.error_code = error_code;
2328         print_func_exit();
2329         return 0;
2330 }
2331
2332 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2333                                      struct litevm_run *litevm_run)
2334 {
2335         print_func_entry();
2336         ++litevm_stat.irq_exits;
2337         print_func_exit();
2338         return 1;
2339 }
2340
2341
2342 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t *count)
2343 {
2344         print_func_entry();
2345         uint64_t inst;
2346         gva_t rip;
2347         int countr_size;
2348         int i, n;
2349
2350         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2351                 countr_size = 2;
2352         } else {
2353                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2354
2355                 countr_size = (cs_ar & AR_L_MASK) ? 8:
2356                               (cs_ar & AR_DB_MASK) ? 4: 2;
2357         }
2358
2359         rip =  vmcs_readl(GUEST_RIP);
2360         if (countr_size != 8)
2361                 rip += vmcs_readl(GUEST_CS_BASE);
2362
2363         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2364
2365         for (i = 0; i < n; i++) {
2366                 switch (((uint8_t*)&inst)[i]) {
2367                 case 0xf0:
2368                 case 0xf2:
2369                 case 0xf3:
2370                 case 0x2e:
2371                 case 0x36:
2372                 case 0x3e:
2373                 case 0x26:
2374                 case 0x64:
2375                 case 0x65:
2376                 case 0x66:
2377                         break;
2378                 case 0x67:
2379                         countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
2380                 default:
2381                         goto done;
2382                 }
2383         }
2384         print_func_exit();
2385         return 0;
2386 done:
2387         countr_size *= 8;
2388         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2389         print_func_exit();
2390         return 1;
2391 }
2392
2393 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2394 {
2395         print_func_entry();
2396         uint64_t exit_qualification;
2397
2398         ++litevm_stat.io_exits;
2399         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2400         litevm_run->exit_reason = LITEVM_EXIT_IO;
2401         if (exit_qualification & 8)
2402                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2403         else
2404                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2405         litevm_run->io.size = (exit_qualification & 7) + 1;
2406         litevm_run->io.string = (exit_qualification & 16) != 0;
2407         litevm_run->io.string_down
2408                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2409         litevm_run->io.rep = (exit_qualification & 32) != 0;
2410         litevm_run->io.port = exit_qualification >> 16;
2411         if (litevm_run->io.string) {
2412                 if (!get_io_count(vcpu, &litevm_run->io.count)) {
2413                         print_func_exit();
2414                         return 1;
2415                 }
2416                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2417         } else
2418                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
2419         print_func_exit();
2420         return 0;
2421 }
2422
2423 static int handle_invlpg(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2424 {
2425         print_func_entry();
2426         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2427         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2428         spin_lock_irqsave(&vcpu->litevm->lock);
2429         vcpu->mmu.inval_page(vcpu, address);
2430         spin_unlock(&vcpu->litevm->lock);
2431         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2432         print_func_exit();
2433         return 1;
2434 }
2435
2436 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2437 {
2438         print_func_entry();
2439         uint64_t exit_qualification;
2440         int cr;
2441         int reg;
2442
2443 #ifdef LITEVM_DEBUG
2444         if (guest_cpl() != 0) {
2445                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2446                 inject_gp(vcpu);
2447                 print_func_exit();
2448                 return 1;
2449         }
2450 #endif
2451
2452         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2453         cr = exit_qualification & 15;
2454         reg = (exit_qualification >> 8) & 15;
2455         switch ((exit_qualification >> 4) & 3) {
2456         case 0: /* mov to cr */
2457                 switch (cr) {
2458                 case 0:
2459                         vcpu_load_rsp_rip(vcpu);
2460                         set_cr0(vcpu, vcpu->regs[reg]);
2461                         skip_emulated_instruction(vcpu);
2462                         print_func_exit();
2463                         return 1;
2464                 case 3:
2465                         vcpu_load_rsp_rip(vcpu);
2466                         set_cr3(vcpu, vcpu->regs[reg]);
2467                         skip_emulated_instruction(vcpu);
2468                         print_func_exit();
2469                         return 1;
2470                 case 4:
2471                         vcpu_load_rsp_rip(vcpu);
2472                         set_cr4(vcpu, vcpu->regs[reg]);
2473                         skip_emulated_instruction(vcpu);
2474                         print_func_exit();
2475                         return 1;
2476                 case 8:
2477                         vcpu_load_rsp_rip(vcpu);
2478                         set_cr8(vcpu, vcpu->regs[reg]);
2479                         skip_emulated_instruction(vcpu);
2480                         print_func_exit();
2481                         return 1;
2482                 };
2483                 break;
2484         case 1: /*mov from cr*/
2485                 switch (cr) {
2486                 case 3:
2487                         vcpu_load_rsp_rip(vcpu);
2488                         vcpu->regs[reg] = vcpu->cr3;
2489                         vcpu_put_rsp_rip(vcpu);
2490                         skip_emulated_instruction(vcpu);
2491                         print_func_exit();
2492                         return 1;
2493                 case 8:
2494                         printd("handle_cr: read CR8 "
2495                                "cpu erratum AA15\n");
2496                         vcpu_load_rsp_rip(vcpu);
2497                         vcpu->regs[reg] = vcpu->cr8;
2498                         vcpu_put_rsp_rip(vcpu);
2499                         skip_emulated_instruction(vcpu);
2500                         print_func_exit();
2501                         return 1;
2502                 }
2503                 break;
2504         case 3: /* lmsw */
2505                 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2506
2507                 skip_emulated_instruction(vcpu);
2508                 print_func_exit();
2509                 return 1;
2510         default:
2511                 break;
2512         }
2513         litevm_run->exit_reason = 0;
2514         printk("litevm: unhandled control register: op %d cr %d\n",
2515                (int)(exit_qualification >> 4) & 3, cr);
2516         print_func_exit();
2517         return 0;
2518 }
2519
2520 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2521 {
2522         print_func_entry();
2523         uint64_t exit_qualification;
2524         unsigned long val;
2525         int dr, reg;
2526
2527         /*
2528          * FIXME: this code assumes the host is debugging the guest.
2529          *        need to deal with guest debugging itself too.
2530          */
2531         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2532         dr = exit_qualification & 7;
2533         reg = (exit_qualification >> 8) & 15;
2534         vcpu_load_rsp_rip(vcpu);
2535         if (exit_qualification & 16) {
2536                 /* mov from dr */
2537                 switch (dr) {
2538                 case 6:
2539                         val = 0xffff0ff0;
2540                         break;
2541                 case 7:
2542                         val = 0x400;
2543                         break;
2544                 default:
2545                         val = 0;
2546                 }
2547                 vcpu->regs[reg] = val;
2548         } else {
2549                 /* mov to dr */
2550         }
2551         vcpu_put_rsp_rip(vcpu);
2552         skip_emulated_instruction(vcpu);
2553         print_func_exit();
2554         return 1;
2555 }
2556
2557 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2558 {
2559         print_func_entry();
2560         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2561         print_func_exit();
2562         return 0;
2563 }
2564
2565 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2566 {
2567         print_func_entry();
2568         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2569         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2570         uint64_t data;
2571
2572         if (guest_cpl() != 0) {
2573                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2574                 inject_gp(vcpu);
2575                 print_func_exit();
2576                 return 1;
2577         }
2578
2579         switch (ecx) {
2580         case MSR_FS_BASE:
2581                 data = vmcs_readl(GUEST_FS_BASE);
2582                 break;
2583         case MSR_GS_BASE:
2584                 data = vmcs_readl(GUEST_GS_BASE);
2585                 break;
2586         case MSR_IA32_SYSENTER_CS:
2587                 data = vmcs_read32(GUEST_SYSENTER_CS);
2588                 break;
2589         case MSR_IA32_SYSENTER_EIP:
2590                 data = vmcs_read32(GUEST_SYSENTER_EIP);
2591                 break;
2592         case MSR_IA32_SYSENTER_ESP:
2593                 data = vmcs_read32(GUEST_SYSENTER_ESP);
2594                 break;
2595         case MSR_IA32_MC0_CTL:
2596         case MSR_IA32_MCG_STATUS:
2597         case MSR_IA32_MCG_CAP:
2598         case MSR_IA32_MC0_MISC:
2599         case MSR_IA32_MC0_MISC+4:
2600         case MSR_IA32_MC0_MISC+8:
2601         case MSR_IA32_MC0_MISC+12:
2602         case MSR_IA32_MC0_MISC+16:
2603         case MSR_IA32_UCODE_REV:
2604                 /* MTRR registers */
2605         case 0xfe:
2606         case 0x200 ... 0x2ff:
2607                 data = 0;
2608                 break;
2609         case MSR_IA32_APICBASE:
2610                 data = vcpu->apic_base;
2611                 break;
2612         default:
2613                 if (msr) {
2614                         data = msr->data;
2615                         break;
2616                 }
2617                 printk("litevm: unhandled rdmsr: %x\n", ecx);
2618                 inject_gp(vcpu);
2619                 print_func_exit();
2620                 return 1;
2621         }
2622
2623         /* FIXME: handling of bits 32:63 of rax, rdx */
2624         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2625         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2626         skip_emulated_instruction(vcpu);
2627         print_func_exit();
2628         return 1;
2629 }
2630
2631 #ifdef __x86_64__
2632
2633 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2634 {
2635         print_func_entry();
2636         struct vmx_msr_entry *msr;
2637
2638         if (efer & EFER_RESERVED_BITS) {
2639                 printd("set_efer: 0x%llx #GP, reserved bits\n",
2640                        efer);
2641                 inject_gp(vcpu);
2642                 print_func_exit();
2643                 return;
2644         }
2645
2646         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2647                 printd("set_efer: #GP, change LME while paging\n");
2648                 inject_gp(vcpu);
2649                 print_func_exit();
2650                 return;
2651         }
2652
2653         efer &= ~EFER_LMA;
2654         efer |= vcpu->shadow_efer & EFER_LMA;
2655
2656         vcpu->shadow_efer = efer;
2657
2658         msr = find_msr_entry(vcpu, MSR_EFER);
2659
2660         if (!(efer & EFER_LMA))
2661             efer &= ~EFER_LME;
2662         msr->data = efer;
2663         skip_emulated_instruction(vcpu);
2664         print_func_exit();
2665 }
2666
2667 #endif
2668
2669 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2670
2671 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2672 {
2673         print_func_entry();
2674         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2675         struct vmx_msr_entry *msr;
2676         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2677                 | ((uint64_t)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2678
2679         if (guest_cpl() != 0) {
2680                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2681                 inject_gp(vcpu);
2682                 print_func_exit();
2683                 return 1;
2684         }
2685
2686         switch (ecx) {
2687         case MSR_FS_BASE:
2688                 vmcs_writel(GUEST_FS_BASE, data);
2689                 break;
2690         case MSR_GS_BASE:
2691                 vmcs_writel(GUEST_GS_BASE, data);
2692                 break;
2693         case MSR_IA32_SYSENTER_CS:
2694                 vmcs_write32(GUEST_SYSENTER_CS, data);
2695                 break;
2696         case MSR_IA32_SYSENTER_EIP:
2697                 vmcs_write32(GUEST_SYSENTER_EIP, data);
2698                 break;
2699         case MSR_IA32_SYSENTER_ESP:
2700                 vmcs_write32(GUEST_SYSENTER_ESP, data);
2701                 break;
2702         case MSR_EFER:
2703                 set_efer(vcpu, data);
2704                 print_func_exit();
2705                 return 1;
2706         case MSR_IA32_MC0_STATUS:
2707                 printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
2708                             , __FUNCTION__, data);
2709                 break;
2710         case MSR_IA32_TIME_STAMP_COUNTER: {
2711                 uint64_t tsc;
2712                 
2713                 tsc = read_tsc();
2714                 vmcs_write64(TSC_OFFSET, data - tsc);
2715                 break;
2716         }
2717         case MSR_IA32_UCODE_REV:
2718         case MSR_IA32_UCODE_WRITE:
2719         case 0x200 ... 0x2ff: /* MTRRs */
2720                 break;
2721         case MSR_IA32_APICBASE:
2722                 vcpu->apic_base = data;
2723                 break;
2724         default:
2725                 msr = find_msr_entry(vcpu, ecx);
2726                 if (msr) {
2727                         msr->data = data;
2728                         break;
2729                 }
2730                 printk("litevm: unhandled wrmsr: %x\n", ecx);
2731                 inject_gp(vcpu);
2732                 print_func_exit();
2733                 return 1;
2734         }
2735         skip_emulated_instruction(vcpu);
2736         print_func_exit();
2737         return 1;
2738 }
2739
2740 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2741                                    struct litevm_run *litevm_run)
2742 {
2743         print_func_entry();
2744         /* Turn off interrupt window reporting. */
2745         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2746                      vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2747                      & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2748         print_func_exit();
2749         return 1;
2750 }
2751
2752 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2753 {
2754         print_func_entry();
2755         skip_emulated_instruction(vcpu);
2756         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) {
2757                 print_func_exit();
2758                 return 1;
2759         }
2760
2761         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2762         print_func_exit();
2763         return 0;
2764 }
2765
2766 /*
2767  * The exit handlers return 1 if the exit was handled fully and guest execution
2768  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2769  * to be done to userspace and return 0.
2770  */
2771 static int (*litevm_vmx_exit_handlers[])(struct litevm_vcpu *vcpu,
2772                                       struct litevm_run *litevm_run) = {
2773         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
2774         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
2775         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
2776         [EXIT_REASON_INVLPG]                  = handle_invlpg,
2777         [EXIT_REASON_CR_ACCESS]               = handle_cr,
2778         [EXIT_REASON_DR_ACCESS]               = handle_dr,
2779         [EXIT_REASON_CPUID]                   = handle_cpuid,
2780         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
2781         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
2782         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
2783         [EXIT_REASON_HLT]                     = handle_halt,
2784 };
2785
2786 static const int litevm_vmx_max_exit_handlers =
2787         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2788
2789 /*
2790  * The guest has exited.  See if we can fix it or if we need userspace
2791  * assistance.
2792  */
2793 static int litevm_handle_exit(struct litevm_run *litevm_run, struct litevm_vcpu *vcpu)
2794 {
2795         print_func_entry();
2796         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2797         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2798
2799         if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
2800                                 exit_reason != EXIT_REASON_EXCEPTION_NMI )
2801                 printk("%s: unexpected, valid vectoring info and "
2802                        "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2803         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2804         if (exit_reason < litevm_vmx_max_exit_handlers
2805             && litevm_vmx_exit_handlers[exit_reason]) {
2806                 print_func_exit();
2807                 return litevm_vmx_exit_handlers[exit_reason](vcpu, litevm_run);
2808         }
2809         else {
2810                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2811                 litevm_run->hw.hardware_exit_reason = exit_reason;
2812         }
2813         print_func_exit();
2814         return 0;
2815 }
2816
2817 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2818 {
2819         print_func_entry();
2820         uint16_t ent[2];
2821         uint16_t cs;
2822         uint16_t ip;
2823         unsigned long flags;
2824         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2825         uint16_t sp =  vmcs_readl(GUEST_RSP);
2826         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2827
2828         if (sp > ss_limit || ((sp - 6) > sp)) {
2829                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2830                             __FUNCTION__,
2831                             vmcs_readl(GUEST_RSP),
2832                             vmcs_readl(GUEST_SS_BASE),
2833                             vmcs_read32(GUEST_SS_LIMIT));
2834                 print_func_exit();
2835                 return;
2836         }
2837
2838         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2839                                                                 sizeof(ent)) {
2840                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2841                 print_func_exit();
2842                 return;
2843         }
2844
2845         flags =  vmcs_readl(GUEST_RFLAGS);
2846         cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
2847         ip =  vmcs_readl(GUEST_RIP);
2848
2849
2850         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2851             litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2852             litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2853                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2854                 print_func_exit();
2855                 return;
2856         }
2857
2858         vmcs_writel(GUEST_RFLAGS, flags &
2859                     ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2860         vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
2861         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2862         vmcs_writel(GUEST_RIP, ent[0]);
2863         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2864         print_func_exit();
2865 }
2866
2867 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2868 {
2869         print_func_entry();
2870         int word_index = __ffs(vcpu->irq_summary);
2871         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2872         int irq = word_index * BITS_PER_LONG + bit_index;
2873
2874         /* don't have clear_bit and I'm not sure the akaros
2875          * bitops are really going to work.
2876          */
2877         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2878         if (!vcpu->irq_pending[word_index])
2879                 vcpu->irq_summary &= ~ (1 << word_index);
2880
2881         if (vcpu->rmode.active) {
2882                 inject_rmode_irq(vcpu, irq);
2883                 print_func_exit();
2884                 return;
2885         }
2886         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2887                         irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2888         print_func_exit();
2889 }
2890
2891 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2892 {
2893         print_func_entry();
2894         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2895             && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2896                 /*
2897                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2898                  */
2899                 litevm_do_inject_irq(vcpu);
2900         else
2901                 /*
2902                  * Interrupts blocked.  Wait for unblock.
2903                  */
2904                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2905                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2906                              | CPU_BASED_VIRTUAL_INTR_PENDING);
2907         print_func_exit();
2908 }
2909
2910 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2911 {
2912         print_func_entry();
2913         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2914
2915 #warning "no debugging guests yet"
2916         assert(0);
2917 /*
2918         set_debugreg(dbg->bp[0], 0);
2919         set_debugreg(dbg->bp[1], 1);
2920         set_debugreg(dbg->bp[2], 2);
2921         set_debugreg(dbg->bp[3], 3);
2922 */
2923         if (dbg->singlestep) {
2924                 unsigned long flags;
2925
2926                 flags = vmcs_readl(GUEST_RFLAGS);
2927                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2928                 vmcs_writel(GUEST_RFLAGS, flags);
2929         }
2930         print_func_exit();
2931 }
2932
2933 static void load_msrs(struct vmx_msr_entry *e, int n)
2934 {
2935         print_func_entry();
2936         int i;
2937
2938         for (i = 0; i < n; ++i)
2939                 write_msr(e[i].index, e[i].data);
2940         print_func_exit();
2941 }
2942
2943 static void save_msrs(struct vmx_msr_entry *e, int n)
2944 {
2945         print_func_entry();
2946         int i;
2947
2948         for (i = 0; i < n; ++i)
2949                 e[i].data = read_msr(e[i].index);
2950         print_func_exit();
2951 }
2952
2953 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
2954 {
2955         print_func_entry();
2956         struct litevm_vcpu *vcpu;
2957         uint8_t fail;
2958         uint16_t fs_sel, gs_sel, ldt_sel;
2959         int fs_gs_ldt_reload_needed;
2960
2961         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2962                 error("vcpu is %d but must be in the range %d..%d\n",
2963                       litevm_run->vcpu, LITEVM_MAX_VCPUS);
2964
2965         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2966         if (!vcpu)
2967                 error("vcpu_load failed");
2968
2969         if (litevm_run->emulated) {
2970                 skip_emulated_instruction(vcpu);
2971                 litevm_run->emulated = 0;
2972         }
2973
2974         if (litevm_run->mmio_completed) {
2975                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
2976                 vcpu->mmio_read_completed = 1;
2977         }
2978
2979         vcpu->mmio_needed = 0;
2980
2981 again:
2982         /*
2983          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2984          * allow segment selectors with cpl > 0 or ti == 1.
2985          */
2986         fs_sel = read_fs();
2987         gs_sel = read_gs();
2988         ldt_sel = read_ldt();
2989         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
2990         if (!fs_gs_ldt_reload_needed) {
2991                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2992                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2993         } else {
2994                 vmcs_write16(HOST_FS_SELECTOR, 0);
2995                 vmcs_write16(HOST_GS_SELECTOR, 0);
2996         }
2997
2998 #ifdef __x86_64__
2999         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
3000         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
3001 #endif
3002
3003         if (vcpu->irq_summary &&
3004             !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
3005                 litevm_try_inject_irq(vcpu);
3006
3007         if (vcpu->guest_debug.enabled)
3008                 litevm_guest_debug_pre(vcpu);
3009
3010         fx_save(vcpu->host_fx_image);
3011         fx_restore(vcpu->guest_fx_image);
3012
3013         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
3014         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3015
3016         asm (
3017                 /* Store host registers */
3018                 "pushf \n\t"
3019 #ifdef __x86_64__
3020                 "push %%rax; push %%rbx; push %%rdx;"
3021                 "push %%rsi; push %%rdi; push %%rbp;"
3022                 "push %%r8;  push %%r9;  push %%r10; push %%r11;"
3023                 "push %%r12; push %%r13; push %%r14; push %%r15;"
3024                 "push %%rcx \n\t"
3025                 "vmwrite %%rsp, %2 \n\t"
3026 #else
3027                 "pusha; push %%ecx \n\t"
3028                 "vmwrite %%esp, %2 \n\t"
3029 #endif
3030                 /* Check if vmlaunch of vmresume is needed */
3031                 "cmp $0, %1 \n\t"
3032                 /* Load guest registers.  Don't clobber flags. */
3033 #ifdef __x86_64__
3034                 "mov %c[cr2](%3), %%rax \n\t"
3035                 "mov %%rax, %%cr2 \n\t"
3036                 "mov %c[rax](%3), %%rax \n\t"
3037                 "mov %c[rbx](%3), %%rbx \n\t"
3038                 "mov %c[rdx](%3), %%rdx \n\t"
3039                 "mov %c[rsi](%3), %%rsi \n\t"
3040                 "mov %c[rdi](%3), %%rdi \n\t"
3041                 "mov %c[rbp](%3), %%rbp \n\t"
3042                 "mov %c[r8](%3),  %%r8  \n\t"
3043                 "mov %c[r9](%3),  %%r9  \n\t"
3044                 "mov %c[r10](%3), %%r10 \n\t"
3045                 "mov %c[r11](%3), %%r11 \n\t"
3046                 "mov %c[r12](%3), %%r12 \n\t"
3047                 "mov %c[r13](%3), %%r13 \n\t"
3048                 "mov %c[r14](%3), %%r14 \n\t"
3049                 "mov %c[r15](%3), %%r15 \n\t"
3050                 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
3051 #else
3052                 "mov %c[cr2](%3), %%eax \n\t"
3053                 "mov %%eax,   %%cr2 \n\t"
3054                 "mov %c[rax](%3), %%eax \n\t"
3055                 "mov %c[rbx](%3), %%ebx \n\t"
3056                 "mov %c[rdx](%3), %%edx \n\t"
3057                 "mov %c[rsi](%3), %%esi \n\t"
3058                 "mov %c[rdi](%3), %%edi \n\t"
3059                 "mov %c[rbp](%3), %%ebp \n\t"
3060                 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
3061 #endif
3062                 /* Enter guest mode */
3063                 "jne launched \n\t"
3064                 "vmlaunch \n\t"
3065                 "jmp litevm_vmx_return \n\t"
3066                 "launched: vmresume \n\t"
3067                 ".globl litevm_vmx_return \n\t"
3068                 "litevm_vmx_return: "
3069                 /* Save guest registers, load host registers, keep flags */
3070 #ifdef __x86_64__
3071                 "xchg %3,     0(%%rsp) \n\t"
3072                 "mov %%rax, %c[rax](%3) \n\t"
3073                 "mov %%rbx, %c[rbx](%3) \n\t"
3074                 "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
3075                 "mov %%rdx, %c[rdx](%3) \n\t"
3076                 "mov %%rsi, %c[rsi](%3) \n\t"
3077                 "mov %%rdi, %c[rdi](%3) \n\t"
3078                 "mov %%rbp, %c[rbp](%3) \n\t"
3079                 "mov %%r8,  %c[r8](%3) \n\t"
3080                 "mov %%r9,  %c[r9](%3) \n\t"
3081                 "mov %%r10, %c[r10](%3) \n\t"
3082                 "mov %%r11, %c[r11](%3) \n\t"
3083                 "mov %%r12, %c[r12](%3) \n\t"
3084                 "mov %%r13, %c[r13](%3) \n\t"
3085                 "mov %%r14, %c[r14](%3) \n\t"
3086                 "mov %%r15, %c[r15](%3) \n\t"
3087                 "mov %%cr2, %%rax   \n\t"
3088                 "mov %%rax, %c[cr2](%3) \n\t"
3089                 "mov 0(%%rsp), %3 \n\t"
3090
3091                 "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
3092                 "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
3093                 "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
3094                 "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
3095 #else
3096                 "xchg %3, 0(%%esp) \n\t"
3097                 "mov %%eax, %c[rax](%3) \n\t"
3098                 "mov %%ebx, %c[rbx](%3) \n\t"
3099                 "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
3100                 "mov %%edx, %c[rdx](%3) \n\t"
3101                 "mov %%esi, %c[rsi](%3) \n\t"
3102                 "mov %%edi, %c[rdi](%3) \n\t"
3103                 "mov %%ebp, %c[rbp](%3) \n\t"
3104                 "mov %%cr2, %%eax  \n\t"
3105                 "mov %%eax, %c[cr2](%3) \n\t"
3106                 "mov 0(%%esp), %3 \n\t"
3107
3108                 "pop %%ecx; popa \n\t"
3109 #endif
3110                 "setbe %0 \n\t"
3111                 "popf \n\t"
3112               : "=g" (fail)
3113               : "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
3114                 "c"(vcpu),
3115                 [rax]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
3116                 [rbx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
3117                 [rcx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
3118                 [rdx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
3119                 [rsi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
3120                 [rdi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
3121                 [rbp]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
3122 #ifdef __x86_64__
3123                 [r8 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8 ])),
3124                 [r9 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9 ])),
3125                 [r10]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
3126                 [r11]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
3127                 [r12]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
3128                 [r13]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
3129                 [r14]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
3130                 [r15]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
3131 #endif
3132                 [cr2]"i"(offsetof(struct litevm_vcpu, cr2))
3133               : "cc", "memory" );
3134
3135         ++litevm_stat.exits;
3136         printk("vm_run exits");
3137         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3138         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
3139
3140         fx_save(vcpu->guest_fx_image);
3141         fx_restore(vcpu->host_fx_image);
3142
3143 #ifndef __x86_64__
3144         asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3145 #endif
3146
3147         litevm_run->exit_type = 0;
3148         if (fail) {
3149                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
3150                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
3151         } else {
3152                 if (fs_gs_ldt_reload_needed) {
3153                         load_ldt(ldt_sel);
3154                         load_fs(fs_sel);
3155                         /*
3156                          * If we have to reload gs, we must take care to
3157                          * preserve our gs base.
3158                          */
3159                         disable_irq();
3160                         load_gs(gs_sel);
3161 #ifdef __x86_64__
3162                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
3163 #endif
3164                         enable_irq();
3165
3166                         reload_tss();
3167                 }
3168                 vcpu->launched = 1;
3169                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
3170                 if (litevm_handle_exit(litevm_run, vcpu)) {
3171                         /* Give scheduler a change to reschedule. */
3172                         vcpu_put(vcpu);
3173 #warning "how to tell if signal is pending"
3174 /*
3175                         if (signal_pending(current)) {
3176                                 ++litevm_stat.signal_exits;
3177                                 return -EINTR;
3178                         }
3179 */
3180                         kthread_yield();
3181                         /* Cannot fail -  no vcpu unplug yet. */
3182                         vcpu_load(litevm, vcpu_slot(vcpu));
3183                         goto again;
3184                 }
3185         }
3186
3187         vcpu_put(vcpu);
3188         printk("vm_run returns\n");
3189         print_func_exit();
3190         return 0;
3191 }
3192
3193 static int litevm_dev_ioctl_get_regs(struct litevm *litevm, struct litevm_regs *regs)
3194 {
3195         print_func_entry();
3196         struct litevm_vcpu *vcpu;
3197
3198         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3199                 print_func_exit();
3200                 return -EINVAL;
3201         }
3202
3203         vcpu = vcpu_load(litevm, regs->vcpu);
3204         if (!vcpu) {
3205                 print_func_exit();
3206                 return -ENOENT;
3207         }
3208
3209         regs->rax = vcpu->regs[VCPU_REGS_RAX];
3210         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
3211         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
3212         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
3213         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
3214         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
3215         regs->rsp = vmcs_readl(GUEST_RSP);
3216         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
3217 #ifdef __x86_64__
3218         regs->r8 = vcpu->regs[VCPU_REGS_R8];
3219         regs->r9 = vcpu->regs[VCPU_REGS_R9];
3220         regs->r10 = vcpu->regs[VCPU_REGS_R10];
3221         regs->r11 = vcpu->regs[VCPU_REGS_R11];
3222         regs->r12 = vcpu->regs[VCPU_REGS_R12];
3223         regs->r13 = vcpu->regs[VCPU_REGS_R13];
3224         regs->r14 = vcpu->regs[VCPU_REGS_R14];
3225         regs->r15 = vcpu->regs[VCPU_REGS_R15];
3226 #endif
3227
3228         regs->rip = vmcs_readl(GUEST_RIP);
3229         regs->rflags = vmcs_readl(GUEST_RFLAGS);
3230
3231         /*
3232          * Don't leak debug flags in case they were set for guest debugging
3233          */
3234         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3235                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3236
3237         vcpu_put(vcpu);
3238
3239         print_func_exit();
3240         return 0;
3241 }
3242
3243 static int litevm_dev_ioctl_set_regs(struct litevm *litevm, struct litevm_regs *regs)
3244 {
3245         print_func_entry();
3246         struct litevm_vcpu *vcpu;
3247
3248         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3249                 print_func_exit();
3250                 return -EINVAL;
3251         }
3252
3253         vcpu = vcpu_load(litevm, regs->vcpu);
3254         if (!vcpu) {
3255                 print_func_exit();
3256                 return -ENOENT;
3257         }
3258
3259         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
3260         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
3261         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
3262         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
3263         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
3264         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
3265         vmcs_writel(GUEST_RSP, regs->rsp);
3266         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
3267 #ifdef __x86_64__
3268         vcpu->regs[VCPU_REGS_R8] = regs->r8;
3269         vcpu->regs[VCPU_REGS_R9] = regs->r9;
3270         vcpu->regs[VCPU_REGS_R10] = regs->r10;
3271         vcpu->regs[VCPU_REGS_R11] = regs->r11;
3272         vcpu->regs[VCPU_REGS_R12] = regs->r12;
3273         vcpu->regs[VCPU_REGS_R13] = regs->r13;
3274         vcpu->regs[VCPU_REGS_R14] = regs->r14;
3275         vcpu->regs[VCPU_REGS_R15] = regs->r15;
3276 #endif
3277
3278         vmcs_writel(GUEST_RIP, regs->rip);
3279         vmcs_writel(GUEST_RFLAGS, regs->rflags);
3280
3281         vcpu_put(vcpu);
3282
3283         print_func_exit();
3284         return 0;
3285 }
3286
3287 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
3288 {
3289         print_func_entry();
3290         struct litevm_vcpu *vcpu;
3291
3292         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3293                 print_func_exit();
3294                 return -EINVAL;
3295         }
3296         vcpu = vcpu_load(litevm, sregs->vcpu);
3297         if (!vcpu) {
3298                 print_func_exit();
3299                 return -ENOENT;
3300         }
3301
3302 #define get_segment(var, seg) \
3303         do { \
3304                 uint32_t ar; \
3305                 \
3306                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
3307                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
3308                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
3309                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
3310                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
3311                 sregs->var.type = ar & 15; \
3312                 sregs->var.s = (ar >> 4) & 1; \
3313                 sregs->var.dpl = (ar >> 5) & 3; \
3314                 sregs->var.present = (ar >> 7) & 1; \
3315                 sregs->var.avl = (ar >> 12) & 1; \
3316                 sregs->var.l = (ar >> 13) & 1; \
3317                 sregs->var.db = (ar >> 14) & 1; \
3318                 sregs->var.g = (ar >> 15) & 1; \
3319                 sregs->var.unusable = (ar >> 16) & 1; \
3320         } while (0);
3321
3322         get_segment(cs, CS);
3323         get_segment(ds, DS);
3324         get_segment(es, ES);
3325         get_segment(fs, FS);
3326         get_segment(gs, GS);
3327         get_segment(ss, SS);
3328
3329         get_segment(tr, TR);
3330         get_segment(ldt, LDTR);
3331 #undef get_segment
3332
3333 #define get_dtable(var, table) \
3334         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
3335                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
3336
3337         get_dtable(idt, IDTR);
3338         get_dtable(gdt, GDTR);
3339 #undef get_dtable
3340
3341         sregs->cr0 = guest_cr0();
3342         sregs->cr2 = vcpu->cr2;
3343         sregs->cr3 = vcpu->cr3;
3344         sregs->cr4 = guest_cr4();
3345         sregs->cr8 = vcpu->cr8;
3346         sregs->efer = vcpu->shadow_efer;
3347         sregs->apic_base = vcpu->apic_base;
3348
3349         sregs->pending_int = vcpu->irq_summary != 0;
3350
3351         vcpu_put(vcpu);
3352
3353         print_func_exit();
3354         return 0;
3355 }
3356
3357 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
3358 {
3359         print_func_entry();
3360         struct litevm_vcpu *vcpu;
3361         int mmu_reset_needed = 0;
3362
3363         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3364                 print_func_exit();
3365                 return -EINVAL;
3366         }
3367         vcpu = vcpu_load(litevm, sregs->vcpu);
3368         if (!vcpu) {
3369                 print_func_exit();
3370                 return -ENOENT;
3371         }
3372
3373 #define set_segment(var, seg) \
3374         do { \
3375                 uint32_t ar; \
3376                 \
3377                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
3378                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
3379                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
3380                 if (sregs->var.unusable) { \
3381                         ar = (1 << 16); \
3382                 } else { \
3383                         ar = (sregs->var.type & 15); \
3384                         ar |= (sregs->var.s & 1) << 4; \
3385                         ar |= (sregs->var.dpl & 3) << 5; \
3386                         ar |= (sregs->var.present & 1) << 7; \
3387                         ar |= (sregs->var.avl & 1) << 12; \
3388                         ar |= (sregs->var.l & 1) << 13; \
3389                         ar |= (sregs->var.db & 1) << 14; \
3390                         ar |= (sregs->var.g & 1) << 15; \
3391                 } \
3392                 vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
3393         } while (0);
3394
3395         set_segment(cs, CS);
3396         set_segment(ds, DS);
3397         set_segment(es, ES);
3398         set_segment(fs, FS);
3399         set_segment(gs, GS);
3400         set_segment(ss, SS);
3401
3402         set_segment(tr, TR);
3403
3404         set_segment(ldt, LDTR);
3405 #undef set_segment
3406
3407 #define set_dtable(var, table) \
3408         vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
3409         vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
3410
3411         set_dtable(idt, IDTR);
3412         set_dtable(gdt, GDTR);
3413 #undef set_dtable
3414
3415         vcpu->cr2 = sregs->cr2;
3416         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
3417         vcpu->cr3 = sregs->cr3;
3418
3419         vcpu->cr8 = sregs->cr8;
3420
3421         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
3422 #ifdef __x86_64__
3423         __set_efer(vcpu, sregs->efer);
3424 #endif
3425         vcpu->apic_base = sregs->apic_base;
3426
3427         mmu_reset_needed |= guest_cr0() != sregs->cr0;
3428         vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
3429         update_exception_bitmap(vcpu);
3430         vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
3431         vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
3432
3433         mmu_reset_needed |=  guest_cr4() != sregs->cr4;
3434         __set_cr4(vcpu, sregs->cr4);
3435
3436         if (mmu_reset_needed)
3437                 litevm_mmu_reset_context(vcpu);
3438         vcpu_put(vcpu);
3439
3440         print_func_exit();
3441         return 0;
3442 }
3443
3444 /*
3445  * Translate a guest virtual address to a guest physical address.
3446  */
3447 static int litevm_dev_ioctl_translate(struct litevm *litevm, struct litevm_translation *tr)
3448 {
3449         print_func_entry();
3450         unsigned long vaddr = tr->linear_address;
3451         struct litevm_vcpu *vcpu;
3452         gpa_t gpa;
3453
3454         vcpu = vcpu_load(litevm, tr->vcpu);
3455         if (!vcpu) {
3456                 print_func_exit();
3457                 return -ENOENT;
3458         }
3459         spin_lock_irqsave(&litevm->lock);
3460         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
3461         tr->physical_address = gpa;
3462         tr->valid = gpa != UNMAPPED_GVA;
3463         tr->writeable = 1;
3464         tr->usermode = 0;
3465         spin_unlock(&litevm->lock);
3466         vcpu_put(vcpu);
3467
3468         print_func_exit();
3469         return 0;
3470 }
3471
3472 #if 0
3473 static int litevm_dev_ioctl_interrupt(struct litevm *litevm, struct litevm_interrupt *irq)
3474 {
3475         struct litevm_vcpu *vcpu;
3476
3477         if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS)
3478                 return -EINVAL;
3479         if (irq->irq < 0 || irq->irq >= 256)
3480                 return -EINVAL;
3481         vcpu = vcpu_load(litevm, irq->vcpu);
3482         if (!vcpu)
3483                 return -ENOENT;
3484
3485         set_bit(irq->irq, vcpu->irq_pending);
3486         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
3487
3488         vcpu_put(vcpu);
3489
3490         return 0;
3491 }