Fix a simple bug in vmx support
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 #define currentcpu (&per_cpu_info[core_id()])
39
40 struct litevm_stat litevm_stat;
41
42 static struct litevm_stats_debugfs_item {
43         const char *name;
44         uint32_t *data;
45 } debugfs_entries[] = {
46         {
47         "pf_fixed", &litevm_stat.pf_fixed}, {
48         "pf_guest", &litevm_stat.pf_guest}, {
49         "tlb_flush", &litevm_stat.tlb_flush}, {
50         "invlpg", &litevm_stat.invlpg}, {
51         "exits", &litevm_stat.exits}, {
52         "io_exits", &litevm_stat.io_exits}, {
53         "mmio_exits", &litevm_stat.mmio_exits}, {
54         "signal_exits", &litevm_stat.signal_exits}, {
55         "irq_exits", &litevm_stat.irq_exits}, {
56         0, 0}
57 };
58
59 static struct dentry *debugfs_dir;
60
61 static const uint32_t vmx_msr_index[] = {
62 #ifdef __x86_64__
63         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
64 #endif
65         MSR_EFER,       // wtf? MSR_K6_STAR,
66 };
67
68 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
69
70 #ifdef __x86_64__
71 /*
72  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
73  * mechanism (cpu bug AA24)
74  */
75 #define NR_BAD_MSRS 2
76 #else
77 #define NR_BAD_MSRS 0
78 #endif
79
80 #define TSS_IOPB_BASE_OFFSET 0x66
81 #define TSS_BASE_SIZE 0x68
82 #define TSS_IOPB_SIZE (65536 / 8)
83 #define TSS_REDIRECTION_SIZE (256 / 8)
84 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
85
86 #define MSR_IA32_VMX_BASIC_MSR                  0x480
87 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
88 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
89 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
90 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
91
92 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
93 #define LMSW_GUEST_MASK 0x0eULL
94 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
95 //#define CR4_VMXE 0x2000
96 #define CR8_RESEVED_BITS (~0x0fULL)
97 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
98
99 #ifdef __x86_64__
100 #define HOST_IS_64 1
101 #else
102 #define HOST_IS_64 0
103 #endif
104
105 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
106 /**
107  * __ffs - find first set bit in word
108  * @word: The word to search
109  *
110  * Undefined if no bit exists, so code should check against 0 first.
111  */
112 static inline unsigned long __ffs(unsigned long word)
113 {
114         print_func_entry();
115 asm("rep; bsf %1,%0":"=r"(word)
116 :               "rm"(word));
117         print_func_exit();
118         return word;
119 }
120
121 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu,
122                                                                                         uint32_t msr)
123 {
124         print_func_entry();
125         int i;
126
127         for (i = 0; i < vcpu->nmsrs; ++i)
128                 if (vcpu->guest_msrs[i].index == msr) {
129                         print_func_exit();
130                         return &vcpu->guest_msrs[i];
131                 }
132         print_func_exit();
133         return 0;
134 }
135
136 struct descriptor_table {
137         uint16_t limit;
138         unsigned long base;
139 } __attribute__ ((packed));
140
141 static void get_gdt(struct descriptor_table *table)
142 {
143         print_func_entry();
144 asm("sgdt %0":"=m"(*table));
145         print_func_exit();
146 }
147
148 static void get_idt(struct descriptor_table *table)
149 {
150         print_func_entry();
151 asm("sidt %0":"=m"(*table));
152         print_func_exit();
153 }
154
155 static uint16_t read_fs(void)
156 {
157         print_func_entry();
158         uint16_t seg;
159 asm("mov %%fs, %0":"=g"(seg));
160         print_func_exit();
161         return seg;
162 }
163
164 static uint16_t read_gs(void)
165 {
166         print_func_entry();
167         uint16_t seg;
168 asm("mov %%gs, %0":"=g"(seg));
169         print_func_exit();
170         return seg;
171 }
172
173 static uint16_t read_ldt(void)
174 {
175         print_func_entry();
176         uint16_t ldt;
177 asm("sldt %0":"=g"(ldt));
178         print_func_exit();
179         return ldt;
180 }
181
182 static void load_fs(uint16_t sel)
183 {
184         print_func_entry();
185 asm("mov %0, %%fs": :"g"(sel));
186         print_func_exit();
187 }
188
189 static void load_gs(uint16_t sel)
190 {
191         print_func_entry();
192 asm("mov %0, %%gs": :"g"(sel));
193         print_func_exit();
194 }
195
196 #ifndef load_ldt
197 static void load_ldt(uint16_t sel)
198 {
199         print_func_entry();
200 asm("lldt %0": :"g"(sel));
201         print_func_exit();
202 }
203 #endif
204
205 static void fx_save(void *image)
206 {
207         print_func_entry();
208         asm("fxsave (%0)"::"r"(image));
209         print_func_exit();
210 }
211
212 static void fx_restore(void *image)
213 {
214         print_func_entry();
215         asm("fxrstor (%0)"::"r"(image));
216         print_func_exit();
217 }
218
219 static void fpu_init(void)
220 {
221         print_func_entry();
222         asm("finit");
223         print_func_exit();
224 }
225
226 struct segment_descriptor {
227         uint16_t limit_low;
228         uint16_t base_low;
229         uint8_t base_mid;
230         uint8_t type:4;
231         uint8_t system:1;
232         uint8_t dpl:2;
233         uint8_t present:1;
234         uint8_t limit_high:4;
235         uint8_t avl:1;
236         uint8_t long_mode:1;
237         uint8_t default_op:1;
238         uint8_t granularity:1;
239         uint8_t base_high;
240 } __attribute__ ((packed));
241
242 #ifdef __x86_64__
243 // LDT or TSS descriptor in the GDT. 16 bytes.
244 struct segment_descriptor_64 {
245         struct segment_descriptor s;
246         uint32_t base_higher;
247         uint32_t pad_zero;
248 };
249
250 #endif
251
252 static unsigned long segment_base(uint16_t selector)
253 {
254         print_func_entry();
255         struct descriptor_table gdt;
256         struct segment_descriptor *d;
257         unsigned long table_base;
258         typedef unsigned long ul;
259         unsigned long v;
260
261 asm("sgdt %0":"=m"(gdt));
262         table_base = gdt.base;
263
264         if (selector & 4) {     /* from ldt */
265                 uint16_t ldt_selector;
266
267 asm("sldt %0":"=g"(ldt_selector));
268                 table_base = segment_base(ldt_selector);
269         }
270         d = (struct segment_descriptor *)(table_base + (selector & ~7));
271         v = d->base_low | ((ul) d->base_mid << 16) | ((ul) d->base_high << 24);
272 #ifdef __x86_64__
273         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
274                 v |= ((ul) ((struct segment_descriptor_64 *)d)->base_higher) << 32;
275 #endif
276         print_func_exit();
277         return v;
278 }
279
280 static unsigned long read_tr_base(void)
281 {
282         print_func_entry();
283         uint16_t tr;
284 asm("str %0":"=g"(tr));
285         print_func_exit();
286         return segment_base(tr);
287 }
288
289 static void reload_tss(void)
290 {
291         print_func_entry();
292 #ifndef __x86_64__
293
294         /*
295          * VT restores TR but not its size.  Useless.
296          */
297         struct descriptor_table gdt;
298         struct segment_descriptor *descs;
299
300         get_gdt(&gdt);
301         descs = (void *)gdt.base;
302         descs[GDT_ENTRY_TSS].type = 9;  /* available TSS */
303         load_TR_desc();
304 #endif
305         print_func_exit();
306 }
307
308 static struct vmcs_descriptor {
309         int size;
310         int order;
311         uint32_t revision_id;
312 } vmcs_descriptor;
313
314 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
315 {
316         print_func_entry();
317         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
318         print_func_exit();
319         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
320 }
321
322 int litevm_read_guest(struct litevm_vcpu *vcpu,
323                                           gva_t addr, unsigned long size, void *dest)
324 {
325         print_func_entry();
326         unsigned char *host_buf = dest;
327         unsigned long req_size = size;
328
329         while (size) {
330                 hpa_t paddr;
331                 unsigned now;
332                 unsigned offset;
333                 hva_t guest_buf;
334
335                 paddr = gva_to_hpa(vcpu, addr);
336
337                 if (is_error_hpa(paddr))
338                         break;
339                 guest_buf = (hva_t) KADDR(paddr);
340                 offset = addr & ~PAGE_MASK;
341                 guest_buf |= offset;
342                 now = MIN(size, PAGE_SIZE - offset);
343                 memcpy(host_buf, (void *)guest_buf, now);
344                 host_buf += now;
345                 addr += now;
346                 size -= now;
347         }
348         print_func_exit();
349         return req_size - size;
350 }
351
352 int litevm_write_guest(struct litevm_vcpu *vcpu,
353                                            gva_t addr, unsigned long size, void *data)
354 {
355         print_func_entry();
356         unsigned char *host_buf = data;
357         unsigned long req_size = size;
358
359         while (size) {
360                 hpa_t paddr;
361                 unsigned now;
362                 unsigned offset;
363                 hva_t guest_buf;
364
365                 paddr = gva_to_hpa(vcpu, addr);
366
367                 if (is_error_hpa(paddr))
368                         break;
369
370                 guest_buf = (hva_t) KADDR(paddr);
371                 offset = addr & ~PAGE_MASK;
372                 guest_buf |= offset;
373                 now = MIN(size, PAGE_SIZE - offset);
374                 memcpy((void *)guest_buf, host_buf, now);
375                 host_buf += now;
376                 addr += now;
377                 size -= now;
378         }
379         print_func_exit();
380         return req_size - size;
381 }
382
383 static void setup_vmcs_descriptor(void)
384 {
385         print_func_entry();
386         uint64_t msr;
387
388         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
389         vmcs_descriptor.size = (msr >> 32) & 0x1fff;
390         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size >> PAGE_SHIFT);
391         vmcs_descriptor.revision_id = (uint32_t) msr;
392         printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n",
393                    msr, vmcs_descriptor.size, vmcs_descriptor.order,
394                    vmcs_descriptor.revision_id);
395         print_func_exit();
396 };
397
398 static void vmcs_clear(struct vmcs *vmcs)
399 {
400         print_func_entry();
401         uint64_t phys_addr = PADDR(vmcs);
402         uint8_t error;
403         printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr);
404         asm volatile ("vmclear %1; setna %0":"=m" (error):"m"(phys_addr):"cc",
405                                   "memory");
406         if (error)
407                 printk("litevm: vmclear fail: %p/%llx\n", vmcs, phys_addr);
408         print_func_exit();
409 }
410
411 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
412 {
413         print_func_entry();
414         struct litevm_vcpu *vcpu = arg;
415         int cpu = core_id();
416         printd
417                 ("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n",
418                  cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
419
420         if (vcpu->cpu == cpu)
421                 vmcs_clear(vcpu->vmcs);
422
423         if (currentcpu->vmcs == vcpu->vmcs)
424                 currentcpu->vmcs = NULL;
425         print_func_exit();
426 }
427
428 static int vcpu_slot(struct litevm_vcpu *vcpu)
429 {
430         print_func_entry();
431         print_func_exit();
432         return vcpu - vcpu->litevm->vcpus;
433 }
434
435 /*
436  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
437  * vcpu mutex is already taken.
438  */
439 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
440 {
441         print_func_entry();
442         uint64_t phys_addr = PADDR(vcpu->vmcs);
443         int cpu;
444         cpu = core_id();
445
446         if ((vcpu->cpu != cpu) && (vcpu->cpu != -1)){
447                 handler_wrapper_t *w;
448                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
449                 smp_call_wait(w);
450                 vcpu->launched = 0;
451         }
452         if (currentcpu->vmcs != vcpu->vmcs) {
453                 uint8_t error;
454
455                 currentcpu->vmcs = vcpu->vmcs;
456                 asm volatile ("vmptrld %1; setna %0":"=m" (error):"m"(phys_addr):"cc");
457                 if (error) {
458                         printk("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
459                         error("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
460                 }
461         }
462
463         if (vcpu->cpu != cpu) {
464                 struct descriptor_table dt;
465                 unsigned long sysenter_esp;
466
467                 vcpu->cpu = cpu;
468                 /*
469                  * Linux uses per-cpu TSS and GDT, so set these when switching
470                  * processors.
471                  */
472                 vmcs_writel(HOST_TR_BASE, read_tr_base());      /* 22.2.4 */
473                 get_gdt(&dt);
474                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
475
476                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
477                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp);      /* 22.2.3 */
478         }
479         print_func_exit();
480         return vcpu;
481 }
482
483 /*
484  * Switches to specified vcpu, until a matching vcpu_put()
485  */
486 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
487 {
488         print_func_entry();
489         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
490
491         printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu);
492
493         qlock(&vcpu->mutex);
494         if (!vcpu->vmcs) {
495                 qunlock(&vcpu->mutex);
496                 error("vcpu->vmcs is NULL");
497         }
498         print_func_exit();
499         return __vcpu_load(vcpu);
500 }
501
502 static void vcpu_put(struct litevm_vcpu *vcpu)
503 {
504         print_func_entry();
505         //put_cpu();
506         qunlock(&vcpu->mutex);
507         print_func_exit();
508 }
509
510 static struct vmcs *alloc_vmcs_cpu(int cpu)
511 {
512         print_func_entry();
513         int node = node_id();
514         struct vmcs *vmcs;
515
516         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
517         if (!pages) {
518                 print_func_exit();
519                 return 0;
520         }
521         memset(vmcs, 0, vmcs_descriptor.size);
522         vmcs->revision_id = vmcs_descriptor.revision_id;        /* vmcs revision id */
523         print_func_exit();
524         return vmcs;
525 }
526
527 static struct vmcs *alloc_vmcs(void)
528 {
529         struct vmcs *ret;
530         print_func_entry();
531         ret = alloc_vmcs_cpu(core_id());
532         print_func_exit();
533         return ret;
534 }
535
536 static int cpu_has_litevm_support(void)
537 {
538         print_func_entry();
539         /* sigh ... qemu. */
540         char vid[16];
541         if (vendor_id(vid) < 0)
542                 return 0;
543         printk("vendor id is %s\n", vid);
544         if (vid[0] == 'Q') /* qemu */
545                 return 0;
546         if (vid[0] == 'A') /* AMD or qemu claiming to be AMD */
547                 return 0;
548         uint32_t ecx = cpuid_ecx(1);
549         print_func_exit();
550         return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */
551 }
552
553 static int vmx_disabled_by_bios(void)
554 {
555         print_func_entry();
556         uint64_t msr;
557
558         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
559         print_func_exit();
560         return (msr & 5) == 1;  /* locked but not enabled */
561 }
562
563 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
564 {
565         print_func_entry();
566         int cpu = hw_core_id();
567         uint64_t phys_addr;
568         uint64_t old;
569         uint64_t status = 0;
570         currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order,
571                                                                                           KMALLOC_WAIT);
572         if (!currentcpu->vmxarea)
573                 return;
574         memset(currentcpu->vmxarea, 0, vmcs_descriptor.size);
575         currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id;
576         phys_addr = PADDR(currentcpu->vmxarea);
577         printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(),
578                    currentcpu->vmxarea, (void *)phys_addr);
579         if (phys_addr & 0xfff) {
580                 printk("fix vmxarea alignment!");
581         }
582         printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
583         old = read_msr(MSR_IA32_FEATURE_CONTROL);
584         printk("%d: vm_enable, old is %d\n", core_id(), old);
585         if ((old & 5) == 0) {
586                 /* enable and lock */
587                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
588                 old = read_msr(MSR_IA32_FEATURE_CONTROL);
589                 printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old);
590         }
591         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
592         lcr4(rcr4() | CR4_VMXE);        /* FIXME: not cpu hotplug safe */
593         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
594         printk("%d:cr0 is %x\n", core_id(), rcr0());
595         lcr0(rcr0() | 0x20);
596         printk("%d:cr0 is %x\n", core_id(), rcr0());
597         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
598         outb(0x92, inb(0x92) | 2);
599         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
600         asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:":"=m" (status):"m"
601                                   (phys_addr):"memory", "cc");
602         printk("%d:vmxon status is %d\n", core_id(), status);
603         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
604         if (!status) {
605                 printk("%d:vm_enable: status says fail\n", core_id());
606         }
607         print_func_exit();
608 }
609
610 static void litevm_disable(void *garbage)
611 {
612         print_func_entry();
613         asm volatile ("vmxoff":::"cc");
614         print_func_exit();
615 }
616
617 struct litevm *vmx_open(void)
618 {
619         print_func_entry();
620         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
621         int i;
622
623         if (!litevm) {
624                 printk("NO LITEVM! MAKES NO SENSE!\n");
625                 error("litevm alloc failed");
626                 print_func_exit();
627                 return 0;
628         }
629
630         spinlock_init_irqsave(&litevm->lock);
631         LIST_INIT(&litevm->link);
632         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
633                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
634
635                 qlock_init(&vcpu->mutex);
636                 vcpu->mmu.root_hpa = INVALID_PAGE;
637                 LIST_INIT(&vcpu->link);
638         }
639         printk("vmx_open: busy %d\n", litevm->busy);
640         printk("return %p\n", litevm);
641         print_func_exit();
642         return litevm;
643 }
644
645 /*
646  * Free any memory in @free but not in @dont.
647  */
648 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
649                                                                          struct litevm_memory_slot *dont)
650 {
651         print_func_entry();
652         int i;
653
654         if (!dont || free->phys_mem != dont->phys_mem)
655                 if (free->phys_mem) {
656                         for (i = 0; i < free->npages; ++i) {
657                                 page_t *page = free->phys_mem[i];
658                                 page_decref(page);
659                                 assert(page_is_free(page2ppn(page)));
660                         }
661                         kfree(free->phys_mem);
662                 }
663
664         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
665                 kfree(free->dirty_bitmap);
666
667         free->phys_mem = 0;
668         free->npages = 0;
669         free->dirty_bitmap = 0;
670         print_func_exit();
671 }
672
673 static void litevm_free_physmem(struct litevm *litevm)
674 {
675         print_func_entry();
676         int i;
677
678         for (i = 0; i < litevm->nmemslots; ++i)
679                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
680         print_func_exit();
681 }
682
683 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
684 {
685         print_func_entry();
686         if (vcpu->vmcs) {
687                 handler_wrapper_t *w;
688                 smp_call_function_all(__vcpu_clear, vcpu, &w);
689                 smp_call_wait(w);
690                 //free_vmcs(vcpu->vmcs);
691                 vcpu->vmcs = 0;
692         }
693         print_func_exit();
694 }
695
696 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
697 {
698         print_func_entry();
699         litevm_free_vmcs(vcpu);
700         litevm_mmu_destroy(vcpu);
701         print_func_exit();
702 }
703
704 static void litevm_free_vcpus(struct litevm *litevm)
705 {
706         print_func_entry();
707         unsigned int i;
708
709         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
710                 litevm_free_vcpu(&litevm->vcpus[i]);
711         print_func_exit();
712 }
713
714 static int litevm_dev_release(struct litevm *litevm)
715 {
716         print_func_entry();
717
718         litevm_free_vcpus(litevm);
719         litevm_free_physmem(litevm);
720         kfree(litevm);
721         print_func_exit();
722         return 0;
723 }
724
725 unsigned long vmcs_readl(unsigned long field)
726 {
727         print_func_entry();
728         unsigned long value;
729
730         asm volatile ("vmread %1, %0":"=g" (value):"r"(field):"cc");
731         print_func_exit();
732         return value;
733 }
734
735 void vmcs_writel(unsigned long field, unsigned long value)
736 {
737         print_func_entry();
738         uint8_t error;
739
740         asm volatile ("vmwrite %1, %2; setna %0":"=g" (error):"r"(value),
741                                   "r"(field):"cc");
742         if (error)
743                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
744                            field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
745         print_func_exit();
746 }
747
748 static void vmcs_write16(unsigned long field, uint16_t value)
749 {
750         print_func_entry();
751         vmcs_writel(field, value);
752         print_func_exit();
753 }
754
755 static void vmcs_write64(unsigned long field, uint64_t value)
756 {
757         print_func_entry();
758 #ifdef __x86_64__
759         vmcs_writel(field, value);
760 #else
761         vmcs_writel(field, value);
762         asm volatile ("");
763         vmcs_writel(field + 1, value >> 32);
764 #endif
765         print_func_exit();
766 }
767
768 static void inject_gp(struct litevm_vcpu *vcpu)
769 {
770         print_func_entry();
771         printd("inject_general_protection: rip 0x%lx\n", vmcs_readl(GUEST_RIP));
772         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
773         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
774                                  GP_VECTOR |
775                                  INTR_TYPE_EXCEPTION |
776                                  INTR_INFO_DELIEVER_CODE_MASK | INTR_INFO_VALID_MASK);
777         print_func_exit();
778 }
779
780 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
781 {
782         print_func_entry();
783         if (vcpu->rmode.active)
784                 vmcs_write32(EXCEPTION_BITMAP, ~0);
785         else
786                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
787         print_func_exit();
788 }
789
790 static void enter_pmode(struct litevm_vcpu *vcpu)
791 {
792         print_func_entry();
793         unsigned long flags;
794
795         vcpu->rmode.active = 0;
796
797         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
798         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
799         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
800
801         flags = vmcs_readl(GUEST_RFLAGS);
802         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
803         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
804         vmcs_writel(GUEST_RFLAGS, flags);
805
806         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
807                                 (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK));
808
809         update_exception_bitmap(vcpu);
810
811 #define FIX_PMODE_DATASEG(seg, save) {                          \
812                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
813                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
814                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
815                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
816         }
817
818         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
819         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
820         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
821         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
822         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
823
824         vmcs_write16(GUEST_CS_SELECTOR,
825                                  vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
826         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
827         print_func_exit();
828 }
829
830 static int rmode_tss_base(struct litevm *litevm)
831 {
832         print_func_entry();
833         gfn_t base_gfn =
834                 litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
835         print_func_exit();
836         return base_gfn << PAGE_SHIFT;
837 }
838
839 static void enter_rmode(struct litevm_vcpu *vcpu)
840 {
841         print_func_entry();
842         unsigned long flags;
843
844         vcpu->rmode.active = 1;
845
846         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
847         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
848
849         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
850         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
851
852         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
853         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
854
855         flags = vmcs_readl(GUEST_RFLAGS);
856         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
857
858         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
859
860         vmcs_writel(GUEST_RFLAGS, flags);
861         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
862         update_exception_bitmap(vcpu);
863
864 #define FIX_RMODE_SEG(seg, save) {                                 \
865                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
866                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
867                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
868                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
869         }
870
871         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
872         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
873
874         FIX_RMODE_SEG(ES, vcpu->rmode.es);
875         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
876         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
877         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
878         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
879         print_func_exit();
880 }
881
882 static int init_rmode_tss(struct litevm *litevm)
883 {
884         print_func_entry();
885         struct page *p1, *p2, *p3;
886         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
887         char *page;
888
889         p1 = _gfn_to_page(litevm, fn++);
890         p2 = _gfn_to_page(litevm, fn++);
891         p3 = _gfn_to_page(litevm, fn);
892
893         if (!p1 || !p2 || !p3) {
894                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
895                 print_func_exit();
896                 return 0;
897         }
898
899         page = page2kva(p1);
900         memset(page, 0, PAGE_SIZE);
901         *(uint16_t *) (page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
902
903         page = page2kva(p2);
904         memset(page, 0, PAGE_SIZE);
905
906         page = page2kva(p3);
907         memset(page, 0, PAGE_SIZE);
908         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
909
910         print_func_exit();
911         return 1;
912 }
913
914 #ifdef __x86_64__
915
916 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
917 {
918         print_func_entry();
919         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
920
921         vcpu->shadow_efer = efer;
922         if (efer & EFER_LMA) {
923                 vmcs_write32(VM_ENTRY_CONTROLS,
924                                          vmcs_read32(VM_ENTRY_CONTROLS) |
925                                          VM_ENTRY_CONTROLS_IA32E_MASK);
926                 msr->data = efer;
927
928         } else {
929                 vmcs_write32(VM_ENTRY_CONTROLS,
930                                          vmcs_read32(VM_ENTRY_CONTROLS) &
931                                          ~VM_ENTRY_CONTROLS_IA32E_MASK);
932
933                 msr->data = efer & ~EFER_LME;
934         }
935         print_func_exit();
936 }
937
938 static void enter_lmode(struct litevm_vcpu *vcpu)
939 {
940         print_func_entry();
941         uint32_t guest_tr_ar;
942
943         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
944         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
945                 printd("%s: tss fixup for long mode. \n", __FUNCTION__);
946                 vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK)
947                                          | AR_TYPE_BUSY_64_TSS);
948         }
949
950         vcpu->shadow_efer |= EFER_LMA;
951
952         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
953         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
954                                  | VM_ENTRY_CONTROLS_IA32E_MASK);
955         print_func_exit();
956 }
957
958 static void exit_lmode(struct litevm_vcpu *vcpu)
959 {
960         print_func_entry();
961         vcpu->shadow_efer &= ~EFER_LMA;
962
963         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
964                                  & ~VM_ENTRY_CONTROLS_IA32E_MASK);
965         print_func_exit();
966 }
967
968 #endif
969
970 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
971 {
972         print_func_entry();
973         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
974                 enter_pmode(vcpu);
975
976         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
977                 enter_rmode(vcpu);
978
979 #ifdef __x86_64__
980         if (vcpu->shadow_efer & EFER_LME) {
981                 if (!is_paging() && (cr0 & CR0_PG_MASK))
982                         enter_lmode(vcpu);
983                 if (is_paging() && !(cr0 & CR0_PG_MASK))
984                         exit_lmode(vcpu);
985         }
986 #endif
987
988         vmcs_writel(CR0_READ_SHADOW, cr0);
989         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
990         print_func_exit();
991 }
992
993 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
994                                                                                  unsigned long cr3)
995 {
996         print_func_entry();
997         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
998         unsigned offset = (cr3 & (PAGE_SIZE - 1)) >> 5;
999         int i;
1000         uint64_t pdpte;
1001         uint64_t *pdpt;
1002         struct litevm_memory_slot *memslot;
1003
1004         spin_lock_irqsave(&vcpu->litevm->lock);
1005         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
1006         /* FIXME: !memslot - emulate? 0xff? */
1007         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
1008
1009         for (i = 0; i < 4; ++i) {
1010                 pdpte = pdpt[offset + i];
1011                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
1012                         break;
1013         }
1014
1015         spin_unlock(&vcpu->litevm->lock);
1016
1017         print_func_exit();
1018         return i != 4;
1019 }
1020
1021 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
1022 {
1023         print_func_entry();
1024         if (cr0 & CR0_RESEVED_BITS) {
1025                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, guest_cr0());
1026                 inject_gp(vcpu);
1027                 print_func_exit();
1028                 return;
1029         }
1030
1031         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
1032                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
1033                 inject_gp(vcpu);
1034                 print_func_exit();
1035                 return;
1036         }
1037
1038         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
1039                 printd("set_cr0: #GP, set PG flag " "and a clear PE flag\n");
1040                 inject_gp(vcpu);
1041                 print_func_exit();
1042                 return;
1043         }
1044
1045         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
1046 #ifdef __x86_64__
1047                 if ((vcpu->shadow_efer & EFER_LME)) {
1048                         uint32_t guest_cs_ar;
1049                         if (!is_pae()) {
1050                                 printd("set_cr0: #GP, start paging "
1051                                            "in long mode while PAE is disabled\n");
1052                                 inject_gp(vcpu);
1053                                 print_func_exit();
1054                                 return;
1055                         }
1056                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1057                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
1058                                 printd("set_cr0: #GP, start paging "
1059                                            "in long mode while CS.L == 1\n");
1060                                 inject_gp(vcpu);
1061                                 print_func_exit();
1062                                 return;
1063
1064                         }
1065                 } else
1066 #endif
1067                 if (is_pae() && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1068                         printd("set_cr0: #GP, pdptrs " "reserved bits\n");
1069                         inject_gp(vcpu);
1070                         print_func_exit();
1071                         return;
1072                 }
1073
1074         }
1075
1076         __set_cr0(vcpu, cr0);
1077         litevm_mmu_reset_context(vcpu);
1078         print_func_exit();
1079         return;
1080 }
1081
1082 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
1083 {
1084         print_func_entry();
1085         unsigned long cr0 = guest_cr0();
1086
1087         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
1088                 enter_pmode(vcpu);
1089                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
1090
1091         } else
1092                 printd("lmsw: unexpected\n");
1093
1094         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
1095                                 | (msw & LMSW_GUEST_MASK));
1096         print_func_exit();
1097 }
1098
1099 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1100 {
1101         print_func_entry();
1102         vmcs_writel(CR4_READ_SHADOW, cr4);
1103         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1104                                                                   LITEVM_RMODE_VM_CR4_ALWAYS_ON :
1105                                                                   LITEVM_PMODE_VM_CR4_ALWAYS_ON));
1106         print_func_exit();
1107 }
1108
1109 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1110 {
1111         print_func_entry();
1112         if (cr4 & CR4_RESEVED_BITS) {
1113                 printd("set_cr4: #GP, reserved bits\n");
1114                 inject_gp(vcpu);
1115                 print_func_exit();
1116                 return;
1117         }
1118
1119         if (is_long_mode()) {
1120                 if (!(cr4 & CR4_PAE_MASK)) {
1121                         printd("set_cr4: #GP, clearing PAE while " "in long mode\n");
1122                         inject_gp(vcpu);
1123                         print_func_exit();
1124                         return;
1125                 }
1126         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
1127                            && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1128                 printd("set_cr4: #GP, pdptrs reserved bits\n");
1129                 inject_gp(vcpu);
1130         }
1131
1132         if (cr4 & CR4_VMXE_MASK) {
1133                 printd("set_cr4: #GP, setting VMXE\n");
1134                 inject_gp(vcpu);
1135                 print_func_exit();
1136                 return;
1137         }
1138         __set_cr4(vcpu, cr4);
1139         spin_lock_irqsave(&vcpu->litevm->lock);
1140         litevm_mmu_reset_context(vcpu);
1141         spin_unlock(&vcpu->litevm->lock);
1142         print_func_exit();
1143 }
1144
1145 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1146 {
1147         print_func_entry();
1148         if (is_long_mode()) {
1149                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
1150                         printd("set_cr3: #GP, reserved bits\n");
1151                         inject_gp(vcpu);
1152                         print_func_exit();
1153                         return;
1154                 }
1155         } else {
1156                 if (cr3 & CR3_RESEVED_BITS) {
1157                         printd("set_cr3: #GP, reserved bits\n");
1158                         inject_gp(vcpu);
1159                         print_func_exit();
1160                         return;
1161                 }
1162                 if (is_paging() && is_pae() && pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1163                         printd("set_cr3: #GP, pdptrs " "reserved bits\n");
1164                         inject_gp(vcpu);
1165                         print_func_exit();
1166                         return;
1167                 }
1168         }
1169
1170         vcpu->cr3 = cr3;
1171         spin_lock_irqsave(&vcpu->litevm->lock);
1172         vcpu->mmu.new_cr3(vcpu);
1173         spin_unlock(&vcpu->litevm->lock);
1174         print_func_exit();
1175 }
1176
1177 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1178 {
1179         print_func_entry();
1180         if (cr8 & CR8_RESEVED_BITS) {
1181                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1182                 inject_gp(vcpu);
1183                 print_func_exit();
1184                 return;
1185         }
1186         vcpu->cr8 = cr8;
1187         print_func_exit();
1188 }
1189
1190 static uint32_t get_rdx_init_val(void)
1191 {
1192         print_func_entry();
1193         uint32_t val;
1194
1195 asm("movl $1, %%eax \n\t" "movl %%eax, %0 \n\t":"=g"(val));
1196         print_func_exit();
1197         return val;
1198
1199 }
1200
1201 static void fx_init(struct litevm_vcpu *vcpu)
1202 {
1203         print_func_entry();
1204         struct __attribute__ ((__packed__)) fx_image_s {
1205                 uint16_t control;               //fcw
1206                 uint16_t status;                //fsw
1207                 uint16_t tag;                   // ftw
1208                 uint16_t opcode;                //fop
1209                 uint64_t ip;                    // fpu ip
1210                 uint64_t operand;               // fpu dp
1211                 uint32_t mxcsr;
1212                 uint32_t mxcsr_mask;
1213
1214         } *fx_image;
1215
1216         fx_save(vcpu->host_fx_image);
1217         fpu_init();
1218         fx_save(vcpu->guest_fx_image);
1219         fx_restore(vcpu->host_fx_image);
1220
1221         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1222         fx_image->mxcsr = 0x1f80;
1223         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1224                    0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1225         print_func_exit();
1226 }
1227
1228 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field,
1229                                                                    uint32_t val)
1230 {
1231         print_func_entry();
1232         uint32_t msr_high, msr_low;
1233         uint64_t msrval;
1234
1235         msrval = read_msr(msr);
1236         msr_low = msrval;
1237         msr_high = (msrval >> 32);
1238
1239         val &= msr_high;
1240         val |= msr_low;
1241         vmcs_write32(vmcs_field, val);
1242         print_func_exit();
1243 }
1244
1245 /*
1246  * Sets up the vmcs for emulated real mode.
1247  */
1248 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1249 {
1250         print_func_entry();
1251 /* no op on x86_64 */
1252 #define asmlinkage
1253         extern asmlinkage void litevm_vmx_return(void);
1254         uint32_t host_sysenter_cs;
1255         uint32_t junk;
1256         uint64_t a;
1257         struct descriptor_table dt;
1258         int i;
1259         int ret;
1260         uint64_t tsc;
1261         int nr_good_msrs;
1262
1263         if (!init_rmode_tss(vcpu->litevm)) {
1264                 error("vcpu_setup: init_rmode_tss failed");
1265         }
1266
1267         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1268         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1269         vcpu->cr8 = 0;
1270         vcpu->apic_base = 0xfee00000 |
1271                 /*for vcpu 0 */ MSR_IA32_APICBASE_BSP |
1272                 MSR_IA32_APICBASE_ENABLE;
1273
1274         fx_init(vcpu);
1275
1276 #define SEG_SETUP(seg) do {                                     \
1277                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1278                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1279                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1280                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1281         } while (0)
1282
1283         /*
1284          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1285          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1286          */
1287         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1288         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1289         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1290         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1291
1292         SEG_SETUP(DS);
1293         SEG_SETUP(ES);
1294         SEG_SETUP(FS);
1295         SEG_SETUP(GS);
1296         SEG_SETUP(SS);
1297
1298         vmcs_write16(GUEST_TR_SELECTOR, 0);
1299         vmcs_writel(GUEST_TR_BASE, 0);
1300         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1301         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1302
1303         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1304         vmcs_writel(GUEST_LDTR_BASE, 0);
1305         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1306         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1307
1308         vmcs_write32(GUEST_SYSENTER_CS, 0);
1309         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1310         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1311
1312         vmcs_writel(GUEST_RFLAGS, 0x02);
1313         vmcs_writel(GUEST_RIP, 0xfff0);
1314         vmcs_writel(GUEST_RSP, 0);
1315
1316         vmcs_writel(GUEST_CR3, 0);
1317
1318         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1319         vmcs_writel(GUEST_DR7, 0x400);
1320
1321         vmcs_writel(GUEST_GDTR_BASE, 0);
1322         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1323
1324         vmcs_writel(GUEST_IDTR_BASE, 0);
1325         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1326
1327         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1328         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1329         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1330
1331         /* I/O */
1332         vmcs_write64(IO_BITMAP_A, 0);
1333         vmcs_write64(IO_BITMAP_B, 0);
1334
1335         tsc = read_tsc();
1336         vmcs_write64(TSC_OFFSET, -tsc);
1337
1338         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1339
1340         /* Special registers */
1341         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1342
1343         /* Control */
1344         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_EXT_INTR_MASK       /* 20.6.1 */
1345                                                    | PIN_BASED_NMI_EXITING      /* 20.6.1 */
1346                 );
1347         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_HLT_EXITING        /* 20.6.2 */
1348                                                    | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1349                                                    | CPU_BASED_CR8_STORE_EXITING        /* 20.6.2 */
1350                                                    | CPU_BASED_UNCOND_IO_EXITING        /* 20.6.2 */
1351                                                    | CPU_BASED_INVDPG_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING  /* 21.3 */
1352                 );
1353
1354         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1355         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1356         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1357         vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1358
1359         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1360         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1361         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1362
1363 #warning "not setting selectors; do we need them?"
1364 #if 0
1365         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);    /* 22.2.4 */
1366         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1367         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1368 #endif
1369         vmcs_write16(HOST_FS_SELECTOR, read_fs());      /* 22.2.4 */
1370         vmcs_write16(HOST_GS_SELECTOR, read_gs());      /* 22.2.4 */
1371 #if 0
1372         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1373 #endif
1374 #ifdef __x86_64__
1375         a = read_msr(MSR_FS_BASE);
1376         vmcs_writel(HOST_FS_BASE, a);   /* 22.2.4 */
1377         a = read_msr(MSR_GS_BASE);
1378         vmcs_writel(HOST_GS_BASE, a);   /* 22.2.4 */
1379 #else
1380         vmcs_writel(HOST_FS_BASE, 0);   /* 22.2.4 */
1381         vmcs_writel(HOST_GS_BASE, 0);   /* 22.2.4 */
1382 #endif
1383
1384 #warning "Not setting HOST_TR_SELECTOR"
1385 #if 0
1386         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS * 8);      /* 22.2.4 */
1387 #endif
1388
1389         get_idt(&dt);
1390         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1391
1392         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return);        /* 22.2.5 */
1393
1394         /* it's the HIGH 32 bits! */
1395         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1396         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1397         a = read_msr(MSR_IA32_SYSENTER_ESP);
1398         vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1399         a = read_msr(MSR_IA32_SYSENTER_EIP);
1400         vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1401
1402         ret = -ENOMEM;
1403         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1404         if (!vcpu->guest_msrs)
1405                 error("guest_msrs kmalloc failed");
1406         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1407         if (!vcpu->host_msrs)
1408                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1409
1410         for (i = 0; i < NR_VMX_MSR; ++i) {
1411                 uint32_t index = vmx_msr_index[i];
1412                 uint32_t data_low, data_high;
1413                 uint64_t data;
1414                 int j = vcpu->nmsrs;
1415
1416 #warning "need readmsr_safe"
1417 //      if (rdmsr_safe(index, &data_low, &data_high) < 0)
1418 //          continue;
1419                 data = read_msr(index);
1420                 vcpu->host_msrs[j].index = index;
1421                 vcpu->host_msrs[j].reserved = 0;
1422                 vcpu->host_msrs[j].data = data;
1423                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1424                 ++vcpu->nmsrs;
1425         }
1426         printk("msrs: %d\n", vcpu->nmsrs);
1427
1428         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1429         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1430         vmcs_writel(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1431         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1432         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, (HOST_IS_64 << 9));        /* 22.2,1, 20.7.1 */
1433         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs);    /* 22.2.2 */
1434         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);     /* 22.2.2 */
1435         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs);    /* 22.2.2 */
1436
1437         /* 22.2.1, 20.8.1 */
1438         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, VM_ENTRY_CONTROLS, 0);
1439         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
1440
1441         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1442         vmcs_writel(TPR_THRESHOLD, 0);
1443
1444         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1445         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1446
1447         __set_cr0(vcpu, 0x60000010);    // enter rmode
1448         __set_cr4(vcpu, 0);
1449 #ifdef __x86_64__
1450         __set_efer(vcpu, 0);
1451 #endif
1452
1453         ret = litevm_mmu_init(vcpu);
1454
1455         print_func_exit();
1456         return ret;
1457
1458 out_free_guest_msrs:
1459         kfree(vcpu->guest_msrs);
1460 out:
1461         return ret;
1462 }
1463
1464 /*
1465  * Sync the rsp and rip registers into the vcpu structure.  This allows
1466  * registers to be accessed by indexing vcpu->regs.
1467  */
1468 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1469 {
1470         print_func_entry();
1471         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1472         vcpu->rip = vmcs_readl(GUEST_RIP);
1473         print_func_exit();
1474 }
1475
1476 /*
1477  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1478  * modification.
1479  */
1480 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1481 {
1482         print_func_entry();
1483         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1484         vmcs_writel(GUEST_RIP, vcpu->rip);
1485         print_func_exit();
1486 }
1487
1488 /*
1489  * Creates some virtual cpus.  Good luck creating more than one.
1490  */
1491 int vmx_create_vcpu(struct litevm *litevm, int n)
1492 {
1493         print_func_entry();
1494         ERRSTACK(1);
1495         int r;
1496         struct litevm_vcpu *vcpu;
1497         struct vmcs *vmcs;
1498         char *errstring = NULL;
1499
1500         if (n < 0 || n >= LITEVM_MAX_VCPUS) {
1501                 printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1502                            LITEVM_MAX_VCPUS);
1503                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1504                           LITEVM_MAX_VCPUS);
1505         }
1506
1507         vcpu = &litevm->vcpus[n];
1508
1509         qlock(&vcpu->mutex);
1510
1511         if (vcpu->vmcs) {
1512                 qunlock(&vcpu->mutex);
1513                 printk("VM already exists\n");
1514                 error("VM already exists");
1515         }
1516
1517         /* I'm a bad person */
1518         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1519         uint64_t a = (uint64_t) vcpu->fx_buf;
1520         a += FX_IMAGE_ALIGN - 1;
1521         a /= FX_IMAGE_ALIGN;
1522         a *= FX_IMAGE_ALIGN;
1523
1524         vcpu->host_fx_image = (char *)a;
1525         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1526
1527         vcpu->cpu = -1; /* First load will set up TR */
1528         vcpu->litevm = litevm;
1529
1530         vmcs = alloc_vmcs();
1531         if (!vmcs) {
1532                 errstring = "vmcs allocate failed";
1533                 printk("%s\n", errstring);
1534                 qunlock(&vcpu->mutex);
1535                 goto out_free_vcpus;
1536         }
1537         vmcs_clear(vmcs);
1538         printk("after vmcs_clear\n");
1539         vcpu->vmcs = vmcs;
1540         vcpu->launched = 0;
1541         printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs);
1542         error("before vcpu_load");
1543         __vcpu_load(vcpu);
1544
1545         printk("PAST vcpu_load\n");
1546 #warning unmatched waserror!
1547         if (waserror()) {
1548                 /* we really need to fix waserror() */
1549                 poperror();
1550                 goto out_free_vcpus;
1551         }
1552
1553         r = litevm_vcpu_setup(vcpu);
1554
1555         vcpu_put(vcpu);
1556
1557         printk("r is %d\n", r);
1558
1559         if (!r) {
1560
1561                 print_func_exit();
1562                 return 0;
1563         }
1564
1565         errstring = "vcup set failed";
1566
1567 out_free_vcpus:
1568         printk("out_free_vcpus: life sucks\n");
1569         litevm_free_vcpu(vcpu);
1570         error(errstring);
1571 out:
1572         print_func_exit();
1573         return r;
1574 }
1575
1576 /*
1577  * Allocate some memory and give it an address in the guest physical address
1578  * space.
1579  *
1580  * Discontiguous memory is allowed, mostly for framebuffers.
1581  */
1582 int vm_set_memory_region(struct litevm *litevm,
1583                                                  struct litevm_memory_region *mem)
1584 {
1585         print_func_entry();
1586         ERRSTACK(2);
1587         int r;
1588         gfn_t base_gfn;
1589         unsigned long npages;
1590         unsigned long i;
1591         struct litevm_memory_slot *memslot;
1592         struct litevm_memory_slot old, new;
1593         int memory_config_version;
1594         void *init_data = mem->init_data;
1595         int pass = 1;
1596
1597         printk("litevm %p\n", litevm);
1598         /* should not happen but ... */
1599         if (!litevm)
1600                 error("NULL litevm in %s", __func__);
1601
1602         if (!mem)
1603                 error("NULL mem in %s", __func__);
1604
1605         if (litevm->busy)
1606                 error("litevm->busy is set! 0x%x\n", litevm->busy);
1607         r = -EINVAL;
1608         /* General sanity checks */
1609         if (mem->memory_size & (PAGE_SIZE - 1))
1610                 error("mem->memory_size %lld is not page-aligned", mem->memory_size);
1611         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1612                 error("guest_phys_addr 0x%llx is not page-aligned",
1613                           mem->guest_phys_addr);
1614         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1615                 error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
1616         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1617                 error("0x%x + 0x%x is < 0x%x",
1618                           mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
1619
1620         memslot = &litevm->memslots[mem->slot];
1621         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1622         npages = mem->memory_size >> PAGE_SHIFT;
1623
1624         if (!npages)
1625                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1626
1627         /* this is actually a very tricky for loop. The use of
1628          * error is a bit dangerous, so we don't use it much.
1629          * consider a rewrite. Would be nice if akaros could do the
1630          * allocation of a bunch of pages for us.
1631          */
1632 raced:
1633         printk("raced: pass %d\n", pass);
1634         spin_lock_irqsave(&litevm->lock);
1635         printk("locked\n");
1636
1637         if (waserror()) {
1638                 spin_unlock(&litevm->lock);
1639                 nexterror();
1640         }
1641
1642         memory_config_version = litevm->memory_config_version;
1643         new = old = *memslot;
1644
1645         new.base_gfn = base_gfn;
1646         new.npages = npages;
1647         new.flags = mem->flags;
1648
1649         /* Disallow changing a memory slot's size. */
1650         r = -EINVAL;
1651         if (npages && old.npages && npages != old.npages)
1652                 error("npages is %d, old.npages is %d, can't change",
1653                           npages, old.npages);
1654
1655         /* Check for overlaps */
1656         r = -EEXIST;
1657         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1658                 struct litevm_memory_slot *s = &litevm->memslots[i];
1659
1660                 if (s == memslot)
1661                         continue;
1662                 if (!((base_gfn + npages <= s->base_gfn) ||
1663                           (base_gfn >= s->base_gfn + s->npages)))
1664                         error("Overlap");
1665         }
1666         /*
1667          * Do memory allocations outside lock.  memory_config_version will
1668          * detect any races.
1669          */
1670         spin_unlock(&litevm->lock);
1671         printk("unlocked\n");
1672         poperror();
1673
1674         /* Deallocate if slot is being removed */
1675         if (!npages)
1676                 new.phys_mem = 0;
1677
1678         /* Free page dirty bitmap if unneeded */
1679         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1680                 new.dirty_bitmap = 0;
1681
1682         r = -ENOMEM;
1683
1684         /* Allocate if a slot is being created */
1685         if (npages && !new.phys_mem) {
1686                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1687
1688                 if (!new.phys_mem)
1689                         goto out_free;
1690
1691                 for (i = 0; i < npages; ++i) {
1692                         int ret;
1693                         ret = kpage_alloc(&new.phys_mem[i]);
1694                         if (ret != ESUCCESS)
1695                                 goto out_free;
1696                         if (init_data) {
1697                                 printk("init data memcpy(%p,%p,4096);\n",
1698                                            page2kva(new.phys_mem[i]), init_data);
1699                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1700                                 init_data += PAGE_SIZE;
1701                         }
1702                 }
1703         }
1704
1705         /* Allocate page dirty bitmap if needed */
1706         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1707                 unsigned dirty_bytes;   //ALIGN(npages, BITS_PER_LONG) / 8;
1708                 dirty_bytes =
1709                         (((npages + BITS_PER_LONG -
1710                            1) / BITS_PER_LONG) * BITS_PER_LONG) / 8;
1711
1712                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1713                 if (!new.dirty_bitmap) {
1714                         printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
1715                         goto out_free;
1716                 }
1717         }
1718
1719         spin_lock_irqsave(&litevm->lock);
1720         printk("locked\n");
1721         if (memory_config_version != litevm->memory_config_version) {
1722                 spin_unlock(&litevm->lock);
1723                 printk("unlocked, try again\n");
1724                 litevm_free_physmem_slot(&new, &old);
1725                 goto raced;
1726         }
1727
1728         r = -EAGAIN;
1729         if (litevm->busy) {
1730                 printk("BUSY!\n");
1731                 goto out_unlock;
1732         }
1733
1734         if (mem->slot >= litevm->nmemslots)
1735                 litevm->nmemslots = mem->slot + 1;
1736
1737         *memslot = new;
1738         ++litevm->memory_config_version;
1739
1740         spin_unlock(&litevm->lock);
1741         printk("unlocked\n");
1742         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1743                 struct litevm_vcpu *vcpu;
1744
1745                 vcpu = vcpu_load(litevm, i);
1746                 if (!vcpu)
1747                         continue;
1748                 litevm_mmu_reset_context(vcpu);
1749                 vcpu_put(vcpu);
1750         }
1751
1752         litevm_free_physmem_slot(&old, &new);
1753         print_func_exit();
1754         return 0;
1755
1756 out_unlock:
1757         spin_unlock(&litevm->lock);
1758         printk("out_unlock\n");
1759 out_free:
1760         printk("out_free\n");
1761         litevm_free_physmem_slot(&new, &old);
1762 out:
1763         printk("vm_set_memory_region: return %d\n", r);
1764         print_func_exit();
1765         return r;
1766 }
1767
1768 #if 0
1769 /*
1770  * Get (and clear) the dirty memory log for a memory slot.
1771  */
1772 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1773                                                                                   struct litevm_dirty_log *log)
1774 {
1775         struct litevm_memory_slot *memslot;
1776         int r, i;
1777         int n;
1778         unsigned long any = 0;
1779
1780         spin_lock_irqsave(&litevm->lock);
1781
1782         /*
1783          * Prevent changes to guest memory configuration even while the lock
1784          * is not taken.
1785          */
1786         ++litevm->busy;
1787         spin_unlock(&litevm->lock);
1788         r = -EINVAL;
1789         if (log->slot >= LITEVM_MEMORY_SLOTS)
1790                 goto out;
1791
1792         memslot = &litevm->memslots[log->slot];
1793         r = -ENOENT;
1794         if (!memslot->dirty_bitmap)
1795                 goto out;
1796
1797         n = ALIGN(memslot->npages, 8) / 8;
1798
1799         for (i = 0; !any && i < n; ++i)
1800                 any = memslot->dirty_bitmap[i];
1801
1802         r = -EFAULT;
1803         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1804                 goto out;
1805
1806         if (any) {
1807                 spin_lock_irqsave(&litevm->lock);
1808                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1809                 spin_unlock(&litevm->lock);
1810                 memset(memslot->dirty_bitmap, 0, n);
1811                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1812                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1813
1814                         if (!vcpu)
1815                                 continue;
1816                         flush_guest_tlb(vcpu);
1817                         vcpu_put(vcpu);
1818                 }
1819         }
1820
1821         r = 0;
1822
1823 out:
1824         spin_lock_irqsave(&litevm->lock);
1825         --litevm->busy;
1826         spin_unlock(&litevm->lock);
1827         return r;
1828 }
1829 #endif
1830
1831 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1832 {
1833         print_func_entry();
1834         int i;
1835
1836         for (i = 0; i < litevm->nmemslots; ++i) {
1837                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1838
1839                 if (gfn >= memslot->base_gfn
1840                         && gfn < memslot->base_gfn + memslot->npages) {
1841                         print_func_exit();
1842                         return memslot;
1843                 }
1844         }
1845         print_func_exit();
1846         return 0;
1847 }
1848
1849 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1850 {
1851         print_func_entry();
1852         int i;
1853         struct litevm_memory_slot *memslot = 0;
1854         unsigned long rel_gfn;
1855
1856         for (i = 0; i < litevm->nmemslots; ++i) {
1857                 memslot = &litevm->memslots[i];
1858
1859                 if (gfn >= memslot->base_gfn
1860                         && gfn < memslot->base_gfn + memslot->npages) {
1861
1862                         if (!memslot || !memslot->dirty_bitmap) {
1863                                 print_func_exit();
1864                                 return;
1865                         }
1866
1867                         rel_gfn = gfn - memslot->base_gfn;
1868
1869                         /* avoid RMW */
1870                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1871                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1872                         print_func_exit();
1873                         return;
1874                 }
1875         }
1876         print_func_exit();
1877 }
1878
1879 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1880 {
1881         print_func_entry();
1882         unsigned long rip;
1883         uint32_t interruptibility;
1884
1885         rip = vmcs_readl(GUEST_RIP);
1886         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1887         vmcs_writel(GUEST_RIP, rip);
1888
1889         /*
1890          * We emulated an instruction, so temporary interrupt blocking
1891          * should be removed, if set.
1892          */
1893         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1894         if (interruptibility & 3)
1895                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3);
1896         print_func_exit();
1897 }
1898
1899 static int emulator_read_std(unsigned long addr,
1900                                                          unsigned long *val,
1901                                                          unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1902 {
1903         print_func_entry();
1904         struct litevm_vcpu *vcpu = ctxt->vcpu;
1905         void *data = val;
1906
1907         while (bytes) {
1908                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1909                 unsigned offset = addr & (PAGE_SIZE - 1);
1910                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ?
1911                         bytes : (unsigned)PAGE_SIZE - offset;
1912                 unsigned long pfn;
1913                 struct litevm_memory_slot *memslot;
1914                 void *page;
1915
1916                 if (gpa == UNMAPPED_GVA) {
1917                         print_func_exit();
1918                         return X86EMUL_PROPAGATE_FAULT;
1919                 }
1920                 pfn = gpa >> PAGE_SHIFT;
1921                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1922                 if (!memslot) {
1923                         print_func_exit();
1924                         return X86EMUL_UNHANDLEABLE;
1925                 }
1926                 page = page2kva(gfn_to_page(memslot, pfn));
1927
1928                 memcpy(data, page + offset, tocopy);
1929
1930                 bytes -= tocopy;
1931                 data += tocopy;
1932                 addr += tocopy;
1933         }
1934
1935         print_func_exit();
1936         return X86EMUL_CONTINUE;
1937 }
1938
1939 static int emulator_write_std(unsigned long addr,
1940                                                           unsigned long val,
1941                                                           unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1942 {
1943         print_func_entry();
1944         printk("emulator_write_std: addr %lx n %d\n", addr, bytes);
1945         print_func_exit();
1946         return X86EMUL_UNHANDLEABLE;
1947 }
1948
1949 static int emulator_read_emulated(unsigned long addr,
1950                                                                   unsigned long *val,
1951                                                                   unsigned int bytes,
1952                                                                   struct x86_emulate_ctxt *ctxt)
1953 {
1954         print_func_entry();
1955         struct litevm_vcpu *vcpu = ctxt->vcpu;
1956
1957         if (vcpu->mmio_read_completed) {
1958                 memcpy(val, vcpu->mmio_data, bytes);
1959                 vcpu->mmio_read_completed = 0;
1960                 print_func_exit();
1961                 return X86EMUL_CONTINUE;
1962         } else if (emulator_read_std(addr, val, bytes, ctxt)
1963                            == X86EMUL_CONTINUE) {
1964                 print_func_exit();
1965                 return X86EMUL_CONTINUE;
1966         } else {
1967                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1968                 if (gpa == UNMAPPED_GVA) {
1969                         print_func_exit();
1970                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
1971                 }
1972                 vcpu->mmio_needed = 1;
1973                 vcpu->mmio_phys_addr = gpa;
1974                 vcpu->mmio_size = bytes;
1975                 vcpu->mmio_is_write = 0;
1976
1977                 print_func_exit();
1978                 return X86EMUL_UNHANDLEABLE;
1979         }
1980 }
1981
1982 static int emulator_write_emulated(unsigned long addr,
1983                                                                    unsigned long val,
1984                                                                    unsigned int bytes,
1985                                                                    struct x86_emulate_ctxt *ctxt)
1986 {
1987         print_func_entry();
1988         struct litevm_vcpu *vcpu = ctxt->vcpu;
1989         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1990
1991         if (gpa == UNMAPPED_GVA) {
1992                 print_func_exit();
1993                 return X86EMUL_PROPAGATE_FAULT;
1994         }
1995
1996         vcpu->mmio_needed = 1;
1997         vcpu->mmio_phys_addr = gpa;
1998         vcpu->mmio_size = bytes;
1999         vcpu->mmio_is_write = 1;
2000         memcpy(vcpu->mmio_data, &val, bytes);
2001
2002         print_func_exit();
2003         return X86EMUL_CONTINUE;
2004 }
2005
2006 static int emulator_cmpxchg_emulated(unsigned long addr,
2007                                                                          unsigned long old,
2008                                                                          unsigned long new,
2009                                                                          unsigned int bytes,
2010                                                                          struct x86_emulate_ctxt *ctxt)
2011 {
2012         print_func_entry();
2013         static int reported;
2014
2015         if (!reported) {
2016                 reported = 1;
2017                 printk("litevm: emulating exchange as write\n");
2018         }
2019         print_func_exit();
2020         return emulator_write_emulated(addr, new, bytes, ctxt);
2021 }
2022
2023 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
2024 {
2025         print_func_entry();
2026         static int reported;
2027         uint8_t opcodes[4];
2028         unsigned long rip = vmcs_readl(GUEST_RIP);
2029         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
2030
2031         if (reported) {
2032                 print_func_exit();
2033                 return;
2034         }
2035
2036         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
2037
2038         printk("emulation failed but !mmio_needed?"
2039                    " rip %lx %02x %02x %02x %02x\n",
2040                    rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2041         reported = 1;
2042         print_func_exit();
2043 }
2044
2045 struct x86_emulate_ops emulate_ops = {
2046         .read_std = emulator_read_std,
2047         .write_std = emulator_write_std,
2048         .read_emulated = emulator_read_emulated,
2049         .write_emulated = emulator_write_emulated,
2050         .cmpxchg_emulated = emulator_cmpxchg_emulated,
2051 };
2052
2053 enum emulation_result {
2054         EMULATE_DONE,                           /* no further processing */
2055         EMULATE_DO_MMIO,                        /* litevm_run filled with mmio request */
2056         EMULATE_FAIL,                           /* can't emulate this instruction */
2057 };
2058
2059 static int emulate_instruction(struct litevm_vcpu *vcpu,
2060                                                            struct litevm_run *run,
2061                                                            unsigned long cr2, uint16_t error_code)
2062 {
2063         print_func_entry();
2064         struct x86_emulate_ctxt emulate_ctxt;
2065         int r;
2066         uint32_t cs_ar;
2067
2068         vcpu_load_rsp_rip(vcpu);
2069
2070         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2071
2072         emulate_ctxt.vcpu = vcpu;
2073         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
2074         emulate_ctxt.cr2 = cr2;
2075         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
2076                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
2077                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
2078                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2079
2080         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2081                 emulate_ctxt.cs_base = 0;
2082                 emulate_ctxt.ds_base = 0;
2083                 emulate_ctxt.es_base = 0;
2084                 emulate_ctxt.ss_base = 0;
2085                 emulate_ctxt.gs_base = 0;
2086                 emulate_ctxt.fs_base = 0;
2087         } else {
2088                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
2089                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
2090                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
2091                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
2092                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
2093                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
2094         }
2095
2096         vcpu->mmio_is_write = 0;
2097         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
2098
2099         if ((r || vcpu->mmio_is_write) && run) {
2100                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2101                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2102                 run->mmio.len = vcpu->mmio_size;
2103                 run->mmio.is_write = vcpu->mmio_is_write;
2104         }
2105
2106         if (r) {
2107                 if (!vcpu->mmio_needed) {
2108                         report_emulation_failure(&emulate_ctxt);
2109                         print_func_exit();
2110                         return EMULATE_FAIL;
2111                 }
2112                 print_func_exit();
2113                 return EMULATE_DO_MMIO;
2114         }
2115
2116         vcpu_put_rsp_rip(vcpu);
2117         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
2118
2119         if (vcpu->mmio_is_write) {
2120                 print_func_exit();
2121                 return EMULATE_DO_MMIO;
2122         }
2123
2124         print_func_exit();
2125         return EMULATE_DONE;
2126 }
2127
2128 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
2129 {
2130         print_func_entry();
2131         print_func_exit();
2132         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2133 }
2134
2135 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2136 {
2137         print_func_entry();
2138         vmcs_writel(GUEST_GDTR_BASE, base);
2139         vmcs_write32(GUEST_GDTR_LIMIT, limit);
2140         print_func_exit();
2141 }
2142
2143 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2144 {
2145         print_func_entry();
2146         vmcs_writel(GUEST_IDTR_BASE, base);
2147         vmcs_write32(GUEST_IDTR_LIMIT, limit);
2148         print_func_exit();
2149 }
2150
2151 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
2152                                    unsigned long *rflags)
2153 {
2154         print_func_entry();
2155         lmsw(vcpu, msw);
2156         *rflags = vmcs_readl(GUEST_RFLAGS);
2157         print_func_exit();
2158 }
2159
2160 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
2161 {
2162         print_func_entry();
2163         switch (cr) {
2164                 case 0:
2165                         print_func_exit();
2166                         return guest_cr0();
2167                 case 2:
2168                         print_func_exit();
2169                         return vcpu->cr2;
2170                 case 3:
2171                         print_func_exit();
2172                         return vcpu->cr3;
2173                 case 4:
2174                         print_func_exit();
2175                         return guest_cr4();
2176                 default:
2177                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2178                         print_func_exit();
2179                         return 0;
2180         }
2181 }
2182
2183 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
2184                                          unsigned long *rflags)
2185 {
2186         print_func_entry();
2187         switch (cr) {
2188                 case 0:
2189                         set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
2190                         *rflags = vmcs_readl(GUEST_RFLAGS);
2191                         break;
2192                 case 2:
2193                         vcpu->cr2 = val;
2194                         break;
2195                 case 3:
2196                         set_cr3(vcpu, val);
2197                         break;
2198                 case 4:
2199                         set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
2200                         break;
2201                 default:
2202                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2203         }
2204         print_func_exit();
2205 }
2206
2207 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
2208                                                                   int vec, uint32_t err_code)
2209 {
2210         print_func_entry();
2211         if (!vcpu->rmode.active) {
2212                 print_func_exit();
2213                 return 0;
2214         }
2215
2216         if (vec == GP_VECTOR && err_code == 0)
2217                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) {
2218                         print_func_exit();
2219                         return 1;
2220                 }
2221         print_func_exit();
2222         return 0;
2223 }
2224
2225 static int handle_exception(struct litevm_vcpu *vcpu,
2226                                                         struct litevm_run *litevm_run)
2227 {
2228         print_func_entry();
2229         uint32_t intr_info, error_code;
2230         unsigned long cr2, rip;
2231         uint32_t vect_info;
2232         enum emulation_result er;
2233
2234         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2235         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2236
2237         if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) {
2238                 printk("%s: unexpected, vectoring info 0x%x "
2239                            "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
2240         }
2241
2242         if (is_external_interrupt(vect_info)) {
2243                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2244                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_pending), irq);
2245                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_summary),
2246                                                            irq / BITS_PER_LONG);
2247         }
2248
2249         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) {  /* nmi */
2250                 asm("int $2");
2251                 print_func_exit();
2252                 return 1;
2253         }
2254         error_code = 0;
2255         rip = vmcs_readl(GUEST_RIP);
2256         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
2257                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2258         if (is_page_fault(intr_info)) {
2259                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2260
2261                 spin_lock_irqsave(&vcpu->litevm->lock);
2262                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
2263                         spin_unlock(&vcpu->litevm->lock);
2264                         print_func_exit();
2265                         return 1;
2266                 }
2267
2268                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
2269                 spin_unlock(&vcpu->litevm->lock);
2270
2271                 switch (er) {
2272                         case EMULATE_DONE:
2273                                 print_func_exit();
2274                                 return 1;
2275                         case EMULATE_DO_MMIO:
2276                                 ++litevm_stat.mmio_exits;
2277                                 litevm_run->exit_reason = LITEVM_EXIT_MMIO;
2278                                 print_func_exit();
2279                                 return 0;
2280                         case EMULATE_FAIL:
2281                                 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2282                                 break;
2283                         default:
2284                                 assert(0);
2285                 }
2286         }
2287
2288         if (vcpu->rmode.active &&
2289                 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2290                                                            error_code)) {
2291                 print_func_exit();
2292                 return 1;
2293         }
2294
2295         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
2296                 (INTR_TYPE_EXCEPTION | 1)) {
2297                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2298                 print_func_exit();
2299                 return 0;
2300         }
2301         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2302         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2303         litevm_run->ex.error_code = error_code;
2304         print_func_exit();
2305         return 0;
2306 }
2307
2308 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2309                                                                          struct litevm_run *litevm_run)
2310 {
2311         print_func_entry();
2312         ++litevm_stat.irq_exits;
2313         print_func_exit();
2314         return 1;
2315 }
2316
2317 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t * count)
2318 {
2319         print_func_entry();
2320         uint64_t inst;
2321         gva_t rip;
2322         int countr_size;
2323         int i, n;
2324
2325         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2326                 countr_size = 2;
2327         } else {
2328                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2329
2330                 countr_size = (cs_ar & AR_L_MASK) ? 8 : (cs_ar & AR_DB_MASK) ? 4 : 2;
2331         }
2332
2333         rip = vmcs_readl(GUEST_RIP);
2334         if (countr_size != 8)
2335                 rip += vmcs_readl(GUEST_CS_BASE);
2336
2337         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2338
2339         for (i = 0; i < n; i++) {
2340                 switch (((uint8_t *) & inst)[i]) {
2341                         case 0xf0:
2342                         case 0xf2:
2343                         case 0xf3:
2344                         case 0x2e:
2345                         case 0x36:
2346                         case 0x3e:
2347                         case 0x26:
2348                         case 0x64:
2349                         case 0x65:
2350                         case 0x66:
2351                                 break;
2352                         case 0x67:
2353                                 countr_size = (countr_size == 2) ? 4 : (countr_size >> 1);
2354                         default:
2355                                 goto done;
2356                 }
2357         }
2358         print_func_exit();
2359         return 0;
2360 done:
2361         countr_size *= 8;
2362         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2363         print_func_exit();
2364         return 1;
2365 }
2366
2367 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2368 {
2369         print_func_entry();
2370         uint64_t exit_qualification;
2371
2372         ++litevm_stat.io_exits;
2373         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2374         litevm_run->exit_reason = LITEVM_EXIT_IO;
2375         if (exit_qualification & 8)
2376                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2377         else
2378                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2379         litevm_run->io.size = (exit_qualification & 7) + 1;
2380         litevm_run->io.string = (exit_qualification & 16) != 0;
2381         litevm_run->io.string_down
2382                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2383         litevm_run->io.rep = (exit_qualification & 32) != 0;
2384         litevm_run->io.port = exit_qualification >> 16;
2385         if (litevm_run->io.string) {
2386                 if (!get_io_count(vcpu, &litevm_run->io.count)) {
2387                         print_func_exit();
2388                         return 1;
2389                 }
2390                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2391         } else
2392                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX];       /* rax */
2393         print_func_exit();
2394         return 0;
2395 }
2396
2397 static int handle_invlpg(struct litevm_vcpu *vcpu,
2398                                                  struct litevm_run *litevm_run)
2399 {
2400         print_func_entry();
2401         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2402         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2403         spin_lock_irqsave(&vcpu->litevm->lock);
2404         vcpu->mmu.inval_page(vcpu, address);
2405         spin_unlock(&vcpu->litevm->lock);
2406         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2407         print_func_exit();
2408         return 1;
2409 }
2410
2411 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2412 {
2413         print_func_entry();
2414         uint64_t exit_qualification;
2415         int cr;
2416         int reg;
2417
2418 #ifdef LITEVM_DEBUG
2419         if (guest_cpl() != 0) {
2420                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2421                 inject_gp(vcpu);
2422                 print_func_exit();
2423                 return 1;
2424         }
2425 #endif
2426
2427         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2428         cr = exit_qualification & 15;
2429         reg = (exit_qualification >> 8) & 15;
2430         switch ((exit_qualification >> 4) & 3) {
2431                 case 0: /* mov to cr */
2432                         switch (cr) {
2433                                 case 0:
2434                                         vcpu_load_rsp_rip(vcpu);
2435                                         set_cr0(vcpu, vcpu->regs[reg]);
2436                                         skip_emulated_instruction(vcpu);
2437                                         print_func_exit();
2438                                         return 1;
2439                                 case 3:
2440                                         vcpu_load_rsp_rip(vcpu);
2441                                         set_cr3(vcpu, vcpu->regs[reg]);
2442                                         skip_emulated_instruction(vcpu);
2443                                         print_func_exit();
2444                                         return 1;
2445                                 case 4:
2446                                         vcpu_load_rsp_rip(vcpu);
2447                                         set_cr4(vcpu, vcpu->regs[reg]);
2448                                         skip_emulated_instruction(vcpu);
2449                                         print_func_exit();
2450                                         return 1;
2451                                 case 8:
2452                                         vcpu_load_rsp_rip(vcpu);
2453                                         set_cr8(vcpu, vcpu->regs[reg]);
2454                                         skip_emulated_instruction(vcpu);
2455                                         print_func_exit();
2456                                         return 1;
2457                         };
2458                         break;
2459                 case 1: /*mov from cr */
2460                         switch (cr) {
2461                                 case 3:
2462                                         vcpu_load_rsp_rip(vcpu);
2463                                         vcpu->regs[reg] = vcpu->cr3;
2464                                         vcpu_put_rsp_rip(vcpu);
2465                                         skip_emulated_instruction(vcpu);
2466                                         print_func_exit();
2467                                         return 1;
2468                                 case 8:
2469                                         printd("handle_cr: read CR8 " "cpu erratum AA15\n");
2470                                         vcpu_load_rsp_rip(vcpu);
2471                                         vcpu->regs[reg] = vcpu->cr8;
2472                                         vcpu_put_rsp_rip(vcpu);
2473                                         skip_emulated_instruction(vcpu);
2474                                         print_func_exit();
2475                                         return 1;
2476                         }
2477                         break;
2478                 case 3: /* lmsw */
2479                         lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2480
2481                         skip_emulated_instruction(vcpu);
2482                         print_func_exit();
2483                         return 1;
2484                 default:
2485                         break;
2486         }
2487         litevm_run->exit_reason = 0;
2488         printk("litevm: unhandled control register: op %d cr %d\n",
2489                    (int)(exit_qualification >> 4) & 3, cr);
2490         print_func_exit();
2491         return 0;
2492 }
2493
2494 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2495 {
2496         print_func_entry();
2497         uint64_t exit_qualification;
2498         unsigned long val;
2499         int dr, reg;
2500
2501         /*
2502          * FIXME: this code assumes the host is debugging the guest.
2503          *        need to deal with guest debugging itself too.
2504          */
2505         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2506         dr = exit_qualification & 7;
2507         reg = (exit_qualification >> 8) & 15;
2508         vcpu_load_rsp_rip(vcpu);
2509         if (exit_qualification & 16) {
2510                 /* mov from dr */
2511                 switch (dr) {
2512                         case 6:
2513                                 val = 0xffff0ff0;
2514                                 break;
2515                         case 7:
2516                                 val = 0x400;
2517                                 break;
2518                         default:
2519                                 val = 0;
2520                 }
2521                 vcpu->regs[reg] = val;
2522         } else {
2523                 /* mov to dr */
2524         }
2525         vcpu_put_rsp_rip(vcpu);
2526         skip_emulated_instruction(vcpu);
2527         print_func_exit();
2528         return 1;
2529 }
2530
2531 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2532 {
2533         print_func_entry();
2534         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2535         print_func_exit();
2536         return 0;
2537 }
2538
2539 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2540 {
2541         print_func_entry();
2542         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2543         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2544         uint64_t data;
2545
2546         if (guest_cpl() != 0) {
2547                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2548                 inject_gp(vcpu);
2549                 print_func_exit();
2550                 return 1;
2551         }
2552
2553         switch (ecx) {
2554                 case MSR_FS_BASE:
2555                         data = vmcs_readl(GUEST_FS_BASE);
2556                         break;
2557                 case MSR_GS_BASE:
2558                         data = vmcs_readl(GUEST_GS_BASE);
2559                         break;
2560                 case MSR_IA32_SYSENTER_CS:
2561                         data = vmcs_read32(GUEST_SYSENTER_CS);
2562                         break;
2563                 case MSR_IA32_SYSENTER_EIP:
2564                         data = vmcs_read32(GUEST_SYSENTER_EIP);
2565                         break;
2566                 case MSR_IA32_SYSENTER_ESP:
2567                         data = vmcs_read32(GUEST_SYSENTER_ESP);
2568                         break;
2569                 case MSR_IA32_MC0_CTL:
2570                 case MSR_IA32_MCG_STATUS:
2571                 case MSR_IA32_MCG_CAP:
2572                 case MSR_IA32_MC0_MISC:
2573                 case MSR_IA32_MC0_MISC + 4:
2574                 case MSR_IA32_MC0_MISC + 8:
2575                 case MSR_IA32_MC0_MISC + 12:
2576                 case MSR_IA32_MC0_MISC + 16:
2577                 case MSR_IA32_UCODE_REV:
2578                         /* MTRR registers */
2579                 case 0xfe:
2580                 case 0x200 ... 0x2ff:
2581                         data = 0;
2582                         break;
2583                 case MSR_IA32_APICBASE:
2584                         data = vcpu->apic_base;
2585                         break;
2586                 default:
2587                         if (msr) {
2588                                 data = msr->data;
2589                                 break;
2590                         }
2591                         printk("litevm: unhandled rdmsr: %x\n", ecx);
2592                         inject_gp(vcpu);
2593                         print_func_exit();
2594                         return 1;
2595         }
2596
2597         /* FIXME: handling of bits 32:63 of rax, rdx */
2598         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2599         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2600         skip_emulated_instruction(vcpu);
2601         print_func_exit();
2602         return 1;
2603 }
2604
2605 #ifdef __x86_64__
2606
2607 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2608 {
2609         print_func_entry();
2610         struct vmx_msr_entry *msr;
2611
2612         if (efer & EFER_RESERVED_BITS) {
2613                 printd("set_efer: 0x%llx #GP, reserved bits\n", efer);
2614                 inject_gp(vcpu);
2615                 print_func_exit();
2616                 return;
2617         }
2618
2619         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2620                 printd("set_efer: #GP, change LME while paging\n");
2621                 inject_gp(vcpu);
2622                 print_func_exit();
2623                 return;
2624         }
2625
2626         efer &= ~EFER_LMA;
2627         efer |= vcpu->shadow_efer & EFER_LMA;
2628
2629         vcpu->shadow_efer = efer;
2630
2631         msr = find_msr_entry(vcpu, MSR_EFER);
2632
2633         if (!(efer & EFER_LMA))
2634                 efer &= ~EFER_LME;
2635         msr->data = efer;
2636         skip_emulated_instruction(vcpu);
2637         print_func_exit();
2638 }
2639
2640 #endif
2641
2642 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2643
2644 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2645 {
2646         print_func_entry();
2647         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2648         struct vmx_msr_entry *msr;
2649         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2650                 | ((uint64_t) (vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2651
2652         if (guest_cpl() != 0) {
2653                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2654                 inject_gp(vcpu);
2655                 print_func_exit();
2656                 return 1;
2657         }
2658
2659         switch (ecx) {
2660                 case MSR_FS_BASE:
2661                         vmcs_writel(GUEST_FS_BASE, data);
2662                         break;
2663                 case MSR_GS_BASE:
2664                         vmcs_writel(GUEST_GS_BASE, data);
2665                         break;
2666                 case MSR_IA32_SYSENTER_CS:
2667                         vmcs_write32(GUEST_SYSENTER_CS, data);
2668                         break;
2669                 case MSR_IA32_SYSENTER_EIP:
2670                         vmcs_write32(GUEST_SYSENTER_EIP, data);
2671                         break;
2672                 case MSR_IA32_SYSENTER_ESP:
2673                         vmcs_write32(GUEST_SYSENTER_ESP, data);
2674                         break;
2675                 case MSR_EFER:
2676                         set_efer(vcpu, data);
2677                         print_func_exit();
2678                         return 1;
2679                 case MSR_IA32_MC0_STATUS:
2680                         printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data);
2681                         break;
2682                 case MSR_IA32_TIME_STAMP_COUNTER:{
2683                                 uint64_t tsc;
2684
2685                                 tsc = read_tsc();
2686                                 vmcs_write64(TSC_OFFSET, data - tsc);
2687                                 break;
2688                         }
2689                 case MSR_IA32_UCODE_REV:
2690                 case MSR_IA32_UCODE_WRITE:
2691                 case 0x200 ... 0x2ff:   /* MTRRs */
2692                         break;
2693                 case MSR_IA32_APICBASE:
2694                         vcpu->apic_base = data;
2695                         break;
2696                 default:
2697                         msr = find_msr_entry(vcpu, ecx);
2698                         if (msr) {
2699                                 msr->data = data;
2700                                 break;
2701                         }
2702                         printk("litevm: unhandled wrmsr: %x\n", ecx);
2703                         inject_gp(vcpu);
2704                         print_func_exit();
2705                         return 1;
2706         }
2707         skip_emulated_instruction(vcpu);
2708         print_func_exit();
2709         return 1;
2710 }
2711
2712 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2713                                                                    struct litevm_run *litevm_run)
2714 {
2715         print_func_entry();
2716         /* Turn off interrupt window reporting. */
2717         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2718                                  vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2719                                  & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2720         print_func_exit();
2721         return 1;
2722 }
2723
2724 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2725 {
2726         print_func_entry();
2727         skip_emulated_instruction(vcpu);
2728         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) {
2729                 print_func_exit();
2730                 return 1;
2731         }
2732
2733         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2734         print_func_exit();
2735         return 0;
2736 }
2737
2738 /*
2739  * The exit handlers return 1 if the exit was handled fully and guest execution
2740  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2741  * to be done to userspace and return 0.
2742  */
2743 static int (*litevm_vmx_exit_handlers[]) (struct litevm_vcpu * vcpu,
2744                                                                                   struct litevm_run * litevm_run) = {
2745 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2746                 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2747                 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2748                 [EXIT_REASON_INVLPG] = handle_invlpg,
2749                 [EXIT_REASON_CR_ACCESS] = handle_cr,
2750                 [EXIT_REASON_DR_ACCESS] = handle_dr,
2751                 [EXIT_REASON_CPUID] = handle_cpuid,
2752                 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2753                 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2754                 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2755                 [EXIT_REASON_HLT] = handle_halt,};
2756
2757 static const int litevm_vmx_max_exit_handlers =
2758         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2759
2760 /*
2761  * The guest has exited.  See if we can fix it or if we need userspace
2762  * assistance.
2763  */
2764 static int litevm_handle_exit(struct litevm_run *litevm_run,
2765                                                           struct litevm_vcpu *vcpu)
2766 {
2767         print_func_entry();
2768         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2769         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2770
2771         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2772                 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2773                 printk("%s: unexpected, valid vectoring info and "
2774                            "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2775         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2776         if (exit_reason < litevm_vmx_max_exit_handlers
2777                 && litevm_vmx_exit_handlers[exit_reason]) {
2778                 print_func_exit();
2779                 return litevm_vmx_exit_handlers[exit_reason] (vcpu, litevm_run);
2780         } else {
2781                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2782                 litevm_run->hw.hardware_exit_reason = exit_reason;
2783         }
2784         print_func_exit();
2785         return 0;
2786 }
2787
2788 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2789 {
2790         print_func_entry();
2791         uint16_t ent[2];
2792         uint16_t cs;
2793         uint16_t ip;
2794         unsigned long flags;
2795         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2796         uint16_t sp = vmcs_readl(GUEST_RSP);
2797         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2798
2799         if (sp > ss_limit || ((sp - 6) > sp)) {
2800                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2801                                         __FUNCTION__,
2802                                         vmcs_readl(GUEST_RSP),
2803                                         vmcs_readl(GUEST_SS_BASE), vmcs_read32(GUEST_SS_LIMIT));
2804                 print_func_exit();
2805                 return;
2806         }
2807
2808         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2809                 sizeof(ent)) {
2810                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2811                 print_func_exit();
2812                 return;
2813         }
2814
2815         flags = vmcs_readl(GUEST_RFLAGS);
2816         cs = vmcs_readl(GUEST_CS_BASE) >> 4;
2817         ip = vmcs_readl(GUEST_RIP);
2818
2819         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2820                 litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2821                 litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2822                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2823                 print_func_exit();
2824                 return;
2825         }
2826
2827         vmcs_writel(GUEST_RFLAGS, flags &
2828                                 ~(X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2829         vmcs_write16(GUEST_CS_SELECTOR, ent[1]);
2830         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2831         vmcs_writel(GUEST_RIP, ent[0]);
2832         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2833         print_func_exit();
2834 }
2835
2836 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2837 {
2838         print_func_entry();
2839         int word_index = __ffs(vcpu->irq_summary);
2840         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2841         int irq = word_index * BITS_PER_LONG + bit_index;
2842
2843         /* don't have clear_bit and I'm not sure the akaros
2844          * bitops are really going to work.
2845          */
2846         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2847         if (!vcpu->irq_pending[word_index])
2848                 vcpu->irq_summary &= ~(1 << word_index);
2849
2850         if (vcpu->rmode.active) {
2851                 inject_rmode_irq(vcpu, irq);
2852                 print_func_exit();
2853                 return;
2854         }
2855         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2856                                  irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2857         print_func_exit();
2858 }
2859
2860 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2861 {
2862         print_func_entry();
2863         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2864                 && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2865                 /*
2866                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2867                  */
2868                 litevm_do_inject_irq(vcpu);
2869         else
2870                 /*
2871                  * Interrupts blocked.  Wait for unblock.
2872                  */
2873                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2874                                          vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2875                                          | CPU_BASED_VIRTUAL_INTR_PENDING);
2876         print_func_exit();
2877 }
2878
2879 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2880 {
2881         print_func_entry();
2882         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2883
2884 #warning "no debugging guests yet"
2885         assert(0);
2886 /*
2887         set_debugreg(dbg->bp[0], 0);
2888         set_debugreg(dbg->bp[1], 1);
2889         set_debugreg(dbg->bp[2], 2);
2890         set_debugreg(dbg->bp[3], 3);
2891 */
2892         if (dbg->singlestep) {
2893                 unsigned long flags;
2894
2895                 flags = vmcs_readl(GUEST_RFLAGS);
2896                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2897                 vmcs_writel(GUEST_RFLAGS, flags);
2898         }
2899         print_func_exit();
2900 }
2901
2902 static void load_msrs(struct vmx_msr_entry *e, int n)
2903 {
2904         print_func_entry();
2905         int i;
2906
2907         for (i = 0; i < n; ++i)
2908                 write_msr(e[i].index, e[i].data);
2909         print_func_exit();
2910 }
2911
2912 static void save_msrs(struct vmx_msr_entry *e, int n)
2913 {
2914         print_func_entry();
2915         int i;
2916
2917         for (i = 0; i < n; ++i)
2918                 e[i].data = read_msr(e[i].index);
2919         print_func_exit();
2920 }
2921
2922 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
2923 {
2924         print_func_entry();
2925         struct litevm_vcpu *vcpu;
2926         uint8_t fail;
2927         uint16_t fs_sel, gs_sel, ldt_sel;
2928         int fs_gs_ldt_reload_needed;
2929
2930         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2931                 error("vcpu is %d but must be in the range %d..%d\n",
2932                           litevm_run->vcpu, LITEVM_MAX_VCPUS);
2933
2934         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2935         if (!vcpu)
2936                 error("vcpu_load failed");
2937
2938         if (litevm_run->emulated) {
2939                 skip_emulated_instruction(vcpu);
2940                 litevm_run->emulated = 0;
2941         }
2942
2943         if (litevm_run->mmio_completed) {
2944                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
2945                 vcpu->mmio_read_completed = 1;
2946         }
2947
2948         vcpu->mmio_needed = 0;
2949
2950 again:
2951         /*
2952          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2953          * allow segment selectors with cpl > 0 or ti == 1.
2954          */
2955         fs_sel = read_fs();
2956         gs_sel = read_gs();
2957         ldt_sel = read_ldt();
2958         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
2959         if (!fs_gs_ldt_reload_needed) {
2960                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2961                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2962         } else {
2963                 vmcs_write16(HOST_FS_SELECTOR, 0);
2964                 vmcs_write16(HOST_GS_SELECTOR, 0);
2965         }
2966
2967 #ifdef __x86_64__
2968         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
2969         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2970 #endif
2971
2972         if (vcpu->irq_summary &&
2973                 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2974                 litevm_try_inject_irq(vcpu);
2975
2976         if (vcpu->guest_debug.enabled)
2977                 litevm_guest_debug_pre(vcpu);
2978
2979         fx_save(vcpu->host_fx_image);
2980         fx_restore(vcpu->guest_fx_image);
2981
2982         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
2983         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2984
2985         asm(
2986                    /* Store host registers */
2987                    "pushf \n\t"
2988 #ifdef __x86_64__
2989                    "push %%rax; push %%rbx; push %%rdx;"
2990                    "push %%rsi; push %%rdi; push %%rbp;"
2991                    "push %%r8;  push %%r9;  push %%r10; push %%r11;"
2992                    "push %%r12; push %%r13; push %%r14; push %%r15;"
2993                    "push %%rcx \n\t" "vmwrite %%rsp, %2 \n\t"
2994 #else
2995                    "pusha; push %%ecx \n\t" "vmwrite %%esp, %2 \n\t"
2996 #endif
2997                    /* Check if vmlaunch of vmresume is needed */
2998                    "cmp $0, %1 \n\t"
2999                    /* Load guest registers.  Don't clobber flags. */
3000 #ifdef __x86_64__
3001                    "mov %c[cr2](%3), %%rax \n\t" "mov %%rax, %%cr2 \n\t" "mov %c[rax](%3), %%rax \n\t" "mov %c[rbx](%3), %%rbx \n\t" "mov %c[rdx](%3), %%rdx \n\t" "mov %c[rsi](%3), %%rsi \n\t" "mov %c[rdi](%3), %%rdi \n\t" "mov %c[rbp](%3), %%rbp \n\t" "mov %c[r8](%3),  %%r8  \n\t" "mov %c[r9](%3),  %%r9  \n\t" "mov %c[r10](%3), %%r10 \n\t" "mov %c[r11](%3), %%r11 \n\t" "mov %c[r12](%3), %%r12 \n\t" "mov %c[r13](%3), %%r13 \n\t" "mov %c[r14](%3), %%r14 \n\t" "mov %c[r15](%3), %%r15 \n\t" "mov %c[rcx](%3), %%rcx \n\t"      /* kills %3 (rcx) */
3002 #else
3003                    "mov %c[cr2](%3), %%eax \n\t" "mov %%eax,   %%cr2 \n\t" "mov %c[rax](%3), %%eax \n\t" "mov %c[rbx](%3), %%ebx \n\t" "mov %c[rdx](%3), %%edx \n\t" "mov %c[rsi](%3), %%esi \n\t" "mov %c[rdi](%3), %%edi \n\t" "mov %c[rbp](%3), %%ebp \n\t" "mov %c[rcx](%3), %%ecx \n\t"    /* kills %3 (ecx) */
3004 #endif
3005                    /* Enter guest mode */
3006                    "jne launched \n\t"
3007                    "vmlaunch \n\t"
3008                    "jmp litevm_vmx_return \n\t"
3009                    "launched: vmresume \n\t"
3010                    ".globl litevm_vmx_return \n\t" "litevm_vmx_return: "
3011                    /* Save guest registers, load host registers, keep flags */
3012 #ifdef __x86_64__
3013                    "xchg %3,     0(%%rsp) \n\t"
3014                    "mov %%rax, %c[rax](%3) \n\t"
3015                    "mov %%rbx, %c[rbx](%3) \n\t"
3016                    "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
3017                    "mov %%rdx, %c[rdx](%3) \n\t"
3018                    "mov %%rsi, %c[rsi](%3) \n\t"
3019                    "mov %%rdi, %c[rdi](%3) \n\t"
3020                    "mov %%rbp, %c[rbp](%3) \n\t"
3021                    "mov %%r8,  %c[r8](%3) \n\t"
3022                    "mov %%r9,  %c[r9](%3) \n\t"
3023                    "mov %%r10, %c[r10](%3) \n\t"
3024                    "mov %%r11, %c[r11](%3) \n\t"
3025                    "mov %%r12, %c[r12](%3) \n\t"
3026                    "mov %%r13, %c[r13](%3) \n\t"
3027                    "mov %%r14, %c[r14](%3) \n\t"
3028                    "mov %%r15, %c[r15](%3) \n\t"
3029                    "mov %%cr2, %%rax   \n\t"
3030                    "mov %%rax, %c[cr2](%3) \n\t"
3031                    "mov 0(%%rsp), %3 \n\t"
3032                    "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
3033                    "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
3034                    "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
3035                    "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
3036 #else
3037                    "xchg %3, 0(%%esp) \n\t"
3038                    "mov %%eax, %c[rax](%3) \n\t"
3039                    "mov %%ebx, %c[rbx](%3) \n\t"
3040                    "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
3041                    "mov %%edx, %c[rdx](%3) \n\t"
3042                    "mov %%esi, %c[rsi](%3) \n\t"
3043                    "mov %%edi, %c[rdi](%3) \n\t"
3044                    "mov %%ebp, %c[rbp](%3) \n\t"
3045                    "mov %%cr2, %%eax  \n\t"
3046                    "mov %%eax, %c[cr2](%3) \n\t"
3047                    "mov 0(%%esp), %3 \n\t" "pop %%ecx; popa \n\t"
3048 #endif
3049 "setbe %0 \n\t" "popf \n\t":"=g"(fail)
3050 :                  "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
3051                    "c"(vcpu),
3052                    [rax] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
3053                    [rbx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
3054                    [rcx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
3055                    [rdx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
3056                    [rsi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
3057                    [rdi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
3058                    [rbp] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
3059 #ifdef __x86_64__
3060                    [r8] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8])),
3061                    [r9] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9])),
3062                    [r10] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
3063                    [r11] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
3064                    [r12] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
3065                    [r13] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
3066                    [r14] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
3067                    [r15] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
3068 #endif
3069                    [cr2] "i"(offsetof(struct litevm_vcpu, cr2))
3070                    :"cc", "memory");
3071
3072         ++litevm_stat.exits;
3073         printk("vm_run exits");
3074         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3075         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
3076
3077         fx_save(vcpu->guest_fx_image);
3078         fx_restore(vcpu->host_fx_image);
3079
3080 #ifndef __x86_64__
3081 asm("mov %0, %%ds; mov %0, %%es": :"r"(__USER_DS));
3082 #endif
3083
3084         litevm_run->exit_type = 0;
3085         if (fail) {
3086                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
3087                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
3088         } else {
3089                 if (fs_gs_ldt_reload_needed) {
3090                         load_ldt(ldt_sel);
3091                         load_fs(fs_sel);
3092                         /*
3093                          * If we have to reload gs, we must take care to
3094                          * preserve our gs base.
3095                          */
3096                         disable_irq();
3097                         load_gs(gs_sel);
3098 #ifdef __x86_64__
3099                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
3100 #endif
3101                         enable_irq();
3102
3103                         reload_tss();
3104                 }
3105                 vcpu->launched = 1;
3106                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
3107                 if (litevm_handle_exit(litevm_run, vcpu)) {
3108                         /* Give scheduler a change to reschedule. */
3109                         vcpu_put(vcpu);
3110 #warning "how to tell if signal is pending"
3111 /*
3112                         if (signal_pending(current)) {
3113                                 ++litevm_stat.signal_exits;
3114                                 return -EINTR;
3115                         }
3116 */
3117                         kthread_yield();
3118                         /* Cannot fail -  no vcpu unplug yet. */
3119                         vcpu_load(litevm, vcpu_slot(vcpu));
3120                         goto again;
3121                 }
3122         }
3123
3124         vcpu_put(vcpu);
3125         printk("vm_run returns\n");
3126         print_func_exit();
3127         return 0;
3128 }
3129
3130 static int litevm_dev_ioctl_get_regs(struct litevm *litevm,
3131                                                                          struct litevm_regs *regs)
3132 {
3133         print_func_entry();
3134         struct litevm_vcpu *vcpu;
3135
3136         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3137                 print_func_exit();
3138                 return -EINVAL;
3139         }
3140
3141         vcpu = vcpu_load(litevm, regs->vcpu);
3142         if (!vcpu) {
3143                 print_func_exit();
3144                 return -ENOENT;
3145         }
3146
3147         regs->rax = vcpu->regs[VCPU_REGS_RAX];
3148         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
3149         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
3150         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
3151         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
3152         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
3153         regs->rsp = vmcs_readl(GUEST_RSP);
3154         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
3155 #ifdef __x86_64__
3156         regs->r8 = vcpu->regs[VCPU_REGS_R8];
3157         regs->r9 = vcpu->regs[VCPU_REGS_R9];
3158         regs->r10 = vcpu->regs[VCPU_REGS_R10];
3159         regs->r11 = vcpu->regs[VCPU_REGS_R11];
3160         regs->r12 = vcpu->regs[VCPU_REGS_R12];
3161         regs->r13 = vcpu->regs[VCPU_REGS_R13];
3162         regs->r14 = vcpu->regs[VCPU_REGS_R14];
3163         regs->r15 = vcpu->regs[VCPU_REGS_R15];
3164 #endif
3165
3166         regs->rip = vmcs_readl(GUEST_RIP);
3167         regs->rflags = vmcs_readl(GUEST_RFLAGS);
3168
3169         /*
3170          * Don't leak debug flags in case they were set for guest debugging
3171          */
3172         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3173                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3174
3175         vcpu_put(vcpu);
3176
3177         print_func_exit();
3178         return 0;
3179 }
3180
3181 static int litevm_dev_ioctl_set_regs(struct litevm *litevm,
3182                                                                          struct litevm_regs *regs)
3183 {
3184         print_func_entry();
3185         struct litevm_vcpu *vcpu;
3186
3187         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3188                 print_func_exit();
3189                 return -EINVAL;
3190         }
3191
3192         vcpu = vcpu_load(litevm, regs->vcpu);
3193         if (!vcpu) {
3194                 print_func_exit();
3195                 return -ENOENT;
3196         }
3197
3198         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
3199         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
3200         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
3201         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
3202         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
3203         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
3204         vmcs_writel(GUEST_RSP, regs->rsp);
3205         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
3206 #ifdef __x86_64__
3207         vcpu->regs[VCPU_REGS_R8] = regs->r8;
3208         vcpu->regs[VCPU_REGS_R9] = regs->r9;
3209         vcpu->regs[VCPU_REGS_R10] = regs->r10;
3210         vcpu->regs[VCPU_REGS_R11] = regs->r11;
3211         vcpu->regs[VCPU_REGS_R12] = regs->r12;
3212         vcpu->regs[VCPU_REGS_R13] = regs->r13;
3213         vcpu->regs[VCPU_REGS_R14] = regs->r14;
3214         vcpu->regs[VCPU_REGS_R15] = regs->r15;
3215 #endif
3216
3217         vmcs_writel(GUEST_RIP, regs->rip);
3218         vmcs_writel(GUEST_RFLAGS, regs->rflags);
3219
3220         vcpu_put(vcpu);
3221
3222         print_func_exit();
3223         return 0;
3224 }
3225
3226 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm,
3227                                                                           struct litevm_sregs *sregs)
3228 {
3229         print_func_entry();
3230         struct litevm_vcpu *vcpu;
3231
3232         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3233                 print_func_exit();
3234                 return -EINVAL;
3235         }
3236         vcpu = vcpu_load(litevm, sregs->vcpu);
3237         if (!vcpu) {
3238                 print_func_exit();
3239                 return -ENOENT;
3240         }
3241 #define get_segment(var, seg) \
3242         do { \
3243                 uint32_t ar; \
3244                 \
3245                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
3246                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
3247                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
3248                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
3249                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
3250                 sregs->var.type = ar & 15; \
3251                 sregs->var.s = (ar >> 4) & 1; \
3252                 sregs->var.dpl = (ar >> 5) & 3; \
3253                 sregs->var.present = (ar >> 7) & 1; \
3254                 sregs->var.avl = (ar >> 12) & 1; \
3255                 sregs->var.l = (ar >> 13) & 1; \
3256                 sregs->var.db = (ar >> 14) & 1; \
3257                 sregs->var.g = (ar >> 15) & 1; \
3258                 sregs->var.unusable = (ar >> 16) & 1; \
3259         } while (0);
3260
3261         get_segment(cs, CS);
3262         get_segment(ds, DS);
3263         get_segment(es, ES);
3264         get_segment(fs, FS);
3265         get_segment(gs, GS);
3266         get_segment(ss, SS);
3267
3268         get_segment(tr, TR);
3269         get_segment(ldt, LDTR);
3270 #undef get_segment
3271
3272 #define get_dtable(var, table) \
3273         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
3274                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
3275
3276         get_dtable(idt, IDTR);
3277         get_dtable(gdt, GDTR);
3278 #undef get_dtable
3279
3280         sregs->cr0 = guest_cr0();
3281         sregs->cr2 = vcpu->cr2;
3282         sregs->cr3 = vcpu->cr3;
3283         sregs->cr4 = guest_cr4();
3284         sregs->cr8 = vcpu->cr8;
3285         sregs->efer = vcpu->shadow_efer;
3286         sregs->apic_base = vcpu->apic_base;
3287
3288         sregs->pending_int = vcpu->irq_summary != 0;
3289
3290         vcpu_put(vcpu);
3291
3292         print_func_exit();
3293         return 0;
3294 }
3295
3296 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm,
3297                                                                           struct litevm_sregs *sregs)
3298 {
3299         print_func_entry();
3300         struct litevm_vcpu *vcpu;
3301         int mmu_reset_needed = 0;
3302
3303         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3304                 print_func_exit();
3305                 return -EINVAL;
3306         }
3307         vcpu = vcpu_load(litevm, sregs->vcpu);
3308         if (!vcpu) {
3309                 print_func_exit();
3310                 return -ENOENT;
3311         }
3312 #define set_segment(var, seg) \
3313         do { \
3314                 uint32_t ar; \
3315                 \
3316                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
3317                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
3318                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
3319                 if (sregs->var.unusable) { \
3320                         ar = (1 << 16); \
3321                 } else { \
3322                         ar = (sregs->var.type & 15); \
3323                         ar |= (sregs->var.s & 1) << 4; \
3324                         ar |= (sregs->var.dpl & 3) << 5; \
3325                         ar |= (sregs->var.present & 1) << 7; \
3326                         ar |= (sregs->var.avl & 1) << 12; \
3327                         ar |= (sregs->var.l & 1) << 13; \
3328                         ar |= (sregs->var.db & 1) << 14; \
3329                         ar |= (sregs->var.g & 1) << 15; \
3330                 } \
3331                 vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
3332         } while (0);
3333
3334         set_segment(cs, CS);
3335         set_segment(ds, DS);
3336         set_segment(es, ES);
3337         set_segment(fs, FS);
3338         set_segment(gs, GS);
3339         set_segment(ss, SS);
3340
3341         set_segment(tr, TR);
3342
3343         set_segment(ldt, LDTR);
3344 #undef set_segment
3345
3346 #define set_dtable(var, table) \
3347         vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
3348         vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
3349
3350         set_dtable(idt, IDTR);
3351         set_dtable(gdt, GDTR);
3352 #undef set_dtable
3353
3354         vcpu->cr2 = sregs->cr2;
3355         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
3356         vcpu->cr3 = sregs->cr3;
3357
3358         vcpu->cr8 = sregs->cr8;
3359
3360         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
3361 #ifdef __x86_64__
3362         __set_efer(vcpu, sregs->efer);
3363 #endif
3364         vcpu->apic_base = sregs->apic_base;
3365
3366         mmu_reset_needed |= guest_cr0() != sregs->cr0;
3367         vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
3368         update_exception_bitmap(vcpu);
3369         vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
3370         vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
3371
3372         mmu_reset_needed |= guest_cr4() != sregs->cr4;
3373         __set_cr4(vcpu, sregs->cr4);
3374
3375         if (mmu_reset_needed)
3376                 litevm_mmu_reset_context(vcpu);
3377         vcpu_put(vcpu);
3378
3379         print_func_exit();
3380         return 0;
3381 }
3382
3383 /*
3384  * Translate a guest virtual address to a guest physical address.
3385  */
3386 static int litevm_dev_ioctl_translate(struct litevm *litevm,
3387                                                                           struct litevm_translation *tr)
3388 {
3389         print_func_entry();
3390         unsigned long vaddr = tr->linear_address;
3391         struct litevm_vcpu *vcpu;
3392         gpa_t gpa;
3393
3394         vcpu = vcpu_load(litevm, tr->vcpu);
3395         if (!vcpu) {
3396                 print_func_exit();
3397                 return -ENOENT;
3398         }
3399         spin_lock_irqsave(&litevm->lock);
3400         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
3401         tr->physical_address = gpa;
3402         tr->valid = gpa != UNMAPPED_GVA;
3403         tr->writeable = 1;
3404         tr->usermode = 0;
3405         spin_unlock(&litevm->lock);
3406         vcpu_put(vcpu);
3407
3408         print_func_exit();
3409         return 0;
3410 }
3411
3412 #if 0
3413 static int litevm_dev_ioctl_interrupt(struct litevm *litevm,
3414                                           &n