Add run support.
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 #define currentcpu (&per_cpu_info[core_id()])
39
40 struct litevm_stat litevm_stat;
41
42 static struct litevm_stats_debugfs_item {
43         const char *name;
44         uint32_t *data;
45 } debugfs_entries[] = {
46         { "pf_fixed", &litevm_stat.pf_fixed },
47         { "pf_guest", &litevm_stat.pf_guest },
48         { "tlb_flush", &litevm_stat.tlb_flush },
49         { "invlpg", &litevm_stat.invlpg },
50         { "exits", &litevm_stat.exits },
51         { "io_exits", &litevm_stat.io_exits },
52         { "mmio_exits", &litevm_stat.mmio_exits },
53         { "signal_exits", &litevm_stat.signal_exits },
54         { "irq_exits", &litevm_stat.irq_exits },
55         { 0, 0 }
56 };
57
58 static struct dentry *debugfs_dir;
59
60 static const uint32_t vmx_msr_index[] = {
61 #ifdef __x86_64__
62         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
63 #endif
64         MSR_EFER, // wtf? MSR_K6_STAR,
65 };
66 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
67
68 #ifdef __x86_64__
69 /*
70  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
71  * mechanism (cpu bug AA24)
72  */
73 #define NR_BAD_MSRS 2
74 #else
75 #define NR_BAD_MSRS 0
76 #endif
77
78 #define TSS_IOPB_BASE_OFFSET 0x66
79 #define TSS_BASE_SIZE 0x68
80 #define TSS_IOPB_SIZE (65536 / 8)
81 #define TSS_REDIRECTION_SIZE (256 / 8)
82 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
83
84 #define MSR_IA32_VMX_BASIC_MSR                  0x480
85 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
86 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
87 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
88 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
89
90 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
91 #define LMSW_GUEST_MASK 0x0eULL
92 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
93 //#define CR4_VMXE 0x2000
94 #define CR8_RESEVED_BITS (~0x0fULL)
95 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
96
97 #ifdef __x86_64__
98 #define HOST_IS_64 1
99 #else
100 #define HOST_IS_64 0
101 #endif
102
103 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
104 /**
105  * __ffs - find first set bit in word
106  * @word: The word to search
107  *
108  * Undefined if no bit exists, so code should check against 0 first.
109  */
110 static inline unsigned long __ffs(unsigned long word)
111 {
112         asm("rep; bsf %1,%0"
113                 : "=r" (word)
114                 : "rm" (word));
115         return word;
116 }
117
118 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu, uint32_t msr)
119 {
120         int i;
121
122         for (i = 0; i < vcpu->nmsrs; ++i)
123                 if (vcpu->guest_msrs[i].index == msr)
124                         return &vcpu->guest_msrs[i];
125         return 0;
126 }
127
128 struct descriptor_table {
129         uint16_t limit;
130         unsigned long base;
131 } __attribute__((packed));
132
133 static void get_gdt(struct descriptor_table *table)
134 {
135         asm ("sgdt %0" : "=m"(*table));
136 }
137
138 static void get_idt(struct descriptor_table *table)
139 {
140         asm ("sidt %0" : "=m"(*table));
141 }
142
143 static uint16_t read_fs(void)
144 {
145         uint16_t seg;
146         asm ("mov %%fs, %0" : "=g"(seg));
147         return seg;
148 }
149
150 static uint16_t read_gs(void)
151 {
152         uint16_t seg;
153         asm ("mov %%gs, %0" : "=g"(seg));
154         return seg;
155 }
156
157 static uint16_t read_ldt(void)
158 {
159         uint16_t ldt;
160         asm ("sldt %0" : "=g"(ldt));
161         return ldt;
162 }
163
164 static void load_fs(uint16_t sel)
165 {
166         asm ("mov %0, %%fs" : : "g"(sel));
167 }
168
169 static void load_gs(uint16_t sel)
170 {
171         asm ("mov %0, %%gs" : : "g"(sel));
172 }
173
174 #ifndef load_ldt
175 static void load_ldt(uint16_t sel)
176 {
177         asm ("lldt %0" : : "g"(sel));
178 }
179 #endif
180
181 static void fx_save(void *image)
182 {
183         asm ("fxsave (%0)":: "r" (image));
184 }
185
186 static void fx_restore(void *image)
187 {
188         asm ("fxrstor (%0)":: "r" (image));
189 }
190
191 static void fpu_init(void)
192 {
193         asm ("finit");
194 }
195
196 struct segment_descriptor {
197         uint16_t limit_low;
198         uint16_t base_low;
199         uint8_t  base_mid;
200         uint8_t  type : 4;
201         uint8_t  system : 1;
202         uint8_t  dpl : 2;
203         uint8_t  present : 1;
204         uint8_t  limit_high : 4;
205         uint8_t  avl : 1;
206         uint8_t  long_mode : 1;
207         uint8_t  default_op : 1;
208         uint8_t  granularity : 1;
209         uint8_t  base_high;
210 } __attribute__((packed));
211
212 #ifdef __x86_64__
213 // LDT or TSS descriptor in the GDT. 16 bytes.
214 struct segment_descriptor_64 {
215         struct segment_descriptor s;
216         uint32_t base_higher;
217         uint32_t pad_zero;
218 };
219
220 #endif
221
222 static unsigned long segment_base(uint16_t selector)
223 {
224         struct descriptor_table gdt;
225         struct segment_descriptor *d;
226         unsigned long table_base;
227         typedef unsigned long ul;
228         unsigned long v;
229
230         asm ("sgdt %0" : "=m"(gdt));
231         table_base = gdt.base;
232
233         if (selector & 4) {           /* from ldt */
234                 uint16_t ldt_selector;
235
236                 asm ("sldt %0" : "=g"(ldt_selector));
237                 table_base = segment_base(ldt_selector);
238         }
239         d = (struct segment_descriptor *)(table_base + (selector & ~7));
240         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
241 #ifdef __x86_64__
242         if (d->system == 0
243             && (d->type == 2 || d->type == 9 || d->type == 11))
244                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
245 #endif
246         return v;
247 }
248
249 static unsigned long read_tr_base(void)
250 {
251         uint16_t tr;
252         asm ("str %0" : "=g"(tr));
253         return segment_base(tr);
254 }
255
256 static void reload_tss(void)
257 {
258 #ifndef __x86_64__
259
260         /*
261          * VT restores TR but not its size.  Useless.
262          */
263         struct descriptor_table gdt;
264         struct segment_descriptor *descs;
265
266         get_gdt(&gdt);
267         descs = (void *)gdt.base;
268         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
269         load_TR_desc();
270 #endif
271 }
272
273 static struct vmcs_descriptor {
274         int size;
275         int order;
276         uint32_t revision_id;
277 } vmcs_descriptor;
278
279 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
280 {
281         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
282         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
283 }
284
285
286
287 int litevm_read_guest(struct litevm_vcpu *vcpu,
288                              gva_t addr,
289                              unsigned long size,
290                              void *dest)
291 {
292         unsigned char *host_buf = dest;
293         unsigned long req_size = size;
294
295         while (size) {
296                 hpa_t paddr;
297                 unsigned now;
298                 unsigned offset;
299                 hva_t guest_buf;
300
301                 paddr = gva_to_hpa(vcpu, addr);
302
303                 if (is_error_hpa(paddr))
304                         break;
305                 guest_buf = (hva_t)KADDR(paddr);
306                 offset = addr & ~PAGE_MASK;
307                 guest_buf |= offset;
308                 now = MIN(size, PAGE_SIZE - offset);
309                 memcpy(host_buf, (void*)guest_buf, now);
310                 host_buf += now;
311                 addr += now;
312                 size -= now;
313         }
314         return req_size - size;
315 }
316
317 int litevm_write_guest(struct litevm_vcpu *vcpu,
318                              gva_t addr,
319                              unsigned long size,
320                              void *data)
321 {
322         unsigned char *host_buf = data;
323         unsigned long req_size = size;
324
325         while (size) {
326                 hpa_t paddr;
327                 unsigned now;
328                 unsigned offset;
329                 hva_t guest_buf;
330
331                 paddr = gva_to_hpa(vcpu, addr);
332
333                 if (is_error_hpa(paddr))
334                         break;
335
336                 guest_buf = (hva_t)KADDR(paddr);
337                 offset = addr & ~PAGE_MASK;
338                 guest_buf |= offset;
339                 now = MIN(size, PAGE_SIZE - offset);
340                 memcpy((void*)guest_buf, host_buf, now);
341                 host_buf += now;
342                 addr += now;
343                 size -= now;
344         }
345         return req_size - size;
346 }
347
348 static void setup_vmcs_descriptor(void)
349 {
350         uint64_t msr;
351
352         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
353         vmcs_descriptor.size = (msr>>32) & 0x1fff;
354         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size>>PAGE_SHIFT);
355         vmcs_descriptor.revision_id = (uint32_t)msr;
356 };
357
358 static void vmcs_clear(struct vmcs *vmcs)
359 {
360         uint64_t phys_addr = PADDR(vmcs);
361         uint8_t error;
362
363         asm volatile ("vmclear %1; setna %0"
364                        : "=m"(error) : "m"(phys_addr) : "cc", "memory" );
365         if (error)
366                 printk("litevm: vmclear fail: %p/%llx\n",
367                        vmcs, phys_addr);
368 }
369
370 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
371 {
372         struct litevm_vcpu *vcpu = arg;
373         int cpu = core_id();
374         printd("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n", 
375                cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
376
377         if (vcpu->cpu == cpu)
378                 vmcs_clear(vcpu->vmcs);
379
380         if (currentcpu->vmcs == vcpu->vmcs)
381                 currentcpu->vmcs = NULL;
382 }
383
384 static int vcpu_slot(struct litevm_vcpu *vcpu)
385 {
386         return vcpu - vcpu->litevm->vcpus;
387 }
388
389 /*
390  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
391  * vcpu mutex is already taken.
392  */
393 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
394 {
395         uint64_t phys_addr = PADDR(vcpu->vmcs);
396         int cpu;
397         cpu = core_id();
398
399         if (vcpu->cpu != cpu) {
400                 handler_wrapper_t *w;
401                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
402                 smp_call_wait(w);
403                 vcpu->launched = 0;
404         }
405         if (currentcpu->vmcs != vcpu->vmcs) {
406                 uint8_t error;
407
408                 currentcpu->vmcs = vcpu->vmcs;
409                 asm volatile ("vmptrld %1; setna %0"
410                                : "=m"(error) : "m"(phys_addr) : "cc" );
411                 if (error){
412                         printk("litevm: vmptrld %p/%llx fail\n",
413                                vcpu->vmcs, phys_addr);
414                         error("litevm: vmptrld %p/%llx fail\n",
415                                vcpu->vmcs, phys_addr);
416                 }
417         }
418
419         if (vcpu->cpu != cpu) {
420                 struct descriptor_table dt;
421                 unsigned long sysenter_esp;
422
423                 vcpu->cpu = cpu;
424                 /*
425                  * Linux uses per-cpu TSS and GDT, so set these when switching
426                  * processors.
427                  */
428                 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
429                 get_gdt(&dt);
430                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
431
432                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
433                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
434         }
435         return vcpu;
436 }
437
438 /*
439  * Switches to specified vcpu, until a matching vcpu_put()
440  */
441 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
442 {
443         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
444
445         qlock(&vcpu->mutex);
446         if (!vcpu->vmcs) {
447                 qunlock(&vcpu->mutex);
448                 error("vcpu->vmcs is NULL");
449         }
450         return __vcpu_load(vcpu);
451 }
452
453 static void vcpu_put(struct litevm_vcpu *vcpu)
454 {
455         //put_cpu();
456         qunlock(&vcpu->mutex);
457 }
458
459
460 static struct vmcs *alloc_vmcs_cpu(int cpu)
461 {
462         int node = node_id();
463         struct vmcs *vmcs;
464
465         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
466         if (!pages)
467                 return 0;
468         memset(vmcs, 0, vmcs_descriptor.size);
469         vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
470         return vmcs;
471 }
472
473 static struct vmcs *alloc_vmcs(void)
474 {
475         return alloc_vmcs_cpu(core_id());
476 }
477
478 static int cpu_has_litevm_support(void)
479 {
480         uint32_t ecx = cpuid_ecx(1);
481         return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */
482 }
483
484 static int vmx_disabled_by_bios(void)
485 {
486         uint64_t msr;
487
488         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
489         return (msr & 5) == 1; /* locked but not enabled */
490 }
491
492 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
493 {
494         int cpu = hw_core_id();
495         uint64_t phys_addr = PADDR(&currentcpu->vmxarea);
496         uint64_t old;
497
498         old = read_msr(MSR_IA32_FEATURE_CONTROL);
499         if ((old & 5) == 0)
500                 /* enable and lock */
501                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
502         lcr4(rcr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
503         asm volatile ("vmxon %0" : : "m"(phys_addr) : "memory", "cc");
504 }
505
506 static void litevm_disable(void *garbage)
507 {
508         asm volatile ("vmxoff" : : : "cc");
509 }
510
511 struct litevm *vmx_open(void)
512 {
513         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
514         int i;
515
516         if (!litevm)
517                 return 0;
518
519         spinlock_init_irqsave(&litevm->lock);
520         LIST_INIT(&litevm->link);
521         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
522                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
523
524                 qlock_init(&vcpu->mutex);
525                 vcpu->mmu.root_hpa = INVALID_PAGE;
526                 LIST_INIT(&vcpu->link);
527         }
528         printk("vmx_open: busy %d\n", litevm->busy);
529         printk("return %p\n", litevm);
530         return litevm;
531 }
532
533 /*
534  * Free any memory in @free but not in @dont.
535  */
536 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
537                                   struct litevm_memory_slot *dont)
538 {
539         int i;
540
541         if (!dont || free->phys_mem != dont->phys_mem)
542                 if (free->phys_mem) {
543                         for (i = 0; i < free->npages; ++i){
544                                 page_t *page = free->phys_mem[i];
545                                 page_decref(page);
546                                 assert(page_is_free(page2ppn(page)));
547                         }
548                         kfree(free->phys_mem);
549                 }
550
551         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
552                 kfree(free->dirty_bitmap);
553
554         free->phys_mem = 0;
555         free->npages = 0;
556         free->dirty_bitmap = 0;
557 }
558
559 static void litevm_free_physmem(struct litevm *litevm)
560 {
561         int i;
562
563         for (i = 0; i < litevm->nmemslots; ++i)
564                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
565 }
566
567 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
568 {
569         if (vcpu->vmcs) {
570                 handler_wrapper_t *w;
571                 smp_call_function_all(__vcpu_clear, vcpu, &w);
572                 smp_call_wait(w);
573                 //free_vmcs(vcpu->vmcs);
574                 vcpu->vmcs = 0;
575         }
576 }
577
578 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
579 {
580         litevm_free_vmcs(vcpu);
581         litevm_mmu_destroy(vcpu);
582 }
583
584 static void litevm_free_vcpus(struct litevm *litevm)
585 {
586         unsigned int i;
587
588         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
589                 litevm_free_vcpu(&litevm->vcpus[i]);
590 }
591
592 static int litevm_dev_release(struct litevm *litevm)
593 {
594
595         litevm_free_vcpus(litevm);
596         litevm_free_physmem(litevm);
597         kfree(litevm);
598         return 0;
599 }
600
601 unsigned long vmcs_readl(unsigned long field)
602 {
603         unsigned long value;
604
605         asm volatile ("vmread %1, %0" : "=g"(value) : "r"(field) : "cc");
606         return value;
607 }
608
609 void vmcs_writel(unsigned long field, unsigned long value)
610 {
611         uint8_t error;
612
613         asm volatile ("vmwrite %1, %2; setna %0"
614                        : "=g"(error) : "r"(value), "r"(field) : "cc" );
615         if (error)
616                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
617                        field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
618 }
619
620 static void vmcs_write16(unsigned long field, uint16_t value)
621 {
622         vmcs_writel(field, value);
623 }
624
625 static void vmcs_write64(unsigned long field, uint64_t value)
626 {
627 #ifdef __x86_64__
628         vmcs_writel(field, value);
629 #else
630         vmcs_writel(field, value);
631         asm volatile ("");
632         vmcs_writel(field+1, value >> 32);
633 #endif
634 }
635
636 static void inject_gp(struct litevm_vcpu *vcpu)
637 {
638         printd("inject_general_protection: rip 0x%lx\n",
639                vmcs_readl(GUEST_RIP));
640         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
641         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
642                      GP_VECTOR |
643                      INTR_TYPE_EXCEPTION |
644                      INTR_INFO_DELIEVER_CODE_MASK |
645                      INTR_INFO_VALID_MASK);
646 }
647
648 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
649 {
650         if (vcpu->rmode.active)
651                 vmcs_write32(EXCEPTION_BITMAP, ~0);
652         else
653                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
654 }
655
656 static void enter_pmode(struct litevm_vcpu *vcpu)
657 {
658         unsigned long flags;
659
660         vcpu->rmode.active = 0;
661
662         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
663         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
664         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
665
666         flags = vmcs_readl(GUEST_RFLAGS);
667         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
668         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
669         vmcs_writel(GUEST_RFLAGS, flags);
670
671         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
672                         (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK) );
673
674         update_exception_bitmap(vcpu);
675
676         #define FIX_PMODE_DATASEG(seg, save) {                          \
677                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
678                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
679                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
680                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
681         }
682
683         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
684         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
685         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
686         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
687         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
688
689         vmcs_write16(GUEST_CS_SELECTOR,
690                      vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
691         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
692 }
693
694 static int rmode_tss_base(struct litevm* litevm)
695 {
696         gfn_t base_gfn = litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
697         return base_gfn << PAGE_SHIFT;
698 }
699
700 static void enter_rmode(struct litevm_vcpu *vcpu)
701 {
702         unsigned long flags;
703
704         vcpu->rmode.active = 1;
705
706         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
707         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
708
709         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
710         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
711
712         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
713         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
714
715         flags = vmcs_readl(GUEST_RFLAGS);
716         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
717
718         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
719
720         vmcs_writel(GUEST_RFLAGS, flags);
721         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
722         update_exception_bitmap(vcpu);
723
724         #define FIX_RMODE_SEG(seg, save) {                                 \
725                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
726                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
727                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
728                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
729         }
730
731         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
732         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
733
734         FIX_RMODE_SEG(ES, vcpu->rmode.es);
735         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
736         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
737         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
738         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
739 }
740
741 static int init_rmode_tss(struct litevm* litevm)
742 {
743         struct page *p1, *p2, *p3;
744         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
745         char *page;
746
747         p1 = _gfn_to_page(litevm, fn++);
748         p2 = _gfn_to_page(litevm, fn++);
749         p3 = _gfn_to_page(litevm, fn);
750
751         if (!p1 || !p2 || !p3) {
752                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
753                 return 0;
754         }
755
756         page = page2kva(p1);
757         memset(page, 0, PAGE_SIZE);
758         *(uint16_t*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
759
760         page = page2kva(p2);
761         memset(page, 0, PAGE_SIZE);
762
763         page = page2kva(p3);
764         memset(page, 0, PAGE_SIZE);
765         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
766
767         return 1;
768 }
769
770 #ifdef __x86_64__
771
772 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
773 {
774         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
775
776         vcpu->shadow_efer = efer;
777         if (efer & EFER_LMA) {
778                 vmcs_write32(VM_ENTRY_CONTROLS,
779                                      vmcs_read32(VM_ENTRY_CONTROLS) |
780                                      VM_ENTRY_CONTROLS_IA32E_MASK);
781                 msr->data = efer;
782
783         } else {
784                 vmcs_write32(VM_ENTRY_CONTROLS,
785                                      vmcs_read32(VM_ENTRY_CONTROLS) &
786                                      ~VM_ENTRY_CONTROLS_IA32E_MASK);
787
788                 msr->data = efer & ~EFER_LME;
789         }
790 }
791
792 static void enter_lmode(struct litevm_vcpu *vcpu)
793 {
794         uint32_t guest_tr_ar;
795
796         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
797         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
798                 printd("%s: tss fixup for long mode. \n",
799                        __FUNCTION__);
800                 vmcs_write32(GUEST_TR_AR_BYTES,
801                              (guest_tr_ar & ~AR_TYPE_MASK)
802                              | AR_TYPE_BUSY_64_TSS);
803         }
804
805         vcpu->shadow_efer |= EFER_LMA;
806
807         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
808         vmcs_write32(VM_ENTRY_CONTROLS,
809                      vmcs_read32(VM_ENTRY_CONTROLS)
810                      | VM_ENTRY_CONTROLS_IA32E_MASK);
811 }
812
813 static void exit_lmode(struct litevm_vcpu *vcpu)
814 {
815         vcpu->shadow_efer &= ~EFER_LMA;
816
817         vmcs_write32(VM_ENTRY_CONTROLS,
818                      vmcs_read32(VM_ENTRY_CONTROLS)
819                      & ~VM_ENTRY_CONTROLS_IA32E_MASK);
820 }
821
822 #endif
823
824 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
825 {
826         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
827                 enter_pmode(vcpu);
828
829         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
830                 enter_rmode(vcpu);
831
832 #ifdef __x86_64__
833         if (vcpu->shadow_efer & EFER_LME) {
834                 if (!is_paging() && (cr0 & CR0_PG_MASK))
835                         enter_lmode(vcpu);
836                 if (is_paging() && !(cr0 & CR0_PG_MASK))
837                         exit_lmode(vcpu);
838         }
839 #endif
840
841         vmcs_writel(CR0_READ_SHADOW, cr0);
842         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
843 }
844
845 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
846                                          unsigned long cr3)
847 {
848         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
849         unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
850         int i;
851         uint64_t pdpte;
852         uint64_t *pdpt;
853         struct litevm_memory_slot *memslot;
854
855         spin_lock_irqsave(&vcpu->litevm->lock);
856         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
857         /* FIXME: !memslot - emulate? 0xff? */
858         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
859
860         for (i = 0; i < 4; ++i) {
861                 pdpte = pdpt[offset + i];
862                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
863                         break;
864         }
865
866         spin_unlock(&vcpu->litevm->lock);
867
868         return i != 4;
869 }
870
871 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
872 {
873         if (cr0 & CR0_RESEVED_BITS) {
874                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
875                        cr0, guest_cr0());
876                 inject_gp(vcpu);
877                 return;
878         }
879
880         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
881                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
882                 inject_gp(vcpu);
883                 return;
884         }
885
886         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
887                 printd("set_cr0: #GP, set PG flag "
888                        "and a clear PE flag\n");
889                 inject_gp(vcpu);
890                 return;
891         }
892
893         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
894 #ifdef __x86_64__
895                 if ((vcpu->shadow_efer & EFER_LME)) {
896                         uint32_t guest_cs_ar;
897                         if (!is_pae()) {
898                                 printd("set_cr0: #GP, start paging "
899                                        "in long mode while PAE is disabled\n");
900                                 inject_gp(vcpu);
901                                 return;
902                         }
903                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
904                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
905                                 printd("set_cr0: #GP, start paging "
906                                        "in long mode while CS.L == 1\n");
907                                 inject_gp(vcpu);
908                                 return;
909
910                         }
911                 } else
912 #endif
913                 if (is_pae() &&
914                             pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
915                         printd("set_cr0: #GP, pdptrs "
916                                "reserved bits\n");
917                         inject_gp(vcpu);
918                         return;
919                 }
920
921         }
922
923         __set_cr0(vcpu, cr0);
924         litevm_mmu_reset_context(vcpu);
925         return;
926 }
927
928 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
929 {
930         unsigned long cr0 = guest_cr0();
931
932         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
933                 enter_pmode(vcpu);
934                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
935
936         } else
937                 printd("lmsw: unexpected\n");
938
939         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
940                                 | (msw & LMSW_GUEST_MASK));
941 }
942
943 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
944 {
945         vmcs_writel(CR4_READ_SHADOW, cr4);
946         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
947                     LITEVM_RMODE_VM_CR4_ALWAYS_ON : LITEVM_PMODE_VM_CR4_ALWAYS_ON));
948 }
949
950 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
951 {
952         if (cr4 & CR4_RESEVED_BITS) {
953                 printd("set_cr4: #GP, reserved bits\n");
954                 inject_gp(vcpu);
955                 return;
956         }
957
958         if (is_long_mode()) {
959                 if (!(cr4 & CR4_PAE_MASK)) {
960                         printd("set_cr4: #GP, clearing PAE while "
961                                "in long mode\n");
962                         inject_gp(vcpu);
963                         return;
964                 }
965         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
966                    && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
967                 printd("set_cr4: #GP, pdptrs reserved bits\n");
968                 inject_gp(vcpu);
969         }
970
971         if (cr4 & CR4_VMXE_MASK) {
972                 printd("set_cr4: #GP, setting VMXE\n");
973                 inject_gp(vcpu);
974                 return;
975         }
976         __set_cr4(vcpu, cr4);
977         spin_lock_irqsave(&vcpu->litevm->lock);
978         litevm_mmu_reset_context(vcpu);
979         spin_unlock(&vcpu->litevm->lock);
980 }
981
982 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
983 {
984         if (is_long_mode()) {
985                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
986                         printd("set_cr3: #GP, reserved bits\n");
987                         inject_gp(vcpu);
988                         return;
989                 }
990         } else {
991                 if (cr3 & CR3_RESEVED_BITS) {
992                         printd("set_cr3: #GP, reserved bits\n");
993                         inject_gp(vcpu);
994                         return;
995                 }
996                 if (is_paging() && is_pae() &&
997                     pdptrs_have_reserved_bits_set(vcpu, cr3)) {
998                         printd("set_cr3: #GP, pdptrs "
999                                "reserved bits\n");
1000                         inject_gp(vcpu);
1001                         return;
1002                 }
1003         }
1004
1005         vcpu->cr3 = cr3;
1006         spin_lock_irqsave(&vcpu->litevm->lock);
1007         vcpu->mmu.new_cr3(vcpu);
1008         spin_unlock(&vcpu->litevm->lock);
1009 }
1010
1011 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1012 {
1013         if ( cr8 & CR8_RESEVED_BITS) {
1014                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1015                 inject_gp(vcpu);
1016                 return;
1017         }
1018         vcpu->cr8 = cr8;
1019 }
1020
1021 static uint32_t get_rdx_init_val(void)
1022 {
1023         uint32_t val;
1024
1025         asm ("movl $1, %%eax \n\t"
1026              "movl %%eax, %0 \n\t" : "=g"(val) );
1027         return val;
1028
1029 }
1030
1031 static void fx_init(struct litevm_vcpu *vcpu)
1032 {
1033         struct __attribute__ ((__packed__)) fx_image_s {
1034                 uint16_t control; //fcw
1035                 uint16_t status; //fsw
1036                 uint16_t tag; // ftw
1037                 uint16_t opcode; //fop
1038                 uint64_t ip; // fpu ip
1039                 uint64_t operand;// fpu dp
1040                 uint32_t mxcsr;
1041                 uint32_t mxcsr_mask;
1042
1043         } *fx_image;
1044
1045         fx_save(vcpu->host_fx_image);
1046         fpu_init();
1047         fx_save(vcpu->guest_fx_image);
1048         fx_restore(vcpu->host_fx_image);
1049
1050         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1051         fx_image->mxcsr = 0x1f80;
1052         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1053                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1054 }
1055
1056 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field, uint32_t val)
1057 {
1058         uint32_t msr_high, msr_low;
1059         uint64_t msrval;
1060
1061         msrval = read_msr(msr);
1062         msr_low = msrval;
1063         msr_high = (msrval>>32);
1064
1065         val &= msr_high;
1066         val |= msr_low;
1067         vmcs_write32(vmcs_field, val);
1068 }
1069
1070 /*
1071  * Sets up the vmcs for emulated real mode.
1072  */
1073 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1074 {
1075 /* no op on x86_64 */
1076 #define asmlinkage
1077         extern asmlinkage void litevm_vmx_return(void);
1078         uint32_t host_sysenter_cs;
1079         uint32_t junk;
1080         uint64_t a;
1081         struct descriptor_table dt;
1082         int i;
1083         int ret;
1084         uint64_t tsc;
1085         int nr_good_msrs;
1086
1087
1088         if (!init_rmode_tss(vcpu->litevm)) {
1089                 error("vcpu_setup: init_rmode_tss failed");
1090         }
1091
1092         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1093         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1094         vcpu->cr8 = 0;
1095         vcpu->apic_base = 0xfee00000 |
1096                         /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
1097                         MSR_IA32_APICBASE_ENABLE;
1098
1099         fx_init(vcpu);
1100
1101 #define SEG_SETUP(seg) do {                                     \
1102                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1103                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1104                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1105                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1106         } while (0)
1107
1108         /*
1109          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1110          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1111          */
1112         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1113         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1114         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1115         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1116
1117         SEG_SETUP(DS);
1118         SEG_SETUP(ES);
1119         SEG_SETUP(FS);
1120         SEG_SETUP(GS);
1121         SEG_SETUP(SS);
1122
1123         vmcs_write16(GUEST_TR_SELECTOR, 0);
1124         vmcs_writel(GUEST_TR_BASE, 0);
1125         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1126         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1127
1128         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1129         vmcs_writel(GUEST_LDTR_BASE, 0);
1130         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1131         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1132
1133         vmcs_write32(GUEST_SYSENTER_CS, 0);
1134         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1135         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1136
1137         vmcs_writel(GUEST_RFLAGS, 0x02);
1138         vmcs_writel(GUEST_RIP, 0xfff0);
1139         vmcs_writel(GUEST_RSP, 0);
1140
1141         vmcs_writel(GUEST_CR3, 0);
1142
1143         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1144         vmcs_writel(GUEST_DR7, 0x400);
1145
1146         vmcs_writel(GUEST_GDTR_BASE, 0);
1147         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1148
1149         vmcs_writel(GUEST_IDTR_BASE, 0);
1150         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1151
1152         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1153         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1154         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1155
1156         /* I/O */
1157         vmcs_write64(IO_BITMAP_A, 0);
1158         vmcs_write64(IO_BITMAP_B, 0);
1159
1160         tsc = read_tsc();
1161         vmcs_write64(TSC_OFFSET, -tsc);
1162
1163         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1164
1165         /* Special registers */
1166         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1167
1168         /* Control */
1169         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR,
1170                                PIN_BASED_VM_EXEC_CONTROL,
1171                                PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
1172                                | PIN_BASED_NMI_EXITING   /* 20.6.1 */
1173                         );
1174         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR,
1175                                CPU_BASED_VM_EXEC_CONTROL,
1176                                CPU_BASED_HLT_EXITING         /* 20.6.2 */
1177                                | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
1178                                | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
1179                                | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
1180                                | CPU_BASED_INVDPG_EXITING
1181                                | CPU_BASED_MOV_DR_EXITING
1182                                | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
1183                         );
1184
1185         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1186         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1187         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1188         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1189
1190         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1191         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1192         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1193
1194 #warning "not setting selectors; do we need them?"
1195 #if 0
1196         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
1197         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1198         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1199 #endif
1200         vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
1201         vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
1202 #if 0
1203         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1204 #endif
1205 #ifdef __x86_64__
1206         a = read_msr(MSR_FS_BASE);
1207         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1208         a = read_msr(MSR_GS_BASE);
1209         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1210 #else
1211         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1212         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1213 #endif
1214
1215 #warning "Not setting HOST_TR_SELECTOR"
1216 #if 0
1217         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
1218 #endif
1219
1220         get_idt(&dt);
1221         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1222
1223
1224         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return); /* 22.2.5 */
1225
1226         /* it's the HIGH 32 bits! */
1227         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1228         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1229         a = read_msr(MSR_IA32_SYSENTER_ESP);
1230         vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
1231         a = read_msr(MSR_IA32_SYSENTER_EIP);
1232         vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
1233
1234         ret = -ENOMEM;
1235         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1236         if (!vcpu->guest_msrs)
1237                 error("guest_msrs kmalloc failed");
1238         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1239         if (!vcpu->host_msrs)
1240                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1241
1242         for (i = 0; i < NR_VMX_MSR; ++i) {
1243                 uint32_t index = vmx_msr_index[i];
1244                 uint32_t data_low, data_high;
1245                 uint64_t data;
1246                 int j = vcpu->nmsrs;
1247
1248 #warning "need readmsr_safe"
1249 //              if (rdmsr_safe(index, &data_low, &data_high) < 0)
1250 //                      continue;
1251                 data = read_msr(index);
1252                 vcpu->host_msrs[j].index = index;
1253                 vcpu->host_msrs[j].reserved = 0;
1254                 vcpu->host_msrs[j].data = data;
1255                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1256                 ++vcpu->nmsrs;
1257         }
1258         printk("msrs: %d\n", vcpu->nmsrs);
1259
1260         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1261         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
1262                     PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1263         vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
1264                     PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1265         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
1266                     PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1267         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS,
1268                                (HOST_IS_64 << 9));  /* 22.2,1, 20.7.1 */
1269         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
1270         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);  /* 22.2.2 */
1271         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1272
1273
1274         /* 22.2.1, 20.8.1 */
1275         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR,
1276                                VM_ENTRY_CONTROLS, 0);
1277         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1278
1279         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1280         vmcs_writel(TPR_THRESHOLD, 0);
1281
1282         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1283         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1284
1285         __set_cr0(vcpu, 0x60000010); // enter rmode
1286         __set_cr4(vcpu, 0);
1287 #ifdef __x86_64__
1288         __set_efer(vcpu, 0);
1289 #endif
1290
1291         ret = litevm_mmu_init(vcpu);
1292
1293         return ret;
1294
1295 out_free_guest_msrs:
1296         kfree(vcpu->guest_msrs);
1297 out:
1298         return ret;
1299 }
1300
1301 /*
1302  * Sync the rsp and rip registers into the vcpu structure.  This allows
1303  * registers to be accessed by indexing vcpu->regs.
1304  */
1305 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1306 {
1307         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1308         vcpu->rip = vmcs_readl(GUEST_RIP);
1309 }
1310
1311 /*
1312  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1313  * modification.
1314  */
1315 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1316 {
1317         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1318         vmcs_writel(GUEST_RIP, vcpu->rip);
1319 }
1320
1321 /*
1322  * Creates some virtual cpus.  Good luck creating more than one.
1323  */
1324 int vmx_create_vcpu(struct litevm *litevm, int n)
1325 {
1326         ERRSTACK(1);
1327         int r;
1328         struct litevm_vcpu *vcpu;
1329         struct vmcs *vmcs;
1330         char *errstring = NULL;
1331
1332         if (n < 0 || n >= LITEVM_MAX_VCPUS)
1333                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n, LITEVM_MAX_VCPUS);
1334
1335         vcpu = &litevm->vcpus[n];
1336
1337         qlock(&vcpu->mutex);
1338
1339         if (vcpu->vmcs) {
1340                 qunlock(&vcpu->mutex);
1341                 error("VM already exists");
1342         }
1343
1344         /* I'm a bad person */
1345         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1346         uint64_t a = (uint64_t) vcpu->fx_buf;
1347         a += FX_IMAGE_ALIGN-1;
1348         a /= FX_IMAGE_ALIGN;
1349         a *= FX_IMAGE_ALIGN;
1350
1351         vcpu->host_fx_image = (char*)a;
1352         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1353
1354         vcpu->cpu = -1;  /* First load will set up TR */
1355         vcpu->litevm = litevm;
1356         vmcs = alloc_vmcs();
1357         if (!vmcs) {
1358                 errstring = "vmcs allocate failed";
1359                 qunlock(&vcpu->mutex);
1360                 goto out_free_vcpus;
1361         }
1362         vmcs_clear(vmcs);
1363         vcpu->vmcs = vmcs;
1364         vcpu->launched = 0;
1365
1366         __vcpu_load(vcpu);
1367
1368         if (waserror()){
1369                 /* we really need to fix waserror() */
1370                 poperror();
1371                 goto out_free_vcpus;
1372         }
1373
1374         r = litevm_vcpu_setup(vcpu);
1375
1376         vcpu_put(vcpu);
1377
1378         if (! r)
1379                 return 0;
1380
1381         errstring = "vcup set failed";
1382
1383 out_free_vcpus:
1384         printk("out_free_vcpus: life sucks\n");
1385         litevm_free_vcpu(vcpu);
1386         error(errstring);
1387 out:
1388         return r;
1389 }
1390
1391 /*
1392  * Allocate some memory and give it an address in the guest physical address
1393  * space.
1394  *
1395  * Discontiguous memory is allowed, mostly for framebuffers.
1396  */
1397 int vm_set_memory_region(struct litevm *litevm,
1398                                            struct litevm_memory_region *mem)
1399 {
1400         ERRSTACK(2);
1401         int r;
1402         gfn_t base_gfn;
1403         unsigned long npages;
1404         unsigned long i;
1405         struct litevm_memory_slot *memslot;
1406         struct litevm_memory_slot old, new;
1407         int memory_config_version;
1408         void *init_data = mem->init_data;
1409         int pass = 1;
1410
1411         printk("litevm %p\n", litevm);
1412         /* should not happen but ... */
1413         if (! litevm)
1414                 error("NULL litevm in %s", __func__);
1415
1416         if (!mem)
1417                 error("NULL mem in %s", __func__);
1418
1419         if (litevm->busy)
1420                 error("litevm->busy is set! 0x%x\n", litevm->busy);
1421         r = -EINVAL;
1422         /* General sanity checks */
1423         if (mem->memory_size & (PAGE_SIZE - 1))
1424                 error("mem->memory_size %lld is not page-aligned", mem->memory_size);
1425         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1426                 error("guest_phys_addr 0x%llx is not page-aligned", mem->guest_phys_addr);
1427         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1428                 error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
1429         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1430                 error("0x%x + 0x%x is < 0x%x", 
1431                       mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
1432
1433         memslot = &litevm->memslots[mem->slot];
1434         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1435         npages = mem->memory_size >> PAGE_SHIFT;
1436
1437         if (!npages)
1438                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1439
1440         /* this is actually a very tricky for loop. The use of
1441          * error is a bit dangerous, so we don't use it much.
1442          * consider a rewrite. Would be nice if akaros could do the
1443          * allocation of a bunch of pages for us.
1444          */
1445 raced:
1446         printk("raced: pass %d\n", pass);
1447         spin_lock_irqsave(&litevm->lock);
1448         printk("locked\n");
1449
1450         if (waserror()){
1451                 spin_unlock(&litevm->lock);
1452                 nexterror();
1453         }
1454                 
1455         memory_config_version = litevm->memory_config_version;
1456         new = old = *memslot;
1457
1458         new.base_gfn = base_gfn;
1459         new.npages = npages;
1460         new.flags = mem->flags;
1461
1462         /* Disallow changing a memory slot's size. */
1463         r = -EINVAL;
1464         if (npages && old.npages && npages != old.npages)
1465                 error("npages is %d, old.npages is %d, can't change",
1466                       npages, old.npages);
1467
1468         /* Check for overlaps */
1469         r = -EEXIST;
1470         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1471                 struct litevm_memory_slot *s = &litevm->memslots[i];
1472
1473                 if (s == memslot)
1474                         continue;
1475                 if (!((base_gfn + npages <= s->base_gfn) ||
1476                       (base_gfn >= s->base_gfn + s->npages)))
1477                         error("Overlap");
1478         }
1479         /*
1480          * Do memory allocations outside lock.  memory_config_version will
1481          * detect any races.
1482          */
1483         spin_unlock(&litevm->lock);
1484         printk("unlocked\n");
1485         poperror();
1486
1487         /* Deallocate if slot is being removed */
1488         if (!npages)
1489                 new.phys_mem = 0;
1490
1491         /* Free page dirty bitmap if unneeded */
1492         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1493                 new.dirty_bitmap = 0;
1494
1495         r = -ENOMEM;
1496
1497         /* Allocate if a slot is being created */
1498         if (npages && !new.phys_mem) {
1499                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1500
1501                 if (!new.phys_mem)
1502                         goto out_free;
1503
1504                 for (i = 0; i < npages; ++i) {
1505                         int ret;
1506                         ret = kpage_alloc(&new.phys_mem[i]);
1507                         if (ret != ESUCCESS)
1508                                 goto out_free;
1509                         if (init_data){
1510                                 printk("init data memcpy(%p,%p,4096);\n",
1511                                        page2kva(new.phys_mem[i]), init_data);
1512                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1513                                 init_data += PAGE_SIZE;
1514                         }
1515                 }
1516         }
1517
1518         /* Allocate page dirty bitmap if needed */
1519         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1520                 unsigned dirty_bytes;//ALIGN(npages, BITS_PER_LONG) / 8;
1521                 dirty_bytes = (((npages + BITS_PER_LONG-1)/BITS_PER_LONG)*BITS_PER_LONG)/8;
1522
1523                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1524                 if (!new.dirty_bitmap){
1525                         printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
1526                         goto out_free;
1527                 }
1528         }
1529
1530         spin_lock_irqsave(&litevm->lock);
1531         printk("locked\n");
1532         if (memory_config_version != litevm->memory_config_version) {
1533                 spin_unlock(&litevm->lock);
1534                 printk("unlocked, try again\n");
1535                 litevm_free_physmem_slot(&new, &old);
1536                 goto raced;
1537         }
1538
1539         r = -EAGAIN;
1540         if (litevm->busy){
1541                 printk("BUSY!\n");
1542                 goto out_unlock;
1543         }
1544
1545         if (mem->slot >= litevm->nmemslots)
1546                 litevm->nmemslots = mem->slot + 1;
1547
1548         *memslot = new;
1549         ++litevm->memory_config_version;
1550
1551         spin_unlock(&litevm->lock);
1552         printk("unlocked\n");
1553         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1554                 struct litevm_vcpu *vcpu;
1555
1556                 vcpu = vcpu_load(litevm, i);
1557                 if (!vcpu)
1558                         continue;
1559                 litevm_mmu_reset_context(vcpu);
1560                 vcpu_put(vcpu);
1561         }
1562
1563         litevm_free_physmem_slot(&old, &new);
1564         return 0;
1565
1566 out_unlock:
1567         spin_unlock(&litevm->lock);
1568         printk("out_unlock\n");
1569 out_free:
1570         printk("out_free\n");
1571         litevm_free_physmem_slot(&new, &old);
1572 out:
1573         printk("vm_set_memory_region: return %d\n", r);
1574         return r;
1575 }
1576
1577 #if 0
1578 /*
1579  * Get (and clear) the dirty memory log for a memory slot.
1580  */
1581 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1582                                        struct litevm_dirty_log *log)
1583 {
1584         struct litevm_memory_slot *memslot;
1585         int r, i;
1586         int n;
1587         unsigned long any = 0;
1588
1589         spin_lock_irqsave(&litevm->lock);
1590
1591         /*
1592          * Prevent changes to guest memory configuration even while the lock
1593          * is not taken.
1594          */
1595         ++litevm->busy;
1596         spin_unlock(&litevm->lock);
1597         r = -EINVAL;
1598         if (log->slot >= LITEVM_MEMORY_SLOTS)
1599                 goto out;
1600
1601         memslot = &litevm->memslots[log->slot];
1602         r = -ENOENT;
1603         if (!memslot->dirty_bitmap)
1604                 goto out;
1605
1606         n = ALIGN(memslot->npages, 8) / 8;
1607
1608         for (i = 0; !any && i < n; ++i)
1609                 any = memslot->dirty_bitmap[i];
1610
1611         r = -EFAULT;
1612         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1613                 goto out;
1614
1615
1616         if (any) {
1617                 spin_lock_irqsave(&litevm->lock);
1618                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1619                 spin_unlock(&litevm->lock);
1620                 memset(memslot->dirty_bitmap, 0, n);
1621                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1622                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1623
1624                         if (!vcpu)
1625                                 continue;
1626                         flush_guest_tlb(vcpu);
1627                         vcpu_put(vcpu);
1628                 }
1629         }
1630
1631         r = 0;
1632
1633 out:
1634         spin_lock_irqsave(&litevm->lock);
1635         --litevm->busy;
1636         spin_unlock(&litevm->lock);
1637         return r;
1638 }
1639 #endif
1640
1641 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1642 {
1643         int i;
1644
1645         for (i = 0; i < litevm->nmemslots; ++i) {
1646                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1647
1648                 if (gfn >= memslot->base_gfn
1649                     && gfn < memslot->base_gfn + memslot->npages)
1650                         return memslot;
1651         }
1652         return 0;
1653 }
1654
1655 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1656 {
1657         int i;
1658         struct litevm_memory_slot *memslot = 0;
1659         unsigned long rel_gfn;
1660
1661         for (i = 0; i < litevm->nmemslots; ++i) {
1662                 memslot = &litevm->memslots[i];
1663
1664                 if (gfn >= memslot->base_gfn
1665                     && gfn < memslot->base_gfn + memslot->npages) {
1666
1667                         if (!memslot || !memslot->dirty_bitmap)
1668                                 return;
1669
1670                         rel_gfn = gfn - memslot->base_gfn;
1671
1672                         /* avoid RMW */
1673                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1674                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1675                         return;
1676                 }
1677         }
1678 }
1679
1680 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1681 {
1682         unsigned long rip;
1683         uint32_t interruptibility;
1684
1685         rip = vmcs_readl(GUEST_RIP);
1686         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1687         vmcs_writel(GUEST_RIP, rip);
1688
1689         /*
1690          * We emulated an instruction, so temporary interrupt blocking
1691          * should be removed, if set.
1692          */
1693         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1694         if (interruptibility & 3)
1695                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
1696                              interruptibility & ~3);
1697 }
1698
1699 static int emulator_read_std(unsigned long addr,
1700                              unsigned long *val,
1701                              unsigned int bytes,
1702                              struct x86_emulate_ctxt *ctxt)
1703 {
1704         struct litevm_vcpu *vcpu = ctxt->vcpu;
1705         void *data = val;
1706
1707         while (bytes) {
1708                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1709                 unsigned offset = addr & (PAGE_SIZE-1);
1710                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ? 
1711                         bytes : (unsigned)PAGE_SIZE - offset;
1712                 unsigned long pfn;
1713                 struct litevm_memory_slot *memslot;
1714                 void *page;
1715
1716                 if (gpa == UNMAPPED_GVA)
1717                         return X86EMUL_PROPAGATE_FAULT;
1718                 pfn = gpa >> PAGE_SHIFT;
1719                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1720                 if (!memslot)
1721                         return X86EMUL_UNHANDLEABLE;
1722                 page = page2kva(gfn_to_page(memslot, pfn));
1723
1724                 memcpy(data, page + offset, tocopy);
1725
1726                 bytes -= tocopy;
1727                 data += tocopy;
1728                 addr += tocopy;
1729         }
1730
1731         return X86EMUL_CONTINUE;
1732 }
1733
1734 static int emulator_write_std(unsigned long addr,
1735                               unsigned long val,
1736                               unsigned int bytes,
1737                               struct x86_emulate_ctxt *ctxt)
1738 {
1739         printk("emulator_write_std: addr %lx n %d\n",
1740                addr, bytes);
1741         return X86EMUL_UNHANDLEABLE;
1742 }
1743
1744 static int emulator_read_emulated(unsigned long addr,
1745                                   unsigned long *val,
1746                                   unsigned int bytes,
1747                                   struct x86_emulate_ctxt *ctxt)
1748 {
1749         struct litevm_vcpu *vcpu = ctxt->vcpu;
1750
1751         if (vcpu->mmio_read_completed) {
1752                 memcpy(val, vcpu->mmio_data, bytes);
1753                 vcpu->mmio_read_completed = 0;
1754                 return X86EMUL_CONTINUE;
1755         } else if (emulator_read_std(addr, val, bytes, ctxt)
1756                    == X86EMUL_CONTINUE)
1757                 return X86EMUL_CONTINUE;
1758         else {
1759                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1760                 if (gpa == UNMAPPED_GVA)
1761                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
1762                 vcpu->mmio_needed = 1;
1763                 vcpu->mmio_phys_addr = gpa;
1764                 vcpu->mmio_size = bytes;
1765                 vcpu->mmio_is_write = 0;
1766
1767                 return X86EMUL_UNHANDLEABLE;
1768         }
1769 }
1770
1771 static int emulator_write_emulated(unsigned long addr,
1772                                    unsigned long val,
1773                                    unsigned int bytes,
1774                                    struct x86_emulate_ctxt *ctxt)
1775 {
1776         struct litevm_vcpu *vcpu = ctxt->vcpu;
1777         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1778
1779         if (gpa == UNMAPPED_GVA)
1780                 return X86EMUL_PROPAGATE_FAULT;
1781
1782         vcpu->mmio_needed = 1;
1783         vcpu->mmio_phys_addr = gpa;
1784         vcpu->mmio_size = bytes;
1785         vcpu->mmio_is_write = 1;
1786         memcpy(vcpu->mmio_data, &val, bytes);
1787
1788         return X86EMUL_CONTINUE;
1789 }
1790
1791 static int emulator_cmpxchg_emulated(unsigned long addr,
1792                                      unsigned long old,
1793                                      unsigned long new,
1794                                      unsigned int bytes,
1795                                      struct x86_emulate_ctxt *ctxt)
1796 {
1797         static int reported;
1798
1799         if (!reported) {
1800                 reported = 1;
1801                 printk("litevm: emulating exchange as write\n");
1802         }
1803         return emulator_write_emulated(addr, new, bytes, ctxt);
1804 }
1805
1806 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1807 {
1808         static int reported;
1809         uint8_t opcodes[4];
1810         unsigned long rip = vmcs_readl(GUEST_RIP);
1811         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
1812
1813         if (reported)
1814                 return;
1815
1816         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1817
1818         printk("emulation failed but !mmio_needed?"
1819                " rip %lx %02x %02x %02x %02x\n",
1820                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1821         reported = 1;
1822 }
1823
1824 struct x86_emulate_ops emulate_ops = {
1825         .read_std            = emulator_read_std,
1826         .write_std           = emulator_write_std,
1827         .read_emulated       = emulator_read_emulated,
1828         .write_emulated      = emulator_write_emulated,
1829         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1830 };
1831
1832 enum emulation_result {
1833         EMULATE_DONE,       /* no further processing */
1834         EMULATE_DO_MMIO,      /* litevm_run filled with mmio request */
1835         EMULATE_FAIL,         /* can't emulate this instruction */
1836 };
1837
1838 static int emulate_instruction(struct litevm_vcpu *vcpu,
1839                                struct litevm_run *run,
1840                                unsigned long cr2,
1841                                uint16_t error_code)
1842 {
1843         struct x86_emulate_ctxt emulate_ctxt;
1844         int r;
1845         uint32_t cs_ar;
1846
1847         vcpu_load_rsp_rip(vcpu);
1848
1849         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1850
1851         emulate_ctxt.vcpu = vcpu;
1852         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
1853         emulate_ctxt.cr2 = cr2;
1854         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1855                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
1856                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
1857                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1858
1859         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1860                 emulate_ctxt.cs_base = 0;
1861                 emulate_ctxt.ds_base = 0;
1862                 emulate_ctxt.es_base = 0;
1863                 emulate_ctxt.ss_base = 0;
1864                 emulate_ctxt.gs_base = 0;
1865                 emulate_ctxt.fs_base = 0;
1866         } else {
1867                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
1868                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
1869                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
1870                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
1871                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
1872                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
1873         }
1874
1875         vcpu->mmio_is_write = 0;
1876         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1877
1878         if ((r || vcpu->mmio_is_write) && run) {
1879                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1880                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1881                 run->mmio.len = vcpu->mmio_size;
1882                 run->mmio.is_write = vcpu->mmio_is_write;
1883         }
1884
1885         if (r) {
1886                 if (!vcpu->mmio_needed) {
1887                         report_emulation_failure(&emulate_ctxt);
1888                         return EMULATE_FAIL;
1889                 }
1890                 return EMULATE_DO_MMIO;
1891         }
1892
1893         vcpu_put_rsp_rip(vcpu);
1894         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
1895
1896         if (vcpu->mmio_is_write)
1897                 return EMULATE_DO_MMIO;
1898
1899         return EMULATE_DONE;
1900 }
1901
1902 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
1903 {
1904         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1905 }
1906
1907 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
1908 {
1909         vmcs_writel(GUEST_GDTR_BASE, base);
1910         vmcs_write32(GUEST_GDTR_LIMIT, limit);
1911 }
1912
1913 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
1914 {
1915         vmcs_writel(GUEST_IDTR_BASE, base);
1916         vmcs_write32(GUEST_IDTR_LIMIT, limit);
1917 }
1918
1919 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
1920                    unsigned long *rflags)
1921 {
1922         lmsw(vcpu, msw);
1923         *rflags = vmcs_readl(GUEST_RFLAGS);
1924 }
1925
1926 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
1927 {
1928         switch (cr) {
1929         case 0:
1930                 return guest_cr0();
1931         case 2:
1932                 return vcpu->cr2;
1933         case 3:
1934                 return vcpu->cr3;
1935         case 4:
1936                 return guest_cr4();
1937         default:
1938                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1939                 return 0;
1940         }
1941 }
1942
1943 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
1944                      unsigned long *rflags)
1945 {
1946         switch (cr) {
1947         case 0:
1948                 set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
1949                 *rflags = vmcs_readl(GUEST_RFLAGS);
1950                 break;
1951         case 2:
1952                 vcpu->cr2 = val;
1953                 break;
1954         case 3:
1955                 set_cr3(vcpu, val);
1956                 break;
1957         case 4:
1958                 set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
1959                 break;
1960         default:
1961                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1962         }
1963 }
1964
1965 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
1966                                   int vec, uint32_t err_code)
1967 {
1968         if (!vcpu->rmode.active)
1969                 return 0;
1970
1971         if (vec == GP_VECTOR && err_code == 0)
1972                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
1973                         return 1;
1974         return 0;
1975 }
1976
1977 static int handle_exception(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
1978 {
1979         uint32_t intr_info, error_code;
1980         unsigned long cr2, rip;
1981         uint32_t vect_info;
1982         enum emulation_result er;
1983
1984         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1985         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1986
1987         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1988                                                 !is_page_fault(intr_info)) {
1989                 printk("%s: unexpected, vectoring info 0x%x "
1990                        "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1991         }
1992
1993         if (is_external_interrupt(vect_info)) {
1994                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1995                 SET_BITMASK_BIT_ATOMIC(((uint8_t *)&vcpu->irq_pending), irq);
1996                 SET_BITMASK_BIT_ATOMIC(((uint8_t *)&vcpu->irq_summary), irq / BITS_PER_LONG);
1997         }
1998
1999         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
2000                 asm ("int $2");
2001                 return 1;
2002         }
2003         error_code = 0;
2004         rip = vmcs_readl(GUEST_RIP);
2005         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
2006                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2007         if (is_page_fault(intr_info)) {
2008                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2009
2010                 spin_lock_irqsave(&vcpu->litevm->lock);
2011                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
2012                         spin_unlock(&vcpu->litevm->lock);
2013                         return 1;
2014                 }
2015
2016                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
2017                 spin_unlock(&vcpu->litevm->lock);
2018
2019                 switch (er) {
2020                 case EMULATE_DONE:
2021                         return 1;
2022                 case EMULATE_DO_MMIO:
2023                         ++litevm_stat.mmio_exits;
2024                         litevm_run->exit_reason = LITEVM_EXIT_MMIO;
2025                         return 0;
2026                  case EMULATE_FAIL:
2027                         vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2028                         break;
2029                 default:
2030                         assert(0);
2031                 }
2032         }
2033
2034         if (vcpu->rmode.active &&
2035             handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2036                                                                 error_code))
2037                 return 1;
2038
2039         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
2040                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2041                 return 0;
2042         }
2043         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2044         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2045         litevm_run->ex.error_code = error_code;
2046         return 0;
2047 }
2048
2049 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2050                                      struct litevm_run *litevm_run)
2051 {
2052         ++litevm_stat.irq_exits;
2053         return 1;
2054 }
2055
2056
2057 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t *count)
2058 {
2059         uint64_t inst;
2060         gva_t rip;
2061         int countr_size;
2062         int i, n;
2063
2064         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2065                 countr_size = 2;
2066         } else {
2067                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2068
2069                 countr_size = (cs_ar & AR_L_MASK) ? 8:
2070                               (cs_ar & AR_DB_MASK) ? 4: 2;
2071         }
2072
2073         rip =  vmcs_readl(GUEST_RIP);
2074         if (countr_size != 8)
2075                 rip += vmcs_readl(GUEST_CS_BASE);
2076
2077         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2078
2079         for (i = 0; i < n; i++) {
2080                 switch (((uint8_t*)&inst)[i]) {
2081                 case 0xf0:
2082                 case 0xf2:
2083                 case 0xf3:
2084                 case 0x2e:
2085                 case 0x36:
2086                 case 0x3e:
2087                 case 0x26:
2088                 case 0x64:
2089                 case 0x65:
2090                 case 0x66:
2091                         break;
2092                 case 0x67:
2093                         countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
2094                 default:
2095                         goto done;
2096                 }
2097         }
2098         return 0;
2099 done:
2100         countr_size *= 8;
2101         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2102         return 1;
2103 }
2104
2105 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2106 {
2107         uint64_t exit_qualification;
2108
2109         ++litevm_stat.io_exits;
2110         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2111         litevm_run->exit_reason = LITEVM_EXIT_IO;
2112         if (exit_qualification & 8)
2113                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2114         else
2115                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2116         litevm_run->io.size = (exit_qualification & 7) + 1;
2117         litevm_run->io.string = (exit_qualification & 16) != 0;
2118         litevm_run->io.string_down
2119                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2120         litevm_run->io.rep = (exit_qualification & 32) != 0;
2121         litevm_run->io.port = exit_qualification >> 16;
2122         if (litevm_run->io.string) {
2123                 if (!get_io_count(vcpu, &litevm_run->io.count))
2124                         return 1;
2125                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2126         } else
2127                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
2128         return 0;
2129 }
2130
2131 static int handle_invlpg(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2132 {
2133         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2134         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2135         spin_lock_irqsave(&vcpu->litevm->lock);
2136         vcpu->mmu.inval_page(vcpu, address);
2137         spin_unlock(&vcpu->litevm->lock);
2138         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2139         return 1;
2140 }
2141
2142 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2143 {
2144         uint64_t exit_qualification;
2145         int cr;
2146         int reg;
2147
2148 #ifdef LITEVM_DEBUG
2149         if (guest_cpl() != 0) {
2150                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2151                 inject_gp(vcpu);
2152                 return 1;
2153         }
2154 #endif
2155
2156         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2157         cr = exit_qualification & 15;
2158         reg = (exit_qualification >> 8) & 15;
2159         switch ((exit_qualification >> 4) & 3) {
2160         case 0: /* mov to cr */
2161                 switch (cr) {
2162                 case 0:
2163                         vcpu_load_rsp_rip(vcpu);
2164                         set_cr0(vcpu, vcpu->regs[reg]);
2165                         skip_emulated_instruction(vcpu);
2166                         return 1;
2167                 case 3:
2168                         vcpu_load_rsp_rip(vcpu);
2169                         set_cr3(vcpu, vcpu->regs[reg]);
2170                         skip_emulated_instruction(vcpu);
2171                         return 1;
2172                 case 4:
2173                         vcpu_load_rsp_rip(vcpu);
2174                         set_cr4(vcpu, vcpu->regs[reg]);
2175                         skip_emulated_instruction(vcpu);
2176                         return 1;
2177                 case 8:
2178                         vcpu_load_rsp_rip(vcpu);
2179                         set_cr8(vcpu, vcpu->regs[reg]);
2180                         skip_emulated_instruction(vcpu);
2181                         return 1;
2182                 };
2183                 break;
2184         case 1: /*mov from cr*/
2185                 switch (cr) {
2186                 case 3:
2187                         vcpu_load_rsp_rip(vcpu);
2188                         vcpu->regs[reg] = vcpu->cr3;
2189                         vcpu_put_rsp_rip(vcpu);
2190                         skip_emulated_instruction(vcpu);
2191                         return 1;
2192                 case 8:
2193                         printd("handle_cr: read CR8 "
2194                                "cpu erratum AA15\n");
2195                         vcpu_load_rsp_rip(vcpu);
2196                         vcpu->regs[reg] = vcpu->cr8;
2197                         vcpu_put_rsp_rip(vcpu);
2198                         skip_emulated_instruction(vcpu);
2199                         return 1;
2200                 }
2201                 break;
2202         case 3: /* lmsw */
2203                 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2204
2205                 skip_emulated_instruction(vcpu);
2206                 return 1;
2207         default:
2208                 break;
2209         }
2210         litevm_run->exit_reason = 0;
2211         printk("litevm: unhandled control register: op %d cr %d\n",
2212                (int)(exit_qualification >> 4) & 3, cr);
2213         return 0;
2214 }
2215
2216 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2217 {
2218         uint64_t exit_qualification;
2219         unsigned long val;
2220         int dr, reg;
2221
2222         /*
2223          * FIXME: this code assumes the host is debugging the guest.
2224          *        need to deal with guest debugging itself too.
2225          */
2226         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2227         dr = exit_qualification & 7;
2228         reg = (exit_qualification >> 8) & 15;
2229         vcpu_load_rsp_rip(vcpu);
2230         if (exit_qualification & 16) {
2231                 /* mov from dr */
2232                 switch (dr) {
2233                 case 6:
2234                         val = 0xffff0ff0;
2235                         break;
2236                 case 7:
2237                         val = 0x400;
2238                         break;
2239                 default:
2240                         val = 0;
2241                 }
2242                 vcpu->regs[reg] = val;
2243         } else {
2244                 /* mov to dr */
2245         }
2246         vcpu_put_rsp_rip(vcpu);
2247         skip_emulated_instruction(vcpu);
2248         return 1;
2249 }
2250
2251 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2252 {
2253         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2254         return 0;
2255 }
2256
2257 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2258 {
2259         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2260         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2261         uint64_t data;
2262
2263 #ifdef LITEVM_DEBUG
2264         if (guest_cpl() != 0) {
2265                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2266                 inject_gp(vcpu);
2267                 return 1;
2268         }
2269 #endif
2270
2271         switch (ecx) {
2272 #ifdef __x86_64__
2273         case MSR_FS_BASE:
2274                 data = vmcs_readl(GUEST_FS_BASE);
2275                 break;
2276         case MSR_GS_BASE:
2277                 data = vmcs_readl(GUEST_GS_BASE);
2278                 break;
2279 #endif
2280         case MSR_IA32_SYSENTER_CS:
2281                 data = vmcs_read32(GUEST_SYSENTER_CS);
2282                 break;
2283         case MSR_IA32_SYSENTER_EIP:
2284                 data = vmcs_read32(GUEST_SYSENTER_EIP);
2285                 break;
2286         case MSR_IA32_SYSENTER_ESP:
2287                 data = vmcs_read32(GUEST_SYSENTER_ESP);
2288                 break;
2289         case MSR_IA32_MC0_CTL:
2290         case MSR_IA32_MCG_STATUS:
2291         case MSR_IA32_MCG_CAP:
2292         case MSR_IA32_MC0_MISC:
2293         case MSR_IA32_MC0_MISC+4:
2294         case MSR_IA32_MC0_MISC+8:
2295         case MSR_IA32_MC0_MISC+12:
2296         case MSR_IA32_MC0_MISC+16:
2297         case MSR_IA32_UCODE_REV:
2298                 /* MTRR registers */
2299         case 0xfe:
2300         case 0x200 ... 0x2ff:
2301                 data = 0;
2302                 break;
2303         case MSR_IA32_APICBASE:
2304                 data = vcpu->apic_base;
2305                 break;
2306         default:
2307                 if (msr) {
2308                         data = msr->data;
2309                         break;
2310                 }
2311                 printk("litevm: unhandled rdmsr: %x\n", ecx);
2312                 inject_gp(vcpu);
2313                 return 1;
2314         }
2315
2316         /* FIXME: handling of bits 32:63 of rax, rdx */
2317         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2318         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2319         skip_emulated_instruction(vcpu);
2320         return 1;
2321 }
2322
2323 #ifdef __x86_64__
2324
2325 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2326 {
2327         struct vmx_msr_entry *msr;
2328
2329         if (efer & EFER_RESERVED_BITS) {
2330                 printd("set_efer: 0x%llx #GP, reserved bits\n",
2331                        efer);
2332                 inject_gp(vcpu);
2333                 return;
2334         }
2335
2336         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2337                 printd("set_efer: #GP, change LME while paging\n");
2338                 inject_gp(vcpu);
2339                 return;
2340         }
2341
2342         efer &= ~EFER_LMA;
2343         efer |= vcpu->shadow_efer & EFER_LMA;
2344
2345         vcpu->shadow_efer = efer;
2346
2347         msr = find_msr_entry(vcpu, MSR_EFER);
2348
2349         if (!(efer & EFER_LMA))
2350             efer &= ~EFER_LME;
2351         msr->data = efer;
2352         skip_emulated_instruction(vcpu);
2353 }
2354
2355 #endif
2356
2357 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2358
2359 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2360 {
2361         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2362         struct vmx_msr_entry *msr;
2363         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2364                 | ((uint64_t)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2365
2366 #ifdef LITEVM_DEBUG
2367         if (guest_cpl() != 0) {
2368                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2369                 inject_gp(vcpu);
2370                 return 1;
2371         }
2372 #endif
2373
2374         switch (ecx) {
2375 #ifdef __x86_64__
2376         case MSR_FS_BASE:
2377                 vmcs_writel(GUEST_FS_BASE, data);
2378                 break;
2379         case MSR_GS_BASE:
2380                 vmcs_writel(GUEST_GS_BASE, data);
2381                 break;
2382 #endif
2383         case MSR_IA32_SYSENTER_CS:
2384                 vmcs_write32(GUEST_SYSENTER_CS, data);
2385                 break;
2386         case MSR_IA32_SYSENTER_EIP:
2387                 vmcs_write32(GUEST_SYSENTER_EIP, data);
2388                 break;
2389         case MSR_IA32_SYSENTER_ESP:
2390                 vmcs_write32(GUEST_SYSENTER_ESP, data);
2391                 break;
2392 #ifdef __x86_64
2393         case MSR_EFER:
2394                 set_efer(vcpu, data);
2395                 return 1;
2396         case MSR_IA32_MC0_STATUS:
2397                 printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
2398                             , __FUNCTION__, data);
2399                 break;
2400 #endif
2401         case MSR_IA32_TIME_STAMP_COUNTER: {
2402                 uint64_t tsc;
2403                 
2404                 tsc = read_tsc();
2405                 vmcs_write64(TSC_OFFSET, data - tsc);
2406                 break;
2407         }
2408         case MSR_IA32_UCODE_REV:
2409         case MSR_IA32_UCODE_WRITE:
2410         case 0x200 ... 0x2ff: /* MTRRs */
2411                 break;
2412         case MSR_IA32_APICBASE:
2413                 vcpu->apic_base = data;
2414                 break;
2415         default:
2416                 msr = find_msr_entry(vcpu, ecx);
2417                 if (msr) {
2418                         msr->data = data;
2419                         break;
2420                 }
2421                 printk("litevm: unhandled wrmsr: %x\n", ecx);
2422                 inject_gp(vcpu);
2423                 return 1;
2424         }
2425         skip_emulated_instruction(vcpu);
2426         return 1;
2427 }
2428
2429 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2430                                    struct litevm_run *litevm_run)
2431 {
2432         /* Turn off interrupt window reporting. */
2433         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2434                      vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2435                      & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2436         return 1;
2437 }
2438
2439 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2440 {
2441         skip_emulated_instruction(vcpu);
2442         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF))
2443                 return 1;
2444
2445         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2446         return 0;
2447 }
2448
2449 /*
2450  * The exit handlers return 1 if the exit was handled fully and guest execution
2451  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2452  * to be done to userspace and return 0.
2453  */
2454 static int (*litevm_vmx_exit_handlers[])(struct litevm_vcpu *vcpu,
2455                                       struct litevm_run *litevm_run) = {
2456         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
2457         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
2458         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
2459         [EXIT_REASON_INVLPG]                  = handle_invlpg,
2460         [EXIT_REASON_CR_ACCESS]               = handle_cr,
2461         [EXIT_REASON_DR_ACCESS]               = handle_dr,
2462         [EXIT_REASON_CPUID]                   = handle_cpuid,
2463         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
2464         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
2465         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
2466         [EXIT_REASON_HLT]                     = handle_halt,
2467 };
2468
2469 static const int litevm_vmx_max_exit_handlers =
2470         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2471
2472 /*
2473  * The guest has exited.  See if we can fix it or if we need userspace
2474  * assistance.
2475  */
2476 static int litevm_handle_exit(struct litevm_run *litevm_run, struct litevm_vcpu *vcpu)
2477 {
2478         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2479         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2480
2481         if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
2482                                 exit_reason != EXIT_REASON_EXCEPTION_NMI )
2483                 printk("%s: unexpected, valid vectoring info and "
2484                        "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2485         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2486         if (exit_reason < litevm_vmx_max_exit_handlers
2487             && litevm_vmx_exit_handlers[exit_reason])
2488                 return litevm_vmx_exit_handlers[exit_reason](vcpu, litevm_run);
2489         else {
2490                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2491                 litevm_run->hw.hardware_exit_reason = exit_reason;
2492         }
2493         return 0;
2494 }
2495
2496 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2497 {
2498         uint16_t ent[2];
2499         uint16_t cs;
2500         uint16_t ip;
2501         unsigned long flags;
2502         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2503         uint16_t sp =  vmcs_readl(GUEST_RSP);
2504         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2505
2506         if (sp > ss_limit || ((sp - 6) > sp)) {
2507                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2508                             __FUNCTION__,
2509                             vmcs_readl(GUEST_RSP),
2510                             vmcs_readl(GUEST_SS_BASE),
2511                             vmcs_read32(GUEST_SS_LIMIT));
2512                 return;
2513         }
2514
2515         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2516                                                                 sizeof(ent)) {
2517                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2518                 return;
2519         }
2520
2521         flags =  vmcs_readl(GUEST_RFLAGS);
2522         cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
2523         ip =  vmcs_readl(GUEST_RIP);
2524
2525
2526         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2527             litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2528             litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2529                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2530                 return;
2531         }
2532
2533         vmcs_writel(GUEST_RFLAGS, flags &
2534                     ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2535         vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
2536         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2537         vmcs_writel(GUEST_RIP, ent[0]);
2538         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2539 }
2540
2541 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2542 {
2543         int word_index = __ffs(vcpu->irq_summary);
2544         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2545         int irq = word_index * BITS_PER_LONG + bit_index;
2546
2547         /* don't have clear_bit and I'm not sure the akaros
2548          * bitops are really going to work.
2549          */
2550         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2551         if (!vcpu->irq_pending[word_index])
2552                 vcpu->irq_summary &= ~ (1 << word_index);
2553
2554         if (vcpu->rmode.active) {
2555                 inject_rmode_irq(vcpu, irq);
2556                 return;
2557         }
2558         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2559                         irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2560 }
2561
2562 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2563 {
2564         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2565             && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2566                 /*
2567                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2568                  */
2569                 litevm_do_inject_irq(vcpu);
2570         else
2571                 /*
2572                  * Interrupts blocked.  Wait for unblock.
2573                  */
2574                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2575                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2576                              | CPU_BASED_VIRTUAL_INTR_PENDING);
2577 }
2578
2579 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2580 {
2581         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2582
2583 #warning "no debugging guests yet"
2584         assert(0);
2585 /*
2586         set_debugreg(dbg->bp[0], 0);
2587         set_debugreg(dbg->bp[1], 1);
2588         set_debugreg(dbg->bp[2], 2);
2589         set_debugreg(dbg->bp[3], 3);
2590 */
2591         if (dbg->singlestep) {
2592                 unsigned long flags;
2593
2594                 flags = vmcs_readl(GUEST_RFLAGS);
2595                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2596                 vmcs_writel(GUEST_RFLAGS, flags);
2597         }
2598 }
2599
2600 static void load_msrs(struct vmx_msr_entry *e, int n)
2601 {
2602         int i;
2603
2604         for (i = 0; i < n; ++i)
2605                 write_msr(e[i].index, e[i].data);
2606 }
2607
2608 static void save_msrs(struct vmx_msr_entry *e, int n)
2609 {
2610         int i;
2611
2612         for (i = 0; i < n; ++i)
2613                 e[i].data = read_msr(e[i].index);
2614 }
2615
2616 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
2617 {
2618         struct litevm_vcpu *vcpu;
2619         uint8_t fail;
2620         uint16_t fs_sel, gs_sel, ldt_sel;
2621         int fs_gs_ldt_reload_needed;
2622
2623         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2624                 error("vcpu is %d but must be in the range %d..%d\n",
2625                       litevm_run->vcpu, LITEVM_MAX_VCPUS);
2626
2627         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2628         if (!vcpu)
2629                 error("vcpu_load failed");
2630
2631         if (litevm_run->emulated) {
2632                 skip_emulated_instruction(vcpu);
2633                 litevm_run->emulated = 0;
2634         }
2635
2636         if (litevm_run->mmio_completed) {
2637                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
2638                 vcpu->mmio_read_completed = 1;
2639         }
2640
2641         vcpu->mmio_needed = 0;
2642
2643 again:
2644         /*
2645          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2646          * allow segment selectors with cpl > 0 or ti == 1.
2647          */
2648         fs_sel = read_fs();
2649         gs_sel = read_gs();
2650         ldt_sel = read_ldt();
2651         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
2652         if (!fs_gs_ldt_reload_needed) {
2653                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2654                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2655         } else {
2656                 vmcs_write16(HOST_FS_SELECTOR, 0);
2657                 vmcs_write16(HOST_GS_SELECTOR, 0);
2658         }
2659
2660 #ifdef __x86_64__
2661         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
2662         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2663 #endif
2664
2665         if (vcpu->irq_summary &&
2666             !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2667                 litevm_try_inject_irq(vcpu);
2668
2669         if (vcpu->guest_debug.enabled)
2670                 litevm_guest_debug_pre(vcpu);
2671
2672         fx_save(vcpu->host_fx_image);
2673         fx_restore(vcpu->guest_fx_image);
2674
2675         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
2676         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2677
2678         asm (
2679                 /* Store host registers */
2680                 "pushf \n\t"
2681 #ifdef __x86_64__
2682                 "push %%rax; push %%rbx; push %%rdx;"
2683                 "push %%rsi; push %%rdi; push %%rbp;"
2684                 "push %%r8;  push %%r9;  push %%r10; push %%r11;"
2685                 "push %%r12; push %%r13; push %%r14; push %%r15;"
2686                 "push %%rcx \n\t"
2687                 "vmwrite %%rsp, %2 \n\t"
2688 #else
2689                 "pusha; push %%ecx \n\t"
2690                 "vmwrite %%esp, %2 \n\t"
2691 #endif
2692                 /* Check if vmlaunch of vmresume is needed */
2693                 "cmp $0, %1 \n\t"
2694                 /* Load guest registers.  Don't clobber flags. */
2695 #ifdef __x86_64__
2696                 "mov %c[cr2](%3), %%rax \n\t"
2697                 "mov %%rax, %%cr2 \n\t"
2698                 "mov %c[rax](%3), %%rax \n\t"
2699                 "mov %c[rbx](%3), %%rbx \n\t"
2700                 "mov %c[rdx](%3), %%rdx \n\t"
2701                 "mov %c[rsi](%3), %%rsi \n\t"
2702                 "mov %c[rdi](%3), %%rdi \n\t"
2703                 "mov %c[rbp](%3), %%rbp \n\t"
2704                 "mov %c[r8](%3),  %%r8  \n\t"
2705                 "mov %c[r9](%3),  %%r9  \n\t"
2706                 "mov %c[r10](%3), %%r10 \n\t"
2707                 "mov %c[r11](%3), %%r11 \n\t"
2708                 "mov %c[r12](%3), %%r12 \n\t"
2709                 "mov %c[r13](%3), %%r13 \n\t"
2710                 "mov %c[r14](%3), %%r14 \n\t"
2711                 "mov %c[r15](%3), %%r15 \n\t"
2712                 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
2713 #else
2714                 "mov %c[cr2](%3), %%eax \n\t"
2715                 "mov %%eax,   %%cr2 \n\t"
2716                 "mov %c[rax](%3), %%eax \n\t"
2717                 "mov %c[rbx](%3), %%ebx \n\t"
2718                 "mov %c[rdx](%3), %%edx \n\t"
2719                 "mov %c[rsi](%3), %%esi \n\t"
2720                 "mov %c[rdi](%3), %%edi \n\t"
2721                 "mov %c[rbp](%3), %%ebp \n\t"
2722                 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
2723 #endif
2724                 /* Enter guest mode */
2725                 "jne launched \n\t"
2726                 "vmlaunch \n\t"
2727                 "jmp litevm_vmx_return \n\t"
2728                 "launched: vmresume \n\t"
2729                 ".globl litevm_vmx_return \n\t"
2730                 "litevm_vmx_return: "
2731                 /* Save guest registers, load host registers, keep flags */
2732 #ifdef __x86_64__
2733                 "xchg %3,     0(%%rsp) \n\t"
2734                 "mov %%rax, %c[rax](%3) \n\t"
2735                 "mov %%rbx, %c[rbx](%3) \n\t"
2736                 "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
2737                 "mov %%rdx, %c[rdx](%3) \n\t"
2738                 "mov %%rsi, %c[rsi](%3) \n\t"
2739                 "mov %%rdi, %c[rdi](%3) \n\t"
2740                 "mov %%rbp, %c[rbp](%3) \n\t"
2741                 "mov %%r8,  %c[r8](%3) \n\t"
2742                 "mov %%r9,  %c[r9](%3) \n\t"
2743                 "mov %%r10, %c[r10](%3) \n\t"
2744                 "mov %%r11, %c[r11](%3) \n\t"
2745                 "mov %%r12, %c[r12](%3) \n\t"
2746                 "mov %%r13, %c[r13](%3) \n\t"
2747                 "mov %%r14, %c[r14](%3) \n\t"
2748                 "mov %%r15, %c[r15](%3) \n\t"
2749                 "mov %%cr2, %%rax   \n\t"
2750                 "mov %%rax, %c[cr2](%3) \n\t"
2751                 "mov 0(%%rsp), %3 \n\t"
2752
2753                 "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
2754                 "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
2755                 "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
2756                 "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
2757 #else
2758                 "xchg %3, 0(%%esp) \n\t"
2759                 "mov %%eax, %c[rax](%3) \n\t"
2760                 "mov %%ebx, %c[rbx](%3) \n\t"
2761                 "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
2762                 "mov %%edx, %c[rdx](%3) \n\t"
2763                 "mov %%esi, %c[rsi](%3) \n\t"
2764                 "mov %%edi, %c[rdi](%3) \n\t"
2765                 "mov %%ebp, %c[rbp](%3) \n\t"
2766                 "mov %%cr2, %%eax  \n\t"
2767                 "mov %%eax, %c[cr2](%3) \n\t"
2768                 "mov 0(%%esp), %3 \n\t"
2769
2770                 "pop %%ecx; popa \n\t"
2771 #endif
2772                 "setbe %0 \n\t"
2773                 "popf \n\t"
2774               : "=g" (fail)
2775               : "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
2776                 "c"(vcpu),
2777                 [rax]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
2778                 [rbx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
2779                 [rcx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
2780                 [rdx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
2781                 [rsi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
2782                 [rdi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
2783                 [rbp]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
2784 #ifdef __x86_64__
2785                 [r8 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8 ])),
2786                 [r9 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9 ])),
2787                 [r10]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
2788                 [r11]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
2789                 [r12]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
2790                 [r13]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
2791                 [r14]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
2792                 [r15]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
2793 #endif
2794                 [cr2]"i"(offsetof(struct litevm_vcpu, cr2))
2795               : "cc", "memory" );
2796
2797         ++litevm_stat.exits;
2798         printk("vm_run exits");
2799         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2800         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
2801
2802         fx_save(vcpu->guest_fx_image);
2803         fx_restore(vcpu->host_fx_image);
2804
2805 #ifndef __x86_64__
2806         asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2807 #endif
2808
2809         litevm_run->exit_type = 0;
2810         if (fail) {
2811                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
2812                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
2813         } else {
2814                 if (fs_gs_ldt_reload_needed) {
2815                         load_ldt(ldt_sel);
2816                         load_fs(fs_sel);
2817                         /*
2818                          * If we have to reload gs, we must take care to
2819                          * preserve our gs base.
2820                          */
2821                         disable_irq();
2822                         load_gs(gs_sel);
2823 #ifdef __x86_64__
2824                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
2825 #endif
2826                         enable_irq();
2827
2828                         reload_tss();
2829                 }
2830                 vcpu->launched = 1;
2831                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
2832                 if (litevm_handle_exit(litevm_run, vcpu)) {
2833                         /* Give scheduler a change to reschedule. */
2834                         vcpu_put(vcpu);
2835 #warning "how to tell if signal is pending"
2836 /*
2837                         if (signal_pending(current)) {
2838                                 ++litevm_stat.signal_exits;
2839                                 return -EINTR;
2840                         }
2841 */
2842                         kthread_yield();
2843                         /* Cannot fail -  no vcpu unplug yet. */
2844                         vcpu_load(litevm, vcpu_slot(vcpu));
2845                         goto again;
2846                 }
2847         }
2848
2849         vcpu_put(vcpu);
2850         printk("vm_run returns\n");
2851         return 0;
2852 }
2853
2854 static int litevm_dev_ioctl_get_regs(struct litevm *litevm, struct litevm_regs *regs)
2855 {
2856         struct litevm_vcpu *vcpu;
2857
2858         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS)
2859                 return -EINVAL;
2860
2861         vcpu = vcpu_load(litevm, regs->vcpu);
2862         if (!vcpu)
2863                 return -ENOENT;
2864
2865         regs->rax = vcpu->regs[VCPU_REGS_RAX];
2866         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2867         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2868         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2869         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2870         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2871         regs->rsp = vmcs_readl(GUEST_RSP);
2872         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2873 #ifdef __x86_64__
2874         regs->r8 = vcpu->regs[VCPU_REGS_R8];
2875         regs->r9 = vcpu->regs[VCPU_REGS_R9];
2876         regs->r10 = vcpu->regs[VCPU_REGS_R10];
2877         regs->r11 = vcpu->regs[VCPU_REGS_R11];
2878         regs->r12 = vcpu->regs[VCPU_REGS_R12];
2879         regs->r13 = vcpu->regs[VCPU_REGS_R13];
2880         regs->r14 = vcpu->regs[VCPU_REGS_R14];
2881         regs->r15 = vcpu->regs[VCPU_REGS_R15];
2882 #endif
2883
2884         regs->rip = vmcs_readl(GUEST_RIP);
2885         regs->rflags = vmcs_readl(GUEST_RFLAGS);
2886
2887         /*
2888          * Don't leak debug flags in case they were set for guest debugging
2889          */
2890         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2891                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2892
2893         vcpu_put(vcpu);
2894
2895         return 0;
2896 }
2897
2898 static int litevm_dev_ioctl_set_regs(struct litevm *litevm, struct litevm_regs *regs)
2899 {
2900         struct litevm_vcpu *vcpu;
2901
2902         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS)
2903                 return -EINVAL;
2904
2905         vcpu = vcpu_load(litevm, regs->vcpu);
2906         if (!vcpu)
2907                 return -ENOENT;
2908
2909         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2910         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2911         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2912         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2913         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2914         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2915         vmcs_writel(GUEST_RSP, regs->rsp);
2916         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2917 #ifdef __x86_64__
2918         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2919         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2920         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2921         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2922         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2923         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2924         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2925         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2926 #endif
2927
2928         vmcs_writel(GUEST_RIP, regs->rip);
2929         vmcs_writel(GUEST_RFLAGS, regs->rflags);
2930
2931         vcpu_put(vcpu);
2932
2933         return 0;
2934 }
2935
2936 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
2937 {
2938         struct litevm_vcpu *vcpu;
2939
2940         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS)
2941                 return -EINVAL;
2942         vcpu = vcpu_load(litevm, sregs->vcpu);
2943         if (!vcpu)
2944                 return -ENOENT;
2945
2946 #define get_segment(var, seg) \
2947         do { \
2948                 uint32_t ar; \
2949                 \
2950                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
2951                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
2952                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
2953                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
2954                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
2955                 sregs->var.type = ar & 15; \
2956                 sregs->var.s = (ar >> 4) & 1; \
2957                 sregs->var.dpl = (ar >> 5) & 3; \
2958                 sregs->var.present = (ar >> 7) & 1; \
2959                 sregs->var.avl = (ar >> 12) & 1; \
2960                 sregs->var.l = (ar >> 13) & 1; \
2961                 sregs->var.db = (ar >> 14) & 1; \
2962                 sregs->var.g = (ar >> 15) & 1; \
2963                 sregs->var.unusable = (ar >> 16) & 1; \
2964         } while (0);
2965
2966         get_segment(cs, CS);
2967         get_segment(ds, DS);
2968         get_segment(es, ES);
2969         get_segment(fs, FS);
2970         get_segment(gs, GS);
2971         get_segment(ss, SS);
2972
2973         get_segment(tr, TR);
2974         get_segment(ldt, LDTR);
2975 #undef get_segment
2976
2977 #define get_dtable(var, table) \
2978         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
2979                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
2980
2981         get_dtable(idt, IDTR);
2982         get_dtable(gdt, GDTR);
2983 #undef get_dtable
2984
2985         sregs->cr0 = guest_cr0();
2986         sregs->cr2 = vcpu->cr2;
2987         sregs->cr3 = vcpu->cr3;
2988         sregs->cr4 = guest_cr4();
2989         sregs->cr8 = vcpu->cr8;
2990         sregs->efer = vcpu->shadow_efer;
2991         sregs->apic_base = vcpu->apic_base;
2992
2993         sregs->pending_int = vcpu->irq_summary != 0;
2994
2995         vcpu_put(vcpu);
2996
2997         return 0;
2998 }
2999
3000 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
3001 {
3002         struct litevm_vcpu *vcpu;
3003         int mmu_reset_needed = 0;
3004
3005         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS)
3006                 return -EINVAL;
3007         vcpu = vcpu_load(litevm, sregs->vcpu);
3008         if (!vcpu)
3009                 return -ENOENT;
3010
3011 #define set_segment(var, seg) \
3012         do { \
3013                 uint32_t ar; \
3014                 \
3015                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
3016                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
3017                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
3018                 if (sregs->var.unusable) { \
3019                         ar = (1 << 16); \
3020                 } else { \
3021                         ar = (sregs->var.type & 15); \
3022                         ar |= (sregs->var.s & 1) << 4; \
3023                         ar |= (sregs->var.dpl & 3) << 5; \
3024                         ar |= (sregs->var.present & 1) << 7; \
3025                         ar |= (sregs->var.avl & 1) << 12; \
3026                         ar |= (sregs->var.l & 1) << 13; \
3027                         ar |= (sregs->var.db & 1) << 14; \
3028                         ar |= (sregs->var.g & 1) << 15; \
3029                 } \
3030                 vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
3031         } while (0);
3032
3033         set_segment(cs, CS);
3034         set_segment(ds, DS);
3035         set_segment(es, ES);
3036         set_segment(fs, FS);
3037         set_segment(gs, GS);
3038         set_segment(ss, SS);
3039
3040         set_segment(tr, TR);
3041
3042         set_segment(ldt, LDTR);
3043 #undef set_segment
3044
3045 #define set_dtable(var, table) \
3046         vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
3047         vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
3048
3049         set_dtable(idt, IDTR);
3050         set_dtable(gdt, GDTR);
3051 #undef set_dtable
3052
3053         vcpu->cr2 = sregs->cr2;
3054         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
3055         vcpu->cr3 = sregs->cr3;
3056
3057         vcpu->cr8 = sregs->cr8;
3058
3059         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
3060 #ifdef __x86_64__
3061         __set_efer(vcpu, sregs->efer);
3062 #endif
3063         vcpu->apic_base = sregs->apic_base;
3064
3065         mmu_reset_needed |= guest_cr0() != sregs->cr0;
3066         vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
3067         update_exception_bitmap(vcpu);
3068         vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
3069         vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
3070
3071         mmu_reset_needed |=  guest_cr4() != sregs->cr4;
3072         __set_cr4(vcpu, sregs->cr4);
3073
3074         if (mmu_reset_needed)
3075                 litevm_mmu_reset_context(vcpu);
3076         vcpu_put(vcpu);
3077
3078         return 0;
3079 }
3080
3081 /*
3082  * Translate a guest virtual address to a guest physical address.
3083  */
3084 static int litevm_dev_ioctl_translate(struct litevm *litevm, struct litevm_translation *tr)
3085 {
3086         unsigned long vaddr = tr->linear_address;
3087         struct litevm_vcpu *vcpu;
3088         gpa_t gpa;
3089
3090         vcpu = vcpu_load(litevm, tr->vcpu);
3091         if (!vcpu)
3092                 return -ENOENT;
3093         spin_lock_irqsave(&litevm->lock);
3094         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
3095         tr->physical_address = gpa;
3096         tr->valid = gpa != UNMAPPED_GVA;
3097         tr->writeable = 1;
3098         tr->usermode = 0;
3099         spin_unlock(&litevm->lock);
3100         vcpu_put(vcpu);
3101
3102         return 0;
3103 }
3104
3105 #if 0
3106 static int litevm_dev_ioctl_interrupt(struct litevm *litevm, struct litevm_interrupt *irq)
3107 {
3108         struct litevm_vcpu *vcpu;
3109
3110         if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS)
3111                 return -EINVAL;
3112         if (irq->irq < 0 || irq->irq >= 256)
3113                 return -EINVAL;
3114         vcpu = vcpu_load(litevm, irq->vcpu);
3115         if (!vcpu)
3116                 return -ENOENT;
3117
3118         set_bit(irq->irq, vcpu->irq_pending);
3119         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
3120
3121         vcpu_put(vcpu);
3122
3123         return 0;
3124 }
3125 #endif
3126
3127 #if 0
3128 static int litevm_dev_ioctl_debug_guest(struct litevm *litevm,
3129                                      struct litevm_debug_guest *dbg)
3130 {
3131         struct litevm_vcpu *vcpu;
3132         unsigned long dr7 = 0x400;
3133         uint32_t exception_bitmap;
3134         int old_singlestep;
3135
3136         if (dbg->vcpu < 0 || dbg->vcpu >= LITEVM_MAX_VCPUS)
3137                 return -EINVAL;
3138         vcpu = vcpu_load(litevm, dbg->vcpu);
3139         if (!vcpu)
3140                 return -ENOENT;
3141
3142         exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
3143         old_singlestep = vcpu->guest_debug.singlestep;
3144
3145         vcpu->guest_debug.enabled = dbg->enabled;
3146         if (vcpu->guest_debug.enabled) {
3147                 int i;
3148
3149                 dr7 |= 0x200;  /* exact */
3150                 for (i = 0; i < 4; ++i) {
3151                         if (!dbg->breakpoints[i].enabled)
3152                                 continue;
3153                         vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
3154                         dr7 |= 2 << (i*2);    /* global enable */
3155                         dr7 |= 0 << (i*4+16); /* execution breakpoint */
3156                 }
3157
3158                 exception_bitmap |= (1u << 1);  /* Trap debug exceptions */
3159
3160                 vcpu->guest_debug.singlestep = dbg->singlestep;
3161         } else {
3162                 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
3163                 vcpu->guest_debug.singlestep = 0;
3164         }
3165
3166         if (old_singlestep && !vcpu->guest_debug.singlestep) {
3167                 unsigned long flags;
3168
3169                 flags = vmcs_readl(GUEST_RFLAGS);
3170                 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3171                 vmcs_writel(GUEST_RFLAGS, flags);
3172         }
3173
3174         vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
3175         vmcs_writel(GUEST_DR7, dr7);
3176
3177         vcpu_put(vcpu);
3178
3179         return 0;
3180 }
3181 #endif
3182
3183 #if 0
3184 long litevm_control(struct litevm *litevm, int command, unsigned long arg)
3185 {
3186         int r = -EINVAL;
3187
3188         switch (command) {
3189         case LITEVM_CREATE_VCPU: {
3190                 r = create_vcpu(litevm, arg);
3191                 if (r)
3192                         goto out;
3193                 break;
3194         }
3195         case LITEVM_RUN: {
3196                 struct litevm_run litevm_run;
3197
3198                 r = -EFAULT;
3199                 if (copy_from_user(&litevm_run, (void *)arg, sizeof litevm_run))
3200                         goto out;
3201                 r = litevm_dev_ioctl_run(litevm, &litevm_run);
3202                 if (r < 0)
3203                         goto out;
3204                 r = -EFAULT;
3205                 if (copy_to_user((void *)arg, &litevm_run, sizeof litevm_run))
3206                         goto out;
3207                 r = 0;
3208                 break;
3209         }
3210         case LITEVM_GET_REGS: {
3211                 struct litevm_regs litevm_regs;
3212
3213                 r = -EFAULT;
3214                 if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
3215                         goto out;
3216                 r = litevm_dev_ioctl_get_regs(litevm, &litevm_regs);
3217                 if (r)
3218                         goto out;
3219                 r = -EFAULT;
3220                 if (copy_to_user((void *)arg, &litevm_regs, sizeof litevm_regs))
3221                         goto out;
3222                 r = 0;
3223                 break;
3224         }
3225         case LITEVM_SET_REGS: {
3226                 struct litevm_regs litevm_regs;
3227
3228                 r = -EFAULT;
3229                 if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
3230                         goto out;
3231                 r = litevm_dev_ioctl_set_regs(litevm, &litevm_regs);
3232                 if (r)
3233                         goto out;
3234                 r = 0;
3235                 break;
3236         }
3237         case LITEVM_GET_SREGS: {
3238                 struct litevm_sregs litevm_sregs;
3239
3240                 r = -EFAULT;
3241                 if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3242                         goto out;
3243                 r = litevm_dev_ioctl_get_sregs(litevm, &litevm_sregs);
3244                 if (r)
3245                         goto out;
3246                 r = -EFAULT;
3247                 if (copy_to_user((void *)arg, &litevm_sregs, sizeof litevm_sregs))
3248                         goto out;
3249                 r = 0;
3250                 break;
3251         }
3252         case LITEVM_SET_SREGS: {
3253                 struct litevm_sregs litevm_sregs;
3254
3255                 r = -EFAULT;
3256                 if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3257                         goto out;
3258                 r = litevm_dev_ioctl_set_sregs(litevm, &litevm_sregs);
3259                 if (r)
3260                         goto out;
3261                 r = 0;
3262                 break;
3263         }
3264         case LITEVM_TRANSLATE: {
3265                 struct litevm_translation tr;
3266
3267                 r = -EFAULT;
3268                 if (copy_from_user(&tr, (void *)arg, sizeof tr))
3269                         goto out;
3270                 r = litevm_dev_ioctl_translate(litevm, &tr);
3271                 if (r)
3272                         goto out;
3273                 r = -EFAULT;
3274                 if (copy_to_user((void *)arg, &tr, sizeof tr))
3275                         goto out;
3276                 r = 0;
3277                 break;
3278         }
3279         case LITEVM_INTERRUPT: {
3280                 struct litevm_interrupt irq;
3281
3282                 r = -EFAULT;
3283                 if (copy_from_user(&irq, (void *)arg, sizeof irq))
3284                         goto out;
3285                 r = litevm_dev_ioctl_interrupt(litevm, &irq);
3286                 if (r)
3287                         goto out;
3288                 r = 0;
3289                 break;
3290         }
3291         case LITEVM_DEBUG_GUEST: {
3292                 struct litevm_debug_guest dbg;
3293
3294                 r = -EFAULT;
3295                 if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
3296                         goto out;
3297                 r = litevm_dev_ioctl_debug_guest(litevm, &dbg);
3298                 if (r)
3299                         goto out;
3300                 r = 0;
3301                 break;
3302         }
3303         case LITEVM_SET_MEMORY_REGION: {
3304                 struct litevm_memory_region litevm_mem;
3305
3306                 r = -EFAULT;
3307                 if (copy_from_user(&litevm_mem, (void *)arg, sizeof litevm_mem))
3308                         goto out;
3309                 r = litevm_dev_ioctl_set_memory_region(litevm, &litevm_mem);
3310                 if (r)
3311                         goto out;
3312                 break;
3313         }
3314         case LITEVM_GET_DIRTY_LOG: {
3315                 struct litevm_dirty_log log;
3316
3317                 r = -EFAULT;
3318                 if (copy_from_user(&log, (void *)arg, sizeof log))
3319                         goto out;
3320                 r = litevm_dev_ioctl_get_dirty_log(litevm, &log);
3321                 if (r)
3322                         goto out;
3323                 break;
3324         }
3325         default:
3326                 ;
3327         }
3328 out:
3329         return r;
3330 }
3331 #endif
3332
3333 #if 0
3334 static int litevm_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3335 {
3336         struct litevm *litevm = vma->vm_file->private_data;
3337         struct litevm_memory_slot *slot;
3338         struct page *page;
3339
3340         slot = gfn_to_memslot(litevm, vmf->pgoff);
3341         if (!slot)
3342                 return VM_FAULT_SIGBUS;
3343         page = gfn_to_page(slot, vmf->pgoff);
3344         if (!page)
3345                 return VM_FAULT_SIGBUS;
3346
3347         get_page(page);
3348         vmf->page = page;
3349         return 0;
3350 }
3351 #endif
3352
3353 #if 0
3354 static int litevm_reboot(struct notifier_block *notifier, unsigned long val,
3355                        void *v)
3356 {
3357         panic("litevm_reboot");
3358         if (val == SYS_RESTART) {
3359                 /*
3360                  * Some (well, at least mine) BIOSes hang on reboot if
3361                  * in vmx root mode.
3362                  */
3363                 printk("litevm: exiting vmx mode\n");
3364                 handler_wrapper_t *w;
3365                 smp_call_function_all(litevm_disable, 0, &w);
3366                 smp_call_wait(w);
3367         }
3368         return NOTIFY_OK;
3369         return 0;
3370 }
3371 #endif
3372
3373 hpa_t bad_page_address;
3374
3375 int vmx_init(void)
3376 {
3377         handler_wrapper_t *w;
3378         int r = 0;
3379
3380         if (!cpu_has_litevm_support()) {
3381                 printk("litevm: no hardware support\n");
3382                 return -EOPNOTSUPP;
3383         }
3384         if (vmx_disabled_by_bios()) {
3385                 printk("litevm: disabled by bios\n");
3386                 return -EOPNOTSUPP;
3387         }
3388
3389         setup_vmcs_descriptor();
3390         smp_call_function_all(vm_enable, 0, &w);
3391         if (smp_call_wait(w)){
3392                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
3393         }
3394
3395         if ((bad_page_address = PADDR(kpage_zalloc_addr())) == 0ULL) {
3396                 r = -ENOMEM;
3397         }
3398
3399         return r;
3400 }
3401
3402 static void litevm_exit(void)
3403 {
3404         //free_litevm_area();
3405         //__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3406 }
3407
3408