We can now set kvm at boot time
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 #define currentcpu (&per_cpu_info[core_id()])
39
40 struct litevm_stat litevm_stat;
41
42 static struct litevm_stats_debugfs_item {
43         const char *name;
44         uint32_t *data;
45 } debugfs_entries[] = {
46         { "pf_fixed", &litevm_stat.pf_fixed },
47         { "pf_guest", &litevm_stat.pf_guest },
48         { "tlb_flush", &litevm_stat.tlb_flush },
49         { "invlpg", &litevm_stat.invlpg },
50         { "exits", &litevm_stat.exits },
51         { "io_exits", &litevm_stat.io_exits },
52         { "mmio_exits", &litevm_stat.mmio_exits },
53         { "signal_exits", &litevm_stat.signal_exits },
54         { "irq_exits", &litevm_stat.irq_exits },
55         { 0, 0 }
56 };
57
58 static struct dentry *debugfs_dir;
59
60 static const uint32_t vmx_msr_index[] = {
61 #ifdef __x86_64__
62         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
63 #endif
64         MSR_EFER, // wtf? MSR_K6_STAR,
65 };
66 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
67
68 #ifdef __x86_64__
69 /*
70  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
71  * mechanism (cpu bug AA24)
72  */
73 #define NR_BAD_MSRS 2
74 #else
75 #define NR_BAD_MSRS 0
76 #endif
77
78 #define TSS_IOPB_BASE_OFFSET 0x66
79 #define TSS_BASE_SIZE 0x68
80 #define TSS_IOPB_SIZE (65536 / 8)
81 #define TSS_REDIRECTION_SIZE (256 / 8)
82 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
83
84 #define MSR_IA32_VMX_BASIC_MSR                  0x480
85 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
86 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
87 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
88 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
89
90 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
91 #define LMSW_GUEST_MASK 0x0eULL
92 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
93 //#define CR4_VMXE 0x2000
94 #define CR8_RESEVED_BITS (~0x0fULL)
95 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
96
97 #ifdef __x86_64__
98 #define HOST_IS_64 1
99 #else
100 #define HOST_IS_64 0
101 #endif
102
103 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
104 /**
105  * __ffs - find first set bit in word
106  * @word: The word to search
107  *
108  * Undefined if no bit exists, so code should check against 0 first.
109  */
110 static inline unsigned long __ffs(unsigned long word)
111 {
112         asm("rep; bsf %1,%0"
113                 : "=r" (word)
114                 : "rm" (word));
115         return word;
116 }
117
118 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu, uint32_t msr)
119 {
120         int i;
121
122         for (i = 0; i < vcpu->nmsrs; ++i)
123                 if (vcpu->guest_msrs[i].index == msr)
124                         return &vcpu->guest_msrs[i];
125         return 0;
126 }
127
128 struct descriptor_table {
129         uint16_t limit;
130         unsigned long base;
131 } __attribute__((packed));
132
133 static void get_gdt(struct descriptor_table *table)
134 {
135         asm ("sgdt %0" : "=m"(*table));
136 }
137
138 static void get_idt(struct descriptor_table *table)
139 {
140         asm ("sidt %0" : "=m"(*table));
141 }
142
143 static uint16_t read_fs(void)
144 {
145         uint16_t seg;
146         asm ("mov %%fs, %0" : "=g"(seg));
147         return seg;
148 }
149
150 static uint16_t read_gs(void)
151 {
152         uint16_t seg;
153         asm ("mov %%gs, %0" : "=g"(seg));
154         return seg;
155 }
156
157 static uint16_t read_ldt(void)
158 {
159         uint16_t ldt;
160         asm ("sldt %0" : "=g"(ldt));
161         return ldt;
162 }
163
164 static void load_fs(uint16_t sel)
165 {
166         asm ("mov %0, %%fs" : : "g"(sel));
167 }
168
169 static void load_gs(uint16_t sel)
170 {
171         asm ("mov %0, %%gs" : : "g"(sel));
172 }
173
174 #ifndef load_ldt
175 static void load_ldt(uint16_t sel)
176 {
177         asm ("lldt %0" : : "g"(sel));
178 }
179 #endif
180
181 static void fx_save(void *image)
182 {
183         asm ("fxsave (%0)":: "r" (image));
184 }
185
186 static void fx_restore(void *image)
187 {
188         asm ("fxrstor (%0)":: "r" (image));
189 }
190
191 static void fpu_init(void)
192 {
193         asm ("finit");
194 }
195
196 struct segment_descriptor {
197         uint16_t limit_low;
198         uint16_t base_low;
199         uint8_t  base_mid;
200         uint8_t  type : 4;
201         uint8_t  system : 1;
202         uint8_t  dpl : 2;
203         uint8_t  present : 1;
204         uint8_t  limit_high : 4;
205         uint8_t  avl : 1;
206         uint8_t  long_mode : 1;
207         uint8_t  default_op : 1;
208         uint8_t  granularity : 1;
209         uint8_t  base_high;
210 } __attribute__((packed));
211
212 #ifdef __x86_64__
213 // LDT or TSS descriptor in the GDT. 16 bytes.
214 struct segment_descriptor_64 {
215         struct segment_descriptor s;
216         uint32_t base_higher;
217         uint32_t pad_zero;
218 };
219
220 #endif
221
222 static unsigned long segment_base(uint16_t selector)
223 {
224         struct descriptor_table gdt;
225         struct segment_descriptor *d;
226         unsigned long table_base;
227         typedef unsigned long ul;
228         unsigned long v;
229
230         asm ("sgdt %0" : "=m"(gdt));
231         table_base = gdt.base;
232
233         if (selector & 4) {           /* from ldt */
234                 uint16_t ldt_selector;
235
236                 asm ("sldt %0" : "=g"(ldt_selector));
237                 table_base = segment_base(ldt_selector);
238         }
239         d = (struct segment_descriptor *)(table_base + (selector & ~7));
240         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
241 #ifdef __x86_64__
242         if (d->system == 0
243             && (d->type == 2 || d->type == 9 || d->type == 11))
244                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
245 #endif
246         return v;
247 }
248
249 static unsigned long read_tr_base(void)
250 {
251         uint16_t tr;
252         asm ("str %0" : "=g"(tr));
253         return segment_base(tr);
254 }
255
256 static void reload_tss(void)
257 {
258 #ifndef __x86_64__
259
260         /*
261          * VT restores TR but not its size.  Useless.
262          */
263         struct descriptor_table gdt;
264         struct segment_descriptor *descs;
265
266         get_gdt(&gdt);
267         descs = (void *)gdt.base;
268         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
269         load_TR_desc();
270 #endif
271 }
272
273 static struct vmcs_descriptor {
274         int size;
275         int order;
276         uint32_t revision_id;
277 } vmcs_descriptor;
278
279 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
280 {
281         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
282         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
283 }
284
285
286
287 int litevm_read_guest(struct litevm_vcpu *vcpu,
288                              gva_t addr,
289                              unsigned long size,
290                              void *dest)
291 {
292         unsigned char *host_buf = dest;
293         unsigned long req_size = size;
294
295         while (size) {
296                 hpa_t paddr;
297                 unsigned now;
298                 unsigned offset;
299                 hva_t guest_buf;
300
301                 paddr = gva_to_hpa(vcpu, addr);
302
303                 if (is_error_hpa(paddr))
304                         break;
305                 guest_buf = (hva_t)KADDR(paddr);
306                 offset = addr & ~PAGE_MASK;
307                 guest_buf |= offset;
308                 now = MIN(size, PAGE_SIZE - offset);
309                 memcpy(host_buf, (void*)guest_buf, now);
310                 host_buf += now;
311                 addr += now;
312                 size -= now;
313         }
314         return req_size - size;
315 }
316
317 int litevm_write_guest(struct litevm_vcpu *vcpu,
318                              gva_t addr,
319                              unsigned long size,
320                              void *data)
321 {
322         unsigned char *host_buf = data;
323         unsigned long req_size = size;
324
325         while (size) {
326                 hpa_t paddr;
327                 unsigned now;
328                 unsigned offset;
329                 hva_t guest_buf;
330
331                 paddr = gva_to_hpa(vcpu, addr);
332
333                 if (is_error_hpa(paddr))
334                         break;
335
336                 guest_buf = (hva_t)KADDR(paddr);
337                 offset = addr & ~PAGE_MASK;
338                 guest_buf |= offset;
339                 now = MIN(size, PAGE_SIZE - offset);
340                 memcpy((void*)guest_buf, host_buf, now);
341                 host_buf += now;
342                 addr += now;
343                 size -= now;
344         }
345         return req_size - size;
346 }
347
348 static void setup_vmcs_descriptor(void)
349 {
350         uint64_t msr;
351
352         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
353         vmcs_descriptor.size = (msr>>32) & 0x1fff;
354         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size>>PAGE_SHIFT);
355         vmcs_descriptor.revision_id = (uint32_t)msr;
356 };
357
358 static void vmcs_clear(struct vmcs *vmcs)
359 {
360         uint64_t phys_addr = PADDR(vmcs);
361         uint8_t error;
362
363         asm volatile ("vmclear %1; setna %0"
364                        : "=m"(error) : "m"(phys_addr) : "cc", "memory" );
365         if (error)
366                 printk("litevm: vmclear fail: %p/%llx\n",
367                        vmcs, phys_addr);
368 }
369
370 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
371 {
372         struct litevm_vcpu *vcpu = arg;
373         int cpu = core_id();
374         printd("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n", 
375                cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
376
377         if (vcpu->cpu == cpu)
378                 vmcs_clear(vcpu->vmcs);
379
380         if (currentcpu->vmcs == vcpu->vmcs)
381                 currentcpu->vmcs = NULL;
382 }
383
384 static int vcpu_slot(struct litevm_vcpu *vcpu)
385 {
386         return vcpu - vcpu->litevm->vcpus;
387 }
388
389 /*
390  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
391  * vcpu mutex is already taken.
392  */
393 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
394 {
395         uint64_t phys_addr = PADDR(vcpu->vmcs);
396         int cpu;
397         cpu = core_id();
398
399         if (vcpu->cpu != cpu) {
400                 handler_wrapper_t *w;
401                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
402                 smp_call_wait(w);
403                 vcpu->launched = 0;
404         }
405         if (currentcpu->vmcs != vcpu->vmcs) {
406                 uint8_t error;
407
408                 currentcpu->vmcs = vcpu->vmcs;
409                 asm volatile ("vmptrld %1; setna %0"
410                                : "=m"(error) : "m"(phys_addr) : "cc" );
411                 if (error)
412                         printk("litevm: vmptrld %p/%llx fail\n",
413                                vcpu->vmcs, phys_addr);
414         }
415
416         if (vcpu->cpu != cpu) {
417                 struct descriptor_table dt;
418                 unsigned long sysenter_esp;
419
420                 vcpu->cpu = cpu;
421                 /*
422                  * Linux uses per-cpu TSS and GDT, so set these when switching
423                  * processors.
424                  */
425                 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
426                 get_gdt(&dt);
427                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
428
429                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
430                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
431         }
432         return vcpu;
433 }
434
435 /*
436  * Switches to specified vcpu, until a matching vcpu_put()
437  */
438 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
439 {
440         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
441
442         qlock(&vcpu->mutex);
443         if (!vcpu->vmcs) {
444                 qunlock(&vcpu->mutex);
445                 return 0;
446         }
447         return __vcpu_load(vcpu);
448 }
449
450 static void vcpu_put(struct litevm_vcpu *vcpu)
451 {
452         //put_cpu();
453         qunlock(&vcpu->mutex);
454 }
455
456
457 static struct vmcs *alloc_vmcs_cpu(int cpu)
458 {
459         int node = node_id();
460         struct vmcs *vmcs;
461
462         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
463         if (!pages)
464                 return 0;
465         memset(vmcs, 0, vmcs_descriptor.size);
466         vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
467         return vmcs;
468 }
469
470 static struct vmcs *alloc_vmcs(void)
471 {
472         return alloc_vmcs_cpu(core_id());
473 }
474
475 static int cpu_has_litevm_support(void)
476 {
477         uint32_t ecx = cpuid_ecx(1);
478         return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */
479 }
480
481 static int vmx_disabled_by_bios(void)
482 {
483         uint64_t msr;
484
485         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
486         return (msr & 5) == 1; /* locked but not enabled */
487 }
488
489 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
490 {
491         int cpu = hw_core_id();
492         uint64_t phys_addr = PADDR(&currentcpu->vmxarea);
493         uint64_t old;
494
495         old = read_msr(MSR_IA32_FEATURE_CONTROL);
496         if ((old & 5) == 0)
497                 /* enable and lock */
498                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
499         lcr4(rcr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
500         asm volatile ("vmxon %0" : : "m"(phys_addr) : "memory", "cc");
501 }
502
503 static void litevm_disable(void *garbage)
504 {
505         asm volatile ("vmxoff" : : : "cc");
506 }
507
508 struct litevm *vmx_open(void)
509 {
510         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
511         int i;
512
513         if (!litevm)
514                 return 0;
515
516         spinlock_init(&litevm->lock);
517         LIST_INIT(&litevm->link);
518         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
519                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
520
521                 qlock_init(&vcpu->mutex);
522                 vcpu->mmu.root_hpa = INVALID_PAGE;
523                 LIST_INIT(&vcpu->link);
524         }
525         return litevm;
526 }
527
528 /*
529  * Free any memory in @free but not in @dont.
530  */
531 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
532                                   struct litevm_memory_slot *dont)
533 {
534         int i;
535
536         if (!dont || free->phys_mem != dont->phys_mem)
537                 if (free->phys_mem) {
538                         for (i = 0; i < free->npages; ++i){
539                                 page_t *page = free->phys_mem[i];
540                                 page_decref(page);
541                                 assert(page_is_free(page2ppn(page)));
542                         }
543                         kfree(free->phys_mem);
544                 }
545
546         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
547                 kfree(free->dirty_bitmap);
548
549         free->phys_mem = 0;
550         free->npages = 0;
551         free->dirty_bitmap = 0;
552 }
553
554 static void litevm_free_physmem(struct litevm *litevm)
555 {
556         int i;
557
558         for (i = 0; i < litevm->nmemslots; ++i)
559                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
560 }
561
562 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
563 {
564         if (vcpu->vmcs) {
565                 handler_wrapper_t *w;
566                 smp_call_function_all(__vcpu_clear, vcpu, &w);
567                 smp_call_wait(w);
568                 //free_vmcs(vcpu->vmcs);
569                 vcpu->vmcs = 0;
570         }
571 }
572
573 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
574 {
575         litevm_free_vmcs(vcpu);
576         litevm_mmu_destroy(vcpu);
577 }
578
579 static void litevm_free_vcpus(struct litevm *litevm)
580 {
581         unsigned int i;
582
583         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
584                 litevm_free_vcpu(&litevm->vcpus[i]);
585 }
586
587 static int litevm_dev_release(struct litevm *litevm)
588 {
589
590         litevm_free_vcpus(litevm);
591         litevm_free_physmem(litevm);
592         kfree(litevm);
593         return 0;
594 }
595
596 unsigned long vmcs_readl(unsigned long field)
597 {
598         unsigned long value;
599
600         asm volatile ("vmread %1, %0" : "=g"(value) : "r"(field) : "cc");
601         return value;
602 }
603
604 void vmcs_writel(unsigned long field, unsigned long value)
605 {
606         uint8_t error;
607
608         asm volatile ("vmwrite %1, %2; setna %0"
609                        : "=g"(error) : "r"(value), "r"(field) : "cc" );
610         if (error)
611                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
612                        field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
613 }
614
615 static void vmcs_write16(unsigned long field, uint16_t value)
616 {
617         vmcs_writel(field, value);
618 }
619
620 static void vmcs_write64(unsigned long field, uint64_t value)
621 {
622 #ifdef __x86_64__
623         vmcs_writel(field, value);
624 #else
625         vmcs_writel(field, value);
626         asm volatile ("");
627         vmcs_writel(field+1, value >> 32);
628 #endif
629 }
630
631 static void inject_gp(struct litevm_vcpu *vcpu)
632 {
633         printd("inject_general_protection: rip 0x%lx\n",
634                vmcs_readl(GUEST_RIP));
635         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
636         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
637                      GP_VECTOR |
638                      INTR_TYPE_EXCEPTION |
639                      INTR_INFO_DELIEVER_CODE_MASK |
640                      INTR_INFO_VALID_MASK);
641 }
642
643 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
644 {
645         if (vcpu->rmode.active)
646                 vmcs_write32(EXCEPTION_BITMAP, ~0);
647         else
648                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
649 }
650
651 static void enter_pmode(struct litevm_vcpu *vcpu)
652 {
653         unsigned long flags;
654
655         vcpu->rmode.active = 0;
656
657         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
658         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
659         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
660
661         flags = vmcs_readl(GUEST_RFLAGS);
662         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
663         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
664         vmcs_writel(GUEST_RFLAGS, flags);
665
666         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
667                         (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK) );
668
669         update_exception_bitmap(vcpu);
670
671         #define FIX_PMODE_DATASEG(seg, save) {                          \
672                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
673                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
674                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
675                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
676         }
677
678         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
679         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
680         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
681         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
682         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
683
684         vmcs_write16(GUEST_CS_SELECTOR,
685                      vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
686         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
687 }
688
689 static int rmode_tss_base(struct litevm* litevm)
690 {
691         gfn_t base_gfn = litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
692         return base_gfn << PAGE_SHIFT;
693 }
694
695 static void enter_rmode(struct litevm_vcpu *vcpu)
696 {
697         unsigned long flags;
698
699         vcpu->rmode.active = 1;
700
701         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
702         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
703
704         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
705         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
706
707         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
708         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
709
710         flags = vmcs_readl(GUEST_RFLAGS);
711         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
712
713         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
714
715         vmcs_writel(GUEST_RFLAGS, flags);
716         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
717         update_exception_bitmap(vcpu);
718
719         #define FIX_RMODE_SEG(seg, save) {                                 \
720                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
721                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
722                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
723                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
724         }
725
726         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
727         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
728
729         FIX_RMODE_SEG(ES, vcpu->rmode.es);
730         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
731         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
732         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
733         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
734 }
735
736 static int init_rmode_tss(struct litevm* litevm)
737 {
738         struct page *p1, *p2, *p3;
739         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
740         char *page;
741
742         p1 = _gfn_to_page(litevm, fn++);
743         p2 = _gfn_to_page(litevm, fn++);
744         p3 = _gfn_to_page(litevm, fn);
745
746         if (!p1 || !p2 || !p3) {
747                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
748                 return 0;
749         }
750
751         page = page2kva(p1);
752         memset(page, 0, PAGE_SIZE);
753         *(uint16_t*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
754
755         page = page2kva(p2);
756         memset(page, 0, PAGE_SIZE);
757
758         page = page2kva(p3);
759         memset(page, 0, PAGE_SIZE);
760         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
761
762         return 1;
763 }
764
765 #ifdef __x86_64__
766
767 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
768 {
769         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
770
771         vcpu->shadow_efer = efer;
772         if (efer & EFER_LMA) {
773                 vmcs_write32(VM_ENTRY_CONTROLS,
774                                      vmcs_read32(VM_ENTRY_CONTROLS) |
775                                      VM_ENTRY_CONTROLS_IA32E_MASK);
776                 msr->data = efer;
777
778         } else {
779                 vmcs_write32(VM_ENTRY_CONTROLS,
780                                      vmcs_read32(VM_ENTRY_CONTROLS) &
781                                      ~VM_ENTRY_CONTROLS_IA32E_MASK);
782
783                 msr->data = efer & ~EFER_LME;
784         }
785 }
786
787 static void enter_lmode(struct litevm_vcpu *vcpu)
788 {
789         uint32_t guest_tr_ar;
790
791         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
792         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
793                 printd("%s: tss fixup for long mode. \n",
794                        __FUNCTION__);
795                 vmcs_write32(GUEST_TR_AR_BYTES,
796                              (guest_tr_ar & ~AR_TYPE_MASK)
797                              | AR_TYPE_BUSY_64_TSS);
798         }
799
800         vcpu->shadow_efer |= EFER_LMA;
801
802         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
803         vmcs_write32(VM_ENTRY_CONTROLS,
804                      vmcs_read32(VM_ENTRY_CONTROLS)
805                      | VM_ENTRY_CONTROLS_IA32E_MASK);
806 }
807
808 static void exit_lmode(struct litevm_vcpu *vcpu)
809 {
810         vcpu->shadow_efer &= ~EFER_LMA;
811
812         vmcs_write32(VM_ENTRY_CONTROLS,
813                      vmcs_read32(VM_ENTRY_CONTROLS)
814                      & ~VM_ENTRY_CONTROLS_IA32E_MASK);
815 }
816
817 #endif
818
819 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
820 {
821         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
822                 enter_pmode(vcpu);
823
824         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
825                 enter_rmode(vcpu);
826
827 #ifdef __x86_64__
828         if (vcpu->shadow_efer & EFER_LME) {
829                 if (!is_paging() && (cr0 & CR0_PG_MASK))
830                         enter_lmode(vcpu);
831                 if (is_paging() && !(cr0 & CR0_PG_MASK))
832                         exit_lmode(vcpu);
833         }
834 #endif
835
836         vmcs_writel(CR0_READ_SHADOW, cr0);
837         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
838 }
839
840 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
841                                          unsigned long cr3)
842 {
843         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
844         unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
845         int i;
846         uint64_t pdpte;
847         uint64_t *pdpt;
848         struct litevm_memory_slot *memslot;
849
850         spin_lock(&vcpu->litevm->lock);
851         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
852         /* FIXME: !memslot - emulate? 0xff? */
853         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
854
855         for (i = 0; i < 4; ++i) {
856                 pdpte = pdpt[offset + i];
857                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
858                         break;
859         }
860
861         spin_unlock(&vcpu->litevm->lock);
862
863         return i != 4;
864 }
865
866 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
867 {
868         if (cr0 & CR0_RESEVED_BITS) {
869                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
870                        cr0, guest_cr0());
871                 inject_gp(vcpu);
872                 return;
873         }
874
875         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
876                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
877                 inject_gp(vcpu);
878                 return;
879         }
880
881         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
882                 printd("set_cr0: #GP, set PG flag "
883                        "and a clear PE flag\n");
884                 inject_gp(vcpu);
885                 return;
886         }
887
888         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
889 #ifdef __x86_64__
890                 if ((vcpu->shadow_efer & EFER_LME)) {
891                         uint32_t guest_cs_ar;
892                         if (!is_pae()) {
893                                 printd("set_cr0: #GP, start paging "
894                                        "in long mode while PAE is disabled\n");
895                                 inject_gp(vcpu);
896                                 return;
897                         }
898                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
899                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
900                                 printd("set_cr0: #GP, start paging "
901                                        "in long mode while CS.L == 1\n");
902                                 inject_gp(vcpu);
903                                 return;
904
905                         }
906                 } else
907 #endif
908                 if (is_pae() &&
909                             pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
910                         printd("set_cr0: #GP, pdptrs "
911                                "reserved bits\n");
912                         inject_gp(vcpu);
913                         return;
914                 }
915
916         }
917
918         __set_cr0(vcpu, cr0);
919         litevm_mmu_reset_context(vcpu);
920         return;
921 }
922
923 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
924 {
925         unsigned long cr0 = guest_cr0();
926
927         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
928                 enter_pmode(vcpu);
929                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
930
931         } else
932                 printd("lmsw: unexpected\n");
933
934         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
935                                 | (msw & LMSW_GUEST_MASK));
936 }
937
938 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
939 {
940         vmcs_writel(CR4_READ_SHADOW, cr4);
941         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
942                     LITEVM_RMODE_VM_CR4_ALWAYS_ON : LITEVM_PMODE_VM_CR4_ALWAYS_ON));
943 }
944
945 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
946 {
947         if (cr4 & CR4_RESEVED_BITS) {
948                 printd("set_cr4: #GP, reserved bits\n");
949                 inject_gp(vcpu);
950                 return;
951         }
952
953         if (is_long_mode()) {
954                 if (!(cr4 & CR4_PAE_MASK)) {
955                         printd("set_cr4: #GP, clearing PAE while "
956                                "in long mode\n");
957                         inject_gp(vcpu);
958                         return;
959                 }
960         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
961                    && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
962                 printd("set_cr4: #GP, pdptrs reserved bits\n");
963                 inject_gp(vcpu);
964         }
965
966         if (cr4 & CR4_VMXE_MASK) {
967                 printd("set_cr4: #GP, setting VMXE\n");
968                 inject_gp(vcpu);
969                 return;
970         }
971         __set_cr4(vcpu, cr4);
972         spin_lock(&vcpu->litevm->lock);
973         litevm_mmu_reset_context(vcpu);
974         spin_unlock(&vcpu->litevm->lock);
975 }
976
977 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
978 {
979         if (is_long_mode()) {
980                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
981                         printd("set_cr3: #GP, reserved bits\n");
982                         inject_gp(vcpu);
983                         return;
984                 }
985         } else {
986                 if (cr3 & CR3_RESEVED_BITS) {
987                         printd("set_cr3: #GP, reserved bits\n");
988                         inject_gp(vcpu);
989                         return;
990                 }
991                 if (is_paging() && is_pae() &&
992                     pdptrs_have_reserved_bits_set(vcpu, cr3)) {
993                         printd("set_cr3: #GP, pdptrs "
994                                "reserved bits\n");
995                         inject_gp(vcpu);
996                         return;
997                 }
998         }
999
1000         vcpu->cr3 = cr3;
1001         spin_lock(&vcpu->litevm->lock);
1002         vcpu->mmu.new_cr3(vcpu);
1003         spin_unlock(&vcpu->litevm->lock);
1004 }
1005
1006 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1007 {
1008         if ( cr8 & CR8_RESEVED_BITS) {
1009                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1010                 inject_gp(vcpu);
1011                 return;
1012         }
1013         vcpu->cr8 = cr8;
1014 }
1015
1016 static uint32_t get_rdx_init_val(void)
1017 {
1018         uint32_t val;
1019
1020         asm ("movl $1, %%eax \n\t"
1021              "movl %%eax, %0 \n\t" : "=g"(val) );
1022         return val;
1023
1024 }
1025
1026 static void fx_init(struct litevm_vcpu *vcpu)
1027 {
1028         struct __attribute__ ((__packed__)) fx_image_s {
1029                 uint16_t control; //fcw
1030                 uint16_t status; //fsw
1031                 uint16_t tag; // ftw
1032                 uint16_t opcode; //fop
1033                 uint64_t ip; // fpu ip
1034                 uint64_t operand;// fpu dp
1035                 uint32_t mxcsr;
1036                 uint32_t mxcsr_mask;
1037
1038         } *fx_image;
1039
1040         fx_save(vcpu->host_fx_image);
1041         fpu_init();
1042         fx_save(vcpu->guest_fx_image);
1043         fx_restore(vcpu->host_fx_image);
1044
1045         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1046         fx_image->mxcsr = 0x1f80;
1047         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1048                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1049 }
1050
1051 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field, uint32_t val)
1052 {
1053         uint32_t msr_high, msr_low;
1054         uint64_t msrval;
1055
1056         msrval = read_msr(msr);
1057         msr_low = msrval;
1058         msr_high = (msrval>>32);
1059
1060         val &= msr_high;
1061         val |= msr_low;
1062         vmcs_write32(vmcs_field, val);
1063 }
1064
1065 /*
1066  * Sets up the vmcs for emulated real mode.
1067  */
1068 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1069 {
1070 /* no op on x86_64 */
1071 #define asmlinkage
1072         extern asmlinkage void litevm_vmx_return(void);
1073         uint32_t host_sysenter_cs;
1074         uint32_t junk;
1075         uint64_t a;
1076         struct descriptor_table dt;
1077         int i;
1078         int ret;
1079         uint64_t tsc;
1080         int nr_good_msrs;
1081
1082
1083         if (!init_rmode_tss(vcpu->litevm)) {
1084                 error("vcpu_setup: init_rmode_tss failed");
1085         }
1086
1087         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1088         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1089         vcpu->cr8 = 0;
1090         vcpu->apic_base = 0xfee00000 |
1091                         /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
1092                         MSR_IA32_APICBASE_ENABLE;
1093
1094         fx_init(vcpu);
1095
1096 #define SEG_SETUP(seg) do {                                     \
1097                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1098                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1099                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1100                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1101         } while (0)
1102
1103         /*
1104          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1105          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1106          */
1107         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1108         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1109         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1110         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1111
1112         SEG_SETUP(DS);
1113         SEG_SETUP(ES);
1114         SEG_SETUP(FS);
1115         SEG_SETUP(GS);
1116         SEG_SETUP(SS);
1117
1118         vmcs_write16(GUEST_TR_SELECTOR, 0);
1119         vmcs_writel(GUEST_TR_BASE, 0);
1120         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1121         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1122
1123         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1124         vmcs_writel(GUEST_LDTR_BASE, 0);
1125         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1126         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1127
1128         vmcs_write32(GUEST_SYSENTER_CS, 0);
1129         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1130         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1131
1132         vmcs_writel(GUEST_RFLAGS, 0x02);
1133         vmcs_writel(GUEST_RIP, 0xfff0);
1134         vmcs_writel(GUEST_RSP, 0);
1135
1136         vmcs_writel(GUEST_CR3, 0);
1137
1138         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1139         vmcs_writel(GUEST_DR7, 0x400);
1140
1141         vmcs_writel(GUEST_GDTR_BASE, 0);
1142         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1143
1144         vmcs_writel(GUEST_IDTR_BASE, 0);
1145         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1146
1147         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1148         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1149         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1150
1151         /* I/O */
1152         vmcs_write64(IO_BITMAP_A, 0);
1153         vmcs_write64(IO_BITMAP_B, 0);
1154
1155         tsc = read_tsc();
1156         vmcs_write64(TSC_OFFSET, -tsc);
1157
1158         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1159
1160         /* Special registers */
1161         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1162
1163         /* Control */
1164         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR,
1165                                PIN_BASED_VM_EXEC_CONTROL,
1166                                PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
1167                                | PIN_BASED_NMI_EXITING   /* 20.6.1 */
1168                         );
1169         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR,
1170                                CPU_BASED_VM_EXEC_CONTROL,
1171                                CPU_BASED_HLT_EXITING         /* 20.6.2 */
1172                                | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
1173                                | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
1174                                | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
1175                                | CPU_BASED_INVDPG_EXITING
1176                                | CPU_BASED_MOV_DR_EXITING
1177                                | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
1178                         );
1179
1180         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1181         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1182         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1183         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1184
1185         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1186         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1187         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1188
1189 #warning "not setting selectors; do we need them?"
1190 #if 0
1191         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
1192         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1193         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1194 #endif
1195         vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
1196         vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
1197 #if 0
1198         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1199 #endif
1200 #ifdef __x86_64__
1201         a = read_msr(MSR_FS_BASE);
1202         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1203         a = read_msr(MSR_GS_BASE);
1204         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1205 #else
1206         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1207         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1208 #endif
1209
1210 #warning "Not setting HOST_TR_SELECTOR"
1211 #if 0
1212         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
1213 #endif
1214
1215         get_idt(&dt);
1216         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1217
1218
1219         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return); /* 22.2.5 */
1220
1221         /* it's the HIGH 32 bits! */
1222         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1223         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1224         a = read_msr(MSR_IA32_SYSENTER_ESP);
1225         vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
1226         a = read_msr(MSR_IA32_SYSENTER_EIP);
1227         vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
1228
1229         ret = -ENOMEM;
1230         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1231         if (!vcpu->guest_msrs)
1232                 error("guest_msrs kmalloc failed");
1233         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1234         if (!vcpu->host_msrs)
1235                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1236
1237         for (i = 0; i < NR_VMX_MSR; ++i) {
1238                 uint32_t index = vmx_msr_index[i];
1239                 uint32_t data_low, data_high;
1240                 uint64_t data;
1241                 int j = vcpu->nmsrs;
1242
1243 #warning "need readmsr_safe"
1244 //              if (rdmsr_safe(index, &data_low, &data_high) < 0)
1245 //                      continue;
1246                 data = read_msr(index);
1247                 vcpu->host_msrs[j].index = index;
1248                 vcpu->host_msrs[j].reserved = 0;
1249                 vcpu->host_msrs[j].data = data;
1250                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1251                 ++vcpu->nmsrs;
1252         }
1253         printk("msrs: %d\n", vcpu->nmsrs);
1254
1255         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1256         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
1257                     PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1258         vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
1259                     PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1260         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
1261                     PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1262         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS,
1263                                (HOST_IS_64 << 9));  /* 22.2,1, 20.7.1 */
1264         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
1265         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);  /* 22.2.2 */
1266         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1267
1268
1269         /* 22.2.1, 20.8.1 */
1270         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR,
1271                                VM_ENTRY_CONTROLS, 0);
1272         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1273
1274         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1275         vmcs_writel(TPR_THRESHOLD, 0);
1276
1277         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1278         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1279
1280         __set_cr0(vcpu, 0x60000010); // enter rmode
1281         __set_cr4(vcpu, 0);
1282 #ifdef __x86_64__
1283         __set_efer(vcpu, 0);
1284 #endif
1285
1286         ret = litevm_mmu_init(vcpu);
1287
1288         return ret;
1289
1290 out_free_guest_msrs:
1291         kfree(vcpu->guest_msrs);
1292 out:
1293         return ret;
1294 }
1295
1296 /*
1297  * Sync the rsp and rip registers into the vcpu structure.  This allows
1298  * registers to be accessed by indexing vcpu->regs.
1299  */
1300 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1301 {
1302         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1303         vcpu->rip = vmcs_readl(GUEST_RIP);
1304 }
1305
1306 /*
1307  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1308  * modification.
1309  */
1310 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1311 {
1312         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1313         vmcs_writel(GUEST_RIP, vcpu->rip);
1314 }
1315
1316 /*
1317  * Creates some virtual cpus.  Good luck creating more than one.
1318  */
1319 int vmx_create_vcpu(struct litevm *litevm, int n)
1320 {
1321         ERRSTACK(1);
1322         int r;
1323         struct litevm_vcpu *vcpu;
1324         struct vmcs *vmcs;
1325         char *errstring = NULL;
1326
1327         if (n < 0 || n >= LITEVM_MAX_VCPUS)
1328                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n, LITEVM_MAX_VCPUS);
1329
1330         vcpu = &litevm->vcpus[n];
1331
1332         qlock(&vcpu->mutex);
1333
1334         if (vcpu->vmcs) {
1335                 qunlock(&vcpu->mutex);
1336                 error("VM already exists");
1337         }
1338
1339         /* I'm a bad person */
1340         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1341         uint64_t a = (uint64_t) vcpu->fx_buf;
1342         a += FX_IMAGE_ALIGN-1;
1343         a /= FX_IMAGE_ALIGN;
1344         a *= FX_IMAGE_ALIGN;
1345
1346         vcpu->host_fx_image = (char*)a;
1347         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1348
1349         vcpu->cpu = -1;  /* First load will set up TR */
1350         vcpu->litevm = litevm;
1351         vmcs = alloc_vmcs();
1352         if (!vmcs) {
1353                 errstring = "vmcs allocate failed";
1354                 qunlock(&vcpu->mutex);
1355                 goto out_free_vcpus;
1356         }
1357         vmcs_clear(vmcs);
1358         vcpu->vmcs = vmcs;
1359         vcpu->launched = 0;
1360
1361         __vcpu_load(vcpu);
1362
1363         if (waserror()){
1364                 /* we really need to fix waserror() */
1365                 poperror();
1366                 goto out_free_vcpus;
1367         }
1368
1369         r = litevm_vcpu_setup(vcpu);
1370
1371         vcpu_put(vcpu);
1372
1373         if (! r)
1374                 return 0;
1375
1376         errstring = "vcup set failed";
1377
1378 out_free_vcpus:
1379         printk("out_free_vcpus: life sucks\n");
1380         litevm_free_vcpu(vcpu);
1381         error(errstring);
1382 out:
1383         return r;
1384 }
1385
1386 /*
1387  * Allocate some memory and give it an address in the guest physical address
1388  * space.
1389  *
1390  * Discontiguous memory is allowed, mostly for framebuffers.
1391  */
1392 int vm_set_memory_region(struct litevm *litevm,
1393                                            struct litevm_memory_region *mem)
1394 {
1395         int r;
1396         gfn_t base_gfn;
1397         unsigned long npages;
1398         unsigned long i;
1399         struct litevm_memory_slot *memslot;
1400         struct litevm_memory_slot old, new;
1401         int memory_config_version;
1402         void *init_data = mem->init_data;
1403
1404         r = -EINVAL;
1405         /* General sanity checks */
1406         if (mem->memory_size & (PAGE_SIZE - 1))
1407                 goto out;
1408         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1409                 goto out;
1410         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1411                 goto out;
1412         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1413                 goto out;
1414
1415         memslot = &litevm->memslots[mem->slot];
1416         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1417         npages = mem->memory_size >> PAGE_SHIFT;
1418
1419         if (!npages)
1420                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1421
1422 raced:
1423         spin_lock(&litevm->lock);
1424
1425         memory_config_version = litevm->memory_config_version;
1426         new = old = *memslot;
1427
1428         new.base_gfn = base_gfn;
1429         new.npages = npages;
1430         new.flags = mem->flags;
1431
1432         /* Disallow changing a memory slot's size. */
1433         r = -EINVAL;
1434         if (npages && old.npages && npages != old.npages)
1435                 goto out_unlock;
1436
1437         /* Check for overlaps */
1438         r = -EEXIST;
1439         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1440                 struct litevm_memory_slot *s = &litevm->memslots[i];
1441
1442                 if (s == memslot)
1443                         continue;
1444                 if (!((base_gfn + npages <= s->base_gfn) ||
1445                       (base_gfn >= s->base_gfn + s->npages)))
1446                         goto out_unlock;
1447         }
1448         /*
1449          * Do memory allocations outside lock.  memory_config_version will
1450          * detect any races.
1451          */
1452         spin_unlock(&litevm->lock);
1453
1454         /* Deallocate if slot is being removed */
1455         if (!npages)
1456                 new.phys_mem = 0;
1457
1458         /* Free page dirty bitmap if unneeded */
1459         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1460                 new.dirty_bitmap = 0;
1461
1462         r = -ENOMEM;
1463
1464         /* Allocate if a slot is being created */
1465         if (npages && !new.phys_mem) {
1466                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1467
1468                 if (!new.phys_mem)
1469                         goto out_free;
1470
1471                 for (i = 0; i < npages; ++i) {
1472                         int ret;
1473                         ret = kpage_alloc(&new.phys_mem[i]);
1474                         if (ret != ESUCCESS)
1475                                 goto out_free;
1476                         if (init_data){
1477                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1478                                 init_data += PAGE_SIZE;
1479                         }
1480                 }
1481         }
1482
1483         /* Allocate page dirty bitmap if needed */
1484         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1485                 unsigned dirty_bytes;//ALIGN(npages, BITS_PER_LONG) / 8;
1486                 dirty_bytes = (((npages + BITS_PER_LONG-1)/BITS_PER_LONG)*BITS_PER_LONG)/8;
1487
1488                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1489                 if (!new.dirty_bitmap)
1490                         goto out_free;
1491         }
1492
1493         spin_lock(&litevm->lock);
1494
1495         if (memory_config_version != litevm->memory_config_version) {
1496                 spin_unlock(&litevm->lock);
1497                 litevm_free_physmem_slot(&new, &old);
1498                 goto raced;
1499         }
1500
1501         r = -EAGAIN;
1502         if (litevm->busy)
1503                 goto out_unlock;
1504
1505         if (mem->slot >= litevm->nmemslots)
1506                 litevm->nmemslots = mem->slot + 1;
1507
1508         *memslot = new;
1509         ++litevm->memory_config_version;
1510
1511         spin_unlock(&litevm->lock);
1512
1513         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1514                 struct litevm_vcpu *vcpu;
1515
1516                 vcpu = vcpu_load(litevm, i);
1517                 if (!vcpu)
1518                         continue;
1519                 litevm_mmu_reset_context(vcpu);
1520                 vcpu_put(vcpu);
1521         }
1522
1523         litevm_free_physmem_slot(&old, &new);
1524         return 0;
1525
1526 out_unlock:
1527         spin_unlock(&litevm->lock);
1528 out_free:
1529         litevm_free_physmem_slot(&new, &old);
1530 out:
1531         return r;
1532 }
1533
1534 #if 0
1535 /*
1536  * Get (and clear) the dirty memory log for a memory slot.
1537  */
1538 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1539                                        struct litevm_dirty_log *log)
1540 {
1541         struct litevm_memory_slot *memslot;
1542         int r, i;
1543         int n;
1544         unsigned long any = 0;
1545
1546         spin_lock(&litevm->lock);
1547
1548         /*
1549          * Prevent changes to guest memory configuration even while the lock
1550          * is not taken.
1551          */
1552         ++litevm->busy;
1553         spin_unlock(&litevm->lock);
1554         r = -EINVAL;
1555         if (log->slot >= LITEVM_MEMORY_SLOTS)
1556                 goto out;
1557
1558         memslot = &litevm->memslots[log->slot];
1559         r = -ENOENT;
1560         if (!memslot->dirty_bitmap)
1561                 goto out;
1562
1563         n = ALIGN(memslot->npages, 8) / 8;
1564
1565         for (i = 0; !any && i < n; ++i)
1566                 any = memslot->dirty_bitmap[i];
1567
1568         r = -EFAULT;
1569         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1570                 goto out;
1571
1572
1573         if (any) {
1574                 spin_lock(&litevm->lock);
1575                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1576                 spin_unlock(&litevm->lock);
1577                 memset(memslot->dirty_bitmap, 0, n);
1578                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1579                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1580
1581                         if (!vcpu)
1582                                 continue;
1583                         flush_guest_tlb(vcpu);
1584                         vcpu_put(vcpu);
1585                 }
1586         }
1587
1588         r = 0;
1589
1590 out:
1591         spin_lock(&litevm->lock);
1592         --litevm->busy;
1593         spin_unlock(&litevm->lock);
1594         return r;
1595 }
1596 #endif
1597
1598 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1599 {
1600         int i;
1601
1602         for (i = 0; i < litevm->nmemslots; ++i) {
1603                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1604
1605                 if (gfn >= memslot->base_gfn
1606                     && gfn < memslot->base_gfn + memslot->npages)
1607                         return memslot;
1608         }
1609         return 0;
1610 }
1611
1612 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1613 {
1614         int i;
1615         struct litevm_memory_slot *memslot = 0;
1616         unsigned long rel_gfn;
1617
1618         for (i = 0; i < litevm->nmemslots; ++i) {
1619                 memslot = &litevm->memslots[i];
1620
1621                 if (gfn >= memslot->base_gfn
1622                     && gfn < memslot->base_gfn + memslot->npages) {
1623
1624                         if (!memslot || !memslot->dirty_bitmap)
1625                                 return;
1626
1627                         rel_gfn = gfn - memslot->base_gfn;
1628
1629                         /* avoid RMW */
1630                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1631                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1632                         return;
1633                 }
1634         }
1635 }
1636
1637 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1638 {
1639         unsigned long rip;
1640         uint32_t interruptibility;
1641
1642         rip = vmcs_readl(GUEST_RIP);
1643         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1644         vmcs_writel(GUEST_RIP, rip);
1645
1646         /*
1647          * We emulated an instruction, so temporary interrupt blocking
1648          * should be removed, if set.
1649          */
1650         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1651         if (interruptibility & 3)
1652                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
1653                              interruptibility & ~3);
1654 }
1655
1656 static int emulator_read_std(unsigned long addr,
1657                              unsigned long *val,
1658                              unsigned int bytes,
1659                              struct x86_emulate_ctxt *ctxt)
1660 {
1661         struct litevm_vcpu *vcpu = ctxt->vcpu;
1662         void *data = val;
1663
1664         while (bytes) {
1665                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1666                 unsigned offset = addr & (PAGE_SIZE-1);
1667                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ? 
1668                         bytes : (unsigned)PAGE_SIZE - offset;
1669                 unsigned long pfn;
1670                 struct litevm_memory_slot *memslot;
1671                 void *page;
1672
1673                 if (gpa == UNMAPPED_GVA)
1674                         return X86EMUL_PROPAGATE_FAULT;
1675                 pfn = gpa >> PAGE_SHIFT;
1676                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1677                 if (!memslot)
1678                         return X86EMUL_UNHANDLEABLE;
1679                 page = page2kva(gfn_to_page(memslot, pfn));
1680
1681                 memcpy(data, page + offset, tocopy);
1682
1683                 bytes -= tocopy;
1684                 data += tocopy;
1685                 addr += tocopy;
1686         }
1687
1688         return X86EMUL_CONTINUE;
1689 }
1690
1691 static int emulator_write_std(unsigned long addr,
1692                               unsigned long val,
1693                               unsigned int bytes,
1694                               struct x86_emulate_ctxt *ctxt)
1695 {
1696         printk("emulator_write_std: addr %lx n %d\n",
1697                addr, bytes);
1698         return X86EMUL_UNHANDLEABLE;
1699 }
1700
1701 static int emulator_read_emulated(unsigned long addr,
1702                                   unsigned long *val,
1703                                   unsigned int bytes,
1704                                   struct x86_emulate_ctxt *ctxt)
1705 {
1706         struct litevm_vcpu *vcpu = ctxt->vcpu;
1707
1708         if (vcpu->mmio_read_completed) {
1709                 memcpy(val, vcpu->mmio_data, bytes);
1710                 vcpu->mmio_read_completed = 0;
1711                 return X86EMUL_CONTINUE;
1712         } else if (emulator_read_std(addr, val, bytes, ctxt)
1713                    == X86EMUL_CONTINUE)
1714                 return X86EMUL_CONTINUE;
1715         else {
1716                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1717                 if (gpa == UNMAPPED_GVA)
1718                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
1719                 vcpu->mmio_needed = 1;
1720                 vcpu->mmio_phys_addr = gpa;
1721                 vcpu->mmio_size = bytes;
1722                 vcpu->mmio_is_write = 0;
1723
1724                 return X86EMUL_UNHANDLEABLE;
1725         }
1726 }
1727
1728 static int emulator_write_emulated(unsigned long addr,
1729                                    unsigned long val,
1730                                    unsigned int bytes,
1731                                    struct x86_emulate_ctxt *ctxt)
1732 {
1733         struct litevm_vcpu *vcpu = ctxt->vcpu;
1734         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1735
1736         if (gpa == UNMAPPED_GVA)
1737                 return X86EMUL_PROPAGATE_FAULT;
1738
1739         vcpu->mmio_needed = 1;
1740         vcpu->mmio_phys_addr = gpa;
1741         vcpu->mmio_size = bytes;
1742         vcpu->mmio_is_write = 1;
1743         memcpy(vcpu->mmio_data, &val, bytes);
1744
1745         return X86EMUL_CONTINUE;
1746 }
1747
1748 static int emulator_cmpxchg_emulated(unsigned long addr,
1749                                      unsigned long old,
1750                                      unsigned long new,
1751                                      unsigned int bytes,
1752                                      struct x86_emulate_ctxt *ctxt)
1753 {
1754         static int reported;
1755
1756         if (!reported) {
1757                 reported = 1;
1758                 printk("litevm: emulating exchange as write\n");
1759         }
1760         return emulator_write_emulated(addr, new, bytes, ctxt);
1761 }
1762
1763 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1764 {
1765         static int reported;
1766         uint8_t opcodes[4];
1767         unsigned long rip = vmcs_readl(GUEST_RIP);
1768         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
1769
1770         if (reported)
1771                 return;
1772
1773         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1774
1775         printk("emulation failed but !mmio_needed?"
1776                " rip %lx %02x %02x %02x %02x\n",
1777                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1778         reported = 1;
1779 }
1780
1781 struct x86_emulate_ops emulate_ops = {
1782         .read_std            = emulator_read_std,
1783         .write_std           = emulator_write_std,
1784         .read_emulated       = emulator_read_emulated,
1785         .write_emulated      = emulator_write_emulated,
1786         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1787 };
1788
1789 enum emulation_result {
1790         EMULATE_DONE,       /* no further processing */
1791         EMULATE_DO_MMIO,      /* litevm_run filled with mmio request */
1792         EMULATE_FAIL,         /* can't emulate this instruction */
1793 };
1794
1795 static int emulate_instruction(struct litevm_vcpu *vcpu,
1796                                struct litevm_run *run,
1797                                unsigned long cr2,
1798                                uint16_t error_code)
1799 {
1800         struct x86_emulate_ctxt emulate_ctxt;
1801         int r;
1802         uint32_t cs_ar;
1803
1804         vcpu_load_rsp_rip(vcpu);
1805
1806         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1807
1808         emulate_ctxt.vcpu = vcpu;
1809         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
1810         emulate_ctxt.cr2 = cr2;
1811         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1812                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
1813                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
1814                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1815
1816         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1817                 emulate_ctxt.cs_base = 0;
1818                 emulate_ctxt.ds_base = 0;
1819                 emulate_ctxt.es_base = 0;
1820                 emulate_ctxt.ss_base = 0;
1821                 emulate_ctxt.gs_base = 0;
1822                 emulate_ctxt.fs_base = 0;
1823         } else {
1824                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
1825                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
1826                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
1827                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
1828                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
1829                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
1830         }
1831
1832         vcpu->mmio_is_write = 0;
1833         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1834
1835         if ((r || vcpu->mmio_is_write) && run) {
1836                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1837                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1838                 run->mmio.len = vcpu->mmio_size;
1839                 run->mmio.is_write = vcpu->mmio_is_write;
1840         }
1841
1842         if (r) {
1843                 if (!vcpu->mmio_needed) {
1844                         report_emulation_failure(&emulate_ctxt);
1845                         return EMULATE_FAIL;
1846                 }
1847                 return EMULATE_DO_MMIO;
1848         }
1849
1850         vcpu_put_rsp_rip(vcpu);
1851         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
1852
1853         if (vcpu->mmio_is_write)
1854                 return EMULATE_DO_MMIO;
1855
1856         return EMULATE_DONE;
1857 }
1858
1859 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
1860 {
1861         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1862 }
1863
1864 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
1865 {
1866         vmcs_writel(GUEST_GDTR_BASE, base);
1867         vmcs_write32(GUEST_GDTR_LIMIT, limit);
1868 }
1869
1870 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
1871 {
1872         vmcs_writel(GUEST_IDTR_BASE, base);
1873         vmcs_write32(GUEST_IDTR_LIMIT, limit);
1874 }
1875
1876 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
1877                    unsigned long *rflags)
1878 {
1879         lmsw(vcpu, msw);
1880         *rflags = vmcs_readl(GUEST_RFLAGS);
1881 }
1882
1883 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
1884 {
1885         switch (cr) {
1886         case 0:
1887                 return guest_cr0();
1888         case 2:
1889                 return vcpu->cr2;
1890         case 3:
1891                 return vcpu->cr3;
1892         case 4:
1893                 return guest_cr4();
1894         default:
1895                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1896                 return 0;
1897         }
1898 }
1899
1900 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
1901                      unsigned long *rflags)
1902 {
1903         switch (cr) {
1904         case 0:
1905                 set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
1906                 *rflags = vmcs_readl(GUEST_RFLAGS);
1907                 break;
1908         case 2:
1909                 vcpu->cr2 = val;
1910                 break;
1911         case 3:
1912                 set_cr3(vcpu, val);
1913                 break;
1914         case 4:
1915                 set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
1916                 break;
1917         default:
1918                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1919         }
1920 }
1921
1922 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
1923                                   int vec, uint32_t err_code)
1924 {
1925         if (!vcpu->rmode.active)
1926                 return 0;
1927
1928         if (vec == GP_VECTOR && err_code == 0)
1929                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
1930                         return 1;
1931         return 0;
1932 }
1933
1934 static int handle_exception(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
1935 {
1936         uint32_t intr_info, error_code;
1937         unsigned long cr2, rip;
1938         uint32_t vect_info;
1939         enum emulation_result er;
1940
1941         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1942         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1943
1944         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1945                                                 !is_page_fault(intr_info)) {
1946                 printk("%s: unexpected, vectoring info 0x%x "
1947                        "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1948         }
1949
1950         if (is_external_interrupt(vect_info)) {
1951                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1952                 SET_BITMASK_BIT_ATOMIC(((uint8_t *)&vcpu->irq_pending), irq);
1953                 SET_BITMASK_BIT_ATOMIC(((uint8_t *)&vcpu->irq_summary), irq / BITS_PER_LONG);
1954         }
1955
1956         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
1957                 asm ("int $2");
1958                 return 1;
1959         }
1960         error_code = 0;
1961         rip = vmcs_readl(GUEST_RIP);
1962         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1963                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1964         if (is_page_fault(intr_info)) {
1965                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1966
1967                 spin_lock(&vcpu->litevm->lock);
1968                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
1969                         spin_unlock(&vcpu->litevm->lock);
1970                         return 1;
1971                 }
1972
1973                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
1974                 spin_unlock(&vcpu->litevm->lock);
1975
1976                 switch (er) {
1977                 case EMULATE_DONE:
1978                         return 1;
1979                 case EMULATE_DO_MMIO:
1980                         ++litevm_stat.mmio_exits;
1981                         litevm_run->exit_reason = LITEVM_EXIT_MMIO;
1982                         return 0;
1983                  case EMULATE_FAIL:
1984                         vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
1985                         break;
1986                 default:
1987                         assert(0);
1988                 }
1989         }
1990
1991         if (vcpu->rmode.active &&
1992             handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1993                                                                 error_code))
1994                 return 1;
1995
1996         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
1997                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
1998                 return 0;
1999         }
2000         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2001         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2002         litevm_run->ex.error_code = error_code;
2003         return 0;
2004 }
2005
2006 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2007                                      struct litevm_run *litevm_run)
2008 {
2009         ++litevm_stat.irq_exits;
2010         return 1;
2011 }
2012
2013
2014 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t *count)
2015 {
2016         uint64_t inst;
2017         gva_t rip;
2018         int countr_size;
2019         int i, n;
2020
2021         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2022                 countr_size = 2;
2023         } else {
2024                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2025
2026                 countr_size = (cs_ar & AR_L_MASK) ? 8:
2027                               (cs_ar & AR_DB_MASK) ? 4: 2;
2028         }
2029
2030         rip =  vmcs_readl(GUEST_RIP);
2031         if (countr_size != 8)
2032                 rip += vmcs_readl(GUEST_CS_BASE);
2033
2034         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2035
2036         for (i = 0; i < n; i++) {
2037                 switch (((uint8_t*)&inst)[i]) {
2038                 case 0xf0:
2039                 case 0xf2:
2040                 case 0xf3:
2041                 case 0x2e:
2042                 case 0x36:
2043                 case 0x3e:
2044                 case 0x26:
2045                 case 0x64:
2046                 case 0x65:
2047                 case 0x66:
2048                         break;
2049                 case 0x67:
2050                         countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
2051                 default:
2052                         goto done;
2053                 }
2054         }
2055         return 0;
2056 done:
2057         countr_size *= 8;
2058         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2059         return 1;
2060 }
2061
2062 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2063 {
2064         uint64_t exit_qualification;
2065
2066         ++litevm_stat.io_exits;
2067         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2068         litevm_run->exit_reason = LITEVM_EXIT_IO;
2069         if (exit_qualification & 8)
2070                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2071         else
2072                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2073         litevm_run->io.size = (exit_qualification & 7) + 1;
2074         litevm_run->io.string = (exit_qualification & 16) != 0;
2075         litevm_run->io.string_down
2076                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2077         litevm_run->io.rep = (exit_qualification & 32) != 0;
2078         litevm_run->io.port = exit_qualification >> 16;
2079         if (litevm_run->io.string) {
2080                 if (!get_io_count(vcpu, &litevm_run->io.count))
2081                         return 1;
2082                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2083         } else
2084                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
2085         return 0;
2086 }
2087
2088 static int handle_invlpg(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2089 {
2090         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2091         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2092         spin_lock(&vcpu->litevm->lock);
2093         vcpu->mmu.inval_page(vcpu, address);
2094         spin_unlock(&vcpu->litevm->lock);
2095         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2096         return 1;
2097 }
2098
2099 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2100 {
2101         uint64_t exit_qualification;
2102         int cr;
2103         int reg;
2104
2105 #ifdef LITEVM_DEBUG
2106         if (guest_cpl() != 0) {
2107                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2108                 inject_gp(vcpu);
2109                 return 1;
2110         }
2111 #endif
2112
2113         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2114         cr = exit_qualification & 15;
2115         reg = (exit_qualification >> 8) & 15;
2116         switch ((exit_qualification >> 4) & 3) {
2117         case 0: /* mov to cr */
2118                 switch (cr) {
2119                 case 0:
2120                         vcpu_load_rsp_rip(vcpu);
2121                         set_cr0(vcpu, vcpu->regs[reg]);
2122                         skip_emulated_instruction(vcpu);
2123                         return 1;
2124                 case 3:
2125                         vcpu_load_rsp_rip(vcpu);
2126                         set_cr3(vcpu, vcpu->regs[reg]);
2127                         skip_emulated_instruction(vcpu);
2128                         return 1;
2129                 case 4:
2130                         vcpu_load_rsp_rip(vcpu);
2131                         set_cr4(vcpu, vcpu->regs[reg]);
2132                         skip_emulated_instruction(vcpu);
2133                         return 1;
2134                 case 8:
2135                         vcpu_load_rsp_rip(vcpu);
2136                         set_cr8(vcpu, vcpu->regs[reg]);
2137                         skip_emulated_instruction(vcpu);
2138                         return 1;
2139                 };
2140                 break;
2141         case 1: /*mov from cr*/
2142                 switch (cr) {
2143                 case 3:
2144                         vcpu_load_rsp_rip(vcpu);
2145                         vcpu->regs[reg] = vcpu->cr3;
2146                         vcpu_put_rsp_rip(vcpu);
2147                         skip_emulated_instruction(vcpu);
2148                         return 1;
2149                 case 8:
2150                         printd("handle_cr: read CR8 "
2151                                "cpu erratum AA15\n");
2152                         vcpu_load_rsp_rip(vcpu);
2153                         vcpu->regs[reg] = vcpu->cr8;
2154                         vcpu_put_rsp_rip(vcpu);
2155                         skip_emulated_instruction(vcpu);
2156                         return 1;
2157                 }
2158                 break;
2159         case 3: /* lmsw */
2160                 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2161
2162                 skip_emulated_instruction(vcpu);
2163                 return 1;
2164         default:
2165                 break;
2166         }
2167         litevm_run->exit_reason = 0;
2168         printk("litevm: unhandled control register: op %d cr %d\n",
2169                (int)(exit_qualification >> 4) & 3, cr);
2170         return 0;
2171 }
2172
2173 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2174 {
2175         uint64_t exit_qualification;
2176         unsigned long val;
2177         int dr, reg;
2178
2179         /*
2180          * FIXME: this code assumes the host is debugging the guest.
2181          *        need to deal with guest debugging itself too.
2182          */
2183         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2184         dr = exit_qualification & 7;
2185         reg = (exit_qualification >> 8) & 15;
2186         vcpu_load_rsp_rip(vcpu);
2187         if (exit_qualification & 16) {
2188                 /* mov from dr */
2189                 switch (dr) {
2190                 case 6:
2191                         val = 0xffff0ff0;
2192                         break;
2193                 case 7:
2194                         val = 0x400;
2195                         break;
2196                 default:
2197                         val = 0;
2198                 }
2199                 vcpu->regs[reg] = val;
2200         } else {
2201                 /* mov to dr */
2202         }
2203         vcpu_put_rsp_rip(vcpu);
2204         skip_emulated_instruction(vcpu);
2205         return 1;
2206 }
2207
2208 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2209 {
2210         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2211         return 0;
2212 }
2213
2214 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2215 {
2216         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2217         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2218         uint64_t data;
2219
2220 #ifdef LITEVM_DEBUG
2221         if (guest_cpl() != 0) {
2222                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2223                 inject_gp(vcpu);
2224                 return 1;
2225         }
2226 #endif
2227
2228         switch (ecx) {
2229 #ifdef __x86_64__
2230         case MSR_FS_BASE:
2231                 data = vmcs_readl(GUEST_FS_BASE);
2232                 break;
2233         case MSR_GS_BASE:
2234                 data = vmcs_readl(GUEST_GS_BASE);
2235                 break;
2236 #endif
2237         case MSR_IA32_SYSENTER_CS:
2238                 data = vmcs_read32(GUEST_SYSENTER_CS);
2239                 break;
2240         case MSR_IA32_SYSENTER_EIP:
2241                 data = vmcs_read32(GUEST_SYSENTER_EIP);
2242                 break;
2243         case MSR_IA32_SYSENTER_ESP:
2244                 data = vmcs_read32(GUEST_SYSENTER_ESP);
2245                 break;
2246         case MSR_IA32_MC0_CTL:
2247         case MSR_IA32_MCG_STATUS:
2248         case MSR_IA32_MCG_CAP:
2249         case MSR_IA32_MC0_MISC:
2250         case MSR_IA32_MC0_MISC+4:
2251         case MSR_IA32_MC0_MISC+8:
2252         case MSR_IA32_MC0_MISC+12:
2253         case MSR_IA32_MC0_MISC+16:
2254         case MSR_IA32_UCODE_REV:
2255                 /* MTRR registers */
2256         case 0xfe:
2257         case 0x200 ... 0x2ff:
2258                 data = 0;
2259                 break;
2260         case MSR_IA32_APICBASE:
2261                 data = vcpu->apic_base;
2262                 break;
2263         default:
2264                 if (msr) {
2265                         data = msr->data;
2266                         break;
2267                 }
2268                 printk("litevm: unhandled rdmsr: %x\n", ecx);
2269                 inject_gp(vcpu);
2270                 return 1;
2271         }
2272
2273         /* FIXME: handling of bits 32:63 of rax, rdx */
2274         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2275         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2276         skip_emulated_instruction(vcpu);
2277         return 1;
2278 }
2279
2280 #ifdef __x86_64__
2281
2282 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2283 {
2284         struct vmx_msr_entry *msr;
2285
2286         if (efer & EFER_RESERVED_BITS) {
2287                 printd("set_efer: 0x%llx #GP, reserved bits\n",
2288                        efer);
2289                 inject_gp(vcpu);
2290                 return;
2291         }
2292
2293         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2294                 printd("set_efer: #GP, change LME while paging\n");
2295                 inject_gp(vcpu);
2296                 return;
2297         }
2298
2299         efer &= ~EFER_LMA;
2300         efer |= vcpu->shadow_efer & EFER_LMA;
2301
2302         vcpu->shadow_efer = efer;
2303
2304         msr = find_msr_entry(vcpu, MSR_EFER);
2305
2306         if (!(efer & EFER_LMA))
2307             efer &= ~EFER_LME;
2308         msr->data = efer;
2309         skip_emulated_instruction(vcpu);
2310 }
2311
2312 #endif
2313
2314 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2315
2316 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2317 {
2318         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2319         struct vmx_msr_entry *msr;
2320         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2321                 | ((uint64_t)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2322
2323 #ifdef LITEVM_DEBUG
2324         if (guest_cpl() != 0) {
2325                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2326                 inject_gp(vcpu);
2327                 return 1;
2328         }
2329 #endif
2330
2331         switch (ecx) {
2332 #ifdef __x86_64__
2333         case MSR_FS_BASE:
2334                 vmcs_writel(GUEST_FS_BASE, data);
2335                 break;
2336         case MSR_GS_BASE:
2337                 vmcs_writel(GUEST_GS_BASE, data);
2338                 break;
2339 #endif
2340         case MSR_IA32_SYSENTER_CS:
2341                 vmcs_write32(GUEST_SYSENTER_CS, data);
2342                 break;
2343         case MSR_IA32_SYSENTER_EIP:
2344                 vmcs_write32(GUEST_SYSENTER_EIP, data);
2345                 break;
2346         case MSR_IA32_SYSENTER_ESP:
2347                 vmcs_write32(GUEST_SYSENTER_ESP, data);
2348                 break;
2349 #ifdef __x86_64
2350         case MSR_EFER:
2351                 set_efer(vcpu, data);
2352                 return 1;
2353         case MSR_IA32_MC0_STATUS:
2354                 printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
2355                             , __FUNCTION__, data);
2356                 break;
2357 #endif
2358         case MSR_IA32_TIME_STAMP_COUNTER: {
2359                 uint64_t tsc;
2360                 
2361                 tsc = read_tsc();
2362                 vmcs_write64(TSC_OFFSET, data - tsc);
2363                 break;
2364         }
2365         case MSR_IA32_UCODE_REV:
2366         case MSR_IA32_UCODE_WRITE:
2367         case 0x200 ... 0x2ff: /* MTRRs */
2368                 break;
2369         case MSR_IA32_APICBASE:
2370                 vcpu->apic_base = data;
2371                 break;
2372         default:
2373                 msr = find_msr_entry(vcpu, ecx);
2374                 if (msr) {
2375                         msr->data = data;
2376                         break;
2377                 }
2378                 printk("litevm: unhandled wrmsr: %x\n", ecx);
2379                 inject_gp(vcpu);
2380                 return 1;
2381         }
2382         skip_emulated_instruction(vcpu);
2383         return 1;
2384 }
2385
2386 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2387                                    struct litevm_run *litevm_run)
2388 {
2389         /* Turn off interrupt window reporting. */
2390         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2391                      vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2392                      & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2393         return 1;
2394 }
2395
2396 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2397 {
2398         skip_emulated_instruction(vcpu);
2399         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF))
2400                 return 1;
2401
2402         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2403         return 0;
2404 }
2405
2406 /*
2407  * The exit handlers return 1 if the exit was handled fully and guest execution
2408  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2409  * to be done to userspace and return 0.
2410  */
2411 static int (*litevm_vmx_exit_handlers[])(struct litevm_vcpu *vcpu,
2412                                       struct litevm_run *litevm_run) = {
2413         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
2414         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
2415         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
2416         [EXIT_REASON_INVLPG]                  = handle_invlpg,
2417         [EXIT_REASON_CR_ACCESS]               = handle_cr,
2418         [EXIT_REASON_DR_ACCESS]               = handle_dr,
2419         [EXIT_REASON_CPUID]                   = handle_cpuid,
2420         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
2421         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
2422         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
2423         [EXIT_REASON_HLT]                     = handle_halt,
2424 };
2425
2426 static const int litevm_vmx_max_exit_handlers =
2427         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2428
2429 /*
2430  * The guest has exited.  See if we can fix it or if we need userspace
2431  * assistance.
2432  */
2433 static int litevm_handle_exit(struct litevm_run *litevm_run, struct litevm_vcpu *vcpu)
2434 {
2435         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2436         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2437
2438         if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
2439                                 exit_reason != EXIT_REASON_EXCEPTION_NMI )
2440                 printk("%s: unexpected, valid vectoring info and "
2441                        "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2442         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2443         if (exit_reason < litevm_vmx_max_exit_handlers
2444             && litevm_vmx_exit_handlers[exit_reason])
2445                 return litevm_vmx_exit_handlers[exit_reason](vcpu, litevm_run);
2446         else {
2447                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2448                 litevm_run->hw.hardware_exit_reason = exit_reason;
2449         }
2450         return 0;
2451 }
2452
2453 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2454 {
2455         uint16_t ent[2];
2456         uint16_t cs;
2457         uint16_t ip;
2458         unsigned long flags;
2459         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2460         uint16_t sp =  vmcs_readl(GUEST_RSP);
2461         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2462
2463         if (sp > ss_limit || ((sp - 6) > sp)) {
2464                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2465                             __FUNCTION__,
2466                             vmcs_readl(GUEST_RSP),
2467                             vmcs_readl(GUEST_SS_BASE),
2468                             vmcs_read32(GUEST_SS_LIMIT));
2469                 return;
2470         }
2471
2472         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2473                                                                 sizeof(ent)) {
2474                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2475                 return;
2476         }
2477
2478         flags =  vmcs_readl(GUEST_RFLAGS);
2479         cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
2480         ip =  vmcs_readl(GUEST_RIP);
2481
2482
2483         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2484             litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2485             litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2486                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2487                 return;
2488         }
2489
2490         vmcs_writel(GUEST_RFLAGS, flags &
2491                     ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2492         vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
2493         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2494         vmcs_writel(GUEST_RIP, ent[0]);
2495         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2496 }
2497
2498 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2499 {
2500         int word_index = __ffs(vcpu->irq_summary);
2501         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2502         int irq = word_index * BITS_PER_LONG + bit_index;
2503
2504         /* don't have clear_bit and I'm not sure the akaros
2505          * bitops are really going to work.
2506          */
2507         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2508         if (!vcpu->irq_pending[word_index])
2509                 vcpu->irq_summary &= ~ (1 << word_index);
2510
2511         if (vcpu->rmode.active) {
2512                 inject_rmode_irq(vcpu, irq);
2513                 return;
2514         }
2515         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2516                         irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2517 }
2518
2519 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2520 {
2521         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2522             && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2523                 /*
2524                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2525                  */
2526                 litevm_do_inject_irq(vcpu);
2527         else
2528                 /*
2529                  * Interrupts blocked.  Wait for unblock.
2530                  */
2531                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2532                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2533                              | CPU_BASED_VIRTUAL_INTR_PENDING);
2534 }
2535
2536 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2537 {
2538         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2539
2540 #warning "no debugging guests yet"
2541         assert(0);
2542 /*
2543         set_debugreg(dbg->bp[0], 0);
2544         set_debugreg(dbg->bp[1], 1);
2545         set_debugreg(dbg->bp[2], 2);
2546         set_debugreg(dbg->bp[3], 3);
2547 */
2548         if (dbg->singlestep) {
2549                 unsigned long flags;
2550
2551                 flags = vmcs_readl(GUEST_RFLAGS);
2552                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2553                 vmcs_writel(GUEST_RFLAGS, flags);
2554         }
2555 }
2556
2557 static void load_msrs(struct vmx_msr_entry *e, int n)
2558 {
2559         int i;
2560
2561         for (i = 0; i < n; ++i)
2562                 write_msr(e[i].index, e[i].data);
2563 }
2564
2565 static void save_msrs(struct vmx_msr_entry *e, int n)
2566 {
2567         int i;
2568
2569         for (i = 0; i < n; ++i)
2570                 e[i].data = read_msr(e[i].index);
2571 }
2572
2573 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
2574 {
2575         struct litevm_vcpu *vcpu;
2576         uint8_t fail;
2577         uint16_t fs_sel, gs_sel, ldt_sel;
2578         int fs_gs_ldt_reload_needed;
2579
2580         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2581                 return -EINVAL;
2582
2583         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2584         if (!vcpu)
2585                 return -ENOENT;
2586
2587         if (litevm_run->emulated) {
2588                 skip_emulated_instruction(vcpu);
2589                 litevm_run->emulated = 0;
2590         }
2591
2592         if (litevm_run->mmio_completed) {
2593                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
2594                 vcpu->mmio_read_completed = 1;
2595         }
2596
2597         vcpu->mmio_needed = 0;
2598
2599 again:
2600         /*
2601          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2602          * allow segment selectors with cpl > 0 or ti == 1.
2603          */
2604         fs_sel = read_fs();
2605         gs_sel = read_gs();
2606         ldt_sel = read_ldt();
2607         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
2608         if (!fs_gs_ldt_reload_needed) {
2609                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2610                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2611         } else {
2612                 vmcs_write16(HOST_FS_SELECTOR, 0);
2613                 vmcs_write16(HOST_GS_SELECTOR, 0);
2614         }
2615
2616 #ifdef __x86_64__
2617         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
2618         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2619 #endif
2620
2621         if (vcpu->irq_summary &&
2622             !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2623                 litevm_try_inject_irq(vcpu);
2624
2625         if (vcpu->guest_debug.enabled)
2626                 litevm_guest_debug_pre(vcpu);
2627
2628         fx_save(vcpu->host_fx_image);
2629         fx_restore(vcpu->guest_fx_image);
2630
2631         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
2632         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2633
2634         asm (
2635                 /* Store host registers */
2636                 "pushf \n\t"
2637 #ifdef __x86_64__
2638                 "push %%rax; push %%rbx; push %%rdx;"
2639                 "push %%rsi; push %%rdi; push %%rbp;"
2640                 "push %%r8;  push %%r9;  push %%r10; push %%r11;"
2641                 "push %%r12; push %%r13; push %%r14; push %%r15;"
2642                 "push %%rcx \n\t"
2643                 "vmwrite %%rsp, %2 \n\t"
2644 #else
2645                 "pusha; push %%ecx \n\t"
2646                 "vmwrite %%esp, %2 \n\t"
2647 #endif
2648                 /* Check if vmlaunch of vmresume is needed */
2649                 "cmp $0, %1 \n\t"
2650                 /* Load guest registers.  Don't clobber flags. */
2651 #ifdef __x86_64__
2652                 "mov %c[cr2](%3), %%rax \n\t"
2653                 "mov %%rax, %%cr2 \n\t"
2654                 "mov %c[rax](%3), %%rax \n\t"
2655                 "mov %c[rbx](%3), %%rbx \n\t"
2656                 "mov %c[rdx](%3), %%rdx \n\t"
2657                 "mov %c[rsi](%3), %%rsi \n\t"
2658                 "mov %c[rdi](%3), %%rdi \n\t"
2659                 "mov %c[rbp](%3), %%rbp \n\t"
2660                 "mov %c[r8](%3),  %%r8  \n\t"
2661                 "mov %c[r9](%3),  %%r9  \n\t"
2662                 "mov %c[r10](%3), %%r10 \n\t"
2663                 "mov %c[r11](%3), %%r11 \n\t"
2664                 "mov %c[r12](%3), %%r12 \n\t"
2665                 "mov %c[r13](%3), %%r13 \n\t"
2666                 "mov %c[r14](%3), %%r14 \n\t"
2667                 "mov %c[r15](%3), %%r15 \n\t"
2668                 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
2669 #else
2670                 "mov %c[cr2](%3), %%eax \n\t"
2671                 "mov %%eax,   %%cr2 \n\t"
2672                 "mov %c[rax](%3), %%eax \n\t"
2673                 "mov %c[rbx](%3), %%ebx \n\t"
2674                 "mov %c[rdx](%3), %%edx \n\t"
2675                 "mov %c[rsi](%3), %%esi \n\t"
2676                 "mov %c[rdi](%3), %%edi \n\t"
2677                 "mov %c[rbp](%3), %%ebp \n\t"
2678                 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
2679 #endif
2680                 /* Enter guest mode */
2681                 "jne launched \n\t"
2682                 "vmlaunch \n\t"
2683                 "jmp litevm_vmx_return \n\t"
2684                 "launched: vmresume \n\t"
2685                 ".globl litevm_vmx_return \n\t"
2686                 "litevm_vmx_return: "
2687                 /* Save guest registers, load host registers, keep flags */
2688 #ifdef __x86_64__
2689                 "xchg %3,     0(%%rsp) \n\t"
2690                 "mov %%rax, %c[rax](%3) \n\t"
2691                 "mov %%rbx, %c[rbx](%3) \n\t"
2692                 "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
2693                 "mov %%rdx, %c[rdx](%3) \n\t"
2694                 "mov %%rsi, %c[rsi](%3) \n\t"
2695                 "mov %%rdi, %c[rdi](%3) \n\t"
2696                 "mov %%rbp, %c[rbp](%3) \n\t"
2697                 "mov %%r8,  %c[r8](%3) \n\t"
2698                 "mov %%r9,  %c[r9](%3) \n\t"
2699                 "mov %%r10, %c[r10](%3) \n\t"
2700                 "mov %%r11, %c[r11](%3) \n\t"
2701                 "mov %%r12, %c[r12](%3) \n\t"
2702                 "mov %%r13, %c[r13](%3) \n\t"
2703                 "mov %%r14, %c[r14](%3) \n\t"
2704                 "mov %%r15, %c[r15](%3) \n\t"
2705                 "mov %%cr2, %%rax   \n\t"
2706                 "mov %%rax, %c[cr2](%3) \n\t"
2707                 "mov 0(%%rsp), %3 \n\t"
2708
2709                 "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
2710                 "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
2711                 "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
2712                 "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
2713 #else
2714                 "xchg %3, 0(%%esp) \n\t"
2715                 "mov %%eax, %c[rax](%3) \n\t"
2716                 "mov %%ebx, %c[rbx](%3) \n\t"
2717                 "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
2718                 "mov %%edx, %c[rdx](%3) \n\t"
2719                 "mov %%esi, %c[rsi](%3) \n\t"
2720                 "mov %%edi, %c[rdi](%3) \n\t"
2721                 "mov %%ebp, %c[rbp](%3) \n\t"
2722                 "mov %%cr2, %%eax  \n\t"
2723                 "mov %%eax, %c[cr2](%3) \n\t"
2724                 "mov 0(%%esp), %3 \n\t"
2725
2726                 "pop %%ecx; popa \n\t"
2727 #endif
2728                 "setbe %0 \n\t"
2729                 "popf \n\t"
2730               : "=g" (fail)
2731               : "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
2732                 "c"(vcpu),
2733                 [rax]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
2734                 [rbx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
2735                 [rcx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
2736                 [rdx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
2737                 [rsi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
2738                 [rdi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
2739                 [rbp]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
2740 #ifdef __x86_64__
2741                 [r8 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8 ])),
2742                 [r9 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9 ])),
2743                 [r10]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
2744                 [r11]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
2745                 [r12]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
2746                 [r13]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
2747                 [r14]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
2748                 [r15]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
2749 #endif
2750                 [cr2]"i"(offsetof(struct litevm_vcpu, cr2))
2751               : "cc", "memory" );
2752
2753         ++litevm_stat.exits;
2754
2755         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2756         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
2757
2758         fx_save(vcpu->guest_fx_image);
2759         fx_restore(vcpu->host_fx_image);
2760
2761 #ifndef __x86_64__
2762         asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2763 #endif
2764
2765         litevm_run->exit_type = 0;
2766         if (fail) {
2767                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
2768                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
2769         } else {
2770                 if (fs_gs_ldt_reload_needed) {
2771                         load_ldt(ldt_sel);
2772                         load_fs(fs_sel);
2773                         /*
2774                          * If we have to reload gs, we must take care to
2775                          * preserve our gs base.
2776                          */
2777                         disable_irq();
2778                         load_gs(gs_sel);
2779 #ifdef __x86_64__
2780                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
2781 #endif
2782                         enable_irq();
2783
2784                         reload_tss();
2785                 }
2786                 vcpu->launched = 1;
2787                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
2788                 if (litevm_handle_exit(litevm_run, vcpu)) {
2789                         /* Give scheduler a change to reschedule. */
2790                         vcpu_put(vcpu);
2791 #warning "how to tell if signal is pending"
2792 /*
2793                         if (signal_pending(current)) {
2794                                 ++litevm_stat.signal_exits;
2795                                 return -EINTR;
2796                         }
2797 */
2798                         kthread_yield();
2799                         /* Cannot fail -  no vcpu unplug yet. */
2800                         vcpu_load(litevm, vcpu_slot(vcpu));
2801                         goto again;
2802                 }
2803         }
2804
2805         vcpu_put(vcpu);
2806         return 0;
2807 }
2808
2809 static int litevm_dev_ioctl_get_regs(struct litevm *litevm, struct litevm_regs *regs)
2810 {
2811         struct litevm_vcpu *vcpu;
2812
2813         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS)
2814                 return -EINVAL;
2815
2816         vcpu = vcpu_load(litevm, regs->vcpu);
2817         if (!vcpu)
2818                 return -ENOENT;
2819
2820         regs->rax = vcpu->regs[VCPU_REGS_RAX];
2821         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2822         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2823         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2824         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2825         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2826         regs->rsp = vmcs_readl(GUEST_RSP);
2827         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2828 #ifdef __x86_64__
2829         regs->r8 = vcpu->regs[VCPU_REGS_R8];
2830         regs->r9 = vcpu->regs[VCPU_REGS_R9];
2831         regs->r10 = vcpu->regs[VCPU_REGS_R10];
2832         regs->r11 = vcpu->regs[VCPU_REGS_R11];
2833         regs->r12 = vcpu->regs[VCPU_REGS_R12];
2834         regs->r13 = vcpu->regs[VCPU_REGS_R13];
2835         regs->r14 = vcpu->regs[VCPU_REGS_R14];
2836         regs->r15 = vcpu->regs[VCPU_REGS_R15];
2837 #endif
2838
2839         regs->rip = vmcs_readl(GUEST_RIP);
2840         regs->rflags = vmcs_readl(GUEST_RFLAGS);
2841
2842         /*
2843          * Don't leak debug flags in case they were set for guest debugging
2844          */
2845         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2846                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2847
2848         vcpu_put(vcpu);
2849
2850         return 0;
2851 }
2852
2853 static int litevm_dev_ioctl_set_regs(struct litevm *litevm, struct litevm_regs *regs)
2854 {
2855         struct litevm_vcpu *vcpu;
2856
2857         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS)
2858                 return -EINVAL;
2859
2860         vcpu = vcpu_load(litevm, regs->vcpu);
2861         if (!vcpu)
2862                 return -ENOENT;
2863
2864         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2865         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2866         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2867         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2868         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2869         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2870         vmcs_writel(GUEST_RSP, regs->rsp);
2871         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2872 #ifdef __x86_64__
2873         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2874         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2875         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2876         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2877         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2878         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2879         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2880         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2881 #endif
2882
2883         vmcs_writel(GUEST_RIP, regs->rip);
2884         vmcs_writel(GUEST_RFLAGS, regs->rflags);
2885
2886         vcpu_put(vcpu);
2887
2888         return 0;
2889 }
2890
2891 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
2892 {
2893         struct litevm_vcpu *vcpu;
2894
2895         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS)
2896                 return -EINVAL;
2897         vcpu = vcpu_load(litevm, sregs->vcpu);
2898         if (!vcpu)
2899                 return -ENOENT;
2900
2901 #define get_segment(var, seg) \
2902         do { \
2903                 uint32_t ar; \
2904                 \
2905                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
2906                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
2907                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
2908                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
2909                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
2910                 sregs->var.type = ar & 15; \
2911                 sregs->var.s = (ar >> 4) & 1; \
2912                 sregs->var.dpl = (ar >> 5) & 3; \
2913                 sregs->var.present = (ar >> 7) & 1; \
2914                 sregs->var.avl = (ar >> 12) & 1; \
2915                 sregs->var.l = (ar >> 13) & 1; \
2916                 sregs->var.db = (ar >> 14) & 1; \
2917                 sregs->var.g = (ar >> 15) & 1; \
2918                 sregs->var.unusable = (ar >> 16) & 1; \
2919         } while (0);
2920
2921         get_segment(cs, CS);
2922         get_segment(ds, DS);
2923         get_segment(es, ES);
2924         get_segment(fs, FS);
2925         get_segment(gs, GS);
2926         get_segment(ss, SS);
2927
2928         get_segment(tr, TR);
2929         get_segment(ldt, LDTR);
2930 #undef get_segment
2931
2932 #define get_dtable(var, table) \
2933         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
2934                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
2935
2936         get_dtable(idt, IDTR);
2937         get_dtable(gdt, GDTR);
2938 #undef get_dtable
2939
2940         sregs->cr0 = guest_cr0();
2941         sregs->cr2 = vcpu->cr2;
2942         sregs->cr3 = vcpu->cr3;
2943         sregs->cr4 = guest_cr4();
2944         sregs->cr8 = vcpu->cr8;
2945         sregs->efer = vcpu->shadow_efer;
2946         sregs->apic_base = vcpu->apic_base;
2947
2948         sregs->pending_int = vcpu->irq_summary != 0;
2949
2950         vcpu_put(vcpu);
2951
2952         return 0;
2953 }
2954
2955 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
2956 {
2957         struct litevm_vcpu *vcpu;
2958         int mmu_reset_needed = 0;
2959
2960         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS)
2961                 return -EINVAL;
2962         vcpu = vcpu_load(litevm, sregs->vcpu);
2963         if (!vcpu)
2964                 return -ENOENT;
2965
2966 #define set_segment(var, seg) \
2967         do { \
2968                 uint32_t ar; \
2969                 \
2970                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
2971                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
2972                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
2973                 if (sregs->var.unusable) { \
2974                         ar = (1 << 16); \
2975                 } else { \
2976                         ar = (sregs->var.type & 15); \
2977                         ar |= (sregs->var.s & 1) << 4; \
2978                         ar |= (sregs->var.dpl & 3) << 5; \
2979                         ar |= (sregs->var.present & 1) << 7; \
2980                         ar |= (sregs->var.avl & 1) << 12; \
2981                         ar |= (sregs->var.l & 1) << 13; \
2982                         ar |= (sregs->var.db & 1) << 14; \
2983                         ar |= (sregs->var.g & 1) << 15; \
2984                 } \
2985                 vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
2986         } while (0);
2987
2988         set_segment(cs, CS);
2989         set_segment(ds, DS);
2990         set_segment(es, ES);
2991         set_segment(fs, FS);
2992         set_segment(gs, GS);
2993         set_segment(ss, SS);
2994
2995         set_segment(tr, TR);
2996
2997         set_segment(ldt, LDTR);
2998 #undef set_segment
2999
3000 #define set_dtable(var, table) \
3001         vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
3002         vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
3003
3004         set_dtable(idt, IDTR);
3005         set_dtable(gdt, GDTR);
3006 #undef set_dtable
3007
3008         vcpu->cr2 = sregs->cr2;
3009         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
3010         vcpu->cr3 = sregs->cr3;
3011
3012         vcpu->cr8 = sregs->cr8;
3013
3014         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
3015 #ifdef __x86_64__
3016         __set_efer(vcpu, sregs->efer);
3017 #endif
3018         vcpu->apic_base = sregs->apic_base;
3019
3020         mmu_reset_needed |= guest_cr0() != sregs->cr0;
3021         vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
3022         update_exception_bitmap(vcpu);
3023         vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
3024         vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
3025
3026         mmu_reset_needed |=  guest_cr4() != sregs->cr4;
3027         __set_cr4(vcpu, sregs->cr4);
3028
3029         if (mmu_reset_needed)
3030                 litevm_mmu_reset_context(vcpu);
3031         vcpu_put(vcpu);
3032
3033         return 0;
3034 }
3035
3036 /*
3037  * Translate a guest virtual address to a guest physical address.
3038  */
3039 static int litevm_dev_ioctl_translate(struct litevm *litevm, struct litevm_translation *tr)
3040 {
3041         unsigned long vaddr = tr->linear_address;
3042         struct litevm_vcpu *vcpu;
3043         gpa_t gpa;
3044
3045         vcpu = vcpu_load(litevm, tr->vcpu);
3046         if (!vcpu)
3047                 return -ENOENT;
3048         spin_lock(&litevm->lock);
3049         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
3050         tr->physical_address = gpa;
3051         tr->valid = gpa != UNMAPPED_GVA;
3052         tr->writeable = 1;
3053         tr->usermode = 0;
3054         spin_unlock(&litevm->lock);
3055         vcpu_put(vcpu);
3056
3057         return 0;
3058 }
3059
3060 #if 0
3061 static int litevm_dev_ioctl_interrupt(struct litevm *litevm, struct litevm_interrupt *irq)
3062 {
3063         struct litevm_vcpu *vcpu;
3064
3065         if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS)
3066                 return -EINVAL;
3067         if (irq->irq < 0 || irq->irq >= 256)
3068                 return -EINVAL;
3069         vcpu = vcpu_load(litevm, irq->vcpu);
3070         if (!vcpu)
3071                 return -ENOENT;
3072
3073         set_bit(irq->irq, vcpu->irq_pending);
3074         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
3075
3076         vcpu_put(vcpu);
3077
3078         return 0;
3079 }
3080 #endif
3081
3082 #if 0
3083 static int litevm_dev_ioctl_debug_guest(struct litevm *litevm,
3084                                      struct litevm_debug_guest *dbg)
3085 {
3086         struct litevm_vcpu *vcpu;
3087         unsigned long dr7 = 0x400;
3088         uint32_t exception_bitmap;
3089         int old_singlestep;
3090
3091         if (dbg->vcpu < 0 || dbg->vcpu >= LITEVM_MAX_VCPUS)
3092                 return -EINVAL;
3093         vcpu = vcpu_load(litevm, dbg->vcpu);
3094         if (!vcpu)
3095                 return -ENOENT;
3096
3097         exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
3098         old_singlestep = vcpu->guest_debug.singlestep;
3099
3100         vcpu->guest_debug.enabled = dbg->enabled;
3101         if (vcpu->guest_debug.enabled) {
3102                 int i;
3103
3104                 dr7 |= 0x200;  /* exact */
3105                 for (i = 0; i < 4; ++i) {
3106                         if (!dbg->breakpoints[i].enabled)
3107                                 continue;
3108                         vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
3109                         dr7 |= 2 << (i*2);    /* global enable */
3110                         dr7 |= 0 << (i*4+16); /* execution breakpoint */
3111                 }
3112
3113                 exception_bitmap |= (1u << 1);  /* Trap debug exceptions */
3114
3115                 vcpu->guest_debug.singlestep = dbg->singlestep;
3116         } else {
3117                 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
3118                 vcpu->guest_debug.singlestep = 0;
3119         }
3120
3121         if (old_singlestep && !vcpu->guest_debug.singlestep) {
3122                 unsigned long flags;
3123
3124                 flags = vmcs_readl(GUEST_RFLAGS);
3125                 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3126                 vmcs_writel(GUEST_RFLAGS, flags);
3127         }
3128
3129         vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
3130         vmcs_writel(GUEST_DR7, dr7);
3131
3132         vcpu_put(vcpu);
3133
3134         return 0;
3135 }
3136 #endif
3137
3138 #if 0
3139 long litevm_control(struct litevm *litevm, int command, unsigned long arg)
3140 {
3141         int r = -EINVAL;
3142
3143         switch (command) {
3144         case LITEVM_CREATE_VCPU: {
3145                 r = create_vcpu(litevm, arg);
3146                 if (r)
3147                         goto out;
3148                 break;
3149         }
3150         case LITEVM_RUN: {
3151                 struct litevm_run litevm_run;
3152
3153                 r = -EFAULT;
3154                 if (copy_from_user(&litevm_run, (void *)arg, sizeof litevm_run))
3155                         goto out;
3156                 r = litevm_dev_ioctl_run(litevm, &litevm_run);
3157                 if (r < 0)
3158                         goto out;
3159                 r = -EFAULT;
3160                 if (copy_to_user((void *)arg, &litevm_run, sizeof litevm_run))
3161                         goto out;
3162                 r = 0;
3163                 break;
3164         }
3165         case LITEVM_GET_REGS: {
3166                 struct litevm_regs litevm_regs;
3167
3168                 r = -EFAULT;
3169                 if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
3170                         goto out;
3171                 r = litevm_dev_ioctl_get_regs(litevm, &litevm_regs);
3172                 if (r)
3173                         goto out;
3174                 r = -EFAULT;
3175                 if (copy_to_user((void *)arg, &litevm_regs, sizeof litevm_regs))
3176                         goto out;
3177                 r = 0;
3178                 break;
3179         }
3180         case LITEVM_SET_REGS: {
3181                 struct litevm_regs litevm_regs;
3182
3183                 r = -EFAULT;
3184                 if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
3185                         goto out;
3186                 r = litevm_dev_ioctl_set_regs(litevm, &litevm_regs);
3187                 if (r)
3188                         goto out;
3189                 r = 0;
3190                 break;
3191         }
3192         case LITEVM_GET_SREGS: {
3193                 struct litevm_sregs litevm_sregs;
3194
3195                 r = -EFAULT;
3196                 if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3197                         goto out;
3198                 r = litevm_dev_ioctl_get_sregs(litevm, &litevm_sregs);
3199                 if (r)
3200                         goto out;
3201                 r = -EFAULT;
3202                 if (copy_to_user((void *)arg, &litevm_sregs, sizeof litevm_sregs))
3203                         goto out;
3204                 r = 0;
3205                 break;
3206         }
3207         case LITEVM_SET_SREGS: {
3208                 struct litevm_sregs litevm_sregs;
3209
3210                 r = -EFAULT;
3211                 if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3212                         goto out;
3213                 r = litevm_dev_ioctl_set_sregs(litevm, &litevm_sregs);
3214                 if (r)
3215                         goto out;
3216                 r = 0;
3217                 break;
3218         }
3219         case LITEVM_TRANSLATE: {
3220                 struct litevm_translation tr;
3221
3222                 r = -EFAULT;
3223                 if (copy_from_user(&tr, (void *)arg, sizeof tr))
3224                         goto out;
3225                 r = litevm_dev_ioctl_translate(litevm, &tr);
3226                 if (r)
3227                         goto out;
3228                 r = -EFAULT;
3229                 if (copy_to_user((void *)arg, &tr, sizeof tr))
3230                         goto out;
3231                 r = 0;
3232                 break;
3233         }
3234         case LITEVM_INTERRUPT: {
3235                 struct litevm_interrupt irq;
3236
3237                 r = -EFAULT;
3238                 if (copy_from_user(&irq, (void *)arg, sizeof irq))
3239                         goto out;
3240                 r = litevm_dev_ioctl_interrupt(litevm, &irq);
3241                 if (r)
3242                         goto out;
3243                 r = 0;
3244                 break;
3245         }
3246         case LITEVM_DEBUG_GUEST: {
3247                 struct litevm_debug_guest dbg;
3248
3249                 r = -EFAULT;
3250                 if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
3251                         goto out;
3252                 r = litevm_dev_ioctl_debug_guest(litevm, &dbg);
3253                 if (r)
3254                         goto out;
3255                 r = 0;
3256                 break;
3257         }
3258         case LITEVM_SET_MEMORY_REGION: {
3259                 struct litevm_memory_region litevm_mem;
3260
3261                 r = -EFAULT;
3262                 if (copy_from_user(&litevm_mem, (void *)arg, sizeof litevm_mem))
3263                         goto out;
3264                 r = litevm_dev_ioctl_set_memory_region(litevm, &litevm_mem);
3265                 if (r)
3266                         goto out;
3267                 break;
3268         }
3269         case LITEVM_GET_DIRTY_LOG: {
3270                 struct litevm_dirty_log log;
3271
3272                 r = -EFAULT;
3273                 if (copy_from_user(&log, (void *)arg, sizeof log))
3274                         goto out;
3275                 r = litevm_dev_ioctl_get_dirty_log(litevm, &log);
3276                 if (r)
3277                         goto out;
3278                 break;
3279         }
3280         default:
3281                 ;
3282         }
3283 out:
3284         return r;
3285 }
3286 #endif
3287
3288 #if 0
3289 static int litevm_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3290 {
3291         struct litevm *litevm = vma->vm_file->private_data;
3292         struct litevm_memory_slot *slot;
3293         struct page *page;
3294
3295         slot = gfn_to_memslot(litevm, vmf->pgoff);
3296         if (!slot)
3297                 return VM_FAULT_SIGBUS;
3298         page = gfn_to_page(slot, vmf->pgoff);
3299         if (!page)
3300                 return VM_FAULT_SIGBUS;
3301
3302         get_page(page);
3303         vmf->page = page;
3304         return 0;
3305 }
3306 #endif
3307
3308 #if 0
3309 static int litevm_reboot(struct notifier_block *notifier, unsigned long val,
3310                        void *v)
3311 {
3312         panic("litevm_reboot");
3313         if (val == SYS_RESTART) {
3314                 /*
3315                  * Some (well, at least mine) BIOSes hang on reboot if
3316                  * in vmx root mode.
3317                  */
3318                 printk("litevm: exiting vmx mode\n");
3319                 handler_wrapper_t *w;
3320                 smp_call_function_all(litevm_disable, 0, &w);
3321                 smp_call_wait(w);
3322         }
3323         return NOTIFY_OK;
3324         return 0;
3325 }
3326 #endif
3327
3328 hpa_t bad_page_address;
3329
3330 int vmx_init(void)
3331 {
3332         handler_wrapper_t *w;
3333         int r = 0;
3334
3335         if (!cpu_has_litevm_support()) {
3336                 printk("litevm: no hardware support\n");
3337                 return -EOPNOTSUPP;
3338         }
3339         if (vmx_disabled_by_bios()) {
3340                 printk("litevm: disabled by bios\n");
3341                 return -EOPNOTSUPP;
3342         }
3343
3344         setup_vmcs_descriptor();
3345         smp_call_function_all(vm_enable, 0, &w);
3346         if (smp_call_wait(w)){
3347                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
3348         }
3349
3350         if ((bad_page_address = PADDR(kpage_zalloc_addr())) == 0ULL) {
3351                 r = -ENOMEM;
3352         }
3353
3354         return r;
3355 }
3356
3357 static void litevm_exit(void)
3358 {
3359         //free_litevm_area();
3360         //__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3361 }
3362
3363