28ace51f0325298045ea20ac87463bd236b9bf05
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #include <kmalloc.h>
17 #include <string.h>
18 #include <stdio.h>
19 #include <assert.h>
20 #include <error.h>
21 #include <pmap.h>
22 #include <sys/queue.h>
23 #include <smp.h>
24 #include <kref.h>
25 #include <atomic.h>
26 #include <alarm.h>
27 #include <event.h>
28 #include <umem.h>
29 #include <devalarm.h>
30 #include <arch/types.h>
31 #include <arch/vm.h>
32 #include <arch/emulate.h>
33 #include <arch/vmdebug.h>
34 #include <arch/msr-index.h>
35
36 #define currentcpu (&per_cpu_info[core_id()])
37
38 struct litevm_stat litevm_stat;
39
40 static struct litevm_stats_debugfs_item {
41         const char *name;
42         uint32_t *data;
43 } debugfs_entries[] = {
44         { "pf_fixed", &litevm_stat.pf_fixed },
45         { "pf_guest", &litevm_stat.pf_guest },
46         { "tlb_flush", &litevm_stat.tlb_flush },
47         { "invlpg", &litevm_stat.invlpg },
48         { "exits", &litevm_stat.exits },
49         { "io_exits", &litevm_stat.io_exits },
50         { "mmio_exits", &litevm_stat.mmio_exits },
51         { "signal_exits", &litevm_stat.signal_exits },
52         { "irq_exits", &litevm_stat.irq_exits },
53         { 0, 0 }
54 };
55
56 static struct dentry *debugfs_dir;
57
58 static const uint32_t vmx_msr_index[] = {
59 #ifdef __x86_64__
60         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
61 #endif
62         MSR_EFER, // wtf? MSR_K6_STAR,
63 };
64 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
65
66 #ifdef __x86_64__
67 /*
68  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
69  * mechanism (cpu bug AA24)
70  */
71 #define NR_BAD_MSRS 2
72 #else
73 #define NR_BAD_MSRS 0
74 #endif
75
76 #define TSS_IOPB_BASE_OFFSET 0x66
77 #define TSS_BASE_SIZE 0x68
78 #define TSS_IOPB_SIZE (65536 / 8)
79 #define TSS_REDIRECTION_SIZE (256 / 8)
80 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
81
82 #define MSR_IA32_VMX_BASIC_MSR                  0x480
83 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
84 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
85 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
86 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
87
88 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
89 #define LMSW_GUEST_MASK 0x0eULL
90 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
91 //#define CR4_VMXE 0x2000
92 #define CR8_RESEVED_BITS (~0x0fULL)
93 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
94
95 #ifdef __x86_64__
96 #define HOST_IS_64 1
97 #else
98 #define HOST_IS_64 0
99 #endif
100
101 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
102 /**
103  * __ffs - find first set bit in word
104  * @word: The word to search
105  *
106  * Undefined if no bit exists, so code should check against 0 first.
107  */
108 static inline unsigned long __ffs(unsigned long word)
109 {
110         asm("rep; bsf %1,%0"
111                 : "=r" (word)
112                 : "rm" (word));
113         return word;
114 }
115
116 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu, uint32_t msr)
117 {
118         int i;
119
120         for (i = 0; i < vcpu->nmsrs; ++i)
121                 if (vcpu->guest_msrs[i].index == msr)
122                         return &vcpu->guest_msrs[i];
123         return 0;
124 }
125
126 struct descriptor_table {
127         uint16_t limit;
128         unsigned long base;
129 } __attribute__((packed));
130
131 static void get_gdt(struct descriptor_table *table)
132 {
133         asm ("sgdt %0" : "=m"(*table));
134 }
135
136 static void get_idt(struct descriptor_table *table)
137 {
138         asm ("sidt %0" : "=m"(*table));
139 }
140
141 static uint16_t read_fs(void)
142 {
143         uint16_t seg;
144         asm ("mov %%fs, %0" : "=g"(seg));
145         return seg;
146 }
147
148 static uint16_t read_gs(void)
149 {
150         uint16_t seg;
151         asm ("mov %%gs, %0" : "=g"(seg));
152         return seg;
153 }
154
155 static uint16_t read_ldt(void)
156 {
157         uint16_t ldt;
158         asm ("sldt %0" : "=g"(ldt));
159         return ldt;
160 }
161
162 static void load_fs(uint16_t sel)
163 {
164         asm ("mov %0, %%fs" : : "g"(sel));
165 }
166
167 static void load_gs(uint16_t sel)
168 {
169         asm ("mov %0, %%gs" : : "g"(sel));
170 }
171
172 #ifndef load_ldt
173 static void load_ldt(uint16_t sel)
174 {
175         asm ("lldt %0" : : "g"(sel));
176 }
177 #endif
178
179 static void fx_save(void *image)
180 {
181         asm ("fxsave (%0)":: "r" (image));
182 }
183
184 static void fx_restore(void *image)
185 {
186         asm ("fxrstor (%0)":: "r" (image));
187 }
188
189 static void fpu_init(void)
190 {
191         asm ("finit");
192 }
193
194 struct segment_descriptor {
195         uint16_t limit_low;
196         uint16_t base_low;
197         uint8_t  base_mid;
198         uint8_t  type : 4;
199         uint8_t  system : 1;
200         uint8_t  dpl : 2;
201         uint8_t  present : 1;
202         uint8_t  limit_high : 4;
203         uint8_t  avl : 1;
204         uint8_t  long_mode : 1;
205         uint8_t  default_op : 1;
206         uint8_t  granularity : 1;
207         uint8_t  base_high;
208 } __attribute__((packed));
209
210 #ifdef __x86_64__
211 // LDT or TSS descriptor in the GDT. 16 bytes.
212 struct segment_descriptor_64 {
213         struct segment_descriptor s;
214         uint32_t base_higher;
215         uint32_t pad_zero;
216 };
217
218 #endif
219
220 static unsigned long segment_base(uint16_t selector)
221 {
222         struct descriptor_table gdt;
223         struct segment_descriptor *d;
224         unsigned long table_base;
225         typedef unsigned long ul;
226         unsigned long v;
227
228         asm ("sgdt %0" : "=m"(gdt));
229         table_base = gdt.base;
230
231         if (selector & 4) {           /* from ldt */
232                 uint16_t ldt_selector;
233
234                 asm ("sldt %0" : "=g"(ldt_selector));
235                 table_base = segment_base(ldt_selector);
236         }
237         d = (struct segment_descriptor *)(table_base + (selector & ~7));
238         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
239 #ifdef __x86_64__
240         if (d->system == 0
241             && (d->type == 2 || d->type == 9 || d->type == 11))
242                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
243 #endif
244         return v;
245 }
246
247 static unsigned long read_tr_base(void)
248 {
249         uint16_t tr;
250         asm ("str %0" : "=g"(tr));
251         return segment_base(tr);
252 }
253
254 static void reload_tss(void)
255 {
256 #ifndef __x86_64__
257
258         /*
259          * VT restores TR but not its size.  Useless.
260          */
261         struct descriptor_table gdt;
262         struct segment_descriptor *descs;
263
264         get_gdt(&gdt);
265         descs = (void *)gdt.base;
266         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
267         load_TR_desc();
268 #endif
269 }
270
271 static struct vmcs_descriptor {
272         int size;
273         int order;
274         uint32_t revision_id;
275 } vmcs_descriptor;
276
277 #if 0
278 #ifdef __x86_64__
279 static unsigned long read_msr(unsigned long msr)
280 {
281         uint64_t value;
282
283         rdmsrl(msr, value);
284         return value;
285 }
286 #endif
287 #endif
288 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
289 {
290         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
291         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
292 }
293
294
295
296 int litevm_read_guest(struct litevm_vcpu *vcpu,
297                              gva_t addr,
298                              unsigned long size,
299                              void *dest)
300 {
301         unsigned char *host_buf = dest;
302         unsigned long req_size = size;
303
304         while (size) {
305                 hpa_t paddr;
306                 unsigned now;
307                 unsigned offset;
308                 hva_t guest_buf;
309
310                 paddr = gva_to_hpa(vcpu, addr);
311
312                 if (is_error_hpa(paddr))
313                         break;
314                 guest_buf = (hva_t)KADDR(paddr);
315                 offset = addr & ~PAGE_MASK;
316                 guest_buf |= offset;
317                 now = MIN(size, PAGE_SIZE - offset);
318                 memcpy(host_buf, (void*)guest_buf, now);
319                 host_buf += now;
320                 addr += now;
321                 size -= now;
322         }
323         return req_size - size;
324 }
325
326 int litevm_write_guest(struct litevm_vcpu *vcpu,
327                              gva_t addr,
328                              unsigned long size,
329                              void *data)
330 {
331         unsigned char *host_buf = data;
332         unsigned long req_size = size;
333
334         while (size) {
335                 hpa_t paddr;
336                 unsigned now;
337                 unsigned offset;
338                 hva_t guest_buf;
339
340                 paddr = gva_to_hpa(vcpu, addr);
341
342                 if (is_error_hpa(paddr))
343                         break;
344
345                 guest_buf = (hva_t)KADDR(paddr);
346                 offset = addr & ~PAGE_MASK;
347                 guest_buf |= offset;
348                 now = MIN(size, PAGE_SIZE - offset);
349                 memcpy((void*)guest_buf, host_buf, now);
350                 host_buf += now;
351                 addr += now;
352                 size -= now;
353         }
354         return req_size - size;
355 }
356
357 static void setup_vmcs_descriptor(void)
358 {
359         uint64_t msr;
360
361         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
362         vmcs_descriptor.size = (msr>>32) & 0x1fff;
363         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size>>PAGE_SHIFT);
364         vmcs_descriptor.revision_id = (uint32_t)msr;
365 };
366
367 static void vmcs_clear(struct vmcs *vmcs)
368 {
369         uint64_t phys_addr = PADDR(vmcs);
370         uint8_t error;
371
372         asm volatile ("vmclear %1; setna %0"
373                        : "=m"(error) : "m"(phys_addr) : "cc", "memory" );
374         if (error)
375                 printk("litevm: vmclear fail: %p/%llx\n",
376                        vmcs, phys_addr);
377 }
378
379 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
380 {
381         struct litevm_vcpu *vcpu = arg;
382         int cpu = core_id();
383         printd("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n", 
384                cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
385
386         if (vcpu->cpu == cpu)
387                 vmcs_clear(vcpu->vmcs);
388
389         if (currentcpu->vmcs == vcpu->vmcs)
390                 currentcpu->vmcs = NULL;
391 }
392
393 static int vcpu_slot(struct litevm_vcpu *vcpu)
394 {
395         return vcpu - vcpu->litevm->vcpus;
396 }
397
398 /*
399  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
400  * vcpu mutex is already taken.
401  */
402 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
403 {
404         uint64_t phys_addr = PADDR(vcpu->vmcs);
405         int cpu;
406         cpu = core_id();
407
408         if (vcpu->cpu != cpu) {
409                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, smp_call_wait);
410                 vcpu->launched = 0;
411         }
412         if (currentcpu->vmcs != vcpu->vmcs) {
413                 uint8_t error;
414
415                 currentcpu->vmcs = vcpu->vmcs;
416                 asm volatile ("vmptrld %1; setna %0"
417                                : "=m"(error) : "m"(phys_addr) : "cc" );
418                 if (error)
419                         printk("litevm: vmptrld %p/%llx fail\n",
420                                vcpu->vmcs, phys_addr);
421         }
422
423         if (vcpu->cpu != cpu) {
424                 struct descriptor_table dt;
425                 unsigned long sysenter_esp;
426
427                 vcpu->cpu = cpu;
428                 /*
429                  * Linux uses per-cpu TSS and GDT, so set these when switching
430                  * processors.
431                  */
432                 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
433                 get_gdt(&dt);
434                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
435
436                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
437                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
438         }
439         return vcpu;
440 }
441
442 /*
443  * Switches to specified vcpu, until a matching vcpu_put()
444  */
445 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
446 {
447         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
448
449         qlock(&vcpu->mutex);
450         if (!vcpu->vmcs) {
451                 qunlock(&vcpu->mutex);
452                 return 0;
453         }
454         return __vcpu_load(vcpu);
455 }
456
457 static void vcpu_put(struct litevm_vcpu *vcpu)
458 {
459         put_cpu();
460         qunlock(&vcpu->mutex);
461 }
462
463
464 static struct vmcs *alloc_vmcs_cpu(int cpu)
465 {
466         int node = node_id();
467         struct vmcs *vmcs;
468
469         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
470         if (!pages)
471                 return 0;
472         memset(vmcs, 0, vmcs_descriptor.size);
473         vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
474         return vmcs;
475 }
476
477 static struct vmcs *alloc_vmcs(void)
478 {
479         return alloc_vmcs_cpu(core_id());
480 }
481
482 static int cpu_has_litevm_support(void)
483 {
484         uint32_t ecx = cpuid_ecx(1);
485         return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */
486 }
487
488 static int alloc_litevm_area(void)
489 {
490 #if 0
491         int cpu;
492         int i;
493
494         /* no longer needed. We put the vmxarea into the cpu info. */
495         /* leave here for now. */
496         for_each_online_cpu(cpu) {
497                 struct vmcs *vmcs;
498
499                 vmcs = alloc_vmcs_cpu(cpu);
500                 if (!vmcs) {
501                         free_litevm_area();
502                         return -ENOMEM;
503                 }
504
505                 per_cpu(vmxarea, cpu) = vmcs;
506         }
507 #endif
508         return 0;
509 }
510
511 static int vmx_disabled_by_bios(void)
512 {
513         uint64_t msr;
514
515         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
516         return (msr & 5) == 1; /* locked but not enabled */
517 }
518
519 static void litevm_enable(struct hw_trapframe *hw_tf, void *garbage)
520 {
521         int cpu = hw_core_id();
522         uint64_t phys_addr = PADDR(&currentcpu->vmxarea);
523         uint64_t old;
524
525         old = read_msr(MSR_IA32_FEATURE_CONTROL);
526         if ((old & 5) == 0)
527                 /* enable and lock */
528                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
529         lcr4(rcr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
530         asm volatile ("vmxon %0" : : "m"(phys_addr) : "memory", "cc");
531 }
532
533 static void litevm_disable(void *garbage)
534 {
535         asm volatile ("vmxoff" : : : "cc");
536 }
537
538 static int litevm_dev_open(struct inode *inode, struct file *filp)
539 {
540         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
541         int i;
542
543         if (!litevm)
544                 return -ENOMEM;
545
546         spinlock_init(&litevm->lock);
547         LIST_INIT(&litevm->link);
548         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
549                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
550
551                 qlock_init(&vcpu->mutex);
552                 vcpu->mmu.root_hpa = INVALID_PAGE;
553                 LIST_INIT(&vcpu->link);
554         }
555 #warning "filp->private data -- > c->aux?"
556 //      filp->private_data = litevm;
557         return 0;
558 }
559
560 /*
561  * Free any memory in @free but not in @dont.
562  */
563 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
564                                   struct litevm_memory_slot *dont)
565 {
566         int i;
567
568         if (!dont || free->phys_mem != dont->phys_mem)
569                 if (free->phys_mem) {
570                         for (i = 0; i < free->npages; ++i){
571                                 page_t *page = ppn2page(i);
572                                 page_decref(ppn2page(i));
573                                 assert(page_is_free(i));
574                         }
575                         kfree(free->phys_mem);
576                 }
577
578         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
579                 kfree(free->dirty_bitmap);
580
581         free->phys_mem = 0;
582         free->npages = 0;
583         free->dirty_bitmap = 0;
584 }
585
586 static void litevm_free_physmem(struct litevm *litevm)
587 {
588         int i;
589
590         for (i = 0; i < litevm->nmemslots; ++i)
591                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
592 }
593
594 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
595 {
596         if (vcpu->vmcs) {
597                 smp_call_function_all(__vcpu_clear, vcpu, smp_call_wait);
598                 //free_vmcs(vcpu->vmcs);
599                 vcpu->vmcs = 0;
600         }
601 }
602
603 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
604 {
605         litevm_free_vmcs(vcpu);
606         litevm_mmu_destroy(vcpu);
607 }
608
609 static void litevm_free_vcpus(struct litevm *litevm)
610 {
611         unsigned int i;
612
613         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
614                 litevm_free_vcpu(&litevm->vcpus[i]);
615 }
616
617 static int litevm_dev_release(struct litevm *litevm)
618 {
619
620         litevm_free_vcpus(litevm);
621         litevm_free_physmem(litevm);
622         kfree(litevm);
623         return 0;
624 }
625
626 unsigned long vmcs_readl(unsigned long field)
627 {
628         unsigned long value;
629
630         asm volatile ("vmread %1, %0" : "=g"(value) : "r"(field) : "cc");
631         return value;
632 }
633
634 void vmcs_writel(unsigned long field, unsigned long value)
635 {
636         uint8_t error;
637
638         asm volatile ("vmwrite %1, %2; setna %0"
639                        : "=g"(error) : "r"(value), "r"(field) : "cc" );
640         if (error)
641                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
642                        field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
643 }
644
645 static void vmcs_write16(unsigned long field, uint16_t value)
646 {
647         vmcs_writel(field, value);
648 }
649
650 static void vmcs_write64(unsigned long field, uint64_t value)
651 {
652 #ifdef __x86_64__
653         vmcs_writel(field, value);
654 #else
655         vmcs_writel(field, value);
656         asm volatile ("");
657         vmcs_writel(field+1, value >> 32);
658 #endif
659 }
660
661 static void inject_gp(struct litevm_vcpu *vcpu)
662 {
663         printd("inject_general_protection: rip 0x%lx\n",
664                vmcs_readl(GUEST_RIP));
665         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
666         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
667                      GP_VECTOR |
668                      INTR_TYPE_EXCEPTION |
669                      INTR_INFO_DELIEVER_CODE_MASK |
670                      INTR_INFO_VALID_MASK);
671 }
672
673 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
674 {
675         if (vcpu->rmode.active)
676                 vmcs_write32(EXCEPTION_BITMAP, ~0);
677         else
678                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
679 }
680
681 static void enter_pmode(struct litevm_vcpu *vcpu)
682 {
683         unsigned long flags;
684
685         vcpu->rmode.active = 0;
686
687         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
688         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
689         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
690
691         flags = vmcs_readl(GUEST_RFLAGS);
692         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
693         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
694         vmcs_writel(GUEST_RFLAGS, flags);
695
696         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
697                         (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK) );
698
699         update_exception_bitmap(vcpu);
700
701         #define FIX_PMODE_DATASEG(seg, save) {                          \
702                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
703                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
704                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
705                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
706         }
707
708         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
709         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
710         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
711         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
712         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
713
714         vmcs_write16(GUEST_CS_SELECTOR,
715                      vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
716         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
717 }
718
719 static int rmode_tss_base(struct litevm* litevm)
720 {
721         gfn_t base_gfn = litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
722         return base_gfn << PAGE_SHIFT;
723 }
724
725 static void enter_rmode(struct litevm_vcpu *vcpu)
726 {
727         unsigned long flags;
728
729         vcpu->rmode.active = 1;
730
731         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
732         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
733
734         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
735         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
736
737         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
738         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
739
740         flags = vmcs_readl(GUEST_RFLAGS);
741         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
742
743         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
744
745         vmcs_writel(GUEST_RFLAGS, flags);
746         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
747         update_exception_bitmap(vcpu);
748
749         #define FIX_RMODE_SEG(seg, save) {                                 \
750                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
751                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
752                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
753                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
754         }
755
756         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
757         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
758
759         FIX_RMODE_SEG(ES, vcpu->rmode.es);
760         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
761         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
762         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
763         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
764 }
765
766 static int init_rmode_tss(struct litevm* litevm)
767 {
768         struct page *p1, *p2, *p3;
769         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
770         char *page;
771
772         p1 = _gfn_to_page(litevm, fn++);
773         p2 = _gfn_to_page(litevm, fn++);
774         p3 = _gfn_to_page(litevm, fn);
775
776         if (!p1 || !p2 || !p3) {
777                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
778                 return 0;
779         }
780
781         page = page2kva(p1);
782         memset(page, 0, PAGE_SIZE);
783         *(uint16_t*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
784
785         page = page2kva(p2);
786         memset(page, 0, PAGE_SIZE);
787
788         page = page2kva(p3);
789         memset(page, 0, PAGE_SIZE);
790         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
791
792         return 1;
793 }
794
795 #ifdef __x86_64__
796
797 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
798 {
799         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
800
801         vcpu->shadow_efer = efer;
802         if (efer & EFER_LMA) {
803                 vmcs_write32(VM_ENTRY_CONTROLS,
804                                      vmcs_read32(VM_ENTRY_CONTROLS) |
805                                      VM_ENTRY_CONTROLS_IA32E_MASK);
806                 msr->data = efer;
807
808         } else {
809                 vmcs_write32(VM_ENTRY_CONTROLS,
810                                      vmcs_read32(VM_ENTRY_CONTROLS) &
811                                      ~VM_ENTRY_CONTROLS_IA32E_MASK);
812
813                 msr->data = efer & ~EFER_LME;
814         }
815 }
816
817 static void enter_lmode(struct litevm_vcpu *vcpu)
818 {
819         uint32_t guest_tr_ar;
820
821         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
822         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
823                 printd("%s: tss fixup for long mode. \n",
824                        __FUNCTION__);
825                 vmcs_write32(GUEST_TR_AR_BYTES,
826                              (guest_tr_ar & ~AR_TYPE_MASK)
827                              | AR_TYPE_BUSY_64_TSS);
828         }
829
830         vcpu->shadow_efer |= EFER_LMA;
831
832         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
833         vmcs_write32(VM_ENTRY_CONTROLS,
834                      vmcs_read32(VM_ENTRY_CONTROLS)
835                      | VM_ENTRY_CONTROLS_IA32E_MASK);
836 }
837
838 static void exit_lmode(struct litevm_vcpu *vcpu)
839 {
840         vcpu->shadow_efer &= ~EFER_LMA;
841
842         vmcs_write32(VM_ENTRY_CONTROLS,
843                      vmcs_read32(VM_ENTRY_CONTROLS)
844                      & ~VM_ENTRY_CONTROLS_IA32E_MASK);
845 }
846
847 #endif
848
849 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
850 {
851         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
852                 enter_pmode(vcpu);
853
854         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
855                 enter_rmode(vcpu);
856
857 #ifdef __x86_64__
858         if (vcpu->shadow_efer & EFER_LME) {
859                 if (!is_paging() && (cr0 & CR0_PG_MASK))
860                         enter_lmode(vcpu);
861                 if (is_paging() && !(cr0 & CR0_PG_MASK))
862                         exit_lmode(vcpu);
863         }
864 #endif
865
866         vmcs_writel(CR0_READ_SHADOW, cr0);
867         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
868 }
869
870 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
871                                          unsigned long cr3)
872 {
873         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
874         unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
875         int i;
876         uint64_t pdpte;
877         uint64_t *pdpt;
878         struct litevm_memory_slot *memslot;
879
880         spin_lock(&vcpu->litevm->lock);
881         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
882         /* FIXME: !memslot - emulate? 0xff? */
883         pdpt = KADDR(gfn_to_page(memslot, pdpt_gfn));
884
885         for (i = 0; i < 4; ++i) {
886                 pdpte = pdpt[offset + i];
887                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
888                         break;
889         }
890
891         spin_unlock(&vcpu->litevm->lock);
892
893         return i != 4;
894 }
895
896 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
897 {
898         if (cr0 & CR0_RESEVED_BITS) {
899                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
900                        cr0, guest_cr0());
901                 inject_gp(vcpu);
902                 return;
903         }
904
905         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
906                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
907                 inject_gp(vcpu);
908                 return;
909         }
910
911         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
912                 printd("set_cr0: #GP, set PG flag "
913                        "and a clear PE flag\n");
914                 inject_gp(vcpu);
915                 return;
916         }
917
918         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
919 #ifdef __x86_64__
920                 if ((vcpu->shadow_efer & EFER_LME)) {
921                         uint32_t guest_cs_ar;
922                         if (!is_pae()) {
923                                 printd("set_cr0: #GP, start paging "
924                                        "in long mode while PAE is disabled\n");
925                                 inject_gp(vcpu);
926                                 return;
927                         }
928                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
929                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
930                                 printd("set_cr0: #GP, start paging "
931                                        "in long mode while CS.L == 1\n");
932                                 inject_gp(vcpu);
933                                 return;
934
935                         }
936                 } else
937 #endif
938                 if (is_pae() &&
939                             pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
940                         printd("set_cr0: #GP, pdptrs "
941                                "reserved bits\n");
942                         inject_gp(vcpu);
943                         return;
944                 }
945
946         }
947
948         __set_cr0(vcpu, cr0);
949         litevm_mmu_reset_context(vcpu);
950         return;
951 }
952
953 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
954 {
955         unsigned long cr0 = guest_cr0();
956
957         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
958                 enter_pmode(vcpu);
959                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
960
961         } else
962                 printd("lmsw: unexpected\n");
963
964         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
965                                 | (msw & LMSW_GUEST_MASK));
966 }
967
968 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
969 {
970         vmcs_writel(CR4_READ_SHADOW, cr4);
971         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
972                     LITEVM_RMODE_VM_CR4_ALWAYS_ON : LITEVM_PMODE_VM_CR4_ALWAYS_ON));
973 }
974
975 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
976 {
977         if (cr4 & CR4_RESEVED_BITS) {
978                 printd("set_cr4: #GP, reserved bits\n");
979                 inject_gp(vcpu);
980                 return;
981         }
982
983         if (is_long_mode()) {
984                 if (!(cr4 & CR4_PAE_MASK)) {
985                         printd("set_cr4: #GP, clearing PAE while "
986                                "in long mode\n");
987                         inject_gp(vcpu);
988                         return;
989                 }
990         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
991                    && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
992                 printd("set_cr4: #GP, pdptrs reserved bits\n");
993                 inject_gp(vcpu);
994         }
995
996         if (cr4 & CR4_VMXE_MASK) {
997                 printd("set_cr4: #GP, setting VMXE\n");
998                 inject_gp(vcpu);
999                 return;
1000         }
1001         __set_cr4(vcpu, cr4);
1002         spin_lock(&vcpu->litevm->lock);
1003         litevm_mmu_reset_context(vcpu);
1004         spin_unlock(&vcpu->litevm->lock);
1005 }
1006
1007 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1008 {
1009         if (is_long_mode()) {
1010                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
1011                         printd("set_cr3: #GP, reserved bits\n");
1012                         inject_gp(vcpu);
1013                         return;
1014                 }
1015         } else {
1016                 if (cr3 & CR3_RESEVED_BITS) {
1017                         printd("set_cr3: #GP, reserved bits\n");
1018                         inject_gp(vcpu);
1019                         return;
1020                 }
1021                 if (is_paging() && is_pae() &&
1022                     pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1023                         printd("set_cr3: #GP, pdptrs "
1024                                "reserved bits\n");
1025                         inject_gp(vcpu);
1026                         return;
1027                 }
1028         }
1029
1030         vcpu->cr3 = cr3;
1031         spin_lock(&vcpu->litevm->lock);
1032         vcpu->mmu.new_cr3(vcpu);
1033         spin_unlock(&vcpu->litevm->lock);
1034 }
1035
1036 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1037 {
1038         if ( cr8 & CR8_RESEVED_BITS) {
1039                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1040                 inject_gp(vcpu);
1041                 return;
1042         }
1043         vcpu->cr8 = cr8;
1044 }
1045
1046 static uint32_t get_rdx_init_val(void)
1047 {
1048         uint32_t val;
1049
1050         asm ("movl $1, %%eax \n\t"
1051              "movl %%eax, %0 \n\t" : "=g"(val) );
1052         return val;
1053
1054 }
1055
1056 static void fx_init(struct litevm_vcpu *vcpu)
1057 {
1058         struct __attribute__ ((__packed__)) fx_image_s {
1059                 uint16_t control; //fcw
1060                 uint16_t status; //fsw
1061                 uint16_t tag; // ftw
1062                 uint16_t opcode; //fop
1063                 uint64_t ip; // fpu ip
1064                 uint64_t operand;// fpu dp
1065                 uint32_t mxcsr;
1066                 uint32_t mxcsr_mask;
1067
1068         } *fx_image;
1069
1070         fx_save(vcpu->host_fx_image);
1071         fpu_init();
1072         fx_save(vcpu->guest_fx_image);
1073         fx_restore(vcpu->host_fx_image);
1074
1075         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1076         fx_image->mxcsr = 0x1f80;
1077         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1078                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1079 }
1080
1081 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field, uint32_t val)
1082 {
1083         uint32_t msr_high, msr_low;
1084         uint64_t msrval;
1085
1086         msrval = read_msr(msr);
1087         msr_low = msrval;
1088         msr_high = (msrval>>32);
1089
1090         val &= msr_high;
1091         val |= msr_low;
1092         vmcs_write32(vmcs_field, val);
1093 }
1094
1095 /*
1096  * Sets up the vmcs for emulated real mode.
1097  */
1098 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1099 {
1100 /* no op on x86_64 */
1101 #define asmlinkage
1102         extern asmlinkage void litevm_vmx_return(void);
1103         uint32_t host_sysenter_cs;
1104         uint32_t junk;
1105         uint64_t a;
1106         struct descriptor_table dt;
1107         int i;
1108         int ret;
1109         uint64_t tsc;
1110         int nr_good_msrs;
1111
1112
1113         if (!init_rmode_tss(vcpu->litevm)) {
1114                 ret = 0;
1115                 goto out;
1116         }
1117
1118         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1119         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1120         vcpu->cr8 = 0;
1121         vcpu->apic_base = 0xfee00000 |
1122                         /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
1123                         MSR_IA32_APICBASE_ENABLE;
1124
1125         fx_init(vcpu);
1126
1127 #define SEG_SETUP(seg) do {                                     \
1128                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1129                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1130                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1131                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1132         } while (0)
1133
1134         /*
1135          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1136          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1137          */
1138         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1139         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1140         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1141         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1142
1143         SEG_SETUP(DS);
1144         SEG_SETUP(ES);
1145         SEG_SETUP(FS);
1146         SEG_SETUP(GS);
1147         SEG_SETUP(SS);
1148
1149         vmcs_write16(GUEST_TR_SELECTOR, 0);
1150         vmcs_writel(GUEST_TR_BASE, 0);
1151         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1152         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1153
1154         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1155         vmcs_writel(GUEST_LDTR_BASE, 0);
1156         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1157         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1158
1159         vmcs_write32(GUEST_SYSENTER_CS, 0);
1160         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1161         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1162
1163         vmcs_writel(GUEST_RFLAGS, 0x02);
1164         vmcs_writel(GUEST_RIP, 0xfff0);
1165         vmcs_writel(GUEST_RSP, 0);
1166
1167         vmcs_writel(GUEST_CR3, 0);
1168
1169         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1170         vmcs_writel(GUEST_DR7, 0x400);
1171
1172         vmcs_writel(GUEST_GDTR_BASE, 0);
1173         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1174
1175         vmcs_writel(GUEST_IDTR_BASE, 0);
1176         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1177
1178         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1179         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1180         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1181
1182         /* I/O */
1183         vmcs_write64(IO_BITMAP_A, 0);
1184         vmcs_write64(IO_BITMAP_B, 0);
1185
1186         tsc = read_tsc();
1187         vmcs_write64(TSC_OFFSET, -tsc);
1188
1189         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1190
1191         /* Special registers */
1192         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1193
1194         /* Control */
1195         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR,
1196                                PIN_BASED_VM_EXEC_CONTROL,
1197                                PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
1198                                | PIN_BASED_NMI_EXITING   /* 20.6.1 */
1199                         );
1200         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR,
1201                                CPU_BASED_VM_EXEC_CONTROL,
1202                                CPU_BASED_HLT_EXITING         /* 20.6.2 */
1203                                | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
1204                                | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
1205                                | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
1206                                | CPU_BASED_INVDPG_EXITING
1207                                | CPU_BASED_MOV_DR_EXITING
1208                                | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
1209                         );
1210
1211         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1212         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1213         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1214         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1215
1216         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1217         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1218         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1219
1220 #warning "not setting selectors; do we need them?"
1221 #if 0
1222         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
1223         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1224         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1225 #endif
1226         vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
1227         vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
1228 #if 0
1229         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1230 #endif
1231 #ifdef __x86_64__
1232         a = read_msr(MSR_FS_BASE);
1233         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1234         a = read_msr(MSR_GS_BASE);
1235         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1236 #else
1237         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1238         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1239 #endif
1240
1241 #warning "Not setting HOST_TR_SELECTOR"
1242 #if 0
1243         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
1244 #endif
1245
1246         get_idt(&dt);
1247         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1248
1249
1250         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return); /* 22.2.5 */
1251
1252         /* it's the HIGH 32 bits! */
1253         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1254         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1255         a = read_msr(MSR_IA32_SYSENTER_ESP);
1256         vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
1257         a = read_msr(MSR_IA32_SYSENTER_EIP);
1258         vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
1259
1260         ret = -ENOMEM;
1261         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1262         if (!vcpu->guest_msrs)
1263                 goto out;
1264         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1265         if (!vcpu->host_msrs)
1266                 goto out_free_guest_msrs;
1267
1268         for (i = 0; i < NR_VMX_MSR; ++i) {
1269                 uint32_t index = vmx_msr_index[i];
1270                 uint32_t data_low, data_high;
1271                 uint64_t data;
1272                 int j = vcpu->nmsrs;
1273
1274 #warning "need readmsr_safe"
1275 //              if (rdmsr_safe(index, &data_low, &data_high) < 0)
1276 //                      continue;
1277                 data = read_msr(index);
1278                 vcpu->host_msrs[j].index = index;
1279                 vcpu->host_msrs[j].reserved = 0;
1280                 vcpu->host_msrs[j].data = data;
1281                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1282                 ++vcpu->nmsrs;
1283         }
1284         printk("msrs: %d\n", vcpu->nmsrs);
1285
1286         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1287         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
1288                     PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1289         vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
1290                     PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1291         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
1292                     PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1293         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS,
1294                                (HOST_IS_64 << 9));  /* 22.2,1, 20.7.1 */
1295         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
1296         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);  /* 22.2.2 */
1297         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1298
1299
1300         /* 22.2.1, 20.8.1 */
1301         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR,
1302                                VM_ENTRY_CONTROLS, 0);
1303         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1304
1305         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1306         vmcs_writel(TPR_THRESHOLD, 0);
1307
1308         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1309         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1310
1311         __set_cr0(vcpu, 0x60000010); // enter rmode
1312         __set_cr4(vcpu, 0);
1313 #ifdef __x86_64__
1314         __set_efer(vcpu, 0);
1315 #endif
1316
1317         ret = litevm_mmu_init(vcpu);
1318
1319         return ret;
1320
1321 out_free_guest_msrs:
1322         kfree(vcpu->guest_msrs);
1323 out:
1324         return ret;
1325 }
1326
1327 /*
1328  * Sync the rsp and rip registers into the vcpu structure.  This allows
1329  * registers to be accessed by indexing vcpu->regs.
1330  */
1331 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1332 {
1333         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1334         vcpu->rip = vmcs_readl(GUEST_RIP);
1335 }
1336
1337 /*
1338  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1339  * modification.
1340  */
1341 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1342 {
1343         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1344         vmcs_writel(GUEST_RIP, vcpu->rip);
1345 }
1346
1347 /*
1348  * Creates some virtual cpus.  Good luck creating more than one.
1349  */
1350 static int litevm_dev_ioctl_create_vcpu(struct litevm *litevm, int n)
1351 {
1352         int r;
1353         struct litevm_vcpu *vcpu;
1354         struct vmcs *vmcs;
1355
1356         r = -EINVAL;
1357         if (n < 0 || n >= LITEVM_MAX_VCPUS)
1358                 goto out;
1359
1360         vcpu = &litevm->vcpus[n];
1361
1362         qlock(&vcpu->mutex);
1363
1364         if (vcpu->vmcs) {
1365                 qunlock(&vcpu->mutex);
1366                 return -EEXIST;
1367         }
1368
1369         /* I'm a bad person */
1370         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1371         uint64_t a = (uint64_t) vcpu->fx_buf;
1372         a += FX_IMAGE_ALIGN-1;
1373         a /= FX_IMAGE_ALIGN;
1374         a *= FX_IMAGE_ALIGN;
1375
1376         vcpu->host_fx_image = (char*)a;
1377         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1378
1379         vcpu->cpu = -1;  /* First load will set up TR */
1380         vcpu->litevm = litevm;
1381         vmcs = alloc_vmcs();
1382         if (!vmcs) {
1383                 qunlock(&vcpu->mutex);
1384                 goto out_free_vcpus;
1385         }
1386         vmcs_clear(vmcs);
1387         vcpu->vmcs = vmcs;
1388         vcpu->launched = 0;
1389
1390         __vcpu_load(vcpu);
1391
1392         r = litevm_vcpu_setup(vcpu);
1393
1394         vcpu_put(vcpu);
1395
1396         if (r < 0)
1397                 goto out_free_vcpus;
1398
1399         return 0;
1400
1401 out_free_vcpus:
1402         litevm_free_vcpu(vcpu);
1403 out:
1404         return r;
1405 }
1406
1407 /*
1408  * Allocate some memory and give it an address in the guest physical address
1409  * space.
1410  *
1411  * Discontiguous memory is allowed, mostly for framebuffers.
1412  */
1413 static int litevm_dev_ioctl_set_memory_region(struct litevm *litevm,
1414                                            struct litevm_memory_region *mem)
1415 {
1416         int r;
1417         gfn_t base_gfn;
1418         unsigned long npages;
1419         unsigned long i;
1420         struct litevm_memory_slot *memslot;
1421         struct litevm_memory_slot old, new;
1422         int memory_config_version;
1423
1424         r = -EINVAL;
1425         /* General sanity checks */
1426         if (mem->memory_size & (PAGE_SIZE - 1))
1427                 goto out;
1428         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1429                 goto out;
1430         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1431                 goto out;
1432         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1433                 goto out;
1434
1435         memslot = &litevm->memslots[mem->slot];
1436         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1437         npages = mem->memory_size >> PAGE_SHIFT;
1438
1439         if (!npages)
1440                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1441
1442 raced:
1443         spin_lock(&litevm->lock);
1444
1445         memory_config_version = litevm->memory_config_version;
1446         new = old = *memslot;
1447
1448         new.base_gfn = base_gfn;
1449         new.npages = npages;
1450         new.flags = mem->flags;
1451
1452         /* Disallow changing a memory slot's size. */
1453         r = -EINVAL;
1454         if (npages && old.npages && npages != old.npages)
1455                 goto out_unlock;
1456
1457         /* Check for overlaps */
1458         r = -EEXIST;
1459         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1460                 struct litevm_memory_slot *s = &litevm->memslots[i];
1461
1462                 if (s == memslot)
1463                         continue;
1464                 if (!((base_gfn + npages <= s->base_gfn) ||
1465                       (base_gfn >= s->base_gfn + s->npages)))
1466                         goto out_unlock;
1467         }
1468         /*
1469          * Do memory allocations outside lock.  memory_config_version will
1470          * detect any races.
1471          */
1472         spin_unlock(&litevm->lock);
1473
1474         /* Deallocate if slot is being removed */
1475         if (!npages)
1476                 new.phys_mem = 0;
1477
1478         /* Free page dirty bitmap if unneeded */
1479         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1480                 new.dirty_bitmap = 0;
1481
1482         r = -ENOMEM;
1483
1484         /* Allocate if a slot is being created */
1485         if (npages && !new.phys_mem) {
1486                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1487
1488                 if (!new.phys_mem)
1489                         goto out_free;
1490
1491                 for (i = 0; i < npages; ++i) {
1492                         new.phys_mem[i] = kpage_zalloc_addr();
1493                         if (!new.phys_mem[i])
1494                                 goto out_free;
1495                 }
1496         }
1497
1498         /* Allocate page dirty bitmap if needed */
1499         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1500                 unsigned dirty_bytes;//ALIGN(npages, BITS_PER_LONG) / 8;
1501                 dirty_bytes = (((npages + BITS_PER_LONG-1)/BITS_PER_LONG)*BITS_PER_LONG)/8;
1502
1503                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1504                 if (!new.dirty_bitmap)
1505                         goto out_free;
1506         }
1507
1508         spin_lock(&litevm->lock);
1509
1510         if (memory_config_version != litevm->memory_config_version) {
1511                 spin_unlock(&litevm->lock);
1512                 litevm_free_physmem_slot(&new, &old);
1513                 goto raced;
1514         }
1515
1516         r = -EAGAIN;
1517         if (litevm->busy)
1518                 goto out_unlock;
1519
1520         if (mem->slot >= litevm->nmemslots)
1521                 litevm->nmemslots = mem->slot + 1;
1522
1523         *memslot = new;
1524         ++litevm->memory_config_version;
1525
1526         spin_unlock(&litevm->lock);
1527
1528         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1529                 struct litevm_vcpu *vcpu;
1530
1531                 vcpu = vcpu_load(litevm, i);
1532                 if (!vcpu)
1533                         continue;
1534                 litevm_mmu_reset_context(vcpu);
1535                 vcpu_put(vcpu);
1536         }
1537
1538         litevm_free_physmem_slot(&old, &new);
1539         return 0;
1540
1541 out_unlock:
1542         spin_unlock(&litevm->lock);
1543 out_free:
1544         litevm_free_physmem_slot(&new, &old);
1545 out:
1546         return r;
1547 }
1548
1549 #if 0
1550 /*
1551  * Get (and clear) the dirty memory log for a memory slot.
1552  */
1553 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1554                                        struct litevm_dirty_log *log)
1555 {
1556         struct litevm_memory_slot *memslot;
1557         int r, i;
1558         int n;
1559         unsigned long any = 0;
1560
1561         spin_lock(&litevm->lock);
1562
1563         /*
1564          * Prevent changes to guest memory configuration even while the lock
1565          * is not taken.
1566          */
1567         ++litevm->busy;
1568         spin_unlock(&litevm->lock);
1569         r = -EINVAL;
1570         if (log->slot >= LITEVM_MEMORY_SLOTS)
1571                 goto out;
1572
1573         memslot = &litevm->memslots[log->slot];
1574         r = -ENOENT;
1575         if (!memslot->dirty_bitmap)
1576                 goto out;
1577
1578         n = ALIGN(memslot->npages, 8) / 8;
1579
1580         for (i = 0; !any && i < n; ++i)
1581                 any = memslot->dirty_bitmap[i];
1582
1583         r = -EFAULT;
1584         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1585                 goto out;
1586
1587
1588         if (any) {
1589                 spin_lock(&litevm->lock);
1590                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1591                 spin_unlock(&litevm->lock);
1592                 memset(memslot->dirty_bitmap, 0, n);
1593                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1594                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1595
1596                         if (!vcpu)
1597                                 continue;
1598                         flush_guest_tlb(vcpu);
1599                         vcpu_put(vcpu);
1600                 }
1601         }
1602
1603         r = 0;
1604
1605 out:
1606         spin_lock(&litevm->lock);
1607         --litevm->busy;
1608         spin_unlock(&litevm->lock);
1609         return r;
1610 }
1611 #endif
1612
1613 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1614 {
1615         int i;
1616
1617         for (i = 0; i < litevm->nmemslots; ++i) {
1618                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1619
1620                 if (gfn >= memslot->base_gfn
1621                     && gfn < memslot->base_gfn + memslot->npages)
1622                         return memslot;
1623         }
1624         return 0;
1625 }
1626
1627 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1628 {
1629         int i;
1630         struct litevm_memory_slot *memslot = 0;
1631         unsigned long rel_gfn;
1632
1633         for (i = 0; i < litevm->nmemslots; ++i) {
1634                 memslot = &litevm->memslots[i];
1635
1636                 if (gfn >= memslot->base_gfn
1637                     && gfn < memslot->base_gfn + memslot->npages) {
1638
1639                         if (!memslot || !memslot->dirty_bitmap)
1640                                 return;
1641
1642                         rel_gfn = gfn - memslot->base_gfn;
1643
1644                         /* avoid RMW */
1645                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1646                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1647                         return;
1648                 }
1649         }
1650 }
1651
1652 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1653 {
1654         unsigned long rip;
1655         uint32_t interruptibility;
1656
1657         rip = vmcs_readl(GUEST_RIP);
1658         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1659         vmcs_writel(GUEST_RIP, rip);
1660
1661         /*
1662          * We emulated an instruction, so temporary interrupt blocking
1663          * should be removed, if set.
1664          */
1665         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1666         if (interruptibility & 3)
1667                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
1668                              interruptibility & ~3);
1669 }
1670
1671 static int emulator_read_std(unsigned long addr,
1672                              unsigned long *val,
1673                              unsigned int bytes,
1674                              struct x86_emulate_ctxt *ctxt)
1675 {
1676         struct litevm_vcpu *vcpu = ctxt->vcpu;
1677         void *data = val;
1678
1679         while (bytes) {
1680                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1681                 unsigned offset = addr & (PAGE_SIZE-1);
1682                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ? 
1683                         bytes : (unsigned)PAGE_SIZE - offset;
1684                 unsigned long pfn;
1685                 struct litevm_memory_slot *memslot;
1686                 void *page;
1687
1688                 if (gpa == UNMAPPED_GVA)
1689                         return X86EMUL_PROPAGATE_FAULT;
1690                 pfn = gpa >> PAGE_SHIFT;
1691                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1692                 if (!memslot)
1693                         return X86EMUL_UNHANDLEABLE;
1694                 page = KADDR(gfn_to_page(memslot, pfn));
1695
1696                 memcpy(data, page + offset, tocopy);
1697
1698                 bytes -= tocopy;
1699                 data += tocopy;
1700                 addr += tocopy;
1701         }
1702
1703         return X86EMUL_CONTINUE;
1704 }
1705
1706 static int emulator_write_std(unsigned long addr,
1707                               unsigned long val,
1708                               unsigned int bytes,
1709                               struct x86_emulate_ctxt *ctxt)
1710 {
1711         printk("emulator_write_std: addr %lx n %d\n",
1712                addr, bytes);
1713         return X86EMUL_UNHANDLEABLE;
1714 }
1715
1716 static int emulator_read_emulated(unsigned long addr,
1717                                   unsigned long *val,
1718                                   unsigned int bytes,
1719                                   struct x86_emulate_ctxt *ctxt)
1720 {
1721         struct litevm_vcpu *vcpu = ctxt->vcpu;
1722
1723         if (vcpu->mmio_read_completed) {
1724                 memcpy(val, vcpu->mmio_data, bytes);
1725                 vcpu->mmio_read_completed = 0;
1726                 return X86EMUL_CONTINUE;
1727         } else if (emulator_read_std(addr, val, bytes, ctxt)
1728                    == X86EMUL_CONTINUE)
1729                 return X86EMUL_CONTINUE;
1730         else {
1731                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1732                 if (gpa == UNMAPPED_GVA)
1733                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
1734                 vcpu->mmio_needed = 1;
1735                 vcpu->mmio_phys_addr = gpa;
1736                 vcpu->mmio_size = bytes;
1737                 vcpu->mmio_is_write = 0;
1738
1739                 return X86EMUL_UNHANDLEABLE;
1740         }
1741 }
1742
1743 static int emulator_write_emulated(unsigned long addr,
1744                                    unsigned long val,
1745                                    unsigned int bytes,
1746                                    struct x86_emulate_ctxt *ctxt)
1747 {
1748         struct litevm_vcpu *vcpu = ctxt->vcpu;
1749         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1750
1751         if (gpa == UNMAPPED_GVA)
1752                 return X86EMUL_PROPAGATE_FAULT;
1753
1754         vcpu->mmio_needed = 1;
1755         vcpu->mmio_phys_addr = gpa;
1756         vcpu->mmio_size = bytes;
1757         vcpu->mmio_is_write = 1;
1758         memcpy(vcpu->mmio_data, &val, bytes);
1759
1760         return X86EMUL_CONTINUE;
1761 }
1762
1763 static int emulator_cmpxchg_emulated(unsigned long addr,
1764                                      unsigned long old,
1765                                      unsigned long new,
1766                                      unsigned int bytes,
1767                                      struct x86_emulate_ctxt *ctxt)
1768 {
1769         static int reported;
1770
1771         if (!reported) {
1772                 reported = 1;
1773                 printk("litevm: emulating exchange as write\n");
1774         }
1775         return emulator_write_emulated(addr, new, bytes, ctxt);
1776 }
1777
1778 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1779 {
1780         static int reported;
1781         uint8_t opcodes[4];
1782         unsigned long rip = vmcs_readl(GUEST_RIP);
1783         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
1784
1785         if (reported)
1786                 return;
1787
1788         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1789
1790         printk("emulation failed but !mmio_needed?"
1791                " rip %lx %02x %02x %02x %02x\n",
1792                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1793         reported = 1;
1794 }
1795
1796 struct x86_emulate_ops emulate_ops = {
1797         .read_std            = emulator_read_std,
1798         .write_std           = emulator_write_std,
1799         .read_emulated       = emulator_read_emulated,
1800         .write_emulated      = emulator_write_emulated,
1801         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1802 };
1803
1804 enum emulation_result {
1805         EMULATE_DONE,       /* no further processing */
1806         EMULATE_DO_MMIO,      /* litevm_run filled with mmio request */
1807         EMULATE_FAIL,         /* can't emulate this instruction */
1808 };
1809
1810 static int emulate_instruction(struct litevm_vcpu *vcpu,
1811                                struct litevm_run *run,
1812                                unsigned long cr2,
1813                                uint16_t error_code)
1814 {
1815         struct x86_emulate_ctxt emulate_ctxt;
1816         int r;
1817         uint32_t cs_ar;
1818
1819         vcpu_load_rsp_rip(vcpu);
1820
1821         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1822
1823         emulate_ctxt.vcpu = vcpu;
1824         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
1825         emulate_ctxt.cr2 = cr2;
1826         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1827                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
1828                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
1829                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1830
1831         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1832                 emulate_ctxt.cs_base = 0;
1833                 emulate_ctxt.ds_base = 0;
1834                 emulate_ctxt.es_base = 0;
1835                 emulate_ctxt.ss_base = 0;
1836                 emulate_ctxt.gs_base = 0;
1837                 emulate_ctxt.fs_base = 0;
1838         } else {
1839                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
1840                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
1841                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
1842                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
1843                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
1844                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
1845         }
1846
1847         vcpu->mmio_is_write = 0;
1848         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1849
1850         if ((r || vcpu->mmio_is_write) && run) {
1851                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1852                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1853                 run->mmio.len = vcpu->mmio_size;
1854                 run->mmio.is_write = vcpu->mmio_is_write;
1855         }
1856
1857         if (r) {
1858                 if (!vcpu->mmio_needed) {
1859                         report_emulation_failure(&emulate_ctxt);
1860                         return EMULATE_FAIL;
1861                 }
1862                 return EMULATE_DO_MMIO;
1863         }
1864
1865         vcpu_put_rsp_rip(vcpu);
1866         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
1867
1868         if (vcpu->mmio_is_write)
1869                 return EMULATE_DO_MMIO;
1870
1871         return EMULATE_DONE;
1872 }
1873
1874 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
1875 {
1876         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1877 }
1878
1879 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
1880 {
1881         vmcs_writel(GUEST_GDTR_BASE, base);
1882         vmcs_write32(GUEST_GDTR_LIMIT, limit);
1883 }
1884
1885 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
1886 {
1887         vmcs_writel(GUEST_IDTR_BASE, base);
1888         vmcs_write32(GUEST_IDTR_LIMIT, limit);
1889 }
1890
1891 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
1892                    unsigned long *rflags)
1893 {
1894         lmsw(vcpu, msw);
1895         *rflags = vmcs_readl(GUEST_RFLAGS);
1896 }
1897
1898 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
1899 {
1900         switch (cr) {
1901         case 0:
1902                 return guest_cr0();
1903         case 2:
1904                 return vcpu->cr2;
1905         case 3:
1906                 return vcpu->cr3;
1907         case 4:
1908                 return guest_cr4();
1909         default:
1910                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1911                 return 0;
1912         }
1913 }
1914
1915 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
1916                      unsigned long *rflags)
1917 {
1918         switch (cr) {
1919         case 0:
1920                 set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
1921                 *rflags = vmcs_readl(GUEST_RFLAGS);
1922                 break;
1923         case 2:
1924                 vcpu->cr2 = val;
1925                 break;
1926         case 3:
1927                 set_cr3(vcpu, val);
1928                 break;
1929         case 4:
1930                 set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
1931                 break;
1932         default:
1933                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1934         }
1935 }
1936
1937 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
1938                                   int vec, uint32_t err_code)
1939 {
1940         if (!vcpu->rmode.active)
1941                 return 0;
1942
1943         if (vec == GP_VECTOR && err_code == 0)
1944                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
1945                         return 1;
1946         return 0;
1947 }
1948
1949 static int handle_exception(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
1950 {
1951         uint32_t intr_info, error_code;
1952         unsigned long cr2, rip;
1953         uint32_t vect_info;
1954         enum emulation_result er;
1955
1956         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1957         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1958
1959         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1960                                                 !is_page_fault(intr_info)) {
1961                 printk("%s: unexpected, vectoring info 0x%x "
1962                        "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1963         }
1964
1965         if (is_external_interrupt(vect_info)) {
1966                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1967                 SET_BITMASK_BIT_ATOMIC(vcpu->irq_pending, irq);
1968                 SET_BITMASK_BIT_ATOMIC(&vcpu->irq_summary, irq / BITS_PER_LONG);
1969         }
1970
1971         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
1972                 asm ("int $2");
1973                 return 1;
1974         }
1975         error_code = 0;
1976         rip = vmcs_readl(GUEST_RIP);
1977         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1978                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1979         if (is_page_fault(intr_info)) {
1980                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1981
1982                 spin_lock(&vcpu->litevm->lock);
1983                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
1984                         spin_unlock(&vcpu->litevm->lock);
1985                         return 1;
1986                 }
1987
1988                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
1989                 spin_unlock(&vcpu->litevm->lock);
1990
1991                 switch (er) {
1992                 case EMULATE_DONE:
1993                         return 1;
1994                 case EMULATE_DO_MMIO:
1995                         ++litevm_stat.mmio_exits;
1996                         litevm_run->exit_reason = LITEVM_EXIT_MMIO;
1997                         return 0;
1998                  case EMULATE_FAIL:
1999                         vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2000                         break;
2001                 default:
2002                         assert(0);
2003                 }
2004         }
2005
2006         if (vcpu->rmode.active &&
2007             handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2008                                                                 error_code))
2009                 return 1;
2010
2011         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
2012                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2013                 return 0;
2014         }
2015         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2016         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2017         litevm_run->ex.error_code = error_code;
2018         return 0;
2019 }
2020
2021 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2022                                      struct litevm_run *litevm_run)
2023 {
2024         ++litevm_stat.irq_exits;
2025         return 1;
2026 }
2027
2028
2029 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t *count)
2030 {
2031         uint64_t inst;
2032         gva_t rip;
2033         int countr_size;
2034         int i, n;
2035
2036         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2037                 countr_size = 2;
2038         } else {
2039                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2040
2041                 countr_size = (cs_ar & AR_L_MASK) ? 8:
2042                               (cs_ar & AR_DB_MASK) ? 4: 2;
2043         }
2044
2045         rip =  vmcs_readl(GUEST_RIP);
2046         if (countr_size != 8)
2047                 rip += vmcs_readl(GUEST_CS_BASE);
2048
2049         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2050
2051         for (i = 0; i < n; i++) {
2052                 switch (((uint8_t*)&inst)[i]) {
2053                 case 0xf0:
2054                 case 0xf2:
2055                 case 0xf3:
2056                 case 0x2e:
2057                 case 0x36:
2058                 case 0x3e:
2059                 case 0x26:
2060                 case 0x64:
2061                 case 0x65:
2062                 case 0x66:
2063                         break;
2064                 case 0x67:
2065                         countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
2066                 default:
2067                         goto done;
2068                 }
2069         }
2070         return 0;
2071 done:
2072         countr_size *= 8;
2073         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2074         return 1;
2075 }
2076
2077 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2078 {
2079         uint64_t exit_qualification;
2080
2081         ++litevm_stat.io_exits;
2082         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2083         litevm_run->exit_reason = LITEVM_EXIT_IO;
2084         if (exit_qualification & 8)
2085                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2086         else
2087                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2088         litevm_run->io.size = (exit_qualification & 7) + 1;
2089         litevm_run->io.string = (exit_qualification & 16) != 0;
2090         litevm_run->io.string_down
2091                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2092         litevm_run->io.rep = (exit_qualification & 32) != 0;
2093         litevm_run->io.port = exit_qualification >> 16;
2094         if (litevm_run->io.string) {
2095                 if (!get_io_count(vcpu, &litevm_run->io.count))
2096                         return 1;
2097                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2098         } else
2099                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
2100         return 0;
2101 }
2102
2103 static int handle_invlpg(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2104 {
2105         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2106         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2107         spin_lock(&vcpu->litevm->lock);
2108         vcpu->mmu.inval_page(vcpu, address);
2109         spin_unlock(&vcpu->litevm->lock);
2110         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2111         return 1;
2112 }
2113
2114 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2115 {
2116         uint64_t exit_qualification;
2117         int cr;
2118         int reg;
2119
2120 #ifdef LITEVM_DEBUG
2121         if (guest_cpl() != 0) {
2122                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2123                 inject_gp(vcpu);
2124                 return 1;
2125         }
2126 #endif
2127
2128         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2129         cr = exit_qualification & 15;
2130         reg = (exit_qualification >> 8) & 15;
2131         switch ((exit_qualification >> 4) & 3) {
2132         case 0: /* mov to cr */
2133                 switch (cr) {
2134                 case 0:
2135                         vcpu_load_rsp_rip(vcpu);
2136                         set_cr0(vcpu, vcpu->regs[reg]);
2137                         skip_emulated_instruction(vcpu);
2138                         return 1;
2139                 case 3:
2140                         vcpu_load_rsp_rip(vcpu);
2141                         set_cr3(vcpu, vcpu->regs[reg]);
2142                         skip_emulated_instruction(vcpu);
2143                         return 1;
2144                 case 4:
2145                         vcpu_load_rsp_rip(vcpu);
2146                         set_cr4(vcpu, vcpu->regs[reg]);
2147                         skip_emulated_instruction(vcpu);
2148                         return 1;
2149                 case 8:
2150                         vcpu_load_rsp_rip(vcpu);
2151                         set_cr8(vcpu, vcpu->regs[reg]);
2152                         skip_emulated_instruction(vcpu);
2153                         return 1;
2154                 };
2155                 break;
2156         case 1: /*mov from cr*/
2157                 switch (cr) {
2158                 case 3:
2159                         vcpu_load_rsp_rip(vcpu);
2160                         vcpu->regs[reg] = vcpu->cr3;
2161                         vcpu_put_rsp_rip(vcpu);
2162                         skip_emulated_instruction(vcpu);
2163                         return 1;
2164                 case 8:
2165                         printd("handle_cr: read CR8 "
2166                                "cpu erratum AA15\n");
2167                         vcpu_load_rsp_rip(vcpu);
2168                         vcpu->regs[reg] = vcpu->cr8;
2169                         vcpu_put_rsp_rip(vcpu);
2170                         skip_emulated_instruction(vcpu);
2171                         return 1;
2172                 }
2173                 break;
2174         case 3: /* lmsw */
2175                 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2176
2177                 skip_emulated_instruction(vcpu);
2178                 return 1;
2179         default:
2180                 break;
2181         }
2182         litevm_run->exit_reason = 0;
2183         printk("litevm: unhandled control register: op %d cr %d\n",
2184                (int)(exit_qualification >> 4) & 3, cr);
2185         return 0;
2186 }
2187
2188 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2189 {
2190         uint64_t exit_qualification;
2191         unsigned long val;
2192         int dr, reg;
2193
2194         /*
2195          * FIXME: this code assumes the host is debugging the guest.
2196          *        need to deal with guest debugging itself too.
2197          */
2198         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2199         dr = exit_qualification & 7;
2200         reg = (exit_qualification >> 8) & 15;
2201         vcpu_load_rsp_rip(vcpu);
2202         if (exit_qualification & 16) {
2203                 /* mov from dr */
2204                 switch (dr) {
2205                 case 6:
2206                         val = 0xffff0ff0;
2207                         break;
2208                 case 7:
2209                         val = 0x400;
2210                         break;
2211                 default:
2212                         val = 0;
2213                 }
2214                 vcpu->regs[reg] = val;
2215         } else {
2216                 /* mov to dr */
2217         }
2218         vcpu_put_rsp_rip(vcpu);
2219         skip_emulated_instruction(vcpu);
2220         return 1;
2221 }
2222
2223 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2224 {
2225         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2226         return 0;
2227 }
2228
2229 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2230 {
2231         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2232         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2233         uint64_t data;
2234
2235 #ifdef LITEVM_DEBUG
2236         if (guest_cpl() != 0) {
2237                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2238                 inject_gp(vcpu);
2239                 return 1;
2240         }
2241 #endif
2242
2243         switch (ecx) {
2244 #ifdef __x86_64__
2245         case MSR_FS_BASE:
2246                 data = vmcs_readl(GUEST_FS_BASE);
2247                 break;
2248         case MSR_GS_BASE:
2249                 data = vmcs_readl(GUEST_GS_BASE);
2250                 break;
2251 #endif
2252         case MSR_IA32_SYSENTER_CS:
2253                 data = vmcs_read32(GUEST_SYSENTER_CS);
2254                 break;
2255         case MSR_IA32_SYSENTER_EIP:
2256                 data = vmcs_read32(GUEST_SYSENTER_EIP);
2257                 break;
2258         case MSR_IA32_SYSENTER_ESP:
2259                 data = vmcs_read32(GUEST_SYSENTER_ESP);
2260                 break;
2261         case MSR_IA32_MC0_CTL:
2262         case MSR_IA32_MCG_STATUS:
2263         case MSR_IA32_MCG_CAP:
2264         case MSR_IA32_MC0_MISC:
2265         case MSR_IA32_MC0_MISC+4:
2266         case MSR_IA32_MC0_MISC+8:
2267         case MSR_IA32_MC0_MISC+12:
2268         case MSR_IA32_MC0_MISC+16:
2269         case MSR_IA32_UCODE_REV:
2270                 /* MTRR registers */
2271         case 0xfe:
2272         case 0x200 ... 0x2ff:
2273                 data = 0;
2274                 break;
2275         case MSR_IA32_APICBASE:
2276                 data = vcpu->apic_base;
2277                 break;
2278         default:
2279                 if (msr) {
2280                         data = msr->data;
2281                         break;
2282                 }
2283                 printk("litevm: unhandled rdmsr: %x\n", ecx);
2284                 inject_gp(vcpu);
2285                 return 1;
2286         }
2287
2288         /* FIXME: handling of bits 32:63 of rax, rdx */
2289         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2290         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2291         skip_emulated_instruction(vcpu);
2292         return 1;
2293 }
2294
2295 #ifdef __x86_64__
2296
2297 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2298 {
2299         struct vmx_msr_entry *msr;
2300
2301         if (efer & EFER_RESERVED_BITS) {
2302                 printd("set_efer: 0x%llx #GP, reserved bits\n",
2303                        efer);
2304                 inject_gp(vcpu);
2305                 return;
2306         }
2307
2308         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2309                 printd("set_efer: #GP, change LME while paging\n");
2310                 inject_gp(vcpu);
2311                 return;
2312         }
2313
2314         efer &= ~EFER_LMA;
2315         efer |= vcpu->shadow_efer & EFER_LMA;
2316
2317         vcpu->shadow_efer = efer;
2318
2319         msr = find_msr_entry(vcpu, MSR_EFER);
2320
2321         if (!(efer & EFER_LMA))
2322             efer &= ~EFER_LME;
2323         msr->data = efer;
2324         skip_emulated_instruction(vcpu);
2325 }
2326
2327 #endif
2328
2329 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2330
2331 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2332 {
2333         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2334         struct vmx_msr_entry *msr;
2335         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2336                 | ((uint64_t)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2337
2338 #ifdef LITEVM_DEBUG
2339         if (guest_cpl() != 0) {
2340                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2341                 inject_gp(vcpu);
2342                 return 1;
2343         }
2344 #endif
2345
2346         switch (ecx) {
2347 #ifdef __x86_64__
2348         case MSR_FS_BASE:
2349                 vmcs_writel(GUEST_FS_BASE, data);
2350                 break;
2351         case MSR_GS_BASE:
2352                 vmcs_writel(GUEST_GS_BASE, data);
2353                 break;
2354 #endif
2355         case MSR_IA32_SYSENTER_CS:
2356                 vmcs_write32(GUEST_SYSENTER_CS, data);
2357                 break;
2358         case MSR_IA32_SYSENTER_EIP:
2359                 vmcs_write32(GUEST_SYSENTER_EIP, data);
2360                 break;
2361         case MSR_IA32_SYSENTER_ESP:
2362                 vmcs_write32(GUEST_SYSENTER_ESP, data);
2363                 break;
2364 #ifdef __x86_64
2365         case MSR_EFER:
2366                 set_efer(vcpu, data);
2367                 return 1;
2368         case MSR_IA32_MC0_STATUS:
2369                 printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
2370                             , __FUNCTION__, data);
2371                 break;
2372 #endif
2373         case MSR_IA32_TIME_STAMP_COUNTER: {
2374                 uint64_t tsc;
2375                 
2376                 tsc = read_tsc();
2377                 vmcs_write64(TSC_OFFSET, data - tsc);
2378                 break;
2379         }
2380         case MSR_IA32_UCODE_REV:
2381         case MSR_IA32_UCODE_WRITE:
2382         case 0x200 ... 0x2ff: /* MTRRs */
2383                 break;
2384         case MSR_IA32_APICBASE:
2385                 vcpu->apic_base = data;
2386                 break;
2387         default:
2388                 msr = find_msr_entry(vcpu, ecx);
2389                 if (msr) {
2390                         msr->data = data;
2391                         break;
2392                 }
2393                 printk("litevm: unhandled wrmsr: %x\n", ecx);
2394                 inject_gp(vcpu);
2395                 return 1;
2396         }
2397         skip_emulated_instruction(vcpu);
2398         return 1;
2399 }
2400
2401 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2402                                    struct litevm_run *litevm_run)
2403 {
2404         /* Turn off interrupt window reporting. */
2405         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2406                      vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2407                      & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2408         return 1;
2409 }
2410
2411 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2412 {
2413         skip_emulated_instruction(vcpu);
2414         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF))
2415                 return 1;
2416
2417         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2418         return 0;
2419 }
2420
2421 /*
2422  * The exit handlers return 1 if the exit was handled fully and guest execution
2423  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2424  * to be done to userspace and return 0.
2425  */
2426 static int (*litevm_vmx_exit_handlers[])(struct litevm_vcpu *vcpu,
2427                                       struct litevm_run *litevm_run) = {
2428         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
2429         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
2430         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
2431         [EXIT_REASON_INVLPG]                  = handle_invlpg,
2432         [EXIT_REASON_CR_ACCESS]               = handle_cr,
2433         [EXIT_REASON_DR_ACCESS]               = handle_dr,
2434         [EXIT_REASON_CPUID]                   = handle_cpuid,
2435         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
2436         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
2437         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
2438         [EXIT_REASON_HLT]                     = handle_halt,
2439 };
2440
2441 static const int litevm_vmx_max_exit_handlers =
2442         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2443
2444 /*
2445  * The guest has exited.  See if we can fix it or if we need userspace
2446  * assistance.
2447  */
2448 static int litevm_handle_exit(struct litevm_run *litevm_run, struct litevm_vcpu *vcpu)
2449 {
2450         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2451         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2452
2453         if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
2454                                 exit_reason != EXIT_REASON_EXCEPTION_NMI )
2455                 printk("%s: unexpected, valid vectoring info and "
2456                        "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2457         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2458         if (exit_reason < litevm_vmx_max_exit_handlers
2459             && litevm_vmx_exit_handlers[exit_reason])
2460                 return litevm_vmx_exit_handlers[exit_reason](vcpu, litevm_run);
2461         else {
2462                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2463                 litevm_run->hw.hardware_exit_reason = exit_reason;
2464         }
2465         return 0;
2466 }
2467
2468 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2469 {
2470         uint16_t ent[2];
2471         uint16_t cs;
2472         uint16_t ip;
2473         unsigned long flags;
2474         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2475         uint16_t sp =  vmcs_readl(GUEST_RSP);
2476         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2477
2478         if (sp > ss_limit || ((sp - 6) > sp)) {
2479                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2480                             __FUNCTION__,
2481                             vmcs_readl(GUEST_RSP),
2482                             vmcs_readl(GUEST_SS_BASE),
2483                             vmcs_read32(GUEST_SS_LIMIT));
2484                 return;
2485         }
2486
2487         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2488                                                                 sizeof(ent)) {
2489                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2490                 return;
2491         }
2492
2493         flags =  vmcs_readl(GUEST_RFLAGS);
2494         cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
2495         ip =  vmcs_readl(GUEST_RIP);
2496
2497
2498         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2499             litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2500             litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2501                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2502                 return;
2503         }
2504
2505         vmcs_writel(GUEST_RFLAGS, flags &
2506                     ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2507         vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
2508         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2509         vmcs_writel(GUEST_RIP, ent[0]);
2510         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2511 }
2512
2513 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2514 {
2515 #warning "fix me; needs ffs and talk to barret about bit ops"
2516 #if 0
2517         int word_index = __ffs(vcpu->irq_summary);
2518         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2519         int irq = word_index * BITS_PER_LONG + bit_index;
2520
2521         clear_bit(bit_index, &vcpu->irq_pending[word_index]);
2522         if (!vcpu->irq_pending[word_index])
2523                 clear_bit(word_index, &vcpu->irq_summary);
2524
2525         if (vcpu->rmode.active) {
2526                 inject_rmode_irq(vcpu, irq);
2527                 return;
2528         }
2529         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2530                         irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2531 #endif
2532 }
2533
2534 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2535 {
2536         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2537             && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2538                 /*
2539                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2540                  */
2541                 litevm_do_inject_irq(vcpu);
2542         else
2543                 /*
2544                  * Interrupts blocked.  Wait for unblock.
2545                  */
2546                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2547                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2548                              | CPU_BASED_VIRTUAL_INTR_PENDING);
2549 }
2550
2551 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2552 {
2553         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2554
2555 #warning "no debugging guests yet"
2556         assert(0);
2557 /*
2558         set_debugreg(dbg->bp[0], 0);
2559         set_debugreg(dbg->bp[1], 1);
2560         set_debugreg(dbg->bp[2], 2);
2561         set_debugreg(dbg->bp[3], 3);
2562 */
2563         if (dbg->singlestep) {
2564                 unsigned long flags;
2565
2566                 flags = vmcs_readl(GUEST_RFLAGS);
2567                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2568                 vmcs_writel(GUEST_RFLAGS, flags);
2569         }
2570 }
2571
2572 static void load_msrs(struct vmx_msr_entry *e, int n)
2573 {
2574         int i;
2575
2576         for (i = 0; i < n; ++i)
2577                 write_msr(e[i].index, e[i].data);
2578 }
2579
2580 static void save_msrs(struct vmx_msr_entry *e, int n)
2581 {
2582         int i;
2583
2584         for (i = 0; i < n; ++i)
2585                 e[i].data = read_msr(e[i].index);
2586 }
2587
2588 static int litevm_dev_ioctl_run(struct litevm *litevm, struct litevm_run *litevm_run)
2589 {
2590         struct litevm_vcpu *vcpu;
2591         uint8_t fail;
2592         uint16_t fs_sel, gs_sel, ldt_sel;
2593         int fs_gs_ldt_reload_needed;
2594
2595         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2596                 return -EINVAL;
2597
2598         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2599         if (!vcpu)
2600                 return -ENOENT;
2601
2602         if (litevm_run->emulated) {
2603                 skip_emulated_instruction(vcpu);
2604                 litevm_run->emulated = 0;
2605         }
2606
2607         if (litevm_run->mmio_completed) {
2608                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
2609                 vcpu->mmio_read_completed = 1;
2610         }
2611
2612         vcpu->mmio_needed = 0;
2613
2614 again:
2615         /*
2616          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2617          * allow segment selectors with cpl > 0 or ti == 1.
2618          */
2619         fs_sel = read_fs();
2620         gs_sel = read_gs();
2621         ldt_sel = read_ldt();
2622         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
2623         if (!fs_gs_ldt_reload_needed) {
2624                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2625                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2626         } else {
2627                 vmcs_write16(HOST_FS_SELECTOR, 0);
2628                 vmcs_write16(HOST_GS_SELECTOR, 0);
2629         }
2630
2631 #ifdef __x86_64__
2632         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
2633         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2634 #endif
2635
2636         if (vcpu->irq_summary &&
2637             !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2638                 litevm_try_inject_irq(vcpu);
2639
2640         if (vcpu->guest_debug.enabled)
2641                 litevm_guest_debug_pre(vcpu);
2642
2643         fx_save(vcpu->host_fx_image);
2644         fx_restore(vcpu->guest_fx_image);
2645
2646         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
2647         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2648
2649         asm (
2650                 /* Store host registers */
2651                 "pushf \n\t"
2652 #ifdef __x86_64__
2653                 "push %%rax; push %%rbx; push %%rdx;"
2654                 "push %%rsi; push %%rdi; push %%rbp;"
2655                 "push %%r8;  push %%r9;  push %%r10; push %%r11;"
2656                 "push %%r12; push %%r13; push %%r14; push %%r15;"
2657                 "push %%rcx \n\t"
2658                 "vmwrite %%rsp, %2 \n\t"
2659 #else
2660                 "pusha; push %%ecx \n\t"
2661                 "vmwrite %%esp, %2 \n\t"
2662 #endif
2663                 /* Check if vmlaunch of vmresume is needed */
2664                 "cmp $0, %1 \n\t"
2665                 /* Load guest registers.  Don't clobber flags. */
2666 #ifdef __x86_64__
2667                 "mov %c[cr2](%3), %%rax \n\t"
2668                 "mov %%rax, %%cr2 \n\t"
2669                 "mov %c[rax](%3), %%rax \n\t"
2670                 "mov %c[rbx](%3), %%rbx \n\t"
2671                 "mov %c[rdx](%3), %%rdx \n\t"
2672                 "mov %c[rsi](%3), %%rsi \n\t"
2673                 "mov %c[rdi](%3), %%rdi \n\t"
2674                 "mov %c[rbp](%3), %%rbp \n\t"
2675                 "mov %c[r8](%3),  %%r8  \n\t"
2676                 "mov %c[r9](%3),  %%r9  \n\t"
2677                 "mov %c[r10](%3), %%r10 \n\t"
2678                 "mov %c[r11](%3), %%r11 \n\t"
2679                 "mov %c[r12](%3), %%r12 \n\t"
2680                 "mov %c[r13](%3), %%r13 \n\t"
2681                 "mov %c[r14](%3), %%r14 \n\t"
2682                 "mov %c[r15](%3), %%r15 \n\t"
2683                 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
2684 #else
2685                 "mov %c[cr2](%3), %%eax \n\t"
2686                 "mov %%eax,   %%cr2 \n\t"
2687                 "mov %c[rax](%3), %%eax \n\t"
2688                 "mov %c[rbx](%3), %%ebx \n\t"
2689                 "mov %c[rdx](%3), %%edx \n\t"
2690                 "mov %c[rsi](%3), %%esi \n\t"
2691                 "mov %c[rdi](%3), %%edi \n\t"
2692                 "mov %c[rbp](%3), %%ebp \n\t"
2693                 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
2694 #endif
2695                 /* Enter guest mode */
2696                 "jne launched \n\t"
2697                 "vmlaunch \n\t"
2698                 "jmp litevm_vmx_return \n\t"
2699                 "launched: vmresume \n\t"
2700                 ".globl litevm_vmx_return \n\t"
2701                 "litevm_vmx_return: "
2702                 /* Save guest registers, load host registers, keep flags */
2703 #ifdef __x86_64__
2704                 "xchg %3,     0(%%rsp) \n\t"
2705                 "mov %%rax, %c[rax](%3) \n\t"
2706                 "mov %%rbx, %c[rbx](%3) \n\t"
2707                 "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
2708                 "mov %%rdx, %c[rdx](%3) \n\t"
2709                 "mov %%rsi, %c[rsi](%3) \n\t"
2710                 "mov %%rdi, %c[rdi](%3) \n\t"
2711                 "mov %%rbp, %c[rbp](%3) \n\t"
2712                 "mov %%r8,  %c[r8](%3) \n\t"
2713                 "mov %%r9,  %c[r9](%3) \n\t"
2714                 "mov %%r10, %c[r10](%3) \n\t"
2715                 "mov %%r11, %c[r11](%3) \n\t"
2716                 "mov %%r12, %c[r12](%3) \n\t"
2717                 "mov %%r13, %c[r13](%3) \n\t"
2718                 "mov %%r14, %c[r14](%3) \n\t"
2719                 "mov %%r15, %c[r15](%3) \n\t"
2720                 "mov %%cr2, %%rax   \n\t"
2721                 "mov %%rax, %c[cr2](%3) \n\t"
2722                 "mov 0(%%rsp), %3 \n\t"
2723
2724                 "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
2725                 "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
2726                 "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
2727                 "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
2728 #else
2729                 "xchg %3, 0(%%esp) \n\t"
2730                 "mov %%eax, %c[rax](%3) \n\t"
2731                 "mov %%ebx, %c[rbx](%3) \n\t"
2732                 "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
2733                 "mov %%edx, %c[rdx](%3) \n\t"
2734                 "mov %%esi, %c[rsi](%3) \n\t"
2735                 "mov %%edi, %c[rdi](%3) \n\t"
2736                 "mov %%ebp, %c[rbp](%3) \n\t"
2737                 "mov %%cr2, %%eax  \n\t"
2738                 "mov %%eax, %c[cr2](%3) \n\t"
2739                 "mov 0(%%esp), %3 \n\t"
2740
2741                 "pop %%ecx; popa \n\t"
2742 #endif
2743                 "setbe %0 \n\t"
2744                 "popf \n\t"
2745               : "=g" (fail)
2746               : "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
2747                 "c"(vcpu),
2748                 [rax]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
2749                 [rbx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
2750                 [rcx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
2751                 [rdx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
2752                 [rsi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
2753                 [rdi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
2754                 [rbp]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
2755 #ifdef __x86_64__
2756                 [r8 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8 ])),
2757                 [r9 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9 ])),
2758                 [r10]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
2759                 [r11]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
2760                 [r12]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
2761                 [r13]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
2762                 [r14]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
2763                 [r15]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
2764 #endif
2765                 [cr2]"i"(offsetof(struct litevm_vcpu, cr2))
2766               : "cc", "memory" );
2767
2768         ++litevm_stat.exits;
2769
2770         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2771         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
2772
2773         fx_save(vcpu->guest_fx_image);
2774         fx_restore(vcpu->host_fx_image);
2775
2776 #ifndef __x86_64__
2777         asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2778 #endif
2779
2780         litevm_run->exit_type = 0;
2781         if (fail) {
2782                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
2783                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
2784         } else {
2785                 if (fs_gs_ldt_reload_needed) {
2786                         load_ldt(ldt_sel);
2787                         load_fs(fs_sel);
2788                         /*
2789                          * If we have to reload gs, we must take care to
2790                          * preserve our gs base.
2791                          */
2792                         disable_irq();
2793                         load_gs(gs_sel);
2794 #ifdef __x86_64__
2795                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
2796 #endif
2797                         enable_irq();
2798
2799                         reload_tss();
2800                 }
2801                 vcpu->launched = 1;
2802                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
2803                 if (litevm_handle_exit(litevm_run, vcpu)) {
2804                         /* Give scheduler a change to reschedule. */
2805                         vcpu_put(vcpu);
2806 #warning "how to tell if signal is pending"
2807 /*
2808                         if (signal_pending(current)) {
2809                                 ++litevm_stat.signal_exits;
2810                                 return -EINTR;
2811                         }
2812 */
2813                         kthread_yield();
2814                         /* Cannot fail -  no vcpu unplug yet. */
2815                         vcpu_load(litevm, vcpu_slot(vcpu));
2816                         goto again;
2817                 }
2818         }
2819
2820         vcpu_put(vcpu);
2821         return 0;
2822 }
2823
2824 static int litevm_dev_ioctl_get_regs(struct litevm *litevm, struct litevm_regs *regs)
2825 {
2826         struct litevm_vcpu *vcpu;
2827
2828         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS)
2829                 return -EINVAL;
2830
2831         vcpu = vcpu_load(litevm, regs->vcpu);
2832         if (!vcpu)
2833                 return -ENOENT;
2834
2835         regs->rax = vcpu->regs[VCPU_REGS_RAX];
2836         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2837         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2838         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2839         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2840         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2841         regs->rsp = vmcs_readl(GUEST_RSP);
2842         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2843 #ifdef __x86_64__
2844         regs->r8 = vcpu->regs[VCPU_REGS_R8];
2845         regs->r9 = vcpu->regs[VCPU_REGS_R9];
2846         regs->r10 = vcpu->regs[VCPU_REGS_R10];
2847         regs->r11 = vcpu->regs[VCPU_REGS_R11];
2848         regs->r12 = vcpu->regs[VCPU_REGS_R12];
2849         regs->r13 = vcpu->regs[VCPU_REGS_R13];
2850         regs->r14 = vcpu->regs[VCPU_REGS_R14];
2851         regs->r15 = vcpu->regs[VCPU_REGS_R15];
2852 #endif
2853
2854         regs->rip = vmcs_readl(GUEST_RIP);
2855         regs->rflags = vmcs_readl(GUEST_RFLAGS);
2856
2857         /*
2858          * Don't leak debug flags in case they were set for guest debugging
2859          */
2860         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2861                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2862
2863         vcpu_put(vcpu);
2864
2865         return 0;
2866 }
2867
2868 static int litevm_dev_ioctl_set_regs(struct litevm *litevm, struct litevm_regs *regs)
2869 {
2870         struct litevm_vcpu *vcpu;
2871
2872         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS)
2873                 return -EINVAL;
2874
2875         vcpu = vcpu_load(litevm, regs->vcpu);
2876         if (!vcpu)
2877                 return -ENOENT;
2878
2879         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2880         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2881         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2882         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2883         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2884         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2885         vmcs_writel(GUEST_RSP, regs->rsp);
2886         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2887 #ifdef __x86_64__
2888         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2889         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2890         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2891         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2892         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2893         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2894         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2895         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2896 #endif
2897
2898         vmcs_writel(GUEST_RIP, regs->rip);
2899         vmcs_writel(GUEST_RFLAGS, regs->rflags);
2900
2901         vcpu_put(vcpu);
2902
2903         return 0;
2904 }
2905
2906 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
2907 {
2908         struct litevm_vcpu *vcpu;
2909
2910         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS)
2911                 return -EINVAL;
2912         vcpu = vcpu_load(litevm, sregs->vcpu);
2913         if (!vcpu)
2914                 return -ENOENT;
2915
2916 #define get_segment(var, seg) \
2917         do { \
2918                 uint32_t ar; \
2919                 \
2920                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
2921                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
2922                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
2923                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
2924                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
2925                 sregs->var.type = ar & 15; \
2926                 sregs->var.s = (ar >> 4) & 1; \
2927                 sregs->var.dpl = (ar >> 5) & 3; \
2928                 sregs->var.present = (ar >> 7) & 1; \
2929                 sregs->var.avl = (ar >> 12) & 1; \
2930                 sregs->var.l = (ar >> 13) & 1; \
2931                 sregs->var.db = (ar >> 14) & 1; \
2932                 sregs->var.g = (ar >> 15) & 1; \
2933                 sregs->var.unusable = (ar >> 16) & 1; \
2934         } while (0);
2935
2936         get_segment(cs, CS);
2937         get_segment(ds, DS);
2938         get_segment(es, ES);
2939         get_segment(fs, FS);
2940         get_segment(gs, GS);
2941         get_segment(ss, SS);
2942
2943         get_segment(tr, TR);
2944         get_segment(ldt, LDTR);
2945 #undef get_segment
2946
2947 #define get_dtable(var, table) \
2948         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
2949                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
2950
2951         get_dtable(idt, IDTR);
2952         get_dtable(gdt, GDTR);
2953 #undef get_dtable
2954
2955         sregs->cr0 = guest_cr0();
2956         sregs->cr2 = vcpu->cr2;
2957         sregs->cr3 = vcpu->cr3;
2958         sregs->cr4 = guest_cr4();
2959         sregs->cr8 = vcpu->cr8;
2960         sregs->efer = vcpu->shadow_efer;
2961         sregs->apic_base = vcpu->apic_base;
2962
2963         sregs->pending_int = vcpu->irq_summary != 0;
2964
2965         vcpu_put(vcpu);
2966
2967         return 0;
2968 }
2969
2970 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
2971 {
2972         struct litevm_vcpu *vcpu;
2973         int mmu_reset_needed = 0;
2974
2975         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS)
2976                 return -EINVAL;
2977         vcpu = vcpu_load(litevm, sregs->vcpu);
2978         if (!vcpu)
2979                 return -ENOENT;
2980
2981 #define set_segment(var, seg) \
2982         do { \
2983                 uint32_t ar; \
2984                 \
2985                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
2986                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
2987                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
2988                 if (sregs->var.unusable) { \
2989                         ar = (1 << 16); \
2990                 } else { \
2991                         ar = (sregs->var.type & 15); \
2992                         ar |= (sregs->var.s & 1) << 4; \
2993                         ar |= (sregs->var.dpl & 3) << 5; \
2994                         ar |= (sregs->var.present & 1) << 7; \
2995                         ar |= (sregs->var.avl & 1) << 12; \
2996                         ar |= (sregs->var.l & 1) << 13; \
2997                         ar |= (sregs->var.db & 1) << 14; \
2998                         ar |= (sregs->var.g & 1) << 15; \
2999                 } \
3000                 vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
3001         } while (0);
3002
3003         set_segment(cs, CS);
3004         set_segment(ds, DS);
3005         set_segment(es, ES);
3006         set_segment(fs, FS);
3007         set_segment(gs, GS);
3008         set_segment(ss, SS);
3009
3010         set_segment(tr, TR);
3011
3012         set_segment(ldt, LDTR);
3013 #undef set_segment
3014
3015 #define set_dtable(var, table) \
3016         vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
3017         vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
3018
3019         set_dtable(idt, IDTR);
3020         set_dtable(gdt, GDTR);
3021 #undef set_dtable
3022
3023         vcpu->cr2 = sregs->cr2;
3024         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
3025         vcpu->cr3 = sregs->cr3;
3026
3027         vcpu->cr8 = sregs->cr8;
3028
3029         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
3030 #ifdef __x86_64__
3031         __set_efer(vcpu, sregs->efer);
3032 #endif
3033         vcpu->apic_base = sregs->apic_base;
3034
3035         mmu_reset_needed |= guest_cr0() != sregs->cr0;
3036         vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
3037         update_exception_bitmap(vcpu);
3038         vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
3039         vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
3040
3041         mmu_reset_needed |=  guest_cr4() != sregs->cr4;
3042         __set_cr4(vcpu, sregs->cr4);
3043
3044         if (mmu_reset_needed)
3045                 litevm_mmu_reset_context(vcpu);
3046         vcpu_put(vcpu);
3047
3048         return 0;
3049 }
3050
3051 /*
3052  * Translate a guest virtual address to a guest physical address.
3053  */
3054 static int litevm_dev_ioctl_translate(struct litevm *litevm, struct litevm_translation *tr)
3055 {
3056         unsigned long vaddr = tr->linear_address;
3057         struct litevm_vcpu *vcpu;
3058         gpa_t gpa;
3059
3060         vcpu = vcpu_load(litevm, tr->vcpu);
3061         if (!vcpu)
3062                 return -ENOENT;
3063         spin_lock(&litevm->lock);
3064         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
3065         tr->physical_address = gpa;
3066         tr->valid = gpa != UNMAPPED_GVA;
3067         tr->writeable = 1;
3068         tr->usermode = 0;
3069         spin_unlock(&litevm->lock);
3070         vcpu_put(vcpu);
3071
3072         return 0;
3073 }
3074
3075 #if 0
3076 static int litevm_dev_ioctl_interrupt(struct litevm *litevm, struct litevm_interrupt *irq)
3077 {
3078         struct litevm_vcpu *vcpu;
3079
3080         if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS)
3081                 return -EINVAL;
3082         if (irq->irq < 0 || irq->irq >= 256)
3083                 return -EINVAL;
3084         vcpu = vcpu_load(litevm, irq->vcpu);
3085         if (!vcpu)
3086                 return -ENOENT;
3087
3088         set_bit(irq->irq, vcpu->irq_pending);
3089         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
3090
3091         vcpu_put(vcpu);
3092
3093         return 0;
3094 }
3095 #endif
3096
3097 #if 0
3098 static int litevm_dev_ioctl_debug_guest(struct litevm *litevm,
3099                                      struct litevm_debug_guest *dbg)
3100 {
3101         struct litevm_vcpu *vcpu;
3102         unsigned long dr7 = 0x400;
3103         uint32_t exception_bitmap;
3104         int old_singlestep;
3105
3106         if (dbg->vcpu < 0 || dbg->vcpu >= LITEVM_MAX_VCPUS)
3107                 return -EINVAL;
3108         vcpu = vcpu_load(litevm, dbg->vcpu);
3109         if (!vcpu)
3110                 return -ENOENT;
3111
3112         exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
3113         old_singlestep = vcpu->guest_debug.singlestep;
3114
3115         vcpu->guest_debug.enabled = dbg->enabled;
3116         if (vcpu->guest_debug.enabled) {
3117                 int i;
3118
3119                 dr7 |= 0x200;  /* exact */
3120                 for (i = 0; i < 4; ++i) {
3121                         if (!dbg->breakpoints[i].enabled)
3122                                 continue;
3123                         vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
3124                         dr7 |= 2 << (i*2);    /* global enable */
3125                         dr7 |= 0 << (i*4+16); /* execution breakpoint */
3126                 }
3127
3128                 exception_bitmap |= (1u << 1);  /* Trap debug exceptions */
3129
3130                 vcpu->guest_debug.singlestep = dbg->singlestep;
3131         } else {
3132                 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
3133                 vcpu->guest_debug.singlestep = 0;
3134         }
3135
3136         if (old_singlestep && !vcpu->guest_debug.singlestep) {
3137                 unsigned long flags;
3138
3139                 flags = vmcs_readl(GUEST_RFLAGS);
3140                 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3141                 vmcs_writel(GUEST_RFLAGS, flags);
3142         }
3143
3144         vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
3145         vmcs_writel(GUEST_DR7, dr7);
3146
3147         vcpu_put(vcpu);
3148
3149         return 0;
3150 }
3151 #endif
3152
3153 #if 0
3154 static long litevm_dev_ioctl(struct file *filp,
3155                           unsigned int ioctl, unsigned long arg)
3156 {
3157         struct litevm *litevm = filp->private_data;
3158         int r = -EINVAL;
3159
3160         switch (ioctl) {
3161         case LITEVM_CREATE_VCPU: {
3162                 r = litevm_dev_ioctl_create_vcpu(litevm, arg);
3163                 if (r)
3164                         goto out;
3165                 break;
3166         }
3167         case LITEVM_RUN: {
3168                 struct litevm_run litevm_run;
3169
3170                 r = -EFAULT;
3171                 if (copy_from_user(&litevm_run, (void *)arg, sizeof litevm_run))
3172                         goto out;
3173                 r = litevm_dev_ioctl_run(litevm, &litevm_run);
3174                 if (r < 0)
3175                         goto out;
3176                 r = -EFAULT;
3177                 if (copy_to_user((void *)arg, &litevm_run, sizeof litevm_run))
3178                         goto out;
3179                 r = 0;
3180                 break;
3181         }
3182         case LITEVM_GET_REGS: {
3183                 struct litevm_regs litevm_regs;
3184
3185                 r = -EFAULT;
3186                 if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
3187                         goto out;
3188                 r = litevm_dev_ioctl_get_regs(litevm, &litevm_regs);
3189                 if (r)
3190                         goto out;
3191                 r = -EFAULT;
3192                 if (copy_to_user((void *)arg, &litevm_regs, sizeof litevm_regs))
3193                         goto out;
3194                 r = 0;
3195                 break;
3196         }
3197         case LITEVM_SET_REGS: {
3198                 struct litevm_regs litevm_regs;
3199
3200                 r = -EFAULT;
3201                 if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
3202                         goto out;
3203                 r = litevm_dev_ioctl_set_regs(litevm, &litevm_regs);
3204                 if (r)
3205                         goto out;
3206                 r = 0;
3207                 break;
3208         }
3209         case LITEVM_GET_SREGS: {
3210                 struct litevm_sregs litevm_sregs;
3211
3212                 r = -EFAULT;
3213                 if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3214                         goto out;
3215                 r = litevm_dev_ioctl_get_sregs(litevm, &litevm_sregs);
3216                 if (r)
3217                         goto out;
3218                 r = -EFAULT;
3219                 if (copy_to_user((void *)arg, &litevm_sregs, sizeof litevm_sregs))
3220                         goto out;
3221                 r = 0;
3222                 break;
3223         }
3224         case LITEVM_SET_SREGS: {
3225                 struct litevm_sregs litevm_sregs;
3226
3227                 r = -EFAULT;
3228                 if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3229                         goto out;
3230                 r = litevm_dev_ioctl_set_sregs(litevm, &litevm_sregs);
3231                 if (r)
3232                         goto out;
3233                 r = 0;
3234                 break;
3235         }
3236         case LITEVM_TRANSLATE: {
3237                 struct litevm_translation tr;
3238
3239                 r = -EFAULT;
3240                 if (copy_from_user(&tr, (void *)arg, sizeof tr))
3241                         goto out;
3242                 r = litevm_dev_ioctl_translate(litevm, &tr);
3243                 if (r)
3244                         goto out;
3245                 r = -EFAULT;
3246                 if (copy_to_user((void *)arg, &tr, sizeof tr))
3247                         goto out;
3248                 r = 0;
3249                 break;
3250         }
3251         case LITEVM_INTERRUPT: {
3252                 struct litevm_interrupt irq;
3253
3254                 r = -EFAULT;
3255                 if (copy_from_user(&irq, (void *)arg, sizeof irq))
3256                         goto out;
3257                 r = litevm_dev_ioctl_interrupt(litevm, &irq);
3258                 if (r)
3259                         goto out;
3260                 r = 0;
3261                 break;
3262         }
3263         case LITEVM_DEBUG_GUEST: {
3264                 struct litevm_debug_guest dbg;
3265
3266                 r = -EFAULT;
3267                 if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
3268                         goto out;
3269                 r = litevm_dev_ioctl_debug_guest(litevm, &dbg);
3270                 if (r)
3271                         goto out;
3272                 r = 0;
3273                 break;
3274         }
3275         case LITEVM_SET_MEMORY_REGION: {
3276                 struct litevm_memory_region litevm_mem;
3277
3278                 r = -EFAULT;
3279                 if (copy_from_user(&litevm_mem, (void *)arg, sizeof litevm_mem))
3280                         goto out;
3281                 r = litevm_dev_ioctl_set_memory_region(litevm, &litevm_mem);
3282                 if (r)
3283                         goto out;
3284                 break;
3285         }
3286         case LITEVM_GET_DIRTY_LOG: {
3287                 struct litevm_dirty_log log;
3288
3289                 r = -EFAULT;
3290                 if (copy_from_user(&log, (void *)arg, sizeof log))
3291                         goto out;
3292                 r = litevm_dev_ioctl_get_dirty_log(litevm, &log);
3293                 if (r)
3294                         goto out;
3295                 break;
3296         }
3297         default:
3298                 ;
3299         }
3300 out:
3301         return r;
3302 }
3303 #endif
3304
3305 #if 0
3306 static int litevm_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3307 {
3308         struct litevm *litevm = vma->vm_file->private_data;
3309         struct litevm_memory_slot *slot;
3310         struct page *page;
3311
3312         slot = gfn_to_memslot(litevm, vmf->pgoff);
3313         if (!slot)
3314                 return VM_FAULT_SIGBUS;
3315         page = gfn_to_page(slot, vmf->pgoff);
3316         if (!page)
3317                 return VM_FAULT_SIGBUS;
3318
3319         get_page(page);
3320         vmf->page = page;
3321         return 0;
3322 }
3323 #endif
3324
3325 #if 0
3326 static int litevm_reboot(struct notifier_block *notifier, unsigned long val,
3327                        void *v)
3328 {
3329         panic("litevm_reboot");
3330         if (val == SYS_RESTART) {
3331                 /*
3332                  * Some (well, at least mine) BIOSes hang on reboot if
3333                  * in vmx root mode.
3334                  */
3335                 printk("litevm: exiting vmx mode\n");
3336                 smp_call_function_all(litevm_disable, 0, smp_call_wait);
3337         }
3338         return NOTIFY_OK;
3339         return 0;
3340 }
3341 #endif
3342
3343 hpa_t bad_page_address;
3344
3345 int litevm_init(void)
3346 {
3347         static struct page *bad_page;
3348         int r = 0;
3349
3350         if (!cpu_has_litevm_support()) {
3351                 printk("litevm: no hardware support\n");
3352                 return -EOPNOTSUPP;
3353         }
3354         if (vmx_disabled_by_bios()) {
3355                 printk("litevm: disabled by bios\n");
3356                 return -EOPNOTSUPP;
3357         }
3358
3359         setup_vmcs_descriptor();
3360         r = alloc_litevm_area();
3361         if (r)
3362                 goto out;
3363         smp_call_function_all(litevm_enable, 0, smp_call_wait);
3364         if ((bad_page = kpage_zalloc_addr()) == NULL) {
3365                 r = -ENOMEM;
3366                 goto out_free;
3367         }
3368
3369         bad_page_address = page2ppn(bad_page) << PAGE_SHIFT;
3370         memset(ppn2kva(bad_page_address), 0, PAGE_SIZE);
3371
3372         return r;
3373
3374 out_free:
3375 #warning "no free_litevm_area do we need it"
3376 //      free_litevm_area();
3377 out:
3378         return r;
3379 }
3380
3381 static void litevm_exit(void)
3382 {
3383         //free_litevm_area();
3384         //__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3385 }
3386
3387