Add flags that VMs need. But many other things do as well.
[akaros.git] / kern / arch / x86 / vm.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #include <kmalloc.h>
17 #include <string.h>
18 #include <stdio.h>
19 #include <assert.h>
20 #include <error.h>
21 #include <pmap.h>
22 #include <sys/queue.h>
23 #include <smp.h>
24 #include <kref.h>
25 #include <atomic.h>
26 #include <alarm.h>
27 #include <event.h>
28 #include <umem.h>
29 #include <devalarm.h>
30 #include <arch/types.h>
31 #include <arch/vm.h>
32 #include <arch/emulate.h>
33 #include <arch/vmdebug.h>
34 #include <arch/msr-index.h>
35
36 struct litevm_stat litevm_stat;
37
38 static struct litevm_stats_debugfs_item {
39         const char *name;
40         uint32_t *data;
41 } debugfs_entries[] = {
42         { "pf_fixed", &litevm_stat.pf_fixed },
43         { "pf_guest", &litevm_stat.pf_guest },
44         { "tlb_flush", &litevm_stat.tlb_flush },
45         { "invlpg", &litevm_stat.invlpg },
46         { "exits", &litevm_stat.exits },
47         { "io_exits", &litevm_stat.io_exits },
48         { "mmio_exits", &litevm_stat.mmio_exits },
49         { "signal_exits", &litevm_stat.signal_exits },
50         { "irq_exits", &litevm_stat.irq_exits },
51         { 0, 0 }
52 };
53
54 static struct dentry *debugfs_dir;
55
56 static const uint32_t vmx_msr_index[] = {
57 #ifdef __x86_64__
58         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
59 #endif
60         MSR_EFER, // wtf? MSR_K6_STAR,
61 };
62 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
63
64 #ifdef __x86_64__
65 /*
66  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
67  * mechanism (cpu bug AA24)
68  */
69 #define NR_BAD_MSRS 2
70 #else
71 #define NR_BAD_MSRS 0
72 #endif
73
74 #define TSS_IOPB_BASE_OFFSET 0x66
75 #define TSS_BASE_SIZE 0x68
76 #define TSS_IOPB_SIZE (65536 / 8)
77 #define TSS_REDIRECTION_SIZE (256 / 8)
78 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
79
80 #define MSR_IA32_VMX_BASIC_MSR                  0x480
81 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
82 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
83 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
84 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
85
86 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
87 #define LMSW_GUEST_MASK 0x0eULL
88 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
89 //#define CR4_VMXE 0x2000
90 #define CR8_RESEVED_BITS (~0x0fULL)
91 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
92
93 #ifdef __x86_64__
94 #define HOST_IS_64 1
95 #else
96 #define HOST_IS_64 0
97 #endif
98
99 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu, uint32_t msr)
100 {
101         int i;
102
103         for (i = 0; i < vcpu->nmsrs; ++i)
104                 if (vcpu->guest_msrs[i].index == msr)
105                         return &vcpu->guest_msrs[i];
106         return 0;
107 }
108
109 struct descriptor_table {
110         uint16_t limit;
111         unsigned long base;
112 } __attribute__((packed));
113
114 static void get_gdt(struct descriptor_table *table)
115 {
116         asm ("sgdt %0" : "=m"(*table));
117 }
118
119 static void get_idt(struct descriptor_table *table)
120 {
121         asm ("sidt %0" : "=m"(*table));
122 }
123
124 static uint16_t read_fs(void)
125 {
126         uint16_t seg;
127         asm ("mov %%fs, %0" : "=g"(seg));
128         return seg;
129 }
130
131 static uint16_t read_gs(void)
132 {
133         uint16_t seg;
134         asm ("mov %%gs, %0" : "=g"(seg));
135         return seg;
136 }
137
138 static uint16_t read_ldt(void)
139 {
140         uint16_t ldt;
141         asm ("sldt %0" : "=g"(ldt));
142         return ldt;
143 }
144
145 static void load_fs(uint16_t sel)
146 {
147         asm ("mov %0, %%fs" : : "g"(sel));
148 }
149
150 static void load_gs(uint16_t sel)
151 {
152         asm ("mov %0, %%gs" : : "g"(sel));
153 }
154
155 #ifndef load_ldt
156 static void load_ldt(uint16_t sel)
157 {
158         asm ("lldt %0" : : "g"(sel));
159 }
160 #endif
161
162 static void fx_save(void *image)
163 {
164         asm ("fxsave (%0)":: "r" (image));
165 }
166
167 static void fx_restore(void *image)
168 {
169         asm ("fxrstor (%0)":: "r" (image));
170 }
171
172 static void fpu_init(void)
173 {
174         asm ("finit");
175 }
176
177 struct segment_descriptor {
178         uint16_t limit_low;
179         uint16_t base_low;
180         uint8_t  base_mid;
181         uint8_t  type : 4;
182         uint8_t  system : 1;
183         uint8_t  dpl : 2;
184         uint8_t  present : 1;
185         uint8_t  limit_high : 4;
186         uint8_t  avl : 1;
187         uint8_t  long_mode : 1;
188         uint8_t  default_op : 1;
189         uint8_t  granularity : 1;
190         uint8_t  base_high;
191 } __attribute__((packed));
192
193 #ifdef __x86_64__
194 // LDT or TSS descriptor in the GDT. 16 bytes.
195 struct segment_descriptor_64 {
196         struct segment_descriptor s;
197         uint32_t base_higher;
198         uint32_t pad_zero;
199 };
200
201 #endif
202
203 static unsigned long segment_base(uint16_t selector)
204 {
205         struct descriptor_table gdt;
206         struct segment_descriptor *d;
207         unsigned long table_base;
208         typedef unsigned long ul;
209         unsigned long v;
210
211         asm ("sgdt %0" : "=m"(gdt));
212         table_base = gdt.base;
213
214         if (selector & 4) {           /* from ldt */
215                 uint16_t ldt_selector;
216
217                 asm ("sldt %0" : "=g"(ldt_selector));
218                 table_base = segment_base(ldt_selector);
219         }
220         d = (struct segment_descriptor *)(table_base + (selector & ~7));
221         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
222 #ifdef __x86_64__
223         if (d->system == 0
224             && (d->type == 2 || d->type == 9 || d->type == 11))
225                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
226 #endif
227         return v;
228 }
229
230 static unsigned long read_tr_base(void)
231 {
232         uint16_t tr;
233         asm ("str %0" : "=g"(tr));
234         return segment_base(tr);
235 }
236
237 static void reload_tss(void)
238 {
239 #ifndef __x86_64__
240
241         /*
242          * VT restores TR but not its size.  Useless.
243          */
244         struct descriptor_table gdt;
245         struct segment_descriptor *descs;
246
247         get_gdt(&gdt);
248         descs = (void *)gdt.base;
249         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
250         load_TR_desc();
251 #endif
252 }
253
254 static struct vmcs_descriptor {
255         int size;
256         int order;
257         uint32_t revision_id;
258 } vmcs_descriptor;
259
260 #if 0
261 #ifdef __x86_64__
262 static unsigned long read_msr(unsigned long msr)
263 {
264         uint64_t value;
265
266         rdmsrl(msr, value);
267         return value;
268 }
269 #endif
270 #endif
271 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
272 {
273         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
274         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
275 }
276
277
278
279 int litevm_read_guest(struct litevm_vcpu *vcpu,
280                              gva_t addr,
281                              unsigned long size,
282                              void *dest)
283 {
284         unsigned char *host_buf = dest;
285         unsigned long req_size = size;
286
287         while (size) {
288                 hpa_t paddr;
289                 unsigned now;
290                 unsigned offset;
291                 hva_t guest_buf;
292
293                 paddr = gva_to_hpa(vcpu, addr);
294
295                 if (is_error_hpa(paddr))
296                         break;
297 #warning "kmap_atomic"
298                 guest_buf = NULL; //(hva_t)kmap_atomic(
299                 //      pfn_to_page(paddr >> PAGE_SHIFT));
300                 offset = addr & ~PAGE_MASK;
301                 guest_buf |= offset;
302                 now = MIN(size, PAGE_SIZE - offset);
303                 memcpy(host_buf, (void*)guest_buf, now);
304                 host_buf += now;
305                 addr += now;
306                 size -= now;
307 #warning "kunmap_atomic"
308 //              kunmap_atomic((void *)(guest_buf & PAGE_MASK));
309         }
310         return req_size - size;
311 }
312
313 int litevm_write_guest(struct litevm_vcpu *vcpu,
314                              gva_t addr,
315                              unsigned long size,
316                              void *data)
317 {
318         unsigned char *host_buf = data;
319         unsigned long req_size = size;
320
321         while (size) {
322                 hpa_t paddr;
323                 unsigned now;
324                 unsigned offset;
325                 hva_t guest_buf;
326
327                 paddr = gva_to_hpa(vcpu, addr);
328
329                 if (is_error_hpa(paddr))
330                         break;
331
332                 guest_buf = (hva_t)kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT));
333                 offset = addr & ~PAGE_MASK;
334                 guest_buf |= offset;
335                 now = MIN(size, PAGE_SIZE - offset);
336                 memcpy((void*)guest_buf, host_buf, now);
337                 host_buf += now;
338                 addr += now;
339                 size -= now;
340                 //kunmap_atomic((void *)(guest_buf & PAGE_MASK));
341         }
342         return req_size - size;
343 }
344
345 static void setup_vmcs_descriptor(void)
346 {
347         uint64_t msr;
348
349         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
350         vmcs_descriptor.size = (msr>>32) & 0x1fff;
351         vmcs_descriptor.order = get_order(vmcs_descriptor.size);
352         vmcs_descriptor.revision_id = (uint32_t)msr;
353 };
354
355 static void vmcs_clear(struct vmcs *vmcs)
356 {
357 #warning "__pa"
358         uint64_t phys_addr = 0//;__pa(vmcs);
359         uint8_t error;
360
361         asm volatile ("vmclear %1; setna %0"
362                        : "=m"(error) : "m"(phys_addr) : "cc", "memory" );
363         if (error)
364                 printk("litevm: vmclear fail: %p/%llx\n",
365                        vmcs, phys_addr);
366 }
367
368 static void __vcpu_clear(void *arg)
369 {
370         struct litevm_vcpu *vcpu = arg;
371 #warning "smp_processor_id"
372         int cpu = 0 ; //smp_processor_id();
373
374         if (vcpu->cpu == cpu)
375                 vmcs_clear(vcpu->vmcs);
376 #warning "per cpu"
377 /*
378         if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
379         per_cpu(current_vmcs, cpu) = 0;*/
380 }
381
382 static int vcpu_slot(struct litevm_vcpu *vcpu)
383 {
384         return vcpu - vcpu->litevm->vcpus;
385 }
386
387 /*
388  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
389  * vcpu mutex is already taken.
390  */
391 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
392 {
393 #warning "__pa"
394         uint64_t phys_addr = 0; //__pa(vcpu->vmcs);
395         int cpu;
396         cpu = core_id();
397
398         if (vcpu->cpu != cpu) {
399 #warning "smp_call_function"
400 //              smp_call_function(__vcpu_clear, vcpu, 1);
401                 vcpu->launched = 0;
402         }
403 #warning "per cpu"
404         if (current->vmcs != vcpu->vmcs) {
405                 uint8_t error;
406
407                 current->vmcs = vcpu->vmcs;
408                 asm volatile ("vmptrld %1; setna %0"
409                                : "=m"(error) : "m"(phys_addr) : "cc" );
410                 if (error)
411                         printk("litevm: vmptrld %p/%llx fail\n",
412                                vcpu->vmcs, phys_addr);
413         }
414
415         if (vcpu->cpu != cpu) {
416                 struct descriptor_table dt;
417                 unsigned long sysenter_esp;
418
419                 vcpu->cpu = cpu;
420                 /*
421                  * Linux uses per-cpu TSS and GDT, so set these when switching
422                  * processors.
423                  */
424                 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
425                 get_gdt(&dt);
426                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
427
428                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
429                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
430         }
431         return vcpu;
432 }
433
434 /*
435  * Switches to specified vcpu, until a matching vcpu_put()
436  */
437 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
438 {
439         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
440
441         mutex_lock(&vcpu->mutex);
442         if (unlikely(!vcpu->vmcs)) {
443                 mutex_unlock(&vcpu->mutex);
444                 return 0;
445         }
446         return __vcpu_load(vcpu);
447 }
448
449 static void vcpu_put(struct litevm_vcpu *vcpu)
450 {
451         put_cpu();
452         mutex_unlock(&vcpu->mutex);
453 }
454
455
456 static struct vmcs *alloc_vmcs_cpu(int cpu)
457 {
458         int node = cpu_to_node(cpu);
459         struct page *pages;
460         struct vmcs *vmcs;
461
462         pages = alloc_pages_node(node, KMALLOC_WAIT, vmcs_descriptor.order);
463         if (!pages)
464                 return 0;
465         vmcs = page_address(pages);
466         memset(vmcs, 0, vmcs_descriptor.size);
467         vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
468         return vmcs;
469 }
470
471 static struct vmcs *alloc_vmcs(void)
472 {
473         return alloc_vmcs_cpu(smp_processor_id());
474 }
475
476 static void free_vmcs(struct vmcs *vmcs)
477 {
478         free_pages((unsigned long)vmcs, vmcs_descriptor.order);
479 }
480
481 static __init int cpu_has_litevm_support(void)
482 {
483         unsigned long ecx = cpuid_ecx(1);
484         return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
485 }
486
487 static void free_litevm_area(void)
488 {
489         int cpu;
490
491 //      for_each_online_cpu(cpu)
492 //              free_vmcs(per_cpu(vmxarea, cpu));
493 }
494
495 static __init int alloc_litevm_area(void)
496 {
497         int cpu;
498
499         for_each_online_cpu(cpu) {
500                 struct vmcs *vmcs;
501
502                 vmcs = alloc_vmcs_cpu(cpu);
503                 if (!vmcs) {
504                         free_litevm_area();
505                         return -ENOMEM;
506                 }
507
508                 per_cpu(vmxarea, cpu) = vmcs;
509         }
510         return 0;
511 }
512
513 static __init int vmx_disabled_by_bios(void)
514 {
515         uint64_t msr;
516
517         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
518         return (msr & 5) == 1; /* locked but not enabled */
519 }
520
521 static __init void litevm_enable(void *garbage)
522 {
523         int cpu = raw_smp_processor_id();
524         uint64_t phys_addr = __pa(per_cpu(vmxarea, cpu));
525         uint64_t old;
526
527         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
528         if ((old & 5) == 0)
529                 /* enable and lock */
530                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5);
531         write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
532         asm volatile ("vmxon %0" : : "m"(phys_addr) : "memory", "cc");
533 }
534
535 static void litevm_disable(void *garbage)
536 {
537         asm volatile ("vmxoff" : : : "cc");
538 }
539
540 static int litevm_dev_open(struct inode *inode, struct file *filp)
541 {
542         struct litevm *litevm = kzalloc(sizeof(struct litevm), KMALLOC_WAIT);
543         int i;
544
545         if (!litevm)
546                 return -ENOMEM;
547
548         spin_lock_init(&litevm->lock);
549         INIT_LIST_HEAD(&litevm->active_mmu_pages);
550         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
551                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
552
553                 mutex_init(&vcpu->mutex);
554                 vcpu->mmu.root_hpa = INVALID_PAGE;
555                 INIT_LIST_HEAD(&vcpu->free_pages);
556         }
557         filp->private_data = litevm;
558         return 0;
559 }
560
561 /*
562  * Free any memory in @free but not in @dont.
563  */
564 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
565                                   struct litevm_memory_slot *dont)
566 {
567         int i;
568
569         if (!dont || free->phys_mem != dont->phys_mem)
570                 if (free->phys_mem) {
571                         for (i = 0; i < free->npages; ++i)
572                                 __free_page(free->phys_mem[i]);
573                         vfree(free->phys_mem);
574                 }
575
576         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
577                 vfree(free->dirty_bitmap);
578
579         free->phys_mem = 0;
580         free->npages = 0;
581         free->dirty_bitmap = 0;
582 }
583
584 static void litevm_free_physmem(struct litevm *litevm)
585 {
586         int i;
587
588         for (i = 0; i < litevm->nmemslots; ++i)
589                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
590 }
591
592 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
593 {
594         if (vcpu->vmcs) {
595                 on_each_cpu(__vcpu_clear, vcpu, 1);
596                 free_vmcs(vcpu->vmcs);
597                 vcpu->vmcs = 0;
598         }
599 }
600
601 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
602 {
603         litevm_free_vmcs(vcpu);
604         litevm_mmu_destroy(vcpu);
605 }
606
607 static void litevm_free_vcpus(struct litevm *litevm)
608 {
609         unsigned int i;
610
611         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
612                 litevm_free_vcpu(&litevm->vcpus[i]);
613 }
614
615 static int litevm_dev_release(struct inode *inode, struct file *filp)
616 {
617         struct litevm *litevm = filp->private_data;
618
619         litevm_free_vcpus(litevm);
620         litevm_free_physmem(litevm);
621         kfree(litevm);
622         return 0;
623 }
624
625 unsigned long vmcs_readl(unsigned long field)
626 {
627         unsigned long value;
628
629         asm volatile ("vmread %1, %0" : "=g"(value) : "r"(field) : "cc");
630         return value;
631 }
632
633 void vmcs_writel(unsigned long field, unsigned long value)
634 {
635         uint8_t error;
636
637         asm volatile ("vmwrite %1, %2; setna %0"
638                        : "=g"(error) : "r"(value), "r"(field) : "cc" );
639         if (error)
640                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
641                        field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
642 }
643
644 static void vmcs_write16(unsigned long field, uint16_t value)
645 {
646         vmcs_writel(field, value);
647 }
648
649 static void vmcs_write64(unsigned long field, uint64_t value)
650 {
651 #ifdef __x86_64__
652         vmcs_writel(field, value);
653 #else
654         vmcs_writel(field, value);
655         asm volatile ("");
656         vmcs_writel(field+1, value >> 32);
657 #endif
658 }
659
660 static void inject_gp(struct litevm_vcpu *vcpu)
661 {
662         printd("inject_general_protection: rip 0x%lx\n",
663                vmcs_readl(GUEST_RIP));
664         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
665         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
666                      GP_VECTOR |
667                      INTR_TYPE_EXCEPTION |
668                      INTR_INFO_DELIEVER_CODE_MASK |
669                      INTR_INFO_VALID_MASK);
670 }
671
672 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
673 {
674         if (vcpu->rmode.active)
675                 vmcs_write32(EXCEPTION_BITMAP, ~0);
676         else
677                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
678 }
679
680 static void enter_pmode(struct litevm_vcpu *vcpu)
681 {
682         unsigned long flags;
683
684         vcpu->rmode.active = 0;
685
686         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
687         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
688         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
689
690         flags = vmcs_readl(GUEST_RFLAGS);
691         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
692         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
693         vmcs_writel(GUEST_RFLAGS, flags);
694
695         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
696                         (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK) );
697
698         update_exception_bitmap(vcpu);
699
700         #define FIX_PMODE_DATASEG(seg, save) {                          \
701                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
702                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
703                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
704                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
705         }
706
707         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
708         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
709         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
710         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
711         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
712
713         vmcs_write16(GUEST_CS_SELECTOR,
714                      vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
715         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
716 }
717
718 static int rmode_tss_base(struct litevm* litevm)
719 {
720         gfn_t base_gfn = litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
721         return base_gfn << PAGE_SHIFT;
722 }
723
724 static void enter_rmode(struct litevm_vcpu *vcpu)
725 {
726         unsigned long flags;
727
728         vcpu->rmode.active = 1;
729
730         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
731         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
732
733         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
734         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
735
736         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
737         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
738
739         flags = vmcs_readl(GUEST_RFLAGS);
740         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
741
742         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
743
744         vmcs_writel(GUEST_RFLAGS, flags);
745         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
746         update_exception_bitmap(vcpu);
747
748         #define FIX_RMODE_SEG(seg, save) {                                 \
749                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
750                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
751                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
752                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
753         }
754
755         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
756         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
757
758         FIX_RMODE_SEG(ES, vcpu->rmode.es);
759         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
760         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
761         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
762         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
763 }
764
765 static int init_rmode_tss(struct litevm* litevm)
766 {
767         struct page *p1, *p2, *p3;
768         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
769         char *page;
770
771         p1 = _gfn_to_page(litevm, fn++);
772         p2 = _gfn_to_page(litevm, fn++);
773         p3 = _gfn_to_page(litevm, fn);
774
775         if (!p1 || !p2 || !p3) {
776                 litevm_printf(litevm,"%s: gfn_to_page failed\n", __FUNCTION__);
777                 return 0;
778         }
779
780         page = kmap_atomic(p1);
781         memset(page, 0, PAGE_SIZE);
782         *(uint16_t*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
783         kunmap_atomic(page);
784
785         page = kmap_atomic(p2);
786         memset(page, 0, PAGE_SIZE);
787         kunmap_atomic(page);
788
789         page = kmap_atomic(p3);
790         memset(page, 0, PAGE_SIZE);
791         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
792         kunmap_atomic(page);
793
794         return 1;
795 }
796
797 #ifdef __x86_64__
798
799 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
800 {
801         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
802
803         vcpu->shadow_efer = efer;
804         if (efer & EFER_LMA) {
805                 vmcs_write32(VM_ENTRY_CONTROLS,
806                                      vmcs_read32(VM_ENTRY_CONTROLS) |
807                                      VM_ENTRY_CONTROLS_IA32E_MASK);
808                 msr->data = efer;
809
810         } else {
811                 vmcs_write32(VM_ENTRY_CONTROLS,
812                                      vmcs_read32(VM_ENTRY_CONTROLS) &
813                                      ~VM_ENTRY_CONTROLS_IA32E_MASK);
814
815                 msr->data = efer & ~EFER_LME;
816         }
817 }
818
819 static void enter_lmode(struct litevm_vcpu *vcpu)
820 {
821         uint32_t guest_tr_ar;
822
823         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
824         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
825                 printd("%s: tss fixup for long mode. \n",
826                        __FUNCTION__);
827                 vmcs_write32(GUEST_TR_AR_BYTES,
828                              (guest_tr_ar & ~AR_TYPE_MASK)
829                              | AR_TYPE_BUSY_64_TSS);
830         }
831
832         vcpu->shadow_efer |= EFER_LMA;
833
834         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
835         vmcs_write32(VM_ENTRY_CONTROLS,
836                      vmcs_read32(VM_ENTRY_CONTROLS)
837                      | VM_ENTRY_CONTROLS_IA32E_MASK);
838 }
839
840 static void exit_lmode(struct litevm_vcpu *vcpu)
841 {
842         vcpu->shadow_efer &= ~EFER_LMA;
843
844         vmcs_write32(VM_ENTRY_CONTROLS,
845                      vmcs_read32(VM_ENTRY_CONTROLS)
846                      & ~VM_ENTRY_CONTROLS_IA32E_MASK);
847 }
848
849 #endif
850
851 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
852 {
853         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
854                 enter_pmode(vcpu);
855
856         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
857                 enter_rmode(vcpu);
858
859 #ifdef __x86_64__
860         if (vcpu->shadow_efer & EFER_LME) {
861                 if (!is_paging() && (cr0 & CR0_PG_MASK))
862                         enter_lmode(vcpu);
863                 if (is_paging() && !(cr0 & CR0_PG_MASK))
864                         exit_lmode(vcpu);
865         }
866 #endif
867
868         vmcs_writel(CR0_READ_SHADOW, cr0);
869         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
870 }
871
872 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
873                                          unsigned long cr3)
874 {
875         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
876         unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
877         int i;
878         uint64_t pdpte;
879         uint64_t *pdpt;
880         struct litevm_memory_slot *memslot;
881
882         spin_lock(&vcpu->litevm->lock);
883         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
884         /* FIXME: !memslot - emulate? 0xff? */
885         pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn));
886
887         for (i = 0; i < 4; ++i) {
888                 pdpte = pdpt[offset + i];
889                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
890                         break;
891         }
892
893         kunmap_atomic(pdpt);
894         spin_unlock(&vcpu->litevm->lock);
895
896         return i != 4;
897 }
898
899 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
900 {
901         if (cr0 & CR0_RESEVED_BITS) {
902                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
903                        cr0, guest_cr0());
904                 inject_gp(vcpu);
905                 return;
906         }
907
908         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
909                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
910                 inject_gp(vcpu);
911                 return;
912         }
913
914         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
915                 printd("set_cr0: #GP, set PG flag "
916                        "and a clear PE flag\n");
917                 inject_gp(vcpu);
918                 return;
919         }
920
921         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
922 #ifdef __x86_64__
923                 if ((vcpu->shadow_efer & EFER_LME)) {
924                         uint32_t guest_cs_ar;
925                         if (!is_pae()) {
926                                 printd("set_cr0: #GP, start paging "
927                                        "in long mode while PAE is disabled\n");
928                                 inject_gp(vcpu);
929                                 return;
930                         }
931                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
932                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
933                                 printd("set_cr0: #GP, start paging "
934                                        "in long mode while CS.L == 1\n");
935                                 inject_gp(vcpu);
936                                 return;
937
938                         }
939                 } else
940 #endif
941                 if (is_pae() &&
942                             pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
943                         printd("set_cr0: #GP, pdptrs "
944                                "reserved bits\n");
945                         inject_gp(vcpu);
946                         return;
947                 }
948
949         }
950
951         __set_cr0(vcpu, cr0);
952         litevm_mmu_reset_context(vcpu);
953         return;
954 }
955
956 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
957 {
958         unsigned long cr0 = guest_cr0();
959
960         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
961                 enter_pmode(vcpu);
962                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
963
964         } else
965                 printd("lmsw: unexpected\n");
966
967         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
968                                 | (msw & LMSW_GUEST_MASK));
969 }
970
971 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
972 {
973         vmcs_writel(CR4_READ_SHADOW, cr4);
974         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
975                     LITEVM_RMODE_VM_CR4_ALWAYS_ON : LITEVM_PMODE_VM_CR4_ALWAYS_ON));
976 }
977
978 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
979 {
980         if (cr4 & CR4_RESEVED_BITS) {
981                 printd("set_cr4: #GP, reserved bits\n");
982                 inject_gp(vcpu);
983                 return;
984         }
985
986         if (is_long_mode()) {
987                 if (!(cr4 & CR4_PAE_MASK)) {
988                         printd("set_cr4: #GP, clearing PAE while "
989                                "in long mode\n");
990                         inject_gp(vcpu);
991                         return;
992                 }
993         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
994                    && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
995                 printd("set_cr4: #GP, pdptrs reserved bits\n");
996                 inject_gp(vcpu);
997         }
998
999         if (cr4 & CR4_VMXE_MASK) {
1000                 printd("set_cr4: #GP, setting VMXE\n");
1001                 inject_gp(vcpu);
1002                 return;
1003         }
1004         __set_cr4(vcpu, cr4);
1005         spin_lock(&vcpu->litevm->lock);
1006         litevm_mmu_reset_context(vcpu);
1007         spin_unlock(&vcpu->litevm->lock);
1008 }
1009
1010 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1011 {
1012         if (is_long_mode()) {
1013                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
1014                         printd("set_cr3: #GP, reserved bits\n");
1015                         inject_gp(vcpu);
1016                         return;
1017                 }
1018         } else {
1019                 if (cr3 & CR3_RESEVED_BITS) {
1020                         printd("set_cr3: #GP, reserved bits\n");
1021                         inject_gp(vcpu);
1022                         return;
1023                 }
1024                 if (is_paging() && is_pae() &&
1025                     pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1026                         printd("set_cr3: #GP, pdptrs "
1027                                "reserved bits\n");
1028                         inject_gp(vcpu);
1029                         return;
1030                 }
1031         }
1032
1033         vcpu->cr3 = cr3;
1034         spin_lock(&vcpu->litevm->lock);
1035         vcpu->mmu.new_cr3(vcpu);
1036         spin_unlock(&vcpu->litevm->lock);
1037 }
1038
1039 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1040 {
1041         if ( cr8 & CR8_RESEVED_BITS) {
1042                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1043                 inject_gp(vcpu);
1044                 return;
1045         }
1046         vcpu->cr8 = cr8;
1047 }
1048
1049 static uint32_t get_rdx_init_val(void)
1050 {
1051         uint32_t val;
1052
1053         asm ("movl $1, %%eax \n\t"
1054              "movl %%eax, %0 \n\t" : "=g"(val) );
1055         return val;
1056
1057 }
1058
1059 static void fx_init(struct litevm_vcpu *vcpu)
1060 {
1061         struct __attribute__ ((__packed__)) fx_image_s {
1062                 uint16_t control; //fcw
1063                 uint16_t status; //fsw
1064                 uint16_t tag; // ftw
1065                 uint16_t opcode; //fop
1066                 uint64_t ip; // fpu ip
1067                 uint64_t operand;// fpu dp
1068                 uint32_t mxcsr;
1069                 uint32_t mxcsr_mask;
1070
1071         } *fx_image;
1072
1073         fx_save(vcpu->host_fx_image);
1074         fpu_init();
1075         fx_save(vcpu->guest_fx_image);
1076         fx_restore(vcpu->host_fx_image);
1077
1078         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1079         fx_image->mxcsr = 0x1f80;
1080         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1081                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1082 }
1083
1084 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field, uint32_t val)
1085 {
1086         uint32_t msr_high, msr_low;
1087
1088         rdmsr(msr, msr_low, msr_high);
1089
1090         val &= msr_high;
1091         val |= msr_low;
1092         vmcs_write32(vmcs_field, val);
1093 }
1094
1095 /*
1096  * Sets up the vmcs for emulated real mode.
1097  */
1098 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1099 {
1100         extern asmlinkage void litevm_vmx_return(void);
1101         uint32_t host_sysenter_cs;
1102         uint32_t junk;
1103         unsigned long a;
1104         struct descriptor_table dt;
1105         int i;
1106         int ret;
1107         uint64_t tsc;
1108         int nr_good_msrs;
1109
1110
1111         if (!init_rmode_tss(vcpu->litevm)) {
1112                 ret = 0;
1113                 goto out;
1114         }
1115
1116         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1117         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1118         vcpu->cr8 = 0;
1119         vcpu->apic_base = 0xfee00000 |
1120                         /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
1121                         MSR_IA32_APICBASE_ENABLE;
1122
1123         fx_init(vcpu);
1124
1125 #define SEG_SETUP(seg) do {                                     \
1126                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1127                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1128                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1129                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1130         } while (0)
1131
1132         /*
1133          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1134          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1135          */
1136         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1137         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1138         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1139         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1140
1141         SEG_SETUP(DS);
1142         SEG_SETUP(ES);
1143         SEG_SETUP(FS);
1144         SEG_SETUP(GS);
1145         SEG_SETUP(SS);
1146
1147         vmcs_write16(GUEST_TR_SELECTOR, 0);
1148         vmcs_writel(GUEST_TR_BASE, 0);
1149         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1150         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1151
1152         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1153         vmcs_writel(GUEST_LDTR_BASE, 0);
1154         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1155         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1156
1157         vmcs_write32(GUEST_SYSENTER_CS, 0);
1158         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1159         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1160
1161         vmcs_writel(GUEST_RFLAGS, 0x02);
1162         vmcs_writel(GUEST_RIP, 0xfff0);
1163         vmcs_writel(GUEST_RSP, 0);
1164
1165         vmcs_writel(GUEST_CR3, 0);
1166
1167         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1168         vmcs_writel(GUEST_DR7, 0x400);
1169
1170         vmcs_writel(GUEST_GDTR_BASE, 0);
1171         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1172
1173         vmcs_writel(GUEST_IDTR_BASE, 0);
1174         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1175
1176         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1177         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1178         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1179
1180         /* I/O */
1181         vmcs_write64(IO_BITMAP_A, 0);
1182         vmcs_write64(IO_BITMAP_B, 0);
1183
1184         rdtscll(tsc);
1185         vmcs_write64(TSC_OFFSET, -tsc);
1186
1187         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1188
1189         /* Special registers */
1190         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1191
1192         /* Control */
1193         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR,
1194                                PIN_BASED_VM_EXEC_CONTROL,
1195                                PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
1196                                | PIN_BASED_NMI_EXITING   /* 20.6.1 */
1197                         );
1198         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR,
1199                                CPU_BASED_VM_EXEC_CONTROL,
1200                                CPU_BASED_HLT_EXITING         /* 20.6.2 */
1201                                | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
1202                                | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
1203                                | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
1204                                | CPU_BASED_INVDPG_EXITING
1205                                | CPU_BASED_MOV_DR_EXITING
1206                                | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
1207                         );
1208
1209         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1210         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1211         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1212         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1213
1214         vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
1215         vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
1216         vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
1217
1218         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
1219         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1220         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1221         vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
1222         vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
1223         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1224 #ifdef __x86_64__
1225         rdmsrl(MSR_FS_BASE, a);
1226         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1227         rdmsrl(MSR_GS_BASE, a);
1228         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1229 #else
1230         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1231         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1232 #endif
1233
1234         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
1235
1236         get_idt(&dt);
1237         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1238
1239
1240         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return); /* 22.2.5 */
1241
1242         rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1243         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1244         rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1245         vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
1246         rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1247         vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
1248
1249         ret = -ENOMEM;
1250         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1251         if (!vcpu->guest_msrs)
1252                 goto out;
1253         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1254         if (!vcpu->host_msrs)
1255                 goto out_free_guest_msrs;
1256
1257         for (i = 0; i < NR_VMX_MSR; ++i) {
1258                 uint32_t index = vmx_msr_index[i];
1259                 uint32_t data_low, data_high;
1260                 uint64_t data;
1261                 int j = vcpu->nmsrs;
1262
1263                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1264                         continue;
1265                 data = data_low | ((uint64_t)data_high << 32);
1266                 vcpu->host_msrs[j].index = index;
1267                 vcpu->host_msrs[j].reserved = 0;
1268                 vcpu->host_msrs[j].data = data;
1269                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1270                 ++vcpu->nmsrs;
1271         }
1272         printk("msrs: %d\n", vcpu->nmsrs);
1273
1274         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1275         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
1276                     virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS));
1277         vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
1278                     virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS));
1279         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
1280                     virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS));
1281         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS,
1282                                (HOST_IS_64 << 9));  /* 22.2,1, 20.7.1 */
1283         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
1284         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);  /* 22.2.2 */
1285         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1286
1287
1288         /* 22.2.1, 20.8.1 */
1289         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR,
1290                                VM_ENTRY_CONTROLS, 0);
1291         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1292
1293         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1294         vmcs_writel(TPR_THRESHOLD, 0);
1295
1296         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1297         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1298
1299         __set_cr0(vcpu, 0x60000010); // enter rmode
1300         __set_cr4(vcpu, 0);
1301 #ifdef __x86_64__
1302         __set_efer(vcpu, 0);
1303 #endif
1304
1305         ret = litevm_mmu_init(vcpu);
1306
1307         return ret;
1308
1309 out_free_guest_msrs:
1310         kfree(vcpu->guest_msrs);
1311 out:
1312         return ret;
1313 }
1314
1315 /*
1316  * Sync the rsp and rip registers into the vcpu structure.  This allows
1317  * registers to be accessed by indexing vcpu->regs.
1318  */
1319 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1320 {
1321         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1322         vcpu->rip = vmcs_readl(GUEST_RIP);
1323 }
1324
1325 /*
1326  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1327  * modification.
1328  */
1329 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1330 {
1331         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1332         vmcs_writel(GUEST_RIP, vcpu->rip);
1333 }
1334
1335 /*
1336  * Creates some virtual cpus.  Good luck creating more than one.
1337  */
1338 static int litevm_dev_ioctl_create_vcpu(struct litevm *litevm, int n)
1339 {
1340         int r;
1341         struct litevm_vcpu *vcpu;
1342         struct vmcs *vmcs;
1343
1344         r = -EINVAL;
1345         if (n < 0 || n >= LITEVM_MAX_VCPUS)
1346                 goto out;
1347
1348         vcpu = &litevm->vcpus[n];
1349
1350         mutex_lock(&vcpu->mutex);
1351
1352         if (vcpu->vmcs) {
1353                 mutex_unlock(&vcpu->mutex);
1354                 return -EEXIST;
1355         }
1356
1357         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
1358                                            FX_IMAGE_ALIGN);
1359         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1360
1361         vcpu->cpu = -1;  /* First load will set up TR */
1362         vcpu->litevm = litevm;
1363         vmcs = alloc_vmcs();
1364         if (!vmcs) {
1365                 mutex_unlock(&vcpu->mutex);
1366                 goto out_free_vcpus;
1367         }
1368         vmcs_clear(vmcs);
1369         vcpu->vmcs = vmcs;
1370         vcpu->launched = 0;
1371
1372         __vcpu_load(vcpu);
1373
1374         r = litevm_vcpu_setup(vcpu);
1375
1376         vcpu_put(vcpu);
1377
1378         if (r < 0)
1379                 goto out_free_vcpus;
1380
1381         return 0;
1382
1383 out_free_vcpus:
1384         litevm_free_vcpu(vcpu);
1385 out:
1386         return r;
1387 }
1388
1389 /*
1390  * Allocate some memory and give it an address in the guest physical address
1391  * space.
1392  *
1393  * Discontiguous memory is allowed, mostly for framebuffers.
1394  */
1395 static int litevm_dev_ioctl_set_memory_region(struct litevm *litevm,
1396                                            struct litevm_memory_region *mem)
1397 {
1398         int r;
1399         gfn_t base_gfn;
1400         unsigned long npages;
1401         unsigned long i;
1402         struct litevm_memory_slot *memslot;
1403         struct litevm_memory_slot old, new;
1404         int memory_config_version;
1405
1406         r = -EINVAL;
1407         /* General sanity checks */
1408         if (mem->memory_size & (PAGE_SIZE - 1))
1409                 goto out;
1410         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1411                 goto out;
1412         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1413                 goto out;
1414         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1415                 goto out;
1416
1417         memslot = &litevm->memslots[mem->slot];
1418         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1419         npages = mem->memory_size >> PAGE_SHIFT;
1420
1421         if (!npages)
1422                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1423
1424 raced:
1425         spin_lock(&litevm->lock);
1426
1427         memory_config_version = litevm->memory_config_version;
1428         new = old = *memslot;
1429
1430         new.base_gfn = base_gfn;
1431         new.npages = npages;
1432         new.flags = mem->flags;
1433
1434         /* Disallow changing a memory slot's size. */
1435         r = -EINVAL;
1436         if (npages && old.npages && npages != old.npages)
1437                 goto out_unlock;
1438
1439         /* Check for overlaps */
1440         r = -EEXIST;
1441         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1442                 struct litevm_memory_slot *s = &litevm->memslots[i];
1443
1444                 if (s == memslot)
1445                         continue;
1446                 if (!((base_gfn + npages <= s->base_gfn) ||
1447                       (base_gfn >= s->base_gfn + s->npages)))
1448                         goto out_unlock;
1449         }
1450         /*
1451          * Do memory allocations outside lock.  memory_config_version will
1452          * detect any races.
1453          */
1454         spin_unlock(&litevm->lock);
1455
1456         /* Deallocate if slot is being removed */
1457         if (!npages)
1458                 new.phys_mem = 0;
1459
1460         /* Free page dirty bitmap if unneeded */
1461         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1462                 new.dirty_bitmap = 0;
1463
1464         r = -ENOMEM;
1465
1466         /* Allocate if a slot is being created */
1467         if (npages && !new.phys_mem) {
1468                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
1469
1470                 if (!new.phys_mem)
1471                         goto out_free;
1472
1473                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
1474                 for (i = 0; i < npages; ++i) {
1475                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER);
1476                         if (!new.phys_mem[i])
1477                                 goto out_free;
1478                 }
1479         }
1480
1481         /* Allocate page dirty bitmap if needed */
1482         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1483                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
1484
1485                 new.dirty_bitmap = vmalloc(dirty_bytes);
1486                 if (!new.dirty_bitmap)
1487                         goto out_free;
1488                 memset(new.dirty_bitmap, 0, dirty_bytes);
1489         }
1490
1491         spin_lock(&litevm->lock);
1492
1493         if (memory_config_version != litevm->memory_config_version) {
1494                 spin_unlock(&litevm->lock);
1495                 litevm_free_physmem_slot(&new, &old);
1496                 goto raced;
1497         }
1498
1499         r = -EAGAIN;
1500         if (litevm->busy)
1501                 goto out_unlock;
1502
1503         if (mem->slot >= litevm->nmemslots)
1504                 litevm->nmemslots = mem->slot + 1;
1505
1506         *memslot = new;
1507         ++litevm->memory_config_version;
1508
1509         spin_unlock(&litevm->lock);
1510
1511         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1512                 struct litevm_vcpu *vcpu;
1513
1514                 vcpu = vcpu_load(litevm, i);
1515                 if (!vcpu)
1516                         continue;
1517                 litevm_mmu_reset_context(vcpu);
1518                 vcpu_put(vcpu);
1519         }
1520
1521         litevm_free_physmem_slot(&old, &new);
1522         return 0;
1523
1524 out_unlock:
1525         spin_unlock(&litevm->lock);
1526 out_free:
1527         litevm_free_physmem_slot(&new, &old);
1528 out:
1529         return r;
1530 }
1531
1532 /*
1533  * Get (and clear) the dirty memory log for a memory slot.
1534  */
1535 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1536                                        struct litevm_dirty_log *log)
1537 {
1538         struct litevm_memory_slot *memslot;
1539         int r, i;
1540         int n;
1541         unsigned long any = 0;
1542
1543         spin_lock(&litevm->lock);
1544
1545         /*
1546          * Prevent changes to guest memory configuration even while the lock
1547          * is not taken.
1548          */
1549         ++litevm->busy;
1550         spin_unlock(&litevm->lock);
1551         r = -EINVAL;
1552         if (log->slot >= LITEVM_MEMORY_SLOTS)
1553                 goto out;
1554
1555         memslot = &litevm->memslots[log->slot];
1556         r = -ENOENT;
1557         if (!memslot->dirty_bitmap)
1558                 goto out;
1559
1560         n = ALIGN(memslot->npages, 8) / 8;
1561
1562         for (i = 0; !any && i < n; ++i)
1563                 any = memslot->dirty_bitmap[i];
1564
1565         r = -EFAULT;
1566         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1567                 goto out;
1568
1569
1570         if (any) {
1571                 spin_lock(&litevm->lock);
1572                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1573                 spin_unlock(&litevm->lock);
1574                 memset(memslot->dirty_bitmap, 0, n);
1575                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1576                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1577
1578                         if (!vcpu)
1579                                 continue;
1580                         flush_guest_tlb(vcpu);
1581                         vcpu_put(vcpu);
1582                 }
1583         }
1584
1585         r = 0;
1586
1587 out:
1588         spin_lock(&litevm->lock);
1589         --litevm->busy;
1590         spin_unlock(&litevm->lock);
1591         return r;
1592 }
1593
1594 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1595 {
1596         int i;
1597
1598         for (i = 0; i < litevm->nmemslots; ++i) {
1599                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1600
1601                 if (gfn >= memslot->base_gfn
1602                     && gfn < memslot->base_gfn + memslot->npages)
1603                         return memslot;
1604         }
1605         return 0;
1606 }
1607
1608 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1609 {
1610         int i;
1611         struct litevm_memory_slot *memslot = 0;
1612         unsigned long rel_gfn;
1613
1614         for (i = 0; i < litevm->nmemslots; ++i) {
1615                 memslot = &litevm->memslots[i];
1616
1617                 if (gfn >= memslot->base_gfn
1618                     && gfn < memslot->base_gfn + memslot->npages) {
1619
1620                         if (!memslot || !memslot->dirty_bitmap)
1621                                 return;
1622
1623                         rel_gfn = gfn - memslot->base_gfn;
1624
1625                         /* avoid RMW */
1626                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
1627                                 set_bit(rel_gfn, memslot->dirty_bitmap);
1628                         return;
1629                 }
1630         }
1631 }
1632
1633 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1634 {
1635         unsigned long rip;
1636         uint32_t interruptibility;
1637
1638         rip = vmcs_readl(GUEST_RIP);
1639         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1640         vmcs_writel(GUEST_RIP, rip);
1641
1642         /*
1643          * We emulated an instruction, so temporary interrupt blocking
1644          * should be removed, if set.
1645          */
1646         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1647         if (interruptibility & 3)
1648                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
1649                              interruptibility & ~3);
1650 }
1651
1652 static int emulator_read_std(unsigned long addr,
1653                              unsigned long *val,
1654                              unsigned int bytes,
1655                              struct x86_emulate_ctxt *ctxt)
1656 {
1657         struct litevm_vcpu *vcpu = ctxt->vcpu;
1658         void *data = val;
1659
1660         while (bytes) {
1661                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1662                 unsigned offset = addr & (PAGE_SIZE-1);
1663                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1664                 unsigned long pfn;
1665                 struct litevm_memory_slot *memslot;
1666                 void *page;
1667
1668                 if (gpa == UNMAPPED_GVA)
1669                         return X86EMUL_PROPAGATE_FAULT;
1670                 pfn = gpa >> PAGE_SHIFT;
1671                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1672                 if (!memslot)
1673                         return X86EMUL_UNHANDLEABLE;
1674                 page = kmap_atomic(gfn_to_page(memslot, pfn));
1675
1676                 memcpy(data, page + offset, tocopy);
1677
1678                 kunmap_atomic(page);
1679
1680                 bytes -= tocopy;
1681                 data += tocopy;
1682                 addr += tocopy;
1683         }
1684
1685         return X86EMUL_CONTINUE;
1686 }
1687
1688 static int emulator_write_std(unsigned long addr,
1689                               unsigned long val,
1690                               unsigned int bytes,
1691                               struct x86_emulate_ctxt *ctxt)
1692 {
1693         printk("emulator_write_std: addr %lx n %d\n",
1694                addr, bytes);
1695         return X86EMUL_UNHANDLEABLE;
1696 }
1697
1698 static int emulator_read_emulated(unsigned long addr,
1699                                   unsigned long *val,
1700                                   unsigned int bytes,
1701                                   struct x86_emulate_ctxt *ctxt)
1702 {
1703         struct litevm_vcpu *vcpu = ctxt->vcpu;
1704
1705         if (vcpu->mmio_read_completed) {
1706                 memcpy(val, vcpu->mmio_data, bytes);
1707                 vcpu->mmio_read_completed = 0;
1708                 return X86EMUL_CONTINUE;
1709         } else if (emulator_read_std(addr, val, bytes, ctxt)
1710                    == X86EMUL_CONTINUE)
1711                 return X86EMUL_CONTINUE;
1712         else {
1713                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1714                 if (gpa == UNMAPPED_GVA)
1715                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
1716                 vcpu->mmio_needed = 1;
1717                 vcpu->mmio_phys_addr = gpa;
1718                 vcpu->mmio_size = bytes;
1719                 vcpu->mmio_is_write = 0;
1720
1721                 return X86EMUL_UNHANDLEABLE;
1722         }
1723 }
1724
1725 static int emulator_write_emulated(unsigned long addr,
1726                                    unsigned long val,
1727                                    unsigned int bytes,
1728                                    struct x86_emulate_ctxt *ctxt)
1729 {
1730         struct litevm_vcpu *vcpu = ctxt->vcpu;
1731         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1732
1733         if (gpa == UNMAPPED_GVA)
1734                 return X86EMUL_PROPAGATE_FAULT;
1735
1736         vcpu->mmio_needed = 1;
1737         vcpu->mmio_phys_addr = gpa;
1738         vcpu->mmio_size = bytes;
1739         vcpu->mmio_is_write = 1;
1740         memcpy(vcpu->mmio_data, &val, bytes);
1741
1742         return X86EMUL_CONTINUE;
1743 }
1744
1745 static int emulator_cmpxchg_emulated(unsigned long addr,
1746                                      unsigned long old,
1747                                      unsigned long new,
1748                                      unsigned int bytes,
1749                                      struct x86_emulate_ctxt *ctxt)
1750 {
1751         static int reported;
1752
1753         if (!reported) {
1754                 reported = 1;
1755                 printk(KERN_WARNING "litevm: emulating exchange as write\n");
1756         }
1757         return emulator_write_emulated(addr, new, bytes, ctxt);
1758 }
1759
1760 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1761 {
1762         static int reported;
1763         uint8_t opcodes[4];
1764         unsigned long rip = vmcs_readl(GUEST_RIP);
1765         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
1766
1767         if (reported)
1768                 return;
1769
1770         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1771
1772         printk("emulation failed but !mmio_needed?"
1773                " rip %lx %02x %02x %02x %02x\n",
1774                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1775         reported = 1;
1776 }
1777
1778 struct x86_emulate_ops emulate_ops = {
1779         .read_std            = emulator_read_std,
1780         .write_std           = emulator_write_std,
1781         .read_emulated       = emulator_read_emulated,
1782         .write_emulated      = emulator_write_emulated,
1783         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1784 };
1785
1786 enum emulation_result {
1787         EMULATE_DONE,       /* no further processing */
1788         EMULATE_DO_MMIO,      /* litevm_run filled with mmio request */
1789         EMULATE_FAIL,         /* can't emulate this instruction */
1790 };
1791
1792 static int emulate_instruction(struct litevm_vcpu *vcpu,
1793                                struct litevm_run *run,
1794                                unsigned long cr2,
1795                                uint16_t error_code)
1796 {
1797         struct x86_emulate_ctxt emulate_ctxt;
1798         int r;
1799         uint32_t cs_ar;
1800
1801         vcpu_load_rsp_rip(vcpu);
1802
1803         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1804
1805         emulate_ctxt.vcpu = vcpu;
1806         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
1807         emulate_ctxt.cr2 = cr2;
1808         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1809                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
1810                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
1811                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1812
1813         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1814                 emulate_ctxt.cs_base = 0;
1815                 emulate_ctxt.ds_base = 0;
1816                 emulate_ctxt.es_base = 0;
1817                 emulate_ctxt.ss_base = 0;
1818                 emulate_ctxt.gs_base = 0;
1819                 emulate_ctxt.fs_base = 0;
1820         } else {
1821                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
1822                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
1823                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
1824                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
1825                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
1826                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
1827         }
1828
1829         vcpu->mmio_is_write = 0;
1830         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1831
1832         if ((r || vcpu->mmio_is_write) && run) {
1833                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1834                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1835                 run->mmio.len = vcpu->mmio_size;
1836                 run->mmio.is_write = vcpu->mmio_is_write;
1837         }
1838
1839         if (r) {
1840                 if (!vcpu->mmio_needed) {
1841                         report_emulation_failure(&emulate_ctxt);
1842                         return EMULATE_FAIL;
1843                 }
1844                 return EMULATE_DO_MMIO;
1845         }
1846
1847         vcpu_put_rsp_rip(vcpu);
1848         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
1849
1850         if (vcpu->mmio_is_write)
1851                 return EMULATE_DO_MMIO;
1852
1853         return EMULATE_DONE;
1854 }
1855
1856 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
1857 {
1858         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1859 }
1860
1861 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
1862 {
1863         vmcs_writel(GUEST_GDTR_BASE, base);
1864         vmcs_write32(GUEST_GDTR_LIMIT, limit);
1865 }
1866
1867 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
1868 {
1869         vmcs_writel(GUEST_IDTR_BASE, base);
1870         vmcs_write32(GUEST_IDTR_LIMIT, limit);
1871 }
1872
1873 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
1874                    unsigned long *rflags)
1875 {
1876         lmsw(vcpu, msw);
1877         *rflags = vmcs_readl(GUEST_RFLAGS);
1878 }
1879
1880 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
1881 {
1882         switch (cr) {
1883         case 0:
1884                 return guest_cr0();
1885         case 2:
1886                 return vcpu->cr2;
1887         case 3:
1888                 return vcpu->cr3;
1889         case 4:
1890                 return guest_cr4();
1891         default:
1892                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1893                 return 0;
1894         }
1895 }
1896
1897 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
1898                      unsigned long *rflags)
1899 {
1900         switch (cr) {
1901         case 0:
1902                 set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
1903                 *rflags = vmcs_readl(GUEST_RFLAGS);
1904                 break;
1905         case 2:
1906                 vcpu->cr2 = val;
1907                 break;
1908         case 3:
1909                 set_cr3(vcpu, val);
1910                 break;
1911         case 4:
1912                 set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
1913                 break;
1914         default:
1915                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1916         }
1917 }
1918
1919 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
1920                                   int vec, uint32_t err_code)
1921 {
1922         if (!vcpu->rmode.active)
1923                 return 0;
1924
1925         if (vec == GP_VECTOR && err_code == 0)
1926                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
1927                         return 1;
1928         return 0;
1929 }
1930
1931 static int handle_exception(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
1932 {
1933         uint32_t intr_info, error_code;
1934         unsigned long cr2, rip;
1935         uint32_t vect_info;
1936         enum emulation_result er;
1937
1938         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1939         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1940
1941         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1942                                                 !is_page_fault(intr_info)) {
1943                 printk("%s: unexpected, vectoring info 0x%x "
1944                        "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1945         }
1946
1947         if (is_external_interrupt(vect_info)) {
1948                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1949                 set_bit(irq, vcpu->irq_pending);
1950                 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
1951         }
1952
1953         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
1954                 asm ("int $2");
1955                 return 1;
1956         }
1957         error_code = 0;
1958         rip = vmcs_readl(GUEST_RIP);
1959         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1960                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1961         if (is_page_fault(intr_info)) {
1962                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1963
1964                 spin_lock(&vcpu->litevm->lock);
1965                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
1966                         spin_unlock(&vcpu->litevm->lock);
1967                         return 1;
1968                 }
1969
1970                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
1971                 spin_unlock(&vcpu->litevm->lock);
1972
1973                 switch (er) {
1974                 case EMULATE_DONE:
1975                         return 1;
1976                 case EMULATE_DO_MMIO:
1977                         ++litevm_stat.mmio_exits;
1978                         litevm_run->exit_reason = LITEVM_EXIT_MMIO;
1979                         return 0;
1980                  case EMULATE_FAIL:
1981                         vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
1982                         break;
1983                 default:
1984                         BUG();
1985                 }
1986         }
1987
1988         if (vcpu->rmode.active &&
1989             handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1990                                                                 error_code))
1991                 return 1;
1992
1993         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
1994                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
1995                 return 0;
1996         }
1997         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
1998         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1999         litevm_run->ex.error_code = error_code;
2000         return 0;
2001 }
2002
2003 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2004                                      struct litevm_run *litevm_run)
2005 {
2006         ++litevm_stat.irq_exits;
2007         return 1;
2008 }
2009
2010
2011 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t *count)
2012 {
2013         uint64_t inst;
2014         gva_t rip;
2015         int countr_size;
2016         int i, n;
2017
2018         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2019                 countr_size = 2;
2020         } else {
2021                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2022
2023                 countr_size = (cs_ar & AR_L_MASK) ? 8:
2024                               (cs_ar & AR_DB_MASK) ? 4: 2;
2025         }
2026
2027         rip =  vmcs_readl(GUEST_RIP);
2028         if (countr_size != 8)
2029                 rip += vmcs_readl(GUEST_CS_BASE);
2030
2031         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2032
2033         for (i = 0; i < n; i++) {
2034                 switch (((uint8_t*)&inst)[i]) {
2035                 case 0xf0:
2036                 case 0xf2:
2037                 case 0xf3:
2038                 case 0x2e:
2039                 case 0x36:
2040                 case 0x3e:
2041                 case 0x26:
2042                 case 0x64:
2043                 case 0x65:
2044                 case 0x66:
2045                         break;
2046                 case 0x67:
2047                         countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
2048                 default:
2049                         goto done;
2050                 }
2051         }
2052         return 0;
2053 done:
2054         countr_size *= 8;
2055         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2056         return 1;
2057 }
2058
2059 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2060 {
2061         uint64_t exit_qualification;
2062
2063         ++litevm_stat.io_exits;
2064         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2065         litevm_run->exit_reason = LITEVM_EXIT_IO;
2066         if (exit_qualification & 8)
2067                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2068         else
2069                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2070         litevm_run->io.size = (exit_qualification & 7) + 1;
2071         litevm_run->io.string = (exit_qualification & 16) != 0;
2072         litevm_run->io.string_down
2073                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2074         litevm_run->io.rep = (exit_qualification & 32) != 0;
2075         litevm_run->io.port = exit_qualification >> 16;
2076         if (litevm_run->io.string) {
2077                 if (!get_io_count(vcpu, &litevm_run->io.count))
2078                         return 1;
2079                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2080         } else
2081                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
2082         return 0;
2083 }
2084
2085 static int handle_invlpg(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2086 {
2087         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2088         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2089         spin_lock(&vcpu->litevm->lock);
2090         vcpu->mmu.inval_page(vcpu, address);
2091         spin_unlock(&vcpu->litevm->lock);
2092         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2093         return 1;
2094 }
2095
2096 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2097 {
2098         uint64_t exit_qualification;
2099         int cr;
2100         int reg;
2101
2102 #ifdef LITEVM_DEBUG
2103         if (guest_cpl() != 0) {
2104                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2105                 inject_gp(vcpu);
2106                 return 1;
2107         }
2108 #endif
2109
2110         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2111         cr = exit_qualification & 15;
2112         reg = (exit_qualification >> 8) & 15;
2113         switch ((exit_qualification >> 4) & 3) {
2114         case 0: /* mov to cr */
2115                 switch (cr) {
2116                 case 0:
2117                         vcpu_load_rsp_rip(vcpu);
2118                         set_cr0(vcpu, vcpu->regs[reg]);
2119                         skip_emulated_instruction(vcpu);
2120                         return 1;
2121                 case 3:
2122                         vcpu_load_rsp_rip(vcpu);
2123                         set_cr3(vcpu, vcpu->regs[reg]);
2124                         skip_emulated_instruction(vcpu);
2125                         return 1;
2126                 case 4:
2127                         vcpu_load_rsp_rip(vcpu);
2128                         set_cr4(vcpu, vcpu->regs[reg]);
2129                         skip_emulated_instruction(vcpu);
2130                         return 1;
2131                 case 8:
2132                         vcpu_load_rsp_rip(vcpu);
2133                         set_cr8(vcpu, vcpu->regs[reg]);
2134                         skip_emulated_instruction(vcpu);
2135                         return 1;
2136                 };
2137                 break;
2138         case 1: /*mov from cr*/
2139                 switch (cr) {
2140                 case 3:
2141                         vcpu_load_rsp_rip(vcpu);
2142                         vcpu->regs[reg] = vcpu->cr3;
2143                         vcpu_put_rsp_rip(vcpu);
2144                         skip_emulated_instruction(vcpu);
2145                         return 1;
2146                 case 8:
2147                         printd("handle_cr: read CR8 "
2148                                "cpu erratum AA15\n");
2149                         vcpu_load_rsp_rip(vcpu);
2150                         vcpu->regs[reg] = vcpu->cr8;
2151                         vcpu_put_rsp_rip(vcpu);
2152                         skip_emulated_instruction(vcpu);
2153                         return 1;
2154                 }
2155                 break;
2156         case 3: /* lmsw */
2157                 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2158
2159                 skip_emulated_instruction(vcpu);
2160                 return 1;
2161         default:
2162                 break;
2163         }
2164         litevm_run->exit_reason = 0;
2165         printk("litevm: unhandled control register: op %d cr %d\n",
2166                (int)(exit_qualification >> 4) & 3, cr);
2167         return 0;
2168 }
2169
2170 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2171 {
2172         uint64_t exit_qualification;
2173         unsigned long val;
2174         int dr, reg;
2175
2176         /*
2177          * FIXME: this code assumes the host is debugging the guest.
2178          *        need to deal with guest debugging itself too.
2179          */
2180         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2181         dr = exit_qualification & 7;
2182         reg = (exit_qualification >> 8) & 15;
2183         vcpu_load_rsp_rip(vcpu);
2184         if (exit_qualification & 16) {
2185                 /* mov from dr */
2186                 switch (dr) {
2187                 case 6:
2188                         val = 0xffff0ff0;
2189                         break;
2190                 case 7:
2191                         val = 0x400;
2192                         break;
2193                 default:
2194                         val = 0;
2195                 }
2196                 vcpu->regs[reg] = val;
2197         } else {
2198                 /* mov to dr */
2199         }
2200         vcpu_put_rsp_rip(vcpu);
2201         skip_emulated_instruction(vcpu);
2202         return 1;
2203 }
2204
2205 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2206 {
2207         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2208         return 0;
2209 }
2210
2211 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2212 {
2213         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2214         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2215         uint64_t data;
2216
2217 #ifdef LITEVM_DEBUG
2218         if (guest_cpl() != 0) {
2219                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2220                 inject_gp(vcpu);
2221                 return 1;
2222         }
2223 #endif
2224
2225         switch (ecx) {
2226 #ifdef __x86_64__
2227         case MSR_FS_BASE:
2228                 data = vmcs_readl(GUEST_FS_BASE);
2229                 break;
2230         case MSR_GS_BASE:
2231                 data = vmcs_readl(GUEST_GS_BASE);
2232                 break;
2233 #endif
2234         case MSR_IA32_SYSENTER_CS:
2235                 data = vmcs_read32(GUEST_SYSENTER_CS);
2236                 break;
2237         case MSR_IA32_SYSENTER_EIP:
2238                 data = vmcs_read32(GUEST_SYSENTER_EIP);
2239                 break;
2240         case MSR_IA32_SYSENTER_ESP:
2241                 data = vmcs_read32(GUEST_SYSENTER_ESP);
2242                 break;
2243         case MSR_IA32_MC0_CTL:
2244         case MSR_IA32_MCG_STATUS:
2245         case MSR_IA32_MCG_CAP:
2246         case MSR_IA32_MC0_MISC:
2247         case MSR_IA32_MC0_MISC+4:
2248         case MSR_IA32_MC0_MISC+8:
2249         case MSR_IA32_MC0_MISC+12:
2250         case MSR_IA32_MC0_MISC+16:
2251         case MSR_IA32_UCODE_REV:
2252                 /* MTRR registers */
2253         case 0xfe:
2254         case 0x200 ... 0x2ff:
2255                 data = 0;
2256                 break;
2257         case MSR_IA32_APICBASE:
2258                 data = vcpu->apic_base;
2259                 break;
2260         default:
2261                 if (msr) {
2262                         data = msr->data;
2263                         break;
2264                 }
2265                 printk("litevm: unhandled rdmsr: %x\n", ecx);
2266                 inject_gp(vcpu);
2267                 return 1;
2268         }
2269
2270         /* FIXME: handling of bits 32:63 of rax, rdx */
2271         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2272         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2273         skip_emulated_instruction(vcpu);
2274         return 1;
2275 }
2276
2277 #ifdef __x86_64__
2278
2279 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2280 {
2281         struct vmx_msr_entry *msr;
2282
2283         if (efer & EFER_RESERVED_BITS) {
2284                 printd("set_efer: 0x%llx #GP, reserved bits\n",
2285                        efer);
2286                 inject_gp(vcpu);
2287                 return;
2288         }
2289
2290         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2291                 printd("set_efer: #GP, change LME while paging\n");
2292                 inject_gp(vcpu);
2293                 return;
2294         }
2295
2296         efer &= ~EFER_LMA;
2297         efer |= vcpu->shadow_efer & EFER_LMA;
2298
2299         vcpu->shadow_efer = efer;
2300
2301         msr = find_msr_entry(vcpu, MSR_EFER);
2302
2303         if (!(efer & EFER_LMA))
2304             efer &= ~EFER_LME;
2305         msr->data = efer;
2306         skip_emulated_instruction(vcpu);
2307 }
2308
2309 #endif
2310
2311 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2312
2313 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2314 {
2315         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2316         struct vmx_msr_entry *msr;
2317         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2318                 | ((uint64_t)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2319
2320 #ifdef LITEVM_DEBUG
2321         if (guest_cpl() != 0) {
2322                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2323                 inject_gp(vcpu);
2324                 return 1;
2325         }
2326 #endif
2327
2328         switch (ecx) {
2329 #ifdef __x86_64__
2330         case MSR_FS_BASE:
2331                 vmcs_writel(GUEST_FS_BASE, data);
2332                 break;
2333         case MSR_GS_BASE:
2334                 vmcs_writel(GUEST_GS_BASE, data);
2335                 break;
2336 #endif
2337         case MSR_IA32_SYSENTER_CS:
2338                 vmcs_write32(GUEST_SYSENTER_CS, data);
2339                 break;
2340         case MSR_IA32_SYSENTER_EIP:
2341                 vmcs_write32(GUEST_SYSENTER_EIP, data);
2342                 break;
2343         case MSR_IA32_SYSENTER_ESP:
2344                 vmcs_write32(GUEST_SYSENTER_ESP, data);
2345                 break;
2346 #ifdef __x86_64
2347         case MSR_EFER:
2348                 set_efer(vcpu, data);
2349                 return 1;
2350         case MSR_IA32_MC0_STATUS:
2351                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
2352                             , __FUNCTION__, data);
2353                 break;
2354 #endif
2355         case MSR_IA32_TIME_STAMP_COUNTER: {
2356                 uint64_t tsc;
2357
2358                 rdtscll(tsc);
2359                 vmcs_write64(TSC_OFFSET, data - tsc);
2360                 break;
2361         }
2362         case MSR_IA32_UCODE_REV:
2363         case MSR_IA32_UCODE_WRITE:
2364         case 0x200 ... 0x2ff: /* MTRRs */
2365                 break;
2366         case MSR_IA32_APICBASE:
2367                 vcpu->apic_base = data;
2368                 break;
2369         default:
2370                 msr = find_msr_entry(vcpu, ecx);
2371                 if (msr) {
2372                         msr->data = data;
2373                         break;
2374                 }
2375                 printk("litevm: unhandled wrmsr: %x\n", ecx);
2376                 inject_gp(vcpu);
2377                 return 1;
2378         }
2379         skip_emulated_instruction(vcpu);
2380         return 1;
2381 }
2382
2383 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2384                                    struct litevm_run *litevm_run)
2385 {
2386         /* Turn off interrupt window reporting. */
2387         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2388                      vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2389                      & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2390         return 1;
2391 }
2392
2393 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2394 {
2395         skip_emulated_instruction(vcpu);
2396         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF))
2397                 return 1;
2398
2399         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2400         return 0;
2401 }
2402
2403 /*
2404  * The exit handlers return 1 if the exit was handled fully and guest execution
2405  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2406  * to be done to userspace and return 0.
2407  */
2408 static int (*litevm_vmx_exit_handlers[])(struct litevm_vcpu *vcpu,
2409                                       struct litevm_run *litevm_run) = {
2410         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
2411         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
2412         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
2413         [EXIT_REASON_INVLPG]                  = handle_invlpg,
2414         [EXIT_REASON_CR_ACCESS]               = handle_cr,
2415         [EXIT_REASON_DR_ACCESS]               = handle_dr,
2416         [EXIT_REASON_CPUID]                   = handle_cpuid,
2417         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
2418         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
2419         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
2420         [EXIT_REASON_HLT]                     = handle_halt,
2421 };
2422
2423 static const int litevm_vmx_max_exit_handlers =
2424         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2425
2426 /*
2427  * The guest has exited.  See if we can fix it or if we need userspace
2428  * assistance.
2429  */
2430 static int litevm_handle_exit(struct litevm_run *litevm_run, struct litevm_vcpu *vcpu)
2431 {
2432         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2433         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2434
2435         if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
2436                                 exit_reason != EXIT_REASON_EXCEPTION_NMI )
2437                 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2438                        "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2439         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2440         if (exit_reason < litevm_vmx_max_exit_handlers
2441             && litevm_vmx_exit_handlers[exit_reason])
2442                 return litevm_vmx_exit_handlers[exit_reason](vcpu, litevm_run);
2443         else {
2444                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2445                 litevm_run->hw.hardware_exit_reason = exit_reason;
2446         }
2447         return 0;
2448 }
2449
2450 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2451 {
2452         uint16_t ent[2];
2453         uint16_t cs;
2454         uint16_t ip;
2455         unsigned long flags;
2456         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2457         uint16_t sp =  vmcs_readl(GUEST_RSP);
2458         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2459
2460         if (sp > ss_limit || sp - 6 > sp) {
2461                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2462                             __FUNCTION__,
2463                             vmcs_readl(GUEST_RSP),
2464                             vmcs_readl(GUEST_SS_BASE),
2465                             vmcs_read32(GUEST_SS_LIMIT));
2466                 return;
2467         }
2468
2469         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2470                                                                 sizeof(ent)) {
2471                 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2472                 return;
2473         }
2474
2475         flags =  vmcs_readl(GUEST_RFLAGS);
2476         cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
2477         ip =  vmcs_readl(GUEST_RIP);
2478
2479
2480         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2481             litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2482             litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2483                 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2484                 return;
2485         }
2486
2487         vmcs_writel(GUEST_RFLAGS, flags &
2488                     ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2489         vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
2490         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2491         vmcs_writel(GUEST_RIP, ent[0]);
2492         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2493 }
2494
2495 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2496 {
2497         int word_index = __ffs(vcpu->irq_summary);
2498         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2499         int irq = word_index * BITS_PER_LONG + bit_index;
2500
2501         clear_bit(bit_index, &vcpu->irq_pending[word_index]);
2502         if (!vcpu->irq_pending[word_index])
2503                 clear_bit(word_index, &vcpu->irq_summary);
2504
2505         if (vcpu->rmode.active) {
2506                 inject_rmode_irq(vcpu, irq);
2507                 return;
2508         }
2509         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2510                         irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2511 }
2512
2513 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2514 {
2515         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2516             && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2517                 /*
2518                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2519                  */
2520                 litevm_do_inject_irq(vcpu);
2521         else
2522                 /*
2523                  * Interrupts blocked.  Wait for unblock.
2524                  */
2525                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2526                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2527                              | CPU_BASED_VIRTUAL_INTR_PENDING);
2528 }
2529
2530 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2531 {
2532         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2533
2534         set_debugreg(dbg->bp[0], 0);
2535         set_debugreg(dbg->bp[1], 1);
2536         set_debugreg(dbg->bp[2], 2);
2537         set_debugreg(dbg->bp[3], 3);
2538
2539         if (dbg->singlestep) {
2540                 unsigned long flags;
2541
2542                 flags = vmcs_readl(GUEST_RFLAGS);
2543                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2544                 vmcs_writel(GUEST_RFLAGS, flags);
2545         }
2546 }
2547
2548 static void load_msrs(struct vmx_msr_entry *e, int n)
2549 {
2550         int i;
2551
2552         for (i = 0; i < n; ++i)
2553                 wrmsrl(e[i].index, e[i].data);
2554 }
2555
2556 static void save_msrs(struct vmx_msr_entry *e, int n)
2557 {
2558         int i;
2559
2560         for (i = 0; i < n; ++i)
2561                 rdmsrl(e[i].index, e[i].data);
2562 }
2563
2564 static int litevm_dev_ioctl_run(struct litevm *litevm, struct litevm_run *litevm_run)
2565 {
2566         struct litevm_vcpu *vcpu;
2567         uint8_t fail;
2568         uint16_t fs_sel, gs_sel, ldt_sel;
2569         int fs_gs_ldt_reload_needed;
2570
2571         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2572                 return -EINVAL;
2573
2574         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2575         if (!vcpu)
2576                 return -ENOENT;
2577
2578         if (litevm_run->emulated) {
2579                 skip_emulated_instruction(vcpu);
2580                 litevm_run->emulated = 0;
2581         }
2582
2583         if (litevm_run->mmio_completed) {
2584                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
2585                 vcpu->mmio_read_completed = 1;
2586         }
2587
2588         vcpu->mmio_needed = 0;
2589
2590 again:
2591         /*
2592          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2593          * allow segment selectors with cpl > 0 or ti == 1.
2594          */
2595         fs_sel = read_fs();
2596         gs_sel = read_gs();
2597         ldt_sel = read_ldt();
2598         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
2599         if (!fs_gs_ldt_reload_needed) {
2600                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2601                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2602         } else {
2603                 vmcs_write16(HOST_FS_SELECTOR, 0);
2604                 vmcs_write16(HOST_GS_SELECTOR, 0);
2605         }
2606
2607 #ifdef __x86_64__
2608         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
2609         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2610 #endif
2611
2612         if (vcpu->irq_summary &&
2613             !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2614                 litevm_try_inject_irq(vcpu);
2615
2616         if (vcpu->guest_debug.enabled)
2617                 litevm_guest_debug_pre(vcpu);
2618
2619         fx_save(vcpu->host_fx_image);
2620         fx_restore(vcpu->guest_fx_image);
2621
2622         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
2623         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2624
2625         asm (
2626                 /* Store host registers */
2627                 "pushf \n\t"
2628 #ifdef __x86_64__
2629                 "push %%rax; push %%rbx; push %%rdx;"
2630                 "push %%rsi; push %%rdi; push %%rbp;"
2631                 "push %%r8;  push %%r9;  push %%r10; push %%r11;"
2632                 "push %%r12; push %%r13; push %%r14; push %%r15;"
2633                 "push %%rcx \n\t"
2634                 "vmwrite %%rsp, %2 \n\t"
2635 #else
2636                 "pusha; push %%ecx \n\t"
2637                 "vmwrite %%esp, %2 \n\t"
2638 #endif
2639                 /* Check if vmlaunch of vmresume is needed */
2640                 "cmp $0, %1 \n\t"
2641                 /* Load guest registers.  Don't clobber flags. */
2642 #ifdef __x86_64__
2643                 "mov %c[cr2](%3), %%rax \n\t"
2644                 "mov %%rax, %%cr2 \n\t"
2645                 "mov %c[rax](%3), %%rax \n\t"
2646                 "mov %c[rbx](%3), %%rbx \n\t"
2647                 "mov %c[rdx](%3), %%rdx \n\t"
2648                 "mov %c[rsi](%3), %%rsi \n\t"
2649                 "mov %c[rdi](%3), %%rdi \n\t"
2650                 "mov %c[rbp](%3), %%rbp \n\t"
2651                 "mov %c[r8](%3),  %%r8  \n\t"
2652                 "mov %c[r9](%3),  %%r9  \n\t"
2653                 "mov %c[r10](%3), %%r10 \n\t"
2654                 "mov %c[r11](%3), %%r11 \n\t"
2655                 "mov %c[r12](%3), %%r12 \n\t"
2656                 "mov %c[r13](%3), %%r13 \n\t"
2657                 "mov %c[r14](%3), %%r14 \n\t"
2658                 "mov %c[r15](%3), %%r15 \n\t"
2659                 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
2660 #else
2661                 "mov %c[cr2](%3), %%eax \n\t"
2662                 "mov %%eax,   %%cr2 \n\t"
2663                 "mov %c[rax](%3), %%eax \n\t"
2664                 "mov %c[rbx](%3), %%ebx \n\t"
2665                 "mov %c[rdx](%3), %%edx \n\t"
2666                 "mov %c[rsi](%3), %%esi \n\t"
2667                 "mov %c[rdi](%3), %%edi \n\t"
2668                 "mov %c[rbp](%3), %%ebp \n\t"
2669                 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
2670 #endif
2671                 /* Enter guest mode */
2672                 "jne launched \n\t"
2673                 "vmlaunch \n\t"
2674                 "jmp litevm_vmx_return \n\t"
2675                 "launched: vmresume \n\t"
2676                 ".globl litevm_vmx_return \n\t"
2677                 "litevm_vmx_return: "
2678                 /* Save guest registers, load host registers, keep flags */
2679 #ifdef __x86_64__
2680                 "xchg %3,     0(%%rsp) \n\t"
2681                 "mov %%rax, %c[rax](%3) \n\t"
2682                 "mov %%rbx, %c[rbx](%3) \n\t"
2683                 "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
2684                 "mov %%rdx, %c[rdx](%3) \n\t"
2685                 "mov %%rsi, %c[rsi](%3) \n\t"
2686                 "mov %%rdi, %c[rdi](%3) \n\t"
2687                 "mov %%rbp, %c[rbp](%3) \n\t"
2688                 "mov %%r8,  %c[r8](%3) \n\t"
2689                 "mov %%r9,  %c[r9](%3) \n\t"
2690                 "mov %%r10, %c[r10](%3) \n\t"
2691                 "mov %%r11, %c[r11](%3) \n\t"
2692                 "mov %%r12, %c[r12](%3) \n\t"
2693                 "mov %%r13, %c[r13](%3) \n\t"
2694                 "mov %%r14, %c[r14](%3) \n\t"
2695                 "mov %%r15, %c[r15](%3) \n\t"
2696                 "mov %%cr2, %%rax   \n\t"
2697                 "mov %%rax, %c[cr2](%3) \n\t"
2698                 "mov 0(%%rsp), %3 \n\t"
2699
2700                 "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
2701                 "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
2702                 "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
2703                 "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
2704 #else
2705                 "xchg %3, 0(%%esp) \n\t"
2706                 "mov %%eax, %c[rax](%3) \n\t"
2707                 "mov %%ebx, %c[rbx](%3) \n\t"
2708                 "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
2709                 "mov %%edx, %c[rdx](%3) \n\t"
2710                 "mov %%esi, %c[rsi](%3) \n\t"
2711                 "mov %%edi, %c[rdi](%3) \n\t"
2712                 "mov %%ebp, %c[rbp](%3) \n\t"
2713                 "mov %%cr2, %%eax  \n\t"
2714                 "mov %%eax, %c[cr2](%3) \n\t"
2715                 "mov 0(%%esp), %3 \n\t"
2716
2717                 "pop %%ecx; popa \n\t"
2718 #endif
2719                 "setbe %0 \n\t"
2720                 "popf \n\t"
2721               : "=g" (fail)
2722               : "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
2723                 "c"(vcpu),
2724                 [rax]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
2725                 [rbx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
2726                 [rcx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
2727                 [rdx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
2728                 [rsi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
2729                 [rdi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
2730                 [rbp]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
2731 #ifdef __x86_64__
2732                 [r8 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8 ])),
2733                 [r9 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9 ])),
2734                 [r10]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
2735                 [r11]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
2736                 [r12]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
2737                 [r13]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
2738                 [r14]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
2739                 [r15]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
2740 #endif
2741                 [cr2]"i"(offsetof(struct litevm_vcpu, cr2))
2742               : "cc", "memory" );
2743
2744         ++litevm_stat.exits;
2745
2746         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2747         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
2748
2749         fx_save(vcpu->guest_fx_image);
2750         fx_restore(vcpu->host_fx_image);
2751
2752 #ifndef __x86_64__
2753         asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2754 #endif
2755
2756         litevm_run->exit_type = 0;
2757         if (fail) {
2758                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
2759                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
2760         } else {
2761                 if (fs_gs_ldt_reload_needed) {
2762                         load_ldt(ldt_sel);
2763                         load_fs(fs_sel);
2764                         /*
2765                          * If we have to reload gs, we must take care to
2766                          * preserve our gs base.
2767                          */
2768                         local_irq_disable();
2769                         load_gs(gs_sel);
2770 #ifdef __x86_64__
2771                         wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
2772 #endif
2773                         local_irq_enable();
2774
2775                         reload_tss();
2776                 }
2777                 vcpu->launched = 1;
2778                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
2779                 if (litevm_handle_exit(litevm_run, vcpu)) {
2780                         /* Give scheduler a change to reschedule. */
2781                         vcpu_put(vcpu);
2782                         if (signal_pending(current)) {
2783                                 ++litevm_stat.signal_exits;
2784                                 return -EINTR;
2785                         }
2786                         cond_resched();
2787                         /* Cannot fail -  no vcpu unplug yet. */
2788                         vcpu_load(litevm, vcpu_slot(vcpu));
2789                         goto again;
2790                 }
2791         }
2792
2793         vcpu_put(vcpu);
2794         return 0;
2795 }
2796
2797 static int litevm_dev_ioctl_get_regs(struct litevm *litevm, struct litevm_regs *regs)
2798 {
2799         struct litevm_vcpu *vcpu;
2800
2801         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS)
2802                 return -EINVAL;
2803
2804         vcpu = vcpu_load(litevm, regs->vcpu);
2805         if (!vcpu)
2806                 return -ENOENT;
2807
2808         regs->rax = vcpu->regs[VCPU_REGS_RAX];
2809         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2810         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2811         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2812         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2813         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2814         regs->rsp = vmcs_readl(GUEST_RSP);
2815         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2816 #ifdef __x86_64__
2817         regs->r8 = vcpu->regs[VCPU_REGS_R8];
2818         regs->r9 = vcpu->regs[VCPU_REGS_R9];
2819         regs->r10 = vcpu->regs[VCPU_REGS_R10];
2820         regs->r11 = vcpu->regs[VCPU_REGS_R11];
2821         regs->r12 = vcpu->regs[VCPU_REGS_R12];
2822         regs->r13 = vcpu->regs[VCPU_REGS_R13];
2823         regs->r14 = vcpu->regs[VCPU_REGS_R14];
2824         regs->r15 = vcpu->regs[VCPU_REGS_R15];
2825 #endif
2826
2827         regs->rip = vmcs_readl(GUEST_RIP);
2828         regs->rflags = vmcs_readl(GUEST_RFLAGS);
2829
2830         /*
2831          * Don't leak debug flags in case they were set for guest debugging
2832          */
2833         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2834                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2835
2836         vcpu_put(vcpu);
2837
2838         return 0;
2839 }
2840
2841 static int litevm_dev_ioctl_set_regs(struct litevm *litevm, struct litevm_regs *regs)
2842 {
2843         struct litevm_vcpu *vcpu;
2844
2845         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS)
2846                 return -EINVAL;
2847
2848         vcpu = vcpu_load(litevm, regs->vcpu);
2849         if (!vcpu)
2850                 return -ENOENT;
2851
2852         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2853         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2854         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2855         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2856         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2857         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2858         vmcs_writel(GUEST_RSP, regs->rsp);
2859         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2860 #ifdef __x86_64__
2861         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2862         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2863         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2864         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2865         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2866         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2867         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2868         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2869 #endif
2870
2871         vmcs_writel(GUEST_RIP, regs->rip);
2872         vmcs_writel(GUEST_RFLAGS, regs->rflags);
2873
2874         vcpu_put(vcpu);
2875
2876         return 0;
2877 }
2878
2879 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
2880 {
2881         struct litevm_vcpu *vcpu;
2882
2883         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS)
2884                 return -EINVAL;
2885         vcpu = vcpu_load(litevm, sregs->vcpu);
2886         if (!vcpu)
2887                 return -ENOENT;
2888
2889 #define get_segment(var, seg) \
2890         do { \
2891                 uint32_t ar; \
2892                 \
2893                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
2894                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
2895                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
2896                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
2897                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
2898                 sregs->var.type = ar & 15; \
2899                 sregs->var.s = (ar >> 4) & 1; \
2900                 sregs->var.dpl = (ar >> 5) & 3; \
2901                 sregs->var.present = (ar >> 7) & 1; \
2902                 sregs->var.avl = (ar >> 12) & 1; \
2903                 sregs->var.l = (ar >> 13) & 1; \
2904                 sregs->var.db = (ar >> 14) & 1; \
2905                 sregs->var.g = (ar >> 15) & 1; \
2906                 sregs->var.unusable = (ar >> 16) & 1; \
2907         } while (0);
2908
2909         get_segment(cs, CS);
2910         get_segment(ds, DS);
2911         get_segment(es, ES);
2912         get_segment(fs, FS);
2913         get_segment(gs, GS);
2914         get_segment(ss, SS);
2915
2916         get_segment(tr, TR);
2917         get_segment(ldt, LDTR);
2918 #undef get_segment
2919
2920 #define get_dtable(var, table) \
2921         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
2922                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
2923
2924         get_dtable(idt, IDTR);
2925         get_dtable(gdt, GDTR);
2926 #undef get_dtable
2927
2928         sregs->cr0 = guest_cr0();
2929         sregs->cr2 = vcpu->cr2;
2930         sregs->cr3 = vcpu->cr3;
2931         sregs->cr4 = guest_cr4();
2932         sregs->cr8 = vcpu->cr8;
2933         sregs->efer = vcpu->shadow_efer;
2934         sregs->apic_base = vcpu->apic_base;
2935
2936         sregs->pending_int = vcpu->irq_summary != 0;
2937
2938         vcpu_put(vcpu);
2939
2940         return 0;
2941 }
2942
2943 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm, struct litevm_sregs *sregs)
2944 {
2945         struct litevm_vcpu *vcpu;
2946         int mmu_reset_needed = 0;
2947
2948         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS)
2949                 return -EINVAL;
2950         vcpu = vcpu_load(litevm, sregs->vcpu);
2951         if (!vcpu)
2952                 return -ENOENT;
2953
2954 #define set_segment(var, seg) \
2955         do { \
2956                 uint32_t ar; \
2957                 \
2958                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
2959                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
2960                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
2961                 if (sregs->var.unusable) { \
2962                         ar = (1 << 16); \
2963                 } else { \
2964                         ar = (sregs->var.type & 15); \
2965                         ar |= (sregs->var.s & 1) << 4; \
2966                         ar |= (sregs->var.dpl & 3) << 5; \
2967                         ar |= (sregs->var.present & 1) << 7; \
2968                         ar |= (sregs->var.avl & 1) << 12; \
2969                         ar |= (sregs->var.l & 1) << 13; \
2970                         ar |= (sregs->var.db & 1) << 14; \
2971                         ar |= (sregs->var.g & 1) << 15; \
2972                 } \
2973                 vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
2974         } while (0);
2975
2976         set_segment(cs, CS);
2977         set_segment(ds, DS);
2978         set_segment(es, ES);
2979         set_segment(fs, FS);
2980         set_segment(gs, GS);
2981         set_segment(ss, SS);
2982
2983         set_segment(tr, TR);
2984
2985         set_segment(ldt, LDTR);
2986 #undef set_segment
2987
2988 #define set_dtable(var, table) \
2989         vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
2990         vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
2991
2992         set_dtable(idt, IDTR);
2993         set_dtable(gdt, GDTR);
2994 #undef set_dtable
2995
2996         vcpu->cr2 = sregs->cr2;
2997         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2998         vcpu->cr3 = sregs->cr3;
2999
3000         vcpu->cr8 = sregs->cr8;
3001
3002         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
3003 #ifdef __x86_64__
3004         __set_efer(vcpu, sregs->efer);
3005 #endif
3006         vcpu->apic_base = sregs->apic_base;
3007
3008         mmu_reset_needed |= guest_cr0() != sregs->cr0;
3009         vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
3010         update_exception_bitmap(vcpu);
3011         vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
3012         vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
3013
3014         mmu_reset_needed |=  guest_cr4() != sregs->cr4;
3015         __set_cr4(vcpu, sregs->cr4);
3016
3017         if (mmu_reset_needed)
3018                 litevm_mmu_reset_context(vcpu);
3019         vcpu_put(vcpu);
3020
3021         return 0;
3022 }
3023
3024 /*
3025  * Translate a guest virtual address to a guest physical address.
3026  */
3027 static int litevm_dev_ioctl_translate(struct litevm *litevm, struct litevm_translation *tr)
3028 {
3029         unsigned long vaddr = tr->linear_address;
3030         struct litevm_vcpu *vcpu;
3031         gpa_t gpa;
3032
3033         vcpu = vcpu_load(litevm, tr->vcpu);
3034         if (!vcpu)
3035                 return -ENOENT;
3036         spin_lock(&litevm->lock);
3037         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
3038         tr->physical_address = gpa;
3039         tr->valid = gpa != UNMAPPED_GVA;
3040         tr->writeable = 1;
3041         tr->usermode = 0;
3042         spin_unlock(&litevm->lock);
3043         vcpu_put(vcpu);
3044
3045         return 0;
3046 }
3047
3048 static int litevm_dev_ioctl_interrupt(struct litevm *litevm, struct litevm_interrupt *irq)
3049 {
3050         struct litevm_vcpu *vcpu;
3051
3052         if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS)
3053                 return -EINVAL;
3054         if (irq->irq < 0 || irq->irq >= 256)
3055                 return -EINVAL;
3056         vcpu = vcpu_load(litevm, irq->vcpu);
3057         if (!vcpu)
3058                 return -ENOENT;
3059
3060         set_bit(irq->irq, vcpu->irq_pending);
3061         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
3062
3063         vcpu_put(vcpu);
3064
3065         return 0;
3066 }
3067
3068 static int litevm_dev_ioctl_debug_guest(struct litevm *litevm,
3069                                      struct litevm_debug_guest *dbg)
3070 {
3071         struct litevm_vcpu *vcpu;
3072         unsigned long dr7 = 0x400;
3073         uint32_t exception_bitmap;
3074         int old_singlestep;
3075
3076         if (dbg->vcpu < 0 || dbg->vcpu >= LITEVM_MAX_VCPUS)
3077                 return -EINVAL;
3078         vcpu = vcpu_load(litevm, dbg->vcpu);
3079         if (!vcpu)
3080                 return -ENOENT;
3081
3082         exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
3083         old_singlestep = vcpu->guest_debug.singlestep;
3084
3085         vcpu->guest_debug.enabled = dbg->enabled;
3086         if (vcpu->guest_debug.enabled) {
3087                 int i;
3088
3089                 dr7 |= 0x200;  /* exact */
3090                 for (i = 0; i < 4; ++i) {
3091                         if (!dbg->breakpoints[i].enabled)
3092                                 continue;
3093                         vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
3094                         dr7 |= 2 << (i*2);    /* global enable */
3095                         dr7 |= 0 << (i*4+16); /* execution breakpoint */
3096                 }
3097
3098                 exception_bitmap |= (1u << 1);  /* Trap debug exceptions */
3099
3100                 vcpu->guest_debug.singlestep = dbg->singlestep;
3101         } else {
3102                 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
3103                 vcpu->guest_debug.singlestep = 0;
3104         }
3105
3106         if (old_singlestep && !vcpu->guest_debug.singlestep) {
3107                 unsigned long flags;
3108
3109                 flags = vmcs_readl(GUEST_RFLAGS);
3110                 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3111                 vmcs_writel(GUEST_RFLAGS, flags);
3112         }
3113
3114         vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
3115         vmcs_writel(GUEST_DR7, dr7);
3116
3117         vcpu_put(vcpu);
3118
3119         return 0;
3120 }
3121
3122 static long litevm_dev_ioctl(struct file *filp,
3123                           unsigned int ioctl, unsigned long arg)
3124 {
3125         struct litevm *litevm = filp->private_data;
3126         int r = -EINVAL;
3127
3128         switch (ioctl) {
3129         case LITEVM_CREATE_VCPU: {
3130                 r = litevm_dev_ioctl_create_vcpu(litevm, arg);
3131                 if (r)
3132                         goto out;
3133                 break;
3134         }
3135         case LITEVM_RUN: {
3136                 struct litevm_run litevm_run;
3137
3138                 r = -EFAULT;
3139                 if (copy_from_user(&litevm_run, (void *)arg, sizeof litevm_run))
3140                         goto out;
3141                 r = litevm_dev_ioctl_run(litevm, &litevm_run);
3142                 if (r < 0)
3143                         goto out;
3144                 r = -EFAULT;
3145                 if (copy_to_user((void *)arg, &litevm_run, sizeof litevm_run))
3146                         goto out;
3147                 r = 0;
3148                 break;
3149         }
3150         case LITEVM_GET_REGS: {
3151                 struct litevm_regs litevm_regs;
3152
3153                 r = -EFAULT;
3154                 if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
3155                         goto out;
3156                 r = litevm_dev_ioctl_get_regs(litevm, &litevm_regs);
3157                 if (r)
3158                         goto out;
3159                 r = -EFAULT;
3160                 if (copy_to_user((void *)arg, &litevm_regs, sizeof litevm_regs))
3161                         goto out;
3162                 r = 0;
3163                 break;
3164         }
3165         case LITEVM_SET_REGS: {
3166                 struct litevm_regs litevm_regs;
3167
3168                 r = -EFAULT;
3169                 if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs))
3170                         goto out;
3171                 r = litevm_dev_ioctl_set_regs(litevm, &litevm_regs);
3172                 if (r)
3173                         goto out;
3174                 r = 0;
3175                 break;
3176         }
3177         case LITEVM_GET_SREGS: {
3178                 struct litevm_sregs litevm_sregs;
3179
3180                 r = -EFAULT;
3181                 if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3182                         goto out;
3183                 r = litevm_dev_ioctl_get_sregs(litevm, &litevm_sregs);
3184                 if (r)
3185                         goto out;
3186                 r = -EFAULT;
3187                 if (copy_to_user((void *)arg, &litevm_sregs, sizeof litevm_sregs))
3188                         goto out;
3189                 r = 0;
3190                 break;
3191         }
3192         case LITEVM_SET_SREGS: {
3193                 struct litevm_sregs litevm_sregs;
3194
3195                 r = -EFAULT;
3196                 if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
3197                         goto out;
3198                 r = litevm_dev_ioctl_set_sregs(litevm, &litevm_sregs);
3199                 if (r)
3200                         goto out;
3201                 r = 0;
3202                 break;
3203         }
3204         case LITEVM_TRANSLATE: {
3205                 struct litevm_translation tr;
3206
3207                 r = -EFAULT;
3208                 if (copy_from_user(&tr, (void *)arg, sizeof tr))
3209                         goto out;
3210                 r = litevm_dev_ioctl_translate(litevm, &tr);
3211                 if (r)
3212                         goto out;
3213                 r = -EFAULT;
3214                 if (copy_to_user((void *)arg, &tr, sizeof tr))
3215                         goto out;
3216                 r = 0;
3217                 break;
3218         }
3219         case LITEVM_INTERRUPT: {
3220                 struct litevm_interrupt irq;
3221
3222                 r = -EFAULT;
3223                 if (copy_from_user(&irq, (void *)arg, sizeof irq))
3224                         goto out;
3225                 r = litevm_dev_ioctl_interrupt(litevm, &irq);
3226                 if (r)
3227                         goto out;
3228                 r = 0;
3229                 break;
3230         }
3231         case LITEVM_DEBUG_GUEST: {
3232                 struct litevm_debug_guest dbg;
3233
3234                 r = -EFAULT;
3235                 if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
3236                         goto out;
3237                 r = litevm_dev_ioctl_debug_guest(litevm, &dbg);
3238                 if (r)
3239                         goto out;
3240                 r = 0;
3241                 break;
3242         }
3243         case LITEVM_SET_MEMORY_REGION: {
3244                 struct litevm_memory_region litevm_mem;
3245
3246                 r = -EFAULT;
3247                 if (copy_from_user(&litevm_mem, (void *)arg, sizeof litevm_mem))
3248                         goto out;
3249                 r = litevm_dev_ioctl_set_memory_region(litevm, &litevm_mem);
3250                 if (r)
3251                         goto out;
3252                 break;
3253         }
3254         case LITEVM_GET_DIRTY_LOG: {
3255                 struct litevm_dirty_log log;
3256
3257                 r = -EFAULT;
3258                 if (copy_from_user(&log, (void *)arg, sizeof log))
3259                         goto out;
3260                 r = litevm_dev_ioctl_get_dirty_log(litevm, &log);
3261                 if (r)
3262                         goto out;
3263                 break;
3264         }
3265         default:
3266                 ;
3267         }
3268 out:
3269         return r;
3270 }
3271
3272 #if 0
3273 static int litevm_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3274 {
3275         struct litevm *litevm = vma->vm_file->private_data;
3276         struct litevm_memory_slot *slot;
3277         struct page *page;
3278
3279         slot = gfn_to_memslot(litevm, vmf->pgoff);
3280         if (!slot)
3281                 return VM_FAULT_SIGBUS;
3282         page = gfn_to_page(slot, vmf->pgoff);
3283         if (!page)
3284                 return VM_FAULT_SIGBUS;
3285
3286         get_page(page);
3287         vmf->page = page;
3288         return 0;
3289 }
3290 #endif
3291
3292 static int litevm_reboot(struct notifier_block *notifier, unsigned long val,
3293                        void *v)
3294 {
3295         panic("litevm_reboot");
3296 #if 0
3297         if (val == SYS_RESTART) {
3298                 /*
3299                  * Some (well, at least mine) BIOSes hang on reboot if
3300                  * in vmx root mode.
3301                  */
3302                 printk("litevm: exiting vmx mode\n");
3303                 on_each_cpu(litevm_disable, 0, 1);
3304         }
3305         return NOTIFY_OK;
3306 #endif
3307         return 0;
3308 }
3309
3310 hpa_t bad_page_address;
3311
3312 static int litevm_init(void)
3313 {
3314         static struct page *bad_page;
3315         int r = 0;
3316
3317 #if 0
3318         if (!cpu_has_litevm_support()) {
3319                 printk("litevm: no hardware support\n");
3320                 return -EOPNOTSUPP;
3321         }
3322         if (vmx_disabled_by_bios()) {
3323                 printk("litevm: disabled by bios\n");
3324                 return -EOPNOTSUPP;
3325         }
3326 #endif
3327
3328         setup_vmcs_descriptor();
3329         r = alloc_litevm_area();
3330         if (r)
3331                 goto out;
3332 #warning "on each cpu ..."
3333 //      on_each_cpu(litevm_enable, 0, 1);
3334
3335         if ((bad_page = alloc_page(KMALLOC_WAIT)) == NULL) {
3336                 r = -ENOMEM;
3337                 goto out_free;
3338         }
3339
3340         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3341         memset(__va(bad_page_address), 0, PAGE_SIZE);
3342
3343         return r;
3344
3345 out_free:
3346         free_litevm_area();
3347 out:
3348         return r;
3349 }
3350
3351 static void litevm_exit(void)
3352 {
3353         free_litevm_area();
3354         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3355 }
3356
3357