Lindent pass
[akaros.git] / kern / arch / x86 / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  */
14
15 #define DEBUG
16 #define LITEVM_DEBUG
17
18 #include <kmalloc.h>
19 #include <string.h>
20 #include <stdio.h>
21 #include <assert.h>
22 #include <error.h>
23 #include <pmap.h>
24 #include <sys/queue.h>
25 #include <smp.h>
26 #include <kref.h>
27 #include <atomic.h>
28 #include <alarm.h>
29 #include <event.h>
30 #include <umem.h>
31 #include <devalarm.h>
32 #include <arch/types.h>
33 #include <arch/vm.h>
34 #include <arch/emulate.h>
35 #include <arch/vmdebug.h>
36 #include <arch/msr-index.h>
37
38 #define currentcpu (&per_cpu_info[core_id()])
39
40 struct litevm_stat litevm_stat;
41
42 static struct litevm_stats_debugfs_item {
43         const char *name;
44         uint32_t *data;
45 } debugfs_entries[] = {
46         {
47         "pf_fixed", &litevm_stat.pf_fixed}, {
48         "pf_guest", &litevm_stat.pf_guest}, {
49         "tlb_flush", &litevm_stat.tlb_flush}, {
50         "invlpg", &litevm_stat.invlpg}, {
51         "exits", &litevm_stat.exits}, {
52         "io_exits", &litevm_stat.io_exits}, {
53         "mmio_exits", &litevm_stat.mmio_exits}, {
54         "signal_exits", &litevm_stat.signal_exits}, {
55         "irq_exits", &litevm_stat.irq_exits}, {
56         0, 0}
57 };
58
59 static struct dentry *debugfs_dir;
60
61 static const uint32_t vmx_msr_index[] = {
62 #ifdef __x86_64__
63         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
64 #endif
65         MSR_EFER,       // wtf? MSR_K6_STAR,
66 };
67
68 #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
69
70 #ifdef __x86_64__
71 /*
72  * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
73  * mechanism (cpu bug AA24)
74  */
75 #define NR_BAD_MSRS 2
76 #else
77 #define NR_BAD_MSRS 0
78 #endif
79
80 #define TSS_IOPB_BASE_OFFSET 0x66
81 #define TSS_BASE_SIZE 0x68
82 #define TSS_IOPB_SIZE (65536 / 8)
83 #define TSS_REDIRECTION_SIZE (256 / 8)
84 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
85
86 #define MSR_IA32_VMX_BASIC_MSR                  0x480
87 #define MSR_IA32_VMX_PINBASED_CTLS_MSR          0x481
88 #define MSR_IA32_VMX_PROCBASED_CTLS_MSR         0x482
89 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
90 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
91
92 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
93 #define LMSW_GUEST_MASK 0x0eULL
94 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
95 //#define CR4_VMXE 0x2000
96 #define CR8_RESEVED_BITS (~0x0fULL)
97 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
98
99 #ifdef __x86_64__
100 #define HOST_IS_64 1
101 #else
102 #define HOST_IS_64 0
103 #endif
104
105 /* bit ops not yet widely used in akaros and we're not sure where to put them. */
106 /**
107  * __ffs - find first set bit in word
108  * @word: The word to search
109  *
110  * Undefined if no bit exists, so code should check against 0 first.
111  */
112 static inline unsigned long __ffs(unsigned long word)
113 {
114         print_func_entry();
115 asm("rep; bsf %1,%0":"=r"(word)
116 :               "rm"(word));
117         print_func_exit();
118         return word;
119 }
120
121 static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu,
122                                                                                         uint32_t msr)
123 {
124         print_func_entry();
125         int i;
126
127         for (i = 0; i < vcpu->nmsrs; ++i)
128                 if (vcpu->guest_msrs[i].index == msr) {
129                         print_func_exit();
130                         return &vcpu->guest_msrs[i];
131                 }
132         print_func_exit();
133         return 0;
134 }
135
136 struct descriptor_table {
137         uint16_t limit;
138         unsigned long base;
139 } __attribute__ ((packed));
140
141 static void get_gdt(struct descriptor_table *table)
142 {
143         print_func_entry();
144 asm("sgdt %0":"=m"(*table));
145         print_func_exit();
146 }
147
148 static void get_idt(struct descriptor_table *table)
149 {
150         print_func_entry();
151 asm("sidt %0":"=m"(*table));
152         print_func_exit();
153 }
154
155 static uint16_t read_fs(void)
156 {
157         print_func_entry();
158         uint16_t seg;
159 asm("mov %%fs, %0":"=g"(seg));
160         print_func_exit();
161         return seg;
162 }
163
164 static uint16_t read_gs(void)
165 {
166         print_func_entry();
167         uint16_t seg;
168 asm("mov %%gs, %0":"=g"(seg));
169         print_func_exit();
170         return seg;
171 }
172
173 static uint16_t read_ldt(void)
174 {
175         print_func_entry();
176         uint16_t ldt;
177 asm("sldt %0":"=g"(ldt));
178         print_func_exit();
179         return ldt;
180 }
181
182 static void load_fs(uint16_t sel)
183 {
184         print_func_entry();
185 asm("mov %0, %%fs": :"g"(sel));
186         print_func_exit();
187 }
188
189 static void load_gs(uint16_t sel)
190 {
191         print_func_entry();
192 asm("mov %0, %%gs": :"g"(sel));
193         print_func_exit();
194 }
195
196 #ifndef load_ldt
197 static void load_ldt(uint16_t sel)
198 {
199         print_func_entry();
200 asm("lldt %0": :"g"(sel));
201         print_func_exit();
202 }
203 #endif
204
205 static void fx_save(void *image)
206 {
207         print_func_entry();
208         asm("fxsave (%0)"::"r"(image));
209         print_func_exit();
210 }
211
212 static void fx_restore(void *image)
213 {
214         print_func_entry();
215         asm("fxrstor (%0)"::"r"(image));
216         print_func_exit();
217 }
218
219 static void fpu_init(void)
220 {
221         print_func_entry();
222         asm("finit");
223         print_func_exit();
224 }
225
226 struct segment_descriptor {
227         uint16_t limit_low;
228         uint16_t base_low;
229         uint8_t base_mid;
230         uint8_t type:4;
231         uint8_t system:1;
232         uint8_t dpl:2;
233         uint8_t present:1;
234         uint8_t limit_high:4;
235         uint8_t avl:1;
236         uint8_t long_mode:1;
237         uint8_t default_op:1;
238         uint8_t granularity:1;
239         uint8_t base_high;
240 } __attribute__ ((packed));
241
242 #ifdef __x86_64__
243 // LDT or TSS descriptor in the GDT. 16 bytes.
244 struct segment_descriptor_64 {
245         struct segment_descriptor s;
246         uint32_t base_higher;
247         uint32_t pad_zero;
248 };
249
250 #endif
251
252 static unsigned long segment_base(uint16_t selector)
253 {
254         print_func_entry();
255         struct descriptor_table gdt;
256         struct segment_descriptor *d;
257         unsigned long table_base;
258         typedef unsigned long ul;
259         unsigned long v;
260
261 asm("sgdt %0":"=m"(gdt));
262         table_base = gdt.base;
263
264         if (selector & 4) {     /* from ldt */
265                 uint16_t ldt_selector;
266
267 asm("sldt %0":"=g"(ldt_selector));
268                 table_base = segment_base(ldt_selector);
269         }
270         d = (struct segment_descriptor *)(table_base + (selector & ~7));
271         v = d->base_low | ((ul) d->base_mid << 16) | ((ul) d->base_high << 24);
272 #ifdef __x86_64__
273         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
274                 v |= ((ul) ((struct segment_descriptor_64 *)d)->base_higher) << 32;
275 #endif
276         print_func_exit();
277         return v;
278 }
279
280 static unsigned long read_tr_base(void)
281 {
282         print_func_entry();
283         uint16_t tr;
284 asm("str %0":"=g"(tr));
285         print_func_exit();
286         return segment_base(tr);
287 }
288
289 static void reload_tss(void)
290 {
291         print_func_entry();
292 #ifndef __x86_64__
293
294         /*
295          * VT restores TR but not its size.  Useless.
296          */
297         struct descriptor_table gdt;
298         struct segment_descriptor *descs;
299
300         get_gdt(&gdt);
301         descs = (void *)gdt.base;
302         descs[GDT_ENTRY_TSS].type = 9;  /* available TSS */
303         load_TR_desc();
304 #endif
305         print_func_exit();
306 }
307
308 static struct vmcs_descriptor {
309         int size;
310         int order;
311         uint32_t revision_id;
312 } vmcs_descriptor;
313
314 static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
315 {
316         print_func_entry();
317         struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
318         print_func_exit();
319         return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
320 }
321
322 int litevm_read_guest(struct litevm_vcpu *vcpu,
323                                           gva_t addr, unsigned long size, void *dest)
324 {
325         print_func_entry();
326         unsigned char *host_buf = dest;
327         unsigned long req_size = size;
328
329         while (size) {
330                 hpa_t paddr;
331                 unsigned now;
332                 unsigned offset;
333                 hva_t guest_buf;
334
335                 paddr = gva_to_hpa(vcpu, addr);
336
337                 if (is_error_hpa(paddr))
338                         break;
339                 guest_buf = (hva_t) KADDR(paddr);
340                 offset = addr & ~PAGE_MASK;
341                 guest_buf |= offset;
342                 now = MIN(size, PAGE_SIZE - offset);
343                 memcpy(host_buf, (void *)guest_buf, now);
344                 host_buf += now;
345                 addr += now;
346                 size -= now;
347         }
348         print_func_exit();
349         return req_size - size;
350 }
351
352 int litevm_write_guest(struct litevm_vcpu *vcpu,
353                                            gva_t addr, unsigned long size, void *data)
354 {
355         print_func_entry();
356         unsigned char *host_buf = data;
357         unsigned long req_size = size;
358
359         while (size) {
360                 hpa_t paddr;
361                 unsigned now;
362                 unsigned offset;
363                 hva_t guest_buf;
364
365                 paddr = gva_to_hpa(vcpu, addr);
366
367                 if (is_error_hpa(paddr))
368                         break;
369
370                 guest_buf = (hva_t) KADDR(paddr);
371                 offset = addr & ~PAGE_MASK;
372                 guest_buf |= offset;
373                 now = MIN(size, PAGE_SIZE - offset);
374                 memcpy((void *)guest_buf, host_buf, now);
375                 host_buf += now;
376                 addr += now;
377                 size -= now;
378         }
379         print_func_exit();
380         return req_size - size;
381 }
382
383 static void setup_vmcs_descriptor(void)
384 {
385         print_func_entry();
386         uint64_t msr;
387
388         msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
389         vmcs_descriptor.size = (msr >> 32) & 0x1fff;
390         vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size >> PAGE_SHIFT);
391         vmcs_descriptor.revision_id = (uint32_t) msr;
392         printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n",
393                    msr, vmcs_descriptor.size, vmcs_descriptor.order,
394                    vmcs_descriptor.revision_id);
395         print_func_exit();
396 };
397
398 static void vmcs_clear(struct vmcs *vmcs)
399 {
400         print_func_entry();
401         uint64_t phys_addr = PADDR(vmcs);
402         uint8_t error;
403         printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr);
404         asm volatile ("vmclear %1; setna %0":"=m" (error):"m"(phys_addr):"cc",
405                                   "memory");
406         if (error)
407                 printk("litevm: vmclear fail: %p/%llx\n", vmcs, phys_addr);
408         print_func_exit();
409 }
410
411 static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
412 {
413         print_func_entry();
414         struct litevm_vcpu *vcpu = arg;
415         int cpu = core_id();
416         printd
417                 ("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n",
418                  cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
419
420         if (vcpu->cpu == cpu)
421                 vmcs_clear(vcpu->vmcs);
422
423         if (currentcpu->vmcs == vcpu->vmcs)
424                 currentcpu->vmcs = NULL;
425         print_func_exit();
426 }
427
428 static int vcpu_slot(struct litevm_vcpu *vcpu)
429 {
430         print_func_entry();
431         print_func_exit();
432         return vcpu - vcpu->litevm->vcpus;
433 }
434
435 /*
436  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
437  * vcpu mutex is already taken.
438  */
439 static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
440 {
441         print_func_entry();
442         uint64_t phys_addr = PADDR(vcpu->vmcs);
443         int cpu;
444         cpu = core_id();
445
446         if (vcpu->cpu != cpu) {
447                 handler_wrapper_t *w;
448                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
449                 smp_call_wait(w);
450                 vcpu->launched = 0;
451         }
452         if (currentcpu->vmcs != vcpu->vmcs) {
453                 uint8_t error;
454
455                 currentcpu->vmcs = vcpu->vmcs;
456                 asm volatile ("vmptrld %1; setna %0":"=m" (error):"m"(phys_addr):"cc");
457                 if (error) {
458                         printk("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
459                         error("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
460                 }
461         }
462
463         if (vcpu->cpu != cpu) {
464                 struct descriptor_table dt;
465                 unsigned long sysenter_esp;
466
467                 vcpu->cpu = cpu;
468                 /*
469                  * Linux uses per-cpu TSS and GDT, so set these when switching
470                  * processors.
471                  */
472                 vmcs_writel(HOST_TR_BASE, read_tr_base());      /* 22.2.4 */
473                 get_gdt(&dt);
474                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
475
476                 sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
477                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp);      /* 22.2.3 */
478         }
479         print_func_exit();
480         return vcpu;
481 }
482
483 /*
484  * Switches to specified vcpu, until a matching vcpu_put()
485  */
486 static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
487 {
488         print_func_entry();
489         struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
490
491         printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu);
492
493         qlock(&vcpu->mutex);
494         if (!vcpu->vmcs) {
495                 qunlock(&vcpu->mutex);
496                 error("vcpu->vmcs is NULL");
497         }
498         print_func_exit();
499         return __vcpu_load(vcpu);
500 }
501
502 static void vcpu_put(struct litevm_vcpu *vcpu)
503 {
504         print_func_entry();
505         //put_cpu();
506         qunlock(&vcpu->mutex);
507         print_func_exit();
508 }
509
510 static struct vmcs *alloc_vmcs_cpu(int cpu)
511 {
512         print_func_entry();
513         int node = node_id();
514         struct vmcs *vmcs;
515
516         vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
517         if (!pages) {
518                 print_func_exit();
519                 return 0;
520         }
521         memset(vmcs, 0, vmcs_descriptor.size);
522         vmcs->revision_id = vmcs_descriptor.revision_id;        /* vmcs revision id */
523         print_func_exit();
524         return vmcs;
525 }
526
527 static struct vmcs *alloc_vmcs(void)
528 {
529         struct vmcs *ret;
530         print_func_entry();
531         ret = alloc_vmcs_cpu(core_id());
532         print_func_exit();
533         return ret;
534 }
535
536 static int cpu_has_litevm_support(void)
537 {
538         print_func_entry();
539         uint32_t ecx = cpuid_ecx(1);
540         print_func_exit();
541         return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */
542 }
543
544 static int vmx_disabled_by_bios(void)
545 {
546         print_func_entry();
547         uint64_t msr;
548
549         msr = read_msr(MSR_IA32_FEATURE_CONTROL);
550         print_func_exit();
551         return (msr & 5) == 1;  /* locked but not enabled */
552 }
553
554 static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
555 {
556         print_func_entry();
557         int cpu = hw_core_id();
558         uint64_t phys_addr;
559         uint64_t old;
560         uint64_t status = 0;
561         currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order,
562                                                                                           KMALLOC_WAIT);
563         if (!currentcpu->vmxarea)
564                 return;
565         memset(currentcpu->vmxarea, 0, vmcs_descriptor.size);
566         currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id;
567         phys_addr = PADDR(currentcpu->vmxarea);
568         printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(),
569                    currentcpu->vmxarea, (void *)phys_addr);
570         if (phys_addr & 0xfff) {
571                 printk("fix vmxarea alignment!");
572         }
573         printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
574         old = read_msr(MSR_IA32_FEATURE_CONTROL);
575         printk("%d: vm_enable, old is %d\n", core_id(), old);
576         if ((old & 5) == 0) {
577                 /* enable and lock */
578                 write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
579                 old = read_msr(MSR_IA32_FEATURE_CONTROL);
580                 printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old);
581         }
582         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
583         lcr4(rcr4() | CR4_VMXE);        /* FIXME: not cpu hotplug safe */
584         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
585         printk("%d:cr0 is %x\n", core_id(), rcr0());
586         lcr0(rcr0() | 0x20);
587         printk("%d:cr0 is %x\n", core_id(), rcr0());
588         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
589         outb(0x92, inb(0x92) | 2);
590         printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
591         asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:":"=m" (status):"m"
592                                   (phys_addr):"memory", "cc");
593         printk("%d:vmxon status is %d\n", core_id(), status);
594         printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
595         if (!status) {
596                 printk("%d:vm_enable: status says fail\n", core_id());
597         }
598         print_func_exit();
599 }
600
601 static void litevm_disable(void *garbage)
602 {
603         print_func_entry();
604         asm volatile ("vmxoff":::"cc");
605         print_func_exit();
606 }
607
608 struct litevm *vmx_open(void)
609 {
610         print_func_entry();
611         struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
612         int i;
613
614         if (!litevm) {
615                 printk("NO LITEVM! MAKES NO SENSE!\n");
616                 error("litevm alloc failed");
617                 print_func_exit();
618                 return 0;
619         }
620
621         spinlock_init_irqsave(&litevm->lock);
622         LIST_INIT(&litevm->link);
623         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
624                 struct litevm_vcpu *vcpu = &litevm->vcpus[i];
625
626                 qlock_init(&vcpu->mutex);
627                 vcpu->mmu.root_hpa = INVALID_PAGE;
628                 LIST_INIT(&vcpu->link);
629         }
630         printk("vmx_open: busy %d\n", litevm->busy);
631         printk("return %p\n", litevm);
632         print_func_exit();
633         return litevm;
634 }
635
636 /*
637  * Free any memory in @free but not in @dont.
638  */
639 static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
640                                                                          struct litevm_memory_slot *dont)
641 {
642         print_func_entry();
643         int i;
644
645         if (!dont || free->phys_mem != dont->phys_mem)
646                 if (free->phys_mem) {
647                         for (i = 0; i < free->npages; ++i) {
648                                 page_t *page = free->phys_mem[i];
649                                 page_decref(page);
650                                 assert(page_is_free(page2ppn(page)));
651                         }
652                         kfree(free->phys_mem);
653                 }
654
655         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
656                 kfree(free->dirty_bitmap);
657
658         free->phys_mem = 0;
659         free->npages = 0;
660         free->dirty_bitmap = 0;
661         print_func_exit();
662 }
663
664 static void litevm_free_physmem(struct litevm *litevm)
665 {
666         print_func_entry();
667         int i;
668
669         for (i = 0; i < litevm->nmemslots; ++i)
670                 litevm_free_physmem_slot(&litevm->memslots[i], 0);
671         print_func_exit();
672 }
673
674 static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
675 {
676         print_func_entry();
677         if (vcpu->vmcs) {
678                 handler_wrapper_t *w;
679                 smp_call_function_all(__vcpu_clear, vcpu, &w);
680                 smp_call_wait(w);
681                 //free_vmcs(vcpu->vmcs);
682                 vcpu->vmcs = 0;
683         }
684         print_func_exit();
685 }
686
687 static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
688 {
689         print_func_entry();
690         litevm_free_vmcs(vcpu);
691         litevm_mmu_destroy(vcpu);
692         print_func_exit();
693 }
694
695 static void litevm_free_vcpus(struct litevm *litevm)
696 {
697         print_func_entry();
698         unsigned int i;
699
700         for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
701                 litevm_free_vcpu(&litevm->vcpus[i]);
702         print_func_exit();
703 }
704
705 static int litevm_dev_release(struct litevm *litevm)
706 {
707         print_func_entry();
708
709         litevm_free_vcpus(litevm);
710         litevm_free_physmem(litevm);
711         kfree(litevm);
712         print_func_exit();
713         return 0;
714 }
715
716 unsigned long vmcs_readl(unsigned long field)
717 {
718         print_func_entry();
719         unsigned long value;
720
721         asm volatile ("vmread %1, %0":"=g" (value):"r"(field):"cc");
722         print_func_exit();
723         return value;
724 }
725
726 void vmcs_writel(unsigned long field, unsigned long value)
727 {
728         print_func_entry();
729         uint8_t error;
730
731         asm volatile ("vmwrite %1, %2; setna %0":"=g" (error):"r"(value),
732                                   "r"(field):"cc");
733         if (error)
734                 printk("vmwrite error: reg %lx value %lx (err %d)\n",
735                            field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
736         print_func_exit();
737 }
738
739 static void vmcs_write16(unsigned long field, uint16_t value)
740 {
741         print_func_entry();
742         vmcs_writel(field, value);
743         print_func_exit();
744 }
745
746 static void vmcs_write64(unsigned long field, uint64_t value)
747 {
748         print_func_entry();
749 #ifdef __x86_64__
750         vmcs_writel(field, value);
751 #else
752         vmcs_writel(field, value);
753         asm volatile ("");
754         vmcs_writel(field + 1, value >> 32);
755 #endif
756         print_func_exit();
757 }
758
759 static void inject_gp(struct litevm_vcpu *vcpu)
760 {
761         print_func_entry();
762         printd("inject_general_protection: rip 0x%lx\n", vmcs_readl(GUEST_RIP));
763         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
764         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
765                                  GP_VECTOR |
766                                  INTR_TYPE_EXCEPTION |
767                                  INTR_INFO_DELIEVER_CODE_MASK | INTR_INFO_VALID_MASK);
768         print_func_exit();
769 }
770
771 static void update_exception_bitmap(struct litevm_vcpu *vcpu)
772 {
773         print_func_entry();
774         if (vcpu->rmode.active)
775                 vmcs_write32(EXCEPTION_BITMAP, ~0);
776         else
777                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
778         print_func_exit();
779 }
780
781 static void enter_pmode(struct litevm_vcpu *vcpu)
782 {
783         print_func_entry();
784         unsigned long flags;
785
786         vcpu->rmode.active = 0;
787
788         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
789         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
790         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
791
792         flags = vmcs_readl(GUEST_RFLAGS);
793         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
794         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
795         vmcs_writel(GUEST_RFLAGS, flags);
796
797         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
798                                 (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK));
799
800         update_exception_bitmap(vcpu);
801
802 #define FIX_PMODE_DATASEG(seg, save) {                          \
803                         vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
804                         vmcs_writel(GUEST_##seg##_BASE, 0);             \
805                         vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
806                         vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
807         }
808
809         FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
810         FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
811         FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
812         FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
813         FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
814
815         vmcs_write16(GUEST_CS_SELECTOR,
816                                  vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
817         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
818         print_func_exit();
819 }
820
821 static int rmode_tss_base(struct litevm *litevm)
822 {
823         print_func_entry();
824         gfn_t base_gfn =
825                 litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
826         print_func_exit();
827         return base_gfn << PAGE_SHIFT;
828 }
829
830 static void enter_rmode(struct litevm_vcpu *vcpu)
831 {
832         print_func_entry();
833         unsigned long flags;
834
835         vcpu->rmode.active = 1;
836
837         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
838         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
839
840         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
841         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
842
843         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
844         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
845
846         flags = vmcs_readl(GUEST_RFLAGS);
847         vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
848
849         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
850
851         vmcs_writel(GUEST_RFLAGS, flags);
852         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
853         update_exception_bitmap(vcpu);
854
855 #define FIX_RMODE_SEG(seg, save) {                                 \
856                 vmcs_write16(GUEST_##seg##_SELECTOR,                       \
857                                         vmcs_readl(GUEST_##seg##_BASE) >> 4); \
858                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);                 \
859                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);                \
860         }
861
862         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
863         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
864
865         FIX_RMODE_SEG(ES, vcpu->rmode.es);
866         FIX_RMODE_SEG(DS, vcpu->rmode.ds);
867         FIX_RMODE_SEG(SS, vcpu->rmode.ss);
868         FIX_RMODE_SEG(GS, vcpu->rmode.gs);
869         FIX_RMODE_SEG(FS, vcpu->rmode.fs);
870         print_func_exit();
871 }
872
873 static int init_rmode_tss(struct litevm *litevm)
874 {
875         print_func_entry();
876         struct page *p1, *p2, *p3;
877         gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
878         char *page;
879
880         p1 = _gfn_to_page(litevm, fn++);
881         p2 = _gfn_to_page(litevm, fn++);
882         p3 = _gfn_to_page(litevm, fn);
883
884         if (!p1 || !p2 || !p3) {
885                 printk("%s: gfn_to_page failed\n", __FUNCTION__);
886                 print_func_exit();
887                 return 0;
888         }
889
890         page = page2kva(p1);
891         memset(page, 0, PAGE_SIZE);
892         *(uint16_t *) (page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
893
894         page = page2kva(p2);
895         memset(page, 0, PAGE_SIZE);
896
897         page = page2kva(p3);
898         memset(page, 0, PAGE_SIZE);
899         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
900
901         print_func_exit();
902         return 1;
903 }
904
905 #ifdef __x86_64__
906
907 static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
908 {
909         print_func_entry();
910         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
911
912         vcpu->shadow_efer = efer;
913         if (efer & EFER_LMA) {
914                 vmcs_write32(VM_ENTRY_CONTROLS,
915                                          vmcs_read32(VM_ENTRY_CONTROLS) |
916                                          VM_ENTRY_CONTROLS_IA32E_MASK);
917                 msr->data = efer;
918
919         } else {
920                 vmcs_write32(VM_ENTRY_CONTROLS,
921                                          vmcs_read32(VM_ENTRY_CONTROLS) &
922                                          ~VM_ENTRY_CONTROLS_IA32E_MASK);
923
924                 msr->data = efer & ~EFER_LME;
925         }
926         print_func_exit();
927 }
928
929 static void enter_lmode(struct litevm_vcpu *vcpu)
930 {
931         print_func_entry();
932         uint32_t guest_tr_ar;
933
934         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
935         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
936                 printd("%s: tss fixup for long mode. \n", __FUNCTION__);
937                 vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK)
938                                          | AR_TYPE_BUSY_64_TSS);
939         }
940
941         vcpu->shadow_efer |= EFER_LMA;
942
943         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
944         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
945                                  | VM_ENTRY_CONTROLS_IA32E_MASK);
946         print_func_exit();
947 }
948
949 static void exit_lmode(struct litevm_vcpu *vcpu)
950 {
951         print_func_entry();
952         vcpu->shadow_efer &= ~EFER_LMA;
953
954         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
955                                  & ~VM_ENTRY_CONTROLS_IA32E_MASK);
956         print_func_exit();
957 }
958
959 #endif
960
961 static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
962 {
963         print_func_entry();
964         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
965                 enter_pmode(vcpu);
966
967         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
968                 enter_rmode(vcpu);
969
970 #ifdef __x86_64__
971         if (vcpu->shadow_efer & EFER_LME) {
972                 if (!is_paging() && (cr0 & CR0_PG_MASK))
973                         enter_lmode(vcpu);
974                 if (is_paging() && !(cr0 & CR0_PG_MASK))
975                         exit_lmode(vcpu);
976         }
977 #endif
978
979         vmcs_writel(CR0_READ_SHADOW, cr0);
980         vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
981         print_func_exit();
982 }
983
984 static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
985                                                                                  unsigned long cr3)
986 {
987         print_func_entry();
988         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
989         unsigned offset = (cr3 & (PAGE_SIZE - 1)) >> 5;
990         int i;
991         uint64_t pdpte;
992         uint64_t *pdpt;
993         struct litevm_memory_slot *memslot;
994
995         spin_lock_irqsave(&vcpu->litevm->lock);
996         memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
997         /* FIXME: !memslot - emulate? 0xff? */
998         pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
999
1000         for (i = 0; i < 4; ++i) {
1001                 pdpte = pdpt[offset + i];
1002                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
1003                         break;
1004         }
1005
1006         spin_unlock(&vcpu->litevm->lock);
1007
1008         print_func_exit();
1009         return i != 4;
1010 }
1011
1012 static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
1013 {
1014         print_func_entry();
1015         if (cr0 & CR0_RESEVED_BITS) {
1016                 printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, guest_cr0());
1017                 inject_gp(vcpu);
1018                 print_func_exit();
1019                 return;
1020         }
1021
1022         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
1023                 printd("set_cr0: #GP, CD == 0 && NW == 1\n");
1024                 inject_gp(vcpu);
1025                 print_func_exit();
1026                 return;
1027         }
1028
1029         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
1030                 printd("set_cr0: #GP, set PG flag " "and a clear PE flag\n");
1031                 inject_gp(vcpu);
1032                 print_func_exit();
1033                 return;
1034         }
1035
1036         if (!is_paging() && (cr0 & CR0_PG_MASK)) {
1037 #ifdef __x86_64__
1038                 if ((vcpu->shadow_efer & EFER_LME)) {
1039                         uint32_t guest_cs_ar;
1040                         if (!is_pae()) {
1041                                 printd("set_cr0: #GP, start paging "
1042                                            "in long mode while PAE is disabled\n");
1043                                 inject_gp(vcpu);
1044                                 print_func_exit();
1045                                 return;
1046                         }
1047                         guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1048                         if (guest_cs_ar & SEGMENT_AR_L_MASK) {
1049                                 printd("set_cr0: #GP, start paging "
1050                                            "in long mode while CS.L == 1\n");
1051                                 inject_gp(vcpu);
1052                                 print_func_exit();
1053                                 return;
1054
1055                         }
1056                 } else
1057 #endif
1058                 if (is_pae() && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1059                         printd("set_cr0: #GP, pdptrs " "reserved bits\n");
1060                         inject_gp(vcpu);
1061                         print_func_exit();
1062                         return;
1063                 }
1064
1065         }
1066
1067         __set_cr0(vcpu, cr0);
1068         litevm_mmu_reset_context(vcpu);
1069         print_func_exit();
1070         return;
1071 }
1072
1073 static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
1074 {
1075         print_func_entry();
1076         unsigned long cr0 = guest_cr0();
1077
1078         if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
1079                 enter_pmode(vcpu);
1080                 vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
1081
1082         } else
1083                 printd("lmsw: unexpected\n");
1084
1085         vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
1086                                 | (msw & LMSW_GUEST_MASK));
1087         print_func_exit();
1088 }
1089
1090 static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1091 {
1092         print_func_entry();
1093         vmcs_writel(CR4_READ_SHADOW, cr4);
1094         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1095                                                                   LITEVM_RMODE_VM_CR4_ALWAYS_ON :
1096                                                                   LITEVM_PMODE_VM_CR4_ALWAYS_ON));
1097         print_func_exit();
1098 }
1099
1100 static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
1101 {
1102         print_func_entry();
1103         if (cr4 & CR4_RESEVED_BITS) {
1104                 printd("set_cr4: #GP, reserved bits\n");
1105                 inject_gp(vcpu);
1106                 print_func_exit();
1107                 return;
1108         }
1109
1110         if (is_long_mode()) {
1111                 if (!(cr4 & CR4_PAE_MASK)) {
1112                         printd("set_cr4: #GP, clearing PAE while " "in long mode\n");
1113                         inject_gp(vcpu);
1114                         print_func_exit();
1115                         return;
1116                 }
1117         } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
1118                            && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
1119                 printd("set_cr4: #GP, pdptrs reserved bits\n");
1120                 inject_gp(vcpu);
1121         }
1122
1123         if (cr4 & CR4_VMXE_MASK) {
1124                 printd("set_cr4: #GP, setting VMXE\n");
1125                 inject_gp(vcpu);
1126                 print_func_exit();
1127                 return;
1128         }
1129         __set_cr4(vcpu, cr4);
1130         spin_lock_irqsave(&vcpu->litevm->lock);
1131         litevm_mmu_reset_context(vcpu);
1132         spin_unlock(&vcpu->litevm->lock);
1133         print_func_exit();
1134 }
1135
1136 static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
1137 {
1138         print_func_entry();
1139         if (is_long_mode()) {
1140                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
1141                         printd("set_cr3: #GP, reserved bits\n");
1142                         inject_gp(vcpu);
1143                         print_func_exit();
1144                         return;
1145                 }
1146         } else {
1147                 if (cr3 & CR3_RESEVED_BITS) {
1148                         printd("set_cr3: #GP, reserved bits\n");
1149                         inject_gp(vcpu);
1150                         print_func_exit();
1151                         return;
1152                 }
1153                 if (is_paging() && is_pae() && pdptrs_have_reserved_bits_set(vcpu, cr3)) {
1154                         printd("set_cr3: #GP, pdptrs " "reserved bits\n");
1155                         inject_gp(vcpu);
1156                         print_func_exit();
1157                         return;
1158                 }
1159         }
1160
1161         vcpu->cr3 = cr3;
1162         spin_lock_irqsave(&vcpu->litevm->lock);
1163         vcpu->mmu.new_cr3(vcpu);
1164         spin_unlock(&vcpu->litevm->lock);
1165         print_func_exit();
1166 }
1167
1168 static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
1169 {
1170         print_func_entry();
1171         if (cr8 & CR8_RESEVED_BITS) {
1172                 printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
1173                 inject_gp(vcpu);
1174                 print_func_exit();
1175                 return;
1176         }
1177         vcpu->cr8 = cr8;
1178         print_func_exit();
1179 }
1180
1181 static uint32_t get_rdx_init_val(void)
1182 {
1183         print_func_entry();
1184         uint32_t val;
1185
1186 asm("movl $1, %%eax \n\t" "movl %%eax, %0 \n\t":"=g"(val));
1187         print_func_exit();
1188         return val;
1189
1190 }
1191
1192 static void fx_init(struct litevm_vcpu *vcpu)
1193 {
1194         print_func_entry();
1195         struct __attribute__ ((__packed__)) fx_image_s {
1196                 uint16_t control;               //fcw
1197                 uint16_t status;                //fsw
1198                 uint16_t tag;                   // ftw
1199                 uint16_t opcode;                //fop
1200                 uint64_t ip;                    // fpu ip
1201                 uint64_t operand;               // fpu dp
1202                 uint32_t mxcsr;
1203                 uint32_t mxcsr_mask;
1204
1205         } *fx_image;
1206
1207         fx_save(vcpu->host_fx_image);
1208         fpu_init();
1209         fx_save(vcpu->guest_fx_image);
1210         fx_restore(vcpu->host_fx_image);
1211
1212         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
1213         fx_image->mxcsr = 0x1f80;
1214         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
1215                    0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
1216         print_func_exit();
1217 }
1218
1219 static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field,
1220                                                                    uint32_t val)
1221 {
1222         print_func_entry();
1223         uint32_t msr_high, msr_low;
1224         uint64_t msrval;
1225
1226         msrval = read_msr(msr);
1227         msr_low = msrval;
1228         msr_high = (msrval >> 32);
1229
1230         val &= msr_high;
1231         val |= msr_low;
1232         vmcs_write32(vmcs_field, val);
1233         print_func_exit();
1234 }
1235
1236 /*
1237  * Sets up the vmcs for emulated real mode.
1238  */
1239 static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
1240 {
1241         print_func_entry();
1242 /* no op on x86_64 */
1243 #define asmlinkage
1244         extern asmlinkage void litevm_vmx_return(void);
1245         uint32_t host_sysenter_cs;
1246         uint32_t junk;
1247         uint64_t a;
1248         struct descriptor_table dt;
1249         int i;
1250         int ret;
1251         uint64_t tsc;
1252         int nr_good_msrs;
1253
1254         if (!init_rmode_tss(vcpu->litevm)) {
1255                 error("vcpu_setup: init_rmode_tss failed");
1256         }
1257
1258         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1259         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1260         vcpu->cr8 = 0;
1261         vcpu->apic_base = 0xfee00000 |
1262                 /*for vcpu 0 */ MSR_IA32_APICBASE_BSP |
1263                 MSR_IA32_APICBASE_ENABLE;
1264
1265         fx_init(vcpu);
1266
1267 #define SEG_SETUP(seg) do {                                     \
1268                 vmcs_write16(GUEST_##seg##_SELECTOR, 0);        \
1269                 vmcs_writel(GUEST_##seg##_BASE, 0);             \
1270                 vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);      \
1271                 vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);     \
1272         } while (0)
1273
1274         /*
1275          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1276          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1277          */
1278         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1279         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1280         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1281         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1282
1283         SEG_SETUP(DS);
1284         SEG_SETUP(ES);
1285         SEG_SETUP(FS);
1286         SEG_SETUP(GS);
1287         SEG_SETUP(SS);
1288
1289         vmcs_write16(GUEST_TR_SELECTOR, 0);
1290         vmcs_writel(GUEST_TR_BASE, 0);
1291         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1292         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1293
1294         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1295         vmcs_writel(GUEST_LDTR_BASE, 0);
1296         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1297         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1298
1299         vmcs_write32(GUEST_SYSENTER_CS, 0);
1300         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1301         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1302
1303         vmcs_writel(GUEST_RFLAGS, 0x02);
1304         vmcs_writel(GUEST_RIP, 0xfff0);
1305         vmcs_writel(GUEST_RSP, 0);
1306
1307         vmcs_writel(GUEST_CR3, 0);
1308
1309         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1310         vmcs_writel(GUEST_DR7, 0x400);
1311
1312         vmcs_writel(GUEST_GDTR_BASE, 0);
1313         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1314
1315         vmcs_writel(GUEST_IDTR_BASE, 0);
1316         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1317
1318         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1319         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1320         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1321
1322         /* I/O */
1323         vmcs_write64(IO_BITMAP_A, 0);
1324         vmcs_write64(IO_BITMAP_B, 0);
1325
1326         tsc = read_tsc();
1327         vmcs_write64(TSC_OFFSET, -tsc);
1328
1329         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1330
1331         /* Special registers */
1332         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1333
1334         /* Control */
1335         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_EXT_INTR_MASK       /* 20.6.1 */
1336                                                    | PIN_BASED_NMI_EXITING      /* 20.6.1 */
1337                 );
1338         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_HLT_EXITING        /* 20.6.2 */
1339                                                    | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1340                                                    | CPU_BASED_CR8_STORE_EXITING        /* 20.6.2 */
1341                                                    | CPU_BASED_UNCOND_IO_EXITING        /* 20.6.2 */
1342                                                    | CPU_BASED_INVDPG_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING  /* 21.3 */
1343                 );
1344
1345         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1346         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1347         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1348         vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1349
1350         vmcs_writel(HOST_CR0, rcr0());  /* 22.2.3 */
1351         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
1352         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3  FIXME: shadow tables */
1353
1354 #warning "not setting selectors; do we need them?"
1355 #if 0
1356         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);    /* 22.2.4 */
1357         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1358         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1359 #endif
1360         vmcs_write16(HOST_FS_SELECTOR, read_fs());      /* 22.2.4 */
1361         vmcs_write16(HOST_GS_SELECTOR, read_gs());      /* 22.2.4 */
1362 #if 0
1363         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);    /* 22.2.4 */
1364 #endif
1365 #ifdef __x86_64__
1366         a = read_msr(MSR_FS_BASE);
1367         vmcs_writel(HOST_FS_BASE, a);   /* 22.2.4 */
1368         a = read_msr(MSR_GS_BASE);
1369         vmcs_writel(HOST_GS_BASE, a);   /* 22.2.4 */
1370 #else
1371         vmcs_writel(HOST_FS_BASE, 0);   /* 22.2.4 */
1372         vmcs_writel(HOST_GS_BASE, 0);   /* 22.2.4 */
1373 #endif
1374
1375 #warning "Not setting HOST_TR_SELECTOR"
1376 #if 0
1377         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS * 8);      /* 22.2.4 */
1378 #endif
1379
1380         get_idt(&dt);
1381         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1382
1383         vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return);        /* 22.2.5 */
1384
1385         /* it's the HIGH 32 bits! */
1386         host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
1387         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1388         a = read_msr(MSR_IA32_SYSENTER_ESP);
1389         vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1390         a = read_msr(MSR_IA32_SYSENTER_EIP);
1391         vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1392
1393         ret = -ENOMEM;
1394         vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1395         if (!vcpu->guest_msrs)
1396                 error("guest_msrs kmalloc failed");
1397         vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
1398         if (!vcpu->host_msrs)
1399                 error("vcpu->host_msrs kmalloc failed -- storage leaked");
1400
1401         for (i = 0; i < NR_VMX_MSR; ++i) {
1402                 uint32_t index = vmx_msr_index[i];
1403                 uint32_t data_low, data_high;
1404                 uint64_t data;
1405                 int j = vcpu->nmsrs;
1406
1407 #warning "need readmsr_safe"
1408 //      if (rdmsr_safe(index, &data_low, &data_high) < 0)
1409 //          continue;
1410                 data = read_msr(index);
1411                 vcpu->host_msrs[j].index = index;
1412                 vcpu->host_msrs[j].reserved = 0;
1413                 vcpu->host_msrs[j].data = data;
1414                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1415                 ++vcpu->nmsrs;
1416         }
1417         printk("msrs: %d\n", vcpu->nmsrs);
1418
1419         nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1420         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1421         vmcs_writel(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
1422         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->host_msrs + NR_BAD_MSRS));
1423         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, (HOST_IS_64 << 9));        /* 22.2,1, 20.7.1 */
1424         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs);    /* 22.2.2 */
1425         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);     /* 22.2.2 */
1426         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs);    /* 22.2.2 */
1427
1428         /* 22.2.1, 20.8.1 */
1429         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, VM_ENTRY_CONTROLS, 0);
1430         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
1431
1432         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1433         vmcs_writel(TPR_THRESHOLD, 0);
1434
1435         vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
1436         vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
1437
1438         __set_cr0(vcpu, 0x60000010);    // enter rmode
1439         __set_cr4(vcpu, 0);
1440 #ifdef __x86_64__
1441         __set_efer(vcpu, 0);
1442 #endif
1443
1444         ret = litevm_mmu_init(vcpu);
1445
1446         print_func_exit();
1447         return ret;
1448
1449 out_free_guest_msrs:
1450         kfree(vcpu->guest_msrs);
1451 out:
1452         return ret;
1453 }
1454
1455 /*
1456  * Sync the rsp and rip registers into the vcpu structure.  This allows
1457  * registers to be accessed by indexing vcpu->regs.
1458  */
1459 static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
1460 {
1461         print_func_entry();
1462         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1463         vcpu->rip = vmcs_readl(GUEST_RIP);
1464         print_func_exit();
1465 }
1466
1467 /*
1468  * Syncs rsp and rip back into the vmcs.  Should be called after possible
1469  * modification.
1470  */
1471 static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
1472 {
1473         print_func_entry();
1474         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
1475         vmcs_writel(GUEST_RIP, vcpu->rip);
1476         print_func_exit();
1477 }
1478
1479 /*
1480  * Creates some virtual cpus.  Good luck creating more than one.
1481  */
1482 int vmx_create_vcpu(struct litevm *litevm, int n)
1483 {
1484         print_func_entry();
1485         ERRSTACK(1);
1486         int r;
1487         struct litevm_vcpu *vcpu;
1488         struct vmcs *vmcs;
1489         char *errstring = NULL;
1490
1491         if (n < 0 || n >= LITEVM_MAX_VCPUS) {
1492                 printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1493                            LITEVM_MAX_VCPUS);
1494                 error("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
1495                           LITEVM_MAX_VCPUS);
1496         }
1497
1498         vcpu = &litevm->vcpus[n];
1499
1500         qlock(&vcpu->mutex);
1501
1502         if (vcpu->vmcs) {
1503                 qunlock(&vcpu->mutex);
1504                 printk("VM already exists\n");
1505                 error("VM already exists");
1506         }
1507
1508         /* I'm a bad person */
1509         //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
1510         uint64_t a = (uint64_t) vcpu->fx_buf;
1511         a += FX_IMAGE_ALIGN - 1;
1512         a /= FX_IMAGE_ALIGN;
1513         a *= FX_IMAGE_ALIGN;
1514
1515         vcpu->host_fx_image = (char *)a;
1516         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
1517
1518         vcpu->cpu = -1; /* First load will set up TR */
1519         vcpu->litevm = litevm;
1520
1521         vmcs = alloc_vmcs();
1522         if (!vmcs) {
1523                 errstring = "vmcs allocate failed";
1524                 printk("%s\n", errstring);
1525                 qunlock(&vcpu->mutex);
1526                 goto out_free_vcpus;
1527         }
1528         vmcs_clear(vmcs);
1529         printk("after vmcs_clear\n");
1530         vcpu->vmcs = vmcs;
1531         vcpu->launched = 0;
1532         printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs);
1533         error("before vcpu_load");
1534         __vcpu_load(vcpu);
1535
1536         printk("PAST vcpu_load\n");
1537 #warning unmatched waserror!
1538         if (waserror()) {
1539                 /* we really need to fix waserror() */
1540                 poperror();
1541                 goto out_free_vcpus;
1542         }
1543
1544         r = litevm_vcpu_setup(vcpu);
1545
1546         vcpu_put(vcpu);
1547
1548         printk("r is %d\n", r);
1549
1550         if (!r) {
1551
1552                 print_func_exit();
1553                 return 0;
1554         }
1555
1556         errstring = "vcup set failed";
1557
1558 out_free_vcpus:
1559         printk("out_free_vcpus: life sucks\n");
1560         litevm_free_vcpu(vcpu);
1561         error(errstring);
1562 out:
1563         print_func_exit();
1564         return r;
1565 }
1566
1567 /*
1568  * Allocate some memory and give it an address in the guest physical address
1569  * space.
1570  *
1571  * Discontiguous memory is allowed, mostly for framebuffers.
1572  */
1573 int vm_set_memory_region(struct litevm *litevm,
1574                                                  struct litevm_memory_region *mem)
1575 {
1576         print_func_entry();
1577         ERRSTACK(2);
1578         int r;
1579         gfn_t base_gfn;
1580         unsigned long npages;
1581         unsigned long i;
1582         struct litevm_memory_slot *memslot;
1583         struct litevm_memory_slot old, new;
1584         int memory_config_version;
1585         void *init_data = mem->init_data;
1586         int pass = 1;
1587
1588         printk("litevm %p\n", litevm);
1589         /* should not happen but ... */
1590         if (!litevm)
1591                 error("NULL litevm in %s", __func__);
1592
1593         if (!mem)
1594                 error("NULL mem in %s", __func__);
1595
1596         if (litevm->busy)
1597                 error("litevm->busy is set! 0x%x\n", litevm->busy);
1598         r = -EINVAL;
1599         /* General sanity checks */
1600         if (mem->memory_size & (PAGE_SIZE - 1))
1601                 error("mem->memory_size %lld is not page-aligned", mem->memory_size);
1602         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1603                 error("guest_phys_addr 0x%llx is not page-aligned",
1604                           mem->guest_phys_addr);
1605         if (mem->slot >= LITEVM_MEMORY_SLOTS)
1606                 error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
1607         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1608                 error("0x%x + 0x%x is < 0x%x",
1609                           mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
1610
1611         memslot = &litevm->memslots[mem->slot];
1612         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1613         npages = mem->memory_size >> PAGE_SHIFT;
1614
1615         if (!npages)
1616                 mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
1617
1618         /* this is actually a very tricky for loop. The use of
1619          * error is a bit dangerous, so we don't use it much.
1620          * consider a rewrite. Would be nice if akaros could do the
1621          * allocation of a bunch of pages for us.
1622          */
1623 raced:
1624         printk("raced: pass %d\n", pass);
1625         spin_lock_irqsave(&litevm->lock);
1626         printk("locked\n");
1627
1628         if (waserror()) {
1629                 spin_unlock(&litevm->lock);
1630                 nexterror();
1631         }
1632
1633         memory_config_version = litevm->memory_config_version;
1634         new = old = *memslot;
1635
1636         new.base_gfn = base_gfn;
1637         new.npages = npages;
1638         new.flags = mem->flags;
1639
1640         /* Disallow changing a memory slot's size. */
1641         r = -EINVAL;
1642         if (npages && old.npages && npages != old.npages)
1643                 error("npages is %d, old.npages is %d, can't change",
1644                           npages, old.npages);
1645
1646         /* Check for overlaps */
1647         r = -EEXIST;
1648         for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
1649                 struct litevm_memory_slot *s = &litevm->memslots[i];
1650
1651                 if (s == memslot)
1652                         continue;
1653                 if (!((base_gfn + npages <= s->base_gfn) ||
1654                           (base_gfn >= s->base_gfn + s->npages)))
1655                         error("Overlap");
1656         }
1657         /*
1658          * Do memory allocations outside lock.  memory_config_version will
1659          * detect any races.
1660          */
1661         spin_unlock(&litevm->lock);
1662         printk("unlocked\n");
1663         poperror();
1664
1665         /* Deallocate if slot is being removed */
1666         if (!npages)
1667                 new.phys_mem = 0;
1668
1669         /* Free page dirty bitmap if unneeded */
1670         if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
1671                 new.dirty_bitmap = 0;
1672
1673         r = -ENOMEM;
1674
1675         /* Allocate if a slot is being created */
1676         if (npages && !new.phys_mem) {
1677                 new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
1678
1679                 if (!new.phys_mem)
1680                         goto out_free;
1681
1682                 for (i = 0; i < npages; ++i) {
1683                         int ret;
1684                         ret = kpage_alloc(&new.phys_mem[i]);
1685                         if (ret != ESUCCESS)
1686                                 goto out_free;
1687                         if (init_data) {
1688                                 printk("init data memcpy(%p,%p,4096);\n",
1689                                            page2kva(new.phys_mem[i]), init_data);
1690                                 memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
1691                                 init_data += PAGE_SIZE;
1692                         }
1693                 }
1694         }
1695
1696         /* Allocate page dirty bitmap if needed */
1697         if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1698                 unsigned dirty_bytes;   //ALIGN(npages, BITS_PER_LONG) / 8;
1699                 dirty_bytes =
1700                         (((npages + BITS_PER_LONG -
1701                            1) / BITS_PER_LONG) * BITS_PER_LONG) / 8;
1702
1703                 new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
1704                 if (!new.dirty_bitmap) {
1705                         printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
1706                         goto out_free;
1707                 }
1708         }
1709
1710         spin_lock_irqsave(&litevm->lock);
1711         printk("locked\n");
1712         if (memory_config_version != litevm->memory_config_version) {
1713                 spin_unlock(&litevm->lock);
1714                 printk("unlocked, try again\n");
1715                 litevm_free_physmem_slot(&new, &old);
1716                 goto raced;
1717         }
1718
1719         r = -EAGAIN;
1720         if (litevm->busy) {
1721                 printk("BUSY!\n");
1722                 goto out_unlock;
1723         }
1724
1725         if (mem->slot >= litevm->nmemslots)
1726                 litevm->nmemslots = mem->slot + 1;
1727
1728         *memslot = new;
1729         ++litevm->memory_config_version;
1730
1731         spin_unlock(&litevm->lock);
1732         printk("unlocked\n");
1733         for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1734                 struct litevm_vcpu *vcpu;
1735
1736                 vcpu = vcpu_load(litevm, i);
1737                 if (!vcpu)
1738                         continue;
1739                 litevm_mmu_reset_context(vcpu);
1740                 vcpu_put(vcpu);
1741         }
1742
1743         litevm_free_physmem_slot(&old, &new);
1744         print_func_exit();
1745         return 0;
1746
1747 out_unlock:
1748         spin_unlock(&litevm->lock);
1749         printk("out_unlock\n");
1750 out_free:
1751         printk("out_free\n");
1752         litevm_free_physmem_slot(&new, &old);
1753 out:
1754         printk("vm_set_memory_region: return %d\n", r);
1755         print_func_exit();
1756         return r;
1757 }
1758
1759 #if 0
1760 /*
1761  * Get (and clear) the dirty memory log for a memory slot.
1762  */
1763 static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
1764                                                                                   struct litevm_dirty_log *log)
1765 {
1766         struct litevm_memory_slot *memslot;
1767         int r, i;
1768         int n;
1769         unsigned long any = 0;
1770
1771         spin_lock_irqsave(&litevm->lock);
1772
1773         /*
1774          * Prevent changes to guest memory configuration even while the lock
1775          * is not taken.
1776          */
1777         ++litevm->busy;
1778         spin_unlock(&litevm->lock);
1779         r = -EINVAL;
1780         if (log->slot >= LITEVM_MEMORY_SLOTS)
1781                 goto out;
1782
1783         memslot = &litevm->memslots[log->slot];
1784         r = -ENOENT;
1785         if (!memslot->dirty_bitmap)
1786                 goto out;
1787
1788         n = ALIGN(memslot->npages, 8) / 8;
1789
1790         for (i = 0; !any && i < n; ++i)
1791                 any = memslot->dirty_bitmap[i];
1792
1793         r = -EFAULT;
1794         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1795                 goto out;
1796
1797         if (any) {
1798                 spin_lock_irqsave(&litevm->lock);
1799                 litevm_mmu_slot_remove_write_access(litevm, log->slot);
1800                 spin_unlock(&litevm->lock);
1801                 memset(memslot->dirty_bitmap, 0, n);
1802                 for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
1803                         struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
1804
1805                         if (!vcpu)
1806                                 continue;
1807                         flush_guest_tlb(vcpu);
1808                         vcpu_put(vcpu);
1809                 }
1810         }
1811
1812         r = 0;
1813
1814 out:
1815         spin_lock_irqsave(&litevm->lock);
1816         --litevm->busy;
1817         spin_unlock(&litevm->lock);
1818         return r;
1819 }
1820 #endif
1821
1822 struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
1823 {
1824         print_func_entry();
1825         int i;
1826
1827         for (i = 0; i < litevm->nmemslots; ++i) {
1828                 struct litevm_memory_slot *memslot = &litevm->memslots[i];
1829
1830                 if (gfn >= memslot->base_gfn
1831                         && gfn < memslot->base_gfn + memslot->npages) {
1832                         print_func_exit();
1833                         return memslot;
1834                 }
1835         }
1836         print_func_exit();
1837         return 0;
1838 }
1839
1840 void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
1841 {
1842         print_func_entry();
1843         int i;
1844         struct litevm_memory_slot *memslot = 0;
1845         unsigned long rel_gfn;
1846
1847         for (i = 0; i < litevm->nmemslots; ++i) {
1848                 memslot = &litevm->memslots[i];
1849
1850                 if (gfn >= memslot->base_gfn
1851                         && gfn < memslot->base_gfn + memslot->npages) {
1852
1853                         if (!memslot || !memslot->dirty_bitmap) {
1854                                 print_func_exit();
1855                                 return;
1856                         }
1857
1858                         rel_gfn = gfn - memslot->base_gfn;
1859
1860                         /* avoid RMW */
1861                         if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
1862                                 SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
1863                         print_func_exit();
1864                         return;
1865                 }
1866         }
1867         print_func_exit();
1868 }
1869
1870 static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
1871 {
1872         print_func_entry();
1873         unsigned long rip;
1874         uint32_t interruptibility;
1875
1876         rip = vmcs_readl(GUEST_RIP);
1877         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1878         vmcs_writel(GUEST_RIP, rip);
1879
1880         /*
1881          * We emulated an instruction, so temporary interrupt blocking
1882          * should be removed, if set.
1883          */
1884         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1885         if (interruptibility & 3)
1886                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3);
1887         print_func_exit();
1888 }
1889
1890 static int emulator_read_std(unsigned long addr,
1891                                                          unsigned long *val,
1892                                                          unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1893 {
1894         print_func_entry();
1895         struct litevm_vcpu *vcpu = ctxt->vcpu;
1896         void *data = val;
1897
1898         while (bytes) {
1899                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1900                 unsigned offset = addr & (PAGE_SIZE - 1);
1901                 unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ?
1902                         bytes : (unsigned)PAGE_SIZE - offset;
1903                 unsigned long pfn;
1904                 struct litevm_memory_slot *memslot;
1905                 void *page;
1906
1907                 if (gpa == UNMAPPED_GVA) {
1908                         print_func_exit();
1909                         return X86EMUL_PROPAGATE_FAULT;
1910                 }
1911                 pfn = gpa >> PAGE_SHIFT;
1912                 memslot = gfn_to_memslot(vcpu->litevm, pfn);
1913                 if (!memslot) {
1914                         print_func_exit();
1915                         return X86EMUL_UNHANDLEABLE;
1916                 }
1917                 page = page2kva(gfn_to_page(memslot, pfn));
1918
1919                 memcpy(data, page + offset, tocopy);
1920
1921                 bytes -= tocopy;
1922                 data += tocopy;
1923                 addr += tocopy;
1924         }
1925
1926         print_func_exit();
1927         return X86EMUL_CONTINUE;
1928 }
1929
1930 static int emulator_write_std(unsigned long addr,
1931                                                           unsigned long val,
1932                                                           unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1933 {
1934         print_func_entry();
1935         printk("emulator_write_std: addr %lx n %d\n", addr, bytes);
1936         print_func_exit();
1937         return X86EMUL_UNHANDLEABLE;
1938 }
1939
1940 static int emulator_read_emulated(unsigned long addr,
1941                                                                   unsigned long *val,
1942                                                                   unsigned int bytes,
1943                                                                   struct x86_emulate_ctxt *ctxt)
1944 {
1945         print_func_entry();
1946         struct litevm_vcpu *vcpu = ctxt->vcpu;
1947
1948         if (vcpu->mmio_read_completed) {
1949                 memcpy(val, vcpu->mmio_data, bytes);
1950                 vcpu->mmio_read_completed = 0;
1951                 print_func_exit();
1952                 return X86EMUL_CONTINUE;
1953         } else if (emulator_read_std(addr, val, bytes, ctxt)
1954                            == X86EMUL_CONTINUE) {
1955                 print_func_exit();
1956                 return X86EMUL_CONTINUE;
1957         } else {
1958                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1959                 if (gpa == UNMAPPED_GVA) {
1960                         print_func_exit();
1961                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
1962                 }
1963                 vcpu->mmio_needed = 1;
1964                 vcpu->mmio_phys_addr = gpa;
1965                 vcpu->mmio_size = bytes;
1966                 vcpu->mmio_is_write = 0;
1967
1968                 print_func_exit();
1969                 return X86EMUL_UNHANDLEABLE;
1970         }
1971 }
1972
1973 static int emulator_write_emulated(unsigned long addr,
1974                                                                    unsigned long val,
1975                                                                    unsigned int bytes,
1976                                                                    struct x86_emulate_ctxt *ctxt)
1977 {
1978         print_func_entry();
1979         struct litevm_vcpu *vcpu = ctxt->vcpu;
1980         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1981
1982         if (gpa == UNMAPPED_GVA) {
1983                 print_func_exit();
1984                 return X86EMUL_PROPAGATE_FAULT;
1985         }
1986
1987         vcpu->mmio_needed = 1;
1988         vcpu->mmio_phys_addr = gpa;
1989         vcpu->mmio_size = bytes;
1990         vcpu->mmio_is_write = 1;
1991         memcpy(vcpu->mmio_data, &val, bytes);
1992
1993         print_func_exit();
1994         return X86EMUL_CONTINUE;
1995 }
1996
1997 static int emulator_cmpxchg_emulated(unsigned long addr,
1998                                                                          unsigned long old,
1999                                                                          unsigned long new,
2000                                                                          unsigned int bytes,
2001                                                                          struct x86_emulate_ctxt *ctxt)
2002 {
2003         print_func_entry();
2004         static int reported;
2005
2006         if (!reported) {
2007                 reported = 1;
2008                 printk("litevm: emulating exchange as write\n");
2009         }
2010         print_func_exit();
2011         return emulator_write_emulated(addr, new, bytes, ctxt);
2012 }
2013
2014 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
2015 {
2016         print_func_entry();
2017         static int reported;
2018         uint8_t opcodes[4];
2019         unsigned long rip = vmcs_readl(GUEST_RIP);
2020         unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
2021
2022         if (reported) {
2023                 print_func_exit();
2024                 return;
2025         }
2026
2027         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
2028
2029         printk("emulation failed but !mmio_needed?"
2030                    " rip %lx %02x %02x %02x %02x\n",
2031                    rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2032         reported = 1;
2033         print_func_exit();
2034 }
2035
2036 struct x86_emulate_ops emulate_ops = {
2037         .read_std = emulator_read_std,
2038         .write_std = emulator_write_std,
2039         .read_emulated = emulator_read_emulated,
2040         .write_emulated = emulator_write_emulated,
2041         .cmpxchg_emulated = emulator_cmpxchg_emulated,
2042 };
2043
2044 enum emulation_result {
2045         EMULATE_DONE,                           /* no further processing */
2046         EMULATE_DO_MMIO,                        /* litevm_run filled with mmio request */
2047         EMULATE_FAIL,                           /* can't emulate this instruction */
2048 };
2049
2050 static int emulate_instruction(struct litevm_vcpu *vcpu,
2051                                                            struct litevm_run *run,
2052                                                            unsigned long cr2, uint16_t error_code)
2053 {
2054         print_func_entry();
2055         struct x86_emulate_ctxt emulate_ctxt;
2056         int r;
2057         uint32_t cs_ar;
2058
2059         vcpu_load_rsp_rip(vcpu);
2060
2061         cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2062
2063         emulate_ctxt.vcpu = vcpu;
2064         emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
2065         emulate_ctxt.cr2 = cr2;
2066         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
2067                 ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
2068                 ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
2069                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2070
2071         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2072                 emulate_ctxt.cs_base = 0;
2073                 emulate_ctxt.ds_base = 0;
2074                 emulate_ctxt.es_base = 0;
2075                 emulate_ctxt.ss_base = 0;
2076                 emulate_ctxt.gs_base = 0;
2077                 emulate_ctxt.fs_base = 0;
2078         } else {
2079                 emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
2080                 emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
2081                 emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
2082                 emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
2083                 emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
2084                 emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
2085         }
2086
2087         vcpu->mmio_is_write = 0;
2088         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
2089
2090         if ((r || vcpu->mmio_is_write) && run) {
2091                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2092                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2093                 run->mmio.len = vcpu->mmio_size;
2094                 run->mmio.is_write = vcpu->mmio_is_write;
2095         }
2096
2097         if (r) {
2098                 if (!vcpu->mmio_needed) {
2099                         report_emulation_failure(&emulate_ctxt);
2100                         print_func_exit();
2101                         return EMULATE_FAIL;
2102                 }
2103                 print_func_exit();
2104                 return EMULATE_DO_MMIO;
2105         }
2106
2107         vcpu_put_rsp_rip(vcpu);
2108         vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
2109
2110         if (vcpu->mmio_is_write) {
2111                 print_func_exit();
2112                 return EMULATE_DO_MMIO;
2113         }
2114
2115         print_func_exit();
2116         return EMULATE_DONE;
2117 }
2118
2119 static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
2120 {
2121         print_func_entry();
2122         print_func_exit();
2123         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2124 }
2125
2126 void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2127 {
2128         print_func_entry();
2129         vmcs_writel(GUEST_GDTR_BASE, base);
2130         vmcs_write32(GUEST_GDTR_LIMIT, limit);
2131         print_func_exit();
2132 }
2133
2134 void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
2135 {
2136         print_func_entry();
2137         vmcs_writel(GUEST_IDTR_BASE, base);
2138         vmcs_write32(GUEST_IDTR_LIMIT, limit);
2139         print_func_exit();
2140 }
2141
2142 void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
2143                                    unsigned long *rflags)
2144 {
2145         print_func_entry();
2146         lmsw(vcpu, msw);
2147         *rflags = vmcs_readl(GUEST_RFLAGS);
2148         print_func_exit();
2149 }
2150
2151 unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
2152 {
2153         print_func_entry();
2154         switch (cr) {
2155                 case 0:
2156                         print_func_exit();
2157                         return guest_cr0();
2158                 case 2:
2159                         print_func_exit();
2160                         return vcpu->cr2;
2161                 case 3:
2162                         print_func_exit();
2163                         return vcpu->cr3;
2164                 case 4:
2165                         print_func_exit();
2166                         return guest_cr4();
2167                 default:
2168                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2169                         print_func_exit();
2170                         return 0;
2171         }
2172 }
2173
2174 void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
2175                                          unsigned long *rflags)
2176 {
2177         print_func_entry();
2178         switch (cr) {
2179                 case 0:
2180                         set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
2181                         *rflags = vmcs_readl(GUEST_RFLAGS);
2182                         break;
2183                 case 2:
2184                         vcpu->cr2 = val;
2185                         break;
2186                 case 3:
2187                         set_cr3(vcpu, val);
2188                         break;
2189                 case 4:
2190                         set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
2191                         break;
2192                 default:
2193                         vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2194         }
2195         print_func_exit();
2196 }
2197
2198 static int handle_rmode_exception(struct litevm_vcpu *vcpu,
2199                                                                   int vec, uint32_t err_code)
2200 {
2201         print_func_entry();
2202         if (!vcpu->rmode.active) {
2203                 print_func_exit();
2204                 return 0;
2205         }
2206
2207         if (vec == GP_VECTOR && err_code == 0)
2208                 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) {
2209                         print_func_exit();
2210                         return 1;
2211                 }
2212         print_func_exit();
2213         return 0;
2214 }
2215
2216 static int handle_exception(struct litevm_vcpu *vcpu,
2217                                                         struct litevm_run *litevm_run)
2218 {
2219         print_func_entry();
2220         uint32_t intr_info, error_code;
2221         unsigned long cr2, rip;
2222         uint32_t vect_info;
2223         enum emulation_result er;
2224
2225         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2226         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2227
2228         if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) {
2229                 printk("%s: unexpected, vectoring info 0x%x "
2230                            "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
2231         }
2232
2233         if (is_external_interrupt(vect_info)) {
2234                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2235                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_pending), irq);
2236                 SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_summary),
2237                                                            irq / BITS_PER_LONG);
2238         }
2239
2240         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) {  /* nmi */
2241                 asm("int $2");
2242                 print_func_exit();
2243                 return 1;
2244         }
2245         error_code = 0;
2246         rip = vmcs_readl(GUEST_RIP);
2247         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
2248                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2249         if (is_page_fault(intr_info)) {
2250                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2251
2252                 spin_lock_irqsave(&vcpu->litevm->lock);
2253                 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
2254                         spin_unlock(&vcpu->litevm->lock);
2255                         print_func_exit();
2256                         return 1;
2257                 }
2258
2259                 er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
2260                 spin_unlock(&vcpu->litevm->lock);
2261
2262                 switch (er) {
2263                         case EMULATE_DONE:
2264                                 print_func_exit();
2265                                 return 1;
2266                         case EMULATE_DO_MMIO:
2267                                 ++litevm_stat.mmio_exits;
2268                                 litevm_run->exit_reason = LITEVM_EXIT_MMIO;
2269                                 print_func_exit();
2270                                 return 0;
2271                         case EMULATE_FAIL:
2272                                 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
2273                                 break;
2274                         default:
2275                                 assert(0);
2276                 }
2277         }
2278
2279         if (vcpu->rmode.active &&
2280                 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2281                                                            error_code)) {
2282                 print_func_exit();
2283                 return 1;
2284         }
2285
2286         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
2287                 (INTR_TYPE_EXCEPTION | 1)) {
2288                 litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
2289                 print_func_exit();
2290                 return 0;
2291         }
2292         litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
2293         litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2294         litevm_run->ex.error_code = error_code;
2295         print_func_exit();
2296         return 0;
2297 }
2298
2299 static int handle_external_interrupt(struct litevm_vcpu *vcpu,
2300                                                                          struct litevm_run *litevm_run)
2301 {
2302         print_func_entry();
2303         ++litevm_stat.irq_exits;
2304         print_func_exit();
2305         return 1;
2306 }
2307
2308 static int get_io_count(struct litevm_vcpu *vcpu, uint64_t * count)
2309 {
2310         print_func_entry();
2311         uint64_t inst;
2312         gva_t rip;
2313         int countr_size;
2314         int i, n;
2315
2316         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
2317                 countr_size = 2;
2318         } else {
2319                 uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
2320
2321                 countr_size = (cs_ar & AR_L_MASK) ? 8 : (cs_ar & AR_DB_MASK) ? 4 : 2;
2322         }
2323
2324         rip = vmcs_readl(GUEST_RIP);
2325         if (countr_size != 8)
2326                 rip += vmcs_readl(GUEST_CS_BASE);
2327
2328         n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
2329
2330         for (i = 0; i < n; i++) {
2331                 switch (((uint8_t *) & inst)[i]) {
2332                         case 0xf0:
2333                         case 0xf2:
2334                         case 0xf3:
2335                         case 0x2e:
2336                         case 0x36:
2337                         case 0x3e:
2338                         case 0x26:
2339                         case 0x64:
2340                         case 0x65:
2341                         case 0x66:
2342                                 break;
2343                         case 0x67:
2344                                 countr_size = (countr_size == 2) ? 4 : (countr_size >> 1);
2345                         default:
2346                                 goto done;
2347                 }
2348         }
2349         print_func_exit();
2350         return 0;
2351 done:
2352         countr_size *= 8;
2353         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
2354         print_func_exit();
2355         return 1;
2356 }
2357
2358 static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2359 {
2360         print_func_entry();
2361         uint64_t exit_qualification;
2362
2363         ++litevm_stat.io_exits;
2364         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2365         litevm_run->exit_reason = LITEVM_EXIT_IO;
2366         if (exit_qualification & 8)
2367                 litevm_run->io.direction = LITEVM_EXIT_IO_IN;
2368         else
2369                 litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
2370         litevm_run->io.size = (exit_qualification & 7) + 1;
2371         litevm_run->io.string = (exit_qualification & 16) != 0;
2372         litevm_run->io.string_down
2373                 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2374         litevm_run->io.rep = (exit_qualification & 32) != 0;
2375         litevm_run->io.port = exit_qualification >> 16;
2376         if (litevm_run->io.string) {
2377                 if (!get_io_count(vcpu, &litevm_run->io.count)) {
2378                         print_func_exit();
2379                         return 1;
2380                 }
2381                 litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
2382         } else
2383                 litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX];       /* rax */
2384         print_func_exit();
2385         return 0;
2386 }
2387
2388 static int handle_invlpg(struct litevm_vcpu *vcpu,
2389                                                  struct litevm_run *litevm_run)
2390 {
2391         print_func_entry();
2392         uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
2393         int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2394         spin_lock_irqsave(&vcpu->litevm->lock);
2395         vcpu->mmu.inval_page(vcpu, address);
2396         spin_unlock(&vcpu->litevm->lock);
2397         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
2398         print_func_exit();
2399         return 1;
2400 }
2401
2402 static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2403 {
2404         print_func_entry();
2405         uint64_t exit_qualification;
2406         int cr;
2407         int reg;
2408
2409 #ifdef LITEVM_DEBUG
2410         if (guest_cpl() != 0) {
2411                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2412                 inject_gp(vcpu);
2413                 print_func_exit();
2414                 return 1;
2415         }
2416 #endif
2417
2418         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2419         cr = exit_qualification & 15;
2420         reg = (exit_qualification >> 8) & 15;
2421         switch ((exit_qualification >> 4) & 3) {
2422                 case 0: /* mov to cr */
2423                         switch (cr) {
2424                                 case 0:
2425                                         vcpu_load_rsp_rip(vcpu);
2426                                         set_cr0(vcpu, vcpu->regs[reg]);
2427                                         skip_emulated_instruction(vcpu);
2428                                         print_func_exit();
2429                                         return 1;
2430                                 case 3:
2431                                         vcpu_load_rsp_rip(vcpu);
2432                                         set_cr3(vcpu, vcpu->regs[reg]);
2433                                         skip_emulated_instruction(vcpu);
2434                                         print_func_exit();
2435                                         return 1;
2436                                 case 4:
2437                                         vcpu_load_rsp_rip(vcpu);
2438                                         set_cr4(vcpu, vcpu->regs[reg]);
2439                                         skip_emulated_instruction(vcpu);
2440                                         print_func_exit();
2441                                         return 1;
2442                                 case 8:
2443                                         vcpu_load_rsp_rip(vcpu);
2444                                         set_cr8(vcpu, vcpu->regs[reg]);
2445                                         skip_emulated_instruction(vcpu);
2446                                         print_func_exit();
2447                                         return 1;
2448                         };
2449                         break;
2450                 case 1: /*mov from cr */
2451                         switch (cr) {
2452                                 case 3:
2453                                         vcpu_load_rsp_rip(vcpu);
2454                                         vcpu->regs[reg] = vcpu->cr3;
2455                                         vcpu_put_rsp_rip(vcpu);
2456                                         skip_emulated_instruction(vcpu);
2457                                         print_func_exit();
2458                                         return 1;
2459                                 case 8:
2460                                         printd("handle_cr: read CR8 " "cpu erratum AA15\n");
2461                                         vcpu_load_rsp_rip(vcpu);
2462                                         vcpu->regs[reg] = vcpu->cr8;
2463                                         vcpu_put_rsp_rip(vcpu);
2464                                         skip_emulated_instruction(vcpu);
2465                                         print_func_exit();
2466                                         return 1;
2467                         }
2468                         break;
2469                 case 3: /* lmsw */
2470                         lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2471
2472                         skip_emulated_instruction(vcpu);
2473                         print_func_exit();
2474                         return 1;
2475                 default:
2476                         break;
2477         }
2478         litevm_run->exit_reason = 0;
2479         printk("litevm: unhandled control register: op %d cr %d\n",
2480                    (int)(exit_qualification >> 4) & 3, cr);
2481         print_func_exit();
2482         return 0;
2483 }
2484
2485 static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2486 {
2487         print_func_entry();
2488         uint64_t exit_qualification;
2489         unsigned long val;
2490         int dr, reg;
2491
2492         /*
2493          * FIXME: this code assumes the host is debugging the guest.
2494          *        need to deal with guest debugging itself too.
2495          */
2496         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2497         dr = exit_qualification & 7;
2498         reg = (exit_qualification >> 8) & 15;
2499         vcpu_load_rsp_rip(vcpu);
2500         if (exit_qualification & 16) {
2501                 /* mov from dr */
2502                 switch (dr) {
2503                         case 6:
2504                                 val = 0xffff0ff0;
2505                                 break;
2506                         case 7:
2507                                 val = 0x400;
2508                                 break;
2509                         default:
2510                                 val = 0;
2511                 }
2512                 vcpu->regs[reg] = val;
2513         } else {
2514                 /* mov to dr */
2515         }
2516         vcpu_put_rsp_rip(vcpu);
2517         skip_emulated_instruction(vcpu);
2518         print_func_exit();
2519         return 1;
2520 }
2521
2522 static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2523 {
2524         print_func_entry();
2525         litevm_run->exit_reason = LITEVM_EXIT_CPUID;
2526         print_func_exit();
2527         return 0;
2528 }
2529
2530 static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2531 {
2532         print_func_entry();
2533         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2534         struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
2535         uint64_t data;
2536
2537         if (guest_cpl() != 0) {
2538                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2539                 inject_gp(vcpu);
2540                 print_func_exit();
2541                 return 1;
2542         }
2543
2544         switch (ecx) {
2545                 case MSR_FS_BASE:
2546                         data = vmcs_readl(GUEST_FS_BASE);
2547                         break;
2548                 case MSR_GS_BASE:
2549                         data = vmcs_readl(GUEST_GS_BASE);
2550                         break;
2551                 case MSR_IA32_SYSENTER_CS:
2552                         data = vmcs_read32(GUEST_SYSENTER_CS);
2553                         break;
2554                 case MSR_IA32_SYSENTER_EIP:
2555                         data = vmcs_read32(GUEST_SYSENTER_EIP);
2556                         break;
2557                 case MSR_IA32_SYSENTER_ESP:
2558                         data = vmcs_read32(GUEST_SYSENTER_ESP);
2559                         break;
2560                 case MSR_IA32_MC0_CTL:
2561                 case MSR_IA32_MCG_STATUS:
2562                 case MSR_IA32_MCG_CAP:
2563                 case MSR_IA32_MC0_MISC:
2564                 case MSR_IA32_MC0_MISC + 4:
2565                 case MSR_IA32_MC0_MISC + 8:
2566                 case MSR_IA32_MC0_MISC + 12:
2567                 case MSR_IA32_MC0_MISC + 16:
2568                 case MSR_IA32_UCODE_REV:
2569                         /* MTRR registers */
2570                 case 0xfe:
2571                 case 0x200 ... 0x2ff:
2572                         data = 0;
2573                         break;
2574                 case MSR_IA32_APICBASE:
2575                         data = vcpu->apic_base;
2576                         break;
2577                 default:
2578                         if (msr) {
2579                                 data = msr->data;
2580                                 break;
2581                         }
2582                         printk("litevm: unhandled rdmsr: %x\n", ecx);
2583                         inject_gp(vcpu);
2584                         print_func_exit();
2585                         return 1;
2586         }
2587
2588         /* FIXME: handling of bits 32:63 of rax, rdx */
2589         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2590         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2591         skip_emulated_instruction(vcpu);
2592         print_func_exit();
2593         return 1;
2594 }
2595
2596 #ifdef __x86_64__
2597
2598 static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
2599 {
2600         print_func_entry();
2601         struct vmx_msr_entry *msr;
2602
2603         if (efer & EFER_RESERVED_BITS) {
2604                 printd("set_efer: 0x%llx #GP, reserved bits\n", efer);
2605                 inject_gp(vcpu);
2606                 print_func_exit();
2607                 return;
2608         }
2609
2610         if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
2611                 printd("set_efer: #GP, change LME while paging\n");
2612                 inject_gp(vcpu);
2613                 print_func_exit();
2614                 return;
2615         }
2616
2617         efer &= ~EFER_LMA;
2618         efer |= vcpu->shadow_efer & EFER_LMA;
2619
2620         vcpu->shadow_efer = efer;
2621
2622         msr = find_msr_entry(vcpu, MSR_EFER);
2623
2624         if (!(efer & EFER_LMA))
2625                 efer &= ~EFER_LME;
2626         msr->data = efer;
2627         skip_emulated_instruction(vcpu);
2628         print_func_exit();
2629 }
2630
2631 #endif
2632
2633 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
2634
2635 static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2636 {
2637         print_func_entry();
2638         uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
2639         struct vmx_msr_entry *msr;
2640         uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2641                 | ((uint64_t) (vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2642
2643         if (guest_cpl() != 0) {
2644                 vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
2645                 inject_gp(vcpu);
2646                 print_func_exit();
2647                 return 1;
2648         }
2649
2650         switch (ecx) {
2651                 case MSR_FS_BASE:
2652                         vmcs_writel(GUEST_FS_BASE, data);
2653                         break;
2654                 case MSR_GS_BASE:
2655                         vmcs_writel(GUEST_GS_BASE, data);
2656                         break;
2657                 case MSR_IA32_SYSENTER_CS:
2658                         vmcs_write32(GUEST_SYSENTER_CS, data);
2659                         break;
2660                 case MSR_IA32_SYSENTER_EIP:
2661                         vmcs_write32(GUEST_SYSENTER_EIP, data);
2662                         break;
2663                 case MSR_IA32_SYSENTER_ESP:
2664                         vmcs_write32(GUEST_SYSENTER_ESP, data);
2665                         break;
2666                 case MSR_EFER:
2667                         set_efer(vcpu, data);
2668                         print_func_exit();
2669                         return 1;
2670                 case MSR_IA32_MC0_STATUS:
2671                         printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data);
2672                         break;
2673                 case MSR_IA32_TIME_STAMP_COUNTER:{
2674                                 uint64_t tsc;
2675
2676                                 tsc = read_tsc();
2677                                 vmcs_write64(TSC_OFFSET, data - tsc);
2678                                 break;
2679                         }
2680                 case MSR_IA32_UCODE_REV:
2681                 case MSR_IA32_UCODE_WRITE:
2682                 case 0x200 ... 0x2ff:   /* MTRRs */
2683                         break;
2684                 case MSR_IA32_APICBASE:
2685                         vcpu->apic_base = data;
2686                         break;
2687                 default:
2688                         msr = find_msr_entry(vcpu, ecx);
2689                         if (msr) {
2690                                 msr->data = data;
2691                                 break;
2692                         }
2693                         printk("litevm: unhandled wrmsr: %x\n", ecx);
2694                         inject_gp(vcpu);
2695                         print_func_exit();
2696                         return 1;
2697         }
2698         skip_emulated_instruction(vcpu);
2699         print_func_exit();
2700         return 1;
2701 }
2702
2703 static int handle_interrupt_window(struct litevm_vcpu *vcpu,
2704                                                                    struct litevm_run *litevm_run)
2705 {
2706         print_func_entry();
2707         /* Turn off interrupt window reporting. */
2708         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2709                                  vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2710                                  & ~CPU_BASED_VIRTUAL_INTR_PENDING);
2711         print_func_exit();
2712         return 1;
2713 }
2714
2715 static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
2716 {
2717         print_func_entry();
2718         skip_emulated_instruction(vcpu);
2719         if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) {
2720                 print_func_exit();
2721                 return 1;
2722         }
2723
2724         litevm_run->exit_reason = LITEVM_EXIT_HLT;
2725         print_func_exit();
2726         return 0;
2727 }
2728
2729 /*
2730  * The exit handlers return 1 if the exit was handled fully and guest execution
2731  * may resume.  Otherwise they set the litevm_run parameter to indicate what needs
2732  * to be done to userspace and return 0.
2733  */
2734 static int (*litevm_vmx_exit_handlers[]) (struct litevm_vcpu * vcpu,
2735                                                                                   struct litevm_run * litevm_run) = {
2736 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2737                 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2738                 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2739                 [EXIT_REASON_INVLPG] = handle_invlpg,
2740                 [EXIT_REASON_CR_ACCESS] = handle_cr,
2741                 [EXIT_REASON_DR_ACCESS] = handle_dr,
2742                 [EXIT_REASON_CPUID] = handle_cpuid,
2743                 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2744                 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2745                 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2746                 [EXIT_REASON_HLT] = handle_halt,};
2747
2748 static const int litevm_vmx_max_exit_handlers =
2749         sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
2750
2751 /*
2752  * The guest has exited.  See if we can fix it or if we need userspace
2753  * assistance.
2754  */
2755 static int litevm_handle_exit(struct litevm_run *litevm_run,
2756                                                           struct litevm_vcpu *vcpu)
2757 {
2758         print_func_entry();
2759         uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2760         uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
2761
2762         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2763                 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2764                 printk("%s: unexpected, valid vectoring info and "
2765                            "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2766         litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2767         if (exit_reason < litevm_vmx_max_exit_handlers
2768                 && litevm_vmx_exit_handlers[exit_reason]) {
2769                 print_func_exit();
2770                 return litevm_vmx_exit_handlers[exit_reason] (vcpu, litevm_run);
2771         } else {
2772                 litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
2773                 litevm_run->hw.hardware_exit_reason = exit_reason;
2774         }
2775         print_func_exit();
2776         return 0;
2777 }
2778
2779 static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
2780 {
2781         print_func_entry();
2782         uint16_t ent[2];
2783         uint16_t cs;
2784         uint16_t ip;
2785         unsigned long flags;
2786         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
2787         uint16_t sp = vmcs_readl(GUEST_RSP);
2788         uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
2789
2790         if (sp > ss_limit || ((sp - 6) > sp)) {
2791                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
2792                                         __FUNCTION__,
2793                                         vmcs_readl(GUEST_RSP),
2794                                         vmcs_readl(GUEST_SS_BASE), vmcs_read32(GUEST_SS_LIMIT));
2795                 print_func_exit();
2796                 return;
2797         }
2798
2799         if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
2800                 sizeof(ent)) {
2801                 //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
2802                 print_func_exit();
2803                 return;
2804         }
2805
2806         flags = vmcs_readl(GUEST_RFLAGS);
2807         cs = vmcs_readl(GUEST_CS_BASE) >> 4;
2808         ip = vmcs_readl(GUEST_RIP);
2809
2810         if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
2811                 litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
2812                 litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
2813                 //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
2814                 print_func_exit();
2815                 return;
2816         }
2817
2818         vmcs_writel(GUEST_RFLAGS, flags &
2819                                 ~(X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
2820         vmcs_write16(GUEST_CS_SELECTOR, ent[1]);
2821         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
2822         vmcs_writel(GUEST_RIP, ent[0]);
2823         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
2824         print_func_exit();
2825 }
2826
2827 static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
2828 {
2829         print_func_entry();
2830         int word_index = __ffs(vcpu->irq_summary);
2831         int bit_index = __ffs(vcpu->irq_pending[word_index]);
2832         int irq = word_index * BITS_PER_LONG + bit_index;
2833
2834         /* don't have clear_bit and I'm not sure the akaros
2835          * bitops are really going to work.
2836          */
2837         vcpu->irq_pending[word_index] &= ~(1 << bit_index);
2838         if (!vcpu->irq_pending[word_index])
2839                 vcpu->irq_summary &= ~(1 << word_index);
2840
2841         if (vcpu->rmode.active) {
2842                 inject_rmode_irq(vcpu, irq);
2843                 print_func_exit();
2844                 return;
2845         }
2846         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2847                                  irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2848         print_func_exit();
2849 }
2850
2851 static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
2852 {
2853         print_func_entry();
2854         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
2855                 && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
2856                 /*
2857                  * Interrupts enabled, and not blocked by sti or mov ss. Good.
2858                  */
2859                 litevm_do_inject_irq(vcpu);
2860         else
2861                 /*
2862                  * Interrupts blocked.  Wait for unblock.
2863                  */
2864                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2865                                          vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
2866                                          | CPU_BASED_VIRTUAL_INTR_PENDING);
2867         print_func_exit();
2868 }
2869
2870 static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
2871 {
2872         print_func_entry();
2873         struct litevm_guest_debug *dbg = &vcpu->guest_debug;
2874
2875 #warning "no debugging guests yet"
2876         assert(0);
2877 /*
2878         set_debugreg(dbg->bp[0], 0);
2879         set_debugreg(dbg->bp[1], 1);
2880         set_debugreg(dbg->bp[2], 2);
2881         set_debugreg(dbg->bp[3], 3);
2882 */
2883         if (dbg->singlestep) {
2884                 unsigned long flags;
2885
2886                 flags = vmcs_readl(GUEST_RFLAGS);
2887                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2888                 vmcs_writel(GUEST_RFLAGS, flags);
2889         }
2890         print_func_exit();
2891 }
2892
2893 static void load_msrs(struct vmx_msr_entry *e, int n)
2894 {
2895         print_func_entry();
2896         int i;
2897
2898         for (i = 0; i < n; ++i)
2899                 write_msr(e[i].index, e[i].data);
2900         print_func_exit();
2901 }
2902
2903 static void save_msrs(struct vmx_msr_entry *e, int n)
2904 {
2905         print_func_entry();
2906         int i;
2907
2908         for (i = 0; i < n; ++i)
2909                 e[i].data = read_msr(e[i].index);
2910         print_func_exit();
2911 }
2912
2913 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
2914 {
2915         print_func_entry();
2916         struct litevm_vcpu *vcpu;
2917         uint8_t fail;
2918         uint16_t fs_sel, gs_sel, ldt_sel;
2919         int fs_gs_ldt_reload_needed;
2920
2921         if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
2922                 error("vcpu is %d but must be in the range %d..%d\n",
2923                           litevm_run->vcpu, LITEVM_MAX_VCPUS);
2924
2925         vcpu = vcpu_load(litevm, litevm_run->vcpu);
2926         if (!vcpu)
2927                 error("vcpu_load failed");
2928
2929         if (litevm_run->emulated) {
2930                 skip_emulated_instruction(vcpu);
2931                 litevm_run->emulated = 0;
2932         }
2933
2934         if (litevm_run->mmio_completed) {
2935                 memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
2936                 vcpu->mmio_read_completed = 1;
2937         }
2938
2939         vcpu->mmio_needed = 0;
2940
2941 again:
2942         /*
2943          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2944          * allow segment selectors with cpl > 0 or ti == 1.
2945          */
2946         fs_sel = read_fs();
2947         gs_sel = read_gs();
2948         ldt_sel = read_ldt();
2949         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
2950         if (!fs_gs_ldt_reload_needed) {
2951                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2952                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2953         } else {
2954                 vmcs_write16(HOST_FS_SELECTOR, 0);
2955                 vmcs_write16(HOST_GS_SELECTOR, 0);
2956         }
2957
2958 #ifdef __x86_64__
2959         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
2960         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2961 #endif
2962
2963         if (vcpu->irq_summary &&
2964                 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2965                 litevm_try_inject_irq(vcpu);
2966
2967         if (vcpu->guest_debug.enabled)
2968                 litevm_guest_debug_pre(vcpu);
2969
2970         fx_save(vcpu->host_fx_image);
2971         fx_restore(vcpu->guest_fx_image);
2972
2973         save_msrs(vcpu->host_msrs, vcpu->nmsrs);
2974         load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2975
2976         asm(
2977                    /* Store host registers */
2978                    "pushf \n\t"
2979 #ifdef __x86_64__
2980                    "push %%rax; push %%rbx; push %%rdx;"
2981                    "push %%rsi; push %%rdi; push %%rbp;"
2982                    "push %%r8;  push %%r9;  push %%r10; push %%r11;"
2983                    "push %%r12; push %%r13; push %%r14; push %%r15;"
2984                    "push %%rcx \n\t" "vmwrite %%rsp, %2 \n\t"
2985 #else
2986                    "pusha; push %%ecx \n\t" "vmwrite %%esp, %2 \n\t"
2987 #endif
2988                    /* Check if vmlaunch of vmresume is needed */
2989                    "cmp $0, %1 \n\t"
2990                    /* Load guest registers.  Don't clobber flags. */
2991 #ifdef __x86_64__
2992                    "mov %c[cr2](%3), %%rax \n\t" "mov %%rax, %%cr2 \n\t" "mov %c[rax](%3), %%rax \n\t" "mov %c[rbx](%3), %%rbx \n\t" "mov %c[rdx](%3), %%rdx \n\t" "mov %c[rsi](%3), %%rsi \n\t" "mov %c[rdi](%3), %%rdi \n\t" "mov %c[rbp](%3), %%rbp \n\t" "mov %c[r8](%3),  %%r8  \n\t" "mov %c[r9](%3),  %%r9  \n\t" "mov %c[r10](%3), %%r10 \n\t" "mov %c[r11](%3), %%r11 \n\t" "mov %c[r12](%3), %%r12 \n\t" "mov %c[r13](%3), %%r13 \n\t" "mov %c[r14](%3), %%r14 \n\t" "mov %c[r15](%3), %%r15 \n\t" "mov %c[rcx](%3), %%rcx \n\t"      /* kills %3 (rcx) */
2993 #else
2994                    "mov %c[cr2](%3), %%eax \n\t" "mov %%eax,   %%cr2 \n\t" "mov %c[rax](%3), %%eax \n\t" "mov %c[rbx](%3), %%ebx \n\t" "mov %c[rdx](%3), %%edx \n\t" "mov %c[rsi](%3), %%esi \n\t" "mov %c[rdi](%3), %%edi \n\t" "mov %c[rbp](%3), %%ebp \n\t" "mov %c[rcx](%3), %%ecx \n\t"    /* kills %3 (ecx) */
2995 #endif
2996                    /* Enter guest mode */
2997                    "jne launched \n\t"
2998                    "vmlaunch \n\t"
2999                    "jmp litevm_vmx_return \n\t"
3000                    "launched: vmresume \n\t"
3001                    ".globl litevm_vmx_return \n\t" "litevm_vmx_return: "
3002                    /* Save guest registers, load host registers, keep flags */
3003 #ifdef __x86_64__
3004                    "xchg %3,     0(%%rsp) \n\t"
3005                    "mov %%rax, %c[rax](%3) \n\t"
3006                    "mov %%rbx, %c[rbx](%3) \n\t"
3007                    "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
3008                    "mov %%rdx, %c[rdx](%3) \n\t"
3009                    "mov %%rsi, %c[rsi](%3) \n\t"
3010                    "mov %%rdi, %c[rdi](%3) \n\t"
3011                    "mov %%rbp, %c[rbp](%3) \n\t"
3012                    "mov %%r8,  %c[r8](%3) \n\t"
3013                    "mov %%r9,  %c[r9](%3) \n\t"
3014                    "mov %%r10, %c[r10](%3) \n\t"
3015                    "mov %%r11, %c[r11](%3) \n\t"
3016                    "mov %%r12, %c[r12](%3) \n\t"
3017                    "mov %%r13, %c[r13](%3) \n\t"
3018                    "mov %%r14, %c[r14](%3) \n\t"
3019                    "mov %%r15, %c[r15](%3) \n\t"
3020                    "mov %%cr2, %%rax   \n\t"
3021                    "mov %%rax, %c[cr2](%3) \n\t"
3022                    "mov 0(%%rsp), %3 \n\t"
3023                    "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
3024                    "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
3025                    "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
3026                    "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
3027 #else
3028                    "xchg %3, 0(%%esp) \n\t"
3029                    "mov %%eax, %c[rax](%3) \n\t"
3030                    "mov %%ebx, %c[rbx](%3) \n\t"
3031                    "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
3032                    "mov %%edx, %c[rdx](%3) \n\t"
3033                    "mov %%esi, %c[rsi](%3) \n\t"
3034                    "mov %%edi, %c[rdi](%3) \n\t"
3035                    "mov %%ebp, %c[rbp](%3) \n\t"
3036                    "mov %%cr2, %%eax  \n\t"
3037                    "mov %%eax, %c[cr2](%3) \n\t"
3038                    "mov 0(%%esp), %3 \n\t" "pop %%ecx; popa \n\t"
3039 #endif
3040 "setbe %0 \n\t" "popf \n\t":"=g"(fail)
3041 :                  "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
3042                    "c"(vcpu),
3043                    [rax] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
3044                    [rbx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
3045                    [rcx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
3046                    [rdx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
3047                    [rsi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
3048                    [rdi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
3049                    [rbp] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
3050 #ifdef __x86_64__
3051                    [r8] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8])),
3052                    [r9] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9])),
3053                    [r10] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
3054                    [r11] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
3055                    [r12] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
3056                    [r13] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
3057                    [r14] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
3058                    [r15] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
3059 #endif
3060                    [cr2] "i"(offsetof(struct litevm_vcpu, cr2))
3061                    :"cc", "memory");
3062
3063         ++litevm_stat.exits;
3064         printk("vm_run exits");
3065         save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
3066         load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
3067
3068         fx_save(vcpu->guest_fx_image);
3069         fx_restore(vcpu->host_fx_image);
3070
3071 #ifndef __x86_64__
3072 asm("mov %0, %%ds; mov %0, %%es": :"r"(__USER_DS));
3073 #endif
3074
3075         litevm_run->exit_type = 0;
3076         if (fail) {
3077                 litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
3078                 litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
3079         } else {
3080                 if (fs_gs_ldt_reload_needed) {
3081                         load_ldt(ldt_sel);
3082                         load_fs(fs_sel);
3083                         /*
3084                          * If we have to reload gs, we must take care to
3085                          * preserve our gs base.
3086                          */
3087                         disable_irq();
3088                         load_gs(gs_sel);
3089 #ifdef __x86_64__
3090                         write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
3091 #endif
3092                         enable_irq();
3093
3094                         reload_tss();
3095                 }
3096                 vcpu->launched = 1;
3097                 litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
3098                 if (litevm_handle_exit(litevm_run, vcpu)) {
3099                         /* Give scheduler a change to reschedule. */
3100                         vcpu_put(vcpu);
3101 #warning "how to tell if signal is pending"
3102 /*
3103                         if (signal_pending(current)) {
3104                                 ++litevm_stat.signal_exits;
3105                                 return -EINTR;
3106                         }
3107 */
3108                         kthread_yield();
3109                         /* Cannot fail -  no vcpu unplug yet. */
3110                         vcpu_load(litevm, vcpu_slot(vcpu));
3111                         goto again;
3112                 }
3113         }
3114
3115         vcpu_put(vcpu);
3116         printk("vm_run returns\n");
3117         print_func_exit();
3118         return 0;
3119 }
3120
3121 static int litevm_dev_ioctl_get_regs(struct litevm *litevm,
3122                                                                          struct litevm_regs *regs)
3123 {
3124         print_func_entry();
3125         struct litevm_vcpu *vcpu;
3126
3127         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3128                 print_func_exit();
3129                 return -EINVAL;
3130         }
3131
3132         vcpu = vcpu_load(litevm, regs->vcpu);
3133         if (!vcpu) {
3134                 print_func_exit();
3135                 return -ENOENT;
3136         }
3137
3138         regs->rax = vcpu->regs[VCPU_REGS_RAX];
3139         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
3140         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
3141         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
3142         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
3143         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
3144         regs->rsp = vmcs_readl(GUEST_RSP);
3145         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
3146 #ifdef __x86_64__
3147         regs->r8 = vcpu->regs[VCPU_REGS_R8];
3148         regs->r9 = vcpu->regs[VCPU_REGS_R9];
3149         regs->r10 = vcpu->regs[VCPU_REGS_R10];
3150         regs->r11 = vcpu->regs[VCPU_REGS_R11];
3151         regs->r12 = vcpu->regs[VCPU_REGS_R12];
3152         regs->r13 = vcpu->regs[VCPU_REGS_R13];
3153         regs->r14 = vcpu->regs[VCPU_REGS_R14];
3154         regs->r15 = vcpu->regs[VCPU_REGS_R15];
3155 #endif
3156
3157         regs->rip = vmcs_readl(GUEST_RIP);
3158         regs->rflags = vmcs_readl(GUEST_RFLAGS);
3159
3160         /*
3161          * Don't leak debug flags in case they were set for guest debugging
3162          */
3163         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3164                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3165
3166         vcpu_put(vcpu);
3167
3168         print_func_exit();
3169         return 0;
3170 }
3171
3172 static int litevm_dev_ioctl_set_regs(struct litevm *litevm,
3173                                                                          struct litevm_regs *regs)
3174 {
3175         print_func_entry();
3176         struct litevm_vcpu *vcpu;
3177
3178         if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
3179                 print_func_exit();
3180                 return -EINVAL;
3181         }
3182
3183         vcpu = vcpu_load(litevm, regs->vcpu);
3184         if (!vcpu) {
3185                 print_func_exit();
3186                 return -ENOENT;
3187         }
3188
3189         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
3190         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
3191         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
3192         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
3193         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
3194         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
3195         vmcs_writel(GUEST_RSP, regs->rsp);
3196         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
3197 #ifdef __x86_64__
3198         vcpu->regs[VCPU_REGS_R8] = regs->r8;
3199         vcpu->regs[VCPU_REGS_R9] = regs->r9;
3200         vcpu->regs[VCPU_REGS_R10] = regs->r10;
3201         vcpu->regs[VCPU_REGS_R11] = regs->r11;
3202         vcpu->regs[VCPU_REGS_R12] = regs->r12;
3203         vcpu->regs[VCPU_REGS_R13] = regs->r13;
3204         vcpu->regs[VCPU_REGS_R14] = regs->r14;
3205         vcpu->regs[VCPU_REGS_R15] = regs->r15;
3206 #endif
3207
3208         vmcs_writel(GUEST_RIP, regs->rip);
3209         vmcs_writel(GUEST_RFLAGS, regs->rflags);
3210
3211         vcpu_put(vcpu);
3212
3213         print_func_exit();
3214         return 0;
3215 }
3216
3217 static int litevm_dev_ioctl_get_sregs(struct litevm *litevm,
3218                                                                           struct litevm_sregs *sregs)
3219 {
3220         print_func_entry();
3221         struct litevm_vcpu *vcpu;
3222
3223         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3224                 print_func_exit();
3225                 return -EINVAL;
3226         }
3227         vcpu = vcpu_load(litevm, sregs->vcpu);
3228         if (!vcpu) {
3229                 print_func_exit();
3230                 return -ENOENT;
3231         }
3232 #define get_segment(var, seg) \
3233         do { \
3234                 uint32_t ar; \
3235                 \
3236                 sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
3237                 sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
3238                 sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
3239                 ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
3240                 if (ar & AR_UNUSABLE_MASK) ar = 0; \
3241                 sregs->var.type = ar & 15; \
3242                 sregs->var.s = (ar >> 4) & 1; \
3243                 sregs->var.dpl = (ar >> 5) & 3; \
3244                 sregs->var.present = (ar >> 7) & 1; \
3245                 sregs->var.avl = (ar >> 12) & 1; \
3246                 sregs->var.l = (ar >> 13) & 1; \
3247                 sregs->var.db = (ar >> 14) & 1; \
3248                 sregs->var.g = (ar >> 15) & 1; \
3249                 sregs->var.unusable = (ar >> 16) & 1; \
3250         } while (0);
3251
3252         get_segment(cs, CS);
3253         get_segment(ds, DS);
3254         get_segment(es, ES);
3255         get_segment(fs, FS);
3256         get_segment(gs, GS);
3257         get_segment(ss, SS);
3258
3259         get_segment(tr, TR);
3260         get_segment(ldt, LDTR);
3261 #undef get_segment
3262
3263 #define get_dtable(var, table) \
3264         sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
3265                 sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
3266
3267         get_dtable(idt, IDTR);
3268         get_dtable(gdt, GDTR);
3269 #undef get_dtable
3270
3271         sregs->cr0 = guest_cr0();
3272         sregs->cr2 = vcpu->cr2;
3273         sregs->cr3 = vcpu->cr3;
3274         sregs->cr4 = guest_cr4();
3275         sregs->cr8 = vcpu->cr8;
3276         sregs->efer = vcpu->shadow_efer;
3277         sregs->apic_base = vcpu->apic_base;
3278
3279         sregs->pending_int = vcpu->irq_summary != 0;
3280
3281         vcpu_put(vcpu);
3282
3283         print_func_exit();
3284         return 0;
3285 }
3286
3287 static int litevm_dev_ioctl_set_sregs(struct litevm *litevm,
3288                                                                           struct litevm_sregs *sregs)
3289 {
3290         print_func_entry();
3291         struct litevm_vcpu *vcpu;
3292         int mmu_reset_needed = 0;
3293
3294         if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
3295                 print_func_exit();
3296                 return -EINVAL;
3297         }
3298         vcpu = vcpu_load(litevm, sregs->vcpu);
3299         if (!vcpu) {
3300                 print_func_exit();
3301                 return -ENOENT;
3302         }
3303 #define set_segment(var, seg) \
3304         do { \
3305                 uint32_t ar; \
3306                 \
3307                 vmcs_writel(GUEST_##seg##_BASE, sregs->var.base);  \
3308                 vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
3309                 vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
3310                 if (sregs->var.unusable) { \
3311                         ar = (1 << 16); \
3312                 } else { \
3313                         ar = (sregs->var.type & 15); \
3314                         ar |= (sregs->var.s & 1) << 4; \
3315                         ar |= (sregs->var.dpl & 3) << 5; \
3316                         ar |= (sregs->var.present & 1) << 7; \
3317                         ar |= (sregs->var.avl & 1) << 12; \
3318                         ar |= (sregs->var.l & 1) << 13; \
3319                         ar |= (sregs->var.db & 1) << 14; \
3320                         ar |= (sregs->var.g & 1) << 15; \
3321                 } \
3322                 vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
3323         } while (0);
3324
3325         set_segment(cs, CS);
3326         set_segment(ds, DS);
3327         set_segment(es, ES);
3328         set_segment(fs, FS);
3329         set_segment(gs, GS);
3330         set_segment(ss, SS);
3331
3332         set_segment(tr, TR);
3333
3334         set_segment(ldt, LDTR);
3335 #undef set_segment
3336
3337 #define set_dtable(var, table) \
3338         vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
3339         vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
3340
3341         set_dtable(idt, IDTR);
3342         set_dtable(gdt, GDTR);
3343 #undef set_dtable
3344
3345         vcpu->cr2 = sregs->cr2;
3346         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
3347         vcpu->cr3 = sregs->cr3;
3348
3349         vcpu->cr8 = sregs->cr8;
3350
3351         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
3352 #ifdef __x86_64__
3353         __set_efer(vcpu, sregs->efer);
3354 #endif
3355         vcpu->apic_base = sregs->apic_base;
3356
3357         mmu_reset_needed |= guest_cr0() != sregs->cr0;
3358         vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
3359         update_exception_bitmap(vcpu);
3360         vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
3361         vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
3362
3363         mmu_reset_needed |= guest_cr4() != sregs->cr4;
3364         __set_cr4(vcpu, sregs->cr4);
3365
3366         if (mmu_reset_needed)
3367                 litevm_mmu_reset_context(vcpu);
3368         vcpu_put(vcpu);
3369
3370         print_func_exit();
3371         return 0;
3372 }
3373
3374 /*
3375  * Translate a guest virtual address to a guest physical address.
3376  */
3377 static int litevm_dev_ioctl_translate(struct litevm *litevm,
3378                                                                           struct litevm_translation *tr)
3379 {
3380         print_func_entry();
3381         unsigned long vaddr = tr->linear_address;
3382         struct litevm_vcpu *vcpu;
3383         gpa_t gpa;
3384
3385         vcpu = vcpu_load(litevm, tr->vcpu);
3386         if (!vcpu) {
3387                 print_func_exit();
3388                 return -ENOENT;
3389         }
3390         spin_lock_irqsave(&litevm->lock);
3391         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
3392         tr->physical_address = gpa;
3393         tr->valid = gpa != UNMAPPED_GVA;
3394         tr->writeable = 1;
3395         tr->usermode = 0;
3396         spin_unlock(&litevm->lock);
3397         vcpu_put(vcpu);
3398
3399         print_func_exit();
3400         return 0;
3401 }
3402
3403 #if 0
3404 static int litevm_dev_ioctl_interrupt(struct litevm *litevm,
3405                                                                           struct litevm_interrupt *irq)
3406 {
3407         struct litevm_vcpu *vcpu;
3408
3409         if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS)
3410                 return -EINVAL;
3411         if (irq->irq < 0 || irq->irq >= 256)
3412                 return -EINVAL;
3413         vcpu = vcpu_load(litevm, irq->vcpu);
3414         if (!vcpu)