VMM: moves various helpers to vmx.h
[akaros.git] / kern / arch / x86 / vmm / intel / vmx.c
1 /**
2  *  vmx.c - The Intel VT-x driver for Dune
3  *
4  * This file is derived from Linux KVM VT-x support.
5  * Copyright (C) 2006 Qumranet, Inc.
6  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
7  *
8  * Original Authors:
9  *   Avi Kivity   <avi@qumranet.com>
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *
12  * This modified version is simpler because it avoids the following
13  * features that are not requirements for Dune:
14  *  * Real-mode emulation
15  *  * Nested VT-x support
16  *  * I/O hardware emulation
17  *  * Any of the more esoteric X86 features and registers
18  *  * KVM-specific functionality
19  *
20  * In essence we provide only the minimum functionality needed to run
21  * a process in vmx non-root mode rather than the full hardware emulation
22  * needed to support an entire OS.
23  *
24  * This driver is a research prototype and as such has the following
25  * limitations:
26  *
27  * FIXME: Backward compatability is currently a non-goal, and only recent
28  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
29  * driver.
30  *
31  * FIXME: Eventually we should handle concurrent user's of VT-x more
32  * gracefully instead of requiring exclusive access. This would allow
33  * Dune to interoperate with KVM and other HV solutions.
34  *
35  * FIXME: We need to support hotplugged physical CPUs.
36  *
37  * Authors:
38  *   Adam Belay   <abelay@stanford.edu>
39  */
40
41 /* Basic flow.
42  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
43  * You're left with the feeling that they got part way through and realized they had to have one for
44  *
45  * 1) your CPU is going to be capable of running VMs, and you need state for that.
46  *
47  * 2) you're about to start a guest, and you need state for that.
48  *
49  * So there is get cpu set up to be able to run VMs stuff, and now
50  * let's start a guest stuff.  In Akaros, CPUs will always be set up
51  * to run a VM if that is possible. Processes can flip themselves into
52  * a VM and that will require another VMCS.
53  *
54  * So: at kernel startup time, the SMP boot stuff calls
55  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
56  * in the case of this file is intel_vmm_init. That does some code
57  * that sets up stuff for ALL sockets, based on the capabilities of
58  * the socket it runs on. If any cpu supports vmx, it assumes they all
59  * do. That's a realistic assumption. So the call_function_all is kind
60  * of stupid, really; it could just see what's on the current cpu and
61  * assume it's on all. HOWEVER: there are systems in the wilde that
62  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
63  * might as well allow for the chance that wel'll only all VMMCPs on a
64  * subset (not implemented yet however).  So: probe all CPUs, get a
65  * count of how many support VMX and, for now, assume they all do
66  * anyway.
67  *
68  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
69  * which contains all the naughty bits settings for all the cpus that can run a VM.
70  * Realistically, all VMX-capable cpus in a system will have identical configurations.
71  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration.
72  *
73  * configure the msr_bitmap. This is the bitmap of MSRs which the
74  * guest can manipulate.  Currently, we only allow GS and FS base.
75  *
76  * Reserve bit 0 in the vpid bitmap as guests can not use that
77  *
78  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
79  * per-guest. Once set up, it is left alone.  The ONLY think we set in
80  * there is the revision area. The VMX is page-sized per cpu and
81  * page-aligned. Note that it can be smaller, but why bother? We know
82  * the max size and alightment, and it's convenient.
83  *
84  * Now that it is set up, enable vmx on all cpus. This involves
85  * testing VMXE in cr4, to see if we've been here before (TODO: delete
86  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
87  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
88  * instruction), and syncing vpid's and ept's.  Now the CPU is ready
89  * to host guests.
90  *
91  * Setting up a guest.
92  * We divide this into two things: vmm_proc_init and vm_run.
93  * Currently, on Intel, vmm_proc_init does nothing.
94  *
95  * vm_run is really complicated. It is called with a coreid, rip, rsp,
96  * cr3, and flags.  On intel, it calls vmx_launch. vmx_launch is set
97  * up for a few test cases. If rip is 1, it sets the guest rip to
98  * a function which will deref 0 and should exit with failure 2. If rip is 0,
99  * it calls an infinite loop in the guest.
100  *
101  * The sequence of operations:
102  * create a vcpu
103  * while (1) {
104  * get a vcpu
105  * disable irqs (required or you can't enter the VM)
106  * vmx_run_vcpu()
107  * enable irqs
108  * manage the vm exit
109  * }
110  *
111  * get a vcpu
112  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want,
113  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
114  *
115  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call
116  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one
117  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu,
118  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done.
119  *
120  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
121  * of inline assembly with embedded CPP crap. I suspect we'll want to
122  * un-inline it someday, but maybe not.  It's called with a vcpu
123  * struct from which it loads guest state, and to which it stores
124  * non-virtualized host state. It issues a vmlaunch or vmresume
125  * instruction depending, and on return, it evaluates if things the
126  * launch/resume had an error in that operation. Note this is NOT the
127  * same as an error while in the virtual machine; this is an error in
128  * startup due to misconfiguration. Depending on whatis returned it's
129  * either a failed vm startup or an exit for lots of many reasons.
130  *
131  */
132
133 /* basically: only rename those globals that might conflict
134  * with existing names. Leave all else the same.
135  * this code is more modern than the other code, yet still
136  * well encapsulated, it seems.
137  */
138 #include <kmalloc.h>
139 #include <string.h>
140 #include <stdio.h>
141 #include <assert.h>
142 #include <error.h>
143 #include <pmap.h>
144 #include <sys/queue.h>
145 #include <smp.h>
146 #include <kref.h>
147 #include <atomic.h>
148 #include <alarm.h>
149 #include <event.h>
150 #include <umem.h>
151 #include <bitops.h>
152 #include <arch/types.h>
153 #include <syscall.h>
154
155 #include "vmx.h"
156 #include "../vmm.h"
157
158 #include "cpufeature.h"
159
160 #define currentcpu (&per_cpu_info[core_id()])
161
162 /*
163  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
164  * away by decrementing the array size.
165  */
166 static const uint32_t vmx_msr_index[] = {
167 #ifdef CONFIG_X86_64
168         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
169 #endif
170         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
171 };
172 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
173
174 static unsigned long *msr_bitmap;
175
176 struct vmx_capability vmx_capability;
177 struct vmcs_config vmcs_config;
178
179 void ept_flush(uint64_t eptp)
180 {
181         ept_sync_context(eptp);
182 }
183
184 static void vmcs_clear(struct vmcs *vmcs)
185 {
186         uint64_t phys_addr = PADDR(vmcs);
187         uint8_t error;
188
189         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
190                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
191                       : "cc", "memory");
192         if (error)
193                 printk("vmclear fail: %p/%llx\n",
194                        vmcs, phys_addr);
195 }
196
197 static void vmcs_load(struct vmcs *vmcs)
198 {
199         uint64_t phys_addr = PADDR(vmcs);
200         uint8_t error;
201
202         asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
203                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
204                         : "cc", "memory");
205         if (error)
206                 printk("vmptrld %p/%llx failed\n",
207                        vmcs, phys_addr);
208 }
209
210 /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
211 static physaddr_t vmcs_get_current(void)
212 {
213         physaddr_t vmcs_paddr;
214         /* RAX contains the addr of the location to store the VMCS pointer.  The
215          * compiler doesn't know the ASM will deref that pointer, hence the =m */
216         asm volatile (ASM_VMX_VMPTRST_RAX : "=m"(vmcs_paddr) : "a"(&vmcs_paddr));
217         return vmcs_paddr;
218 }
219
220 __always_inline unsigned long vmcs_readl(unsigned long field)
221 {
222         unsigned long value;
223
224         asm volatile (ASM_VMX_VMREAD_RDX_RAX
225                       : "=a"(value) : "d"(field) : "cc");
226         return value;
227 }
228
229 __always_inline uint16_t vmcs_read16(unsigned long field)
230 {
231         return vmcs_readl(field);
232 }
233
234 static __always_inline uint32_t vmcs_read32(unsigned long field)
235 {
236         return vmcs_readl(field);
237 }
238
239 static __always_inline uint64_t vmcs_read64(unsigned long field)
240 {
241 #ifdef CONFIG_X86_64
242         return vmcs_readl(field);
243 #else
244         return vmcs_readl(field) | ((uint64_t)vmcs_readl(field+1) << 32);
245 #endif
246 }
247
248 void vmwrite_error(unsigned long field, unsigned long value)
249 {
250         printk("vmwrite error: reg %lx value %lx (err %d)\n",
251                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
252 }
253
254 void vmcs_writel(unsigned long field, unsigned long value)
255 {
256         uint8_t error;
257
258         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
259                        : "=q"(error) : "a"(value), "d"(field) : "cc");
260         if (error)
261                 vmwrite_error(field, value);
262 }
263
264 static void vmcs_write16(unsigned long field, uint16_t value)
265 {
266         vmcs_writel(field, value);
267 }
268
269 static void vmcs_write32(unsigned long field, uint32_t value)
270 {
271         vmcs_writel(field, value);
272 }
273
274 static void vmcs_write64(unsigned long field, uint64_t value)
275 {
276         vmcs_writel(field, value);
277 }
278
279 static int adjust_vmx_controls(uint32_t ctl_min, uint32_t ctl_opt,
280                                       uint32_t msr, uint32_t *result)
281 {
282         uint32_t vmx_msr_low, vmx_msr_high;
283         uint32_t ctl = ctl_min | ctl_opt;
284         uint64_t vmx_msr = read_msr(msr);
285         vmx_msr_low = vmx_msr;
286         vmx_msr_high = vmx_msr>>32;
287
288         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
289         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
290
291         /* Ensure minimum (required) set of control bits are supported. */
292         if (ctl_min & ~ctl) {
293                 return -EIO;
294         }
295
296         *result = ctl;
297         return 0;
298 }
299
300 static  bool allow_1_setting(uint32_t msr, uint32_t ctl)
301 {
302         uint32_t vmx_msr_low, vmx_msr_high;
303
304         rdmsr(msr, vmx_msr_low, vmx_msr_high);
305         return vmx_msr_high & ctl;
306 }
307
308 static  void setup_vmcs_config(void *p)
309 {
310         int *ret = p;
311         struct vmcs_config *vmcs_conf = &vmcs_config;
312         uint32_t vmx_msr_low, vmx_msr_high;
313         uint32_t min, opt, min2, opt2;
314         uint32_t _pin_based_exec_control = 0;
315         uint32_t _cpu_based_exec_control = 0;
316         uint32_t _cpu_based_2nd_exec_control = 0;
317         uint32_t _vmexit_control = 0;
318         uint32_t _vmentry_control = 0;
319
320         *ret = -EIO;
321         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
322         opt = PIN_BASED_VIRTUAL_NMIS;
323         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
324                                 &_pin_based_exec_control) < 0) {
325                 return;
326         }
327
328         min =
329               CPU_BASED_CR8_LOAD_EXITING |
330               CPU_BASED_CR8_STORE_EXITING |
331               CPU_BASED_CR3_LOAD_EXITING |
332               CPU_BASED_CR3_STORE_EXITING |
333               CPU_BASED_MOV_DR_EXITING |
334               CPU_BASED_USE_TSC_OFFSETING |
335               CPU_BASED_MWAIT_EXITING |
336               CPU_BASED_MONITOR_EXITING |
337               CPU_BASED_INVLPG_EXITING;
338
339         min |= CPU_BASED_HLT_EXITING;
340
341         opt = CPU_BASED_TPR_SHADOW |
342               CPU_BASED_USE_MSR_BITMAPS |
343               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
344         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
345                                 &_cpu_based_exec_control) < 0) {
346                 return;
347         }
348
349         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
350                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
351                                            ~CPU_BASED_CR8_STORE_EXITING;
352
353         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
354                 min2 = 
355                         SECONDARY_EXEC_ENABLE_EPT |
356                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
357                 opt2 =  SECONDARY_EXEC_WBINVD_EXITING |
358                         SECONDARY_EXEC_RDTSCP |
359                         SECONDARY_EXEC_ENABLE_INVPCID;
360                 if (adjust_vmx_controls(min2, opt2,
361                                         MSR_IA32_VMX_PROCBASED_CTLS2,
362                                         &_cpu_based_2nd_exec_control) < 0) {
363                                                 return;
364                                         }
365         }
366
367         if (!(_cpu_based_2nd_exec_control &
368                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
369                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
370
371         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
372                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
373                    enabled */
374                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
375                                              CPU_BASED_CR3_STORE_EXITING |
376                                              CPU_BASED_INVLPG_EXITING);
377                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
378                       vmx_capability.ept, vmx_capability.vpid);
379         }
380
381         min = 0;
382
383         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
384
385 //      opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
386         opt = 0;
387         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
388                                 &_vmexit_control) < 0) {
389                 return;
390         }
391
392         min = 0;
393 //      opt = VM_ENTRY_LOAD_IA32_PAT;
394         opt = 0;
395         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
396                                 &_vmentry_control) < 0) {
397                 return;
398         }
399
400         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
401
402         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
403         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) {
404                 return;
405         }
406
407         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
408         if (vmx_msr_high & (1u<<16)) {
409                 printk("64-bit CPUs always have VMX_BASIC_MSR[48]==0. FAILS!\n");
410                 return;
411         }
412
413         /* Require Write-Back (WB) memory type for VMCS accesses. */
414         if (((vmx_msr_high >> 18) & 15) != 6) {
415                 printk("NO WB!\n");
416                 return;
417         }
418
419         vmcs_conf->size = vmx_msr_high & 0x1fff;
420         vmcs_conf->order = LOG2_UP(nr_pages(vmcs_config.size));
421         vmcs_conf->revision_id = vmx_msr_low;
422         printk("vmcs_conf size %d order %d rev %d\n",
423                vmcs_conf->size, vmcs_conf->order,
424                vmcs_conf->revision_id);
425
426         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
427         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
428         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
429         vmcs_conf->vmexit_ctrl         = _vmexit_control;
430         vmcs_conf->vmentry_ctrl        = _vmentry_control;
431
432         vmx_capability.has_load_efer =
433                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
434                                 VM_ENTRY_LOAD_IA32_EFER)
435                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
436                                    VM_EXIT_LOAD_IA32_EFER);
437
438         /* Now that we've done all the setup we can do, verify
439          * that we have all the capabilities we need. These tests
440          * are done last presumably because all the work done above
441          * affects some of them.
442          */
443
444         if (!vmx_capability.has_load_efer) {
445                 printk("CPU lacks ability to load EFER register\n");
446                 return;
447         }
448
449         *ret = 0;
450 }
451
452 static struct vmcs *__vmx_alloc_vmcs(int node)
453 {
454         struct vmcs *vmcs;
455
456         vmcs = get_cont_pages_node(node, vmcs_config.order, KMALLOC_WAIT);
457         if (!vmcs)
458                 return 0;
459         memset(vmcs, 0, vmcs_config.size);
460         vmcs->revision_id = vmcs_config.revision_id;    /* vmcs revision id */
461         printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
462         return vmcs;
463 }
464
465 /**
466  * vmx_alloc_vmcs - allocates a VMCS region
467  *
468  * NOTE: Assumes the new region will be used by the current CPU.
469  *
470  * Returns a valid VMCS region.
471  */
472 static struct vmcs *vmx_alloc_vmcs(void)
473 {
474         return __vmx_alloc_vmcs(node_id());
475 }
476
477 /**
478  * vmx_free_vmcs - frees a VMCS region
479  */
480 static void vmx_free_vmcs(struct vmcs *vmcs)
481 {
482   //free_pages((unsigned long)vmcs, vmcs_config.order);
483 }
484
485 /*
486  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
487  * will not change in the lifetime of the guest.
488  * Note that host-state that does change is set elsewhere. E.g., host-state
489  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
490  */
491 static void vmx_setup_constant_host_state(void)
492 {
493         uint32_t low32, high32;
494         unsigned long tmpl;
495         pseudodesc_t dt;
496
497         vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);  /* 22.2.3 */
498         vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
499         vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
500
501         vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
502         vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
503         vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
504         vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
505         vmcs_write16(HOST_TR_SELECTOR, GD_TSS);  /* 22.2.4 */
506
507         native_store_idt(&dt);
508         vmcs_writel(HOST_IDTR_BASE, dt.pd_base);   /* 22.2.4 */
509
510         asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
511         vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
512
513         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
514         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
515         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
516         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
517
518         rdmsr(MSR_EFER, low32, high32);
519         vmcs_write32(HOST_IA32_EFER, low32);
520
521         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
522                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
523                 vmcs_write64(HOST_IA32_PAT, low32 | ((uint64_t) high32 << 32));
524         }
525
526         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
527         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
528
529         /* TODO: This (at least gs) is per cpu */
530         rdmsrl(MSR_FS_BASE, tmpl);
531         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
532         rdmsrl(MSR_GS_BASE, tmpl);
533         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
534 }
535
536 static inline uint16_t vmx_read_ldt(void)
537 {
538         uint16_t ldt;
539         asm("sldt %0" : "=g"(ldt));
540         return ldt;
541 }
542
543 static unsigned long segment_base(uint16_t selector)
544 {
545         pseudodesc_t *gdt = &currentcpu->host_gdt;
546         struct desc_struct *d;
547         unsigned long table_base;
548         unsigned long v;
549
550         if (!(selector & ~3)) {
551                 return 0;
552         }
553
554         table_base = gdt->pd_base;
555
556         if (selector & 4) {           /* from ldt */
557                 uint16_t ldt_selector = vmx_read_ldt();
558
559                 if (!(ldt_selector & ~3)) {
560                         return 0;
561                 }
562
563                 table_base = segment_base(ldt_selector);
564         }
565         d = (struct desc_struct *)(table_base + (selector & ~7));
566         v = get_desc_base(d);
567 #ifdef CONFIG_X86_64
568        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
569                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
570 #endif
571         return v;
572 }
573
574 static inline unsigned long vmx_read_tr_base(void)
575 {
576         uint16_t tr;
577         asm("str %0" : "=g"(tr));
578         return segment_base(tr);
579 }
580
581 static void __vmx_setup_cpu(void)
582 {
583         pseudodesc_t *gdt = &currentcpu->host_gdt;
584         unsigned long sysenter_esp;
585         unsigned long tmpl;
586
587         /*
588          * Linux uses per-cpu TSS and GDT, so set these when switching
589          * processors.
590          */
591         vmcs_writel(HOST_TR_BASE, vmx_read_tr_base()); /* 22.2.4 */
592         vmcs_writel(HOST_GDTR_BASE, gdt->pd_base);   /* 22.2.4 */
593
594         rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
595         vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
596
597         rdmsrl(MSR_FS_BASE, tmpl);
598         vmcs_writel(HOST_FS_BASE, tmpl); /* 22.2.4 */
599         rdmsrl(MSR_GS_BASE, tmpl);
600         vmcs_writel(HOST_GS_BASE, tmpl); /* 22.2.4 */
601 }
602
603 /**
604  * vmx_get_cpu - called before using a cpu
605  * @vcpu: VCPU that will be loaded.
606  *
607  * Disables preemption. Call vmx_put_cpu() when finished.
608  */
609 static void vmx_get_cpu(struct vmx_vcpu *vcpu)
610 {
611         int cur_cpu = core_id();
612         handler_wrapper_t *w;
613
614         if (currentcpu->local_vcpu)
615                 panic("get_cpu: currentcpu->localvcpu was non-NULL");
616         if (currentcpu->local_vcpu != vcpu) {
617                 currentcpu->local_vcpu = vcpu;
618
619                 if (vcpu->cpu != cur_cpu) {
620                         if (vcpu->cpu >= 0) {
621                                 panic("vcpu->cpu is not -1, it's %d\n", vcpu->cpu);
622                         } else
623                                 vmcs_clear(vcpu->vmcs);
624
625                         ept_sync_context(vcpu_get_eptp(vcpu));
626
627                         vcpu->launched = 0;
628                         vmcs_load(vcpu->vmcs);
629                         __vmx_setup_cpu();
630                         vcpu->cpu = cur_cpu;
631                 } else {
632                         vmcs_load(vcpu->vmcs);
633                 }
634         }
635 }
636
637 /**
638  * vmx_put_cpu - called after using a cpu
639  * @vcpu: VCPU that was loaded.
640  */
641 static void vmx_put_cpu(struct vmx_vcpu *vcpu)
642 {
643         if (core_id() != vcpu->cpu)
644                 panic("%s: core_id() %d != vcpu->cpu %d\n",
645                       __func__, core_id(), vcpu->cpu);
646
647         if (currentcpu->local_vcpu != vcpu)
648                 panic("vmx_put_cpu: asked to clear something not ours");
649
650         ept_sync_context(vcpu_get_eptp(vcpu));
651         vmcs_clear(vcpu->vmcs);
652         vcpu->cpu = -1;
653         currentcpu->local_vcpu = NULL;
654         //put_cpu();
655 }
656
657 static void __vmx_sync_helper(struct hw_trapframe *hw_tf, void *ptr)
658 {
659         struct vmx_vcpu *vcpu = ptr;
660
661         ept_sync_context(vcpu_get_eptp(vcpu));
662 }
663
664 struct sync_addr_args {
665         struct vmx_vcpu *vcpu;
666         gpa_t gpa;
667 };
668
669 static void __vmx_sync_individual_addr_helper(struct hw_trapframe *hw_tf, void *ptr)
670 {
671         struct sync_addr_args *args = ptr;
672
673 //      ept_sync_individual_addr(
674
675 }
676
677 /**
678  * vmx_ept_sync_global - used to evict everything in the EPT
679  * @vcpu: the vcpu
680  */
681 void vmx_ept_sync_vcpu(struct vmx_vcpu *vcpu)
682 {
683         handler_wrapper_t *w;
684
685         smp_call_function_single(vcpu->cpu,
686                 __vmx_sync_helper, (void *) vcpu, &w);
687
688         if (smp_call_wait(w)) {
689                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
690         }
691
692
693 }
694
695 /**
696  * vmx_ept_sync_individual_addr - used to evict an individual address
697  * @vcpu: the vcpu
698  * @gpa: the guest-physical address
699  */
700 void vmx_ept_sync_individual_addr(struct vmx_vcpu *vcpu, gpa_t gpa)
701 {
702         struct sync_addr_args args;
703         args.vcpu = vcpu;
704         args.gpa = gpa;
705
706         handler_wrapper_t *w;
707
708
709         smp_call_function_single(vcpu->cpu,
710                                  __vmx_sync_individual_addr_helper, (void *) &args, &w);
711
712         if (smp_call_wait(w)) {
713                 printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
714         }
715
716 }
717
718 /**
719  * vmx_dump_cpu - prints the CPU state
720  * @vcpu: VCPU to print
721  */
722 static void vmx_dump_cpu(struct vmx_vcpu *vcpu)
723 {
724
725         unsigned long flags;
726
727         vmx_get_cpu(vcpu);
728         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
729         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
730         flags = vmcs_readl(GUEST_RFLAGS);
731         vmx_put_cpu(vcpu);
732
733         printk("--- Begin VCPU Dump ---\n");
734         printk("CPU %d VPID %d\n", vcpu->cpu, 0);
735         printk("RIP 0x%016lx RFLAGS 0x%08lx\n",
736                vcpu->regs.tf_rip, flags);
737         printk("RAX 0x%016lx RCX 0x%016lx\n",
738                 vcpu->regs.tf_rax, vcpu->regs.tf_rcx);
739         printk("RDX 0x%016lx RBX 0x%016lx\n",
740                 vcpu->regs.tf_rdx, vcpu->regs.tf_rbx);
741         printk("RSP 0x%016lx RBP 0x%016lx\n",
742                 vcpu->regs.tf_rsp, vcpu->regs.tf_rbp);
743         printk("RSI 0x%016lx RDI 0x%016lx\n",
744                 vcpu->regs.tf_rsi, vcpu->regs.tf_rdi);
745         printk("R8  0x%016lx R9  0x%016lx\n",
746                 vcpu->regs.tf_r8, vcpu->regs.tf_r9);
747         printk("R10 0x%016lx R11 0x%016lx\n",
748                 vcpu->regs.tf_r10, vcpu->regs.tf_r11);
749         printk("R12 0x%016lx R13 0x%016lx\n",
750                 vcpu->regs.tf_r12, vcpu->regs.tf_r13);
751         printk("R14 0x%016lx R15 0x%016lx\n",
752                 vcpu->regs.tf_r14, vcpu->regs.tf_r15);
753         printk("--- End VCPU Dump ---\n");
754
755 }
756
757 uint64_t construct_eptp(physaddr_t root_hpa)
758 {
759         uint64_t eptp;
760
761         /* set WB memory and 4 levels of walk.  we checked these in ept_init */
762         eptp = VMX_EPT_MEM_TYPE_WB |
763                (VMX_EPT_GAW_4_LVL << VMX_EPT_GAW_EPTP_SHIFT);
764         if (cpu_has_vmx_ept_ad_bits())
765                 eptp |= VMX_EPT_AD_ENABLE_BIT;
766         eptp |= (root_hpa & PAGE_MASK);
767
768         return eptp;
769 }
770
771 /**
772  * vmx_setup_initial_guest_state - configures the initial state of guest registers
773  */
774 static void vmx_setup_initial_guest_state(void)
775 {
776         unsigned long tmpl;
777         unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
778                             X86_CR4_PGE | X86_CR4_OSFXSR;
779         uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
780 #if 0
781         do we need it
782         if (boot_cpu_has(X86_FEATURE_PCID))
783                 cr4 |= X86_CR4_PCIDE;
784         if (boot_cpu_has(X86_FEATURE_OSXSAVE))
785                 cr4 |= X86_CR4_OSXSAVE;
786 #endif
787         /* we almost certainly have this */
788         /* we'll go sour if we don't. */
789         if (1) //boot_cpu_has(X86_FEATURE_FSGSBASE))
790                 cr4 |= X86_CR4_RDWRGSFS;
791
792         /* configure control and data registers */
793         vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
794                                X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
795         vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
796                                      X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
797         vmcs_writel(GUEST_CR3, rcr3());
798         vmcs_writel(GUEST_CR4, cr4);
799         vmcs_writel(CR4_READ_SHADOW, cr4);
800         vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
801                                      EFER_SCE | EFER_FFXSR);
802         vmcs_writel(GUEST_GDTR_BASE, 0);
803         vmcs_writel(GUEST_GDTR_LIMIT, 0);
804         vmcs_writel(GUEST_IDTR_BASE, 0);
805         vmcs_writel(GUEST_IDTR_LIMIT, 0);
806         vmcs_writel(GUEST_RIP, 0xdeadbeef);
807         vmcs_writel(GUEST_RSP, 0xdeadbeef);
808         vmcs_writel(GUEST_RFLAGS, 0x02);
809         vmcs_writel(GUEST_DR7, 0);
810
811         /* guest segment bases */
812         vmcs_writel(GUEST_CS_BASE, 0);
813         vmcs_writel(GUEST_DS_BASE, 0);
814         vmcs_writel(GUEST_ES_BASE, 0);
815         vmcs_writel(GUEST_GS_BASE, 0);
816         vmcs_writel(GUEST_SS_BASE, 0);
817         rdmsrl(MSR_FS_BASE, tmpl);
818         vmcs_writel(GUEST_FS_BASE, tmpl);
819
820         /* guest segment access rights */
821         vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
822         vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
823         vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
824         vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
825         vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
826         vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
827
828         /* guest segment limits */
829         vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
830         vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
831         vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
832         vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
833         vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
834         vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
835
836         /* configure segment selectors */
837         vmcs_write16(GUEST_CS_SELECTOR, 0);
838         vmcs_write16(GUEST_DS_SELECTOR, 0);
839         vmcs_write16(GUEST_ES_SELECTOR, 0);
840         vmcs_write16(GUEST_FS_SELECTOR, 0);
841         vmcs_write16(GUEST_GS_SELECTOR, 0);
842         vmcs_write16(GUEST_SS_SELECTOR, 0);
843         vmcs_write16(GUEST_TR_SELECTOR, 0);
844
845         /* guest LDTR */
846         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
847         vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
848         vmcs_writel(GUEST_LDTR_BASE, 0);
849         vmcs_writel(GUEST_LDTR_LIMIT, 0);
850
851         /* guest TSS */
852         vmcs_writel(GUEST_TR_BASE, 0);
853         vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
854         vmcs_writel(GUEST_TR_LIMIT, 0xff);
855
856         /* initialize sysenter */
857         vmcs_write32(GUEST_SYSENTER_CS, 0);
858         vmcs_writel(GUEST_SYSENTER_ESP, 0);
859         vmcs_writel(GUEST_SYSENTER_EIP, 0);
860
861         /* other random initialization */
862         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
863         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
864         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
865         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
866         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
867 }
868
869 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
870 {
871         int f = sizeof(unsigned long);
872         /*
873          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
874          * have the write-low and read-high bitmap offsets the wrong way round.
875          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
876          */
877         if (msr <= 0x1fff) {
878                 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
879                 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
880         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
881                 msr &= 0x1fff;
882                 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
883                 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
884         }
885 }
886
887 static void setup_msr(struct vmx_vcpu *vcpu)
888 {
889         int set[] = { MSR_LSTAR };
890         struct vmx_msr_entry *e;
891         int sz = sizeof(set) / sizeof(*set);
892         int i;
893
894         //BUILD_BUG_ON(sz > NR_AUTOLOAD_MSRS);
895
896         vcpu->msr_autoload.nr = sz;
897
898         /* XXX enable only MSRs in set */
899         vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
900
901         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vcpu->msr_autoload.nr);
902         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
903         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vcpu->msr_autoload.nr);
904
905         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.host));
906         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->msr_autoload.guest));
907         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->msr_autoload.guest));
908
909         for (i = 0; i < sz; i++) {
910                 uint64_t val;
911
912                 e = &vcpu->msr_autoload.host[i];
913                 e->index = set[i];
914                 __vmx_disable_intercept_for_msr(msr_bitmap, e->index);
915                 rdmsrl(e->index, val);
916                 e->value = val;
917
918                 e = &vcpu->msr_autoload.guest[i];
919                 e->index = set[i];
920                 e->value = 0xDEADBEEF;
921         }
922 }
923
924 /**
925  *  vmx_setup_vmcs - configures the vmcs with starting parameters
926  */
927 static void vmx_setup_vmcs(struct vmx_vcpu *vcpu)
928 {
929         vmcs_write16(VIRTUAL_PROCESSOR_ID, 0);
930         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
931
932         /* Control */
933         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
934                 vmcs_config.pin_based_exec_ctrl);
935
936         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
937                 vmcs_config.cpu_based_exec_ctrl);
938
939         if (cpu_has_secondary_exec_ctrls()) {
940                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
941                              vmcs_config.cpu_based_2nd_exec_ctrl);
942         }
943
944         vmcs_write64(EPT_POINTER, vcpu_get_eptp(vcpu));
945
946         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
947         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
948         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
949
950         setup_msr(vcpu);
951 #if 0
952         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
953                 uint32_t msr_low, msr_high;
954                 uint64_t host_pat;
955                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
956                 host_pat = msr_low | ((uint64_t) msr_high << 32);
957                 /* Write the default value follow host pat */
958                 vmcs_write64(GUEST_IA32_PAT, host_pat);
959                 /* Keep arch.pat sync with GUEST_IA32_PAT */
960                 vmx->vcpu.arch.pat = host_pat;
961         }
962 #endif
963 #if 0
964         for (int i = 0; i < NR_VMX_MSR; ++i) {
965                 uint32_t index = vmx_msr_index[i];
966                 uint32_t data_low, data_high;
967                 int j = vmx->nmsrs;
968                 // TODO we should have read/writemsr_safe
969 #if 0
970                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
971                         continue;
972                 if (wrmsr_safe(index, data_low, data_high) < 0)
973                         continue;
974 #endif
975                 vmx->guest_msrs[j].index = i;
976                 vmx->guest_msrs[j].data = 0;
977                 vmx->guest_msrs[j].mask = -1ull;
978                 ++vmx->nmsrs;
979         }
980 #endif
981
982         vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
983
984         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
985         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
986
987         vmcs_writel(CR0_GUEST_HOST_MASK, ~0ul);
988         vmcs_writel(CR4_GUEST_HOST_MASK, ~0ul);
989
990         //kvm_write_tsc(&vmx->vcpu, 0);
991         vmcs_writel(TSC_OFFSET, 0);
992
993         vmx_setup_constant_host_state();
994 }
995
996 /**
997  * vmx_create_vcpu - allocates and initializes a new virtual cpu
998  *
999  * Returns: A new VCPU structure
1000  */
1001 struct vmx_vcpu *vmx_create_vcpu(struct proc *p)
1002 {
1003         struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
1004         if (!vcpu) {
1005                 return NULL;
1006         }
1007
1008         memset(vcpu, 0, sizeof(*vcpu));
1009
1010         vcpu->proc = p; /* uncounted (weak) reference */
1011         vcpu->vmcs = vmx_alloc_vmcs();
1012         printd("%d: vcpu->vmcs is %p\n", core_id(), vcpu->vmcs);
1013         if (!vcpu->vmcs)
1014                 goto fail_vmcs;
1015
1016         vcpu->cpu = -1;
1017
1018         vmx_get_cpu(vcpu);
1019         vmx_setup_vmcs(vcpu);
1020         vmx_setup_initial_guest_state();
1021         vmx_put_cpu(vcpu);
1022
1023         return vcpu;
1024
1025 fail_vmcs:
1026         kfree(vcpu);
1027         return NULL;
1028 }
1029
1030 /**
1031  * vmx_destroy_vcpu - destroys and frees an existing virtual cpu
1032  * @vcpu: the VCPU to destroy
1033  */
1034 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu)
1035 {
1036         vmx_free_vmcs(vcpu->vmcs);
1037         kfree(vcpu);
1038 }
1039
1040 /**
1041  * vmx_current_vcpu - returns a pointer to the vcpu for the current task.
1042  *
1043  * In the contexts where this is used the vcpu pointer should never be NULL.
1044  */
1045 static inline struct vmx_vcpu *vmx_current_vcpu(void)
1046 {
1047         struct vmx_vcpu *vcpu = currentcpu->local_vcpu;
1048         if (!vcpu)
1049                 panic("Core has no vcpu!");
1050         return vcpu;
1051 }
1052
1053 /**
1054  * vmx_run_vcpu - launches the CPU into non-root mode
1055  * We ONLY support 64-bit guests.
1056  * @vcpu: the vmx instance to launch
1057  */
1058 static int vmx_run_vcpu(struct vmx_vcpu *vcpu)
1059 {
1060         asm(
1061                 /* Store host registers */
1062                 "push %%rdx; push %%rbp;"
1063                 "push %%rcx \n\t" /* placeholder for guest rcx */
1064                 "push %%rcx \n\t"
1065                 "cmp %%rsp, %c[host_rsp](%0) \n\t"
1066                 "je 1f \n\t"
1067                 "mov %%rsp, %c[host_rsp](%0) \n\t"
1068                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1069                 "1: \n\t"
1070                 /* Reload cr2 if changed */
1071                 "mov %c[cr2](%0), %%rax \n\t"
1072                 "mov %%cr2, %%rdx \n\t"
1073                 "cmp %%rax, %%rdx \n\t"
1074                 "je 2f \n\t"
1075                 "mov %%rax, %%cr2 \n\t"
1076                 "2: \n\t"
1077                 /* Check if vmlaunch of vmresume is needed */
1078                 "cmpl $0, %c[launched](%0) \n\t"
1079                 /* Load guest registers.  Don't clobber flags. */
1080                 "mov %c[rax](%0), %%rax \n\t"
1081                 "mov %c[rbx](%0), %%rbx \n\t"
1082                 "mov %c[rdx](%0), %%rdx \n\t"
1083                 "mov %c[rsi](%0), %%rsi \n\t"
1084                 "mov %c[rdi](%0), %%rdi \n\t"
1085                 "mov %c[rbp](%0), %%rbp \n\t"
1086                 "mov %c[r8](%0),  %%r8  \n\t"
1087                 "mov %c[r9](%0),  %%r9  \n\t"
1088                 "mov %c[r10](%0), %%r10 \n\t"
1089                 "mov %c[r11](%0), %%r11 \n\t"
1090                 "mov %c[r12](%0), %%r12 \n\t"
1091                 "mov %c[r13](%0), %%r13 \n\t"
1092                 "mov %c[r14](%0), %%r14 \n\t"
1093                 "mov %c[r15](%0), %%r15 \n\t"
1094                 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (ecx) */
1095
1096                 /* Enter guest mode */
1097                 "jne .Llaunched \n\t"
1098                 ASM_VMX_VMLAUNCH "\n\t"
1099                 "jmp .Lkvm_vmx_return \n\t"
1100                 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1101                 ".Lkvm_vmx_return: "
1102                 /* Save guest registers, load host registers, keep flags */
1103                 "mov %0, %c[wordsize](%%rsp) \n\t"
1104                 "pop %0 \n\t"
1105                 "mov %%rax, %c[rax](%0) \n\t"
1106                 "mov %%rbx, %c[rbx](%0) \n\t"
1107                 "popq %c[rcx](%0) \n\t"
1108                 "mov %%rdx, %c[rdx](%0) \n\t"
1109                 "mov %%rsi, %c[rsi](%0) \n\t"
1110                 "mov %%rdi, %c[rdi](%0) \n\t"
1111                 "mov %%rbp, %c[rbp](%0) \n\t"
1112                 "mov %%r8,  %c[r8](%0) \n\t"
1113                 "mov %%r9,  %c[r9](%0) \n\t"
1114                 "mov %%r10, %c[r10](%0) \n\t"
1115                 "mov %%r11, %c[r11](%0) \n\t"
1116                 "mov %%r12, %c[r12](%0) \n\t"
1117                 "mov %%r13, %c[r13](%0) \n\t"
1118                 "mov %%r14, %c[r14](%0) \n\t"
1119                 "mov %%r15, %c[r15](%0) \n\t"
1120                 "mov %%rax, %%r10 \n\t"
1121                 "mov %%rdx, %%r11 \n\t"
1122
1123                 "mov %%cr2, %%rax   \n\t"
1124                 "mov %%rax, %c[cr2](%0) \n\t"
1125
1126                 "pop  %%rbp; pop  %%rdx \n\t"
1127                 "setbe %c[fail](%0) \n\t"
1128                 "mov $" STRINGIFY(GD_UD) ", %%rax \n\t"
1129                 "mov %%rax, %%ds \n\t"
1130                 "mov %%rax, %%es \n\t"
1131               : : "c"(vcpu), "d"((unsigned long)HOST_RSP),
1132                 [launched]"i"(offsetof(struct vmx_vcpu, launched)),
1133                 [fail]"i"(offsetof(struct vmx_vcpu, fail)),
1134                 [host_rsp]"i"(offsetof(struct vmx_vcpu, host_rsp)),
1135                 [rax]"i"(offsetof(struct vmx_vcpu, regs.tf_rax)),
1136                 [rbx]"i"(offsetof(struct vmx_vcpu, regs.tf_rbx)),
1137                 [rcx]"i"(offsetof(struct vmx_vcpu, regs.tf_rcx)),
1138                 [rdx]"i"(offsetof(struct vmx_vcpu, regs.tf_rdx)),
1139                 [rsi]"i"(offsetof(struct vmx_vcpu, regs.tf_rsi)),
1140                 [rdi]"i"(offsetof(struct vmx_vcpu, regs.tf_rdi)),
1141                 [rbp]"i"(offsetof(struct vmx_vcpu, regs.tf_rbp)),
1142                 [r8]"i"(offsetof(struct vmx_vcpu, regs.tf_r8)),
1143                 [r9]"i"(offsetof(struct vmx_vcpu, regs.tf_r9)),
1144                 [r10]"i"(offsetof(struct vmx_vcpu, regs.tf_r10)),
1145                 [r11]"i"(offsetof(struct vmx_vcpu, regs.tf_r11)),
1146                 [r12]"i"(offsetof(struct vmx_vcpu, regs.tf_r12)),
1147                 [r13]"i"(offsetof(struct vmx_vcpu, regs.tf_r13)),
1148                 [r14]"i"(offsetof(struct vmx_vcpu, regs.tf_r14)),
1149                 [r15]"i"(offsetof(struct vmx_vcpu, regs.tf_r15)),
1150                 [cr2]"i"(offsetof(struct vmx_vcpu, cr2)),
1151                 [wordsize]"i"(sizeof(unsigned long))
1152               : "cc", "memory"
1153                 , "rax", "rbx", "rdi", "rsi"
1154                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
1155         );
1156
1157         vcpu->regs.tf_rip = vmcs_readl(GUEST_RIP);
1158         vcpu->regs.tf_rsp = vmcs_readl(GUEST_RSP);
1159         printk("RETURN. ip %016lx sp %016lx cr2 %016lx\n",
1160                vcpu->regs.tf_rip, vcpu->regs.tf_rsp, vcpu->cr2);
1161         /* FIXME: do we need to set up other flags? */
1162         vcpu->regs.tf_rflags = (vmcs_readl(GUEST_RFLAGS) & 0xFF) |
1163                       X86_EFLAGS_IF | 0x2;
1164
1165         vcpu->regs.tf_cs = GD_UT;
1166         vcpu->regs.tf_ss = GD_UD;
1167
1168         vcpu->launched = 1;
1169
1170         if (vcpu->fail) {
1171                 printk("failure detected (err %x)\n",
1172                        vmcs_read32(VM_INSTRUCTION_ERROR));
1173                 return VMX_EXIT_REASONS_FAILED_VMENTRY;
1174         }
1175
1176         return vmcs_read32(VM_EXIT_REASON);
1177
1178 #if 0
1179         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1180         vmx_complete_atomic_exit(vmx);
1181         vmx_recover_nmi_blocking(vmx);
1182         vmx_complete_interrupts(vmx);
1183 #endif
1184 }
1185
1186 static void vmx_step_instruction(void)
1187 {
1188         vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1189                                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1190 }
1191
1192 static int vmx_handle_ept_violation(struct vmx_vcpu *vcpu)
1193 {
1194         unsigned long gva, gpa;
1195         int exit_qual, ret = -1;
1196         page_t *page;
1197
1198         vmx_get_cpu(vcpu);
1199         exit_qual = vmcs_read32(EXIT_QUALIFICATION);
1200         gva = vmcs_readl(GUEST_LINEAR_ADDRESS);
1201         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1202         printk("ept: gva %016lx, gpa %016lx\n", gva, gpa);
1203
1204         vmx_put_cpu(vcpu);
1205
1206         // this is a total hack, for testing things.
1207         // note that we only care about the gpa, and the
1208         // gpa is our process virtual address. 
1209         // Confused yet?
1210         page = page_lookup(current->env_pgdir, (void *)gpa, NULL);
1211         printk("Lookup %p returns %p\n", gpa, page);
1212         if (page) {
1213                 uint64_t hpa = page2pa(page);
1214                 printk("hpa for %p is %p\n", gpa, hpa);
1215                 ret = vmx_do_ept_fault(vcpu->proc->env_pgdir.epte, gpa, hpa, exit_qual);
1216                 printk("vmx_do_ept_fault returns %d\n", ret);
1217         }
1218
1219         if (ret) {
1220                 printk("page fault failure "
1221                        "GPA: 0x%lx, GVA: 0x%lx\n",
1222                        gpa, gva);
1223                 vmx_dump_cpu(vcpu);
1224         }
1225
1226         return ret;
1227 }
1228
1229 static void vmx_handle_cpuid(struct vmx_vcpu *vcpu)
1230 {
1231         unsigned int eax, ebx, ecx, edx;
1232
1233         eax = vcpu->regs.tf_rax;
1234         ecx = vcpu->regs.tf_rcx;
1235         cpuid(0, 2, &eax, &ebx, &ecx, &edx);
1236         vcpu->regs.tf_rax = eax;
1237         vcpu->regs.tf_rbx = ebx;
1238         vcpu->regs.tf_rcx = ecx;
1239         vcpu->regs.tf_rdx = edx;
1240 }
1241
1242 static int vmx_handle_nmi_exception(struct vmx_vcpu *vcpu)
1243 {
1244         uint32_t intr_info;
1245
1246         vmx_get_cpu(vcpu);
1247         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1248         vmx_put_cpu(vcpu);
1249
1250         printk("vmx (vcpu %p): got an exception\n", vcpu);
1251         printk("vmx (vcpu %p): pid %d\n", vcpu, vcpu->proc->pid);
1252         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
1253                 return 0;
1254         }
1255
1256         printk("unhandled nmi, intr_info %x\n", intr_info);
1257         return -EIO;
1258 }
1259
1260
1261 static void noop(void) {
1262         __asm__ __volatile__ ("1: jmp 1b");
1263 }
1264
1265 static void fail(void) {
1266         __asm__ __volatile__ ("movq $0xdeadbeef, %rbx; movq 0, %rax");
1267 }
1268
1269 static unsigned long stack[512];
1270 /**
1271  * vmx_launch - the main loop for a VMX Dune process
1272  * @conf: the launch configuration
1273  */
1274 int vmx_launch(uint64_t rip, uint64_t rsp, uint64_t cr3)
1275 {
1276         int ret;
1277         struct vmx_vcpu *vcpu;
1278         int i = 0;
1279         int errors = 0;
1280
1281         if (rip < 4096 ) {
1282                 // testing.
1283                 switch(rip) {
1284                 default:
1285                         rip = (uint64_t)noop + 4;
1286                         break;
1287                 case 1:
1288                         rip = (uint64_t)fail + 4;
1289                         break;
1290                 }
1291         }
1292
1293         if (cr3 == 0) {
1294                 cr3 = rcr3();
1295         }
1296
1297         /* sanity checking.  -- later
1298         ret = ept_check_page(ept, rip);
1299         if (ret) {
1300                 printk("0x%x is not mapped in the ept!\n", rip);
1301                 errors++;
1302         }
1303         ret = ept_check_page(ept, rsp);
1304         if (ret) {
1305                 printk("0x%x is not mapped in the ept!\n", rsp);
1306                 errors++;
1307         }
1308         */
1309         if (errors) {
1310                 return -EINVAL;
1311         }
1312
1313
1314         printk("RUNNING: %s: rip %p rsp %p cr3 %p \n",
1315                __func__, rip, rsp, cr3);
1316         /* TODO: dirty hack til we have VMM contexts */
1317         vcpu = current->vmm.guest_pcores[0];
1318         if (!vcpu) {
1319                 printk("Failed to get a CPU!\n");
1320                 return -ENOMEM;
1321         }
1322
1323         vmx_get_cpu(vcpu);
1324         vmcs_writel(GUEST_RIP, rip);
1325         vmcs_writel(GUEST_RSP, rsp);
1326         vmcs_writel(GUEST_CR3, cr3);
1327         vmx_put_cpu(vcpu);
1328
1329         vcpu->ret_code = -1;
1330
1331         while (1) {
1332                 vmx_get_cpu(vcpu);
1333
1334                 // TODO: manage the fpu when we restart.
1335
1336                 // TODO: see if we need to exit before we go much further.
1337                 disable_irq();
1338                 ret = vmx_run_vcpu(vcpu);
1339                 enable_irq();
1340                 vmx_put_cpu(vcpu);
1341
1342                 if (ret == EXIT_REASON_VMCALL) {
1343                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1344                         printk("system call! WTF\n");
1345                 } else if (ret == EXIT_REASON_CPUID)
1346                         vmx_handle_cpuid(vcpu);
1347                 else if (ret == EXIT_REASON_EPT_VIOLATION) {
1348                         if (vmx_handle_ept_violation(vcpu))
1349                                 vcpu->shutdown = SHUTDOWN_EPT_VIOLATION;
1350                 } else if (ret == EXIT_REASON_EXCEPTION_NMI) {
1351                         if (vmx_handle_nmi_exception(vcpu))
1352                                 vcpu->shutdown = SHUTDOWN_NMI_EXCEPTION;
1353                 } else if (ret == EXIT_REASON_EXTERNAL_INTERRUPT) {
1354                         printk("External interrupt\n");
1355                 } else {
1356                         printk("unhandled exit: reason %x, exit qualification %x\n",
1357                                ret, vmcs_read32(EXIT_QUALIFICATION));
1358                         vmx_dump_cpu(vcpu);
1359                         vcpu->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
1360                 }
1361
1362                 /* TODO: we can't just return and relaunch the VMCS, in case we blocked.
1363                  * similar to how proc_restartcore/smp_idle only restart the pcpui
1364                  * cur_ctx, we need to do the same, via the VMCS resume business. */
1365
1366                 if (vcpu->shutdown)
1367                         break;
1368         }
1369
1370         printk("RETURN. ip %016lx sp %016lx\n",
1371                 vcpu->regs.tf_rip, vcpu->regs.tf_rsp);
1372
1373         /*
1374          * Return both the reason for the shutdown and a status value.
1375          * The exit() and exit_group() system calls only need 8 bits for
1376          * the status but we allow 16 bits in case we might want to
1377          * return more information for one of the other shutdown reasons.
1378          */
1379         ret = (vcpu->shutdown << 16) | (vcpu->ret_code & 0xffff);
1380
1381         return ret;
1382 }
1383
1384 /**
1385  * __vmx_enable - low-level enable of VMX mode on the current CPU
1386  * @vmxon_buf: an opaque buffer for use as the VMXON region
1387  */
1388 static  int __vmx_enable(struct vmcs *vmxon_buf)
1389 {
1390         uint64_t phys_addr = PADDR(vmxon_buf);
1391         uint64_t old, test_bits;
1392
1393         if (rcr4() & X86_CR4_VMXE) {
1394                 panic("Should never have this happen");
1395                 return -EBUSY;
1396         }
1397
1398         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1399
1400         test_bits = FEATURE_CONTROL_LOCKED;
1401         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1402
1403         if (0) // tboot_enabled())
1404                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1405
1406         if ((old & test_bits) != test_bits) {
1407                 /* If it's locked, then trying to set it will cause a GPF.
1408                  * No Dune for you!
1409                  */
1410                 if (old & FEATURE_CONTROL_LOCKED) {
1411                         printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1412                         return -1;
1413                 }
1414
1415                 /* enable and lock */
1416                 write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1417         }
1418         lcr4(rcr4() | X86_CR4_VMXE);
1419
1420         __vmxon(phys_addr);
1421         vpid_sync_vcpu_global();        /* good idea, even if we aren't using vpids */
1422         ept_sync_global();
1423
1424         return 0;
1425 }
1426
1427 /**
1428  * vmx_enable - enables VMX mode on the current CPU
1429  * @unused: not used (required for on_each_cpu())
1430  *
1431  * Sets up necessary state for enable (e.g. a scratchpad for VMXON.)
1432  */
1433 static void vmx_enable(void)
1434 {
1435         struct vmcs *vmxon_buf = currentcpu->vmxarea;
1436         int ret;
1437
1438         ret = __vmx_enable(vmxon_buf);
1439         if (ret)
1440                 goto failed;
1441
1442         currentcpu->vmx_enabled = 1;
1443         // TODO: do we need this?
1444         store_gdt(&currentcpu->host_gdt);
1445
1446         printk("VMX enabled on CPU %d\n", core_id());
1447         return;
1448
1449 failed:
1450         printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1451 }
1452
1453 /**
1454  * vmx_disable - disables VMX mode on the current CPU
1455  */
1456 static void vmx_disable(void *unused)
1457 {
1458         if (currentcpu->vmx_enabled) {
1459                 __vmxoff();
1460                 lcr4(rcr4() & ~X86_CR4_VMXE);
1461                 currentcpu->vmx_enabled = 0;
1462         }
1463 }
1464
1465 /* Probe the cpus to see which ones can do vmx.
1466  * Return -errno if it fails, and 1 if it succeeds.
1467  */
1468 static bool probe_cpu_vmx(void)
1469 {
1470         /* The best way to test this code is:
1471          * wrmsr -p <cpu> 0x3a 1
1472          * This will lock vmx off; then modprobe dune.
1473          * Frequently, however, systems have all 0x3a registers set to 5,
1474          * meaning testing is impossible, as vmx can not be disabled.
1475          * We have to simulate it being unavailable in most cases.
1476          * The 'test' variable provides an easy way to simulate
1477          * unavailability of vmx on some, none, or all cpus.
1478          */
1479         if (!cpu_has_vmx()) {
1480                 printk("Machine does not support VT-x\n");
1481                 return FALSE;
1482         } else {
1483                 printk("Machine supports VT-x\n");
1484                 return TRUE;
1485         }
1486 }
1487
1488 static void setup_vmxarea(void)
1489 {
1490                 struct vmcs *vmxon_buf;
1491                 printd("Set up vmxarea for cpu %d\n", core_id());
1492                 vmxon_buf = __vmx_alloc_vmcs(node_id());
1493                 if (!vmxon_buf) {
1494                         printk("setup_vmxarea failed on node %d\n", core_id());
1495                         return;
1496                 }
1497                 currentcpu->vmxarea = vmxon_buf;
1498 }
1499
1500 static int ept_init(void)
1501 {
1502         if (!cpu_has_vmx_ept()) {
1503                 printk("VMX doesn't support EPT!\n");
1504                 return -1;
1505         }
1506         if (!cpu_has_vmx_eptp_writeback()) {
1507                 printk("VMX EPT doesn't support WB memory!\n");
1508                 return -1;
1509         }
1510         if (!cpu_has_vmx_ept_4levels()) {
1511                 printk("VMX EPT doesn't support 4 level walks!\n");
1512                 return -1;
1513         }
1514         switch (arch_max_jumbo_page_shift()) {
1515                 case PML3_SHIFT:
1516                         if (!cpu_has_vmx_ept_1g_page()) {
1517                                 printk("VMX EPT doesn't support 1 GB pages!\n");
1518                                 return -1;
1519                         }
1520                         break;
1521                 case PML2_SHIFT:
1522                         if (!cpu_has_vmx_ept_2m_page()) {
1523                                 printk("VMX EPT doesn't support 2 MB pages!\n");
1524                                 return -1;
1525                         }
1526                         break;
1527                 default:
1528                         printk("Unexpected jumbo page size %d\n",
1529                                arch_max_jumbo_page_shift());
1530                         return -1;
1531         }
1532         if (!cpu_has_vmx_ept_ad_bits()) {
1533                 printk("VMX EPT doesn't support accessed/dirty!\n");
1534                 /* TODO: set the pmap_ops accordingly */
1535         }
1536         if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
1537                 printk("VMX EPT can't invalidate PTEs/TLBs!\n");
1538                 return -1;
1539         }
1540
1541         return 0;
1542 }
1543
1544 /**
1545  * vmx_init sets up physical core data areas that are required to run a vm at all.
1546  * These data areas are not connected to a specific user process in any way. Instead,
1547  * they are in some sense externalizing what would other wise be a very large ball of
1548  * state that would be inside the CPU.
1549  */
1550 int intel_vmm_init(void)
1551 {
1552         int r, cpu, ret;
1553
1554         if (! probe_cpu_vmx()) {
1555                 return -EOPNOTSUPP;
1556         }
1557
1558         setup_vmcs_config(&ret);
1559
1560         if (ret) {
1561                 printk("setup_vmcs_config failed: %d\n", ret);
1562                 return ret;
1563         }
1564
1565         msr_bitmap = (unsigned long *)kpage_zalloc_addr();
1566         if (!msr_bitmap) {
1567                 printk("Could not allocate msr_bitmap\n");
1568                 return -ENOMEM;
1569         }
1570         /* FIXME: do we need APIC virtualization (flexpriority?) */
1571
1572         memset(msr_bitmap, 0xff, PAGE_SIZE);
1573         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
1574         __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
1575
1576         if ((ret = ept_init())) {
1577                 printk("EPT init failed, %d\n", ret);
1578                 return ret;
1579         }
1580         printk("VMX setup succeeded\n");
1581         return 0;
1582 }
1583
1584 int intel_vmm_pcpu_init(void)
1585 {
1586         setup_vmxarea();
1587         vmx_enable();
1588         return 0;
1589 }