VMM: Remove unused code (XCC)
[akaros.git] / kern / arch / x86 / vmm / vmm.c
1 /* Copyright 2015 Google Inc.
2  * 
3  * See LICENSE for details.
4  */
5
6 /* We're not going to falll into the trap of only compiling support
7  * for AMD OR Intel for an image. It all gets compiled in, and which
8  * one you use depends on on cpuinfo, not a compile-time
9  * switch. That's proven to be the best strategy.  Conditionally
10  * compiling in support is the path to hell.
11  */
12 #include <assert.h>
13 #include <pmap.h>
14 #include <smp.h>
15 #include <kmalloc.h>
16
17 #include <ros/vmm.h>
18 #include "intel/vmx.h"
19 #include "vmm.h"
20 #include <trap.h>
21 #include <umem.h>
22
23 /* TODO: have better cpuid info storage and checks */
24 bool x86_supports_vmx = FALSE;
25
26 static void vmmcp_posted_handler(struct hw_trapframe *hw_tf, void *data);
27
28 /* Figure out what kind of CPU we are on, and if it supports any reasonable
29  * virtualization. For now, if we're not some sort of newer intel, don't
30  * bother. This does all cores. Again, note, we make these decisions at runtime,
31  * to avoid getting into the problems that compile-time decisions can cause. 
32  * At this point, of course, it's still all intel.
33  */
34 void vmm_init(void)
35 {
36         int ret;
37         /* Check first for intel capabilities. This is hence two back-to-back
38          * implementationd-dependent checks. That's ok, it's all msr dependent.
39          */
40         ret = intel_vmm_init();
41         if (! ret) {
42                 printd("intel_vmm_init worked\n");
43
44                 //Register I_VMMCP_POSTED IRQ
45                 //register_irq(I_VMMCP_POSTED, vmmcp_posted_handler, NULL,
46                 //              MKBUS(BusLAPIC, 0, 0, 0));
47                 x86_supports_vmx = TRUE;
48                 return;
49         }
50
51         /* TODO: AMD. Will we ever care? It's not clear. */
52         printk("vmm_init failed, ret %d\n", ret);
53         return;
54 }
55
56 static void vmmcp_posted_handler(struct hw_trapframe *hw_tf, void *data)
57 {
58         printk("%s\n", __func__);
59 }
60
61 void vmm_pcpu_init(void)
62 {
63         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
64
65         pcpui->guest_pcoreid = -1;
66         if (!x86_supports_vmx)
67                 return;
68         if (! intel_vmm_pcpu_init()) {
69                 printd("vmm_pcpu_init worked\n");
70                 return;
71         }
72         /* TODO: AMD. Will we ever care? It's not clear. */
73         printk("vmm_pcpu_init failed\n");
74 }
75
76 int vm_post_interrupt(struct vmctl *v)
77 {
78         int vmx_interrupt_notify(struct vmctl *v);
79         if (current->vmm.amd) {
80                 return -1;
81         } else {
82                 return vmx_interrupt_notify(v);
83         }
84         return -1;
85 }
86
87 /* Initializes a process to run virtual machine contexts, returning the number
88  * initialized, optionally setting errno */
89 int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores,
90                     struct vmm_gpcore_init *u_gpcis, int flags)
91 {
92         struct vmm *vmm = &p->vmm;
93         unsigned int i;
94         struct vmm_gpcore_init gpci;
95
96         if (flags & ~VMM_ALL_FLAGS) {
97                 set_errstr("%s: flags is 0x%lx, VMM_ALL_FLAGS is 0x%lx\n", __func__,
98                            flags, VMM_ALL_FLAGS);
99                 set_errno(EINVAL);
100                 return 0;
101         }
102         vmm->flags = flags;
103         if (!x86_supports_vmx) {
104                 set_errno(ENODEV);
105                 return 0;
106         }
107         qlock(&vmm->qlock);
108         if (vmm->vmmcp) {
109                 set_errno(EINVAL);
110                 qunlock(&vmm->qlock);
111                 return 0;
112         }
113         /* Set this early, so cleanup checks the gpc array */
114         vmm->vmmcp = TRUE;
115         nr_guest_pcores = MIN(nr_guest_pcores, num_cores);
116         vmm->amd = 0;
117         vmm->guest_pcores = kzmalloc(sizeof(void*) * nr_guest_pcores, KMALLOC_WAIT);
118         for (i = 0; i < nr_guest_pcores; i++) {
119                 if (copy_from_user(&gpci, &u_gpcis[i],
120                                    sizeof(struct vmm_gpcore_init))) {
121                         set_error(EINVAL, "Bad pointer %p for gps", u_gpcis);
122                         break;
123                 }
124                 vmm->guest_pcores[i] = vmx_create_vcpu(p, &gpci);
125                 /* If we failed, we'll clean it up when the process dies */
126                 if (!vmm->guest_pcores[i]) {
127                         set_errno(ENOMEM);
128                         break;
129                 }
130         }
131         vmm->nr_guest_pcores = i;
132         for (int i = 0; i < VMM_VMEXIT_NR_TYPES; i++)
133                 vmm->vmexits[i] = 0;
134         qunlock(&vmm->qlock);
135         return i;
136 }
137
138 /* Has no concurrency protection - only call this when you know you have the
139  * only ref to vmm.  For instance, from __proc_free, where there is only one ref
140  * to the proc (and thus proc.vmm). */
141 void __vmm_struct_cleanup(struct proc *p)
142 {
143         struct vmm *vmm = &p->vmm;
144         if (!vmm->vmmcp)
145                 return;
146         for (int i = 0; i < vmm->nr_guest_pcores; i++) {
147                 if (vmm->guest_pcores[i])
148                         vmx_destroy_vcpu(vmm->guest_pcores[i]);
149         }
150         kfree(vmm->guest_pcores);
151         ept_flush(p->env_pgdir.eptp);
152         vmm->vmmcp = FALSE;
153 }
154
155 struct vmx_vcpu *lookup_guest_pcore(struct proc *p, int guest_pcoreid)
156 {
157         /* nr_guest_pcores is written once at setup and never changed */
158         if (guest_pcoreid >= p->vmm.nr_guest_pcores)
159                 return 0;
160         return p->vmm.guest_pcores[guest_pcoreid];
161 }
162
163 struct vmx_vcpu *load_guest_pcore(struct proc *p, int guest_pcoreid)
164 {
165         struct vmx_vcpu *gpc;
166         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
167
168         gpc = lookup_guest_pcore(p, guest_pcoreid);
169         if (!gpc)
170                 return 0;
171         assert(pcpui->guest_pcoreid == -1);
172         spin_lock(&p->vmm.lock);
173         if (gpc->cpu != -1) {
174                 spin_unlock(&p->vmm.lock);
175                 return 0;
176         }
177         gpc->cpu = core_id();
178         spin_unlock(&p->vmm.lock);
179         /* We've got dibs on the gpc; we don't need to hold the lock any longer. */
180         pcpui->guest_pcoreid = guest_pcoreid;
181         ept_sync_context(vcpu_get_eptp(gpc));
182         vmx_load_guest_pcore(gpc);
183         return gpc;
184 }
185
186 void unload_guest_pcore(struct proc *p, int guest_pcoreid)
187 {
188         struct vmx_vcpu *gpc;
189         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
190
191         gpc = lookup_guest_pcore(p, guest_pcoreid);
192         assert(gpc);
193         spin_lock(&p->vmm.lock);
194         assert(gpc->cpu != -1);
195         ept_sync_context(vcpu_get_eptp(gpc));
196         vmx_unload_guest_pcore(gpc);
197         gpc->cpu = -1;
198         /* As soon as we unlock, this gpc can be started on another core */
199         spin_unlock(&p->vmm.lock);
200         pcpui->guest_pcoreid = -1;
201 }
202
203 /* emulated msr. For now, an msr value and a pointer to a helper that
204  * performs the requested operation.
205  */
206 struct emmsr {
207         uint32_t reg;
208         char *name;
209         bool (*f)(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
210                   uint64_t *rax, uint32_t opcode);
211         bool written;
212         uint32_t edx, eax;
213 };
214
215 static bool emsr_miscenable(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
216                             uint64_t *rax, uint32_t opcode);
217 static bool emsr_mustmatch(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
218                            uint64_t *rax, uint32_t opcode);
219 static bool emsr_readonly(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
220                           uint64_t *rax, uint32_t opcode);
221 static bool emsr_readzero(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
222                           uint64_t *rax, uint32_t opcode);
223 static bool emsr_fakewrite(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
224                            uint64_t *rax, uint32_t opcode);
225 static bool emsr_ok(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
226                     uint64_t *rax, uint32_t opcode);
227 static bool emsr_fake_apicbase(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
228                                uint64_t *rax, uint32_t opcode);
229
230 struct emmsr emmsrs[] = {
231         {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable},
232         {MSR_IA32_SYSENTER_CS, "MSR_IA32_SYSENTER_CS", emsr_ok},
233         {MSR_IA32_SYSENTER_EIP, "MSR_IA32_SYSENTER_EIP", emsr_ok},
234         {MSR_IA32_SYSENTER_ESP, "MSR_IA32_SYSENTER_ESP", emsr_ok},
235         {MSR_IA32_UCODE_REV, "MSR_IA32_UCODE_REV", emsr_fakewrite},
236         {MSR_CSTAR, "MSR_CSTAR", emsr_fakewrite},
237         {MSR_IA32_VMX_BASIC_MSR, "MSR_IA32_VMX_BASIC_MSR", emsr_fakewrite},
238         {MSR_IA32_VMX_PINBASED_CTLS_MSR, "MSR_IA32_VMX_PINBASED_CTLS_MSR",
239          emsr_fakewrite},
240         {MSR_IA32_VMX_PROCBASED_CTLS_MSR, "MSR_IA32_VMX_PROCBASED_CTLS_MSR",
241          emsr_fakewrite},
242         {MSR_IA32_VMX_PROCBASED_CTLS2, "MSR_IA32_VMX_PROCBASED_CTLS2",
243          emsr_fakewrite},
244         {MSR_IA32_VMX_EXIT_CTLS_MSR, "MSR_IA32_VMX_EXIT_CTLS_MSR",
245          emsr_fakewrite},
246         {MSR_IA32_VMX_ENTRY_CTLS_MSR, "MSR_IA32_VMX_ENTRY_CTLS_MSR",
247          emsr_fakewrite},
248         {MSR_IA32_ENERGY_PERF_BIAS, "MSR_IA32_ENERGY_PERF_BIAS",
249          emsr_fakewrite},
250         {MSR_LBR_SELECT, "MSR_LBR_SELECT", emsr_ok},
251         {MSR_LBR_TOS, "MSR_LBR_TOS", emsr_ok},
252         {MSR_LBR_NHM_FROM, "MSR_LBR_NHM_FROM", emsr_ok},
253         {MSR_LBR_NHM_TO, "MSR_LBR_NHM_TO", emsr_ok},
254         {MSR_LBR_CORE_FROM, "MSR_LBR_CORE_FROM", emsr_ok},
255         {MSR_LBR_CORE_TO, "MSR_LBR_CORE_TO", emsr_ok},
256
257         // grumble.
258         {MSR_OFFCORE_RSP_0, "MSR_OFFCORE_RSP_0", emsr_ok},
259         {MSR_OFFCORE_RSP_1, "MSR_OFFCORE_RSP_1", emsr_ok},
260         // louder.
261         {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok},
262         // aaaaaahhhhhhhhhhhhhhhhhhhhh
263         {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
264         {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
265         {MSR_IA32_PERF_CAPABILITIES, "MSR_IA32_PERF_CAPABILITIES", emsr_ok},
266         // unsafe.
267         {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fake_apicbase},
268
269         // mostly harmless.
270         {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite},
271         {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero},
272
273         // TBD
274         {MSR_IA32_TSC_DEADLINE, "MSR_IA32_TSC_DEADLINE", emsr_fakewrite},
275 };
276
277 /* this may be the only register that needs special handling.
278  * If there others then we might want to extend the emmsr struct.
279  */
280 bool emsr_miscenable(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
281                      uint64_t *rax, uint32_t opcode)
282 {
283         uint32_t eax, edx;
284
285         rdmsr(msr->reg, eax, edx);
286         /* we just let them read the misc msr for now. */
287         if (opcode == VMM_MSR_EMU_READ) {
288                 *rax = eax;
289                 *rax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
290                 *rdx = edx;
291                 return TRUE;
292         } else {
293                 /* if they are writing what is already written, that's ok. */
294                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
295                         return TRUE;
296         }
297         printk
298                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
299                  msr->name, (uint32_t) *rdx, (uint32_t) *rax, edx, eax);
300         return FALSE;
301 }
302
303 /* TODO: this looks like a copy-paste for the read side.  What's the purpose of
304  * mustmatch?  No one even uses it. */
305 bool emsr_mustmatch(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
306                     uint64_t *rax, uint32_t opcode)
307 {
308         uint32_t eax, edx;
309
310         rdmsr(msr->reg, eax, edx);
311         /* we just let them read the misc msr for now. */
312         if (opcode == VMM_MSR_EMU_READ) {
313                 *rax = eax;
314                 *rdx = edx;
315                 return TRUE;
316         } else {
317                 /* if they are writing what is already written, that's ok. */
318                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
319                         return TRUE;
320         }
321         printk
322                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
323                  msr->name, (uint32_t) *rdx, (uint32_t) *rax, edx, eax);
324         return FALSE;
325 }
326
327 bool emsr_readonly(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
328                    uint64_t *rax, uint32_t opcode)
329 {
330         uint32_t eax, edx;
331
332         rdmsr((uint32_t) *rcx, eax, edx);
333         if (opcode == VMM_MSR_EMU_READ) {
334                 *rax = eax;
335                 *rdx = edx;
336                 return TRUE;
337         }
338
339         printk("%s: Tried to write a readonly register\n", msr->name);
340         return FALSE;
341 }
342
343 bool emsr_readzero(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
344                    uint64_t *rax, uint32_t opcode)
345 {
346         if (opcode == VMM_MSR_EMU_READ) {
347                 *rax = 0;
348                 *rdx = 0;
349                 return TRUE;
350         }
351
352         printk("%s: Tried to write a readonly register\n", msr->name);
353         return FALSE;
354 }
355
356 /* pretend to write it, but don't write it. */
357 bool emsr_fakewrite(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
358                     uint64_t *rax, uint32_t opcode)
359 {
360         uint32_t eax, edx;
361
362         if (!msr->written) {
363                 rdmsr(msr->reg, eax, edx);
364         } else {
365                 edx = msr->edx;
366                 eax = msr->eax;
367         }
368         /* we just let them read the misc msr for now. */
369         if (opcode == VMM_MSR_EMU_READ) {
370                 *rax = eax;
371                 *rdx = edx;
372                 return TRUE;
373         } else {
374                 /* if they are writing what is already written, that's ok. */
375                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
376                         return TRUE;
377                 msr->edx = *rdx;
378                 msr->eax = *rax;
379                 msr->written = TRUE;
380         }
381         return TRUE;
382 }
383
384 bool emsr_ok(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
385              uint64_t *rax, uint32_t opcode)
386 {
387         if (opcode == VMM_MSR_EMU_READ) {
388                 rdmsr(msr->reg, *rdx, *rax);
389         } else {
390                 uint64_t val = (uint64_t) *rdx << 32 | *rax;
391
392                 write_msr(msr->reg, val);
393         }
394         return TRUE;
395 }
396
397 /* pretend to write it, but don't write it. */
398 bool emsr_fake_apicbase(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
399                         uint64_t *rax, uint32_t opcode)
400 {
401         uint32_t eax, edx;
402
403         if (!msr->written) {
404                 //rdmsr(msr->reg, eax, edx);
405                 /* TODO: tightly coupled to the addr in vmrunkernel.  We want this func
406                  * to return the val that vmrunkernel put into the VMCS. */
407                 eax = 0xfee00900;
408                 edx = 0;
409         } else {
410                 edx = msr->edx;
411                 eax = msr->eax;
412         }
413         /* we just let them read the misc msr for now. */
414         if (opcode == VMM_MSR_EMU_READ) {
415                 *rax = eax;
416                 *rdx = edx;
417                 return TRUE;
418         } else {
419                 /* if they are writing what is already written, that's ok. */
420                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
421                         return 0;
422                 msr->edx = *rdx;
423                 msr->eax = *rax;
424                 msr->written = TRUE;
425         }
426         return TRUE;
427 }
428
429 bool vmm_emulate_msr(uint64_t *rcx, uint64_t *rdx, uint64_t *rax, int op)
430 {
431         for (int i = 0; i < ARRAY_SIZE(emmsrs); i++) {
432                 if (emmsrs[i].reg != *rcx)
433                         continue;
434                 return emmsrs[i].f(&emmsrs[i], rcx, rdx, rax, op);
435         }
436         return FALSE;
437 }