1aeb16fbd4d1877cf59e6ea1e1b358701de99b31
[akaros.git] / kern / arch / x86 / vmm / vmm.c
1 /* Copyright 2015 Google Inc.
2  *
3  * See LICENSE for details.
4  */
5
6 /* We're not going to falll into the trap of only compiling support
7  * for AMD OR Intel for an image. It all gets compiled in, and which
8  * one you use depends on on cpuinfo, not a compile-time
9  * switch. That's proven to be the best strategy.  Conditionally
10  * compiling in support is the path to hell.
11  */
12 #include <assert.h>
13 #include <pmap.h>
14 #include <smp.h>
15 #include <kmalloc.h>
16
17 #include <ros/vmm.h>
18 #include "intel/vmx.h"
19 #include "vmm.h"
20 #include <trap.h>
21 #include <umem.h>
22
23 #include <arch/x86.h>
24 #include <ros/procinfo.h>
25
26
27 /* TODO: have better cpuid info storage and checks */
28 bool x86_supports_vmx = FALSE;
29
30 /* Figure out what kind of CPU we are on, and if it supports any reasonable
31  * virtualization. For now, if we're not some sort of newer intel, don't
32  * bother. This does all cores. Again, note, we make these decisions at runtime,
33  * to avoid getting into the problems that compile-time decisions can cause.
34  * At this point, of course, it's still all intel.
35  */
36 void vmm_init(void)
37 {
38         int ret;
39         /* Check first for intel capabilities. This is hence two back-to-back
40          * implementationd-dependent checks. That's ok, it's all msr dependent.
41          */
42         ret = intel_vmm_init();
43         if (! ret) {
44                 x86_supports_vmx = TRUE;
45                 return;
46         }
47
48         /* TODO: AMD. Will we ever care? It's not clear. */
49         printk("vmm_init failed, ret %d\n", ret);
50         return;
51 }
52
53 void vmm_pcpu_init(void)
54 {
55         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
56
57         pcpui->guest_pcoreid = -1;
58         if (!x86_supports_vmx)
59                 return;
60         if (! intel_vmm_pcpu_init()) {
61                 printd("vmm_pcpu_init worked\n");
62                 return;
63         }
64         /* TODO: AMD. Will we ever care? It's not clear. */
65         printk("vmm_pcpu_init failed\n");
66 }
67
68 /* Initializes a process to run virtual machine contexts, returning the number
69  * initialized, throwing on error. */
70 int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores,
71                     struct vmm_gpcore_init *u_gpcis)
72 {
73         ERRSTACK(1);
74         struct vmm *vmm = &p->vmm;
75         struct vmm_gpcore_init gpci;
76
77         if (!x86_supports_vmx)
78                 error(ENODEV, "This CPU does not support VMX");
79         qlock(&vmm->qlock);
80         if (waserror()) {
81                 qunlock(&vmm->qlock);
82                 nexterror();
83         }
84
85         /* TODO: just use an atomic test instead of all this locking stuff? */
86         if (vmm->vmmcp)
87                 error(EAGAIN, "We're already running a vmmcp?");
88         /* Set this early, so cleanup checks the gpc array */
89         vmm->vmmcp = TRUE;
90         vmm->amd = 0;
91         vmx_setup_vmx_vmm(&vmm->vmx);
92         nr_guest_pcores = MIN(nr_guest_pcores, num_cores);
93         vmm->guest_pcores = kzmalloc(sizeof(void *) * nr_guest_pcores, MEM_WAIT);
94         if (!vmm->guest_pcores)
95                 error(ENOMEM, "Allocation of vmm->guest_pcores failed");
96
97         for (int i = 0; i < nr_guest_pcores; i++) {
98                 if (copy_from_user(&gpci, &u_gpcis[i], sizeof(struct vmm_gpcore_init)))
99                         error(EINVAL, "Bad pointer %p for gps", u_gpcis);
100                 vmm->guest_pcores[i] = create_guest_pcore(p, &gpci);
101                 vmm->nr_guest_pcores = i + 1;
102         }
103         for (int i = 0; i < VMM_VMEXIT_NR_TYPES; i++)
104                 vmm->vmexits[i] = 0;
105         qunlock(&vmm->qlock);
106         poperror();
107         return vmm->nr_guest_pcores;
108 }
109
110 /* Has no concurrency protection - only call this when you know you have the
111  * only ref to vmm.  For instance, from __proc_free, where there is only one ref
112  * to the proc (and thus proc.vmm). */
113 void __vmm_struct_cleanup(struct proc *p)
114 {
115         struct vmm *vmm = &p->vmm;
116         if (!vmm->vmmcp)
117                 return;
118         for (int i = 0; i < vmm->nr_guest_pcores; i++) {
119                 if (vmm->guest_pcores[i])
120                         destroy_guest_pcore(vmm->guest_pcores[i]);
121         }
122         kfree(vmm->guest_pcores);
123         ept_flush(p->env_pgdir.eptp);
124         vmm->vmmcp = FALSE;
125 }
126
127 int vmm_poke_guest(struct proc *p, int guest_pcoreid)
128 {
129         struct guest_pcore *gpc;
130         int pcoreid;
131
132         gpc = lookup_guest_pcore(p, guest_pcoreid);
133         if (!gpc) {
134                 set_error(ENOENT, "Bad guest_pcoreid %d", guest_pcoreid);
135                 return -1;
136         }
137         /* We're doing an unlocked peek; it could change immediately.  This is a
138          * best effort service. */
139         pcoreid = ACCESS_ONCE(gpc->cpu);
140         if (pcoreid == -1) {
141                 /* So we know that we'll miss the poke for the posted IRQ.  We could
142                  * return an error.  However, error handling for this case isn't
143                  * particularly helpful (yet).  The absence of the error does not mean
144                  * the IRQ was posted.  We'll still return 0, meaning "the user didn't
145                  * mess up; we tried." */
146                 return 0;
147         }
148         send_ipi(pcoreid, I_POKE_GUEST);
149         return 0;
150 }
151
152 struct guest_pcore *lookup_guest_pcore(struct proc *p, int guest_pcoreid)
153 {
154         /* nr_guest_pcores is written once at setup and never changed */
155         if (guest_pcoreid >= p->vmm.nr_guest_pcores)
156                 return 0;
157         return p->vmm.guest_pcores[guest_pcoreid];
158 }
159
160 struct guest_pcore *load_guest_pcore(struct proc *p, int guest_pcoreid)
161 {
162         struct guest_pcore *gpc;
163         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
164
165         gpc = lookup_guest_pcore(p, guest_pcoreid);
166         if (!gpc)
167                 return 0;
168         assert(pcpui->guest_pcoreid == -1);
169         spin_lock(&p->vmm.lock);
170         if (gpc->cpu != -1) {
171                 spin_unlock(&p->vmm.lock);
172                 return 0;
173         }
174         gpc->cpu = core_id();
175         spin_unlock(&p->vmm.lock);
176         /* We've got dibs on the gpc; we don't need to hold the lock any longer. */
177         pcpui->guest_pcoreid = guest_pcoreid;
178         vmx_load_guest_pcore(gpc);
179         /* Load guest's xcr0 */
180         lxcr0(gpc->xcr0);
181
182         /* Manual MSR save/restore */
183         write_kern_gsbase(gpc->msr_kern_gs_base);
184         if (gpc->msr_star != AKAROS_MSR_STAR)
185                 write_msr(MSR_STAR, gpc->msr_star);
186         if (gpc->msr_lstar != AKAROS_MSR_LSTAR)
187                 write_msr(MSR_LSTAR, gpc->msr_lstar);
188         if (gpc->msr_sfmask != AKAROS_MSR_SFMASK)
189                 write_msr(MSR_SFMASK, gpc->msr_sfmask);
190
191         return gpc;
192 }
193
194 void unload_guest_pcore(struct proc *p, int guest_pcoreid)
195 {
196         struct guest_pcore *gpc;
197         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
198
199         gpc = lookup_guest_pcore(p, guest_pcoreid);
200         assert(gpc);
201         spin_lock(&p->vmm.lock);
202         assert(gpc->cpu != -1);
203         vmx_unload_guest_pcore(gpc);
204         gpc->cpu = -1;
205
206         /* Save guest's xcr0 and restore Akaros's default. */
207         gpc->xcr0 = rxcr0();
208         lxcr0(__proc_global_info.x86_default_xcr0);
209
210         /* We manage these MSRs manually. */
211         gpc->msr_kern_gs_base = read_kern_gsbase();
212         gpc->msr_star = read_msr(MSR_STAR);
213         gpc->msr_lstar = read_msr(MSR_LSTAR);
214         gpc->msr_sfmask = read_msr(MSR_SFMASK);
215
216         write_kern_gsbase((uint64_t)pcpui);
217         if (gpc->msr_star != AKAROS_MSR_STAR)
218                 write_msr(MSR_STAR, AKAROS_MSR_STAR);
219         if (gpc->msr_lstar != AKAROS_MSR_LSTAR)
220                 write_msr(MSR_LSTAR, AKAROS_MSR_LSTAR);
221         if (gpc->msr_sfmask, AKAROS_MSR_SFMASK)
222                 write_msr(MSR_SFMASK, AKAROS_MSR_SFMASK);
223
224         /* As soon as we unlock, this gpc can be started on another core */
225         spin_unlock(&p->vmm.lock);
226         pcpui->guest_pcoreid = -1;
227 }
228
229 /* emulated msr. For now, an msr value and a pointer to a helper that
230  * performs the requested operation.
231  */
232 struct emmsr {
233         uint32_t reg;
234         char *name;
235         bool (*f)(struct emmsr *msr, struct vm_trapframe *vm_tf,
236                   uint32_t opcode);
237         bool written;
238         uint32_t edx, eax;
239 };
240
241 static bool emsr_miscenable(struct emmsr *msr, struct vm_trapframe *vm_tf,
242                             uint32_t opcode);
243 static bool emsr_readonly(struct emmsr *msr, struct vm_trapframe *vm_tf,
244                           uint32_t opcode);
245 static bool emsr_readzero(struct emmsr *msr, struct vm_trapframe *vm_tf,
246                           uint32_t opcode);
247 static bool emsr_fakewrite(struct emmsr *msr, struct vm_trapframe *vm_tf,
248                            uint32_t opcode);
249 static bool emsr_ok(struct emmsr *msr, struct vm_trapframe *vm_tf,
250                     uint32_t opcode);
251 static bool emsr_fake_apicbase(struct emmsr *msr, struct vm_trapframe *vm_tf,
252                                uint32_t opcode);
253
254 struct emmsr emmsrs[] = {
255         {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable},
256         {MSR_IA32_SYSENTER_CS, "MSR_IA32_SYSENTER_CS", emsr_ok},
257         {MSR_IA32_SYSENTER_EIP, "MSR_IA32_SYSENTER_EIP", emsr_ok},
258         {MSR_IA32_SYSENTER_ESP, "MSR_IA32_SYSENTER_ESP", emsr_ok},
259         {MSR_IA32_UCODE_REV, "MSR_IA32_UCODE_REV", emsr_fakewrite},
260         {MSR_CSTAR, "MSR_CSTAR", emsr_fakewrite},
261         {MSR_IA32_VMX_BASIC_MSR, "MSR_IA32_VMX_BASIC_MSR", emsr_fakewrite},
262         {MSR_IA32_VMX_PINBASED_CTLS_MSR, "MSR_IA32_VMX_PINBASED_CTLS_MSR",
263          emsr_fakewrite},
264         {MSR_IA32_VMX_PROCBASED_CTLS_MSR, "MSR_IA32_VMX_PROCBASED_CTLS_MSR",
265          emsr_fakewrite},
266         {MSR_IA32_VMX_PROCBASED_CTLS2, "MSR_IA32_VMX_PROCBASED_CTLS2",
267          emsr_fakewrite},
268         {MSR_IA32_VMX_EXIT_CTLS_MSR, "MSR_IA32_VMX_EXIT_CTLS_MSR",
269          emsr_fakewrite},
270         {MSR_IA32_VMX_ENTRY_CTLS_MSR, "MSR_IA32_VMX_ENTRY_CTLS_MSR",
271          emsr_fakewrite},
272         {MSR_IA32_ENERGY_PERF_BIAS, "MSR_IA32_ENERGY_PERF_BIAS",
273          emsr_fakewrite},
274         {MSR_LBR_SELECT, "MSR_LBR_SELECT", emsr_ok},
275         {MSR_LBR_TOS, "MSR_LBR_TOS", emsr_ok},
276         {MSR_LBR_NHM_FROM, "MSR_LBR_NHM_FROM", emsr_ok},
277         {MSR_LBR_NHM_TO, "MSR_LBR_NHM_TO", emsr_ok},
278         {MSR_LBR_CORE_FROM, "MSR_LBR_CORE_FROM", emsr_ok},
279         {MSR_LBR_CORE_TO, "MSR_LBR_CORE_TO", emsr_ok},
280
281         // grumble.
282         {MSR_OFFCORE_RSP_0, "MSR_OFFCORE_RSP_0", emsr_ok},
283         {MSR_OFFCORE_RSP_1, "MSR_OFFCORE_RSP_1", emsr_ok},
284         // louder.
285         {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok},
286         // aaaaaahhhhhhhhhhhhhhhhhhhhh
287         {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
288         {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL1", emsr_ok},
289         {MSR_IA32_PERF_CAPABILITIES, "MSR_IA32_PERF_CAPABILITIES", emsr_readzero},
290         // unsafe.
291         {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fake_apicbase},
292
293         // mostly harmless.
294         {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite},
295         {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero},
296         {MSR_IA32_MCG_CAP, "MSR_IA32_MCG_CAP", emsr_readzero},
297         {MSR_IA32_DEBUGCTLMSR, "MSR_IA32_DEBUGCTLMSR", emsr_fakewrite},
298
299         // TBD
300         {MSR_IA32_TSC_DEADLINE, "MSR_IA32_TSC_DEADLINE", emsr_fakewrite},
301 };
302
303 /* this may be the only register that needs special handling.
304  * If there others then we might want to extend the emmsr struct.
305  */
306 bool emsr_miscenable(struct emmsr *msr, struct vm_trapframe *vm_tf,
307                      uint32_t opcode)
308 {
309         uint64_t val;
310         uint32_t eax, edx;
311
312         if (read_msr_safe(msr->reg, &val))
313                 return FALSE;
314         eax = low32(val);
315         eax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
316         edx = high32(val);
317         /* we just let them read the misc msr for now. */
318         if (opcode == VMM_MSR_EMU_READ) {
319                 vm_tf->tf_rax = eax;
320                 vm_tf->tf_rdx = edx;
321                 return TRUE;
322         } else {
323                 /* if they are writing what is already written, that's ok. */
324                 if (((uint32_t) vm_tf->tf_rax == eax)
325                     && ((uint32_t) vm_tf->tf_rdx == edx))
326                         return TRUE;
327         }
328         printk("%s: Wanted to write 0x%x%x, but could not; value was 0x%x%x\n",
329                msr->name, (uint32_t) vm_tf->tf_rdx, (uint32_t) vm_tf->tf_rax,
330                edx, eax);
331         return FALSE;
332 }
333
334 bool emsr_readonly(struct emmsr *msr, struct vm_trapframe *vm_tf,
335                    uint32_t opcode)
336 {
337         uint64_t val;
338
339         if (read_msr_safe(msr->reg, &val))
340                 return FALSE;
341         if (opcode == VMM_MSR_EMU_READ) {
342                 vm_tf->tf_rax = low32(val);
343                 vm_tf->tf_rdx = high32(val);
344                 return TRUE;
345         }
346
347         printk("%s: Tried to write a readonly register\n", msr->name);
348         return FALSE;
349 }
350
351 bool emsr_readzero(struct emmsr *msr, struct vm_trapframe *vm_tf,
352                    uint32_t opcode)
353 {
354         if (opcode == VMM_MSR_EMU_READ) {
355                 vm_tf->tf_rax = 0;
356                 vm_tf->tf_rdx = 0;
357                 return TRUE;
358         }
359
360         printk("%s: Tried to write a readonly register\n", msr->name);
361         return FALSE;
362 }
363
364 /* pretend to write it, but don't write it. */
365 bool emsr_fakewrite(struct emmsr *msr, struct vm_trapframe *vm_tf,
366                     uint32_t opcode)
367 {
368         uint32_t eax, edx;
369         uint64_t val;
370
371         if (!msr->written) {
372                 if (read_msr_safe(msr->reg, &val))
373                         return FALSE;
374                 eax = low32(val);
375                 edx = high32(val);
376         } else {
377                 eax = msr->eax;
378                 edx = msr->edx;
379         }
380         /* we just let them read the misc msr for now. */
381         if (opcode == VMM_MSR_EMU_READ) {
382                 vm_tf->tf_rax = eax;
383                 vm_tf->tf_rdx = edx;
384                 return TRUE;
385         } else {
386                 msr->edx = vm_tf->tf_rdx;
387                 msr->eax = vm_tf->tf_rax;
388                 msr->written = TRUE;
389         }
390         return TRUE;
391 }
392
393 bool emsr_ok(struct emmsr *msr, struct vm_trapframe *vm_tf,
394              uint32_t opcode)
395 {
396         uint64_t val;
397
398         if (opcode == VMM_MSR_EMU_READ) {
399                 if (read_msr_safe(msr->reg, &val))
400                         return FALSE;
401                 vm_tf->tf_rax = low32(val);
402                 vm_tf->tf_rdx = high32(val);
403         } else {
404                 val = (vm_tf->tf_rdx << 32) | (vm_tf->tf_rax & 0xffffffff);
405                 if (write_msr_safe(msr->reg, val))
406                         return FALSE;
407         }
408         return TRUE;
409 }
410
411 /* pretend to write it, but don't write it. */
412 bool emsr_fake_apicbase(struct emmsr *msr, struct vm_trapframe *vm_tf,
413                         uint32_t opcode)
414 {
415         uint32_t eax, edx;
416
417         if (!msr->written) {
418                 /* TODO: tightly coupled to the addr in vmrunkernel.  We want this func
419                  * to return the val that vmrunkernel put into the VMCS. */
420                 eax = 0xfee00d00;
421                 if (vm_tf->tf_guest_pcoreid != 0) {
422                         // Remove BSP bit if not core 0
423                         eax = 0xfee00c00;
424                 }
425                 edx = 0;
426         } else {
427                 edx = msr->edx;
428                 eax = msr->eax;
429         }
430         /* we just let them read the misc msr for now. */
431         if (opcode == VMM_MSR_EMU_READ) {
432                 vm_tf->tf_rax = eax;
433                 vm_tf->tf_rdx = edx;
434                 return TRUE;
435         } else {
436                 /* if they are writing what is already written, that's ok. */
437                 if (((uint32_t) vm_tf->tf_rax == eax)
438                     && ((uint32_t) vm_tf->tf_rdx == edx))
439                         return 0;
440                 msr->edx = vm_tf->tf_rdx;
441                 msr->eax = vm_tf->tf_rax;
442                 msr->written = TRUE;
443         }
444         return TRUE;
445 }
446
447 bool vmm_emulate_msr(struct vm_trapframe *vm_tf, int op)
448 {
449         for (int i = 0; i < ARRAY_SIZE(emmsrs); i++) {
450                 if (emmsrs[i].reg != vm_tf->tf_rcx)
451                         continue;
452                 return emmsrs[i].f(&emmsrs[i], vm_tf, op);
453         }
454         return FALSE;
455 }