VMM: Use safe MSR accessors
[akaros.git] / kern / arch / x86 / vmm / vmm.c
1 /* Copyright 2015 Google Inc.
2  *
3  * See LICENSE for details.
4  */
5
6 /* We're not going to falll into the trap of only compiling support
7  * for AMD OR Intel for an image. It all gets compiled in, and which
8  * one you use depends on on cpuinfo, not a compile-time
9  * switch. That's proven to be the best strategy.  Conditionally
10  * compiling in support is the path to hell.
11  */
12 #include <assert.h>
13 #include <pmap.h>
14 #include <smp.h>
15 #include <kmalloc.h>
16
17 #include <ros/vmm.h>
18 #include "intel/vmx.h"
19 #include "vmm.h"
20 #include <trap.h>
21 #include <umem.h>
22
23 #include <arch/x86.h>
24 #include <ros/procinfo.h>
25
26
27 /* TODO: have better cpuid info storage and checks */
28 bool x86_supports_vmx = FALSE;
29
30 /* Figure out what kind of CPU we are on, and if it supports any reasonable
31  * virtualization. For now, if we're not some sort of newer intel, don't
32  * bother. This does all cores. Again, note, we make these decisions at runtime,
33  * to avoid getting into the problems that compile-time decisions can cause.
34  * At this point, of course, it's still all intel.
35  */
36 void vmm_init(void)
37 {
38         int ret;
39         /* Check first for intel capabilities. This is hence two back-to-back
40          * implementationd-dependent checks. That's ok, it's all msr dependent.
41          */
42         ret = intel_vmm_init();
43         if (! ret) {
44                 x86_supports_vmx = TRUE;
45                 return;
46         }
47
48         /* TODO: AMD. Will we ever care? It's not clear. */
49         printk("vmm_init failed, ret %d\n", ret);
50         return;
51 }
52
53 void vmm_pcpu_init(void)
54 {
55         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
56
57         pcpui->guest_pcoreid = -1;
58         if (!x86_supports_vmx)
59                 return;
60         if (! intel_vmm_pcpu_init()) {
61                 printd("vmm_pcpu_init worked\n");
62                 return;
63         }
64         /* TODO: AMD. Will we ever care? It's not clear. */
65         printk("vmm_pcpu_init failed\n");
66 }
67
68 /* Initializes a process to run virtual machine contexts, returning the number
69  * initialized, optionally setting errno */
70 int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores,
71                     struct vmm_gpcore_init *u_gpcis, int flags)
72 {
73         struct vmm *vmm = &p->vmm;
74         unsigned int i;
75         struct vmm_gpcore_init gpci;
76
77         if (flags & ~VMM_ALL_FLAGS) {
78                 set_errstr("%s: flags is 0x%lx, VMM_ALL_FLAGS is 0x%lx\n", __func__,
79                            flags, VMM_ALL_FLAGS);
80                 set_errno(EINVAL);
81                 return 0;
82         }
83         vmm->flags = flags;
84         if (!x86_supports_vmx) {
85                 set_errno(ENODEV);
86                 return 0;
87         }
88         qlock(&vmm->qlock);
89         if (vmm->vmmcp) {
90                 set_errno(EINVAL);
91                 qunlock(&vmm->qlock);
92                 return 0;
93         }
94         /* Set this early, so cleanup checks the gpc array */
95         vmm->vmmcp = TRUE;
96         nr_guest_pcores = MIN(nr_guest_pcores, num_cores);
97         vmm->amd = 0;
98         vmm->guest_pcores = kzmalloc(sizeof(void*) * nr_guest_pcores,
99                                      MEM_WAIT);
100         for (i = 0; i < nr_guest_pcores; i++) {
101                 if (copy_from_user(&gpci, &u_gpcis[i],
102                                    sizeof(struct vmm_gpcore_init))) {
103                         set_error(EINVAL, "Bad pointer %p for gps", u_gpcis);
104                         break;
105                 }
106                 vmm->guest_pcores[i] = create_guest_pcore(p, &gpci);
107                 /* If we failed, we'll clean it up when the process dies */
108                 if (!vmm->guest_pcores[i]) {
109                         set_errno(ENOMEM);
110                         break;
111                 }
112         }
113         vmm->nr_guest_pcores = i;
114         for (int i = 0; i < VMM_VMEXIT_NR_TYPES; i++)
115                 vmm->vmexits[i] = 0;
116         qunlock(&vmm->qlock);
117         return i;
118 }
119
120 /* Has no concurrency protection - only call this when you know you have the
121  * only ref to vmm.  For instance, from __proc_free, where there is only one ref
122  * to the proc (and thus proc.vmm). */
123 void __vmm_struct_cleanup(struct proc *p)
124 {
125         struct vmm *vmm = &p->vmm;
126         if (!vmm->vmmcp)
127                 return;
128         for (int i = 0; i < vmm->nr_guest_pcores; i++) {
129                 if (vmm->guest_pcores[i])
130                         destroy_guest_pcore(vmm->guest_pcores[i]);
131         }
132         kfree(vmm->guest_pcores);
133         ept_flush(p->env_pgdir.eptp);
134         vmm->vmmcp = FALSE;
135 }
136
137 int vmm_poke_guest(struct proc *p, int guest_pcoreid)
138 {
139         struct guest_pcore *gpc;
140         int pcoreid;
141
142         gpc = lookup_guest_pcore(p, guest_pcoreid);
143         if (!gpc) {
144                 set_error(ENOENT, "Bad guest_pcoreid %d", guest_pcoreid);
145                 return -1;
146         }
147         /* We're doing an unlocked peek; it could change immediately.  This is a
148          * best effort service. */
149         pcoreid = ACCESS_ONCE(gpc->cpu);
150         if (pcoreid == -1) {
151                 /* So we know that we'll miss the poke for the posted IRQ.  We could
152                  * return an error.  However, error handling for this case isn't
153                  * particularly helpful (yet).  The absence of the error does not mean
154                  * the IRQ was posted.  We'll still return 0, meaning "the user didn't
155                  * mess up; we tried." */
156                 return 0;
157         }
158         send_ipi(pcoreid, I_POKE_CORE);
159         return 0;
160 }
161
162 struct guest_pcore *lookup_guest_pcore(struct proc *p, int guest_pcoreid)
163 {
164         /* nr_guest_pcores is written once at setup and never changed */
165         if (guest_pcoreid >= p->vmm.nr_guest_pcores)
166                 return 0;
167         return p->vmm.guest_pcores[guest_pcoreid];
168 }
169
170 struct guest_pcore *load_guest_pcore(struct proc *p, int guest_pcoreid)
171 {
172         struct guest_pcore *gpc;
173         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
174
175         gpc = lookup_guest_pcore(p, guest_pcoreid);
176         if (!gpc)
177                 return 0;
178         assert(pcpui->guest_pcoreid == -1);
179         spin_lock(&p->vmm.lock);
180         if (gpc->cpu != -1) {
181                 spin_unlock(&p->vmm.lock);
182                 return 0;
183         }
184         gpc->cpu = core_id();
185         spin_unlock(&p->vmm.lock);
186         /* We've got dibs on the gpc; we don't need to hold the lock any longer. */
187         pcpui->guest_pcoreid = guest_pcoreid;
188         ept_sync_context(gpc_get_eptp(gpc));
189         vmx_load_guest_pcore(gpc);
190         /* Load guest's xcr0 */
191         lxcr0(gpc->xcr0);
192         return gpc;
193 }
194
195 void unload_guest_pcore(struct proc *p, int guest_pcoreid)
196 {
197         struct guest_pcore *gpc;
198         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
199
200         gpc = lookup_guest_pcore(p, guest_pcoreid);
201         assert(gpc);
202         spin_lock(&p->vmm.lock);
203         assert(gpc->cpu != -1);
204         ept_sync_context(gpc_get_eptp(gpc));
205         vmx_unload_guest_pcore(gpc);
206         gpc->cpu = -1;
207
208         /* Save guest's xcr0 and restore Akaros's default. */
209         gpc->xcr0 = rxcr0();
210         lxcr0(__proc_global_info.x86_default_xcr0);
211
212         /* As soon as we unlock, this gpc can be started on another core */
213         spin_unlock(&p->vmm.lock);
214         pcpui->guest_pcoreid = -1;
215 }
216
217 /* emulated msr. For now, an msr value and a pointer to a helper that
218  * performs the requested operation.
219  */
220 struct emmsr {
221         uint32_t reg;
222         char *name;
223         bool (*f)(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
224                   uint64_t *rax, uint32_t opcode);
225         bool written;
226         uint32_t edx, eax;
227 };
228
229 static bool emsr_miscenable(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
230                             uint64_t *rax, uint32_t opcode);
231 static bool emsr_mustmatch(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
232                            uint64_t *rax, uint32_t opcode);
233 static bool emsr_readonly(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
234                           uint64_t *rax, uint32_t opcode);
235 static bool emsr_readzero(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
236                           uint64_t *rax, uint32_t opcode);
237 static bool emsr_fakewrite(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
238                            uint64_t *rax, uint32_t opcode);
239 static bool emsr_ok(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
240                     uint64_t *rax, uint32_t opcode);
241 static bool emsr_fake_apicbase(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
242                                uint64_t *rax, uint32_t opcode);
243
244 struct emmsr emmsrs[] = {
245         {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable},
246         {MSR_IA32_SYSENTER_CS, "MSR_IA32_SYSENTER_CS", emsr_ok},
247         {MSR_IA32_SYSENTER_EIP, "MSR_IA32_SYSENTER_EIP", emsr_ok},
248         {MSR_IA32_SYSENTER_ESP, "MSR_IA32_SYSENTER_ESP", emsr_ok},
249         {MSR_IA32_UCODE_REV, "MSR_IA32_UCODE_REV", emsr_fakewrite},
250         {MSR_CSTAR, "MSR_CSTAR", emsr_fakewrite},
251         {MSR_IA32_VMX_BASIC_MSR, "MSR_IA32_VMX_BASIC_MSR", emsr_fakewrite},
252         {MSR_IA32_VMX_PINBASED_CTLS_MSR, "MSR_IA32_VMX_PINBASED_CTLS_MSR",
253          emsr_fakewrite},
254         {MSR_IA32_VMX_PROCBASED_CTLS_MSR, "MSR_IA32_VMX_PROCBASED_CTLS_MSR",
255          emsr_fakewrite},
256         {MSR_IA32_VMX_PROCBASED_CTLS2, "MSR_IA32_VMX_PROCBASED_CTLS2",
257          emsr_fakewrite},
258         {MSR_IA32_VMX_EXIT_CTLS_MSR, "MSR_IA32_VMX_EXIT_CTLS_MSR",
259          emsr_fakewrite},
260         {MSR_IA32_VMX_ENTRY_CTLS_MSR, "MSR_IA32_VMX_ENTRY_CTLS_MSR",
261          emsr_fakewrite},
262         {MSR_IA32_ENERGY_PERF_BIAS, "MSR_IA32_ENERGY_PERF_BIAS",
263          emsr_fakewrite},
264         {MSR_LBR_SELECT, "MSR_LBR_SELECT", emsr_ok},
265         {MSR_LBR_TOS, "MSR_LBR_TOS", emsr_ok},
266         {MSR_LBR_NHM_FROM, "MSR_LBR_NHM_FROM", emsr_ok},
267         {MSR_LBR_NHM_TO, "MSR_LBR_NHM_TO", emsr_ok},
268         {MSR_LBR_CORE_FROM, "MSR_LBR_CORE_FROM", emsr_ok},
269         {MSR_LBR_CORE_TO, "MSR_LBR_CORE_TO", emsr_ok},
270
271         // grumble.
272         {MSR_OFFCORE_RSP_0, "MSR_OFFCORE_RSP_0", emsr_ok},
273         {MSR_OFFCORE_RSP_1, "MSR_OFFCORE_RSP_1", emsr_ok},
274         // louder.
275         {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok},
276         // aaaaaahhhhhhhhhhhhhhhhhhhhh
277         {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
278         {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
279         {MSR_IA32_PERF_CAPABILITIES, "MSR_IA32_PERF_CAPABILITIES", emsr_ok},
280         // unsafe.
281         {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fake_apicbase},
282
283         // mostly harmless.
284         {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite},
285         {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero},
286
287         // TBD
288         {MSR_IA32_TSC_DEADLINE, "MSR_IA32_TSC_DEADLINE", emsr_fakewrite},
289 };
290
291 /* this may be the only register that needs special handling.
292  * If there others then we might want to extend the emmsr struct.
293  */
294 bool emsr_miscenable(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
295                      uint64_t *rax, uint32_t opcode)
296 {
297         uint32_t eax, edx;
298         uint64_t val;
299
300         if (read_msr_safe(msr->reg, &val))
301                 return FALSE;
302         split_msr_val(val, &edx, &eax);
303         /* we just let them read the misc msr for now. */
304         if (opcode == VMM_MSR_EMU_READ) {
305                 *rax = eax;
306                 *rax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
307                 *rdx = edx;
308                 return TRUE;
309         } else {
310                 /* if they are writing what is already written, that's ok. */
311                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
312                         return TRUE;
313         }
314         printk
315                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
316                  msr->name, (uint32_t) *rdx, (uint32_t) *rax, edx, eax);
317         return FALSE;
318 }
319
320 /* TODO: this looks like a copy-paste for the read side.  What's the purpose of
321  * mustmatch?  No one even uses it. */
322 bool emsr_mustmatch(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
323                     uint64_t *rax, uint32_t opcode)
324 {
325         uint32_t eax, edx;
326         uint64_t val;
327
328         if (read_msr_safe(msr->reg, &val))
329                 return FALSE;
330         split_msr_val(val, &edx, &eax);
331         /* we just let them read the misc msr for now. */
332         if (opcode == VMM_MSR_EMU_READ) {
333                 *rax = eax;
334                 *rdx = edx;
335                 return TRUE;
336         } else {
337                 /* if they are writing what is already written, that's ok. */
338                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
339                         return TRUE;
340         }
341         printk
342                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
343                  msr->name, (uint32_t) *rdx, (uint32_t) *rax, edx, eax);
344         return FALSE;
345 }
346
347 bool emsr_readonly(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
348                    uint64_t *rax, uint32_t opcode)
349 {
350         uint32_t eax, edx;
351         uint64_t val;
352
353         if (read_msr_safe(msr->reg, &val))
354                 return FALSE;
355         split_msr_val(val, &edx, &eax);
356         if (opcode == VMM_MSR_EMU_READ) {
357                 *rax = eax;
358                 *rdx = edx;
359                 return TRUE;
360         }
361
362         printk("%s: Tried to write a readonly register\n", msr->name);
363         return FALSE;
364 }
365
366 bool emsr_readzero(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
367                    uint64_t *rax, uint32_t opcode)
368 {
369         if (opcode == VMM_MSR_EMU_READ) {
370                 *rax = 0;
371                 *rdx = 0;
372                 return TRUE;
373         }
374
375         printk("%s: Tried to write a readonly register\n", msr->name);
376         return FALSE;
377 }
378
379 /* pretend to write it, but don't write it. */
380 bool emsr_fakewrite(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
381                     uint64_t *rax, uint32_t opcode)
382 {
383         uint32_t eax, edx;
384         uint64_t val;
385
386         if (!msr->written) {
387                 if (read_msr_safe(msr->reg, &val))
388                         return FALSE;
389                 split_msr_val(val, &edx, &eax);
390         } else {
391                 edx = msr->edx;
392                 eax = msr->eax;
393         }
394         /* we just let them read the misc msr for now. */
395         if (opcode == VMM_MSR_EMU_READ) {
396                 *rax = eax;
397                 *rdx = edx;
398                 return TRUE;
399         } else {
400                 /* if they are writing what is already written, that's ok. */
401                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
402                         return TRUE;
403                 msr->edx = *rdx;
404                 msr->eax = *rax;
405                 msr->written = TRUE;
406         }
407         return TRUE;
408 }
409
410 bool emsr_ok(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
411              uint64_t *rax, uint32_t opcode)
412 {
413         uint32_t eax, edx;
414         uint64_t val;
415
416         if (opcode == VMM_MSR_EMU_READ) {
417                 if (read_msr_safe(msr->reg, &val))
418                         return FALSE;
419                 split_msr_val(val, &edx, &eax);
420                 *rax = eax;
421                 *rdx = edx;
422         } else {
423                 val = (*rdx << 32) | (*rax & 0xffffffff);
424                 if (write_msr_safe(msr->reg, val))
425                         return FALSE;
426         }
427         return TRUE;
428 }
429
430 /* pretend to write it, but don't write it. */
431 bool emsr_fake_apicbase(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
432                         uint64_t *rax, uint32_t opcode)
433 {
434         uint32_t eax, edx;
435
436         if (!msr->written) {
437                 /* TODO: tightly coupled to the addr in vmrunkernel.  We want this func
438                  * to return the val that vmrunkernel put into the VMCS. */
439                 eax = 0xfee00900;
440                 edx = 0;
441         } else {
442                 edx = msr->edx;
443                 eax = msr->eax;
444         }
445         /* we just let them read the misc msr for now. */
446         if (opcode == VMM_MSR_EMU_READ) {
447                 *rax = eax;
448                 *rdx = edx;
449                 return TRUE;
450         } else {
451                 /* if they are writing what is already written, that's ok. */
452                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
453                         return 0;
454                 msr->edx = *rdx;
455                 msr->eax = *rax;
456                 msr->written = TRUE;
457         }
458         return TRUE;
459 }
460
461 bool vmm_emulate_msr(uint64_t *rcx, uint64_t *rdx, uint64_t *rax, int op)
462 {
463         for (int i = 0; i < ARRAY_SIZE(emmsrs); i++) {
464                 if (emmsrs[i].reg != *rcx)
465                         continue;
466                 return emmsrs[i].f(&emmsrs[i], rcx, rdx, rax, op);
467         }
468         return FALSE;
469 }