VMM: Manually save/restore certain registers [1/2]
[akaros.git] / kern / arch / x86 / vmm / vmm.c
1 /* Copyright 2015 Google Inc.
2  *
3  * See LICENSE for details.
4  */
5
6 /* We're not going to falll into the trap of only compiling support
7  * for AMD OR Intel for an image. It all gets compiled in, and which
8  * one you use depends on on cpuinfo, not a compile-time
9  * switch. That's proven to be the best strategy.  Conditionally
10  * compiling in support is the path to hell.
11  */
12 #include <assert.h>
13 #include <pmap.h>
14 #include <smp.h>
15 #include <kmalloc.h>
16
17 #include <ros/vmm.h>
18 #include "intel/vmx.h"
19 #include "vmm.h"
20 #include <trap.h>
21 #include <umem.h>
22
23 #include <arch/x86.h>
24 #include <ros/procinfo.h>
25
26
27 /* TODO: have better cpuid info storage and checks */
28 bool x86_supports_vmx = FALSE;
29
30 /* Figure out what kind of CPU we are on, and if it supports any reasonable
31  * virtualization. For now, if we're not some sort of newer intel, don't
32  * bother. This does all cores. Again, note, we make these decisions at runtime,
33  * to avoid getting into the problems that compile-time decisions can cause.
34  * At this point, of course, it's still all intel.
35  */
36 void vmm_init(void)
37 {
38         int ret;
39         /* Check first for intel capabilities. This is hence two back-to-back
40          * implementationd-dependent checks. That's ok, it's all msr dependent.
41          */
42         ret = intel_vmm_init();
43         if (! ret) {
44                 x86_supports_vmx = TRUE;
45                 return;
46         }
47
48         /* TODO: AMD. Will we ever care? It's not clear. */
49         printk("vmm_init failed, ret %d\n", ret);
50         return;
51 }
52
53 void vmm_pcpu_init(void)
54 {
55         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
56
57         pcpui->guest_pcoreid = -1;
58         if (!x86_supports_vmx)
59                 return;
60         if (! intel_vmm_pcpu_init()) {
61                 printd("vmm_pcpu_init worked\n");
62                 return;
63         }
64         /* TODO: AMD. Will we ever care? It's not clear. */
65         printk("vmm_pcpu_init failed\n");
66 }
67
68 /* Initializes a process to run virtual machine contexts, returning the number
69  * initialized, throwing on error. */
70 int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores,
71                     struct vmm_gpcore_init *u_gpcis, int flags)
72 {
73         ERRSTACK(1);
74         struct vmm *vmm = &p->vmm;
75         struct vmm_gpcore_init gpci;
76
77         if (flags & ~VMM_ALL_FLAGS)
78                 error(EINVAL, "%s: flags is 0x%lx, VMM_ALL_FLAGS is 0x%lx\n", __func__,
79                       flags, VMM_ALL_FLAGS);
80         vmm->flags = flags;
81         if (!x86_supports_vmx)
82                 error(ENODEV, "This CPU does not support VMX");
83         qlock(&vmm->qlock);
84         if (waserror()) {
85                 qunlock(&vmm->qlock);
86                 nexterror();
87         }
88
89         /* TODO: just use an atomic test instead of all this locking stuff? */
90         if (vmm->vmmcp)
91                 error(EAGAIN, "We're already running a vmmcp?");
92         /* Set this early, so cleanup checks the gpc array */
93         vmm->vmmcp = TRUE;
94         nr_guest_pcores = MIN(nr_guest_pcores, num_cores);
95         vmm->amd = 0;
96         vmm->guest_pcores = kzmalloc(sizeof(void *) * nr_guest_pcores, MEM_WAIT);
97         if (!vmm->guest_pcores)
98                 error(ENOMEM, "Allocation of vmm->guest_pcores failed");
99
100         for (int i = 0; i < nr_guest_pcores; i++) {
101                 if (copy_from_user(&gpci, &u_gpcis[i], sizeof(struct vmm_gpcore_init)))
102                         error(EINVAL, "Bad pointer %p for gps", u_gpcis);
103                 vmm->guest_pcores[i] = create_guest_pcore(p, &gpci);
104                 vmm->nr_guest_pcores = i + 1;
105         }
106         for (int i = 0; i < VMM_VMEXIT_NR_TYPES; i++)
107                 vmm->vmexits[i] = 0;
108         qunlock(&vmm->qlock);
109         poperror();
110         return vmm->nr_guest_pcores;
111 }
112
113 /* Has no concurrency protection - only call this when you know you have the
114  * only ref to vmm.  For instance, from __proc_free, where there is only one ref
115  * to the proc (and thus proc.vmm). */
116 void __vmm_struct_cleanup(struct proc *p)
117 {
118         struct vmm *vmm = &p->vmm;
119         if (!vmm->vmmcp)
120                 return;
121         for (int i = 0; i < vmm->nr_guest_pcores; i++) {
122                 if (vmm->guest_pcores[i])
123                         destroy_guest_pcore(vmm->guest_pcores[i]);
124         }
125         kfree(vmm->guest_pcores);
126         ept_flush(p->env_pgdir.eptp);
127         vmm->vmmcp = FALSE;
128 }
129
130 int vmm_poke_guest(struct proc *p, int guest_pcoreid)
131 {
132         struct guest_pcore *gpc;
133         int pcoreid;
134
135         gpc = lookup_guest_pcore(p, guest_pcoreid);
136         if (!gpc) {
137                 set_error(ENOENT, "Bad guest_pcoreid %d", guest_pcoreid);
138                 return -1;
139         }
140         /* We're doing an unlocked peek; it could change immediately.  This is a
141          * best effort service. */
142         pcoreid = ACCESS_ONCE(gpc->cpu);
143         if (pcoreid == -1) {
144                 /* So we know that we'll miss the poke for the posted IRQ.  We could
145                  * return an error.  However, error handling for this case isn't
146                  * particularly helpful (yet).  The absence of the error does not mean
147                  * the IRQ was posted.  We'll still return 0, meaning "the user didn't
148                  * mess up; we tried." */
149                 return 0;
150         }
151         send_ipi(pcoreid, I_POKE_CORE);
152         return 0;
153 }
154
155 struct guest_pcore *lookup_guest_pcore(struct proc *p, int guest_pcoreid)
156 {
157         /* nr_guest_pcores is written once at setup and never changed */
158         if (guest_pcoreid >= p->vmm.nr_guest_pcores)
159                 return 0;
160         return p->vmm.guest_pcores[guest_pcoreid];
161 }
162
163 struct guest_pcore *load_guest_pcore(struct proc *p, int guest_pcoreid)
164 {
165         struct guest_pcore *gpc;
166         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
167
168         gpc = lookup_guest_pcore(p, guest_pcoreid);
169         if (!gpc)
170                 return 0;
171         assert(pcpui->guest_pcoreid == -1);
172         spin_lock(&p->vmm.lock);
173         if (gpc->cpu != -1) {
174                 spin_unlock(&p->vmm.lock);
175                 return 0;
176         }
177         gpc->cpu = core_id();
178         spin_unlock(&p->vmm.lock);
179         /* We've got dibs on the gpc; we don't need to hold the lock any longer. */
180         pcpui->guest_pcoreid = guest_pcoreid;
181         ept_sync_context(gpc_get_eptp(gpc));
182         vmx_load_guest_pcore(gpc);
183         /* Load guest's xcr0 */
184         lxcr0(gpc->xcr0);
185
186         /* Manual MSR save/restore */
187         write_kern_gsbase(gpc->msr_kern_gs_base);
188         if (gpc->msr_star != AKAROS_MSR_STAR)
189                 write_msr(MSR_STAR, gpc->msr_star);
190         if (gpc->msr_lstar != AKAROS_MSR_LSTAR)
191                 write_msr(MSR_LSTAR, gpc->msr_lstar);
192         if (gpc->msr_sfmask != AKAROS_MSR_SFMASK)
193                 write_msr(MSR_SFMASK, gpc->msr_sfmask);
194
195         return gpc;
196 }
197
198 void unload_guest_pcore(struct proc *p, int guest_pcoreid)
199 {
200         struct guest_pcore *gpc;
201         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
202
203         gpc = lookup_guest_pcore(p, guest_pcoreid);
204         assert(gpc);
205         spin_lock(&p->vmm.lock);
206         assert(gpc->cpu != -1);
207         ept_sync_context(gpc_get_eptp(gpc));
208         vmx_unload_guest_pcore(gpc);
209         gpc->cpu = -1;
210
211         /* Save guest's xcr0 and restore Akaros's default. */
212         gpc->xcr0 = rxcr0();
213         lxcr0(__proc_global_info.x86_default_xcr0);
214
215         /* We manage these MSRs manually. */
216         gpc->msr_kern_gs_base = read_kern_gsbase();
217         gpc->msr_star = read_msr(MSR_STAR);
218         gpc->msr_lstar = read_msr(MSR_LSTAR);
219         gpc->msr_sfmask = read_msr(MSR_SFMASK);
220
221         write_kern_gsbase((uint64_t)pcpui);
222         if (gpc->msr_star != AKAROS_MSR_STAR)
223                 write_msr(MSR_STAR, AKAROS_MSR_STAR);
224         if (gpc->msr_lstar != AKAROS_MSR_LSTAR)
225                 write_msr(MSR_LSTAR, AKAROS_MSR_LSTAR);
226         if (gpc->msr_sfmask, AKAROS_MSR_SFMASK)
227                 write_msr(MSR_SFMASK, AKAROS_MSR_SFMASK);
228
229         /* As soon as we unlock, this gpc can be started on another core */
230         spin_unlock(&p->vmm.lock);
231         pcpui->guest_pcoreid = -1;
232 }
233
234 /* emulated msr. For now, an msr value and a pointer to a helper that
235  * performs the requested operation.
236  */
237 struct emmsr {
238         uint32_t reg;
239         char *name;
240         bool (*f)(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
241                   uint64_t *rax, uint32_t opcode);
242         bool written;
243         uint32_t edx, eax;
244 };
245
246 static bool emsr_miscenable(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
247                             uint64_t *rax, uint32_t opcode);
248 static bool emsr_mustmatch(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
249                            uint64_t *rax, uint32_t opcode);
250 static bool emsr_readonly(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
251                           uint64_t *rax, uint32_t opcode);
252 static bool emsr_readzero(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
253                           uint64_t *rax, uint32_t opcode);
254 static bool emsr_fakewrite(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
255                            uint64_t *rax, uint32_t opcode);
256 static bool emsr_ok(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
257                     uint64_t *rax, uint32_t opcode);
258 static bool emsr_fake_apicbase(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
259                                uint64_t *rax, uint32_t opcode);
260
261 struct emmsr emmsrs[] = {
262         {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable},
263         {MSR_IA32_SYSENTER_CS, "MSR_IA32_SYSENTER_CS", emsr_ok},
264         {MSR_IA32_SYSENTER_EIP, "MSR_IA32_SYSENTER_EIP", emsr_ok},
265         {MSR_IA32_SYSENTER_ESP, "MSR_IA32_SYSENTER_ESP", emsr_ok},
266         {MSR_IA32_UCODE_REV, "MSR_IA32_UCODE_REV", emsr_fakewrite},
267         {MSR_CSTAR, "MSR_CSTAR", emsr_fakewrite},
268         {MSR_IA32_VMX_BASIC_MSR, "MSR_IA32_VMX_BASIC_MSR", emsr_fakewrite},
269         {MSR_IA32_VMX_PINBASED_CTLS_MSR, "MSR_IA32_VMX_PINBASED_CTLS_MSR",
270          emsr_fakewrite},
271         {MSR_IA32_VMX_PROCBASED_CTLS_MSR, "MSR_IA32_VMX_PROCBASED_CTLS_MSR",
272          emsr_fakewrite},
273         {MSR_IA32_VMX_PROCBASED_CTLS2, "MSR_IA32_VMX_PROCBASED_CTLS2",
274          emsr_fakewrite},
275         {MSR_IA32_VMX_EXIT_CTLS_MSR, "MSR_IA32_VMX_EXIT_CTLS_MSR",
276          emsr_fakewrite},
277         {MSR_IA32_VMX_ENTRY_CTLS_MSR, "MSR_IA32_VMX_ENTRY_CTLS_MSR",
278          emsr_fakewrite},
279         {MSR_IA32_ENERGY_PERF_BIAS, "MSR_IA32_ENERGY_PERF_BIAS",
280          emsr_fakewrite},
281         {MSR_LBR_SELECT, "MSR_LBR_SELECT", emsr_ok},
282         {MSR_LBR_TOS, "MSR_LBR_TOS", emsr_ok},
283         {MSR_LBR_NHM_FROM, "MSR_LBR_NHM_FROM", emsr_ok},
284         {MSR_LBR_NHM_TO, "MSR_LBR_NHM_TO", emsr_ok},
285         {MSR_LBR_CORE_FROM, "MSR_LBR_CORE_FROM", emsr_ok},
286         {MSR_LBR_CORE_TO, "MSR_LBR_CORE_TO", emsr_ok},
287
288         // grumble.
289         {MSR_OFFCORE_RSP_0, "MSR_OFFCORE_RSP_0", emsr_ok},
290         {MSR_OFFCORE_RSP_1, "MSR_OFFCORE_RSP_1", emsr_ok},
291         // louder.
292         {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok},
293         // aaaaaahhhhhhhhhhhhhhhhhhhhh
294         {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
295         {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL1", emsr_ok},
296         {MSR_IA32_PERF_CAPABILITIES, "MSR_IA32_PERF_CAPABILITIES", emsr_readzero},
297         // unsafe.
298         {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fake_apicbase},
299
300         // mostly harmless.
301         {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite},
302         {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero},
303         {MSR_IA32_MCG_CAP, "MSR_IA32_MCG_CAP", emsr_readzero},
304         {MSR_IA32_DEBUGCTLMSR, "MSR_IA32_DEBUGCTLMSR", emsr_fakewrite},
305
306         // TBD
307         {MSR_IA32_TSC_DEADLINE, "MSR_IA32_TSC_DEADLINE", emsr_fakewrite},
308 };
309
310 /* this may be the only register that needs special handling.
311  * If there others then we might want to extend the emmsr struct.
312  */
313 bool emsr_miscenable(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
314                      uint64_t *rax, uint32_t opcode)
315 {
316         uint32_t eax, edx;
317         uint64_t val;
318
319         if (read_msr_safe(msr->reg, &val))
320                 return FALSE;
321         split_msr_val(val, &edx, &eax);
322         /* we just let them read the misc msr for now. */
323         if (opcode == VMM_MSR_EMU_READ) {
324                 *rax = eax;
325                 *rax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
326                 *rdx = edx;
327                 return TRUE;
328         } else {
329                 /* if they are writing what is already written, that's ok. */
330                 eax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
331                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
332                         return TRUE;
333         }
334         printk
335                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
336                  msr->name, (uint32_t) *rdx, (uint32_t) *rax, edx, eax);
337         return FALSE;
338 }
339
340 /* TODO: this looks like a copy-paste for the read side.  What's the purpose of
341  * mustmatch?  No one even uses it. */
342 bool emsr_mustmatch(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
343                     uint64_t *rax, uint32_t opcode)
344 {
345         uint32_t eax, edx;
346         uint64_t val;
347
348         if (read_msr_safe(msr->reg, &val))
349                 return FALSE;
350         split_msr_val(val, &edx, &eax);
351         /* we just let them read the misc msr for now. */
352         if (opcode == VMM_MSR_EMU_READ) {
353                 *rax = eax;
354                 *rdx = edx;
355                 return TRUE;
356         } else {
357                 /* if they are writing what is already written, that's ok. */
358                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
359                         return TRUE;
360         }
361         printk
362                 ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
363                  msr->name, (uint32_t) *rdx, (uint32_t) *rax, edx, eax);
364         return FALSE;
365 }
366
367 bool emsr_readonly(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
368                    uint64_t *rax, uint32_t opcode)
369 {
370         uint32_t eax, edx;
371         uint64_t val;
372
373         if (read_msr_safe(msr->reg, &val))
374                 return FALSE;
375         split_msr_val(val, &edx, &eax);
376         if (opcode == VMM_MSR_EMU_READ) {
377                 *rax = eax;
378                 *rdx = edx;
379                 return TRUE;
380         }
381
382         printk("%s: Tried to write a readonly register\n", msr->name);
383         return FALSE;
384 }
385
386 bool emsr_readzero(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
387                    uint64_t *rax, uint32_t opcode)
388 {
389         if (opcode == VMM_MSR_EMU_READ) {
390                 *rax = 0;
391                 *rdx = 0;
392                 return TRUE;
393         }
394
395         printk("%s: Tried to write a readonly register\n", msr->name);
396         return FALSE;
397 }
398
399 /* pretend to write it, but don't write it. */
400 bool emsr_fakewrite(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
401                     uint64_t *rax, uint32_t opcode)
402 {
403         uint32_t eax, edx;
404         uint64_t val;
405
406         if (!msr->written) {
407                 if (read_msr_safe(msr->reg, &val))
408                         return FALSE;
409                 split_msr_val(val, &edx, &eax);
410         } else {
411                 edx = msr->edx;
412                 eax = msr->eax;
413         }
414         /* we just let them read the misc msr for now. */
415         if (opcode == VMM_MSR_EMU_READ) {
416                 *rax = eax;
417                 *rdx = edx;
418                 return TRUE;
419         } else {
420                 /* if they are writing what is already written, that's ok. */
421                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
422                         return TRUE;
423                 msr->edx = *rdx;
424                 msr->eax = *rax;
425                 msr->written = TRUE;
426         }
427         return TRUE;
428 }
429
430 bool emsr_ok(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
431              uint64_t *rax, uint32_t opcode)
432 {
433         uint32_t eax, edx;
434         uint64_t val;
435
436         if (opcode == VMM_MSR_EMU_READ) {
437                 if (read_msr_safe(msr->reg, &val))
438                         return FALSE;
439                 split_msr_val(val, &edx, &eax);
440                 *rax = eax;
441                 *rdx = edx;
442         } else {
443                 val = (*rdx << 32) | (*rax & 0xffffffff);
444                 if (write_msr_safe(msr->reg, val))
445                         return FALSE;
446         }
447         return TRUE;
448 }
449
450 /* pretend to write it, but don't write it. */
451 bool emsr_fake_apicbase(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
452                         uint64_t *rax, uint32_t opcode)
453 {
454         uint32_t eax, edx;
455
456         if (!msr->written) {
457                 /* TODO: tightly coupled to the addr in vmrunkernel.  We want this func
458                  * to return the val that vmrunkernel put into the VMCS. */
459                 eax = 0xfee00900;
460                 edx = 0;
461         } else {
462                 edx = msr->edx;
463                 eax = msr->eax;
464         }
465         /* we just let them read the misc msr for now. */
466         if (opcode == VMM_MSR_EMU_READ) {
467                 *rax = eax;
468                 *rdx = edx;
469                 return TRUE;
470         } else {
471                 /* if they are writing what is already written, that's ok. */
472                 if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
473                         return 0;
474                 msr->edx = *rdx;
475                 msr->eax = *rax;
476                 msr->written = TRUE;
477         }
478         return TRUE;
479 }
480
481 bool vmm_emulate_msr(uint64_t *rcx, uint64_t *rdx, uint64_t *rax, int op)
482 {
483         for (int i = 0; i < ARRAY_SIZE(emmsrs); i++) {
484                 if (emmsrs[i].reg != *rcx)
485                         continue;
486                 return emmsrs[i].f(&emmsrs[i], rcx, rdx, rax, op);
487         }
488         return FALSE;
489 }