tests/linux: use Akaros's CFLAGS
[akaros.git] / user / vmm / vmexit.c
1 /* Copyright (c) 2015-2016 Google Inc.
2  * See LICENSE for details. */
3
4 #include <parlib/common.h>
5 #include <vmm/virtio.h>
6 #include <vmm/virtio_mmio.h>
7 #include <vmm/virtio_ids.h>
8 #include <vmm/virtio_config.h>
9 #include <vmm/mmio.h>
10 #include <vmm/vmm.h>
11 #include <parlib/arch/trap.h>
12 #include <parlib/bitmask.h>
13 #include <parlib/stdio.h>
14 #include <stdlib.h>
15
16 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
17 {
18         return GET_BITMASK_BIT(gpci->posted_irq_desc,
19                                VMX_POSTED_OUTSTANDING_NOTIF);
20 }
21
22 /* Returns true if the hardware will trigger an IRQ for the guest.  These
23  * virtual IRQs are only processed under certain situations, like vmentry, and
24  * posted IRQs.  See 'Evaluation of Pending Virtual Interrupts' in the SDM. */
25 static bool virtual_irq_is_pending(struct guest_thread *gth)
26 {
27         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
28         uint8_t rvi, vppr;
29
30         /* Currently, the lower 4 bits are various ways to block IRQs, e.g.
31          * blocking by STI.  The other bits are must be 0.  Presumably any new
32          * bits are types of IRQ blocking. */
33         if (gth_to_vmtf(gth)->tf_intrinfo1)
34                 return false;
35         vppr = read_mmreg32((uintptr_t)gth_to_gpci(gth)->vapic_addr + 0xa0);
36         rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
37         return (rvi & 0xf0) > (vppr & 0xf0);
38 }
39
40 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
41  * vmm_interrupt_guest(). */
42 static void sleep_til_irq(struct guest_thread *gth)
43 {
44         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
45
46         /* The invariant is that if an IRQ is posted, but not delivered, we will
47          * not sleep.  Anyone who posts an IRQ must signal after setting it.
48          * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
49          * posting, we'll need to revist this.  For more details, see the notes
50          * in the kernel IPI-IRC fast path.
51          *
52          * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
53          * possible that the hardware attempted to post the interrupt.  In SDM
54          * parlance, the processor could have "recognized" the virtual IRQ, but
55          * not delivered it yet.  This could happen if the guest had executed
56          * "sti", but not "hlt" yet.  The IRQ was posted and recognized, but not
57          * delivered ("sti blocking").  Then the guest executes "hlt", and
58          * vmexits.  OUTSTANDING_NOTIF will be clear in this case.  RVI should
59          * be set - at least to the vector we just sent, but possibly to a
60          * greater vector if multiple were sent.  RVI should only be cleared
61          * after virtual IRQs were actually delivered.  So checking
62          * OUTSTANDING_NOTIF and RVI should suffice.
63          *
64          * Note that when we see a notif or pending virtual IRQ, we don't
65          * actually deliver the IRQ, we'll just restart the guest and the
66          * hardware will deliver the virtual IRQ at the appropriate time.
67          *
68          * The more traditional race here is if the halt starts concurrently
69          * with the post; that's why we sync with the mutex to make sure there
70          * is an ordering between the actual halt (this function) and the
71          * posting. */
72         uth_mutex_lock(gth->halt_mtx);
73         while (!(pir_notif_is_set(gpci) || virtual_irq_is_pending(gth)))
74                 uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
75         uth_mutex_unlock(gth->halt_mtx);
76 }
77
78 enum {
79         CPUID_0B_LEVEL_SMT = 0,
80         CPUID_0B_LEVEL_CORE
81 };
82
83 static bool handle_cpuid(struct guest_thread *gth)
84 {
85         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
86         struct virtual_machine *vm = gth_to_vm(gth);
87         uint32_t eax = vm_tf->tf_rax;
88         uint32_t ecx = vm_tf->tf_rcx;
89
90         if (!vmm_user_handles_cpuid(eax, ecx)) {
91                 fprintf(stderr, "got an unexpected cpuid 0x%x:%x\n", eax, ecx);
92                 return false;
93         }
94
95         switch (eax) {
96         case 0x0b: {
97                 uint32_t level = vm_tf->tf_rcx & 0x0F;
98
99                 vm_tf->tf_rcx = level;
100                 vm_tf->tf_rdx = gth->gpc_id;
101                 if (level == CPUID_0B_LEVEL_SMT) {
102                         vm_tf->tf_rax = 0;
103                         vm_tf->tf_rbx = 1;
104                         vm_tf->tf_rcx |= ((level + 1) << 8);
105                 }
106                 if (level == CPUID_0B_LEVEL_CORE) {
107                         uint32_t shift = LOG2_UP(vm->nr_gpcs);
108
109                         if (shift > 0x1F)
110                                 shift = 0x1F;
111                         vm_tf->tf_rax = shift;
112                         vm_tf->tf_rbx = vm->nr_gpcs;
113                         vm_tf->tf_rcx |= ((level + 1) << 8);
114                 }
115         }        break;
116         default:
117                 fprintf(stderr, "got an unhandled cpuid 0x%x:%x\n", eax, ecx);
118                 return false;
119         }
120
121         vm_tf->tf_rip += 2;
122         return true;
123 }
124
125 static int ept_mem_access(struct guest_thread *gth, uintptr_t gpa,
126                           unsigned long *regp, size_t size, bool store)
127 {
128         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
129         struct virtual_machine *vm = gth_to_vm(gth);
130
131         /* TODO use helpers for some of these addr checks.  the fee/fec ones
132          * might be wrong too.
133          *
134          * Also consider adding checks to make sure the entire access (gpa +
135          * size) is within the page / segment / region. */
136         for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
137                 if (vm->virtio_mmio_devices[i] == NULL)
138                         continue;
139                 if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
140                         continue;
141                 /* TODO: can the guest cause us to spawn off infinite threads?
142                  */
143                 /* TODO: regp often gets cast in virtio_mmio_wr, but not always.
144                  * We probably don't need this assert or the u32* cast below. */
145                 assert(size <= 4);
146                 if (store)
147                         virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa,
148                                        size, (uint32_t *)regp);
149                 else
150                         *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i],
151                                                gpa, size);
152                 return 0;
153         }
154         if (PG_ADDR(gpa) == 0xfec00000) {
155                 do_ioapic(gth, gpa, regp, store);
156         } else if (PG_ADDR(gpa) == 0) {
157                 memmove(regp, &vm->low4k[gpa], size);
158         } else {
159                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
160                 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
161                                 vm_tf->tf_exit_reason);
162                 fprintf(stderr, "Returning 0xffffffff\n");
163                 showstatus(stderr, gth);
164                 /* Just fill the whole register for now. */
165                 *regp = (uint64_t) -1;
166                 return -1;
167         }
168         return 0;
169 }
170
171
172 static bool handle_ept_fault(struct guest_thread *gth)
173 {
174         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
175         struct virtual_machine *vm = gth_to_vm(gth);
176         uint64_t gpa, *regp;
177         uint8_t regx;
178         bool store;
179         int size;
180         int advance;
181         int ret;
182         uint8_t insn[VMM_MAX_INSN_SZ];
183
184         if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
185                 ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0,
186                                   0, 0);
187                 if (ret <= 0)
188                         panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
189                               ret);
190                 return TRUE;
191         }
192         /* TODO: consider pushing the PF stuff into fetch, since rippa has a
193          * bunch of callers.  Though we'll still need to know to return true to
194          * restart the guest, instead of killing them with e.g. a failed emu. */
195         if (fetch_insn(gth, insn)) {
196                 /* We were unable to translate RIP due to an ept fault */
197                 vm_tf->tf_trap_inject = VM_TRAP_VALID
198                                       | VM_TRAP_ERROR_CODE
199                                       | VM_TRAP_HARDWARE
200                                       | HW_TRAP_PAGE_FAULT;
201                 return true;
202         }
203         ret = emulate_mem_insn(gth, insn, ept_mem_access, &advance);
204         if (ret < 0) {
205                 fprintf(stderr, "emulate failed!\n");
206                 return false;
207         }
208
209         vm_tf->tf_rip += advance;
210         return true;
211 }
212
213 static bool handle_vmcall_printc(struct guest_thread *gth)
214 {
215         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
216         uint8_t byte;
217
218         byte = vm_tf->tf_rdi;
219         printf("%c", byte);
220         fflush(stdout);
221         return TRUE;
222 }
223
224 static bool handle_vmcall_smpboot(struct guest_thread *gth)
225 {
226         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
227         struct vm_trapframe *vm_tf_ap;
228         struct virtual_machine *vm = gth_to_vm(gth);
229         int cur_pcores = vm->up_gpcs;
230
231         /* Check if we're guest pcore 0. Only the BSP is allowed to start APs.
232          */
233         if (vm_tf->tf_guest_pcoreid != 0) {
234                 fprintf(stderr,
235                         "Only guest pcore 0 is allowed to start APs. core was %ld\n",
236                         vm_tf->tf_guest_pcoreid);
237                 return FALSE;
238         }
239
240         /* Check if we've reached the maximum, if yes, blow out. */
241         if (vm->nr_gpcs == cur_pcores) {
242                 fprintf(stderr,
243                         "guest tried to start up too many cores. max was %ld, current up %ld\n",
244                         vm->nr_gpcs, cur_pcores);
245                 return FALSE;
246         }
247
248         /* Start up secondary core. */
249         vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
250         /* We use the BSP's CR3 for now. This should be fine because they
251          * change it later anyway. */
252         vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
253         vm_tf_ap->tf_rip = vm_tf->tf_rdi;
254         vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
255         vm_tf_ap->tf_rflags = FL_RSVD_1;
256
257         vm->up_gpcs++;
258
259         start_guest_thread(gpcid_to_gth(vm, cur_pcores));
260
261         return TRUE;
262 }
263
264 static bool handle_vmcall_get_tscfreq(struct guest_thread *gth)
265 {
266         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
267         struct vm_trapframe *vm_tf_ap;
268         struct virtual_machine *vm = gth_to_vm(gth);
269
270         vm_tf->tf_rax = get_tsc_freq() / 1000;
271         return TRUE;
272 }
273
274 static bool handle_vmcall(struct guest_thread *gth)
275 {
276         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
277         struct virtual_machine *vm = gth_to_vm(gth);
278         bool retval = FALSE;
279
280         if (vm->vmcall)
281                 return vm->vmcall(gth, vm_tf);
282
283         switch (vm_tf->tf_rax) {
284         case AKAROS_VMCALL_PRINTC:
285                 retval = handle_vmcall_printc(gth);
286                 break;
287         case AKAROS_VMCALL_SMPBOOT:
288                 retval = handle_vmcall_smpboot(gth);
289                 break;
290         case AKAROS_VMCALL_GET_TSCFREQ:
291                 retval = handle_vmcall_get_tscfreq(gth);
292                 break;
293         case AKAROS_VMCALL_TRACE_TF:
294                 trace_printf("  rax  0x%016lx\n",      vm_tf->tf_r11);
295                 trace_printf("  rbx  0x%016lx\n",      vm_tf->tf_rbx);
296                 trace_printf("  rcx  0x%016lx\n",      vm_tf->tf_rcx);
297                 trace_printf("  rdx  0x%016lx\n",      vm_tf->tf_rdx);
298                 trace_printf("  rbp  0x%016lx\n",      vm_tf->tf_rbp);
299                 trace_printf("  rsi  0x%016lx\n",      vm_tf->tf_rsi);
300                 trace_printf("  rdi  0x%016lx\n",      vm_tf->tf_rdi);
301                 trace_printf("  r8   0x%016lx\n",      vm_tf->tf_r8);
302                 trace_printf("  r9   0x%016lx\n",      vm_tf->tf_r9);
303                 trace_printf("  r10  0x%016lx\n",      vm_tf->tf_r10);
304                 trace_printf("  r11  0x%016lx\n",      0xdeadbeef);
305                 trace_printf("  r12  0x%016lx\n",      vm_tf->tf_r12);
306                 trace_printf("  r13  0x%016lx\n",      vm_tf->tf_r13);
307                 trace_printf("  r14  0x%016lx\n",      vm_tf->tf_r14);
308                 trace_printf("  r15  0x%016lx\n",      vm_tf->tf_r15);
309                 trace_printf("  rip  0x%016lx\n",      vm_tf->tf_rip);
310                 trace_printf("  rflg 0x%016lx\n",      vm_tf->tf_rflags);
311                 trace_printf("  rsp  0x%016lx\n",      vm_tf->tf_rsp);
312                 trace_printf("  cr2  0x%016lx\n",      vm_tf->tf_cr2);
313                 trace_printf("  cr3  0x%016lx\n",      vm_tf->tf_cr3);
314                 trace_printf("Gpcore 0x%08x\n",        vm_tf->tf_guest_pcoreid);
315                 trace_printf("Flags  0x%08x\n",        vm_tf->tf_flags);
316                 trace_printf("Inject 0x%08x\n",        vm_tf->tf_trap_inject);
317                 trace_printf("ExitRs 0x%08x\n",        vm_tf->tf_exit_reason);
318                 trace_printf("ExitQl 0x%08x\n",        vm_tf->tf_exit_qual);
319                 trace_printf("Intr1  0x%016lx\n",      vm_tf->tf_intrinfo1);
320                 trace_printf("Intr2  0x%016lx\n",      vm_tf->tf_intrinfo2);
321                 trace_printf("GIntr  0x----%04x\n",
322                              vm_tf->tf_guest_intr_status);
323                 trace_printf("GVA    0x%016lx\n",      vm_tf->tf_guest_va);
324                 trace_printf("GPA    0x%016lx\n",      vm_tf->tf_guest_pa);
325                 retval = true;
326                 break;
327         case AKAROS_VMCALL_SHUTDOWN:
328                 exit(0);
329         }
330
331         if (retval)
332                 vm_tf->tf_rip += 3;
333
334         return retval;
335 }
336
337 static bool handle_io(struct guest_thread *gth)
338 {
339         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
340         int ret = io(gth);
341
342         if (ret < 0)
343                 return FALSE;
344         if (ret == VM_PAGE_FAULT) {
345                 /* We were unable to translate RIP due to an ept fault */
346                 vm_tf->tf_trap_inject = VM_TRAP_VALID
347                                       | VM_TRAP_ERROR_CODE
348                                       | VM_TRAP_HARDWARE
349                                       | HW_TRAP_PAGE_FAULT;
350         }
351         return TRUE;
352 }
353
354 static bool handle_msr(struct guest_thread *gth)
355 {
356         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
357
358         if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
359                 /* Use event injection through vmctl to send a general
360                  * protection fault vmctl.interrupt gets written to the VM-Entry
361                  * Interruption-Information Field by vmx */
362                 vm_tf->tf_trap_inject = VM_TRAP_VALID
363                                       | VM_TRAP_ERROR_CODE
364                                       | VM_TRAP_HARDWARE
365                                       | HW_TRAP_GP_FAULT;
366         } else {
367                 vm_tf->tf_rip += 2;
368         }
369         return TRUE;
370 }
371
372 static bool handle_apic_access(struct guest_thread *gth)
373 {
374         uint64_t gpa, *regp;
375         uint8_t regx;
376         bool store;
377         int size;
378         int advance;
379         uint8_t insn[VMM_MAX_INSN_SZ];
380         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
381
382         if (fetch_insn(gth, insn)) {
383                 vm_tf->tf_trap_inject = VM_TRAP_VALID
384                                       | VM_TRAP_ERROR_CODE
385                                       | VM_TRAP_HARDWARE
386                                       | HW_TRAP_PAGE_FAULT;
387                 return true;
388         }
389         if (emulate_mem_insn(gth, insn, __apic_access, &advance))
390                 return FALSE;
391         vm_tf->tf_rip += advance;
392         return TRUE;
393 }
394
395 static bool handle_halt(struct guest_thread *gth)
396 {
397         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
398         struct virtual_machine *vm = gth_to_vm(gth);
399
400         if (vm->halt_exit)
401                 return FALSE;
402         /* It's possible the guest disabled IRQs and halted, perhaps waiting on
403          * an NMI or something.  If we need to support that, we can change this.
404          */
405         sleep_til_irq(gth);
406         vm_tf->tf_rip += 1;
407         return TRUE;
408 }
409
410 /* The guest is told (via cpuid) that there is no monitor/mwait.  Callers of
411  * mwait are paravirtualized halts.
412  *
413  * We don't support monitor/mwait in software, so if they tried to mwait
414  * without break-on-interrupt and with interrupts disabled, they'll never
415  * wake up.  So we'll always break on interrupt. */
416 static bool handle_mwait(struct guest_thread *gth)
417 {
418         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
419         struct virtual_machine *vm = gth_to_vm(gth);
420
421         sleep_til_irq(gth);
422         vm_tf->tf_rip += 3;
423         return TRUE;
424 }
425
426 /* Is this a vmm specific thing?  or generic?
427  *
428  * what do we do when we want to kill the vm?  what are our other options? */
429 bool handle_vmexit(struct guest_thread *gth)
430 {
431         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
432
433         switch (vm_tf->tf_exit_reason) {
434         case EXIT_REASON_CPUID:
435                 return handle_cpuid(gth);
436         case EXIT_REASON_EPT_VIOLATION:
437                 return handle_ept_fault(gth);
438         case EXIT_REASON_VMCALL:
439                 return handle_vmcall(gth);
440         case EXIT_REASON_IO_INSTRUCTION:
441                 return handle_io(gth);
442         case EXIT_REASON_MSR_WRITE:
443         case EXIT_REASON_MSR_READ:
444                 return handle_msr(gth);
445         case EXIT_REASON_APIC_ACCESS:
446                 return handle_apic_access(gth);
447         case EXIT_REASON_HLT:
448                 return handle_halt(gth);
449         case EXIT_REASON_MWAIT_INSTRUCTION:
450                 return handle_mwait(gth);
451         case EXIT_REASON_EXTERNAL_INTERRUPT:
452         case EXIT_REASON_APIC_WRITE:
453                 /* TODO: just ignore these? */
454                 return TRUE;
455         default:
456                 return FALSE;
457         }
458 }