parlib: have 2LS libraries #include parlib/stdio.h
[akaros.git] / user / vmm / vmexit.c
1 /* Copyright (c) 2015-2016 Google Inc.
2  * See LICENSE for details. */
3
4 #include <parlib/common.h>
5 #include <vmm/virtio.h>
6 #include <vmm/virtio_mmio.h>
7 #include <vmm/virtio_ids.h>
8 #include <vmm/virtio_config.h>
9 #include <vmm/mmio.h>
10 #include <vmm/vmm.h>
11 #include <parlib/arch/trap.h>
12 #include <parlib/bitmask.h>
13 #include <parlib/stdio.h>
14
15 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
16 {
17         return GET_BITMASK_BIT(gpci->posted_irq_desc,
18                                VMX_POSTED_OUTSTANDING_NOTIF);
19 }
20
21 /* Returns true if the hardware will trigger an IRQ for the guest.  These
22  * virtual IRQs are only processed under certain situations, like vmentry, and
23  * posted IRQs.  See 'Evaluation of Pending Virtual Interrupts' in the SDM. */
24 static bool virtual_irq_is_pending(struct guest_thread *gth)
25 {
26         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
27         uint8_t rvi, vppr;
28
29         /* Currently, the lower 4 bits are various ways to block IRQs, e.g.
30          * blocking by STI.  The other bits are must be 0.  Presumably any new
31          * bits are types of IRQ blocking. */
32         if (gth_to_vmtf(gth)->tf_intrinfo1)
33                 return false;
34         vppr = read_mmreg32((uintptr_t)gth_to_gpci(gth)->vapic_addr + 0xa0);
35         rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
36         return (rvi & 0xf0) > (vppr & 0xf0);
37 }
38
39 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
40  * vmm_interrupt_guest(). */
41 static void sleep_til_irq(struct guest_thread *gth)
42 {
43         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
44
45         /* The invariant is that if an IRQ is posted, but not delivered, we will
46          * not sleep.  Anyone who posts an IRQ must signal after setting it.
47          * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
48          * posting, we'll need to revist this.  For more details, see the notes
49          * in the kernel IPI-IRC fast path.
50          *
51          * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
52          * possible that the hardware attempted to post the interrupt.  In SDM
53          * parlance, the processor could have "recognized" the virtual IRQ, but
54          * not delivered it yet.  This could happen if the guest had executed
55          * "sti", but not "hlt" yet.  The IRQ was posted and recognized, but not
56          * delivered ("sti blocking").  Then the guest executes "hlt", and
57          * vmexits.  OUTSTANDING_NOTIF will be clear in this case.  RVI should
58          * be set - at least to the vector we just sent, but possibly to a
59          * greater vector if multiple were sent.  RVI should only be cleared
60          * after virtual IRQs were actually delivered.  So checking
61          * OUTSTANDING_NOTIF and RVI should suffice.
62          *
63          * Note that when we see a notif or pending virtual IRQ, we don't
64          * actually deliver the IRQ, we'll just restart the guest and the
65          * hardware will deliver the virtual IRQ at the appropriate time.
66          *
67          * The more traditional race here is if the halt starts concurrently
68          * with the post; that's why we sync with the mutex to make sure there
69          * is an ordering between the actual halt (this function) and the
70          * posting. */
71         uth_mutex_lock(gth->halt_mtx);
72         while (!(pir_notif_is_set(gpci) || virtual_irq_is_pending(gth)))
73                 uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
74         uth_mutex_unlock(gth->halt_mtx);
75 }
76
77 enum {
78         CPUID_0B_LEVEL_SMT = 0,
79         CPUID_0B_LEVEL_CORE
80 };
81
82 static bool handle_cpuid(struct guest_thread *gth)
83 {
84         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
85         struct virtual_machine *vm = gth_to_vm(gth);
86         uint32_t level = vm_tf->tf_rcx & 0x0F;
87
88         if (vm_tf->tf_rax != 0x0B)
89                 return FALSE;
90
91         vm_tf->tf_rip += 2;
92         vm_tf->tf_rax = 0;
93         vm_tf->tf_rbx = 0;
94         vm_tf->tf_rcx = level;
95         vm_tf->tf_rdx = gth->gpc_id;
96         if (level == CPUID_0B_LEVEL_SMT) {
97                 vm_tf->tf_rax = 0;
98                 vm_tf->tf_rbx = 1;
99                 vm_tf->tf_rcx |= ((level + 1) << 8);
100         }
101         if (level == CPUID_0B_LEVEL_CORE) {
102                 uint32_t shift = LOG2_UP(vm->nr_gpcs);
103
104                 if (shift > 0x1F)
105                         shift = 0x1F;
106                 vm_tf->tf_rax = shift;
107                 vm_tf->tf_rbx = vm->nr_gpcs;
108                 vm_tf->tf_rcx |= ((level + 1) << 8);
109         }
110
111         return TRUE;
112 }
113
114 static bool handle_ept_fault(struct guest_thread *gth)
115 {
116         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
117         struct virtual_machine *vm = gth_to_vm(gth);
118         uint64_t gpa, *regp;
119         uint8_t regx;
120         int store, size;
121         int advance;
122         int ret;
123
124         if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
125                 ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0,
126                                   0, 0);
127                 if (ret <= 0)
128                         panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
129                               ret);
130                 return TRUE;
131         }
132         ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
133
134         if (ret < 0)
135                 return FALSE;
136         if (ret == VM_PAGE_FAULT) {
137                 /* We were unable to translate RIP due to an ept fault */
138                 vm_tf->tf_trap_inject = VM_TRAP_VALID
139                                       | VM_TRAP_ERROR_CODE
140                                       | VM_TRAP_HARDWARE
141                                       | HW_TRAP_PAGE_FAULT;
142                 return TRUE;
143         }
144
145         assert(size >= 0);
146         /* TODO use helpers for some of these addr checks.  the fee/fec ones
147          * might be wrong too. */
148         for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
149                 if (vm->virtio_mmio_devices[i] == NULL)
150                         continue;
151                 if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
152                         continue;
153                 /* TODO: can the guest cause us to spawn off infinite threads?
154                  */
155                 if (store)
156                         virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa,
157                                        size, (uint32_t *)regp);
158                 else
159                         *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i],
160                                                gpa, size);
161                 vm_tf->tf_rip += advance;
162                 return TRUE;
163         }
164         if (PG_ADDR(gpa) == 0xfec00000) {
165                 do_ioapic(gth, gpa, regx, regp, store);
166         } else if (PG_ADDR(gpa) == 0) {
167                 memmove(regp, &vm->low4k[gpa], size);
168         } else {
169                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
170                 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
171                                 vm_tf->tf_exit_reason);
172                 fprintf(stderr, "Returning 0xffffffff\n");
173                 showstatus(stderr, gth);
174                 /* Just fill the whole register for now. */
175                 *regp = (uint64_t) -1;
176                 return FALSE;
177         }
178         vm_tf->tf_rip += advance;
179         return TRUE;
180 }
181
182 static bool handle_vmcall_printc(struct guest_thread *gth)
183 {
184         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
185         uint8_t byte;
186
187         byte = vm_tf->tf_rdi;
188         printf("%c", byte);
189         if (byte == '\n')
190                 printf("%c", '%');
191         fflush(stdout);
192         return TRUE;
193 }
194
195 static bool handle_vmcall_smpboot(struct guest_thread *gth)
196 {
197         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
198         struct vm_trapframe *vm_tf_ap;
199         struct virtual_machine *vm = gth_to_vm(gth);
200         int cur_pcores = vm->up_gpcs;
201
202         /* Check if we're guest pcore 0. Only the BSP is allowed to start APs.
203          */
204         if (vm_tf->tf_guest_pcoreid != 0) {
205                 fprintf(stderr,
206                         "Only guest pcore 0 is allowed to start APs. core was %ld\n",
207                         vm_tf->tf_guest_pcoreid);
208                 return FALSE;
209         }
210
211         /* Check if we've reached the maximum, if yes, blow out. */
212         if (vm->nr_gpcs == cur_pcores) {
213                 fprintf(stderr,
214                         "guest tried to start up too many cores. max was %ld, current up %ld\n",
215                         vm->nr_gpcs, cur_pcores);
216                 return FALSE;
217         }
218
219         /* Start up secondary core. */
220         vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
221         /* We use the BSP's CR3 for now. This should be fine because they
222          * change it later anyway. */
223         vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
224
225         /* Starting RIP is passed in via rdi. */
226         vm_tf_ap->tf_rip = vm_tf->tf_rdi;
227
228         /* Starting RSP is passed in via rsi. */
229         vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
230
231         vm->up_gpcs++;
232
233         start_guest_thread(gpcid_to_gth(vm, cur_pcores));
234
235         return TRUE;
236 }
237
238 static bool handle_vmcall_get_tscfreq(struct guest_thread *gth)
239 {
240         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
241         struct vm_trapframe *vm_tf_ap;
242         struct virtual_machine *vm = gth_to_vm(gth);
243
244         vm_tf->tf_rax = get_tsc_freq() / 1000;
245         return TRUE;
246 }
247
248 static bool handle_vmcall(struct guest_thread *gth)
249 {
250         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
251         struct virtual_machine *vm = gth_to_vm(gth);
252         bool retval = FALSE;
253
254         if (vm->vmcall)
255                 return vm->vmcall(gth, vm_tf);
256
257         switch (vm_tf->tf_rax) {
258         case VMCALL_PRINTC:
259                 retval = handle_vmcall_printc(gth);
260                 break;
261         case VMCALL_SMPBOOT:
262                 retval = handle_vmcall_smpboot(gth);
263                 break;
264         case VMCALL_GET_TSCFREQ:
265                 retval = handle_vmcall_get_tscfreq(gth);
266                 break;
267         case VMCALL_TRACE_TF:
268                 trace_printf("  rax  0x%016lx\n",      vm_tf->tf_r11);
269                 trace_printf("  rbx  0x%016lx\n",      vm_tf->tf_rbx);
270                 trace_printf("  rcx  0x%016lx\n",      vm_tf->tf_rcx);
271                 trace_printf("  rdx  0x%016lx\n",      vm_tf->tf_rdx);
272                 trace_printf("  rbp  0x%016lx\n",      vm_tf->tf_rbp);
273                 trace_printf("  rsi  0x%016lx\n",      vm_tf->tf_rsi);
274                 trace_printf("  rdi  0x%016lx\n",      vm_tf->tf_rdi);
275                 trace_printf("  r8   0x%016lx\n",      vm_tf->tf_r8);
276                 trace_printf("  r9   0x%016lx\n",      vm_tf->tf_r9);
277                 trace_printf("  r10  0x%016lx\n",      vm_tf->tf_r10);
278                 trace_printf("  r11  0x%016lx\n",      0xdeadbeef);
279                 trace_printf("  r12  0x%016lx\n",      vm_tf->tf_r12);
280                 trace_printf("  r13  0x%016lx\n",      vm_tf->tf_r13);
281                 trace_printf("  r14  0x%016lx\n",      vm_tf->tf_r14);
282                 trace_printf("  r15  0x%016lx\n",      vm_tf->tf_r15);
283                 trace_printf("  rip  0x%016lx\n",      vm_tf->tf_rip);
284                 trace_printf("  rflg 0x%016lx\n",      vm_tf->tf_rflags);
285                 trace_printf("  rsp  0x%016lx\n",      vm_tf->tf_rsp);
286                 trace_printf("  cr2  0x%016lx\n",      vm_tf->tf_cr2);
287                 trace_printf("  cr3  0x%016lx\n",      vm_tf->tf_cr3);
288                 trace_printf("Gpcore 0x%08x\n",        vm_tf->tf_guest_pcoreid);
289                 trace_printf("Flags  0x%08x\n",        vm_tf->tf_flags);
290                 trace_printf("Inject 0x%08x\n",        vm_tf->tf_trap_inject);
291                 trace_printf("ExitRs 0x%08x\n",        vm_tf->tf_exit_reason);
292                 trace_printf("ExitQl 0x%08x\n",        vm_tf->tf_exit_qual);
293                 trace_printf("Intr1  0x%016lx\n",      vm_tf->tf_intrinfo1);
294                 trace_printf("Intr2  0x%016lx\n",      vm_tf->tf_intrinfo2);
295                 trace_printf("GIntr  0x----%04x\n",
296                              vm_tf->tf_guest_intr_status);
297                 trace_printf("GVA    0x%016lx\n",      vm_tf->tf_guest_va);
298                 trace_printf("GPA    0x%016lx\n",      vm_tf->tf_guest_pa);
299                 retval = true;
300                 break;
301         }
302
303         if (retval)
304                 vm_tf->tf_rip += 3;
305
306         return retval;
307 }
308
309 static bool handle_io(struct guest_thread *gth)
310 {
311         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
312         int ret = io(gth);
313
314         if (ret < 0)
315                 return FALSE;
316         if (ret == VM_PAGE_FAULT) {
317                 /* We were unable to translate RIP due to an ept fault */
318                 vm_tf->tf_trap_inject = VM_TRAP_VALID
319                                       | VM_TRAP_ERROR_CODE
320                                       | VM_TRAP_HARDWARE
321                                       | HW_TRAP_PAGE_FAULT;
322         }
323         return TRUE;
324 }
325
326 static bool handle_msr(struct guest_thread *gth)
327 {
328         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
329
330         if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
331                 /* Use event injection through vmctl to send a general
332                  * protection fault vmctl.interrupt gets written to the VM-Entry
333                  * Interruption-Information Field by vmx */
334                 vm_tf->tf_trap_inject = VM_TRAP_VALID
335                                       | VM_TRAP_ERROR_CODE
336                                       | VM_TRAP_HARDWARE
337                                       | HW_TRAP_GP_FAULT;
338         } else {
339                 vm_tf->tf_rip += 2;
340         }
341         return TRUE;
342 }
343
344 static bool handle_apic_access(struct guest_thread *gth)
345 {
346         uint64_t gpa, *regp;
347         uint8_t regx;
348         int store, size;
349         int advance;
350         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
351
352         if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
353                 return FALSE;
354         if (__apic_access(gth, gpa, regx, regp, store))
355                 return FALSE;
356         vm_tf->tf_rip += advance;
357         return TRUE;
358 }
359
360 static bool handle_halt(struct guest_thread *gth)
361 {
362         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
363         struct virtual_machine *vm = gth_to_vm(gth);
364
365         if (vm->halt_exit)
366                 return FALSE;
367         /* It's possible the guest disabled IRQs and halted, perhaps waiting on
368          * an NMI or something.  If we need to support that, we can change this.
369          */
370         sleep_til_irq(gth);
371         vm_tf->tf_rip += 1;
372         return TRUE;
373 }
374
375 /* The guest is told (via cpuid) that there is no monitor/mwait.  Callers of
376  * mwait are paravirtualized halts.
377  *
378  * We don't support monitor/mwait in software, so if they tried to mwait
379  * without break-on-interrupt and with interrupts disabled, they'll never
380  * wake up.  So we'll always break on interrupt. */
381 static bool handle_mwait(struct guest_thread *gth)
382 {
383         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
384         struct virtual_machine *vm = gth_to_vm(gth);
385
386         sleep_til_irq(gth);
387         vm_tf->tf_rip += 3;
388         return TRUE;
389 }
390
391 /* Is this a vmm specific thing?  or generic?
392  *
393  * what do we do when we want to kill the vm?  what are our other options? */
394 bool handle_vmexit(struct guest_thread *gth)
395 {
396         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
397
398         switch (vm_tf->tf_exit_reason) {
399         case EXIT_REASON_CPUID:
400                 return handle_cpuid(gth);
401         case EXIT_REASON_EPT_VIOLATION:
402                 return handle_ept_fault(gth);
403         case EXIT_REASON_VMCALL:
404                 return handle_vmcall(gth);
405         case EXIT_REASON_IO_INSTRUCTION:
406                 return handle_io(gth);
407         case EXIT_REASON_MSR_WRITE:
408         case EXIT_REASON_MSR_READ:
409                 return handle_msr(gth);
410         case EXIT_REASON_APIC_ACCESS:
411                 return handle_apic_access(gth);
412         case EXIT_REASON_HLT:
413                 return handle_halt(gth);
414         case EXIT_REASON_MWAIT_INSTRUCTION:
415                 return handle_mwait(gth);
416         case EXIT_REASON_EXTERNAL_INTERRUPT:
417         case EXIT_REASON_APIC_WRITE:
418                 /* TODO: just ignore these? */
419                 return TRUE;
420         default:
421                 fprintf(stderr,
422                         "VMM library: don't know how to handle exit %d\n",
423                         vm_tf->tf_exit_reason);
424                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
425                         vm_tf->tf_exit_reason);
426                 return FALSE;
427         }
428 }