vmm: Add a vmcall for tracing the TF (XCC)
[akaros.git] / user / vmm / vmexit.c
1 /* Copyright (c) 2015-2016 Google Inc.
2  * See LICENSE for details. */
3
4 #include <parlib/common.h>
5 #include <vmm/virtio.h>
6 #include <vmm/virtio_mmio.h>
7 #include <vmm/virtio_ids.h>
8 #include <vmm/virtio_config.h>
9 #include <vmm/mmio.h>
10 #include <vmm/vmm.h>
11 #include <parlib/arch/trap.h>
12 #include <parlib/bitmask.h>
13 #include <stdio.h>
14
15 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
16 {
17         return GET_BITMASK_BIT(gpci->posted_irq_desc, VMX_POSTED_OUTSTANDING_NOTIF);
18 }
19
20 /* Returns true if the hardware will trigger an IRQ for the guest.  These
21  * virtual IRQs are only processed under certain situations, like vmentry, and
22  * posted IRQs.  See 'Evaluation of Pending Virtual Interrupts' in the SDM. */
23 static bool virtual_irq_is_pending(struct guest_thread *gth)
24 {
25         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
26         uint8_t rvi, vppr;
27
28         /* Currently, the lower 4 bits are various ways to block IRQs, e.g. blocking
29          * by STI.  The other bits are must be 0.  Presumably any new bits are types
30          * of IRQ blocking. */
31         if (gth_to_vmtf(gth)->tf_intrinfo1)
32                 return false;
33         vppr = read_mmreg32((uintptr_t)gth_to_gpci(gth)->vapic_addr + 0xa0);
34         rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
35         return (rvi & 0xf0) > (vppr & 0xf0);
36 }
37
38 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
39  * vmm_interrupt_guest(). */
40 static void sleep_til_irq(struct guest_thread *gth)
41 {
42         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
43
44         /* The invariant is that if an IRQ is posted, but not delivered, we will not
45          * sleep.  Anyone who posts an IRQ must signal after setting it.
46          * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
47          * posting, we'll need to revist this.  For more details, see the notes in
48          * the kernel IPI-IRC fast path.
49          *
50          * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
51          * possible that the hardware attempted to post the interrupt.  In SDM
52          * parlance, the processor could have "recognized" the virtual IRQ, but not
53          * delivered it yet.  This could happen if the guest had executed "sti", but
54          * not "hlt" yet.  The IRQ was posted and recognized, but not delivered
55          * ("sti blocking").  Then the guest executes "hlt", and vmexits.
56          * OUTSTANDING_NOTIF will be clear in this case.  RVI should be set - at
57          * least to the vector we just sent, but possibly to a greater vector if
58          * multiple were sent.  RVI should only be cleared after virtual IRQs were
59          * actually delivered.  So checking OUTSTANDING_NOTIF and RVI should
60          * suffice.
61          *
62          * Note that when we see a notif or pending virtual IRQ, we don't actually
63          * deliver the IRQ, we'll just restart the guest and the hardware will
64          * deliver the virtual IRQ at the appropriate time.
65          *
66          * The more traditional race here is if the halt starts concurrently with
67          * the post; that's why we sync with the mutex to make sure there is an
68          * ordering between the actual halt (this function) and the posting. */
69         uth_mutex_lock(gth->halt_mtx);
70         while (!(pir_notif_is_set(gpci) || virtual_irq_is_pending(gth)))
71                 uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
72         uth_mutex_unlock(gth->halt_mtx);
73 }
74
75 enum {
76                 CPUID_0B_LEVEL_SMT = 0,
77                 CPUID_0B_LEVEL_CORE
78 };
79
80 static bool handle_cpuid(struct guest_thread *gth)
81 {
82         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
83         struct virtual_machine *vm = gth_to_vm(gth);
84         uint32_t level = vm_tf->tf_rcx & 0x0F;
85
86         if (vm_tf->tf_rax != 0x0B)
87                 return FALSE;
88
89         vm_tf->tf_rip += 2;
90         vm_tf->tf_rax = 0;
91         vm_tf->tf_rbx = 0;
92         vm_tf->tf_rcx = level;
93         vm_tf->tf_rdx = gth->gpc_id;
94         if (level == CPUID_0B_LEVEL_SMT) {
95                 vm_tf->tf_rax = 0;
96                 vm_tf->tf_rbx = 1;
97                 vm_tf->tf_rcx |= ((level + 1) << 8);
98         }
99         if (level == CPUID_0B_LEVEL_CORE) {
100                 uint32_t shift = LOG2_UP(vm->nr_gpcs);
101
102                 if (shift > 0x1F)
103                         shift = 0x1F;
104                 vm_tf->tf_rax = shift;
105                 vm_tf->tf_rbx = vm->nr_gpcs;
106                 vm_tf->tf_rcx |= ((level + 1) << 8);
107         }
108
109         return TRUE;
110 }
111
112 static bool handle_ept_fault(struct guest_thread *gth)
113 {
114         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
115         struct virtual_machine *vm = gth_to_vm(gth);
116         uint64_t gpa, *regp;
117         uint8_t regx;
118         int store, size;
119         int advance;
120         int ret;
121
122         if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
123                 ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0, 0, 0);
124                 if (ret <= 0)
125                         panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
126                               ret);
127                 return TRUE;
128         }
129         ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
130
131         if (ret < 0)
132                 return FALSE;
133         if (ret == VM_PAGE_FAULT) {
134                 /* We were unable to translate RIP due to an ept fault */
135                 vm_tf->tf_trap_inject = VM_TRAP_VALID
136                                       | VM_TRAP_ERROR_CODE
137                                       | VM_TRAP_HARDWARE
138                                       | HW_TRAP_PAGE_FAULT;
139                 return TRUE;
140         }
141
142         assert(size >= 0);
143         /* TODO use helpers for some of these addr checks.  the fee/fec ones might
144          * be wrong too. */
145         for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
146                 if (vm->virtio_mmio_devices[i] == NULL)
147                         continue;
148                 if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
149                         continue;
150                 /* TODO: can the guest cause us to spawn off infinite threads? */
151                 if (store)
152                         virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size,
153                                        (uint32_t *)regp);
154                 else
155                         *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size);
156                 vm_tf->tf_rip += advance;
157                 return TRUE;
158         }
159         if (PG_ADDR(gpa) == 0xfec00000) {
160                 do_ioapic(gth, gpa, regx, regp, store);
161         } else if (PG_ADDR(gpa) == 0) {
162                 memmove(regp, &vm->low4k[gpa], size);
163         } else {
164                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
165                 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
166                                 vm_tf->tf_exit_reason);
167                 fprintf(stderr, "Returning 0xffffffff\n");
168                 showstatus(stderr, gth);
169                 /* Just fill the whole register for now. */
170                 *regp = (uint64_t) -1;
171                 return FALSE;
172         }
173         vm_tf->tf_rip += advance;
174         return TRUE;
175 }
176
177 static bool handle_vmcall_printc(struct guest_thread *gth)
178 {
179         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
180         uint8_t byte;
181
182         byte = vm_tf->tf_rdi;
183         printf("%c", byte);
184         if (byte == '\n')
185                 printf("%c", '%');
186         fflush(stdout);
187         return TRUE;
188 }
189
190 static bool handle_vmcall_smpboot(struct guest_thread *gth)
191 {
192         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
193         struct vm_trapframe *vm_tf_ap;
194         struct virtual_machine *vm = gth_to_vm(gth);
195         int cur_pcores = vm->up_gpcs;
196
197         /* Check if we're guest pcore 0. Only the BSP is allowed to start APs. */
198         if (vm_tf->tf_guest_pcoreid != 0) {
199                 fprintf(stderr,
200                         "Only guest pcore 0 is allowed to start APs. core was %ld\n",
201                         vm_tf->tf_guest_pcoreid);
202                 return FALSE;
203         }
204
205         /* Check if we've reached the maximum, if yes, blow out. */
206         if (vm->nr_gpcs == cur_pcores) {
207                 fprintf(stderr,
208                         "guest tried to start up too many cores. max was %ld, current up %ld\n",
209                         vm->nr_gpcs, cur_pcores);
210                 return FALSE;
211         }
212
213         /* Start up secondary core. */
214         vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
215         /* We use the BSP's CR3 for now. This should be fine because they
216          * change it later anyway. */
217         vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
218
219         /* Starting RIP is passed in via rdi. */
220         vm_tf_ap->tf_rip = vm_tf->tf_rdi;
221
222         /* Starting RSP is passed in via rsi. */
223         vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
224
225         vm->up_gpcs++;
226
227         start_guest_thread(gpcid_to_gth(vm, cur_pcores));
228
229         return TRUE;
230 }
231
232 static bool handle_vmcall_get_tscfreq(struct guest_thread *gth)
233 {
234         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
235         struct vm_trapframe *vm_tf_ap;
236         struct virtual_machine *vm = gth_to_vm(gth);
237
238         vm_tf->tf_rax = get_tsc_freq() / 1000;
239         return TRUE;
240 }
241
242 static bool handle_vmcall(struct guest_thread *gth)
243 {
244         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
245         struct virtual_machine *vm = gth_to_vm(gth);
246         bool retval = FALSE;
247
248         if (vm->vmcall)
249                 return vm->vmcall(gth, vm_tf);
250
251         switch (vm_tf->tf_rax) {
252         case VMCALL_PRINTC:
253                 retval = handle_vmcall_printc(gth);
254                 break;
255         case VMCALL_SMPBOOT:
256                 retval = handle_vmcall_smpboot(gth);
257                 break;
258         case VMCALL_GET_TSCFREQ:
259                 retval = handle_vmcall_get_tscfreq(gth);
260                 break;
261         case VMCALL_TRACE_TF:
262                 trace_printf("  rax  0x%016lx\n",      vm_tf->tf_r11);
263                 trace_printf("  rbx  0x%016lx\n",      vm_tf->tf_rbx);
264                 trace_printf("  rcx  0x%016lx\n",      vm_tf->tf_rcx);
265                 trace_printf("  rdx  0x%016lx\n",      vm_tf->tf_rdx);
266                 trace_printf("  rbp  0x%016lx\n",      vm_tf->tf_rbp);
267                 trace_printf("  rsi  0x%016lx\n",      vm_tf->tf_rsi);
268                 trace_printf("  rdi  0x%016lx\n",      vm_tf->tf_rdi);
269                 trace_printf("  r8   0x%016lx\n",      vm_tf->tf_r8);
270                 trace_printf("  r9   0x%016lx\n",      vm_tf->tf_r9);
271                 trace_printf("  r10  0x%016lx\n",      vm_tf->tf_r10);
272                 trace_printf("  r11  0x%016lx\n",      0xdeadbeef);
273                 trace_printf("  r12  0x%016lx\n",      vm_tf->tf_r12);
274                 trace_printf("  r13  0x%016lx\n",      vm_tf->tf_r13);
275                 trace_printf("  r14  0x%016lx\n",      vm_tf->tf_r14);
276                 trace_printf("  r15  0x%016lx\n",      vm_tf->tf_r15);
277                 trace_printf("  rip  0x%016lx\n",      vm_tf->tf_rip);
278                 trace_printf("  rflg 0x%016lx\n",      vm_tf->tf_rflags);
279                 trace_printf("  rsp  0x%016lx\n",      vm_tf->tf_rsp);
280                 trace_printf("  cr2  0x%016lx\n",      vm_tf->tf_cr2);
281                 trace_printf("  cr3  0x%016lx\n",      vm_tf->tf_cr3);
282                 trace_printf("Gpcore 0x%08x\n",        vm_tf->tf_guest_pcoreid);
283                 trace_printf("Flags  0x%08x\n",        vm_tf->tf_flags);
284                 trace_printf("Inject 0x%08x\n",        vm_tf->tf_trap_inject);
285                 trace_printf("ExitRs 0x%08x\n",        vm_tf->tf_exit_reason);
286                 trace_printf("ExitQl 0x%08x\n",        vm_tf->tf_exit_qual);
287                 trace_printf("Intr1  0x%016lx\n",      vm_tf->tf_intrinfo1);
288                 trace_printf("Intr2  0x%016lx\n",      vm_tf->tf_intrinfo2);
289                 trace_printf("GIntr  0x----%04x\n",    vm_tf->tf_guest_intr_status);
290                 trace_printf("GVA    0x%016lx\n",      vm_tf->tf_guest_va);
291                 trace_printf("GPA    0x%016lx\n",      vm_tf->tf_guest_pa);
292                 retval = true;
293                 break;
294         }
295
296         if (retval)
297                 vm_tf->tf_rip += 3;
298
299         return retval;
300 }
301
302 static bool handle_io(struct guest_thread *gth)
303 {
304         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
305         int ret = io(gth);
306
307         if (ret < 0)
308                 return FALSE;
309         if (ret == VM_PAGE_FAULT) {
310                 /* We were unable to translate RIP due to an ept fault */
311                 vm_tf->tf_trap_inject = VM_TRAP_VALID
312                                       | VM_TRAP_ERROR_CODE
313                                       | VM_TRAP_HARDWARE
314                                       | HW_TRAP_PAGE_FAULT;
315         }
316         return TRUE;
317 }
318
319 static bool handle_msr(struct guest_thread *gth)
320 {
321         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
322
323         if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
324                 /* Use event injection through vmctl to send a general protection fault
325                  * vmctl.interrupt gets written to the VM-Entry Interruption-Information
326                  * Field by vmx */
327                 vm_tf->tf_trap_inject = VM_TRAP_VALID
328                                       | VM_TRAP_ERROR_CODE
329                                       | VM_TRAP_HARDWARE
330                                       | HW_TRAP_GP_FAULT;
331         } else {
332                 vm_tf->tf_rip += 2;
333         }
334         return TRUE;
335 }
336
337 static bool handle_apic_access(struct guest_thread *gth)
338 {
339         uint64_t gpa, *regp;
340         uint8_t regx;
341         int store, size;
342         int advance;
343         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
344
345         if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
346                 return FALSE;
347         if (__apic_access(gth, gpa, regx, regp, store))
348                 return FALSE;
349         vm_tf->tf_rip += advance;
350         return TRUE;
351 }
352
353 static bool handle_halt(struct guest_thread *gth)
354 {
355         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
356         struct virtual_machine *vm = gth_to_vm(gth);
357
358         if (vm->halt_exit)
359                 return FALSE;
360         /* It's possible the guest disabled IRQs and halted, perhaps waiting on an
361          * NMI or something.  If we need to support that, we can change this.  */
362         sleep_til_irq(gth);
363         vm_tf->tf_rip += 1;
364         return TRUE;
365 }
366
367 /* The guest is told (via cpuid) that there is no monitor/mwait.  Callers of
368  * mwait are paravirtualized halts.
369  *
370  * We don't support monitor/mwait in software, so if they tried to mwait
371  * without break-on-interrupt and with interrupts disabled, they'll never
372  * wake up.  So we'll always break on interrupt. */
373 static bool handle_mwait(struct guest_thread *gth)
374 {
375         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
376         struct virtual_machine *vm = gth_to_vm(gth);
377
378         sleep_til_irq(gth);
379         vm_tf->tf_rip += 3;
380         return TRUE;
381 }
382
383 /* Is this a vmm specific thing?  or generic?
384  *
385  * what do we do when we want to kill the vm?  what are our other options? */
386 bool handle_vmexit(struct guest_thread *gth)
387 {
388         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
389
390         switch (vm_tf->tf_exit_reason) {
391         case EXIT_REASON_CPUID:
392                 return handle_cpuid(gth);
393         case EXIT_REASON_EPT_VIOLATION:
394                 return handle_ept_fault(gth);
395         case EXIT_REASON_VMCALL:
396                 return handle_vmcall(gth);
397         case EXIT_REASON_IO_INSTRUCTION:
398                 return handle_io(gth);
399         case EXIT_REASON_MSR_WRITE:
400         case EXIT_REASON_MSR_READ:
401                 return handle_msr(gth);
402         case EXIT_REASON_APIC_ACCESS:
403                 return handle_apic_access(gth);
404         case EXIT_REASON_HLT:
405                 return handle_halt(gth);
406         case EXIT_REASON_MWAIT_INSTRUCTION:
407                 return handle_mwait(gth);
408         case EXIT_REASON_EXTERNAL_INTERRUPT:
409         case EXIT_REASON_APIC_WRITE:
410                 /* TODO: just ignore these? */
411                 return TRUE;
412         default:
413                 fprintf(stderr, "VMM library: don't know how to handle exit %d\n",
414                         vm_tf->tf_exit_reason);
415                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
416                         vm_tf->tf_exit_reason);
417                 return FALSE;
418         }
419 }