vmm: Check VPPR and interrupt-window-blocking with RVI
[akaros.git] / user / vmm / vmexit.c
1 /* Copyright (c) 2015-2016 Google Inc.
2  * See LICENSE for details. */
3
4 #include <parlib/common.h>
5 #include <vmm/virtio.h>
6 #include <vmm/virtio_mmio.h>
7 #include <vmm/virtio_ids.h>
8 #include <vmm/virtio_config.h>
9 #include <vmm/mmio.h>
10 #include <vmm/vmm.h>
11 #include <parlib/arch/trap.h>
12 #include <parlib/bitmask.h>
13 #include <stdio.h>
14
15 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
16 {
17         return GET_BITMASK_BIT(gpci->posted_irq_desc, VMX_POSTED_OUTSTANDING_NOTIF);
18 }
19
20 /* Returns true if the hardware will trigger an IRQ for the guest.  These
21  * virtual IRQs are only processed under certain situations, like vmentry, and
22  * posted IRQs.  See 'Evaluation of Pending Virtual Interrupts' in the SDM. */
23 static bool virtual_irq_is_pending(struct guest_thread *gth)
24 {
25         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
26         uint8_t rvi, vppr;
27
28         /* Currently, the lower 4 bits are various ways to block IRQs, e.g. blocking
29          * by STI.  The other bits are must be 0.  Presumably any new bits are types
30          * of IRQ blocking. */
31         if (gth_to_vmtf(gth)->tf_intrinfo1)
32                 return false;
33         vppr = read_mmreg32((uintptr_t)gth_to_gpci(gth)->vapic_addr + 0xa0);
34         rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
35         return (rvi & 0xf0) > (vppr & 0xf0);
36 }
37
38 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
39  * vmm_interrupt_guest(). */
40 static void sleep_til_irq(struct guest_thread *gth)
41 {
42         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
43
44         /* The invariant is that if an IRQ is posted, but not delivered, we will not
45          * sleep.  Anyone who posts an IRQ must signal after setting it.
46          * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
47          * posting, we'll need to revist this.
48          *
49          * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
50          * possible that the hardware attempted to post the interrupt.  In SDM
51          * parlance, the processor could have "recognized" the virtual IRQ, but not
52          * delivered it yet.  This could happen if the guest had executed "sti", but
53          * not "hlt" yet.  The IRQ was posted and recognized, but not delivered
54          * ("sti blocking").  Then the guest executes "hlt", and vmexits.
55          * OUTSTANDING_NOTIF will be clear in this case.  RVI should be set - at
56          * least to the vector we just sent, but possibly to a greater vector if
57          * multiple were sent.  RVI should only be cleared after virtual IRQs were
58          * actually delivered.  So checking OUTSTANDING_NOTIF and RVI should
59          * suffice.
60          *
61          * Note that when we see a notif or pending virtual IRQ, we don't actually
62          * deliver the IRQ, we'll just restart the guest and the hardware will
63          * deliver the virtual IRQ at the appropriate time.
64          *
65          * The more traditional race here is if the halt starts concurrently with
66          * the post; that's why we sync with the mutex to make sure there is an
67          * ordering between the actual halt (this function) and the posting. */
68         uth_mutex_lock(gth->halt_mtx);
69         while (!(pir_notif_is_set(gpci) || virtual_irq_is_pending(gth)))
70                 uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
71         uth_mutex_unlock(gth->halt_mtx);
72 }
73
74 enum {
75                 CPUID_0B_LEVEL_SMT = 0,
76                 CPUID_0B_LEVEL_CORE
77 };
78
79 static bool handle_cpuid(struct guest_thread *gth)
80 {
81         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
82         struct virtual_machine *vm = gth_to_vm(gth);
83         uint32_t level = vm_tf->tf_rcx & 0x0F;
84
85         if (vm_tf->tf_rax != 0x0B)
86                 return FALSE;
87
88         vm_tf->tf_rip += 2;
89         vm_tf->tf_rax = 0;
90         vm_tf->tf_rbx = 0;
91         vm_tf->tf_rcx = level;
92         vm_tf->tf_rdx = gth->gpc_id;
93         if (level == CPUID_0B_LEVEL_SMT) {
94                 vm_tf->tf_rax = 0;
95                 vm_tf->tf_rbx = 1;
96                 vm_tf->tf_rcx |= ((level + 1) << 8);
97         }
98         if (level == CPUID_0B_LEVEL_CORE) {
99                 uint32_t shift = LOG2_UP(vm->nr_gpcs);
100
101                 if (shift > 0x1F)
102                         shift = 0x1F;
103                 vm_tf->tf_rax = shift;
104                 vm_tf->tf_rbx = vm->nr_gpcs;
105                 vm_tf->tf_rcx |= ((level + 1) << 8);
106         }
107
108         return TRUE;
109 }
110
111 static bool handle_ept_fault(struct guest_thread *gth)
112 {
113         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
114         struct virtual_machine *vm = gth_to_vm(gth);
115         uint64_t gpa, *regp;
116         uint8_t regx;
117         int store, size;
118         int advance;
119         int ret;
120
121         if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
122                 ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0, 0, 0);
123                 if (ret <= 0)
124                         panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
125                               ret);
126                 return TRUE;
127         }
128         ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
129
130         if (ret < 0)
131                 return FALSE;
132         if (ret == VM_PAGE_FAULT) {
133                 /* We were unable to translate RIP due to an ept fault */
134                 vm_tf->tf_trap_inject = VM_TRAP_VALID
135                                       | VM_TRAP_ERROR_CODE
136                                       | VM_TRAP_HARDWARE
137                                       | HW_TRAP_PAGE_FAULT;
138                 return TRUE;
139         }
140
141         assert(size >= 0);
142         /* TODO use helpers for some of these addr checks.  the fee/fec ones might
143          * be wrong too. */
144         for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
145                 if (vm->virtio_mmio_devices[i] == NULL)
146                         continue;
147                 if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
148                         continue;
149                 /* TODO: can the guest cause us to spawn off infinite threads? */
150                 if (store)
151                         virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size,
152                                        (uint32_t *)regp);
153                 else
154                         *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size);
155                 vm_tf->tf_rip += advance;
156                 return TRUE;
157         }
158         if (PG_ADDR(gpa) == 0xfec00000) {
159                 do_ioapic(gth, gpa, regx, regp, store);
160         } else if (PG_ADDR(gpa) == 0) {
161                 memmove(regp, &vm->low4k[gpa], size);
162         } else {
163                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
164                 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
165                                 vm_tf->tf_exit_reason);
166                 fprintf(stderr, "Returning 0xffffffff\n");
167                 showstatus(stderr, gth);
168                 /* Just fill the whole register for now. */
169                 *regp = (uint64_t) -1;
170                 return FALSE;
171         }
172         vm_tf->tf_rip += advance;
173         return TRUE;
174 }
175
176 static bool handle_vmcall_printc(struct guest_thread *gth)
177 {
178         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
179         uint8_t byte;
180
181         byte = vm_tf->tf_rdi;
182         printf("%c", byte);
183         if (byte == '\n')
184                 printf("%c", '%');
185         fflush(stdout);
186         return TRUE;
187 }
188
189 static bool handle_vmcall_smpboot(struct guest_thread *gth)
190 {
191         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
192         struct vm_trapframe *vm_tf_ap;
193         struct virtual_machine *vm = gth_to_vm(gth);
194         int cur_pcores = vm->up_gpcs;
195
196         /* Check if we're guest pcore 0. Only the BSP is allowed to start APs. */
197         if (vm_tf->tf_guest_pcoreid != 0) {
198                 fprintf(stderr,
199                         "Only guest pcore 0 is allowed to start APs. core was %ld\n",
200                         vm_tf->tf_guest_pcoreid);
201                 return FALSE;
202         }
203
204         /* Check if we've reached the maximum, if yes, blow out. */
205         if (vm->nr_gpcs == cur_pcores) {
206                 fprintf(stderr,
207                         "guest tried to start up too many cores. max was %ld, current up %ld\n",
208                         vm->nr_gpcs, cur_pcores);
209                 return FALSE;
210         }
211
212         /* Start up secondary core. */
213         vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
214         /* We use the BSP's CR3 for now. This should be fine because they
215          * change it later anyway. */
216         vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
217
218         /* Starting RIP is passed in via rdi. */
219         vm_tf_ap->tf_rip = vm_tf->tf_rdi;
220
221         /* Starting RSP is passed in via rsi. */
222         vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
223
224         vm->up_gpcs++;
225
226         start_guest_thread(gpcid_to_gth(vm, cur_pcores));
227
228         return TRUE;
229 }
230
231 static bool handle_vmcall_get_tscfreq(struct guest_thread *gth)
232 {
233         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
234         struct vm_trapframe *vm_tf_ap;
235         struct virtual_machine *vm = gth_to_vm(gth);
236
237         vm_tf->tf_rax = get_tsc_freq() / 1000;
238         return TRUE;
239 }
240
241 static bool handle_vmcall(struct guest_thread *gth)
242 {
243         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
244         struct virtual_machine *vm = gth_to_vm(gth);
245         bool retval = FALSE;
246
247         if (vm->vmcall)
248                 return vm->vmcall(gth, vm_tf);
249
250         switch (vm_tf->tf_rax) {
251         case VMCALL_PRINTC:
252                 retval = handle_vmcall_printc(gth);
253                 break;
254         case VMCALL_SMPBOOT:
255                 retval = handle_vmcall_smpboot(gth);
256                 break;
257         case VMCALL_GET_TSCFREQ:
258                 retval = handle_vmcall_get_tscfreq(gth);
259                 break;
260         }
261
262         if (retval)
263                 vm_tf->tf_rip += 3;
264
265         return retval;
266 }
267
268 static bool handle_io(struct guest_thread *gth)
269 {
270         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
271         int ret = io(gth);
272
273         if (ret < 0)
274                 return FALSE;
275         if (ret == VM_PAGE_FAULT) {
276                 /* We were unable to translate RIP due to an ept fault */
277                 vm_tf->tf_trap_inject = VM_TRAP_VALID
278                                       | VM_TRAP_ERROR_CODE
279                                       | VM_TRAP_HARDWARE
280                                       | HW_TRAP_PAGE_FAULT;
281         }
282         return TRUE;
283 }
284
285 static bool handle_msr(struct guest_thread *gth)
286 {
287         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
288
289         if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
290                 /* Use event injection through vmctl to send a general protection fault
291                  * vmctl.interrupt gets written to the VM-Entry Interruption-Information
292                  * Field by vmx */
293                 vm_tf->tf_trap_inject = VM_TRAP_VALID
294                                       | VM_TRAP_ERROR_CODE
295                                       | VM_TRAP_HARDWARE
296                                       | HW_TRAP_GP_FAULT;
297         } else {
298                 vm_tf->tf_rip += 2;
299         }
300         return TRUE;
301 }
302
303 static bool handle_apic_access(struct guest_thread *gth)
304 {
305         uint64_t gpa, *regp;
306         uint8_t regx;
307         int store, size;
308         int advance;
309         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
310
311         if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
312                 return FALSE;
313         if (__apic_access(gth, gpa, regx, regp, store))
314                 return FALSE;
315         vm_tf->tf_rip += advance;
316         return TRUE;
317 }
318
319 static bool handle_halt(struct guest_thread *gth)
320 {
321         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
322         struct virtual_machine *vm = gth_to_vm(gth);
323
324         if (vm->halt_exit)
325                 return FALSE;
326         /* It's possible the guest disabled IRQs and halted, perhaps waiting on an
327          * NMI or something.  If we need to support that, we can change this.  */
328         sleep_til_irq(gth);
329         vm_tf->tf_rip += 1;
330         return TRUE;
331 }
332
333 /* The guest is told (via cpuid) that there is no monitor/mwait.  Callers of
334  * mwait are paravirtualized halts.
335  *
336  * We don't support monitor/mwait in software, so if they tried to mwait
337  * without break-on-interrupt and with interrupts disabled, they'll never
338  * wake up.  So we'll always break on interrupt. */
339 static bool handle_mwait(struct guest_thread *gth)
340 {
341         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
342         struct virtual_machine *vm = gth_to_vm(gth);
343
344         sleep_til_irq(gth);
345         vm_tf->tf_rip += 3;
346         return TRUE;
347 }
348
349 /* Is this a vmm specific thing?  or generic?
350  *
351  * what do we do when we want to kill the vm?  what are our other options? */
352 bool handle_vmexit(struct guest_thread *gth)
353 {
354         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
355
356         switch (vm_tf->tf_exit_reason) {
357         case EXIT_REASON_CPUID:
358                 return handle_cpuid(gth);
359         case EXIT_REASON_EPT_VIOLATION:
360                 return handle_ept_fault(gth);
361         case EXIT_REASON_VMCALL:
362                 return handle_vmcall(gth);
363         case EXIT_REASON_IO_INSTRUCTION:
364                 return handle_io(gth);
365         case EXIT_REASON_MSR_WRITE:
366         case EXIT_REASON_MSR_READ:
367                 return handle_msr(gth);
368         case EXIT_REASON_APIC_ACCESS:
369                 return handle_apic_access(gth);
370         case EXIT_REASON_HLT:
371                 return handle_halt(gth);
372         case EXIT_REASON_MWAIT_INSTRUCTION:
373                 return handle_mwait(gth);
374         case EXIT_REASON_EXTERNAL_INTERRUPT:
375         case EXIT_REASON_APIC_WRITE:
376                 /* TODO: just ignore these? */
377                 return TRUE;
378         default:
379                 fprintf(stderr, "VMM library: don't know how to handle exit %d\n",
380                         vm_tf->tf_exit_reason);
381                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
382                         vm_tf->tf_exit_reason);
383                 return FALSE;
384         }
385 }