vmm: Let the VMM control mwait vmexits (XCC)
[akaros.git] / user / vmm / vmexit.c
1 /* Copyright (c) 2015-2016 Google Inc.
2  * See LICENSE for details. */
3
4 #include <parlib/common.h>
5 #include <vmm/virtio.h>
6 #include <vmm/virtio_mmio.h>
7 #include <vmm/virtio_ids.h>
8 #include <vmm/virtio_config.h>
9 #include <vmm/vmm.h>
10 #include <parlib/arch/trap.h>
11 #include <parlib/bitmask.h>
12 #include <stdio.h>
13
14 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
15 {
16         return GET_BITMASK_BIT(gpci->posted_irq_desc, VMX_POSTED_OUTSTANDING_NOTIF);
17 }
18
19 static bool rvi_is_set(struct guest_thread *gth)
20 {
21         uint8_t rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
22
23         return rvi != 0;
24 }
25
26 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
27  * vmm_interrupt_guest(). */
28 static void sleep_til_irq(struct guest_thread *gth)
29 {
30         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
31
32         /* The invariant is that if an IRQ is posted, but not delivered, we will not
33          * sleep.  Anyone who posts an IRQ must signal after setting it.
34          * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
35          * posting, we'll need to revist this.
36          *
37          * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
38          * possible that the hardware attempted to post the interrupt.  In SDM
39          * parlance, the processor could have "recognized" the virtual IRQ, but not
40          * delivered it yet.  This could happen if the guest had executed "sti", but
41          * not "hlt" yet.  The IRQ was posted and recognized, but not delivered
42          * ("sti blocking").  Then the guest executes "hlt", and vmexits.
43          * OUTSTANDING_NOTIF will be clear in this case.  RVI should be set - at
44          * least to the vector we just sent, but possibly to a greater vector if
45          * multiple were sent.  RVI should only be cleared after virtual IRQs were
46          * actually delivered.  So checking OUTSTANDING_NOTIF and RVI should
47          * suffice.
48          *
49          * Generally, we should also check GUEST_INTERRUPTIBILITY_INFO to see if
50          * there's some reason to not deliver the interrupt and check things like
51          * the VPPR (priority register).  But since we're emulating a halt, mwait,
52          * or something else that needs to be woken by an IRQ, we can ignore that
53          * and just wake them up.  Note that we won't actually deliver the IRQ,
54          * we'll just restart the guest and the hardware will deliver the virtual
55          * IRQ at the appropriate time.  So in the event that something weird
56          * happens, the halt/mwait just returns spuriously.
57          *
58          * The more traditional race here is if the halt starts concurrently with
59          * the post; that's why we sync with the mutex to make sure there is an
60          * ordering between the actual halt (this function) and the posting. */
61         uth_mutex_lock(gth->halt_mtx);
62         while (!(pir_notif_is_set(gpci) || rvi_is_set(gth)))
63                 uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
64         uth_mutex_unlock(gth->halt_mtx);
65 }
66
67 enum {
68                 CPUID_0B_LEVEL_SMT = 0,
69                 CPUID_0B_LEVEL_CORE
70 };
71
72 static bool handle_cpuid(struct guest_thread *gth)
73 {
74         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
75         struct virtual_machine *vm = gth_to_vm(gth);
76         uint32_t level = vm_tf->tf_rcx & 0x0F;
77
78         if (vm_tf->tf_rax != 0x0B)
79                 return FALSE;
80
81         vm_tf->tf_rip += 2;
82         vm_tf->tf_rax = 0;
83         vm_tf->tf_rbx = 0;
84         vm_tf->tf_rcx = level;
85         vm_tf->tf_rdx = gth->gpc_id;
86         if (level == CPUID_0B_LEVEL_SMT) {
87                 vm_tf->tf_rax = 0;
88                 vm_tf->tf_rbx = 1;
89                 vm_tf->tf_rcx |= ((level + 1) << 8);
90         }
91         if (level == CPUID_0B_LEVEL_CORE) {
92                 uint32_t shift = LOG2_UP(vm->nr_gpcs);
93
94                 if (shift > 0x1F)
95                         shift = 0x1F;
96                 vm_tf->tf_rax = shift;
97                 vm_tf->tf_rbx = vm->nr_gpcs;
98                 vm_tf->tf_rcx |= ((level + 1) << 8);
99         }
100
101         return TRUE;
102 }
103
104 static bool handle_ept_fault(struct guest_thread *gth)
105 {
106         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
107         struct virtual_machine *vm = gth_to_vm(gth);
108         uint64_t gpa, *regp;
109         uint8_t regx;
110         int store, size;
111         int advance;
112         int ret;
113
114         if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
115                 ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0, 0, 0);
116                 if (ret <= 0)
117                         panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
118                               ret);
119                 return TRUE;
120         }
121         ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
122
123         if (ret < 0)
124                 return FALSE;
125         if (ret == VM_PAGE_FAULT) {
126                 /* We were unable to translate RIP due to an ept fault */
127                 vm_tf->tf_trap_inject = VM_TRAP_VALID
128                                       | VM_TRAP_ERROR_CODE
129                                       | VM_TRAP_HARDWARE
130                                       | HW_TRAP_PAGE_FAULT;
131                 return TRUE;
132         }
133
134         assert(size >= 0);
135         /* TODO use helpers for some of these addr checks.  the fee/fec ones might
136          * be wrong too. */
137         for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
138                 if (vm->virtio_mmio_devices[i] == NULL)
139                         continue;
140                 if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
141                         continue;
142                 /* TODO: can the guest cause us to spawn off infinite threads? */
143                 if (store)
144                         virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size,
145                                        (uint32_t *)regp);
146                 else
147                         *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size);
148                 vm_tf->tf_rip += advance;
149                 return TRUE;
150         }
151         if (PG_ADDR(gpa) == 0xfec00000) {
152                 do_ioapic(gth, gpa, regx, regp, store);
153         } else if (PG_ADDR(gpa) == 0) {
154                 memmove(regp, &vm->low4k[gpa], size);
155         } else {
156                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
157                 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
158                                 vm_tf->tf_exit_reason);
159                 fprintf(stderr, "Returning 0xffffffff\n");
160                 showstatus(stderr, gth);
161                 /* Just fill the whole register for now. */
162                 *regp = (uint64_t) -1;
163                 return FALSE;
164         }
165         vm_tf->tf_rip += advance;
166         return TRUE;
167 }
168
169 static bool handle_vmcall_printc(struct guest_thread *gth)
170 {
171         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
172         uint8_t byte;
173
174         byte = vm_tf->tf_rdi;
175         printf("%c", byte);
176         if (byte == '\n')
177                 printf("%c", '%');
178         fflush(stdout);
179         return TRUE;
180 }
181
182 static bool handle_vmcall_smpboot(struct guest_thread *gth)
183 {
184         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
185         struct vm_trapframe *vm_tf_ap;
186         struct virtual_machine *vm = gth_to_vm(gth);
187         int cur_pcores = vm->up_gpcs;
188
189         /* Check if we're guest pcore 0. Only the BSP is allowed to start APs. */
190         if (vm_tf->tf_guest_pcoreid != 0) {
191                 fprintf(stderr,
192                         "Only guest pcore 0 is allowed to start APs. core was %ld\n",
193                         vm_tf->tf_guest_pcoreid);
194                 return FALSE;
195         }
196
197         /* Check if we've reached the maximum, if yes, blow out. */
198         if (vm->nr_gpcs == cur_pcores) {
199                 fprintf(stderr,
200                         "guest tried to start up too many cores. max was %ld, current up %ld\n",
201                         vm->nr_gpcs, cur_pcores);
202                 return FALSE;
203         }
204
205         /* Start up secondary core. */
206         vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
207         /* We use the BSP's CR3 for now. This should be fine because they
208          * change it later anyway. */
209         vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
210
211         /* Starting RIP is passed in via rdi. */
212         vm_tf_ap->tf_rip = vm_tf->tf_rdi;
213
214         /* Starting RSP is passed in via rsi. */
215         vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
216
217         vm->up_gpcs++;
218
219         start_guest_thread(gpcid_to_gth(vm, cur_pcores));
220
221         return TRUE;
222 }
223
224 static bool handle_vmcall(struct guest_thread *gth)
225 {
226         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
227         struct virtual_machine *vm = gth_to_vm(gth);
228         bool retval = FALSE;
229
230         if (vm->vmcall)
231                 return vm->vmcall(gth, vm_tf);
232
233         switch (vm_tf->tf_rax) {
234                 case VMCALL_PRINTC:
235                         retval = handle_vmcall_printc(gth);
236                         break;
237                 case VMCALL_SMPBOOT:
238                         retval = handle_vmcall_smpboot(gth);
239                         break;
240         }
241
242         if (retval)
243                 vm_tf->tf_rip += 3;
244
245         return retval;
246 }
247
248 static bool handle_io(struct guest_thread *gth)
249 {
250         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
251         int ret = io(gth);
252
253         if (ret < 0)
254                 return FALSE;
255         if (ret == VM_PAGE_FAULT) {
256                 /* We were unable to translate RIP due to an ept fault */
257                 vm_tf->tf_trap_inject = VM_TRAP_VALID
258                                       | VM_TRAP_ERROR_CODE
259                                       | VM_TRAP_HARDWARE
260                                       | HW_TRAP_PAGE_FAULT;
261         }
262         return TRUE;
263 }
264
265 static bool handle_msr(struct guest_thread *gth)
266 {
267         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
268
269         if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
270                 /* Use event injection through vmctl to send a general protection fault
271                  * vmctl.interrupt gets written to the VM-Entry Interruption-Information
272                  * Field by vmx */
273                 vm_tf->tf_trap_inject = VM_TRAP_VALID
274                                       | VM_TRAP_ERROR_CODE
275                                       | VM_TRAP_HARDWARE
276                                       | HW_TRAP_GP_FAULT;
277         } else {
278                 vm_tf->tf_rip += 2;
279         }
280         return TRUE;
281 }
282
283 static bool handle_apic_access(struct guest_thread *gth)
284 {
285         uint64_t gpa, *regp;
286         uint8_t regx;
287         int store, size;
288         int advance;
289         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
290
291         if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
292                 return FALSE;
293         if (__apic_access(gth, gpa, regx, regp, store))
294                 return FALSE;
295         vm_tf->tf_rip += advance;
296         return TRUE;
297 }
298
299 static bool handle_halt(struct guest_thread *gth)
300 {
301         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
302         struct virtual_machine *vm = gth_to_vm(gth);
303
304         if (vm->halt_exit)
305                 return FALSE;
306         /* It's possible the guest disabled IRQs and halted, perhaps waiting on an
307          * NMI or something.  If we need to support that, we can change this.  */
308         sleep_til_irq(gth);
309         vm_tf->tf_rip += 1;
310         return TRUE;
311 }
312
313 /* The guest is told (via cpuid) that there is no monitor/mwait.  Callers of
314  * mwait are paravirtualized halts.
315  *
316  * We don't support monitor/mwait in software, so if they tried to mwait
317  * without break-on-interrupt and with interrupts disabled, they'll never
318  * wake up.  So we'll always break on interrupt. */
319 static bool handle_mwait(struct guest_thread *gth)
320 {
321         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
322         struct virtual_machine *vm = gth_to_vm(gth);
323
324         sleep_til_irq(gth);
325         vm_tf->tf_rip += 3;
326         return TRUE;
327 }
328
329 /* Is this a vmm specific thing?  or generic?
330  *
331  * what do we do when we want to kill the vm?  what are our other options? */
332 bool handle_vmexit(struct guest_thread *gth)
333 {
334         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
335
336         switch (vm_tf->tf_exit_reason) {
337         case EXIT_REASON_CPUID:
338                 return handle_cpuid(gth);
339         case EXIT_REASON_EPT_VIOLATION:
340                 return handle_ept_fault(gth);
341         case EXIT_REASON_VMCALL:
342                 return handle_vmcall(gth);
343         case EXIT_REASON_IO_INSTRUCTION:
344                 return handle_io(gth);
345         case EXIT_REASON_MSR_WRITE:
346         case EXIT_REASON_MSR_READ:
347                 return handle_msr(gth);
348         case EXIT_REASON_APIC_ACCESS:
349                 return handle_apic_access(gth);
350         case EXIT_REASON_HLT:
351                 return handle_halt(gth);
352         case EXIT_REASON_MWAIT_INSTRUCTION:
353                 return handle_mwait(gth);
354         case EXIT_REASON_EXTERNAL_INTERRUPT:
355         case EXIT_REASON_APIC_WRITE:
356                 /* TODO: just ignore these? */
357                 return TRUE;
358         default:
359                 fprintf(stderr, "VMM library: don't know how to handle exit %d\n",
360                         vm_tf->tf_exit_reason);
361                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
362                         vm_tf->tf_exit_reason);
363                 return FALSE;
364         }
365 }