Remove MWAIT functionality from the VM guest.
[akaros.git] / user / vmm / vmexit.c
1 /* Copyright (c) 2015-2016 Google Inc.
2  * See LICENSE for details. */
3
4 #include <parlib/common.h>
5 #include <vmm/virtio.h>
6 #include <vmm/virtio_mmio.h>
7 #include <vmm/virtio_ids.h>
8 #include <vmm/virtio_config.h>
9 #include <vmm/vmm.h>
10 #include <parlib/arch/trap.h>
11 #include <parlib/bitmask.h>
12 #include <stdio.h>
13
14 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
15 {
16         return GET_BITMASK_BIT(gpci->posted_irq_desc, VMX_POSTED_OUTSTANDING_NOTIF);
17 }
18
19 static bool rvi_is_set(struct guest_thread *gth)
20 {
21         uint8_t rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
22
23         return rvi != 0;
24 }
25
26 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
27  * vmm_interrupt_guest(). */
28 static void sleep_til_irq(struct guest_thread *gth)
29 {
30         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
31
32         /* The invariant is that if an IRQ is posted, but not delivered, we will not
33          * sleep.  Anyone who posts an IRQ must signal after setting it.
34          * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
35          * posting, we'll need to revist this.
36          *
37          * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
38          * possible that the hardware attempted to post the interrupt.  In SDM
39          * parlance, the processor could have "recognized" the virtual IRQ, but not
40          * delivered it yet.  This could happen if the guest had executed "sti", but
41          * not "hlt" yet.  The IRQ was posted and recognized, but not delivered
42          * ("sti blocking").  Then the guest executes "hlt", and vmexits.
43          * OUTSTANDING_NOTIF will be clear in this case.  RVI should be set - at
44          * least to the vector we just sent, but possibly to a greater vector if
45          * multiple were sent.  RVI should only be cleared after virtual IRQs were
46          * actually delivered.  So checking OUTSTANDING_NOTIF and RVI should
47          * suffice.
48          *
49          * Generally, we should also check GUEST_INTERRUPTIBILITY_INFO to see if
50          * there's some reason to not deliver the interrupt and check things like
51          * the VPPR (priority register).  But since we're emulating a halt, mwait,
52          * or something else that needs to be woken by an IRQ, we can ignore that
53          * and just wake them up.  Note that we won't actually deliver the IRQ,
54          * we'll just restart the guest and the hardware will deliver the virtual
55          * IRQ at the appropriate time.  So in the event that something weird
56          * happens, the halt/mwait just returns spuriously.
57          *
58          * The more traditional race here is if the halt starts concurrently with
59          * the post; that's why we sync with the mutex to make sure there is an
60          * ordering between the actual halt (this function) and the posting. */
61         uth_mutex_lock(gth->halt_mtx);
62         while (!(pir_notif_is_set(gpci) || rvi_is_set(gth)))
63                 uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
64         uth_mutex_unlock(gth->halt_mtx);
65 }
66
67 enum {
68                 CPUID_0B_LEVEL_SMT = 0,
69                 CPUID_0B_LEVEL_CORE
70 };
71
72 static bool handle_cpuid(struct guest_thread *gth)
73 {
74         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
75         struct virtual_machine *vm = gth_to_vm(gth);
76         uint32_t level = vm_tf->tf_rcx & 0x0F;
77
78         if (vm_tf->tf_rax != 0x0B)
79                 return FALSE;
80
81         vm_tf->tf_rip += 2;
82         vm_tf->tf_rax = 0;
83         vm_tf->tf_rbx = 0;
84         vm_tf->tf_rcx = level;
85         vm_tf->tf_rdx = gth->gpc_id;
86         if (level == CPUID_0B_LEVEL_SMT) {
87                 vm_tf->tf_rax = 0;
88                 vm_tf->tf_rbx = 1;
89                 vm_tf->tf_rcx |= ((level + 1) << 8);
90         }
91         if (level == CPUID_0B_LEVEL_CORE) {
92                 uint32_t shift = LOG2_UP(vm->nr_gpcs);
93
94                 if (shift > 0x1F)
95                         shift = 0x1F;
96                 vm_tf->tf_rax = shift;
97                 vm_tf->tf_rbx = vm->nr_gpcs;
98                 vm_tf->tf_rcx |= ((level + 1) << 8);
99         }
100
101         return TRUE;
102 }
103
104 static bool handle_ept_fault(struct guest_thread *gth)
105 {
106         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
107         struct virtual_machine *vm = gth_to_vm(gth);
108         uint64_t gpa, *regp;
109         uint8_t regx;
110         int store, size;
111         int advance;
112
113         int ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
114
115         if (ret < 0)
116                 return FALSE;
117         if (ret == VM_PAGE_FAULT) {
118                 /* We were unable to translate RIP due to an ept fault */
119                 vm_tf->tf_trap_inject = VM_TRAP_VALID
120                                       | VM_TRAP_ERROR_CODE
121                                       | VM_TRAP_HARDWARE
122                                       | HW_TRAP_PAGE_FAULT;
123                 return TRUE;
124         }
125
126         assert(size >= 0);
127         /* TODO use helpers for some of these addr checks.  the fee/fec ones might
128          * be wrong too. */
129         for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
130                 if (vm->virtio_mmio_devices[i] == NULL)
131                         continue;
132                 if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
133                         continue;
134                 /* TODO: can the guest cause us to spawn off infinite threads? */
135                 if (store)
136                         virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size,
137                                        (uint32_t *)regp);
138                 else
139                         *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size);
140                 vm_tf->tf_rip += advance;
141                 return TRUE;
142         }
143         if (PG_ADDR(gpa) == 0xfec00000) {
144                 do_ioapic(gth, gpa, regx, regp, store);
145         } else if (PG_ADDR(gpa) == 0) {
146                 memmove(regp, &vm->low4k[gpa], size);
147         } else {
148                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
149                 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
150                                 vm_tf->tf_exit_reason);
151                 fprintf(stderr, "Returning 0xffffffff\n");
152                 showstatus(stderr, gth);
153                 /* Just fill the whole register for now. */
154                 *regp = (uint64_t) -1;
155                 return FALSE;
156         }
157         vm_tf->tf_rip += advance;
158         return TRUE;
159 }
160
161 static bool handle_vmcall_printc(struct guest_thread *gth)
162 {
163         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
164         uint8_t byte;
165
166         byte = vm_tf->tf_rdi;
167         printf("%c", byte);
168         if (byte == '\n')
169                 printf("%c", '%');
170         fflush(stdout);
171         return TRUE;
172 }
173
174 static bool handle_vmcall_smpboot(struct guest_thread *gth)
175 {
176         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
177         struct vm_trapframe *vm_tf_ap;
178         struct virtual_machine *vm = gth_to_vm(gth);
179         int cur_pcores = vm->up_gpcs;
180
181         /* Check if we're guest pcore 0. Only the BSP is allowed to start APs. */
182         if (vm_tf->tf_guest_pcoreid != 0) {
183                 fprintf(stderr,
184                         "Only guest pcore 0 is allowed to start APs. core was %ld\n",
185                         vm_tf->tf_guest_pcoreid);
186                 return FALSE;
187         }
188
189         /* Check if we've reached the maximum, if yes, blow out. */
190         if (vm->nr_gpcs == cur_pcores) {
191                 fprintf(stderr,
192                         "guest tried to start up too many cores. max was %ld, current up %ld\n",
193                         vm->nr_gpcs, cur_pcores);
194                 return FALSE;
195         }
196
197         /* Start up secondary core. */
198         vm_tf_ap = gth_to_vmtf(vm->gths[cur_pcores]);
199         /* We use the BSP's CR3 for now. This should be fine because they
200          * change it later anyway. */
201         vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
202
203         /* Starting RIP is passed in via rdi. */
204         vm_tf_ap->tf_rip = vm_tf->tf_rdi;
205
206         /* Starting RSP is passed in via rsi. */
207         vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
208
209         vm->up_gpcs++;
210
211         start_guest_thread(vm->gths[cur_pcores]);
212
213         return TRUE;
214 }
215
216 static bool handle_vmcall(struct guest_thread *gth)
217 {
218         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
219         bool retval = FALSE;
220
221         if (gth->vmcall)
222                 return gth->vmcall(gth, vm_tf);
223
224         switch (vm_tf->tf_rax) {
225                 case VMCALL_PRINTC:
226                         retval = handle_vmcall_printc(gth);
227                         break;
228                 case VMCALL_SMPBOOT:
229                         retval = handle_vmcall_smpboot(gth);
230                         break;
231         }
232
233         if (retval)
234                 vm_tf->tf_rip += 3;
235
236         return retval;
237 }
238
239 static bool handle_io(struct guest_thread *gth)
240 {
241         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
242         int ret = io(gth);
243
244         if (ret < 0)
245                 return FALSE;
246         if (ret == VM_PAGE_FAULT) {
247                 /* We were unable to translate RIP due to an ept fault */
248                 vm_tf->tf_trap_inject = VM_TRAP_VALID
249                                       | VM_TRAP_ERROR_CODE
250                                       | VM_TRAP_HARDWARE
251                                       | HW_TRAP_PAGE_FAULT;
252         }
253         return TRUE;
254 }
255
256 static bool handle_msr(struct guest_thread *gth)
257 {
258         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
259
260         if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
261                 /* Use event injection through vmctl to send a general protection fault
262                  * vmctl.interrupt gets written to the VM-Entry Interruption-Information
263                  * Field by vmx */
264                 vm_tf->tf_trap_inject = VM_TRAP_VALID
265                                       | VM_TRAP_ERROR_CODE
266                                       | VM_TRAP_HARDWARE
267                                       | HW_TRAP_GP_FAULT;
268         } else {
269                 vm_tf->tf_rip += 2;
270         }
271         return TRUE;
272 }
273
274 static bool handle_apic_access(struct guest_thread *gth)
275 {
276         uint64_t gpa, *regp;
277         uint8_t regx;
278         int store, size;
279         int advance;
280         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
281
282         if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
283                 return FALSE;
284         if (__apic_access(gth, gpa, regx, regp, store))
285                 return FALSE;
286         vm_tf->tf_rip += advance;
287         return TRUE;
288 }
289
290 static bool handle_halt(struct guest_thread *gth)
291 {
292         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
293
294         if (gth->halt_exit)
295                 return FALSE;
296         /* It's possible the guest disabled IRQs and halted, perhaps waiting on an
297          * NMI or something.  If we need to support that, we can change this.  */
298         sleep_til_irq(gth);
299         vm_tf->tf_rip += 1;
300         return TRUE;
301 }
302
303 /* Is this a vmm specific thing?  or generic?
304  *
305  * what do we do when we want to kill the vm?  what are our other options? */
306 bool handle_vmexit(struct guest_thread *gth)
307 {
308         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
309
310         switch (vm_tf->tf_exit_reason) {
311         case EXIT_REASON_CPUID:
312                 return handle_cpuid(gth);
313         case EXIT_REASON_EPT_VIOLATION:
314                 return handle_ept_fault(gth);
315         case EXIT_REASON_VMCALL:
316                 return handle_vmcall(gth);
317         case EXIT_REASON_IO_INSTRUCTION:
318                 return handle_io(gth);
319         case EXIT_REASON_MSR_WRITE:
320         case EXIT_REASON_MSR_READ:
321                 return handle_msr(gth);
322         case EXIT_REASON_APIC_ACCESS:
323                 return handle_apic_access(gth);
324         case EXIT_REASON_HLT:
325                 return handle_halt(gth);
326         case EXIT_REASON_EXTERNAL_INTERRUPT:
327         case EXIT_REASON_APIC_WRITE:
328                 /* TODO: just ignore these? */
329                 return TRUE;
330         default:
331                 fprintf(stderr, "VMM library: don't know how to handle exit %d\n",
332                         vm_tf->tf_exit_reason);
333                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
334                         vm_tf->tf_exit_reason);
335                 return FALSE;
336         }
337 }