Update to linuxemu syscall structure
[akaros.git] / user / vmm / vmexit.c
1 /* Copyright (c) 2015-2016 Google Inc.
2  * See LICENSE for details. */
3
4 #include <parlib/common.h>
5 #include <vmm/virtio.h>
6 #include <vmm/virtio_mmio.h>
7 #include <vmm/virtio_ids.h>
8 #include <vmm/virtio_config.h>
9 #include <vmm/vmm.h>
10 #include <parlib/arch/trap.h>
11 #include <parlib/bitmask.h>
12 #include <stdio.h>
13
14 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
15 {
16         return GET_BITMASK_BIT(gpci->posted_irq_desc, VMX_POSTED_OUTSTANDING_NOTIF);
17 }
18
19 static bool rvi_is_set(struct guest_thread *gth)
20 {
21         uint8_t rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
22
23         return rvi != 0;
24 }
25
26 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
27  * vmm_interrupt_guest(). */
28 static void sleep_til_irq(struct guest_thread *gth)
29 {
30         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
31
32         /* The invariant is that if an IRQ is posted, but not delivered, we will not
33          * sleep.  Anyone who posts an IRQ must signal after setting it.
34          * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
35          * posting, we'll need to revist this.
36          *
37          * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
38          * possible that the hardware attempted to post the interrupt.  In SDM
39          * parlance, the processor could have "recognized" the virtual IRQ, but not
40          * delivered it yet.  This could happen if the guest had executed "sti", but
41          * not "hlt" yet.  The IRQ was posted and recognized, but not delivered
42          * ("sti blocking").  Then the guest executes "hlt", and vmexits.
43          * OUTSTANDING_NOTIF will be clear in this case.  RVI should be set - at
44          * least to the vector we just sent, but possibly to a greater vector if
45          * multiple were sent.  RVI should only be cleared after virtual IRQs were
46          * actually delivered.  So checking OUTSTANDING_NOTIF and RVI should
47          * suffice.
48          *
49          * Generally, we should also check GUEST_INTERRUPTIBILITY_INFO to see if
50          * there's some reason to not deliver the interrupt and check things like
51          * the VPPR (priority register).  But since we're emulating a halt, mwait,
52          * or something else that needs to be woken by an IRQ, we can ignore that
53          * and just wake them up.  Note that we won't actually deliver the IRQ,
54          * we'll just restart the guest and the hardware will deliver the virtual
55          * IRQ at the appropriate time.  So in the event that something weird
56          * happens, the halt/mwait just returns spuriously.
57          *
58          * The more traditional race here is if the halt starts concurrently with
59          * the post; that's why we sync with the mutex to make sure there is an
60          * ordering between the actual halt (this function) and the posting. */
61         uth_mutex_lock(gth->halt_mtx);
62         while (!(pir_notif_is_set(gpci) || rvi_is_set(gth)))
63                 uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
64         uth_mutex_unlock(gth->halt_mtx);
65 }
66
67 enum {
68                 CPUID_0B_LEVEL_SMT = 0,
69                 CPUID_0B_LEVEL_CORE
70 };
71
72 static bool handle_cpuid(struct guest_thread *gth)
73 {
74         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
75         struct virtual_machine *vm = gth_to_vm(gth);
76         uint32_t level = vm_tf->tf_rcx & 0x0F;
77
78         if (vm_tf->tf_rax != 0x0B)
79                 return FALSE;
80
81         vm_tf->tf_rip += 2;
82         vm_tf->tf_rax = 0;
83         vm_tf->tf_rbx = 0;
84         vm_tf->tf_rcx = level;
85         vm_tf->tf_rdx = gth->gpc_id;
86         if (level == CPUID_0B_LEVEL_SMT) {
87                 vm_tf->tf_rax = 0;
88                 vm_tf->tf_rbx = 1;
89                 vm_tf->tf_rcx |= ((level + 1) << 8);
90         }
91         if (level == CPUID_0B_LEVEL_CORE) {
92                 uint32_t shift = LOG2_UP(vm->nr_gpcs);
93
94                 if (shift > 0x1F)
95                         shift = 0x1F;
96                 vm_tf->tf_rax = shift;
97                 vm_tf->tf_rbx = vm->nr_gpcs;
98                 vm_tf->tf_rcx |= ((level + 1) << 8);
99         }
100
101         return TRUE;
102 }
103
104 static bool handle_ept_fault(struct guest_thread *gth)
105 {
106         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
107         struct virtual_machine *vm = gth_to_vm(gth);
108         uint64_t gpa, *regp;
109         uint8_t regx;
110         int store, size;
111         int advance;
112         int ret;
113
114         if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
115                 ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0, 0, 0);
116                 if (ret <= 0)
117                         panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
118                               ret);
119                 return TRUE;
120         }
121         ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
122
123         if (ret < 0)
124                 return FALSE;
125         if (ret == VM_PAGE_FAULT) {
126                 /* We were unable to translate RIP due to an ept fault */
127                 vm_tf->tf_trap_inject = VM_TRAP_VALID
128                                       | VM_TRAP_ERROR_CODE
129                                       | VM_TRAP_HARDWARE
130                                       | HW_TRAP_PAGE_FAULT;
131                 return TRUE;
132         }
133
134         assert(size >= 0);
135         /* TODO use helpers for some of these addr checks.  the fee/fec ones might
136          * be wrong too. */
137         for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
138                 if (vm->virtio_mmio_devices[i] == NULL)
139                         continue;
140                 if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
141                         continue;
142                 /* TODO: can the guest cause us to spawn off infinite threads? */
143                 if (store)
144                         virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size,
145                                        (uint32_t *)regp);
146                 else
147                         *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size);
148                 vm_tf->tf_rip += advance;
149                 return TRUE;
150         }
151         if (PG_ADDR(gpa) == 0xfec00000) {
152                 do_ioapic(gth, gpa, regx, regp, store);
153         } else if (PG_ADDR(gpa) == 0) {
154                 memmove(regp, &vm->low4k[gpa], size);
155         } else {
156                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
157                 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
158                                 vm_tf->tf_exit_reason);
159                 fprintf(stderr, "Returning 0xffffffff\n");
160                 showstatus(stderr, gth);
161                 /* Just fill the whole register for now. */
162                 *regp = (uint64_t) -1;
163                 return FALSE;
164         }
165         vm_tf->tf_rip += advance;
166         return TRUE;
167 }
168
169 static bool handle_vmcall_printc(struct guest_thread *gth)
170 {
171         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
172         uint8_t byte;
173
174         byte = vm_tf->tf_rdi;
175         printf("%c", byte);
176         if (byte == '\n')
177                 printf("%c", '%');
178         fflush(stdout);
179         return TRUE;
180 }
181
182 static bool handle_vmcall_smpboot(struct guest_thread *gth)
183 {
184         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
185         struct vm_trapframe *vm_tf_ap;
186         struct virtual_machine *vm = gth_to_vm(gth);
187         int cur_pcores = vm->up_gpcs;
188
189         /* Check if we're guest pcore 0. Only the BSP is allowed to start APs. */
190         if (vm_tf->tf_guest_pcoreid != 0) {
191                 fprintf(stderr,
192                         "Only guest pcore 0 is allowed to start APs. core was %ld\n",
193                         vm_tf->tf_guest_pcoreid);
194                 return FALSE;
195         }
196
197         /* Check if we've reached the maximum, if yes, blow out. */
198         if (vm->nr_gpcs == cur_pcores) {
199                 fprintf(stderr,
200                         "guest tried to start up too many cores. max was %ld, current up %ld\n",
201                         vm->nr_gpcs, cur_pcores);
202                 return FALSE;
203         }
204
205         /* Start up secondary core. */
206         vm_tf_ap = gth_to_vmtf(vm->gths[cur_pcores]);
207         /* We use the BSP's CR3 for now. This should be fine because they
208          * change it later anyway. */
209         vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
210
211         /* Starting RIP is passed in via rdi. */
212         vm_tf_ap->tf_rip = vm_tf->tf_rdi;
213
214         /* Starting RSP is passed in via rsi. */
215         vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
216
217         vm->up_gpcs++;
218
219         start_guest_thread(vm->gths[cur_pcores]);
220
221         return TRUE;
222 }
223
224 static bool handle_vmcall(struct guest_thread *gth)
225 {
226         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
227         bool retval = FALSE;
228
229         if (gth->vmcall)
230                 return gth->vmcall(gth, vm_tf);
231
232         switch (vm_tf->tf_rax) {
233                 case VMCALL_PRINTC:
234                         retval = handle_vmcall_printc(gth);
235                         break;
236                 case VMCALL_SMPBOOT:
237                         retval = handle_vmcall_smpboot(gth);
238                         break;
239         }
240
241         if (retval)
242                 vm_tf->tf_rip += 3;
243
244         return retval;
245 }
246
247 static bool handle_io(struct guest_thread *gth)
248 {
249         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
250         int ret = io(gth);
251
252         if (ret < 0)
253                 return FALSE;
254         if (ret == VM_PAGE_FAULT) {
255                 /* We were unable to translate RIP due to an ept fault */
256                 vm_tf->tf_trap_inject = VM_TRAP_VALID
257                                       | VM_TRAP_ERROR_CODE
258                                       | VM_TRAP_HARDWARE
259                                       | HW_TRAP_PAGE_FAULT;
260         }
261         return TRUE;
262 }
263
264 static bool handle_msr(struct guest_thread *gth)
265 {
266         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
267
268         if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
269                 /* Use event injection through vmctl to send a general protection fault
270                  * vmctl.interrupt gets written to the VM-Entry Interruption-Information
271                  * Field by vmx */
272                 vm_tf->tf_trap_inject = VM_TRAP_VALID
273                                       | VM_TRAP_ERROR_CODE
274                                       | VM_TRAP_HARDWARE
275                                       | HW_TRAP_GP_FAULT;
276         } else {
277                 vm_tf->tf_rip += 2;
278         }
279         return TRUE;
280 }
281
282 static bool handle_apic_access(struct guest_thread *gth)
283 {
284         uint64_t gpa, *regp;
285         uint8_t regx;
286         int store, size;
287         int advance;
288         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
289
290         if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
291                 return FALSE;
292         if (__apic_access(gth, gpa, regx, regp, store))
293                 return FALSE;
294         vm_tf->tf_rip += advance;
295         return TRUE;
296 }
297
298 static bool handle_halt(struct guest_thread *gth)
299 {
300         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
301
302         if (gth->halt_exit)
303                 return FALSE;
304         /* It's possible the guest disabled IRQs and halted, perhaps waiting on an
305          * NMI or something.  If we need to support that, we can change this.  */
306         sleep_til_irq(gth);
307         vm_tf->tf_rip += 1;
308         return TRUE;
309 }
310
311 /* Is this a vmm specific thing?  or generic?
312  *
313  * what do we do when we want to kill the vm?  what are our other options? */
314 bool handle_vmexit(struct guest_thread *gth)
315 {
316         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
317
318         switch (vm_tf->tf_exit_reason) {
319         case EXIT_REASON_CPUID:
320                 return handle_cpuid(gth);
321         case EXIT_REASON_EPT_VIOLATION:
322                 return handle_ept_fault(gth);
323         case EXIT_REASON_VMCALL:
324                 return handle_vmcall(gth);
325         case EXIT_REASON_IO_INSTRUCTION:
326                 return handle_io(gth);
327         case EXIT_REASON_MSR_WRITE:
328         case EXIT_REASON_MSR_READ:
329                 return handle_msr(gth);
330         case EXIT_REASON_APIC_ACCESS:
331                 return handle_apic_access(gth);
332         case EXIT_REASON_HLT:
333                 return handle_halt(gth);
334         case EXIT_REASON_EXTERNAL_INTERRUPT:
335         case EXIT_REASON_APIC_WRITE:
336                 /* TODO: just ignore these? */
337                 return TRUE;
338         default:
339                 fprintf(stderr, "VMM library: don't know how to handle exit %d\n",
340                         vm_tf->tf_exit_reason);
341                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
342                         vm_tf->tf_exit_reason);
343                 return FALSE;
344         }
345 }