vmm: Export the TSC freq via a vmcall (XCC)
[akaros.git] / user / vmm / vmexit.c
1 /* Copyright (c) 2015-2016 Google Inc.
2  * See LICENSE for details. */
3
4 #include <parlib/common.h>
5 #include <vmm/virtio.h>
6 #include <vmm/virtio_mmio.h>
7 #include <vmm/virtio_ids.h>
8 #include <vmm/virtio_config.h>
9 #include <vmm/vmm.h>
10 #include <parlib/arch/trap.h>
11 #include <parlib/bitmask.h>
12 #include <stdio.h>
13
14 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
15 {
16         return GET_BITMASK_BIT(gpci->posted_irq_desc, VMX_POSTED_OUTSTANDING_NOTIF);
17 }
18
19 static bool rvi_is_set(struct guest_thread *gth)
20 {
21         uint8_t rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
22
23         return rvi != 0;
24 }
25
26 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
27  * vmm_interrupt_guest(). */
28 static void sleep_til_irq(struct guest_thread *gth)
29 {
30         struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
31
32         /* The invariant is that if an IRQ is posted, but not delivered, we will not
33          * sleep.  Anyone who posts an IRQ must signal after setting it.
34          * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
35          * posting, we'll need to revist this.
36          *
37          * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
38          * possible that the hardware attempted to post the interrupt.  In SDM
39          * parlance, the processor could have "recognized" the virtual IRQ, but not
40          * delivered it yet.  This could happen if the guest had executed "sti", but
41          * not "hlt" yet.  The IRQ was posted and recognized, but not delivered
42          * ("sti blocking").  Then the guest executes "hlt", and vmexits.
43          * OUTSTANDING_NOTIF will be clear in this case.  RVI should be set - at
44          * least to the vector we just sent, but possibly to a greater vector if
45          * multiple were sent.  RVI should only be cleared after virtual IRQs were
46          * actually delivered.  So checking OUTSTANDING_NOTIF and RVI should
47          * suffice.
48          *
49          * Generally, we should also check GUEST_INTERRUPTIBILITY_INFO to see if
50          * there's some reason to not deliver the interrupt and check things like
51          * the VPPR (priority register).  But since we're emulating a halt, mwait,
52          * or something else that needs to be woken by an IRQ, we can ignore that
53          * and just wake them up.  Note that we won't actually deliver the IRQ,
54          * we'll just restart the guest and the hardware will deliver the virtual
55          * IRQ at the appropriate time.  So in the event that something weird
56          * happens, the halt/mwait just returns spuriously.
57          *
58          * The more traditional race here is if the halt starts concurrently with
59          * the post; that's why we sync with the mutex to make sure there is an
60          * ordering between the actual halt (this function) and the posting. */
61         uth_mutex_lock(gth->halt_mtx);
62         while (!(pir_notif_is_set(gpci) || rvi_is_set(gth)))
63                 uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
64         uth_mutex_unlock(gth->halt_mtx);
65 }
66
67 enum {
68                 CPUID_0B_LEVEL_SMT = 0,
69                 CPUID_0B_LEVEL_CORE
70 };
71
72 static bool handle_cpuid(struct guest_thread *gth)
73 {
74         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
75         struct virtual_machine *vm = gth_to_vm(gth);
76         uint32_t level = vm_tf->tf_rcx & 0x0F;
77
78         if (vm_tf->tf_rax != 0x0B)
79                 return FALSE;
80
81         vm_tf->tf_rip += 2;
82         vm_tf->tf_rax = 0;
83         vm_tf->tf_rbx = 0;
84         vm_tf->tf_rcx = level;
85         vm_tf->tf_rdx = gth->gpc_id;
86         if (level == CPUID_0B_LEVEL_SMT) {
87                 vm_tf->tf_rax = 0;
88                 vm_tf->tf_rbx = 1;
89                 vm_tf->tf_rcx |= ((level + 1) << 8);
90         }
91         if (level == CPUID_0B_LEVEL_CORE) {
92                 uint32_t shift = LOG2_UP(vm->nr_gpcs);
93
94                 if (shift > 0x1F)
95                         shift = 0x1F;
96                 vm_tf->tf_rax = shift;
97                 vm_tf->tf_rbx = vm->nr_gpcs;
98                 vm_tf->tf_rcx |= ((level + 1) << 8);
99         }
100
101         return TRUE;
102 }
103
104 static bool handle_ept_fault(struct guest_thread *gth)
105 {
106         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
107         struct virtual_machine *vm = gth_to_vm(gth);
108         uint64_t gpa, *regp;
109         uint8_t regx;
110         int store, size;
111         int advance;
112         int ret;
113
114         if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
115                 ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0, 0, 0);
116                 if (ret <= 0)
117                         panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
118                               ret);
119                 return TRUE;
120         }
121         ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
122
123         if (ret < 0)
124                 return FALSE;
125         if (ret == VM_PAGE_FAULT) {
126                 /* We were unable to translate RIP due to an ept fault */
127                 vm_tf->tf_trap_inject = VM_TRAP_VALID
128                                       | VM_TRAP_ERROR_CODE
129                                       | VM_TRAP_HARDWARE
130                                       | HW_TRAP_PAGE_FAULT;
131                 return TRUE;
132         }
133
134         assert(size >= 0);
135         /* TODO use helpers for some of these addr checks.  the fee/fec ones might
136          * be wrong too. */
137         for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
138                 if (vm->virtio_mmio_devices[i] == NULL)
139                         continue;
140                 if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
141                         continue;
142                 /* TODO: can the guest cause us to spawn off infinite threads? */
143                 if (store)
144                         virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size,
145                                        (uint32_t *)regp);
146                 else
147                         *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size);
148                 vm_tf->tf_rip += advance;
149                 return TRUE;
150         }
151         if (PG_ADDR(gpa) == 0xfec00000) {
152                 do_ioapic(gth, gpa, regx, regp, store);
153         } else if (PG_ADDR(gpa) == 0) {
154                 memmove(regp, &vm->low4k[gpa], size);
155         } else {
156                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
157                 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
158                                 vm_tf->tf_exit_reason);
159                 fprintf(stderr, "Returning 0xffffffff\n");
160                 showstatus(stderr, gth);
161                 /* Just fill the whole register for now. */
162                 *regp = (uint64_t) -1;
163                 return FALSE;
164         }
165         vm_tf->tf_rip += advance;
166         return TRUE;
167 }
168
169 static bool handle_vmcall_printc(struct guest_thread *gth)
170 {
171         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
172         uint8_t byte;
173
174         byte = vm_tf->tf_rdi;
175         printf("%c", byte);
176         if (byte == '\n')
177                 printf("%c", '%');
178         fflush(stdout);
179         return TRUE;
180 }
181
182 static bool handle_vmcall_smpboot(struct guest_thread *gth)
183 {
184         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
185         struct vm_trapframe *vm_tf_ap;
186         struct virtual_machine *vm = gth_to_vm(gth);
187         int cur_pcores = vm->up_gpcs;
188
189         /* Check if we're guest pcore 0. Only the BSP is allowed to start APs. */
190         if (vm_tf->tf_guest_pcoreid != 0) {
191                 fprintf(stderr,
192                         "Only guest pcore 0 is allowed to start APs. core was %ld\n",
193                         vm_tf->tf_guest_pcoreid);
194                 return FALSE;
195         }
196
197         /* Check if we've reached the maximum, if yes, blow out. */
198         if (vm->nr_gpcs == cur_pcores) {
199                 fprintf(stderr,
200                         "guest tried to start up too many cores. max was %ld, current up %ld\n",
201                         vm->nr_gpcs, cur_pcores);
202                 return FALSE;
203         }
204
205         /* Start up secondary core. */
206         vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
207         /* We use the BSP's CR3 for now. This should be fine because they
208          * change it later anyway. */
209         vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
210
211         /* Starting RIP is passed in via rdi. */
212         vm_tf_ap->tf_rip = vm_tf->tf_rdi;
213
214         /* Starting RSP is passed in via rsi. */
215         vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
216
217         vm->up_gpcs++;
218
219         start_guest_thread(gpcid_to_gth(vm, cur_pcores));
220
221         return TRUE;
222 }
223
224 static bool handle_vmcall_get_tscfreq(struct guest_thread *gth)
225 {
226         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
227         struct vm_trapframe *vm_tf_ap;
228         struct virtual_machine *vm = gth_to_vm(gth);
229
230         vm_tf->tf_rax = get_tsc_freq() / 1000;
231         return TRUE;
232 }
233
234 static bool handle_vmcall(struct guest_thread *gth)
235 {
236         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
237         struct virtual_machine *vm = gth_to_vm(gth);
238         bool retval = FALSE;
239
240         if (vm->vmcall)
241                 return vm->vmcall(gth, vm_tf);
242
243         switch (vm_tf->tf_rax) {
244         case VMCALL_PRINTC:
245                 retval = handle_vmcall_printc(gth);
246                 break;
247         case VMCALL_SMPBOOT:
248                 retval = handle_vmcall_smpboot(gth);
249                 break;
250         case VMCALL_GET_TSCFREQ:
251                 retval = handle_vmcall_get_tscfreq(gth);
252                 break;
253         }
254
255         if (retval)
256                 vm_tf->tf_rip += 3;
257
258         return retval;
259 }
260
261 static bool handle_io(struct guest_thread *gth)
262 {
263         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
264         int ret = io(gth);
265
266         if (ret < 0)
267                 return FALSE;
268         if (ret == VM_PAGE_FAULT) {
269                 /* We were unable to translate RIP due to an ept fault */
270                 vm_tf->tf_trap_inject = VM_TRAP_VALID
271                                       | VM_TRAP_ERROR_CODE
272                                       | VM_TRAP_HARDWARE
273                                       | HW_TRAP_PAGE_FAULT;
274         }
275         return TRUE;
276 }
277
278 static bool handle_msr(struct guest_thread *gth)
279 {
280         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
281
282         if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
283                 /* Use event injection through vmctl to send a general protection fault
284                  * vmctl.interrupt gets written to the VM-Entry Interruption-Information
285                  * Field by vmx */
286                 vm_tf->tf_trap_inject = VM_TRAP_VALID
287                                       | VM_TRAP_ERROR_CODE
288                                       | VM_TRAP_HARDWARE
289                                       | HW_TRAP_GP_FAULT;
290         } else {
291                 vm_tf->tf_rip += 2;
292         }
293         return TRUE;
294 }
295
296 static bool handle_apic_access(struct guest_thread *gth)
297 {
298         uint64_t gpa, *regp;
299         uint8_t regx;
300         int store, size;
301         int advance;
302         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
303
304         if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
305                 return FALSE;
306         if (__apic_access(gth, gpa, regx, regp, store))
307                 return FALSE;
308         vm_tf->tf_rip += advance;
309         return TRUE;
310 }
311
312 static bool handle_halt(struct guest_thread *gth)
313 {
314         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
315         struct virtual_machine *vm = gth_to_vm(gth);
316
317         if (vm->halt_exit)
318                 return FALSE;
319         /* It's possible the guest disabled IRQs and halted, perhaps waiting on an
320          * NMI or something.  If we need to support that, we can change this.  */
321         sleep_til_irq(gth);
322         vm_tf->tf_rip += 1;
323         return TRUE;
324 }
325
326 /* The guest is told (via cpuid) that there is no monitor/mwait.  Callers of
327  * mwait are paravirtualized halts.
328  *
329  * We don't support monitor/mwait in software, so if they tried to mwait
330  * without break-on-interrupt and with interrupts disabled, they'll never
331  * wake up.  So we'll always break on interrupt. */
332 static bool handle_mwait(struct guest_thread *gth)
333 {
334         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
335         struct virtual_machine *vm = gth_to_vm(gth);
336
337         sleep_til_irq(gth);
338         vm_tf->tf_rip += 3;
339         return TRUE;
340 }
341
342 /* Is this a vmm specific thing?  or generic?
343  *
344  * what do we do when we want to kill the vm?  what are our other options? */
345 bool handle_vmexit(struct guest_thread *gth)
346 {
347         struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
348
349         switch (vm_tf->tf_exit_reason) {
350         case EXIT_REASON_CPUID:
351                 return handle_cpuid(gth);
352         case EXIT_REASON_EPT_VIOLATION:
353                 return handle_ept_fault(gth);
354         case EXIT_REASON_VMCALL:
355                 return handle_vmcall(gth);
356         case EXIT_REASON_IO_INSTRUCTION:
357                 return handle_io(gth);
358         case EXIT_REASON_MSR_WRITE:
359         case EXIT_REASON_MSR_READ:
360                 return handle_msr(gth);
361         case EXIT_REASON_APIC_ACCESS:
362                 return handle_apic_access(gth);
363         case EXIT_REASON_HLT:
364                 return handle_halt(gth);
365         case EXIT_REASON_MWAIT_INSTRUCTION:
366                 return handle_mwait(gth);
367         case EXIT_REASON_EXTERNAL_INTERRUPT:
368         case EXIT_REASON_APIC_WRITE:
369                 /* TODO: just ignore these? */
370                 return TRUE;
371         default:
372                 fprintf(stderr, "VMM library: don't know how to handle exit %d\n",
373                         vm_tf->tf_exit_reason);
374                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
375                         vm_tf->tf_exit_reason);
376                 return FALSE;
377         }
378 }