Convert run_once() to parlib_run_once() (XCC)
[akaros.git] / user / vmm / vmexit.c
index 3a38361..fa61c80 100644 (file)
@@ -8,10 +8,98 @@
 #include <vmm/virtio_config.h>
 #include <vmm/vmm.h>
 #include <parlib/arch/trap.h>
+#include <parlib/bitmask.h>
 #include <stdio.h>
 
-/* TODO: need infrastructure to handle GPC wakeup properly */
-static bool consdata = FALSE;
+static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
+{
+       return GET_BITMASK_BIT(gpci->posted_irq_desc, VMX_POSTED_OUTSTANDING_NOTIF);
+}
+
+static bool rvi_is_set(struct guest_thread *gth)
+{
+       uint8_t rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
+
+       return rvi != 0;
+}
+
+/* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
+ * vmm_interrupt_guest(). */
+static void sleep_til_irq(struct guest_thread *gth)
+{
+       struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
+
+       /* The invariant is that if an IRQ is posted, but not delivered, we will not
+        * sleep.  Anyone who posts an IRQ must signal after setting it.
+        * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
+        * posting, we'll need to revist this.
+        *
+        * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
+        * possible that the hardware attempted to post the interrupt.  In SDM
+        * parlance, the processor could have "recognized" the virtual IRQ, but not
+        * delivered it yet.  This could happen if the guest had executed "sti", but
+        * not "hlt" yet.  The IRQ was posted and recognized, but not delivered
+        * ("sti blocking").  Then the guest executes "hlt", and vmexits.
+        * OUTSTANDING_NOTIF will be clear in this case.  RVI should be set - at
+        * least to the vector we just sent, but possibly to a greater vector if
+        * multiple were sent.  RVI should only be cleared after virtual IRQs were
+        * actually delivered.  So checking OUTSTANDING_NOTIF and RVI should
+        * suffice.
+        *
+        * Generally, we should also check GUEST_INTERRUPTIBILITY_INFO to see if
+        * there's some reason to not deliver the interrupt and check things like
+        * the VPPR (priority register).  But since we're emulating a halt, mwait,
+        * or something else that needs to be woken by an IRQ, we can ignore that
+        * and just wake them up.  Note that we won't actually deliver the IRQ,
+        * we'll just restart the guest and the hardware will deliver the virtual
+        * IRQ at the appropriate time.  So in the event that something weird
+        * happens, the halt/mwait just returns spuriously.
+        *
+        * The more traditional race here is if the halt starts concurrently with
+        * the post; that's why we sync with the mutex to make sure there is an
+        * ordering between the actual halt (this function) and the posting. */
+       uth_mutex_lock(gth->halt_mtx);
+       while (!(pir_notif_is_set(gpci) || rvi_is_set(gth)))
+               uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
+       uth_mutex_unlock(gth->halt_mtx);
+}
+
+enum {
+               CPUID_0B_LEVEL_SMT = 0,
+               CPUID_0B_LEVEL_CORE
+};
+
+static bool handle_cpuid(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct virtual_machine *vm = gth_to_vm(gth);
+       uint32_t level = vm_tf->tf_rcx & 0x0F;
+
+       if (vm_tf->tf_rax != 0x0B)
+               return FALSE;
+
+       vm_tf->tf_rip += 2;
+       vm_tf->tf_rax = 0;
+       vm_tf->tf_rbx = 0;
+       vm_tf->tf_rcx = level;
+       vm_tf->tf_rdx = gth->gpc_id;
+       if (level == CPUID_0B_LEVEL_SMT) {
+               vm_tf->tf_rax = 0;
+               vm_tf->tf_rbx = 1;
+               vm_tf->tf_rcx |= ((level + 1) << 8);
+       }
+       if (level == CPUID_0B_LEVEL_CORE) {
+               uint32_t shift = LOG2_UP(vm->nr_gpcs);
+
+               if (shift > 0x1F)
+                       shift = 0x1F;
+               vm_tf->tf_rax = shift;
+               vm_tf->tf_rbx = vm->nr_gpcs;
+               vm_tf->tf_rcx |= ((level + 1) << 8);
+       }
+
+       return TRUE;
+}
 
 static bool handle_ept_fault(struct guest_thread *gth)
 {
@@ -22,28 +110,37 @@ static bool handle_ept_fault(struct guest_thread *gth)
        int store, size;
        int advance;
 
-       if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
+       int ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
+
+       if (ret < 0)
                return FALSE;
+       if (ret == VM_PAGE_FAULT) {
+               /* We were unable to translate RIP due to an ept fault */
+               vm_tf->tf_trap_inject = VM_TRAP_VALID
+                                     | VM_TRAP_ERROR_CODE
+                                     | VM_TRAP_HARDWARE
+                                     | HW_TRAP_PAGE_FAULT;
+               return TRUE;
+       }
+
+       assert(size >= 0);
        /* TODO use helpers for some of these addr checks.  the fee/fec ones might
         * be wrong too. */
-       if (PG_ADDR(gpa) == vm->virtio_mmio_base) {
+       for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
+               if (vm->virtio_mmio_devices[i] == NULL)
+                       continue;
+               if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
+                       continue;
                /* TODO: can the guest cause us to spawn off infinite threads? */
-               virtio_mmio(gth, gpa, regx, regp, store);
-               if (size < 0) {
-                       // TODO: It would be preferable for the decoder to return an
-                       //       unsigned value, so that we don't have to worry
-                       //       about this. I don't know if it's even possible for
-                       //       the width to be negative;
-                       VIRTIO_DRI_ERRX(cons_mmio_dev.vqdev,
-                           "Driver tried to access the device with a negative access width in the instruction?");
-               }
-               //fprintf(stderr, "RIP is 0x%x\n", vm_tf->tf_rip);
                if (store)
-                       virtio_mmio_wr(&cons_mmio_dev, gpa, size, (uint32_t *)regp);
+                       virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size,
+                                      (uint32_t *)regp);
                else
-                       *regp = virtio_mmio_rd(&cons_mmio_dev, gpa, size);
-
-       } else if (PG_ADDR(gpa) == 0xfec00000) {
+                       *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size);
+               vm_tf->tf_rip += advance;
+               return TRUE;
+       }
+       if (PG_ADDR(gpa) == 0xfec00000) {
                do_ioapic(gth, gpa, regx, regp, store);
        } else if (PG_ADDR(gpa) == 0) {
                memmove(regp, &vm->low4k[gpa], size);
@@ -53,7 +150,7 @@ static bool handle_ept_fault(struct guest_thread *gth)
                                vm_tf->tf_exit_reason);
                fprintf(stderr, "Returning 0xffffffff\n");
                showstatus(stderr, gth);
-               // Just fill the whole register for now.
+               /* Just fill the whole register for now. */
                *regp = (uint64_t) -1;
                return FALSE;
        }
@@ -61,7 +158,7 @@ static bool handle_ept_fault(struct guest_thread *gth)
        return TRUE;
 }
 
-static bool handle_vmcall(struct guest_thread *gth)
+static bool handle_vmcall_printc(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
        uint8_t byte;
@@ -70,13 +167,89 @@ static bool handle_vmcall(struct guest_thread *gth)
        printf("%c", byte);
        if (byte == '\n')
                printf("%c", '%');
-       vm_tf->tf_rip += 3;
+       fflush(stdout);
        return TRUE;
 }
 
+static bool handle_vmcall_smpboot(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct vm_trapframe *vm_tf_ap;
+       struct virtual_machine *vm = gth_to_vm(gth);
+       int cur_pcores = vm->up_gpcs;
+
+       /* Check if we're guest pcore 0. Only the BSP is allowed to start APs. */
+       if (vm_tf->tf_guest_pcoreid != 0) {
+               fprintf(stderr,
+                       "Only guest pcore 0 is allowed to start APs. core was %ld\n",
+                       vm_tf->tf_guest_pcoreid);
+               return FALSE;
+       }
+
+       /* Check if we've reached the maximum, if yes, blow out. */
+       if (vm->nr_gpcs == cur_pcores) {
+               fprintf(stderr,
+                       "guest tried to start up too many cores. max was %ld, current up %ld\n",
+                       vm->nr_gpcs, cur_pcores);
+               return FALSE;
+       }
+
+       /* Start up secondary core. */
+       vm_tf_ap = gth_to_vmtf(vm->gths[cur_pcores]);
+       /* We use the BSP's CR3 for now. This should be fine because they
+        * change it later anyway. */
+       vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
+
+       /* Starting RIP is passed in via rdi. */
+       vm_tf_ap->tf_rip = vm_tf->tf_rdi;
+
+       /* Starting RSP is passed in via rsi. */
+       vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
+
+       vm->up_gpcs++;
+
+       start_guest_thread(vm->gths[cur_pcores]);
+
+       return TRUE;
+}
+
+static bool handle_vmcall(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       bool retval = FALSE;
+
+       if (gth->vmcall)
+               return gth->vmcall(gth, vm_tf);
+
+       switch (vm_tf->tf_rax) {
+               case VMCALL_PRINTC:
+                       retval = handle_vmcall_printc(gth);
+                       break;
+               case VMCALL_SMPBOOT:
+                       retval = handle_vmcall_smpboot(gth);
+                       break;
+       }
+
+       if (retval)
+               vm_tf->tf_rip += 3;
+
+       return retval;
+}
+
 static bool handle_io(struct guest_thread *gth)
 {
-       io(gth);
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       int ret = io(gth);
+
+       if (ret < 0)
+               return FALSE;
+       if (ret == VM_PAGE_FAULT) {
+               /* We were unable to translate RIP due to an ept fault */
+               vm_tf->tf_trap_inject = VM_TRAP_VALID
+                                     | VM_TRAP_ERROR_CODE
+                                     | VM_TRAP_HARDWARE
+                                     | HW_TRAP_PAGE_FAULT;
+       }
        return TRUE;
 }
 
@@ -84,7 +257,6 @@ static bool handle_msr(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 
-       /* TODO: consider pushing the gth into msrio */
        if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
                /* Use event injection through vmctl to send a general protection fault
                 * vmctl.interrupt gets written to the VM-Entry Interruption-Information
@@ -119,8 +291,11 @@ static bool handle_halt(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 
-       while (!consdata)
-               ;
+       if (gth->halt_exit)
+               return FALSE;
+       /* It's possible the guest disabled IRQs and halted, perhaps waiting on an
+        * NMI or something.  If we need to support that, we can change this.  */
+       sleep_til_irq(gth);
        vm_tf->tf_rip += 1;
        return TRUE;
 }
@@ -129,8 +304,10 @@ static bool handle_mwait(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 
-       while (!consdata)
-               ;
+       /* TODO: we need to handle the actual monitor part of mwait.  This just
+        * implements the power management / halting.  Likewise, it's possible IRQs
+        * are disabled (as with halt). */
+       sleep_til_irq(gth);
        vm_tf->tf_rip += 3;
        return TRUE;
 }
@@ -143,6 +320,8 @@ bool handle_vmexit(struct guest_thread *gth)
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 
        switch (vm_tf->tf_exit_reason) {
+       case EXIT_REASON_CPUID:
+               return handle_cpuid(gth);
        case EXIT_REASON_EPT_VIOLATION:
                return handle_ept_fault(gth);
        case EXIT_REASON_VMCALL:
@@ -163,7 +342,7 @@ bool handle_vmexit(struct guest_thread *gth)
                /* TODO: just ignore these? */
                return TRUE;
        default:
-               fprintf(stderr, "Don't know how to handle exit %d\n",
+               fprintf(stderr, "VMM library: don't know how to handle exit %d\n",
                        vm_tf->tf_exit_reason);
                fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
                        vm_tf->tf_exit_reason);