parlib: have 2LS libraries #include parlib/stdio.h
[akaros.git] / user / vmm / vmexit.c
index fa61c80..0cbff79 100644 (file)
@@ -6,21 +6,34 @@
 #include <vmm/virtio_mmio.h>
 #include <vmm/virtio_ids.h>
 #include <vmm/virtio_config.h>
+#include <vmm/mmio.h>
 #include <vmm/vmm.h>
 #include <parlib/arch/trap.h>
 #include <parlib/bitmask.h>
-#include <stdio.h>
+#include <parlib/stdio.h>
 
 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
 {
-       return GET_BITMASK_BIT(gpci->posted_irq_desc, VMX_POSTED_OUTSTANDING_NOTIF);
+       return GET_BITMASK_BIT(gpci->posted_irq_desc,
+                              VMX_POSTED_OUTSTANDING_NOTIF);
 }
 
-static bool rvi_is_set(struct guest_thread *gth)
+/* Returns true if the hardware will trigger an IRQ for the guest.  These
+ * virtual IRQs are only processed under certain situations, like vmentry, and
+ * posted IRQs.  See 'Evaluation of Pending Virtual Interrupts' in the SDM. */
+static bool virtual_irq_is_pending(struct guest_thread *gth)
 {
-       uint8_t rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
-
-       return rvi != 0;
+       struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
+       uint8_t rvi, vppr;
+
+       /* Currently, the lower 4 bits are various ways to block IRQs, e.g.
+        * blocking by STI.  The other bits are must be 0.  Presumably any new
+        * bits are types of IRQ blocking. */
+       if (gth_to_vmtf(gth)->tf_intrinfo1)
+               return false;
+       vppr = read_mmreg32((uintptr_t)gth_to_gpci(gth)->vapic_addr + 0xa0);
+       rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
+       return (rvi & 0xf0) > (vppr & 0xf0);
 }
 
 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
@@ -29,44 +42,41 @@ static void sleep_til_irq(struct guest_thread *gth)
 {
        struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
 
-       /* The invariant is that if an IRQ is posted, but not delivered, we will not
-        * sleep.  Anyone who posts an IRQ must signal after setting it.
+       /* The invariant is that if an IRQ is posted, but not delivered, we will
+        * not sleep.  Anyone who posts an IRQ must signal after setting it.
         * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
-        * posting, we'll need to revist this.
+        * posting, we'll need to revist this.  For more details, see the notes
+        * in the kernel IPI-IRC fast path.
         *
         * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
         * possible that the hardware attempted to post the interrupt.  In SDM
-        * parlance, the processor could have "recognized" the virtual IRQ, but not
-        * delivered it yet.  This could happen if the guest had executed "sti", but
-        * not "hlt" yet.  The IRQ was posted and recognized, but not delivered
-        * ("sti blocking").  Then the guest executes "hlt", and vmexits.
-        * OUTSTANDING_NOTIF will be clear in this case.  RVI should be set - at
-        * least to the vector we just sent, but possibly to a greater vector if
-        * multiple were sent.  RVI should only be cleared after virtual IRQs were
-        * actually delivered.  So checking OUTSTANDING_NOTIF and RVI should
-        * suffice.
+        * parlance, the processor could have "recognized" the virtual IRQ, but
+        * not delivered it yet.  This could happen if the guest had executed
+        * "sti", but not "hlt" yet.  The IRQ was posted and recognized, but not
+        * delivered ("sti blocking").  Then the guest executes "hlt", and
+        * vmexits.  OUTSTANDING_NOTIF will be clear in this case.  RVI should
+        * be set - at least to the vector we just sent, but possibly to a
+        * greater vector if multiple were sent.  RVI should only be cleared
+        * after virtual IRQs were actually delivered.  So checking
+        * OUTSTANDING_NOTIF and RVI should suffice.
         *
-        * Generally, we should also check GUEST_INTERRUPTIBILITY_INFO to see if
-        * there's some reason to not deliver the interrupt and check things like
-        * the VPPR (priority register).  But since we're emulating a halt, mwait,
-        * or something else that needs to be woken by an IRQ, we can ignore that
-        * and just wake them up.  Note that we won't actually deliver the IRQ,
-        * we'll just restart the guest and the hardware will deliver the virtual
-        * IRQ at the appropriate time.  So in the event that something weird
-        * happens, the halt/mwait just returns spuriously.
+        * Note that when we see a notif or pending virtual IRQ, we don't
+        * actually deliver the IRQ, we'll just restart the guest and the
+        * hardware will deliver the virtual IRQ at the appropriate time.
         *
-        * The more traditional race here is if the halt starts concurrently with
-        * the post; that's why we sync with the mutex to make sure there is an
-        * ordering between the actual halt (this function) and the posting. */
+        * The more traditional race here is if the halt starts concurrently
+        * with the post; that's why we sync with the mutex to make sure there
+        * is an ordering between the actual halt (this function) and the
+        * posting. */
        uth_mutex_lock(gth->halt_mtx);
-       while (!(pir_notif_is_set(gpci) || rvi_is_set(gth)))
+       while (!(pir_notif_is_set(gpci) || virtual_irq_is_pending(gth)))
                uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
        uth_mutex_unlock(gth->halt_mtx);
 }
 
 enum {
-               CPUID_0B_LEVEL_SMT = 0,
-               CPUID_0B_LEVEL_CORE
+       CPUID_0B_LEVEL_SMT = 0,
+       CPUID_0B_LEVEL_CORE
 };
 
 static bool handle_cpuid(struct guest_thread *gth)
@@ -109,8 +119,17 @@ static bool handle_ept_fault(struct guest_thread *gth)
        uint8_t regx;
        int store, size;
        int advance;
-
-       int ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
+       int ret;
+
+       if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
+               ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0,
+                                 0, 0);
+               if (ret <= 0)
+                       panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
+                             ret);
+               return TRUE;
+       }
+       ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
 
        if (ret < 0)
                return FALSE;
@@ -124,19 +143,21 @@ static bool handle_ept_fault(struct guest_thread *gth)
        }
 
        assert(size >= 0);
-       /* TODO use helpers for some of these addr checks.  the fee/fec ones might
-        * be wrong too. */
+       /* TODO use helpers for some of these addr checks.  the fee/fec ones
+        * might be wrong too. */
        for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
                if (vm->virtio_mmio_devices[i] == NULL)
                        continue;
                if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
                        continue;
-               /* TODO: can the guest cause us to spawn off infinite threads? */
+               /* TODO: can the guest cause us to spawn off infinite threads?
+                */
                if (store)
-                       virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size,
-                                      (uint32_t *)regp);
+                       virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa,
+                                      size, (uint32_t *)regp);
                else
-                       *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size);
+                       *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i],
+                                              gpa, size);
                vm_tf->tf_rip += advance;
                return TRUE;
        }
@@ -178,7 +199,8 @@ static bool handle_vmcall_smpboot(struct guest_thread *gth)
        struct virtual_machine *vm = gth_to_vm(gth);
        int cur_pcores = vm->up_gpcs;
 
-       /* Check if we're guest pcore 0. Only the BSP is allowed to start APs. */
+       /* Check if we're guest pcore 0. Only the BSP is allowed to start APs.
+        */
        if (vm_tf->tf_guest_pcoreid != 0) {
                fprintf(stderr,
                        "Only guest pcore 0 is allowed to start APs. core was %ld\n",
@@ -195,7 +217,7 @@ static bool handle_vmcall_smpboot(struct guest_thread *gth)
        }
 
        /* Start up secondary core. */
-       vm_tf_ap = gth_to_vmtf(vm->gths[cur_pcores]);
+       vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
        /* We use the BSP's CR3 for now. This should be fine because they
         * change it later anyway. */
        vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
@@ -208,26 +230,74 @@ static bool handle_vmcall_smpboot(struct guest_thread *gth)
 
        vm->up_gpcs++;
 
-       start_guest_thread(vm->gths[cur_pcores]);
+       start_guest_thread(gpcid_to_gth(vm, cur_pcores));
+
+       return TRUE;
+}
+
+static bool handle_vmcall_get_tscfreq(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct vm_trapframe *vm_tf_ap;
+       struct virtual_machine *vm = gth_to_vm(gth);
 
+       vm_tf->tf_rax = get_tsc_freq() / 1000;
        return TRUE;
 }
 
 static bool handle_vmcall(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct virtual_machine *vm = gth_to_vm(gth);
        bool retval = FALSE;
 
-       if (gth->vmcall)
-               return gth->vmcall(gth, vm_tf);
+       if (vm->vmcall)
+               return vm->vmcall(gth, vm_tf);
 
        switch (vm_tf->tf_rax) {
-               case VMCALL_PRINTC:
-                       retval = handle_vmcall_printc(gth);
-                       break;
-               case VMCALL_SMPBOOT:
-                       retval = handle_vmcall_smpboot(gth);
-                       break;
+       case VMCALL_PRINTC:
+               retval = handle_vmcall_printc(gth);
+               break;
+       case VMCALL_SMPBOOT:
+               retval = handle_vmcall_smpboot(gth);
+               break;
+       case VMCALL_GET_TSCFREQ:
+               retval = handle_vmcall_get_tscfreq(gth);
+               break;
+       case VMCALL_TRACE_TF:
+               trace_printf("  rax  0x%016lx\n",      vm_tf->tf_r11);
+               trace_printf("  rbx  0x%016lx\n",      vm_tf->tf_rbx);
+               trace_printf("  rcx  0x%016lx\n",      vm_tf->tf_rcx);
+               trace_printf("  rdx  0x%016lx\n",      vm_tf->tf_rdx);
+               trace_printf("  rbp  0x%016lx\n",      vm_tf->tf_rbp);
+               trace_printf("  rsi  0x%016lx\n",      vm_tf->tf_rsi);
+               trace_printf("  rdi  0x%016lx\n",      vm_tf->tf_rdi);
+               trace_printf("  r8   0x%016lx\n",      vm_tf->tf_r8);
+               trace_printf("  r9   0x%016lx\n",      vm_tf->tf_r9);
+               trace_printf("  r10  0x%016lx\n",      vm_tf->tf_r10);
+               trace_printf("  r11  0x%016lx\n",      0xdeadbeef);
+               trace_printf("  r12  0x%016lx\n",      vm_tf->tf_r12);
+               trace_printf("  r13  0x%016lx\n",      vm_tf->tf_r13);
+               trace_printf("  r14  0x%016lx\n",      vm_tf->tf_r14);
+               trace_printf("  r15  0x%016lx\n",      vm_tf->tf_r15);
+               trace_printf("  rip  0x%016lx\n",      vm_tf->tf_rip);
+               trace_printf("  rflg 0x%016lx\n",      vm_tf->tf_rflags);
+               trace_printf("  rsp  0x%016lx\n",      vm_tf->tf_rsp);
+               trace_printf("  cr2  0x%016lx\n",      vm_tf->tf_cr2);
+               trace_printf("  cr3  0x%016lx\n",      vm_tf->tf_cr3);
+               trace_printf("Gpcore 0x%08x\n",        vm_tf->tf_guest_pcoreid);
+               trace_printf("Flags  0x%08x\n",        vm_tf->tf_flags);
+               trace_printf("Inject 0x%08x\n",        vm_tf->tf_trap_inject);
+               trace_printf("ExitRs 0x%08x\n",        vm_tf->tf_exit_reason);
+               trace_printf("ExitQl 0x%08x\n",        vm_tf->tf_exit_qual);
+               trace_printf("Intr1  0x%016lx\n",      vm_tf->tf_intrinfo1);
+               trace_printf("Intr2  0x%016lx\n",      vm_tf->tf_intrinfo2);
+               trace_printf("GIntr  0x----%04x\n",
+                            vm_tf->tf_guest_intr_status);
+               trace_printf("GVA    0x%016lx\n",      vm_tf->tf_guest_va);
+               trace_printf("GPA    0x%016lx\n",      vm_tf->tf_guest_pa);
+               retval = true;
+               break;
        }
 
        if (retval)
@@ -258,9 +328,9 @@ static bool handle_msr(struct guest_thread *gth)
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 
        if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
-               /* Use event injection through vmctl to send a general protection fault
-                * vmctl.interrupt gets written to the VM-Entry Interruption-Information
-                * Field by vmx */
+               /* Use event injection through vmctl to send a general
+                * protection fault vmctl.interrupt gets written to the VM-Entry
+                * Interruption-Information Field by vmx */
                vm_tf->tf_trap_inject = VM_TRAP_VALID
                                      | VM_TRAP_ERROR_CODE
                                      | VM_TRAP_HARDWARE
@@ -290,23 +360,29 @@ static bool handle_apic_access(struct guest_thread *gth)
 static bool handle_halt(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct virtual_machine *vm = gth_to_vm(gth);
 
-       if (gth->halt_exit)
+       if (vm->halt_exit)
                return FALSE;
-       /* It's possible the guest disabled IRQs and halted, perhaps waiting on an
-        * NMI or something.  If we need to support that, we can change this.  */
+       /* It's possible the guest disabled IRQs and halted, perhaps waiting on
+        * an NMI or something.  If we need to support that, we can change this.
+        */
        sleep_til_irq(gth);
        vm_tf->tf_rip += 1;
        return TRUE;
 }
 
+/* The guest is told (via cpuid) that there is no monitor/mwait.  Callers of
+ * mwait are paravirtualized halts.
+ *
+ * We don't support monitor/mwait in software, so if they tried to mwait
+ * without break-on-interrupt and with interrupts disabled, they'll never
+ * wake up.  So we'll always break on interrupt. */
 static bool handle_mwait(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct virtual_machine *vm = gth_to_vm(gth);
 
-       /* TODO: we need to handle the actual monitor part of mwait.  This just
-        * implements the power management / halting.  Likewise, it's possible IRQs
-        * are disabled (as with halt). */
        sleep_til_irq(gth);
        vm_tf->tf_rip += 3;
        return TRUE;
@@ -342,7 +418,8 @@ bool handle_vmexit(struct guest_thread *gth)
                /* TODO: just ignore these? */
                return TRUE;
        default:
-               fprintf(stderr, "VMM library: don't know how to handle exit %d\n",
+               fprintf(stderr,
+                       "VMM library: don't know how to handle exit %d\n",
                        vm_tf->tf_exit_reason);
                fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
                        vm_tf->tf_exit_reason);