parlib: have 2LS libraries #include parlib/stdio.h
[akaros.git] / user / vmm / vmexit.c
index 4404ec9..0cbff79 100644 (file)
@@ -6,21 +6,34 @@
 #include <vmm/virtio_mmio.h>
 #include <vmm/virtio_ids.h>
 #include <vmm/virtio_config.h>
 #include <vmm/virtio_mmio.h>
 #include <vmm/virtio_ids.h>
 #include <vmm/virtio_config.h>
+#include <vmm/mmio.h>
 #include <vmm/vmm.h>
 #include <parlib/arch/trap.h>
 #include <parlib/bitmask.h>
 #include <vmm/vmm.h>
 #include <parlib/arch/trap.h>
 #include <parlib/bitmask.h>
-#include <stdio.h>
+#include <parlib/stdio.h>
 
 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
 {
 
 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
 {
-       return GET_BITMASK_BIT(gpci->posted_irq_desc, VMX_POSTED_OUTSTANDING_NOTIF);
+       return GET_BITMASK_BIT(gpci->posted_irq_desc,
+                              VMX_POSTED_OUTSTANDING_NOTIF);
 }
 
 }
 
-static bool rvi_is_set(struct guest_thread *gth)
+/* Returns true if the hardware will trigger an IRQ for the guest.  These
+ * virtual IRQs are only processed under certain situations, like vmentry, and
+ * posted IRQs.  See 'Evaluation of Pending Virtual Interrupts' in the SDM. */
+static bool virtual_irq_is_pending(struct guest_thread *gth)
 {
 {
-       uint8_t rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
+       struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
+       uint8_t rvi, vppr;
 
 
-       return rvi != 0;
+       /* Currently, the lower 4 bits are various ways to block IRQs, e.g.
+        * blocking by STI.  The other bits are must be 0.  Presumably any new
+        * bits are types of IRQ blocking. */
+       if (gth_to_vmtf(gth)->tf_intrinfo1)
+               return false;
+       vppr = read_mmreg32((uintptr_t)gth_to_gpci(gth)->vapic_addr + 0xa0);
+       rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
+       return (rvi & 0xf0) > (vppr & 0xf0);
 }
 
 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
 }
 
 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
@@ -29,41 +42,75 @@ static void sleep_til_irq(struct guest_thread *gth)
 {
        struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
 
 {
        struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
 
-       /* The invariant is that if an IRQ is posted, but not delivered, we will not
-        * sleep.  Anyone who posts an IRQ must signal after setting it.
+       /* The invariant is that if an IRQ is posted, but not delivered, we will
+        * not sleep.  Anyone who posts an IRQ must signal after setting it.
         * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
         * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
-        * posting, we'll need to revist this.
+        * posting, we'll need to revist this.  For more details, see the notes
+        * in the kernel IPI-IRC fast path.
         *
         * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
         * possible that the hardware attempted to post the interrupt.  In SDM
         *
         * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
         * possible that the hardware attempted to post the interrupt.  In SDM
-        * parlance, the processor could have "recognized" the virtual IRQ, but not
-        * delivered it yet.  This could happen if the guest had executed "sti", but
-        * not "hlt" yet.  The IRQ was posted and recognized, but not delivered
-        * ("sti blocking").  Then the guest executes "hlt", and vmexits.
-        * OUTSTANDING_NOTIF will be clear in this case.  RVI should be set - at
-        * least to the vector we just sent, but possibly to a greater vector if
-        * multiple were sent.  RVI should only be cleared after virtual IRQs were
-        * actually delivered.  So checking OUTSTANDING_NOTIF and RVI should
-        * suffice.
+        * parlance, the processor could have "recognized" the virtual IRQ, but
+        * not delivered it yet.  This could happen if the guest had executed
+        * "sti", but not "hlt" yet.  The IRQ was posted and recognized, but not
+        * delivered ("sti blocking").  Then the guest executes "hlt", and
+        * vmexits.  OUTSTANDING_NOTIF will be clear in this case.  RVI should
+        * be set - at least to the vector we just sent, but possibly to a
+        * greater vector if multiple were sent.  RVI should only be cleared
+        * after virtual IRQs were actually delivered.  So checking
+        * OUTSTANDING_NOTIF and RVI should suffice.
         *
         *
-        * Generally, we should also check GUEST_INTERRUPTIBILITY_INFO to see if
-        * there's some reason to not deliver the interrupt and check things like
-        * the VPPR (priority register).  But since we're emulating a halt, mwait,
-        * or something else that needs to be woken by an IRQ, we can ignore that
-        * and just wake them up.  Note that we won't actually deliver the IRQ,
-        * we'll just restart the guest and the hardware will deliver the virtual
-        * IRQ at the appropriate time.  So in the event that something weird
-        * happens, the halt/mwait just returns spuriously.
+        * Note that when we see a notif or pending virtual IRQ, we don't
+        * actually deliver the IRQ, we'll just restart the guest and the
+        * hardware will deliver the virtual IRQ at the appropriate time.
         *
         *
-        * The more traditional race here is if the halt starts concurrently with
-        * the post; that's why we sync with the mutex to make sure there is an
-        * ordering between the actual halt (this function) and the posting. */
+        * The more traditional race here is if the halt starts concurrently
+        * with the post; that's why we sync with the mutex to make sure there
+        * is an ordering between the actual halt (this function) and the
+        * posting. */
        uth_mutex_lock(gth->halt_mtx);
        uth_mutex_lock(gth->halt_mtx);
-       while (!(pir_notif_is_set(gpci) || rvi_is_set(gth)))
+       while (!(pir_notif_is_set(gpci) || virtual_irq_is_pending(gth)))
                uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
        uth_mutex_unlock(gth->halt_mtx);
 }
 
                uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
        uth_mutex_unlock(gth->halt_mtx);
 }
 
+enum {
+       CPUID_0B_LEVEL_SMT = 0,
+       CPUID_0B_LEVEL_CORE
+};
+
+static bool handle_cpuid(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct virtual_machine *vm = gth_to_vm(gth);
+       uint32_t level = vm_tf->tf_rcx & 0x0F;
+
+       if (vm_tf->tf_rax != 0x0B)
+               return FALSE;
+
+       vm_tf->tf_rip += 2;
+       vm_tf->tf_rax = 0;
+       vm_tf->tf_rbx = 0;
+       vm_tf->tf_rcx = level;
+       vm_tf->tf_rdx = gth->gpc_id;
+       if (level == CPUID_0B_LEVEL_SMT) {
+               vm_tf->tf_rax = 0;
+               vm_tf->tf_rbx = 1;
+               vm_tf->tf_rcx |= ((level + 1) << 8);
+       }
+       if (level == CPUID_0B_LEVEL_CORE) {
+               uint32_t shift = LOG2_UP(vm->nr_gpcs);
+
+               if (shift > 0x1F)
+                       shift = 0x1F;
+               vm_tf->tf_rax = shift;
+               vm_tf->tf_rbx = vm->nr_gpcs;
+               vm_tf->tf_rcx |= ((level + 1) << 8);
+       }
+
+       return TRUE;
+}
+
 static bool handle_ept_fault(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 static bool handle_ept_fault(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
@@ -72,8 +119,17 @@ static bool handle_ept_fault(struct guest_thread *gth)
        uint8_t regx;
        int store, size;
        int advance;
        uint8_t regx;
        int store, size;
        int advance;
+       int ret;
 
 
-       int ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
+       if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
+               ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0,
+                                 0, 0);
+               if (ret <= 0)
+                       panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
+                             ret);
+               return TRUE;
+       }
+       ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
 
        if (ret < 0)
                return FALSE;
 
        if (ret < 0)
                return FALSE;
@@ -87,19 +143,21 @@ static bool handle_ept_fault(struct guest_thread *gth)
        }
 
        assert(size >= 0);
        }
 
        assert(size >= 0);
-       /* TODO use helpers for some of these addr checks.  the fee/fec ones might
-        * be wrong too. */
+       /* TODO use helpers for some of these addr checks.  the fee/fec ones
+        * might be wrong too. */
        for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
                if (vm->virtio_mmio_devices[i] == NULL)
                        continue;
                if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
                        continue;
        for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
                if (vm->virtio_mmio_devices[i] == NULL)
                        continue;
                if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
                        continue;
-               /* TODO: can the guest cause us to spawn off infinite threads? */
+               /* TODO: can the guest cause us to spawn off infinite threads?
+                */
                if (store)
                if (store)
-                       virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size,
-                                      (uint32_t *)regp);
+                       virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa,
+                                      size, (uint32_t *)regp);
                else
                else
-                       *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size);
+                       *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i],
+                                              gpa, size);
                vm_tf->tf_rip += advance;
                return TRUE;
        }
                vm_tf->tf_rip += advance;
                return TRUE;
        }
@@ -121,7 +179,7 @@ static bool handle_ept_fault(struct guest_thread *gth)
        return TRUE;
 }
 
        return TRUE;
 }
 
-static bool handle_vmcall(struct guest_thread *gth)
+static bool handle_vmcall_printc(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
        uint8_t byte;
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
        uint8_t byte;
@@ -130,10 +188,124 @@ static bool handle_vmcall(struct guest_thread *gth)
        printf("%c", byte);
        if (byte == '\n')
                printf("%c", '%');
        printf("%c", byte);
        if (byte == '\n')
                printf("%c", '%');
-       vm_tf->tf_rip += 3;
+       fflush(stdout);
+       return TRUE;
+}
+
+static bool handle_vmcall_smpboot(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct vm_trapframe *vm_tf_ap;
+       struct virtual_machine *vm = gth_to_vm(gth);
+       int cur_pcores = vm->up_gpcs;
+
+       /* Check if we're guest pcore 0. Only the BSP is allowed to start APs.
+        */
+       if (vm_tf->tf_guest_pcoreid != 0) {
+               fprintf(stderr,
+                       "Only guest pcore 0 is allowed to start APs. core was %ld\n",
+                       vm_tf->tf_guest_pcoreid);
+               return FALSE;
+       }
+
+       /* Check if we've reached the maximum, if yes, blow out. */
+       if (vm->nr_gpcs == cur_pcores) {
+               fprintf(stderr,
+                       "guest tried to start up too many cores. max was %ld, current up %ld\n",
+                       vm->nr_gpcs, cur_pcores);
+               return FALSE;
+       }
+
+       /* Start up secondary core. */
+       vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
+       /* We use the BSP's CR3 for now. This should be fine because they
+        * change it later anyway. */
+       vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
+
+       /* Starting RIP is passed in via rdi. */
+       vm_tf_ap->tf_rip = vm_tf->tf_rdi;
+
+       /* Starting RSP is passed in via rsi. */
+       vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
+
+       vm->up_gpcs++;
+
+       start_guest_thread(gpcid_to_gth(vm, cur_pcores));
+
+       return TRUE;
+}
+
+static bool handle_vmcall_get_tscfreq(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct vm_trapframe *vm_tf_ap;
+       struct virtual_machine *vm = gth_to_vm(gth);
+
+       vm_tf->tf_rax = get_tsc_freq() / 1000;
        return TRUE;
 }
 
        return TRUE;
 }
 
+static bool handle_vmcall(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct virtual_machine *vm = gth_to_vm(gth);
+       bool retval = FALSE;
+
+       if (vm->vmcall)
+               return vm->vmcall(gth, vm_tf);
+
+       switch (vm_tf->tf_rax) {
+       case VMCALL_PRINTC:
+               retval = handle_vmcall_printc(gth);
+               break;
+       case VMCALL_SMPBOOT:
+               retval = handle_vmcall_smpboot(gth);
+               break;
+       case VMCALL_GET_TSCFREQ:
+               retval = handle_vmcall_get_tscfreq(gth);
+               break;
+       case VMCALL_TRACE_TF:
+               trace_printf("  rax  0x%016lx\n",      vm_tf->tf_r11);
+               trace_printf("  rbx  0x%016lx\n",      vm_tf->tf_rbx);
+               trace_printf("  rcx  0x%016lx\n",      vm_tf->tf_rcx);
+               trace_printf("  rdx  0x%016lx\n",      vm_tf->tf_rdx);
+               trace_printf("  rbp  0x%016lx\n",      vm_tf->tf_rbp);
+               trace_printf("  rsi  0x%016lx\n",      vm_tf->tf_rsi);
+               trace_printf("  rdi  0x%016lx\n",      vm_tf->tf_rdi);
+               trace_printf("  r8   0x%016lx\n",      vm_tf->tf_r8);
+               trace_printf("  r9   0x%016lx\n",      vm_tf->tf_r9);
+               trace_printf("  r10  0x%016lx\n",      vm_tf->tf_r10);
+               trace_printf("  r11  0x%016lx\n",      0xdeadbeef);
+               trace_printf("  r12  0x%016lx\n",      vm_tf->tf_r12);
+               trace_printf("  r13  0x%016lx\n",      vm_tf->tf_r13);
+               trace_printf("  r14  0x%016lx\n",      vm_tf->tf_r14);
+               trace_printf("  r15  0x%016lx\n",      vm_tf->tf_r15);
+               trace_printf("  rip  0x%016lx\n",      vm_tf->tf_rip);
+               trace_printf("  rflg 0x%016lx\n",      vm_tf->tf_rflags);
+               trace_printf("  rsp  0x%016lx\n",      vm_tf->tf_rsp);
+               trace_printf("  cr2  0x%016lx\n",      vm_tf->tf_cr2);
+               trace_printf("  cr3  0x%016lx\n",      vm_tf->tf_cr3);
+               trace_printf("Gpcore 0x%08x\n",        vm_tf->tf_guest_pcoreid);
+               trace_printf("Flags  0x%08x\n",        vm_tf->tf_flags);
+               trace_printf("Inject 0x%08x\n",        vm_tf->tf_trap_inject);
+               trace_printf("ExitRs 0x%08x\n",        vm_tf->tf_exit_reason);
+               trace_printf("ExitQl 0x%08x\n",        vm_tf->tf_exit_qual);
+               trace_printf("Intr1  0x%016lx\n",      vm_tf->tf_intrinfo1);
+               trace_printf("Intr2  0x%016lx\n",      vm_tf->tf_intrinfo2);
+               trace_printf("GIntr  0x----%04x\n",
+                            vm_tf->tf_guest_intr_status);
+               trace_printf("GVA    0x%016lx\n",      vm_tf->tf_guest_va);
+               trace_printf("GPA    0x%016lx\n",      vm_tf->tf_guest_pa);
+               retval = true;
+               break;
+       }
+
+       if (retval)
+               vm_tf->tf_rip += 3;
+
+       return retval;
+}
+
 static bool handle_io(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 static bool handle_io(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
@@ -155,11 +327,10 @@ static bool handle_msr(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 
-       /* TODO: consider pushing the gth into msrio */
        if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
        if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
-               /* Use event injection through vmctl to send a general protection fault
-                * vmctl.interrupt gets written to the VM-Entry Interruption-Information
-                * Field by vmx */
+               /* Use event injection through vmctl to send a general
+                * protection fault vmctl.interrupt gets written to the VM-Entry
+                * Interruption-Information Field by vmx */
                vm_tf->tf_trap_inject = VM_TRAP_VALID
                                      | VM_TRAP_ERROR_CODE
                                      | VM_TRAP_HARDWARE
                vm_tf->tf_trap_inject = VM_TRAP_VALID
                                      | VM_TRAP_ERROR_CODE
                                      | VM_TRAP_HARDWARE
@@ -189,23 +360,29 @@ static bool handle_apic_access(struct guest_thread *gth)
 static bool handle_halt(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 static bool handle_halt(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct virtual_machine *vm = gth_to_vm(gth);
 
 
-       if (gth->halt_exit)
+       if (vm->halt_exit)
                return FALSE;
                return FALSE;
-       /* It's possible the guest disabled IRQs and halted, perhaps waiting on an
-        * NMI or something.  If we need to support that, we can change this.  */
+       /* It's possible the guest disabled IRQs and halted, perhaps waiting on
+        * an NMI or something.  If we need to support that, we can change this.
+        */
        sleep_til_irq(gth);
        vm_tf->tf_rip += 1;
        return TRUE;
 }
 
        sleep_til_irq(gth);
        vm_tf->tf_rip += 1;
        return TRUE;
 }
 
+/* The guest is told (via cpuid) that there is no monitor/mwait.  Callers of
+ * mwait are paravirtualized halts.
+ *
+ * We don't support monitor/mwait in software, so if they tried to mwait
+ * without break-on-interrupt and with interrupts disabled, they'll never
+ * wake up.  So we'll always break on interrupt. */
 static bool handle_mwait(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 static bool handle_mwait(struct guest_thread *gth)
 {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct virtual_machine *vm = gth_to_vm(gth);
 
 
-       /* TODO: we need to handle the actual monitor part of mwait.  This just
-        * implements the power management / halting.  Likewise, it's possible IRQs
-        * are disabled (as with halt). */
        sleep_til_irq(gth);
        vm_tf->tf_rip += 3;
        return TRUE;
        sleep_til_irq(gth);
        vm_tf->tf_rip += 3;
        return TRUE;
@@ -219,6 +396,8 @@ bool handle_vmexit(struct guest_thread *gth)
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 
        switch (vm_tf->tf_exit_reason) {
        struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 
        switch (vm_tf->tf_exit_reason) {
+       case EXIT_REASON_CPUID:
+               return handle_cpuid(gth);
        case EXIT_REASON_EPT_VIOLATION:
                return handle_ept_fault(gth);
        case EXIT_REASON_VMCALL:
        case EXIT_REASON_EPT_VIOLATION:
                return handle_ept_fault(gth);
        case EXIT_REASON_VMCALL:
@@ -239,7 +418,8 @@ bool handle_vmexit(struct guest_thread *gth)
                /* TODO: just ignore these? */
                return TRUE;
        default:
                /* TODO: just ignore these? */
                return TRUE;
        default:
-               fprintf(stderr, "Don't know how to handle exit %d\n",
+               fprintf(stderr,
+                       "VMM library: don't know how to handle exit %d\n",
                        vm_tf->tf_exit_reason);
                fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
                        vm_tf->tf_exit_reason);
                        vm_tf->tf_exit_reason);
                fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
                        vm_tf->tf_exit_reason);