vmm: Allow vmm.h to include vmx.h
[akaros.git] / kern / arch / x86 / vmm / vmm.c
index 0781544..8f0acc8 100644 (file)
@@ -66,13 +66,12 @@ void vmm_pcpu_init(void)
 }
 
 /* Initializes a process to run virtual machine contexts, returning the number
- * initialized, optionally setting errno */
+ * initialized, throwing on error. */
 int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores,
                     struct vmm_gpcore_init *u_gpcis, int flags)
 {
        ERRSTACK(1);
        struct vmm *vmm = &p->vmm;
-       unsigned int i;
        struct vmm_gpcore_init gpci;
 
        if (flags & ~VMM_ALL_FLAGS)
@@ -98,17 +97,17 @@ int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores,
        if (!vmm->guest_pcores)
                error(ENOMEM, "Allocation of vmm->guest_pcores failed");
 
-       for (i = 0; i < nr_guest_pcores; i++) {
+       for (int i = 0; i < nr_guest_pcores; i++) {
                if (copy_from_user(&gpci, &u_gpcis[i], sizeof(struct vmm_gpcore_init)))
                        error(EINVAL, "Bad pointer %p for gps", u_gpcis);
                vmm->guest_pcores[i] = create_guest_pcore(p, &gpci);
-               vmm->nr_guest_pcores = i;
+               vmm->nr_guest_pcores = i + 1;
        }
        for (int i = 0; i < VMM_VMEXIT_NR_TYPES; i++)
                vmm->vmexits[i] = 0;
        qunlock(&vmm->qlock);
        poperror();
-       return i;
+       return vmm->nr_guest_pcores;
 }
 
 /* Has no concurrency protection - only call this when you know you have the
@@ -149,7 +148,7 @@ int vmm_poke_guest(struct proc *p, int guest_pcoreid)
                 * mess up; we tried." */
                return 0;
        }
-       send_ipi(pcoreid, I_POKE_CORE);
+       send_ipi(pcoreid, I_POKE_GUEST);
        return 0;
 }
 
@@ -179,10 +178,19 @@ struct guest_pcore *load_guest_pcore(struct proc *p, int guest_pcoreid)
        spin_unlock(&p->vmm.lock);
        /* We've got dibs on the gpc; we don't need to hold the lock any longer. */
        pcpui->guest_pcoreid = guest_pcoreid;
-       ept_sync_context(gpc_get_eptp(gpc));
        vmx_load_guest_pcore(gpc);
        /* Load guest's xcr0 */
        lxcr0(gpc->xcr0);
+
+       /* Manual MSR save/restore */
+       write_kern_gsbase(gpc->msr_kern_gs_base);
+       if (gpc->msr_star != AKAROS_MSR_STAR)
+               write_msr(MSR_STAR, gpc->msr_star);
+       if (gpc->msr_lstar != AKAROS_MSR_LSTAR)
+               write_msr(MSR_LSTAR, gpc->msr_lstar);
+       if (gpc->msr_sfmask != AKAROS_MSR_SFMASK)
+               write_msr(MSR_SFMASK, gpc->msr_sfmask);
+
        return gpc;
 }
 
@@ -195,7 +203,6 @@ void unload_guest_pcore(struct proc *p, int guest_pcoreid)
        assert(gpc);
        spin_lock(&p->vmm.lock);
        assert(gpc->cpu != -1);
-       ept_sync_context(gpc_get_eptp(gpc));
        vmx_unload_guest_pcore(gpc);
        gpc->cpu = -1;
 
@@ -203,6 +210,20 @@ void unload_guest_pcore(struct proc *p, int guest_pcoreid)
        gpc->xcr0 = rxcr0();
        lxcr0(__proc_global_info.x86_default_xcr0);
 
+       /* We manage these MSRs manually. */
+       gpc->msr_kern_gs_base = read_kern_gsbase();
+       gpc->msr_star = read_msr(MSR_STAR);
+       gpc->msr_lstar = read_msr(MSR_LSTAR);
+       gpc->msr_sfmask = read_msr(MSR_SFMASK);
+
+       write_kern_gsbase((uint64_t)pcpui);
+       if (gpc->msr_star != AKAROS_MSR_STAR)
+               write_msr(MSR_STAR, AKAROS_MSR_STAR);
+       if (gpc->msr_lstar != AKAROS_MSR_LSTAR)
+               write_msr(MSR_LSTAR, AKAROS_MSR_LSTAR);
+       if (gpc->msr_sfmask, AKAROS_MSR_SFMASK)
+               write_msr(MSR_SFMASK, AKAROS_MSR_SFMASK);
+
        /* As soon as we unlock, this gpc can be started on another core */
        spin_unlock(&p->vmm.lock);
        pcpui->guest_pcoreid = -1;
@@ -214,26 +235,24 @@ void unload_guest_pcore(struct proc *p, int guest_pcoreid)
 struct emmsr {
        uint32_t reg;
        char *name;
-       bool (*f)(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                 uint64_t *rax, uint32_t opcode);
+       bool (*f)(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                 uint32_t opcode);
        bool written;
        uint32_t edx, eax;
 };
 
-static bool emsr_miscenable(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                            uint64_t *rax, uint32_t opcode);
-static bool emsr_mustmatch(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                           uint64_t *rax, uint32_t opcode);
-static bool emsr_readonly(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                          uint64_t *rax, uint32_t opcode);
-static bool emsr_readzero(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                          uint64_t *rax, uint32_t opcode);
-static bool emsr_fakewrite(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                           uint64_t *rax, uint32_t opcode);
-static bool emsr_ok(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                    uint64_t *rax, uint32_t opcode);
-static bool emsr_fake_apicbase(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                               uint64_t *rax, uint32_t opcode);
+static bool emsr_miscenable(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                            uint32_t opcode);
+static bool emsr_readonly(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                          uint32_t opcode);
+static bool emsr_readzero(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                          uint32_t opcode);
+static bool emsr_fakewrite(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                           uint32_t opcode);
+static bool emsr_ok(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                    uint32_t opcode);
+static bool emsr_fake_apicbase(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                               uint32_t opcode);
 
 struct emmsr emmsrs[] = {
        {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable},
@@ -269,14 +288,16 @@ struct emmsr emmsrs[] = {
        {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok},
        // aaaaaahhhhhhhhhhhhhhhhhhhhh
        {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
-       {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
-       {MSR_IA32_PERF_CAPABILITIES, "MSR_IA32_PERF_CAPABILITIES", emsr_ok},
+       {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL1", emsr_ok},
+       {MSR_IA32_PERF_CAPABILITIES, "MSR_IA32_PERF_CAPABILITIES", emsr_readzero},
        // unsafe.
        {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fake_apicbase},
 
        // mostly harmless.
        {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite},
        {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero},
+       {MSR_IA32_MCG_CAP, "MSR_IA32_MCG_CAP", emsr_readzero},
+       {MSR_IA32_DEBUGCTLMSR, "MSR_IA32_DEBUGCTLMSR", emsr_fakewrite},
 
        // TBD
        {MSR_IA32_TSC_DEADLINE, "MSR_IA32_TSC_DEADLINE", emsr_fakewrite},
@@ -285,71 +306,44 @@ struct emmsr emmsrs[] = {
 /* this may be the only register that needs special handling.
  * If there others then we might want to extend the emmsr struct.
  */
-bool emsr_miscenable(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                     uint64_t *rax, uint32_t opcode)
+bool emsr_miscenable(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                     uint32_t opcode)
 {
-       uint32_t eax, edx;
        uint64_t val;
-
-       if (read_msr_safe(msr->reg, &val))
-               return FALSE;
-       split_msr_val(val, &edx, &eax);
-       /* we just let them read the misc msr for now. */
-       if (opcode == VMM_MSR_EMU_READ) {
-               *rax = eax;
-               *rax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
-               *rdx = edx;
-               return TRUE;
-       } else {
-               /* if they are writing what is already written, that's ok. */
-               if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
-                       return TRUE;
-       }
-       printk
-               ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
-                msr->name, (uint32_t) *rdx, (uint32_t) *rax, edx, eax);
-       return FALSE;
-}
-
-/* TODO: this looks like a copy-paste for the read side.  What's the purpose of
- * mustmatch?  No one even uses it. */
-bool emsr_mustmatch(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                    uint64_t *rax, uint32_t opcode)
-{
        uint32_t eax, edx;
-       uint64_t val;
 
        if (read_msr_safe(msr->reg, &val))
                return FALSE;
-       split_msr_val(val, &edx, &eax);
+       eax = low32(val);
+       eax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
+       edx = high32(val);
        /* we just let them read the misc msr for now. */
        if (opcode == VMM_MSR_EMU_READ) {
-               *rax = eax;
-               *rdx = edx;
+               vm_tf->tf_rax = eax;
+               vm_tf->tf_rdx = edx;
                return TRUE;
        } else {
                /* if they are writing what is already written, that's ok. */
-               if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
+               if (((uint32_t) vm_tf->tf_rax == eax)
+                   && ((uint32_t) vm_tf->tf_rdx == edx))
                        return TRUE;
        }
-       printk
-               ("%s: Wanted to write 0x%x:0x%x, but could not; value was 0x%x:0x%x\n",
-                msr->name, (uint32_t) *rdx, (uint32_t) *rax, edx, eax);
+       printk("%s: Wanted to write 0x%x%x, but could not; value was 0x%x%x\n",
+              msr->name, (uint32_t) vm_tf->tf_rdx, (uint32_t) vm_tf->tf_rax,
+              edx, eax);
        return FALSE;
 }
 
-bool emsr_readonly(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                   uint64_t *rax, uint32_t opcode)
+bool emsr_readonly(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                   uint32_t opcode)
 {
-       uint32_t eax, edx;
        uint64_t val;
 
        if (read_msr_safe(msr->reg, &val))
                return FALSE;
-       split_msr_val(val, &edx, &eax);
        if (opcode == VMM_MSR_EMU_READ) {
-               *rax = eax;
-               *rdx = edx;
+               vm_tf->tf_rax = low32(val);
+               vm_tf->tf_rdx = high32(val);
                return TRUE;
        }
 
@@ -357,12 +351,12 @@ bool emsr_readonly(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
        return FALSE;
 }
 
-bool emsr_readzero(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                   uint64_t *rax, uint32_t opcode)
+bool emsr_readzero(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                   uint32_t opcode)
 {
        if (opcode == VMM_MSR_EMU_READ) {
-               *rax = 0;
-               *rdx = 0;
+               vm_tf->tf_rax = 0;
+               vm_tf->tf_rdx = 0;
                return TRUE;
        }
 
@@ -371,8 +365,8 @@ bool emsr_readzero(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
 }
 
 /* pretend to write it, but don't write it. */
-bool emsr_fakewrite(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                    uint64_t *rax, uint32_t opcode)
+bool emsr_fakewrite(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                    uint32_t opcode)
 {
        uint32_t eax, edx;
        uint64_t val;
@@ -380,41 +374,37 @@ bool emsr_fakewrite(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
        if (!msr->written) {
                if (read_msr_safe(msr->reg, &val))
                        return FALSE;
-               split_msr_val(val, &edx, &eax);
+               eax = low32(val);
+               edx = high32(val);
        } else {
-               edx = msr->edx;
                eax = msr->eax;
+               edx = msr->edx;
        }
        /* we just let them read the misc msr for now. */
        if (opcode == VMM_MSR_EMU_READ) {
-               *rax = eax;
-               *rdx = edx;
+               vm_tf->tf_rax = eax;
+               vm_tf->tf_rdx = edx;
                return TRUE;
        } else {
-               /* if they are writing what is already written, that's ok. */
-               if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
-                       return TRUE;
-               msr->edx = *rdx;
-               msr->eax = *rax;
+               msr->edx = vm_tf->tf_rdx;
+               msr->eax = vm_tf->tf_rax;
                msr->written = TRUE;
        }
        return TRUE;
 }
 
-bool emsr_ok(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-             uint64_t *rax, uint32_t opcode)
+bool emsr_ok(struct emmsr *msr, struct vm_trapframe *vm_tf,
+             uint32_t opcode)
 {
-       uint32_t eax, edx;
        uint64_t val;
 
        if (opcode == VMM_MSR_EMU_READ) {
                if (read_msr_safe(msr->reg, &val))
                        return FALSE;
-               split_msr_val(val, &edx, &eax);
-               *rax = eax;
-               *rdx = edx;
+               vm_tf->tf_rax = low32(val);
+               vm_tf->tf_rdx = high32(val);
        } else {
-               val = (*rdx << 32) | (*rax & 0xffffffff);
+               val = (vm_tf->tf_rdx << 32) | (vm_tf->tf_rax & 0xffffffff);
                if (write_msr_safe(msr->reg, val))
                        return FALSE;
        }
@@ -422,15 +412,19 @@ bool emsr_ok(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
 }
 
 /* pretend to write it, but don't write it. */
-bool emsr_fake_apicbase(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
-                        uint64_t *rax, uint32_t opcode)
+bool emsr_fake_apicbase(struct emmsr *msr, struct vm_trapframe *vm_tf,
+                        uint32_t opcode)
 {
        uint32_t eax, edx;
 
        if (!msr->written) {
                /* TODO: tightly coupled to the addr in vmrunkernel.  We want this func
                 * to return the val that vmrunkernel put into the VMCS. */
-               eax = 0xfee00900;
+               eax = 0xfee00d00;
+               if (vm_tf->tf_guest_pcoreid != 0) {
+                       // Remove BSP bit if not core 0
+                       eax = 0xfee00c00;
+               }
                edx = 0;
        } else {
                edx = msr->edx;
@@ -438,26 +432,27 @@ bool emsr_fake_apicbase(struct emmsr *msr, uint64_t *rcx, uint64_t *rdx,
        }
        /* we just let them read the misc msr for now. */
        if (opcode == VMM_MSR_EMU_READ) {
-               *rax = eax;
-               *rdx = edx;
+               vm_tf->tf_rax = eax;
+               vm_tf->tf_rdx = edx;
                return TRUE;
        } else {
                /* if they are writing what is already written, that's ok. */
-               if (((uint32_t) *rax == eax) && ((uint32_t) *rdx == edx))
+               if (((uint32_t) vm_tf->tf_rax == eax)
+                   && ((uint32_t) vm_tf->tf_rdx == edx))
                        return 0;
-               msr->edx = *rdx;
-               msr->eax = *rax;
+               msr->edx = vm_tf->tf_rdx;
+               msr->eax = vm_tf->tf_rax;
                msr->written = TRUE;
        }
        return TRUE;
 }
 
-bool vmm_emulate_msr(uint64_t *rcx, uint64_t *rdx, uint64_t *rax, int op)
+bool vmm_emulate_msr(struct vm_trapframe *vm_tf, int op)
 {
        for (int i = 0; i < ARRAY_SIZE(emmsrs); i++) {
-               if (emmsrs[i].reg != *rcx)
+               if (emmsrs[i].reg != vm_tf->tf_rcx)
                        continue;
-               return emmsrs[i].f(&emmsrs[i], rcx, rdx, rax, op);
+               return emmsrs[i].f(&emmsrs[i], vm_tf, op);
        }
        return FALSE;
 }