VMM: Add kernel support for VM contexts [1/2]
authorBarret Rhoden <brho@cs.berkeley.edu>
Tue, 2 Feb 2016 16:58:24 +0000 (11:58 -0500)
committerBarret Rhoden <brho@cs.berkeley.edu>
Tue, 2 Feb 2016 22:43:52 +0000 (17:43 -0500)
The kernel now knows how to pop VM contexts and handle VM exits.

As of this commit, we're still using the old KVM loop.  The HOST_RIP on
resume is still set to use the old KVM loop, IPI injection still uses the
vmctl, and userspace does not ask it to use contexts.

Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
kern/arch/x86/process64.c
kern/arch/x86/trap.c
kern/arch/x86/trap64.h
kern/arch/x86/trapentry64.S
user/pthread/pthread.c

index bf2cb4b..14cfcd6 100644 (file)
@@ -73,12 +73,113 @@ static void __attribute__((noreturn)) proc_pop_swtf(struct sw_trapframe *tf)
        panic("sysret failed");
 }
 
+/* If popping a VM TF fails for some reason, we need to reflect it back to the
+ * user.  It is possible that the reflection fails.  We still need to run
+ * something, and it's a lousy time to try something else.  So We'll give them a
+ * TF that will probably fault right away and kill them. */
+static void __attribute__((noreturn)) handle_bad_vm_tf(struct vm_trapframe *tf)
+{
+       struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
+
+       tf->tf_exit_reason |= VMX_EXIT_REASONS_FAILED_VMENTRY;
+       tf->tf_flags |= VMCTX_FL_HAS_FAULT;
+       if (reflect_current_context()) {
+               printk("[kernel] Unable to reflect after a bad VM enter\n");
+               proc_init_ctx(pcpui->cur_ctx, 0, 0xcafebabe, 0, 0);
+       }
+       proc_pop_ctx(pcpui->cur_ctx);
+}
+
 static void __attribute__((noreturn)) proc_pop_vmtf(struct vm_trapframe *tf)
 {
-       /* This function probably will be able to fail internally.  If that happens,
-        * we'll just build a dummy SW TF and pop that instead. */
-       /* TODO: (VMCTX) */
-       panic("Not implemented");
+       struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
+       struct proc *p = pcpui->cur_proc;
+       struct vmx_vcpu *gpc;
+
+       if (x86_vmtf_is_partial(tf)) {
+               gpc = lookup_guest_pcore(p, tf->tf_guest_pcoreid);
+               assert(gpc);
+               assert(pcpui->guest_pcoreid == tf->tf_guest_pcoreid);
+       } else {
+               gpc = load_guest_pcore(p, tf->tf_guest_pcoreid);
+               if (!gpc) {
+                       tf->tf_exit_reason = EXIT_REASON_GUEST_IN_USE;
+                       handle_bad_vm_tf(tf);
+               }
+       }
+       vmcs_write(GUEST_RSP, tf->tf_rsp);
+       vmcs_write(GUEST_CR3, tf->tf_cr3);
+       vmcs_write(GUEST_RIP, tf->tf_rip);
+       vmcs_write(GUEST_RFLAGS, tf->tf_rflags);
+       /* cr2 is not part of the VMCS state; we need to save/restore it manually */
+       lcr2(tf->tf_cr2);
+       vmcs_write(VM_ENTRY_INTR_INFO_FIELD, tf->tf_trap_inject);
+       /* vmlaunch/resume can fail, so we need to be able to return from this.
+        * Thus we can't clobber rsp via the popq style of setting the registers.
+        * Likewise, we don't want to lose rbp via the clobber list.
+        *
+        * Partial contexts have already been launched, so we resume them. */
+       asm volatile ("testl $"STRINGIFY(VMCTX_FL_PARTIAL)", %c[flags](%0);"
+                     "pushq %%rbp;              "      /* save in case we fail */
+                     "movq %c[rbx](%0), %%rbx;  "
+                     "movq %c[rcx](%0), %%rcx;  "
+                     "movq %c[rdx](%0), %%rdx;  "
+                     "movq %c[rbp](%0), %%rbp;  "
+                     "movq %c[rsi](%0), %%rsi;  "
+                     "movq %c[rdi](%0), %%rdi;  "
+                     "movq %c[r8](%0),  %%r8;   "
+                     "movq %c[r9](%0),  %%r9;   "
+                     "movq %c[r10](%0), %%r10;  "
+                     "movq %c[r11](%0), %%r11;  "
+                     "movq %c[r12](%0), %%r12;  "
+                     "movq %c[r13](%0), %%r13;  "
+                     "movq %c[r14](%0), %%r14;  "
+                     "movq %c[r15](%0), %%r15;  "
+                     "movq %c[rax](%0), %%rax;  "      /* clobber our *tf last */
+                     "jnz 1f;                   "      /* jump if partial */
+                     ASM_VMX_VMLAUNCH";         "      /* non-partial gets launched */
+                     "jmp 2f;                   "
+                     "1: "ASM_VMX_VMRESUME";    "      /* partials get resumed */
+                     "2: popq %%rbp;            "      /* vmlaunch failed */
+                     :
+                     : "a" (tf),
+                       [rax]"i"(offsetof(struct vm_trapframe, tf_rax)),
+                       [rbx]"i"(offsetof(struct vm_trapframe, tf_rbx)),
+                       [rcx]"i"(offsetof(struct vm_trapframe, tf_rcx)),
+                       [rdx]"i"(offsetof(struct vm_trapframe, tf_rdx)),
+                       [rbp]"i"(offsetof(struct vm_trapframe, tf_rbp)),
+                       [rsi]"i"(offsetof(struct vm_trapframe, tf_rsi)),
+                       [rdi]"i"(offsetof(struct vm_trapframe, tf_rdi)),
+                        [r8]"i"(offsetof(struct vm_trapframe, tf_r8)),
+                        [r9]"i"(offsetof(struct vm_trapframe, tf_r9)),
+                       [r10]"i"(offsetof(struct vm_trapframe, tf_r10)),
+                       [r11]"i"(offsetof(struct vm_trapframe, tf_r11)),
+                       [r12]"i"(offsetof(struct vm_trapframe, tf_r12)),
+                       [r13]"i"(offsetof(struct vm_trapframe, tf_r13)),
+                       [r14]"i"(offsetof(struct vm_trapframe, tf_r14)),
+                       [r15]"i"(offsetof(struct vm_trapframe, tf_r15)),
+                       [flags]"i"(offsetof(struct vm_trapframe, tf_flags))
+                     : "cc", "memory", "rbx", "rcx", "rdx", "rsi", "rdi",
+                       "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");
+       /* vmlaunch/resume failed.  It could be for a few reasons, including things
+        * like launching instead of resuming, not having a VMCS loaded, failing a
+        * host-state area check, etc.  Those are kernel problems.
+        *
+        * The user also might be able to trigger some of these failures.  For
+        * instance, rflags could be bad, or the trap_injection could be
+        * misformatted.  We might catch that in secure_tf, or we could reflect
+        * those to the user.  Detecting btw the kernel and user mistakes might be
+        * a pain.
+        *
+        * For now, the plan is to just reflect everything back to the user and
+        * whitelist errors that are known to be kernel bugs.
+        *
+        * Also we should always have a non-shadow VMCS, so ZF should be 1 and we
+        * can read the error register. */
+       assert(read_flags() & FL_ZF);
+       tf->tf_exit_reason = EXIT_REASON_VMENTER_FAILED;
+       tf->tf_exit_qual = vmcs_read(VM_INSTRUCTION_ERROR);
+       handle_bad_vm_tf(tf);
 }
 
 void proc_pop_ctx(struct user_context *ctx)
index 876c3b2..6cb72cb 100644 (file)
@@ -676,3 +676,191 @@ void send_ipi(uint32_t os_coreid, uint8_t vector)
        }
        __send_ipi(hw_coreid, vector);
 }
+
+/****************** VM exit handling ******************/
+
+static bool handle_vmexit_cpuid(struct vm_trapframe *tf)
+{
+       uint32_t eax, ebx, ecx, edx;
+
+       cpuid(tf->tf_rax, tf->tf_rcx, &eax, &ebx, &ecx, &edx);
+       tf->tf_rax = eax;
+       tf->tf_rbx = ebx;
+       tf->tf_rcx = ecx;
+       tf->tf_rdx = edx;
+       tf->tf_rip += 2;
+       return TRUE;
+}
+
+static bool handle_vmexit_ept_fault(struct vm_trapframe *tf)
+{
+       int prot = 0;
+       int ret;
+
+       prot |= tf->tf_exit_qual & VMX_EPT_FAULT_READ ? PROT_READ : 0;
+       prot |= tf->tf_exit_qual & VMX_EPT_FAULT_WRITE ? PROT_WRITE : 0;
+       prot |= tf->tf_exit_qual & VMX_EPT_FAULT_INS ? PROT_EXEC : 0;
+       ret = handle_page_fault(current, tf->tf_guest_pa, prot);
+       if (ret) {
+               /* TODO: maybe put ret in the TF somewhere */
+               return FALSE;
+       }
+       return TRUE;
+}
+
+static bool handle_vmexit_nmi(struct vm_trapframe *tf)
+{
+       /* Sanity checks, make sure we really got an NMI.  Feel free to remove. */
+       assert((tf->tf_intrinfo2 & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR);
+       assert((tf->tf_intrinfo2 & INTR_INFO_VECTOR_MASK) == T_NMI);
+       /* our NMI handler from trap.c won't run.  but we don't need the lock
+        * disabling stuff. */
+       extern bool mon_verbose_trace;
+
+       if (mon_verbose_trace) {
+               print_vmtrapframe(tf);
+               /* TODO: a backtrace of the guest would be nice here. */
+       }
+       printk("Core %d is at %p\n", core_id(), get_vmtf_pc(tf));
+       return TRUE;
+}
+
+bool handle_vmexit_msr(struct vm_trapframe *tf)
+{
+       bool ret;
+
+       ret = vmm_emulate_msr(&tf->tf_rcx, &tf->tf_rdx, &tf->tf_rax,
+                             (tf->tf_exit_reason == EXIT_REASON_MSR_READ
+                                                  ? VMM_MSR_EMU_READ : VMM_MSR_EMU_WRITE));
+       if (ret)
+               tf->tf_rip += 2;
+       return ret;
+}
+
+bool handle_vmexit_extirq(struct vm_trapframe *tf)
+{
+       struct hw_trapframe hw_tf;
+
+       /* For now, we just handle external IRQs.  I think guest traps should go to
+        * the guest, based on our vmctls */
+       assert((tf->tf_intrinfo2 & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_EXT_INTR);
+       /* TODO: Our IRQ handlers all expect TFs.  Let's fake one.  A bunch of
+        * handlers (e.g. backtrace/perf) will probably be unhappy about a user TF
+        * that is really a VM, so this all needs work. */
+       hw_tf.tf_gsbase = 0;
+       hw_tf.tf_fsbase = 0;
+       hw_tf.tf_rax = tf->tf_rax;
+       hw_tf.tf_rbx = tf->tf_rbx;
+       hw_tf.tf_rcx = tf->tf_rcx;
+       hw_tf.tf_rdx = tf->tf_rdx;
+       hw_tf.tf_rbp = tf->tf_rbp;
+       hw_tf.tf_rsi = tf->tf_rsi;
+       hw_tf.tf_rdi = tf->tf_rdi;
+       hw_tf.tf_r8 = tf->tf_r8;
+       hw_tf.tf_r9 = tf->tf_r9;
+       hw_tf.tf_r10 = tf->tf_r10;
+       hw_tf.tf_r11 = tf->tf_r11;
+       hw_tf.tf_r12 = tf->tf_r12;
+       hw_tf.tf_r13 = tf->tf_r13;
+       hw_tf.tf_r14 = tf->tf_r14;
+       hw_tf.tf_r15 = tf->tf_r15;
+       hw_tf.tf_trapno = tf->tf_intrinfo2 & INTR_INFO_VECTOR_MASK;
+       hw_tf.tf_err = 0;
+       hw_tf.tf_rip = tf->tf_rip;
+       hw_tf.tf_cs = GD_UT;    /* faking a user TF, even though it's a VM */
+       hw_tf.tf_rflags = tf->tf_rflags;
+       hw_tf.tf_rsp = tf->tf_rsp;
+       hw_tf.tf_ss = GD_UD;
+
+       irq_dispatch(&hw_tf);
+       /* Consider returning whether or not there was a handler registered */
+       return TRUE;
+}
+
+static void vmexit_dispatch(struct vm_trapframe *tf)
+{
+       bool handled = FALSE;
+
+       /* Do not block in any of these functions.
+        *
+        * If we block, we'll probably need to finalize the context.  If we do, then
+        * there's a chance the guest pcore can start somewhere else, and then we
+        * can't get the GPC loaded again.  Plus, they could be running a GPC with
+        * an unresolved vmexit.  It's just mess.
+        *
+        * If we want to enable IRQs, we can do so on a case-by-case basis.  Don't
+        * do it for external IRQs - the irq_dispatch code will handle it. */
+       switch (tf->tf_exit_reason) {
+       case EXIT_REASON_VMCALL:
+               if (current->vmm.flags & VMM_VMCALL_PRINTF) {
+                       printk("%c", tf->tf_rdi);
+                       tf->tf_rip += 3;
+                       handled = TRUE;
+               }
+               break;
+       case EXIT_REASON_CPUID:
+               handled = handle_vmexit_cpuid(tf);
+               break;
+       case EXIT_REASON_EPT_VIOLATION:
+               handled = handle_vmexit_ept_fault(tf);
+               break;
+       case EXIT_REASON_EXCEPTION_NMI:
+               handled = handle_vmexit_nmi(tf);
+               break;
+       case EXIT_REASON_MSR_READ:
+       case EXIT_REASON_MSR_WRITE:
+               handled = handle_vmexit_msr(tf);
+               break;
+       case EXIT_REASON_EXTERNAL_INTERRUPT:
+               handled = handle_vmexit_extirq(tf);
+               break;
+       default:
+               printd("Unhandled vmexit: reason 0x%x, exit qualification 0x%x\n",
+                      tf->tf_exit_reason, tf->tf_exit_qual);
+       }
+       if (!handled) {
+               tf->tf_flags |= VMCTX_FL_HAS_FAULT;
+               if (reflect_current_context()) {
+                       /* VM contexts shouldn't be in vcore context, so this should be
+                        * pretty rare (unlike SCPs or VC ctx page faults). */
+                       printk("[kernel] Unable to reflect VM Exit\n");
+                       print_vmtrapframe(tf);
+                       proc_destroy(current);
+               }
+       }
+}
+
+void handle_vmexit(struct vm_trapframe *tf)
+{
+       struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
+
+       tf->tf_rip = vmcs_read(GUEST_RIP);
+       tf->tf_rflags = vmcs_read(GUEST_RFLAGS);
+       tf->tf_rsp = vmcs_read(GUEST_RSP);
+       tf->tf_cr2 = rcr2();
+       tf->tf_cr3 = vmcs_read(GUEST_CR3);
+       tf->tf_guest_pcoreid = pcpui->guest_pcoreid;
+       tf->tf_flags |= VMCTX_FL_PARTIAL;
+       tf->tf_exit_reason = vmcs_read(VM_EXIT_REASON);
+       tf->tf_exit_qual = vmcs_read(EXIT_QUALIFICATION);
+       tf->tf_intrinfo1 = vmcs_read(GUEST_INTERRUPTIBILITY_INFO);
+       tf->tf_intrinfo2 = vmcs_read(VM_EXIT_INTR_INFO);
+       tf->tf_guest_va = vmcs_read(GUEST_LINEAR_ADDRESS);
+       tf->tf_guest_pa = vmcs_read(GUEST_PHYSICAL_ADDRESS);
+
+       set_current_ctx_vm(pcpui, tf);
+       tf = &pcpui->cur_ctx->tf.vm_tf;
+       vmexit_dispatch(tf);
+       /* We're either restarting a partial VM ctx (vmcs was launched, loaded on
+        * the core, etc) or a SW vc ctx for the reflected trap.  Or the proc is
+        * dying and we'll handle a __death KMSG shortly. */
+       proc_restartcore();
+}
+
+void x86_finalize_vmtf(struct vm_trapframe *tf)
+{
+       struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
+
+       x86_vmtf_clear_partial(tf);
+       unload_guest_pcore(pcpui->cur_proc, pcpui->guest_pcoreid);
+}
index 555d8e6..db9c185 100644 (file)
@@ -176,12 +176,7 @@ static inline void x86_finalize_swtf(struct sw_trapframe *tf)
        x86_swtf_clear_partial(tf);
 }
 
-static inline void x86_finalize_vmtf(struct vm_trapframe *tf)
-{
-       x86_vmtf_clear_partial(tf);
-       /* TODO: (VMCTX) */
-       panic("Not implemented");
-}
+void x86_finalize_vmtf(struct vm_trapframe *tf);
 
 /* Makes sure that the user context is fully saved into ctx and not split across
  * the struct and HW, meaning it is not a "partial context". */
index 5e4f19c..4723314 100644 (file)
@@ -502,3 +502,42 @@ normal_syscall:
        # return via pop_tf, never this path
 sysenter_spin:
        jmp sysenter_spin
+
+.globl vmexit_handler;
+.type vmexit_handler, @function;
+vmexit_handler:
+       # rflags has all flags = 0, so cli and cld already.
+       # HOST_GS_BASE and RSP is set by the hardware
+       # Set default values.  Most of these will be set in C later.
+       pushq $0                        # guest_pa
+       pushq $0                        # guest_va
+       pushq $0                        # intrinfo2 and 1
+       pushq $0                        # exit_qual + exit_reason
+       pushq $0                        # pad + trap_inject
+       pushq $0                        # flags + guest_pcorid
+       pushq $0                        # cr3
+       pushq $0                        # cr2
+       pushq $0                        # rsp
+       pushq $0                        # rflags
+       pushq $0                        # rip
+       # Save register state
+       pushq %r15
+       pushq %r14
+       pushq %r13
+       pushq %r12
+       pushq %r11
+       pushq %r10
+       pushq %r9
+       pushq %r8
+       pushq %rdi
+       pushq %rsi
+       pushq %rbp
+       pushq %rdx
+       pushq %rcx
+       pushq %rbx
+       pushq %rax
+       movq $0, %rbp                   # so we can backtrace to this point
+       movq %rsp, %rdi
+       call handle_vmexit
+vmexit_spin:
+       jmp vmexit_spin
index 51b1363..31f6e51 100644 (file)
@@ -344,9 +344,6 @@ static void pth_thread_refl_fault(struct uthread *uth,
                                         __arch_refl_get_err(ctx),
                                         __arch_refl_get_aux(ctx));
                break;
-       case ROS_VM_CTX:
-               /* TODO: (VMCTX) the pthread 2LS might not bother with this */
-               break;
        default:
                assert(0);
        }