x86: vmm: Track state for handling vmexits as KERNEL
[akaros.git] / kern / arch / x86 / trap.c
index 949d3a1..9f168fa 100644 (file)
@@ -85,9 +85,7 @@ const char *x86_trapname(int trapno)
 }
 
 /* Set stacktop for the current core to be the stack the kernel will start on
- * when trapping/interrupting from userspace.  Don't use this til after
- * smp_percpu_init().  We can probably get the TSS by reading the task register
- * and then the GDT.  Still, it's a pain. */
+ * when trapping/interrupting from userspace. */
 void set_stack_top(uintptr_t stacktop)
 {
        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
@@ -102,11 +100,7 @@ uintptr_t get_stack_top(void)
 {
        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
        uintptr_t stacktop;
-       /* so we can check this in interrupt handlers (before smp_boot()) */
-       /* TODO: These are dangerous - it assumes we're on a one-page stack.  If we
-        * change it to KSTKSIZE, then we assume stacks are KSTKSIZE-aligned */
-       if (!pcpui->tss)
-               return ROUNDUP(read_sp(), PGSIZE);
+
        stacktop = x86_get_stacktop_tss(pcpui->tss);
        if (stacktop != ROUNDUP(read_sp(), PGSIZE))
                panic("Bad stacktop: %p esp one is %p\n", stacktop,
@@ -156,15 +150,17 @@ void idt_init(void)
        idt[T_BRKPT].gd_dpl = 3;
        /* Send NMIs to their own stack (IST1 in every core's TSS) */
        idt[T_NMI].gd_ist = 1;
+       /* Send double faults to their own stack (IST2 in every core's TSS) */
+       idt[T_DBLFLT].gd_ist = 2;
+
+       /* The sooner we set this, the sooner we can use set/get_stack_top. */
+       per_cpu_info[0].tss = &ts;
+       per_cpu_info[0].gdt = gdt;
 
        /* Set up our kernel stack when changing rings */
        /* Note: we want 16 byte aligned kernel stack frames (AMD 2:8.9.3) */
-       x86_set_stacktop_tss(&ts, (uintptr_t)bootstacktop);
-       x86_sysenter_init((uintptr_t)bootstacktop);
-
-#ifdef CONFIG_KTHREAD_POISON
-       *kstack_bottom_addr((uintptr_t)bootstacktop) = 0xdeadbeef;
-#endif /* CONFIG_KTHREAD_POISON */
+       x86_sysenter_init();
+       set_stack_top((uintptr_t)bootstacktop);
 
        /* Initialize the TSS field of the gdt.  The size of the TSS desc differs
         * between 64 and 32 bit, hence the pointer acrobatics */
@@ -538,6 +534,13 @@ void handle_nmi(struct hw_trapframe *hw_tf)
        assert(0);
 }
 
+void handle_double_fault(struct hw_trapframe *hw_tf)
+{
+       print_trapframe(hw_tf);
+       backtrace_hwtf(hw_tf);
+       panic("Double fault!  Check the kernel stack pointer; you likely ran off the end of the stack.");
+}
+
 /* Certain traps want IRQs enabled, such as the syscall.  Others can't handle
  * it, like the page fault handler.  Turn them on on a case-by-case basis. */
 static void trap_dispatch(struct hw_trapframe *hw_tf)
@@ -550,9 +553,10 @@ static void trap_dispatch(struct hw_trapframe *hw_tf)
        // Handle processor exceptions.
        switch(hw_tf->tf_trapno) {
                case T_BRKPT:
-                       enable_irq();
-                       monitor(hw_tf);
-                       disable_irq();
+                       if (!in_kernel(hw_tf))
+                               backtrace_user_ctx(current, current_ctx);
+                       else
+                               monitor(hw_tf);
                        handled = TRUE;
                        break;
                case T_ILLOP:
@@ -576,13 +580,8 @@ static void trap_dispatch(struct hw_trapframe *hw_tf)
                            *(uint8_t*)(ip + 1) == 0x01,
                            *(uint8_t*)(ip + 2) == 0xf9) {
                                x86_fake_rdtscp(hw_tf);
-                               pcpui->__lock_checking_enabled++;       /* for print debugging */
                                handled = TRUE;
-                               break;
                        }
-                       enable_irq();
-                       monitor(hw_tf);
-                       disable_irq();
                        pcpui->__lock_checking_enabled++;               /* for print debugging */
                        break;
                }
@@ -880,26 +879,50 @@ static bool handle_vmexit_cpuid(struct vm_trapframe *tf)
 {
        uint32_t eax, ebx, ecx, edx;
 
-       /* 0x4000000 is taken from Linux; it is not documented but it signals the
-        * use of KVM. */
-       if (tf->tf_rax == 0x40000000) {
-               /* Pretend to be KVM: Return the KVM signature by placing the following
-                * constants in RAX, RBX, RCX and RDX. RAX is set to 0, while RBX to
-                * RDX forms the string "KVMKVMKVMKVM\0\0\0". This can be placed in
-                * 0x100 offsets from 0x40000000 to 0x40010000. */
-               eax = 0;
-               ebx = 0x4b4d564b;
-               ecx = 0x564b4d56;
-               edx = 0x0000004d;
-       } else {
-               cpuid(tf->tf_rax, tf->tf_rcx, &eax, &ebx, &ecx, &edx);
-               if (tf->tf_rax == 1) {
+       if (tf->tf_rax == 0x0B)
+               return FALSE;   // Handle in userspace.
+
+       cpuid(tf->tf_rax, tf->tf_rcx, &eax, &ebx, &ecx, &edx);
+       switch (tf->tf_rax) {
+               case 0x01:
                        /* Set the hypervisor bit to let the guest know it is virtualized */
                        ecx |= 1 << 31;
+                       /* Unset the monitor capability bit so that the guest does not try
+                        * to use monitor/mwait. */
+                       ecx &= ~(1 << 3);
                        /* Unset the vmx capability bit so that the guest does not try
                         * to turn it on. */
                        ecx &= ~(1 << 5);
-               }
+                       /* Unset the perf capability bit so that the guest does not try
+                        * to turn it on. */
+                       ecx &= ~(1 << 15);
+
+                       /* Set the guest pcore id into the apic ID field in CPUID. */
+                       ebx &= 0x0000ffff;
+                       ebx |= (current->vmm.nr_guest_pcores & 0xff) << 16;
+                       ebx |= (tf->tf_guest_pcoreid & 0xff) << 24;
+                       break;
+               case 0x0A:
+                       eax = 0;
+                       ebx = 0;
+                       ecx = 0;
+                       edx = 0;
+                       break;
+               /* Signal the use of KVM. */
+               case 0x40000000:
+                       eax = 0;
+                       ebx = 0x4b4d564b;
+                       ecx = 0x564b4d56;
+                       edx = 0x0000004d;
+                       break;
+               /* Hypervisor Features. */
+               case 0x40000003:
+                       /* Unset the monitor capability bit so that the guest does not try
+                        * to use monitor/mwait. */
+                       edx &= ~(1 << 0);
+                       break;
+               default:
+                       break;
        }
        tf->tf_rax = eax;
        tf->tf_rbx = ebx;
@@ -918,11 +941,14 @@ static bool handle_vmexit_ept_fault(struct vm_trapframe *tf)
        prot |= tf->tf_exit_qual & VMX_EPT_FAULT_WRITE ? PROT_WRITE : 0;
        prot |= tf->tf_exit_qual & VMX_EPT_FAULT_INS ? PROT_EXEC : 0;
        ret = handle_page_fault(current, tf->tf_guest_pa, prot);
-       if (ret) {
-               /* TODO: maybe put ret in the TF somewhere */
-               return FALSE;
-       }
-       return TRUE;
+       if (ret == 0)
+               return TRUE;
+
+       //Mirror behavior in uthreads, tell userspace to try again.
+       if (ret == -EAGAIN)
+               tf->tf_flags |= VMCTX_FL_EPT_VMR_BACKED;
+
+       return FALSE;
 }
 
 /* Regarding NMI blocking,
@@ -948,7 +974,7 @@ bool handle_vmexit_msr(struct vm_trapframe *tf)
 {
        bool ret;
 
-       ret = vmm_emulate_msr(&tf->tf_rcx, &tf->tf_rdx, &tf->tf_rax,
+       ret = vmm_emulate_msr(tf,
                              (tf->tf_exit_reason == EXIT_REASON_MSR_READ
                                                   ? VMM_MSR_EMU_READ : VMM_MSR_EMU_WRITE));
        if (ret)
@@ -1042,7 +1068,8 @@ static void vmexit_dispatch(struct vm_trapframe *tf)
         * do it for external IRQs - the irq_dispatch code will handle it. */
        switch (tf->tf_exit_reason) {
        case EXIT_REASON_VMCALL:
-               if (current->vmm.flags & VMM_VMCALL_PRINTF) {
+               if (current->vmm.flags & VMM_VMCALL_PRINTF &&
+                   tf->tf_rax == VMCALL_PRINTC) {
                        printk("%c", tf->tf_rdi);
                        tf->tf_rip += 3;
                        handled = TRUE;
@@ -1103,6 +1130,7 @@ void handle_vmexit(struct vm_trapframe *tf)
        tf->tf_guest_pa = vmcs_read(GUEST_PHYSICAL_ADDRESS);
 
        set_current_ctx_vm(pcpui, tf);
+       __set_cpu_state(pcpui, CPU_STATE_KERNEL);
        tf = &pcpui->cur_ctx->tf.vm_tf;
        vmexit_dispatch(tf);
        /* We're either restarting a partial VM ctx (vmcs was launched, loaded on
@@ -1118,16 +1146,16 @@ void handle_vmexit(struct vm_trapframe *tf)
  * loaded with the current GS (the kernel's). */
 static void x86_finalize_hwtf(struct hw_trapframe *tf)
 {
-       tf->tf_gsbase = read_msr(MSR_KERNEL_GS_BASE);
-       write_msr(MSR_KERNEL_GS_BASE, read_gsbase());
+       tf->tf_gsbase = read_kern_gsbase();
+       write_kern_gsbase(read_gsbase());
        tf->tf_fsbase = read_fsbase();
        x86_hwtf_clear_partial(tf);
 }
 
 static void x86_finalize_swtf(struct sw_trapframe *tf)
 {
-       tf->tf_gsbase = read_msr(MSR_KERNEL_GS_BASE);
-       write_msr(MSR_KERNEL_GS_BASE, read_gsbase());
+       tf->tf_gsbase = read_kern_gsbase();
+       write_kern_gsbase(read_gsbase());
        tf->tf_fsbase = read_fsbase();
        x86_swtf_clear_partial(tf);
 }
@@ -1137,7 +1165,7 @@ static void x86_finalize_vmtf(struct vm_trapframe *tf)
        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 
        x86_vmtf_clear_partial(tf);
-       unload_guest_pcore(pcpui->cur_proc, pcpui->guest_pcoreid);
+       unload_guest_pcore(pcpui->owning_proc, pcpui->guest_pcoreid);
 }
 
 /* Makes sure that the user context is fully saved into ctx and not split across