akaros/kern/arch/x86/trap.c
<<
>>
Prefs
   1#include <arch/mmu.h>
   2#include <arch/x86.h>
   3#include <arch/arch.h>
   4#include <arch/console.h>
   5#include <arch/apic.h>
   6#include <arch/perfmon.h>
   7#include <ros/common.h>
   8#include <smp.h>
   9#include <assert.h>
  10#include <pmap.h>
  11#include <trap.h>
  12#include <monitor.h>
  13#include <process.h>
  14#include <mm.h>
  15#include <stdio.h>
  16#include <slab.h>
  17#include <syscall.h>
  18#include <kdebug.h>
  19#include <kmalloc.h>
  20#include <ex_table.h>
  21#include <arch/mptables.h>
  22#include <ros/procinfo.h>
  23
  24enum {
  25        NMI_NORMAL_OPN = 0,
  26        NMI_IN_PROGRESS,
  27        NMI_HANDLE_ANOTHER,
  28};
  29
  30taskstate_t ts;
  31
  32/* Interrupt descriptor table.  64 bit needs 16 byte alignment (i think). */
  33gatedesc_t __attribute__((aligned (16))) idt[256] = { { 0 } };
  34pseudodesc_t idt_pd;
  35
  36/* interrupt handler table, each element is a linked list of handlers for a
  37 * given IRQ.  Modification requires holding the lock (TODO: RCU) */
  38struct irq_handler *irq_handlers[NUM_IRQS];
  39spinlock_t irq_handler_wlock = SPINLOCK_INITIALIZER_IRQSAVE;
  40
  41static bool try_handle_exception_fixup(struct hw_trapframe *hw_tf)
  42{
  43        if (in_kernel(hw_tf)) {
  44                uintptr_t fixup_ip = get_fixup_ip(hw_tf->tf_rip);
  45
  46                if (fixup_ip != 0) {
  47                        hw_tf->tf_rip = fixup_ip;
  48                        return true;
  49                }
  50        }
  51
  52        return false;
  53}
  54
  55const char *x86_trapname(int trapno)
  56{
  57        static const char *const excnames[] = {
  58                "Divide error",
  59                "Debug",
  60                "Non-Maskable Interrupt",
  61                "Breakpoint",
  62                "Overflow",
  63                "BOUND Range Exceeded",
  64                "Invalid Opcode",
  65                "Device Not Available",
  66                "Double Fault",
  67                "Coprocessor Segment Overrun",
  68                "Invalid TSS",
  69                "Segment Not Present",
  70                "Stack Fault",
  71                "General Protection",
  72                "Page Fault",
  73                "(unknown trap)",
  74                "x87 FPU Floating-Point Error",
  75                "Alignment Check",
  76                "Machine-Check",
  77                "SIMD Floating-Point Exception"
  78        };
  79
  80        if (trapno < sizeof(excnames)/sizeof(excnames[0]))
  81                return excnames[trapno];
  82        if (trapno == T_SYSCALL)
  83                return "System call";
  84        return "(unknown trap)";
  85}
  86
  87/* Set stacktop for the current core to be the stack the kernel will start on
  88 * when trapping/interrupting from userspace. */
  89void set_stack_top(uintptr_t stacktop)
  90{
  91        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
  92
  93        /* No need to reload the task register, this takes effect immediately */
  94        x86_set_stacktop_tss(pcpui->tss, stacktop);
  95        /* Also need to make sure sysenters come in correctly */
  96        x86_set_sysenter_stacktop(stacktop);
  97}
  98
  99/* Note the check implies we only are on a one page stack (or the first page) */
 100uintptr_t get_stack_top(void)
 101{
 102        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 103        uintptr_t stacktop;
 104
 105        stacktop = x86_get_stacktop_tss(pcpui->tss);
 106        if (stacktop != ROUNDUP(read_sp(), PGSIZE))
 107                panic("Bad stacktop: %p esp one is %p\n", stacktop,
 108                      ROUNDUP(read_sp(), PGSIZE));
 109        return stacktop;
 110}
 111
 112/* Sends a non-maskable interrupt; the handler will print a trapframe. */
 113void send_nmi(uint32_t os_coreid)
 114{
 115        /* NMI / IPI for x86 are limited to 8 bits */
 116        uint8_t hw_core = (uint8_t)get_hw_coreid(os_coreid);
 117
 118        __send_nmi(hw_core);
 119}
 120
 121void idt_init(void)
 122{
 123        /* This table is made in trapentry$BITS.S by each macro in that file.
 124         * It is layed out such that the ith entry is the ith's traphandler's
 125         * (uintptr_t) trap addr, then (uint32_t) trap number. */
 126        struct trapinfo { uintptr_t trapaddr; uint32_t trapnumber; }
 127               __attribute__((packed));
 128        extern struct trapinfo trap_tbl[];
 129        extern struct trapinfo trap_tbl_end[];
 130        int i, trap_tbl_size = trap_tbl_end - trap_tbl;
 131        extern void ISR_default(void);
 132        extern void ISR_syscall(void);
 133
 134        /* set all to default, to catch everything */
 135        for (i = 0; i < 256; i++)
 136                SETGATE(idt[i], 0, GD_KT, &ISR_default, 0);
 137
 138        /* set all entries that have real trap handlers
 139         * we need to stop short of the last one, since the last is the default
 140         * handler with a fake interrupt number (500) that is out of bounds of
 141         * the idt[] */
 142        for (i = 0; i < trap_tbl_size - 1; i++)
 143                SETGATE(idt[trap_tbl[i].trapnumber], 0, GD_KT,
 144                        trap_tbl[i].trapaddr, 0);
 145        /* Sanity check */
 146        assert((uintptr_t)ISR_syscall ==
 147               ((uintptr_t)idt[T_SYSCALL].gd_off_63_32 << 32 |
 148                (uintptr_t)idt[T_SYSCALL].gd_off_31_16 << 16 |
 149                (uintptr_t)idt[T_SYSCALL].gd_off_15_0));
 150        /* turn on trap-based syscall handling and other user-accessible ints
 151         * DPL 3 means this can be triggered by the int instruction */
 152        idt[T_SYSCALL].gd_dpl = 3;
 153        idt[T_BRKPT].gd_dpl = 3;
 154        /* Send NMIs to their own stack (IST1 in every core's TSS) */
 155        idt[T_NMI].gd_ist = 1;
 156        /* Send double faults to their own stack (IST2 in every core's TSS) */
 157        idt[T_DBLFLT].gd_ist = 2;
 158
 159        /* The sooner we set this, the sooner we can use set/get_stack_top. */
 160        per_cpu_info[0].tss = &ts;
 161        per_cpu_info[0].gdt = gdt;
 162
 163        /* Set up our kernel stack when changing rings */
 164        /* Note: we want 16 byte aligned kernel stack frames (AMD 2:8.9.3) */
 165        x86_sysenter_init();
 166        /* We will set this properly once we have a kstack from the slab. */
 167        set_stack_top(0xdeadbeef);
 168
 169        /* Initialize the TSS field of the gdt.  The size of the TSS desc
 170         * differs between 64 and 32 bit, hence the pointer acrobatics */
 171        syssegdesc_t *ts_slot = (syssegdesc_t*)&gdt[GD_TSS >> 3];
 172        *ts_slot = (syssegdesc_t)SEG_SYS_SMALL(STS_T32A, (uintptr_t)&ts,
 173                                               sizeof(taskstate_t), 0);
 174
 175        /* Init the IDT PD.  Need to do this before ltr for some reason.  (Doing
 176         * this between ltr and lidt causes the machine to reboot... */
 177        idt_pd.pd_lim = sizeof(idt) - 1;
 178        idt_pd.pd_base = (uintptr_t)idt;
 179
 180        ltr(GD_TSS);
 181
 182        asm volatile("lidt %0" : : "m"(idt_pd));
 183
 184        pic_remap();
 185        pic_mask_all();
 186
 187        int ncleft = MAX_NUM_CORES;
 188        int num_cores_mpacpi;
 189
 190        ncleft = mpsinit(ncleft);
 191        ncleft = mpacpi(ncleft);
 192        num_cores_mpacpi = MAX_NUM_CORES - ncleft;
 193        printk("MP and ACPI found %d cores\n", num_cores_mpacpi);
 194        if (num_cores != num_cores_mpacpi)
 195                warn("Topology (%d) and MP/ACPI (%d) differ on num_cores!",
 196                     num_cores, num_cores_mpacpi);
 197
 198        apiconline();
 199        ioapiconline();
 200
 201        /* the lapic IRQs need to be unmasked on a per-core basis */
 202        register_irq(IdtLAPIC_TIMER, timer_interrupt, NULL,
 203                     MKBUS(BusLAPIC, 0, 0, 0));
 204        register_irq(IdtLAPIC_ERROR, handle_lapic_error, NULL,
 205                     MKBUS(BusLAPIC, 0, 0, 0));
 206        register_irq(IdtLAPIC_PCINT, perfmon_interrupt, NULL,
 207                     MKBUS(BusLAPIC, 0, 0, 0));
 208        register_irq(I_KERNEL_MSG, handle_kmsg_ipi, NULL,
 209                     MKBUS(BusIPI, 0, 0, 0));
 210}
 211
 212static void print_fperr(struct hw_trapframe *hw_tf)
 213{
 214        uint16_t fpcw, fpsw;
 215        uint32_t mxcsr;
 216
 217        asm volatile ("fnstcw %0" : "=m"(fpcw));
 218        asm volatile ("fnstsw %0" : "=m"(fpsw));
 219        asm volatile ("stmxcsr %0" : "=m"(mxcsr));
 220        print_lock();
 221        print_trapframe(hw_tf);
 222        printk("Core %d: FP ERR, CW: 0x%04x, SW: 0x%04x, MXCSR 0x%08x\n",
 223               core_id(), fpcw, fpsw, mxcsr);
 224        printk("Core %d: The following faults are unmasked:\n", core_id());
 225        if (fpsw & ~fpcw & FP_EXCP_IE) {
 226                printk("\tInvalid Operation: ");
 227                if (fpsw & FP_SW_SF) {
 228                        if (fpsw & FP_SW_C1)
 229                                printk("Stack overflow\n");
 230                        else
 231                                printk("Stack underflow\n");
 232                } else {
 233                        printk("invalid arithmetic operand\n");
 234                }
 235        }
 236        if (fpsw & ~fpcw & FP_EXCP_DE)
 237                printk("\tDenormalized operand\n");
 238        if (fpsw & ~fpcw & FP_EXCP_ZE)
 239                printk("\tDivide by zero\n");
 240        if (fpsw & ~fpcw & FP_EXCP_OE)
 241                printk("\tNumeric Overflow\n");
 242        if (fpsw & ~fpcw & FP_EXCP_UE)
 243                printk("\tNumeric Underflow\n");
 244        if (fpsw & ~fpcw & FP_EXCP_PE)
 245                printk("\tInexact result (precision)\n");
 246        print_unlock();
 247}
 248
 249static bool __handler_user_page_fault(struct hw_trapframe *hw_tf,
 250                                      uintptr_t fault_va, int prot)
 251{
 252        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 253        int err;
 254
 255        assert(pcpui->owning_proc == pcpui->cur_proc);
 256        enable_irq();
 257        err = handle_page_fault(pcpui->owning_proc, fault_va, prot);
 258        disable_irq();
 259        if (err) {
 260                if (err == -EAGAIN)
 261                        hw_tf->tf_err |= PF_VMR_BACKED;
 262                return FALSE;
 263        }
 264        return TRUE;
 265}
 266
 267static bool __handler_kernel_page_fault(struct hw_trapframe *hw_tf,
 268                                        uintptr_t fault_va, int prot)
 269{
 270        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 271        int err;
 272
 273        /* The only thing an NMI handler that faults can do is a fixup */
 274        if (pcpui->nmi_status != NMI_NORMAL_OPN) {
 275                assert(in_kernel(hw_tf));
 276                return try_handle_exception_fixup(hw_tf);
 277        }
 278        /* In general, if there's no cur_proc, a KPF is a bug. */
 279        if (!pcpui->cur_proc) {
 280                /* This only runs from test_uaccess(), where it is expected to
 281                 * fail. */
 282                if (try_handle_exception_fixup(hw_tf))
 283                        return TRUE;
 284                panic_hwtf(hw_tf, "Proc-less Page Fault in the Kernel at %p!",
 285                           fault_va);
 286        }
 287        /* TODO - handle kernel page faults.  This is dangerous, since we might
 288         * be holding locks in the kernel and could deadlock when we HPF.  For
 289         * now, I'm just disabling the lock checker, since it'll flip out when
 290         * it sees there is a kernel trap.  Will need to think about this a bit,
 291         * esp when we properly handle bad addrs and whatnot. */
 292        pcpui->__lock_checking_enabled--;
 293        /* It is a bug for the kernel to access user memory while holding locks
 294         * that are used by handle_page_fault.  At a minimum, this includes
 295         * p->vmr_lock and memory allocation locks.
 296         *
 297         * In an effort to reduce the number of locks (both now and in the
 298         * future), the kernel will not attempt to handle faults on file-back
 299         * VMRs.  We probably can turn that on in the future, but I'd rather
 300         * keep things safe for now.  (We'll probably need to change this when
 301         * we stop MAP_POPULATE | MAP_LOCKED entire binaries).
 302         *
 303         * Note that we do not enable IRQs here, unlike in the user case.
 304         * Again, this is to limit the locks we could be grabbing. */
 305        err = handle_page_fault_nofile(pcpui->cur_proc, fault_va, prot);
 306        pcpui->__lock_checking_enabled++;
 307        if (err) {
 308                if (try_handle_exception_fixup(hw_tf))
 309                        return TRUE;
 310                /* Turn this on to help debug bad function pointers */
 311                printd("rsp %p\n\t 0(rsp): %p\n\t 8(rsp): %p\n\t 16(rsp): %p\n"
 312                       "\t24(rsp): %p\n", hw_tf->tf_rsp,
 313                       *(uintptr_t*)(hw_tf->tf_rsp +  0),
 314                       *(uintptr_t*)(hw_tf->tf_rsp +  8),
 315                       *(uintptr_t*)(hw_tf->tf_rsp + 16),
 316                       *(uintptr_t*)(hw_tf->tf_rsp + 24));
 317                panic_hwtf(hw_tf, "Proc-ful Page Fault in the Kernel at %p!",
 318                           fault_va);
 319                /* if we want to do something like kill a process or other code,
 320                 * be aware we are in a sort of irq-like context, meaning the
 321                 * main kernel code we 'interrupted' could be holding locks -
 322                 * even irqsave locks. */
 323        }
 324        return TRUE;
 325}
 326
 327static bool __handle_page_fault(struct hw_trapframe *hw_tf, unsigned long *aux)
 328{
 329        uintptr_t fault_va = rcr2();
 330        int prot = hw_tf->tf_err & PF_ERROR_WRITE ? PROT_WRITE : PROT_READ;
 331
 332        *aux = fault_va;
 333        if (in_kernel(hw_tf))
 334                return __handler_kernel_page_fault(hw_tf, fault_va, prot);
 335        else
 336                return __handler_user_page_fault(hw_tf, fault_va, prot);
 337}
 338
 339/* Actual body of work done when an NMI arrives */
 340static void do_nmi_work(struct hw_trapframe *hw_tf)
 341{
 342        assert(!irq_is_enabled());
 343        /* It's mostly harmless to snapshot the TF, and we can send a spurious
 344         * PCINT interrupt.  perfmon.c just uses the interrupt to tell it to
 345         * check its counters for overflow.  Note that the PCINT interrupt is
 346         * just a regular IRQ.  The backtrace was recorded during the NMI and
 347         * emitted during IRQ.
 348         *
 349         * That being said, it's OK if the monitor triggers debugging NMIs while
 350         * perf is running.  If perf triggers an NMI when the monitor wants to
 351         * print, the monitor will debug *that* NMI, and not the one that gets
 352         * sent moments later.  That's fine. */
 353        emit_monitor_backtrace(ROS_HW_CTX, hw_tf);
 354        perfmon_snapshot_hwtf(hw_tf);
 355        send_self_ipi(IdtLAPIC_PCINT);
 356}
 357
 358/* NMI HW_TF hacking involves four symbols:
 359 *
 360 * [__nmi_pop_ok_start, __nmi_pop_ok_end) mark the beginning and end of the
 361 * code for an nmi popping routine that will actually pop at the end.
 362 *
 363 * [__nmi_pop_fail_start, __nmi_pop_fail_end) mark the beginning and end of the
 364 * shadow code for an nmi popping routine that will fail at the end.
 365 *
 366 * If we see a TF in the OK section, we'll move it to the FAIL section.  If it's
 367 * already in the FAIL section, we'll report that as a success. */
 368extern char __nmi_pop_ok_start[], __nmi_pop_ok_end[];
 369extern char __nmi_pop_fail_start[], __nmi_pop_fail_end[];
 370
 371static bool nmi_hw_tf_needs_hacked(struct hw_trapframe *hw_tf)
 372{
 373        return ((uintptr_t)__nmi_pop_ok_start <= hw_tf->tf_rip) &&
 374               (hw_tf->tf_rip < (uintptr_t)__nmi_pop_ok_end);
 375}
 376
 377static bool nmi_hw_tf_was_hacked(struct hw_trapframe *hw_tf)
 378{
 379        return ((uintptr_t)__nmi_pop_fail_start <= hw_tf->tf_rip) &&
 380               (hw_tf->tf_rip < (uintptr_t)__nmi_pop_fail_end);
 381}
 382
 383/* Helper.  Hacks the TF if it was in the OK section so that it is at the same
 384 * spot in the FAIL section.  Returns TRUE if the TF is hacked, meaning the NMI
 385 * handler can just return. */
 386static bool nmi_check_and_hack_tf(struct hw_trapframe *hw_tf)
 387{
 388        uintptr_t offset;
 389
 390        if (!nmi_hw_tf_needs_hacked(hw_tf))
 391                return FALSE;
 392        if (nmi_hw_tf_was_hacked(hw_tf))
 393                return TRUE;
 394        offset = hw_tf->tf_rip - (uintptr_t)__nmi_pop_ok_start;
 395        hw_tf->tf_rip = (uintptr_t)__nmi_pop_fail_start + offset;
 396        return TRUE;
 397}
 398
 399/* Bottom half of the NMI handler.  This can be interrupted under some
 400 * circumstances by NMIs.  It exits by popping the hw_tf in assembly. */
 401void noinline __attribute__((noreturn))
 402__nmi_bottom_half(struct hw_trapframe *hw_tf)
 403{
 404        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 405
 406        while (1) {
 407                /* Signal that we're doing work.  A concurrent NMI will set this
 408                 * to NMI_HANDLE_ANOTHER if we should continue, which we'll
 409                 * catch later. */
 410                pcpui->nmi_status = NMI_IN_PROGRESS;
 411                do_nmi_work(hw_tf);
 412                /* We need to check nmi_status to see if it is
 413                 * NMI_HANDLE_ANOTHER (if so, run again), write NMI_NORMAL_OPN,
 414                 * leave this stack, and return to the original context.  We
 415                 * need to do that in such a manner that an NMI can come in at
 416                 * any time.  There are two concerns.
 417                 *
 418                 * First, we need to not "miss the signal" telling us to re-run
 419                 * the NMI handler.  To do that, we'll do the actual checking in
 420                 * asm.  Being in the asm code block is a signal to the real NMI
 421                 * handler that we need to abort and do_nmi_work() again.
 422                 *
 423                 * Second, we need to atomically leave the stack and return.  By
 424                 * being in asm, the NMI handler knows to just hack our PC to
 425                 * make us return, instead of starting up a fresh
 426                 * __nmi_bottom_half().
 427                 *
 428                 * The NMI handler works together with the following function
 429                 * such that if that race occurs while we're in the function,
 430                 * it'll fail and return.  Then we'll just do_nmi_work() and try
 431                 * again. */
 432                extern void nmi_try_to_pop(struct hw_trapframe *tf, int *status,
 433                                           int old_val, int new_val);
 434
 435                nmi_try_to_pop(hw_tf, &pcpui->nmi_status, NMI_IN_PROGRESS,
 436                               NMI_NORMAL_OPN);
 437                /* Either we returned on our own, since we lost a race with
 438                 * nmi_status and didn't write (status = ANOTHER), or we won the
 439                 * race, but an NMI handler set the status to ANOTHER and
 440                 * restarted us. */
 441                assert(pcpui->nmi_status != NMI_NORMAL_OPN);
 442        }
 443}
 444
 445/* Separate handler from traps, since there's too many rules for NMI ctx.
 446 *
 447 * The general rule is that any writes from NMI context must be very careful.
 448 * When talking about reads and writes to per-core data:
 449 * - If NMIs write things written by normal kernel contexts, including IRQs and
 450 *   traps with IRQs disabled, then you must use atomics on both sides.
 451 * - If NMIs write things read by normal contexts, then readers must be careful,
 452 *   since the data can change at will.
 453 * - If NMIs read things written by normal contexts, don't worry: you're running
 454 *   uninterrupted (given x86 NMI caveats).
 455 * - We cannot block.  The current kthread thinks its stacktop is different than
 456 *   the one we're on.  Just get in and get out.
 457 * - If we interrupted a user TF, then we don't need to worry any more than for
 458 *   normal traps/IRQs.
 459 * - However, we cannot call proc_restartcore.  That could trigger all sorts of
 460 *   things, like kthreads blocking.
 461 * - Parallel accesses (from other cores) are the same as always.  You just
 462 *   can't lock easily.
 463 *
 464 * Normally, once you're in NMI, other NMIs are blocked until we return.
 465 * However, if our NMI handler faults (PF, GPF, breakpoint) due to something
 466 * like tracing, the iret from that fault will cancel our NMI protections.  Thus
 467 * we need another layer of code to make sure we don't run the NMI handler
 468 * concurrently on the same core.  See https://lwn.net/Articles/484932/ for more
 469 * info.
 470 *
 471 * We'll get around the problem by running on yet another NMI stack.  All NMIs
 472 * come in on the nmi entry stack (tss->ist1).  While we're on that stack, we
 473 * will not be interrupted.  We jump to another stack to do_nmi_work.  That code
 474 * can be interrupted, but we are careful to only have one 'thread' running on
 475 * that stack at a time.  We do this by carefully hopping off the stack in
 476 * assembly, similar to popping user TFs. */
 477void handle_nmi(struct hw_trapframe *hw_tf)
 478{
 479        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 480        struct hw_trapframe *hw_tf_copy;
 481        uintptr_t worker_stacktop;
 482
 483        /* At this point, we're an NMI and other NMIs are blocked.  Only once we
 484         * hop to the bottom half could that be no longer true.  NMI with NMIs
 485         * fully blocked will run without interruption.  For that reason, we
 486         * don't have to be careful about any memory accesses or compiler
 487         * tricks. */
 488        if (pcpui->nmi_status == NMI_HANDLE_ANOTHER)
 489                return;
 490        if (pcpui->nmi_status == NMI_IN_PROGRESS) {
 491                /* Force the handler to run again.  We don't need to worry about
 492                 * concurrent access here.  We're running, they are not.  We
 493                 * cannot 'PAUSE' since NMIs are fully blocked.
 494                 *
 495                 * The asm routine, for its part, does a compare-and-swap, so if
 496                 * we happened to interrupt it before it wrote NMI_NORMAL_OPN,
 497                 * it'll notice, abort, and not write the status. */
 498                pcpui->nmi_status = NMI_HANDLE_ANOTHER;
 499                return;
 500        }
 501        assert(pcpui->nmi_status == NMI_NORMAL_OPN);
 502        pcpui->nmi_status = NMI_HANDLE_ANOTHER;
 503        /* We could be interrupting an NMI that is trying to pop back to a
 504         * normal context.  We can tell by looking at its PC.  If it is within
 505         * the popping routine, then we interrupted it at this bad time.  We'll
 506         * hack the TF such that it will return instead of succeeding. */
 507        if (nmi_check_and_hack_tf(hw_tf))
 508                return;
 509        /* OK, so we didn't interrupt an NMI that was trying to return.  So we
 510         * need to run the bottom half.  We're going to jump stacks, but we also
 511         * need to copy the hw_tf.  The existing one will be clobbered by any
 512         * interrupting NMIs.
 513         *
 514         * We also need to save some space on the top of that stack for a
 515         * pointer to pcpui and a scratch register, which nmi_try_to_pop() will
 516         * use.  The target stack will look like this:
 517         *
 518         *           +--------------------------+ Page boundary (e.g. 0x6000)
 519         *           |   scratch space (rsp)    |
 520         *           |       pcpui pointer      |
 521         *           |      tf_ss + padding     | HW_TF end
 522         *           |          tf_rsp          |
 523         *           |            .             |
 524         *           |            .             |
 525         * RSP ->    |         tf_gsbase        | HW_TF start, hw_tf_copy
 526         *           +--------------------------+
 527         *           |            .             |
 528         *           |            .             |
 529         *           |            .             |
 530         *           +--------------------------+ Page boundary (e.g. 0x5000)
 531         *
 532         * __nmi_bottom_half() just picks up using the stack below tf_gsbase.
 533         * It'll push as needed, growing down.  Basically we're just using the
 534         * space 'above' the stack as storage. */
 535        worker_stacktop = pcpui->nmi_worker_stacktop - 2 * sizeof(uintptr_t);
 536        *(uintptr_t*)worker_stacktop = (uintptr_t)pcpui;
 537        worker_stacktop = worker_stacktop - sizeof(struct hw_trapframe);
 538        hw_tf_copy = (struct hw_trapframe*)worker_stacktop;
 539        *hw_tf_copy = *hw_tf;
 540        /* Once we head to the bottom half, consider ourselves interruptible
 541         * (though it's not until the first time we do_nmi_work()).  We'll never
 542         * come back to this stack.  Doing this in asm so we can easily pass an
 543         * argument.  We don't need to call (vs jmp), but it helps keep the
 544         * stack aligned. */
 545        asm volatile("mov $0x0, %%rbp;"
 546                     "mov %0, %%rsp;"
 547                     "call __nmi_bottom_half;"
 548                     : : "r"(worker_stacktop), "D"(hw_tf_copy));
 549        assert(0);
 550}
 551
 552void handle_double_fault(struct hw_trapframe *hw_tf)
 553{
 554        panic_hwtf(hw_tf, "Double fault!  Check the kernel stack pointer; you likely ran off the end of the stack.");
 555}
 556
 557/* Certain traps want IRQs enabled, such as the syscall.  Others can't handle
 558 * it, like the page fault handler.  Turn them on on a case-by-case basis. */
 559static void trap_dispatch(struct hw_trapframe *hw_tf)
 560{
 561        struct per_cpu_info *pcpui;
 562        bool handled = FALSE;
 563        unsigned long aux = 0;
 564        uintptr_t fixup_ip;
 565
 566        // Handle processor exceptions.
 567        switch(hw_tf->tf_trapno) {
 568        case T_BRKPT:
 569                if (!in_kernel(hw_tf))
 570                        backtrace_user_ctx(current, current_ctx);
 571                else
 572                        monitor(hw_tf);
 573                handled = TRUE;
 574                break;
 575        case T_ILLOP:
 576        {
 577                /* TODO: this can PF if there is a concurrent unmap/PM removal.
 578                 * */
 579                uintptr_t ip = get_hwtf_pc(hw_tf);
 580
 581                pcpui = &per_cpu_info[core_id()];
 582                pcpui->__lock_checking_enabled--; /* for print debugging */
 583                /* We will muck with the actual TF.  If we're dealing with
 584                 * userspace, we need to make sure we edit the actual TF that
 585                 * will get restarted (pcpui), and not the TF on the kstack
 586                 * (which aren't the same).  See set_current_ctx() for more
 587                 * info. */
 588                if (!in_kernel(hw_tf))
 589                        hw_tf = &pcpui->cur_ctx->tf.hw_tf;
 590                printd("bad opcode, eip: %p, next 3 bytes: %x %x %x\n", ip,
 591                       *(uint8_t*)(ip + 0),
 592                       *(uint8_t*)(ip + 1),
 593                       *(uint8_t*)(ip + 2));
 594                /* rdtscp: 0f 01 f9 */
 595                if (*(uint8_t*)(ip + 0) == 0x0f,
 596                    *(uint8_t*)(ip + 1) == 0x01,
 597                    *(uint8_t*)(ip + 2) == 0xf9) {
 598                        x86_fake_rdtscp(hw_tf);
 599                        handled = TRUE;
 600                }
 601                pcpui->__lock_checking_enabled++; /* for print debugging */
 602                break;
 603        }
 604        case T_PGFLT:
 605                handled = __handle_page_fault(hw_tf, &aux);
 606                break;
 607        case T_GPFLT:
 608        case T_FPERR:
 609                handled = try_handle_exception_fixup(hw_tf);
 610                break;
 611        case T_SYSCALL:
 612                enable_irq();
 613                // check for userspace, for now
 614                assert(hw_tf->tf_cs != GD_KT);
 615                /* Set up and run the async calls */
 616                /* TODO: this is using the wrong reg1 for traps for 32 bit */
 617                prep_syscalls(current,
 618                              (struct syscall*)x86_get_systrap_arg0(hw_tf),
 619                              (unsigned int)x86_get_systrap_arg1(hw_tf));
 620                disable_irq();
 621                handled = TRUE;
 622                break;
 623        }
 624
 625        if (!handled) {
 626                if (in_kernel(hw_tf))
 627                        panic_hwtf(hw_tf,
 628                                   "Damn Damn!  Unhandled trap in the kernel!");
 629                reflect_unhandled_trap(hw_tf->tf_trapno, hw_tf->tf_err, aux);
 630        }
 631}
 632
 633/* Helper.  For now, this copies out the TF to pcpui.  Eventually, we should
 634 * consider doing this in trapentry.S
 635 *
 636 * TODO: consider having this return the tf used, so we can set tf in trap and
 637 * irq handlers to edit the TF that will get restarted.  Right now, the kernel
 638 * uses and restarts tf, but userspace restarts the old pcpui tf.  It is
 639 * tempting to do this, but note that tf stays on the stack of the kthread,
 640 * while pcpui->cur_ctx is for the core we trapped in on.  Meaning if we ever
 641 * block, suddenly cur_ctx is pointing to some old clobbered state that was
 642 * already returned to and can't be trusted.  Meanwhile tf can always be trusted
 643 * (like with an in_kernel() check).  The only types of traps from the user that
 644 * can be expected to have editable trapframes are ones that don't block. */
 645static void set_current_ctx_hw(struct per_cpu_info *pcpui,
 646                               struct hw_trapframe *hw_tf)
 647{
 648        assert(!irq_is_enabled());
 649        pcpui->actual_ctx.type = ROS_HW_CTX;
 650        pcpui->actual_ctx.tf.hw_tf = *hw_tf;
 651        pcpui->cur_ctx = &pcpui->actual_ctx;
 652}
 653
 654static void set_current_ctx_sw(struct per_cpu_info *pcpui,
 655                               struct sw_trapframe *sw_tf)
 656{
 657        assert(!irq_is_enabled());
 658        pcpui->actual_ctx.type = ROS_SW_CTX;
 659        pcpui->actual_ctx.tf.sw_tf = *sw_tf;
 660        pcpui->cur_ctx = &pcpui->actual_ctx;
 661}
 662
 663static void set_current_ctx_vm(struct per_cpu_info *pcpui,
 664                               struct vm_trapframe *vm_tf)
 665{
 666        assert(!irq_is_enabled());
 667        pcpui->actual_ctx.type = ROS_VM_CTX;
 668        pcpui->actual_ctx.tf.vm_tf = *vm_tf;
 669        pcpui->cur_ctx = &pcpui->actual_ctx;
 670}
 671
 672void trap(struct hw_trapframe *hw_tf)
 673{
 674        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 675
 676        /* Copy out the TF for now */
 677        if (!in_kernel(hw_tf)) {
 678                set_current_ctx_hw(pcpui, hw_tf);
 679                /* ignoring state for nested kernel traps.  should be rare. */
 680                __set_cpu_state(pcpui, CPU_STATE_KERNEL);
 681        } else {
 682                inc_ktrap_depth(pcpui);
 683        }
 684        printd("Incoming TRAP %d on core %d, TF at %p\n", hw_tf->tf_trapno,
 685               core_id(), hw_tf);
 686        if ((hw_tf->tf_cs & ~3) != GD_UT && (hw_tf->tf_cs & ~3) != GD_KT)
 687                panic_hwtf(hw_tf, "Trapframe with invalid CS!");
 688        trap_dispatch(hw_tf);
 689        /* Return to the current process, which should be runnable.  If we're
 690         * the kernel, we should just return naturally.  Note that current and
 691         * tf need to still be okay (might not be after blocking) */
 692        if (in_kernel(hw_tf)) {
 693                dec_ktrap_depth(pcpui);
 694                return;
 695        }
 696        proc_restartcore();
 697        assert(0);
 698}
 699
 700static bool vector_is_irq(int apic_vec)
 701{
 702        /* arguably, we could limit them to MaxIdtIOAPIC */
 703        return (IdtPIC <= apic_vec) && (apic_vec <= IdtMAX);
 704}
 705
 706static void irq_dispatch(struct hw_trapframe *hw_tf)
 707{
 708        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 709        struct irq_handler *irq_h;
 710
 711        if (!in_irq_ctx(pcpui))
 712                __set_cpu_state(pcpui, CPU_STATE_IRQ);
 713        inc_irq_depth(pcpui);
 714        //if (core_id())
 715        if (hw_tf->tf_trapno != IdtLAPIC_TIMER) /* timer irq */
 716        if (hw_tf->tf_trapno != I_KERNEL_MSG)
 717        if (hw_tf->tf_trapno != 65)     /* qemu serial tends to get this one */
 718                printd("Incoming IRQ, ISR: %d on core %d\n", hw_tf->tf_trapno,
 719                       core_id());
 720        /* TODO: RCU read lock */
 721        irq_h = irq_handlers[hw_tf->tf_trapno];
 722        if (!irq_h) {
 723                warn_once("Received IRQ %d, had no handler registered!",
 724                          hw_tf->tf_trapno);
 725                /* If we don't have an IRQ handler, we don't know how to EOI.
 726                 * Odds are, it's a LAPIC IRQ, such as I_TESTING */
 727                if (!lapic_check_spurious(hw_tf->tf_trapno))
 728                        lapic_send_eoi(hw_tf->tf_trapno);
 729                goto out_no_eoi;
 730        }
 731        if (irq_h->check_spurious(hw_tf->tf_trapno))
 732                goto out_no_eoi;
 733        /* Can now be interrupted/nested by higher priority IRQs, but not by our
 734         * current IRQ vector, til we EOI. */
 735        enable_irq();
 736        while (irq_h) {
 737                irq_h->isr(hw_tf, irq_h->data);
 738                irq_h = irq_h->next;
 739        }
 740        // if we're a general purpose IPI function call, down the cpu_list
 741        extern handler_wrapper_t handler_wrappers[NUM_HANDLER_WRAPPERS];
 742        if ((I_SMP_CALL0 <= hw_tf->tf_trapno) &&
 743            (hw_tf->tf_trapno <= I_SMP_CALL_LAST))
 744                down_checklist(handler_wrappers[hw_tf->tf_trapno & 0x0f]
 745                               .cpu_list);
 746        disable_irq();
 747        /* Keep in sync with ipi_is_pending */
 748        irq_handlers[hw_tf->tf_trapno]->eoi(hw_tf->tf_trapno);
 749        /* Fall-through */
 750out_no_eoi:
 751        dec_irq_depth(pcpui);
 752        if (!in_irq_ctx(pcpui))
 753                __set_cpu_state(pcpui, CPU_STATE_KERNEL);
 754}
 755
 756/* Note IRQs are disabled unless explicitly turned on.
 757 *
 758 * In general, we should only get trapno's >= PIC1_OFFSET (32).  Anything else
 759 * should be a trap.  Even if we don't use the PIC, that should be the standard.
 760 * It is possible to get a spurious LAPIC IRQ with vector 15 (or similar), but
 761 * the spurious check should catch that.
 762 *
 763 * Note that from hardware's perspective (PIC, etc), IRQs start from 0, but they
 764 * are all mapped up at PIC1_OFFSET for the cpu / irq_handler. */
 765void handle_irq(struct hw_trapframe *hw_tf)
 766{
 767        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 768
 769        /* Copy out the TF for now */
 770        if (!in_kernel(hw_tf))
 771                set_current_ctx_hw(pcpui, hw_tf);
 772        irq_dispatch(hw_tf);
 773        /* Return to the current process, which should be runnable.  If we're
 774         * the kernel, we should just return naturally.  Note that current and
 775         * tf need to still be okay (might not be after blocking) */
 776        if (in_kernel(hw_tf))
 777                return;
 778        proc_restartcore();
 779        assert(0);
 780}
 781
 782/* The irq field may be ignored based on the type of Bus. */
 783int register_irq(int irq, isr_t handler, void *irq_arg, uint32_t tbdf)
 784{
 785        struct irq_handler *irq_h;
 786        int vector;
 787
 788        irq_h = kzmalloc(sizeof(struct irq_handler), 0);
 789        assert(irq_h);
 790        irq_h->dev_irq = irq;
 791        irq_h->tbdf = tbdf;
 792        vector = bus_irq_setup(irq_h);
 793        if (vector == -1) {
 794                kfree(irq_h);
 795                return -1;
 796        }
 797        printk("IRQ %d, vector %d (0x%x), type %s\n", irq, vector, vector,
 798               irq_h->type);
 799        assert(irq_h->check_spurious && irq_h->eoi);
 800        irq_h->isr = handler;
 801        irq_h->data = irq_arg;
 802        irq_h->apic_vector = vector;
 803        /* RCU write lock */
 804        spin_lock_irqsave(&irq_handler_wlock);
 805        irq_h->next = irq_handlers[vector];
 806        wmb();  /* make sure irq_h is done before publishing to readers */
 807        irq_handlers[vector] = irq_h;
 808        spin_unlock_irqsave(&irq_handler_wlock);
 809        /* Most IRQs other than the BusIPI should need their irq unmasked.
 810         * Might need to pass the irq_h, in case unmask needs more info.
 811         * The lapic IRQs need to be unmasked on a per-core basis */
 812        if (irq_h->unmask && strcmp(irq_h->type, "lapic"))
 813                irq_h->unmask(irq_h, vector);
 814        return 0;
 815}
 816
 817/* These routing functions only allow the routing of an irq to a single core.
 818 * If we want to route to multiple cores, we'll probably need to set up logical
 819 * groups or something and take some additional parameters. */
 820static int route_irq_h(struct irq_handler *irq_h, int os_coreid)
 821{
 822        int hw_coreid;
 823        if (!irq_h->route_irq) {
 824                printk("[kernel] apic_vec %d, type %s cannot be routed\n",
 825                       irq_h->apic_vector, irq_h->type);
 826                return -1;
 827        }
 828        if (os_coreid >= MAX_NUM_CORES) {
 829                printk("[kernel] os_coreid %d out of range!\n", os_coreid);
 830                return -1;
 831        }
 832        hw_coreid = get_hw_coreid(os_coreid);
 833        if (hw_coreid == -1) {
 834                printk("[kernel] os_coreid %d not a valid hw core!\n",
 835                       os_coreid);
 836                return -1;
 837        }
 838        irq_h->route_irq(irq_h, irq_h->apic_vector, hw_coreid);
 839        return 0;
 840}
 841
 842/* Routes all irqs for a given apic_vector to os_coreid.  Returns 0 if all of
 843 * them succeeded.  -1 if there were none or if any of them failed.  We don't
 844 * share IRQs often (if ever anymore), so this shouldn't be an issue. */
 845int route_irqs(int apic_vec, int os_coreid)
 846{
 847        struct irq_handler *irq_h;
 848        int ret = -1;
 849
 850        if (!vector_is_irq(apic_vec)) {
 851                printk("[kernel] vector %d is not an IRQ vector!\n", apic_vec);
 852                return -1;
 853        }
 854        irq_h = irq_handlers[apic_vec];
 855        while (irq_h) {
 856                assert(irq_h->apic_vector == apic_vec);
 857                ret = route_irq_h(irq_h, os_coreid);
 858                irq_h = irq_h->next;
 859        }
 860        return ret;
 861}
 862
 863/* It's a moderate pain in the ass to put these in bit-specific files (header
 864 * hell with the set_current_ helpers) */
 865void sysenter_callwrapper(struct syscall *sysc, unsigned long count,
 866                          struct sw_trapframe *sw_tf)
 867{
 868        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 869        set_current_ctx_sw(pcpui, sw_tf);
 870        __set_cpu_state(pcpui, CPU_STATE_KERNEL);
 871        /* Once we've set_current_ctx, we can enable interrupts.  This used to
 872         * be mandatory (we had immediate KMSGs that would muck with cur_ctx).
 873         * Now it should only help for sanity/debugging. */
 874        enable_irq();
 875        /* Set up and run the async calls.  This may block, and we could migrate
 876         * to another core.  If you use pcpui again, you need to reread it. */
 877        prep_syscalls(current, sysc, count);
 878        disable_irq();
 879        proc_restartcore();
 880}
 881
 882/* Declared in x86/arch.h */
 883void send_ipi(uint32_t os_coreid, uint8_t vector)
 884{
 885        int hw_coreid = get_hw_coreid(os_coreid);
 886
 887        if (hw_coreid == -1) {
 888                panic("Unmapped OS coreid (OS %d)!\n", os_coreid);
 889                return;
 890        }
 891        assert(vector != T_NMI);
 892        __send_ipi(hw_coreid, vector);
 893}
 894
 895/****************** VM exit handling ******************/
 896
 897static bool handle_vmexit_cpuid(struct vm_trapframe *tf)
 898{
 899        uint32_t eax, ebx, ecx, edx;
 900        const uint32_t *sigptr;
 901        const char kvm_sig[] = "KVMKVMKVM\0\0\0";
 902        const char akaros_sig[] = "AKAROSINSIDE";
 903
 904        if (vmm_user_handles_cpuid(tf->tf_rax, tf->tf_rcx))
 905                return false;
 906
 907        cpuid(tf->tf_rax, tf->tf_rcx, &eax, &ebx, &ecx, &edx);
 908        switch (tf->tf_rax) {
 909        /* TODO: If we can move this to userspace, vmrunkernel can make GPCS on
 910         * the fly. */
 911        case 0x01:
 912                /* Set the hypervisor bit to let the guest know it is
 913                 * virtualized */
 914                ecx |= 1 << 31;
 915                /* Unset the monitor capability bit so that the guest does not
 916                 * try to use monitor/mwait. */
 917                ecx &= ~(1 << 3);
 918                /* Unset the vmx capability bit so that the guest does not try
 919                 * to turn it on. */
 920                ecx &= ~(1 << 5);
 921                /* Unset the perf capability bit so that the guest does not try
 922                 * to turn it on. */
 923                ecx &= ~(1 << 15);
 924
 925                /* Set the guest pcore id into the apic ID field in CPUID. */
 926                ebx &= 0x0000ffff;
 927                ebx |= (current->vmm.nr_guest_pcores & 0xff) << 16;
 928                ebx |= (tf->tf_guest_pcoreid & 0xff) << 24;
 929                break;
 930        case 0x07:
 931                /* Do not advertise TSC_ADJUST */
 932                ebx &= ~(1 << 1);
 933                break;
 934        case 0x0A:
 935                eax = 0;
 936                ebx = 0;
 937                ecx = 0;
 938                edx = 0;
 939                break;
 940        /* Signal the use of KVM. */
 941        case 0x40000000:
 942                sigptr = (const uint32_t *)kvm_sig;
 943                eax = 0;
 944                ebx = sigptr[0];
 945                ecx = sigptr[1];
 946                edx = sigptr[2];
 947                break;
 948        /* Hypervisor Features. */
 949        case 0x40000003:
 950                /* Unset the monitor capability bit so that the guest does not
 951                 * try to use monitor/mwait. */
 952                edx &= ~(1 << 0);
 953                break;
 954        /* Signal the use of AKAROS. */
 955        case 0x40000100:
 956                sigptr = (const uint32_t *)akaros_sig;
 957                eax = 0;
 958                ebx = sigptr[0];
 959                ecx = sigptr[1];
 960                edx = sigptr[2];
 961                break;
 962        /* Hypervisor Features. */
 963        case 0x40000103:
 964                /* Unset the monitor capability bit so that the guest does not
 965                 * try to use monitor/mwait. */
 966                edx &= ~(1 << 0);
 967                break;
 968        default:
 969                break;
 970        }
 971        tf->tf_rax = eax;
 972        tf->tf_rbx = ebx;
 973        tf->tf_rcx = ecx;
 974        tf->tf_rdx = edx;
 975        tf->tf_rip += 2;
 976        return TRUE;
 977}
 978
 979static bool handle_vmexit_ept_fault(struct vm_trapframe *tf)
 980{
 981        int prot = 0;
 982        int ret;
 983
 984        prot |= tf->tf_exit_qual & VMX_EPT_FAULT_READ ? PROT_READ : 0;
 985        prot |= tf->tf_exit_qual & VMX_EPT_FAULT_WRITE ? PROT_WRITE : 0;
 986        prot |= tf->tf_exit_qual & VMX_EPT_FAULT_INS ? PROT_EXEC : 0;
 987        ret = handle_page_fault(current, tf->tf_guest_pa, prot);
 988        if (ret == 0)
 989                return TRUE;
 990
 991        //Mirror behavior in uthreads, tell userspace to try again.
 992        if (ret == -EAGAIN)
 993                tf->tf_flags |= VMCTX_FL_EPT_VMR_BACKED;
 994
 995        return FALSE;
 996}
 997
 998/* Regarding NMI blocking,
 999 *      "An NMI causes subsequent NMIs to be blocked, but only after the VM exit
1000 *      completes." (SDM)
1001 *
1002 * Like handle_nmi(), this function and anything it calls directly cannot fault,
1003 * or else we lose our NMI protections. */
1004static bool handle_vmexit_nmi(struct vm_trapframe *tf)
1005{
1006        /* Sanity checks, make sure we really got an NMI.  Feel free to remove.
1007         */
1008        assert((tf->tf_intrinfo2 & INTR_INFO_INTR_TYPE_MASK)
1009               == INTR_TYPE_NMI_INTR);
1010        assert((tf->tf_intrinfo2 & INTR_INFO_VECTOR_MASK) == T_NMI);
1011        assert(!irq_is_enabled());
1012
1013        emit_monitor_backtrace(ROS_VM_CTX, tf);
1014        perfmon_snapshot_vmtf(tf);
1015        send_self_ipi(IdtLAPIC_PCINT);
1016        return TRUE;
1017}
1018
1019bool handle_vmexit_msr(struct vm_trapframe *tf)
1020{
1021        bool ret;
1022
1023        ret = vmm_emulate_msr(tf, (tf->tf_exit_reason == EXIT_REASON_MSR_READ
1024                                   ? VMM_MSR_EMU_READ : VMM_MSR_EMU_WRITE));
1025        if (ret)
1026                tf->tf_rip += 2;
1027        return ret;
1028}
1029
1030bool handle_vmexit_extirq(struct vm_trapframe *tf)
1031{
1032        struct hw_trapframe hw_tf;
1033        uint32_t trap_nr;
1034
1035        /* For now, we just handle external IRQs.  I think guest traps should go
1036         * to the guest, based on our vmctls */
1037        assert((tf->tf_intrinfo2 & INTR_INFO_INTR_TYPE_MASK)
1038               == INTR_TYPE_EXT_INTR);
1039        /* The POKE_HANDLER doesn't run for an ExtINT that triggers a vmexit */
1040        trap_nr = tf->tf_intrinfo2 & INTR_INFO_VECTOR_MASK;
1041        if (trap_nr == I_POKE_CORE) {
1042                lapic_send_eoi(trap_nr);
1043                return TRUE;
1044        }
1045        /* TODO: Our IRQ handlers all expect TFs.  Let's fake one.  A bunch of
1046         * handlers (e.g. backtrace/perf) will probably be unhappy about a user
1047         * TF that is really a VM, so this all needs work. */
1048        hw_tf.tf_gsbase = 0;
1049        hw_tf.tf_fsbase = 0;
1050        hw_tf.tf_rax = tf->tf_rax;
1051        hw_tf.tf_rbx = tf->tf_rbx;
1052        hw_tf.tf_rcx = tf->tf_rcx;
1053        hw_tf.tf_rdx = tf->tf_rdx;
1054        hw_tf.tf_rbp = tf->tf_rbp;
1055        hw_tf.tf_rsi = tf->tf_rsi;
1056        hw_tf.tf_rdi = tf->tf_rdi;
1057        hw_tf.tf_r8 = tf->tf_r8;
1058        hw_tf.tf_r9 = tf->tf_r9;
1059        hw_tf.tf_r10 = tf->tf_r10;
1060        hw_tf.tf_r11 = tf->tf_r11;
1061        hw_tf.tf_r12 = tf->tf_r12;
1062        hw_tf.tf_r13 = tf->tf_r13;
1063        hw_tf.tf_r14 = tf->tf_r14;
1064        hw_tf.tf_r15 = tf->tf_r15;
1065        hw_tf.tf_trapno = trap_nr;
1066        hw_tf.tf_err = 0;
1067        hw_tf.tf_rip = tf->tf_rip;
1068        hw_tf.tf_cs = GD_UT;    /* faking a user TF, even though it's a VM */
1069        hw_tf.tf_rflags = tf->tf_rflags;
1070        hw_tf.tf_rsp = tf->tf_rsp;
1071        hw_tf.tf_ss = GD_UD;
1072
1073        irq_dispatch(&hw_tf);
1074        /* Consider returning whether or not there was a handler registered */
1075        return TRUE;
1076}
1077
1078static bool handle_vmexit_xsetbv(struct vm_trapframe *tf)
1079{
1080        // The VM's requested-feature bitmap is represented by edx:eax
1081        uint64_t vm_rfbm = (tf->tf_rdx << 32) | tf->tf_rax;
1082
1083        // If the VM tries to set xcr0 to a superset
1084        // of Akaros's default value, kill the VM.
1085
1086        // Bit in vm_rfbm and x86_default_xcr0: Ok. Requested and allowed.
1087        // Bit in vm_rfbm but not x86_default_xcr0: Bad! Requested, not allowed.
1088        // Bit not in vm_rfbm but in x86_default_xcr0: Ok. Not requested.
1089
1090        // vm_rfbm & (~x86_default_xcr0) is nonzero if any bits
1091        // are set in vm_rfbm but not x86_default_xcr0
1092
1093        if (vm_rfbm & (~__proc_global_info.x86_default_xcr0))
1094                return FALSE;
1095
1096
1097        // If attempting to use vm_rfbm for xsetbv
1098        // causes a fault, we reflect to the VMM.
1099        if (safe_lxcr0(vm_rfbm))
1100                return FALSE;
1101
1102
1103        // If no fault, advance the instruction pointer
1104        // and return TRUE to make the VM resume.
1105        tf->tf_rip += 3; // XSETBV is a 3-byte instruction
1106        return TRUE;
1107}
1108
1109static void vmexit_dispatch(struct vm_trapframe *tf)
1110{
1111        bool handled = FALSE;
1112
1113        /* Do not block in any of these functions.
1114         *
1115         * If we block, we'll probably need to finalize the context.  If we do,
1116         * then there's a chance the guest pcore can start somewhere else, and
1117         * then we can't get the GPC loaded again.  Plus, they could be running
1118         * a GPC with an unresolved vmexit.  It's just mess.
1119         *
1120         * If we want to enable IRQs, we can do so on a case-by-case basis.
1121         * Don't do it for external IRQs - the irq_dispatch code will handle it.
1122         * */
1123        switch (tf->tf_exit_reason) {
1124        case EXIT_REASON_VMCALL:
1125                if (current->vmm.flags & VMM_CTL_FL_KERN_PRINTC &&
1126                    tf->tf_rax == AKAROS_VMCALL_PRINTC) {
1127                        printk("%c", tf->tf_rdi);
1128                        tf->tf_rip += 3;
1129                        handled = TRUE;
1130                }
1131                break;
1132        case EXIT_REASON_CPUID:
1133                handled = handle_vmexit_cpuid(tf);
1134                break;
1135        case EXIT_REASON_EPT_VIOLATION:
1136                handled = handle_vmexit_ept_fault(tf);
1137                break;
1138        case EXIT_REASON_EXCEPTION_NMI:
1139                handled = handle_vmexit_nmi(tf);
1140                break;
1141        case EXIT_REASON_MSR_READ:
1142        case EXIT_REASON_MSR_WRITE:
1143                handled = handle_vmexit_msr(tf);
1144                break;
1145        case EXIT_REASON_EXTERNAL_INTERRUPT:
1146                handled = handle_vmexit_extirq(tf);
1147                break;
1148        case EXIT_REASON_XSETBV:
1149                handled = handle_vmexit_xsetbv(tf);
1150                break;
1151        default:
1152                printd("Unhandled vmexit: reason 0x%x, exit qual 0x%x\n",
1153                       tf->tf_exit_reason, tf->tf_exit_qual);
1154        }
1155        if (!handled) {
1156                tf->tf_flags |= VMCTX_FL_HAS_FAULT;
1157                if (reflect_current_context()) {
1158                        /* VM contexts shouldn't be in vcore context, so this
1159                         * should be pretty rare (unlike SCPs or VC ctx page
1160                         * faults). */
1161                        printk("[kernel] Unable to reflect VM Exit\n");
1162                        print_vmtrapframe(tf);
1163                        proc_destroy(current);
1164                }
1165        }
1166}
1167
1168void handle_vmexit(struct vm_trapframe *tf)
1169{
1170        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
1171
1172        tf->tf_rip = vmcs_read(GUEST_RIP);
1173        tf->tf_rflags = vmcs_read(GUEST_RFLAGS);
1174        tf->tf_rsp = vmcs_read(GUEST_RSP);
1175        tf->tf_cr2 = rcr2();
1176        tf->tf_cr3 = vmcs_read(GUEST_CR3);
1177        tf->tf_guest_pcoreid = pcpui->guest_pcoreid;
1178        tf->tf_flags |= VMCTX_FL_PARTIAL;
1179        tf->tf_guest_intr_status = vmcs_read(GUEST_INTR_STATUS);
1180        tf->tf_exit_reason = vmcs_read(VM_EXIT_REASON);
1181        tf->tf_exit_qual = vmcs_read(EXIT_QUALIFICATION);
1182        tf->tf_intrinfo1 = vmcs_read(GUEST_INTERRUPTIBILITY_INFO);
1183        tf->tf_intrinfo2 = vmcs_read(VM_EXIT_INTR_INFO);
1184        tf->tf_guest_va = vmcs_read(GUEST_LINEAR_ADDRESS);
1185        tf->tf_guest_pa = vmcs_read(GUEST_PHYSICAL_ADDRESS);
1186
1187        set_current_ctx_vm(pcpui, tf);
1188        __set_cpu_state(pcpui, CPU_STATE_KERNEL);
1189        tf = &pcpui->cur_ctx->tf.vm_tf;
1190        vmexit_dispatch(tf);
1191        /* We're either restarting a partial VM ctx (vmcs was launched, loaded
1192         * on the core, etc) or a SW vc ctx for the reflected trap.  Or the proc
1193         * is dying and we'll handle a __death KMSG shortly. */
1194        proc_restartcore();
1195}
1196
1197/* Partial contexts for HW and SW TFs have the user's gs in MSR_KERNEL_GS_BASE.
1198 * The kernel's gs is loaded into gs.  We need to put the kernel's gs into
1199 * KERNEL_GS_BASE so the core is ready to run another full context, save the
1200 * user's {GS,FS}_BASE into their TF so it can run on another core, and keep GS
1201 * loaded with the current GS (the kernel's). */
1202static void x86_finalize_hwtf(struct hw_trapframe *tf)
1203{
1204        tf->tf_gsbase = read_kern_gsbase();
1205        write_kern_gsbase(read_gsbase());
1206        tf->tf_fsbase = read_fsbase();
1207        x86_hwtf_clear_partial(tf);
1208}
1209
1210static void x86_finalize_swtf(struct sw_trapframe *tf)
1211{
1212        tf->tf_gsbase = read_kern_gsbase();
1213        write_kern_gsbase(read_gsbase());
1214        tf->tf_fsbase = read_fsbase();
1215        x86_swtf_clear_partial(tf);
1216}
1217
1218static void x86_finalize_vmtf(struct vm_trapframe *tf)
1219{
1220        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
1221
1222        x86_vmtf_clear_partial(tf);
1223        unload_guest_pcore(pcpui->owning_proc, pcpui->guest_pcoreid);
1224}
1225
1226/* Makes sure that the user context is fully saved into ctx and not split across
1227 * the struct and HW, meaning it is not a "partial context".
1228 *
1229 * Be careful to zero out any part of the ctx struct not in use, to avoid
1230 * leaking information from other processes. */
1231void arch_finalize_ctx(struct user_context *ctx)
1232{
1233        if (!arch_ctx_is_partial(ctx))
1234                return;
1235        switch (ctx->type) {
1236        case ROS_HW_CTX:
1237                x86_finalize_hwtf(&ctx->tf.hw_tf);
1238                memset((uint8_t*)&ctx->tf + sizeof(struct hw_trapframe), 0,
1239                           sizeof(ctx->tf) - sizeof(struct hw_trapframe));
1240                break;
1241        case ROS_SW_CTX:
1242                x86_finalize_swtf(&ctx->tf.sw_tf);
1243                memset((uint8_t*)&ctx->tf + sizeof(struct sw_trapframe), 0,
1244                           sizeof(ctx->tf) - sizeof(struct sw_trapframe));
1245                break;
1246        case ROS_VM_CTX:
1247                x86_finalize_vmtf(&ctx->tf.vm_tf);
1248                memset((uint8_t*)&ctx->tf + sizeof(struct vm_trapframe), 0,
1249                           sizeof(ctx->tf) - sizeof(struct vm_trapframe));
1250                break;
1251        }
1252}
1253