akaros/kern/arch/x86/trap.c
<<
>>
Prefs
   1#include <arch/mmu.h>
   2#include <arch/x86.h>
   3#include <arch/arch.h>
   4#include <arch/console.h>
   5#include <arch/apic.h>
   6#include <arch/perfmon.h>
   7#include <ros/common.h>
   8#include <smp.h>
   9#include <assert.h>
  10#include <pmap.h>
  11#include <trap.h>
  12#include <monitor.h>
  13#include <process.h>
  14#include <mm.h>
  15#include <stdio.h>
  16#include <slab.h>
  17#include <syscall.h>
  18#include <kdebug.h>
  19#include <kmalloc.h>
  20#include <ex_table.h>
  21#include <arch/mptables.h>
  22#include <ros/procinfo.h>
  23
  24enum {
  25        NMI_NORMAL_OPN = 0,
  26        NMI_IN_PROGRESS,
  27        NMI_HANDLE_ANOTHER,
  28};
  29
  30taskstate_t ts;
  31
  32/* Interrupt descriptor table.  64 bit needs 16 byte alignment (i think). */
  33gatedesc_t __attribute__((aligned (16))) idt[256] = { { 0 } };
  34pseudodesc_t idt_pd;
  35
  36/* interrupt handler table, each element is a linked list of handlers for a
  37 * given IRQ.  Modification requires holding the lock. */
  38struct irq_handler *irq_handlers[NUM_IRQS];
  39spinlock_t irq_handler_wlock = SPINLOCK_INITIALIZER_IRQSAVE;
  40
  41static struct arena *irq_vectors;
  42
  43static bool try_handle_exception_fixup(struct hw_trapframe *hw_tf)
  44{
  45        if (in_kernel(hw_tf)) {
  46                uintptr_t fixup_ip = get_fixup_ip(hw_tf->tf_rip);
  47
  48                if (fixup_ip != 0) {
  49                        hw_tf->tf_rip = fixup_ip;
  50                        return true;
  51                }
  52        }
  53
  54        return false;
  55}
  56
  57const char *x86_trapname(int trapno)
  58{
  59        static const char *const excnames[] = {
  60                "Divide error",
  61                "Debug",
  62                "Non-Maskable Interrupt",
  63                "Breakpoint",
  64                "Overflow",
  65                "BOUND Range Exceeded",
  66                "Invalid Opcode",
  67                "Device Not Available",
  68                "Double Fault",
  69                "Coprocessor Segment Overrun",
  70                "Invalid TSS",
  71                "Segment Not Present",
  72                "Stack Fault",
  73                "General Protection",
  74                "Page Fault",
  75                "(unknown trap)",
  76                "x87 FPU Floating-Point Error",
  77                "Alignment Check",
  78                "Machine-Check",
  79                "SIMD Floating-Point Exception"
  80        };
  81
  82        if (trapno < sizeof(excnames)/sizeof(excnames[0]))
  83                return excnames[trapno];
  84        if (trapno == T_SYSCALL)
  85                return "System call";
  86        return "(unknown trap)";
  87}
  88
  89/* Set stacktop for the current core to be the stack the kernel will start on
  90 * when trapping/interrupting from userspace. */
  91void set_stack_top(uintptr_t stacktop)
  92{
  93        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
  94
  95        /* No need to reload the task register, this takes effect immediately */
  96        x86_set_stacktop_tss(pcpui->tss, stacktop);
  97        /* Also need to make sure sysenters come in correctly */
  98        x86_set_sysenter_stacktop(stacktop);
  99}
 100
 101/* Note the check implies we only are on a one page stack (or the first page) */
 102uintptr_t get_stack_top(void)
 103{
 104        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 105        uintptr_t stacktop;
 106
 107        stacktop = x86_get_stacktop_tss(pcpui->tss);
 108        if (stacktop != ROUNDUP(read_sp(), PGSIZE))
 109                panic("Bad stacktop: %p esp one is %p\n", stacktop,
 110                      ROUNDUP(read_sp(), PGSIZE));
 111        return stacktop;
 112}
 113
 114/* Sends a non-maskable interrupt; the handler will print a trapframe. */
 115void send_nmi(uint32_t os_coreid)
 116{
 117        /* NMI / IPI for x86 are limited to 8 bits */
 118        uint8_t hw_core = (uint8_t)get_hw_coreid(os_coreid);
 119
 120        __send_nmi(hw_core);
 121}
 122
 123void idt_init(void)
 124{
 125        /* This table is made in trapentry$BITS.S by each macro in that file.
 126         * It is layed out such that the ith entry is the ith's traphandler's
 127         * (uintptr_t) trap addr, then (uint32_t) trap number. */
 128        struct trapinfo { uintptr_t trapaddr; uint32_t trapnumber; }
 129               __attribute__((packed));
 130        extern struct trapinfo trap_tbl[];
 131        extern struct trapinfo trap_tbl_end[];
 132        int i, trap_tbl_size = trap_tbl_end - trap_tbl;
 133        extern void ISR_default(void);
 134        extern void ISR_syscall(void);
 135
 136        /* set all to default, to catch everything */
 137        for (i = 0; i < 256; i++)
 138                SETGATE(idt[i], 0, GD_KT, &ISR_default, 0);
 139
 140        /* set all entries that have real trap handlers
 141         * we need to stop short of the last one, since the last is the default
 142         * handler with a fake interrupt number (500) that is out of bounds of
 143         * the idt[] */
 144        for (i = 0; i < trap_tbl_size - 1; i++)
 145                SETGATE(idt[trap_tbl[i].trapnumber], 0, GD_KT,
 146                        trap_tbl[i].trapaddr, 0);
 147        /* Sanity check */
 148        assert((uintptr_t)ISR_syscall ==
 149               ((uintptr_t)idt[T_SYSCALL].gd_off_63_32 << 32 |
 150                (uintptr_t)idt[T_SYSCALL].gd_off_31_16 << 16 |
 151                (uintptr_t)idt[T_SYSCALL].gd_off_15_0));
 152        /* turn on trap-based syscall handling and other user-accessible ints
 153         * DPL 3 means this can be triggered by the int instruction */
 154        idt[T_SYSCALL].gd_dpl = 3;
 155        idt[T_BRKPT].gd_dpl = 3;
 156        /* Send NMIs to their own stack (IST1 in every core's TSS) */
 157        idt[T_NMI].gd_ist = 1;
 158        /* Send double faults to their own stack (IST2 in every core's TSS) */
 159        idt[T_DBLFLT].gd_ist = 2;
 160
 161        /* The sooner we set this, the sooner we can use set/get_stack_top. */
 162        per_cpu_info[0].tss = &ts;
 163        per_cpu_info[0].gdt = gdt;
 164
 165        /* Set up our kernel stack when changing rings */
 166        /* Note: we want 16 byte aligned kernel stack frames (AMD 2:8.9.3) */
 167        x86_sysenter_init();
 168        /* We will set this properly once we have a kstack from the slab. */
 169        set_stack_top(0xdeadbeef);
 170
 171        /* Initialize the TSS field of the gdt.  The size of the TSS desc
 172         * differs between 64 and 32 bit, hence the pointer acrobatics */
 173        syssegdesc_t *ts_slot = (syssegdesc_t*)&gdt[GD_TSS >> 3];
 174        *ts_slot = (syssegdesc_t)SEG_SYS_SMALL(STS_T32A, (uintptr_t)&ts,
 175                                               sizeof(taskstate_t), 0);
 176
 177        /* Init the IDT PD.  Need to do this before ltr for some reason.  (Doing
 178         * this between ltr and lidt causes the machine to reboot... */
 179        idt_pd.pd_lim = sizeof(idt) - 1;
 180        idt_pd.pd_base = (uintptr_t)idt;
 181
 182        ltr(GD_TSS);
 183
 184        asm volatile("lidt %0" : : "m"(idt_pd));
 185
 186        irq_vectors = arena_create("irq_vectors", (void*)IdtIOAPIC,
 187                                   MaxIdtIOAPIC - IdtIOAPIC, 1,
 188                                   NULL, NULL, NULL, 0, MEM_ATOMIC);
 189        assert(irq_vectors);
 190
 191        pic_remap();
 192        pic_mask_all();
 193
 194        int ncleft = MAX_NUM_CORES;
 195        int num_cores_mpacpi;
 196
 197        ncleft = mpsinit(ncleft);
 198        ncleft = mpacpi(ncleft);
 199        num_cores_mpacpi = MAX_NUM_CORES - ncleft;
 200        printk("MP and ACPI found %d cores\n", num_cores_mpacpi);
 201        if (num_cores != num_cores_mpacpi)
 202                warn("Topology (%d) and MP/ACPI (%d) differ on num_cores!",
 203                     num_cores, num_cores_mpacpi);
 204
 205        apiconline();
 206        ioapiconline();
 207
 208        /* the lapic IRQs need to be unmasked on a per-core basis */
 209        register_irq(IdtLAPIC_TIMER, timer_interrupt, NULL,
 210                     MKBUS(BusLAPIC, 0, 0, 0));
 211        register_irq(IdtLAPIC_ERROR, handle_lapic_error, NULL,
 212                     MKBUS(BusLAPIC, 0, 0, 0));
 213        register_irq(IdtLAPIC_PCINT, perfmon_interrupt, NULL,
 214                     MKBUS(BusLAPIC, 0, 0, 0));
 215        register_irq(I_KERNEL_MSG, handle_kmsg_ipi, NULL,
 216                     MKBUS(BusIPI, 0, 0, 0));
 217}
 218
 219static void print_fperr(struct hw_trapframe *hw_tf)
 220{
 221        uint16_t fpcw, fpsw;
 222        uint32_t mxcsr;
 223
 224        asm volatile ("fnstcw %0" : "=m"(fpcw));
 225        asm volatile ("fnstsw %0" : "=m"(fpsw));
 226        asm volatile ("stmxcsr %0" : "=m"(mxcsr));
 227        print_lock();
 228        print_trapframe(hw_tf);
 229        printk("Core %d: FP ERR, CW: 0x%04x, SW: 0x%04x, MXCSR 0x%08x\n",
 230               core_id(), fpcw, fpsw, mxcsr);
 231        printk("Core %d: The following faults are unmasked:\n", core_id());
 232        if (fpsw & ~fpcw & FP_EXCP_IE) {
 233                printk("\tInvalid Operation: ");
 234                if (fpsw & FP_SW_SF) {
 235                        if (fpsw & FP_SW_C1)
 236                                printk("Stack overflow\n");
 237                        else
 238                                printk("Stack underflow\n");
 239                } else {
 240                        printk("invalid arithmetic operand\n");
 241                }
 242        }
 243        if (fpsw & ~fpcw & FP_EXCP_DE)
 244                printk("\tDenormalized operand\n");
 245        if (fpsw & ~fpcw & FP_EXCP_ZE)
 246                printk("\tDivide by zero\n");
 247        if (fpsw & ~fpcw & FP_EXCP_OE)
 248                printk("\tNumeric Overflow\n");
 249        if (fpsw & ~fpcw & FP_EXCP_UE)
 250                printk("\tNumeric Underflow\n");
 251        if (fpsw & ~fpcw & FP_EXCP_PE)
 252                printk("\tInexact result (precision)\n");
 253        print_unlock();
 254}
 255
 256static bool __handler_user_page_fault(struct hw_trapframe *hw_tf,
 257                                      uintptr_t fault_va, int prot)
 258{
 259        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 260        int err;
 261
 262        assert(pcpui->owning_proc == pcpui->cur_proc);
 263        enable_irq();
 264        err = handle_page_fault(pcpui->owning_proc, fault_va, prot);
 265        disable_irq();
 266        if (err) {
 267                if (err == -EAGAIN)
 268                        hw_tf->tf_err |= PF_VMR_BACKED;
 269                return FALSE;
 270        }
 271        return TRUE;
 272}
 273
 274static bool __handler_kernel_page_fault(struct hw_trapframe *hw_tf,
 275                                        uintptr_t fault_va, int prot)
 276{
 277        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 278        int err;
 279
 280        /* The only thing an NMI handler that faults can do is a fixup */
 281        if (pcpui->nmi_status != NMI_NORMAL_OPN) {
 282                assert(in_kernel(hw_tf));
 283                return try_handle_exception_fixup(hw_tf);
 284        }
 285        /* In general, if there's no cur_proc, a KPF is a bug. */
 286        if (!pcpui->cur_proc) {
 287                /* This only runs from test_uaccess(), where it is expected to
 288                 * fail. */
 289                if (try_handle_exception_fixup(hw_tf))
 290                        return TRUE;
 291                panic_hwtf(hw_tf, "Proc-less Page Fault in the Kernel at %p!",
 292                           fault_va);
 293        }
 294        /* TODO - handle kernel page faults.  This is dangerous, since we might
 295         * be holding locks in the kernel and could deadlock when we HPF.  For
 296         * now, I'm just disabling the lock checker, since it'll flip out when
 297         * it sees there is a kernel trap.  Will need to think about this a bit,
 298         * esp when we properly handle bad addrs and whatnot. */
 299        pcpui->__lock_checking_enabled--;
 300        /* It is a bug for the kernel to access user memory while holding locks
 301         * that are used by handle_page_fault.  At a minimum, this includes
 302         * p->vmr_lock and memory allocation locks.
 303         *
 304         * In an effort to reduce the number of locks (both now and in the
 305         * future), the kernel will not attempt to handle faults on file-back
 306         * VMRs.  We probably can turn that on in the future, but I'd rather
 307         * keep things safe for now.  (We'll probably need to change this when
 308         * we stop MAP_POPULATE | MAP_LOCKED entire binaries).
 309         *
 310         * Note that we do not enable IRQs here, unlike in the user case.
 311         * Again, this is to limit the locks we could be grabbing. */
 312        err = handle_page_fault_nofile(pcpui->cur_proc, fault_va, prot);
 313        pcpui->__lock_checking_enabled++;
 314        if (err) {
 315                if (try_handle_exception_fixup(hw_tf))
 316                        return TRUE;
 317                /* Turn this on to help debug bad function pointers */
 318                printd("rsp %p\n\t 0(rsp): %p\n\t 8(rsp): %p\n\t 16(rsp): %p\n"
 319                       "\t24(rsp): %p\n", hw_tf->tf_rsp,
 320                       *(uintptr_t*)(hw_tf->tf_rsp +  0),
 321                       *(uintptr_t*)(hw_tf->tf_rsp +  8),
 322                       *(uintptr_t*)(hw_tf->tf_rsp + 16),
 323                       *(uintptr_t*)(hw_tf->tf_rsp + 24));
 324                panic_hwtf(hw_tf, "Proc-ful Page Fault in the Kernel at %p!",
 325                           fault_va);
 326                /* if we want to do something like kill a process or other code,
 327                 * be aware we are in a sort of irq-like context, meaning the
 328                 * main kernel code we 'interrupted' could be holding locks -
 329                 * even irqsave locks. */
 330        }
 331        return TRUE;
 332}
 333
 334static bool __handle_page_fault(struct hw_trapframe *hw_tf, unsigned long *aux)
 335{
 336        uintptr_t fault_va = rcr2();
 337        int prot = hw_tf->tf_err & PF_ERROR_WRITE ? PROT_WRITE : PROT_READ;
 338
 339        *aux = fault_va;
 340        if (in_kernel(hw_tf))
 341                return __handler_kernel_page_fault(hw_tf, fault_va, prot);
 342        else
 343                return __handler_user_page_fault(hw_tf, fault_va, prot);
 344}
 345
 346/* Actual body of work done when an NMI arrives */
 347static void do_nmi_work(struct hw_trapframe *hw_tf)
 348{
 349        assert(!irq_is_enabled());
 350
 351        extern void __watchdog_nmi_handler(struct hw_trapframe *hw_tf);
 352
 353        __watchdog_nmi_handler(hw_tf);
 354
 355        /* It's mostly harmless to snapshot the TF, and we can send a spurious
 356         * PCINT interrupt.  perfmon.c just uses the interrupt to tell it to
 357         * check its counters for overflow.  Note that the PCINT interrupt is
 358         * just a regular IRQ.  The backtrace was recorded during the NMI and
 359         * emitted during IRQ.
 360         *
 361         * That being said, it's OK if the monitor triggers debugging NMIs while
 362         * perf is running.  If perf triggers an NMI when the monitor wants to
 363         * print, the monitor will debug *that* NMI, and not the one that gets
 364         * sent moments later.  That's fine. */
 365        emit_monitor_backtrace(ROS_HW_CTX, hw_tf);
 366        perfmon_snapshot_hwtf(hw_tf);
 367        send_self_ipi(IdtLAPIC_PCINT);
 368}
 369
 370/* NMI HW_TF hacking involves four symbols:
 371 *
 372 * [__nmi_pop_ok_start, __nmi_pop_ok_end) mark the beginning and end of the
 373 * code for an nmi popping routine that will actually pop at the end.
 374 *
 375 * [__nmi_pop_fail_start, __nmi_pop_fail_end) mark the beginning and end of the
 376 * shadow code for an nmi popping routine that will fail at the end.
 377 *
 378 * If we see a TF in the OK section, we'll move it to the FAIL section.  If it's
 379 * already in the FAIL section, we'll report that as a success. */
 380extern char __nmi_pop_ok_start[], __nmi_pop_ok_end[];
 381extern char __nmi_pop_fail_start[], __nmi_pop_fail_end[];
 382
 383static bool nmi_hw_tf_needs_hacked(struct hw_trapframe *hw_tf)
 384{
 385        return ((uintptr_t)__nmi_pop_ok_start <= hw_tf->tf_rip) &&
 386               (hw_tf->tf_rip < (uintptr_t)__nmi_pop_ok_end);
 387}
 388
 389static bool nmi_hw_tf_was_hacked(struct hw_trapframe *hw_tf)
 390{
 391        return ((uintptr_t)__nmi_pop_fail_start <= hw_tf->tf_rip) &&
 392               (hw_tf->tf_rip < (uintptr_t)__nmi_pop_fail_end);
 393}
 394
 395/* Helper.  Hacks the TF if it was in the OK section so that it is at the same
 396 * spot in the FAIL section.  Returns TRUE if the TF is hacked, meaning the NMI
 397 * handler can just return. */
 398static bool nmi_check_and_hack_tf(struct hw_trapframe *hw_tf)
 399{
 400        uintptr_t offset;
 401
 402        if (!nmi_hw_tf_needs_hacked(hw_tf))
 403                return FALSE;
 404        if (nmi_hw_tf_was_hacked(hw_tf))
 405                return TRUE;
 406        offset = hw_tf->tf_rip - (uintptr_t)__nmi_pop_ok_start;
 407        hw_tf->tf_rip = (uintptr_t)__nmi_pop_fail_start + offset;
 408        return TRUE;
 409}
 410
 411/* Bottom half of the NMI handler.  This can be interrupted under some
 412 * circumstances by NMIs.  It exits by popping the hw_tf in assembly. */
 413void noinline __attribute__((noreturn))
 414__nmi_bottom_half(struct hw_trapframe *hw_tf)
 415{
 416        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 417
 418        while (1) {
 419                /* Signal that we're doing work.  A concurrent NMI will set this
 420                 * to NMI_HANDLE_ANOTHER if we should continue, which we'll
 421                 * catch later. */
 422                pcpui->nmi_status = NMI_IN_PROGRESS;
 423                do_nmi_work(hw_tf);
 424                /* We need to check nmi_status to see if it is
 425                 * NMI_HANDLE_ANOTHER (if so, run again), write NMI_NORMAL_OPN,
 426                 * leave this stack, and return to the original context.  We
 427                 * need to do that in such a manner that an NMI can come in at
 428                 * any time.  There are two concerns.
 429                 *
 430                 * First, we need to not "miss the signal" telling us to re-run
 431                 * the NMI handler.  To do that, we'll do the actual checking in
 432                 * asm.  Being in the asm code block is a signal to the real NMI
 433                 * handler that we need to abort and do_nmi_work() again.
 434                 *
 435                 * Second, we need to atomically leave the stack and return.  By
 436                 * being in asm, the NMI handler knows to just hack our PC to
 437                 * make us return, instead of starting up a fresh
 438                 * __nmi_bottom_half().
 439                 *
 440                 * The NMI handler works together with the following function
 441                 * such that if that race occurs while we're in the function,
 442                 * it'll fail and return.  Then we'll just do_nmi_work() and try
 443                 * again. */
 444                extern void nmi_try_to_pop(struct hw_trapframe *tf, int *status,
 445                                           int old_val, int new_val);
 446
 447                nmi_try_to_pop(hw_tf, &pcpui->nmi_status, NMI_IN_PROGRESS,
 448                               NMI_NORMAL_OPN);
 449                /* Either we returned on our own, since we lost a race with
 450                 * nmi_status and didn't write (status = ANOTHER), or we won the
 451                 * race, but an NMI handler set the status to ANOTHER and
 452                 * restarted us. */
 453                assert(pcpui->nmi_status != NMI_NORMAL_OPN);
 454        }
 455}
 456
 457/* Separate handler from traps, since there's too many rules for NMI ctx.
 458 *
 459 * The general rule is that any writes from NMI context must be very careful.
 460 * When talking about reads and writes to per-core data:
 461 * - If NMIs write things written by normal kernel contexts, including IRQs and
 462 *   traps with IRQs disabled, then you must use atomics on both sides.
 463 * - If NMIs write things read by normal contexts, then readers must be careful,
 464 *   since the data can change at will.
 465 * - If NMIs read things written by normal contexts, don't worry: you're running
 466 *   uninterrupted (given x86 NMI caveats).
 467 * - We cannot block.  The current kthread thinks its stacktop is different than
 468 *   the one we're on.  Just get in and get out.
 469 * - If we interrupted a user TF, then we don't need to worry any more than for
 470 *   normal traps/IRQs.
 471 * - However, we cannot call proc_restartcore.  That could trigger all sorts of
 472 *   things, like kthreads blocking.
 473 * - Parallel accesses (from other cores) are the same as always.  You just
 474 *   can't lock easily.
 475 *
 476 * Normally, once you're in NMI, other NMIs are blocked until we return.
 477 * However, if our NMI handler faults (PF, GPF, breakpoint) due to something
 478 * like tracing, the iret from that fault will cancel our NMI protections.  Thus
 479 * we need another layer of code to make sure we don't run the NMI handler
 480 * concurrently on the same core.  See https://lwn.net/Articles/484932/ for more
 481 * info.
 482 *
 483 * We'll get around the problem by running on yet another NMI stack.  All NMIs
 484 * come in on the nmi entry stack (tss->ist1).  While we're on that stack, we
 485 * will not be interrupted.  We jump to another stack to do_nmi_work.  That code
 486 * can be interrupted, but we are careful to only have one 'thread' running on
 487 * that stack at a time.  We do this by carefully hopping off the stack in
 488 * assembly, similar to popping user TFs. */
 489void handle_nmi(struct hw_trapframe *hw_tf)
 490{
 491        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 492        struct hw_trapframe *hw_tf_copy;
 493        uintptr_t worker_stacktop;
 494
 495        /* At this point, we're an NMI and other NMIs are blocked.  Only once we
 496         * hop to the bottom half could that be no longer true.  NMI with NMIs
 497         * fully blocked will run without interruption.  For that reason, we
 498         * don't have to be careful about any memory accesses or compiler
 499         * tricks. */
 500        if (pcpui->nmi_status == NMI_HANDLE_ANOTHER)
 501                return;
 502        if (pcpui->nmi_status == NMI_IN_PROGRESS) {
 503                /* Force the handler to run again.  We don't need to worry about
 504                 * concurrent access here.  We're running, they are not.  We
 505                 * cannot 'PAUSE' since NMIs are fully blocked.
 506                 *
 507                 * The asm routine, for its part, does a compare-and-swap, so if
 508                 * we happened to interrupt it before it wrote NMI_NORMAL_OPN,
 509                 * it'll notice, abort, and not write the status. */
 510                pcpui->nmi_status = NMI_HANDLE_ANOTHER;
 511                return;
 512        }
 513        assert(pcpui->nmi_status == NMI_NORMAL_OPN);
 514        pcpui->nmi_status = NMI_HANDLE_ANOTHER;
 515        /* We could be interrupting an NMI that is trying to pop back to a
 516         * normal context.  We can tell by looking at its PC.  If it is within
 517         * the popping routine, then we interrupted it at this bad time.  We'll
 518         * hack the TF such that it will return instead of succeeding. */
 519        if (nmi_check_and_hack_tf(hw_tf))
 520                return;
 521        /* OK, so we didn't interrupt an NMI that was trying to return.  So we
 522         * need to run the bottom half.  We're going to jump stacks, but we also
 523         * need to copy the hw_tf.  The existing one will be clobbered by any
 524         * interrupting NMIs.
 525         *
 526         * We also need to save some space on the top of that stack for a
 527         * pointer to pcpui and a scratch register, which nmi_try_to_pop() will
 528         * use.  The target stack will look like this:
 529         *
 530         *           +--------------------------+ Page boundary (e.g. 0x6000)
 531         *           |   scratch space (rsp)    |
 532         *           |       pcpui pointer      |
 533         *           |      tf_ss + padding     | HW_TF end
 534         *           |          tf_rsp          |
 535         *           |            .             |
 536         *           |            .             |
 537         * RSP ->    |         tf_gsbase        | HW_TF start, hw_tf_copy
 538         *           +--------------------------+
 539         *           |            .             |
 540         *           |            .             |
 541         *           |            .             |
 542         *           +--------------------------+ Page boundary (e.g. 0x5000)
 543         *
 544         * __nmi_bottom_half() just picks up using the stack below tf_gsbase.
 545         * It'll push as needed, growing down.  Basically we're just using the
 546         * space 'above' the stack as storage. */
 547        worker_stacktop = pcpui->nmi_worker_stacktop - 2 * sizeof(uintptr_t);
 548        *(uintptr_t*)worker_stacktop = (uintptr_t)pcpui;
 549        worker_stacktop = worker_stacktop - sizeof(struct hw_trapframe);
 550        hw_tf_copy = (struct hw_trapframe*)worker_stacktop;
 551        *hw_tf_copy = *hw_tf;
 552        /* Once we head to the bottom half, consider ourselves interruptible
 553         * (though it's not until the first time we do_nmi_work()).  We'll never
 554         * come back to this stack.  Doing this in asm so we can easily pass an
 555         * argument.  We don't need to call (vs jmp), but it helps keep the
 556         * stack aligned. */
 557        asm volatile("mov $0x0, %%rbp;"
 558                     "mov %0, %%rsp;"
 559                     "call __nmi_bottom_half;"
 560                     : : "r"(worker_stacktop), "D"(hw_tf_copy));
 561        assert(0);
 562}
 563
 564void handle_double_fault(struct hw_trapframe *hw_tf)
 565{
 566        panic_hwtf(hw_tf, "Double fault!  Check the kernel stack pointer; you likely ran off the end of the stack.");
 567}
 568
 569/* Certain traps want IRQs enabled, such as the syscall.  Others can't handle
 570 * it, like the page fault handler.  Turn them on on a case-by-case basis. */
 571static void trap_dispatch(struct hw_trapframe *hw_tf)
 572{
 573        struct per_cpu_info *pcpui;
 574        bool handled = FALSE;
 575        unsigned long aux = 0;
 576        uintptr_t fixup_ip;
 577
 578        // Handle processor exceptions.
 579        switch(hw_tf->tf_trapno) {
 580        case T_BRKPT:
 581                if (!in_kernel(hw_tf))
 582                        backtrace_user_ctx(current, current_ctx);
 583                else
 584                        monitor(hw_tf);
 585                handled = TRUE;
 586                break;
 587        case T_ILLOP:
 588        {
 589                /* TODO: this can PF if there is a concurrent unmap/PM removal.
 590                 * */
 591                uintptr_t ip = get_hwtf_pc(hw_tf);
 592
 593                pcpui = &per_cpu_info[core_id()];
 594                pcpui->__lock_checking_enabled--; /* for print debugging */
 595                /* We will muck with the actual TF.  If we're dealing with
 596                 * userspace, we need to make sure we edit the actual TF that
 597                 * will get restarted (pcpui), and not the TF on the kstack
 598                 * (which aren't the same).  See set_current_ctx() for more
 599                 * info. */
 600                if (!in_kernel(hw_tf))
 601                        hw_tf = &pcpui->cur_ctx->tf.hw_tf;
 602                printd("bad opcode, eip: %p, next 3 bytes: %x %x %x\n", ip,
 603                       *(uint8_t*)(ip + 0),
 604                       *(uint8_t*)(ip + 1),
 605                       *(uint8_t*)(ip + 2));
 606                /* rdtscp: 0f 01 f9 */
 607                if (*(uint8_t*)(ip + 0) == 0x0f,
 608                    *(uint8_t*)(ip + 1) == 0x01,
 609                    *(uint8_t*)(ip + 2) == 0xf9) {
 610                        x86_fake_rdtscp(hw_tf);
 611                        handled = TRUE;
 612                }
 613                pcpui->__lock_checking_enabled++; /* for print debugging */
 614                break;
 615        }
 616        case T_PGFLT:
 617                handled = __handle_page_fault(hw_tf, &aux);
 618                break;
 619        case T_GPFLT:
 620        case T_FPERR:
 621                handled = try_handle_exception_fixup(hw_tf);
 622                break;
 623        case T_SYSCALL:
 624                enable_irq();
 625                // check for userspace, for now
 626                assert(hw_tf->tf_cs != GD_KT);
 627                /* Set up and run the async calls */
 628                /* TODO: this is using the wrong reg1 for traps for 32 bit */
 629                prep_syscalls(current,
 630                              (struct syscall*)x86_get_systrap_arg0(hw_tf),
 631                              (unsigned int)x86_get_systrap_arg1(hw_tf));
 632                disable_irq();
 633                handled = TRUE;
 634                break;
 635        }
 636
 637        if (!handled) {
 638                if (in_kernel(hw_tf))
 639                        panic_hwtf(hw_tf,
 640                                   "Damn Damn!  Unhandled trap in the kernel!");
 641                reflect_unhandled_trap(hw_tf->tf_trapno, hw_tf->tf_err, aux);
 642        }
 643}
 644
 645/* Helper.  For now, this copies out the TF to pcpui.  Eventually, we should
 646 * consider doing this in trapentry.S
 647 *
 648 * TODO: consider having this return the tf used, so we can set tf in trap and
 649 * irq handlers to edit the TF that will get restarted.  Right now, the kernel
 650 * uses and restarts tf, but userspace restarts the old pcpui tf.  It is
 651 * tempting to do this, but note that tf stays on the stack of the kthread,
 652 * while pcpui->cur_ctx is for the core we trapped in on.  Meaning if we ever
 653 * block, suddenly cur_ctx is pointing to some old clobbered state that was
 654 * already returned to and can't be trusted.  Meanwhile tf can always be trusted
 655 * (like with an in_kernel() check).  The only types of traps from the user that
 656 * can be expected to have editable trapframes are ones that don't block. */
 657static void set_current_ctx_hw(struct per_cpu_info *pcpui,
 658                               struct hw_trapframe *hw_tf)
 659{
 660        assert(!irq_is_enabled());
 661        pcpui->actual_ctx.type = ROS_HW_CTX;
 662        pcpui->actual_ctx.tf.hw_tf = *hw_tf;
 663        pcpui->cur_ctx = &pcpui->actual_ctx;
 664}
 665
 666static void set_current_ctx_sw(struct per_cpu_info *pcpui,
 667                               struct sw_trapframe *sw_tf)
 668{
 669        assert(!irq_is_enabled());
 670        pcpui->actual_ctx.type = ROS_SW_CTX;
 671        pcpui->actual_ctx.tf.sw_tf = *sw_tf;
 672        pcpui->cur_ctx = &pcpui->actual_ctx;
 673}
 674
 675static void set_current_ctx_vm(struct per_cpu_info *pcpui,
 676                               struct vm_trapframe *vm_tf)
 677{
 678        assert(!irq_is_enabled());
 679        pcpui->actual_ctx.type = ROS_VM_CTX;
 680        pcpui->actual_ctx.tf.vm_tf = *vm_tf;
 681        pcpui->cur_ctx = &pcpui->actual_ctx;
 682}
 683
 684void trap(struct hw_trapframe *hw_tf)
 685{
 686        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 687
 688        /* Copy out the TF for now */
 689        if (!in_kernel(hw_tf)) {
 690                set_current_ctx_hw(pcpui, hw_tf);
 691                /* ignoring state for nested kernel traps.  should be rare. */
 692                __set_cpu_state(pcpui, CPU_STATE_KERNEL);
 693        } else {
 694                inc_ktrap_depth(pcpui);
 695        }
 696        printd("Incoming TRAP %d on core %d, TF at %p\n", hw_tf->tf_trapno,
 697               core_id(), hw_tf);
 698        if ((hw_tf->tf_cs & ~3) != GD_UT && (hw_tf->tf_cs & ~3) != GD_KT)
 699                panic_hwtf(hw_tf, "Trapframe with invalid CS!");
 700        trap_dispatch(hw_tf);
 701        /* Return to the current process, which should be runnable.  If we're
 702         * the kernel, we should just return naturally.  Note that current and
 703         * tf need to still be okay (might not be after blocking) */
 704        if (in_kernel(hw_tf)) {
 705                dec_ktrap_depth(pcpui);
 706                return;
 707        }
 708        proc_restartcore();
 709        assert(0);
 710}
 711
 712static bool vector_is_irq(int apic_vec)
 713{
 714        /* arguably, we could limit them to MaxIdtIOAPIC */
 715        return (IdtPIC <= apic_vec) && (apic_vec <= IdtMAX);
 716}
 717
 718static void irq_dispatch(struct hw_trapframe *hw_tf)
 719{
 720        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 721        struct irq_handler *irq_h;
 722
 723        if (!in_irq_ctx(pcpui))
 724                __set_cpu_state(pcpui, CPU_STATE_IRQ);
 725        inc_irq_depth(pcpui);
 726        //if (core_id())
 727        if (hw_tf->tf_trapno != IdtLAPIC_TIMER) /* timer irq */
 728        if (hw_tf->tf_trapno != I_KERNEL_MSG)
 729        if (hw_tf->tf_trapno != 65)     /* qemu serial tends to get this one */
 730                printd("Incoming IRQ, ISR: %d on core %d\n", hw_tf->tf_trapno,
 731                       core_id());
 732        rcu_read_lock();
 733        irq_h = rcu_dereference(irq_handlers[hw_tf->tf_trapno]);
 734        if (!irq_h) {
 735                warn_once("Received IRQ %d, had no handler registered!",
 736                          hw_tf->tf_trapno);
 737                /* If we don't have an IRQ handler, we don't know how to EOI.
 738                 * Odds are, it's a LAPIC IRQ, such as I_TESTING */
 739                if (!lapic_check_spurious(hw_tf->tf_trapno))
 740                        lapic_send_eoi(hw_tf->tf_trapno);
 741                goto out_no_eoi;
 742        }
 743        if (irq_h->check_spurious(hw_tf->tf_trapno))
 744                goto out_no_eoi;
 745        /* Can now be interrupted/nested by higher priority IRQs, but not by our
 746         * current IRQ vector, til we EOI. */
 747        enable_irq();
 748        while (irq_h) {
 749                irq_h->isr(hw_tf, irq_h->data);
 750                irq_h = rcu_dereference(irq_h->next);
 751        }
 752        // if we're a general purpose IPI function call, down the cpu_list
 753        extern handler_wrapper_t handler_wrappers[NUM_HANDLER_WRAPPERS];
 754        if ((I_SMP_CALL0 <= hw_tf->tf_trapno) &&
 755            (hw_tf->tf_trapno <= I_SMP_CALL_LAST))
 756                down_checklist(handler_wrappers[hw_tf->tf_trapno & 0x0f]
 757                               .cpu_list);
 758        disable_irq();
 759        /* Keep in sync with ipi_is_pending */
 760        irq_h = rcu_dereference(irq_handlers[hw_tf->tf_trapno]);
 761        irq_h->eoi(hw_tf->tf_trapno);
 762        /* Fall-through */
 763out_no_eoi:
 764        rcu_read_unlock();
 765        dec_irq_depth(pcpui);
 766        if (!in_irq_ctx(pcpui))
 767                __set_cpu_state(pcpui, CPU_STATE_KERNEL);
 768}
 769
 770/* Note IRQs are disabled unless explicitly turned on.
 771 *
 772 * In general, we should only get trapno's >= PIC1_OFFSET (32).  Anything else
 773 * should be a trap.  Even if we don't use the PIC, that should be the standard.
 774 * It is possible to get a spurious LAPIC IRQ with vector 15 (or similar), but
 775 * the spurious check should catch that.
 776 *
 777 * Note that from hardware's perspective (PIC, etc), IRQs start from 0, but they
 778 * are all mapped up at PIC1_OFFSET for the cpu / irq_handler. */
 779void handle_irq(struct hw_trapframe *hw_tf)
 780{
 781        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 782
 783        /* Copy out the TF for now */
 784        if (!in_kernel(hw_tf))
 785                set_current_ctx_hw(pcpui, hw_tf);
 786        irq_dispatch(hw_tf);
 787        /* Return to the current process, which should be runnable.  If we're
 788         * the kernel, we should just return naturally.  Note that current and
 789         * tf need to still be okay (might not be after blocking) */
 790        if (in_kernel(hw_tf))
 791                return;
 792        proc_restartcore();
 793        assert(0);
 794}
 795
 796/* The irq field may be ignored based on the type of Bus. */
 797struct irq_handler *register_irq(int irq, isr_t handler, void *irq_arg,
 798                                 uint32_t tbdf)
 799{
 800        struct irq_handler *irq_h;
 801        int vector;
 802
 803        irq_h = kzmalloc(sizeof(struct irq_handler), 0);
 804        assert(irq_h);
 805        irq_h->dev_irq = irq;
 806        irq_h->tbdf = tbdf;
 807        vector = bus_irq_setup(irq_h);
 808        if (vector == -1) {
 809                kfree(irq_h);
 810                return NULL;
 811        }
 812        printk("IRQ %d, vector %d (0x%x), type %s\n", irq, vector, vector,
 813               irq_h->type);
 814        assert(irq_h->check_spurious && irq_h->eoi);
 815        irq_h->isr = handler;
 816        irq_h->data = irq_arg;
 817        irq_h->apic_vector = vector;
 818        spin_lock_irqsave(&irq_handler_wlock);
 819        irq_h->next = irq_handlers[vector];
 820        rcu_assign_pointer(irq_handlers[vector], irq_h);
 821        spin_unlock_irqsave(&irq_handler_wlock);
 822        /* Most IRQs other than the BusIPI should need their irq unmasked.
 823         * Might need to pass the irq_h, in case unmask needs more info.
 824         * The lapic IRQs need to be unmasked on a per-core basis */
 825        if (irq_h->unmask && strcmp(irq_h->type, "lapic"))
 826                irq_h->unmask(irq_h, vector);
 827        return irq_h;
 828}
 829
 830int deregister_irq(int vector, uint32_t tbdf)
 831{
 832        struct irq_handler *irq_h, **pp;
 833
 834        pp = &irq_handlers[vector];
 835        spin_lock_irqsave(&irq_handler_wlock);
 836        while ((irq_h = *pp)) {
 837                if (irq_h->tbdf == tbdf) {
 838                        rcu_assign_pointer(*pp, irq_h->next);
 839                        break;
 840                }
 841                pp = &irq_h->next;
 842        }
 843        spin_unlock_irqsave(&irq_handler_wlock);
 844        if (!irq_h) {
 845                warn("No IRQ V: %d TBDF: %x to unregister!", vector, tbdf);
 846                return -1;
 847        }
 848        /* Ideally, the driver should have told the device to not fire the IRQ
 849         * anymore.  If they do, we may get a warn_once.  This could be on
 850         * another core, etc. */
 851        if (irq_h->mask)
 852                irq_h->mask(irq_h, irq_h->apic_vector);
 853        synchronize_rcu();
 854        if (irq_h->cleanup)
 855                irq_h->cleanup(irq_h);
 856        kfree(irq_h);
 857        return 0;
 858}
 859
 860/* 0 is an error.  It's not a valid IRQ vector for Akaros, even if
 861 * divide-by-zero has trap/irq vector 0 (T_DIVIDE). */
 862int get_irq_vector(void)
 863{
 864        return (int)(long)arena_alloc(irq_vectors, 1, MEM_ATOMIC);
 865}
 866
 867void put_irq_vector(int vec)
 868{
 869        arena_free(irq_vectors, (void*)(long)vec, 1);
 870}
 871
 872/* These routing functions only allow the routing of an irq to a single core.
 873 * If we want to route to multiple cores, we'll probably need to set up logical
 874 * groups or something and take some additional parameters. */
 875static int route_irq_h(struct irq_handler *irq_h, int os_coreid)
 876{
 877        int hw_coreid;
 878        if (!irq_h->route_irq) {
 879                printk("[kernel] apic_vec %d, type %s cannot be routed\n",
 880                       irq_h->apic_vector, irq_h->type);
 881                return -1;
 882        }
 883        if (os_coreid >= MAX_NUM_CORES) {
 884                printk("[kernel] os_coreid %d out of range!\n", os_coreid);
 885                return -1;
 886        }
 887        hw_coreid = get_hw_coreid(os_coreid);
 888        if (hw_coreid == -1) {
 889                printk("[kernel] os_coreid %d not a valid hw core!\n",
 890                       os_coreid);
 891                return -1;
 892        }
 893        irq_h->route_irq(irq_h, irq_h->apic_vector, hw_coreid);
 894        return 0;
 895}
 896
 897/* Routes all irqs for a given apic_vector to os_coreid.  Returns 0 if all of
 898 * them succeeded.  -1 if there were none or if any of them failed.  We don't
 899 * share IRQs often (if ever anymore), so this shouldn't be an issue. */
 900int route_irqs(int apic_vec, int os_coreid)
 901{
 902        struct irq_handler *irq_h;
 903        int ret = -1;
 904
 905        if (!vector_is_irq(apic_vec)) {
 906                printk("[kernel] vector %d is not an IRQ vector!\n", apic_vec);
 907                return -1;
 908        }
 909        irq_h = irq_handlers[apic_vec];
 910        while (irq_h) {
 911                assert(irq_h->apic_vector == apic_vec);
 912                ret = route_irq_h(irq_h, os_coreid);
 913                irq_h = irq_h->next;
 914        }
 915        return ret;
 916}
 917
 918/* It's a moderate pain in the ass to put these in bit-specific files (header
 919 * hell with the set_current_ helpers) */
 920void sysenter_callwrapper(struct syscall *sysc, unsigned long count,
 921                          struct sw_trapframe *sw_tf)
 922{
 923        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 924        set_current_ctx_sw(pcpui, sw_tf);
 925        __set_cpu_state(pcpui, CPU_STATE_KERNEL);
 926        /* Once we've set_current_ctx, we can enable interrupts.  This used to
 927         * be mandatory (we had immediate KMSGs that would muck with cur_ctx).
 928         * Now it should only help for sanity/debugging. */
 929        enable_irq();
 930        /* Set up and run the async calls.  This may block, and we could migrate
 931         * to another core.  If you use pcpui again, you need to reread it. */
 932        prep_syscalls(current, sysc, count);
 933        disable_irq();
 934        proc_restartcore();
 935}
 936
 937/* Declared in x86/arch.h */
 938void send_ipi(uint32_t os_coreid, uint8_t vector)
 939{
 940        int hw_coreid = get_hw_coreid(os_coreid);
 941
 942        if (hw_coreid == -1) {
 943                panic("Unmapped OS coreid (OS %d)!\n", os_coreid);
 944                return;
 945        }
 946        assert(vector != T_NMI);
 947        __send_ipi(hw_coreid, vector);
 948}
 949
 950/****************** VM exit handling ******************/
 951
 952static bool handle_vmexit_cpuid(struct vm_trapframe *tf)
 953{
 954        uint32_t eax, ebx, ecx, edx;
 955        const uint32_t *sigptr;
 956        const char kvm_sig[] = "KVMKVMKVM\0\0\0";
 957        const char akaros_sig[] = "AKAROSINSIDE";
 958
 959        if (vmm_user_handles_cpuid(tf->tf_rax, tf->tf_rcx))
 960                return false;
 961
 962        cpuid(tf->tf_rax, tf->tf_rcx, &eax, &ebx, &ecx, &edx);
 963        switch (tf->tf_rax) {
 964        /* TODO: If we can move this to userspace, vmrunkernel can make GPCS on
 965         * the fly. */
 966        case 0x01:
 967                /* Set the hypervisor bit to let the guest know it is
 968                 * virtualized */
 969                ecx |= 1 << 31;
 970                /* Unset the monitor capability bit so that the guest does not
 971                 * try to use monitor/mwait. */
 972                ecx &= ~(1 << 3);
 973                /* Unset the vmx capability bit so that the guest does not try
 974                 * to turn it on. */
 975                ecx &= ~(1 << 5);
 976                /* Unset the perf capability bit so that the guest does not try
 977                 * to turn it on. */
 978                ecx &= ~(1 << 15);
 979
 980                /* Set the guest pcore id into the apic ID field in CPUID. */
 981                ebx &= 0x0000ffff;
 982                ebx |= (current->vmm.nr_guest_pcores & 0xff) << 16;
 983                ebx |= (tf->tf_guest_pcoreid & 0xff) << 24;
 984                break;
 985        case 0x07:
 986                /* Do not advertise TSC_ADJUST */
 987                ebx &= ~(1 << 1);
 988                break;
 989        case 0x0A:
 990                eax = 0;
 991                ebx = 0;
 992                ecx = 0;
 993                edx = 0;
 994                break;
 995        /* Signal the use of KVM. */
 996        case 0x40000000:
 997                sigptr = (const uint32_t *)kvm_sig;
 998                eax = 0;
 999                ebx = sigptr[0];
1000                ecx = sigptr[1];
1001                edx = sigptr[2];
1002                break;
1003        /* Hypervisor Features. */
1004        case 0x40000003:
1005                /* Unset the monitor capability bit so that the guest does not
1006                 * try to use monitor/mwait. */
1007                edx &= ~(1 << 0);
1008                break;
1009        /* Signal the use of AKAROS. */
1010        case 0x40000100:
1011                sigptr = (const uint32_t *)akaros_sig;
1012                eax = 0;
1013                ebx = sigptr[0];
1014                ecx = sigptr[1];
1015                edx = sigptr[2];
1016                break;
1017        /* Hypervisor Features. */
1018        case 0x40000103:
1019                /* Unset the monitor capability bit so that the guest does not
1020                 * try to use monitor/mwait. */
1021                edx &= ~(1 << 0);
1022                break;
1023        default:
1024                break;
1025        }
1026        tf->tf_rax = eax;
1027        tf->tf_rbx = ebx;
1028        tf->tf_rcx = ecx;
1029        tf->tf_rdx = edx;
1030        tf->tf_rip += 2;
1031        return TRUE;
1032}
1033
1034static bool handle_vmexit_ept_fault(struct vm_trapframe *tf)
1035{
1036        int prot = 0;
1037        int ret;
1038
1039        prot |= tf->tf_exit_qual & VMX_EPT_FAULT_READ ? PROT_READ : 0;
1040        prot |= tf->tf_exit_qual & VMX_EPT_FAULT_WRITE ? PROT_WRITE : 0;
1041        prot |= tf->tf_exit_qual & VMX_EPT_FAULT_INS ? PROT_EXEC : 0;
1042        ret = handle_page_fault(current, tf->tf_guest_pa, prot);
1043        if (ret == 0)
1044                return TRUE;
1045
1046        //Mirror behavior in uthreads, tell userspace to try again.
1047        if (ret == -EAGAIN)
1048                tf->tf_flags |= VMCTX_FL_EPT_VMR_BACKED;
1049
1050        return FALSE;
1051}
1052
1053/* Regarding NMI blocking,
1054 *      "An NMI causes subsequent NMIs to be blocked, but only after the VM exit
1055 *      completes." (SDM)
1056 *
1057 * Like handle_nmi(), this function and anything it calls directly cannot fault,
1058 * or else we lose our NMI protections. */
1059static bool handle_vmexit_nmi(struct vm_trapframe *tf)
1060{
1061        /* Sanity checks, make sure we really got an NMI.  Feel free to remove.
1062         */
1063        assert((tf->tf_intrinfo2 & INTR_INFO_INTR_TYPE_MASK)
1064               == INTR_TYPE_NMI_INTR);
1065        assert((tf->tf_intrinfo2 & INTR_INFO_VECTOR_MASK) == T_NMI);
1066        assert(!irq_is_enabled());
1067
1068        emit_monitor_backtrace(ROS_VM_CTX, tf);
1069        perfmon_snapshot_vmtf(tf);
1070        send_self_ipi(IdtLAPIC_PCINT);
1071        return TRUE;
1072}
1073
1074bool handle_vmexit_msr(struct vm_trapframe *tf)
1075{
1076        bool ret;
1077
1078        ret = vmm_emulate_msr(tf, (tf->tf_exit_reason == EXIT_REASON_MSR_READ
1079                                   ? VMM_MSR_EMU_READ : VMM_MSR_EMU_WRITE));
1080        if (ret)
1081                tf->tf_rip += 2;
1082        return ret;
1083}
1084
1085bool handle_vmexit_extirq(struct vm_trapframe *tf)
1086{
1087        struct hw_trapframe hw_tf;
1088        uint32_t trap_nr;
1089
1090        /* For now, we just handle external IRQs.  I think guest traps should go
1091         * to the guest, based on our vmctls */
1092        assert((tf->tf_intrinfo2 & INTR_INFO_INTR_TYPE_MASK)
1093               == INTR_TYPE_EXT_INTR);
1094        /* The POKE_HANDLER doesn't run for an ExtINT that triggers a vmexit */
1095        trap_nr = tf->tf_intrinfo2 & INTR_INFO_VECTOR_MASK;
1096        if (trap_nr == I_POKE_CORE) {
1097                lapic_send_eoi(trap_nr);
1098                return TRUE;
1099        }
1100        /* TODO: Our IRQ handlers all expect TFs.  Let's fake one.  A bunch of
1101         * handlers (e.g. backtrace/perf) will probably be unhappy about a user
1102         * TF that is really a VM, so this all needs work. */
1103        hw_tf.tf_gsbase = 0;
1104        hw_tf.tf_fsbase = 0;
1105        hw_tf.tf_rax = tf->tf_rax;
1106        hw_tf.tf_rbx = tf->tf_rbx;
1107        hw_tf.tf_rcx = tf->tf_rcx;
1108        hw_tf.tf_rdx = tf->tf_rdx;
1109        hw_tf.tf_rbp = tf->tf_rbp;
1110        hw_tf.tf_rsi = tf->tf_rsi;
1111        hw_tf.tf_rdi = tf->tf_rdi;
1112        hw_tf.tf_r8 = tf->tf_r8;
1113        hw_tf.tf_r9 = tf->tf_r9;
1114        hw_tf.tf_r10 = tf->tf_r10;
1115        hw_tf.tf_r11 = tf->tf_r11;
1116        hw_tf.tf_r12 = tf->tf_r12;
1117        hw_tf.tf_r13 = tf->tf_r13;
1118        hw_tf.tf_r14 = tf->tf_r14;
1119        hw_tf.tf_r15 = tf->tf_r15;
1120        hw_tf.tf_trapno = trap_nr;
1121        hw_tf.tf_err = 0;
1122        hw_tf.tf_rip = tf->tf_rip;
1123        hw_tf.tf_cs = GD_UT;    /* faking a user TF, even though it's a VM */
1124        hw_tf.tf_rflags = tf->tf_rflags;
1125        hw_tf.tf_rsp = tf->tf_rsp;
1126        hw_tf.tf_ss = GD_UD;
1127
1128        irq_dispatch(&hw_tf);
1129        /* Consider returning whether or not there was a handler registered */
1130        return TRUE;
1131}
1132
1133static bool handle_vmexit_xsetbv(struct vm_trapframe *tf)
1134{
1135        // The VM's requested-feature bitmap is represented by edx:eax
1136        uint64_t vm_rfbm = (tf->tf_rdx << 32) | tf->tf_rax;
1137
1138        // If the VM tries to set xcr0 to a superset
1139        // of Akaros's default value, kill the VM.
1140
1141        // Bit in vm_rfbm and x86_default_xcr0: Ok. Requested and allowed.
1142        // Bit in vm_rfbm but not x86_default_xcr0: Bad! Requested, not allowed.
1143        // Bit not in vm_rfbm but in x86_default_xcr0: Ok. Not requested.
1144
1145        // vm_rfbm & (~x86_default_xcr0) is nonzero if any bits
1146        // are set in vm_rfbm but not x86_default_xcr0
1147
1148        if (vm_rfbm & (~__proc_global_info.x86_default_xcr0))
1149                return FALSE;
1150
1151
1152        // If attempting to use vm_rfbm for xsetbv
1153        // causes a fault, we reflect to the VMM.
1154        if (safe_lxcr0(vm_rfbm))
1155                return FALSE;
1156
1157
1158        // If no fault, advance the instruction pointer
1159        // and return TRUE to make the VM resume.
1160        tf->tf_rip += 3; // XSETBV is a 3-byte instruction
1161        return TRUE;
1162}
1163
1164static void vmexit_dispatch(struct vm_trapframe *tf)
1165{
1166        bool handled = FALSE;
1167
1168        /* Do not block in any of these functions.
1169         *
1170         * If we block, we'll probably need to finalize the context.  If we do,
1171         * then there's a chance the guest pcore can start somewhere else, and
1172         * then we can't get the GPC loaded again.  Plus, they could be running
1173         * a GPC with an unresolved vmexit.  It's just mess.
1174         *
1175         * If we want to enable IRQs, we can do so on a case-by-case basis.
1176         * Don't do it for external IRQs - the irq_dispatch code will handle it.
1177         * */
1178        switch (tf->tf_exit_reason) {
1179        case EXIT_REASON_VMCALL:
1180                if (current->vmm.flags & VMM_CTL_FL_KERN_PRINTC &&
1181                    tf->tf_rax == AKAROS_VMCALL_PRINTC) {
1182                        printk("%c", tf->tf_rdi);
1183                        tf->tf_rip += 3;
1184                        handled = TRUE;
1185                }
1186                break;
1187        case EXIT_REASON_CPUID:
1188                handled = handle_vmexit_cpuid(tf);
1189                break;
1190        case EXIT_REASON_EPT_VIOLATION:
1191                handled = handle_vmexit_ept_fault(tf);
1192                break;
1193        case EXIT_REASON_EXCEPTION_NMI:
1194                handled = handle_vmexit_nmi(tf);
1195                break;
1196        case EXIT_REASON_MSR_READ:
1197        case EXIT_REASON_MSR_WRITE:
1198                handled = handle_vmexit_msr(tf);
1199                break;
1200        case EXIT_REASON_EXTERNAL_INTERRUPT:
1201                handled = handle_vmexit_extirq(tf);
1202                break;
1203        case EXIT_REASON_XSETBV:
1204                handled = handle_vmexit_xsetbv(tf);
1205                break;
1206        default:
1207                printd("Unhandled vmexit: reason 0x%x, exit qual 0x%x\n",
1208                       tf->tf_exit_reason, tf->tf_exit_qual);
1209        }
1210        if (!handled) {
1211                tf->tf_flags |= VMCTX_FL_HAS_FAULT;
1212                if (reflect_current_context()) {
1213                        /* VM contexts shouldn't be in vcore context, so this
1214                         * should be pretty rare (unlike SCPs or VC ctx page
1215                         * faults). */
1216                        printk("[kernel] Unable to reflect VM Exit\n");
1217                        print_vmtrapframe(tf);
1218                        proc_destroy(current);
1219                }
1220        }
1221}
1222
1223void handle_vmexit(struct vm_trapframe *tf)
1224{
1225        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
1226
1227        tf->tf_rip = vmcs_read(GUEST_RIP);
1228        tf->tf_rflags = vmcs_read(GUEST_RFLAGS);
1229        tf->tf_rsp = vmcs_read(GUEST_RSP);
1230        tf->tf_cr2 = rcr2();
1231        tf->tf_cr3 = vmcs_read(GUEST_CR3);
1232        tf->tf_guest_pcoreid = pcpui->guest_pcoreid;
1233        tf->tf_flags |= VMCTX_FL_PARTIAL;
1234        tf->tf_guest_intr_status = vmcs_read(GUEST_INTR_STATUS);
1235        tf->tf_exit_reason = vmcs_read(VM_EXIT_REASON);
1236        tf->tf_exit_qual = vmcs_read(EXIT_QUALIFICATION);
1237        tf->tf_intrinfo1 = vmcs_read(GUEST_INTERRUPTIBILITY_INFO);
1238        tf->tf_intrinfo2 = vmcs_read(VM_EXIT_INTR_INFO);
1239        tf->tf_guest_va = vmcs_read(GUEST_LINEAR_ADDRESS);
1240        tf->tf_guest_pa = vmcs_read(GUEST_PHYSICAL_ADDRESS);
1241
1242        set_current_ctx_vm(pcpui, tf);
1243        __set_cpu_state(pcpui, CPU_STATE_KERNEL);
1244        tf = &pcpui->cur_ctx->tf.vm_tf;
1245        vmexit_dispatch(tf);
1246        /* We're either restarting a partial VM ctx (vmcs was launched, loaded
1247         * on the core, etc) or a SW vc ctx for the reflected trap.  Or the proc
1248         * is dying and we'll handle a __death KMSG shortly. */
1249        proc_restartcore();
1250}
1251
1252/* Partial contexts for HW and SW TFs have the user's gs in MSR_KERNEL_GS_BASE.
1253 * The kernel's gs is loaded into gs.  We need to put the kernel's gs into
1254 * KERNEL_GS_BASE so the core is ready to run another full context, save the
1255 * user's {GS,FS}_BASE into their TF so it can run on another core, and keep GS
1256 * loaded with the current GS (the kernel's). */
1257static void x86_finalize_hwtf(struct hw_trapframe *tf)
1258{
1259        tf->tf_gsbase = read_kern_gsbase();
1260        write_kern_gsbase(read_gsbase());
1261        tf->tf_fsbase = read_fsbase();
1262        x86_hwtf_clear_partial(tf);
1263}
1264
1265static void x86_finalize_swtf(struct sw_trapframe *tf)
1266{
1267        tf->tf_gsbase = read_kern_gsbase();
1268        write_kern_gsbase(read_gsbase());
1269        tf->tf_fsbase = read_fsbase();
1270        x86_swtf_clear_partial(tf);
1271}
1272
1273static void x86_finalize_vmtf(struct vm_trapframe *tf)
1274{
1275        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
1276
1277        x86_vmtf_clear_partial(tf);
1278        unload_guest_pcore(pcpui->owning_proc, pcpui->guest_pcoreid);
1279}
1280
1281/* Makes sure that the user context is fully saved into ctx and not split across
1282 * the struct and HW, meaning it is not a "partial context".
1283 *
1284 * Be careful to zero out any part of the ctx struct not in use, to avoid
1285 * leaking information from other processes. */
1286void arch_finalize_ctx(struct user_context *ctx)
1287{
1288        if (!arch_ctx_is_partial(ctx))
1289                return;
1290        switch (ctx->type) {
1291        case ROS_HW_CTX:
1292                x86_finalize_hwtf(&ctx->tf.hw_tf);
1293                memset((uint8_t*)&ctx->tf + sizeof(struct hw_trapframe), 0,
1294                           sizeof(ctx->tf) - sizeof(struct hw_trapframe));
1295                break;
1296        case ROS_SW_CTX:
1297                x86_finalize_swtf(&ctx->tf.sw_tf);
1298                memset((uint8_t*)&ctx->tf + sizeof(struct sw_trapframe), 0,
1299                           sizeof(ctx->tf) - sizeof(struct sw_trapframe));
1300                break;
1301        case ROS_VM_CTX:
1302                x86_finalize_vmtf(&ctx->tf.vm_tf);
1303                memset((uint8_t*)&ctx->tf + sizeof(struct vm_trapframe), 0,
1304                           sizeof(ctx->tf) - sizeof(struct vm_trapframe));
1305                break;
1306        }
1307}
1308