akaros/kern/arch/x86/smp_boot.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2009 The Regents of the University of California
   3 * Barret Rhoden <brho@cs.berkeley.edu>
   4 * See LICENSE for details.
   5 */
   6
   7#include <arch/x86.h>
   8#include <arch/arch.h>
   9#include <smp.h>
  10#include <arch/console.h>
  11#include <arch/apic.h>
  12#include <arch/perfmon.h>
  13#include <time.h>
  14
  15#include <bitmask.h>
  16#include <atomic.h>
  17#include <error.h>
  18#include <stdio.h>
  19#include <string.h>
  20#include <assert.h>
  21#include <pmap.h>
  22#include <env.h>
  23#include <trap.h>
  24#include <kmalloc.h>
  25#include <cpu_feat.h>
  26#include <arch/fsgsbase.h>
  27#include <ros/procinfo.h>
  28
  29#include "vmm/vmm.h"
  30
  31extern handler_wrapper_t handler_wrappers[NUM_HANDLER_WRAPPERS];
  32int x86_num_cores_booted = 1;
  33uintptr_t smp_stack_top;
  34barrier_t generic_barrier;
  35
  36#define DECLARE_HANDLER_CHECKLISTS(vector)                          \
  37        INIT_CHECKLIST(f##vector##_cpu_list, MAX_NUM_CORES);
  38
  39#define INIT_HANDLER_WRAPPER(v)                                     \
  40{                                                                   \
  41        handler_wrappers[(v)].vector = 0xe##v;                      \
  42        handler_wrappers[(v)].cpu_list = &f##v##_cpu_list;          \
  43        handler_wrappers[(v)].cpu_list->mask.size = num_cores;      \
  44}
  45
  46DECLARE_HANDLER_CHECKLISTS(0);
  47DECLARE_HANDLER_CHECKLISTS(1);
  48DECLARE_HANDLER_CHECKLISTS(2);
  49DECLARE_HANDLER_CHECKLISTS(3);
  50DECLARE_HANDLER_CHECKLISTS(4);
  51
  52static void init_smp_call_function(void)
  53{
  54        INIT_HANDLER_WRAPPER(0);
  55        INIT_HANDLER_WRAPPER(1);
  56        INIT_HANDLER_WRAPPER(2);
  57        INIT_HANDLER_WRAPPER(3);
  58        INIT_HANDLER_WRAPPER(4);
  59}
  60
  61/******************************************************************************/
  62
  63bool core_id_ready = FALSE;
  64
  65static void setup_rdtscp(int coreid)
  66{
  67        uint32_t edx;
  68        int rdtscp_ecx;
  69
  70        /* TODO: have some sort of 'cpu info structure' with flags */
  71        cpuid(0x80000001, 0x0, 0, 0, 0, &edx);
  72        if (edx & (1 << 27)) {
  73                write_msr(MSR_TSC_AUX, coreid);
  74                /* Busted versions of qemu bug out here (32 bit) */
  75                asm volatile ("rdtscp" : "=c"(rdtscp_ecx) : : "eax", "edx");
  76                if (!coreid && (read_msr(MSR_TSC_AUX) != rdtscp_ecx))
  77                        printk("\nBroken rdtscp detected, don't trust it for pcoreid!\n\n");
  78        }
  79}
  80
  81/* TODO: consider merging __arch_pcpu with parts of this (sync with RISCV) */
  82void smp_final_core_init(void)
  83{
  84        /* Set the coreid in pcpui for fast access to it through TLS. */
  85        int coreid = get_os_coreid(hw_core_id());
  86        struct per_cpu_info *pcpui = &per_cpu_info[coreid];
  87        pcpui->coreid = coreid;
  88        write_msr(MSR_GS_BASE, (uintptr_t)pcpui); /* our cr4 isn't set yet */
  89        write_msr(MSR_KERN_GS_BASE, (uint64_t)pcpui);
  90        /* don't need this for the kernel anymore, but userspace can still use
  91         * it */
  92        setup_rdtscp(coreid);
  93        /* After this point, all cores have set up their segmentation and
  94         * whatnot to be able to do a proper core_id(). */
  95        waiton_barrier(&generic_barrier);
  96        if (coreid == 0)
  97                core_id_ready = TRUE;
  98        /* being paranoid with this, it's all a bit ugly */
  99        waiton_barrier(&generic_barrier);
 100        setup_default_mtrrs(&generic_barrier);
 101        smp_percpu_init();
 102        waiton_barrier(&generic_barrier);
 103}
 104
 105// this needs to be set in smp_entry too...
 106#define trampoline_pg 0x00001000UL
 107extern char smp_entry[];
 108extern char smp_entry_end[];
 109extern char smp_boot_lock[];
 110extern char smp_semaphore[];
 111
 112static inline uint16_t *get_smp_semaphore()
 113{
 114        return (uint16_t *)(smp_semaphore - smp_entry + trampoline_pg);
 115}
 116
 117static void __spin_bootlock_raw(void)
 118{
 119        uint16_t *bootlock = (uint16_t*)(smp_boot_lock - smp_entry +
 120                                         trampoline_pg);
 121
 122        /* Same lock code as in smp_entry */
 123        asm volatile ("movw $1, %%ax;   "
 124                      "1:               "
 125                      "xchgw %%ax, %0;  "
 126                      "test %%ax, %%ax; "
 127                      "jne 1b;" : : "m"(*bootlock) : "eax", "cc", "memory");
 128}
 129
 130void smp_boot(void)
 131{
 132        struct per_cpu_info *pcpui0 = &per_cpu_info[0];
 133        page_t *smp_stack;
 134
 135        // NEED TO GRAB A LOWMEM FREE PAGE FOR AP BOOTUP CODE
 136        // page1 (2nd page) is reserved, hardcoded in pmap.c
 137        memset(KADDR(trampoline_pg), 0, PGSIZE);
 138        memcpy(KADDR(trampoline_pg), (void *)smp_entry,
 139           smp_entry_end - smp_entry);
 140
 141        /* Make sure the trampoline page is mapped.  64 bit already has the
 142         * tramp pg mapped (1 GB of lowmem), so this is a nop. */
 143
 144        // Allocate a stack for the cores starting up.  One for all, must share
 145        if (kpage_alloc(&smp_stack))
 146                panic("No memory for SMP boot stack!");
 147        smp_stack_top = (uintptr_t)(page2kva(smp_stack) + PGSIZE);
 148
 149        /* During SMP boot, core_id_early() returns 0, so all of the cores,
 150         * which grab locks concurrently, share the same pcpui and thus the same
 151         * lock_depth.  We need to disable checking until core_id works
 152         * properly. */
 153        pcpui0->__lock_checking_enabled = 0;
 154        // Start the IPI process (INIT, wait, SIPI, wait, SIPI, wait)
 155        send_init_ipi();
 156        // SDM 3A is a little wonky wrt the proper delays.  These are my best
 157        // guess.
 158        udelay(10000);
 159        // first SIPI
 160        send_startup_ipi(0x01);
 161        /* BOCHS does not like this second SIPI.
 162        // second SIPI
 163        udelay(200);
 164        send_startup_ipi(0x01);
 165        */
 166        udelay(500000);
 167
 168        // Each core will also increment smp_semaphore, and decrement when it is
 169        // done, all in smp_entry.  It's purpose is to keep Core0 from competing
 170        // for the smp_boot_lock.  So long as one AP increments the sem before
 171        // the final LAPIC timer goes off, all available cores will be
 172        // initialized.
 173        while (*get_smp_semaphore())
 174                cpu_relax();
 175
 176        // From here on, no other cores are coming up.  Grab the lock to ensure
 177        // it.  Another core could be in it's prelock phase and be trying to
 178        // grab the lock forever....
 179        // The lock exists on the trampoline, so it can be grabbed right away in
 180        // real mode.  If core0 wins the race and blocks other CPUs from coming
 181        // up it can crash the machine if the other cores are allowed to proceed
 182        // with booting.  Specifically, it's when they turn on paging and have
 183        // that temp mapping pulled out from under them.  Now, if a core loses,
 184        // it will spin on the trampoline (which we must be careful to not
 185        // deallocate)
 186        __spin_bootlock_raw();
 187        printk("Number of Cores Detected: %d\n", x86_num_cores_booted);
 188#ifdef CONFIG_DISABLE_SMT
 189        assert(!(num_cores % 2));
 190        printk("Using only %d Idlecores (SMT Disabled)\n", num_cores >> 1);
 191#endif /* CONFIG_DISABLE_SMT */
 192
 193        /* cleans up the trampoline page, and any other low boot mem mappings */
 194        x86_cleanup_bootmem();
 195        /* trampoline_pg had a refcount of 2 earlier, so we need to dec once
 196         * more to free it but only if all cores are in (or we reset / reinit
 197         * those that failed) */
 198        if (x86_num_cores_booted == num_cores) {
 199                /* TODO: if we ever alloc the trampoline_pg or something, we can
 200                 * free it here. */
 201        } else {
 202                warn("ACPI/MP found %d cores, smp_boot initialized %d, using %d\n",
 203                     num_cores, x86_num_cores_booted, x86_num_cores_booted);
 204                num_cores = x86_num_cores_booted;
 205        }
 206        // Dealloc the temp shared stack
 207        page_decref(smp_stack);
 208
 209        // Set up the generic remote function call facility
 210        init_smp_call_function();
 211
 212        /* Final core initialization */
 213        init_barrier(&generic_barrier, num_cores);
 214        /* This will break the cores out of their hlt in smp_entry.S */
 215        send_broadcast_ipi(I_POKE_CORE);
 216        smp_final_core_init();  /* need to init ourselves as well */
 217}
 218
 219/* This is called from smp_entry by each core to finish the core bootstrapping.
 220 * There is a spinlock around this entire function in smp_entry, for a few
 221 * reasons, the most important being that all cores use the same stack when
 222 * entering here.
 223 *
 224 * Do not use per_cpu_info in here.  Do whatever you need in smp_percpu_init().
 225 */
 226uintptr_t smp_main(void)
 227{
 228        /* We need to fake being core 0 for our memory allocations to work
 229         * nicely.  This is safe since the entire machine is single threaded
 230         * while we are in this function. */
 231        write_msr(MSR_GS_BASE, (uintptr_t)&per_cpu_info[0]);
 232
 233        // Get a per-core kernel stack
 234        uintptr_t my_stack_top = get_kstack();
 235
 236        /* This blob is the GDT, the GDT PD, and the TSS. */
 237        unsigned int blob_size = sizeof(segdesc_t) * SEG_COUNT +
 238                                 sizeof(pseudodesc_t) + sizeof(taskstate_t);
 239        /* TODO: don't use kmalloc - might have issues in the future */
 240        void *gdt_etc = kmalloc(blob_size, 0);  /* we'll never free this btw */
 241        taskstate_t *my_ts = gdt_etc;
 242        pseudodesc_t *my_gdt_pd = (void*)my_ts + sizeof(taskstate_t);
 243        segdesc_t *my_gdt = (void*)my_gdt_pd + sizeof(pseudodesc_t);
 244
 245        /* This is a bit ghetto: we need to communicate our GDT and TSS's
 246         * location to smp_percpu_init(), but we can't trust our coreid (since
 247         * they haven't been remapped yet (so we can't write it directly to
 248         * per_cpu_info)).  So we use the bottom of the stack page... */
 249        *kstack_bottom_addr(my_stack_top) = (uintptr_t)gdt_etc;
 250
 251        // Build and load the gdt / gdt_pd
 252        memcpy(my_gdt, gdt, sizeof(segdesc_t)*SEG_COUNT);
 253        *my_gdt_pd = (pseudodesc_t) {
 254                sizeof(segdesc_t)*SEG_COUNT - 1, (uintptr_t) my_gdt };
 255        asm volatile("lgdt %0" : : "m"(*my_gdt_pd));
 256
 257        /* Set up our kernel stack when changing rings */
 258        x86_set_stacktop_tss(my_ts, my_stack_top);
 259        // Initialize the TSS field of my_gdt.
 260        syssegdesc_t *ts_slot = (syssegdesc_t*)&my_gdt[GD_TSS >> 3];
 261        *ts_slot = (syssegdesc_t)SEG_SYS_SMALL(STS_T32A, (uintptr_t)my_ts,
 262                                               sizeof(taskstate_t), 0);
 263        // Load the TSS
 264        ltr(GD_TSS);
 265
 266        // Loads the same IDT used by the other cores
 267        asm volatile("lidt %0" : : "m"(idt_pd));
 268
 269        apiconline();
 270
 271        /* Stop pretending to be core 0.  We'll get our own coreid shortly and
 272         * set gs properly (smp_final_core_init()) */
 273        write_msr(MSR_GS_BASE, 0);
 274
 275        return my_stack_top; // will be loaded in smp_entry.S
 276}
 277
 278static void pcpu_init_nmi(struct per_cpu_info *pcpui)
 279{
 280        uintptr_t nmi_entry_stacktop = get_kstack();
 281
 282        /* NMI handlers can't use swapgs for kernel TFs, so we need to bootstrap
 283         * a bit.  We'll use a little bit of space above the actual NMI stacktop
 284         * for storage for the pcpui pointer.  But we need to be careful: the HW
 285         * will align RSP to 16 bytes on entry. */
 286        nmi_entry_stacktop -= 16;
 287        *(uintptr_t*)nmi_entry_stacktop = (uintptr_t)pcpui;
 288        pcpui->tss->ts_ist1 = nmi_entry_stacktop;
 289        /* Our actual NMI work is done on yet another stack, to avoid the "iret
 290         * cancelling NMI protections" problem.  All problems can be solved with
 291         * another layer of indirection! */
 292        pcpui->nmi_worker_stacktop = get_kstack();
 293}
 294
 295static void pcpu_init_doublefault(struct per_cpu_info *pcpui)
 296{
 297        pcpui->tss->ts_ist2 = get_kstack();
 298}
 299
 300/* Perform any initialization needed by per_cpu_info.  Make sure every core
 301 * calls this at some point in the smp_boot process.  If you don't smp_boot, you
 302 * must still call this for core 0.  This must NOT be called from smp_main,
 303 * since it relies on the kernel stack pointer to find the gdt.  Be careful not
 304 * to call it on too deep of a stack frame. */
 305void __arch_pcpu_init(uint32_t coreid)
 306{
 307        uintptr_t *my_stack_bot;
 308        struct per_cpu_info *pcpui = &per_cpu_info[coreid];
 309        uint32_t eax, edx;
 310
 311        /* Flushes any potentially old mappings from smp_boot() (note the page
 312         * table removal) */
 313        tlbflush();
 314
 315        if (cpu_has_feat(CPU_FEAT_X86_FSGSBASE))
 316                lcr4(rcr4() | CR4_FSGSBASE);
 317
 318        /*
 319         * Enable SSE instructions.
 320         * CR4.OSFXSR enables SSE and ensures that MXCSR/XMM gets saved with
 321         *            FXSAVE
 322         * CR4.OSXSAVE enables XSAVE instructions. Only set if XSAVE supported.
 323         * CR4.OSXMME indicates OS support for software exception handlers for
 324         * SIMD floating-point exceptions (turn it on to get #XM exceptions
 325         * in the event of a SIMD error instead of #UD exceptions).
 326         */
 327        lcr4(rcr4() | CR4_OSFXSR | CR4_OSXMME);
 328
 329        if (cpu_has_feat(CPU_FEAT_X86_XSAVE)) {
 330                // You MUST set CR4.OSXSAVE before loading xcr0
 331                lcr4(rcr4() | CR4_OSXSAVE);
 332                // Set xcr0 to the Akaros-wide default
 333                lxcr0(__proc_global_info.x86_default_xcr0);
 334        }
 335
 336        // Initialize fpu and extended state by restoring our default XSAVE
 337        // area.
 338        init_fp_state();
 339
 340        /* core 0 set up earlier in idt_init() */
 341        if (coreid) {
 342                my_stack_bot = kstack_bottom_addr(ROUNDUP(read_sp() - 1,
 343                                                          PGSIZE));
 344                pcpui->tss = (taskstate_t*)(*my_stack_bot);
 345                pcpui->gdt = (segdesc_t*)(*my_stack_bot +
 346                                          sizeof(taskstate_t) +
 347                                          sizeof(pseudodesc_t));
 348        }
 349        assert(read_gsbase() == (uintptr_t)pcpui);
 350        assert(read_msr(MSR_KERN_GS_BASE) == (uint64_t)pcpui);
 351        /* Don't try setting up til after setting GS */
 352        x86_sysenter_init();
 353        x86_set_sysenter_stacktop(x86_get_stacktop_tss(pcpui->tss));
 354        pcpu_init_nmi(pcpui);
 355        pcpu_init_doublefault(pcpui);
 356        /* need to init perfctr before potentially using it in timer handler */
 357        perfmon_pcpu_init();
 358        vmm_pcpu_init();
 359        lcr4(rcr4() & ~CR4_TSD);
 360
 361        /* This should allow turbo mode.  I haven't found a doc that says how
 362         * deep we need to sleep.  At a minimum on some machines, it's C2.
 363         * Given that "C2 or deeper" pops up in a few other areas as a deeper
 364         * sleep (e.g.  mwaits on memory accesses from outside the processor
 365         * won't wake >= C2), this might be deep enough for turbo mode to kick
 366         * in. */
 367        set_fastest_pstate();
 368        set_cstate(X86_MWAIT_C2);
 369}
 370