akaros/kern/src/process.c
<<
>>
Prefs
   1/* Copyright (c) 2009, 2010 The Regents of the University of California
   2 * Barret Rhoden <brho@cs.berkeley.edu>
   3 * See LICENSE for details. */
   4
   5#include <event.h>
   6#include <arch/arch.h>
   7#include <bitmask.h>
   8#include <process.h>
   9#include <atomic.h>
  10#include <smp.h>
  11#include <pmap.h>
  12#include <trap.h>
  13#include <umem.h>
  14#include <schedule.h>
  15#include <manager.h>
  16#include <stdio.h>
  17#include <assert.h>
  18#include <time.h>
  19#include <hashtable.h>
  20#include <slab.h>
  21#include <sys/queue.h>
  22#include <monitor.h>
  23#include <elf.h>
  24#include <arsc_server.h>
  25#include <kmalloc.h>
  26#include <ros/procinfo.h>
  27#include <init.h>
  28#include <rcu.h>
  29#include <arch/intel-iommu.h>
  30
  31struct kmem_cache *proc_cache;
  32
  33/* Other helpers, implemented later. */
  34static bool is_mapped_vcore(struct proc *p, uint32_t pcoreid);
  35static uint32_t get_vcoreid(struct proc *p, uint32_t pcoreid);
  36static uint32_t try_get_pcoreid(struct proc *p, uint32_t vcoreid);
  37static uint32_t get_pcoreid(struct proc *p, uint32_t vcoreid);
  38static void __proc_free(struct kref *kref);
  39static bool scp_is_vcctx_ready(struct preempt_data *vcpd);
  40static void save_vc_fp_state(struct preempt_data *vcpd);
  41static void restore_vc_fp_state(struct preempt_data *vcpd);
  42
  43/* PID management. */
  44#define PID_MAX 32767 // goes from 0 to 32767, with 0 reserved
  45static DECL_BITMASK(pid_bmask, PID_MAX + 1);
  46spinlock_t pid_bmask_lock = SPINLOCK_INITIALIZER;
  47struct hashtable *pid_hash;
  48spinlock_t pid_hash_lock; // initialized in proc_init
  49
  50/* Finds the next free entry (zero) entry in the pid_bitmask.  Set means busy.
  51 * PID 0 is reserved (in proc_init).  A return value of 0 is a failure (and
  52 * you'll also see a warning, for now).  Consider doing this with atomics. */
  53static pid_t get_free_pid(void)
  54{
  55        static pid_t next_free_pid = 1;
  56        pid_t my_pid = 0;
  57
  58        spin_lock(&pid_bmask_lock);
  59        // atomically (can lock for now, then change to atomic_and_return
  60        FOR_CIRC_BUFFER(next_free_pid, PID_MAX + 1, i) {
  61                // always points to the next to test
  62                next_free_pid = (next_free_pid + 1) % (PID_MAX + 1);
  63                if (!GET_BITMASK_BIT(pid_bmask, i)) {
  64                        SET_BITMASK_BIT(pid_bmask, i);
  65                        my_pid = i;
  66                        break;
  67                }
  68        }
  69        spin_unlock(&pid_bmask_lock);
  70        if (!my_pid)
  71                warn("Unable to find a PID!  You need to deal with this!\n");
  72        return my_pid;
  73}
  74
  75/* Return a pid to the pid bitmask */
  76static void put_free_pid(pid_t pid)
  77{
  78        spin_lock(&pid_bmask_lock);
  79        CLR_BITMASK_BIT(pid_bmask, pid);
  80        spin_unlock(&pid_bmask_lock);
  81}
  82
  83/* 'resume' is the time int ticks of the most recent onlining.  'total' is the
  84 * amount of time in ticks consumed up to and including the current offlining.
  85 *
  86 * We could move these to the map and unmap of vcores, though not every place
  87 * uses that (SCPs, in particular).  However, maps/unmaps happen remotely;
  88 * something to consider.  If we do it remotely, we can batch them up and do one
  89 * rdtsc() for all of them.  For now, I want to do them on the core, around when
  90 * we do the context change.  It'll also parallelize the accounting a bit. */
  91void vcore_account_online(struct proc *p, uint32_t vcoreid)
  92{
  93        struct vcore *vc = &p->procinfo->vcoremap[vcoreid];
  94
  95        vc->resume_ticks = read_tsc();
  96}
  97
  98void vcore_account_offline(struct proc *p, uint32_t vcoreid)
  99{
 100        struct vcore *vc = &p->procinfo->vcoremap[vcoreid];
 101        vc->total_ticks += read_tsc() - vc->resume_ticks;
 102}
 103
 104uint64_t vcore_account_gettotal(struct proc *p, uint32_t vcoreid)
 105{
 106        struct vcore *vc = &p->procinfo->vcoremap[vcoreid];
 107
 108        return vc->total_ticks;
 109}
 110
 111/* While this could be done with just an assignment, this gives us the
 112 * opportunity to check for bad transitions.  Might compile these out later, so
 113 * we shouldn't rely on them for sanity checking from userspace.  */
 114int __proc_set_state(struct proc *p, uint32_t state)
 115{
 116        uint32_t curstate = p->state;
 117        /* Valid transitions:
 118         * C   -> RBS
 119         * C   -> D
 120         * RBS -> RGS
 121         * RGS -> RBS
 122         * RGS -> W
 123         * RGM -> W
 124         * W   -> RBS
 125         * W   -> RGS
 126         * W   -> RBM
 127         * W   -> D
 128         * RGS -> RBM
 129         * RBM -> RGM
 130         * RGM -> RBM
 131         * RGM -> RBS
 132         * RGS -> D
 133         * RGM -> D
 134         * D   -> DA
 135         *
 136         * These ought to be implemented later (allowed, not thought through
 137         * yet).
 138         * RBS -> D
 139         * RBM -> D
 140         */
 141        #if 1 // some sort of correctness flag
 142        switch (curstate) {
 143        case PROC_CREATED:
 144                if (!(state & (PROC_RUNNABLE_S | PROC_DYING)))
 145                        goto invalid_state_transition;
 146                break;
 147        case PROC_RUNNABLE_S:
 148                if (!(state & (PROC_RUNNING_S | PROC_DYING)))
 149                        goto invalid_state_transition;
 150                break;
 151        case PROC_RUNNING_S:
 152                if (!(state & (PROC_RUNNABLE_S | PROC_RUNNABLE_M | PROC_WAITING
 153                               | PROC_DYING)))
 154                        goto invalid_state_transition;
 155                break;
 156        case PROC_WAITING:
 157                if (!(state & (PROC_RUNNABLE_S | PROC_RUNNING_S |
 158                               PROC_RUNNABLE_M | PROC_DYING)))
 159                        goto invalid_state_transition;
 160                break;
 161        case PROC_DYING:
 162                if (state != PROC_DYING_ABORT)
 163                        goto invalid_state_transition;
 164                break;
 165        case PROC_DYING_ABORT:
 166                goto invalid_state_transition;
 167        case PROC_RUNNABLE_M:
 168                if (!(state & (PROC_RUNNING_M | PROC_DYING)))
 169                        goto invalid_state_transition;
 170                break;
 171        case PROC_RUNNING_M:
 172                if (!(state & (PROC_RUNNABLE_S | PROC_RUNNABLE_M | PROC_WAITING
 173                               | PROC_DYING)))
 174                        goto invalid_state_transition;
 175                break;
 176invalid_state_transition:
 177                panic("Invalid State Transition! %s to %02x",
 178                      procstate2str(state), state);
 179        }
 180        #endif
 181        p->state = state;
 182        return 0;
 183}
 184
 185/* Returns a pointer to the proc with the given pid, or 0 if there is none.
 186 * This uses get_not_zero, since it is possible the refcnt is 0, which means the
 187 * process is dying and we should not have the ref (and thus return 0).  We need
 188 * to lock to protect us from getting p, (someone else removes and frees p),
 189 * then get_not_zero() on p.
 190 * Don't push the locking into the hashtable without dealing with this. */
 191struct proc *pid2proc(pid_t pid)
 192{
 193        spin_lock(&pid_hash_lock);
 194        struct proc *p = hashtable_search(pid_hash, (void*)(long)pid);
 195
 196        if (p)
 197                if (!kref_get_not_zero(&p->p_kref, 1))
 198                        p = 0;
 199        spin_unlock(&pid_hash_lock);
 200        return p;
 201}
 202
 203/* Used by devproc for successive reads of the proc table.
 204 * Returns a pointer to the nth proc, or 0 if there is none.
 205 * This uses get_not_zero, since it is possible the refcnt is 0, which means the
 206 * process is dying and we should not have the ref (and thus return 0).  We need
 207 * to lock to protect us from getting p, (someone else removes and frees p),
 208 * then get_not_zero() on p.
 209 * Don't push the locking into the hashtable without dealing with this. */
 210struct proc *pid_nth(unsigned int n)
 211{
 212        struct proc *p;
 213        spin_lock(&pid_hash_lock);
 214        if (!hashtable_count(pid_hash)) {
 215                spin_unlock(&pid_hash_lock);
 216                return NULL;
 217        }
 218        struct hashtable_itr *iter = hashtable_iterator(pid_hash);
 219        p = hashtable_iterator_value(iter);
 220
 221        while (p) {
 222                /* if this process is not valid, it doesn't count,
 223                 * so continue
 224                 */
 225
 226                if (kref_get_not_zero(&p->p_kref, 1)) {
 227                        /* this one counts */
 228                        if (! n){
 229                                printd("pid_nth: at end, p %p\n", p);
 230                                break;
 231                        }
 232                        kref_put(&p->p_kref);
 233                        n--;
 234                }
 235                if (!hashtable_iterator_advance(iter)) {
 236                        p = NULL;
 237                        break;
 238                }
 239                p = hashtable_iterator_value(iter);
 240        }
 241
 242        spin_unlock(&pid_hash_lock);
 243        kfree(iter);
 244        return p;
 245}
 246
 247/* Performs any initialization related to processes, such as create the proc
 248 * cache, prep the scheduler, etc.  When this returns, we should be ready to use
 249 * any process related function. */
 250void proc_init(void)
 251{
 252        /* Catch issues with the vcoremap and TAILQ_ENTRY sizes */
 253        static_assert(sizeof(TAILQ_ENTRY(vcore)) == sizeof(void*) * 2);
 254        proc_cache = kmem_cache_create("proc", sizeof(struct proc),
 255                                       MAX(ARCH_CL_SIZE,
 256                                       __alignof__(struct proc)), 0, NULL, 0,
 257                                       0, NULL);
 258        /* Init PID mask and hash.  pid 0 is reserved. */
 259        SET_BITMASK_BIT(pid_bmask, 0);
 260        spinlock_init(&pid_hash_lock);
 261        spin_lock(&pid_hash_lock);
 262        pid_hash = create_hashtable(100, __generic_hash, __generic_eq);
 263        spin_unlock(&pid_hash_lock);
 264        schedule_init();
 265
 266        atomic_init(&num_envs, 0);
 267}
 268
 269void proc_set_username(struct proc *p, char *name)
 270{
 271        set_username(&p->user, name);
 272}
 273
 274/*
 275 * Copies username from the parent process. This is the only case where a
 276 * reader blocks writing, just to be extra safe during process initialization.
 277 *
 278 * Note that since this is intended to be called during initialization, the
 279 * child's name lock is NOT used for writing. Nothing else should be able to
 280 * read or write yet, so this can be a simple memcpy once the parent is locked.
 281 */
 282void proc_inherit_parent_username(struct proc *child, struct proc *parent)
 283{
 284        spin_lock(&parent->user.name_lock);
 285
 286        // copy entire parent buffer for constant runtime
 287        memcpy(child->user.name, parent->user.name, sizeof(child->user.name));
 288
 289        spin_unlock(&parent->user.name_lock);
 290}
 291
 292void proc_set_progname(struct proc *p, char *name)
 293{
 294        if (name == NULL)
 295                name = DEFAULT_PROGNAME;
 296
 297        /* might have an issue if a dentry name isn't null terminated, and we'd
 298         * get extra junk up to progname_sz. Or crash. */
 299        strlcpy(p->progname, name, PROC_PROGNAME_SZ);
 300}
 301
 302void proc_replace_binary_path(struct proc *p, char *path)
 303{
 304        if (p->binary_path)
 305                free_path(p, p->binary_path);
 306        p->binary_path = path;
 307}
 308
 309/* Be sure you init'd the vcore lists before calling this. */
 310void proc_init_procinfo(struct proc* p)
 311{
 312        p->procinfo->pid = p->pid;
 313        p->procinfo->ppid = p->ppid;
 314        p->procinfo->max_vcores = max_vcores(p);
 315        p->procinfo->tsc_freq = __proc_global_info.tsc_freq;
 316        p->procinfo->timing_overhead = __proc_global_info.tsc_overhead;
 317        p->procinfo->program_end = 0;
 318        /* 0'ing the arguments.  Some higher function will need to set them */
 319        memset(p->procinfo->res_grant, 0, sizeof(p->procinfo->res_grant));
 320        /* 0'ing the vcore/pcore map.  Will link the vcores later. */
 321        memset(&p->procinfo->vcoremap, 0, sizeof(p->procinfo->vcoremap));
 322        memset(&p->procinfo->pcoremap, 0, sizeof(p->procinfo->pcoremap));
 323        p->procinfo->num_vcores = 0;
 324        p->procinfo->is_mcp = FALSE;
 325        p->procinfo->coremap_seqctr = SEQCTR_INITIALIZER;
 326        /* It's a bug in the kernel if we let them ask for more than max */
 327        for (int i = 0; i < p->procinfo->max_vcores; i++) {
 328                TAILQ_INSERT_TAIL(&p->inactive_vcs, &p->procinfo->vcoremap[i],
 329                                  list);
 330        }
 331}
 332
 333void proc_init_procdata(struct proc *p)
 334{
 335        memset(p->procdata, 0, sizeof(struct procdata));
 336        /* processes can't go into vc context on vc 0 til they unset this.  This
 337         * is for processes that block before initing uthread code (like rtld).
 338         */
 339        atomic_set(&p->procdata->vcore_preempt_data[0].flags, VC_SCP_NOVCCTX);
 340}
 341
 342static void proc_open_stdfds(struct proc *p)
 343{
 344        int fd;
 345        struct proc *old_current = current;
 346
 347        /* Due to the way the syscall helpers assume the target process is
 348         * current, we need to set current temporarily.  We don't use switch_to,
 349         * since that actually loads the process's address space, which might be
 350         * empty or incomplete.  These syscalls shouldn't access user memory,
 351         * especially considering how we're probably in the boot pgdir. */
 352        current = p;
 353        fd = sysopenat(AT_FDCWD, "#cons/stdin", O_READ, 0);
 354        assert(fd == 0);
 355        fd = sysopenat(AT_FDCWD, "#cons/stdout", O_WRITE, 0);
 356        assert(fd == 1);
 357        fd = sysopenat(AT_FDCWD, "#cons/stderr", O_WRITE, 0);
 358        assert(fd == 2);
 359        current = old_current;
 360}
 361
 362/* Allocates and initializes a process, with the given parent.  Currently
 363 * writes the *p into **pp, and returns 0 on success, < 0 for an error.
 364 * Errors include:
 365 *  - ENOFREEPID if it can't get a PID
 366 *  - ENOMEM on memory exhaustion */
 367error_t proc_alloc(struct proc **pp, struct proc *parent, int flags)
 368{
 369        error_t r;
 370        struct proc *p;
 371
 372        if (!(p = kmem_cache_alloc(proc_cache, 0)))
 373                return -ENOMEM;
 374        /* zero everything by default, other specific items are set below */
 375        memset(p, 0, sizeof(*p));
 376
 377        /* only one ref, which we pass back.  the old 'existence' ref is managed
 378         * by the ksched */
 379        kref_init(&p->p_kref, __proc_free, 1);
 380        /* Initialize the address space */
 381        if ((r = env_setup_vm(p)) < 0) {
 382                kmem_cache_free(proc_cache, p);
 383                return r;
 384        }
 385        if (!(p->pid = get_free_pid())) {
 386                kmem_cache_free(proc_cache, p);
 387                return -ENOFREEPID;
 388        }
 389        if (parent && parent->binary_path)
 390                kstrdup(&p->binary_path, parent->binary_path);
 391        /* Set the basic status variables. */
 392        spinlock_init(&p->proc_lock);
 393        spinlock_init(&p->user.name_lock);
 394        /* so we can see processes killed by the kernel */
 395        p->exitcode = 1337;
 396        if (parent) {
 397                p->ppid = parent->pid;
 398                proc_inherit_parent_username(p, parent);
 399                proc_incref(p, 1);      /* storing a ref in the parent */
 400                /* using the CV's lock to protect anything related to child
 401                 * waiting */
 402                cv_lock(&parent->child_wait);
 403                TAILQ_INSERT_TAIL(&parent->children, p, sibling_link);
 404                cv_unlock(&parent->child_wait);
 405        } else {
 406                p->ppid = 0;
 407                strlcpy(p->user.name, eve.name, sizeof(p->user.name));
 408                printk("Parentless process assigned username '%s'\n",
 409                       p->user.name);
 410        }
 411        TAILQ_INIT(&p->children);
 412        cv_init(&p->child_wait);
 413        /* shouldn't go through state machine for init */
 414        p->state = PROC_CREATED;
 415        p->env_flags = 0;
 416        spinlock_init(&p->vmr_lock);
 417        spinlock_init(&p->pte_lock);
 418        TAILQ_INIT(&p->vm_regions); /* could init this in the slab */
 419        p->vmr_history = 0;
 420        /* Initialize the vcore lists, we'll build the inactive list so that it
 421         * includes all vcores when we initialize procinfo.  Do this before
 422         * initing procinfo. */
 423        TAILQ_INIT(&p->online_vcs);
 424        TAILQ_INIT(&p->bulk_preempted_vcs);
 425        TAILQ_INIT(&p->inactive_vcs);
 426        /* Init procinfo/procdata.  Procinfo's argp/argb are 0'd */
 427        proc_init_procinfo(p);
 428        proc_init_procdata(p);
 429
 430        /* Initialize the generic sysevent ring buffer */
 431        SHARED_RING_INIT(&p->procdata->syseventring);
 432        /* Initialize the frontend of the sysevent ring buffer */
 433        FRONT_RING_INIT(&p->syseventfrontring,
 434                        &p->procdata->syseventring,
 435                        SYSEVENTRINGSIZE);
 436
 437        /* Init FS structures TODO: cleanup (might pull this out) */
 438        p->umask = parent ? parent->umask : S_IWGRP | S_IWOTH;
 439        memset(&p->open_files, 0, sizeof(p->open_files)); /* slightly ghetto */
 440        spinlock_init(&p->open_files.lock);
 441        p->open_files.max_files = NR_OPEN_FILES_DEFAULT;
 442        p->open_files.max_fdset = NR_FILE_DESC_DEFAULT;
 443        p->open_files.fd = p->open_files.fd_array;
 444        p->open_files.open_fds = (struct fd_set*)&p->open_files.open_fds_init;
 445        if (parent) {
 446                if (flags & PROC_DUP_FGRP)
 447                        clone_fdt(&parent->open_files, &p->open_files);
 448        } else {
 449                /* no parent, we're created from the kernel */
 450                proc_open_stdfds(p);
 451        }
 452        /* Init the ucq hash lock */
 453        p->ucq_hashlock = (struct hashlock*)&p->ucq_hl_noref;
 454        hashlock_init_irqsave(p->ucq_hashlock, HASHLOCK_DEFAULT_SZ);
 455
 456        atomic_inc(&num_envs);
 457        plan9setup(p, parent, flags);
 458        devalarm_init(p);
 459        TAILQ_INIT(&p->abortable_sleepers);
 460        spinlock_init_irqsave(&p->abort_list_lock);
 461        memset(&p->vmm, 0, sizeof(struct vmm));
 462        spinlock_init(&p->vmm.lock);
 463        qlock_init(&p->vmm.qlock);
 464        qlock_init(&p->dev_qlock);
 465        TAILQ_INIT(&p->pci_devs);
 466        INIT_LIST_HEAD(&p->iommus);
 467        printd("[%08x] new process %08x\n", current ? current->pid : 0, p->pid);
 468        *pp = p;
 469        return 0;
 470}
 471
 472/* We have a bunch of different ways to make processes.  Call this once the
 473 * process is ready to be used by the rest of the system.  For now, this just
 474 * means when it is ready to be named via the pidhash.  In the future, we might
 475 * push setting the state to CREATED into here. */
 476void __proc_ready(struct proc *p)
 477{
 478        /* Tell the ksched about us.  TODO: do we need to worry about the ksched
 479         * doing stuff to us before we're added to the pid_hash? */
 480        __sched_proc_register(p);
 481        spin_lock(&pid_hash_lock);
 482        hashtable_insert(pid_hash, (void*)(long)p->pid, p);
 483        spin_unlock(&pid_hash_lock);
 484}
 485
 486/* Creates a process from the specified file, argvs, and envps. */
 487struct proc *proc_create(struct file_or_chan *prog, char **argv, char **envp)
 488{
 489        struct proc *p;
 490        error_t r;
 491        int ret;
 492
 493        if ((r = proc_alloc(&p, current, 0 /* flags */)) < 0)
 494                panic("proc_create: %d", r);
 495        int argc = 0, envc = 0;
 496        if(argv) while(argv[argc]) argc++;
 497        if(envp) while(envp[envc]) envc++;
 498        proc_set_progname(p, argc ? argv[0] : NULL);
 499        ret = load_elf(p, prog, argc, argv, envc, envp);
 500        assert(ret == 0);
 501        __proc_ready(p);
 502        return p;
 503}
 504
 505static int __cb_assert_no_pg(struct proc *p, pte_t pte, void *va, void *arg)
 506{
 507        assert(pte_is_unmapped(pte));
 508        return 0;
 509}
 510
 511/* This is called by kref_put(), once the last reference to the process is
 512 * gone.  Don't call this otherwise (it will panic).  It will clean up the
 513 * address space and deallocate any other used memory. */
 514static void __proc_free(struct kref *kref)
 515{
 516        struct proc *p = container_of(kref, struct proc, p_kref);
 517        void *hash_ret;
 518        physaddr_t pa;
 519
 520        printd("[PID %d] freeing proc: %d\n", current ? current->pid : 0,
 521               p->pid);
 522        // All parts of the kernel should have decref'd before __proc_free is
 523        // called
 524        assert(kref_refcnt(&p->p_kref) == 0);
 525        assert(TAILQ_EMPTY(&p->alarmset.list));
 526
 527        if (p->strace) {
 528                kref_put(&p->strace->procs);
 529                kref_put(&p->strace->users);
 530        }
 531        teardown_dma_arena(p);
 532        __vmm_struct_cleanup(p);
 533        p->progname[0] = 0;
 534        free_path(p, p->binary_path);
 535        cclose(p->dot);
 536        cclose(p->slash);
 537        p->dot = p->slash = 0; /* catch bugs */
 538        /* now we'll finally decref files for the file-backed vmrs */
 539        unmap_and_destroy_vmrs(p);
 540        /* Remove us from the pid_hash and give our PID back (in that order). */
 541        spin_lock(&pid_hash_lock);
 542        hash_ret = hashtable_remove(pid_hash, (void*)(long)p->pid);
 543        spin_unlock(&pid_hash_lock);
 544        /* might not be in the hash/ready, if we failed during proc creation */
 545        if (hash_ret)
 546                put_free_pid(p->pid);
 547        else
 548                printd("[kernel] pid %d not in the PID hash in %s\n", p->pid,
 549                       __FUNCTION__);
 550        /* All memory below UMAPTOP should have been freed via the VMRs.  The
 551         * stuff above is the global info/page and procinfo/procdata.  We free
 552         * procinfo and procdata, but not the global memory - that's system
 553         * wide.  We could clear the PTEs of the upper stuff (UMAPTOP to UVPT),
 554         * but we shouldn't need to. */
 555        env_user_mem_walk(p, 0, UMAPTOP, __cb_assert_no_pg, 0);
 556        kpages_free(p->procinfo, PROCINFO_NUM_PAGES * PGSIZE);
 557        kpages_free(p->procdata, PROCDATA_NUM_PAGES * PGSIZE);
 558
 559        env_pagetable_free(p);
 560        arch_pgdir_clear(&p->env_pgdir);
 561        p->env_cr3 = 0;
 562
 563        atomic_dec(&num_envs);
 564
 565        /* Dealloc the struct proc */
 566        kmem_cache_free(proc_cache, p);
 567}
 568
 569/* Whether or not actor can control target.  TODO: do something reasonable here.
 570 * Just checking for the parent is a bit limiting.  Could walk the parent-child
 571 * tree, check user ids, or some combination.  Make sure actors can always
 572 * control themselves. */
 573bool proc_controls(struct proc *actor, struct proc *target)
 574{
 575        return TRUE;
 576        #if 0 /* Example: */
 577        return ((actor == target) || (target->ppid == actor->pid));
 578        #endif
 579}
 580
 581/* Helper to incref by val.  Using the helper to help debug/interpose on proc
 582 * ref counting.  Note that pid2proc doesn't use this interface. */
 583void proc_incref(struct proc *p, unsigned int val)
 584{
 585        kref_get(&p->p_kref, val);
 586}
 587
 588/* Helper to decref for debugging.  Don't directly kref_put() for now. */
 589void proc_decref(struct proc *p)
 590{
 591        kref_put(&p->p_kref);
 592}
 593
 594/* Helper, makes p the 'current' process, dropping the old current/cr3.  This no
 595 * longer assumes the passed in reference already counted 'current'.  It will
 596 * incref internally when needed. */
 597static void __set_proc_current(struct proc *p)
 598{
 599        /* We use the pcpui to access 'current' to cut down on the core_id()
 600         * calls, though who know how expensive/painful they are. */
 601        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 602        struct proc *old_proc;
 603
 604        /* If the process wasn't here, then we need to load its address space */
 605        if (p != pcpui->cur_proc) {
 606                proc_incref(p, 1);
 607                lcr3(p->env_cr3);
 608                /* This is "leaving the process context" of the previous proc.
 609                 * The previous lcr3 unloaded the previous proc's context.  This
 610                 * should rarely happen, since we usually proactively leave
 611                 * process context, but this is the fallback. */
 612                old_proc = pcpui->cur_proc;
 613                pcpui->cur_proc = p;
 614                if (old_proc)
 615                        proc_decref(old_proc);
 616        }
 617}
 618
 619/* Flag says if vcore context is not ready, which is set in init_procdata.  The
 620 * process must turn off this flag on vcore0 at some point.  It's off by default
 621 * on all other vcores. */
 622static bool scp_is_vcctx_ready(struct preempt_data *vcpd)
 623{
 624        return !(atomic_read(&vcpd->flags) & VC_SCP_NOVCCTX);
 625}
 626
 627/* Dispatches a _S process to run on the current core.  This should never be
 628 * called to "restart" a core.
 629 *
 630 * This will always return, regardless of whether or not the calling core is
 631 * being given to a process. (it used to pop the tf directly, before we had
 632 * cur_ctx).
 633 *
 634 * Since it always returns, it will never "eat" your reference (old
 635 * documentation talks about this a bit). */
 636void proc_run_s(struct proc *p)
 637{
 638        uint32_t coreid = core_id();
 639        struct per_cpu_info *pcpui = &per_cpu_info[coreid];
 640        struct preempt_data *vcpd = &p->procdata->vcore_preempt_data[0];
 641
 642        spin_lock(&p->proc_lock);
 643        switch (p->state) {
 644        case (PROC_DYING):
 645        case (PROC_DYING_ABORT):
 646                spin_unlock(&p->proc_lock);
 647                printk("[kernel] _S %d not starting: async death\n",
 648                       p->pid);
 649                return;
 650        case (PROC_RUNNABLE_S):
 651                __proc_set_state(p, PROC_RUNNING_S);
 652                /* SCPs don't have full vcores, but they act like they have
 653                 * vcore 0.  We map the vcore, since we will want to know where
 654                 * this process is running, even if it is only in RUNNING_S.  We
 655                 * can use the vcoremap, which makes death easy.  num_vcores is
 656                 * still 0, and we do account the time online and offline. */
 657                __seq_start_write(&p->procinfo->coremap_seqctr);
 658                p->procinfo->num_vcores = 0;
 659                __map_vcore(p, 0, coreid);
 660                vcore_account_online(p, 0);
 661                __seq_end_write(&p->procinfo->coremap_seqctr);
 662                /* incref, since we're saving a reference in owning proc later*/
 663                proc_incref(p, 1);
 664                /* lock was protecting the state and VC mapping, not pcpui stuff
 665                 */
 666                spin_unlock(&p->proc_lock);
 667                /* redundant with proc_startcore, might be able to remove that
 668                 * one */
 669                __set_proc_current(p);
 670                /* set us up as owning_proc.  ksched bug if there is already
 671                 * one, for now.  can simply clear_owning if we want to. */
 672                assert(!pcpui->owning_proc);
 673                pcpui->owning_proc = p;
 674                pcpui->owning_vcoreid = 0;
 675                restore_vc_fp_state(vcpd);
 676                /* similar to the old __startcore, start them in vcore context
 677                 * if they have notifs and aren't already in vcore context.
 678                 * o/w, start them wherever they were before (could be either vc
 679                 * ctx or not) */
 680                if (!vcpd->notif_disabled && vcpd->notif_pending
 681                                          && scp_is_vcctx_ready(vcpd)) {
 682                        vcpd->notif_disabled = TRUE;
 683                        /* save the _S's ctx in the uthread slot, build and pop
 684                         * a new one in actual/cur_ctx. */
 685                        vcpd->uthread_ctx = p->scp_ctx;
 686                        pcpui->cur_ctx = &pcpui->actual_ctx;
 687                        memset(pcpui->cur_ctx, 0, sizeof(struct user_context));
 688                        proc_init_ctx(pcpui->cur_ctx, 0, vcpd->vcore_entry,
 689                                      vcpd->vcore_stack, vcpd->vcore_tls_desc);
 690                } else {
 691                        /* If they have no transition stack, then they can't
 692                         * receive events.  The most they are getting is a
 693                         * wakeup from the kernel.  They won't even turn off
 694                         * notif_pending, so we'll do that for them. */
 695                        if (!scp_is_vcctx_ready(vcpd))
 696                                vcpd->notif_pending = FALSE;
 697                        /* this is one of the few times cur_ctx != &actual_ctx*/
 698                        pcpui->cur_ctx = &p->scp_ctx;
 699                }
 700                /* When the calling core idles, it'll call restartcore and run
 701                 * the _S process's context. */
 702                return;
 703        default:
 704                spin_unlock(&p->proc_lock);
 705                panic("Invalid process state %p in %s()!!", p->state,
 706                      __FUNCTION__);
 707        }
 708}
 709
 710/* Helper: sends preempt messages to all vcores on the bulk preempt list, and
 711 * moves them to the inactive list. */
 712static void __send_bulkp_events(struct proc *p)
 713{
 714        struct vcore *vc_i, *vc_temp;
 715        struct event_msg preempt_msg = {0};
 716
 717        /* Whenever we send msgs with the proc locked, we need at least 1 online
 718         */
 719        assert(!TAILQ_EMPTY(&p->online_vcs));
 720        /* Send preempt messages for any left on the BP list.  No need to set
 721         * any flags, it all was done on the real preempt.  Now we're just
 722         * telling the process about any that didn't get restarted and are still
 723         * preempted. */
 724        TAILQ_FOREACH_SAFE(vc_i, &p->bulk_preempted_vcs, list, vc_temp) {
 725                /* Note that if there are no active vcores, send_k_e will post
 726                 * to our own vcore, the last of which will be put on the
 727                 * inactive list and be the first to be started.  We could have
 728                 * issues with deadlocking, since send_k_e() could grab the
 729                 * proclock (if there are no active vcores) */
 730                preempt_msg.ev_type = EV_VCORE_PREEMPT;
 731                preempt_msg.ev_arg2 = vcore2vcoreid(p, vc_i); /* arg2 32 bits */
 732                send_kernel_event(p, &preempt_msg, 0);
 733                /* TODO: we may want a TAILQ_CONCAT_HEAD, or something that does
 734                 * that.  We need a loop for the messages, but not necessarily
 735                 * for the list changes.  */
 736                TAILQ_REMOVE(&p->bulk_preempted_vcs, vc_i, list);
 737                TAILQ_INSERT_HEAD(&p->inactive_vcs, vc_i, list);
 738        }
 739}
 740
 741/* Run an _M.  Can be called safely on one that is already running.  Hold the
 742 * lock before calling.  Other than state checks, this just starts up the _M's
 743 * vcores, much like the second part of give_cores_running.  More specifically,
 744 * give_cores_runnable puts cores on the online list, which this then sends
 745 * messages to.  give_cores_running immediately puts them on the list and sends
 746 * the message.  the two-step style may go out of fashion soon.
 747 *
 748 * This expects that the "instructions" for which core(s) to run this on will be
 749 * in the vcoremap, which needs to be set externally (give_cores()). */
 750void __proc_run_m(struct proc *p)
 751{
 752        struct vcore *vc_i;
 753        switch (p->state) {
 754        case (PROC_WAITING):
 755        case (PROC_DYING):
 756        case (PROC_DYING_ABORT):
 757                warn("ksched tried to run proc %d in state %s\n", p->pid,
 758                     procstate2str(p->state));
 759                return;
 760        case (PROC_RUNNABLE_M):
 761                /* vcoremap[i] holds the coreid of the physical core allocated
 762                 * to this process.  It is set outside proc_run. */
 763                if (p->procinfo->num_vcores) {
 764                        __send_bulkp_events(p);
 765                        __proc_set_state(p, PROC_RUNNING_M);
 766                        /* Up the refcnt, to avoid the n refcnt upping on the
 767                         * destination cores.  Keep in sync with __startcore */
 768                        proc_incref(p, p->procinfo->num_vcores * 2);
 769                        /* Send kernel messages to all online vcores (which were
 770                         * added to the list and mapped in __proc_give_cores()),
 771                         * making them turn online */
 772                        TAILQ_FOREACH(vc_i, &p->online_vcs, list) {
 773                                send_kernel_message(vc_i->pcoreid, __startcore,
 774                                        (long)p,
 775                                        (long)vcore2vcoreid(p, vc_i),
 776                                        (long)vc_i->nr_preempts_sent,
 777                                        KMSG_ROUTINE);
 778                        }
 779                } else {
 780                        warn("Tried to proc_run() an _M with no vcores!");
 781                }
 782                /* There a subtle race avoidance here (when we unlock after
 783                 * sending the message).  __proc_startcore can handle a death
 784                 * message, but we can't have the startcore come after the death
 785                 * message.  Otherwise, it would look like a new process.  So we
 786                 * hold the lock til after we send our message, which prevents a
 787                 * possible death message.
 788                 * - Note there is no guarantee this core's interrupts were on,
 789                 *   so it may not get the message for a while... */
 790                return;
 791        case (PROC_RUNNING_M):
 792                return;
 793        default:
 794                /* unlock just so the monitor can call something that might
 795                 * lock*/
 796                spin_unlock(&p->proc_lock);
 797                panic("Invalid process state %p in %s()!!", p->state,
 798                      __FUNCTION__);
 799        }
 800}
 801
 802/* You must disable IRQs and PRKM before calling this.
 803 *
 804 * Actually runs the given context (trapframe) of process p on the core this
 805 * code executes on.  This is called directly by __startcore, which needs to
 806 * bypass the routine_kmsg check.  Interrupts should be off when you call this.
 807 *
 808 * A note on refcnting: this function will not return, and your proc reference
 809 * will be ignored (not decreffed).  It may be incref'd, if cur_proc was not
 810 * set.  Pass in an already-accounted-for ref, such as owning_proc. */
 811void __proc_startcore(struct proc *p, struct user_context *ctx)
 812{
 813        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 814
 815        assert(!irq_is_enabled());
 816        /* Should never have ktask still set.  If we do, future syscalls could
 817         * try to block later and lose track of our address space. */
 818        assert(!is_ktask(pcpui->cur_kthread));
 819        __set_proc_current(p);
 820        __set_cpu_state(pcpui, CPU_STATE_USER);
 821        proc_pop_ctx(ctx);
 822}
 823
 824/* Restarts/runs the current_ctx, which must be for the current process, on the
 825 * core this code executes on.
 826 *
 827 * For now, we just smp_idle.  We used to do something similar, but customized
 828 * for expecting to return to the process.  But it was a source of bugs.  If we
 829 * want to optimize for the case where we know we had a process current, then we
 830 * can do so here.
 831 *
 832 * Note that PRKM currently calls smp_idle() if it ever has a message, so the
 833 * value of optimizing may depend on the semantics of PRKM. */
 834void proc_restartcore(void)
 835{
 836        smp_idle();
 837}
 838
 839/* Helper for proc_destroy.  Disowns any children. */
 840static void proc_disown_children(struct proc *parent)
 841{
 842        struct proc *child_i, *temp;
 843        struct proc_list todo = TAILQ_HEAD_INITIALIZER(todo);
 844        int ret;
 845
 846        cv_lock(&parent->child_wait);
 847        TAILQ_FOREACH_SAFE(child_i, &parent->children, sibling_link, temp) {
 848                ret = __proc_disown_child(parent, child_i);
 849                /* should never fail, lock should cover the race.  invariant:
 850                 * any child on the list should have us as a parent */
 851                assert(!ret);
 852                TAILQ_INSERT_TAIL(&todo, child_i, sibling_link);
 853        }
 854        cv_unlock(&parent->child_wait);
 855
 856        TAILQ_FOREACH_SAFE(child_i, &todo, sibling_link, temp)
 857                proc_decref(child_i);
 858}
 859
 860/* Destroys the process.  It will destroy the process and return any cores
 861 * to the ksched via the __sched_proc_destroy() CB.
 862 *
 863 * Here's the way process death works:
 864 * 0. grab the lock (protects state transition and core map)
 865 * 1. set state to dying.  that keeps the kernel from doing anything for the
 866 * process (like proc_running it).
 867 * 2. figure out where the process is running (cross-core/async or RUNNING_M)
 868 * 3. IPI to clean up those cores (decref, etc).
 869 * 4. Unlock
 870 * 5. Clean up your core, if applicable
 871 * (Last core/kernel thread to decref cleans up and deallocates resources.)
 872 *
 873 * Note that some cores can be processing async calls, but will eventually
 874 * decref.  Should think about this more, like some sort of callback/revocation.
 875 *
 876 * This function will now always return (it used to not return if the calling
 877 * core was dying).  However, when it returns, a kernel message will eventually
 878 * come in, making you abandon_core, as if you weren't running.  It may be that
 879 * the only reference to p is the one you passed in, and when you decref, it'll
 880 * get __proc_free()d. */
 881void proc_destroy(struct proc *p)
 882{
 883        uint32_t nr_cores_revoked = 0;
 884        struct kthread *sleeper;
 885        struct proc *child_i, *temp;
 886
 887        spin_lock(&p->proc_lock);
 888        /* storage for pc_arr is alloced at decl, which is after grabbing the
 889         * lock*/
 890        uint32_t pc_arr[p->procinfo->num_vcores];
 891        switch (p->state) {
 892        case PROC_DYING: /* someone else killed this already. */
 893        case (PROC_DYING_ABORT):
 894                spin_unlock(&p->proc_lock);
 895                return;
 896        case PROC_CREATED:
 897        case PROC_RUNNABLE_S:
 898        case PROC_WAITING:
 899                break;
 900        case PROC_RUNNABLE_M:
 901        case PROC_RUNNING_M:
 902                /* Need to reclaim any cores this proc might have, even if it's
 903                 * not running yet.  Those running will receive a __death */
 904                nr_cores_revoked = __proc_take_allcores(p, pc_arr, FALSE);
 905                break;
 906        case PROC_RUNNING_S:
 907                #if 0
 908                // here's how to do it manually
 909                if (current == p) {
 910                        lcr3(boot_cr3);
 911                        current = NULL;
 912                        proc_decref(p);         /* this decref is for the cr3 */
 913                }
 914                #endif
 915                send_kernel_message(get_pcoreid(p, 0), __death, (long)p, 0, 0,
 916                                    KMSG_ROUTINE);
 917                __seq_start_write(&p->procinfo->coremap_seqctr);
 918                __unmap_vcore(p, 0);
 919                __seq_end_write(&p->procinfo->coremap_seqctr);
 920                /* If we ever have RUNNING_S run on non-mgmt cores, we'll need
 921                 * to tell the ksched about this now-idle core (after unlocking)
 922                 */
 923                break;
 924        default:
 925                warn("Weird state(%s) in %s()", procstate2str(p->state),
 926                     __FUNCTION__);
 927                spin_unlock(&p->proc_lock);
 928                return;
 929        }
 930        /* At this point, a death IPI should be on its way, either from the
 931         * RUNNING_S one, or from proc_take_cores with a __death.  in general,
 932         * interrupts should be on when you call proc_destroy locally, but
 933         * currently aren't for all things (like traphandlers). */
 934        __proc_set_state(p, PROC_DYING);
 935        spin_unlock(&p->proc_lock);
 936        proc_disown_children(p);
 937        /* Wake any of our kthreads waiting on children, so they can abort */
 938        cv_broadcast(&p->child_wait);
 939        /* we need to close files here, and not in free, since we could have a
 940         * refcnt indirectly related to one of our files.  specifically, if we
 941         * have a parent sleeping on our pipe, that parent won't wake up to
 942         * decref until the pipe closes.  And if the parent doesnt decref, we
 943         * don't free.  Even if we send a SIGCHLD to the parent, that would
 944         * require that the parent to never ignores that signal (or we risk
 945         * never reaping).
 946         *
 947         * Also note that any mmap'd files will still be mmapped.  You can close
 948         * the file after mmapping, with no effect. */
 949        close_fdt(&p->open_files, FALSE);
 950        /* Abort any abortable syscalls.  This won't catch every sleeper, but
 951         * future abortable sleepers are already prevented via the DYING_ABORT
 952         * state.  (signalled DYING_ABORT, no new sleepers will block, and now
 953         * we wake all old sleepers). */
 954        __proc_set_state(p, PROC_DYING_ABORT);
 955        abort_all_sysc(p);
 956        iommu_unassign_all_devices(p);
 957        /* Tell the ksched about our death, and which cores we freed up */
 958        __sched_proc_destroy(p, pc_arr, nr_cores_revoked);
 959        /* Tell our parent about our state change (to DYING) */
 960        proc_signal_parent(p);
 961}
 962
 963/* Can use this to signal anything that might cause a parent to wait on the
 964 * child, such as termination, or signals.  Change the state or whatever before
 965 * calling. */
 966void proc_signal_parent(struct proc *child)
 967{
 968        struct kthread *sleeper;
 969        struct proc *parent = pid2proc(child->ppid);
 970        if (!parent)
 971                return;
 972        send_posix_signal(parent, SIGCHLD);
 973        /* there could be multiple kthreads sleeping for various reasons.  even
 974         * an SCP could have multiple async syscalls. */
 975        cv_broadcast(&parent->child_wait);
 976        /* if the parent was waiting, there's a __launch kthread KMSG out there
 977         */
 978        proc_decref(parent);
 979}
 980
 981/* Called when a parent is done with its child, and no longer wants to track the
 982 * child, nor to allow the child to track it.  Call with a lock (cv) held.
 983 * Returns 0 if we disowned, -1 on failure.
 984 *
 985 * If we disowned, (ret == 0), the caller must decref the child. */
 986int __proc_disown_child(struct proc *parent, struct proc *child)
 987{
 988        /* Bail out if the child has already been reaped */
 989        if (!child->ppid)
 990                return -1;
 991        assert(child->ppid == parent->pid);
 992        /* lock protects from concurrent inserts / removals from the list */
 993        TAILQ_REMOVE(&parent->children, child, sibling_link);
 994        /* After this, the child won't be able to get more refs to us, but it
 995         * may still have some references in running code. */
 996        child->ppid = 0;
 997        return 0;
 998}
 999
1000/* Turns *p into an MCP.  Needs to be called from a local syscall of a RUNNING_S
1001 * process.  Returns 0 if it succeeded, an error code otherwise. */
1002int proc_change_to_m(struct proc *p)
1003{
1004        int retval = 0;
1005        spin_lock(&p->proc_lock);
1006        /* in case userspace erroneously tries to change more than once */
1007        if (__proc_is_mcp(p))
1008                goto error_out;
1009        switch (p->state) {
1010        case (PROC_RUNNING_S):
1011                /* issue with if we're async or not (need to preempt it)
1012                 * either of these should trip it. TODO: (ACR) async core req */
1013                if ((current != p) || (get_pcoreid(p, 0) != core_id()))
1014                        panic("We don't handle async RUNNING_S core requests");
1015                struct preempt_data *vcpd = &p->procdata->vcore_preempt_data[0];
1016
1017                assert(current_ctx);
1018                /* Copy uthread0's context to VC 0's uthread slot */
1019                copy_current_ctx_to(&vcpd->uthread_ctx);
1020                clear_owning_proc(core_id());   /* so we don't restart */
1021                save_vc_fp_state(vcpd);
1022                /* Userspace needs to not fuck with notif_disabled before
1023                 * transitioning to _M. */
1024                if (vcpd->notif_disabled) {
1025                        printk("[kernel] user bug: notifs disabled for vcore 0\n");
1026                        vcpd->notif_disabled = FALSE;
1027                }
1028                /* in the async case, we'll need to remotely stop and bundle
1029                 * vcore0's TF.  this is already done for the sync case (local
1030                 * syscall). */
1031                /* this process no longer runs on its old location (which is
1032                 * this core, for now, since we don't handle async calls) */
1033                __seq_start_write(&p->procinfo->coremap_seqctr);
1034                // TODO: (ACR) will need to unmap remotely (receive-side)
1035                __unmap_vcore(p, 0);
1036                vcore_account_offline(p, 0);
1037                __seq_end_write(&p->procinfo->coremap_seqctr);
1038                /* change to runnable_m (it's TF is already saved) */
1039                __proc_set_state(p, PROC_RUNNABLE_M);
1040                p->procinfo->is_mcp = TRUE;
1041                spin_unlock(&p->proc_lock);
1042                /* Tell the ksched that we're a real MCP now! */
1043                __sched_proc_change_to_m(p);
1044                return 0;
1045        case (PROC_RUNNABLE_S):
1046                /* Issues: being on the runnable_list, proc_set_state not liking
1047                 * it, and not clearly thinking through how this would happen.
1048                 * Perhaps an async call that gets serviced after you're
1049                 * descheduled? */
1050                warn("Not supporting RUNNABLE_S -> RUNNABLE_M yet.\n");
1051                goto error_out;
1052        case (PROC_DYING):
1053        case (PROC_DYING_ABORT):
1054                warn("Dying, core request coming from %d\n", core_id());
1055                goto error_out;
1056        default:
1057                goto error_out;
1058        }
1059error_out:
1060        spin_unlock(&p->proc_lock);
1061        return -EINVAL;
1062}
1063
1064/* Old code to turn a RUNNING_M to a RUNNING_S, with the calling context
1065 * becoming the new 'thread0'.  Don't use this.  Caller needs to send in a
1066 * pc_arr big enough for all vcores.  Will return the number of cores given up
1067 * by the proc. */
1068uint32_t __proc_change_to_s(struct proc *p, uint32_t *pc_arr)
1069{
1070        struct preempt_data *vcpd = &p->procdata->vcore_preempt_data[0];
1071        uint32_t num_revoked;
1072
1073        /* Not handling vcore accounting.  Do so if we ever use this */
1074        printk("[kernel] trying to transition _M -> _S (deprecated)!\n");
1075        assert(p->state == PROC_RUNNING_M); // TODO: (ACR) async core req
1076        /* save the context, to be restarted in _S mode */
1077        assert(current_ctx);
1078        copy_current_ctx_to(&p->scp_ctx);
1079        clear_owning_proc(core_id());   /* so we don't restart */
1080        save_vc_fp_state(vcpd);
1081        /* sending death, since it's not our job to save contexts or anything in
1082         * this case. */
1083        num_revoked = __proc_take_allcores(p, pc_arr, FALSE);
1084        __proc_set_state(p, PROC_RUNNABLE_S);
1085        return num_revoked;
1086}
1087
1088/* Helper function.  Is the given pcore a mapped vcore?  No locking involved, be
1089 * careful. */
1090static bool is_mapped_vcore(struct proc *p, uint32_t pcoreid)
1091{
1092        return p->procinfo->pcoremap[pcoreid].valid;
1093}
1094
1095/* Helper function.  Find the vcoreid for a given physical core id for proc p.
1096 * No locking involved, be careful.  Panics on failure. */
1097static uint32_t get_vcoreid(struct proc *p, uint32_t pcoreid)
1098{
1099        assert(is_mapped_vcore(p, pcoreid));
1100        return p->procinfo->pcoremap[pcoreid].vcoreid;
1101}
1102
1103/* Helper function.  Try to find the pcoreid for a given virtual core id for
1104 * proc p.  No locking involved, be careful.  Use this when you can tolerate a
1105 * stale or otherwise 'wrong' answer. */
1106static uint32_t try_get_pcoreid(struct proc *p, uint32_t vcoreid)
1107{
1108        return p->procinfo->vcoremap[vcoreid].pcoreid;
1109}
1110
1111/* Helper function.  Find the pcoreid for a given virtual core id for proc p.
1112 * No locking involved, be careful.  Panics on failure. */
1113static uint32_t get_pcoreid(struct proc *p, uint32_t vcoreid)
1114{
1115        assert(vcore_is_mapped(p, vcoreid));
1116        return try_get_pcoreid(p, vcoreid);
1117}
1118
1119/* Saves the FP state of the calling core into VCPD.  Pairs with
1120 * restore_vc_fp_state().  On x86, the best case overhead of the flags:
1121 *              FNINIT: 36 ns
1122 *              FXSAVE: 46 ns
1123 *              FXRSTR: 42 ns
1124 *              Flagged FXSAVE: 50 ns
1125 *              Flagged FXRSTR: 66 ns
1126 *              Excess flagged FXRSTR: 42 ns
1127 * If we don't do it, we'll need to initialize every VCPD at process creation
1128 * time with a good FPU state (x86 control words are initialized as 0s, like the
1129 * rest of VCPD). */
1130static void save_vc_fp_state(struct preempt_data *vcpd)
1131{
1132        save_fp_state(&vcpd->preempt_anc);
1133        vcpd->rflags |= VC_FPU_SAVED;
1134}
1135
1136/* Conditionally restores the FP state from VCPD.  If the state was not valid,
1137 * we don't bother restoring and just initialize the FPU. */
1138static void restore_vc_fp_state(struct preempt_data *vcpd)
1139{
1140        if (vcpd->rflags & VC_FPU_SAVED) {
1141                restore_fp_state(&vcpd->preempt_anc);
1142                vcpd->rflags &= ~VC_FPU_SAVED;
1143        } else {
1144                init_fp_state();
1145        }
1146}
1147
1148/* Helper for SCPs, saves the core's FPU state into the VCPD vc0 slot */
1149void __proc_save_fpu_s(struct proc *p)
1150{
1151        struct preempt_data *vcpd = &p->procdata->vcore_preempt_data[0];
1152
1153        save_vc_fp_state(vcpd);
1154}
1155
1156/* Helper: saves the SCP's GP tf state and unmaps vcore 0.  This does *not* save
1157 * the FPU state.
1158 *
1159 * In the future, we'll probably use vc0's space for scp_ctx and the silly
1160 * state.  If we ever do that, we'll need to stop using scp_ctx (soon to be in
1161 * VCPD) as a location for pcpui->cur_ctx to point (dangerous) */
1162void __proc_save_context_s(struct proc *p)
1163{
1164        copy_current_ctx_to(&p->scp_ctx);
1165        __seq_start_write(&p->procinfo->coremap_seqctr);
1166        __unmap_vcore(p, 0);
1167        __seq_end_write(&p->procinfo->coremap_seqctr);
1168        vcore_account_offline(p, 0);
1169}
1170
1171/* Yields the calling core.  Must be called locally (not async) for now.
1172 * - If RUNNING_S, you just give up your time slice and will eventually return,
1173 *   possibly after WAITING on an event.
1174 * - If RUNNING_M, you give up the current vcore (which never returns), and
1175 *   adjust the amount of cores wanted/granted.
1176 * - If you have only one vcore, you switch to WAITING.  There's no 'classic
1177 *   yield' for MCPs (at least not now).  When you run again, you'll have one
1178 *   guaranteed core, starting from the entry point.
1179 *
1180 * If the call is being nice, it means different things for SCPs and MCPs.  For
1181 * MCPs, it means that it is in response to a preemption (which needs to be
1182 * checked).  If there is no preemption pending, just return.  For SCPs, it
1183 * means the proc wants to give up the core, but still has work to do.  If not,
1184 * the proc is trying to wait on an event.  It's not being nice to others, it
1185 * just has no work to do.
1186 *
1187 * This usually does not return (smp_idle()), so it will eat your reference.
1188 * Also note that it needs a non-current/edible reference, since it will abandon
1189 * and continue to use the *p (current == 0, no cr3, etc).
1190 *
1191 * We disable interrupts for most of it too, since we need to protect
1192 * current_ctx and not race with __notify (which doesn't play well with
1193 * concurrent yielders). */
1194void proc_yield(struct proc *p, bool being_nice)
1195{
1196        uint32_t vcoreid, pcoreid = core_id();
1197        struct per_cpu_info *pcpui = &per_cpu_info[pcoreid];
1198        struct vcore *vc;
1199        struct preempt_data *vcpd;
1200
1201        /* Need to lock to prevent concurrent vcore changes (online, inactive,
1202         * the mapping, etc).  This plus checking the nr_preempts is enough to
1203         * tell if our vcoreid and cur_ctx ought to be here still or if we
1204         * should abort */
1205        spin_lock(&p->proc_lock); /* horrible scalability.  =( */
1206        switch (p->state) {
1207        case (PROC_RUNNING_S):
1208                if (!being_nice) {
1209                        /* waiting for an event to unblock us */
1210                        vcpd = &p->procdata->vcore_preempt_data[0];
1211                        /* syncing with event's SCP code.  we set waiting, then
1212                         * check pending.  they set pending, then check waiting.
1213                         * it's not possible for us to miss the notif *and* for
1214                         * them to miss WAITING.  one (or both) of us will see
1215                         * and make sure the proc wakes up.  */
1216                        __proc_set_state(p, PROC_WAITING);
1217                        /* don't let the state write pass the notif read */
1218                        wrmb();
1219                        if (vcpd->notif_pending) {
1220                                __proc_set_state(p, PROC_RUNNING_S);
1221                                /* they can't handle events, just need to
1222                                 * prevent a yield.  (note the notif_pendings
1223                                 * are collapsed). */
1224                                if (!scp_is_vcctx_ready(vcpd))
1225                                        vcpd->notif_pending = FALSE;
1226                                goto out_failed;
1227                        }
1228                        /* if we're here, we want to sleep.  a concurrent event
1229                         * that hasn't already written notif_pending will have
1230                         * seen WAITING, and will be spinning while we do this.
1231                         * */
1232                        __proc_save_context_s(p);
1233                        spin_unlock(&p->proc_lock);
1234                } else {
1235                        /* yielding to allow other processes to run.  we're
1236                         * briefly WAITING, til we are woken up */
1237                        __proc_set_state(p, PROC_WAITING);
1238                        __proc_save_context_s(p);
1239                        spin_unlock(&p->proc_lock);
1240                        /* immediately wake up the proc (makes it runnable) */
1241                        proc_wakeup(p);
1242                }
1243                goto out_yield_core;
1244        case (PROC_RUNNING_M):
1245                break;                  /* will handle this stuff below */
1246        case (PROC_DYING):              /* incoming __death */
1247        case (PROC_DYING_ABORT):
1248        case (PROC_RUNNABLE_M): /* incoming (bulk) preempt/myield TODO:(BULK) */
1249                goto out_failed;
1250        default:
1251                panic("Weird state(%s) in %s()", procstate2str(p->state),
1252                      __FUNCTION__);
1253        }
1254        /* This is which vcore this pcore thinks it is, regardless of any
1255         * unmappings that may have happened remotely (with __PRs waiting to
1256         * run) */
1257        vcoreid = pcpui->owning_vcoreid;
1258        vc = vcoreid2vcore(p, vcoreid);
1259        vcpd = &p->procdata->vcore_preempt_data[vcoreid];
1260        /* This is how we detect whether or not a __PR happened. */
1261        if (vc->nr_preempts_sent != vc->nr_preempts_done)
1262                goto out_failed;
1263        /* Sanity checks.  If we were preempted or are dying, we should have
1264         * noticed by now. */
1265        assert(is_mapped_vcore(p, pcoreid));
1266        assert(vcoreid == get_vcoreid(p, pcoreid));
1267        /* no reason to be nice, return */
1268        if (being_nice && !vc->preempt_pending)
1269                goto out_failed;
1270        /* At this point, AFAIK there should be no preempt/death messages on the
1271         * way, and we're on the online list.  So we'll go ahead and do the
1272         * yielding business. */
1273        /* If there's a preempt pending, we don't need to preempt later since we
1274         * are yielding (nice or otherwise).  If not, this is just a regular
1275         * yield. */
1276        if (vc->preempt_pending) {
1277                vc->preempt_pending = 0;
1278        } else {
1279                /* Optional: on a normal yield, check to see if we are putting
1280                 * them below amt_wanted (help with user races) and bail. */
1281                if (p->procdata->res_req[RES_CORES].amt_wanted >=
1282                                       p->procinfo->num_vcores)
1283                        goto out_failed;
1284        }
1285        /* Don't let them yield if they are missing a notification.  Userspace
1286         * must not leave vcore context without dealing with notif_pending.
1287         * pop_user_ctx() handles leaving via uthread context.  This handles
1288         * leaving via a yield.
1289         *
1290         * This early check is an optimization.  The real check is below when it
1291         * works with the online_vcs list (syncing with event.c and INDIR/IPI
1292         * posting). */
1293        if (vcpd->notif_pending)
1294                goto out_failed;
1295        /* Now we'll actually try to yield */
1296        printd("[K] Process %d (%p) is yielding on vcore %d\n", p->pid, p,
1297               get_vcoreid(p, pcoreid));
1298        /* Remove from the online list, add to the yielded list, and unmap
1299         * the vcore, which gives up the core. */
1300        TAILQ_REMOVE(&p->online_vcs, vc, list);
1301        /* Now that we're off the online list, check to see if an alert made
1302         * it through (event.c sets this) */
1303        wrmb(); /* prev write must hit before reading notif_pending */
1304        /* Note we need interrupts disabled, since a __notify can come in
1305         * and set pending to FALSE */
1306        if (vcpd->notif_pending) {
1307                /* We lost, put it back on the list and abort the yield.  If we
1308                 * ever build an myield, we'll need a way to deal with this for
1309                 * all vcores */
1310                TAILQ_INSERT_TAIL(&p->online_vcs, vc, list); /* could go HEAD */
1311                goto out_failed;
1312        }
1313        /* Not really a kmsg, but it acts like one w.r.t. proc mgmt */
1314        pcpui_trace_kmsg(pcpui, (uintptr_t)proc_yield);
1315        /* We won the race with event sending, we can safely yield */
1316        TAILQ_INSERT_HEAD(&p->inactive_vcs, vc, list);
1317        /* Note this protects stuff userspace should look at, which doesn't
1318         * include the TAILQs. */
1319        __seq_start_write(&p->procinfo->coremap_seqctr);
1320        /* Next time the vcore starts, it starts fresh */
1321        vcpd->notif_disabled = FALSE;
1322        __unmap_vcore(p, vcoreid);
1323        p->procinfo->num_vcores--;
1324        p->procinfo->res_grant[RES_CORES] = p->procinfo->num_vcores;
1325        __seq_end_write(&p->procinfo->coremap_seqctr);
1326        vcore_account_offline(p, vcoreid);
1327        /* No more vcores?  Then we wait on an event */
1328        if (p->procinfo->num_vcores == 0) {
1329                /* consider a ksched op to tell it about us WAITING */
1330                __proc_set_state(p, PROC_WAITING);
1331        }
1332        spin_unlock(&p->proc_lock);
1333        /* We discard the current context, but we still need to restore the core
1334         */
1335        arch_finalize_ctx(pcpui->cur_ctx);
1336        /* Hand the now-idle core to the ksched */
1337        __sched_put_idle_core(p, pcoreid);
1338        goto out_yield_core;
1339out_failed:
1340        /* for some reason we just want to return, either to take a KMSG that
1341         * cleans us up, or because we shouldn't yield (ex: notif_pending). */
1342        spin_unlock(&p->proc_lock);
1343        return;
1344out_yield_core:                         /* successfully yielded the core */
1345        proc_decref(p);                 /* need to eat the ref passed in */
1346        /* Clean up the core and idle. */
1347        clear_owning_proc(pcoreid);     /* so we don't restart */
1348        abandon_core();
1349        smp_idle();
1350}
1351
1352/* Sends a notification (aka active notification, aka IPI) to p's vcore.  We
1353 * only send a notification if one they are enabled.  There's a bunch of weird
1354 * cases with this, and how pending / enabled are signals between the user and
1355 * kernel - check the documentation.  Note that pending is more about messages.
1356 * The process needs to be in vcore_context, and the reason is usually a
1357 * message.  We set pending here in case we were called to prod them into vcore
1358 * context (like via a sys_self_notify).  Also note that this works for _S
1359 * procs, if you send to vcore 0 (and the proc is running). */
1360void proc_notify(struct proc *p, uint32_t vcoreid)
1361{
1362        struct preempt_data *vcpd = &p->procdata->vcore_preempt_data[vcoreid];
1363
1364        assert(proc_vcoreid_is_safe(p, vcoreid));
1365        /* If you're thinking about checking notif_pending and then returning if
1366         * it is already set, note that some callers (e.g. the event system) set
1367         * notif_pending when they deliver a message, regardless of whether
1368         * there is an IPI or not.  Those callers assume that we don't care
1369         * about notif_pending, only notif_disabled.  So don't change this
1370         * without changing them (probably can't without a lot of thought - that
1371         * notif_pending is about missing messages.  It might be possible to say
1372         * "no IPI, but don't let me miss messages that were delivered." */
1373        vcpd->notif_pending = TRUE;
1374        wrmb(); /* must write notif_pending before reading notif_disabled */
1375        if (!vcpd->notif_disabled) {
1376                /* GIANT WARNING: we aren't using the proc-lock to protect the
1377                 * vcoremap.  We want to be able to use this from interrupt
1378                 * context, and don't want the proc_lock to be an irqsave.
1379                 * Spurious __notify() kmsgs are okay (it checks to see if the
1380                 * right receiver is current). */
1381                if (vcore_is_mapped(p, vcoreid)) {
1382                        printd("[kernel] sending notif to vcore %d\n", vcoreid);
1383                        /* This use of try_get_pcoreid is racy, might be
1384                         * unmapped */
1385                        send_kernel_message(try_get_pcoreid(p, vcoreid),
1386                                            __notify, (long)p, 0, 0,
1387                                            KMSG_ROUTINE);
1388                }
1389        }
1390}
1391
1392/* Makes sure p is runnable.  Callers may spam this, so it needs to handle
1393 * repeated calls for the same event.  Callers include event delivery, SCP
1394 * yield, and new SCPs.  Will trigger __sched_.cp_wakeup() CBs.  Will only
1395 * trigger the CB once, regardless of how many times we are called, *until* the
1396 * proc becomes WAITING again, presumably because of something the ksched did.*/
1397void proc_wakeup(struct proc *p)
1398{
1399        spin_lock(&p->proc_lock);
1400        if (__proc_is_mcp(p)) {
1401                /* we only wake up WAITING mcps */
1402                if (p->state != PROC_WAITING) {
1403                        spin_unlock(&p->proc_lock);
1404                        return;
1405                }
1406                __proc_set_state(p, PROC_RUNNABLE_M);
1407                spin_unlock(&p->proc_lock);
1408                __sched_mcp_wakeup(p);
1409                return;
1410        } else {
1411                /* SCPs can wake up for a variety of reasons.  the only times we
1412                 * need to do something is if it was waiting or just created.
1413                 * other cases are either benign (just go out), or potential
1414                 * bugs (_Ms) */
1415                switch (p->state) {
1416                case (PROC_CREATED):
1417                case (PROC_WAITING):
1418                        __proc_set_state(p, PROC_RUNNABLE_S);
1419                        break;
1420                case (PROC_RUNNABLE_S):
1421                case (PROC_RUNNING_S):
1422                case (PROC_DYING):
1423                case (PROC_DYING_ABORT):
1424                        spin_unlock(&p->proc_lock);
1425                        return;
1426                case (PROC_RUNNABLE_M):
1427                case (PROC_RUNNING_M):
1428                        warn("Weird state(%s) in %s()", procstate2str(p->state),
1429                             __FUNCTION__);
1430                        spin_unlock(&p->proc_lock);
1431                        return;
1432                }
1433                /* thanks, past brho! */
1434                printd("[kernel] FYI, waking up an _S proc\n");
1435                spin_unlock(&p->proc_lock);
1436                __sched_scp_wakeup(p);
1437        }
1438}
1439
1440/* Is the process in multi_mode / is an MCP or not?  */
1441bool __proc_is_mcp(struct proc *p)
1442{
1443        /* in lieu of using the amount of cores requested, or having a bunch of
1444         * states (like PROC_WAITING_M and _S), I'll just track it with a bool.
1445         */
1446        return p->procinfo->is_mcp;
1447}
1448
1449bool proc_is_vcctx_ready(struct proc *p)
1450{
1451        struct preempt_data *vcpd = &p->procdata->vcore_preempt_data[0];
1452
1453        return scp_is_vcctx_ready(vcpd);
1454}
1455
1456/************************  Preemption Functions  ******************************
1457 * Don't rely on these much - I'll be sure to change them up a bit.
1458 *
1459 * Careful about what takes a vcoreid and what takes a pcoreid.  Also, there may
1460 * be weird glitches with setting the state to RUNNABLE_M.  It is somewhat in
1461 * flux.  The num_vcores is changed after take_cores, but some of the messages
1462 * (or local traps) may not yet be ready to handle seeing their future state.
1463 * But they should be, so fix those when they pop up.
1464 *
1465 * Another thing to do would be to make the _core functions take a pcorelist,
1466 * and not just one pcoreid. */
1467
1468/* Sets a preempt_pending warning for p's vcore, to go off 'when'.  If you care
1469 * about locking, do it before calling.  Takes a vcoreid! */
1470void __proc_preempt_warn(struct proc *p, uint32_t vcoreid, uint64_t when)
1471{
1472        struct event_msg local_msg = {0};
1473
1474        /* danger with doing this unlocked: preempt_pending is set, but never
1475         * 0'd, since it is unmapped and not dealt with (TODO)*/
1476        p->procinfo->vcoremap[vcoreid].preempt_pending = when;
1477
1478        /* Send the event (which internally checks to see how they want it) */
1479        local_msg.ev_type = EV_PREEMPT_PENDING;
1480        local_msg.ev_arg1 = vcoreid;
1481        /* Whenever we send msgs with the proc locked, we need at least 1
1482         * online.  Caller needs to make sure the core was online/mapped. */
1483        assert(!TAILQ_EMPTY(&p->online_vcs));
1484        send_kernel_event(p, &local_msg, vcoreid);
1485
1486        /* TODO: consider putting in some lookup place for the alarm to find it.
1487         * til then, it'll have to scan the vcoremap (O(n) instead of O(m)) */
1488}
1489
1490/* Warns all active vcores of an impending preemption.  Hold the lock if you
1491 * care about the mapping (and you should). */
1492void __proc_preempt_warnall(struct proc *p, uint64_t when)
1493{
1494        struct vcore *vc_i;
1495        TAILQ_FOREACH(vc_i, &p->online_vcs, list)
1496                __proc_preempt_warn(p, vcore2vcoreid(p, vc_i), when);
1497        /* TODO: consider putting in some lookup place for the alarm to find it.
1498         * til then, it'll have to scan the vcoremap (O(n) instead of O(m)) */
1499}
1500
1501// TODO: function to set an alarm, if none is outstanding
1502
1503/* Raw function to preempt a single core.  If you care about locking, do it
1504 * before calling. */
1505void __proc_preempt_core(struct proc *p, uint32_t pcoreid)
1506{
1507        uint32_t vcoreid = get_vcoreid(p, pcoreid);
1508        struct event_msg preempt_msg = {0};
1509        /* works with nr_preempts_done to signal completion of a preemption */
1510        p->procinfo->vcoremap[vcoreid].nr_preempts_sent++;
1511        // expects a pcorelist.  assumes pcore is mapped and running_m
1512        __proc_take_corelist(p, &pcoreid, 1, TRUE);
1513        /* Only send the message if we have an online core.  o/w, it would fuck
1514         * us up (deadlock), and hey don't need a message.  the core we just
1515         * took will be the first one to be restarted.  It will look like a
1516         * notif.  in the future, we could send the event if we want, but the
1517         * caller needs to do that (after unlocking). */
1518        if (!TAILQ_EMPTY(&p->online_vcs)) {
1519                preempt_msg.ev_type = EV_VCORE_PREEMPT;
1520                preempt_msg.ev_arg2 = vcoreid;
1521                send_kernel_event(p, &preempt_msg, 0);
1522        }
1523}
1524
1525/* Raw function to preempt every vcore.  If you care about locking, do it before
1526 * calling. */
1527uint32_t __proc_preempt_all(struct proc *p, uint32_t *pc_arr)
1528{
1529        struct vcore *vc_i;
1530
1531        /* TODO:(BULK) PREEMPT - don't bother with this, set a proc wide flag,
1532         * or just make us RUNNABLE_M.  Note this is also used by __map_vcore.
1533         */
1534        TAILQ_FOREACH(vc_i, &p->online_vcs, list)
1535                vc_i->nr_preempts_sent++;
1536        return __proc_take_allcores(p, pc_arr, TRUE);
1537}
1538
1539/* Warns and preempts a vcore from p.  No delaying / alarming, or anything.  The
1540 * warning will be for u usec from now.  Returns TRUE if the core belonged to
1541 * the proc (and thus preempted), False if the proc no longer has the core. */
1542bool proc_preempt_core(struct proc *p, uint32_t pcoreid, uint64_t usec)
1543{
1544        uint64_t warn_time = read_tsc() + usec2tsc(usec);
1545        bool retval = FALSE;
1546        if (p->state != PROC_RUNNING_M) {
1547                /* more of an FYI for brho.  should be harmless to return. */
1548                warn("Tried to preempt from a non RUNNING_M proc!");
1549                return FALSE;
1550        }
1551        spin_lock(&p->proc_lock);
1552        if (is_mapped_vcore(p, pcoreid)) {
1553                __proc_preempt_warn(p, get_vcoreid(p, pcoreid), warn_time);
1554                __proc_preempt_core(p, pcoreid);
1555                /* we might have taken the last core */
1556                if (!p->procinfo->num_vcores)
1557                        __proc_set_state(p, PROC_RUNNABLE_M);
1558                retval = TRUE;
1559        }
1560        spin_unlock(&p->proc_lock);
1561        return retval;
1562}
1563
1564/* Warns and preempts all from p.  No delaying / alarming, or anything.  The
1565 * warning will be for u usec from now. */
1566void proc_preempt_all(struct proc *p, uint64_t usec)
1567{
1568        uint64_t warn_time = read_tsc() + usec2tsc(usec);
1569        uint32_t num_revoked = 0;
1570
1571        spin_lock(&p->proc_lock);
1572        /* storage for pc_arr is alloced at decl, which is after grabbing the
1573         * lock*/
1574        uint32_t pc_arr[p->procinfo->num_vcores];
1575
1576        /* DYING could be okay */
1577        if (p->state != PROC_RUNNING_M) {
1578                warn("Tried to preempt from a non RUNNING_M proc!");
1579                spin_unlock(&p->proc_lock);
1580                return;
1581        }
1582        __proc_preempt_warnall(p, warn_time);
1583        num_revoked = __proc_preempt_all(p, pc_arr);
1584        assert(!p->procinfo->num_vcores);
1585        __proc_set_state(p, PROC_RUNNABLE_M);
1586        spin_unlock(&p->proc_lock);
1587        /* TODO: when we revise this func, look at __put_idle */
1588        /* Return the cores to the ksched */
1589        if (num_revoked)
1590                __sched_put_idle_cores(p, pc_arr, num_revoked);
1591}
1592
1593/* Give the specific pcore to proc p.  Lots of assumptions, so don't really use
1594 * this.  The proc needs to be _M and prepared for it.  the pcore needs to be
1595 * free, etc. */
1596void proc_give(struct proc *p, uint32_t pcoreid)
1597{
1598        warn("Your idlecoremap is now screwed up");     /* TODO (IDLE) */
1599        spin_lock(&p->proc_lock);
1600        // expects a pcorelist, we give it a list of one
1601        __proc_give_cores(p, &pcoreid, 1);
1602        spin_unlock(&p->proc_lock);
1603}
1604
1605/* Global version of the helper, for sys_get_vcoreid (might phase that syscall
1606 * out). */
1607uint32_t proc_get_vcoreid(struct proc *p)
1608{
1609        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
1610
1611        if (pcpui->owning_proc == p) {
1612                return pcpui->owning_vcoreid;
1613        } else {
1614                warn("Asked for vcoreid for %p, but %p is pwns", p,
1615                     pcpui->owning_proc);
1616                return (uint32_t)-1;
1617        }
1618}
1619
1620/* TODO: make all of these static inlines when we gut the env crap */
1621bool vcore_is_mapped(struct proc *p, uint32_t vcoreid)
1622{
1623        return p->procinfo->vcoremap[vcoreid].valid;
1624}
1625
1626/* Can do this, or just create a new field and save it in the vcoremap */
1627uint32_t vcore2vcoreid(struct proc *p, struct vcore *vc)
1628{
1629        return (vc - p->procinfo->vcoremap);
1630}
1631
1632struct vcore *vcoreid2vcore(struct proc *p, uint32_t vcoreid)
1633{
1634        return &p->procinfo->vcoremap[vcoreid];
1635}
1636
1637/********** Core granting (bulk and single) ***********/
1638
1639/* Helper: gives pcore to the process, mapping it to the next available vcore
1640 * from list vc_list.  Returns TRUE if we succeeded (non-empty).  If you pass in
1641 * **vc, we'll tell you which vcore it was. */
1642static bool __proc_give_a_pcore(struct proc *p, uint32_t pcore,
1643                                struct vcore_tailq *vc_list, struct vcore **vc)
1644{
1645        struct vcore *new_vc;
1646
1647        new_vc = TAILQ_FIRST(vc_list);
1648        if (!new_vc)
1649                return FALSE;
1650        printd("setting vcore %d to pcore %d\n", vcore2vcoreid(p, new_vc),
1651               pcore);
1652        TAILQ_REMOVE(vc_list, new_vc, list);
1653        TAILQ_INSERT_TAIL(&p->online_vcs, new_vc, list);
1654        __map_vcore(p, vcore2vcoreid(p, new_vc), pcore);
1655        if (vc)
1656                *vc = new_vc;
1657        return TRUE;
1658}
1659
1660static void __proc_give_cores_runnable(struct proc *p, uint32_t *pc_arr,
1661                                       uint32_t num)
1662{
1663        assert(p->state == PROC_RUNNABLE_M);
1664        assert(num);    /* catch bugs */
1665        /* add new items to the vcoremap */
1666        /* unncessary if offline */
1667        __seq_start_write(&p->procinfo->coremap_seqctr);
1668        p->procinfo->num_vcores += num;
1669        for (int i = 0; i < num; i++) {
1670                /* Try from the bulk list first */
1671                if (__proc_give_a_pcore(p, pc_arr[i], &p->bulk_preempted_vcs,
1672                                        0))
1673                        continue;
1674                /* o/w, try from the inactive list.  at one point, i thought
1675                 * there might be a legit way in which the inactive list could
1676                 * be empty, but that i wanted to catch it via an assert. */
1677                assert(__proc_give_a_pcore(p, pc_arr[i], &p->inactive_vcs, 0));
1678        }
1679        __seq_end_write(&p->procinfo->coremap_seqctr);
1680}
1681
1682static void __proc_give_cores_running(struct proc *p, uint32_t *pc_arr,
1683                                      uint32_t num)
1684{
1685        struct vcore *vc_i;
1686        /* Up the refcnt, since num cores are going to start using this
1687         * process and have it loaded in their owning_proc and 'current'. */
1688        proc_incref(p, num * 2);        /* keep in sync with __startcore */
1689        __seq_start_write(&p->procinfo->coremap_seqctr);
1690        p->procinfo->num_vcores += num;
1691        assert(TAILQ_EMPTY(&p->bulk_preempted_vcs));
1692        for (int i = 0; i < num; i++) {
1693                assert(__proc_give_a_pcore(p, pc_arr[i], &p->inactive_vcs,
1694                                           &vc_i));
1695                send_kernel_message(pc_arr[i], __startcore, (long)p,
1696                                    (long)vcore2vcoreid(p, vc_i),
1697                                    (long)vc_i->nr_preempts_sent, KMSG_ROUTINE);
1698        }
1699        __seq_end_write(&p->procinfo->coremap_seqctr);
1700}
1701
1702/* Gives process p the additional num cores listed in pcorelist.  If the proc is
1703 * not RUNNABLE_M or RUNNING_M, this will fail and allocate none of the core
1704 * (and return -1).  If you're RUNNING_M, this will startup your new cores at
1705 * the entry point with their virtual IDs (or restore a preemption).  If you're
1706 * RUNNABLE_M, you should call __proc_run_m after this so that the process can
1707 * start to use its cores.  In either case, this returns 0.
1708 *
1709 * If you're *_S, make sure your core0's TF is set (which is done when coming in
1710 * via arch/trap.c and we are RUNNING_S), change your state, then call this.
1711 * Then call __proc_run_m().
1712 *
1713 * The reason I didn't bring the _S cases from core_request over here is so we
1714 * can keep this family of calls dealing with only *_Ms, to avoiding caring if
1715 * this is called from another core, and to avoid the _S -> _M transition.
1716 *
1717 * WARNING: You must hold the proc_lock before calling this! */
1718int __proc_give_cores(struct proc *p, uint32_t *pc_arr, uint32_t num)
1719{
1720        /* should never happen: */
1721        assert(num + p->procinfo->num_vcores <= MAX_NUM_CORES);
1722        switch (p->state) {
1723        case (PROC_RUNNABLE_S):
1724        case (PROC_RUNNING_S):
1725                warn("Don't give cores to a process in a *_S state!\n");
1726                return -1;
1727        case (PROC_DYING):
1728        case (PROC_DYING_ABORT):
1729        case (PROC_WAITING):
1730                /* can't accept, just fail */
1731                return -1;
1732        case (PROC_RUNNABLE_M):
1733                __proc_give_cores_runnable(p, pc_arr, num);
1734                break;
1735        case (PROC_RUNNING_M):
1736                __proc_give_cores_running(p, pc_arr, num);
1737                break;
1738        default:
1739                panic("Weird state(%s) in %s()", procstate2str(p->state),
1740                      __FUNCTION__);
1741        }
1742        /* TODO: considering moving to the ksched (hard, due to yield) */
1743        p->procinfo->res_grant[RES_CORES] += num;
1744        return 0;
1745}
1746
1747/********** Core revocation (bulk and single) ***********/
1748
1749/* Revokes a single vcore from a process (unmaps or sends a KMSG to unmap). */
1750static void __proc_revoke_core(struct proc *p, uint32_t vcoreid, bool preempt)
1751{
1752        uint32_t pcoreid = get_pcoreid(p, vcoreid);
1753        struct preempt_data *vcpd;
1754        if (preempt) {
1755                /* Lock the vcore's state (necessary for preemption recovery) */
1756                vcpd = &p->procdata->vcore_preempt_data[vcoreid];
1757                atomic_or(&vcpd->flags, VC_K_LOCK);
1758                send_kernel_message(pcoreid, __preempt, (long)p, 0, 0,
1759                                    KMSG_ROUTINE);
1760        } else {
1761                send_kernel_message(pcoreid, __death, (long)p, 0, 0,
1762                                    KMSG_ROUTINE);
1763        }
1764}
1765
1766/* Revokes all cores from the process (unmaps or sends a KMSGS). */
1767static void __proc_revoke_allcores(struct proc *p, bool preempt)
1768{
1769        struct vcore *vc_i;
1770
1771        /* TODO: if we ever get broadcast messaging, use it here (still need to
1772         * lock the vcores' states for preemption) */
1773        TAILQ_FOREACH(vc_i, &p->online_vcs, list)
1774                __proc_revoke_core(p, vcore2vcoreid(p, vc_i), preempt);
1775}
1776
1777/* Might be faster to scan the vcoremap than to walk the list... */
1778static void __proc_unmap_allcores(struct proc *p)
1779{
1780        struct vcore *vc_i;
1781        TAILQ_FOREACH(vc_i, &p->online_vcs, list)
1782                __unmap_vcore(p, vcore2vcoreid(p, vc_i));
1783}
1784
1785/* Takes (revoke via kmsg or unmap) from process p the num cores listed in
1786 * pc_arr.  Will preempt if 'preempt' is set.  o/w, no state will be saved, etc.
1787 * Don't use this for taking all of a process's cores.
1788 *
1789 * Make sure you hold the lock when you call this, and make sure that the pcore
1790 * actually belongs to the proc, non-trivial due to other __preempt messages. */
1791void __proc_take_corelist(struct proc *p, uint32_t *pc_arr, uint32_t num,
1792                          bool preempt)
1793{
1794        struct vcore *vc;
1795        uint32_t vcoreid;
1796        assert(p->state & (PROC_RUNNING_M | PROC_RUNNABLE_M));
1797        __seq_start_write(&p->procinfo->coremap_seqctr);
1798        for (int i = 0; i < num; i++) {
1799                vcoreid = get_vcoreid(p, pc_arr[i]);
1800                /* Sanity check */
1801                assert(pc_arr[i] == get_pcoreid(p, vcoreid));
1802                /* Revoke / unmap core */
1803                if (p->state == PROC_RUNNING_M)
1804                        __proc_revoke_core(p, vcoreid, preempt);
1805                __unmap_vcore(p, vcoreid);
1806                /* Change lists for the vcore.  Note, the vcore is already
1807                 * unmapped and/or the messages are already in flight.  The only
1808                 * code that looks at the lists without holding the lock is
1809                 * event code. */
1810                vc = vcoreid2vcore(p, vcoreid);
1811                TAILQ_REMOVE(&p->online_vcs, vc, list);
1812                /* even for single preempts, we use the inactive list.  bulk
1813                 * preempt is only used for when we take everything. */
1814                TAILQ_INSERT_HEAD(&p->inactive_vcs, vc, list);
1815        }
1816        p->procinfo->num_vcores -= num;
1817        __seq_end_write(&p->procinfo->coremap_seqctr);
1818        p->procinfo->res_grant[RES_CORES] -= num;
1819}
1820
1821/* Takes all cores from a process (revoke via kmsg or unmap), putting them on
1822 * the appropriate vcore list, and fills pc_arr with the pcores revoked, and
1823 * returns the number of entries in pc_arr.
1824 *
1825 * Make sure pc_arr is big enough to handle num_vcores().
1826 * Make sure you hold the lock when you call this. */
1827uint32_t __proc_take_allcores(struct proc *p, uint32_t *pc_arr, bool preempt)
1828{
1829        struct vcore *vc_i, *vc_temp;
1830        uint32_t num = 0;
1831        assert(p->state & (PROC_RUNNING_M | PROC_RUNNABLE_M));
1832        __seq_start_write(&p->procinfo->coremap_seqctr);
1833        /* Write out which pcores we're going to take */
1834        TAILQ_FOREACH(vc_i, &p->online_vcs, list)
1835                pc_arr[num++] = vc_i->pcoreid;
1836        /* Revoke if they are running, and unmap.  Both of these need the online
1837         * list to not be changed yet. */
1838        if (p->state == PROC_RUNNING_M)
1839                __proc_revoke_allcores(p, preempt);
1840        __proc_unmap_allcores(p);
1841        /* Move the vcores from online to the head of the appropriate list */
1842        TAILQ_FOREACH_SAFE(vc_i, &p->online_vcs, list, vc_temp) {
1843                /* TODO: we may want a TAILQ_CONCAT_HEAD, or something that does
1844                 * that */
1845                TAILQ_REMOVE(&p->online_vcs, vc_i, list);
1846                /* Put the cores on the appropriate list */
1847                if (preempt)
1848                        TAILQ_INSERT_HEAD(&p->bulk_preempted_vcs, vc_i, list);
1849                else
1850                        TAILQ_INSERT_HEAD(&p->inactive_vcs, vc_i, list);
1851        }
1852        assert(TAILQ_EMPTY(&p->online_vcs));
1853        assert(num == p->procinfo->num_vcores);
1854        p->procinfo->num_vcores = 0;
1855        __seq_end_write(&p->procinfo->coremap_seqctr);
1856        p->procinfo->res_grant[RES_CORES] = 0;
1857        return num;
1858}
1859
1860/* Helper to do the vcore->pcore and inverse mapping.  Hold the lock when
1861 * calling. */
1862void __map_vcore(struct proc *p, uint32_t vcoreid, uint32_t pcoreid)
1863{
1864        p->procinfo->vcoremap[vcoreid].pcoreid = pcoreid;
1865        p->procinfo->vcoremap[vcoreid].valid = TRUE;
1866        p->procinfo->pcoremap[pcoreid].vcoreid = vcoreid;
1867        p->procinfo->pcoremap[pcoreid].valid = TRUE;
1868}
1869
1870/* Helper to unmap the vcore->pcore and inverse mapping.  Hold the lock when
1871 * calling. */
1872void __unmap_vcore(struct proc *p, uint32_t vcoreid)
1873{
1874        p->procinfo->pcoremap[p->procinfo->vcoremap[vcoreid].pcoreid].valid =
1875                FALSE;
1876        p->procinfo->vcoremap[vcoreid].valid = FALSE;
1877}
1878
1879/* Stop running whatever context is on this core and load a known-good cr3.
1880 * Note this leaves no trace of what was running. This "leaves the process's
1881 * context.
1882 *
1883 * This does not clear the owning proc.  Use the other helper for that.
1884 *
1885 * Returns whether or not there was a process present. */
1886bool abandon_core(void)
1887{
1888        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
1889
1890        /* Syscalls that don't return will ultimately call abadon_core(), so we
1891         * need to make sure we don't think we are still working on a syscall.
1892         * */
1893        pcpui->cur_kthread->sysc = 0;
1894        pcpui->cur_kthread->errbuf = 0; /* just in case */
1895        if (pcpui->cur_proc) {
1896                __abandon_core();
1897                return true;
1898        }
1899        return false;
1900}
1901
1902/* Helper to clear the core's owning processor and manage refcnting.  Pass in
1903 * core_id() to save a couple core_id() calls. */
1904void clear_owning_proc(uint32_t coreid)
1905{
1906        struct per_cpu_info *pcpui = &per_cpu_info[coreid];
1907        struct proc *p = pcpui->owning_proc;
1908
1909        __clear_owning_proc(coreid);
1910        pcpui->owning_proc = 0;
1911        pcpui->owning_vcoreid = 0xdeadbeef;
1912        pcpui->cur_ctx = 0;     /* catch bugs for now (may go away) */
1913        if (p)
1914                proc_decref(p);
1915}
1916
1917/* Switches to the address space/context of new_p, doing nothing if we are
1918 * already in new_p.  You can pass NULL for a noop.
1919 *
1920 * This won't add extra refcnts or anything, and needs to be
1921 * paired with switch_back() at the end of whatever function you are in.
1922 * Specifically, the uncounted refs are one for the old_proc, which is passed
1923 * back to the caller, and new_p is getting placed in cur_proc. */
1924uintptr_t switch_to(struct proc *new_p)
1925{
1926        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
1927        struct kthread *kth = pcpui->cur_kthread;
1928        struct proc *old_proc;
1929        uintptr_t ret;
1930
1931        if (!new_p)
1932                return -1;
1933        old_proc = pcpui->cur_proc;             /* uncounted ref */
1934        /* If we aren't the proc already, then switch to it */
1935        if (old_proc != new_p) {
1936                pcpui->cur_proc = new_p;        /* uncounted ref */
1937                lcr3(new_p->env_cr3);
1938        }
1939        ret = (uintptr_t)old_proc;
1940        if (is_ktask(kth)) {
1941                if (!(kth->flags & KTH_SAVE_ADDR_SPACE)) {
1942                        kth->flags |= KTH_SAVE_ADDR_SPACE;
1943                        /* proc pointers are aligned; we can use the lower bit
1944                         * as a signal to turn off SAVE_ADDR_SPACE. */
1945                        ret |= 0x1;
1946                }
1947        }
1948        return ret;
1949}
1950
1951/* This switches back from new_p to the original process.  Pair it with
1952 * switch_to(), and pass in its return value for old_ret. */
1953void switch_back(struct proc *new_p, uintptr_t old_ret)
1954{
1955        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
1956        struct kthread *kth = pcpui->cur_kthread;
1957        struct proc *old_proc;
1958
1959        if (!new_p)
1960                return;
1961        if (is_ktask(kth)) {
1962                if (old_ret & 0x1) {
1963                        kth->flags &= ~KTH_SAVE_ADDR_SPACE;
1964                        old_ret &= ~0x1;
1965                }
1966        }
1967        old_proc = (struct proc*)old_ret;
1968        if (old_proc != new_p) {
1969                pcpui->cur_proc = old_proc;
1970                if (old_proc)
1971                        lcr3(old_proc->env_cr3);
1972                else
1973                        lcr3(boot_cr3);
1974        }
1975}
1976
1977/* Will send a TLB shootdown message to every vcore in the main address space
1978 * (aka, all vcores for now).  The message will take the start and end virtual
1979 * addresses as well, in case we want to be more clever about how much we
1980 * shootdown and batching our messages.  Should do the sanity about rounding up
1981 * and down in this function too.
1982 *
1983 * Would be nice to have a broadcast kmsg at this point.  Note this may send a
1984 * message to the calling core (interrupting it, possibly while holding the
1985 * proc_lock).  We don't need to process routine messages since it's an
1986 * immediate message. */
1987void proc_tlbshootdown(struct proc *p, uintptr_t start, uintptr_t end)
1988{
1989        /* TODO: need a better way to find cores running our address space.  we
1990         * can have kthreads running syscalls, async calls, processes being
1991         * created. */
1992        struct vcore *vc_i;
1993
1994        /* TODO: we might be able to avoid locking here in the future (we must
1995         * hit all online, and we can check __mapped).  it'll be complicated. */
1996        spin_lock(&p->proc_lock);
1997        switch (p->state) {
1998        case (PROC_RUNNING_S):
1999                tlbflush();
2000                break;
2001        case (PROC_RUNNING_M):
2002                /* TODO: (TLB) sanity checks and rounding on the ranges.
2003                 *
2004                 * We need to make sure that once a core that was online has
2005                 * been removed from the online list, then it must receive a TLB
2006                 * flush (abandon_core()) before running the process again.
2007                 * Either that, or make other decisions about who to
2008                 * TLB-shootdown. */
2009                TAILQ_FOREACH(vc_i, &p->online_vcs, list) {
2010                        send_kernel_message(vc_i->pcoreid, __tlbshootdown,
2011                                            start, end, 0, KMSG_IMMEDIATE);
2012                }
2013                break;
2014        default:
2015                /* TODO: til we fix shootdowns, there are some odd cases where
2016                 * we have the address space loaded, but the state is in
2017                 * transition. */
2018                if (p == current)
2019                        tlbflush();
2020        }
2021        spin_unlock(&p->proc_lock);
2022        proc_iotlb_flush(p);
2023}
2024
2025/* Helper, used by __startcore and __set_curctx, which sets up cur_ctx to run a
2026 * given process's vcore.  Caller needs to set up things like owning_proc and
2027 * whatnot.  Note that we might not have p loaded as current. */
2028static void __set_curctx_to_vcoreid(struct proc *p, uint32_t vcoreid,
2029                                    uint32_t old_nr_preempts_sent)
2030{
2031        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
2032        struct preempt_data *vcpd = &p->procdata->vcore_preempt_data[vcoreid];
2033        struct vcore *vc = vcoreid2vcore(p, vcoreid);
2034
2035        /* Spin until our vcore's old preemption is done.  When __SC was sent,
2036         * we were told what the nr_preempts_sent was at that time.  Once that
2037         * many are done, it is time for us to run.  This forces a
2038         * 'happens-before' ordering on a __PR of our VC before this __SC of the
2039         * VC.  Note the nr_done should not exceed old_nr_sent, since further
2040         * __PR are behind this __SC in the KMSG queue. */
2041        while (old_nr_preempts_sent != vc->nr_preempts_done)
2042                cpu_relax();
2043        /* read nr_done before any other rd or wr.  CPU mb in the atomic. */
2044        cmb();
2045        /* Mark that this vcore as no longer preempted.  No danger of clobbering
2046         * other writes, since this would get turned on in __preempt (which
2047         * can't be concurrent with this function on this core), and the atomic
2048         * is just toggling the one bit (a concurrent VC_K_LOCK will work) */
2049        atomic_and(&vcpd->flags, ~VC_PREEMPTED);
2050        /* Once the VC is no longer preempted, we allow it to receive msgs.  We
2051         * could let userspace do it, but handling it here makes it easier for
2052         * them to handle_indirs (when they turn this flag off).  Note the
2053         * atomics provide the needed barriers (cmb and mb on flags). */
2054        atomic_or(&vcpd->flags, VC_CAN_RCV_MSG);
2055        printd("[kernel] startcore on physical core %d for process %d's vcore %d\n",
2056               core_id(), p->pid, vcoreid);
2057        /* If notifs are disabled, the vcore was in vcore context and we need to
2058         * restart the vcore_ctx.  o/w, we give them a fresh vcore (which is
2059         * also what happens the first time a vcore comes online).  No matter
2060         * what, they'll restart in vcore context.  It's just a matter of
2061         * whether or not it is the old, interrupted vcore context. */
2062        if (vcpd->notif_disabled) {
2063                /* copy-in the tf we'll pop, then set all security-related
2064                 * fields */
2065                pcpui->actual_ctx = vcpd->vcore_ctx;
2066                proc_secure_ctx(&pcpui->actual_ctx);
2067        } else { /* not restarting from a preemption, use a fresh vcore */
2068                assert(vcpd->vcore_stack);
2069                proc_init_ctx(&pcpui->actual_ctx, vcoreid, vcpd->vcore_entry,
2070                              vcpd->vcore_stack, vcpd->vcore_tls_desc);
2071                /* Disable/mask active notifications for fresh vcores */
2072                vcpd->notif_disabled = TRUE;
2073        }
2074        /* Regardless of whether or not we have a 'fresh' VC, we need to restore
2075         * the FPU state for the VC according to VCPD (which means either a
2076         * saved FPU state or a brand new init).  Starting a fresh VC is just
2077         * referring to the GP context we run.  The vcore itself needs to have
2078         * the FPU state loaded from when it previously ran and was saved (or a
2079         * fresh FPU if it wasn't saved).  For fresh FPUs, the main purpose is
2080         * for limiting info leakage.  I think VCs that don't need FPU state for
2081         * some reason (like having a current_uthread) can handle any sort of
2082         * FPU state, since it gets sorted when they pop their next uthread.
2083         *
2084         * Note this can cause a GP fault on x86 if the state is corrupt.  In
2085         * lieu of reading in the huge FP state and mucking with mxcsr_mask, we
2086         * should handle this like a KPF on user code. */
2087        restore_vc_fp_state(vcpd);
2088        /* cur_ctx was built above (in actual_ctx), now use it */
2089        pcpui->cur_ctx = &pcpui->actual_ctx;
2090        /* this cur_ctx will get run when the kernel returns / idles */
2091        vcore_account_online(p, vcoreid);
2092}
2093
2094/* Changes calling vcore to be vcoreid.  enable_my_notif tells us about how the
2095 * state calling vcore wants to be left in.  It will look like caller_vcoreid
2096 * was preempted.  Note we don't care about notif_pending.
2097 *
2098 * Will return:
2099 *      0 if we successfully changed to the target vcore.
2100 *      -EBUSY if the target vcore is already mapped (a good kind of failure)
2101 *      -EAGAIN if we failed for some other reason and need to try again.  For
2102 *      example, the caller could be preempted, and we never even attempted to
2103 *      change.
2104 *      -EINVAL some userspace bug */
2105int proc_change_to_vcore(struct proc *p, uint32_t new_vcoreid,
2106                         bool enable_my_notif)
2107{
2108        uint32_t caller_vcoreid, pcoreid = core_id();
2109        struct per_cpu_info *pcpui = &per_cpu_info[pcoreid];
2110        struct preempt_data *caller_vcpd;
2111        struct vcore *caller_vc, *new_vc;
2112        struct event_msg preempt_msg = {0};
2113        int retval = -EAGAIN;   /* by default, try again */
2114
2115        /* Need to not reach outside the vcoremap, which might be smaller in the
2116         * future, but should always be as big as max_vcores */
2117        assert(proc_vcoreid_is_safe(p, new_vcoreid));
2118        /* Need to lock to prevent concurrent vcore changes, like in yield. */
2119        spin_lock(&p->proc_lock);
2120        /* new_vcoreid is already runing, abort */
2121        if (vcore_is_mapped(p, new_vcoreid)) {
2122                retval = -EBUSY;
2123                goto out_locked;
2124        }
2125        /* Need to make sure our vcore is allowed to switch.  We might have a
2126         * __preempt, __death, etc, coming in.  Similar to yield. */
2127        switch (p->state) {
2128        case (PROC_RUNNING_M):
2129                break;          /* the only case we can proceed */
2130        case (PROC_RUNNING_S):  /* user bug, just return */
2131        case (PROC_DYING):      /* incoming __death */
2132        case (PROC_DYING_ABORT):
2133        case (PROC_RUNNABLE_M): /* incoming (bulk) preempt/myield TODO:(BULK) */
2134                goto out_locked;
2135        default:
2136                panic("Weird state(%s) in %s()", procstate2str(p->state),
2137                      __FUNCTION__);
2138        }
2139        /* This is which vcore this pcore thinks it is, regardless of any
2140         * unmappings that may have happened remotely (with __PRs waiting to
2141         * run) */
2142        caller_vcoreid = pcpui->owning_vcoreid;
2143        caller_vc = vcoreid2vcore(p, caller_vcoreid);
2144        caller_vcpd = &p->procdata->vcore_preempt_data[caller_vcoreid];
2145        /* This is how we detect whether or not a __PR happened.  If it did,
2146         * just abort and handle the kmsg.  No new __PRs are coming since we
2147         * hold the lock.  This also detects a __PR followed by a __SC for the
2148         * same VC. */
2149        if (caller_vc->nr_preempts_sent != caller_vc->nr_preempts_done)
2150                goto out_locked;
2151        /* Sanity checks.  If we were preempted or are dying, we should have
2152         * noticed by now. */
2153        assert(is_mapped_vcore(p, pcoreid));
2154        assert(caller_vcoreid == get_vcoreid(p, pcoreid));
2155        /* Should only call from vcore context */
2156        if (!caller_vcpd->notif_disabled) {
2157                retval = -EINVAL;
2158                printk("[kernel] You tried to change vcores from uth ctx\n");
2159                goto out_locked;
2160        }
2161        /* Ok, we're clear to do the switch.  Lets figure out who the new one is
2162         */
2163        new_vc = vcoreid2vcore(p, new_vcoreid);
2164        printd("[kernel] changing vcore %d to vcore %d\n", caller_vcoreid,
2165               new_vcoreid);
2166        /* enable_my_notif signals how we'll be restarted */
2167        if (enable_my_notif) {
2168                /* if they set this flag, then the vcore can just restart from
2169                 * scratch, and we don't care about either the uthread_ctx or
2170                 * the vcore_ctx. */
2171                caller_vcpd->notif_disabled = FALSE;
2172                /* Don't need to save the FPU.  There should be no uthread or
2173                 * other reason to return to the FPU state.  But we do need to
2174                 * finalize the context, even though we are throwing it away.
2175                 * We need to return the pcore to a state where it can run any
2176                 * context and not be bound to the old context. */
2177                arch_finalize_ctx(pcpui->cur_ctx);
2178        } else {
2179                /* need to set up the calling vcore's ctx so that it'll get
2180                 * restarted by __startcore, to make the caller look like it was
2181                 * preempted. */
2182                copy_current_ctx_to(&caller_vcpd->vcore_ctx);
2183                save_vc_fp_state(caller_vcpd);
2184        }
2185        /* Mark our core as preempted (for userspace recovery).  Userspace
2186         * checks this in handle_indirs, and it needs to check the mbox
2187         * regardless of enable_my_notif.  This does mean cores that change-to
2188         * with no intent to return will be tracked as PREEMPTED until they
2189         * start back up (maybe forever). */
2190        atomic_or(&caller_vcpd->flags, VC_PREEMPTED);
2191        /* Either way, unmap and offline our current vcore */
2192        /* Move the caller from online to inactive */
2193        TAILQ_REMOVE(&p->online_vcs, caller_vc, list);
2194        /* We don't bother with the notif_pending race.  note that notif_pending
2195         * could still be set.  this was a preempted vcore, and userspace will
2196         * need to deal with missed messages (preempt_recover() will handle
2197         * that) */
2198        TAILQ_INSERT_HEAD(&p->inactive_vcs, caller_vc, list);
2199        /* Move the new one from inactive to online */
2200        TAILQ_REMOVE(&p->inactive_vcs, new_vc, list);
2201        TAILQ_INSERT_TAIL(&p->online_vcs, new_vc, list);
2202        /* Change the vcore map */
2203        __seq_start_write(&p->procinfo->coremap_seqctr);
2204        __unmap_vcore(p, caller_vcoreid);
2205        __map_vcore(p, new_vcoreid, pcoreid);
2206        __seq_end_write(&p->procinfo->coremap_seqctr);
2207        vcore_account_offline(p, caller_vcoreid);
2208        /* Send either a PREEMPT msg or a CHECK_MSGS msg.  If they said to
2209         * enable_my_notif, then all userspace needs is to check messages, not a
2210         * full preemption recovery. */
2211        preempt_msg.ev_type = (enable_my_notif ? EV_CHECK_MSGS :
2212                               EV_VCORE_PREEMPT);
2213        preempt_msg.ev_arg2 = caller_vcoreid;   /* arg2 is 32 bits */
2214        /* Whenever we send msgs with the proc locked, we need at least 1
2215         * online.  In this case, it's the one we just changed to. */
2216        assert(!TAILQ_EMPTY(&p->online_vcs));
2217        send_kernel_event(p, &preempt_msg, new_vcoreid);
2218        /* So this core knows which vcore is here. (cur_proc and owning_proc are
2219         * already correct): */
2220        pcpui->owning_vcoreid = new_vcoreid;
2221        /* Until we set_curctx, we don't really have a valid current tf.  The
2222         * stuff in that old one is from our previous vcore, not the current
2223         * owning_vcoreid.  This matters for other KMSGS that will run before
2224         * __set_curctx (like __notify). */
2225        pcpui->cur_ctx = 0;
2226        /* Need to send a kmsg to finish.  We can't set_curctx til the __PR is
2227         * done, but we can't spin right here while holding the lock (can't spin
2228         * while waiting on a message, roughly) */
2229        send_kernel_message(pcoreid, __set_curctx, (long)p, (long)new_vcoreid,
2230                            (long)new_vc->nr_preempts_sent, KMSG_ROUTINE);
2231        retval = 0;
2232        /* Fall through to exit */
2233out_locked:
2234        spin_unlock(&p->proc_lock);
2235        return retval;
2236}
2237
2238/* Kernel message handler to start a process's context on this core, when the
2239 * core next considers running a process.  Tightly coupled with __proc_run_m().
2240 * Interrupts are disabled. */
2241void __startcore(uint32_t srcid, long a0, long a1, long a2)
2242{
2243        uint32_t vcoreid = (uint32_t)a1;
2244        uint32_t coreid = core_id();
2245        struct per_cpu_info *pcpui = &per_cpu_info[coreid];
2246        struct proc *p_to_run = (struct proc *)a0;
2247        uint32_t old_nr_preempts_sent = (uint32_t)a2;
2248
2249        assert(p_to_run);
2250        /* Can not be any TF from a process here already */
2251        assert(!pcpui->owning_proc);
2252        /* the sender of the kmsg increfed already for this saved ref to
2253         * p_to_run */
2254        pcpui->owning_proc = p_to_run;
2255        pcpui->owning_vcoreid = vcoreid;
2256        /* sender increfed again, assuming we'd install to cur_proc.  only do
2257         * this if no one else is there.  this is an optimization, since we
2258         * expect to send these __startcores to idles cores, and this saves a
2259         * scramble to incref when all of the cores restartcore/startcore later.
2260         * Keep in sync with __proc_give_cores() and __proc_run_m(). */
2261        if (!pcpui->cur_proc) {
2262                pcpui->cur_proc = p_to_run; /* install the ref to cur_proc */
2263                lcr3(p_to_run->env_cr3);
2264        } else {
2265                proc_decref(p_to_run);
2266        }
2267        /* Note we are not necessarily in the cr3 of p_to_run */
2268        /* Now that we sorted refcnts and know p / which vcore it should be, set
2269         * up pcpui->cur_ctx so that it will run that particular vcore */
2270        __set_curctx_to_vcoreid(p_to_run, vcoreid, old_nr_preempts_sent);
2271}
2272
2273/* Kernel message handler to load a proc's vcore context on this core.  Similar
2274 * to __startcore, except it is used when p already controls the core (e.g.
2275 * change_to).  Since the core is already controlled, pcpui such as owning proc,
2276 * vcoreid, and cur_proc are all already set. */
2277void __set_curctx(uint32_t srcid, long a0, long a1, long a2)
2278{
2279        struct proc *p = (struct proc*)a0;
2280        uint32_t vcoreid = (uint32_t)a1;
2281        uint32_t old_nr_preempts_sent = (uint32_t)a2;
2282        __set_curctx_to_vcoreid(p, vcoreid, old_nr_preempts_sent);
2283}
2284
2285/* Bail out if it's the wrong process, or if they no longer want a notif.  Try
2286 * not to grab locks or write access to anything that isn't per-core in here. */
2287void __notify(uint32_t srcid, long a0, long a1, long a2)
2288{
2289        uint32_t vcoreid, coreid = core_id();
2290        struct per_cpu_info *pcpui = &per_cpu_info[coreid];
2291        struct preempt_data *vcpd;
2292        struct proc *p = (struct proc*)a0;
2293
2294        /* Not the right proc */
2295        if (p != pcpui->owning_proc)
2296                return;
2297        /* the core might be owned, but not have a valid cur_ctx (if we're in
2298         * the process of changing */
2299        if (!pcpui->cur_ctx)
2300                return;
2301        /* Common cur_ctx sanity checks.  Note cur_ctx could be an _S's scp_ctx
2302         */
2303        vcoreid = pcpui->owning_vcoreid;
2304        vcpd = &p->procdata->vcore_preempt_data[vcoreid];
2305        /* for SCPs that haven't (and might never) call vc_event_init, like
2306         * rtld.  this is harmless for MCPS to check this */
2307        if (!scp_is_vcctx_ready(vcpd))
2308                return;
2309        printd("received active notification for proc %d's vcore %d on pcore %d\n",
2310               p->procinfo->pid, vcoreid, coreid);
2311        /* sort signals.  notifs are now masked, like an interrupt gate */
2312        if (vcpd->notif_disabled)
2313                return;
2314        vcpd->notif_disabled = TRUE;
2315        /* save the old ctx in the uthread slot, build and pop a new one.  Note
2316         * that silly state isn't our business for a notification. */
2317        copy_current_ctx_to(&vcpd->uthread_ctx);
2318        memset(pcpui->cur_ctx, 0, sizeof(struct user_context));
2319        proc_init_ctx(pcpui->cur_ctx, vcoreid, vcpd->vcore_entry,
2320                      vcpd->vcore_stack, vcpd->vcore_tls_desc);
2321        /* this cur_ctx will get run when the kernel returns / idles */
2322}
2323
2324void __preempt(uint32_t srcid, long a0, long a1, long a2)
2325{
2326        uint32_t vcoreid, coreid = core_id();
2327        struct per_cpu_info *pcpui = &per_cpu_info[coreid];
2328        struct preempt_data *vcpd;
2329        struct proc *p = (struct proc*)a0;
2330
2331        assert(p);
2332        if (p != pcpui->owning_proc) {
2333                panic("__preempt arrived for proc (%p) that was not owning (%p)!",
2334                      p, pcpui->owning_proc);
2335        }
2336        /* Common cur_ctx sanity checks */
2337        assert(pcpui->cur_ctx);
2338        assert(pcpui->cur_ctx == &pcpui->actual_ctx);
2339        vcoreid = pcpui->owning_vcoreid;
2340        vcpd = &p->procdata->vcore_preempt_data[vcoreid];
2341        printd("[kernel] received __preempt for proc %d's vcore %d on pcore %d\n",
2342               p->procinfo->pid, vcoreid, coreid);
2343        /* if notifs are disabled, the vcore is in vcore context (as far as
2344         * we're concerned), and we save it in the vcore slot. o/w, we save the
2345         * process's cur_ctx in the uthread slot, and it'll appear to the vcore
2346         * when it comes back up the uthread just took a notification. */
2347        if (vcpd->notif_disabled)
2348                copy_current_ctx_to(&vcpd->vcore_ctx);
2349        else
2350                copy_current_ctx_to(&vcpd->uthread_ctx);
2351        /* Userspace in a preemption handler on another core might be copying FP
2352         * state from memory (VCPD) at the moment, and if so we don't want to
2353         * clobber it.  In this rare case, our current core's FPU state should
2354         * be the same as whatever is in VCPD, so this shouldn't be necessary,
2355         * but the arch-specific save function might do something other than
2356         * write out bit-for-bit the exact same data.  Checking STEALING
2357         * suffices, since we hold the K_LOCK (preventing userspace from
2358         * starting a fresh STEALING phase concurrently). */
2359        if (!(atomic_read(&vcpd->flags) & VC_UTHREAD_STEALING))
2360                save_vc_fp_state(vcpd);
2361        /* Mark the vcore as preempted and unlock (was locked by the sender). */
2362        atomic_or(&vcpd->flags, VC_PREEMPTED);
2363        atomic_and(&vcpd->flags, ~VC_K_LOCK);
2364        /* either __preempt or proc_yield() ends the preempt phase. */
2365        p->procinfo->vcoremap[vcoreid].preempt_pending = 0;
2366        vcore_account_offline(p, vcoreid);
2367        /* make sure everything else hits before we finish the preempt */
2368        wmb();
2369        /* up the nr_done, which signals the next __startcore for this vc */
2370        p->procinfo->vcoremap[vcoreid].nr_preempts_done++;
2371        /* We won't restart the process later.  current gets cleared later when
2372         * we notice there is no owning_proc and we have nothing to do
2373         * (smp_idle, restartcore, etc) */
2374        clear_owning_proc(coreid);
2375}
2376
2377/* Kernel message handler to clean up the core when a process is dying.
2378 * Note this leaves no trace of what was running.
2379 * It's okay if death comes to a core that's already idling and has no current.
2380 * It could happen if a process decref'd before __proc_startcore could incref. */
2381void __death(uint32_t srcid, long a0, long a1, long a2)
2382{
2383        uint32_t vcoreid, coreid = core_id();
2384        struct per_cpu_info *pcpui = &per_cpu_info[coreid];
2385        struct proc *p = (struct proc*)a0;
2386
2387        assert(p);
2388        if (p != pcpui->owning_proc) {
2389                /* Older versions of Akaros thought it was OK to have a __death
2390                 * hit a core that no longer had a process.  I think it's a bug
2391                 * now. */
2392                panic("__death arrived for proc (%p) that was not owning (%p)!",
2393                      p, pcpui->owning_proc);
2394        }
2395        vcoreid = pcpui->owning_vcoreid;
2396        printd("[kernel] death on physical core %d for process %d's vcore %d\n",
2397               coreid, p->pid, vcoreid);
2398        vcore_account_offline(p, vcoreid);      /* in case anyone is counting */
2399        /* We won't restart the process later.  current gets cleared later when
2400         * we notice there is no owning_proc and we have nothing to do
2401         * (smp_idle, restartcore, etc). */
2402        arch_finalize_ctx(pcpui->cur_ctx);
2403        clear_owning_proc(coreid);
2404}
2405
2406/* Kernel message handler, usually sent IMMEDIATE, to shoot down virtual
2407 * addresses from a0 to a1. */
2408void __tlbshootdown(uint32_t srcid, long a0, long a1, long a2)
2409{
2410        /* TODO: (TLB) something more intelligent with the range */
2411        tlbflush();
2412}
2413
2414void print_allpids(void)
2415{
2416        void print_proc_state(void *item, void *opaque)
2417        {
2418                struct proc *p = (struct proc*)item;
2419                assert(p);
2420                /* this actually adds an extra space, since no progname is ever
2421                 * PROGNAME_SZ bytes, due to the \0 counted in PROGNAME. */
2422                printk("%8d %-*s %-10s %6d\n", p->pid, PROC_PROGNAME_SZ,
2423                       p->progname, procstate2str(p->state), p->ppid);
2424        }
2425        char dashes[PROC_PROGNAME_SZ];
2426        memset(dashes, '-', PROC_PROGNAME_SZ);
2427        dashes[PROC_PROGNAME_SZ - 1] = '\0';
2428        /* -5, for 'Name ' */
2429        printk("     PID Name %-*s State      Parent    \n",
2430               PROC_PROGNAME_SZ - 5, "");
2431        printk("------------------------------%s\n", dashes);
2432        spin_lock(&pid_hash_lock);
2433        hash_for_each(pid_hash, print_proc_state, NULL);
2434        spin_unlock(&pid_hash_lock);
2435}
2436
2437void proc_get_set(struct process_set *pset)
2438{
2439        void enum_proc(void *item, void *opaque)
2440        {
2441                struct proc *p = (struct proc*) item;
2442                struct process_set *pset = (struct process_set *) opaque;
2443
2444                if (pset->num_processes < pset->size) {
2445                        if (!kref_get_not_zero(&p->p_kref, 1))
2446                                return;
2447
2448                        pset->procs[pset->num_processes] = p;
2449                        pset->num_processes++;
2450                }
2451        }
2452
2453        static const size_t num_extra_alloc = 16;
2454
2455        pset->procs = NULL;
2456        do {
2457                if (pset->procs)
2458                        proc_free_set(pset);
2459                pset->size = atomic_read(&num_envs) + num_extra_alloc;
2460                pset->num_processes = 0;
2461                pset->procs = (struct proc **)
2462                        kzmalloc(pset->size * sizeof(struct proc *), MEM_WAIT);
2463
2464                spin_lock(&pid_hash_lock);
2465                hash_for_each(pid_hash, enum_proc, pset);
2466                spin_unlock(&pid_hash_lock);
2467
2468        } while (pset->num_processes == pset->size);
2469}
2470
2471void proc_free_set(struct process_set *pset)
2472{
2473        for (size_t i = 0; i < pset->num_processes; i++)
2474                proc_decref(pset->procs[i]);
2475        kfree(pset->procs);
2476}
2477
2478void print_proc_info(pid_t pid, int verbosity)
2479{
2480        int j = 0;
2481        uint64_t total_time = 0;
2482        struct proc *child, *p = pid2proc(pid);
2483        struct vcore *vc_i;
2484        struct preempt_data *vcpd;
2485
2486        if (!p) {
2487                printk("Bad PID.\n");
2488                return;
2489        }
2490        vcpd = &p->procdata->vcore_preempt_data[0];
2491        print_lock();
2492        spinlock_debug(&p->proc_lock);
2493        //spin_lock(&p->proc_lock); // No locking!!
2494        printk("struct proc: %p\n", p);
2495        printk("Program name: %s\n", p->progname);
2496        printk("PID: %d\n", p->pid);
2497        printk("PPID: %d\n", p->ppid);
2498        printk("State: %s (%p)\n", procstate2str(p->state), p->state);
2499        printk("\tIs %san MCP\n", p->procinfo->is_mcp ? "" : "not ");
2500        if (!scp_is_vcctx_ready(vcpd))
2501                printk("\tIs NOT vcctx ready\n");
2502        if (verbosity > 0 && !p->procinfo->is_mcp) {
2503                printk("Last saved SCP context:");
2504                backtrace_user_ctx(p, &p->scp_ctx);
2505        }
2506        printk("Refcnt: %d\n", atomic_read(&p->p_kref.refcount) - 1);
2507        printk("Flags: 0x%08x\n", p->env_flags);
2508        printk("CR3(phys): %p\n", p->env_cr3);
2509        printk("Num Vcores: %d\n", p->procinfo->num_vcores);
2510        printk("Vcore Lists (may be in flux w/o locking):\n----------------\n");
2511        printk("Online:\n");
2512        TAILQ_FOREACH(vc_i, &p->online_vcs, list)
2513                printk("\tVcore %d -> Pcore %d\n", vcore2vcoreid(p, vc_i),
2514                       vc_i->pcoreid);
2515        printk("Bulk Preempted:\n");
2516        TAILQ_FOREACH(vc_i, &p->bulk_preempted_vcs, list)
2517                printk("\tVcore %d\n", vcore2vcoreid(p, vc_i));
2518        printk("Inactive / Yielded:\n");
2519        TAILQ_FOREACH(vc_i, &p->inactive_vcs, list)
2520                printk("\tVcore %d\n", vcore2vcoreid(p, vc_i));
2521        if (verbosity > 0) {
2522                printk("Nsec Online, up to the last offlining:\n");
2523                printk("------------------------");
2524                for (int i = 0; i < p->procinfo->max_vcores; i++) {
2525                        uint64_t vc_time = tsc2nsec(vcore_account_gettotal(p,
2526                                                                           i));
2527
2528                        if (i % 4 == 0)
2529                                printk("\n");
2530                        printk("  VC %3d: %14llu", i, vc_time);
2531                        total_time += vc_time;
2532                }
2533                printk("\n");
2534                printk("Total CPU-NSEC: %llu\n", total_time);
2535        }
2536        printk("Resources:\n------------------------\n");
2537        for (int i = 0; i < MAX_NUM_RESOURCES; i++)
2538                printk("\tRes type: %02d, amt wanted: %08d amt granted: %08d\n",
2539                       i, p->procdata->res_req[i].amt_wanted,
2540                       p->procinfo->res_grant[i]);
2541        printk("Open Files:\n");
2542        struct fd_table *files = &p->open_files;
2543
2544        if (spin_locked(&files->lock)) {
2545                spinlock_debug(&files->lock);
2546                printk("FILE LOCK HELD, ABORTING\n");
2547                print_unlock();
2548                proc_decref(p);
2549                return;
2550        }
2551        spin_lock(&files->lock);
2552        for (int i = 0; i < files->max_files; i++) {
2553                if (GET_BITMASK_BIT(files->open_fds->fds_bits, i)) {
2554                        printk("\tFD: %02d, ", i);
2555                        assert(files->fd[i].fd_chan);
2556                        print_chaninfo(files->fd[i].fd_chan);
2557                }
2558        }
2559        spin_unlock(&files->lock);
2560        printk("Children: (PID (struct proc *))\n");
2561        TAILQ_FOREACH(child, &p->children, sibling_link)
2562                printk("\t%d (%p)\n", child->pid, child);
2563        print_unlock();
2564        /* no locking / unlocking or refcnting */
2565        // spin_unlock(&p->proc_lock);
2566        proc_decref(p);
2567}
2568
2569/* Debugging function, checks what (process, vcore) is supposed to run on this
2570 * pcore.  Meant to be called from smp_idle() before halting. */
2571void check_my_owner(void)
2572{
2573        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
2574        void shazbot(void *item, void *opaque)
2575        {
2576                struct proc *p = (struct proc*)item;
2577                struct vcore *vc_i;
2578                assert(p);
2579                spin_lock(&p->proc_lock);
2580                TAILQ_FOREACH(vc_i, &p->online_vcs, list) {
2581                        /* this isn't true, a __startcore could be on the way
2582                         * and we're already "online" */
2583                        if (vc_i->pcoreid == core_id()) {
2584                                /* Immediate message was sent, we should get it
2585                                 * when we enable interrupts, which should cause
2586                                 * us to skip cpu_halt() */
2587                                if (!STAILQ_EMPTY(&pcpui->immed_amsgs))
2588                                        continue;
2589                                printk("Owned pcore (%d) has no owner, by %p, vc %d!\n",
2590                                       core_id(), p, vcore2vcoreid(p, vc_i));
2591                                spin_unlock(&p->proc_lock);
2592                                spin_unlock(&pid_hash_lock);
2593                                monitor(0);
2594                        }
2595                }
2596                spin_unlock(&p->proc_lock);
2597        }
2598        assert(!irq_is_enabled());
2599        if (!booting && !pcpui->owning_proc) {
2600                spin_lock(&pid_hash_lock);
2601                hash_for_each(pid_hash, shazbot, NULL);
2602                spin_unlock(&pid_hash_lock);
2603        }
2604}
2605