akaros/user/pthread/pthread.c
<<
>>
Prefs
   1#include <ros/trapframe.h>
   2#include "pthread.h"
   3#include <parlib/vcore.h>
   4#include <parlib/mcs.h>
   5#include <stdlib.h>
   6#include <string.h>
   7#include <parlib/assert.h>
   8#include <errno.h>
   9#include <parlib/parlib.h>
  10#include <ros/event.h>
  11#include <parlib/arch/atomic.h>
  12#include <parlib/arch/arch.h>
  13#include <sys/queue.h>
  14#include <sys/mman.h>
  15#include <parlib/event.h>
  16#include <parlib/ucq.h>
  17#include <parlib/signal.h>
  18#include <parlib/arch/trap.h>
  19#include <parlib/ros_debug.h>
  20#include <parlib/stdio.h>
  21#include <sys/fork_cb.h>
  22
  23#include <parlib/alarm.h>
  24#include <futex.h>
  25#include <parlib/serialize.h>
  26
  27/* TODO: eventually, we probably want to split this into the pthreads interface
  28 * and a default 2LS.  That way, apps can use the pthreads interface and use any
  29 * 2LS.  Here's a few blockers:
  30 * - pthread_cleanup(): probably support at the uthread level
  31 * - attrs and creation: probably use a default stack size and handle detached
  32 * - getattrs_np: return -1, mostly due to the stackaddr.  Callers probably want
  33 *   a real 2LS operation.
  34 * Then we can split pthreads into parlib/default_sched.c (replaces thread0) and
  35 * pthread.c.  After that, we can have a signal handling thread (even for
  36 * 'thread0'), which allows us to close() or do other vcore-ctx-unsafe ops. */
  37
  38struct pthread_queue ready_queue = TAILQ_HEAD_INITIALIZER(ready_queue);
  39struct pthread_queue active_queue = TAILQ_HEAD_INITIALIZER(active_queue);
  40struct mcs_pdr_lock queue_lock;
  41int threads_ready = 0;
  42int threads_active = 0;
  43atomic_t threads_total;
  44bool need_tls = TRUE;
  45static uint64_t fork_generation;
  46#define INIT_FORK_GENERATION 1
  47
  48/* Array of per-vcore structs to manage waiting on syscalls and handling
  49 * overflow.  Init'd in pth_init(). */
  50struct sysc_mgmt *sysc_mgmt = 0;
  51
  52/* Helper / local functions */
  53static int get_next_pid(void);
  54static inline void pthread_exit_no_cleanup(void *ret);
  55
  56/* Pthread 2LS operations */
  57static void pth_sched_init(void);
  58static void pth_sched_entry(void);
  59static void pth_thread_runnable(struct uthread *uthread);
  60static void pth_thread_paused(struct uthread *uthread);
  61static void pth_thread_blockon_sysc(struct uthread *uthread, void *sysc);
  62static void pth_thread_has_blocked(struct uthread *uthread, int flags);
  63static void pth_thread_refl_fault(struct uthread *uth,
  64                                  struct user_context *ctx);
  65static void pth_thread_exited(struct uthread *uth);
  66static struct uthread *pth_thread_create(void *(*func)(void *), void *arg);
  67static void pth_got_posix_signal(int sig_nr, struct siginfo *info);
  68static void pth_thread_bulk_runnable(uth_sync_t *wakees);
  69
  70/* Event Handlers */
  71static void pth_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
  72                               void *data);
  73
  74struct schedule_ops pthread_sched_ops = {
  75        .sched_init = pth_sched_init,
  76        .sched_entry = pth_sched_entry,
  77        .thread_runnable = pth_thread_runnable,
  78        .thread_paused = pth_thread_paused,
  79        .thread_blockon_sysc = pth_thread_blockon_sysc,
  80        .thread_has_blocked = pth_thread_has_blocked,
  81        .thread_refl_fault = pth_thread_refl_fault,
  82        .thread_exited = pth_thread_exited,
  83        .thread_create = pth_thread_create,
  84        .got_posix_signal = pth_got_posix_signal,
  85        .thread_bulk_runnable = pth_thread_bulk_runnable,
  86};
  87
  88struct schedule_ops *sched_ops = &pthread_sched_ops;
  89
  90/* Static helpers */
  91static void __pthread_free_stack(struct pthread_tcb *pt);
  92static int __pthread_allocate_stack(struct pthread_tcb *pt);
  93static void __pth_yield_cb(struct uthread *uthread, void *junk);
  94
  95/* Called from vcore entry.  Options usually include restarting whoever was
  96 * running there before or running a new thread.  Events are handled out of
  97 * event.c (table of function pointers, stuff like that). */
  98static void __attribute__((noreturn)) pth_sched_entry(void)
  99{
 100        uint32_t vcoreid = vcore_id();
 101        if (current_uthread) {
 102                /* Prep the pthread to run any pending posix signal handlers
 103                 * registered via pthread_kill once it is restored. */
 104                uthread_prep_pending_signals(current_uthread);
 105                /* Run the thread itself */
 106                run_current_uthread();
 107                assert(0);
 108        }
 109        /* no one currently running, so lets get someone from the ready queue */
 110        struct pthread_tcb *new_thread = NULL;
 111
 112        /* Try to get a thread.  If we get one, we'll break out and run it.  If
 113         * not, we'll try to yield.  vcore_yield() might return, if we lost a
 114         * race and had a new event come in, one that may make us able to get a
 115         * new_thread */
 116        do {
 117                handle_events(vcoreid);
 118                __check_preempt_pending(vcoreid);
 119                mcs_pdr_lock(&queue_lock);
 120                TAILQ_FOREACH(new_thread, &ready_queue, tq_next) {
 121                        if (new_thread->fork_generation < fork_generation)
 122                                continue;
 123                        break;
 124                }
 125                if (new_thread) {
 126                        TAILQ_REMOVE(&ready_queue, new_thread, tq_next);
 127                        assert(new_thread->state == PTH_RUNNABLE);
 128                        new_thread->state = PTH_RUNNING;
 129                        TAILQ_INSERT_TAIL(&active_queue, new_thread, tq_next);
 130                        threads_active++;
 131                        threads_ready--;
 132                        mcs_pdr_unlock(&queue_lock);
 133                        /* If you see what looks like the same uthread running
 134                         * in multiple places, your list might be jacked up.
 135                         * Turn this on. */
 136                        printd("[P] got uthread %08p on vc %d state %08p flags %08p\n",
 137                               new_thread, vcoreid,
 138                               ((struct uthread*)new_thread)->state,
 139                               ((struct uthread*)new_thread)->flags);
 140                        break;
 141                }
 142                mcs_pdr_unlock(&queue_lock);
 143                /* no new thread, try to yield */
 144                printd("[P] No threads, vcore %d is yielding\n", vcore_id());
 145                /* TODO: you can imagine having something smarter here, like
 146                 * spin for a bit before yielding. */
 147                vcore_yield(FALSE);
 148        } while (1);
 149        /* Prep the pthread to run any pending posix signal handlers registered
 150         * via pthread_kill once it is restored. */
 151        uthread_prep_pending_signals((struct uthread*)new_thread);
 152        /* Run the thread itself */
 153        run_uthread((struct uthread*)new_thread);
 154        assert(0);
 155}
 156
 157/* Could move this, along with start_routine and arg, into the 2LSs */
 158static void __pthread_run(void)
 159{
 160        struct pthread_tcb *me = pthread_self();
 161        pthread_exit_no_cleanup(me->start_routine(me->arg));
 162}
 163
 164/* GIANT WARNING: if you make any changes to this, also change the broadcast
 165 * wakeups (cond var, barrier, etc) */
 166static void pth_thread_runnable(struct uthread *uthread)
 167{
 168        struct pthread_tcb *pthread = (struct pthread_tcb*)uthread;
 169
 170        /* At this point, the 2LS can see why the thread blocked and was woken
 171         * up in the first place (coupling these things together).  On the yield
 172         * path, the 2LS was involved and was able to set the state.  Now when
 173         * we get the thread back, we can take a look. */
 174        printd("pthread %08p runnable, state was %d\n", pthread,
 175               pthread->state);
 176        switch (pthread->state) {
 177        case (PTH_CREATED):
 178        case (PTH_BLK_YIELDING):
 179        case (PTH_BLK_SYSC):
 180        case (PTH_BLK_PAUSED):
 181        case (PTH_BLK_MUTEX):
 182        case (PTH_BLK_MISC):
 183                /* can do whatever for each of these cases */
 184                break;
 185        default:
 186                panic("Odd state %d for pthread %08p\n", pthread->state,
 187                      pthread);
 188        }
 189        pthread->state = PTH_RUNNABLE;
 190        /* Insert the newly created thread into the ready queue of threads.  It
 191         * will be removed from this queue later when vcore_entry() comes up */
 192        mcs_pdr_lock(&queue_lock);
 193        /* Again, GIANT WARNING: if you change this, change batch wakeup code */
 194        TAILQ_INSERT_TAIL(&ready_queue, pthread, tq_next);
 195        threads_ready++;
 196        mcs_pdr_unlock(&queue_lock);
 197        /* Smarter schedulers should look at the num_vcores() and how much work
 198         * is going on to make a decision about how many vcores to request. */
 199        vcore_request_more(threads_ready);
 200}
 201
 202/* For some reason not under its control, the uthread stopped running (compared
 203 * to yield, which was caused by uthread/2LS code).
 204 *
 205 * The main case for this is if the vcore was preempted or if the vcore it was
 206 * running on needed to stop.  You are given a uthread that looks like it took a
 207 * notif, and had its context/silly state copied out to the uthread struct.
 208 * (copyout_uthread).  Note that this will be called in the context (TLS) of the
 209 * vcore that is losing the uthread.  If that vcore is running, it'll be in a
 210 * preempt-event handling loop (not in your 2LS code).  If this is a big
 211 * problem, I'll change it. */
 212static void pth_thread_paused(struct uthread *uthread)
 213{
 214        struct pthread_tcb *pthread = (struct pthread_tcb*)uthread;
 215
 216        __pthread_generic_yield(pthread);
 217        /* communicate to pth_thread_runnable */
 218        pthread->state = PTH_BLK_PAUSED;
 219        /* At this point, you could do something clever, like put it at the
 220         * front of the runqueue, see if it was holding a lock, do some
 221         * accounting, or whatever. */
 222        pth_thread_runnable(uthread);
 223}
 224
 225/* Restarts a uthread hanging off a syscall.  For the simple pthread case, we
 226 * just make it runnable and let the main scheduler code handle it. */
 227static void restart_thread(struct syscall *sysc)
 228{
 229        struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
 230        /* uthread stuff here: */
 231        assert(ut_restartee);
 232        assert(((struct pthread_tcb*)ut_restartee)->state == PTH_BLK_SYSC);
 233        assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
 234        ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
 235        pth_thread_runnable(ut_restartee);
 236}
 237
 238/* This handler is usually run in vcore context, though I can imagine it being
 239 * called by a uthread in some other threading library. */
 240static void pth_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
 241                               void *data)
 242{
 243        struct syscall *sysc;
 244        assert(in_vcore_context());
 245        /* if we just got a bit (not a msg), it should be because the process is
 246         * still an SCP and hasn't started using the MCP ev_q yet (using the
 247         * simple ev_q and glibc's blockon) or because the bit is still set from
 248         * an old ev_q (blocking syscalls from before we could enter vcore ctx).
 249         * Either way, just return.  Note that if you screwed up the pth ev_q
 250         * and made it NO_MSG, you'll never notice (we used to assert(ev_msg)).
 251         * */
 252        if (!ev_msg)
 253                return;
 254        /* It's a bug if we don't have a msg (we're handling a syscall
 255         * bit-event) */
 256        assert(ev_msg);
 257        /* Get the sysc from the message and just restart it */
 258        sysc = ev_msg->ev_arg3;
 259        assert(sysc);
 260        restart_thread(sysc);
 261}
 262
 263/* This will be called from vcore context, after the current thread has yielded
 264 * and is trying to block on sysc.  Need to put it somewhere were we can wake it
 265 * up when the sysc is done.  For now, we'll have the kernel send us an event
 266 * when the syscall is done. */
 267static void pth_thread_blockon_sysc(struct uthread *uthread, void *syscall)
 268{
 269        struct syscall *sysc = (struct syscall*)syscall;
 270        int old_flags;
 271        uint32_t vcoreid = vcore_id();
 272        struct pthread_tcb *pthread = (struct pthread_tcb*)uthread;
 273
 274        __pthread_generic_yield(pthread);
 275        pthread->state = PTH_BLK_SYSC;
 276        /* Set things up so we can wake this thread up later */
 277        sysc->u_data = uthread;
 278        /* Register our vcore's syscall ev_q to hear about this syscall. */
 279        if (!register_evq(sysc, sysc_mgmt[vcoreid].ev_q)) {
 280                /* Lost the race with the call being done.  The kernel won't
 281                 * send the event.  Just restart him. */
 282                restart_thread(sysc);
 283        }
 284        /* GIANT WARNING: do not touch the thread after this point. */
 285}
 286
 287static void pth_thread_has_blocked(struct uthread *uthread, int flags)
 288{
 289        struct pthread_tcb *pthread = (struct pthread_tcb*)uthread;
 290
 291        __pthread_generic_yield(pthread);
 292        /* Whatever we do here, we are mostly communicating to our future selves
 293         * in pth_thread_runnable(), which gets called by whoever triggered this
 294         * callback */
 295        switch (flags) {
 296        case UTH_EXT_BLK_YIELD:
 297                pthread->state = PTH_BLK_YIELDING;
 298                break;
 299        case UTH_EXT_BLK_MUTEX:
 300                pthread->state = PTH_BLK_MUTEX;
 301                break;
 302        default:
 303                pthread->state = PTH_BLK_MISC;
 304        };
 305}
 306
 307static void __signal_and_restart(struct uthread *uthread,
 308                                 int signo, int code, void *addr)
 309{
 310        uthread_prep_signal_from_fault(uthread, signo, code, addr);
 311        pth_thread_runnable(uthread);
 312}
 313
 314static void handle_div_by_zero(struct uthread *uthread, unsigned int err,
 315                               unsigned long aux)
 316{
 317        __signal_and_restart(uthread, SIGFPE, FPE_INTDIV, (void*)aux);
 318}
 319
 320// checks that usys in go passes its arguments correctly
 321// it only automatically checks with 7 arguments, print is for the rest
 322int go_usys_tester(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e,
 323                   uint64_t f, uint64_t g, uint64_t h, uint64_t i, uint64_t j,
 324                   uint64_t k, uint64_t l)
 325{
 326        printf("a = %lu, b = %lu, c = %lu, d = %lu, e = %lu, f = %lu, g = %lu, h = %lu, i = %lu, j = %lu, k = %lu, l = %lu\n",
 327                a, b, c, d, e, f, g, h, i, j, k, l);
 328        uint64_t ret_val = 0;
 329
 330        ret_val |= a;
 331        ret_val |= (b << 8);
 332        ret_val |= (c << 16);
 333        ret_val |= (d << 24);
 334        ret_val |= (e << 32);
 335        ret_val |= (f << 40);
 336        ret_val |= (g << 48);
 337        return ret_val;
 338}
 339
 340struct alarm_waiter *abort_syscall_at_abs_unix(uint64_t deadline)
 341{
 342        // note the malloc of waiter instead of it going on the stack
 343        struct alarm_waiter *waiter = malloc(sizeof(struct alarm_waiter));
 344
 345        init_awaiter(waiter, alarm_abort_sysc);
 346        waiter->data = current_uthread;
 347        set_awaiter_abs_unix(waiter, deadline);
 348        set_alarm(waiter);
 349        return waiter;
 350}
 351
 352bool unset_alarm_with_free(struct alarm_waiter *waiter)
 353{
 354        // we need to free the waiter we created in abort_syscall_at_abs_unix
 355        bool ret = unset_alarm(waiter);
 356
 357        free(waiter);
 358        return ret;
 359}
 360
 361// ros_syscall_sync, but makes sure errors are zeros if there is no error
 362void go_syscall(struct syscall *sysc)
 363{
 364        ros_syscall_sync(sysc);
 365        if (!syscall_retval_is_error(sysc->num, sysc->retval)) {
 366                sysc->err = 0;
 367                sysc->errstr[0] = 0;
 368        }
 369}
 370
 371static void set_up_go_table(void **table)
 372{
 373        table[0] = abort_syscall_at_abs_unix;
 374        table[1] = unset_alarm_with_free;
 375        table[2] = go_syscall;
 376        table[3] = go_usys_tester;
 377        table[4] = futex;
 378        table[5] = serialize_argv_envp;
 379        table[6] = free;
 380        assert(table[7] == (void*) 0xDEADBEEF);
 381}
 382
 383static void handle_gp_fault(struct uthread *uthread, unsigned int err,
 384                            unsigned long aux)
 385{
 386        //TODO this code is x86-64 only
 387        uint64_t rax = uthread->u_ctx.tf.hw_tf.tf_rax;
 388
 389        // we fault with a known high 16 bits in go to set up a function pointer
 390        // table, the address of the table is the low 48 bits
 391        if (rax >> 48 == 0xDEAD) {
 392                set_up_go_table((void **)(0xFFFFFFFFFFFFUL & rax));
 393                // we jump over the call instruction which is 2 bytes
 394                uthread->u_ctx.tf.hw_tf.tf_rip += 2;
 395                pth_thread_runnable(uthread);
 396                return;
 397        }
 398        __signal_and_restart(uthread, SIGSEGV, SEGV_ACCERR, (void*)aux);
 399}
 400
 401static void handle_page_fault(struct uthread *uthread, unsigned int err,
 402                              unsigned long aux)
 403{
 404        struct pthread_tcb *pthread = (struct pthread_tcb*)uthread;
 405        if (!(err & PF_VMR_BACKED)) {
 406                __signal_and_restart(uthread, SIGSEGV, SEGV_MAPERR, (void*)aux);
 407        } else {
 408                syscall_async(&uthread->local_sysc, SYS_populate_va, aux, 1);
 409                __block_uthread_on_async_sysc(uthread);
 410        }
 411}
 412
 413static void pth_thread_refl_hw_fault(struct uthread *uthread,
 414                                     unsigned int trap_nr,
 415                                     unsigned int err, unsigned long aux)
 416{
 417        struct pthread_tcb *pthread = (struct pthread_tcb*)uthread;
 418
 419        __pthread_generic_yield(pthread);
 420        pthread->state = PTH_BLK_SYSC;
 421
 422        switch (trap_nr) {
 423        case HW_TRAP_DIV_ZERO:
 424                handle_div_by_zero(uthread, err, aux);
 425                break;
 426        case HW_TRAP_GP_FAULT:
 427                handle_gp_fault(uthread, err, aux);
 428                break;
 429        case HW_TRAP_PAGE_FAULT:
 430                handle_page_fault(uthread, err, aux);
 431                break;
 432        default:
 433                printf("Pthread has unhandled fault: %d, err: %d, aux: %p\n",
 434                       trap_nr, err, aux);
 435                /* Note that uthread.c already copied out our ctx into the uth
 436                 * struct */
 437                print_user_context(&uthread->u_ctx);
 438                printf("Turn on printx to spew unhandled, malignant trap info\n");
 439                exit(-1);
 440        }
 441}
 442
 443static void pth_thread_refl_fault(struct uthread *uth,
 444                                  struct user_context *ctx)
 445{
 446        switch (ctx->type) {
 447        case ROS_HW_CTX:
 448                pth_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
 449                                         __arch_refl_get_err(ctx),
 450                                         __arch_refl_get_aux(ctx));
 451                break;
 452        default:
 453                assert(0);
 454        }
 455}
 456
 457static void pth_thread_exited(struct uthread *uth)
 458{
 459        struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
 460
 461        __pthread_generic_yield(pthread);
 462        /* Catch some bugs */
 463        pthread->state = PTH_EXITING;
 464        /* Destroy the pthread */
 465        uthread_cleanup(uth);
 466        /* Cleanup, mirroring pthread_create() */
 467        __pthread_free_stack(pthread);
 468        /* If we were the last pthread, we exit for the whole process.  Keep in
 469         * mind that thread0 is counted in this, so this will only happen if
 470         * that thread calls pthread_exit(). */
 471        if ((atomic_fetch_and_add(&threads_total, -1) == 1))
 472                exit(0);
 473}
 474
 475/* Careful, if someone used the pthread_need_tls() hack to turn off TLS, it will
 476 * also be turned off for these threads. */
 477static struct uthread *pth_thread_create(void *(*func)(void *), void *arg)
 478{
 479        struct pthread_tcb *pth;
 480        int ret;
 481
 482        ret = pthread_create(&pth, NULL, func, arg);
 483        return ret == 0 ? (struct uthread*)pth : NULL;
 484}
 485
 486/* Careful, that fake_uctx takes up a lot of stack space.  We could call
 487 * pthread_kill too.  Note the VMM 2LS has similar code. */
 488static void pth_got_posix_signal(int sig_nr, struct siginfo *info)
 489{
 490        struct user_context fake_uctx;
 491
 492        /* If we happen to have a current uthread, we can use that - perhaps
 493         * that's what the user wants.  If not, we'll build a fake one
 494         * representing our current call stack. */
 495        if (current_uthread) {
 496                trigger_posix_signal(sig_nr, info, get_cur_uth_ctx());
 497        } else {
 498                init_user_ctx(&fake_uctx, (uintptr_t)pth_got_posix_signal,
 499                              get_stack_pointer());
 500                trigger_posix_signal(sig_nr, info, &fake_uctx);
 501        }
 502}
 503
 504static void pth_thread_bulk_runnable(uth_sync_t *wakees)
 505{
 506        struct uthread *uth_i;
 507        struct pthread_tcb *pth_i;
 508
 509        /* Amortize the lock grabbing over all restartees */
 510        mcs_pdr_lock(&queue_lock);
 511        while ((uth_i = __uth_sync_get_next(wakees))) {
 512                pth_i = (struct pthread_tcb*)uth_i;
 513                pth_i->state = PTH_RUNNABLE;
 514                TAILQ_INSERT_TAIL(&ready_queue, pth_i, tq_next);
 515                threads_ready++;
 516        }
 517        mcs_pdr_unlock(&queue_lock);
 518        vcore_request_more(threads_ready);
 519}
 520
 521/* Akaros pthread extensions / hacks */
 522
 523/* Careful using this - glibc and gcc are likely to use TLS without you knowing
 524 * it. */
 525void pthread_need_tls(bool need)
 526{
 527        need_tls = need;
 528}
 529
 530/* Pthread interface stuff and helpers */
 531
 532int pthread_attr_init(pthread_attr_t *a)
 533{
 534        a->stackaddr = 0;
 535        a->stacksize = PTHREAD_STACK_SIZE;
 536        a->detachstate = PTHREAD_CREATE_JOINABLE;
 537        /* priority and policy should be set by anyone changing inherit. */
 538        a->sched_priority = 0;
 539        a->sched_policy = 0;
 540        a->sched_inherit = PTHREAD_INHERIT_SCHED;
 541        return 0;
 542}
 543
 544int pthread_attr_destroy(pthread_attr_t *a)
 545{
 546        return 0;
 547}
 548
 549static void __pthread_free_stack(struct pthread_tcb *pt)
 550{
 551        int ret = munmap(pt->stacktop - pt->stacksize, pt->stacksize);
 552        assert(!ret);
 553}
 554
 555static int __pthread_allocate_stack(struct pthread_tcb *pt)
 556{
 557        int force_a_page_fault;
 558        assert(pt->stacksize);
 559        void* stackbot = mmap(0, pt->stacksize,
 560                              PROT_READ | PROT_WRITE | PROT_EXEC,
 561                              MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 562        if (stackbot == MAP_FAILED)
 563                return -1; // errno set by mmap
 564        pt->stacktop = stackbot + pt->stacksize;
 565        /* Want the top of the stack populated, but not the rest of the stack;
 566         * that'll grow on demand (up to pt->stacksize) */
 567        force_a_page_fault = ACCESS_ONCE(*(int*)(pt->stacktop - sizeof(int)));
 568        return 0;
 569}
 570
 571// Warning, this will reuse numbers eventually
 572static int get_next_pid(void)
 573{
 574        static uint32_t next_pid = 0;
 575        return next_pid++;
 576}
 577
 578int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize)
 579{
 580        attr->stacksize = stacksize;
 581        return 0;
 582}
 583
 584int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize)
 585{
 586        *stacksize = attr->stacksize;
 587        return 0;
 588}
 589
 590int pthread_attr_setguardsize(pthread_attr_t *attr, size_t guardsize)
 591{
 592        attr->guardsize = guardsize;
 593        return 0;
 594}
 595
 596int pthread_attr_getguardsize(pthread_attr_t *attr, size_t *guardsize)
 597{
 598        *guardsize = attr->guardsize;
 599        return 0;
 600}
 601
 602int pthread_attr_getstack(const pthread_attr_t *__restrict __attr,
 603                          void **__stackaddr, size_t *__stacksize)
 604{
 605        *__stackaddr = __attr->stackaddr;
 606        *__stacksize = __attr->stacksize;
 607        return 0;
 608}
 609
 610int pthread_getattr_np(pthread_t __th, pthread_attr_t *__attr)
 611{
 612        struct uthread *uth = (struct uthread*)__th;
 613
 614        __attr->stackaddr = __th->stacktop - __th->stacksize;
 615        __attr->stacksize = __th->stacksize;
 616        if (atomic_read(&uth->join_ctl.state) == UTH_JOIN_DETACHED)
 617                __attr->detachstate = PTHREAD_CREATE_DETACHED;
 618        else
 619                __attr->detachstate = PTHREAD_CREATE_JOINABLE;
 620        return 0;
 621}
 622
 623/* All multi-threading is suspended during a fork.  Thread0 will continue to
 624 * run, which could come up if SYS_fork blocks or we get interrupted.  Parents
 625 * will continue threading after the fork, like normal.  Old threads in the
 626 * child will never run again.  New threads in the child will run. */
 627static void pth_pre_fork(void)
 628{
 629        struct pthread_tcb *pth_0 = (struct pthread_tcb*)current_uthread;
 630
 631        if (!uthread_is_thread0(current_uthread))
 632                panic("Tried to fork from a non-thread0 thread!");
 633        if (in_multi_mode())
 634                panic("Tried to fork from an MCP!");
 635        pth_0->fork_generation = fork_generation + 1;
 636        /* in case we get interrupted after incrementing the global gen */
 637        cmb();
 638        /* We're single-core and thread0 here, so we can modify fork_generation
 639         */
 640        fork_generation++;
 641        /* At this point, whether we come back as the child or the parent, no
 642         * old thread (from the previous generation) will run. */
 643}
 644
 645static void pth_post_fork(pid_t ret)
 646{
 647        struct pthread_tcb *pth_0 = (struct pthread_tcb*)current_uthread;
 648
 649        if (ret) {
 650                fork_generation--;
 651                pth_0->fork_generation = fork_generation;
 652        }
 653}
 654
 655/* Do whatever init you want.  At some point call uthread_2ls_init() and pass it
 656 * a uthread representing thread0 (int main()) */
 657void pth_sched_init(void)
 658{
 659        uintptr_t mmap_block;
 660        struct pthread_tcb *t;
 661        int ret;
 662
 663        mcs_pdr_init(&queue_lock);
 664        fork_generation = INIT_FORK_GENERATION;
 665        /* Create a pthread_tcb for the main thread */
 666        ret = posix_memalign((void**)&t, __alignof__(struct pthread_tcb),
 667                             sizeof(struct pthread_tcb));
 668        assert(!ret);
 669        /* aggressively 0 for bugs */
 670        memset(t, 0, sizeof(struct pthread_tcb));
 671        t->id = get_next_pid();
 672        t->fork_generation = fork_generation;
 673        t->stacksize = USTACK_NUM_PAGES * PGSIZE;
 674        t->stacktop = (void*)USTACKTOP;
 675        t->state = PTH_RUNNING;
 676        /* implies that sigmasks are longs, which they are. */
 677        assert(t->id == 0);
 678        SLIST_INIT(&t->cr_stack);
 679        /* Put the new pthread (thread0) on the active queue */
 680        mcs_pdr_lock(&queue_lock);
 681        threads_active++;
 682        TAILQ_INSERT_TAIL(&active_queue, t, tq_next);
 683        mcs_pdr_unlock(&queue_lock);
 684        /* Tell the kernel where and how we want to receive events.  This is
 685         * just an example of what to do to have a notification turned on.
 686         * We're turning on USER_IPIs, posting events to vcore 0's vcpd, and
 687         * telling the kernel to send to vcore 0.  Note sys_self_notify will
 688         * ignore the vcoreid and private preference.  Also note that
 689         * enable_kevent() is just an example, and you probably want to use
 690         * parts of event.c to do what you want. */
 691        enable_kevent(EV_USER_IPI, 0, EVENT_IPI | EVENT_VCORE_PRIVATE);
 692        /* Set up the per-vcore structs to track outstanding syscalls */
 693        sysc_mgmt = malloc(sizeof(struct sysc_mgmt) * max_vcores());
 694        assert(sysc_mgmt);
 695#if 1   /* Independent ev_mboxes per vcore */
 696        /* Get a block of pages for our per-vcore (but non-VCPD) ev_qs */
 697        mmap_block = (uintptr_t)mmap(0, PGSIZE * 2 * max_vcores(),
 698                                     PROT_WRITE | PROT_READ,
 699                                     MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE,
 700                                     -1, 0);
 701        assert(mmap_block);
 702        /* Could be smarter and do this on demand (in case we don't actually
 703         * want max_vcores()). */
 704        for (int i = 0; i < max_vcores(); i++) {
 705                /* Each vcore needs to point to a non-VCPD ev_q */
 706                sysc_mgmt[i].ev_q = get_eventq_raw();
 707                sysc_mgmt[i].ev_q->ev_flags = EVENT_IPI | EVENT_INDIR |
 708                                              EVENT_SPAM_INDIR | EVENT_WAKEUP;
 709                sysc_mgmt[i].ev_q->ev_vcore = i;
 710                sysc_mgmt[i].ev_q->ev_mbox->type = EV_MBOX_UCQ;
 711                ucq_init_raw(&sysc_mgmt[i].ev_q->ev_mbox->ucq,
 712                             mmap_block + (2 * i    ) * PGSIZE, 
 713                             mmap_block + (2 * i + 1) * PGSIZE); 
 714        }
 715        /* Technically, we should munmap and free what we've alloc'd, but the
 716         * kernel will clean it up for us when we exit. */
 717#endif 
 718#if 0   /* One global ev_mbox, separate ev_q per vcore */
 719        struct event_mbox *sysc_mbox = malloc(sizeof(struct event_mbox));
 720        uintptr_t two_pages = (uintptr_t)mmap(0, PGSIZE * 2, PROT_WRITE |
 721                                              PROT_READ, MAP_POPULATE |
 722                                              MAP_ANONYMOUS | MAP_PRIVATE, -1,
 723                                              0);
 724        printd("Global ucq: %08p\n", &sysc_mbox->ev_msgs);
 725        assert(sysc_mbox);
 726        assert(two_pages);
 727        memset(sysc_mbox, 0, sizeof(struct event_mbox));
 728        sysc_mbox->type = EV_MBOX_UCQ;
 729        ucq_init_raw(&sysc_mbox->ucq, two_pages, two_pages + PGSIZE);
 730        for (int i = 0; i < max_vcores(); i++) {
 731                sysc_mgmt[i].ev_q = get_eventq_slim();
 732                sysc_mgmt[i].ev_q->ev_flags = EVENT_IPI | EVENT_INDIR |
 733                                              EVENT_SPAM_INDIR | EVENT_WAKEUP;
 734                sysc_mgmt[i].ev_q->ev_vcore = i;
 735                sysc_mgmt[i].ev_q->ev_mbox = sysc_mbox;
 736        }
 737#endif
 738        uthread_2ls_init((struct uthread*)t, pth_handle_syscall, NULL);
 739        atomic_init(&threads_total, 1);                 /* one for thread0 */
 740        pre_fork_2ls = pth_pre_fork;
 741        post_fork_2ls = pth_post_fork;
 742}
 743
 744/* Make sure our scheduler runs inside an MCP rather than an SCP. */
 745void pthread_mcp_init()
 746{
 747        /* Prevent this from happening more than once. */
 748        parlib_init_once_racy(return);
 749
 750        uthread_mcp_init();
 751        /* From here forward we are an MCP running on vcore 0. Could consider
 752         * doing other pthread specific initialization based on knowing we are
 753         * an mcp after this point. */
 754}
 755
 756int __pthread_create(pthread_t *thread, const pthread_attr_t *attr,
 757                     void *(*start_routine)(void *), void *arg)
 758{
 759        struct uth_thread_attr uth_attr = {0};
 760        struct pthread_tcb *parent;
 761        struct pthread_tcb *pthread;
 762        int ret;
 763
 764        /* For now, unconditionally become an mcp when creating a pthread (if
 765         * not one already). This may change in the future once we support 2LSs
 766         * in an SCP. */
 767        pthread_mcp_init();
 768
 769        parent = (struct pthread_tcb*)current_uthread;
 770        ret = posix_memalign((void**)&pthread, __alignof__(struct pthread_tcb),
 771                             sizeof(struct pthread_tcb));
 772        assert(!ret);
 773        /* aggressively 0 for bugs*/
 774        memset(pthread, 0, sizeof(struct pthread_tcb));
 775        pthread->stacksize = PTHREAD_STACK_SIZE;        /* default */
 776        pthread->state = PTH_CREATED;
 777        pthread->id = get_next_pid();
 778        pthread->fork_generation = fork_generation;
 779        SLIST_INIT(&pthread->cr_stack);
 780        /* Respect the attributes */
 781        if (attr) {
 782                if (attr->stacksize)    /* don't set a 0 stacksize */
 783                        pthread->stacksize = attr->stacksize;
 784                if (attr->detachstate == PTHREAD_CREATE_DETACHED)
 785                        uth_attr.detached = TRUE;
 786        }
 787        /* allocate a stack */
 788        if (__pthread_allocate_stack(pthread))
 789                printf("We're fucked\n");
 790        /* Set the u_tf to start up in __pthread_run, which will call the real
 791         * start_routine and pass it the arg.  Note those aren't set until later
 792         * in pthread_create(). */
 793        init_user_ctx(&pthread->uthread.u_ctx, (uintptr_t)&__pthread_run,
 794                      (uintptr_t)(pthread->stacktop));
 795        pthread->start_routine = start_routine;
 796        pthread->arg = arg;
 797        /* Initialize the uthread */
 798        if (need_tls)
 799                uth_attr.want_tls = TRUE;
 800        uthread_init((struct uthread*)pthread, &uth_attr);
 801        *thread = pthread;
 802        atomic_inc(&threads_total);
 803        return 0;
 804}
 805
 806int pthread_create(pthread_t *thread, const pthread_attr_t *attr,
 807                   void *(*start_routine)(void *), void *arg)
 808{
 809        if (!__pthread_create(thread, attr, start_routine, arg))
 810                pth_thread_runnable((struct uthread*)*thread);
 811        return 0;
 812}
 813
 814/* Helper that all pthread-controlled yield paths call.  Just does some
 815 * accounting.  This is another example of how the much-loathed (and loved)
 816 * active queue is keeping us honest.  Need to export for sem and friends. */
 817void __pthread_generic_yield(struct pthread_tcb *pthread)
 818{
 819        mcs_pdr_lock(&queue_lock);
 820        threads_active--;
 821        TAILQ_REMOVE(&active_queue, pthread, tq_next);
 822        mcs_pdr_unlock(&queue_lock);
 823}
 824
 825int pthread_join(struct pthread_tcb *join_target, void **retval)
 826{
 827        uthread_join((struct uthread*)join_target, retval);
 828        return 0;
 829}
 830
 831static inline void pthread_exit_no_cleanup(void *ret)
 832{
 833        struct pthread_tcb *pthread = pthread_self();
 834
 835        while (SLIST_FIRST(&pthread->cr_stack))
 836                pthread_cleanup_pop(FALSE);
 837        destroy_dtls();
 838        uth_2ls_thread_exit(ret);
 839}
 840
 841void pthread_exit(void *ret)
 842{
 843        struct pthread_tcb *pthread = pthread_self();
 844        while (SLIST_FIRST(&pthread->cr_stack))
 845                pthread_cleanup_pop(TRUE);
 846        pthread_exit_no_cleanup(ret);
 847}
 848
 849/* Cooperative yielding of the processor, to allow other threads to run */
 850int pthread_yield(void)
 851{
 852        uthread_sched_yield();
 853        return 0;
 854}
 855
 856int pthread_cancel(pthread_t __th)
 857{
 858        fprintf(stderr, "Unsupported %s!", __FUNCTION__);
 859        abort();
 860        return -1;
 861}
 862
 863void pthread_cleanup_push(void (*routine)(void *), void *arg)
 864{
 865        struct pthread_tcb *p = pthread_self();
 866        struct pthread_cleanup_routine *r = malloc(sizeof(*r));
 867
 868        r->routine = routine;
 869        r->arg = arg;
 870        SLIST_INSERT_HEAD(&p->cr_stack, r, cr_next);
 871}
 872
 873void pthread_cleanup_pop(int execute)
 874{
 875        struct pthread_tcb *p = pthread_self();
 876        struct pthread_cleanup_routine *r = SLIST_FIRST(&p->cr_stack);
 877
 878        if (r) {
 879                SLIST_REMOVE_HEAD(&p->cr_stack, cr_next);
 880                if (execute)
 881                        r->routine(r->arg);
 882                free(r);
 883        }
 884}
 885
 886int pthread_mutexattr_init(pthread_mutexattr_t *attr)
 887{
 888        attr->type = PTHREAD_MUTEX_DEFAULT;
 889        return 0;
 890}
 891
 892int pthread_mutexattr_destroy(pthread_mutexattr_t *attr)
 893{
 894        return 0;
 895}
 896
 897int pthread_attr_setdetachstate(pthread_attr_t *__attr, int __detachstate)
 898{
 899        __attr->detachstate = __detachstate;
 900        return 0;
 901}
 902
 903int pthread_mutexattr_gettype(const pthread_mutexattr_t *attr, int *type)
 904{
 905        *type = attr ? attr->type : PTHREAD_MUTEX_DEFAULT;
 906        return 0;
 907}
 908
 909static bool __pthread_mutex_type_ok(int type)
 910{
 911        switch (type) {
 912        case PTHREAD_MUTEX_NORMAL:
 913        case PTHREAD_MUTEX_RECURSIVE:
 914                return TRUE;
 915        }
 916        return FALSE;
 917}
 918
 919int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type)
 920{
 921        if (!__pthread_mutex_type_ok(type))
 922                return EINVAL;
 923        attr->type = type;
 924        return 0;
 925}
 926
 927int pthread_mutex_init(pthread_mutex_t *m, const pthread_mutexattr_t *attr)
 928{
 929        if (attr) {
 930                if (!__pthread_mutex_type_ok(attr->type))
 931                        return EINVAL;
 932                m->type = attr->type;
 933        } else {
 934                m->type = PTHREAD_MUTEX_NORMAL;
 935        }
 936        switch (m->type) {
 937        case PTHREAD_MUTEX_NORMAL:
 938                uth_mutex_init(&m->mtx);
 939                break;
 940        case PTHREAD_MUTEX_RECURSIVE:
 941                uth_recurse_mutex_init(&m->r_mtx);
 942                break;
 943        }
 944        return 0;
 945}
 946
 947int pthread_mutex_lock(pthread_mutex_t *m)
 948{
 949        switch (m->type) {
 950        case PTHREAD_MUTEX_NORMAL:
 951                uth_mutex_lock(&m->mtx);
 952                break;
 953        case PTHREAD_MUTEX_RECURSIVE:
 954                uth_recurse_mutex_lock(&m->r_mtx);
 955                break;
 956        default:
 957                panic("Bad pth mutex type %d!", m->type);
 958        }
 959        return 0;
 960}
 961
 962int pthread_mutex_trylock(pthread_mutex_t *m)
 963{
 964        bool got_it;
 965
 966        switch (m->type) {
 967        case PTHREAD_MUTEX_NORMAL:
 968                got_it = uth_mutex_trylock(&m->mtx);
 969                break;
 970        case PTHREAD_MUTEX_RECURSIVE:
 971                got_it = uth_recurse_mutex_trylock(&m->r_mtx);
 972                break;
 973        default:
 974                panic("Bad pth mutex type %d!", m->type);
 975        }
 976        return got_it ? 0 : EBUSY;
 977}
 978
 979int pthread_mutex_unlock(pthread_mutex_t *m)
 980{
 981        switch (m->type) {
 982        case PTHREAD_MUTEX_NORMAL:
 983                uth_mutex_unlock(&m->mtx);
 984                break;
 985        case PTHREAD_MUTEX_RECURSIVE:
 986                uth_recurse_mutex_unlock(&m->r_mtx);
 987                break;
 988        default:
 989                panic("Bad pth mutex type %d!", m->type);
 990        }
 991        return 0;
 992}
 993
 994int pthread_mutex_destroy(pthread_mutex_t *m)
 995{
 996        switch (m->type) {
 997        case PTHREAD_MUTEX_NORMAL:
 998                uth_mutex_destroy(&m->mtx);
 999                break;
1000        case PTHREAD_MUTEX_RECURSIVE:
1001                uth_recurse_mutex_destroy(&m->r_mtx);
1002                break;
1003        default:
1004                panic("Bad pth mutex type %d!", m->type);
1005        }
1006        return 0;
1007}
1008
1009int pthread_mutex_timedlock(pthread_mutex_t *m, const struct timespec *abstime)
1010{
1011        bool got_it;
1012
1013        switch (m->type) {
1014        case PTHREAD_MUTEX_NORMAL:
1015                got_it = uth_mutex_timed_lock(&m->mtx, abstime);
1016                break;
1017        case PTHREAD_MUTEX_RECURSIVE:
1018                got_it = uth_recurse_mutex_timed_lock(&m->r_mtx, abstime);
1019                break;
1020        default:
1021                panic("Bad pth mutex type %d!", m->type);
1022        }
1023        return got_it ? 0 : ETIMEDOUT;
1024}
1025
1026int pthread_cond_init(pthread_cond_t *c, const pthread_condattr_t *a)
1027{
1028        if (a) {
1029                if (a->pshared != PTHREAD_PROCESS_PRIVATE)
1030                        fprintf(stderr,
1031                                "pthreads only supports private condvars");
1032                /* We also ignore clock_id */
1033        }
1034        uth_cond_var_init(c);
1035        return 0;
1036}
1037
1038int pthread_cond_destroy(pthread_cond_t *c)
1039{
1040        uth_cond_var_destroy(c);
1041        return 0;
1042}
1043
1044int pthread_cond_broadcast(pthread_cond_t *c)
1045{
1046        uth_cond_var_broadcast(c);
1047        return 0;
1048}
1049
1050/* spec says this needs to work regardless of whether or not it holds the mutex
1051 * already. */
1052int pthread_cond_signal(pthread_cond_t *c)
1053{
1054        uth_cond_var_signal(c);
1055        return 0;
1056}
1057
1058int pthread_cond_wait(pthread_cond_t *c, pthread_mutex_t *m)
1059{
1060        switch (m->type) {
1061        case PTHREAD_MUTEX_NORMAL:
1062                uth_cond_var_wait(c, &m->mtx);
1063                break;
1064        case PTHREAD_MUTEX_RECURSIVE:
1065                uth_cond_var_wait_recurse(c, &m->r_mtx);
1066                break;
1067        default:
1068                panic("Bad pth mutex type %d!", m->type);
1069        }
1070        return 0;
1071}
1072
1073int pthread_cond_timedwait(pthread_cond_t *c, pthread_mutex_t *m,
1074                           const struct timespec *abstime)
1075{
1076        bool got_it;
1077
1078        switch (m->type) {
1079        case PTHREAD_MUTEX_NORMAL:
1080                got_it = uth_cond_var_timed_wait(c, &m->mtx, abstime);
1081                break;
1082        case PTHREAD_MUTEX_RECURSIVE:
1083                got_it = uth_cond_var_timed_wait_recurse(c, &m->r_mtx, abstime);
1084                break;
1085        default:
1086                panic("Bad pth mutex type %d!", m->type);
1087        }
1088        return got_it ? 0 : ETIMEDOUT;
1089}
1090
1091int pthread_condattr_init(pthread_condattr_t *a)
1092{
1093        a->pshared = PTHREAD_PROCESS_PRIVATE;
1094        a->clock = 0;
1095        return 0;
1096}
1097
1098int pthread_condattr_destroy(pthread_condattr_t *a)
1099{
1100        return 0;
1101}
1102
1103int pthread_condattr_getpshared(pthread_condattr_t *a, int *s)
1104{
1105        *s = a->pshared;
1106        return 0;
1107}
1108
1109int pthread_condattr_setpshared(pthread_condattr_t *a, int s)
1110{
1111        a->pshared = s;
1112        if (s == PTHREAD_PROCESS_SHARED) {
1113                printf("Warning: we don't do shared pthread condvars btw diff MCPs\n");
1114                return -1;
1115        }
1116        return 0;
1117}
1118
1119int pthread_condattr_getclock(const pthread_condattr_t *attr,
1120                              clockid_t *clock_id)
1121{
1122        *clock_id = attr->clock;
1123        return 0;
1124}
1125
1126int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock_id)
1127{
1128        printf("Warning: we don't do pthread condvar clock stuff\n");
1129        attr->clock = clock_id;
1130        return 0;
1131}
1132
1133int pthread_rwlock_init(pthread_rwlock_t *rwl, const pthread_rwlockattr_t *a)
1134{
1135        uth_rwlock_init(rwl);
1136        return 0;
1137}
1138
1139int pthread_rwlock_destroy(pthread_rwlock_t *rwl)
1140{
1141        uth_rwlock_destroy(rwl);
1142        return 0;
1143}
1144
1145int pthread_rwlock_rdlock(pthread_rwlock_t *rwl)
1146{
1147        uth_rwlock_rdlock(rwl);
1148        return 0;
1149}
1150
1151int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwl)
1152{
1153        return uth_rwlock_try_rdlock(rwl) ? 0 : EBUSY;
1154}
1155
1156int pthread_rwlock_wrlock(pthread_rwlock_t *rwl)
1157{
1158        uth_rwlock_wrlock(rwl);
1159        return 0;
1160}
1161
1162int pthread_rwlock_trywrlock(pthread_rwlock_t *rwl)
1163{
1164        return uth_rwlock_try_wrlock(rwl) ? 0 : EBUSY;
1165}
1166
1167int pthread_rwlock_unlock(pthread_rwlock_t *rwl)
1168{
1169        uth_rwlock_unlock(rwl);
1170        return 0;
1171}
1172
1173pthread_t pthread_self(void)
1174{
1175        return (struct pthread_tcb*)uthread_self();
1176}
1177
1178int pthread_equal(pthread_t t1, pthread_t t2)
1179{
1180        return t1 == t2;
1181}
1182
1183int pthread_once(pthread_once_t *once_control, void (*init_routine)(void))
1184{
1185        /* pthread_once's init routine doesn't take an argument, like parlibs.
1186         * This means the func will be run with an argument passed to it, but
1187         * it'll be ignored. */
1188        parlib_run_once(once_control, (void (*)(void *))init_routine, NULL);
1189        /* The return for pthread_once isn't an error from the function, it's
1190         * just an overall error.  Note pthread's init_routine() has no return
1191         * value. */
1192        return 0;
1193}
1194
1195int pthread_barrier_init(pthread_barrier_t *b,
1196                         const pthread_barrierattr_t *a, int count)
1197{
1198        b->total_threads = count;
1199        b->sense = 0;
1200        atomic_set(&b->count, count);
1201        spin_pdr_init(&b->lock);
1202        __uth_sync_init(&b->waiters);
1203        b->nr_waiters = 0;
1204        return 0;
1205}
1206
1207struct barrier_junk {
1208        pthread_barrier_t               *b;
1209        int                             ls;
1210};
1211
1212/* Helper for spinning sync, returns TRUE if it is okay to keep spinning.
1213 *
1214 * Alternatives include:
1215 *      old_count <= num_vcores() (barrier code, pass in old_count as *state,
1216 *                                 but this only works if every awake pthread
1217 *                                 will belong to the barrier).
1218 *      just spin for a bit       (use *state to track spins)
1219 *      FALSE                     (always is safe)
1220 *      etc...
1221 * 'threads_ready' isn't too great since sometimes it'll be non-zero when it is
1222 * about to become 0.  We really want "I have no threads waiting to run that
1223 * aren't going to run on their on unless this core yields instead of spins". */
1224/* TODO: consider making this a 2LS op */
1225static inline bool safe_to_spin(unsigned int *state)
1226{
1227        return (*state)++ % PTHREAD_BARRIER_SPINS;
1228}
1229
1230/* Callback/bottom half of barrier. */
1231static void __pth_barrier_cb(struct uthread *uthread, void *junk)
1232{
1233        pthread_barrier_t *b = ((struct barrier_junk*)junk)->b;
1234        int ls = ((struct barrier_junk*)junk)->ls;
1235
1236        uthread_has_blocked(uthread, UTH_EXT_BLK_MUTEX);
1237        /* TODO: if we used a trylock, we could bail as soon as we see sense */
1238        spin_pdr_lock(&b->lock);
1239        /* If sense is ls (our free value), we lost the race and shouldn't sleep
1240         */
1241        if (b->sense == ls) {
1242                spin_pdr_unlock(&b->lock);
1243                uthread_runnable(uthread);
1244                return;
1245        }
1246        /* otherwise, we sleep */
1247        __uth_sync_enqueue(uthread, &b->waiters);
1248        b->nr_waiters++;
1249        spin_pdr_unlock(&b->lock);
1250}
1251
1252/* We assume that the same threads participating in the barrier this time will
1253 * also participate next time.  Imagine a thread stopped right after its fetch
1254 * and add - we know it is coming through eventually.  We finish and change the
1255 * sense, which should allow the delayed thread to eventually break through.
1256 * But if another n threads come in first, we'll set the sense back to the old
1257 * value, thereby catching the delayed thread til the next barrier. 
1258 *
1259 * A note on preemption: if any thread gets preempted and it is never dealt
1260 * with, eventually we deadlock, with all threads waiting on the last one to
1261 * enter (and any stragglers from one run will be the last in the next run).
1262 * One way or another, we need to handle preemptions.  The current 2LS requests
1263 * an IPI for a preempt, so we'll be fine.  Any other strategies will need to
1264 * consider how barriers work.  Any time we sleep, we'll be okay (since that
1265 * frees up our core to handle preemptions/run other threads. */
1266int pthread_barrier_wait(pthread_barrier_t *b)
1267{
1268        unsigned int spin_state = 0;
1269        /* when b->sense is the value we read, then we're free*/
1270        int ls = !b->sense;
1271        uth_sync_t restartees;
1272        struct uthread *uth_i;
1273        struct barrier_junk local_junk;
1274        
1275        long old_count = atomic_fetch_and_add(&b->count, -1);
1276
1277        if (old_count == 1) {
1278                /* TODO: we might want to grab the lock right away, so a few
1279                 * short circuit faster? */
1280                atomic_set(&b->count, b->total_threads);
1281                /* we still need to maintain ordering btw count and sense, in
1282                 * case another thread doesn't sleep (if we wrote sense first,
1283                 * they could break out, race around, and muck with count before
1284                 * it is time) */
1285                /* wmb(); handled by the spin lock */
1286                spin_pdr_lock(&b->lock);
1287                /* Sense is only protected in addition to decisions to sleep */
1288                b->sense = ls;  /* set to free everyone */
1289                /* All access to nr_waiters is protected by the lock */
1290                if (!b->nr_waiters) {
1291                        spin_pdr_unlock(&b->lock);
1292                        return PTHREAD_BARRIER_SERIAL_THREAD;
1293                }
1294                __uth_sync_init(&restartees);
1295                __uth_sync_swap(&restartees, &b->waiters);
1296                b->nr_waiters = 0;
1297                spin_pdr_unlock(&b->lock);
1298                __uth_sync_wake_all(&restartees);
1299                return PTHREAD_BARRIER_SERIAL_THREAD;
1300        } else {
1301                /* Spin if there are no other threads to run.  No sense sleeping
1302                 */
1303                do {
1304                        if (b->sense == ls)
1305                                return 0;
1306                        cpu_relax();
1307                } while (safe_to_spin(&spin_state));
1308
1309                /* Try to sleep, when we wake/return, we're free to go */
1310                local_junk.b = b;
1311                local_junk.ls = ls;
1312                uthread_yield(TRUE, __pth_barrier_cb, &local_junk);
1313                // assert(b->sense == ls);
1314                return 0;
1315        }
1316}
1317
1318int pthread_barrier_destroy(pthread_barrier_t *b)
1319{
1320        assert(!b->nr_waiters);
1321        __uth_sync_destroy(&b->waiters);
1322        /* Free any locks (if we end up using an MCS) */
1323        return 0;
1324}
1325
1326int pthread_detach(pthread_t thread)
1327{
1328        uthread_detach((struct uthread*)thread);
1329        return 0;
1330}
1331
1332int pthread_kill(pthread_t thread, int signo)
1333{
1334        return uthread_signal(&thread->uthread, signo);
1335}
1336
1337int pthread_sigmask(int how, const sigset_t *set, sigset_t *oset)
1338{
1339        int ret = sigprocmask(how, set, oset);
1340
1341        /* Ensures any pending signals we just unmasked get processed. */
1342        if (set && ret == 0)
1343                pthread_yield();
1344        return ret;
1345}
1346
1347int pthread_sigqueue(pthread_t *thread, int sig, const union sigval value)
1348{
1349        printf("pthread_sigqueue is not yet implemented!");
1350        return -1;
1351}
1352
1353int pthread_key_create(pthread_key_t *key, void (*destructor)(void*))
1354{
1355        *key = dtls_key_create(destructor);
1356        assert(key);
1357        return 0;
1358}
1359
1360int pthread_key_delete(pthread_key_t key)
1361{
1362        dtls_key_delete(key);
1363        return 0;
1364}
1365
1366void *pthread_getspecific(pthread_key_t key)
1367{
1368        return get_dtls(key);
1369}
1370
1371int pthread_setspecific(pthread_key_t key, const void *value)
1372{
1373        set_dtls(key, (void*)value);
1374        return 0;
1375}
1376
1377
1378/* Scheduling Stuff.  Actually, these don't tell the 2LS anything - they just
1379 * pretend to muck with attrs and params, as expected by pthreads apps. */
1380
1381int pthread_attr_setschedparam(pthread_attr_t *attr,
1382                               const struct sched_param *param)
1383{
1384        /* The set of acceptable priorities are based on the scheduling policy.
1385         * We'll just accept any old number, since we might not know the policy
1386         * yet.  I didn't see anything in the man pages saying attr had to have
1387         * a policy set before setting priority. */
1388        attr->sched_priority = param->sched_priority;
1389        return 0;
1390}
1391
1392int pthread_attr_getschedparam(pthread_attr_t *attr,
1393                               struct sched_param *param)
1394{
1395        param->sched_priority = attr->sched_priority;
1396        return 0;
1397}
1398
1399int pthread_attr_setschedpolicy(pthread_attr_t *attr, int policy)
1400{
1401        attr->sched_policy = policy;
1402        return 0;
1403}
1404
1405int pthread_attr_getschedpolicy(pthread_attr_t *attr, int *policy)
1406{
1407        *policy = attr->sched_policy;
1408        return 0;
1409}
1410
1411/* We only support SCOPE_PROCESS, so we don't even use the attr. */
1412int pthread_attr_setscope(pthread_attr_t *attr, int scope)
1413{
1414        if (scope != PTHREAD_SCOPE_PROCESS)
1415                return -ENOTSUP;
1416        return 0;
1417}
1418
1419int pthread_attr_getscope(pthread_attr_t *attr, int *scope)
1420{
1421        *scope = PTHREAD_SCOPE_PROCESS;
1422        return 0;
1423}
1424
1425/* Inheritance refers to policy, priority, scope */
1426int pthread_attr_setinheritsched(pthread_attr_t *attr,
1427                                 int inheritsched)
1428{
1429        switch (inheritsched) {
1430                case PTHREAD_INHERIT_SCHED:
1431                case PTHREAD_EXPLICIT_SCHED:
1432                        break;
1433                default:
1434                        return -EINVAL;
1435        }
1436        attr->sched_inherit = inheritsched;
1437        return 0;
1438}
1439
1440int pthread_attr_getinheritsched(const pthread_attr_t *attr,
1441                                 int *inheritsched)
1442{
1443        *inheritsched = attr->sched_inherit;
1444        return 0;
1445}
1446
1447int pthread_setschedparam(pthread_t thread, int policy,
1448                           const struct sched_param *param)
1449{
1450        return 0;
1451}
1452
1453int pthread_getschedparam(pthread_t thread, int *policy,
1454                           struct sched_param *param)
1455{
1456        /* Faking {FIFO, 0}.  It's up to the 2LS to do whatever it wants. */
1457        *policy = SCHED_FIFO;
1458        param->sched_priority = 0;
1459        return 0;
1460}
1461