net: Make select() not spurious
[akaros.git] / user / vmm / sched.c
index 5324464..32c420e 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <vmm/sched.h>
 #include <vmm/vmm.h>
+#include <vmm/vthread.h>
 #include <sys/mman.h>
 #include <stdlib.h>
 #include <assert.h>
@@ -15,6 +16,7 @@
 #include <parlib/arch/trap.h>
 #include <parlib/ros_debug.h>
 #include <parlib/vcore_tick.h>
+#include <parlib/slab.h>
 
 int vmm_sched_period_usec = 1000;
 
@@ -27,11 +29,13 @@ static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
 /* Runnable queues, broken up by thread type. */
 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
+static struct vmm_thread **greedy_rnbl_guests;
 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
 static atomic_t nr_unblk_tasks;
 static atomic_t nr_unblk_guests;
 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
 static struct event_queue *sysc_evq;
+static struct kmem_cache *task_thread_cache;
 
 static void vmm_sched_init(void);
 static void vmm_sched_entry(void);
@@ -64,11 +68,24 @@ static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
 static void acct_thread_blocked(struct vmm_thread *vth);
 static void acct_thread_unblocked(struct vmm_thread *vth);
 static void enqueue_vmm_thread(struct vmm_thread *vth);
+static int task_thread_ctor(void *obj, void *priv, int flags);
+static void task_thread_dtor(void *obj, void *priv);
 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
                                            int type);
 static void *__alloc_stack(size_t stacksize);
 static void __free_stack(void *stacktop, size_t stacksize);
 
+static bool sched_is_greedy(void)
+{
+       return parlib_never_yield;
+}
+
+static unsigned int sched_nr_greedy_cores(void)
+{
+       if (!current_vm)
+               return 1;
+       return current_vm->nr_gpcs + 1;
+}
 
 static void restart_thread(struct syscall *sysc)
 {
@@ -103,7 +120,8 @@ static struct event_queue *setup_sysc_evq(int vcoreid)
 
        mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
                                     PROT_WRITE | PROT_READ,
-                                    MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
+                                    MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE,
+                                    -1, 0);
        evq = get_eventq_raw();
        assert(mmap_block && evq);
        evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
@@ -127,6 +145,11 @@ static void vmm_sched_init(void)
        /* for lack of a better vcore, might as well send to 0 */
        sysc_evq = setup_sysc_evq(0);
        uthread_2ls_init((struct uthread*)thread0, vmm_handle_syscall, NULL);
+       task_thread_cache = kmem_cache_create("task threads",
+                                             sizeof(struct vmm_thread),
+                                             __alignof__(struct vmm_thread), 0,
+                                             task_thread_ctor, task_thread_dtor,
+                                             NULL);
 }
 
 /* The scheduling policy is encapsulated in the next few functions (from here
@@ -154,26 +177,12 @@ static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
 
 static struct vmm_thread *pick_a_thread_degraded(void)
 {
-       struct vmm_thread *vth = 0;
-       static int next_class = VMM_THREAD_GUEST;
+       struct vmm_thread *vth;
 
-       /* We don't have a lot of cores (maybe 0), so we'll alternate which type of
-        * thread we look at first.  Basically, we're RR within a class of threads,
-        * and we'll toggle between those two classes. */
        spin_pdr_lock(&queue_lock);
-       if (next_class == VMM_THREAD_GUEST) {
-               if (!vth)
-                       vth = __pop_first(&rnbl_guests);
-               if (!vth)
-                       vth = __pop_first(&rnbl_tasks);
-               next_class = VMM_THREAD_TASK;
-       } else {
-               if (!vth)
-                       vth = __pop_first(&rnbl_tasks);
-               if (!vth)
-                       vth = __pop_first(&rnbl_guests);
-               next_class = VMM_THREAD_GUEST;
-       };
+       vth = __pop_first(&rnbl_tasks);
+       if (!vth)
+               vth = __pop_first(&rnbl_guests);
        spin_pdr_unlock(&queue_lock);
        return vth;
 }
@@ -210,9 +219,13 @@ static void yield_current_uth(void)
  * to send events, how to avoid interfering with gpcs, etc. */
 static bool try_to_get_vcores(void)
 {
-       int nr_vcores_wanted = desired_nr_vcores();
-       bool have_enough = nr_vcores_wanted <= num_vcores();
+       int nr_vcores_wanted;
+       bool have_enough;
 
+       if (sched_is_greedy())
+               return num_vcores() == sched_nr_greedy_cores();
+       nr_vcores_wanted = desired_nr_vcores();
+       have_enough = nr_vcores_wanted <= num_vcores();
        if (have_enough) {
                vcore_tick_disable();
                return TRUE;
@@ -222,7 +235,44 @@ static bool try_to_get_vcores(void)
        return FALSE;
 }
 
-static void __attribute__((noreturn)) vmm_sched_entry(void)
+static void stats_run_vth(struct vmm_thread *vth)
+{
+       vth->nr_runs++;
+       if (vth->prev_vcoreid != vcore_id()) {
+               vth->prev_vcoreid = vcore_id();
+               vth->nr_resched++;
+       }
+}
+
+/* TODO: This assumes we get all of our vcores. */
+static struct vmm_thread *sched_pick_thread_greedy(void)
+{
+       struct vmm_thread *vth;
+
+       if (current_uthread) {
+               stats_run_vth((struct vmm_thread*)current_uthread);
+               run_current_uthread();
+       }
+       if (vcore_id() == 0) {
+               spin_pdr_lock(&queue_lock);
+               vth = __pop_first(&rnbl_tasks);
+               spin_pdr_unlock(&queue_lock);
+               return vth;
+       }
+       /* This races with enqueue_vmm_thread, which can run on another core.
+        * Here are the rules:
+        * - set when runnable (race free, only one state for the thread at a time)
+        * - cleared when we run it (race free, we're the only runners)
+        * - if we take an interrupt, we'll just run_current_uthread and not check
+        * - if we vmexit, we'll run the buddy directly */
+       assert(vcore_id() <= current_vm->nr_gpcs);
+       vth = greedy_rnbl_guests[vcore_id() - 1];
+       if (vth)
+               greedy_rnbl_guests[vcore_id() - 1] = NULL;
+       return vth;
+}
+
+static struct vmm_thread *sched_pick_thread_nice(void)
 {
        struct vmm_thread *vth;
        bool have_enough;
@@ -232,14 +282,38 @@ static void __attribute__((noreturn)) vmm_sched_entry(void)
                /* slightly less than ideal: we grab the queue lock twice */
                yield_current_uth();
        }
-       if (current_uthread)
+       if (current_uthread) {
+               stats_run_vth((struct vmm_thread*)current_uthread);
                run_current_uthread();
+       }
        if (have_enough)
                vth = pick_a_thread_plenty();
        else
                vth = pick_a_thread_degraded();
-       if (!vth)
-               vcore_yield_or_restart();
+       return vth;
+}
+
+static void __attribute__((noreturn)) vmm_sched_entry(void)
+{
+       struct vmm_thread *vth;
+
+       if (sched_is_greedy()) {
+               vth = sched_pick_thread_greedy();
+               if (!vth) {
+                       /* sys_halt_core will return, but we need to restart the vcore.  We
+                        * might have woke due to an event, and we'll need to handle_events
+                        * and other things dealt with by uthreads. */
+                       if (vcore_id() == 0)
+                               sys_halt_core(0);
+                       /* In greedy mode, yield will abort and we'll just restart */
+                       vcore_yield_or_restart();
+               }
+       } else {
+               vth = sched_pick_thread_nice();
+               if (!vth)
+                       vcore_yield_or_restart();
+       }
+       stats_run_vth(vth);
        run_uthread((struct uthread*)vth);
 }
 
@@ -326,6 +400,7 @@ static void __swap_to_gth(struct uthread *uth, void *dummy)
        /* We just immediately run our buddy.  The ctlr and the guest are accounted
         * together ("pass the token" back and forth). */
        current_uthread = NULL;
+       stats_run_vth((struct vmm_thread*)cth->buddy);
        run_uthread((struct uthread*)cth->buddy);
        assert(0);
 }
@@ -363,6 +438,7 @@ static void vmm_thread_refl_vm_fault(struct uthread *uth)
        struct guest_thread *gth = (struct guest_thread*)uth;
        struct ctlr_thread *cth = gth->buddy;
 
+       gth->nr_vmexits++;
        /* The ctlr starts frm the top every time we get a new fault. */
        cth->uthread.flags |= UTHREAD_SAVED;
        init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
@@ -370,6 +446,7 @@ static void vmm_thread_refl_vm_fault(struct uthread *uth)
        /* We just immediately run our buddy.  The ctlr and the guest are accounted
         * together ("pass the token" back and forth). */
        current_uthread = NULL;
+       stats_run_vth((struct vmm_thread*)cth);
        run_uthread((struct uthread*)cth);
        assert(0);
 }
@@ -393,18 +470,45 @@ static void vmm_thread_refl_fault(struct uthread *uth,
        }
 }
 
+static void task_thread_dtor(void *obj, void *priv)
+{
+       struct task_thread *tth = (struct task_thread*)obj;
+
+       uthread_cleanup((struct uthread*)tth);
+       __free_stack(tth->stacktop, tth->stacksize);
+}
+
+static void task_thread_exit(struct task_thread *tth)
+{
+       struct uthread *uth = (struct uthread*)tth;
+
+       if (uth->flags & UTHREAD_IS_THREAD0)
+               return;
+       kmem_cache_free(task_thread_cache, tth);
+}
+
+static void ctlr_thread_exit(struct ctlr_thread *cth)
+{
+       __vthread_exited((struct vthread*)cth->buddy);
+}
+
 static void vmm_thread_exited(struct uthread *uth)
 {
        struct vmm_thread *vth = (struct vmm_thread*)uth;
-       struct task_thread *tth = (struct task_thread*)uth;
 
-       /* Catch bugs.  Right now, only tasks threads can exit. */
-       assert(vth->type == VMM_THREAD_TASK);
+       assert(vth->type != VMM_THREAD_GUEST);
 
-       acct_thread_blocked((struct vmm_thread*)tth);
-       uthread_cleanup(uth);
-       __free_stack(tth->stacktop, tth->stacksize);
-       free(tth);
+       acct_thread_blocked(vth);
+       switch (vth->type) {
+       case VMM_THREAD_TASK:
+               task_thread_exit((struct task_thread*)uth);
+               break;
+       case VMM_THREAD_CTLR:
+               ctlr_thread_exit((struct ctlr_thread*)uth);
+               break;
+       case VMM_THREAD_GUEST:
+               panic("Guest threads shouldn't be able to exit");
+       }
 }
 
 static void destroy_guest_thread(struct guest_thread *gth)
@@ -418,8 +522,9 @@ static void destroy_guest_thread(struct guest_thread *gth)
        free(gth);
 }
 
-static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
-                                                unsigned int gpcoreid)
+struct guest_thread *create_guest_thread(struct virtual_machine *vm,
+                                         unsigned int gpcoreid,
+                                         struct vmm_gpcore_init *gpci)
 {
        struct guest_thread *gth;
        struct ctlr_thread *cth;
@@ -438,6 +543,7 @@ static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
        gth->buddy = cth;
        cth->buddy = gth;
        gth->gpc_id = gpcoreid;
+       gth->gpci = *gpci;
        cth->stacksize = VMM_THR_STACKSIZE;
        cth->stacktop = __alloc_stack(cth->stacksize);
        if (!cth->stacktop) {
@@ -447,31 +553,73 @@ static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
        }
        gth->uthread.u_ctx.type = ROS_VM_CTX;
        gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
-       /* No need to init the ctlr.  It gets re-init'd each time it starts. */
        uthread_init((struct uthread*)gth, &gth_attr);
        uthread_init((struct uthread*)cth, &cth_attr);
-       /* TODO: give it a correct FP state.  Our current one is probably fine */
-       restore_fp_state(&gth->uthread.as);
-       gth->uthread.flags |= UTHREAD_FPSAVED;
        gth->halt_mtx = uth_mutex_alloc();
        gth->halt_cv = uth_cond_var_alloc();
        return gth;
 }
 
-int vmm_init(struct virtual_machine *vm, int flags)
+static void ev_handle_diag(struct event_msg *ev_msg, unsigned int ev_type,
+                           void *data)
+{
+       struct virtual_machine *vm = current_vm;
+       struct guest_thread *gth;
+       struct ctlr_thread *cth;
+       bool reset = FALSE;
+
+       if (ev_msg && (ev_msg->ev_arg1 == 1))
+               reset = TRUE;
+
+       fprintf(stderr, "\nSCHED stats:\n---------------\n");
+       for (int i = 0; i < vm->nr_gpcs; i++) {
+               gth = gpcid_to_gth(vm, i);
+               cth = gth->buddy;
+               fprintf(stderr, "\tGPC %2d: %lu resched, %lu gth runs, %lu ctl runs, %lu user-handled vmexits\n",
+                               i,
+                       ((struct vmm_thread*)gth)->nr_resched,
+                       ((struct vmm_thread*)gth)->nr_runs,
+                       ((struct vmm_thread*)cth)->nr_runs,
+                       gth->nr_vmexits);
+               if (reset) {
+                   ((struct vmm_thread*)gth)->nr_resched = 0;
+                   ((struct vmm_thread*)gth)->nr_runs = 0;
+                   ((struct vmm_thread*)cth)->nr_runs = 0;
+                   gth->nr_vmexits = 0;
+               }
+       }
+       fprintf(stderr, "\n\tNr unblocked gpc %lu, Nr unblocked tasks %lu\n",
+               atomic_read(&nr_unblk_guests), atomic_read(&nr_unblk_tasks));
+}
+
+int vmm_init(struct virtual_machine *vm, struct vmm_gpcore_init *gpcis,
+             int flags)
 {
        struct guest_thread **gths;
 
        if (current_vm)
                return -1;
        current_vm = vm;
-       if (syscall(SYS_vmm_setup, vm->nr_gpcs, vm->gpcis, flags) != vm->nr_gpcs)
+       /* We should tell the kernel to create all of the GPCs we'll need in
+        * advance.
+        *
+        * We could create the others on the fly, but the kernel's answer for
+        * CPUID[0x1] will not have to total number of cores.  If we move that
+        * handler to userspace, we can create the SMP-booted GPCs on the fly.
+        *
+        * We'd also have to deal with gths[] growing dynamically, which would
+        * require synchronization. */
+       if (syscall(SYS_vmm_add_gpcs, vm->nr_gpcs, gpcis) != vm->nr_gpcs)
                return -1;
+       if (flags) {
+               if (syscall(SYS_vmm_ctl, VMM_CTL_SET_FLAGS, flags))
+                       return -1;
+       }
        gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
        if (!gths)
                return -1;
        for (int i = 0; i < vm->nr_gpcs; i++) {
-               gths[i] = create_guest_thread(vm, i);
+               gths[i] = create_guest_thread(vm, i, &gpcis[i]);
                if (!gths[i]) {
                        for (int j = 0; j < i; j++)
                                destroy_guest_thread(gths[j]);
@@ -479,8 +627,18 @@ int vmm_init(struct virtual_machine *vm, int flags)
                        return -1;
                }
        }
-       vm->gths = gths;
+       wmb(); /* All gths posted before advertising. */
+       vm->__gths = gths;
        uthread_mcp_init();
+       register_ev_handler(EV_FREE_APPLE_PIE, ev_handle_diag, NULL);
+       if (sched_is_greedy()) {
+               greedy_rnbl_guests = calloc(vm->nr_gpcs, sizeof(struct vmm_thread *));
+               assert(greedy_rnbl_guests);
+               vcore_request_total(sched_nr_greedy_cores());
+               syscall(SYS_vmm_ctl, VMM_CTL_SET_EXITS,
+                       syscall(SYS_vmm_ctl, VMM_CTL_GET_EXITS) &
+                               ~(VMM_CTL_EXIT_HALT | VMM_CTL_EXIT_MWAIT));
+       }
        return 0;
 }
 
@@ -497,6 +655,21 @@ static void __task_thread_run(void)
        uth_2ls_thread_exit(tth->func(tth->arg));
 }
 
+static int task_thread_ctor(void *obj, void *priv, int flags)
+{
+       struct vmm_thread *vth = (struct vmm_thread*)obj;
+       struct task_thread *tth = (struct task_thread*)obj;
+
+       memset(vth, 0, sizeof(struct vmm_thread));
+       vth->type = VMM_THREAD_TASK;
+       vth->vm = current_vm;
+       tth->stacksize = VMM_THR_STACKSIZE;
+       tth->stacktop = __alloc_stack(tth->stacksize);
+       if (!tth->stacktop)
+               return -1;
+       return 0;
+}
+
 /* Helper, creates and starts a task thread. */
 static struct task_thread *__vmm_run_task(struct virtual_machine *vm,
                                           void *(*func)(void *), void *arg,
@@ -504,15 +677,7 @@ static struct task_thread *__vmm_run_task(struct virtual_machine *vm,
 {
        struct task_thread *tth;
 
-       tth = (struct task_thread*)alloc_vmm_thread(vm, VMM_THREAD_TASK);
-       if (!tth)
-               return 0;
-       tth->stacksize = VMM_THR_STACKSIZE;
-       tth->stacktop = __alloc_stack(tth->stacksize);
-       if (!tth->stacktop) {
-               free(tth);
-               return 0;
-       }
+       tth = kmem_cache_alloc(task_thread_cache, 0);
        tth->func = func;
        tth->arg = arg;
        init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
@@ -570,19 +735,41 @@ static void acct_thread_unblocked(struct vmm_thread *vth)
        }
 }
 
+static void greedy_mark_guest_runnable(struct vmm_thread *vth)
+{
+       int gpcid;
+
+       if (vth->type == VMM_THREAD_GUEST)
+               gpcid = ((struct guest_thread*)vth)->gpc_id;
+       else
+               gpcid = ((struct ctlr_thread*)vth)->buddy->gpc_id;
+       /* racing with the reader */
+       greedy_rnbl_guests[gpcid] = vth;
+}
+
 static void enqueue_vmm_thread(struct vmm_thread *vth)
 {
-       spin_pdr_lock(&queue_lock);
        switch (vth->type) {
        case VMM_THREAD_GUEST:
        case VMM_THREAD_CTLR:
-               TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
+               if (sched_is_greedy()) {
+                       greedy_mark_guest_runnable(vth);
+               } else {
+                       spin_pdr_lock(&queue_lock);
+                       TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
+                       spin_pdr_unlock(&queue_lock);
+               }
                break;
        case VMM_THREAD_TASK:
+               spin_pdr_lock(&queue_lock);
                TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
+               spin_pdr_unlock(&queue_lock);
+               if (sched_is_greedy())
+                       vcore_wake(0, false);
                break;
+       default:
+               panic("Bad vmm_thread type %p\n", vth->type);
        }
-       spin_pdr_unlock(&queue_lock);
        try_to_get_vcores();
 }
 
@@ -611,7 +798,7 @@ static void *__alloc_stack(size_t stacksize)
        int force_a_page_fault;
        void *stacktop;
        void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
-                             MAP_ANONYMOUS, -1, 0);
+                             MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 
        if (stackbot == MAP_FAILED)
                return 0;