parlib: vmm: Allow VM uthreads to have no FP state
[akaros.git] / user / vmm / sched.c
index 744cb39..26d5d17 100644 (file)
@@ -15,6 +15,7 @@
 #include <parlib/arch/trap.h>
 #include <parlib/ros_debug.h>
 #include <parlib/vcore_tick.h>
+#include <parlib/slab.h>
 
 int vmm_sched_period_usec = 1000;
 
@@ -27,24 +28,27 @@ static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
 /* Runnable queues, broken up by thread type. */
 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
+static struct vmm_thread **greedy_rnbl_guests;
 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
 static atomic_t nr_unblk_tasks;
 static atomic_t nr_unblk_guests;
 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
 static struct event_queue *sysc_evq;
+static struct kmem_cache *task_thread_cache;
 
+static void vmm_sched_init(void);
 static void vmm_sched_entry(void);
 static void vmm_thread_runnable(struct uthread *uth);
 static void vmm_thread_paused(struct uthread *uth);
 static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
-static void vmm_thread_has_blocked(struct uthread *uth, uth_sync_t *sync_obj,
-                                   int flags);
+static void vmm_thread_has_blocked(struct uthread *uth, int flags);
 static void vmm_thread_refl_fault(struct uthread *uth,
                                   struct user_context *ctx);
 static void vmm_thread_exited(struct uthread *uth);
 static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg);
 
 struct schedule_ops vmm_sched_ops = {
+       .sched_init = vmm_sched_init,
        .sched_entry = vmm_sched_entry,
        .thread_runnable = vmm_thread_runnable,
        .thread_paused = vmm_thread_paused,
@@ -55,17 +59,32 @@ struct schedule_ops vmm_sched_ops = {
        .thread_create = vmm_thread_create,
 };
 
+struct schedule_ops *sched_ops = &vmm_sched_ops;
+
 /* Helpers */
 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
                                void *data);
 static void acct_thread_blocked(struct vmm_thread *vth);
 static void acct_thread_unblocked(struct vmm_thread *vth);
 static void enqueue_vmm_thread(struct vmm_thread *vth);
+static int task_thread_ctor(void *obj, void *priv, int flags);
+static void task_thread_dtor(void *obj, void *priv);
 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
                                            int type);
 static void *__alloc_stack(size_t stacksize);
 static void __free_stack(void *stacktop, size_t stacksize);
 
+static bool sched_is_greedy(void)
+{
+       return parlib_never_yield;
+}
+
+static unsigned int sched_nr_greedy_cores(void)
+{
+       if (!current_vm)
+               return 1;
+       return current_vm->nr_gpcs + 1;
+}
 
 static void restart_thread(struct syscall *sysc)
 {
@@ -100,7 +119,8 @@ static struct event_queue *setup_sysc_evq(int vcoreid)
 
        mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
                                     PROT_WRITE | PROT_READ,
-                                    MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
+                                    MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE,
+                                    -1, 0);
        evq = get_eventq_raw();
        assert(mmap_block && evq);
        evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
@@ -110,13 +130,10 @@ static struct event_queue *setup_sysc_evq(int vcoreid)
        return evq;
 }
 
-static void __attribute__((constructor)) vmm_lib_init(void)
+static void vmm_sched_init(void)
 {
        struct task_thread *thread0;
 
-       parlib_init_once_racy(return);
-       uthread_lib_init();
-
        /* Note that thread0 doesn't belong to a VM.  We can set this during
         * vmm_init() if we need to. */
        thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
@@ -126,8 +143,12 @@ static void __attribute__((constructor)) vmm_lib_init(void)
        thread0->stacktop = (void*)USTACKTOP;
        /* for lack of a better vcore, might as well send to 0 */
        sysc_evq = setup_sysc_evq(0);
-       uthread_2ls_init((struct uthread*)thread0, &vmm_sched_ops,
-                     vmm_handle_syscall, NULL);
+       uthread_2ls_init((struct uthread*)thread0, vmm_handle_syscall, NULL);
+       task_thread_cache = kmem_cache_create("task threads",
+                                             sizeof(struct vmm_thread),
+                                             __alignof__(struct vmm_thread), 0,
+                                             task_thread_ctor, task_thread_dtor,
+                                             NULL);
 }
 
 /* The scheduling policy is encapsulated in the next few functions (from here
@@ -155,26 +176,12 @@ static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
 
 static struct vmm_thread *pick_a_thread_degraded(void)
 {
-       struct vmm_thread *vth = 0;
-       static int next_class = VMM_THREAD_GUEST;
+       struct vmm_thread *vth;
 
-       /* We don't have a lot of cores (maybe 0), so we'll alternate which type of
-        * thread we look at first.  Basically, we're RR within a class of threads,
-        * and we'll toggle between those two classes. */
        spin_pdr_lock(&queue_lock);
-       if (next_class == VMM_THREAD_GUEST) {
-               if (!vth)
-                       vth = __pop_first(&rnbl_guests);
-               if (!vth)
-                       vth = __pop_first(&rnbl_tasks);
-               next_class = VMM_THREAD_TASK;
-       } else {
-               if (!vth)
-                       vth = __pop_first(&rnbl_tasks);
-               if (!vth)
-                       vth = __pop_first(&rnbl_guests);
-               next_class = VMM_THREAD_GUEST;
-       };
+       vth = __pop_first(&rnbl_tasks);
+       if (!vth)
+               vth = __pop_first(&rnbl_guests);
        spin_pdr_unlock(&queue_lock);
        return vth;
 }
@@ -211,9 +218,13 @@ static void yield_current_uth(void)
  * to send events, how to avoid interfering with gpcs, etc. */
 static bool try_to_get_vcores(void)
 {
-       int nr_vcores_wanted = desired_nr_vcores();
-       bool have_enough = nr_vcores_wanted <= num_vcores();
+       int nr_vcores_wanted;
+       bool have_enough;
 
+       if (sched_is_greedy())
+               return num_vcores() == sched_nr_greedy_cores();
+       nr_vcores_wanted = desired_nr_vcores();
+       have_enough = nr_vcores_wanted <= num_vcores();
        if (have_enough) {
                vcore_tick_disable();
                return TRUE;
@@ -223,7 +234,44 @@ static bool try_to_get_vcores(void)
        return FALSE;
 }
 
-static void __attribute__((noreturn)) vmm_sched_entry(void)
+static void stats_run_vth(struct vmm_thread *vth)
+{
+       vth->nr_runs++;
+       if (vth->prev_vcoreid != vcore_id()) {
+               vth->prev_vcoreid = vcore_id();
+               vth->nr_resched++;
+       }
+}
+
+/* TODO: This assumes we get all of our vcores. */
+static struct vmm_thread *sched_pick_thread_greedy(void)
+{
+       struct vmm_thread *vth;
+
+       if (current_uthread) {
+               stats_run_vth((struct vmm_thread*)current_uthread);
+               run_current_uthread();
+       }
+       if (vcore_id() == 0) {
+               spin_pdr_lock(&queue_lock);
+               vth = __pop_first(&rnbl_tasks);
+               spin_pdr_unlock(&queue_lock);
+               return vth;
+       }
+       /* This races with enqueue_vmm_thread, which can run on another core.
+        * Here are the rules:
+        * - set when runnable (race free, only one state for the thread at a time)
+        * - cleared when we run it (race free, we're the only runners)
+        * - if we take an interrupt, we'll just run_current_uthread and not check
+        * - if we vmexit, we'll run the buddy directly */
+       assert(vcore_id() <= current_vm->nr_gpcs);
+       vth = greedy_rnbl_guests[vcore_id() - 1];
+       if (vth)
+               greedy_rnbl_guests[vcore_id() - 1] = NULL;
+       return vth;
+}
+
+static struct vmm_thread *sched_pick_thread_nice(void)
 {
        struct vmm_thread *vth;
        bool have_enough;
@@ -233,14 +281,28 @@ static void __attribute__((noreturn)) vmm_sched_entry(void)
                /* slightly less than ideal: we grab the queue lock twice */
                yield_current_uth();
        }
-       if (current_uthread)
+       if (current_uthread) {
+               stats_run_vth((struct vmm_thread*)current_uthread);
                run_current_uthread();
+       }
        if (have_enough)
                vth = pick_a_thread_plenty();
        else
                vth = pick_a_thread_degraded();
+       return vth;
+}
+
+static void __attribute__((noreturn)) vmm_sched_entry(void)
+{
+       struct vmm_thread *vth;
+
+       if (sched_is_greedy())
+               vth = sched_pick_thread_greedy();
+       else
+               vth = sched_pick_thread_nice();
        if (!vth)
                vcore_yield_or_restart();
+       stats_run_vth(vth);
        run_uthread((struct uthread*)vth);
 }
 
@@ -274,16 +336,13 @@ static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
        /* GIANT WARNING: do not touch the thread after this point. */
 }
 
-static void vmm_thread_has_blocked(struct uthread *uth, uth_sync_t *sync_obj,
-                                   int flags)
+static void vmm_thread_has_blocked(struct uthread *uth, int flags)
 {
        /* The thread blocked on something like a mutex.  It's not runnable, so we
         * don't need to put it on a list, but we do need to account for it not
         * running.  We'll find out (via thread_runnable) when it starts up again.
         */
        acct_thread_blocked((struct vmm_thread*)uth);
-       if (sync_obj)
-               __uth_default_sync_enqueue(uth, sync_obj);
 }
 
 static void refl_error(struct uthread *uth, unsigned int trap_nr,
@@ -330,6 +389,7 @@ static void __swap_to_gth(struct uthread *uth, void *dummy)
        /* We just immediately run our buddy.  The ctlr and the guest are accounted
         * together ("pass the token" back and forth). */
        current_uthread = NULL;
+       stats_run_vth((struct vmm_thread*)cth->buddy);
        run_uthread((struct uthread*)cth->buddy);
        assert(0);
 }
@@ -367,6 +427,7 @@ static void vmm_thread_refl_vm_fault(struct uthread *uth)
        struct guest_thread *gth = (struct guest_thread*)uth;
        struct ctlr_thread *cth = gth->buddy;
 
+       gth->nr_vmexits++;
        /* The ctlr starts frm the top every time we get a new fault. */
        cth->uthread.flags |= UTHREAD_SAVED;
        init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
@@ -374,6 +435,7 @@ static void vmm_thread_refl_vm_fault(struct uthread *uth)
        /* We just immediately run our buddy.  The ctlr and the guest are accounted
         * together ("pass the token" back and forth). */
        current_uthread = NULL;
+       stats_run_vth((struct vmm_thread*)cth);
        run_uthread((struct uthread*)cth);
        assert(0);
 }
@@ -397,6 +459,13 @@ static void vmm_thread_refl_fault(struct uthread *uth,
        }
 }
 
+static void task_thread_dtor(void *obj, void *priv)
+{
+       struct task_thread *tth = (struct task_thread*)obj;
+
+       __free_stack(tth->stacktop, tth->stacksize);
+}
+
 static void vmm_thread_exited(struct uthread *uth)
 {
        struct vmm_thread *vth = (struct vmm_thread*)uth;
@@ -407,8 +476,9 @@ static void vmm_thread_exited(struct uthread *uth)
 
        acct_thread_blocked((struct vmm_thread*)tth);
        uthread_cleanup(uth);
-       __free_stack(tth->stacktop, tth->stacksize);
-       free(tth);
+       if (uth->flags & UTHREAD_IS_THREAD0)
+               return;
+       kmem_cache_free(task_thread_cache, tth);
 }
 
 static void destroy_guest_thread(struct guest_thread *gth)
@@ -451,17 +521,45 @@ static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
        }
        gth->uthread.u_ctx.type = ROS_VM_CTX;
        gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
-       /* No need to init the ctlr.  It gets re-init'd each time it starts. */
        uthread_init((struct uthread*)gth, &gth_attr);
        uthread_init((struct uthread*)cth, &cth_attr);
-       /* TODO: give it a correct FP state.  Our current one is probably fine */
-       restore_fp_state(&gth->uthread.as);
-       gth->uthread.flags |= UTHREAD_FPSAVED;
        gth->halt_mtx = uth_mutex_alloc();
        gth->halt_cv = uth_cond_var_alloc();
        return gth;
 }
 
+static void ev_handle_diag(struct event_msg *ev_msg, unsigned int ev_type,
+                           void *data)
+{
+       struct virtual_machine *vm = current_vm;
+       struct guest_thread *gth;
+       struct ctlr_thread *cth;
+       bool reset = FALSE;
+
+       if (ev_msg && (ev_msg->ev_arg1 == 1))
+               reset = TRUE;
+
+       fprintf(stderr, "\nSCHED stats:\n---------------\n");
+       for (int i = 0; i < vm->nr_gpcs; i++) {
+               gth = vm->gths[i];
+               cth = gth->buddy;
+               fprintf(stderr, "\tGPC %2d: %lu resched, %lu gth runs, %lu ctl runs, %lu user-handled vmexits\n",
+                               i,
+                       ((struct vmm_thread*)gth)->nr_resched,
+                       ((struct vmm_thread*)gth)->nr_runs,
+                       ((struct vmm_thread*)cth)->nr_runs,
+                       gth->nr_vmexits);
+               if (reset) {
+                   ((struct vmm_thread*)gth)->nr_resched = 0;
+                   ((struct vmm_thread*)gth)->nr_runs = 0;
+                   ((struct vmm_thread*)cth)->nr_runs = 0;
+                   gth->nr_vmexits = 0;
+               }
+       }
+       fprintf(stderr, "\n\tNr unblocked gpc %lu, Nr unblocked tasks %lu\n",
+               atomic_read(&nr_unblk_guests), atomic_read(&nr_unblk_tasks));
+}
+
 int vmm_init(struct virtual_machine *vm, int flags)
 {
        struct guest_thread **gths;
@@ -485,6 +583,14 @@ int vmm_init(struct virtual_machine *vm, int flags)
        }
        vm->gths = gths;
        uthread_mcp_init();
+       register_ev_handler(EV_FREE_APPLE_PIE, ev_handle_diag, NULL);
+       if (sched_is_greedy()) {
+               greedy_rnbl_guests = calloc(vm->nr_gpcs, sizeof(struct vmm_thread *));
+               assert(greedy_rnbl_guests);
+               vcore_request_total(sched_nr_greedy_cores());
+               syscall(SYS_vmm_ctl, VMM_CTL_SET_EXITS,
+                       syscall(SYS_vmm_ctl, VMM_CTL_GET_EXITS) & ~VMM_CTL_EXIT_HALT);
+       }
        return 0;
 }
 
@@ -501,6 +607,21 @@ static void __task_thread_run(void)
        uth_2ls_thread_exit(tth->func(tth->arg));
 }
 
+static int task_thread_ctor(void *obj, void *priv, int flags)
+{
+       struct vmm_thread *vth = (struct vmm_thread*)obj;
+       struct task_thread *tth = (struct task_thread*)obj;
+
+       memset(vth, 0, sizeof(struct vmm_thread));
+       vth->type = VMM_THREAD_TASK;
+       vth->vm = current_vm;
+       tth->stacksize = VMM_THR_STACKSIZE;
+       tth->stacktop = __alloc_stack(tth->stacksize);
+       if (!tth->stacktop)
+               return -1;
+       return 0;
+}
+
 /* Helper, creates and starts a task thread. */
 static struct task_thread *__vmm_run_task(struct virtual_machine *vm,
                                           void *(*func)(void *), void *arg,
@@ -508,15 +629,7 @@ static struct task_thread *__vmm_run_task(struct virtual_machine *vm,
 {
        struct task_thread *tth;
 
-       tth = (struct task_thread*)alloc_vmm_thread(vm, VMM_THREAD_TASK);
-       if (!tth)
-               return 0;
-       tth->stacksize = VMM_THR_STACKSIZE;
-       tth->stacktop = __alloc_stack(tth->stacksize);
-       if (!tth->stacktop) {
-               free(tth);
-               return 0;
-       }
+       tth = kmem_cache_alloc(task_thread_cache, 0);
        tth->func = func;
        tth->arg = arg;
        init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
@@ -574,19 +687,39 @@ static void acct_thread_unblocked(struct vmm_thread *vth)
        }
 }
 
+static void greedy_mark_guest_runnable(struct vmm_thread *vth)
+{
+       int gpcid;
+
+       if (vth->type == VMM_THREAD_GUEST)
+               gpcid = ((struct guest_thread*)vth)->gpc_id;
+       else
+               gpcid = ((struct ctlr_thread*)vth)->buddy->gpc_id;
+       /* racing with the reader */
+       greedy_rnbl_guests[gpcid] = vth;
+}
+
 static void enqueue_vmm_thread(struct vmm_thread *vth)
 {
-       spin_pdr_lock(&queue_lock);
        switch (vth->type) {
        case VMM_THREAD_GUEST:
        case VMM_THREAD_CTLR:
-               TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
+               if (sched_is_greedy()) {
+                       greedy_mark_guest_runnable(vth);
+               } else {
+                       spin_pdr_lock(&queue_lock);
+                       TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
+                       spin_pdr_unlock(&queue_lock);
+               }
                break;
        case VMM_THREAD_TASK:
+               spin_pdr_lock(&queue_lock);
                TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
+               spin_pdr_unlock(&queue_lock);
                break;
+       default:
+               panic("Bad vmm_thread type %p\n", vth->type);
        }
-       spin_pdr_unlock(&queue_lock);
        try_to_get_vcores();
 }
 
@@ -615,7 +748,7 @@ static void *__alloc_stack(size_t stacksize)
        int force_a_page_fault;
        void *stacktop;
        void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
-                             MAP_ANONYMOUS, -1, 0);
+                             MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 
        if (stackbot == MAP_FAILED)
                return 0;