parlib: Implement join/detach() for all uthreads
[akaros.git] / user / vmm / sched.c
1 /* Copyright (c) 2016 Google Inc.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * 2LS for virtual machines */
6
7 #include <vmm/sched.h>
8 #include <vmm/vmm.h>
9 #include <sys/mman.h>
10 #include <stdlib.h>
11 #include <assert.h>
12 #include <parlib/spinlock.h>
13 #include <parlib/event.h>
14 #include <parlib/ucq.h>
15 #include <parlib/arch/trap.h>
16 #include <parlib/ros_debug.h>
17 #include <parlib/vcore_tick.h>
18
19 int vmm_sched_period_usec = 1000;
20
21 /* For now, we only have one VM managed by the 2LS.  If we ever expand that,
22  * we'll need something analogous to current_uthread, so the 2LS knows which VM
23  * it is working on. */
24 static struct virtual_machine *current_vm;
25
26 static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
27 /* Runnable queues, broken up by thread type. */
28 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
29 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
30 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
31 static atomic_t nr_unblk_tasks;
32 static atomic_t nr_unblk_guests;
33 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
34 static struct event_queue *sysc_evq;
35
36 static void vmm_sched_entry(void);
37 static void vmm_thread_runnable(struct uthread *uth);
38 static void vmm_thread_paused(struct uthread *uth);
39 static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
40 static void vmm_thread_has_blocked(struct uthread *uth, uth_sync_t sync_obj,
41                                    int flags);
42 static void vmm_thread_refl_fault(struct uthread *uth,
43                                   struct user_context *ctx);
44 static void vmm_thread_exited(struct uthread *uth);
45
46 struct schedule_ops vmm_sched_ops = {
47         .sched_entry = vmm_sched_entry,
48         .thread_runnable = vmm_thread_runnable,
49         .thread_paused = vmm_thread_paused,
50         .thread_blockon_sysc = vmm_thread_blockon_sysc,
51         .thread_has_blocked = vmm_thread_has_blocked,
52         .thread_refl_fault = vmm_thread_refl_fault,
53         .thread_exited = vmm_thread_exited,
54 };
55
56 /* Helpers */
57 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
58                                void *data);
59 static void acct_thread_blocked(struct vmm_thread *vth);
60 static void acct_thread_unblocked(struct vmm_thread *vth);
61 static void enqueue_vmm_thread(struct vmm_thread *vth);
62 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
63                                            int type);
64 static void *__alloc_stack(size_t stacksize);
65 static void __free_stack(void *stacktop, size_t stacksize);
66
67
68 static void restart_thread(struct syscall *sysc)
69 {
70         struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
71
72         /* uthread stuff here: */
73         assert(ut_restartee);
74         assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
75         ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
76         vmm_thread_runnable(ut_restartee);
77 }
78
79 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
80                                void *data)
81 {
82         struct syscall *sysc;
83
84         /* I think we can make this assert now.  If not, check pthread.c. (concern
85          * was having old ev_qs firing and running this handler). */
86         assert(ev_msg);
87         sysc = ev_msg->ev_arg3;
88         assert(sysc);
89         restart_thread(sysc);
90 }
91
92 /* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will
93  * attempt to route the notifs/IPIs to vcoreid */
94 static struct event_queue *setup_sysc_evq(int vcoreid)
95 {
96         struct event_queue *evq;
97         uintptr_t mmap_block;
98
99         mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
100                                      PROT_WRITE | PROT_READ,
101                                      MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
102         evq = get_eventq_raw();
103         assert(mmap_block && evq);
104         evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
105         evq->ev_vcore = vcoreid;
106         evq->ev_mbox->type = EV_MBOX_UCQ;
107         ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
108         return evq;
109 }
110
111 static void __attribute__((constructor)) vmm_lib_init(void)
112 {
113         struct task_thread *thread0;
114
115         parlib_init_once_racy(return);
116         uthread_lib_init();
117
118         /* Note that thread0 doesn't belong to a VM.  We can set this during
119          * vmm_init() if we need to. */
120         thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
121         assert(thread0);
122         acct_thread_unblocked((struct vmm_thread*)thread0);
123         thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
124         thread0->stacktop = (void*)USTACKTOP;
125         /* for lack of a better vcore, might as well send to 0 */
126         sysc_evq = setup_sysc_evq(0);
127         uthread_2ls_init((struct uthread*)thread0, &vmm_sched_ops,
128                      vmm_handle_syscall, NULL);
129 }
130
131 /* The scheduling policy is encapsulated in the next few functions (from here
132  * down to sched_entry()). */
133
134 static int desired_nr_vcores(void)
135 {
136         /* Sanity checks on our accounting. */
137         assert(atomic_read(&nr_unblk_guests) >= 0);
138         assert(atomic_read(&nr_unblk_tasks) >= 0);
139         /* Lockless peak.  This is always an estimate.  Some of our tasks busy-wait,
140          * so it's not enough to just give us one vcore for all tasks, yet. */
141         return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
142 }
143
144 static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
145 {
146         struct vmm_thread *vth;
147
148         vth = TAILQ_FIRST(tq);
149         if (vth)
150                 TAILQ_REMOVE(tq, vth, tq_next);
151         return vth;
152 }
153
154 static struct vmm_thread *pick_a_thread_degraded(void)
155 {
156         struct vmm_thread *vth = 0;
157         static int next_class = VMM_THREAD_GUEST;
158
159         /* We don't have a lot of cores (maybe 0), so we'll alternate which type of
160          * thread we look at first.  Basically, we're RR within a class of threads,
161          * and we'll toggle between those two classes. */
162         spin_pdr_lock(&queue_lock);
163         if (next_class == VMM_THREAD_GUEST) {
164                 if (!vth)
165                         vth = __pop_first(&rnbl_guests);
166                 if (!vth)
167                         vth = __pop_first(&rnbl_tasks);
168                 next_class = VMM_THREAD_TASK;
169         } else {
170                 if (!vth)
171                         vth = __pop_first(&rnbl_tasks);
172                 if (!vth)
173                         vth = __pop_first(&rnbl_guests);
174                 next_class = VMM_THREAD_GUEST;
175         };
176         spin_pdr_unlock(&queue_lock);
177         return vth;
178 }
179
180 /* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */
181 static struct vmm_thread *pick_a_thread_plenty(void)
182 {
183         struct vmm_thread *vth = 0;
184
185         spin_pdr_lock(&queue_lock);
186         if (!vth)
187                 vth = __pop_first(&rnbl_tasks);
188         if (!vth)
189                 vth = __pop_first(&rnbl_guests);
190         spin_pdr_unlock(&queue_lock);
191         return vth;
192 }
193
194 static void yield_current_uth(void)
195 {
196         struct vmm_thread *vth;
197
198         if (!current_uthread)
199                 return;
200         vth = (struct vmm_thread*)stop_current_uthread();
201         enqueue_vmm_thread(vth);
202 }
203
204 /* Helper, tries to get the right number of vcores.  Returns TRUE if we think we
205  * have enough, FALSE otherwise.
206  *
207  * TODO: this doesn't handle a lot of issues, like preemption, how to
208  * run/yield our vcores, dynamic changes in the number of runnables, where
209  * to send events, how to avoid interfering with gpcs, etc. */
210 static bool try_to_get_vcores(void)
211 {
212         int nr_vcores_wanted = desired_nr_vcores();
213         bool have_enough = nr_vcores_wanted <= num_vcores();
214
215         if (have_enough) {
216                 vcore_tick_disable();
217                 return TRUE;
218         }
219         vcore_tick_enable(vmm_sched_period_usec);
220         vcore_request_total(nr_vcores_wanted);
221         return FALSE;
222 }
223
224 static void __attribute__((noreturn)) vmm_sched_entry(void)
225 {
226         struct vmm_thread *vth;
227         bool have_enough;
228
229         have_enough = try_to_get_vcores();
230         if (!have_enough && vcore_tick_poll()) {
231                 /* slightly less than ideal: we grab the queue lock twice */
232                 yield_current_uth();
233         }
234         if (current_uthread)
235                 run_current_uthread();
236         if (have_enough)
237                 vth = pick_a_thread_plenty();
238         else
239                 vth = pick_a_thread_degraded();
240         if (!vth)
241                 vcore_yield_or_restart();
242         run_uthread((struct uthread*)vth);
243 }
244
245 static void vmm_thread_runnable(struct uthread *uth)
246 {
247         /* A thread that was blocked is now runnable.  This counts as becoming
248          * unblocked (running + runnable) */
249         acct_thread_unblocked((struct vmm_thread*)uth);
250         enqueue_vmm_thread((struct vmm_thread*)uth);
251 }
252
253 static void vmm_thread_paused(struct uthread *uth)
254 {
255         /* The thread stopped for some reason, usually a preemption.  We'd like to
256          * just run it whenever we get a chance.  Note that it didn't become
257          * 'blocked' - it's still runnable. */
258         enqueue_vmm_thread((struct vmm_thread*)uth);
259 }
260
261 static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
262 {
263         struct syscall *sysc = (struct syscall*)syscall;
264
265         acct_thread_blocked((struct vmm_thread*)uth);
266         sysc->u_data = uth;
267         if (!register_evq(sysc, sysc_evq)) {
268                 /* Lost the race with the call being done.  The kernel won't send the
269                  * event.  Just restart him. */
270                 restart_thread(sysc);
271         }
272         /* GIANT WARNING: do not touch the thread after this point. */
273 }
274
275 static void vmm_thread_has_blocked(struct uthread *uth, uth_sync_t sync_obj,
276                                    int flags)
277 {
278         /* The thread blocked on something like a mutex.  It's not runnable, so we
279          * don't need to put it on a list, but we do need to account for it not
280          * running.  We'll find out (via thread_runnable) when it starts up again.
281          */
282         acct_thread_blocked((struct vmm_thread*)uth);
283         if (sync_obj)
284                 __uth_default_sync_enqueue(uth, sync_obj);
285 }
286
287 static void refl_error(struct uthread *uth, unsigned int trap_nr,
288                        unsigned int err, unsigned long aux)
289 {
290         printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
291                trap_nr, err, aux);
292         /* Note that uthread.c already copied out our ctx into the uth
293          * struct */
294         print_user_context(&uth->u_ctx);
295         printf("Turn on printx to spew unhandled, malignant trap info\n");
296         exit(-1);
297 }
298
299 static bool handle_page_fault(struct uthread *uth, unsigned int err,
300                               unsigned long aux)
301 {
302         if (!(err & PF_VMR_BACKED))
303                 return FALSE;
304         syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
305         __block_uthread_on_async_sysc(uth);
306         return TRUE;
307 }
308
309 static void vmm_thread_refl_hw_fault(struct uthread *uth,
310                                      unsigned int trap_nr,
311                                      unsigned int err, unsigned long aux)
312 {
313         switch (trap_nr) {
314         case HW_TRAP_PAGE_FAULT:
315                 if (!handle_page_fault(uth, err, aux))
316                         refl_error(uth, trap_nr, err, aux);
317                 break;
318         default:
319                 refl_error(uth, trap_nr, err, aux);
320         }
321 }
322
323 /* Yield callback for __ctlr_entry */
324 static void __swap_to_gth(struct uthread *uth, void *dummy)
325 {
326         struct ctlr_thread *cth = (struct ctlr_thread*)uth;
327
328         /* We just immediately run our buddy.  The ctlr and the guest are accounted
329          * together ("pass the token" back and forth). */
330         current_uthread = NULL;
331         run_uthread((struct uthread*)cth->buddy);
332         assert(0);
333 }
334
335 /* All ctrl threads start here, each time their guest has a fault.  They can
336  * block and unblock along the way.  Once a ctlr does its final uthread_yield,
337  * the next time it will start again from the top. */
338 static void __ctlr_entry(void)
339 {
340         struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
341         struct virtual_machine *vm = gth_to_vm(cth->buddy);
342
343         if (!handle_vmexit(cth->buddy)) {
344                 struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy);
345
346                 fprintf(stderr, "vmm: handle_vmexit returned false\n");
347                 fprintf(stderr, "Note: this may be a kernel module, not the kernel\n");
348                 fprintf(stderr, "RSP was %p, ", (void *)vm_tf->tf_rsp);
349                 fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip);
350                 /* TODO: properly walk the kernel page tables to map the tf_rip
351                  * to a physical address. For now, however, this hack is good
352                  * enough.
353                  */
354                 hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16);
355                 showstatus(stderr, cth->buddy);
356                 exit(0);
357         }
358         /* We want to atomically yield and start/reenqueue our buddy.  We do so in
359          * vcore context on the other side of the yield. */
360         uthread_yield(FALSE, __swap_to_gth, 0);
361 }
362
363 static void vmm_thread_refl_vm_fault(struct uthread *uth)
364 {
365         struct guest_thread *gth = (struct guest_thread*)uth;
366         struct ctlr_thread *cth = gth->buddy;
367
368         /* The ctlr starts frm the top every time we get a new fault. */
369         cth->uthread.flags |= UTHREAD_SAVED;
370         init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
371                       (uintptr_t)(cth->stacktop));
372         /* We just immediately run our buddy.  The ctlr and the guest are accounted
373          * together ("pass the token" back and forth). */
374         current_uthread = NULL;
375         run_uthread((struct uthread*)cth);
376         assert(0);
377 }
378
379 static void vmm_thread_refl_fault(struct uthread *uth,
380                                   struct user_context *ctx)
381 {
382         switch (ctx->type) {
383         case ROS_HW_CTX:
384                 /* Guests should only ever VM exit */
385                 assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
386                 vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
387                                          __arch_refl_get_err(ctx),
388                                          __arch_refl_get_aux(ctx));
389                 break;
390         case ROS_VM_CTX:
391                 vmm_thread_refl_vm_fault(uth);
392                 break;
393         default:
394                 assert(0);
395         }
396 }
397
398 static void vmm_thread_exited(struct uthread *uth)
399 {
400         struct vmm_thread *vth = (struct vmm_thread*)uth;
401         struct task_thread *tth = (struct task_thread*)uth;
402
403         /* Catch bugs.  Right now, only tasks threads can exit. */
404         assert(vth->type == VMM_THREAD_TASK);
405
406         acct_thread_blocked((struct vmm_thread*)tth);
407         uthread_cleanup(uth);
408         __free_stack(tth->stacktop, tth->stacksize);
409         free(tth);
410 }
411
412 static void destroy_guest_thread(struct guest_thread *gth)
413 {
414         struct ctlr_thread *cth = gth->buddy;
415
416         __free_stack(cth->stacktop, cth->stacksize);
417         uthread_cleanup((struct uthread*)cth);
418         free(cth);
419         uthread_cleanup((struct uthread*)gth);
420         free(gth);
421 }
422
423 static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
424                                                 unsigned int gpcoreid)
425 {
426         struct guest_thread *gth;
427         struct ctlr_thread *cth;
428         /* Guests won't use TLS; they always operate in Ring V.  The controller
429          * might - not because of anything we do, but because of glibc calls. */
430         struct uth_thread_attr gth_attr = {.want_tls = FALSE};
431         struct uth_thread_attr cth_attr = {.want_tls = TRUE};
432
433         gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
434         cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
435         if (!gth || !cth) {
436                 free(gth);
437                 free(cth);
438                 return 0;
439         }
440         gth->buddy = cth;
441         cth->buddy = gth;
442         gth->gpc_id = gpcoreid;
443         cth->stacksize = VMM_THR_STACKSIZE;
444         cth->stacktop = __alloc_stack(cth->stacksize);
445         if (!cth->stacktop) {
446                 free(gth);
447                 free(cth);
448                 return 0;
449         }
450         gth->uthread.u_ctx.type = ROS_VM_CTX;
451         gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
452         /* No need to init the ctlr.  It gets re-init'd each time it starts. */
453         uthread_init((struct uthread*)gth, &gth_attr);
454         uthread_init((struct uthread*)cth, &cth_attr);
455         /* TODO: give it a correct FP state.  Our current one is probably fine */
456         restore_fp_state(&gth->uthread.as);
457         gth->uthread.flags |= UTHREAD_FPSAVED;
458         gth->halt_mtx = uth_mutex_alloc();
459         gth->halt_cv = uth_cond_var_alloc();
460         return gth;
461 }
462
463 int vmm_init(struct virtual_machine *vm, int flags)
464 {
465         struct guest_thread **gths;
466
467         if (current_vm)
468                 return -1;
469         current_vm = vm;
470         if (syscall(SYS_vmm_setup, vm->nr_gpcs, vm->gpcis, flags) != vm->nr_gpcs)
471                 return -1;
472         gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
473         if (!gths)
474                 return -1;
475         for (int i = 0; i < vm->nr_gpcs; i++) {
476                 gths[i] = create_guest_thread(vm, i);
477                 if (!gths[i]) {
478                         for (int j = 0; j < i; j++)
479                                 destroy_guest_thread(gths[j]);
480                         free(gths);
481                         return -1;
482                 }
483         }
484         vm->gths = gths;
485         uthread_mcp_init();
486         return 0;
487 }
488
489 void start_guest_thread(struct guest_thread *gth)
490 {
491         acct_thread_unblocked((struct vmm_thread*)gth);
492         enqueue_vmm_thread((struct vmm_thread*)gth);
493 }
494
495 static void __task_thread_run(void)
496 {
497         struct task_thread *tth = (struct task_thread*)current_uthread;
498
499         tth->func(tth->arg);
500         uth_2ls_thread_exit(0);
501 }
502
503 struct task_thread *vmm_run_task(struct virtual_machine *vm,
504                                  void (*func)(void *), void *arg)
505 {
506         struct task_thread *tth;
507         struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = TRUE};
508
509         tth = (struct task_thread*)alloc_vmm_thread(vm, VMM_THREAD_TASK);
510         if (!tth)
511                 return 0;
512         tth->stacksize = VMM_THR_STACKSIZE;
513         tth->stacktop = __alloc_stack(tth->stacksize);
514         if (!tth->stacktop) {
515                 free(tth);
516                 return 0;
517         }
518         tth->func = func;
519         tth->arg = arg;
520         init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
521                       (uintptr_t)(tth->stacktop));
522         uthread_init((struct uthread*)tth, &tth_attr);
523         acct_thread_unblocked((struct vmm_thread*)tth);
524         enqueue_vmm_thread((struct vmm_thread*)tth);
525         return tth;
526 }
527
528 /* Helpers for tracking nr_unblk_* threads. */
529 static void acct_thread_blocked(struct vmm_thread *vth)
530 {
531         switch (vth->type) {
532         case VMM_THREAD_GUEST:
533         case VMM_THREAD_CTLR:
534                 atomic_dec(&nr_unblk_guests);
535                 break;
536         case VMM_THREAD_TASK:
537                 atomic_dec(&nr_unblk_tasks);
538                 break;
539         }
540 }
541
542 static void acct_thread_unblocked(struct vmm_thread *vth)
543 {
544         switch (vth->type) {
545         case VMM_THREAD_GUEST:
546         case VMM_THREAD_CTLR:
547                 atomic_inc(&nr_unblk_guests);
548                 break;
549         case VMM_THREAD_TASK:
550                 atomic_inc(&nr_unblk_tasks);
551                 break;
552         }
553 }
554
555 static void enqueue_vmm_thread(struct vmm_thread *vth)
556 {
557         spin_pdr_lock(&queue_lock);
558         switch (vth->type) {
559         case VMM_THREAD_GUEST:
560         case VMM_THREAD_CTLR:
561                 TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
562                 break;
563         case VMM_THREAD_TASK:
564                 TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
565                 break;
566         }
567         spin_pdr_unlock(&queue_lock);
568         try_to_get_vcores();
569 }
570
571 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
572 {
573         struct vmm_thread *vth;
574         int ret;
575
576         ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
577                              sizeof(struct vmm_thread));
578         if (ret)
579                 return 0;
580         memset(vth, 0, sizeof(struct vmm_thread));
581         vth->type = type;
582         vth->vm = vm;
583         return vth;
584 }
585
586 static void __free_stack(void *stacktop, size_t stacksize)
587 {
588         munmap(stacktop - stacksize, stacksize);
589 }
590
591 static void *__alloc_stack(size_t stacksize)
592 {
593         int force_a_page_fault;
594         void *stacktop;
595         void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
596                               MAP_ANONYMOUS, -1, 0);
597
598         if (stackbot == MAP_FAILED)
599                 return 0;
600         stacktop = stackbot + stacksize;
601         /* Want the top of the stack populated, but not the rest of the stack;
602          * that'll grow on demand (up to stacksize, then will clobber memory). */
603         force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
604         return stacktop;
605 }