Kick the VMM 2LS when enqueueing a thread
[akaros.git] / user / vmm / sched.c
1 /* Copyright (c) 2016 Google Inc.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * 2LS for virtual machines */
6
7 #include <vmm/sched.h>
8 #include <vmm/vmm.h>
9 #include <sys/mman.h>
10 #include <stdlib.h>
11 #include <assert.h>
12 #include <parlib/spinlock.h>
13 #include <parlib/event.h>
14 #include <parlib/ucq.h>
15 #include <parlib/arch/trap.h>
16 #include <parlib/ros_debug.h>
17 #include <benchutil/vcore_tick.h>
18
19 int vmm_sched_period_usec = 1000;
20
21 /* For now, we only have one VM managed by the 2LS.  If we ever expand that,
22  * we'll need something analogous to current_uthread, so the 2LS knows which VM
23  * it is working on. */
24 static struct virtual_machine *current_vm;
25
26 static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
27 /* Runnable queues, broken up by thread type. */
28 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
29 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
30 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
31 static atomic_t nr_unblk_tasks;
32 static atomic_t nr_unblk_guests;
33 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
34 static struct event_queue *sysc_evq;
35
36 static void vmm_sched_entry(void);
37 static void vmm_thread_runnable(struct uthread *uth);
38 static void vmm_thread_paused(struct uthread *uth);
39 static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
40 static void vmm_thread_has_blocked(struct uthread *uth, int flags);
41 static void vmm_thread_refl_fault(struct uthread *uth,
42                                   struct user_context *ctx);
43
44 struct schedule_ops vmm_sched_ops = {
45         .sched_entry = vmm_sched_entry,
46         .thread_runnable = vmm_thread_runnable,
47         .thread_paused = vmm_thread_paused,
48         .thread_blockon_sysc = vmm_thread_blockon_sysc,
49         .thread_has_blocked = vmm_thread_has_blocked,
50         .thread_refl_fault = vmm_thread_refl_fault,
51 };
52
53 /* Helpers */
54 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
55                                void *data);
56 static void acct_thread_blocked(struct vmm_thread *vth);
57 static void acct_thread_unblocked(struct vmm_thread *vth);
58 static void enqueue_vmm_thread(struct vmm_thread *vth);
59 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
60                                            int type);
61 static void *__alloc_stack(size_t stacksize);
62 static void __free_stack(void *stacktop, size_t stacksize);
63
64
65 static void restart_thread(struct syscall *sysc)
66 {
67         struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
68
69         /* uthread stuff here: */
70         assert(ut_restartee);
71         assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
72         ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
73         vmm_thread_runnable(ut_restartee);
74 }
75
76 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
77                                void *data)
78 {
79         struct syscall *sysc;
80
81         /* I think we can make this assert now.  If not, check pthread.c. (concern
82          * was having old ev_qs firing and running this handler). */
83         assert(ev_msg);
84         sysc = ev_msg->ev_arg3;
85         assert(sysc);
86         restart_thread(sysc);
87 }
88
89 /* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will
90  * attempt to route the notifs/IPIs to vcoreid */
91 static struct event_queue *setup_sysc_evq(int vcoreid)
92 {
93         struct event_queue *evq;
94         uintptr_t mmap_block;
95
96         mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
97                                      PROT_WRITE | PROT_READ,
98                                      MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
99         evq = get_eventq_raw();
100         assert(mmap_block && evq);
101         evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
102         evq->ev_vcore = vcoreid;
103         evq->ev_mbox->type = EV_MBOX_UCQ;
104         ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
105         return evq;
106 }
107
108 static void __attribute__((constructor)) vmm_lib_init(void)
109 {
110         struct task_thread *thread0;
111
112         init_once_racy(return);
113         uthread_lib_init();
114
115         /* Note that thread0 doesn't belong to a VM.  We can set this during
116          * vmm_init() if we need to. */
117         thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
118         assert(thread0);
119         acct_thread_unblocked((struct vmm_thread*)thread0);
120         thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
121         thread0->stacktop = (void*)USTACKTOP;
122         /* for lack of a better vcore, might as well send to 0 */
123         sysc_evq = setup_sysc_evq(0);
124         register_ev_handler(EV_SYSCALL, vmm_handle_syscall, 0);
125         uthread_2ls_init((struct uthread*)thread0, &vmm_sched_ops);
126 }
127
128 /* The scheduling policy is encapsulated in the next few functions (from here
129  * down to sched_entry()). */
130
131 static int desired_nr_vcores(void)
132 {
133         /* Sanity checks on our accounting. */
134         assert(atomic_read(&nr_unblk_guests) >= 0);
135         assert(atomic_read(&nr_unblk_tasks) >= 0);
136         /* Lockless peak.  This is always an estimate.  Some of our tasks busy-wait,
137          * so it's not enough to just give us one vcore for all tasks, yet. */
138         return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
139 }
140
141 static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
142 {
143         struct vmm_thread *vth;
144
145         vth = TAILQ_FIRST(tq);
146         if (vth)
147                 TAILQ_REMOVE(tq, vth, tq_next);
148         return vth;
149 }
150
151 static struct vmm_thread *pick_a_thread_degraded(void)
152 {
153         struct vmm_thread *vth = 0;
154         static int next_class = VMM_THREAD_GUEST;
155
156         /* We don't have a lot of cores (maybe 0), so we'll alternate which type of
157          * thread we look at first.  Basically, we're RR within a class of threads,
158          * and we'll toggle between those two classes. */
159         spin_pdr_lock(&queue_lock);
160         if (next_class == VMM_THREAD_GUEST) {
161                 if (!vth)
162                         vth = __pop_first(&rnbl_guests);
163                 if (!vth)
164                         vth = __pop_first(&rnbl_tasks);
165                 next_class = VMM_THREAD_TASK;
166         } else {
167                 if (!vth)
168                         vth = __pop_first(&rnbl_tasks);
169                 if (!vth)
170                         vth = __pop_first(&rnbl_guests);
171                 next_class = VMM_THREAD_GUEST;
172         };
173         spin_pdr_unlock(&queue_lock);
174         return vth;
175 }
176
177 /* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */
178 static struct vmm_thread *pick_a_thread_plenty(void)
179 {
180         struct vmm_thread *vth = 0;
181
182         spin_pdr_lock(&queue_lock);
183         if (!vth)
184                 vth = __pop_first(&rnbl_tasks);
185         if (!vth)
186                 vth = __pop_first(&rnbl_guests);
187         spin_pdr_unlock(&queue_lock);
188         return vth;
189 }
190
191 static void yield_current_uth(void)
192 {
193         struct vmm_thread *vth;
194
195         if (!current_uthread)
196                 return;
197         vth = (struct vmm_thread*)stop_current_uthread();
198         enqueue_vmm_thread(vth);
199 }
200
201 /* Helper, tries to get the right number of vcores.  Returns TRUE if we think we
202  * have enough, FALSE otherwise.
203  *
204  * TODO: this doesn't handle a lot of issues, like preemption, how to
205  * run/yield our vcores, dynamic changes in the number of runnables, where
206  * to send events, how to avoid interfering with gpcs, etc. */
207 static bool try_to_get_vcores(void)
208 {
209         int nr_vcores_wanted = desired_nr_vcores();
210         bool have_enough = nr_vcores_wanted <= num_vcores();
211
212         if (have_enough) {
213                 vcore_tick_disable();
214                 return TRUE;
215         }
216         vcore_tick_enable(vmm_sched_period_usec);
217         vcore_request_total(nr_vcores_wanted);
218         return FALSE;
219 }
220
221 static void __attribute__((noreturn)) vmm_sched_entry(void)
222 {
223         struct vmm_thread *vth;
224         bool have_enough;
225
226         have_enough = try_to_get_vcores();
227         if (!have_enough && vcore_tick_poll()) {
228                 /* slightly less than ideal: we grab the queue lock twice */
229                 yield_current_uth();
230         }
231         if (current_uthread)
232                 run_current_uthread();
233         if (have_enough)
234                 vth = pick_a_thread_plenty();
235         else
236                 vth = pick_a_thread_degraded();
237         if (!vth)
238                 vcore_yield_or_restart();
239         run_uthread((struct uthread*)vth);
240 }
241
242 static void vmm_thread_runnable(struct uthread *uth)
243 {
244         /* A thread that was blocked is now runnable.  This counts as becoming
245          * unblocked (running + runnable) */
246         acct_thread_unblocked((struct vmm_thread*)uth);
247         enqueue_vmm_thread((struct vmm_thread*)uth);
248 }
249
250 static void vmm_thread_paused(struct uthread *uth)
251 {
252         /* The thread stopped for some reason, usually a preemption.  We'd like to
253          * just run it whenever we get a chance.  Note that it didn't become
254          * 'blocked' - it's still runnable. */
255         enqueue_vmm_thread((struct vmm_thread*)uth);
256 }
257
258 static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
259 {
260         struct syscall *sysc = (struct syscall*)syscall;
261
262         acct_thread_blocked((struct vmm_thread*)uth);
263         sysc->u_data = uth;
264         if (!register_evq(sysc, sysc_evq)) {
265                 /* Lost the race with the call being done.  The kernel won't send the
266                  * event.  Just restart him. */
267                 restart_thread(sysc);
268         }
269         /* GIANT WARNING: do not touch the thread after this point. */
270 }
271
272 static void vmm_thread_has_blocked(struct uthread *uth, int flags)
273 {
274         /* The thread blocked on something like a mutex.  It's not runnable, so we
275          * don't need to put it on a list, but we do need to account for it not
276          * running.  We'll find out (via thread_runnable) when it starts up again.
277          */
278         acct_thread_blocked((struct vmm_thread*)uth);
279 }
280
281 static void refl_error(struct uthread *uth, unsigned int trap_nr,
282                        unsigned int err, unsigned long aux)
283 {
284         printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
285                trap_nr, err, aux);
286         /* Note that uthread.c already copied out our ctx into the uth
287          * struct */
288         print_user_context(&uth->u_ctx);
289         printf("Turn on printx to spew unhandled, malignant trap info\n");
290         exit(-1);
291 }
292
293 static bool handle_page_fault(struct uthread *uth, unsigned int err,
294                               unsigned long aux)
295 {
296         if (!(err & PF_VMR_BACKED))
297                 return FALSE;
298         syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
299         __block_uthread_on_async_sysc(uth);
300         return TRUE;
301 }
302
303 static void vmm_thread_refl_hw_fault(struct uthread *uth,
304                                      unsigned int trap_nr,
305                                      unsigned int err, unsigned long aux)
306 {
307         switch (trap_nr) {
308         case HW_TRAP_PAGE_FAULT:
309                 if (!handle_page_fault(uth, err, aux))
310                         refl_error(uth, trap_nr, err, aux);
311                 break;
312         default:
313                 refl_error(uth, trap_nr, err, aux);
314         }
315 }
316
317 /* Yield callback for __ctlr_entry */
318 static void __swap_to_gth(struct uthread *uth, void *dummy)
319 {
320         struct ctlr_thread *cth = (struct ctlr_thread*)uth;
321
322         /* We don't re-account for block/unblock.  The ctlr and the guest are
323          * accounted together ("pass the token" back and forth). */
324         enqueue_vmm_thread((struct vmm_thread*)cth->buddy);
325 }
326
327 /* All ctrl threads start here, each time their guest has a fault.  They can
328  * block and unblock along the way.  Once a ctlr does its final uthread_yield,
329  * the next time it will start again from the top. */
330 static void __ctlr_entry(void)
331 {
332         struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
333         struct virtual_machine *vm = gth_to_vm(cth->buddy);
334
335         if (!handle_vmexit(cth->buddy)) {
336                 struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy);
337
338                 fprintf(stderr, "vmm: handle_vmexit returned false\n");
339                 fprintf(stderr, "Note: this may be a kernel module, not the kernel\n");
340                 fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip);
341                 /* TODO: properly walk the kernel page tables to map the tf_rip
342                  * to a physical address. For now, however, this hack is good
343                  * enough.
344                  */
345                 hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16);
346                 showstatus(stderr, cth->buddy);
347                 exit(0);
348         }
349         /* We want to atomically yield and start/reenqueue our buddy.  We do so in
350          * vcore context on the other side of the yield. */
351         uthread_yield(FALSE, __swap_to_gth, 0);
352 }
353
354 static void vmm_thread_refl_vm_fault(struct uthread *uth)
355 {
356         struct guest_thread *gth = (struct guest_thread*)uth;
357         struct ctlr_thread *cth = gth->buddy;
358
359         /* The ctlr starts frm the top every time we get a new fault. */
360         cth->uthread.flags |= UTHREAD_SAVED;
361         init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
362                       (uintptr_t)(cth->stacktop));
363         /* We don't re-account for block/unblock.  The ctlr and the guest are
364          * accounted together ("pass the token" back and forth). */
365         enqueue_vmm_thread((struct vmm_thread*)cth);
366 }
367
368 static void vmm_thread_refl_fault(struct uthread *uth,
369                                   struct user_context *ctx)
370 {
371         switch (ctx->type) {
372         case ROS_HW_CTX:
373                 /* Guests should only ever VM exit */
374                 assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
375                 vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
376                                          __arch_refl_get_err(ctx),
377                                          __arch_refl_get_aux(ctx));
378                 break;
379         case ROS_VM_CTX:
380                 vmm_thread_refl_vm_fault(uth);
381                 break;
382         default:
383                 assert(0);
384         }
385 }
386
387 static void destroy_guest_thread(struct guest_thread *gth)
388 {
389         struct ctlr_thread *cth = gth->buddy;
390
391         __free_stack(cth->stacktop, cth->stacksize);
392         uthread_cleanup((struct uthread*)cth);
393         free(cth);
394         uthread_cleanup((struct uthread*)gth);
395         free(gth);
396 }
397
398 static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
399                                                 unsigned int gpcoreid)
400 {
401         struct guest_thread *gth;
402         struct ctlr_thread *cth;
403         /* Guests won't use TLS; they always operate in Ring V.  The controller
404          * might - not because of anything we do, but because of glibc calls. */
405         struct uth_thread_attr gth_attr = {.want_tls = FALSE};
406         struct uth_thread_attr cth_attr = {.want_tls = TRUE};
407
408         gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
409         cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
410         if (!gth || !cth) {
411                 free(gth);
412                 free(cth);
413                 return 0;
414         }
415         gth->buddy = cth;
416         cth->buddy = gth;
417         gth->gpc_id = gpcoreid;
418         cth->stacksize = VMM_THR_STACKSIZE;
419         cth->stacktop = __alloc_stack(cth->stacksize);
420         if (!cth->stacktop) {
421                 free(gth);
422                 free(cth);
423                 return 0;
424         }
425         gth->uthread.u_ctx.type = ROS_VM_CTX;
426         gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
427         /* No need to init the ctlr.  It gets re-init'd each time it starts. */
428         uthread_init((struct uthread*)gth, &gth_attr);
429         uthread_init((struct uthread*)cth, &cth_attr);
430         /* TODO: give it a correct FP state.  Our current one is probably fine */
431         restore_fp_state(&gth->uthread.as);
432         gth->uthread.flags |= UTHREAD_FPSAVED;
433         gth->halt_mtx = uth_mutex_alloc();
434         gth->halt_cv = uth_cond_var_alloc();
435         return gth;
436 }
437
438 int vmm_init(struct virtual_machine *vm, int flags)
439 {
440         struct guest_thread **gths;
441
442         if (current_vm)
443                 return -1;
444         current_vm = vm;
445         if (syscall(SYS_vmm_setup, vm->nr_gpcs, vm->gpcis, flags) != vm->nr_gpcs)
446                 return -1;
447         gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
448         if (!gths)
449                 return -1;
450         for (int i = 0; i < vm->nr_gpcs; i++) {
451                 gths[i] = create_guest_thread(vm, i);
452                 if (!gths[i]) {
453                         for (int j = 0; j < i; j++)
454                                 destroy_guest_thread(gths[j]);
455                         free(gths);
456                         return -1;
457                 }
458         }
459         vm->gths = gths;
460         uthread_mcp_init();
461         return 0;
462 }
463
464 void start_guest_thread(struct guest_thread *gth)
465 {
466         acct_thread_unblocked((struct vmm_thread*)gth);
467         enqueue_vmm_thread((struct vmm_thread*)gth);
468 }
469
470 static void __tth_exit_cb(struct uthread *uthread, void *junk)
471 {
472         struct task_thread *tth = (struct task_thread*)uthread;
473
474         acct_thread_blocked((struct vmm_thread*)tth);
475         uthread_cleanup(uthread);
476         __free_stack(tth->stacktop, tth->stacksize);
477         free(tth);
478 }
479
480 static void __task_thread_run(void)
481 {
482         struct task_thread *tth = (struct task_thread*)current_uthread;
483
484         tth->func(tth->arg);
485         uthread_yield(FALSE, __tth_exit_cb, 0);
486 }
487
488 struct task_thread *vmm_run_task(struct virtual_machine *vm,
489                                  void (*func)(void *), void *arg)
490 {
491         struct task_thread *tth;
492         struct uth_thread_attr tth_attr = {.want_tls = TRUE};
493
494         tth = (struct task_thread*)alloc_vmm_thread(vm, VMM_THREAD_TASK);
495         if (!tth)
496                 return 0;
497         tth->stacksize = VMM_THR_STACKSIZE;
498         tth->stacktop = __alloc_stack(tth->stacksize);
499         if (!tth->stacktop) {
500                 free(tth);
501                 return 0;
502         }
503         tth->func = func;
504         tth->arg = arg;
505         init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
506                       (uintptr_t)(tth->stacktop));
507         uthread_init((struct uthread*)tth, &tth_attr);
508         acct_thread_unblocked((struct vmm_thread*)tth);
509         enqueue_vmm_thread((struct vmm_thread*)tth);
510         return tth;
511 }
512
513 /* Helpers for tracking nr_unblk_* threads. */
514 static void acct_thread_blocked(struct vmm_thread *vth)
515 {
516         switch (vth->type) {
517         case VMM_THREAD_GUEST:
518         case VMM_THREAD_CTLR:
519                 atomic_dec(&nr_unblk_guests);
520                 break;
521         case VMM_THREAD_TASK:
522                 atomic_dec(&nr_unblk_tasks);
523                 break;
524         }
525 }
526
527 static void acct_thread_unblocked(struct vmm_thread *vth)
528 {
529         switch (vth->type) {
530         case VMM_THREAD_GUEST:
531         case VMM_THREAD_CTLR:
532                 atomic_inc(&nr_unblk_guests);
533                 break;
534         case VMM_THREAD_TASK:
535                 atomic_inc(&nr_unblk_tasks);
536                 break;
537         }
538 }
539
540 static void enqueue_vmm_thread(struct vmm_thread *vth)
541 {
542         spin_pdr_lock(&queue_lock);
543         switch (vth->type) {
544         case VMM_THREAD_GUEST:
545         case VMM_THREAD_CTLR:
546                 TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
547                 break;
548         case VMM_THREAD_TASK:
549                 TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
550                 break;
551         }
552         spin_pdr_unlock(&queue_lock);
553         try_to_get_vcores();
554 }
555
556 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
557 {
558         struct vmm_thread *vth;
559         int ret;
560
561         ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
562                              sizeof(struct vmm_thread));
563         if (ret)
564                 return 0;
565         memset(vth, 0, sizeof(struct vmm_thread));
566         vth->type = type;
567         vth->vm = vm;
568         return vth;
569 }
570
571 static void __free_stack(void *stacktop, size_t stacksize)
572 {
573         munmap(stacktop - stacksize, stacksize);
574 }
575
576 static void *__alloc_stack(size_t stacksize)
577 {
578         int force_a_page_fault;
579         void *stacktop;
580         void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
581                               MAP_ANONYMOUS, -1, 0);
582
583         if (stackbot == MAP_FAILED)
584                 return 0;
585         stacktop = stackbot + stacksize;
586         /* Want the top of the stack populated, but not the rest of the stack;
587          * that'll grow on demand (up to stacksize, then will clobber memory). */
588         force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
589         return stacktop;
590 }