Compile with -Wreturn-type
[akaros.git] / user / vmm / sched.c
1 /* Copyright (c) 2016 Google Inc.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * 2LS for virtual machines */
6
7 #include <vmm/sched.h>
8 #include <vmm/vmm.h>
9 #include <sys/mman.h>
10 #include <stdlib.h>
11 #include <assert.h>
12 #include <parlib/spinlock.h>
13 #include <parlib/event.h>
14 #include <parlib/ucq.h>
15 #include <parlib/arch/trap.h>
16 #include <parlib/ros_debug.h>
17 #include <benchutil/vcore_tick.h>
18
19 int vmm_sched_period_usec = 1000;
20
21 /* For now, we only have one VM managed by the 2LS.  If we ever expand that,
22  * we'll need something analogous to current_uthread, so the 2LS knows which VM
23  * it is working on. */
24 static struct virtual_machine *current_vm;
25
26 static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
27 /* Runnable queues, broken up by thread type. */
28 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
29 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
30 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
31 static atomic_t nr_unblk_tasks;
32 static atomic_t nr_unblk_guests;
33 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
34 static struct event_queue *sysc_evq;
35
36 static void vmm_sched_entry(void);
37 static void vmm_thread_runnable(struct uthread *uth);
38 static void vmm_thread_paused(struct uthread *uth);
39 static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
40 static void vmm_thread_has_blocked(struct uthread *uth, int flags);
41 static void vmm_thread_refl_fault(struct uthread *uth,
42                                   struct user_context *ctx);
43
44 struct schedule_ops vmm_sched_ops = {
45         .sched_entry = vmm_sched_entry,
46         .thread_runnable = vmm_thread_runnable,
47         .thread_paused = vmm_thread_paused,
48         .thread_blockon_sysc = vmm_thread_blockon_sysc,
49         .thread_has_blocked = vmm_thread_has_blocked,
50         .thread_refl_fault = vmm_thread_refl_fault,
51 };
52
53 /* Helpers */
54 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
55                                void *data);
56 static void acct_thread_blocked(struct vmm_thread *vth);
57 static void acct_thread_unblocked(struct vmm_thread *vth);
58 static void enqueue_vmm_thread(struct vmm_thread *vth);
59 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
60                                            int type);
61 static void *__alloc_stack(size_t stacksize);
62 static void __free_stack(void *stacktop, size_t stacksize);
63
64
65 static void restart_thread(struct syscall *sysc)
66 {
67         struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
68
69         /* uthread stuff here: */
70         assert(ut_restartee);
71         assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
72         ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
73         vmm_thread_runnable(ut_restartee);
74 }
75
76 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
77                                void *data)
78 {
79         struct syscall *sysc;
80
81         /* I think we can make this assert now.  If not, check pthread.c. (concern
82          * was having old ev_qs firing and running this handler). */
83         assert(ev_msg);
84         sysc = ev_msg->ev_arg3;
85         assert(sysc);
86         restart_thread(sysc);
87 }
88
89 /* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will
90  * attempt to route the notifs/IPIs to vcoreid */
91 static struct event_queue *setup_sysc_evq(int vcoreid)
92 {
93         struct event_queue *evq;
94         uintptr_t mmap_block;
95
96         mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
97                                      PROT_WRITE | PROT_READ,
98                                      MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
99         evq = get_eventq_raw();
100         assert(mmap_block && evq);
101         evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
102         evq->ev_vcore = vcoreid;
103         evq->ev_mbox->type = EV_MBOX_UCQ;
104         ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
105         return evq;
106 }
107
108 static void __attribute__((constructor)) vmm_lib_init(void)
109 {
110         struct task_thread *thread0;
111
112         init_once_racy(return);
113         uthread_lib_init();
114
115         /* Note that thread0 doesn't belong to a VM.  We can set this during
116          * vmm_init() if we need to. */
117         thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
118         assert(thread0);
119         acct_thread_unblocked((struct vmm_thread*)thread0);
120         thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
121         thread0->stacktop = (void*)USTACKTOP;
122         /* for lack of a better vcore, might as well send to 0 */
123         sysc_evq = setup_sysc_evq(0);
124         register_ev_handler(EV_SYSCALL, vmm_handle_syscall, 0);
125         uthread_2ls_init((struct uthread*)thread0, &vmm_sched_ops);
126 }
127
128 /* The scheduling policy is encapsulated in the next few functions (from here
129  * down to sched_entry()). */
130
131 static int desired_nr_vcores(void)
132 {
133         /* Sanity checks on our accounting. */
134         assert(atomic_read(&nr_unblk_guests) >= 0);
135         assert(atomic_read(&nr_unblk_tasks) >= 0);
136         /* Lockless peak.  This is always an estimate.  Some of our tasks busy-wait,
137          * so it's not enough to just give us one vcore for all tasks, yet. */
138         return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
139 }
140
141 static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
142 {
143         struct vmm_thread *vth;
144
145         vth = TAILQ_FIRST(tq);
146         if (vth)
147                 TAILQ_REMOVE(tq, vth, tq_next);
148         return vth;
149 }
150
151 static struct vmm_thread *pick_a_thread_degraded(void)
152 {
153         struct vmm_thread *vth = 0;
154         static int next_class = VMM_THREAD_GUEST;
155
156         /* We don't have a lot of cores (maybe 0), so we'll alternate which type of
157          * thread we look at first.  Basically, we're RR within a class of threads,
158          * and we'll toggle between those two classes. */
159         spin_pdr_lock(&queue_lock);
160         if (next_class == VMM_THREAD_GUEST) {
161                 if (!vth)
162                         vth = __pop_first(&rnbl_guests);
163                 if (!vth)
164                         vth = __pop_first(&rnbl_tasks);
165                 next_class = VMM_THREAD_TASK;
166         } else {
167                 if (!vth)
168                         vth = __pop_first(&rnbl_tasks);
169                 if (!vth)
170                         vth = __pop_first(&rnbl_guests);
171                 next_class = VMM_THREAD_GUEST;
172         };
173         spin_pdr_unlock(&queue_lock);
174         return vth;
175 }
176
177 /* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */
178 static struct vmm_thread *pick_a_thread_plenty(void)
179 {
180         struct vmm_thread *vth = 0;
181
182         spin_pdr_lock(&queue_lock);
183         if (!vth)
184                 vth = __pop_first(&rnbl_tasks);
185         if (!vth)
186                 vth = __pop_first(&rnbl_guests);
187         spin_pdr_unlock(&queue_lock);
188         return vth;
189 }
190
191 static void yield_current_uth(void)
192 {
193         struct vmm_thread *vth;
194
195         if (!current_uthread)
196                 return;
197         vth = (struct vmm_thread*)stop_current_uthread();
198         enqueue_vmm_thread(vth);
199 }
200
201 /* Helper, tries to get the right number of vcores.  Returns TRUE if we think we
202  * have enough, FALSE otherwise.
203  *
204  * TODO: this doesn't handle a lot of issues, like preemption, how to
205  * run/yield our vcores, dynamic changes in the number of runnables, where
206  * to send events, how to avoid interfering with gpcs, etc. */
207 static bool try_to_get_vcores(void)
208 {
209         int nr_vcores_wanted = desired_nr_vcores();
210         bool have_enough = nr_vcores_wanted <= num_vcores();
211
212         if (have_enough) {
213                 vcore_tick_disable();
214                 return TRUE;
215         }
216         vcore_tick_enable(vmm_sched_period_usec);
217         vcore_request_total(nr_vcores_wanted);
218         return FALSE;
219 }
220
221 static void __attribute__((noreturn)) vmm_sched_entry(void)
222 {
223         struct vmm_thread *vth;
224         bool have_enough;
225
226         have_enough = try_to_get_vcores();
227         if (!have_enough && vcore_tick_poll()) {
228                 /* slightly less than ideal: we grab the queue lock twice */
229                 yield_current_uth();
230         }
231         if (current_uthread)
232                 run_current_uthread();
233         if (have_enough)
234                 vth = pick_a_thread_plenty();
235         else
236                 vth = pick_a_thread_degraded();
237         if (!vth)
238                 vcore_yield_or_restart();
239         run_uthread((struct uthread*)vth);
240 }
241
242 static void vmm_thread_runnable(struct uthread *uth)
243 {
244         /* A thread that was blocked is now runnable.  This counts as becoming
245          * unblocked (running + runnable) */
246         acct_thread_unblocked((struct vmm_thread*)uth);
247         enqueue_vmm_thread((struct vmm_thread*)uth);
248 }
249
250 static void vmm_thread_paused(struct uthread *uth)
251 {
252         /* The thread stopped for some reason, usually a preemption.  We'd like to
253          * just run it whenever we get a chance.  Note that it didn't become
254          * 'blocked' - it's still runnable. */
255         enqueue_vmm_thread((struct vmm_thread*)uth);
256 }
257
258 static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
259 {
260         struct syscall *sysc = (struct syscall*)syscall;
261
262         acct_thread_blocked((struct vmm_thread*)uth);
263         sysc->u_data = uth;
264         if (!register_evq(sysc, sysc_evq)) {
265                 /* Lost the race with the call being done.  The kernel won't send the
266                  * event.  Just restart him. */
267                 restart_thread(sysc);
268         }
269         /* GIANT WARNING: do not touch the thread after this point. */
270 }
271
272 static void vmm_thread_has_blocked(struct uthread *uth, int flags)
273 {
274         /* The thread blocked on something like a mutex.  It's not runnable, so we
275          * don't need to put it on a list, but we do need to account for it not
276          * running.  We'll find out (via thread_runnable) when it starts up again.
277          */
278         acct_thread_blocked((struct vmm_thread*)uth);
279 }
280
281 static void refl_error(struct uthread *uth, unsigned int trap_nr,
282                        unsigned int err, unsigned long aux)
283 {
284         printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
285                trap_nr, err, aux);
286         /* Note that uthread.c already copied out our ctx into the uth
287          * struct */
288         print_user_context(&uth->u_ctx);
289         printf("Turn on printx to spew unhandled, malignant trap info\n");
290         exit(-1);
291 }
292
293 static bool handle_page_fault(struct uthread *uth, unsigned int err,
294                               unsigned long aux)
295 {
296         if (!(err & PF_VMR_BACKED))
297                 return FALSE;
298         syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
299         __block_uthread_on_async_sysc(uth);
300         return TRUE;
301 }
302
303 static void vmm_thread_refl_hw_fault(struct uthread *uth,
304                                      unsigned int trap_nr,
305                                      unsigned int err, unsigned long aux)
306 {
307         switch (trap_nr) {
308         case HW_TRAP_PAGE_FAULT:
309                 if (!handle_page_fault(uth, err, aux))
310                         refl_error(uth, trap_nr, err, aux);
311                 break;
312         default:
313                 refl_error(uth, trap_nr, err, aux);
314         }
315 }
316
317 /* Yield callback for __ctlr_entry */
318 static void __swap_to_gth(struct uthread *uth, void *dummy)
319 {
320         struct ctlr_thread *cth = (struct ctlr_thread*)uth;
321
322         /* We just immediately run our buddy.  The ctlr and the guest are accounted
323          * together ("pass the token" back and forth). */
324         current_uthread = NULL;
325         run_uthread((struct uthread*)cth->buddy);
326         assert(0);
327 }
328
329 /* All ctrl threads start here, each time their guest has a fault.  They can
330  * block and unblock along the way.  Once a ctlr does its final uthread_yield,
331  * the next time it will start again from the top. */
332 static void __ctlr_entry(void)
333 {
334         struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
335         struct virtual_machine *vm = gth_to_vm(cth->buddy);
336
337         if (!handle_vmexit(cth->buddy)) {
338                 struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy);
339
340                 fprintf(stderr, "vmm: handle_vmexit returned false\n");
341                 fprintf(stderr, "Note: this may be a kernel module, not the kernel\n");
342                 fprintf(stderr, "RSP was %p, ", (void *)vm_tf->tf_rsp);
343                 fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip);
344                 /* TODO: properly walk the kernel page tables to map the tf_rip
345                  * to a physical address. For now, however, this hack is good
346                  * enough.
347                  */
348                 hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16);
349                 showstatus(stderr, cth->buddy);
350                 exit(0);
351         }
352         /* We want to atomically yield and start/reenqueue our buddy.  We do so in
353          * vcore context on the other side of the yield. */
354         uthread_yield(FALSE, __swap_to_gth, 0);
355 }
356
357 static void vmm_thread_refl_vm_fault(struct uthread *uth)
358 {
359         struct guest_thread *gth = (struct guest_thread*)uth;
360         struct ctlr_thread *cth = gth->buddy;
361
362         /* The ctlr starts frm the top every time we get a new fault. */
363         cth->uthread.flags |= UTHREAD_SAVED;
364         init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
365                       (uintptr_t)(cth->stacktop));
366         /* We just immediately run our buddy.  The ctlr and the guest are accounted
367          * together ("pass the token" back and forth). */
368         current_uthread = NULL;
369         run_uthread((struct uthread*)cth);
370         assert(0);
371 }
372
373 static void vmm_thread_refl_fault(struct uthread *uth,
374                                   struct user_context *ctx)
375 {
376         switch (ctx->type) {
377         case ROS_HW_CTX:
378                 /* Guests should only ever VM exit */
379                 assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
380                 vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
381                                          __arch_refl_get_err(ctx),
382                                          __arch_refl_get_aux(ctx));
383                 break;
384         case ROS_VM_CTX:
385                 vmm_thread_refl_vm_fault(uth);
386                 break;
387         default:
388                 assert(0);
389         }
390 }
391
392 static void destroy_guest_thread(struct guest_thread *gth)
393 {
394         struct ctlr_thread *cth = gth->buddy;
395
396         __free_stack(cth->stacktop, cth->stacksize);
397         uthread_cleanup((struct uthread*)cth);
398         free(cth);
399         uthread_cleanup((struct uthread*)gth);
400         free(gth);
401 }
402
403 static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
404                                                 unsigned int gpcoreid)
405 {
406         struct guest_thread *gth;
407         struct ctlr_thread *cth;
408         /* Guests won't use TLS; they always operate in Ring V.  The controller
409          * might - not because of anything we do, but because of glibc calls. */
410         struct uth_thread_attr gth_attr = {.want_tls = FALSE};
411         struct uth_thread_attr cth_attr = {.want_tls = TRUE};
412
413         gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
414         cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
415         if (!gth || !cth) {
416                 free(gth);
417                 free(cth);
418                 return 0;
419         }
420         gth->buddy = cth;
421         cth->buddy = gth;
422         gth->gpc_id = gpcoreid;
423         cth->stacksize = VMM_THR_STACKSIZE;
424         cth->stacktop = __alloc_stack(cth->stacksize);
425         if (!cth->stacktop) {
426                 free(gth);
427                 free(cth);
428                 return 0;
429         }
430         gth->uthread.u_ctx.type = ROS_VM_CTX;
431         gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
432         /* No need to init the ctlr.  It gets re-init'd each time it starts. */
433         uthread_init((struct uthread*)gth, &gth_attr);
434         uthread_init((struct uthread*)cth, &cth_attr);
435         /* TODO: give it a correct FP state.  Our current one is probably fine */
436         restore_fp_state(&gth->uthread.as);
437         gth->uthread.flags |= UTHREAD_FPSAVED;
438         gth->halt_mtx = uth_mutex_alloc();
439         gth->halt_cv = uth_cond_var_alloc();
440         return gth;
441 }
442
443 int vmm_init(struct virtual_machine *vm, int flags)
444 {
445         struct guest_thread **gths;
446
447         if (current_vm)
448                 return -1;
449         current_vm = vm;
450         if (syscall(SYS_vmm_setup, vm->nr_gpcs, vm->gpcis, flags) != vm->nr_gpcs)
451                 return -1;
452         gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
453         if (!gths)
454                 return -1;
455         for (int i = 0; i < vm->nr_gpcs; i++) {
456                 gths[i] = create_guest_thread(vm, i);
457                 if (!gths[i]) {
458                         for (int j = 0; j < i; j++)
459                                 destroy_guest_thread(gths[j]);
460                         free(gths);
461                         return -1;
462                 }
463         }
464         vm->gths = gths;
465         uthread_mcp_init();
466         return 0;
467 }
468
469 void start_guest_thread(struct guest_thread *gth)
470 {
471         acct_thread_unblocked((struct vmm_thread*)gth);
472         enqueue_vmm_thread((struct vmm_thread*)gth);
473 }
474
475 static void __tth_exit_cb(struct uthread *uthread, void *junk)
476 {
477         struct task_thread *tth = (struct task_thread*)uthread;
478
479         acct_thread_blocked((struct vmm_thread*)tth);
480         uthread_cleanup(uthread);
481         __free_stack(tth->stacktop, tth->stacksize);
482         free(tth);
483 }
484
485 static void __task_thread_run(void)
486 {
487         struct task_thread *tth = (struct task_thread*)current_uthread;
488
489         tth->func(tth->arg);
490         uthread_yield(FALSE, __tth_exit_cb, 0);
491 }
492
493 struct task_thread *vmm_run_task(struct virtual_machine *vm,
494                                  void (*func)(void *), void *arg)
495 {
496         struct task_thread *tth;
497         struct uth_thread_attr tth_attr = {.want_tls = TRUE};
498
499         tth = (struct task_thread*)alloc_vmm_thread(vm, VMM_THREAD_TASK);
500         if (!tth)
501                 return 0;
502         tth->stacksize = VMM_THR_STACKSIZE;
503         tth->stacktop = __alloc_stack(tth->stacksize);
504         if (!tth->stacktop) {
505                 free(tth);
506                 return 0;
507         }
508         tth->func = func;
509         tth->arg = arg;
510         init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
511                       (uintptr_t)(tth->stacktop));
512         uthread_init((struct uthread*)tth, &tth_attr);
513         acct_thread_unblocked((struct vmm_thread*)tth);
514         enqueue_vmm_thread((struct vmm_thread*)tth);
515         return tth;
516 }
517
518 /* Helpers for tracking nr_unblk_* threads. */
519 static void acct_thread_blocked(struct vmm_thread *vth)
520 {
521         switch (vth->type) {
522         case VMM_THREAD_GUEST:
523         case VMM_THREAD_CTLR:
524                 atomic_dec(&nr_unblk_guests);
525                 break;
526         case VMM_THREAD_TASK:
527                 atomic_dec(&nr_unblk_tasks);
528                 break;
529         }
530 }
531
532 static void acct_thread_unblocked(struct vmm_thread *vth)
533 {
534         switch (vth->type) {
535         case VMM_THREAD_GUEST:
536         case VMM_THREAD_CTLR:
537                 atomic_inc(&nr_unblk_guests);
538                 break;
539         case VMM_THREAD_TASK:
540                 atomic_inc(&nr_unblk_tasks);
541                 break;
542         }
543 }
544
545 static void enqueue_vmm_thread(struct vmm_thread *vth)
546 {
547         spin_pdr_lock(&queue_lock);
548         switch (vth->type) {
549         case VMM_THREAD_GUEST:
550         case VMM_THREAD_CTLR:
551                 TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
552                 break;
553         case VMM_THREAD_TASK:
554                 TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
555                 break;
556         }
557         spin_pdr_unlock(&queue_lock);
558         try_to_get_vcores();
559 }
560
561 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
562 {
563         struct vmm_thread *vth;
564         int ret;
565
566         ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
567                              sizeof(struct vmm_thread));
568         if (ret)
569                 return 0;
570         memset(vth, 0, sizeof(struct vmm_thread));
571         vth->type = type;
572         vth->vm = vm;
573         return vth;
574 }
575
576 static void __free_stack(void *stacktop, size_t stacksize)
577 {
578         munmap(stacktop - stacksize, stacksize);
579 }
580
581 static void *__alloc_stack(size_t stacksize)
582 {
583         int force_a_page_fault;
584         void *stacktop;
585         void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
586                               MAP_ANONYMOUS, -1, 0);
587
588         if (stackbot == MAP_FAILED)
589                 return 0;
590         stacktop = stackbot + stacksize;
591         /* Want the top of the stack populated, but not the rest of the stack;
592          * that'll grow on demand (up to stacksize, then will clobber memory). */
593         force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
594         return stacktop;
595 }