user/vmm: print the RSP as well as RIP
[akaros.git] / user / vmm / sched.c
1 /* Copyright (c) 2016 Google Inc.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * 2LS for virtual machines */
6
7 #include <vmm/sched.h>
8 #include <vmm/vmm.h>
9 #include <sys/mman.h>
10 #include <stdlib.h>
11 #include <assert.h>
12 #include <parlib/spinlock.h>
13 #include <parlib/event.h>
14 #include <parlib/ucq.h>
15 #include <parlib/arch/trap.h>
16 #include <parlib/ros_debug.h>
17 #include <benchutil/vcore_tick.h>
18
19 int vmm_sched_period_usec = 1000;
20
21 /* For now, we only have one VM managed by the 2LS.  If we ever expand that,
22  * we'll need something analogous to current_uthread, so the 2LS knows which VM
23  * it is working on. */
24 static struct virtual_machine *current_vm;
25
26 static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
27 /* Runnable queues, broken up by thread type. */
28 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
29 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
30 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
31 static atomic_t nr_unblk_tasks;
32 static atomic_t nr_unblk_guests;
33 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
34 static struct event_queue *sysc_evq;
35
36 static void vmm_sched_entry(void);
37 static void vmm_thread_runnable(struct uthread *uth);
38 static void vmm_thread_paused(struct uthread *uth);
39 static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
40 static void vmm_thread_has_blocked(struct uthread *uth, int flags);
41 static void vmm_thread_refl_fault(struct uthread *uth,
42                                   struct user_context *ctx);
43
44 struct schedule_ops vmm_sched_ops = {
45         .sched_entry = vmm_sched_entry,
46         .thread_runnable = vmm_thread_runnable,
47         .thread_paused = vmm_thread_paused,
48         .thread_blockon_sysc = vmm_thread_blockon_sysc,
49         .thread_has_blocked = vmm_thread_has_blocked,
50         .thread_refl_fault = vmm_thread_refl_fault,
51 };
52
53 /* Helpers */
54 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
55                                void *data);
56 static void acct_thread_blocked(struct vmm_thread *vth);
57 static void acct_thread_unblocked(struct vmm_thread *vth);
58 static void enqueue_vmm_thread(struct vmm_thread *vth);
59 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
60                                            int type);
61 static void *__alloc_stack(size_t stacksize);
62 static void __free_stack(void *stacktop, size_t stacksize);
63
64
65 static void restart_thread(struct syscall *sysc)
66 {
67         struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
68
69         /* uthread stuff here: */
70         assert(ut_restartee);
71         assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
72         ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
73         vmm_thread_runnable(ut_restartee);
74 }
75
76 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
77                                void *data)
78 {
79         struct syscall *sysc;
80
81         /* I think we can make this assert now.  If not, check pthread.c. (concern
82          * was having old ev_qs firing and running this handler). */
83         assert(ev_msg);
84         sysc = ev_msg->ev_arg3;
85         assert(sysc);
86         restart_thread(sysc);
87 }
88
89 /* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will
90  * attempt to route the notifs/IPIs to vcoreid */
91 static struct event_queue *setup_sysc_evq(int vcoreid)
92 {
93         struct event_queue *evq;
94         uintptr_t mmap_block;
95
96         mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
97                                      PROT_WRITE | PROT_READ,
98                                      MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
99         evq = get_eventq_raw();
100         assert(mmap_block && evq);
101         evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
102         evq->ev_vcore = vcoreid;
103         evq->ev_mbox->type = EV_MBOX_UCQ;
104         ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
105         return evq;
106 }
107
108 static void __attribute__((constructor)) vmm_lib_init(void)
109 {
110         struct task_thread *thread0;
111
112         init_once_racy(return);
113         uthread_lib_init();
114
115         /* Note that thread0 doesn't belong to a VM.  We can set this during
116          * vmm_init() if we need to. */
117         thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
118         assert(thread0);
119         acct_thread_unblocked((struct vmm_thread*)thread0);
120         thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
121         thread0->stacktop = (void*)USTACKTOP;
122         /* for lack of a better vcore, might as well send to 0 */
123         sysc_evq = setup_sysc_evq(0);
124         register_ev_handler(EV_SYSCALL, vmm_handle_syscall, 0);
125         uthread_2ls_init((struct uthread*)thread0, &vmm_sched_ops);
126 }
127
128 /* The scheduling policy is encapsulated in the next few functions (from here
129  * down to sched_entry()). */
130
131 static int desired_nr_vcores(void)
132 {
133         /* Sanity checks on our accounting. */
134         assert(atomic_read(&nr_unblk_guests) >= 0);
135         assert(atomic_read(&nr_unblk_tasks) >= 0);
136         /* Lockless peak.  This is always an estimate.  Some of our tasks busy-wait,
137          * so it's not enough to just give us one vcore for all tasks, yet. */
138         return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
139 }
140
141 static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
142 {
143         struct vmm_thread *vth;
144
145         vth = TAILQ_FIRST(tq);
146         if (vth)
147                 TAILQ_REMOVE(tq, vth, tq_next);
148         return vth;
149 }
150
151 static struct vmm_thread *pick_a_thread_degraded(void)
152 {
153         struct vmm_thread *vth = 0;
154         static int next_class = VMM_THREAD_GUEST;
155
156         /* We don't have a lot of cores (maybe 0), so we'll alternate which type of
157          * thread we look at first.  Basically, we're RR within a class of threads,
158          * and we'll toggle between those two classes. */
159         spin_pdr_lock(&queue_lock);
160         if (next_class == VMM_THREAD_GUEST) {
161                 if (!vth)
162                         vth = __pop_first(&rnbl_guests);
163                 if (!vth)
164                         vth = __pop_first(&rnbl_tasks);
165                 next_class = VMM_THREAD_TASK;
166         } else {
167                 if (!vth)
168                         vth = __pop_first(&rnbl_tasks);
169                 if (!vth)
170                         vth = __pop_first(&rnbl_guests);
171                 next_class = VMM_THREAD_GUEST;
172         };
173         spin_pdr_unlock(&queue_lock);
174         return vth;
175 }
176
177 /* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */
178 static struct vmm_thread *pick_a_thread_plenty(void)
179 {
180         struct vmm_thread *vth = 0;
181
182         spin_pdr_lock(&queue_lock);
183         if (!vth)
184                 vth = __pop_first(&rnbl_tasks);
185         if (!vth)
186                 vth = __pop_first(&rnbl_guests);
187         spin_pdr_unlock(&queue_lock);
188         return vth;
189 }
190
191 static void yield_current_uth(void)
192 {
193         struct vmm_thread *vth;
194
195         if (!current_uthread)
196                 return;
197         vth = (struct vmm_thread*)stop_current_uthread();
198         enqueue_vmm_thread(vth);
199 }
200
201 /* Helper, tries to get the right number of vcores.  Returns TRUE if we think we
202  * have enough, FALSE otherwise.
203  *
204  * TODO: this doesn't handle a lot of issues, like preemption, how to
205  * run/yield our vcores, dynamic changes in the number of runnables, where
206  * to send events, how to avoid interfering with gpcs, etc. */
207 static bool try_to_get_vcores(void)
208 {
209         int nr_vcores_wanted = desired_nr_vcores();
210         bool have_enough = nr_vcores_wanted <= num_vcores();
211
212         if (have_enough) {
213                 vcore_tick_disable();
214                 return TRUE;
215         }
216         vcore_tick_enable(vmm_sched_period_usec);
217         vcore_request_total(nr_vcores_wanted);
218         return FALSE;
219 }
220
221 static void __attribute__((noreturn)) vmm_sched_entry(void)
222 {
223         struct vmm_thread *vth;
224         bool have_enough;
225
226         have_enough = try_to_get_vcores();
227         if (!have_enough && vcore_tick_poll()) {
228                 /* slightly less than ideal: we grab the queue lock twice */
229                 yield_current_uth();
230         }
231         if (current_uthread)
232                 run_current_uthread();
233         if (have_enough)
234                 vth = pick_a_thread_plenty();
235         else
236                 vth = pick_a_thread_degraded();
237         if (!vth)
238                 vcore_yield_or_restart();
239         run_uthread((struct uthread*)vth);
240 }
241
242 static void vmm_thread_runnable(struct uthread *uth)
243 {
244         /* A thread that was blocked is now runnable.  This counts as becoming
245          * unblocked (running + runnable) */
246         acct_thread_unblocked((struct vmm_thread*)uth);
247         enqueue_vmm_thread((struct vmm_thread*)uth);
248 }
249
250 static void vmm_thread_paused(struct uthread *uth)
251 {
252         /* The thread stopped for some reason, usually a preemption.  We'd like to
253          * just run it whenever we get a chance.  Note that it didn't become
254          * 'blocked' - it's still runnable. */
255         enqueue_vmm_thread((struct vmm_thread*)uth);
256 }
257
258 static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
259 {
260         struct syscall *sysc = (struct syscall*)syscall;
261
262         acct_thread_blocked((struct vmm_thread*)uth);
263         sysc->u_data = uth;
264         if (!register_evq(sysc, sysc_evq)) {
265                 /* Lost the race with the call being done.  The kernel won't send the
266                  * event.  Just restart him. */
267                 restart_thread(sysc);
268         }
269         /* GIANT WARNING: do not touch the thread after this point. */
270 }
271
272 static void vmm_thread_has_blocked(struct uthread *uth, int flags)
273 {
274         /* The thread blocked on something like a mutex.  It's not runnable, so we
275          * don't need to put it on a list, but we do need to account for it not
276          * running.  We'll find out (via thread_runnable) when it starts up again.
277          */
278         acct_thread_blocked((struct vmm_thread*)uth);
279 }
280
281 static void refl_error(struct uthread *uth, unsigned int trap_nr,
282                        unsigned int err, unsigned long aux)
283 {
284         printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
285                trap_nr, err, aux);
286         /* Note that uthread.c already copied out our ctx into the uth
287          * struct */
288         print_user_context(&uth->u_ctx);
289         printf("Turn on printx to spew unhandled, malignant trap info\n");
290         exit(-1);
291 }
292
293 static bool handle_page_fault(struct uthread *uth, unsigned int err,
294                               unsigned long aux)
295 {
296         if (!(err & PF_VMR_BACKED))
297                 return FALSE;
298         syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
299         __block_uthread_on_async_sysc(uth);
300         return TRUE;
301 }
302
303 static void vmm_thread_refl_hw_fault(struct uthread *uth,
304                                      unsigned int trap_nr,
305                                      unsigned int err, unsigned long aux)
306 {
307         switch (trap_nr) {
308         case HW_TRAP_PAGE_FAULT:
309                 if (!handle_page_fault(uth, err, aux))
310                         refl_error(uth, trap_nr, err, aux);
311                 break;
312         default:
313                 refl_error(uth, trap_nr, err, aux);
314         }
315 }
316
317 /* Yield callback for __ctlr_entry */
318 static void __swap_to_gth(struct uthread *uth, void *dummy)
319 {
320         struct ctlr_thread *cth = (struct ctlr_thread*)uth;
321
322         /* We don't re-account for block/unblock.  The ctlr and the guest are
323          * accounted together ("pass the token" back and forth). */
324         enqueue_vmm_thread((struct vmm_thread*)cth->buddy);
325 }
326
327 /* All ctrl threads start here, each time their guest has a fault.  They can
328  * block and unblock along the way.  Once a ctlr does its final uthread_yield,
329  * the next time it will start again from the top. */
330 static void __ctlr_entry(void)
331 {
332         struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
333         struct virtual_machine *vm = gth_to_vm(cth->buddy);
334
335         if (!handle_vmexit(cth->buddy)) {
336                 struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy);
337
338                 fprintf(stderr, "vmm: handle_vmexit returned false\n");
339                 fprintf(stderr, "Note: this may be a kernel module, not the kernel\n");
340                 fprintf(stderr, "RSP was %p, ", (void *)vm_tf->tf_rsp);
341                 fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip);
342                 /* TODO: properly walk the kernel page tables to map the tf_rip
343                  * to a physical address. For now, however, this hack is good
344                  * enough.
345                  */
346                 hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16);
347                 showstatus(stderr, cth->buddy);
348                 exit(0);
349         }
350         /* We want to atomically yield and start/reenqueue our buddy.  We do so in
351          * vcore context on the other side of the yield. */
352         uthread_yield(FALSE, __swap_to_gth, 0);
353 }
354
355 static void vmm_thread_refl_vm_fault(struct uthread *uth)
356 {
357         struct guest_thread *gth = (struct guest_thread*)uth;
358         struct ctlr_thread *cth = gth->buddy;
359
360         /* The ctlr starts frm the top every time we get a new fault. */
361         cth->uthread.flags |= UTHREAD_SAVED;
362         init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
363                       (uintptr_t)(cth->stacktop));
364         /* We don't re-account for block/unblock.  The ctlr and the guest are
365          * accounted together ("pass the token" back and forth). */
366         enqueue_vmm_thread((struct vmm_thread*)cth);
367 }
368
369 static void vmm_thread_refl_fault(struct uthread *uth,
370                                   struct user_context *ctx)
371 {
372         switch (ctx->type) {
373         case ROS_HW_CTX:
374                 /* Guests should only ever VM exit */
375                 assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
376                 vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
377                                          __arch_refl_get_err(ctx),
378                                          __arch_refl_get_aux(ctx));
379                 break;
380         case ROS_VM_CTX:
381                 vmm_thread_refl_vm_fault(uth);
382                 break;
383         default:
384                 assert(0);
385         }
386 }
387
388 static void destroy_guest_thread(struct guest_thread *gth)
389 {
390         struct ctlr_thread *cth = gth->buddy;
391
392         __free_stack(cth->stacktop, cth->stacksize);
393         uthread_cleanup((struct uthread*)cth);
394         free(cth);
395         uthread_cleanup((struct uthread*)gth);
396         free(gth);
397 }
398
399 static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
400                                                 unsigned int gpcoreid)
401 {
402         struct guest_thread *gth;
403         struct ctlr_thread *cth;
404         /* Guests won't use TLS; they always operate in Ring V.  The controller
405          * might - not because of anything we do, but because of glibc calls. */
406         struct uth_thread_attr gth_attr = {.want_tls = FALSE};
407         struct uth_thread_attr cth_attr = {.want_tls = TRUE};
408
409         gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
410         cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
411         if (!gth || !cth) {
412                 free(gth);
413                 free(cth);
414                 return 0;
415         }
416         gth->buddy = cth;
417         cth->buddy = gth;
418         gth->gpc_id = gpcoreid;
419         cth->stacksize = VMM_THR_STACKSIZE;
420         cth->stacktop = __alloc_stack(cth->stacksize);
421         if (!cth->stacktop) {
422                 free(gth);
423                 free(cth);
424                 return 0;
425         }
426         gth->uthread.u_ctx.type = ROS_VM_CTX;
427         gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
428         /* No need to init the ctlr.  It gets re-init'd each time it starts. */
429         uthread_init((struct uthread*)gth, &gth_attr);
430         uthread_init((struct uthread*)cth, &cth_attr);
431         /* TODO: give it a correct FP state.  Our current one is probably fine */
432         restore_fp_state(&gth->uthread.as);
433         gth->uthread.flags |= UTHREAD_FPSAVED;
434         gth->halt_mtx = uth_mutex_alloc();
435         gth->halt_cv = uth_cond_var_alloc();
436         return gth;
437 }
438
439 int vmm_init(struct virtual_machine *vm, int flags)
440 {
441         struct guest_thread **gths;
442
443         if (current_vm)
444                 return -1;
445         current_vm = vm;
446         if (syscall(SYS_vmm_setup, vm->nr_gpcs, vm->gpcis, flags) != vm->nr_gpcs)
447                 return -1;
448         gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
449         if (!gths)
450                 return -1;
451         for (int i = 0; i < vm->nr_gpcs; i++) {
452                 gths[i] = create_guest_thread(vm, i);
453                 if (!gths[i]) {
454                         for (int j = 0; j < i; j++)
455                                 destroy_guest_thread(gths[j]);
456                         free(gths);
457                         return -1;
458                 }
459         }
460         vm->gths = gths;
461         uthread_mcp_init();
462         return 0;
463 }
464
465 void start_guest_thread(struct guest_thread *gth)
466 {
467         acct_thread_unblocked((struct vmm_thread*)gth);
468         enqueue_vmm_thread((struct vmm_thread*)gth);
469 }
470
471 static void __tth_exit_cb(struct uthread *uthread, void *junk)
472 {
473         struct task_thread *tth = (struct task_thread*)uthread;
474
475         acct_thread_blocked((struct vmm_thread*)tth);
476         uthread_cleanup(uthread);
477         __free_stack(tth->stacktop, tth->stacksize);
478         free(tth);
479 }
480
481 static void __task_thread_run(void)
482 {
483         struct task_thread *tth = (struct task_thread*)current_uthread;
484
485         tth->func(tth->arg);
486         uthread_yield(FALSE, __tth_exit_cb, 0);
487 }
488
489 struct task_thread *vmm_run_task(struct virtual_machine *vm,
490                                  void (*func)(void *), void *arg)
491 {
492         struct task_thread *tth;
493         struct uth_thread_attr tth_attr = {.want_tls = TRUE};
494
495         tth = (struct task_thread*)alloc_vmm_thread(vm, VMM_THREAD_TASK);
496         if (!tth)
497                 return 0;
498         tth->stacksize = VMM_THR_STACKSIZE;
499         tth->stacktop = __alloc_stack(tth->stacksize);
500         if (!tth->stacktop) {
501                 free(tth);
502                 return 0;
503         }
504         tth->func = func;
505         tth->arg = arg;
506         init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
507                       (uintptr_t)(tth->stacktop));
508         uthread_init((struct uthread*)tth, &tth_attr);
509         acct_thread_unblocked((struct vmm_thread*)tth);
510         enqueue_vmm_thread((struct vmm_thread*)tth);
511         return tth;
512 }
513
514 /* Helpers for tracking nr_unblk_* threads. */
515 static void acct_thread_blocked(struct vmm_thread *vth)
516 {
517         switch (vth->type) {
518         case VMM_THREAD_GUEST:
519         case VMM_THREAD_CTLR:
520                 atomic_dec(&nr_unblk_guests);
521                 break;
522         case VMM_THREAD_TASK:
523                 atomic_dec(&nr_unblk_tasks);
524                 break;
525         }
526 }
527
528 static void acct_thread_unblocked(struct vmm_thread *vth)
529 {
530         switch (vth->type) {
531         case VMM_THREAD_GUEST:
532         case VMM_THREAD_CTLR:
533                 atomic_inc(&nr_unblk_guests);
534                 break;
535         case VMM_THREAD_TASK:
536                 atomic_inc(&nr_unblk_tasks);
537                 break;
538         }
539 }
540
541 static void enqueue_vmm_thread(struct vmm_thread *vth)
542 {
543         spin_pdr_lock(&queue_lock);
544         switch (vth->type) {
545         case VMM_THREAD_GUEST:
546         case VMM_THREAD_CTLR:
547                 TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
548                 break;
549         case VMM_THREAD_TASK:
550                 TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
551                 break;
552         }
553         spin_pdr_unlock(&queue_lock);
554         try_to_get_vcores();
555 }
556
557 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
558 {
559         struct vmm_thread *vth;
560         int ret;
561
562         ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
563                              sizeof(struct vmm_thread));
564         if (ret)
565                 return 0;
566         memset(vth, 0, sizeof(struct vmm_thread));
567         vth->type = type;
568         vth->vm = vm;
569         return vth;
570 }
571
572 static void __free_stack(void *stacktop, size_t stacksize)
573 {
574         munmap(stacktop - stacksize, stacksize);
575 }
576
577 static void *__alloc_stack(size_t stacksize)
578 {
579         int force_a_page_fault;
580         void *stacktop;
581         void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
582                               MAP_ANONYMOUS, -1, 0);
583
584         if (stackbot == MAP_FAILED)
585                 return 0;
586         stacktop = stackbot + stacksize;
587         /* Want the top of the stack populated, but not the rest of the stack;
588          * that'll grow on demand (up to stacksize, then will clobber memory). */
589         force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
590         return stacktop;
591 }