VMM: Sync halting GPCs and interrupt injection
[akaros.git] / user / vmm / sched.c
1 /* Copyright (c) 2016 Google Inc.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * 2LS for virtual machines */
6
7 #include <vmm/sched.h>
8 #include <vmm/vmm.h>
9 #include <sys/mman.h>
10 #include <stdlib.h>
11 #include <assert.h>
12 #include <parlib/spinlock.h>
13 #include <parlib/event.h>
14 #include <parlib/ucq.h>
15 #include <parlib/arch/trap.h>
16 #include <parlib/ros_debug.h>
17 #include <benchutil/vcore_tick.h>
18
19 int vmm_sched_period_usec = 1000;
20
21 /* For now, we only have one VM managed by the 2LS.  If we ever expand that,
22  * we'll need something analogous to current_uthread, so the 2LS knows which VM
23  * it is working on. */
24 static struct virtual_machine *current_vm;
25
26 static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
27 /* Runnable queues, broken up by thread type. */
28 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
29 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
30 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
31 static atomic_t nr_unblk_tasks;
32 static atomic_t nr_unblk_guests;
33 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
34 static struct event_queue *sysc_evq;
35
36 static void vmm_sched_entry(void);
37 static void vmm_thread_runnable(struct uthread *uth);
38 static void vmm_thread_paused(struct uthread *uth);
39 static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
40 static void vmm_thread_has_blocked(struct uthread *uth, int flags);
41 static void vmm_thread_refl_fault(struct uthread *uth,
42                                   struct user_context *ctx);
43
44 struct schedule_ops vmm_sched_ops = {
45         .sched_entry = vmm_sched_entry,
46         .thread_runnable = vmm_thread_runnable,
47         .thread_paused = vmm_thread_paused,
48         .thread_blockon_sysc = vmm_thread_blockon_sysc,
49         .thread_has_blocked = vmm_thread_has_blocked,
50         .thread_refl_fault = vmm_thread_refl_fault,
51 };
52
53 /* Helpers */
54 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
55                                void *data);
56 static void acct_thread_blocked(struct vmm_thread *vth);
57 static void acct_thread_unblocked(struct vmm_thread *vth);
58 static void enqueue_vmm_thread(struct vmm_thread *vth);
59 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
60                                            int type);
61 static void *__alloc_stack(size_t stacksize);
62 static void __free_stack(void *stacktop, size_t stacksize);
63
64
65 static void restart_thread(struct syscall *sysc)
66 {
67         struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
68
69         /* uthread stuff here: */
70         assert(ut_restartee);
71         assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
72         ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
73         vmm_thread_runnable(ut_restartee);
74 }
75
76 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
77                                void *data)
78 {
79         struct syscall *sysc;
80
81         /* I think we can make this assert now.  If not, check pthread.c. (concern
82          * was having old ev_qs firing and running this handler). */
83         assert(ev_msg);
84         sysc = ev_msg->ev_arg3;
85         assert(sysc);
86         restart_thread(sysc);
87 }
88
89 /* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will
90  * attempt to route the notifs/IPIs to vcoreid */
91 static struct event_queue *setup_sysc_evq(int vcoreid)
92 {
93         struct event_queue *evq;
94         uintptr_t mmap_block;
95
96         mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
97                                      PROT_WRITE | PROT_READ,
98                                      MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
99         evq = get_eventq_raw();
100         assert(mmap_block && evq);
101         evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
102         evq->ev_vcore = vcoreid;
103         evq->ev_mbox->type = EV_MBOX_UCQ;
104         ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
105         return evq;
106 }
107
108 static void __attribute__((constructor)) vmm_lib_init(void)
109 {
110         struct task_thread *thread0;
111
112         init_once_racy(return);
113         uthread_lib_init();
114
115         /* Note that thread0 doesn't belong to a VM.  We can set this during
116          * vmm_init() if we need to. */
117         thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
118         assert(thread0);
119         acct_thread_unblocked((struct vmm_thread*)thread0);
120         thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
121         thread0->stacktop = (void*)USTACKTOP;
122         /* for lack of a better vcore, might as well send to 0 */
123         sysc_evq = setup_sysc_evq(0);
124         register_ev_handler(EV_SYSCALL, vmm_handle_syscall, 0);
125         uthread_2ls_init((struct uthread*)thread0, &vmm_sched_ops);
126 }
127
128 /* The scheduling policy is encapsulated in the next few functions (from here
129  * down to sched_entry()). */
130
131 static int desired_nr_vcores(void)
132 {
133         /* Sanity checks on our accounting. */
134         assert(atomic_read(&nr_unblk_guests) >= 0);
135         assert(atomic_read(&nr_unblk_tasks) >= 0);
136         /* Lockless peak.  This is always an estimate.  Some of our tasks busy-wait,
137          * so it's not enough to just give us one vcore for all tasks, yet. */
138         return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
139 }
140
141 static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
142 {
143         struct vmm_thread *vth;
144
145         vth = TAILQ_FIRST(tq);
146         if (vth)
147                 TAILQ_REMOVE(tq, vth, tq_next);
148         return vth;
149 }
150
151 static struct vmm_thread *pick_a_thread_degraded(void)
152 {
153         struct vmm_thread *vth = 0;
154         static int next_class = VMM_THREAD_GUEST;
155
156         /* We don't have a lot of cores (maybe 0), so we'll alternate which type of
157          * thread we look at first.  Basically, we're RR within a class of threads,
158          * and we'll toggle between those two classes. */
159         spin_pdr_lock(&queue_lock);
160         if (next_class == VMM_THREAD_GUEST) {
161                 if (!vth)
162                         vth = __pop_first(&rnbl_guests);
163                 if (!vth)
164                         vth = __pop_first(&rnbl_tasks);
165                 next_class = VMM_THREAD_TASK;
166         } else {
167                 if (!vth)
168                         vth = __pop_first(&rnbl_tasks);
169                 if (!vth)
170                         vth = __pop_first(&rnbl_guests);
171                 next_class = VMM_THREAD_GUEST;
172         };
173         spin_pdr_unlock(&queue_lock);
174         return vth;
175 }
176
177 /* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */
178 static struct vmm_thread *pick_a_thread_plenty(void)
179 {
180         struct vmm_thread *vth = 0;
181
182         spin_pdr_lock(&queue_lock);
183         if (!vth)
184                 vth = __pop_first(&rnbl_tasks);
185         if (!vth)
186                 vth = __pop_first(&rnbl_guests);
187         spin_pdr_unlock(&queue_lock);
188         return vth;
189 }
190
191 static void yield_current_uth(void)
192 {
193         struct vmm_thread *vth;
194
195         if (!current_uthread)
196                 return;
197         vth = (struct vmm_thread*)stop_current_uthread();
198         enqueue_vmm_thread(vth);
199 }
200
201 static void __attribute__((noreturn)) vmm_sched_entry(void)
202 {
203         struct vmm_thread *vth;
204         int nr_vcores_wanted = desired_nr_vcores();
205         bool have_enough = nr_vcores_wanted <= num_vcores();
206
207         /* TODO: this doesn't handle a lot of issues, like preemption, how to
208          * run/yield our vcores, dynamic changes in the number of runnables, where
209          * to send events, how to avoid interfering with gpcs, etc. */
210         if (have_enough) {
211                 vcore_tick_disable();
212         } else {
213                 vcore_tick_enable(vmm_sched_period_usec);
214                 vcore_request_total(nr_vcores_wanted);
215                 if (vcore_tick_poll()) {
216                         /* slightly less than ideal: we grab the queue lock twice */
217                         yield_current_uth();
218                 }
219         }
220         if (current_uthread)
221                 run_current_uthread();
222         if (have_enough)
223                 vth = pick_a_thread_plenty();
224         else
225                 vth = pick_a_thread_degraded();
226         if (!vth)
227                 vcore_yield_or_restart();
228         run_uthread((struct uthread*)vth);
229 }
230
231 static void vmm_thread_runnable(struct uthread *uth)
232 {
233         /* A thread that was blocked is now runnable.  This counts as becoming
234          * unblocked (running + runnable) */
235         acct_thread_unblocked((struct vmm_thread*)uth);
236         enqueue_vmm_thread((struct vmm_thread*)uth);
237 }
238
239 static void vmm_thread_paused(struct uthread *uth)
240 {
241         /* The thread stopped for some reason, usually a preemption.  We'd like to
242          * just run it whenever we get a chance.  Note that it didn't become
243          * 'blocked' - it's still runnable. */
244         enqueue_vmm_thread((struct vmm_thread*)uth);
245 }
246
247 static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
248 {
249         struct syscall *sysc = (struct syscall*)syscall;
250
251         acct_thread_blocked((struct vmm_thread*)uth);
252         sysc->u_data = uth;
253         if (!register_evq(sysc, sysc_evq)) {
254                 /* Lost the race with the call being done.  The kernel won't send the
255                  * event.  Just restart him. */
256                 restart_thread(sysc);
257         }
258         /* GIANT WARNING: do not touch the thread after this point. */
259 }
260
261 static void vmm_thread_has_blocked(struct uthread *uth, int flags)
262 {
263         /* The thread blocked on something like a mutex.  It's not runnable, so we
264          * don't need to put it on a list, but we do need to account for it not
265          * running.  We'll find out (via thread_runnable) when it starts up again.
266          */
267         acct_thread_blocked((struct vmm_thread*)uth);
268 }
269
270 static void refl_error(struct uthread *uth, unsigned int trap_nr,
271                        unsigned int err, unsigned long aux)
272 {
273         printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
274                trap_nr, err, aux);
275         /* Note that uthread.c already copied out our ctx into the uth
276          * struct */
277         print_user_context(&uth->u_ctx);
278         printf("Turn on printx to spew unhandled, malignant trap info\n");
279         exit(-1);
280 }
281
282 static bool handle_page_fault(struct uthread *uth, unsigned int err,
283                               unsigned long aux)
284 {
285         if (!(err & PF_VMR_BACKED))
286                 return FALSE;
287         syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
288         __block_uthread_on_async_sysc(uth);
289         return TRUE;
290 }
291
292 static void vmm_thread_refl_hw_fault(struct uthread *uth,
293                                      unsigned int trap_nr,
294                                      unsigned int err, unsigned long aux)
295 {
296         switch (trap_nr) {
297         case HW_TRAP_PAGE_FAULT:
298                 if (!handle_page_fault(uth, err, aux))
299                         refl_error(uth, trap_nr, err, aux);
300                 break;
301         default:
302                 refl_error(uth, trap_nr, err, aux);
303         }
304 }
305
306 /* Yield callback for __ctlr_entry */
307 static void __swap_to_gth(struct uthread *uth, void *dummy)
308 {
309         struct ctlr_thread *cth = (struct ctlr_thread*)uth;
310
311         /* We don't re-account for block/unblock.  The ctlr and the guest are
312          * accounted together ("pass the token" back and forth). */
313         enqueue_vmm_thread((struct vmm_thread*)cth->buddy);
314 }
315
316 /* All ctrl threads start here, each time their guest has a fault.  They can
317  * block and unblock along the way.  Once a ctlr does its final uthread_yield,
318  * the next time it will start again from the top. */
319 static void __ctlr_entry(void)
320 {
321         struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
322         struct virtual_machine *vm = gth_to_vm(cth->buddy);
323
324         if (!handle_vmexit(cth->buddy)) {
325                 struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy);
326
327                 fprintf(stderr, "vmm: handle_vmexit returned false\n");
328                 fprintf(stderr, "Note: this may be a kernel module, not the kernel\n");
329                 fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip);
330                 /* TODO: properly walk the kernel page tables to map the tf_rip
331                  * to a physical address. For now, however, this hack is good
332                  * enough.
333                  */
334                 hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16);
335                 showstatus(stderr, cth->buddy);
336                 exit(0);
337         }
338         /* We want to atomically yield and start/reenqueue our buddy.  We do so in
339          * vcore context on the other side of the yield. */
340         uthread_yield(FALSE, __swap_to_gth, 0);
341 }
342
343 static void vmm_thread_refl_vm_fault(struct uthread *uth)
344 {
345         struct guest_thread *gth = (struct guest_thread*)uth;
346         struct ctlr_thread *cth = gth->buddy;
347
348         /* The ctlr starts frm the top every time we get a new fault. */
349         cth->uthread.flags |= UTHREAD_SAVED;
350         init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
351                       (uintptr_t)(cth->stacktop));
352         /* We don't re-account for block/unblock.  The ctlr and the guest are
353          * accounted together ("pass the token" back and forth). */
354         enqueue_vmm_thread((struct vmm_thread*)cth);
355 }
356
357 static void vmm_thread_refl_fault(struct uthread *uth,
358                                   struct user_context *ctx)
359 {
360         switch (ctx->type) {
361         case ROS_HW_CTX:
362                 /* Guests should only ever VM exit */
363                 assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
364                 vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
365                                          __arch_refl_get_err(ctx),
366                                          __arch_refl_get_aux(ctx));
367                 break;
368         case ROS_VM_CTX:
369                 vmm_thread_refl_vm_fault(uth);
370                 break;
371         default:
372                 assert(0);
373         }
374 }
375
376 static void destroy_guest_thread(struct guest_thread *gth)
377 {
378         struct ctlr_thread *cth = gth->buddy;
379
380         __free_stack(cth->stacktop, cth->stacksize);
381         uthread_cleanup((struct uthread*)cth);
382         free(cth);
383         uthread_cleanup((struct uthread*)gth);
384         free(gth);
385 }
386
387 static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
388                                                 unsigned int gpcoreid)
389 {
390         struct guest_thread *gth;
391         struct ctlr_thread *cth;
392         /* Guests won't use TLS; they always operate in Ring V.  The controller
393          * might - not because of anything we do, but because of glibc calls. */
394         struct uth_thread_attr gth_attr = {.want_tls = FALSE};
395         struct uth_thread_attr cth_attr = {.want_tls = TRUE};
396
397         gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
398         cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
399         if (!gth || !cth) {
400                 free(gth);
401                 free(cth);
402                 return 0;
403         }
404         gth->buddy = cth;
405         cth->buddy = gth;
406         gth->gpc_id = gpcoreid;
407         cth->stacksize = VMM_THR_STACKSIZE;
408         cth->stacktop = __alloc_stack(cth->stacksize);
409         if (!cth->stacktop) {
410                 free(gth);
411                 free(cth);
412                 return 0;
413         }
414         gth->uthread.u_ctx.type = ROS_VM_CTX;
415         gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
416         /* No need to init the ctlr.  It gets re-init'd each time it starts. */
417         uthread_init((struct uthread*)gth, &gth_attr);
418         uthread_init((struct uthread*)cth, &cth_attr);
419         /* TODO: give it a correct FP state.  Our current one is probably fine */
420         restore_fp_state(&gth->uthread.as);
421         gth->uthread.flags |= UTHREAD_FPSAVED;
422         gth->halt_mtx = uth_mutex_alloc();
423         gth->halt_cv = uth_cond_var_alloc();
424         return gth;
425 }
426
427 int vmm_init(struct virtual_machine *vm, int flags)
428 {
429         struct guest_thread **gths;
430
431         if (current_vm)
432                 return -1;
433         current_vm = vm;
434         if (syscall(SYS_vmm_setup, vm->nr_gpcs, vm->gpcis, flags) != vm->nr_gpcs)
435                 return -1;
436         gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
437         if (!gths)
438                 return -1;
439         for (int i = 0; i < vm->nr_gpcs; i++) {
440                 gths[i] = create_guest_thread(vm, i);
441                 if (!gths[i]) {
442                         for (int j = 0; j < i; j++)
443                                 destroy_guest_thread(gths[j]);
444                         free(gths);
445                         return -1;
446                 }
447         }
448         vm->gths = gths;
449         uthread_mcp_init();
450         return 0;
451 }
452
453 void start_guest_thread(struct guest_thread *gth)
454 {
455         acct_thread_unblocked((struct vmm_thread*)gth);
456         enqueue_vmm_thread((struct vmm_thread*)gth);
457 }
458
459 static void __tth_exit_cb(struct uthread *uthread, void *junk)
460 {
461         struct task_thread *tth = (struct task_thread*)uthread;
462
463         acct_thread_blocked((struct vmm_thread*)tth);
464         uthread_cleanup(uthread);
465         __free_stack(tth->stacktop, tth->stacksize);
466         free(tth);
467 }
468
469 static void __task_thread_run(void)
470 {
471         struct task_thread *tth = (struct task_thread*)current_uthread;
472
473         tth->func(tth->arg);
474         uthread_yield(FALSE, __tth_exit_cb, 0);
475 }
476
477 struct task_thread *vmm_run_task(struct virtual_machine *vm,
478                                  void (*func)(void *), void *arg)
479 {
480         struct task_thread *tth;
481         struct uth_thread_attr tth_attr = {.want_tls = TRUE};
482
483         tth = (struct task_thread*)alloc_vmm_thread(vm, VMM_THREAD_TASK);
484         if (!tth)
485                 return 0;
486         tth->stacksize = VMM_THR_STACKSIZE;
487         tth->stacktop = __alloc_stack(tth->stacksize);
488         if (!tth->stacktop) {
489                 free(tth);
490                 return 0;
491         }
492         tth->func = func;
493         tth->arg = arg;
494         init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
495                       (uintptr_t)(tth->stacktop));
496         uthread_init((struct uthread*)tth, &tth_attr);
497         acct_thread_unblocked((struct vmm_thread*)tth);
498         enqueue_vmm_thread((struct vmm_thread*)tth);
499         return tth;
500 }
501
502 /* Helpers for tracking nr_unblk_* threads. */
503 static void acct_thread_blocked(struct vmm_thread *vth)
504 {
505         switch (vth->type) {
506         case VMM_THREAD_GUEST:
507         case VMM_THREAD_CTLR:
508                 atomic_dec(&nr_unblk_guests);
509                 break;
510         case VMM_THREAD_TASK:
511                 atomic_dec(&nr_unblk_tasks);
512                 break;
513         }
514 }
515
516 static void acct_thread_unblocked(struct vmm_thread *vth)
517 {
518         switch (vth->type) {
519         case VMM_THREAD_GUEST:
520         case VMM_THREAD_CTLR:
521                 atomic_inc(&nr_unblk_guests);
522                 break;
523         case VMM_THREAD_TASK:
524                 atomic_inc(&nr_unblk_tasks);
525                 break;
526         }
527 }
528
529 static void enqueue_vmm_thread(struct vmm_thread *vth)
530 {
531         spin_pdr_lock(&queue_lock);
532         switch (vth->type) {
533         case VMM_THREAD_GUEST:
534         case VMM_THREAD_CTLR:
535                 TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
536                 break;
537         case VMM_THREAD_TASK:
538                 TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
539                 break;
540         }
541         spin_pdr_unlock(&queue_lock);
542 }
543
544 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
545 {
546         struct vmm_thread *vth;
547         int ret;
548
549         ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
550                              sizeof(struct vmm_thread));
551         if (ret)
552                 return 0;
553         memset(vth, 0, sizeof(struct vmm_thread));
554         vth->type = type;
555         vth->vm = vm;
556         return vth;
557 }
558
559 static void __free_stack(void *stacktop, size_t stacksize)
560 {
561         munmap(stacktop - stacksize, stacksize);
562 }
563
564 static void *__alloc_stack(size_t stacksize)
565 {
566         int force_a_page_fault;
567         void *stacktop;
568         void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
569                               MAP_ANONYMOUS, -1, 0);
570
571         if (stackbot == MAP_FAILED)
572                 return 0;
573         stacktop = stackbot + stacksize;
574         /* Want the top of the stack populated, but not the rest of the stack;
575          * that'll grow on demand (up to stacksize, then will clobber memory). */
576         force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
577         return stacktop;
578 }