fc531ec06d922ce9bd501c63cefb827d5de6930c
[akaros.git] / user / vmm / sched.c
1 /* Copyright (c) 2016 Google Inc.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * 2LS for virtual machines */
6
7 #include <vmm/sched.h>
8 #include <vmm/vmm.h>
9 #include <vmm/vthread.h>
10 #include <sys/mman.h>
11 #include <stdlib.h>
12 #include <assert.h>
13 #include <parlib/spinlock.h>
14 #include <parlib/event.h>
15 #include <parlib/ucq.h>
16 #include <parlib/arch/trap.h>
17 #include <parlib/ros_debug.h>
18 #include <parlib/vcore_tick.h>
19 #include <parlib/slab.h>
20
21 int vmm_sched_period_usec = 1000;
22
23 /* For now, we only have one VM managed by the 2LS.  If we ever expand that,
24  * we'll need something analogous to current_uthread, so the 2LS knows which VM
25  * it is working on. */
26 static struct virtual_machine *current_vm;
27
28 static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
29 /* Runnable queues, broken up by thread type. */
30 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
31 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
32 static struct vmm_thread **greedy_rnbl_guests;
33 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
34 static atomic_t nr_unblk_tasks;
35 static atomic_t nr_unblk_guests;
36 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
37 static struct event_queue *sysc_evq;
38 static struct kmem_cache *task_thread_cache;
39
40 static void vmm_sched_init(void);
41 static void vmm_sched_entry(void);
42 static void vmm_thread_runnable(struct uthread *uth);
43 static void vmm_thread_paused(struct uthread *uth);
44 static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
45 static void vmm_thread_has_blocked(struct uthread *uth, int flags);
46 static void vmm_thread_refl_fault(struct uthread *uth,
47                                   struct user_context *ctx);
48 static void vmm_thread_exited(struct uthread *uth);
49 static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg);
50 static void vmm_got_posix_signal(int sig_nr, struct siginfo *info);
51
52 struct schedule_ops vmm_sched_ops = {
53         .sched_init = vmm_sched_init,
54         .sched_entry = vmm_sched_entry,
55         .thread_runnable = vmm_thread_runnable,
56         .thread_paused = vmm_thread_paused,
57         .thread_blockon_sysc = vmm_thread_blockon_sysc,
58         .thread_has_blocked = vmm_thread_has_blocked,
59         .thread_refl_fault = vmm_thread_refl_fault,
60         .thread_exited = vmm_thread_exited,
61         .thread_create = vmm_thread_create,
62         .got_posix_signal = vmm_got_posix_signal,
63 };
64
65 struct schedule_ops *sched_ops = &vmm_sched_ops;
66
67 /* Helpers */
68 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
69                                void *data);
70 static void acct_thread_blocked(struct vmm_thread *vth);
71 static void acct_thread_unblocked(struct vmm_thread *vth);
72 static void enqueue_vmm_thread(struct vmm_thread *vth);
73 static int task_thread_ctor(void *obj, void *priv, int flags);
74 static void task_thread_dtor(void *obj, void *priv);
75 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
76                                            int type);
77 static void *__alloc_stack(size_t stacksize);
78 static void __free_stack(void *stacktop, size_t stacksize);
79
80 static bool sched_is_greedy(void)
81 {
82         return parlib_never_yield;
83 }
84
85 static unsigned int sched_nr_greedy_cores(void)
86 {
87         if (!current_vm)
88                 return 1;
89         return current_vm->nr_gpcs + 1;
90 }
91
92 static void restart_thread(struct syscall *sysc)
93 {
94         struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
95
96         /* uthread stuff here: */
97         assert(ut_restartee);
98         assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
99         ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
100         vmm_thread_runnable(ut_restartee);
101 }
102
103 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
104                                void *data)
105 {
106         struct syscall *sysc;
107
108         /* I think we can make this assert now.  If not, check pthread.c.
109          * (concern was having old ev_qs firing and running this handler). */
110         assert(ev_msg);
111         sysc = ev_msg->ev_arg3;
112         assert(sysc);
113         restart_thread(sysc);
114 }
115
116 /* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will
117  * attempt to route the notifs/IPIs to vcoreid */
118 static struct event_queue *setup_sysc_evq(int vcoreid)
119 {
120         struct event_queue *evq;
121         uintptr_t mmap_block;
122
123         mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
124                                      PROT_WRITE | PROT_READ,
125                                      MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE,
126                                      -1, 0);
127         evq = get_eventq_raw();
128         assert(mmap_block && evq);
129         evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR |
130                         EVENT_WAKEUP;
131         evq->ev_vcore = vcoreid;
132         evq->ev_mbox->type = EV_MBOX_UCQ;
133         ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
134         return evq;
135 }
136
137 static void vmm_sched_init(void)
138 {
139         struct task_thread *thread0;
140
141         /* Note that thread0 doesn't belong to a VM.  We can set this during
142          * vmm_init() if we need to. */
143         thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
144         assert(thread0);
145         acct_thread_unblocked((struct vmm_thread*)thread0);
146         thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
147         thread0->stacktop = (void*)USTACKTOP;
148         /* for lack of a better vcore, might as well send to 0 */
149         sysc_evq = setup_sysc_evq(0);
150         uthread_2ls_init((struct uthread*)thread0, vmm_handle_syscall, NULL);
151         task_thread_cache = kmem_cache_create("task threads",
152                                               sizeof(struct vmm_thread),
153                                               __alignof__(struct vmm_thread), 0,
154                                               task_thread_ctor,
155                                               task_thread_dtor, NULL);
156 }
157
158 /* The scheduling policy is encapsulated in the next few functions (from here
159  * down to sched_entry()). */
160
161 static int desired_nr_vcores(void)
162 {
163         /* Sanity checks on our accounting. */
164         assert(atomic_read(&nr_unblk_guests) >= 0);
165         assert(atomic_read(&nr_unblk_tasks) >= 0);
166         /* Lockless peak.  This is always an estimate.  Some of our tasks
167          * busy-wait, so it's not enough to just give us one vcore for all
168          * tasks, yet. */
169         return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
170 }
171
172 static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
173 {
174         struct vmm_thread *vth;
175
176         vth = TAILQ_FIRST(tq);
177         if (vth)
178                 TAILQ_REMOVE(tq, vth, tq_next);
179         return vth;
180 }
181
182 static struct vmm_thread *pick_a_thread_degraded(void)
183 {
184         struct vmm_thread *vth;
185
186         spin_pdr_lock(&queue_lock);
187         vth = __pop_first(&rnbl_tasks);
188         if (!vth)
189                 vth = __pop_first(&rnbl_guests);
190         spin_pdr_unlock(&queue_lock);
191         return vth;
192 }
193
194 /* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */
195 static struct vmm_thread *pick_a_thread_plenty(void)
196 {
197         struct vmm_thread *vth = 0;
198
199         spin_pdr_lock(&queue_lock);
200         if (!vth)
201                 vth = __pop_first(&rnbl_tasks);
202         if (!vth)
203                 vth = __pop_first(&rnbl_guests);
204         spin_pdr_unlock(&queue_lock);
205         return vth;
206 }
207
208 static void yield_current_uth(void)
209 {
210         struct vmm_thread *vth;
211
212         if (!current_uthread)
213                 return;
214         vth = (struct vmm_thread*)stop_current_uthread();
215         enqueue_vmm_thread(vth);
216 }
217
218 /* Helper, tries to get the right number of vcores.  Returns TRUE if we think we
219  * have enough, FALSE otherwise.
220  *
221  * TODO: this doesn't handle a lot of issues, like preemption, how to
222  * run/yield our vcores, dynamic changes in the number of runnables, where
223  * to send events, how to avoid interfering with gpcs, etc. */
224 static bool try_to_get_vcores(void)
225 {
226         int nr_vcores_wanted;
227         bool have_enough;
228
229         if (sched_is_greedy())
230                 return num_vcores() == sched_nr_greedy_cores();
231         nr_vcores_wanted = desired_nr_vcores();
232         have_enough = nr_vcores_wanted <= num_vcores();
233         if (have_enough) {
234                 vcore_tick_disable();
235                 return TRUE;
236         }
237         vcore_tick_enable(vmm_sched_period_usec);
238         vcore_request_total(nr_vcores_wanted);
239         return FALSE;
240 }
241
242 static void stats_run_vth(struct vmm_thread *vth)
243 {
244         vth->nr_runs++;
245         if (vth->prev_vcoreid != vcore_id()) {
246                 vth->prev_vcoreid = vcore_id();
247                 vth->nr_resched++;
248         }
249 }
250
251 /* TODO: This assumes we get all of our vcores. */
252 static struct vmm_thread *sched_pick_thread_greedy(void)
253 {
254         struct vmm_thread *vth;
255
256         if (current_uthread) {
257                 stats_run_vth((struct vmm_thread*)current_uthread);
258                 run_current_uthread();
259         }
260         if (vcore_id() == 0) {
261                 spin_pdr_lock(&queue_lock);
262                 vth = __pop_first(&rnbl_tasks);
263                 spin_pdr_unlock(&queue_lock);
264                 return vth;
265         }
266         /* This races with enqueue_vmm_thread, which can run on another core.
267          * Here are the rules:
268          * - set when runnable (race free, only one state for the thread at a
269          *   time)
270          * - cleared when we run it (race free, we're the only runners)
271          * - if we take an interrupt, we'll just run_current_uthread and not
272          *   check
273          * - if we vmexit, we'll run the buddy directly */
274         assert(vcore_id() <= current_vm->nr_gpcs);
275         vth = greedy_rnbl_guests[vcore_id() - 1];
276         if (vth)
277                 greedy_rnbl_guests[vcore_id() - 1] = NULL;
278         return vth;
279 }
280
281 static struct vmm_thread *sched_pick_thread_nice(void)
282 {
283         struct vmm_thread *vth;
284         bool have_enough;
285
286         have_enough = try_to_get_vcores();
287         if (!have_enough && vcore_tick_poll()) {
288                 /* slightly less than ideal: we grab the queue lock twice */
289                 yield_current_uth();
290         }
291         if (current_uthread) {
292                 stats_run_vth((struct vmm_thread*)current_uthread);
293                 run_current_uthread();
294         }
295         if (have_enough)
296                 vth = pick_a_thread_plenty();
297         else
298                 vth = pick_a_thread_degraded();
299         return vth;
300 }
301
302 static void __attribute__((noreturn)) vmm_sched_entry(void)
303 {
304         struct vmm_thread *vth;
305
306         if (sched_is_greedy()) {
307                 vth = sched_pick_thread_greedy();
308                 if (!vth) {
309                         /* sys_halt_core will return, but we need to restart the
310                          * vcore.  We might have woke due to an event, and we'll
311                          * need to handle_events and other things dealt with by
312                          * uthreads. */
313                         if (vcore_id() == 0)
314                                 sys_halt_core(0);
315                         /* In greedy mode, yield will abort and we'll just
316                          * restart */
317                         vcore_yield_or_restart();
318                 }
319         } else {
320                 vth = sched_pick_thread_nice();
321                 if (!vth)
322                         vcore_yield_or_restart();
323         }
324         stats_run_vth(vth);
325         run_uthread((struct uthread*)vth);
326 }
327
328 static void vmm_thread_runnable(struct uthread *uth)
329 {
330         /* A thread that was blocked is now runnable.  This counts as becoming
331          * unblocked (running + runnable) */
332         acct_thread_unblocked((struct vmm_thread*)uth);
333         enqueue_vmm_thread((struct vmm_thread*)uth);
334 }
335
336 static void vmm_thread_paused(struct uthread *uth)
337 {
338         /* The thread stopped for some reason, usually a preemption.  We'd like
339          * to just run it whenever we get a chance.  Note that it didn't become
340          * 'blocked' - it's still runnable. */
341         enqueue_vmm_thread((struct vmm_thread*)uth);
342 }
343
344 static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
345 {
346         struct syscall *sysc = (struct syscall*)syscall;
347
348         acct_thread_blocked((struct vmm_thread*)uth);
349         sysc->u_data = uth;
350         if (!register_evq(sysc, sysc_evq)) {
351                 /* Lost the race with the call being done.  The kernel won't
352                  * send the event.  Just restart him. */
353                 restart_thread(sysc);
354         }
355         /* GIANT WARNING: do not touch the thread after this point. */
356 }
357
358 static void vmm_thread_has_blocked(struct uthread *uth, int flags)
359 {
360         /* The thread blocked on something like a mutex.  It's not runnable, so
361          * we don't need to put it on a list, but we do need to account for it
362          * not running.  We'll find out (via thread_runnable) when it starts up
363          * again.  */
364         acct_thread_blocked((struct vmm_thread*)uth);
365 }
366
367 static void refl_error(struct uthread *uth, unsigned int trap_nr,
368                        unsigned int err, unsigned long aux)
369 {
370         printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
371                trap_nr, err, aux);
372         /* Note that uthread.c already copied out our ctx into the uth
373          * struct */
374         print_user_context(&uth->u_ctx);
375         printf("Turn on printx to spew unhandled, malignant trap info\n");
376         exit(-1);
377 }
378
379 static bool handle_page_fault(struct uthread *uth, unsigned int err,
380                               unsigned long aux)
381 {
382         if (!(err & PF_VMR_BACKED))
383                 return FALSE;
384         syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
385         __block_uthread_on_async_sysc(uth);
386         return TRUE;
387 }
388
389 static void vmm_thread_refl_hw_fault(struct uthread *uth,
390                                      unsigned int trap_nr,
391                                      unsigned int err, unsigned long aux)
392 {
393         switch (trap_nr) {
394         case HW_TRAP_PAGE_FAULT:
395                 if (!handle_page_fault(uth, err, aux))
396                         refl_error(uth, trap_nr, err, aux);
397                 break;
398         default:
399                 refl_error(uth, trap_nr, err, aux);
400         }
401 }
402
403 /* Yield callback for __ctlr_entry */
404 static void __swap_to_gth(struct uthread *uth, void *dummy)
405 {
406         struct ctlr_thread *cth = (struct ctlr_thread*)uth;
407
408         /* We just immediately run our buddy.  The ctlr and the guest are
409          * accounted together ("pass the token" back and forth). */
410         current_uthread = NULL;
411         stats_run_vth((struct vmm_thread*)cth->buddy);
412         run_uthread((struct uthread*)cth->buddy);
413         assert(0);
414 }
415
416 /* All ctrl threads start here, each time their guest has a fault.  They can
417  * block and unblock along the way.  Once a ctlr does its final uthread_yield,
418  * the next time it will start again from the top. */
419 static void __ctlr_entry(void)
420 {
421         struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
422         struct virtual_machine *vm = gth_to_vm(cth->buddy);
423
424         if (!handle_vmexit(cth->buddy)) {
425                 struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy);
426                 static struct spin_pdr_lock spew = SPINPDR_INITIALIZER;
427
428                 spin_pdr_lock(&spew);
429                 fprintf(stderr, "vmm: handle_vmexit failed!\n");
430                 fprintf(stderr, "RSP was %p, ", (void *)vm_tf->tf_rsp);
431                 fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip);
432                 /* TODO: properly walk the kernel page tables to map the tf_rip
433                  * to a physical address. For now, however, this hack is good
434                  * enough.
435                  */
436                 hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16);
437                 showstatus(stderr, cth->buddy);
438                 spin_pdr_unlock(&spew);
439                 exit(0);
440         }
441         /* We want to atomically yield and start/reenqueue our buddy.  We do so
442          * in vcore context on the other side of the yield. */
443         uthread_yield(FALSE, __swap_to_gth, 0);
444 }
445
446 static void vmm_thread_refl_vm_fault(struct uthread *uth)
447 {
448         struct guest_thread *gth = (struct guest_thread*)uth;
449         struct ctlr_thread *cth = gth->buddy;
450
451         gth->nr_vmexits++;
452         /* The ctlr starts frm the top every time we get a new fault. */
453         cth->uthread.flags |= UTHREAD_SAVED;
454         init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
455                       (uintptr_t)(cth->stacktop));
456         /* We just immediately run our buddy.  The ctlr and the guest are
457          * accounted together ("pass the token" back and forth). */
458         current_uthread = NULL;
459         stats_run_vth((struct vmm_thread*)cth);
460         run_uthread((struct uthread*)cth);
461         assert(0);
462 }
463
464 static void vmm_thread_refl_fault(struct uthread *uth,
465                                   struct user_context *ctx)
466 {
467         switch (ctx->type) {
468         case ROS_HW_CTX:
469                 /* Guests should only ever VM exit */
470                 assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
471                 vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
472                                          __arch_refl_get_err(ctx),
473                                          __arch_refl_get_aux(ctx));
474                 break;
475         case ROS_VM_CTX:
476                 vmm_thread_refl_vm_fault(uth);
477                 break;
478         default:
479                 assert(0);
480         }
481 }
482
483 static void task_thread_dtor(void *obj, void *priv)
484 {
485         struct task_thread *tth = (struct task_thread*)obj;
486
487         uthread_cleanup((struct uthread*)tth);
488         __free_stack(tth->stacktop, tth->stacksize);
489 }
490
491 static void task_thread_exit(struct task_thread *tth)
492 {
493         struct uthread *uth = (struct uthread*)tth;
494
495         if (uth->flags & UTHREAD_IS_THREAD0)
496                 return;
497         kmem_cache_free(task_thread_cache, tth);
498 }
499
500 static void ctlr_thread_exit(struct ctlr_thread *cth)
501 {
502         __vthread_exited((struct vthread*)cth->buddy);
503 }
504
505 static void vmm_thread_exited(struct uthread *uth)
506 {
507         struct vmm_thread *vth = (struct vmm_thread*)uth;
508
509         assert(vth->type != VMM_THREAD_GUEST);
510
511         acct_thread_blocked(vth);
512         switch (vth->type) {
513         case VMM_THREAD_TASK:
514                 task_thread_exit((struct task_thread*)uth);
515                 break;
516         case VMM_THREAD_CTLR:
517                 ctlr_thread_exit((struct ctlr_thread*)uth);
518                 break;
519         case VMM_THREAD_GUEST:
520                 panic("Guest threads shouldn't be able to exit");
521         }
522 }
523
524 static void destroy_guest_thread(struct guest_thread *gth)
525 {
526         struct ctlr_thread *cth = gth->buddy;
527
528         __free_stack(cth->stacktop, cth->stacksize);
529         uthread_cleanup((struct uthread*)cth);
530         free(cth);
531         uthread_cleanup((struct uthread*)gth);
532         free(gth);
533 }
534
535 struct guest_thread *create_guest_thread(struct virtual_machine *vm,
536                                          unsigned int gpcoreid,
537                                          struct vmm_gpcore_init *gpci)
538 {
539         struct guest_thread *gth;
540         struct ctlr_thread *cth;
541         /* Guests won't use TLS; they always operate in Ring V.  The controller
542          * might - not because of anything we do, but because of glibc calls. */
543         struct uth_thread_attr gth_attr = {.want_tls = FALSE};
544         struct uth_thread_attr cth_attr = {.want_tls = TRUE};
545
546         gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
547         cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
548         if (!gth || !cth) {
549                 free(gth);
550                 free(cth);
551                 return 0;
552         }
553         gth->buddy = cth;
554         cth->buddy = gth;
555         gth->gpc_id = gpcoreid;
556         gth->gpci = *gpci;
557         cth->stacksize = VMM_THR_STACKSIZE;
558         cth->stacktop = __alloc_stack(cth->stacksize);
559         if (!cth->stacktop) {
560                 free(gth);
561                 free(cth);
562                 return 0;
563         }
564         gth->uthread.u_ctx.type = ROS_VM_CTX;
565         gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
566         uthread_init((struct uthread*)gth, &gth_attr);
567         uthread_init((struct uthread*)cth, &cth_attr);
568         gth->halt_mtx = uth_mutex_alloc();
569         gth->halt_cv = uth_cond_var_alloc();
570         return gth;
571 }
572
573 static void ev_handle_diag(struct event_msg *ev_msg, unsigned int ev_type,
574                            void *data)
575 {
576         struct virtual_machine *vm = current_vm;
577         struct guest_thread *gth;
578         struct ctlr_thread *cth;
579         bool reset = FALSE;
580
581         if (ev_msg && (ev_msg->ev_arg1 == 1))
582                 reset = TRUE;
583
584         fprintf(stderr, "\nSCHED stats:\n---------------\n");
585         for (int i = 0; i < vm->nr_gpcs; i++) {
586                 gth = gpcid_to_gth(vm, i);
587                 cth = gth->buddy;
588                 fprintf(stderr, "\tGPC %2d: %lu resched, %lu gth runs, %lu ctl runs, %lu user-handled vmexits\n",
589                         i,
590                         ((struct vmm_thread*)gth)->nr_resched,
591                         ((struct vmm_thread*)gth)->nr_runs,
592                         ((struct vmm_thread*)cth)->nr_runs,
593                         gth->nr_vmexits);
594                 if (reset) {
595                         ((struct vmm_thread*)gth)->nr_resched = 0;
596                         ((struct vmm_thread*)gth)->nr_runs = 0;
597                         ((struct vmm_thread*)cth)->nr_runs = 0;
598                         gth->nr_vmexits = 0;
599                 }
600         }
601         fprintf(stderr, "\n\tNr unblocked gpc %lu, Nr unblocked tasks %lu\n",
602                 atomic_read(&nr_unblk_guests), atomic_read(&nr_unblk_tasks));
603 }
604
605 int vmm_init(struct virtual_machine *vm, struct vmm_gpcore_init *gpcis,
606              int flags)
607 {
608         struct guest_thread **gths;
609
610         if (current_vm)
611                 return -1;
612         current_vm = vm;
613         /* We should tell the kernel to create all of the GPCs we'll need in
614          * advance.
615          *
616          * We could create the others on the fly, but the kernel's answer for
617          * CPUID[0x1] will not have to total number of cores.  If we move that
618          * handler to userspace, we can create the SMP-booted GPCs on the fly.
619          *
620          * We'd also have to deal with gths[] growing dynamically, which would
621          * require synchronization. */
622         if (syscall(SYS_vmm_add_gpcs, vm->nr_gpcs, gpcis) != vm->nr_gpcs)
623                 return -1;
624         if (flags) {
625                 if (syscall(SYS_vmm_ctl, VMM_CTL_SET_FLAGS, flags))
626                         return -1;
627         }
628         gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
629         if (!gths)
630                 return -1;
631         for (int i = 0; i < vm->nr_gpcs; i++) {
632                 gths[i] = create_guest_thread(vm, i, &gpcis[i]);
633                 if (!gths[i]) {
634                         for (int j = 0; j < i; j++)
635                                 destroy_guest_thread(gths[j]);
636                         free(gths);
637                         return -1;
638                 }
639         }
640         wmb(); /* All gths posted before advertising. */
641         vm->__gths = gths;
642         uthread_mcp_init();
643         register_ev_handler(EV_FREE_APPLE_PIE, ev_handle_diag, NULL);
644         if (sched_is_greedy()) {
645                 greedy_rnbl_guests = calloc(vm->nr_gpcs,
646                                             sizeof(struct vmm_thread *));
647                 assert(greedy_rnbl_guests);
648                 vcore_request_total(sched_nr_greedy_cores());
649                 syscall(SYS_vmm_ctl, VMM_CTL_SET_EXITS,
650                         syscall(SYS_vmm_ctl, VMM_CTL_GET_EXITS) &
651                                 ~(VMM_CTL_EXIT_HALT | VMM_CTL_EXIT_MWAIT));
652         }
653         return 0;
654 }
655
656 void start_guest_thread(struct guest_thread *gth)
657 {
658         acct_thread_unblocked((struct vmm_thread*)gth);
659         enqueue_vmm_thread((struct vmm_thread*)gth);
660 }
661
662 static void __task_thread_run(void)
663 {
664         struct task_thread *tth = (struct task_thread*)current_uthread;
665
666         uth_2ls_thread_exit(tth->func(tth->arg));
667 }
668
669 static int task_thread_ctor(void *obj, void *priv, int flags)
670 {
671         struct vmm_thread *vth = (struct vmm_thread*)obj;
672         struct task_thread *tth = (struct task_thread*)obj;
673
674         memset(vth, 0, sizeof(struct vmm_thread));
675         vth->type = VMM_THREAD_TASK;
676         vth->vm = current_vm;
677         tth->stacksize = VMM_THR_STACKSIZE;
678         tth->stacktop = __alloc_stack(tth->stacksize);
679         if (!tth->stacktop)
680                 return -1;
681         return 0;
682 }
683
684 /* Helper, creates and starts a task thread. */
685 static struct task_thread *__vmm_run_task(struct virtual_machine *vm,
686                                           void *(*func)(void *), void *arg,
687                                           struct uth_thread_attr *tth_attr)
688 {
689         struct task_thread *tth;
690
691         tth = kmem_cache_alloc(task_thread_cache, 0);
692         tth->func = func;
693         tth->arg = arg;
694         init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
695                       (uintptr_t)(tth->stacktop));
696         uthread_init((struct uthread*)tth, tth_attr);
697         acct_thread_unblocked((struct vmm_thread*)tth);
698         enqueue_vmm_thread((struct vmm_thread*)tth);
699         return tth;
700 }
701
702 struct task_thread *vmm_run_task(struct virtual_machine *vm,
703                                  void *(*func)(void *), void *arg)
704 {
705         struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = TRUE};
706
707         return __vmm_run_task(vm, func, arg, &tth_attr);
708 }
709
710 static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg)
711 {
712         struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = FALSE};
713         struct task_thread *tth;
714
715         /* It's OK to not have a VM for a generic thread */
716         tth = __vmm_run_task(NULL, func, arg, &tth_attr);
717         /* But just in case, let's poison it */
718         ((struct vmm_thread*)tth)->vm = (void*)0xdeadbeef;
719         return (struct uthread*)tth;
720 }
721
722 /* Careful, that fake_uctx takes up a lot of stack space.  We could do something
723  * to route signals to task threads too.  The VMM-2LS has less need for this at
724  * all, so we could just run the signal handler as-is, without worrying about
725  * user contexts.  Note the pthread 2LS has similar code. */
726 static void vmm_got_posix_signal(int sig_nr, struct siginfo *info)
727 {
728         struct user_context fake_uctx;
729
730         /* If we happen to have a current uthread, we can use that - perhaps
731          * that's what the user wants.  If not, we'll build a fake one
732          * representing our current call stack. */
733         if (current_uthread) {
734                 trigger_posix_signal(sig_nr, info, get_cur_uth_ctx());
735         } else {
736                 init_user_ctx(&fake_uctx, (uintptr_t)vmm_got_posix_signal,
737                               get_stack_pointer());
738                 trigger_posix_signal(sig_nr, info, &fake_uctx);
739         }
740 }
741
742 /* Helpers for tracking nr_unblk_* threads. */
743 static void acct_thread_blocked(struct vmm_thread *vth)
744 {
745         switch (vth->type) {
746         case VMM_THREAD_GUEST:
747         case VMM_THREAD_CTLR:
748                 atomic_dec(&nr_unblk_guests);
749                 break;
750         case VMM_THREAD_TASK:
751                 atomic_dec(&nr_unblk_tasks);
752                 break;
753         }
754 }
755
756 static void acct_thread_unblocked(struct vmm_thread *vth)
757 {
758         switch (vth->type) {
759         case VMM_THREAD_GUEST:
760         case VMM_THREAD_CTLR:
761                 atomic_inc(&nr_unblk_guests);
762                 break;
763         case VMM_THREAD_TASK:
764                 atomic_inc(&nr_unblk_tasks);
765                 break;
766         }
767 }
768
769 static void greedy_mark_guest_runnable(struct vmm_thread *vth)
770 {
771         int gpcid;
772
773         if (vth->type == VMM_THREAD_GUEST)
774                 gpcid = ((struct guest_thread*)vth)->gpc_id;
775         else
776                 gpcid = ((struct ctlr_thread*)vth)->buddy->gpc_id;
777         /* racing with the reader */
778         greedy_rnbl_guests[gpcid] = vth;
779 }
780
781 static void enqueue_vmm_thread(struct vmm_thread *vth)
782 {
783         switch (vth->type) {
784         case VMM_THREAD_GUEST:
785         case VMM_THREAD_CTLR:
786                 if (sched_is_greedy()) {
787                         greedy_mark_guest_runnable(vth);
788                 } else {
789                         spin_pdr_lock(&queue_lock);
790                         TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
791                         spin_pdr_unlock(&queue_lock);
792                 }
793                 break;
794         case VMM_THREAD_TASK:
795                 spin_pdr_lock(&queue_lock);
796                 TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
797                 spin_pdr_unlock(&queue_lock);
798                 if (sched_is_greedy())
799                         vcore_wake(0, false);
800                 break;
801         default:
802                 panic("Bad vmm_thread type %p\n", vth->type);
803         }
804         try_to_get_vcores();
805 }
806
807 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
808 {
809         struct vmm_thread *vth;
810         int ret;
811
812         ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
813                              sizeof(struct vmm_thread));
814         if (ret)
815                 return 0;
816         memset(vth, 0, sizeof(struct vmm_thread));
817         vth->type = type;
818         vth->vm = vm;
819         return vth;
820 }
821
822 static void __free_stack(void *stacktop, size_t stacksize)
823 {
824         munmap(stacktop - stacksize, stacksize);
825 }
826
827 static void *__alloc_stack(size_t stacksize)
828 {
829         int force_a_page_fault;
830         void *stacktop;
831         void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
832                               MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
833
834         if (stackbot == MAP_FAILED)
835                 return 0;
836         stacktop = stackbot + stacksize;
837         /* Want the top of the stack populated, but not the rest of the stack;
838          * that'll grow on demand (up to stacksize, then will clobber memory). */
839         force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
840         return stacktop;
841 }