proc: fix refcounting bug in proc_get_set()
[akaros.git] / user / vmm / sched.c
1 /* Copyright (c) 2016 Google Inc.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * 2LS for virtual machines */
6
7 #include <vmm/sched.h>
8 #include <vmm/vmm.h>
9 #include <vmm/vthread.h>
10 #include <sys/mman.h>
11 #include <stdlib.h>
12 #include <assert.h>
13 #include <parlib/spinlock.h>
14 #include <parlib/event.h>
15 #include <parlib/ucq.h>
16 #include <parlib/arch/trap.h>
17 #include <parlib/ros_debug.h>
18 #include <parlib/vcore_tick.h>
19 #include <parlib/slab.h>
20
21 int vmm_sched_period_usec = 1000;
22
23 /* For now, we only have one VM managed by the 2LS.  If we ever expand that,
24  * we'll need something analogous to current_uthread, so the 2LS knows which VM
25  * it is working on. */
26 static struct virtual_machine *current_vm;
27
28 static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
29 /* Runnable queues, broken up by thread type. */
30 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
31 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
32 static struct vmm_thread **greedy_rnbl_guests;
33 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
34 static atomic_t nr_unblk_tasks;
35 static atomic_t nr_unblk_guests;
36 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
37 static struct event_queue *sysc_evq;
38 static struct kmem_cache *task_thread_cache;
39
40 static void vmm_sched_init(void);
41 static void vmm_sched_entry(void);
42 static void vmm_thread_runnable(struct uthread *uth);
43 static void vmm_thread_paused(struct uthread *uth);
44 static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
45 static void vmm_thread_has_blocked(struct uthread *uth, int flags);
46 static void vmm_thread_refl_fault(struct uthread *uth,
47                                   struct user_context *ctx);
48 static void vmm_thread_exited(struct uthread *uth);
49 static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg);
50
51 struct schedule_ops vmm_sched_ops = {
52         .sched_init = vmm_sched_init,
53         .sched_entry = vmm_sched_entry,
54         .thread_runnable = vmm_thread_runnable,
55         .thread_paused = vmm_thread_paused,
56         .thread_blockon_sysc = vmm_thread_blockon_sysc,
57         .thread_has_blocked = vmm_thread_has_blocked,
58         .thread_refl_fault = vmm_thread_refl_fault,
59         .thread_exited = vmm_thread_exited,
60         .thread_create = vmm_thread_create,
61 };
62
63 struct schedule_ops *sched_ops = &vmm_sched_ops;
64
65 /* Helpers */
66 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
67                                void *data);
68 static void acct_thread_blocked(struct vmm_thread *vth);
69 static void acct_thread_unblocked(struct vmm_thread *vth);
70 static void enqueue_vmm_thread(struct vmm_thread *vth);
71 static int task_thread_ctor(void *obj, void *priv, int flags);
72 static void task_thread_dtor(void *obj, void *priv);
73 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
74                                            int type);
75 static void *__alloc_stack(size_t stacksize);
76 static void __free_stack(void *stacktop, size_t stacksize);
77
78 static bool sched_is_greedy(void)
79 {
80         return parlib_never_yield;
81 }
82
83 static unsigned int sched_nr_greedy_cores(void)
84 {
85         if (!current_vm)
86                 return 1;
87         return current_vm->nr_gpcs + 1;
88 }
89
90 static void restart_thread(struct syscall *sysc)
91 {
92         struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
93
94         /* uthread stuff here: */
95         assert(ut_restartee);
96         assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
97         ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
98         vmm_thread_runnable(ut_restartee);
99 }
100
101 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
102                                void *data)
103 {
104         struct syscall *sysc;
105
106         /* I think we can make this assert now.  If not, check pthread.c.
107          * (concern was having old ev_qs firing and running this handler). */
108         assert(ev_msg);
109         sysc = ev_msg->ev_arg3;
110         assert(sysc);
111         restart_thread(sysc);
112 }
113
114 /* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will
115  * attempt to route the notifs/IPIs to vcoreid */
116 static struct event_queue *setup_sysc_evq(int vcoreid)
117 {
118         struct event_queue *evq;
119         uintptr_t mmap_block;
120
121         mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
122                                      PROT_WRITE | PROT_READ,
123                                      MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE,
124                                      -1, 0);
125         evq = get_eventq_raw();
126         assert(mmap_block && evq);
127         evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR |
128                         EVENT_WAKEUP;
129         evq->ev_vcore = vcoreid;
130         evq->ev_mbox->type = EV_MBOX_UCQ;
131         ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
132         return evq;
133 }
134
135 static void vmm_sched_init(void)
136 {
137         struct task_thread *thread0;
138
139         /* Note that thread0 doesn't belong to a VM.  We can set this during
140          * vmm_init() if we need to. */
141         thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
142         assert(thread0);
143         acct_thread_unblocked((struct vmm_thread*)thread0);
144         thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
145         thread0->stacktop = (void*)USTACKTOP;
146         /* for lack of a better vcore, might as well send to 0 */
147         sysc_evq = setup_sysc_evq(0);
148         uthread_2ls_init((struct uthread*)thread0, vmm_handle_syscall, NULL);
149         task_thread_cache = kmem_cache_create("task threads",
150                                               sizeof(struct vmm_thread),
151                                               __alignof__(struct vmm_thread), 0,
152                                               task_thread_ctor,
153                                               task_thread_dtor, NULL);
154 }
155
156 /* The scheduling policy is encapsulated in the next few functions (from here
157  * down to sched_entry()). */
158
159 static int desired_nr_vcores(void)
160 {
161         /* Sanity checks on our accounting. */
162         assert(atomic_read(&nr_unblk_guests) >= 0);
163         assert(atomic_read(&nr_unblk_tasks) >= 0);
164         /* Lockless peak.  This is always an estimate.  Some of our tasks
165          * busy-wait, so it's not enough to just give us one vcore for all
166          * tasks, yet. */
167         return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
168 }
169
170 static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
171 {
172         struct vmm_thread *vth;
173
174         vth = TAILQ_FIRST(tq);
175         if (vth)
176                 TAILQ_REMOVE(tq, vth, tq_next);
177         return vth;
178 }
179
180 static struct vmm_thread *pick_a_thread_degraded(void)
181 {
182         struct vmm_thread *vth;
183
184         spin_pdr_lock(&queue_lock);
185         vth = __pop_first(&rnbl_tasks);
186         if (!vth)
187                 vth = __pop_first(&rnbl_guests);
188         spin_pdr_unlock(&queue_lock);
189         return vth;
190 }
191
192 /* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */
193 static struct vmm_thread *pick_a_thread_plenty(void)
194 {
195         struct vmm_thread *vth = 0;
196
197         spin_pdr_lock(&queue_lock);
198         if (!vth)
199                 vth = __pop_first(&rnbl_tasks);
200         if (!vth)
201                 vth = __pop_first(&rnbl_guests);
202         spin_pdr_unlock(&queue_lock);
203         return vth;
204 }
205
206 static void yield_current_uth(void)
207 {
208         struct vmm_thread *vth;
209
210         if (!current_uthread)
211                 return;
212         vth = (struct vmm_thread*)stop_current_uthread();
213         enqueue_vmm_thread(vth);
214 }
215
216 /* Helper, tries to get the right number of vcores.  Returns TRUE if we think we
217  * have enough, FALSE otherwise.
218  *
219  * TODO: this doesn't handle a lot of issues, like preemption, how to
220  * run/yield our vcores, dynamic changes in the number of runnables, where
221  * to send events, how to avoid interfering with gpcs, etc. */
222 static bool try_to_get_vcores(void)
223 {
224         int nr_vcores_wanted;
225         bool have_enough;
226
227         if (sched_is_greedy())
228                 return num_vcores() == sched_nr_greedy_cores();
229         nr_vcores_wanted = desired_nr_vcores();
230         have_enough = nr_vcores_wanted <= num_vcores();
231         if (have_enough) {
232                 vcore_tick_disable();
233                 return TRUE;
234         }
235         vcore_tick_enable(vmm_sched_period_usec);
236         vcore_request_total(nr_vcores_wanted);
237         return FALSE;
238 }
239
240 static void stats_run_vth(struct vmm_thread *vth)
241 {
242         vth->nr_runs++;
243         if (vth->prev_vcoreid != vcore_id()) {
244                 vth->prev_vcoreid = vcore_id();
245                 vth->nr_resched++;
246         }
247 }
248
249 /* TODO: This assumes we get all of our vcores. */
250 static struct vmm_thread *sched_pick_thread_greedy(void)
251 {
252         struct vmm_thread *vth;
253
254         if (current_uthread) {
255                 stats_run_vth((struct vmm_thread*)current_uthread);
256                 run_current_uthread();
257         }
258         if (vcore_id() == 0) {
259                 spin_pdr_lock(&queue_lock);
260                 vth = __pop_first(&rnbl_tasks);
261                 spin_pdr_unlock(&queue_lock);
262                 return vth;
263         }
264         /* This races with enqueue_vmm_thread, which can run on another core.
265          * Here are the rules:
266          * - set when runnable (race free, only one state for the thread at a
267          *   time)
268          * - cleared when we run it (race free, we're the only runners)
269          * - if we take an interrupt, we'll just run_current_uthread and not
270          *   check
271          * - if we vmexit, we'll run the buddy directly */
272         assert(vcore_id() <= current_vm->nr_gpcs);
273         vth = greedy_rnbl_guests[vcore_id() - 1];
274         if (vth)
275                 greedy_rnbl_guests[vcore_id() - 1] = NULL;
276         return vth;
277 }
278
279 static struct vmm_thread *sched_pick_thread_nice(void)
280 {
281         struct vmm_thread *vth;
282         bool have_enough;
283
284         have_enough = try_to_get_vcores();
285         if (!have_enough && vcore_tick_poll()) {
286                 /* slightly less than ideal: we grab the queue lock twice */
287                 yield_current_uth();
288         }
289         if (current_uthread) {
290                 stats_run_vth((struct vmm_thread*)current_uthread);
291                 run_current_uthread();
292         }
293         if (have_enough)
294                 vth = pick_a_thread_plenty();
295         else
296                 vth = pick_a_thread_degraded();
297         return vth;
298 }
299
300 static void __attribute__((noreturn)) vmm_sched_entry(void)
301 {
302         struct vmm_thread *vth;
303
304         if (sched_is_greedy()) {
305                 vth = sched_pick_thread_greedy();
306                 if (!vth) {
307                         /* sys_halt_core will return, but we need to restart the
308                          * vcore.  We might have woke due to an event, and we'll
309                          * need to handle_events and other things dealt with by
310                          * uthreads. */
311                         if (vcore_id() == 0)
312                                 sys_halt_core(0);
313                         /* In greedy mode, yield will abort and we'll just
314                          * restart */
315                         vcore_yield_or_restart();
316                 }
317         } else {
318                 vth = sched_pick_thread_nice();
319                 if (!vth)
320                         vcore_yield_or_restart();
321         }
322         stats_run_vth(vth);
323         run_uthread((struct uthread*)vth);
324 }
325
326 static void vmm_thread_runnable(struct uthread *uth)
327 {
328         /* A thread that was blocked is now runnable.  This counts as becoming
329          * unblocked (running + runnable) */
330         acct_thread_unblocked((struct vmm_thread*)uth);
331         enqueue_vmm_thread((struct vmm_thread*)uth);
332 }
333
334 static void vmm_thread_paused(struct uthread *uth)
335 {
336         /* The thread stopped for some reason, usually a preemption.  We'd like
337          * to just run it whenever we get a chance.  Note that it didn't become
338          * 'blocked' - it's still runnable. */
339         enqueue_vmm_thread((struct vmm_thread*)uth);
340 }
341
342 static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
343 {
344         struct syscall *sysc = (struct syscall*)syscall;
345
346         acct_thread_blocked((struct vmm_thread*)uth);
347         sysc->u_data = uth;
348         if (!register_evq(sysc, sysc_evq)) {
349                 /* Lost the race with the call being done.  The kernel won't
350                  * send the event.  Just restart him. */
351                 restart_thread(sysc);
352         }
353         /* GIANT WARNING: do not touch the thread after this point. */
354 }
355
356 static void vmm_thread_has_blocked(struct uthread *uth, int flags)
357 {
358         /* The thread blocked on something like a mutex.  It's not runnable, so
359          * we don't need to put it on a list, but we do need to account for it
360          * not running.  We'll find out (via thread_runnable) when it starts up
361          * again.  */
362         acct_thread_blocked((struct vmm_thread*)uth);
363 }
364
365 static void refl_error(struct uthread *uth, unsigned int trap_nr,
366                        unsigned int err, unsigned long aux)
367 {
368         printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
369                trap_nr, err, aux);
370         /* Note that uthread.c already copied out our ctx into the uth
371          * struct */
372         print_user_context(&uth->u_ctx);
373         printf("Turn on printx to spew unhandled, malignant trap info\n");
374         exit(-1);
375 }
376
377 static bool handle_page_fault(struct uthread *uth, unsigned int err,
378                               unsigned long aux)
379 {
380         if (!(err & PF_VMR_BACKED))
381                 return FALSE;
382         syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
383         __block_uthread_on_async_sysc(uth);
384         return TRUE;
385 }
386
387 static void vmm_thread_refl_hw_fault(struct uthread *uth,
388                                      unsigned int trap_nr,
389                                      unsigned int err, unsigned long aux)
390 {
391         switch (trap_nr) {
392         case HW_TRAP_PAGE_FAULT:
393                 if (!handle_page_fault(uth, err, aux))
394                         refl_error(uth, trap_nr, err, aux);
395                 break;
396         default:
397                 refl_error(uth, trap_nr, err, aux);
398         }
399 }
400
401 /* Yield callback for __ctlr_entry */
402 static void __swap_to_gth(struct uthread *uth, void *dummy)
403 {
404         struct ctlr_thread *cth = (struct ctlr_thread*)uth;
405
406         /* We just immediately run our buddy.  The ctlr and the guest are
407          * accounted together ("pass the token" back and forth). */
408         current_uthread = NULL;
409         stats_run_vth((struct vmm_thread*)cth->buddy);
410         run_uthread((struct uthread*)cth->buddy);
411         assert(0);
412 }
413
414 /* All ctrl threads start here, each time their guest has a fault.  They can
415  * block and unblock along the way.  Once a ctlr does its final uthread_yield,
416  * the next time it will start again from the top. */
417 static void __ctlr_entry(void)
418 {
419         struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
420         struct virtual_machine *vm = gth_to_vm(cth->buddy);
421
422         if (!handle_vmexit(cth->buddy)) {
423                 struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy);
424                 static struct spin_pdr_lock spew = SPINPDR_INITIALIZER;
425
426                 spin_pdr_lock(&spew);
427                 fprintf(stderr, "vmm: handle_vmexit failed!\n");
428                 fprintf(stderr, "RSP was %p, ", (void *)vm_tf->tf_rsp);
429                 fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip);
430                 /* TODO: properly walk the kernel page tables to map the tf_rip
431                  * to a physical address. For now, however, this hack is good
432                  * enough.
433                  */
434                 hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16);
435                 showstatus(stderr, cth->buddy);
436                 spin_pdr_unlock(&spew);
437                 exit(0);
438         }
439         /* We want to atomically yield and start/reenqueue our buddy.  We do so
440          * in vcore context on the other side of the yield. */
441         uthread_yield(FALSE, __swap_to_gth, 0);
442 }
443
444 static void vmm_thread_refl_vm_fault(struct uthread *uth)
445 {
446         struct guest_thread *gth = (struct guest_thread*)uth;
447         struct ctlr_thread *cth = gth->buddy;
448
449         gth->nr_vmexits++;
450         /* The ctlr starts frm the top every time we get a new fault. */
451         cth->uthread.flags |= UTHREAD_SAVED;
452         init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
453                       (uintptr_t)(cth->stacktop));
454         /* We just immediately run our buddy.  The ctlr and the guest are
455          * accounted together ("pass the token" back and forth). */
456         current_uthread = NULL;
457         stats_run_vth((struct vmm_thread*)cth);
458         run_uthread((struct uthread*)cth);
459         assert(0);
460 }
461
462 static void vmm_thread_refl_fault(struct uthread *uth,
463                                   struct user_context *ctx)
464 {
465         switch (ctx->type) {
466         case ROS_HW_CTX:
467                 /* Guests should only ever VM exit */
468                 assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
469                 vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
470                                          __arch_refl_get_err(ctx),
471                                          __arch_refl_get_aux(ctx));
472                 break;
473         case ROS_VM_CTX:
474                 vmm_thread_refl_vm_fault(uth);
475                 break;
476         default:
477                 assert(0);
478         }
479 }
480
481 static void task_thread_dtor(void *obj, void *priv)
482 {
483         struct task_thread *tth = (struct task_thread*)obj;
484
485         uthread_cleanup((struct uthread*)tth);
486         __free_stack(tth->stacktop, tth->stacksize);
487 }
488
489 static void task_thread_exit(struct task_thread *tth)
490 {
491         struct uthread *uth = (struct uthread*)tth;
492
493         if (uth->flags & UTHREAD_IS_THREAD0)
494                 return;
495         kmem_cache_free(task_thread_cache, tth);
496 }
497
498 static void ctlr_thread_exit(struct ctlr_thread *cth)
499 {
500         __vthread_exited((struct vthread*)cth->buddy);
501 }
502
503 static void vmm_thread_exited(struct uthread *uth)
504 {
505         struct vmm_thread *vth = (struct vmm_thread*)uth;
506
507         assert(vth->type != VMM_THREAD_GUEST);
508
509         acct_thread_blocked(vth);
510         switch (vth->type) {
511         case VMM_THREAD_TASK:
512                 task_thread_exit((struct task_thread*)uth);
513                 break;
514         case VMM_THREAD_CTLR:
515                 ctlr_thread_exit((struct ctlr_thread*)uth);
516                 break;
517         case VMM_THREAD_GUEST:
518                 panic("Guest threads shouldn't be able to exit");
519         }
520 }
521
522 static void destroy_guest_thread(struct guest_thread *gth)
523 {
524         struct ctlr_thread *cth = gth->buddy;
525
526         __free_stack(cth->stacktop, cth->stacksize);
527         uthread_cleanup((struct uthread*)cth);
528         free(cth);
529         uthread_cleanup((struct uthread*)gth);
530         free(gth);
531 }
532
533 struct guest_thread *create_guest_thread(struct virtual_machine *vm,
534                                          unsigned int gpcoreid,
535                                          struct vmm_gpcore_init *gpci)
536 {
537         struct guest_thread *gth;
538         struct ctlr_thread *cth;
539         /* Guests won't use TLS; they always operate in Ring V.  The controller
540          * might - not because of anything we do, but because of glibc calls. */
541         struct uth_thread_attr gth_attr = {.want_tls = FALSE};
542         struct uth_thread_attr cth_attr = {.want_tls = TRUE};
543
544         gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
545         cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
546         if (!gth || !cth) {
547                 free(gth);
548                 free(cth);
549                 return 0;
550         }
551         gth->buddy = cth;
552         cth->buddy = gth;
553         gth->gpc_id = gpcoreid;
554         gth->gpci = *gpci;
555         cth->stacksize = VMM_THR_STACKSIZE;
556         cth->stacktop = __alloc_stack(cth->stacksize);
557         if (!cth->stacktop) {
558                 free(gth);
559                 free(cth);
560                 return 0;
561         }
562         gth->uthread.u_ctx.type = ROS_VM_CTX;
563         gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
564         uthread_init((struct uthread*)gth, &gth_attr);
565         uthread_init((struct uthread*)cth, &cth_attr);
566         gth->halt_mtx = uth_mutex_alloc();
567         gth->halt_cv = uth_cond_var_alloc();
568         return gth;
569 }
570
571 static void ev_handle_diag(struct event_msg *ev_msg, unsigned int ev_type,
572                            void *data)
573 {
574         struct virtual_machine *vm = current_vm;
575         struct guest_thread *gth;
576         struct ctlr_thread *cth;
577         bool reset = FALSE;
578
579         if (ev_msg && (ev_msg->ev_arg1 == 1))
580                 reset = TRUE;
581
582         fprintf(stderr, "\nSCHED stats:\n---------------\n");
583         for (int i = 0; i < vm->nr_gpcs; i++) {
584                 gth = gpcid_to_gth(vm, i);
585                 cth = gth->buddy;
586                 fprintf(stderr, "\tGPC %2d: %lu resched, %lu gth runs, %lu ctl runs, %lu user-handled vmexits\n",
587                         i,
588                         ((struct vmm_thread*)gth)->nr_resched,
589                         ((struct vmm_thread*)gth)->nr_runs,
590                         ((struct vmm_thread*)cth)->nr_runs,
591                         gth->nr_vmexits);
592                 if (reset) {
593                         ((struct vmm_thread*)gth)->nr_resched = 0;
594                         ((struct vmm_thread*)gth)->nr_runs = 0;
595                         ((struct vmm_thread*)cth)->nr_runs = 0;
596                         gth->nr_vmexits = 0;
597                 }
598         }
599         fprintf(stderr, "\n\tNr unblocked gpc %lu, Nr unblocked tasks %lu\n",
600                 atomic_read(&nr_unblk_guests), atomic_read(&nr_unblk_tasks));
601 }
602
603 int vmm_init(struct virtual_machine *vm, struct vmm_gpcore_init *gpcis,
604              int flags)
605 {
606         struct guest_thread **gths;
607
608         if (current_vm)
609                 return -1;
610         current_vm = vm;
611         /* We should tell the kernel to create all of the GPCs we'll need in
612          * advance.
613          *
614          * We could create the others on the fly, but the kernel's answer for
615          * CPUID[0x1] will not have to total number of cores.  If we move that
616          * handler to userspace, we can create the SMP-booted GPCs on the fly.
617          *
618          * We'd also have to deal with gths[] growing dynamically, which would
619          * require synchronization. */
620         if (syscall(SYS_vmm_add_gpcs, vm->nr_gpcs, gpcis) != vm->nr_gpcs)
621                 return -1;
622         if (flags) {
623                 if (syscall(SYS_vmm_ctl, VMM_CTL_SET_FLAGS, flags))
624                         return -1;
625         }
626         gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
627         if (!gths)
628                 return -1;
629         for (int i = 0; i < vm->nr_gpcs; i++) {
630                 gths[i] = create_guest_thread(vm, i, &gpcis[i]);
631                 if (!gths[i]) {
632                         for (int j = 0; j < i; j++)
633                                 destroy_guest_thread(gths[j]);
634                         free(gths);
635                         return -1;
636                 }
637         }
638         wmb(); /* All gths posted before advertising. */
639         vm->__gths = gths;
640         uthread_mcp_init();
641         register_ev_handler(EV_FREE_APPLE_PIE, ev_handle_diag, NULL);
642         if (sched_is_greedy()) {
643                 greedy_rnbl_guests = calloc(vm->nr_gpcs,
644                                             sizeof(struct vmm_thread *));
645                 assert(greedy_rnbl_guests);
646                 vcore_request_total(sched_nr_greedy_cores());
647                 syscall(SYS_vmm_ctl, VMM_CTL_SET_EXITS,
648                         syscall(SYS_vmm_ctl, VMM_CTL_GET_EXITS) &
649                                 ~(VMM_CTL_EXIT_HALT | VMM_CTL_EXIT_MWAIT));
650         }
651         return 0;
652 }
653
654 void start_guest_thread(struct guest_thread *gth)
655 {
656         acct_thread_unblocked((struct vmm_thread*)gth);
657         enqueue_vmm_thread((struct vmm_thread*)gth);
658 }
659
660 static void __task_thread_run(void)
661 {
662         struct task_thread *tth = (struct task_thread*)current_uthread;
663
664         uth_2ls_thread_exit(tth->func(tth->arg));
665 }
666
667 static int task_thread_ctor(void *obj, void *priv, int flags)
668 {
669         struct vmm_thread *vth = (struct vmm_thread*)obj;
670         struct task_thread *tth = (struct task_thread*)obj;
671
672         memset(vth, 0, sizeof(struct vmm_thread));
673         vth->type = VMM_THREAD_TASK;
674         vth->vm = current_vm;
675         tth->stacksize = VMM_THR_STACKSIZE;
676         tth->stacktop = __alloc_stack(tth->stacksize);
677         if (!tth->stacktop)
678                 return -1;
679         return 0;
680 }
681
682 /* Helper, creates and starts a task thread. */
683 static struct task_thread *__vmm_run_task(struct virtual_machine *vm,
684                                           void *(*func)(void *), void *arg,
685                                           struct uth_thread_attr *tth_attr)
686 {
687         struct task_thread *tth;
688
689         tth = kmem_cache_alloc(task_thread_cache, 0);
690         tth->func = func;
691         tth->arg = arg;
692         init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
693                       (uintptr_t)(tth->stacktop));
694         uthread_init((struct uthread*)tth, tth_attr);
695         acct_thread_unblocked((struct vmm_thread*)tth);
696         enqueue_vmm_thread((struct vmm_thread*)tth);
697         return tth;
698 }
699
700 struct task_thread *vmm_run_task(struct virtual_machine *vm,
701                                  void *(*func)(void *), void *arg)
702 {
703         struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = TRUE};
704
705         return __vmm_run_task(vm, func, arg, &tth_attr);
706 }
707
708 static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg)
709 {
710         struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = FALSE};
711         struct task_thread *tth;
712
713         /* It's OK to not have a VM for a generic thread */
714         tth = __vmm_run_task(NULL, func, arg, &tth_attr);
715         /* But just in case, let's poison it */
716         ((struct vmm_thread*)tth)->vm = (void*)0xdeadbeef;
717         return (struct uthread*)tth;
718 }
719
720 /* Helpers for tracking nr_unblk_* threads. */
721 static void acct_thread_blocked(struct vmm_thread *vth)
722 {
723         switch (vth->type) {
724         case VMM_THREAD_GUEST:
725         case VMM_THREAD_CTLR:
726                 atomic_dec(&nr_unblk_guests);
727                 break;
728         case VMM_THREAD_TASK:
729                 atomic_dec(&nr_unblk_tasks);
730                 break;
731         }
732 }
733
734 static void acct_thread_unblocked(struct vmm_thread *vth)
735 {
736         switch (vth->type) {
737         case VMM_THREAD_GUEST:
738         case VMM_THREAD_CTLR:
739                 atomic_inc(&nr_unblk_guests);
740                 break;
741         case VMM_THREAD_TASK:
742                 atomic_inc(&nr_unblk_tasks);
743                 break;
744         }
745 }
746
747 static void greedy_mark_guest_runnable(struct vmm_thread *vth)
748 {
749         int gpcid;
750
751         if (vth->type == VMM_THREAD_GUEST)
752                 gpcid = ((struct guest_thread*)vth)->gpc_id;
753         else
754                 gpcid = ((struct ctlr_thread*)vth)->buddy->gpc_id;
755         /* racing with the reader */
756         greedy_rnbl_guests[gpcid] = vth;
757 }
758
759 static void enqueue_vmm_thread(struct vmm_thread *vth)
760 {
761         switch (vth->type) {
762         case VMM_THREAD_GUEST:
763         case VMM_THREAD_CTLR:
764                 if (sched_is_greedy()) {
765                         greedy_mark_guest_runnable(vth);
766                 } else {
767                         spin_pdr_lock(&queue_lock);
768                         TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
769                         spin_pdr_unlock(&queue_lock);
770                 }
771                 break;
772         case VMM_THREAD_TASK:
773                 spin_pdr_lock(&queue_lock);
774                 TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
775                 spin_pdr_unlock(&queue_lock);
776                 if (sched_is_greedy())
777                         vcore_wake(0, false);
778                 break;
779         default:
780                 panic("Bad vmm_thread type %p\n", vth->type);
781         }
782         try_to_get_vcores();
783 }
784
785 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
786 {
787         struct vmm_thread *vth;
788         int ret;
789
790         ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
791                              sizeof(struct vmm_thread));
792         if (ret)
793                 return 0;
794         memset(vth, 0, sizeof(struct vmm_thread));
795         vth->type = type;
796         vth->vm = vm;
797         return vth;
798 }
799
800 static void __free_stack(void *stacktop, size_t stacksize)
801 {
802         munmap(stacktop - stacksize, stacksize);
803 }
804
805 static void *__alloc_stack(size_t stacksize)
806 {
807         int force_a_page_fault;
808         void *stacktop;
809         void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
810                               MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
811
812         if (stackbot == MAP_FAILED)
813                 return 0;
814         stacktop = stackbot + stacksize;
815         /* Want the top of the stack populated, but not the rest of the stack;
816          * that'll grow on demand (up to stacksize, then will clobber memory). */
817         force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
818         return stacktop;
819 }