vmm: Collect and report scheduler stats
[akaros.git] / user / vmm / sched.c
1 /* Copyright (c) 2016 Google Inc.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * 2LS for virtual machines */
6
7 #include <vmm/sched.h>
8 #include <vmm/vmm.h>
9 #include <sys/mman.h>
10 #include <stdlib.h>
11 #include <assert.h>
12 #include <parlib/spinlock.h>
13 #include <parlib/event.h>
14 #include <parlib/ucq.h>
15 #include <parlib/arch/trap.h>
16 #include <parlib/ros_debug.h>
17 #include <parlib/vcore_tick.h>
18 #include <parlib/slab.h>
19
20 int vmm_sched_period_usec = 1000;
21
22 /* For now, we only have one VM managed by the 2LS.  If we ever expand that,
23  * we'll need something analogous to current_uthread, so the 2LS knows which VM
24  * it is working on. */
25 static struct virtual_machine *current_vm;
26
27 static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
28 /* Runnable queues, broken up by thread type. */
29 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
30 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
31 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
32 static atomic_t nr_unblk_tasks;
33 static atomic_t nr_unblk_guests;
34 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
35 static struct event_queue *sysc_evq;
36 static struct kmem_cache *task_thread_cache;
37
38 static void vmm_sched_init(void);
39 static void vmm_sched_entry(void);
40 static void vmm_thread_runnable(struct uthread *uth);
41 static void vmm_thread_paused(struct uthread *uth);
42 static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
43 static void vmm_thread_has_blocked(struct uthread *uth, int flags);
44 static void vmm_thread_refl_fault(struct uthread *uth,
45                                   struct user_context *ctx);
46 static void vmm_thread_exited(struct uthread *uth);
47 static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg);
48
49 struct schedule_ops vmm_sched_ops = {
50         .sched_init = vmm_sched_init,
51         .sched_entry = vmm_sched_entry,
52         .thread_runnable = vmm_thread_runnable,
53         .thread_paused = vmm_thread_paused,
54         .thread_blockon_sysc = vmm_thread_blockon_sysc,
55         .thread_has_blocked = vmm_thread_has_blocked,
56         .thread_refl_fault = vmm_thread_refl_fault,
57         .thread_exited = vmm_thread_exited,
58         .thread_create = vmm_thread_create,
59 };
60
61 struct schedule_ops *sched_ops = &vmm_sched_ops;
62
63 /* Helpers */
64 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
65                                void *data);
66 static void acct_thread_blocked(struct vmm_thread *vth);
67 static void acct_thread_unblocked(struct vmm_thread *vth);
68 static void enqueue_vmm_thread(struct vmm_thread *vth);
69 static int task_thread_ctor(void *obj, void *priv, int flags);
70 static void task_thread_dtor(void *obj, void *priv);
71 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
72                                            int type);
73 static void *__alloc_stack(size_t stacksize);
74 static void __free_stack(void *stacktop, size_t stacksize);
75
76
77 static void restart_thread(struct syscall *sysc)
78 {
79         struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
80
81         /* uthread stuff here: */
82         assert(ut_restartee);
83         assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
84         ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
85         vmm_thread_runnable(ut_restartee);
86 }
87
88 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
89                                void *data)
90 {
91         struct syscall *sysc;
92
93         /* I think we can make this assert now.  If not, check pthread.c. (concern
94          * was having old ev_qs firing and running this handler). */
95         assert(ev_msg);
96         sysc = ev_msg->ev_arg3;
97         assert(sysc);
98         restart_thread(sysc);
99 }
100
101 /* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will
102  * attempt to route the notifs/IPIs to vcoreid */
103 static struct event_queue *setup_sysc_evq(int vcoreid)
104 {
105         struct event_queue *evq;
106         uintptr_t mmap_block;
107
108         mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
109                                      PROT_WRITE | PROT_READ,
110                                      MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE,
111                                      -1, 0);
112         evq = get_eventq_raw();
113         assert(mmap_block && evq);
114         evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
115         evq->ev_vcore = vcoreid;
116         evq->ev_mbox->type = EV_MBOX_UCQ;
117         ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
118         return evq;
119 }
120
121 static void vmm_sched_init(void)
122 {
123         struct task_thread *thread0;
124
125         /* Note that thread0 doesn't belong to a VM.  We can set this during
126          * vmm_init() if we need to. */
127         thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
128         assert(thread0);
129         acct_thread_unblocked((struct vmm_thread*)thread0);
130         thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
131         thread0->stacktop = (void*)USTACKTOP;
132         /* for lack of a better vcore, might as well send to 0 */
133         sysc_evq = setup_sysc_evq(0);
134         uthread_2ls_init((struct uthread*)thread0, vmm_handle_syscall, NULL);
135         task_thread_cache = kmem_cache_create("task threads",
136                                               sizeof(struct vmm_thread),
137                                               __alignof__(struct vmm_thread), 0,
138                                               task_thread_ctor, task_thread_dtor,
139                                               NULL);
140 }
141
142 /* The scheduling policy is encapsulated in the next few functions (from here
143  * down to sched_entry()). */
144
145 static int desired_nr_vcores(void)
146 {
147         /* Sanity checks on our accounting. */
148         assert(atomic_read(&nr_unblk_guests) >= 0);
149         assert(atomic_read(&nr_unblk_tasks) >= 0);
150         /* Lockless peak.  This is always an estimate.  Some of our tasks busy-wait,
151          * so it's not enough to just give us one vcore for all tasks, yet. */
152         return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
153 }
154
155 static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
156 {
157         struct vmm_thread *vth;
158
159         vth = TAILQ_FIRST(tq);
160         if (vth)
161                 TAILQ_REMOVE(tq, vth, tq_next);
162         return vth;
163 }
164
165 static struct vmm_thread *pick_a_thread_degraded(void)
166 {
167         struct vmm_thread *vth;
168
169         spin_pdr_lock(&queue_lock);
170         vth = __pop_first(&rnbl_tasks);
171         if (!vth)
172                 vth = __pop_first(&rnbl_guests);
173         spin_pdr_unlock(&queue_lock);
174         return vth;
175 }
176
177 /* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */
178 static struct vmm_thread *pick_a_thread_plenty(void)
179 {
180         struct vmm_thread *vth = 0;
181
182         spin_pdr_lock(&queue_lock);
183         if (!vth)
184                 vth = __pop_first(&rnbl_tasks);
185         if (!vth)
186                 vth = __pop_first(&rnbl_guests);
187         spin_pdr_unlock(&queue_lock);
188         return vth;
189 }
190
191 static void yield_current_uth(void)
192 {
193         struct vmm_thread *vth;
194
195         if (!current_uthread)
196                 return;
197         vth = (struct vmm_thread*)stop_current_uthread();
198         enqueue_vmm_thread(vth);
199 }
200
201 /* Helper, tries to get the right number of vcores.  Returns TRUE if we think we
202  * have enough, FALSE otherwise.
203  *
204  * TODO: this doesn't handle a lot of issues, like preemption, how to
205  * run/yield our vcores, dynamic changes in the number of runnables, where
206  * to send events, how to avoid interfering with gpcs, etc. */
207 static bool try_to_get_vcores(void)
208 {
209         int nr_vcores_wanted = desired_nr_vcores();
210         bool have_enough = nr_vcores_wanted <= num_vcores();
211
212         if (have_enough) {
213                 vcore_tick_disable();
214                 return TRUE;
215         }
216         vcore_tick_enable(vmm_sched_period_usec);
217         vcore_request_total(nr_vcores_wanted);
218         return FALSE;
219 }
220
221 static void stats_run_vth(struct vmm_thread *vth)
222 {
223         vth->nr_runs++;
224         if (vth->prev_vcoreid != vcore_id()) {
225                 vth->prev_vcoreid = vcore_id();
226                 vth->nr_resched++;
227         }
228 }
229
230 static void __attribute__((noreturn)) vmm_sched_entry(void)
231 {
232         struct vmm_thread *vth;
233         bool have_enough;
234
235         have_enough = try_to_get_vcores();
236         if (!have_enough && vcore_tick_poll()) {
237                 /* slightly less than ideal: we grab the queue lock twice */
238                 yield_current_uth();
239         }
240         if (current_uthread) {
241                 stats_run_vth((struct vmm_thread*)current_uthread);
242                 run_current_uthread();
243         }
244         if (have_enough)
245                 vth = pick_a_thread_plenty();
246         else
247                 vth = pick_a_thread_degraded();
248         if (!vth)
249                 vcore_yield_or_restart();
250         stats_run_vth(vth);
251         run_uthread((struct uthread*)vth);
252 }
253
254 static void vmm_thread_runnable(struct uthread *uth)
255 {
256         /* A thread that was blocked is now runnable.  This counts as becoming
257          * unblocked (running + runnable) */
258         acct_thread_unblocked((struct vmm_thread*)uth);
259         enqueue_vmm_thread((struct vmm_thread*)uth);
260 }
261
262 static void vmm_thread_paused(struct uthread *uth)
263 {
264         /* The thread stopped for some reason, usually a preemption.  We'd like to
265          * just run it whenever we get a chance.  Note that it didn't become
266          * 'blocked' - it's still runnable. */
267         enqueue_vmm_thread((struct vmm_thread*)uth);
268 }
269
270 static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
271 {
272         struct syscall *sysc = (struct syscall*)syscall;
273
274         acct_thread_blocked((struct vmm_thread*)uth);
275         sysc->u_data = uth;
276         if (!register_evq(sysc, sysc_evq)) {
277                 /* Lost the race with the call being done.  The kernel won't send the
278                  * event.  Just restart him. */
279                 restart_thread(sysc);
280         }
281         /* GIANT WARNING: do not touch the thread after this point. */
282 }
283
284 static void vmm_thread_has_blocked(struct uthread *uth, int flags)
285 {
286         /* The thread blocked on something like a mutex.  It's not runnable, so we
287          * don't need to put it on a list, but we do need to account for it not
288          * running.  We'll find out (via thread_runnable) when it starts up again.
289          */
290         acct_thread_blocked((struct vmm_thread*)uth);
291 }
292
293 static void refl_error(struct uthread *uth, unsigned int trap_nr,
294                        unsigned int err, unsigned long aux)
295 {
296         printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
297                trap_nr, err, aux);
298         /* Note that uthread.c already copied out our ctx into the uth
299          * struct */
300         print_user_context(&uth->u_ctx);
301         printf("Turn on printx to spew unhandled, malignant trap info\n");
302         exit(-1);
303 }
304
305 static bool handle_page_fault(struct uthread *uth, unsigned int err,
306                               unsigned long aux)
307 {
308         if (!(err & PF_VMR_BACKED))
309                 return FALSE;
310         syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
311         __block_uthread_on_async_sysc(uth);
312         return TRUE;
313 }
314
315 static void vmm_thread_refl_hw_fault(struct uthread *uth,
316                                      unsigned int trap_nr,
317                                      unsigned int err, unsigned long aux)
318 {
319         switch (trap_nr) {
320         case HW_TRAP_PAGE_FAULT:
321                 if (!handle_page_fault(uth, err, aux))
322                         refl_error(uth, trap_nr, err, aux);
323                 break;
324         default:
325                 refl_error(uth, trap_nr, err, aux);
326         }
327 }
328
329 /* Yield callback for __ctlr_entry */
330 static void __swap_to_gth(struct uthread *uth, void *dummy)
331 {
332         struct ctlr_thread *cth = (struct ctlr_thread*)uth;
333
334         /* We just immediately run our buddy.  The ctlr and the guest are accounted
335          * together ("pass the token" back and forth). */
336         current_uthread = NULL;
337         stats_run_vth((struct vmm_thread*)cth->buddy);
338         run_uthread((struct uthread*)cth->buddy);
339         assert(0);
340 }
341
342 /* All ctrl threads start here, each time their guest has a fault.  They can
343  * block and unblock along the way.  Once a ctlr does its final uthread_yield,
344  * the next time it will start again from the top. */
345 static void __ctlr_entry(void)
346 {
347         struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
348         struct virtual_machine *vm = gth_to_vm(cth->buddy);
349
350         if (!handle_vmexit(cth->buddy)) {
351                 struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy);
352
353                 fprintf(stderr, "vmm: handle_vmexit returned false\n");
354                 fprintf(stderr, "Note: this may be a kernel module, not the kernel\n");
355                 fprintf(stderr, "RSP was %p, ", (void *)vm_tf->tf_rsp);
356                 fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip);
357                 /* TODO: properly walk the kernel page tables to map the tf_rip
358                  * to a physical address. For now, however, this hack is good
359                  * enough.
360                  */
361                 hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16);
362                 showstatus(stderr, cth->buddy);
363                 exit(0);
364         }
365         /* We want to atomically yield and start/reenqueue our buddy.  We do so in
366          * vcore context on the other side of the yield. */
367         uthread_yield(FALSE, __swap_to_gth, 0);
368 }
369
370 static void vmm_thread_refl_vm_fault(struct uthread *uth)
371 {
372         struct guest_thread *gth = (struct guest_thread*)uth;
373         struct ctlr_thread *cth = gth->buddy;
374
375         gth->nr_vmexits++;
376         /* The ctlr starts frm the top every time we get a new fault. */
377         cth->uthread.flags |= UTHREAD_SAVED;
378         init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
379                       (uintptr_t)(cth->stacktop));
380         /* We just immediately run our buddy.  The ctlr and the guest are accounted
381          * together ("pass the token" back and forth). */
382         current_uthread = NULL;
383         stats_run_vth((struct vmm_thread*)cth);
384         run_uthread((struct uthread*)cth);
385         assert(0);
386 }
387
388 static void vmm_thread_refl_fault(struct uthread *uth,
389                                   struct user_context *ctx)
390 {
391         switch (ctx->type) {
392         case ROS_HW_CTX:
393                 /* Guests should only ever VM exit */
394                 assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
395                 vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
396                                          __arch_refl_get_err(ctx),
397                                          __arch_refl_get_aux(ctx));
398                 break;
399         case ROS_VM_CTX:
400                 vmm_thread_refl_vm_fault(uth);
401                 break;
402         default:
403                 assert(0);
404         }
405 }
406
407 static void task_thread_dtor(void *obj, void *priv)
408 {
409         struct task_thread *tth = (struct task_thread*)obj;
410
411         __free_stack(tth->stacktop, tth->stacksize);
412 }
413
414 static void vmm_thread_exited(struct uthread *uth)
415 {
416         struct vmm_thread *vth = (struct vmm_thread*)uth;
417         struct task_thread *tth = (struct task_thread*)uth;
418
419         /* Catch bugs.  Right now, only tasks threads can exit. */
420         assert(vth->type == VMM_THREAD_TASK);
421
422         acct_thread_blocked((struct vmm_thread*)tth);
423         uthread_cleanup(uth);
424         if (uth->flags & UTHREAD_IS_THREAD0)
425                 return;
426         kmem_cache_free(task_thread_cache, tth);
427 }
428
429 static void destroy_guest_thread(struct guest_thread *gth)
430 {
431         struct ctlr_thread *cth = gth->buddy;
432
433         __free_stack(cth->stacktop, cth->stacksize);
434         uthread_cleanup((struct uthread*)cth);
435         free(cth);
436         uthread_cleanup((struct uthread*)gth);
437         free(gth);
438 }
439
440 static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
441                                                 unsigned int gpcoreid)
442 {
443         struct guest_thread *gth;
444         struct ctlr_thread *cth;
445         /* Guests won't use TLS; they always operate in Ring V.  The controller
446          * might - not because of anything we do, but because of glibc calls. */
447         struct uth_thread_attr gth_attr = {.want_tls = FALSE};
448         struct uth_thread_attr cth_attr = {.want_tls = TRUE};
449
450         gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
451         cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
452         if (!gth || !cth) {
453                 free(gth);
454                 free(cth);
455                 return 0;
456         }
457         gth->buddy = cth;
458         cth->buddy = gth;
459         gth->gpc_id = gpcoreid;
460         cth->stacksize = VMM_THR_STACKSIZE;
461         cth->stacktop = __alloc_stack(cth->stacksize);
462         if (!cth->stacktop) {
463                 free(gth);
464                 free(cth);
465                 return 0;
466         }
467         gth->uthread.u_ctx.type = ROS_VM_CTX;
468         gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
469         /* No need to init the ctlr.  It gets re-init'd each time it starts. */
470         uthread_init((struct uthread*)gth, &gth_attr);
471         uthread_init((struct uthread*)cth, &cth_attr);
472         /* TODO: give it a correct FP state.  Our current one is probably fine */
473         restore_fp_state(&gth->uthread.as);
474         gth->uthread.flags |= UTHREAD_FPSAVED;
475         gth->halt_mtx = uth_mutex_alloc();
476         gth->halt_cv = uth_cond_var_alloc();
477         return gth;
478 }
479
480 static void ev_handle_diag(struct event_msg *ev_msg, unsigned int ev_type,
481                            void *data)
482 {
483         struct virtual_machine *vm = current_vm;
484         struct guest_thread *gth;
485         struct ctlr_thread *cth;
486         bool reset = FALSE;
487
488         if (ev_msg && (ev_msg->ev_arg1 == 1))
489                 reset = TRUE;
490
491         fprintf(stderr, "\nSCHED stats:\n---------------\n");
492         for (int i = 0; i < vm->nr_gpcs; i++) {
493                 gth = vm->gths[i];
494                 cth = gth->buddy;
495                 fprintf(stderr, "\tGPC %2d: %lu resched, %lu gth runs, %lu ctl runs, %lu user-handled vmexits\n",
496                                 i,
497                         ((struct vmm_thread*)gth)->nr_resched,
498                         ((struct vmm_thread*)gth)->nr_runs,
499                         ((struct vmm_thread*)cth)->nr_runs,
500                         gth->nr_vmexits);
501                 if (reset) {
502                     ((struct vmm_thread*)gth)->nr_resched = 0;
503                     ((struct vmm_thread*)gth)->nr_runs = 0;
504                     ((struct vmm_thread*)cth)->nr_runs = 0;
505                     gth->nr_vmexits = 0;
506                 }
507         }
508         fprintf(stderr, "\n\tNr unblocked gpc %lu, Nr unblocked tasks %lu\n",
509                 atomic_read(&nr_unblk_guests), atomic_read(&nr_unblk_tasks));
510 }
511
512 int vmm_init(struct virtual_machine *vm, int flags)
513 {
514         struct guest_thread **gths;
515
516         if (current_vm)
517                 return -1;
518         current_vm = vm;
519         if (syscall(SYS_vmm_setup, vm->nr_gpcs, vm->gpcis, flags) != vm->nr_gpcs)
520                 return -1;
521         gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
522         if (!gths)
523                 return -1;
524         for (int i = 0; i < vm->nr_gpcs; i++) {
525                 gths[i] = create_guest_thread(vm, i);
526                 if (!gths[i]) {
527                         for (int j = 0; j < i; j++)
528                                 destroy_guest_thread(gths[j]);
529                         free(gths);
530                         return -1;
531                 }
532         }
533         vm->gths = gths;
534         uthread_mcp_init();
535         register_ev_handler(EV_FREE_APPLE_PIE, ev_handle_diag, NULL);
536         return 0;
537 }
538
539 void start_guest_thread(struct guest_thread *gth)
540 {
541         acct_thread_unblocked((struct vmm_thread*)gth);
542         enqueue_vmm_thread((struct vmm_thread*)gth);
543 }
544
545 static void __task_thread_run(void)
546 {
547         struct task_thread *tth = (struct task_thread*)current_uthread;
548
549         uth_2ls_thread_exit(tth->func(tth->arg));
550 }
551
552 static int task_thread_ctor(void *obj, void *priv, int flags)
553 {
554         struct vmm_thread *vth = (struct vmm_thread*)obj;
555         struct task_thread *tth = (struct task_thread*)obj;
556
557         memset(vth, 0, sizeof(struct vmm_thread));
558         vth->type = VMM_THREAD_TASK;
559         vth->vm = current_vm;
560         tth->stacksize = VMM_THR_STACKSIZE;
561         tth->stacktop = __alloc_stack(tth->stacksize);
562         if (!tth->stacktop)
563                 return -1;
564         return 0;
565 }
566
567 /* Helper, creates and starts a task thread. */
568 static struct task_thread *__vmm_run_task(struct virtual_machine *vm,
569                                           void *(*func)(void *), void *arg,
570                                           struct uth_thread_attr *tth_attr)
571 {
572         struct task_thread *tth;
573
574         tth = kmem_cache_alloc(task_thread_cache, 0);
575         tth->func = func;
576         tth->arg = arg;
577         init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
578                       (uintptr_t)(tth->stacktop));
579         uthread_init((struct uthread*)tth, tth_attr);
580         acct_thread_unblocked((struct vmm_thread*)tth);
581         enqueue_vmm_thread((struct vmm_thread*)tth);
582         return tth;
583 }
584
585 struct task_thread *vmm_run_task(struct virtual_machine *vm,
586                                  void *(*func)(void *), void *arg)
587 {
588         struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = TRUE};
589
590         return __vmm_run_task(vm, func, arg, &tth_attr);
591 }
592
593 static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg)
594 {
595         struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = FALSE};
596         struct task_thread *tth;
597
598         /* It's OK to not have a VM for a generic thread */
599         tth = __vmm_run_task(NULL, func, arg, &tth_attr);
600         /* But just in case, let's poison it */
601         ((struct vmm_thread*)tth)->vm = (void*)0xdeadbeef;
602         return (struct uthread*)tth;
603 }
604
605 /* Helpers for tracking nr_unblk_* threads. */
606 static void acct_thread_blocked(struct vmm_thread *vth)
607 {
608         switch (vth->type) {
609         case VMM_THREAD_GUEST:
610         case VMM_THREAD_CTLR:
611                 atomic_dec(&nr_unblk_guests);
612                 break;
613         case VMM_THREAD_TASK:
614                 atomic_dec(&nr_unblk_tasks);
615                 break;
616         }
617 }
618
619 static void acct_thread_unblocked(struct vmm_thread *vth)
620 {
621         switch (vth->type) {
622         case VMM_THREAD_GUEST:
623         case VMM_THREAD_CTLR:
624                 atomic_inc(&nr_unblk_guests);
625                 break;
626         case VMM_THREAD_TASK:
627                 atomic_inc(&nr_unblk_tasks);
628                 break;
629         }
630 }
631
632 static void enqueue_vmm_thread(struct vmm_thread *vth)
633 {
634         spin_pdr_lock(&queue_lock);
635         switch (vth->type) {
636         case VMM_THREAD_GUEST:
637         case VMM_THREAD_CTLR:
638                 TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
639                 break;
640         case VMM_THREAD_TASK:
641                 TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
642                 break;
643         default:
644                 panic("Bad vmm_thread type %p\n", vth->type);
645         }
646         spin_pdr_unlock(&queue_lock);
647         try_to_get_vcores();
648 }
649
650 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
651 {
652         struct vmm_thread *vth;
653         int ret;
654
655         ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
656                              sizeof(struct vmm_thread));
657         if (ret)
658                 return 0;
659         memset(vth, 0, sizeof(struct vmm_thread));
660         vth->type = type;
661         vth->vm = vm;
662         return vth;
663 }
664
665 static void __free_stack(void *stacktop, size_t stacksize)
666 {
667         munmap(stacktop - stacksize, stacksize);
668 }
669
670 static void *__alloc_stack(size_t stacksize)
671 {
672         int force_a_page_fault;
673         void *stacktop;
674         void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
675                               MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
676
677         if (stackbot == MAP_FAILED)
678                 return 0;
679         stacktop = stackbot + stacksize;
680         /* Want the top of the stack populated, but not the rest of the stack;
681          * that'll grow on demand (up to stacksize, then will clobber memory). */
682         force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
683         return stacktop;
684 }