vmm: Use a helper for all gth lookups
[akaros.git] / user / vmm / sched.c
1 /* Copyright (c) 2016 Google Inc.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * 2LS for virtual machines */
6
7 #include <vmm/sched.h>
8 #include <vmm/vmm.h>
9 #include <sys/mman.h>
10 #include <stdlib.h>
11 #include <assert.h>
12 #include <parlib/spinlock.h>
13 #include <parlib/event.h>
14 #include <parlib/ucq.h>
15 #include <parlib/arch/trap.h>
16 #include <parlib/ros_debug.h>
17 #include <parlib/vcore_tick.h>
18 #include <parlib/slab.h>
19
20 int vmm_sched_period_usec = 1000;
21
22 /* For now, we only have one VM managed by the 2LS.  If we ever expand that,
23  * we'll need something analogous to current_uthread, so the 2LS knows which VM
24  * it is working on. */
25 static struct virtual_machine *current_vm;
26
27 static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
28 /* Runnable queues, broken up by thread type. */
29 static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
30 static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
31 static struct vmm_thread **greedy_rnbl_guests;
32 /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
33 static atomic_t nr_unblk_tasks;
34 static atomic_t nr_unblk_guests;
35 /* Global evq for all syscalls.  Could make this per vcore or whatever. */
36 static struct event_queue *sysc_evq;
37 static struct kmem_cache *task_thread_cache;
38
39 static void vmm_sched_init(void);
40 static void vmm_sched_entry(void);
41 static void vmm_thread_runnable(struct uthread *uth);
42 static void vmm_thread_paused(struct uthread *uth);
43 static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
44 static void vmm_thread_has_blocked(struct uthread *uth, int flags);
45 static void vmm_thread_refl_fault(struct uthread *uth,
46                                   struct user_context *ctx);
47 static void vmm_thread_exited(struct uthread *uth);
48 static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg);
49
50 struct schedule_ops vmm_sched_ops = {
51         .sched_init = vmm_sched_init,
52         .sched_entry = vmm_sched_entry,
53         .thread_runnable = vmm_thread_runnable,
54         .thread_paused = vmm_thread_paused,
55         .thread_blockon_sysc = vmm_thread_blockon_sysc,
56         .thread_has_blocked = vmm_thread_has_blocked,
57         .thread_refl_fault = vmm_thread_refl_fault,
58         .thread_exited = vmm_thread_exited,
59         .thread_create = vmm_thread_create,
60 };
61
62 struct schedule_ops *sched_ops = &vmm_sched_ops;
63
64 /* Helpers */
65 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
66                                void *data);
67 static void acct_thread_blocked(struct vmm_thread *vth);
68 static void acct_thread_unblocked(struct vmm_thread *vth);
69 static void enqueue_vmm_thread(struct vmm_thread *vth);
70 static int task_thread_ctor(void *obj, void *priv, int flags);
71 static void task_thread_dtor(void *obj, void *priv);
72 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
73                                            int type);
74 static void *__alloc_stack(size_t stacksize);
75 static void __free_stack(void *stacktop, size_t stacksize);
76
77 static bool sched_is_greedy(void)
78 {
79         return parlib_never_yield;
80 }
81
82 static unsigned int sched_nr_greedy_cores(void)
83 {
84         if (!current_vm)
85                 return 1;
86         return current_vm->nr_gpcs + 1;
87 }
88
89 static void restart_thread(struct syscall *sysc)
90 {
91         struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
92
93         /* uthread stuff here: */
94         assert(ut_restartee);
95         assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
96         ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
97         vmm_thread_runnable(ut_restartee);
98 }
99
100 static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
101                                void *data)
102 {
103         struct syscall *sysc;
104
105         /* I think we can make this assert now.  If not, check pthread.c. (concern
106          * was having old ev_qs firing and running this handler). */
107         assert(ev_msg);
108         sysc = ev_msg->ev_arg3;
109         assert(sysc);
110         restart_thread(sysc);
111 }
112
113 /* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will
114  * attempt to route the notifs/IPIs to vcoreid */
115 static struct event_queue *setup_sysc_evq(int vcoreid)
116 {
117         struct event_queue *evq;
118         uintptr_t mmap_block;
119
120         mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
121                                      PROT_WRITE | PROT_READ,
122                                      MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE,
123                                      -1, 0);
124         evq = get_eventq_raw();
125         assert(mmap_block && evq);
126         evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
127         evq->ev_vcore = vcoreid;
128         evq->ev_mbox->type = EV_MBOX_UCQ;
129         ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
130         return evq;
131 }
132
133 static void vmm_sched_init(void)
134 {
135         struct task_thread *thread0;
136
137         /* Note that thread0 doesn't belong to a VM.  We can set this during
138          * vmm_init() if we need to. */
139         thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
140         assert(thread0);
141         acct_thread_unblocked((struct vmm_thread*)thread0);
142         thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
143         thread0->stacktop = (void*)USTACKTOP;
144         /* for lack of a better vcore, might as well send to 0 */
145         sysc_evq = setup_sysc_evq(0);
146         uthread_2ls_init((struct uthread*)thread0, vmm_handle_syscall, NULL);
147         task_thread_cache = kmem_cache_create("task threads",
148                                               sizeof(struct vmm_thread),
149                                               __alignof__(struct vmm_thread), 0,
150                                               task_thread_ctor, task_thread_dtor,
151                                               NULL);
152 }
153
154 /* The scheduling policy is encapsulated in the next few functions (from here
155  * down to sched_entry()). */
156
157 static int desired_nr_vcores(void)
158 {
159         /* Sanity checks on our accounting. */
160         assert(atomic_read(&nr_unblk_guests) >= 0);
161         assert(atomic_read(&nr_unblk_tasks) >= 0);
162         /* Lockless peak.  This is always an estimate.  Some of our tasks busy-wait,
163          * so it's not enough to just give us one vcore for all tasks, yet. */
164         return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
165 }
166
167 static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
168 {
169         struct vmm_thread *vth;
170
171         vth = TAILQ_FIRST(tq);
172         if (vth)
173                 TAILQ_REMOVE(tq, vth, tq_next);
174         return vth;
175 }
176
177 static struct vmm_thread *pick_a_thread_degraded(void)
178 {
179         struct vmm_thread *vth;
180
181         spin_pdr_lock(&queue_lock);
182         vth = __pop_first(&rnbl_tasks);
183         if (!vth)
184                 vth = __pop_first(&rnbl_guests);
185         spin_pdr_unlock(&queue_lock);
186         return vth;
187 }
188
189 /* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */
190 static struct vmm_thread *pick_a_thread_plenty(void)
191 {
192         struct vmm_thread *vth = 0;
193
194         spin_pdr_lock(&queue_lock);
195         if (!vth)
196                 vth = __pop_first(&rnbl_tasks);
197         if (!vth)
198                 vth = __pop_first(&rnbl_guests);
199         spin_pdr_unlock(&queue_lock);
200         return vth;
201 }
202
203 static void yield_current_uth(void)
204 {
205         struct vmm_thread *vth;
206
207         if (!current_uthread)
208                 return;
209         vth = (struct vmm_thread*)stop_current_uthread();
210         enqueue_vmm_thread(vth);
211 }
212
213 /* Helper, tries to get the right number of vcores.  Returns TRUE if we think we
214  * have enough, FALSE otherwise.
215  *
216  * TODO: this doesn't handle a lot of issues, like preemption, how to
217  * run/yield our vcores, dynamic changes in the number of runnables, where
218  * to send events, how to avoid interfering with gpcs, etc. */
219 static bool try_to_get_vcores(void)
220 {
221         int nr_vcores_wanted;
222         bool have_enough;
223
224         if (sched_is_greedy())
225                 return num_vcores() == sched_nr_greedy_cores();
226         nr_vcores_wanted = desired_nr_vcores();
227         have_enough = nr_vcores_wanted <= num_vcores();
228         if (have_enough) {
229                 vcore_tick_disable();
230                 return TRUE;
231         }
232         vcore_tick_enable(vmm_sched_period_usec);
233         vcore_request_total(nr_vcores_wanted);
234         return FALSE;
235 }
236
237 static void stats_run_vth(struct vmm_thread *vth)
238 {
239         vth->nr_runs++;
240         if (vth->prev_vcoreid != vcore_id()) {
241                 vth->prev_vcoreid = vcore_id();
242                 vth->nr_resched++;
243         }
244 }
245
246 /* TODO: This assumes we get all of our vcores. */
247 static struct vmm_thread *sched_pick_thread_greedy(void)
248 {
249         struct vmm_thread *vth;
250
251         if (current_uthread) {
252                 stats_run_vth((struct vmm_thread*)current_uthread);
253                 run_current_uthread();
254         }
255         if (vcore_id() == 0) {
256                 spin_pdr_lock(&queue_lock);
257                 vth = __pop_first(&rnbl_tasks);
258                 spin_pdr_unlock(&queue_lock);
259                 return vth;
260         }
261         /* This races with enqueue_vmm_thread, which can run on another core.
262          * Here are the rules:
263          * - set when runnable (race free, only one state for the thread at a time)
264          * - cleared when we run it (race free, we're the only runners)
265          * - if we take an interrupt, we'll just run_current_uthread and not check
266          * - if we vmexit, we'll run the buddy directly */
267         assert(vcore_id() <= current_vm->nr_gpcs);
268         vth = greedy_rnbl_guests[vcore_id() - 1];
269         if (vth)
270                 greedy_rnbl_guests[vcore_id() - 1] = NULL;
271         return vth;
272 }
273
274 static struct vmm_thread *sched_pick_thread_nice(void)
275 {
276         struct vmm_thread *vth;
277         bool have_enough;
278
279         have_enough = try_to_get_vcores();
280         if (!have_enough && vcore_tick_poll()) {
281                 /* slightly less than ideal: we grab the queue lock twice */
282                 yield_current_uth();
283         }
284         if (current_uthread) {
285                 stats_run_vth((struct vmm_thread*)current_uthread);
286                 run_current_uthread();
287         }
288         if (have_enough)
289                 vth = pick_a_thread_plenty();
290         else
291                 vth = pick_a_thread_degraded();
292         return vth;
293 }
294
295 static void __attribute__((noreturn)) vmm_sched_entry(void)
296 {
297         struct vmm_thread *vth;
298
299         if (sched_is_greedy())
300                 vth = sched_pick_thread_greedy();
301         else
302                 vth = sched_pick_thread_nice();
303         if (!vth)
304                 vcore_yield_or_restart();
305         stats_run_vth(vth);
306         run_uthread((struct uthread*)vth);
307 }
308
309 static void vmm_thread_runnable(struct uthread *uth)
310 {
311         /* A thread that was blocked is now runnable.  This counts as becoming
312          * unblocked (running + runnable) */
313         acct_thread_unblocked((struct vmm_thread*)uth);
314         enqueue_vmm_thread((struct vmm_thread*)uth);
315 }
316
317 static void vmm_thread_paused(struct uthread *uth)
318 {
319         /* The thread stopped for some reason, usually a preemption.  We'd like to
320          * just run it whenever we get a chance.  Note that it didn't become
321          * 'blocked' - it's still runnable. */
322         enqueue_vmm_thread((struct vmm_thread*)uth);
323 }
324
325 static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
326 {
327         struct syscall *sysc = (struct syscall*)syscall;
328
329         acct_thread_blocked((struct vmm_thread*)uth);
330         sysc->u_data = uth;
331         if (!register_evq(sysc, sysc_evq)) {
332                 /* Lost the race with the call being done.  The kernel won't send the
333                  * event.  Just restart him. */
334                 restart_thread(sysc);
335         }
336         /* GIANT WARNING: do not touch the thread after this point. */
337 }
338
339 static void vmm_thread_has_blocked(struct uthread *uth, int flags)
340 {
341         /* The thread blocked on something like a mutex.  It's not runnable, so we
342          * don't need to put it on a list, but we do need to account for it not
343          * running.  We'll find out (via thread_runnable) when it starts up again.
344          */
345         acct_thread_blocked((struct vmm_thread*)uth);
346 }
347
348 static void refl_error(struct uthread *uth, unsigned int trap_nr,
349                        unsigned int err, unsigned long aux)
350 {
351         printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
352                trap_nr, err, aux);
353         /* Note that uthread.c already copied out our ctx into the uth
354          * struct */
355         print_user_context(&uth->u_ctx);
356         printf("Turn on printx to spew unhandled, malignant trap info\n");
357         exit(-1);
358 }
359
360 static bool handle_page_fault(struct uthread *uth, unsigned int err,
361                               unsigned long aux)
362 {
363         if (!(err & PF_VMR_BACKED))
364                 return FALSE;
365         syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
366         __block_uthread_on_async_sysc(uth);
367         return TRUE;
368 }
369
370 static void vmm_thread_refl_hw_fault(struct uthread *uth,
371                                      unsigned int trap_nr,
372                                      unsigned int err, unsigned long aux)
373 {
374         switch (trap_nr) {
375         case HW_TRAP_PAGE_FAULT:
376                 if (!handle_page_fault(uth, err, aux))
377                         refl_error(uth, trap_nr, err, aux);
378                 break;
379         default:
380                 refl_error(uth, trap_nr, err, aux);
381         }
382 }
383
384 /* Yield callback for __ctlr_entry */
385 static void __swap_to_gth(struct uthread *uth, void *dummy)
386 {
387         struct ctlr_thread *cth = (struct ctlr_thread*)uth;
388
389         /* We just immediately run our buddy.  The ctlr and the guest are accounted
390          * together ("pass the token" back and forth). */
391         current_uthread = NULL;
392         stats_run_vth((struct vmm_thread*)cth->buddy);
393         run_uthread((struct uthread*)cth->buddy);
394         assert(0);
395 }
396
397 /* All ctrl threads start here, each time their guest has a fault.  They can
398  * block and unblock along the way.  Once a ctlr does its final uthread_yield,
399  * the next time it will start again from the top. */
400 static void __ctlr_entry(void)
401 {
402         struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
403         struct virtual_machine *vm = gth_to_vm(cth->buddy);
404
405         if (!handle_vmexit(cth->buddy)) {
406                 struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy);
407
408                 fprintf(stderr, "vmm: handle_vmexit returned false\n");
409                 fprintf(stderr, "Note: this may be a kernel module, not the kernel\n");
410                 fprintf(stderr, "RSP was %p, ", (void *)vm_tf->tf_rsp);
411                 fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip);
412                 /* TODO: properly walk the kernel page tables to map the tf_rip
413                  * to a physical address. For now, however, this hack is good
414                  * enough.
415                  */
416                 hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16);
417                 showstatus(stderr, cth->buddy);
418                 exit(0);
419         }
420         /* We want to atomically yield and start/reenqueue our buddy.  We do so in
421          * vcore context on the other side of the yield. */
422         uthread_yield(FALSE, __swap_to_gth, 0);
423 }
424
425 static void vmm_thread_refl_vm_fault(struct uthread *uth)
426 {
427         struct guest_thread *gth = (struct guest_thread*)uth;
428         struct ctlr_thread *cth = gth->buddy;
429
430         gth->nr_vmexits++;
431         /* The ctlr starts frm the top every time we get a new fault. */
432         cth->uthread.flags |= UTHREAD_SAVED;
433         init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
434                       (uintptr_t)(cth->stacktop));
435         /* We just immediately run our buddy.  The ctlr and the guest are accounted
436          * together ("pass the token" back and forth). */
437         current_uthread = NULL;
438         stats_run_vth((struct vmm_thread*)cth);
439         run_uthread((struct uthread*)cth);
440         assert(0);
441 }
442
443 static void vmm_thread_refl_fault(struct uthread *uth,
444                                   struct user_context *ctx)
445 {
446         switch (ctx->type) {
447         case ROS_HW_CTX:
448                 /* Guests should only ever VM exit */
449                 assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
450                 vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
451                                          __arch_refl_get_err(ctx),
452                                          __arch_refl_get_aux(ctx));
453                 break;
454         case ROS_VM_CTX:
455                 vmm_thread_refl_vm_fault(uth);
456                 break;
457         default:
458                 assert(0);
459         }
460 }
461
462 static void task_thread_dtor(void *obj, void *priv)
463 {
464         struct task_thread *tth = (struct task_thread*)obj;
465
466         __free_stack(tth->stacktop, tth->stacksize);
467 }
468
469 static void vmm_thread_exited(struct uthread *uth)
470 {
471         struct vmm_thread *vth = (struct vmm_thread*)uth;
472         struct task_thread *tth = (struct task_thread*)uth;
473
474         /* Catch bugs.  Right now, only tasks threads can exit. */
475         assert(vth->type == VMM_THREAD_TASK);
476
477         acct_thread_blocked((struct vmm_thread*)tth);
478         uthread_cleanup(uth);
479         if (uth->flags & UTHREAD_IS_THREAD0)
480                 return;
481         kmem_cache_free(task_thread_cache, tth);
482 }
483
484 static void destroy_guest_thread(struct guest_thread *gth)
485 {
486         struct ctlr_thread *cth = gth->buddy;
487
488         __free_stack(cth->stacktop, cth->stacksize);
489         uthread_cleanup((struct uthread*)cth);
490         free(cth);
491         uthread_cleanup((struct uthread*)gth);
492         free(gth);
493 }
494
495 static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
496                                                 unsigned int gpcoreid,
497                                                 struct vmm_gpcore_init *gpci)
498 {
499         struct guest_thread *gth;
500         struct ctlr_thread *cth;
501         /* Guests won't use TLS; they always operate in Ring V.  The controller
502          * might - not because of anything we do, but because of glibc calls. */
503         struct uth_thread_attr gth_attr = {.want_tls = FALSE};
504         struct uth_thread_attr cth_attr = {.want_tls = TRUE};
505
506         gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
507         cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
508         if (!gth || !cth) {
509                 free(gth);
510                 free(cth);
511                 return 0;
512         }
513         gth->buddy = cth;
514         cth->buddy = gth;
515         gth->gpc_id = gpcoreid;
516         gth->gpci = *gpci;
517         cth->stacksize = VMM_THR_STACKSIZE;
518         cth->stacktop = __alloc_stack(cth->stacksize);
519         if (!cth->stacktop) {
520                 free(gth);
521                 free(cth);
522                 return 0;
523         }
524         gth->uthread.u_ctx.type = ROS_VM_CTX;
525         gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
526         uthread_init((struct uthread*)gth, &gth_attr);
527         uthread_init((struct uthread*)cth, &cth_attr);
528         gth->halt_mtx = uth_mutex_alloc();
529         gth->halt_cv = uth_cond_var_alloc();
530         return gth;
531 }
532
533 static void ev_handle_diag(struct event_msg *ev_msg, unsigned int ev_type,
534                            void *data)
535 {
536         struct virtual_machine *vm = current_vm;
537         struct guest_thread *gth;
538         struct ctlr_thread *cth;
539         bool reset = FALSE;
540
541         if (ev_msg && (ev_msg->ev_arg1 == 1))
542                 reset = TRUE;
543
544         fprintf(stderr, "\nSCHED stats:\n---------------\n");
545         for (int i = 0; i < vm->nr_gpcs; i++) {
546                 gth = gpcid_to_gth(vm, i);
547                 cth = gth->buddy;
548                 fprintf(stderr, "\tGPC %2d: %lu resched, %lu gth runs, %lu ctl runs, %lu user-handled vmexits\n",
549                                 i,
550                         ((struct vmm_thread*)gth)->nr_resched,
551                         ((struct vmm_thread*)gth)->nr_runs,
552                         ((struct vmm_thread*)cth)->nr_runs,
553                         gth->nr_vmexits);
554                 if (reset) {
555                     ((struct vmm_thread*)gth)->nr_resched = 0;
556                     ((struct vmm_thread*)gth)->nr_runs = 0;
557                     ((struct vmm_thread*)cth)->nr_runs = 0;
558                     gth->nr_vmexits = 0;
559                 }
560         }
561         fprintf(stderr, "\n\tNr unblocked gpc %lu, Nr unblocked tasks %lu\n",
562                 atomic_read(&nr_unblk_guests), atomic_read(&nr_unblk_tasks));
563 }
564
565 int vmm_init(struct virtual_machine *vm, struct vmm_gpcore_init *gpcis,
566              int flags)
567 {
568         struct guest_thread **gths;
569
570         if (current_vm)
571                 return -1;
572         current_vm = vm;
573         /* We should tell the kernel to create all of the GPCs we'll need in
574          * advance.
575          *
576          * We could create the others on the fly, but the kernel's answer for
577          * CPUID[0x1] will not have to total number of cores.  If we move that
578          * handler to userspace, we can create the SMP-booted GPCs on the fly.
579          *
580          * We'd also have to deal with gths[] growing dynamically, which would
581          * require synchronization. */
582         if (syscall(SYS_vmm_add_gpcs, vm->nr_gpcs, gpcis) != vm->nr_gpcs)
583                 return -1;
584         if (flags) {
585                 if (syscall(SYS_vmm_ctl, VMM_CTL_SET_FLAGS, flags))
586                         return -1;
587         }
588         gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
589         if (!gths)
590                 return -1;
591         for (int i = 0; i < vm->nr_gpcs; i++) {
592                 gths[i] = create_guest_thread(vm, i, &gpcis[i]);
593                 if (!gths[i]) {
594                         for (int j = 0; j < i; j++)
595                                 destroy_guest_thread(gths[j]);
596                         free(gths);
597                         return -1;
598                 }
599         }
600         vm->__gths = gths;
601         uthread_mcp_init();
602         register_ev_handler(EV_FREE_APPLE_PIE, ev_handle_diag, NULL);
603         if (sched_is_greedy()) {
604                 greedy_rnbl_guests = calloc(vm->nr_gpcs, sizeof(struct vmm_thread *));
605                 assert(greedy_rnbl_guests);
606                 vcore_request_total(sched_nr_greedy_cores());
607                 syscall(SYS_vmm_ctl, VMM_CTL_SET_EXITS,
608                         syscall(SYS_vmm_ctl, VMM_CTL_GET_EXITS) & ~VMM_CTL_EXIT_HALT);
609         }
610         return 0;
611 }
612
613 void start_guest_thread(struct guest_thread *gth)
614 {
615         acct_thread_unblocked((struct vmm_thread*)gth);
616         enqueue_vmm_thread((struct vmm_thread*)gth);
617 }
618
619 static void __task_thread_run(void)
620 {
621         struct task_thread *tth = (struct task_thread*)current_uthread;
622
623         uth_2ls_thread_exit(tth->func(tth->arg));
624 }
625
626 static int task_thread_ctor(void *obj, void *priv, int flags)
627 {
628         struct vmm_thread *vth = (struct vmm_thread*)obj;
629         struct task_thread *tth = (struct task_thread*)obj;
630
631         memset(vth, 0, sizeof(struct vmm_thread));
632         vth->type = VMM_THREAD_TASK;
633         vth->vm = current_vm;
634         tth->stacksize = VMM_THR_STACKSIZE;
635         tth->stacktop = __alloc_stack(tth->stacksize);
636         if (!tth->stacktop)
637                 return -1;
638         return 0;
639 }
640
641 /* Helper, creates and starts a task thread. */
642 static struct task_thread *__vmm_run_task(struct virtual_machine *vm,
643                                           void *(*func)(void *), void *arg,
644                                           struct uth_thread_attr *tth_attr)
645 {
646         struct task_thread *tth;
647
648         tth = kmem_cache_alloc(task_thread_cache, 0);
649         tth->func = func;
650         tth->arg = arg;
651         init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
652                       (uintptr_t)(tth->stacktop));
653         uthread_init((struct uthread*)tth, tth_attr);
654         acct_thread_unblocked((struct vmm_thread*)tth);
655         enqueue_vmm_thread((struct vmm_thread*)tth);
656         return tth;
657 }
658
659 struct task_thread *vmm_run_task(struct virtual_machine *vm,
660                                  void *(*func)(void *), void *arg)
661 {
662         struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = TRUE};
663
664         return __vmm_run_task(vm, func, arg, &tth_attr);
665 }
666
667 static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg)
668 {
669         struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = FALSE};
670         struct task_thread *tth;
671
672         /* It's OK to not have a VM for a generic thread */
673         tth = __vmm_run_task(NULL, func, arg, &tth_attr);
674         /* But just in case, let's poison it */
675         ((struct vmm_thread*)tth)->vm = (void*)0xdeadbeef;
676         return (struct uthread*)tth;
677 }
678
679 /* Helpers for tracking nr_unblk_* threads. */
680 static void acct_thread_blocked(struct vmm_thread *vth)
681 {
682         switch (vth->type) {
683         case VMM_THREAD_GUEST:
684         case VMM_THREAD_CTLR:
685                 atomic_dec(&nr_unblk_guests);
686                 break;
687         case VMM_THREAD_TASK:
688                 atomic_dec(&nr_unblk_tasks);
689                 break;
690         }
691 }
692
693 static void acct_thread_unblocked(struct vmm_thread *vth)
694 {
695         switch (vth->type) {
696         case VMM_THREAD_GUEST:
697         case VMM_THREAD_CTLR:
698                 atomic_inc(&nr_unblk_guests);
699                 break;
700         case VMM_THREAD_TASK:
701                 atomic_inc(&nr_unblk_tasks);
702                 break;
703         }
704 }
705
706 static void greedy_mark_guest_runnable(struct vmm_thread *vth)
707 {
708         int gpcid;
709
710         if (vth->type == VMM_THREAD_GUEST)
711                 gpcid = ((struct guest_thread*)vth)->gpc_id;
712         else
713                 gpcid = ((struct ctlr_thread*)vth)->buddy->gpc_id;
714         /* racing with the reader */
715         greedy_rnbl_guests[gpcid] = vth;
716 }
717
718 static void enqueue_vmm_thread(struct vmm_thread *vth)
719 {
720         switch (vth->type) {
721         case VMM_THREAD_GUEST:
722         case VMM_THREAD_CTLR:
723                 if (sched_is_greedy()) {
724                         greedy_mark_guest_runnable(vth);
725                 } else {
726                         spin_pdr_lock(&queue_lock);
727                         TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
728                         spin_pdr_unlock(&queue_lock);
729                 }
730                 break;
731         case VMM_THREAD_TASK:
732                 spin_pdr_lock(&queue_lock);
733                 TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
734                 spin_pdr_unlock(&queue_lock);
735                 break;
736         default:
737                 panic("Bad vmm_thread type %p\n", vth->type);
738         }
739         try_to_get_vcores();
740 }
741
742 static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
743 {
744         struct vmm_thread *vth;
745         int ret;
746
747         ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
748                              sizeof(struct vmm_thread));
749         if (ret)
750                 return 0;
751         memset(vth, 0, sizeof(struct vmm_thread));
752         vth->type = type;
753         vth->vm = vm;
754         return vth;
755 }
756
757 static void __free_stack(void *stacktop, size_t stacksize)
758 {
759         munmap(stacktop - stacksize, stacksize);
760 }
761
762 static void *__alloc_stack(size_t stacksize)
763 {
764         int force_a_page_fault;
765         void *stacktop;
766         void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
767                               MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
768
769         if (stackbot == MAP_FAILED)
770                 return 0;
771         stacktop = stackbot + stacksize;
772         /* Want the top of the stack populated, but not the rest of the stack;
773          * that'll grow on demand (up to stacksize, then will clobber memory). */
774         force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
775         return stacktop;
776 }