Add a 2LS for VMMs
authorBarret Rhoden <brho@cs.berkeley.edu>
Wed, 27 Apr 2016 22:14:24 +0000 (18:14 -0400)
committerBarret Rhoden <brho@cs.berkeley.edu>
Mon, 2 May 2016 21:11:15 +0000 (17:11 -0400)
We now have a basic 2LS for VMMs.  It doesn't handle preemption or other
serious issues, but it does the basics.  Specifically, it supports
vmrunkernel, which is no longer a pthread-based app.

You can control whether vmrunkernel is an SCP or an MCP, as well as whether
or not it yields, with the parlib_ control variables.  We can provide
arguments to vmrunkernel for that later.

Ancillary changes:
- virtio uses the VMM 2LS, instead of pthreads
- renamed the apic() function (too generic)
- the vmexit switch code was moved out of vmrunkernel and refactored.  It
  should be easier to follow.  In the process, I removed the empty apic()
case and the unused interrupt-window case.

Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
tests/vmm/vmrunkernel.c
user/vmm/apic.c
user/vmm/include/vmm/sched.h
user/vmm/include/vmm/virtio_mmio.h
user/vmm/include/vmm/vmm.h
user/vmm/sched.c [new file with mode: 0644]
user/vmm/virtio-mmio.c
user/vmm/vmexit.c [new file with mode: 0644]

index 1b189b4..a53f22b 100644 (file)
 #include <vmm/virtio_config.h>
 #include <vmm/sched.h>
 
-
 struct virtual_machine local_vm, *vm = &local_vm;
 struct vmm_gpcore_init gpci;
 
-struct vmctl vmctl;
-
-/* Whoever holds the ball runs.  run_vm never actually grabs it - it is grabbed
- * on its behalf. */
-uth_mutex_t the_ball;
-pthread_t vm_thread;
-
-void (*old_thread_refl)(struct uthread *uth, struct user_context *ctx);
-
-static void copy_vmtf_to_vmctl(struct vm_trapframe *vm_tf, struct vmctl *vmctl)
-{
-       vmctl->cr3 = vm_tf->tf_cr3;
-       vmctl->gva = vm_tf->tf_guest_va;
-       vmctl->gpa = vm_tf->tf_guest_pa;
-       vmctl->exit_qual = vm_tf->tf_exit_qual;
-       if (vm_tf->tf_exit_reason == EXIT_REASON_EPT_VIOLATION)
-               vmctl->shutdown = SHUTDOWN_EPT_VIOLATION;
-       else
-               vmctl->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
-       vmctl->ret_code = vm_tf->tf_exit_reason;
-       vmctl->interrupt = vm_tf->tf_trap_inject;
-       vmctl->intrinfo1 = vm_tf->tf_intrinfo1;
-       vmctl->intrinfo2 = vm_tf->tf_intrinfo2;
-       /* Most of the HW TF.  Should be good enough for now */
-       vmctl->regs.tf_rax = vm_tf->tf_rax;
-       vmctl->regs.tf_rbx = vm_tf->tf_rbx;
-       vmctl->regs.tf_rcx = vm_tf->tf_rcx;
-       vmctl->regs.tf_rdx = vm_tf->tf_rdx;
-       vmctl->regs.tf_rbp = vm_tf->tf_rbp;
-       vmctl->regs.tf_rsi = vm_tf->tf_rsi;
-       vmctl->regs.tf_rdi = vm_tf->tf_rdi;
-       vmctl->regs.tf_r8  = vm_tf->tf_r8;
-       vmctl->regs.tf_r9  = vm_tf->tf_r9;
-       vmctl->regs.tf_r10 = vm_tf->tf_r10;
-       vmctl->regs.tf_r11 = vm_tf->tf_r11;
-       vmctl->regs.tf_r12 = vm_tf->tf_r12;
-       vmctl->regs.tf_r13 = vm_tf->tf_r13;
-       vmctl->regs.tf_r14 = vm_tf->tf_r14;
-       vmctl->regs.tf_r15 = vm_tf->tf_r15;
-       vmctl->regs.tf_rip = vm_tf->tf_rip;
-       vmctl->regs.tf_rflags = vm_tf->tf_rflags;
-       vmctl->regs.tf_rsp = vm_tf->tf_rsp;
-}
-
-static void copy_vmctl_to_vmtf(struct vmctl *vmctl, struct vm_trapframe *vm_tf)
-{
-       vm_tf->tf_rax = vmctl->regs.tf_rax;
-       vm_tf->tf_rbx = vmctl->regs.tf_rbx;
-       vm_tf->tf_rcx = vmctl->regs.tf_rcx;
-       vm_tf->tf_rdx = vmctl->regs.tf_rdx;
-       vm_tf->tf_rbp = vmctl->regs.tf_rbp;
-       vm_tf->tf_rsi = vmctl->regs.tf_rsi;
-       vm_tf->tf_rdi = vmctl->regs.tf_rdi;
-       vm_tf->tf_r8  = vmctl->regs.tf_r8;
-       vm_tf->tf_r9  = vmctl->regs.tf_r9;
-       vm_tf->tf_r10 = vmctl->regs.tf_r10;
-       vm_tf->tf_r11 = vmctl->regs.tf_r11;
-       vm_tf->tf_r12 = vmctl->regs.tf_r12;
-       vm_tf->tf_r13 = vmctl->regs.tf_r13;
-       vm_tf->tf_r14 = vmctl->regs.tf_r14;
-       vm_tf->tf_r15 = vmctl->regs.tf_r15;
-       vm_tf->tf_rip = vmctl->regs.tf_rip;
-       vm_tf->tf_rflags = vmctl->regs.tf_rflags;
-       vm_tf->tf_rsp = vmctl->regs.tf_rsp;
-       vm_tf->tf_cr3 = vmctl->cr3;
-       vm_tf->tf_trap_inject = vmctl->interrupt;
-       /* Don't care about the rest of the fields.  The kernel only writes them */
-}
-
-/* callback, runs in vcore context.  this sets up our initial context.  once we
- * become runnable again, we'll run the first bits of the vm ctx.  after that,
- * our context will be stopped and started and will just run whatever the guest
- * VM wants.  we'll never come back to this code or to run_vm(). */
-static void __build_vm_ctx_cb(struct uthread *uth, void *arg)
-{
-       struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
-       struct vmctl *vmctl = (struct vmctl*)arg;
-       struct vm_trapframe *vm_tf;
-
-       __pthread_generic_yield(pthread);
-       pthread->state = PTH_BLK_YIELDING;
-
-       memset(&uth->u_ctx, 0, sizeof(struct user_context));
-       uth->u_ctx.type = ROS_VM_CTX;
-       vm_tf = &uth->u_ctx.tf.vm_tf;
-
-       vm_tf->tf_guest_pcoreid = 0;    /* assuming only 1 guest core */
-
-       copy_vmctl_to_vmtf(vmctl, vm_tf);
-
-       /* other HW/GP regs are 0, which should be fine.  the FP state is still
-        * whatever we were running before, though this is pretty much unnecessary.
-        * we mostly don't want crazy crap in the uth->as, and a non-current_uthread
-        * VM ctx is supposed to have something in their FP state (like HW ctxs). */
-       save_fp_state(&uth->as);
-       uth->flags |= UTHREAD_FPSAVED | UTHREAD_SAVED;
-
-       uthread_runnable(uth);
-}
-
-static void *run_vm(void *arg)
-{
-       struct vmctl *vmctl = (struct vmctl*)arg;
-
-       assert(vmctl->command == REG_RSP_RIP_CR3);
-       /* We need to hack our context, so that next time we run, we're a VM ctx */
-       uthread_yield(FALSE, __build_vm_ctx_cb, arg);
-}
-
-static void vmm_thread_refl_fault(struct uthread *uth,
-                                  struct user_context *ctx)
-{
-       struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
-
-       /* Hack to call the original pth 2LS op */
-       if (!ctx->type == ROS_VM_CTX) {
-               old_thread_refl(uth, ctx);
-               return;
-       }
-       __pthread_generic_yield(pthread);
-       /* normally we'd handle the vmexit here.  to work within the existing
-        * framework, we just wake the controller thread.  It'll look at our ctx
-        * then make us runnable again */
-       pthread->state = PTH_BLK_MUTEX;
-       uth_mutex_unlock(the_ball);             /* wake the run_vmthread */
-}
-
-
-
-/* this will start the vm thread, and return when the thread has blocked,
- * with the right info in vmctl. */
-static void run_vmthread(struct vmctl *vmctl)
-{
-       struct vm_trapframe *vm_tf;
-
-       if (!vm_thread) {
-               /* first time through, we make the vm thread.  the_ball was already
-                * grabbed right after it was alloc'd. */
-               if (pthread_create(&vm_thread, NULL, run_vm, vmctl)) {
-                       perror("pth_create");
-                       exit(-1);
-               }
-               /* hack in our own handlers for some 2LS ops */
-               old_thread_refl = sched_ops->thread_refl_fault;
-               sched_ops->thread_refl_fault = vmm_thread_refl_fault;
-       } else {
-               copy_vmctl_to_vmtf(vmctl, &vm_thread->uthread.u_ctx.tf.vm_tf);
-               uth_mutex_lock(the_ball);       /* grab it for the vm_thread */
-               uthread_runnable((struct uthread*)vm_thread);
-       }
-       uth_mutex_lock(the_ball);
-       /* We woke due to a vm exit.  Need to unlock for the next time we're run */
-       uth_mutex_unlock(the_ball);
-       /* the vm stopped.  we can do whatever we want before rerunning it.  since
-        * we're controlling the uth, we need to handle its vmexits.  we'll fill in
-        * the vmctl, since that's the current framework. */
-       copy_vmtf_to_vmctl(&vm_thread->uthread.u_ctx.tf.vm_tf, vmctl);
-}
-
 /* By 1999, you could just scan the hardware
  * and work it out. But 2005, that was no longer possible. How sad.
  * so we have to fake acpi to make it all work.
@@ -303,7 +143,6 @@ struct acpi_madt_interrupt_override isor[] = {
 void *low1m;
 volatile int shared = 0;
 volatile int quit = 0;
-int mcp = 1;
 
 /* total hack. If the vm runs away we want to get control again. */
 unsigned int maxresume = (unsigned int) -1;
@@ -339,7 +178,7 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr);
 
 pthread_t timerthread_struct;
 
-void *timer_thread(void *arg)
+void timer_thread(void *arg)
 {
        uint8_t vector;
        uint32_t initial_count;
@@ -355,7 +194,7 @@ void *timer_thread(void *arg)
        fprintf(stderr, "SENDING TIMER\n");
 }
 
-void *consout(void *arg)
+void consout(void *arg)
 {
        char *line, *consline, *outline;
        static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
@@ -408,13 +247,12 @@ void *consout(void *arg)
                if (debug) fprintf(stderr, "CCC: DONE call add_used\n");
        }
        fprintf(stderr, "All done\n");
-       return NULL;
 }
 
 // FIXME.
 volatile int consdata = 0;
 
-void *consin(void *arg)
+void consin(void *arg)
 {
        struct virtio_threadarg *a = arg;
        char *line, *outline;
@@ -454,8 +292,8 @@ void *consin(void *arg)
                        if (debug) fprintf(stderr, "CONSIN: GOT A LINE:%s:\n", consline);
                        if (debug) fprintf(stderr, "CONSIN: OUTLEN:%d:\n", outlen);
                        if (strlen(consline) < 3 && consline[0] == 'q' ) {
-                               quit = 1;
-                               break;
+                               fflush(stdout);
+                               exit(0);
                        }
 
                        memmove(iov[i].v, consline, strlen(consline)+ 1);
@@ -475,7 +313,6 @@ void *consin(void *arg)
                ros_syscall(SYS_vmm_poke_guest, 0, 0, 0, 0, 0, 0);
        }
        fprintf(stderr, "All done\n");
-       return NULL;
 }
 
 static struct vqdev vqdev= {
@@ -484,8 +321,8 @@ dev: VIRTIO_ID_CONSOLE,
 device_features: 0, /* Can't do it: linux console device does not support it. VIRTIO_F_VERSION_1*/
 numvqs: 2,
 vqs: {
-               {name: "consin", maxqnum: 64, f: consin, arg: (void *)0},
-               {name: "consout", maxqnum: 64, f: consout, arg: (void *)0},
+               {name: "consin",  maxqnum: 64, func: consin,  arg: (void *)0},
+               {name: "consout", maxqnum: 64, func: consout, arg: (void *)0},
        }
 };
 
@@ -574,11 +411,9 @@ int main(int argc, char **argv)
        struct acpi_table_xsdt *x;
        // lowmem is a bump allocated pointer to 2M at the "physbase" of memory
        void *lowmem = (void *) 0x1000000;
-       //struct vmctl vmctl;
        int amt;
        int vmmflags = 0; // Disabled probably forever. VMM_VMCALL_PRINTF;
        uint64_t entry = 0x1200000, kerneladdress = 0x1200000;
-       int nr_gpcs = 1;
        int ret;
        void * xp;
        int kfd = -1;
@@ -590,9 +425,6 @@ int main(int argc, char **argv)
        struct vm_trapframe *vm_tf;
        uint64_t tsc_freq_khz;
 
-       the_ball = uth_mutex_alloc();
-       uth_mutex_lock(the_ball);
-
        fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT,
                        PML1_PTE_REACH);
 
@@ -786,7 +618,6 @@ int main(int argc, char **argv)
        memset(a, 0, 4096);
        a += 4096;
        gpci.vapic_addr = a;
-       //vmctl.vapic = (uint64_t) a_page;
        memset(a, 0, 4096);
        ((uint32_t *)a)[0x30/4] = 0x01060014;
        p64 = a;
@@ -830,36 +661,16 @@ int main(int argc, char **argv)
        bp->e820_map[e820i].size = 0x10000000;
        bp->e820_map[e820i++].type = E820_RESERVED;
 
-       if (ros_syscall(SYS_vmm_setup, nr_gpcs, &gpci, vmmflags, 0, 0, 0) !=
-           nr_gpcs) {
-               perror("Guest pcore setup failed");
-               exit(1);
-       }
 
-       fprintf(stderr, "Run with %d cores and vmmflags 0x%x\n", nr_gpcs, vmmflags);
-       mcp = 1;
-       if (mcp) {
-               my_retvals = malloc(sizeof(void*) * nr_threads);
-               if (!my_retvals)
-                       perror("Init threads/malloc");
-
-               pthread_can_vcore_request(FALSE);       /* 2LS won't manage vcores */
-               pthread_need_tls(FALSE);
-               pthread_mcp_init();                                     /* gives us one vcore */
-               vcore_request_total(nr_threads);
-               for (int i = 0; i < nr_threads; i++) {
-                       xp = __procinfo.vcoremap;
-                       fprintf(stderr, "%p\n", __procinfo.vcoremap);
-                       fprintf(stderr, "Vcore %d mapped to pcore %d\n", i,
-                               __procinfo.vcoremap[i].pcoreid);
-               }
-       }
+       /* Set parlib vars to control the 2LS. */
+       parlib_wants_to_be_mcp = TRUE;  /* default */
+       parlib_never_yield = FALSE;             /* default */
+       vm->nr_gpcs = 1;
+       vm->gpcis = &gpci;
+       ret = vmm_init(vm, vmmflags);
+       assert(!ret);
+
 
-       ret = syscall(33, 1);
-       if (ret < 0) {
-               perror("vm setup");
-               exit(1);
-       }
        ret = posix_memalign((void **)&p512, 4096, 3*4096);
        fprintf(stderr, "memalign is %p\n", p512);
        if (ret) {
@@ -890,262 +701,17 @@ int main(int argc, char **argv)
 
 
        vm->virtio_mmio_base = 0x100000000;
+       register_virtio_mmio(&vqdev, vm->virtio_mmio_base);
 
+       vmm_run_task(vm, timer_thread, 0);
 
-       vmctl.interrupt = 0;
-       vmctl.command = REG_RSP_RIP_CR3;
-       vmctl.cr3 = (uint64_t) p512;
-       vmctl.regs.tf_rip = entry;
-       vmctl.regs.tf_rsp = 0;
-       vmctl.regs.tf_rsi = (uint64_t) bp;
-       if (mcp) {
-               /* set up virtio bits, which depend on threads being enabled. */
-               register_virtio_mmio(&vqdev, vm->virtio_mmio_base);
-       }
-       fprintf(stderr, "threads started\n");
-       fprintf(stderr, "Writing command :%s:\n", cmd);
-
-       if (debug)
-               vapic_status_dump(stderr, (void *)gpci.vapic_addr);
-
-       run_vmthread(&vmctl);
-
-       if (debug)
-               vapic_status_dump(stderr, (void *)gpci.vapic_addr);
-
-       if (mcp) {
-               /* Start up timer thread */
-               if (pthread_create(&timerthread_struct, NULL, timer_thread, NULL)) {
-                       fprintf(stderr, "pth_create failed for timer thread.");
-                       perror("pth_create");
-               }
-       }
-
-       vm_tf = &(vm_thread->uthread.u_ctx.tf.vm_tf);
-
-       while (1) {
-
-               int c;
-               uint8_t byte;
-               //vmctl.command = REG_RIP;
-               if (maxresume-- == 0) {
-                       debug = 1;
-                       resumeprompt = 1;
-               }
-               if (debug) {
-                       fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
-                               vm_tf->tf_exit_reason);
-                       showstatus(stderr, (struct guest_thread*)&vm_thread);
-               }
-               if (resumeprompt) {
-                       fprintf(stderr, "RESUME?\n");
-                       c = getchar();
-                       if (c == 'q')
-                               break;
-               }
-               if (vm_tf->tf_exit_reason == EXIT_REASON_EPT_VIOLATION) {
-                       uint64_t gpa, *regp, val;
-                       uint8_t regx;
-                       int store, size;
-                       int advance;
-                       if (decode((struct guest_thread *) vm_thread, &gpa, &regx, &regp,
-                                  &store, &size, &advance)) {
-                               fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
-                                       vm_tf->tf_exit_reason);
-                               showstatus(stderr, (struct guest_thread*)&vm_thread);
-                               quit = 1;
-                               break;
-                       }
-                       if (debug) fprintf(stderr, "%p %p %p %p %p %p\n", gpa, regx, regp, store, size, advance);
-                       if (PG_ADDR(gpa) == vm->virtio_mmio_base) {
-                               if (debug) fprintf(stderr, "DO SOME VIRTIO\n");
-                               // Lucky for us the various virtio ops are well-defined.
-                               virtio_mmio((struct guest_thread *)vm_thread, gpa, regx, regp,
-                                           store);
-                               if (debug) fprintf(stderr, "store is %d:\n", store);
-                               if (debug) fprintf(stderr, "REGP IS %16x:\n", *regp);
-                       } else if (PG_ADDR(gpa) == 0xfee00000) {
-                               // until we fix our include mess, just put the proto here.
-                               //int apic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
-                               //apic(&vmctl, gpa, regx, regp, store);
-                       } else if (PG_ADDR(gpa) == 0xfec00000) {
-                               // until we fix our include mess, just put the proto here.
-                               do_ioapic((struct guest_thread *)vm_thread, gpa, regx, regp,
-                                         store);
-                       } else if (PG_ADDR(gpa) == 0) {
-                               uint64_t val = 0;
-                               memmove(&val, &vm->low4k[gpa], size);
-                               hexdump(stdout, &vm->low4k[gpa], size);
-                               fprintf(stderr, "Low 4k, code %p read @ %p, size %d, val %p\n",
-                                       vm_tf->tf_rip, gpa, size, val);
-                               memmove(regp, &vm->low4k[gpa], size);
-                               hexdump(stdout, regp, size);
-                       } else {
-                               fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
-                               fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
-                                       vm_tf->tf_exit_reason);
-                               fprintf(stderr, "Returning 0xffffffff\n");
-                               showstatus(stderr, (struct guest_thread*)&vm_thread);
-                               // Just fill the whole register for now.
-                               *regp = (uint64_t) -1;
-                       }
-                       vm_tf->tf_rip += advance;
-                       if (debug)
-                               fprintf(stderr, "Advance rip by %d bytes to %p\n",
-                                       advance, vm_tf->tf_rip);
-                       //vmctl.shutdown = 0;
-                       //vmctl.gpa = 0;
-                       //vmctl.command = REG_ALL;
-               } else {
-                       switch (vm_tf->tf_exit_reason) {
-                       case  EXIT_REASON_VMCALL:
-                               byte = vm_tf->tf_rdi;
-                               printf("%c", byte);
-                               if (byte == '\n') printf("%c", '%');
-                               vm_tf->tf_rip += 3;
-                               break;
-                       case EXIT_REASON_EXTERNAL_INTERRUPT:
-                               //debug = 1;
-                               if (debug)
-                                       fprintf(stderr, "XINT 0x%x 0x%x\n",
-                                               vm_tf->tf_intrinfo1, vm_tf->tf_intrinfo2);
-                               if (debug) pir_dump();
-                               //vmctl.command = RESUME;
-                               break;
-                       case EXIT_REASON_IO_INSTRUCTION:
-                               fprintf(stderr, "IO @ %p\n", vm_tf->tf_rip);
-                               io((struct guest_thread *)vm_thread);
-                               //vmctl.shutdown = 0;
-                               //vmctl.gpa = 0;
-                               //vmctl.command = REG_ALL;
-                               break;
-                       case EXIT_REASON_INTERRUPT_WINDOW:
-                               if (consdata) {
-                                       if (debug) fprintf(stderr, "inject an interrupt\n");
-                                       virtio_mmio_set_vring_irq();
-                                       vm_tf->tf_trap_inject = 0x80000000 | vm->virtio_irq;
-                                       //vmctl.command = RESUME;
-                                       consdata = 0;
-                               }
-                               break;
-                       case EXIT_REASON_MSR_WRITE:
-                       case EXIT_REASON_MSR_READ:
-                               fprintf(stderr, "Do an msr\n");
-                               if (msrio((struct guest_thread *)vm_thread, &gpci,
-                                         vm_tf->tf_exit_reason)) {
-                                       // uh-oh, msrio failed
-                                       // well, hand back a GP fault which is what Intel does
-                                       fprintf(stderr, "MSR FAILED: RIP %p, shutdown 0x%x\n",
-                                               vm_tf->tf_rip, vm_tf->tf_exit_reason);
-                                       showstatus(stderr, (struct guest_thread*)&vm_thread);
-
-                                       // Use event injection through vmctl to send
-                                       // a general protection fault
-                                       // vmctl.interrupt gets written to the VM-Entry
-                                       // Interruption-Information Field by vmx
-                                       vm_tf->tf_trap_inject = VM_TRAP_VALID
-                                                             | VM_TRAP_ERROR_CODE
-                                                             | VM_TRAP_HARDWARE
-                                                             | 13; // GPF
-                               } else {
-                                       vm_tf->tf_rip += 2;
-                               }
-                               break;
-                       case EXIT_REASON_MWAIT_INSTRUCTION:
-                         fflush(stdout);
-                               if (debug)fprintf(stderr, "\n================== Guest MWAIT. =======================\n");
-                               if (debug)fprintf(stderr, "Wait for cons data\n");
-                               while (!consdata)
-                                       ;
-                               //debug = 1;
-                               if (debug)
-                                       vapic_status_dump(stderr, gpci.vapic_addr);
-                               if (debug)fprintf(stderr, "Resume with consdata ...\n");
-                               vm_tf->tf_rip += 3;
-                               //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
-                               //showstatus(stderr, (struct guest_thread*)&vm_thread);
-                               break;
-                       case EXIT_REASON_HLT:
-                               fflush(stdout);
-                               if (debug)fprintf(stderr, "\n================== Guest halted. =======================\n");
-                               if (debug)fprintf(stderr, "Wait for cons data\n");
-                               while (!consdata)
-                                       ;
-                               //debug = 1;
-                               if (debug)fprintf(stderr, "Resume with consdata ...\n");
-                               vm_tf->tf_rip += 1;
-                               //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
-                               //showstatus(stderr, (struct guest_thread*)&vm_thread);
-                               break;
-                       case EXIT_REASON_APIC_ACCESS:
-                               if (1 || debug)fprintf(stderr, "APIC READ EXIT\n");
-
-                               uint64_t gpa, *regp, val;
-                               uint8_t regx;
-                               int store, size;
-                               int advance;
-                               if (decode((struct guest_thread *)vm_thread, &gpa, &regx,
-                                          &regp, &store, &size, &advance)) {
-                                       fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
-                                               vm_tf->tf_exit_reason);
-                                       showstatus(stderr, (struct guest_thread*)&vm_thread);
-                                       quit = 1;
-                                       break;
-                               }
-
-                               int apic(struct guest_thread *vm_thread, uint64_t gpa,
-                                        int destreg, uint64_t *regp, int store);
-                               apic((struct guest_thread *)vm_thread, gpa, regx, regp, store);
-                               vm_tf->tf_rip += advance;
-                               if (debug)
-                                       fprintf(stderr, "Advance rip by %d bytes to %p\n",
-                                               advance, vm_tf->tf_rip);
-                               //vmctl.shutdown = 0;
-                               //vmctl.gpa = 0;
-                               //vmctl.command = REG_ALL;
-                               break;
-                       case EXIT_REASON_APIC_WRITE:
-                               if (1 || debug)fprintf(stderr, "APIC WRITE EXIT\n");
-                               break;
-                       default:
-                               fprintf(stderr, "Don't know how to handle exit %d\n",
-                                       vm_tf->tf_exit_reason);
-                               fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
-                                       vm_tf->tf_exit_reason);
-                               showstatus(stderr, (struct guest_thread*)&vm_thread);
-                               quit = 1;
-                               break;
-                       }
-               }
-               if (debug) fprintf(stderr, "at bottom of switch, quit is %d\n", quit);
-               if (quit)
-                       break;
-               if (consdata) {
-                       if (debug) fprintf(stderr, "inject an interrupt\n");
-                       if (debug)
-                               fprintf(stderr, "XINT 0x%x 0x%x\n", vm_tf->tf_intrinfo1,
-                                       vm_tf->tf_intrinfo2);
-                       vm_tf->tf_trap_inject = 0x80000000 | vm->virtio_irq;
-                       virtio_mmio_set_vring_irq();
-                       consdata = 0;
-                       //debug = 1;
-                       //vmctl.command = RESUME;
-               }
-               if (debug) fprintf(stderr, "NOW DO A RESUME\n");
-               copy_vmtf_to_vmctl(vm_tf, &vmctl);
-               run_vmthread(&vmctl);
-               copy_vmctl_to_vmtf(&vmctl, vm_tf);
-       }
-
-       /* later.
-       for (int i = 0; i < nr_threads-1; i++) {
-               int ret;
-               if (pthread_join(my_threads[i], &my_retvals[i]))
-                       perror("pth_join failed");
-               fprintf(stderr, "%d %d\n", i, ret);
-       }
- */
+       vm_tf = gth_to_vmtf(vm->gths[0]);
+       vm_tf->tf_cr3 = (uint64_t) p512;
+       vm_tf->tf_rip = entry;
+       vm_tf->tf_rsp = 0;
+       vm_tf->tf_rsi = (uint64_t) bp;
+       start_guest_thread(vm->gths[0]);
 
-       fflush(stdout);
-       exit(0);
+       uthread_sleep_forever();
+       return 0;
 }
index e50d71b..2153edb 100644 (file)
@@ -164,8 +164,8 @@ static void apic_write(uint64_t offset, uint32_t value)
 
 }
 
-int apic(struct guest_thread *vm_thread, uint64_t gpa, int destreg,
-         uint64_t *regp, int store)
+int __apic_access(struct guest_thread *vm_thread, uint64_t gpa, int destreg,
+                  uint64_t *regp, int store)
 {
        uint32_t offset = gpa & 0xfffff;
        /* basic sanity tests. */
index 484407c..43e26a4 100644 (file)
@@ -1,15 +1,80 @@
 /* Copyright (c) 2016 Google Inc.
  * Barret Rhoden <brho@cs.berkeley.edu>
- * See LICENSE for details. */
+ * See LICENSE for details.
+ *
+ * 2LS for virtual machines */
 
 #pragma once
 
 #include <parlib/uthread.h>
+#include <sys/queue.h>
 
 __BEGIN_DECLS
 
+/* Three types of threads.  Guests are actual guest VMs.  Controllers are
+ * threads that are paired to guests and handles their exits.  Guests and
+ * controllers are 1:1 (via *buddy).  Task threads are for the VMM itself, such
+ * as a console thread. */
+
+#define VMM_THREAD_GUEST               1
+#define VMM_THREAD_CTLR                        2
+#define VMM_THREAD_TASK                        3
+
+#define VMM_THR_STACKSIZE              8192
+
+struct guest_thread;
+struct ctlr_thread;
+struct task_thread;
+
 struct guest_thread {
        struct uthread                          uthread;
+       struct ctlr_thread                      *buddy;
+       unsigned int                            gpc_id;
 };
 
+struct ctlr_thread {
+       struct uthread                          uthread;
+       struct guest_thread                     *buddy;
+       size_t                                          stacksize;
+       void                                            *stacktop;
+};
+
+struct task_thread {
+       struct uthread                          uthread;
+       void                                            (*func)(void *);
+       void                                            *arg;
+       size_t                                          stacksize;
+       void                                            *stacktop;
+};
+
+struct virtual_machine;                        /* in vmm/vmm.h */
+struct vmm_thread {
+       union {
+               struct guest_thread             guest;
+               struct ctlr_thread              ctlr;
+               struct task_thread              task;
+       };
+       int                                                     type;
+       TAILQ_ENTRY(vmm_thread)         tq_next;
+       struct virtual_machine          *vm;
+};
+
+TAILQ_HEAD(vmm_thread_tq, vmm_thread);
+
+extern int vmm_sched_period_usec;
+
+/* Initialize a VMM for a virtual machine, which the caller fills out, except
+ * for gths.  This will set **gths in the struct virtual machine.  Do not free()
+ * the array.
+ *
+ * Set the parlib control variables (e.g. parlib_wants_to_be_mcp) before calling
+ * this initializer.
+ *
+ * Returns 0 on success, -1 o/w. */
+int vmm_init(struct virtual_machine *vm, int flags);
+/* Starts a guest thread/core. */
+void start_guest_thread(struct guest_thread *gth);
+/* Start and run a task thread. */
+int vmm_run_task(struct virtual_machine *vm, void (*func)(void *), void *arg);
+
 __END_DECLS
index 27c1d75..4f93fbd 100644 (file)
 // status; and a pointer to the virtio struct.
 struct vq {
        char *name;
-       void *(*f)(void *arg); // Start this as a thread when a matching virtio is discovered.
+       /* Start this as a thread when a matching virtio is discovered. */
+       void (*func)(void *arg);
        void *arg;
        int maxqnum; // how many things the q gets? or something. 
        int qnum; 
        int qalign;
-       pthread_t thread;
        /* filled in by virtio probing. */
        uint64_t pfn;
        uint32_t isr; // not used yet but ...
index 94bcede..0dd2274 100644 (file)
@@ -32,11 +32,30 @@ int msrio(struct guest_thread *vm_thread, struct vmm_gpcore_init *gpci,
           uint32_t opcode);
 int do_ioapic(struct guest_thread *vm_thread, uint64_t gpa,
               int destreg, uint64_t *regp, int store);
+bool handle_vmexit(struct guest_thread *gth);
+int __apic_access(struct guest_thread *vm_thread, uint64_t gpa, int destreg,
+                  uint64_t *regp, int store);
 
 /* Lookup helpers */
 
 static struct virtual_machine *gth_to_vm(struct guest_thread *gth)
 {
-       /* TODO */
-       return 0;
+       return ((struct vmm_thread*)gth)->vm;
+}
+
+static struct vm_trapframe *gth_to_vmtf(struct guest_thread *gth)
+{
+       return &gth->uthread.u_ctx.tf.vm_tf;
+}
+
+static struct vmm_gpcore_init *gth_to_gpci(struct guest_thread *gth)
+{
+       struct virtual_machine *vm = gth_to_vm(gth);
+
+       return &vm->gpcis[gth->gpc_id];
+}
+
+static struct virtual_machine *get_my_vm(void)
+{
+       return ((struct vmm_thread*)current_uthread)->vm;
 }
diff --git a/user/vmm/sched.c b/user/vmm/sched.c
new file mode 100644 (file)
index 0000000..985c521
--- /dev/null
@@ -0,0 +1,565 @@
+/* Copyright (c) 2016 Google Inc.
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * 2LS for virtual machines */
+
+#include <vmm/sched.h>
+#include <vmm/vmm.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <parlib/spinlock.h>
+#include <parlib/event.h>
+#include <parlib/ucq.h>
+#include <parlib/arch/trap.h>
+#include <parlib/ros_debug.h>
+#include <benchutil/vcore_tick.h>
+
+int vmm_sched_period_usec = 1000;
+
+/* For now, we only have one VM managed by the 2LS.  If we ever expand that,
+ * we'll need something analogous to current_uthread, so the 2LS knows which VM
+ * it is working on. */
+static struct virtual_machine *current_vm;
+
+static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
+/* Runnable queues, broken up by thread type. */
+static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
+static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
+/* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */
+static atomic_t nr_unblk_tasks;
+static atomic_t nr_unblk_guests;
+/* Global evq for all syscalls.  Could make this per vcore or whatever. */
+static struct event_queue *sysc_evq;
+
+static void vmm_sched_entry(void);
+static void vmm_thread_runnable(struct uthread *uth);
+static void vmm_thread_paused(struct uthread *uth);
+static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
+static void vmm_thread_has_blocked(struct uthread *uth, int flags);
+static void vmm_thread_refl_fault(struct uthread *uth,
+                                  struct user_context *ctx);
+
+struct schedule_ops vmm_sched_ops = {
+       .sched_entry = vmm_sched_entry,
+       .thread_runnable = vmm_thread_runnable,
+       .thread_paused = vmm_thread_paused,
+       .thread_blockon_sysc = vmm_thread_blockon_sysc,
+       .thread_has_blocked = vmm_thread_has_blocked,
+       .thread_refl_fault = vmm_thread_refl_fault,
+};
+
+/* Helpers */
+static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
+                               void *data);
+static void acct_thread_blocked(struct vmm_thread *vth);
+static void acct_thread_unblocked(struct vmm_thread *vth);
+static void enqueue_vmm_thread(struct vmm_thread *vth);
+static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
+                                           int type);
+static void *__alloc_stack(size_t stacksize);
+static void __free_stack(void *stacktop, size_t stacksize);
+
+
+static void restart_thread(struct syscall *sysc)
+{
+       struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
+
+       /* uthread stuff here: */
+       assert(ut_restartee);
+       assert(ut_restartee->sysc == sysc);     /* set in uthread.c */
+       ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
+       vmm_thread_runnable(ut_restartee);
+}
+
+static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
+                               void *data)
+{
+       struct syscall *sysc;
+
+       /* I think we can make this assert now.  If not, check pthread.c. (concern
+        * was having old ev_qs firing and running this handler). */
+       assert(ev_msg);
+       sysc = ev_msg->ev_arg3;
+       assert(sysc);
+       restart_thread(sysc);
+}
+
+/* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will
+ * attempt to route the notifs/IPIs to vcoreid */
+static struct event_queue *setup_sysc_evq(int vcoreid)
+{
+       struct event_queue *evq;
+       uintptr_t mmap_block;
+
+       mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
+                                    PROT_WRITE | PROT_READ,
+                                    MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
+       evq = get_eventq_raw();
+       assert(mmap_block && evq);
+       evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
+       evq->ev_vcore = vcoreid;
+       evq->ev_mbox->type = EV_MBOX_UCQ;
+       ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
+       return evq;
+}
+
+static void __attribute__((constructor)) vmm_lib_init(void)
+{
+       struct task_thread *thread0;
+
+       init_once_racy(return);
+       uthread_lib_init();
+
+       /* Note that thread0 doesn't belong to a VM.  We can set this during
+        * vmm_init() if we need to. */
+       thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
+       assert(thread0);
+       acct_thread_unblocked((struct vmm_thread*)thread0);
+       thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
+       thread0->stacktop = (void*)USTACKTOP;
+       /* for lack of a better vcore, might as well send to 0 */
+       sysc_evq = setup_sysc_evq(0);
+       register_ev_handler(EV_SYSCALL, vmm_handle_syscall, 0);
+       uthread_2ls_init((struct uthread*)thread0, &vmm_sched_ops);
+}
+
+/* The scheduling policy is encapsulated in the next few functions (from here
+ * down to sched_entry()). */
+
+static int desired_nr_vcores(void)
+{
+       /* Sanity checks on our accounting. */
+       assert(atomic_read(&nr_unblk_guests) >= 0);
+       assert(atomic_read(&nr_unblk_tasks) >= 0);
+       /* Lockless peak.  This is always an estimate.  Some of our tasks busy-wait,
+        * so it's not enough to just give us one vcore for all tasks, yet. */
+       return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
+}
+
+static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
+{
+       struct vmm_thread *vth;
+
+       vth = TAILQ_FIRST(tq);
+       if (vth)
+               TAILQ_REMOVE(tq, vth, tq_next);
+       return vth;
+}
+
+static struct vmm_thread *pick_a_thread_degraded(void)
+{
+       struct vmm_thread *vth = 0;
+       static int next_class = VMM_THREAD_GUEST;
+
+       /* We don't have a lot of cores (maybe 0), so we'll alternate which type of
+        * thread we look at first.  Basically, we're RR within a class of threads,
+        * and we'll toggle between those two classes. */
+       spin_pdr_lock(&queue_lock);
+       if (next_class == VMM_THREAD_GUEST) {
+               if (!vth)
+                       vth = __pop_first(&rnbl_guests);
+               if (!vth)
+                       vth = __pop_first(&rnbl_tasks);
+               next_class = VMM_THREAD_TASK;
+       } else {
+               if (!vth)
+                       vth = __pop_first(&rnbl_tasks);
+               if (!vth)
+                       vth = __pop_first(&rnbl_guests);
+               next_class = VMM_THREAD_GUEST;
+       };
+       spin_pdr_unlock(&queue_lock);
+       return vth;
+}
+
+/* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */
+static struct vmm_thread *pick_a_thread_plenty(void)
+{
+       struct vmm_thread *vth = 0;
+
+       spin_pdr_lock(&queue_lock);
+       if (!vth)
+               vth = __pop_first(&rnbl_tasks);
+       if (!vth)
+               vth = __pop_first(&rnbl_guests);
+       spin_pdr_unlock(&queue_lock);
+       return vth;
+}
+
+static void yield_current_uth(void)
+{
+       struct vmm_thread *vth;
+
+       if (!current_uthread)
+               return;
+       vth = (struct vmm_thread*)stop_current_uthread();
+       enqueue_vmm_thread(vth);
+}
+
+static void __attribute__((noreturn)) vmm_sched_entry(void)
+{
+       struct vmm_thread *vth;
+       int nr_vcores_wanted = desired_nr_vcores();
+       bool have_enough = nr_vcores_wanted <= num_vcores();
+
+       /* TODO: this doesn't handle a lot of issues, like preemption, how to
+        * run/yield our vcores, dynamic changes in the number of runnables, where
+        * to send events, how to avoid interfering with gpcs, etc. */
+       if (have_enough) {
+               vcore_tick_disable();
+       } else {
+               vcore_tick_enable(vmm_sched_period_usec);
+               vcore_request_total(nr_vcores_wanted);
+               if (vcore_tick_poll()) {
+                       /* slightly less than ideal: we grab the queue lock twice */
+                       yield_current_uth();
+               }
+       }
+       if (current_uthread)
+               run_current_uthread();
+       if (have_enough)
+               vth = pick_a_thread_plenty();
+       else
+               vth = pick_a_thread_degraded();
+       if (!vth)
+               vcore_yield_or_restart();
+       run_uthread((struct uthread*)vth);
+}
+
+static void vmm_thread_runnable(struct uthread *uth)
+{
+       /* A thread that was blocked is now runnable.  This counts as becoming
+        * unblocked (running + runnable) */
+       acct_thread_unblocked((struct vmm_thread*)uth);
+       enqueue_vmm_thread((struct vmm_thread*)uth);
+}
+
+static void vmm_thread_paused(struct uthread *uth)
+{
+       /* The thread stopped for some reason, usually a preemption.  We'd like to
+        * just run it whenever we get a chance.  Note that it didn't become
+        * 'blocked' - it's still runnable. */
+       enqueue_vmm_thread((struct vmm_thread*)uth);
+}
+
+static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
+{
+       struct syscall *sysc = (struct syscall*)syscall;
+
+       acct_thread_blocked((struct vmm_thread*)uth);
+       sysc->u_data = uth;
+       if (!register_evq(sysc, sysc_evq)) {
+               /* Lost the race with the call being done.  The kernel won't send the
+                * event.  Just restart him. */
+               restart_thread(sysc);
+       }
+       /* GIANT WARNING: do not touch the thread after this point. */
+}
+
+static void vmm_thread_has_blocked(struct uthread *uth, int flags)
+{
+       /* The thread blocked on something like a mutex.  It's not runnable, so we
+        * don't need to put it on a list, but we do need to account for it not
+        * running.  We'll find out (via thread_runnable) when it starts up again.
+        */
+       acct_thread_blocked((struct vmm_thread*)uth);
+}
+
+static void refl_error(struct uthread *uth, unsigned int trap_nr,
+                       unsigned int err, unsigned long aux)
+{
+       printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
+              trap_nr, err, aux);
+       /* Note that uthread.c already copied out our ctx into the uth
+        * struct */
+       print_user_context(&uth->u_ctx);
+       printf("Turn on printx to spew unhandled, malignant trap info\n");
+       exit(-1);
+}
+
+static bool handle_page_fault(struct uthread *uth, unsigned int err,
+                              unsigned long aux)
+{
+       if (!(err & PF_VMR_BACKED))
+               return FALSE;
+       syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
+       __block_uthread_on_async_sysc(uth);
+       return TRUE;
+}
+
+static void vmm_thread_refl_hw_fault(struct uthread *uth,
+                                     unsigned int trap_nr,
+                                     unsigned int err, unsigned long aux)
+{
+       switch (trap_nr) {
+       case HW_TRAP_PAGE_FAULT:
+               if (!handle_page_fault(uth, err, aux))
+                       refl_error(uth, trap_nr, err, aux);
+               break;
+       default:
+               refl_error(uth, trap_nr, err, aux);
+       }
+}
+
+/* Yield callback for __ctlr_entry */
+static void __swap_to_gth(struct uthread *uth, void *dummy)
+{
+       struct ctlr_thread *cth = (struct ctlr_thread*)uth;
+
+       /* We don't re-account for block/unblock.  The ctlr and the guest are
+        * accounted together ("pass the token" back and forth). */
+       enqueue_vmm_thread((struct vmm_thread*)cth->buddy);
+}
+
+/* All ctrl threads start here, each time their guest has a fault.  They can
+ * block and unblock along the way.  Once a ctlr does its final uthread_yield,
+ * the next time it will start again from the top. */
+static void __ctlr_entry(void)
+{
+       struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
+       struct virtual_machine *vm = gth_to_vm(cth->buddy);
+
+       if (!handle_vmexit(cth->buddy)) {
+               showstatus(stderr, cth->buddy);
+               exit(0);
+       }
+       /* We want to atomically yield and start/reenqueue our buddy.  We do so in
+        * vcore context on the other side of the yield. */
+       uthread_yield(FALSE, __swap_to_gth, 0);
+}
+
+static void vmm_thread_refl_vm_fault(struct uthread *uth)
+{
+       struct guest_thread *gth = (struct guest_thread*)uth;
+       struct ctlr_thread *cth = gth->buddy;
+
+       /* The ctlr starts frm the top every time we get a new fault. */
+       cth->uthread.flags |= UTHREAD_SAVED;
+       init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
+                     (uintptr_t)(cth->stacktop));
+       /* We don't re-account for block/unblock.  The ctlr and the guest are
+        * accounted together ("pass the token" back and forth). */
+       enqueue_vmm_thread((struct vmm_thread*)cth);
+}
+
+static void vmm_thread_refl_fault(struct uthread *uth,
+                                  struct user_context *ctx)
+{
+       switch (ctx->type) {
+       case ROS_HW_CTX:
+               /* Guests should only ever VM exit */
+               assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
+               vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
+                                        __arch_refl_get_err(ctx),
+                                        __arch_refl_get_aux(ctx));
+               break;
+       case ROS_VM_CTX:
+               vmm_thread_refl_vm_fault(uth);
+               break;
+       default:
+               assert(0);
+       }
+}
+
+static void destroy_guest_thread(struct guest_thread *gth)
+{
+       struct ctlr_thread *cth = gth->buddy;
+
+       __free_stack(cth->stacktop, cth->stacksize);
+       uthread_cleanup((struct uthread*)cth);
+       free(cth);
+       uthread_cleanup((struct uthread*)gth);
+       free(gth);
+}
+
+static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
+                                                unsigned int gpcoreid)
+{
+       struct guest_thread *gth;
+       struct ctlr_thread *cth;
+       /* Guests won't use TLS; they always operate in Ring V.  The controller
+        * might - not because of anything we do, but because of glibc calls. */
+       struct uth_thread_attr gth_attr = {.want_tls = FALSE};
+       struct uth_thread_attr cth_attr = {.want_tls = TRUE};
+
+       gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
+       cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
+       if (!gth || !cth) {
+               free(gth);
+               free(cth);
+               return 0;
+       }
+       gth->buddy = cth;
+       cth->buddy = gth;
+       gth->gpc_id = gpcoreid;
+       cth->stacksize = VMM_THR_STACKSIZE;
+       cth->stacktop = __alloc_stack(cth->stacksize);
+       if (!cth->stacktop) {
+               free(gth);
+               free(cth);
+               return 0;
+       }
+       gth->uthread.u_ctx.type = ROS_VM_CTX;
+       gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
+       /* No need to init the ctlr.  It gets re-init'd each time it starts. */
+       uthread_init((struct uthread*)gth, &gth_attr);
+       uthread_init((struct uthread*)cth, &cth_attr);
+       /* TODO: give it a correct FP state.  Our current one is probably fine */
+       restore_fp_state(&gth->uthread.as);
+       gth->uthread.flags |= UTHREAD_FPSAVED;
+       return gth;
+}
+
+int vmm_init(struct virtual_machine *vm, int flags)
+{
+       struct guest_thread **gths;
+
+       if (current_vm)
+               return -1;
+       current_vm = vm;
+       if (syscall(SYS_vmm_setup, vm->nr_gpcs, vm->gpcis, flags) != vm->nr_gpcs)
+               return -1;
+       gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
+       if (!gths)
+               return -1;
+       for (int i = 0; i < vm->nr_gpcs; i++) {
+               gths[i] = create_guest_thread(vm, i);
+               if (!gths[i]) {
+                       for (int j = 0; j < i; j++)
+                               destroy_guest_thread(gths[j]);
+                       free(gths);
+                       return -1;
+               }
+       }
+       vm->gths = gths;
+       uthread_mcp_init();
+       return 0;
+}
+
+void start_guest_thread(struct guest_thread *gth)
+{
+       acct_thread_unblocked((struct vmm_thread*)gth);
+       enqueue_vmm_thread((struct vmm_thread*)gth);
+}
+
+static void __tth_exit_cb(struct uthread *uthread, void *junk)
+{
+       struct task_thread *tth = (struct task_thread*)uthread;
+
+       acct_thread_blocked((struct vmm_thread*)tth);
+       uthread_cleanup(uthread);
+       __free_stack(tth->stacktop, tth->stacksize);
+       free(tth);
+}
+
+static void __task_thread_run(void)
+{
+       struct task_thread *tth = (struct task_thread*)current_uthread;
+
+       tth->func(tth->arg);
+       uthread_yield(FALSE, __tth_exit_cb, 0);
+}
+
+int vmm_run_task(struct virtual_machine *vm, void (*func)(void *), void *arg)
+{
+       struct task_thread *tth;
+       struct uth_thread_attr tth_attr = {.want_tls = TRUE};
+
+       tth = (struct task_thread*)alloc_vmm_thread(vm, VMM_THREAD_TASK);
+       if (!tth)
+               return -1;
+       tth->stacksize = VMM_THR_STACKSIZE;
+       tth->stacktop = __alloc_stack(tth->stacksize);
+       if (!tth->stacktop) {
+               free(tth);
+               return -1;
+       }
+       tth->func = func;
+       tth->arg = arg;
+       init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
+                     (uintptr_t)(tth->stacktop));
+       uthread_init((struct uthread*)tth, &tth_attr);
+       acct_thread_unblocked((struct vmm_thread*)tth);
+       enqueue_vmm_thread((struct vmm_thread*)tth);
+       return 0;
+}
+
+/* Helpers for tracking nr_unblk_* threads. */
+static void acct_thread_blocked(struct vmm_thread *vth)
+{
+       switch (vth->type) {
+       case VMM_THREAD_GUEST:
+       case VMM_THREAD_CTLR:
+               atomic_dec(&nr_unblk_guests);
+               break;
+       case VMM_THREAD_TASK:
+               atomic_dec(&nr_unblk_tasks);
+               break;
+       }
+}
+
+static void acct_thread_unblocked(struct vmm_thread *vth)
+{
+       switch (vth->type) {
+       case VMM_THREAD_GUEST:
+       case VMM_THREAD_CTLR:
+               atomic_inc(&nr_unblk_guests);
+               break;
+       case VMM_THREAD_TASK:
+               atomic_inc(&nr_unblk_tasks);
+               break;
+       }
+}
+
+static void enqueue_vmm_thread(struct vmm_thread *vth)
+{
+       spin_pdr_lock(&queue_lock);
+       switch (vth->type) {
+       case VMM_THREAD_GUEST:
+       case VMM_THREAD_CTLR:
+               TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
+               break;
+       case VMM_THREAD_TASK:
+               TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
+               break;
+       }
+       spin_pdr_unlock(&queue_lock);
+}
+
+static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
+{
+       struct vmm_thread *vth;
+       int ret;
+
+       ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
+                            sizeof(struct vmm_thread));
+       if (ret)
+               return 0;
+       memset(vth, 0, sizeof(struct vmm_thread));
+       vth->type = type;
+       vth->vm = vm;
+       return vth;
+}
+
+static void __free_stack(void *stacktop, size_t stacksize)
+{
+       munmap(stacktop - stacksize, stacksize);
+}
+
+static void *__alloc_stack(size_t stacksize)
+{
+       int force_a_page_fault;
+       void *stacktop;
+       void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
+                             MAP_ANONYMOUS, -1, 0);
+
+       if (stackbot == MAP_FAILED)
+               return 0;
+       stacktop = stackbot + stacksize;
+       /* Want the top of the stack populated, but not the rest of the stack;
+        * that'll grow on demand (up to stacksize, then will clobber memory). */
+       force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
+       return stacktop;
+}
index 7e472f3..ec7cb6e 100644 (file)
@@ -311,10 +311,7 @@ static void virtio_mmio_write(struct virtual_machine *vm, uint64_t gpa,
                                                          NULL, NULL, /* callbacks */
                                                          mmio.vqdev->vqs[mmio.qsel].name);
                    fprintf(stderr, "START THE THREAD. pfn is 0x%x, virtio is %p\n", mmio.pagesize, va->arg->virtio);
-                   if (pthread_create(&va->arg->thread, NULL, va->arg->f, va)) {
-                           fprintf(stderr, "pth_create failed for vq %s", va->arg->name);
-                           perror("pth_create");
-                   }
+                       vmm_run_task(vm, va->arg->func, va);
         break;
     case VIRTIO_MMIO_QUEUE_NOTIFY:
            if (value < mmio.vqdev->numvqs) {
@@ -388,10 +385,7 @@ static void virtio_mmio_write(struct virtual_machine *vm, uint64_t gpa,
                    va->arg = &mmio.vqdev->vqs[mmio.qsel];
                    va->arg->virtio = (void *)(va->arg->pfn * mmio.pagesize);
                    fprintf(stderr, "START THE THREAD. pfn is 0x%x, virtio is %p\n", mmio.pagesize, va->arg->virtio);
-                   if (pthread_create(&va->arg->thread, NULL, va->arg->f, va)) {
-                           fprintf(stderr, "pth_create failed for vq %s", va->arg->name);
-                           perror("pth_create");
-                   }
+                       vmm_run_task(vm, va->arg->func, va);
            }
            break;
 
diff --git a/user/vmm/vmexit.c b/user/vmm/vmexit.c
new file mode 100644 (file)
index 0000000..bf12f5b
--- /dev/null
@@ -0,0 +1,158 @@
+/* Copyright (c) 2015-2016 Google Inc.
+ * See LICENSE for details. */
+
+#include <parlib/common.h>
+#include <vmm/virtio.h>
+#include <vmm/virtio_mmio.h>
+#include <vmm/virtio_ids.h>
+#include <vmm/virtio_config.h>
+#include <vmm/vmm.h>
+#include <parlib/arch/trap.h>
+#include <stdio.h>
+
+/* TODO: need infrastructure to handle GPC wakeup properly */
+static bool consdata = FALSE;
+
+static bool handle_ept_fault(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       struct virtual_machine *vm = gth_to_vm(gth);
+       uint64_t gpa, *regp;
+       uint8_t regx;
+       int store, size;
+       int advance;
+
+       if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
+               return FALSE;
+       /* TODO use helpers for some of these addr checks.  the fee/fec ones might
+        * be wrong too. */
+       if (PG_ADDR(gpa) == vm->virtio_mmio_base) {
+               /* TODO: can the guest cause us to spawn off infinite threads? */
+               virtio_mmio(gth, gpa, regx, regp, store);
+       } else if (PG_ADDR(gpa) == 0xfec00000) {
+               do_ioapic(gth, gpa, regx, regp, store);
+       } else if (PG_ADDR(gpa) == 0) {
+               memmove(regp, &vm->low4k[gpa], size);
+       } else {
+               fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
+               fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
+                               vm_tf->tf_exit_reason);
+               fprintf(stderr, "Returning 0xffffffff\n");
+               showstatus(stderr, gth);
+               // Just fill the whole register for now.
+               *regp = (uint64_t) -1;
+               return FALSE;
+       }
+       vm_tf->tf_rip += advance;
+       return TRUE;
+}
+
+static bool handle_vmcall(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+       uint8_t byte;
+
+       byte = vm_tf->tf_rdi;
+       printf("%c", byte);
+       if (byte == '\n')
+               printf("%c", '%');
+       vm_tf->tf_rip += 3;
+       return TRUE;
+}
+
+static bool handle_io(struct guest_thread *gth)
+{
+       io(gth);
+       return TRUE;
+}
+
+static bool handle_msr(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+
+       /* TODO: consider pushing the gth into msrio */
+       if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
+               /* Use event injection through vmctl to send a general protection fault
+                * vmctl.interrupt gets written to the VM-Entry Interruption-Information
+                * Field by vmx */
+               vm_tf->tf_trap_inject = VM_TRAP_VALID
+                                     | VM_TRAP_ERROR_CODE
+                                     | VM_TRAP_HARDWARE
+                                     | HW_TRAP_GP_FAULT;
+       } else {
+               vm_tf->tf_rip += 2;
+       }
+       return TRUE;
+}
+
+static bool handle_apic_access(struct guest_thread *gth)
+{
+       uint64_t gpa, *regp;
+       uint8_t regx;
+       int store, size;
+       int advance;
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+
+       if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
+               return FALSE;
+       if (__apic_access(gth, gpa, regx, regp, store))
+               return FALSE;
+       vm_tf->tf_rip += advance;
+       return TRUE;
+}
+
+static bool handle_halt(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+
+       while (!consdata)
+               ;
+       vm_tf->tf_rip += 1;
+       return TRUE;
+}
+
+static bool handle_mwait(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+
+       while (!consdata)
+               ;
+       vm_tf->tf_rip += 3;
+       return TRUE;
+}
+
+/* Is this a vmm specific thing?  or generic?
+ *
+ * what do we do when we want to kill the vm?  what are our other options? */
+bool handle_vmexit(struct guest_thread *gth)
+{
+       struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
+
+       switch (vm_tf->tf_exit_reason) {
+       case EXIT_REASON_EPT_VIOLATION:
+               return handle_ept_fault(gth);
+       case EXIT_REASON_VMCALL:
+               return handle_vmcall(gth);
+       case EXIT_REASON_IO_INSTRUCTION:
+               return handle_io(gth);
+       case EXIT_REASON_MSR_WRITE:
+       case EXIT_REASON_MSR_READ:
+               return handle_msr(gth);
+       case EXIT_REASON_APIC_ACCESS:
+               return handle_apic_access(gth);
+       case EXIT_REASON_HLT:
+               return handle_halt(gth);
+       case EXIT_REASON_MWAIT_INSTRUCTION:
+               return handle_mwait(gth);
+       case EXIT_REASON_EXTERNAL_INTERRUPT:
+       case EXIT_REASON_APIC_WRITE:
+               /* TODO: just ignore these? */
+               return TRUE;
+       default:
+               fprintf(stderr, "Don't know how to handle exit %d\n",
+                       vm_tf->tf_exit_reason);
+               fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
+                       vm_tf->tf_exit_reason);
+               return FALSE;
+       }
+}