1d06723ef9ddf8f77af68f00004b3a2bbfdb5012
[akaros.git] / tests / vmm / vmrunkernel.c
1 #include <stdio.h>
2 #include <pthread.h>
3 #include <sys/types.h>
4 #include <sys/stat.h>
5 #include <fcntl.h>
6 #include <parlib/arch/arch.h>
7 #include <parlib/ros_debug.h>
8 #include <unistd.h>
9 #include <errno.h>
10 #include <dirent.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <ros/syscall.h>
14 #include <sys/mman.h>
15 #include <vmm/coreboot_tables.h>
16 #include <vmm/vmm.h>
17 #include <vmm/acpi/acpi.h>
18 #include <ros/arch/mmu.h>
19 #include <ros/vmm.h>
20 #include <parlib/uthread.h>
21 #include <vmm/linux_bootparam.h>
22 #include <vmm/virtio.h>
23 #include <vmm/virtio_mmio.h>
24 #include <vmm/virtio_ids.h>
25 #include <vmm/virtio_config.h>
26 #include <vmm/sched.h>
27
28 struct vmctl vmctl;
29 struct vmm_gpcore_init gpci;
30
31 /* Whoever holds the ball runs.  run_vm never actually grabs it - it is grabbed
32  * on its behalf. */
33 uth_mutex_t the_ball;
34 pthread_t vm_thread;
35
36 void (*old_thread_refl)(struct uthread *uth, struct user_context *ctx);
37
38 static void copy_vmtf_to_vmctl(struct vm_trapframe *vm_tf, struct vmctl *vmctl)
39 {
40         vmctl->cr3 = vm_tf->tf_cr3;
41         vmctl->gva = vm_tf->tf_guest_va;
42         vmctl->gpa = vm_tf->tf_guest_pa;
43         vmctl->exit_qual = vm_tf->tf_exit_qual;
44         if (vm_tf->tf_exit_reason == EXIT_REASON_EPT_VIOLATION)
45                 vmctl->shutdown = SHUTDOWN_EPT_VIOLATION;
46         else
47                 vmctl->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
48         vmctl->ret_code = vm_tf->tf_exit_reason;
49         vmctl->interrupt = vm_tf->tf_trap_inject;
50         vmctl->intrinfo1 = vm_tf->tf_intrinfo1;
51         vmctl->intrinfo2 = vm_tf->tf_intrinfo2;
52         /* Most of the HW TF.  Should be good enough for now */
53         vmctl->regs.tf_rax = vm_tf->tf_rax;
54         vmctl->regs.tf_rbx = vm_tf->tf_rbx;
55         vmctl->regs.tf_rcx = vm_tf->tf_rcx;
56         vmctl->regs.tf_rdx = vm_tf->tf_rdx;
57         vmctl->regs.tf_rbp = vm_tf->tf_rbp;
58         vmctl->regs.tf_rsi = vm_tf->tf_rsi;
59         vmctl->regs.tf_rdi = vm_tf->tf_rdi;
60         vmctl->regs.tf_r8  = vm_tf->tf_r8;
61         vmctl->regs.tf_r9  = vm_tf->tf_r9;
62         vmctl->regs.tf_r10 = vm_tf->tf_r10;
63         vmctl->regs.tf_r11 = vm_tf->tf_r11;
64         vmctl->regs.tf_r12 = vm_tf->tf_r12;
65         vmctl->regs.tf_r13 = vm_tf->tf_r13;
66         vmctl->regs.tf_r14 = vm_tf->tf_r14;
67         vmctl->regs.tf_r15 = vm_tf->tf_r15;
68         vmctl->regs.tf_rip = vm_tf->tf_rip;
69         vmctl->regs.tf_rflags = vm_tf->tf_rflags;
70         vmctl->regs.tf_rsp = vm_tf->tf_rsp;
71 }
72
73 static void copy_vmctl_to_vmtf(struct vmctl *vmctl, struct vm_trapframe *vm_tf)
74 {
75         vm_tf->tf_rax = vmctl->regs.tf_rax;
76         vm_tf->tf_rbx = vmctl->regs.tf_rbx;
77         vm_tf->tf_rcx = vmctl->regs.tf_rcx;
78         vm_tf->tf_rdx = vmctl->regs.tf_rdx;
79         vm_tf->tf_rbp = vmctl->regs.tf_rbp;
80         vm_tf->tf_rsi = vmctl->regs.tf_rsi;
81         vm_tf->tf_rdi = vmctl->regs.tf_rdi;
82         vm_tf->tf_r8  = vmctl->regs.tf_r8;
83         vm_tf->tf_r9  = vmctl->regs.tf_r9;
84         vm_tf->tf_r10 = vmctl->regs.tf_r10;
85         vm_tf->tf_r11 = vmctl->regs.tf_r11;
86         vm_tf->tf_r12 = vmctl->regs.tf_r12;
87         vm_tf->tf_r13 = vmctl->regs.tf_r13;
88         vm_tf->tf_r14 = vmctl->regs.tf_r14;
89         vm_tf->tf_r15 = vmctl->regs.tf_r15;
90         vm_tf->tf_rip = vmctl->regs.tf_rip;
91         vm_tf->tf_rflags = vmctl->regs.tf_rflags;
92         vm_tf->tf_rsp = vmctl->regs.tf_rsp;
93         vm_tf->tf_cr3 = vmctl->cr3;
94         vm_tf->tf_trap_inject = vmctl->interrupt;
95         /* Don't care about the rest of the fields.  The kernel only writes them */
96 }
97
98 /* callback, runs in vcore context.  this sets up our initial context.  once we
99  * become runnable again, we'll run the first bits of the vm ctx.  after that,
100  * our context will be stopped and started and will just run whatever the guest
101  * VM wants.  we'll never come back to this code or to run_vm(). */
102 static void __build_vm_ctx_cb(struct uthread *uth, void *arg)
103 {
104         struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
105         struct vmctl *vmctl = (struct vmctl*)arg;
106         struct vm_trapframe *vm_tf;
107
108         __pthread_generic_yield(pthread);
109         pthread->state = PTH_BLK_YIELDING;
110
111         memset(&uth->u_ctx, 0, sizeof(struct user_context));
112         uth->u_ctx.type = ROS_VM_CTX;
113         vm_tf = &uth->u_ctx.tf.vm_tf;
114
115         vm_tf->tf_guest_pcoreid = 0;    /* assuming only 1 guest core */
116
117         copy_vmctl_to_vmtf(vmctl, vm_tf);
118
119         /* other HW/GP regs are 0, which should be fine.  the FP state is still
120          * whatever we were running before, though this is pretty much unnecessary.
121          * we mostly don't want crazy crap in the uth->as, and a non-current_uthread
122          * VM ctx is supposed to have something in their FP state (like HW ctxs). */
123         save_fp_state(&uth->as);
124         uth->flags |= UTHREAD_FPSAVED | UTHREAD_SAVED;
125
126         uthread_runnable(uth);
127 }
128
129 static void *run_vm(void *arg)
130 {
131         struct vmctl *vmctl = (struct vmctl*)arg;
132
133         assert(vmctl->command == REG_RSP_RIP_CR3);
134         /* We need to hack our context, so that next time we run, we're a VM ctx */
135         uthread_yield(FALSE, __build_vm_ctx_cb, arg);
136 }
137
138 static void vmm_thread_refl_fault(struct uthread *uth,
139                                   struct user_context *ctx)
140 {
141         struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
142
143         /* Hack to call the original pth 2LS op */
144         if (!ctx->type == ROS_VM_CTX) {
145                 old_thread_refl(uth, ctx);
146                 return;
147         }
148         __pthread_generic_yield(pthread);
149         /* normally we'd handle the vmexit here.  to work within the existing
150          * framework, we just wake the controller thread.  It'll look at our ctx
151          * then make us runnable again */
152         pthread->state = PTH_BLK_MUTEX;
153         uth_mutex_unlock(the_ball);             /* wake the run_vmthread */
154 }
155
156
157
158 /* this will start the vm thread, and return when the thread has blocked,
159  * with the right info in vmctl. */
160 static void run_vmthread(struct vmctl *vmctl)
161 {
162         struct vm_trapframe *vm_tf;
163
164         if (!vm_thread) {
165                 /* first time through, we make the vm thread.  the_ball was already
166                  * grabbed right after it was alloc'd. */
167                 if (pthread_create(&vm_thread, NULL, run_vm, vmctl)) {
168                         perror("pth_create");
169                         exit(-1);
170                 }
171                 /* hack in our own handlers for some 2LS ops */
172                 old_thread_refl = sched_ops->thread_refl_fault;
173                 sched_ops->thread_refl_fault = vmm_thread_refl_fault;
174         } else {
175                 copy_vmctl_to_vmtf(vmctl, &vm_thread->uthread.u_ctx.tf.vm_tf);
176                 uth_mutex_lock(the_ball);       /* grab it for the vm_thread */
177                 uthread_runnable((struct uthread*)vm_thread);
178         }
179         uth_mutex_lock(the_ball);
180         /* We woke due to a vm exit.  Need to unlock for the next time we're run */
181         uth_mutex_unlock(the_ball);
182         /* the vm stopped.  we can do whatever we want before rerunning it.  since
183          * we're controlling the uth, we need to handle its vmexits.  we'll fill in
184          * the vmctl, since that's the current framework. */
185         copy_vmtf_to_vmctl(&vm_thread->uthread.u_ctx.tf.vm_tf, vmctl);
186 }
187
188 /* By 1999, you could just scan the hardware
189  * and work it out. But 2005, that was no longer possible. How sad.
190  * so we have to fake acpi to make it all work.
191  * This will be copied to memory at 0xe0000, so the kernel can find it.
192  */
193
194 /* assume they're all 256 bytes long just to make it easy.
195  * Just have pointers that point to aligned things.
196  */
197
198 struct acpi_table_rsdp rsdp = {
199         .signature = "RSD PTR ",
200         .oem_id = "AKAROS",
201         .revision = 2,
202         .length = 36,
203 };
204
205 struct acpi_table_xsdt xsdt = {
206         .header = {
207                 .signature= "XSDT",
208                 // This is so stupid. Incredibly stupid.
209                 .revision = 0,
210                 .oem_id = "AKAROS",
211                 .oem_table_id = "ALPHABET",
212                 .oem_revision = 0,
213                 .asl_compiler_id = "RON ",
214                 .asl_compiler_revision = 0,
215         },
216 };
217 struct acpi_table_fadt fadt = {
218         .header = {
219                 .signature= "FADT",
220                 // This is so stupid. Incredibly stupid.
221                 .revision = 0,
222                 .oem_id = "AKAROS",
223                 .oem_table_id = "ALPHABET",
224                 .oem_revision = 0,
225                 .asl_compiler_id = "RON ",
226                 .asl_compiler_revision = 0,
227         },
228 };
229
230 /* This has to be dropped into memory, then the other crap just follows it.
231  */
232 struct acpi_table_madt madt = {
233         .header = {
234                 .signature = "APIC",
235                 .revision = 0,
236                 .oem_id = "AKAROS",
237                 .oem_table_id = "ALPHABET",
238                 .oem_revision = 0,
239                 .asl_compiler_id = "RON ",
240                 .asl_compiler_revision = 0,
241         },
242
243         .address = 0xfee00000ULL,
244 };
245
246 struct acpi_madt_local_apic Apic0 = {.header = {.type = ACPI_MADT_TYPE_LOCAL_APIC, .length = sizeof(struct acpi_madt_local_apic)},
247                                      .processor_id = 0, .id = 0};
248 struct acpi_madt_io_apic Apic1 = {.header = {.type = ACPI_MADT_TYPE_IO_APIC, .length = sizeof(struct acpi_madt_io_apic)},
249                                   .id = 1, .address = 0xfec00000, .global_irq_base = 0};
250 struct acpi_madt_local_x2apic X2Apic0 = {
251         .header = {
252                 .type = ACPI_MADT_TYPE_LOCAL_X2APIC,
253                 .length = sizeof(struct acpi_madt_local_x2apic)
254         },
255         .local_apic_id = 0,
256         .uid = 0
257 };
258
259 struct acpi_madt_interrupt_override isor[] = {
260         /* I have no idea if it should be source irq 2, global 0, or global 2, source 0. Shit. */
261         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
262          .bus = 0, .source_irq = 2, .global_irq = 0, .inti_flags = 0},
263         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
264          .bus = 0, .source_irq = 1, .global_irq = 1, .inti_flags = 0},
265         //{.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
266          //.bus = 0, .source_irq = 2, .global_irq = 2, .inti_flags = 0},
267         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
268          .bus = 0, .source_irq = 3, .global_irq = 3, .inti_flags = 0},
269         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
270          .bus = 0, .source_irq = 4, .global_irq = 4, .inti_flags = 0},
271         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
272          .bus = 0, .source_irq = 5, .global_irq = 5, .inti_flags = 0},
273         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
274          .bus = 0, .source_irq = 6, .global_irq = 6, .inti_flags = 0},
275         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
276          .bus = 0, .source_irq = 7, .global_irq = 7, .inti_flags = 0},
277         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
278          .bus = 0, .source_irq = 8, .global_irq = 8, .inti_flags = 0},
279         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
280          .bus = 0, .source_irq = 9, .global_irq = 9, .inti_flags = 0},
281         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
282          .bus = 0, .source_irq = 10, .global_irq = 10, .inti_flags = 0},
283         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
284          .bus = 0, .source_irq = 11, .global_irq = 11, .inti_flags = 0},
285         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
286          .bus = 0, .source_irq = 12, .global_irq = 12, .inti_flags = 0},
287         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
288          .bus = 0, .source_irq = 13, .global_irq = 13, .inti_flags = 0},
289         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
290          .bus = 0, .source_irq = 14, .global_irq = 14, .inti_flags = 0},
291         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
292          .bus = 0, .source_irq = 15, .global_irq = 15, .inti_flags = 0},
293         // VMMCP routes irq 32 to gsi 17
294         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
295          .bus = 0, .source_irq = 32, .global_irq = 17, .inti_flags = 5},
296 };
297
298
299 /* this test will run the "kernel" in the negative address space. We hope. */
300 void *low1m;
301 uint8_t low4k[4096];
302 unsigned long long stack[1024];
303 volatile int shared = 0;
304 volatile int quit = 0;
305 int mcp = 1;
306 int virtioirq = 17;
307
308 /* total hack. If the vm runs away we want to get control again. */
309 unsigned int maxresume = (unsigned int) -1;
310
311 #define MiB 0x100000u
312 #define GiB (1u<<30)
313 #define GKERNBASE (16*MiB)
314 #define KERNSIZE (128*MiB+GKERNBASE)
315 uint8_t _kernel[KERNSIZE];
316
317 unsigned long long *p512, *p1, *p2m;
318
319 void **my_retvals;
320 int nr_threads = 4;
321 int debug = 0;
322 int resumeprompt = 0;
323 /* unlike Linux, this shared struct is for both host and guest. */
324 //      struct virtqueue *constoguest =
325 //              vring_new_virtqueue(0, 512, 8192, 0, inpages, NULL, NULL, "test");
326 uint64_t virtio_mmio_base = 0x100000000ULL;
327
328 void vapic_status_dump(FILE *f, void *vapic);
329 static void set_posted_interrupt(int vector);
330
331 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
332 #error "Get a gcc newer than 4.4.0"
333 #else
334 #define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
335 #endif
336
337 #define LOCK_PREFIX "lock "
338 #define ADDR                            BITOP_ADDR(addr)
339 static inline int test_and_set_bit(int nr, volatile unsigned long *addr);
340
341 pthread_t timerthread_struct;
342
343 void *timer_thread(void *arg)
344 {
345         uint8_t vector;
346         uint32_t initial_count;
347         while (1) {
348                 vector = ((uint32_t *)gpci.vapic_addr)[0x32] & 0xff;
349                 initial_count = ((uint32_t *)gpci.vapic_addr)[0x38];
350                 if (vector && initial_count) {
351                         set_posted_interrupt(vector);
352                         ros_syscall(SYS_vmm_poke_guest, 0, 0, 0, 0, 0, 0);
353                 }
354                 uthread_usleep(100000);
355         }
356         fprintf(stderr, "SENDING TIMER\n");
357 }
358
359 void *consout(void *arg)
360 {
361         char *line, *consline, *outline;
362         static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
363         static struct scatterlist in[] = { {NULL, sizeof(line)}, };
364         static struct scatterlist iov[32];
365         struct virtio_threadarg *a = arg;
366         static unsigned int inlen, outlen, conslen;
367         struct virtqueue *v = a->arg->virtio;
368         fprintf(stderr, "talk thread ..\n");
369         uint16_t head, gaveit = 0, gotitback = 0;
370         uint32_t vv;
371         int i;
372         int num;
373
374         if (debug) {
375                 fprintf(stderr, "----------------------- TT a %p\n", a);
376                 fprintf(stderr, "talk thread ttargs %x v %x\n", a, v);
377         }
378
379         for(num = 0;;num++) {
380                 //int debug = 1;
381                 /* host: use any buffers we should have been sent. */
382                 head = wait_for_vq_desc(v, iov, &outlen, &inlen);
383                 if (debug)
384                         fprintf(stderr, "CCC: vq desc head %d, gaveit %d gotitback %d\n", head, gaveit, gotitback);
385                 for(i = 0; debug && i < outlen + inlen; i++)
386                         fprintf(stderr, "CCC: v[%d/%d] v %p len %d\n", i, outlen + inlen, iov[i].v, iov[i].length);
387                 /* host: if we got an output buffer, just output it. */
388                 for(i = 0; i < outlen; i++) {
389                         num++;
390                         int j;
391                         if (debug) {
392                                 fprintf(stderr, "CCC: IOV length is %d\n", iov[i].length);
393                         }
394                         for (j = 0; j < iov[i].length; j++)
395                                 printf("%c", ((char *)iov[i].v)[j]);
396                 }
397                 fflush(stdout);
398                 if (debug)
399                         fprintf(stderr, "CCC: outlen is %d; inlen is %d\n", outlen, inlen);
400                 /* host: fill in the writeable buffers. */
401                 /* why we're getting these I don't know. */
402                 for (i = outlen; i < outlen + inlen; i++) {
403                         if (debug) fprintf(stderr, "CCC: send back empty writeable");
404                         iov[i].length = 0;
405                 }
406                 if (debug) fprintf(stderr, "CCC: call add_used\n");
407                 /* host: now ack that we used them all. */
408                 add_used(v, head, outlen+inlen);
409                 if (debug) fprintf(stderr, "CCC: DONE call add_used\n");
410         }
411         fprintf(stderr, "All done\n");
412         return NULL;
413 }
414
415 // FIXME.
416 volatile int consdata = 0;
417
418 void *consin(void *arg)
419 {
420         struct virtio_threadarg *a = arg;
421         char *line, *outline;
422         static char consline[128];
423         static struct scatterlist iov[32];
424         static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
425         static struct scatterlist in[] = { {NULL, sizeof(line)}, };
426
427         static unsigned int inlen, outlen, conslen;
428         struct virtqueue *v = a->arg->virtio;
429         fprintf(stderr, "consin thread ..\n");
430         uint16_t head, gaveit = 0, gotitback = 0;
431         uint32_t vv;
432         int i;
433         int num;
434         //char c[1];
435
436         if (debug) fprintf(stderr, "Spin on console being read, print num queues, halt\n");
437
438         for(num = 0;! quit;num++) {
439                 //int debug = 1;
440                 /* host: use any buffers we should have been sent. */
441                 head = wait_for_vq_desc(v, iov, &outlen, &inlen);
442                 if (debug)
443                         fprintf(stderr, "vq desc head %d, gaveit %d gotitback %d\n", head, gaveit, gotitback);
444                 for(i = 0; debug && i < outlen + inlen; i++)
445                         fprintf(stderr, "v[%d/%d] v %p len %d\n", i, outlen + inlen, iov[i].v, iov[i].length);
446                 if (debug)
447                         fprintf(stderr, "outlen is %d; inlen is %d\n", outlen, inlen);
448                 /* host: fill in the writeable buffers. */
449                 for (i = outlen; i < outlen + inlen; i++) {
450                         /* host: read a line. */
451                         memset(consline, 0, 128);
452                         if (read(0, consline, 1) < 0) {
453                                 exit(0);
454                         }
455                         if (debug) fprintf(stderr, "CONSIN: GOT A LINE:%s:\n", consline);
456                         if (debug) fprintf(stderr, "CONSIN: OUTLEN:%d:\n", outlen);
457                         if (strlen(consline) < 3 && consline[0] == 'q' ) {
458                                 quit = 1;
459                                 break;
460                         }
461
462                         memmove(iov[i].v, consline, strlen(consline)+ 1);
463                         iov[i].length = strlen(consline) + 1;
464                 }
465                 if (debug) fprintf(stderr, "call add_used\n");
466                 /* host: now ack that we used them all. */
467                 add_used(v, head, outlen+inlen);
468                 /* turn off consdata - the IRQ injection isn't right */
469                 //consdata = 1;
470                 if (debug) fprintf(stderr, "DONE call add_used\n");
471
472                 // Send spurious for testing (Gan)
473                 set_posted_interrupt(0xE5);
474                 virtio_mmio_set_vring_irq();
475
476                 ros_syscall(SYS_vmm_poke_guest, 0, 0, 0, 0, 0, 0);
477         }
478         fprintf(stderr, "All done\n");
479         return NULL;
480 }
481
482 static struct vqdev vqdev= {
483 name: "console",
484 dev: VIRTIO_ID_CONSOLE,
485 device_features: 0, /* Can't do it: linux console device does not support it. VIRTIO_F_VERSION_1*/
486 numvqs: 2,
487 vqs: {
488                 {name: "consin", maxqnum: 64, f: consin, arg: (void *)0},
489                 {name: "consout", maxqnum: 64, f: consout, arg: (void *)0},
490         }
491 };
492
493 void lowmem() {
494         __asm__ __volatile__ (".section .lowmem, \"aw\"\n\tlow: \n\t.=0x1000\n\t.align 0x100000\n\t.previous\n");
495 }
496
497 static uint8_t acpi_tb_checksum(uint8_t *buffer, uint32_t length)
498 {
499         uint8_t sum = 0;
500         uint8_t *end = buffer + length;
501         fprintf(stderr, "tbchecksum %p for %d", buffer, length);
502         while (buffer < end) {
503                 if (end - buffer < 2)
504                         fprintf(stderr, "%02x\n", sum);
505                 sum = (uint8_t)(sum + *(buffer++));
506         }
507         fprintf(stderr, " is %02x\n", sum);
508         return (sum);
509 }
510
511 static void gencsum(uint8_t *target, void *data, int len)
512 {
513         uint8_t csum;
514         // blast target to zero so it does not get counted
515         // (it might be in the struct we checksum) And, yes, it is, goodness.
516         fprintf(stderr, "gencsum %p target %p source %d bytes\n", target, data, len);
517         *target = 0;
518         csum  = acpi_tb_checksum((uint8_t *)data, len);
519         *target = ~csum + 1;
520         fprintf(stderr, "Cmoputed is %02x\n", *target);
521 }
522
523 static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
524 {
525         int oldbit;
526
527         asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
528                      "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
529
530         return oldbit;
531 }
532
533 static void pir_dump()
534 {
535         unsigned long *pir_ptr = gpci.posted_irq_desc;
536         int i;
537         fprintf(stderr, "-------Begin PIR dump-------\n");
538         for (i = 0; i < 8; i++){
539                 fprintf(stderr, "Byte %d: 0x%016x\n", i, pir_ptr[i]);
540         }
541         fprintf(stderr, "-------End PIR dump-------\n");
542 }
543
544 static void set_posted_interrupt(int vector)
545 {
546         test_and_set_bit(vector, gpci.posted_irq_desc);
547         /* LOCKed instruction provides the mb() */
548         test_and_set_bit(VMX_POSTED_OUTSTANDING_NOTIF, gpci.posted_irq_desc);
549 }
550
551 int main(int argc, char **argv)
552 {
553         struct boot_params *bp;
554         char *cmdline_default = "earlyprintk=vmcall,keep"
555                                     " console=hvc0"
556                                     " virtio_mmio.device=1M@0x100000000:32"
557                                     " nosmp"
558                                     " maxcpus=1"
559                                     " acpi.debug_layer=0x2"
560                                     " acpi.debug_level=0xffffffff"
561                                     " apic=debug"
562                                     " noexec=off"
563                                     " nohlt"
564                                     " init=/bin/launcher"
565                                     " lapic=notscdeadline"
566                                     " lapictimerfreq=1000000"
567                                     " pit=none";
568         char *cmdline_extra = "\0";
569         char *cmdline;
570         uint64_t *p64;
571         void *a = (void *)0xe0000;
572         struct acpi_table_rsdp *r;
573         struct acpi_table_fadt *f;
574         struct acpi_table_madt *m;
575         struct acpi_table_xsdt *x;
576         uint64_t virtiobase = 0x100000000ULL;
577         // lowmem is a bump allocated pointer to 2M at the "physbase" of memory
578         void *lowmem = (void *) 0x1000000;
579         //struct vmctl vmctl;
580         int amt;
581         int vmmflags = 0; // Disabled probably forever. VMM_VMCALL_PRINTF;
582         uint64_t entry = 0x1200000, kerneladdress = 0x1200000;
583         int nr_gpcs = 1;
584         int ret;
585         void * xp;
586         int kfd = -1;
587         static char cmd[512];
588         int i;
589         uint8_t csum;
590         void *coreboot_tables = (void *) 0x1165000;
591         void *a_page;
592         struct vm_trapframe *vm_tf;
593         uint64_t tsc_freq_khz;
594
595         the_ball = uth_mutex_alloc();
596         uth_mutex_lock(the_ball);
597
598         fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT,
599                         PML1_PTE_REACH);
600
601
602         // mmap is not working for us at present.
603         if ((uint64_t)_kernel > GKERNBASE) {
604                 fprintf(stderr, "kernel array @%p is above , GKERNBASE@%p sucks\n", _kernel, GKERNBASE);
605                 exit(1);
606         }
607         memset(_kernel, 0, sizeof(_kernel));
608         memset(lowmem, 0xff, 2*1048576);
609         memset(low4k, 0xff, 4096);
610         // avoid at all costs, requires too much instruction emulation.
611         //low4k[0x40e] = 0;
612         //low4k[0x40f] = 0xe0;
613
614         //Place mmap(Gan)
615         a_page = mmap((void *)0xfee00000, PGSIZE, PROT_READ | PROT_WRITE,
616                               MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
617         fprintf(stderr, "a_page mmap pointer %p\n", a_page);
618
619         if (a_page == (void *) -1) {
620                 perror("Could not mmap APIC");
621                 exit(1);
622         }
623         if (((uint64_t)a_page & 0xfff) != 0) {
624                 perror("APIC page mapping is not page aligned");
625                 exit(1);
626         }
627
628         memset(a_page, 0, 4096);
629         ((uint32_t *)a_page)[0x30/4] = 0x01060015;
630         //((uint32_t *)a_page)[0x30/4] = 0xDEADBEEF;
631
632
633         argc--, argv++;
634         // switches ...
635         // Sorry, I don't much like the gnu opt parsing code.
636         while (1) {
637                 if (*argv[0] != '-')
638                         break;
639                 switch(argv[0][1]) {
640                 case 'd':
641                         debug++;
642                         break;
643                 case 'v':
644                         vmmflags |= VMM_VMCALL_PRINTF;
645                         break;
646                 case 'm':
647                         argc--, argv++;
648                         maxresume = strtoull(argv[0], 0, 0);
649                         break;
650                 case 'i':
651                         argc--, argv++;
652                         virtioirq = strtoull(argv[0], 0, 0);
653                         break;
654                 case 'c':
655                         argc--, argv++;
656                         cmdline_extra = argv[0];
657                 default:
658                         fprintf(stderr, "BMAFR\n");
659                         break;
660                 }
661                 argc--, argv++;
662         }
663         if (argc < 1) {
664                 fprintf(stderr, "Usage: %s vmimage [-n (no vmcall printf)] [coreboot_tables [loadaddress [entrypoint]]]\n", argv[0]);
665                 exit(1);
666         }
667         if (argc > 1)
668                 coreboot_tables = (void *) strtoull(argv[1], 0, 0);
669         if (argc > 2)
670                 kerneladdress = strtoull(argv[2], 0, 0);
671         if (argc > 3)
672                 entry = strtoull(argv[3], 0, 0);
673         kfd = open(argv[0], O_RDONLY);
674         if (kfd < 0) {
675                 perror(argv[0]);
676                 exit(1);
677         }
678         // read in the kernel.
679         xp = (void *)kerneladdress;
680         for(;;) {
681                 amt = read(kfd, xp, 1048576);
682                 if (amt < 0) {
683                         perror("read");
684                         exit(1);
685                 }
686                 if (amt == 0) {
687                         break;
688                 }
689                 xp += amt;
690         }
691         fprintf(stderr, "Read in %d bytes\n", xp-kerneladdress);
692         close(kfd);
693
694         // The low 1m so we can fill in bullshit like ACPI. */
695         // And, sorry, due to the STUPID format of the RSDP for now we need the low 1M.
696         low1m = mmap((int*)4096, MiB-4096, PROT_READ | PROT_WRITE,
697                          MAP_ANONYMOUS, -1, 0);
698         if (low1m != (void *)4096) {
699                 perror("Unable to mmap low 1m");
700                 exit(1);
701         }
702         memset(low1m, 0xff, MiB-4096);
703         r = a;
704         fprintf(stderr, "install rsdp to %p\n", r);
705         *r = rsdp;
706         a += sizeof(*r);
707         memmove(&r->xsdt_physical_address, &a, sizeof(a));
708         gencsum(&r->checksum, r, ACPI_RSDP_CHECKSUM_LENGTH);
709         if ((csum = acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_CHECKSUM_LENGTH)) != 0) {
710                 fprintf(stderr, "RSDP has bad checksum; summed to %x\n", csum);
711                 exit(1);
712         }
713
714         /* Check extended checksum if table version >= 2 */
715         gencsum(&r->extended_checksum, r, ACPI_RSDP_XCHECKSUM_LENGTH);
716         if ((rsdp.revision >= 2) &&
717             (acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_XCHECKSUM_LENGTH) != 0)) {
718                 fprintf(stderr, "RSDP has bad checksum v2\n");
719                 exit(1);
720         }
721
722         /* just leave a bunch of space for the xsdt. */
723         /* we need to zero the area since it has pointers. */
724         x = a;
725         a += sizeof(*x) + 8*sizeof(void *);
726         memset(x, 0, a - (void *)x);
727         fprintf(stderr, "install xsdt to %p\n", x);
728         *x = xsdt;
729         x->table_offset_entry[0] = 0;
730         x->table_offset_entry[1] = 0;
731         x->header.length = a - (void *)x;
732
733         f = a;
734         fprintf(stderr, "install fadt to %p\n", f);
735         *f = fadt;
736         x->table_offset_entry[2] = (uint64_t) f;
737         a += sizeof(*f);
738         f->header.length = a - (void *)f;
739         gencsum(&f->header.checksum, f, f->header.length);
740         if (acpi_tb_checksum((uint8_t *)f, f->header.length) != 0) {
741                 fprintf(stderr, "ffadt has bad checksum v2\n");
742                 exit(1);
743         }
744
745         m = a;
746         *m = madt;
747         x->table_offset_entry[3] = (uint64_t) m;
748         a += sizeof(*m);
749         fprintf(stderr, "install madt to %p\n", m);
750         memmove(a, &Apic0, sizeof(Apic0));
751         a += sizeof(Apic0);
752         memmove(a, &Apic1, sizeof(Apic1));
753         a += sizeof(Apic1);
754         memmove(a, &X2Apic0, sizeof(X2Apic0));
755         a += sizeof(X2Apic0);
756         memmove(a, &isor, sizeof(isor));
757         a += sizeof(isor);
758         m->header.length = a - (void *)m;
759         gencsum(&m->header.checksum, m, m->header.length);
760         if (acpi_tb_checksum((uint8_t *) m, m->header.length) != 0) {
761                 fprintf(stderr, "madt has bad checksum v2\n");
762                 exit(1);
763         }
764         fprintf(stderr, "allchecksums ok\n");
765
766         gencsum(&x->header.checksum, x, x->header.length);
767         if ((csum = acpi_tb_checksum((uint8_t *) x, x->header.length)) != 0) {
768                 fprintf(stderr, "XSDT has bad checksum; summed to %x\n", csum);
769                 exit(1);
770         }
771
772         hexdump(stdout, r, a-(void *)r);
773
774         a = (void *)(((unsigned long)a + 0xfff) & ~0xfff);
775         gpci.posted_irq_desc = a;
776         memset(a, 0, 4096);
777         a += 4096;
778         gpci.vapic_addr = a;
779         //vmctl.vapic = (uint64_t) a_page;
780         memset(a, 0, 4096);
781         ((uint32_t *)a)[0x30/4] = 0x01060014;
782         p64 = a;
783         // set up apic values? do we need to?
784         // qemu does this.
785         //((uint8_t *)a)[4] = 1;
786         a += 4096;
787         gpci.apic_addr = (void*)0xfee00000;
788
789         /* Allocate memory for, and zero the bootparams
790          * page before writing to it, or Linux thinks
791          * we're talking crazy.
792          */
793         a += 4096;
794         bp = a;
795         memset(bp, 0, 4096);
796
797         /* Set the kernel command line parameters */
798         a += 4096;
799         cmdline = a;
800         a += 4096;
801         bp->hdr.cmd_line_ptr = (uintptr_t) cmdline;
802         tsc_freq_khz = get_tsc_freq()/1000;
803         sprintf(cmdline, "%s tscfreq=%lld %s", cmdline_default, tsc_freq_khz,
804                 cmdline_extra);
805
806
807         /* Put the e820 memory region information in the boot_params */
808         bp->e820_entries = 3;
809         int e820i = 0;
810
811         bp->e820_map[e820i].addr = 0;
812         bp->e820_map[e820i].size = 16 * 1048576;
813         bp->e820_map[e820i++].type = E820_RESERVED;
814
815         bp->e820_map[e820i].addr = 16 * 1048576;
816         bp->e820_map[e820i].size = 128 * 1048576;
817         bp->e820_map[e820i++].type = E820_RAM;
818
819         bp->e820_map[e820i].addr = 0xf0000000;
820         bp->e820_map[e820i].size = 0x10000000;
821         bp->e820_map[e820i++].type = E820_RESERVED;
822
823         if (ros_syscall(SYS_vmm_setup, nr_gpcs, &gpci, vmmflags, 0, 0, 0) !=
824             nr_gpcs) {
825                 perror("Guest pcore setup failed");
826                 exit(1);
827         }
828
829         fprintf(stderr, "Run with %d cores and vmmflags 0x%x\n", nr_gpcs, vmmflags);
830         mcp = 1;
831         if (mcp) {
832                 my_retvals = malloc(sizeof(void*) * nr_threads);
833                 if (!my_retvals)
834                         perror("Init threads/malloc");
835
836                 pthread_can_vcore_request(FALSE);       /* 2LS won't manage vcores */
837                 pthread_need_tls(FALSE);
838                 pthread_mcp_init();                                     /* gives us one vcore */
839                 vcore_request(nr_threads - 1);          /* ghetto incremental interface */
840                 for (int i = 0; i < nr_threads; i++) {
841                         xp = __procinfo.vcoremap;
842                         fprintf(stderr, "%p\n", __procinfo.vcoremap);
843                         fprintf(stderr, "Vcore %d mapped to pcore %d\n", i,
844                                 __procinfo.vcoremap[i].pcoreid);
845                 }
846         }
847
848         ret = syscall(33, 1);
849         if (ret < 0) {
850                 perror("vm setup");
851                 exit(1);
852         }
853         ret = posix_memalign((void **)&p512, 4096, 3*4096);
854         fprintf(stderr, "memalign is %p\n", p512);
855         if (ret) {
856                 perror("ptp alloc");
857                 exit(1);
858         }
859         p1 = &p512[512];
860         p2m = &p512[1024];
861         uint64_t kernbase = 0; //0xffffffff80000000;
862         uint64_t highkernbase = 0xffffffff80000000;
863         p512[PML4(kernbase)] = (unsigned long long)p1 | 7;
864         p1[PML3(kernbase)] = /*0x87; */(unsigned long long)p2m | 7;
865         p512[PML4(highkernbase)] = (unsigned long long)p1 | 7;
866         p1[PML3(highkernbase)] = /*0x87; */(unsigned long long)p2m | 7;
867 #define _2MiB (0x200000)
868
869         for (i = 0; i < 512; i++) {
870                 p2m[PML2(kernbase + i * _2MiB)] = 0x87 | i * _2MiB;
871         }
872
873         kernbase >>= (0+12);
874         kernbase <<= (0 + 12);
875         uint8_t *kernel = (void *)GKERNBASE;
876         //write_coreboot_table(coreboot_tables, ((void *)VIRTIOBASE) /*kernel*/, KERNSIZE + 1048576);
877         hexdump(stdout, coreboot_tables, 512);
878         fprintf(stderr, "kernbase for pml4 is 0x%llx and entry is %llx\n", kernbase, entry);
879         fprintf(stderr, "p512 %p p512[0] is 0x%lx p1 %p p1[0] is 0x%x\n", p512, p512[0], p1, p1[0]);
880         vmctl.interrupt = 0;
881         vmctl.command = REG_RSP_RIP_CR3;
882         vmctl.cr3 = (uint64_t) p512;
883         vmctl.regs.tf_rip = entry;
884         vmctl.regs.tf_rsp = (uint64_t) &stack[1024];
885         vmctl.regs.tf_rsi = (uint64_t) bp;
886         if (mcp) {
887                 /* set up virtio bits, which depend on threads being enabled. */
888                 register_virtio_mmio(&vqdev, virtio_mmio_base);
889         }
890         fprintf(stderr, "threads started\n");
891         fprintf(stderr, "Writing command :%s:\n", cmd);
892
893         if (debug)
894                 vapic_status_dump(stderr, (void *)gpci.vapic_addr);
895
896         run_vmthread(&vmctl);
897
898         if (debug)
899                 vapic_status_dump(stderr, (void *)gpci.vapic_addr);
900
901         if (mcp) {
902                 /* Start up timer thread */
903                 if (pthread_create(&timerthread_struct, NULL, timer_thread, NULL)) {
904                         fprintf(stderr, "pth_create failed for timer thread.");
905                         perror("pth_create");
906                 }
907         }
908
909         vm_tf = &(vm_thread->uthread.u_ctx.tf.vm_tf);
910
911         while (1) {
912
913                 int c;
914                 uint8_t byte;
915                 //vmctl.command = REG_RIP;
916                 if (maxresume-- == 0) {
917                         debug = 1;
918                         resumeprompt = 1;
919                 }
920                 if (debug) {
921                         fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
922                                 vm_tf->tf_exit_reason);
923                         showstatus(stderr, (struct guest_thread*)&vm_thread);
924                 }
925                 if (resumeprompt) {
926                         fprintf(stderr, "RESUME?\n");
927                         c = getchar();
928                         if (c == 'q')
929                                 break;
930                 }
931                 if (vm_tf->tf_exit_reason == EXIT_REASON_EPT_VIOLATION) {
932                         uint64_t gpa, *regp, val;
933                         uint8_t regx;
934                         int store, size;
935                         int advance;
936                         if (decode((struct guest_thread *) vm_thread, &gpa, &regx, &regp,
937                                    &store, &size, &advance)) {
938                                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
939                                         vm_tf->tf_exit_reason);
940                                 showstatus(stderr, (struct guest_thread*)&vm_thread);
941                                 quit = 1;
942                                 break;
943                         }
944                         if (debug) fprintf(stderr, "%p %p %p %p %p %p\n", gpa, regx, regp, store, size, advance);
945                         if ((gpa & ~0xfffULL) == virtiobase) {
946                                 if (debug) fprintf(stderr, "DO SOME VIRTIO\n");
947                                 // Lucky for us the various virtio ops are well-defined.
948                                 virtio_mmio((struct guest_thread *)vm_thread, gpa, regx, regp,
949                                             store);
950                                 if (debug) fprintf(stderr, "store is %d:\n", store);
951                                 if (debug) fprintf(stderr, "REGP IS %16x:\n", *regp);
952                         } else if ((gpa & 0xfee00000) == 0xfee00000) {
953                                 // until we fix our include mess, just put the proto here.
954                                 //int apic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
955                                 //apic(&vmctl, gpa, regx, regp, store);
956                         } else if ((gpa & 0xfec00000) == 0xfec00000) {
957                                 // until we fix our include mess, just put the proto here.
958                                 do_ioapic((struct guest_thread *)vm_thread, gpa, regx, regp,
959                                           store);
960                         } else if (gpa < 4096) {
961                                 uint64_t val = 0;
962                                 memmove(&val, &low4k[gpa], size);
963                                 hexdump(stdout, &low4k[gpa], size);
964                                 fprintf(stderr, "Low 1m, code %p read @ %p, size %d, val %p\n",
965                                         vm_tf->tf_rip, gpa, size, val);
966                                 memmove(regp, &low4k[gpa], size);
967                                 hexdump(stdout, regp, size);
968                         } else {
969                                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
970                                 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
971                                         vm_tf->tf_exit_reason);
972                                 fprintf(stderr, "Returning 0xffffffff\n");
973                                 showstatus(stderr, (struct guest_thread*)&vm_thread);
974                                 // Just fill the whole register for now.
975                                 *regp = (uint64_t) -1;
976                         }
977                         vm_tf->tf_rip += advance;
978                         if (debug)
979                                 fprintf(stderr, "Advance rip by %d bytes to %p\n",
980                                         advance, vm_tf->tf_rip);
981                         //vmctl.shutdown = 0;
982                         //vmctl.gpa = 0;
983                         //vmctl.command = REG_ALL;
984                 } else {
985                         switch (vm_tf->tf_exit_reason) {
986                         case  EXIT_REASON_VMCALL:
987                                 byte = vm_tf->tf_rdi;
988                                 printf("%c", byte);
989                                 if (byte == '\n') printf("%c", '%');
990                                 vm_tf->tf_rip += 3;
991                                 break;
992                         case EXIT_REASON_EXTERNAL_INTERRUPT:
993                                 //debug = 1;
994                                 if (debug)
995                                         fprintf(stderr, "XINT 0x%x 0x%x\n",
996                                                 vm_tf->tf_intrinfo1, vm_tf->tf_intrinfo2);
997                                 if (debug) pir_dump();
998                                 //vmctl.command = RESUME;
999                                 break;
1000                         case EXIT_REASON_IO_INSTRUCTION:
1001                                 fprintf(stderr, "IO @ %p\n", vm_tf->tf_rip);
1002                                 io((struct guest_thread *)vm_thread);
1003                                 //vmctl.shutdown = 0;
1004                                 //vmctl.gpa = 0;
1005                                 //vmctl.command = REG_ALL;
1006                                 break;
1007                         case EXIT_REASON_INTERRUPT_WINDOW:
1008                                 if (consdata) {
1009                                         if (debug) fprintf(stderr, "inject an interrupt\n");
1010                                         virtio_mmio_set_vring_irq();
1011                                         vm_tf->tf_trap_inject = 0x80000000 | virtioirq;
1012                                         //vmctl.command = RESUME;
1013                                         consdata = 0;
1014                                 }
1015                                 break;
1016                         case EXIT_REASON_MSR_WRITE:
1017                         case EXIT_REASON_MSR_READ:
1018                                 fprintf(stderr, "Do an msr\n");
1019                                 if (msrio((struct guest_thread *)vm_thread, &gpci,
1020                                           vm_tf->tf_exit_reason)) {
1021                                         // uh-oh, msrio failed
1022                                         // well, hand back a GP fault which is what Intel does
1023                                         fprintf(stderr, "MSR FAILED: RIP %p, shutdown 0x%x\n",
1024                                                 vm_tf->tf_rip, vm_tf->tf_exit_reason);
1025                                         showstatus(stderr, (struct guest_thread*)&vm_thread);
1026
1027                                         // Use event injection through vmctl to send
1028                                         // a general protection fault
1029                                         // vmctl.interrupt gets written to the VM-Entry
1030                                         // Interruption-Information Field by vmx
1031                                         vm_tf->tf_trap_inject = VM_TRAP_VALID
1032                                                               | VM_TRAP_ERROR_CODE
1033                                                               | VM_TRAP_HARDWARE
1034                                                               | 13; // GPF
1035                                 } else {
1036                                         vm_tf->tf_rip += 2;
1037                                 }
1038                                 break;
1039                         case EXIT_REASON_MWAIT_INSTRUCTION:
1040                           fflush(stdout);
1041                                 if (debug)fprintf(stderr, "\n================== Guest MWAIT. =======================\n");
1042                                 if (debug)fprintf(stderr, "Wait for cons data\n");
1043                                 while (!consdata)
1044                                         ;
1045                                 //debug = 1;
1046                                 if (debug)
1047                                         vapic_status_dump(stderr, gpci.vapic_addr);
1048                                 if (debug)fprintf(stderr, "Resume with consdata ...\n");
1049                                 vm_tf->tf_rip += 3;
1050                                 //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
1051                                 //showstatus(stderr, (struct guest_thread*)&vm_thread);
1052                                 break;
1053                         case EXIT_REASON_HLT:
1054                                 fflush(stdout);
1055                                 if (debug)fprintf(stderr, "\n================== Guest halted. =======================\n");
1056                                 if (debug)fprintf(stderr, "Wait for cons data\n");
1057                                 while (!consdata)
1058                                         ;
1059                                 //debug = 1;
1060                                 if (debug)fprintf(stderr, "Resume with consdata ...\n");
1061                                 vm_tf->tf_rip += 1;
1062                                 //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
1063                                 //showstatus(stderr, (struct guest_thread*)&vm_thread);
1064                                 break;
1065                         case EXIT_REASON_APIC_ACCESS:
1066                                 if (1 || debug)fprintf(stderr, "APIC READ EXIT\n");
1067
1068                                 uint64_t gpa, *regp, val;
1069                                 uint8_t regx;
1070                                 int store, size;
1071                                 int advance;
1072                                 if (decode((struct guest_thread *)vm_thread, &gpa, &regx,
1073                                            &regp, &store, &size, &advance)) {
1074                                         fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
1075                                                 vm_tf->tf_exit_reason);
1076                                         showstatus(stderr, (struct guest_thread*)&vm_thread);
1077                                         quit = 1;
1078                                         break;
1079                                 }
1080
1081                                 int apic(struct guest_thread *vm_thread, uint64_t gpa,
1082                                          int destreg, uint64_t *regp, int store);
1083                                 apic((struct guest_thread *)vm_thread, gpa, regx, regp, store);
1084                                 vm_tf->tf_rip += advance;
1085                                 if (debug)
1086                                         fprintf(stderr, "Advance rip by %d bytes to %p\n",
1087                                                 advance, vm_tf->tf_rip);
1088                                 //vmctl.shutdown = 0;
1089                                 //vmctl.gpa = 0;
1090                                 //vmctl.command = REG_ALL;
1091                                 break;
1092                         case EXIT_REASON_APIC_WRITE:
1093                                 if (1 || debug)fprintf(stderr, "APIC WRITE EXIT\n");
1094                                 break;
1095                         default:
1096                                 fprintf(stderr, "Don't know how to handle exit %d\n",
1097                                         vm_tf->tf_exit_reason);
1098                                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
1099                                         vm_tf->tf_exit_reason);
1100                                 showstatus(stderr, (struct guest_thread*)&vm_thread);
1101                                 quit = 1;
1102                                 break;
1103                         }
1104                 }
1105                 if (debug) fprintf(stderr, "at bottom of switch, quit is %d\n", quit);
1106                 if (quit)
1107                         break;
1108                 if (consdata) {
1109                         if (debug) fprintf(stderr, "inject an interrupt\n");
1110                         if (debug)
1111                                 fprintf(stderr, "XINT 0x%x 0x%x\n", vm_tf->tf_intrinfo1,
1112                                         vm_tf->tf_intrinfo2);
1113                         vm_tf->tf_trap_inject = 0x80000000 | virtioirq;
1114                         virtio_mmio_set_vring_irq();
1115                         consdata = 0;
1116                         //debug = 1;
1117                         //vmctl.command = RESUME;
1118                 }
1119                 if (debug) fprintf(stderr, "NOW DO A RESUME\n");
1120                 copy_vmtf_to_vmctl(vm_tf, &vmctl);
1121                 run_vmthread(&vmctl);
1122                 copy_vmctl_to_vmtf(&vmctl, vm_tf);
1123         }
1124
1125         /* later.
1126         for (int i = 0; i < nr_threads-1; i++) {
1127                 int ret;
1128                 if (pthread_join(my_threads[i], &my_retvals[i]))
1129                         perror("pth_join failed");
1130                 fprintf(stderr, "%d %d\n", i, ret);
1131         }
1132  */
1133
1134         fflush(stdout);
1135         exit(0);
1136 }