Updates from vmm-akaros
[akaros.git] / tests / vmm / vmrunkernel.c
1 #include <stdio.h>
2 #include <pthread.h>
3 #include <sys/types.h>
4 #include <sys/stat.h>
5 #include <fcntl.h>
6 #include <parlib/arch/arch.h>
7 #include <parlib/ros_debug.h>
8 #include <unistd.h>
9 #include <errno.h>
10 #include <dirent.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <ros/syscall.h>
14 #include <sys/mman.h>
15 #include <vmm/coreboot_tables.h>
16 #include <vmm/vmm.h>
17 #include <vmm/acpi/acpi.h>
18 #include <ros/arch/mmu.h>
19 #include <ros/vmm.h>
20 #include <parlib/uthread.h>
21 #include <vmm/linux_bootparam.h>
22 #include <vmm/virtio.h>
23 #include <vmm/virtio_mmio.h>
24 #include <vmm/virtio_ids.h>
25 #include <vmm/virtio_config.h>
26
27
28
29 void showstatus(FILE *f, struct vmctl *v);
30
31 int msrio(struct vmctl *vcpu, uint32_t opcode);
32
33 struct vmctl vmctl;
34 struct vmm_gpcore_init gpci;
35
36 /* Whoever holds the ball runs.  run_vm never actually grabs it - it is grabbed
37  * on its behalf. */
38 uth_mutex_t the_ball;
39 pthread_t vm_thread;
40 void (*old_thread_refl)(struct uthread *uth, struct user_context *ctx);
41
42 static void copy_vmtf_to_vmctl(struct vm_trapframe *vm_tf, struct vmctl *vmctl)
43 {
44         vmctl->cr3 = vm_tf->tf_cr3;
45         vmctl->gva = vm_tf->tf_guest_va;
46         vmctl->gpa = vm_tf->tf_guest_pa;
47         vmctl->exit_qual = vm_tf->tf_exit_qual;
48         if (vm_tf->tf_exit_reason == EXIT_REASON_EPT_VIOLATION)
49                 vmctl->shutdown = SHUTDOWN_EPT_VIOLATION;
50         else
51                 vmctl->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
52         vmctl->ret_code = vm_tf->tf_exit_reason;
53         vmctl->interrupt = vm_tf->tf_trap_inject;
54         vmctl->intrinfo1 = vm_tf->tf_intrinfo1;
55         vmctl->intrinfo2 = vm_tf->tf_intrinfo2;
56         /* Most of the HW TF.  Should be good enough for now */
57         vmctl->regs.tf_rax = vm_tf->tf_rax;
58         vmctl->regs.tf_rbx = vm_tf->tf_rbx;
59         vmctl->regs.tf_rcx = vm_tf->tf_rcx;
60         vmctl->regs.tf_rdx = vm_tf->tf_rdx;
61         vmctl->regs.tf_rbp = vm_tf->tf_rbp;
62         vmctl->regs.tf_rsi = vm_tf->tf_rsi;
63         vmctl->regs.tf_rdi = vm_tf->tf_rdi;
64         vmctl->regs.tf_r8  = vm_tf->tf_r8;
65         vmctl->regs.tf_r9  = vm_tf->tf_r9;
66         vmctl->regs.tf_r10 = vm_tf->tf_r10;
67         vmctl->regs.tf_r11 = vm_tf->tf_r11;
68         vmctl->regs.tf_r12 = vm_tf->tf_r12;
69         vmctl->regs.tf_r13 = vm_tf->tf_r13;
70         vmctl->regs.tf_r14 = vm_tf->tf_r14;
71         vmctl->regs.tf_r15 = vm_tf->tf_r15;
72         vmctl->regs.tf_rip = vm_tf->tf_rip;
73         vmctl->regs.tf_rflags = vm_tf->tf_rflags;
74         vmctl->regs.tf_rsp = vm_tf->tf_rsp;
75 }
76
77 static void copy_vmctl_to_vmtf(struct vmctl *vmctl, struct vm_trapframe *vm_tf)
78 {
79         vm_tf->tf_rax = vmctl->regs.tf_rax;
80         vm_tf->tf_rbx = vmctl->regs.tf_rbx;
81         vm_tf->tf_rcx = vmctl->regs.tf_rcx;
82         vm_tf->tf_rdx = vmctl->regs.tf_rdx;
83         vm_tf->tf_rbp = vmctl->regs.tf_rbp;
84         vm_tf->tf_rsi = vmctl->regs.tf_rsi;
85         vm_tf->tf_rdi = vmctl->regs.tf_rdi;
86         vm_tf->tf_r8  = vmctl->regs.tf_r8;
87         vm_tf->tf_r9  = vmctl->regs.tf_r9;
88         vm_tf->tf_r10 = vmctl->regs.tf_r10;
89         vm_tf->tf_r11 = vmctl->regs.tf_r11;
90         vm_tf->tf_r12 = vmctl->regs.tf_r12;
91         vm_tf->tf_r13 = vmctl->regs.tf_r13;
92         vm_tf->tf_r14 = vmctl->regs.tf_r14;
93         vm_tf->tf_r15 = vmctl->regs.tf_r15;
94         vm_tf->tf_rip = vmctl->regs.tf_rip;
95         vm_tf->tf_rflags = vmctl->regs.tf_rflags;
96         vm_tf->tf_rsp = vmctl->regs.tf_rsp;
97         vm_tf->tf_cr3 = vmctl->cr3;
98         vm_tf->tf_trap_inject = vmctl->interrupt;
99         /* Don't care about the rest of the fields.  The kernel only writes them */
100 }
101
102 /* callback, runs in vcore context.  this sets up our initial context.  once we
103  * become runnable again, we'll run the first bits of the vm ctx.  after that,
104  * our context will be stopped and started and will just run whatever the guest
105  * VM wants.  we'll never come back to this code or to run_vm(). */
106 static void __build_vm_ctx_cb(struct uthread *uth, void *arg)
107 {
108         struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
109         struct vmctl *vmctl = (struct vmctl*)arg;
110         struct vm_trapframe *vm_tf;
111
112         __pthread_generic_yield(pthread);
113         pthread->state = PTH_BLK_YIELDING;
114
115         memset(&uth->u_ctx, 0, sizeof(struct user_context));
116         uth->u_ctx.type = ROS_VM_CTX;
117         vm_tf = &uth->u_ctx.tf.vm_tf;
118
119         vm_tf->tf_guest_pcoreid = 0;    /* assuming only 1 guest core */
120
121         copy_vmctl_to_vmtf(vmctl, vm_tf);
122
123         /* other HW/GP regs are 0, which should be fine.  the FP state is still
124          * whatever we were running before, though this is pretty much unnecessary.
125          * we mostly don't want crazy crap in the uth->as, and a non-current_uthread
126          * VM ctx is supposed to have something in their FP state (like HW ctxs). */
127         save_fp_state(&uth->as);
128         uth->flags |= UTHREAD_FPSAVED | UTHREAD_SAVED;
129
130         uthread_runnable(uth);
131 }
132
133 static void *run_vm(void *arg)
134 {
135         struct vmctl *vmctl = (struct vmctl*)arg;
136
137         assert(vmctl->command == REG_RSP_RIP_CR3);
138         /* We need to hack our context, so that next time we run, we're a VM ctx */
139         uthread_yield(FALSE, __build_vm_ctx_cb, arg);
140 }
141
142 static void vmm_thread_refl_fault(struct uthread *uth,
143                                   struct user_context *ctx)
144 {
145         struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
146
147         /* Hack to call the original pth 2LS op */
148         if (!ctx->type == ROS_VM_CTX) {
149                 old_thread_refl(uth, ctx);
150                 return;
151         }
152         __pthread_generic_yield(pthread);
153         /* normally we'd handle the vmexit here.  to work within the existing
154          * framework, we just wake the controller thread.  It'll look at our ctx
155          * then make us runnable again */
156         pthread->state = PTH_BLK_MUTEX;
157         uth_mutex_unlock(the_ball);             /* wake the run_vmthread */
158 }
159
160
161
162 /* this will start the vm thread, and return when the thread has blocked,
163  * with the right info in vmctl. */
164 static void run_vmthread(struct vmctl *vmctl)
165 {
166         struct vm_trapframe *vm_tf;
167
168         if (!vm_thread) {
169                 /* first time through, we make the vm thread.  the_ball was already
170                  * grabbed right after it was alloc'd. */
171                 if (pthread_create(&vm_thread, NULL, run_vm, vmctl)) {
172                         perror("pth_create");
173                         exit(-1);
174                 }
175                 /* hack in our own handlers for some 2LS ops */
176                 old_thread_refl = sched_ops->thread_refl_fault;
177                 sched_ops->thread_refl_fault = vmm_thread_refl_fault;
178         } else {
179                 copy_vmctl_to_vmtf(vmctl, &vm_thread->uthread.u_ctx.tf.vm_tf);
180                 uth_mutex_lock(the_ball);       /* grab it for the vm_thread */
181                 uthread_runnable((struct uthread*)vm_thread);
182         }
183         uth_mutex_lock(the_ball);
184         /* We woke due to a vm exit.  Need to unlock for the next time we're run */
185         uth_mutex_unlock(the_ball);
186         /* the vm stopped.  we can do whatever we want before rerunning it.  since
187          * we're controlling the uth, we need to handle its vmexits.  we'll fill in
188          * the vmctl, since that's the current framework. */
189         copy_vmtf_to_vmctl(&vm_thread->uthread.u_ctx.tf.vm_tf, vmctl);
190 }
191
192 /* By 1999, you could just scan the hardware
193  * and work it out. But 2005, that was no longer possible. How sad.
194  * so we have to fake acpi to make it all work.
195  * This will be copied to memory at 0xe0000, so the kernel can find it.
196  */
197
198 /* assume they're all 256 bytes long just to make it easy.
199  * Just have pointers that point to aligned things.
200  */
201
202 struct acpi_table_rsdp rsdp = {
203         .signature = "RSD PTR ",
204         .oem_id = "AKAROS",
205         .revision = 2,
206         .length = 36,
207 };
208
209 struct acpi_table_xsdt xsdt = {
210         .header = {
211                 .signature= "XSDT",
212                 // This is so stupid. Incredibly stupid.
213                 .revision = 0,
214                 .oem_id = "AKAROS",
215                 .oem_table_id = "ALPHABET",
216                 .oem_revision = 0,
217                 .asl_compiler_id = "RON ",
218                 .asl_compiler_revision = 0,
219         },
220 };
221 struct acpi_table_fadt fadt = {
222         .header = {
223                 .signature= "FADT",
224                 // This is so stupid. Incredibly stupid.
225                 .revision = 0,
226                 .oem_id = "AKAROS",
227                 .oem_table_id = "ALPHABET",
228                 .oem_revision = 0,
229                 .asl_compiler_id = "RON ",
230                 .asl_compiler_revision = 0,
231         },
232 };
233
234 /* This has to be dropped into memory, then the other crap just follows it.
235  */
236 struct acpi_table_madt madt = {
237         .header = {
238                 .signature = "APIC",
239                 .revision = 0,
240                 .oem_id = "AKAROS",
241                 .oem_table_id = "ALPHABET",
242                 .oem_revision = 0,
243                 .asl_compiler_id = "RON ",
244                 .asl_compiler_revision = 0,
245         },
246
247         .address = 0xfee00000ULL,
248 };
249
250 struct acpi_madt_local_apic Apic0 = {.header = {.type = ACPI_MADT_TYPE_LOCAL_APIC, .length = sizeof(struct acpi_madt_local_apic)},
251                                      .processor_id = 0, .id = 0};
252 struct acpi_madt_io_apic Apic1 = {.header = {.type = ACPI_MADT_TYPE_IO_APIC, .length = sizeof(struct acpi_madt_io_apic)},
253                                   .id = 1, .address = 0xfec00000, .global_irq_base = 0};
254 struct acpi_madt_interrupt_override isor[] = {
255         /* I have no idea if it should be source irq 2, global 0, or global 2, source 0. Shit. */
256         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
257          .bus = 0, .source_irq = 2, .global_irq = 0, .inti_flags = 0},
258         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
259          .bus = 0, .source_irq = 1, .global_irq = 1, .inti_flags = 0},
260         //{.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
261          //.bus = 0, .source_irq = 2, .global_irq = 2, .inti_flags = 0},
262         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
263          .bus = 0, .source_irq = 3, .global_irq = 3, .inti_flags = 0},
264         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
265          .bus = 0, .source_irq = 4, .global_irq = 4, .inti_flags = 0},
266         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
267          .bus = 0, .source_irq = 5, .global_irq = 5, .inti_flags = 0},
268         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
269          .bus = 0, .source_irq = 6, .global_irq = 6, .inti_flags = 0},
270         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
271          .bus = 0, .source_irq = 7, .global_irq = 7, .inti_flags = 0},
272         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
273          .bus = 0, .source_irq = 8, .global_irq = 8, .inti_flags = 0},
274         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
275          .bus = 0, .source_irq = 9, .global_irq = 9, .inti_flags = 0},
276         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
277          .bus = 0, .source_irq = 10, .global_irq = 10, .inti_flags = 0},
278         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
279          .bus = 0, .source_irq = 11, .global_irq = 11, .inti_flags = 0},
280         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
281          .bus = 0, .source_irq = 12, .global_irq = 12, .inti_flags = 0},
282         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
283          .bus = 0, .source_irq = 13, .global_irq = 13, .inti_flags = 0},
284         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
285          .bus = 0, .source_irq = 14, .global_irq = 14, .inti_flags = 0},
286         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
287          .bus = 0, .source_irq = 15, .global_irq = 15, .inti_flags = 0},
288         // VMMCP routes irq 32 to gsi 17
289         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
290          .bus = 0, .source_irq = 32, .global_irq = 17, .inti_flags = 5},
291 };
292
293
294 /* this test will run the "kernel" in the negative address space. We hope. */
295 void *low1m;
296 uint8_t low4k[4096];
297 unsigned long long stack[1024];
298 volatile int shared = 0;
299 volatile int quit = 0;
300 int mcp = 1;
301 int virtioirq = 17;
302
303 /* total hack. If the vm runs away we want to get control again. */
304 unsigned int maxresume = (unsigned int) -1;
305
306 #define MiB 0x100000u
307 #define GiB (1u<<30)
308 #define GKERNBASE (16*MiB)
309 #define KERNSIZE (128*MiB+GKERNBASE)
310 uint8_t _kernel[KERNSIZE];
311
312 unsigned long long *p512, *p1, *p2m;
313
314 void **my_retvals;
315 int nr_threads = 4;
316 int debug = 0;
317 int resumeprompt = 0;
318 /* unlike Linux, this shared struct is for both host and guest. */
319 //      struct virtqueue *constoguest =
320 //              vring_new_virtqueue(0, 512, 8192, 0, inpages, NULL, NULL, "test");
321 uint64_t virtio_mmio_base = 0x100000000ULL;
322
323 void vapic_status_dump(FILE *f, void *vapic);
324 static void set_posted_interrupt(int vector);
325
326 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
327 #error "Get a gcc newer than 4.4.0"
328 #else
329 #define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
330 #endif
331
332 #define LOCK_PREFIX "lock "
333 #define ADDR                            BITOP_ADDR(addr)
334 static inline int test_and_set_bit(int nr, volatile unsigned long *addr);
335
336 void *timer_thread(void *arg)
337 {
338         while (1) {
339                 set_posted_interrupt(0xef);
340                 ros_syscall(SYS_vmm_poke_guest, 0, 0, 0, 0, 0, 0);
341                 uthread_usleep(1);
342         }
343 }
344
345 void *consout(void *arg)
346 {
347         char *line, *consline, *outline;
348         static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
349         static struct scatterlist in[] = { {NULL, sizeof(line)}, };
350         static struct scatterlist iov[32];
351         struct virtio_threadarg *a = arg;
352         static unsigned int inlen, outlen, conslen;
353         struct virtqueue *v = a->arg->virtio;
354         fprintf(stderr, "talk thread ..\n");
355         uint16_t head, gaveit = 0, gotitback = 0;
356         uint32_t vv;
357         int i;
358         int num;
359
360         if (debug) {
361                 fprintf(stderr, "----------------------- TT a %p\n", a);
362                 fprintf(stderr, "talk thread ttargs %x v %x\n", a, v);
363         }
364
365         for(num = 0;;num++) {
366                 //int debug = 1;
367                 /* host: use any buffers we should have been sent. */
368                 head = wait_for_vq_desc(v, iov, &outlen, &inlen);
369                 if (debug)
370                         fprintf(stderr, "CCC: vq desc head %d, gaveit %d gotitback %d\n", head, gaveit, gotitback);
371                 for(i = 0; debug && i < outlen + inlen; i++)
372                         fprintf(stderr, "CCC: v[%d/%d] v %p len %d\n", i, outlen + inlen, iov[i].v, iov[i].length);
373                 /* host: if we got an output buffer, just output it. */
374                 for(i = 0; i < outlen; i++) {
375                         num++;
376                         int j;
377                         if (debug) {
378                                 fprintf(stderr, "CCC: IOV length is %d\n", iov[i].length);
379                         }
380                         for (j = 0; j < iov[i].length; j++)
381                                 printf("%c", ((char *)iov[i].v)[j]);
382                 }
383                 fflush(stdout);
384                 if (debug)
385                         fprintf(stderr, "CCC: outlen is %d; inlen is %d\n", outlen, inlen);
386                 /* host: fill in the writeable buffers. */
387                 /* why we're getting these I don't know. */
388                 for (i = outlen; i < outlen + inlen; i++) {
389                         if (debug) fprintf(stderr, "CCC: send back empty writeable");
390                         iov[i].length = 0;
391                 }
392                 if (debug) fprintf(stderr, "CCC: call add_used\n");
393                 /* host: now ack that we used them all. */
394                 add_used(v, head, outlen+inlen);
395                 if (debug) fprintf(stderr, "CCC: DONE call add_used\n");
396         }
397         fprintf(stderr, "All done\n");
398         return NULL;
399 }
400
401 // FIXME.
402 volatile int consdata = 0;
403
404 void *consin(void *arg)
405 {
406         struct virtio_threadarg *a = arg;
407         char *line, *outline;
408         static char consline[128];
409         static struct scatterlist iov[32];
410         static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
411         static struct scatterlist in[] = { {NULL, sizeof(line)}, };
412
413         static unsigned int inlen, outlen, conslen;
414         struct virtqueue *v = a->arg->virtio;
415         fprintf(stderr, "consin thread ..\n");
416         uint16_t head, gaveit = 0, gotitback = 0;
417         uint32_t vv;
418         int i;
419         int num;
420         //char c[1];
421         int timer_started = 0;
422         pthread_t timerthread_struct;
423
424         if (debug) fprintf(stderr, "Spin on console being read, print num queues, halt\n");
425
426         for(num = 0;! quit;num++) {
427                 //int debug = 1;
428                 /* host: use any buffers we should have been sent. */
429                 head = wait_for_vq_desc(v, iov, &outlen, &inlen);
430                 if (debug)
431                         fprintf(stderr, "vq desc head %d, gaveit %d gotitback %d\n", head, gaveit, gotitback);
432                 for(i = 0; debug && i < outlen + inlen; i++)
433                         fprintf(stderr, "v[%d/%d] v %p len %d\n", i, outlen + inlen, iov[i].v, iov[i].length);
434                 if (debug)
435                         fprintf(stderr, "outlen is %d; inlen is %d\n", outlen, inlen);
436                 /* host: fill in the writeable buffers. */
437                 for (i = outlen; i < outlen + inlen; i++) {
438                         /* host: read a line. */
439                         memset(consline, 0, 128);
440                         if (read(0, consline, 1) < 0) {
441                                 exit(0);
442                         }
443                         if (debug) fprintf(stderr, "CONSIN: GOT A LINE:%s:\n", consline);
444                         if (debug) fprintf(stderr, "CONSIN: OUTLEN:%d:\n", outlen);
445                         if (strlen(consline) < 3 && consline[0] == 'q' ) {
446                                 quit = 1;
447                                 break;
448                         }
449
450                         memmove(iov[i].v, consline, strlen(consline)+ 1);
451                         iov[i].length = strlen(consline) + 1;
452                 }
453                 if (debug) fprintf(stderr, "call add_used\n");
454                 /* host: now ack that we used them all. */
455                 add_used(v, head, outlen+inlen);
456                 /* turn off consdata - the IRQ injection isn't right */
457                 //consdata = 1;
458                 if (debug) fprintf(stderr, "DONE call add_used\n");
459
460                 // Send spurious for testing (Gan)
461                 set_posted_interrupt(0xE5);
462                 virtio_mmio_set_vring_irq();
463
464                 ros_syscall(SYS_vmm_poke_guest, 0, 0, 0, 0, 0, 0);
465                 /*if (!timer_started && mcp) {
466                         // Start up timer thread
467                         if (pthread_create(&timerthread_struct, NULL, timer_thread, NULL)) {
468                                 fprintf(stderr, "pth_create failed for timer thread.");
469                                 perror("pth_create");
470                         } else {
471                                 timer_started = 1;
472                         }
473                 }*/
474         }
475         fprintf(stderr, "All done\n");
476         return NULL;
477 }
478
479 static struct vqdev vqdev= {
480 name: "console",
481 dev: VIRTIO_ID_CONSOLE,
482 device_features: 0, /* Can't do it: linux console device does not support it. VIRTIO_F_VERSION_1*/
483 numvqs: 2,
484 vqs: {
485                 {name: "consin", maxqnum: 64, f: consin, arg: (void *)0},
486                 {name: "consout", maxqnum: 64, f: consout, arg: (void *)0},
487         }
488 };
489
490 void lowmem() {
491         __asm__ __volatile__ (".section .lowmem, \"aw\"\n\tlow: \n\t.=0x1000\n\t.align 0x100000\n\t.previous\n");
492 }
493
494 static uint8_t acpi_tb_checksum(uint8_t *buffer, uint32_t length)
495 {
496         uint8_t sum = 0;
497         uint8_t *end = buffer + length;
498         fprintf(stderr, "tbchecksum %p for %d", buffer, length);
499         while (buffer < end) {
500                 if (end - buffer < 2)
501                         fprintf(stderr, "%02x\n", sum);
502                 sum = (uint8_t)(sum + *(buffer++));
503         }
504         fprintf(stderr, " is %02x\n", sum);
505         return (sum);
506 }
507
508 static void gencsum(uint8_t *target, void *data, int len)
509 {
510         uint8_t csum;
511         // blast target to zero so it does not get counted
512         // (it might be in the struct we checksum) And, yes, it is, goodness.
513         fprintf(stderr, "gencsum %p target %p source %d bytes\n", target, data, len);
514         *target = 0;
515         csum  = acpi_tb_checksum((uint8_t *)data, len);
516         *target = ~csum + 1;
517         fprintf(stderr, "Cmoputed is %02x\n", *target);
518 }
519
520 static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
521 {
522         int oldbit;
523
524         asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
525                      "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
526
527         return oldbit;
528 }
529
530 static void pir_dump()
531 {
532         unsigned long *pir_ptr = gpci.posted_irq_desc;
533         int i;
534         fprintf(stderr, "-------Begin PIR dump-------\n");
535         for (i = 0; i < 8; i++){
536                 fprintf(stderr, "Byte %d: 0x%016x\n", i, pir_ptr[i]);
537         }
538         fprintf(stderr, "-------End PIR dump-------\n");
539 }
540
541 static void set_posted_interrupt(int vector)
542 {
543         test_and_set_bit(vector, gpci.posted_irq_desc);
544         /* LOCKed instruction provides the mb() */
545         test_and_set_bit(VMX_POSTED_OUTSTANDING_NOTIF, gpci.posted_irq_desc);
546 }
547
548 int main(int argc, char **argv)
549 {
550         struct boot_params *bp;
551         char *cmdline;
552         uint64_t *p64;
553         void *a = (void *)0xe0000;
554         struct acpi_table_rsdp *r;
555         struct acpi_table_fadt *f;
556         struct acpi_table_madt *m;
557         struct acpi_table_xsdt *x;
558         uint64_t virtiobase = 0x100000000ULL;
559         // lowmem is a bump allocated pointer to 2M at the "physbase" of memory
560         void *lowmem = (void *) 0x1000000;
561         //struct vmctl vmctl;
562         int amt;
563         int vmmflags = 0; // Disabled probably forever. VMM_VMCALL_PRINTF;
564         uint64_t entry = 0x1200000, kerneladdress = 0x1200000;
565         int nr_gpcs = 1;
566         int ret;
567         void * xp;
568         int kfd = -1;
569         static char cmd[512];
570         int i;
571         uint8_t csum;
572         void *coreboot_tables = (void *) 0x1165000;
573         void *a_page;
574
575         the_ball = uth_mutex_alloc();
576         uth_mutex_lock(the_ball);
577
578         fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT,
579                         PML1_PTE_REACH);
580
581
582         // mmap is not working for us at present.
583         if ((uint64_t)_kernel > GKERNBASE) {
584                 fprintf(stderr, "kernel array @%p is above , GKERNBASE@%p sucks\n", _kernel, GKERNBASE);
585                 exit(1);
586         }
587         memset(_kernel, 0, sizeof(_kernel));
588         memset(lowmem, 0xff, 2*1048576);
589         memset(low4k, 0xff, 4096);
590         // avoid at all costs, requires too much instruction emulation.
591         //low4k[0x40e] = 0;
592         //low4k[0x40f] = 0xe0;
593
594         //Place mmap(Gan)
595         a_page = mmap((void *)0xfee00000, PGSIZE, PROT_READ | PROT_WRITE,
596                               MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
597         fprintf(stderr, "a_page mmap pointer %p\n", a_page);
598
599         if (a_page == (void *) -1) {
600                 perror("Could not mmap APIC");
601                 exit(1);
602         }
603         if (((uint64_t)a_page & 0xfff) != 0) {
604                 perror("APIC page mapping is not page aligned");
605                 exit(1);
606         }
607
608         memset(a_page, 0, 4096);
609         //((uint32_t *)a_page)[0x30/4] = 0x01060015;
610         ((uint32_t *)a_page)[0x30/4] = 0xDEADBEEF;
611
612
613         argc--,argv++;
614         // switches ...
615         // Sorry, I don't much like the gnu opt parsing code.
616         while (1) {
617                 if (*argv[0] != '-')
618                         break;
619                 switch(argv[0][1]) {
620                 case 'd':
621                         debug++;
622                         break;
623                 case 'v':
624                         vmmflags |= VMM_VMCALL_PRINTF;
625                         break;
626                 case 'm':
627                         argc--,argv++;
628                         maxresume = strtoull(argv[0], 0, 0);
629                         break;
630                 case 'i':
631                         argc--,argv++;
632                         virtioirq = strtoull(argv[0], 0, 0);
633                         break;
634                 default:
635                         fprintf(stderr, "BMAFR\n");
636                         break;
637                 }
638                 argc--,argv++;
639         }
640         if (argc < 1) {
641                 fprintf(stderr, "Usage: %s vmimage [-n (no vmcall printf)] [coreboot_tables [loadaddress [entrypoint]]]\n", argv[0]);
642                 exit(1);
643         }
644         if (argc > 1)
645                 coreboot_tables = (void *) strtoull(argv[1], 0, 0);
646         if (argc > 2)
647                 kerneladdress = strtoull(argv[2], 0, 0);
648         if (argc > 3)
649                 entry = strtoull(argv[3], 0, 0);
650         kfd = open(argv[0], O_RDONLY);
651         if (kfd < 0) {
652                 perror(argv[0]);
653                 exit(1);
654         }
655         // read in the kernel.
656         xp = (void *)kerneladdress;
657         for(;;) {
658                 amt = read(kfd, xp, 1048576);
659                 if (amt < 0) {
660                         perror("read");
661                         exit(1);
662                 }
663                 if (amt == 0) {
664                         break;
665                 }
666                 xp += amt;
667         }
668         fprintf(stderr, "Read in %d bytes\n", xp-kerneladdress);
669         close(kfd);
670
671         // The low 1m so we can fill in bullshit like ACPI. */
672         // And, sorry, due to the STUPID format of the RSDP for now we need the low 1M.
673         low1m = mmap((int*)4096, MiB-4096, PROT_READ | PROT_WRITE,
674                          MAP_ANONYMOUS, -1, 0);
675         if (low1m != (void *)4096) {
676                 perror("Unable to mmap low 1m");
677                 exit(1);
678         }
679         memset(low1m, 0xff, MiB-4096);
680         r = a;
681         fprintf(stderr, "install rsdp to %p\n", r);
682         *r = rsdp;
683         a += sizeof(*r);
684         memmove(&r->xsdt_physical_address, &a, sizeof(a));
685         gencsum(&r->checksum, r, ACPI_RSDP_CHECKSUM_LENGTH);
686         if ((csum = acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_CHECKSUM_LENGTH)) != 0) {
687                 fprintf(stderr, "RSDP has bad checksum; summed to %x\n", csum);
688                 exit(1);
689         }
690
691         /* Check extended checksum if table version >= 2 */
692         gencsum(&r->extended_checksum, r, ACPI_RSDP_XCHECKSUM_LENGTH);
693         if ((rsdp.revision >= 2) &&
694             (acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_XCHECKSUM_LENGTH) != 0)) {
695                 fprintf(stderr, "RSDP has bad checksum v2\n");
696                 exit(1);
697         }
698
699         /* just leave a bunch of space for the xsdt. */
700         /* we need to zero the area since it has pointers. */
701         x = a;
702         a += sizeof(*x) + 8*sizeof(void *);
703         memset(x, 0, a - (void *)x);
704         fprintf(stderr, "install xsdt to %p\n", x);
705         *x = xsdt;
706         x->table_offset_entry[0] = 0;
707         x->table_offset_entry[1] = 0;
708         x->header.length = a - (void *)x;
709
710         f = a;
711         fprintf(stderr, "install fadt to %p\n", f);
712         *f = fadt;
713         x->table_offset_entry[2] = (uint64_t) f;
714         a += sizeof(*f);
715         f->header.length = a - (void *)f;
716         gencsum(&f->header.checksum, f, f->header.length);
717         if (acpi_tb_checksum((uint8_t *)f, f->header.length) != 0) {
718                 fprintf(stderr, "ffadt has bad checksum v2\n");
719                 exit(1);
720         }
721
722         m = a;
723         *m = madt;
724         x->table_offset_entry[3] = (uint64_t) m;
725         a += sizeof(*m);
726         fprintf(stderr, "install madt to %p\n", m);
727         memmove(a, &Apic0, sizeof(Apic0));
728         a += sizeof(Apic0);
729         memmove(a, &Apic1, sizeof(Apic1));
730         a += sizeof(Apic1);
731         memmove(a, &isor, sizeof(isor));
732         a += sizeof(isor);
733         m->header.length = a - (void *)m;
734         gencsum(&m->header.checksum, m, m->header.length);
735         if (acpi_tb_checksum((uint8_t *) m, m->header.length) != 0) {
736                 fprintf(stderr, "madt has bad checksum v2\n");
737                 exit(1);
738         }
739         fprintf(stderr, "allchecksums ok\n");
740
741         gencsum(&x->header.checksum, x, x->header.length);
742         if ((csum = acpi_tb_checksum((uint8_t *) x, x->header.length)) != 0) {
743                 fprintf(stderr, "XSDT has bad checksum; summed to %x\n", csum);
744                 exit(1);
745         }
746
747         hexdump(stdout, r, a-(void *)r);
748
749         a = (void *)(((unsigned long)a + 0xfff) & ~0xfff);
750         gpci.posted_irq_desc = a;
751         memset(a, 0, 4096);
752         a += 4096;
753         gpci.vapic_addr = a;
754         //vmctl.vapic = (uint64_t) a_page;
755         memset(a, 0, 4096);
756         ((uint32_t *)a)[0x30/4] = 0x01060014;
757         p64 = a;
758         // set up apic values? do we need to?
759         // qemu does this.
760         //((uint8_t *)a)[4] = 1;
761         a += 4096;
762         gpci.apic_addr = (void*)0xfee00000;
763
764         /* Allocate memory for, and zero the bootparams
765          * page before writing to it, or Linux thinks
766          * we're talking crazy.
767          */
768         a += 4096;
769         bp = a;
770         memset(bp, 0, 4096);
771
772         /* Set the kernel command line parameters */
773         a += 4096;
774         cmdline = a;
775         a += 4096;
776         bp->hdr.cmd_line_ptr = (uintptr_t) cmdline;
777         sprintf(cmdline, "earlyprintk=vmcall,keep"
778                              " console=hvc0"
779                              " virtio_mmio.device=1M@0x100000000:32"
780                              " nosmp"
781                              " maxcpus=1"
782                              " acpi.debug_layer=0x2"
783                              " acpi.debug_level=0xffffffff"
784                              " apic=debug"
785                              " noexec=off"
786                              " nohlt"
787                              " init=/bin/sh"
788                              " lapic=notscdeadline"
789                              " lapictimerfreq=1000"
790                              " pit=none");
791
792
793         /* Put the e820 memory region information in the boot_params */
794         bp->e820_entries = 3;
795         int e820i = 0;
796
797         bp->e820_map[e820i].addr = 0;
798         bp->e820_map[e820i].size = 16 * 1048576;
799         bp->e820_map[e820i++].type = E820_RESERVED;
800
801         bp->e820_map[e820i].addr = 16 * 1048576;
802         bp->e820_map[e820i].size = 128 * 1048576;
803         bp->e820_map[e820i++].type = E820_RAM;
804
805         bp->e820_map[e820i].addr = 0xf0000000;
806         bp->e820_map[e820i].size = 0x10000000;
807         bp->e820_map[e820i++].type = E820_RESERVED;
808
809         if (ros_syscall(SYS_vmm_setup, nr_gpcs, &gpci, vmmflags, 0, 0, 0) !=
810             nr_gpcs) {
811                 perror("Guest pcore setup failed");
812                 exit(1);
813         }
814
815         fprintf(stderr, "Run with %d cores and vmmflags 0x%x\n", nr_gpcs, vmmflags);
816         mcp = 1;
817         if (mcp) {
818                 my_retvals = malloc(sizeof(void*) * nr_threads);
819                 if (!my_retvals)
820                         perror("Init threads/malloc");
821
822                 pthread_can_vcore_request(FALSE);       /* 2LS won't manage vcores */
823                 pthread_need_tls(FALSE);
824                 pthread_mcp_init();                                     /* gives us one vcore */
825                 vcore_request(nr_threads - 1);          /* ghetto incremental interface */
826                 for (int i = 0; i < nr_threads; i++) {
827                         xp = __procinfo.vcoremap;
828                         fprintf(stderr, "%p\n", __procinfo.vcoremap);
829                         fprintf(stderr, "Vcore %d mapped to pcore %d\n", i,
830                                 __procinfo.vcoremap[i].pcoreid);
831                 }
832         }
833
834         ret = syscall(33, 1);
835         if (ret < 0) {
836                 perror("vm setup");
837                 exit(1);
838         }
839         ret = posix_memalign((void **)&p512, 4096, 3*4096);
840         fprintf(stderr, "memalign is %p\n", p512);
841         if (ret) {
842                 perror("ptp alloc");
843                 exit(1);
844         }
845         p1 = &p512[512];
846         p2m = &p512[1024];
847         uint64_t kernbase = 0; //0xffffffff80000000;
848         uint64_t highkernbase = 0xffffffff80000000;
849         p512[PML4(kernbase)] = (unsigned long long)p1 | 7;
850         p1[PML3(kernbase)] = /*0x87; */(unsigned long long)p2m | 7;
851         p512[PML4(highkernbase)] = (unsigned long long)p1 | 7;
852         p1[PML3(highkernbase)] = /*0x87; */(unsigned long long)p2m | 7;
853 #define _2MiB (0x200000)
854
855         for (i = 0; i < 512; i++) {
856                 p2m[PML2(kernbase + i * _2MiB)] = 0x87 | i * _2MiB;
857         }
858
859         kernbase >>= (0+12);
860         kernbase <<= (0 + 12);
861         uint8_t *kernel = (void *)GKERNBASE;
862         //write_coreboot_table(coreboot_tables, ((void *)VIRTIOBASE) /*kernel*/, KERNSIZE + 1048576);
863         hexdump(stdout, coreboot_tables, 512);
864         fprintf(stderr, "kernbase for pml4 is 0x%llx and entry is %llx\n", kernbase, entry);
865         fprintf(stderr, "p512 %p p512[0] is 0x%lx p1 %p p1[0] is 0x%x\n", p512, p512[0], p1, p1[0]);
866         vmctl.interrupt = 0;
867         vmctl.command = REG_RSP_RIP_CR3;
868         vmctl.cr3 = (uint64_t) p512;
869         vmctl.regs.tf_rip = entry;
870         vmctl.regs.tf_rsp = (uint64_t) &stack[1024];
871         vmctl.regs.tf_rsi = (uint64_t) bp;
872         if (mcp) {
873                 /* set up virtio bits, which depend on threads being enabled. */
874                 register_virtio_mmio(&vqdev, virtio_mmio_base);
875         }
876         fprintf(stderr, "threads started\n");
877         fprintf(stderr, "Writing command :%s:\n", cmd);
878
879         if (debug)
880                 vapic_status_dump(stderr, (void *)gpci.vapic_addr);
881
882         run_vmthread(&vmctl);
883
884         if (debug)
885                 vapic_status_dump(stderr, (void *)gpci.vapic_addr);
886
887         while (1) {
888
889                 int c;
890                 uint8_t byte;
891                 vmctl.command = REG_RIP;
892                 if (maxresume-- == 0) {
893                         debug = 1;
894                         resumeprompt = 1;
895                 }
896                 if (debug) {
897                         fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
898                         showstatus(stderr, &vmctl);
899                 }
900                 if (resumeprompt) {
901                         fprintf(stderr, "RESUME?\n");
902                         c = getchar();
903                         if (c == 'q')
904                                 break;
905                 }
906                 if (vmctl.shutdown == SHUTDOWN_EPT_VIOLATION) {
907                         uint64_t gpa, *regp, val;
908                         uint8_t regx;
909                         int store, size;
910                         int advance;
911                         if (decode(&vmctl, &gpa, &regx, &regp, &store, &size, &advance)) {
912                                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
913                                 showstatus(stderr, &vmctl);
914                                 quit = 1;
915                                 break;
916                         }
917                         if (debug) fprintf(stderr, "%p %p %p %p %p %p\n", gpa, regx, regp, store, size, advance);
918                         if ((gpa & ~0xfffULL) == virtiobase) {
919                                 if (debug) fprintf(stderr, "DO SOME VIRTIO\n");
920                                 // Lucky for us the various virtio ops are well-defined.
921                                 virtio_mmio(&vmctl, gpa, regx, regp, store);
922                                 if (debug) fprintf(stderr, "store is %d:\n", store);
923                                 if (debug) fprintf(stderr, "REGP IS %16x:\n", *regp);
924                         } else if ((gpa & 0xfee00000) == 0xfee00000) {
925                                 // until we fix our include mess, just put the proto here.
926                                 //int apic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
927                                 //apic(&vmctl, gpa, regx, regp, store);
928                         } else if ((gpa & 0xfec00000) == 0xfec00000) {
929                                 // until we fix our include mess, just put the proto here.
930                                 int do_ioapic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
931                                 do_ioapic(&vmctl, gpa, regx, regp, store);
932                         } else if (gpa < 4096) {
933                                 uint64_t val = 0;
934                                 memmove(&val, &low4k[gpa], size);
935                                 hexdump(stdout, &low4k[gpa], size);
936                                 fprintf(stderr, "Low 1m, code %p read @ %p, size %d, val %p\n", vmctl.regs.tf_rip, gpa, size, val);
937                                 memmove(regp, &low4k[gpa], size);
938                                 hexdump(stdout, regp, size);
939                         } else {
940                                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
941                                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
942                                 fprintf(stderr, "Returning 0xffffffff\n");
943                                 showstatus(stderr, &vmctl);
944                                 // Just fill the whole register for now.
945                                 *regp = (uint64_t) -1;
946                         }
947                         vmctl.regs.tf_rip += advance;
948                         if (debug) fprintf(stderr, "Advance rip by %d bytes to %p\n", advance, vmctl.regs.tf_rip);
949                         vmctl.shutdown = 0;
950                         vmctl.gpa = 0;
951                         vmctl.command = REG_ALL;
952                 } else if (vmctl.shutdown == SHUTDOWN_UNHANDLED_EXIT_REASON) {
953                         switch(vmctl.ret_code){
954                         case  EXIT_REASON_VMCALL:
955                                 byte = vmctl.regs.tf_rdi;
956                                 printf("%c", byte);
957                                 if (byte == '\n') printf("%c", '%');
958                                 vmctl.regs.tf_rip += 3;
959                                 break;
960                         case EXIT_REASON_EXTERNAL_INTERRUPT:
961                                 //debug = 1;
962                                 if (debug) fprintf(stderr, "XINT 0x%x 0x%x\n", vmctl.intrinfo1, vmctl.intrinfo2);
963                                 if (debug) pir_dump();
964                                 vmctl.command = RESUME;
965                                 break;
966                         case EXIT_REASON_IO_INSTRUCTION:
967                                 fprintf(stderr, "IO @ %p\n", vmctl.regs.tf_rip);
968                                 io(&vmctl);
969                                 vmctl.shutdown = 0;
970                                 vmctl.gpa = 0;
971                                 vmctl.command = REG_ALL;
972                                 break;
973                         case EXIT_REASON_INTERRUPT_WINDOW:
974                                 if (consdata) {
975                                         if (debug) fprintf(stderr, "inject an interrupt\n");
976                                         virtio_mmio_set_vring_irq();
977                                         vmctl.interrupt = 0x80000000 | virtioirq;
978                                         vmctl.command = RESUME;
979                                         consdata = 0;
980                                 }
981                                 break;
982                         case EXIT_REASON_MSR_WRITE:
983                         case EXIT_REASON_MSR_READ:
984                                 fprintf(stderr, "Do an msr\n");
985                                 if (msrio(&vmctl, vmctl.ret_code)) {
986                                         // uh-oh, msrio failed
987                                         // well, hand back a GP fault which is what Intel does
988                                         fprintf(stderr, "MSR FAILED: RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
989                                         showstatus(stderr, &vmctl);
990
991                                         // Use event injection through vmctl to send
992                                         // a general protection fault
993                                         // vmctl.interrupt gets written to the VM-Entry
994                                         // Interruption-Information Field by vmx
995                                         vmctl.interrupt = (1 << 31) // "Valid" bit
996                                                         | (0 << 12) // Reserved by Intel
997                                                         | (1 << 11) // Deliver-error-code bit (set if event pushes error code to stack)
998                                                         | (3 << 8)  // Event type (3 is "hardware exception")
999                                                         | 13;       // Interrupt/exception vector (13 is "general protection fault")
1000                                         run_vmthread(&vmctl);
1001                                 } else {
1002                                         vmctl.regs.tf_rip += 2;
1003                                         run_vmthread(&vmctl);
1004                                 }
1005                                 break;
1006                         case EXIT_REASON_MWAIT_INSTRUCTION:
1007                           fflush(stdout);
1008                                 if (debug)fprintf(stderr, "\n================== Guest MWAIT. =======================\n");
1009                                 if (debug)fprintf(stderr, "Wait for cons data\n");
1010                                 while (!consdata)
1011                                         ;
1012                                 //debug = 1;
1013                                 if (debug)
1014                                         vapic_status_dump(stderr, gpci.vapic_addr);
1015                                 if (debug)fprintf(stderr, "Resume with consdata ...\n");
1016                                 vmctl.regs.tf_rip += 3;
1017                                 run_vmthread(&vmctl);
1018                                 //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
1019                                 //showstatus(stderr, &vmctl);
1020                                 break;
1021                         case EXIT_REASON_HLT:
1022                                 fflush(stdout);
1023                                 if (debug)fprintf(stderr, "\n================== Guest halted. =======================\n");
1024                                 if (debug)fprintf(stderr, "Wait for cons data\n");
1025                                 while (!consdata)
1026                                         ;
1027                                 //debug = 1;
1028                                 if (debug)fprintf(stderr, "Resume with consdata ...\n");
1029                                 vmctl.regs.tf_rip += 1;
1030                                 run_vmthread(&vmctl);
1031                                 //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
1032                                 //showstatus(stderr, &vmctl);
1033                                 break;
1034                         case EXIT_REASON_APIC_ACCESS:
1035                                 if (1 || debug)fprintf(stderr, "APIC READ EXIT\n");
1036
1037                                 uint64_t gpa, *regp, val;
1038                                 uint8_t regx;
1039                                 int store, size;
1040                                 int advance;
1041                                 if (decode(&vmctl, &gpa, &regx, &regp, &store, &size, &advance)) {
1042                                         fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
1043                                         showstatus(stderr, &vmctl);
1044                                         quit = 1;
1045                                         break;
1046                                 }
1047
1048                                 int apic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
1049                                 apic(&vmctl, gpa, regx, regp, store);
1050                                 vmctl.regs.tf_rip += advance;
1051                                 if (debug) fprintf(stderr, "Advance rip by %d bytes to %p\n", advance, vmctl.regs.tf_rip);
1052                                 vmctl.shutdown = 0;
1053                                 vmctl.gpa = 0;
1054                                 vmctl.command = REG_ALL;
1055                                 break;
1056                         case EXIT_REASON_APIC_WRITE:
1057                                 if (1 || debug)fprintf(stderr, "APIC WRITE EXIT\n");
1058                                 break;
1059                         default:
1060                                 fprintf(stderr, "Don't know how to handle exit %d\n", vmctl.ret_code);
1061                                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
1062                                 showstatus(stderr, &vmctl);
1063                                 quit = 1;
1064                                 break;
1065                         }
1066                 }
1067                 if (debug) fprintf(stderr, "at bottom of switch, quit is %d\n", quit);
1068                 if (quit)
1069                         break;
1070                 if (consdata) {
1071                         if (debug) fprintf(stderr, "inject an interrupt\n");
1072                         if (debug) fprintf(stderr, "XINT 0x%x 0x%x\n", vmctl.intrinfo1, vmctl.intrinfo2);
1073                         vmctl.interrupt = 0x80000000 | virtioirq;
1074                         virtio_mmio_set_vring_irq();
1075                         consdata = 0;
1076                         //debug = 1;
1077                         vmctl.command = RESUME;
1078                 }
1079                 if (debug) fprintf(stderr, "NOW DO A RESUME\n");
1080                 run_vmthread(&vmctl);
1081         }
1082
1083         /* later.
1084         for (int i = 0; i < nr_threads-1; i++) {
1085                 int ret;
1086                 if (pthread_join(my_threads[i], &my_retvals[i]))
1087                         perror("pth_join failed");
1088                 fprintf(stderr, "%d %d\n", i, ret);
1089         }
1090  */
1091
1092         fflush(stdout);
1093         exit(0);
1094 }