2dd3778dd3437692b0321f53d8463dab2403b4c6
[akaros.git] / tests / vmm / vmrunkernel.c
1 #include <stdio.h> 
2 #include <pthread.h>
3 #include <sys/types.h>
4 #include <sys/stat.h>
5 #include <fcntl.h>
6 #include <parlib/arch/arch.h>
7 #include <parlib/ros_debug.h>
8 #include <unistd.h>
9 #include <errno.h>
10 #include <dirent.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <ros/syscall.h>
14 #include <sys/mman.h>
15 #include <vmm/coreboot_tables.h>
16 #include <vmm/vmm.h>
17 #include <vmm/acpi/acpi.h>
18 #include <ros/arch/mmu.h>
19 #include <ros/vmm.h>
20 #include <parlib/uthread.h>
21 #include <vmm/virtio.h>
22 #include <vmm/virtio_mmio.h>
23 #include <vmm/virtio_ids.h>
24 #include <vmm/virtio_config.h>
25
26 int msrio(struct vmctl *vcpu, uint32_t opcode);
27
28 struct vmctl vmctl;
29 struct vmm_gpcore_init gpci;
30
31 /* Whoever holds the ball runs.  run_vm never actually grabs it - it is grabbed
32  * on its behalf. */
33 uth_mutex_t the_ball;
34 pthread_t vm_thread;
35 void (*old_thread_refl)(struct uthread *uth, struct user_context *ctx);
36
37 /* callback, runs in vcore context.  this sets up our initial context.  once we
38  * become runnable again, we'll run the first bits of the vm ctx.  after that,
39  * our context will be stopped and started and will just run whatever the guest
40  * VM wants.  we'll never come back to this code or to run_vm(). */
41 static void __build_vm_ctx_cb(struct uthread *uth, void *arg)
42 {
43         struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
44         struct vmctl *vmctl = (struct vmctl*)arg;
45         struct vm_trapframe *vm_tf;
46
47         __pthread_generic_yield(pthread);
48         pthread->state = PTH_BLK_YIELDING;
49
50         memset(&uth->u_ctx, 0, sizeof(struct user_context));
51         uth->u_ctx.type = ROS_VM_CTX;
52         vm_tf = &uth->u_ctx.tf.vm_tf;
53
54         vm_tf->tf_guest_pcoreid = 0;    /* assuming only 1 guest core */
55         vm_tf->tf_cr3 = vmctl->cr3;
56         vm_tf->tf_rip = vmctl->regs.tf_rip;
57         vm_tf->tf_rsp = vmctl->regs.tf_rsp;
58
59         /* other HW/GP regs are 0, which should be fine.  the FP state is still
60          * whatever we were running before, though this is pretty much unnecessary.
61          * we mostly don't want crazy crap in the uth->as, and a non-current_uthread
62          * VM ctx is supposed to have something in their FP state (like HW ctxs). */
63         save_fp_state(&uth->as);
64         uth->flags |= UTHREAD_FPSAVED | UTHREAD_SAVED;
65
66         uthread_runnable(uth);
67 }
68
69 static void *run_vm(void *arg)
70 {
71         struct vmctl *vmctl = (struct vmctl*)arg;
72
73         assert(vmctl->command == REG_RSP_RIP_CR3);
74         /* We need to hack our context, so that next time we run, we're a VM ctx */
75         uthread_yield(FALSE, __build_vm_ctx_cb, arg);
76 }
77
78 static void vmm_thread_refl_fault(struct uthread *uth,
79                                   struct user_context *ctx)
80 {
81         struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
82
83         /* Hack to call the original pth 2LS op */
84         if (!ctx->type == ROS_VM_CTX) {
85                 old_thread_refl(uth, ctx);
86                 return;
87         }
88         __pthread_generic_yield(pthread);
89         /* normally we'd handle the vmexit here.  to work within the existing
90          * framework, we just wake the controller thread.  It'll look at our ctx
91          * then make us runnable again */
92         pthread->state = PTH_BLK_MUTEX;
93         uth_mutex_unlock(the_ball);             /* wake the run_vmthread */
94 }
95
96 static void copy_vmtf_to_vmctl(struct vm_trapframe *vm_tf, struct vmctl *vmctl)
97 {
98         vmctl->cr3 = vm_tf->tf_cr3;
99         vmctl->gva = vm_tf->tf_guest_va;
100         vmctl->gpa = vm_tf->tf_guest_pa;
101         vmctl->exit_qual = vm_tf->tf_exit_qual;
102         if (vm_tf->tf_exit_reason == EXIT_REASON_EPT_VIOLATION)
103                 vmctl->shutdown = SHUTDOWN_EPT_VIOLATION;
104         else
105                 vmctl->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
106         vmctl->ret_code = vm_tf->tf_exit_reason;
107         vmctl->interrupt = vm_tf->tf_trap_inject;
108         vmctl->intrinfo1 = vm_tf->tf_intrinfo1;
109         vmctl->intrinfo2 = vm_tf->tf_intrinfo2;
110         /* Most of the HW TF.  Should be good enough for now */
111         vmctl->regs.tf_rax = vm_tf->tf_rax;
112         vmctl->regs.tf_rbx = vm_tf->tf_rbx;
113         vmctl->regs.tf_rcx = vm_tf->tf_rcx;
114         vmctl->regs.tf_rdx = vm_tf->tf_rdx;
115         vmctl->regs.tf_rbp = vm_tf->tf_rbp;
116         vmctl->regs.tf_rsi = vm_tf->tf_rsi;
117         vmctl->regs.tf_rdi = vm_tf->tf_rdi;
118         vmctl->regs.tf_r8  = vm_tf->tf_r8;
119         vmctl->regs.tf_r9  = vm_tf->tf_r9;
120         vmctl->regs.tf_r10 = vm_tf->tf_r10;
121         vmctl->regs.tf_r11 = vm_tf->tf_r11;
122         vmctl->regs.tf_r12 = vm_tf->tf_r12;
123         vmctl->regs.tf_r13 = vm_tf->tf_r13;
124         vmctl->regs.tf_r14 = vm_tf->tf_r14;
125         vmctl->regs.tf_r15 = vm_tf->tf_r15;
126         vmctl->regs.tf_rip = vm_tf->tf_rip;
127         vmctl->regs.tf_rflags = vm_tf->tf_rflags;
128         vmctl->regs.tf_rsp = vm_tf->tf_rsp;
129 }
130
131 static void copy_vmctl_to_vmtf(struct vmctl *vmctl, struct vm_trapframe *vm_tf)
132 {
133         vm_tf->tf_rax = vmctl->regs.tf_rax;
134         vm_tf->tf_rbx = vmctl->regs.tf_rbx;
135         vm_tf->tf_rcx = vmctl->regs.tf_rcx;
136         vm_tf->tf_rdx = vmctl->regs.tf_rdx;
137         vm_tf->tf_rbp = vmctl->regs.tf_rbp;
138         vm_tf->tf_rsi = vmctl->regs.tf_rsi;
139         vm_tf->tf_rdi = vmctl->regs.tf_rdi;
140         vm_tf->tf_r8  = vmctl->regs.tf_r8;
141         vm_tf->tf_r9  = vmctl->regs.tf_r9;
142         vm_tf->tf_r10 = vmctl->regs.tf_r10;
143         vm_tf->tf_r11 = vmctl->regs.tf_r11;
144         vm_tf->tf_r12 = vmctl->regs.tf_r12;
145         vm_tf->tf_r13 = vmctl->regs.tf_r13;
146         vm_tf->tf_r14 = vmctl->regs.tf_r14;
147         vm_tf->tf_r15 = vmctl->regs.tf_r15;
148         vm_tf->tf_rip = vmctl->regs.tf_rip;
149         vm_tf->tf_rflags = vmctl->regs.tf_rflags;
150         vm_tf->tf_rsp = vmctl->regs.tf_rsp;
151         vm_tf->tf_cr3 = vmctl->cr3;
152         vm_tf->tf_trap_inject = vmctl->interrupt;
153         /* Don't care about the rest of the fields.  The kernel only writes them */
154 }
155
156 /* this will start the vm thread, and return when the thread has blocked,
157  * with the right info in vmctl. */
158 static void run_vmthread(struct vmctl *vmctl)
159 {
160         struct vm_trapframe *vm_tf;
161
162         if (!vm_thread) {
163                 /* first time through, we make the vm thread.  the_ball was already
164                  * grabbed right after it was alloc'd. */
165                 if (pthread_create(&vm_thread, NULL, run_vm, vmctl)) {
166                         perror("pth_create");
167                         exit(-1);
168                 }
169                 /* hack in our own handlers for some 2LS ops */
170                 old_thread_refl = sched_ops->thread_refl_fault;
171                 sched_ops->thread_refl_fault = vmm_thread_refl_fault;
172         } else {
173                 copy_vmctl_to_vmtf(vmctl, &vm_thread->uthread.u_ctx.tf.vm_tf);
174                 uth_mutex_lock(the_ball);       /* grab it for the vm_thread */
175                 uthread_runnable((struct uthread*)vm_thread);
176         }
177         uth_mutex_lock(the_ball);
178         /* We woke due to a vm exit.  Need to unlock for the next time we're run */
179         uth_mutex_unlock(the_ball);
180         /* the vm stopped.  we can do whatever we want before rerunning it.  since
181          * we're controlling the uth, we need to handle its vmexits.  we'll fill in
182          * the vmctl, since that's the current framework. */
183         copy_vmtf_to_vmctl(&vm_thread->uthread.u_ctx.tf.vm_tf, vmctl);
184 }
185
186 /* Kind of sad what a total clusterf the pc world is. By 1999, you could just scan the hardware 
187  * and work it out. But 2005, that was no longer possible. How sad. 
188  * so we have to fake acpi to make it all work. !@#$!@#$#.
189  * This will be copied to memory at 0xe0000, so the kernel can find it.
190  */
191 /* assume they're all 256 bytes long just to make it easy. Just have pointers that point to aligned things. */
192
193 struct acpi_table_rsdp rsdp = {
194         .signature = "RSD PTR ",
195         .oem_id = "AKAROS",
196         .revision = 2,
197         .length = 36,
198 };
199
200 struct acpi_table_xsdt xsdt = {
201         .header = {
202                 .signature= "XSDT",
203                 // This is so stupid. Incredibly stupid.
204                 .revision = 0,
205                 .oem_id = "AKAROS",
206                 .oem_table_id = "ALPHABET",
207                 .oem_revision = 0,
208                 .asl_compiler_id = "RON ",
209                 .asl_compiler_revision = 0,
210         },
211 };
212 struct acpi_table_fadt fadt = {
213         .header = {
214                 .signature= "FADT",
215                 // This is so stupid. Incredibly stupid.
216                 .revision = 0,
217                 .oem_id = "AKAROS",
218                 .oem_table_id = "ALPHABET",
219                 .oem_revision = 0,
220                 .asl_compiler_id = "RON ",
221                 .asl_compiler_revision = 0,
222         },
223 };
224
225 /* This has to be dropped into memory, then the other crap just follows it.
226  */
227 struct acpi_table_madt madt = {
228         .header = {
229                 .signature = "APIC",
230                 .revision = 0,
231                 .oem_id = "AKAROS",
232                 .oem_table_id = "ALPHABET",
233                 .oem_revision = 0,
234                 .asl_compiler_id = "RON ",
235                 .asl_compiler_revision = 0,
236         },
237         
238         .address = 0xfee00000ULL,
239 };
240
241 struct acpi_madt_local_apic Apic0 = {.header = {.type = ACPI_MADT_TYPE_LOCAL_APIC, .length = sizeof(struct acpi_madt_local_apic)},
242                                      .processor_id = 0, .id = 0};
243 struct acpi_madt_io_apic Apic1 = {.header = {.type = ACPI_MADT_TYPE_IO_APIC, .length = sizeof(struct acpi_madt_io_apic)},
244                                   .id = 1, .address = 0xfec00000, .global_irq_base = 0};
245 struct acpi_madt_interrupt_override isor[] = {
246         /* I have no idea if it should be source irq 2, global 0, or global 2, source 0. Shit. */
247         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
248          .bus = 0, .source_irq = 2, .global_irq = 0, .inti_flags = 0},
249         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
250          .bus = 0, .source_irq = 1, .global_irq = 1, .inti_flags = 0},
251         //{.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
252          //.bus = 0, .source_irq = 2, .global_irq = 2, .inti_flags = 0},
253         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
254          .bus = 0, .source_irq = 3, .global_irq = 3, .inti_flags = 0},
255         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
256          .bus = 0, .source_irq = 4, .global_irq = 4, .inti_flags = 0},
257         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
258          .bus = 0, .source_irq = 5, .global_irq = 5, .inti_flags = 0},
259         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
260          .bus = 0, .source_irq = 6, .global_irq = 6, .inti_flags = 0},
261         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
262          .bus = 0, .source_irq = 7, .global_irq = 7, .inti_flags = 0},
263         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
264          .bus = 0, .source_irq = 8, .global_irq = 8, .inti_flags = 0},
265         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
266          .bus = 0, .source_irq = 9, .global_irq = 9, .inti_flags = 0},
267         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
268          .bus = 0, .source_irq = 10, .global_irq = 10, .inti_flags = 0},
269         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
270          .bus = 0, .source_irq = 11, .global_irq = 11, .inti_flags = 0},
271         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
272          .bus = 0, .source_irq = 12, .global_irq = 12, .inti_flags = 0},
273         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
274          .bus = 0, .source_irq = 13, .global_irq = 13, .inti_flags = 0},
275         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
276          .bus = 0, .source_irq = 14, .global_irq = 14, .inti_flags = 0},
277         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
278          .bus = 0, .source_irq = 15, .global_irq = 15, .inti_flags = 0},
279         // VMMCP routes irq 32 to gsi 17
280         {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
281          .bus = 0, .source_irq = 32, .global_irq = 17, .inti_flags = 5},
282 };
283
284
285 /* this test will run the "kernel" in the negative address space. We hope. */
286 void *low1m;
287 uint8_t low4k[4096];
288 unsigned long long stack[1024];
289 volatile int shared = 0;
290 volatile int quit = 0;
291 int mcp = 1;
292 int virtioirq = 17;
293
294 /* total hack. If the vm runs away we want to get control again. */
295 unsigned int maxresume = (unsigned int) -1;
296
297 #define MiB 0x100000u
298 #define GiB (1u<<30)
299 #define GKERNBASE (16*MiB)
300 #define KERNSIZE (128*MiB+GKERNBASE)
301 uint8_t _kernel[KERNSIZE];
302
303 unsigned long long *p512, *p1, *p2m;
304
305 void **my_retvals;
306 int nr_threads = 4;
307 int debug = 0;
308 int resumeprompt = 0;
309 /* unlike Linux, this shared struct is for both host and guest. */
310 //      struct virtqueue *constoguest = 
311 //              vring_new_virtqueue(0, 512, 8192, 0, inpages, NULL, NULL, "test");
312 uint64_t virtio_mmio_base = 0x100000000ULL;
313
314 void vapic_status_dump(FILE *f, void *vapic);
315 static void set_posted_interrupt(int vector);
316
317 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
318 #error "Get a gcc newer than 4.4.0"
319 #else
320 #define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
321 #endif
322
323 #define LOCK_PREFIX "lock "
324 #define ADDR                            BITOP_ADDR(addr)
325 static inline int test_and_set_bit(int nr, volatile unsigned long *addr);
326
327 void *timer_thread(void *arg)
328 {
329         while (1) {
330                 set_posted_interrupt(0xef);
331                 ros_syscall(SYS_vmm_poke_guest, 0, 0, 0, 0, 0, 0);
332                 uthread_usleep(1);
333         }
334 }
335
336 void *consout(void *arg)
337 {
338         char *line, *consline, *outline;
339         static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
340         static struct scatterlist in[] = { {NULL, sizeof(line)}, };
341         static struct scatterlist iov[32];
342         struct virtio_threadarg *a = arg;
343         static unsigned int inlen, outlen, conslen;
344         struct virtqueue *v = a->arg->virtio;
345         fprintf(stderr, "talk thread ..\n");
346         uint16_t head, gaveit = 0, gotitback = 0;
347         uint32_t vv;
348         int i;
349         int num;
350         if (debug) {
351                 fprintf(stderr, "----------------------- TT a %p\n", a);
352                 fprintf(stderr, "talk thread ttargs %x v %x\n", a, v);
353         }
354         
355         for(num = 0;;num++) {
356                 //int debug = 1;
357                 /* host: use any buffers we should have been sent. */
358                 head = wait_for_vq_desc(v, iov, &outlen, &inlen);
359                 if (debug)
360                         fprintf(stderr, "CCC: vq desc head %d, gaveit %d gotitback %d\n", head, gaveit, gotitback);
361                 for(i = 0; debug && i < outlen + inlen; i++)
362                         fprintf(stderr, "CCC: v[%d/%d] v %p len %d\n", i, outlen + inlen, iov[i].v, iov[i].length);
363                 /* host: if we got an output buffer, just output it. */
364                 for(i = 0; i < outlen; i++) {
365                         num++;
366                         int j;
367                         if (debug) {
368                                 fprintf(stderr, "CCC: IOV length is %d\n", iov[i].length);
369                         }
370                         for (j = 0; j < iov[i].length; j++)
371                                 printf("%c", ((char *)iov[i].v)[j]);
372                 }
373                 fflush(stdout);
374                 if (debug)
375                         fprintf(stderr, "CCC: outlen is %d; inlen is %d\n", outlen, inlen);
376                 /* host: fill in the writeable buffers. */
377                 /* why we're getting these I don't know. */
378                 for (i = outlen; i < outlen + inlen; i++) {
379                         if (debug) fprintf(stderr, "CCC: send back empty writeable");
380                         iov[i].length = 0;
381                 }
382                 if (debug) fprintf(stderr, "CCC: call add_used\n");
383                 /* host: now ack that we used them all. */
384                 add_used(v, head, outlen+inlen);
385                 if (debug) fprintf(stderr, "CCC: DONE call add_used\n");
386         }
387         fprintf(stderr, "All done\n");
388         return NULL;
389 }
390
391 // FIXME. 
392 volatile int consdata = 0;
393
394 void *consin(void *arg)
395 {
396         struct virtio_threadarg *a = arg;
397         char *line, *outline;
398         static char consline[128];
399         static struct scatterlist iov[32];
400         static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
401         static struct scatterlist in[] = { {NULL, sizeof(line)}, };
402
403         static unsigned int inlen, outlen, conslen;
404         struct virtqueue *v = a->arg->virtio;
405         fprintf(stderr, "consin thread ..\n");
406         uint16_t head, gaveit = 0, gotitback = 0;
407         uint32_t vv;
408         int i;
409         int num;
410         //char c[1];
411         int timer_started = 0;
412         pthread_t timerthread_struct;
413
414         if (debug) fprintf(stderr, "Spin on console being read, print num queues, halt\n");
415
416         for(num = 0;! quit;num++) {
417                 //int debug = 1;
418                 /* host: use any buffers we should have been sent. */
419                 head = wait_for_vq_desc(v, iov, &outlen, &inlen);
420                 if (debug)
421                         fprintf(stderr, "vq desc head %d, gaveit %d gotitback %d\n", head, gaveit, gotitback);
422                 for(i = 0; debug && i < outlen + inlen; i++)
423                         fprintf(stderr, "v[%d/%d] v %p len %d\n", i, outlen + inlen, iov[i].v, iov[i].length);
424                 if (debug)
425                         fprintf(stderr, "outlen is %d; inlen is %d\n", outlen, inlen);
426                 /* host: fill in the writeable buffers. */
427                 for (i = outlen; i < outlen + inlen; i++) {
428                         /* host: read a line. */
429                         memset(consline, 0, 128);
430                         if (read(0, consline, 1) < 0) {
431                                 exit(0);
432                         } 
433                         if (debug) fprintf(stderr, "CONSIN: GOT A LINE:%s:\n", consline);
434                         if (debug) fprintf(stderr, "CONSIN: OUTLEN:%d:\n", outlen);
435                         if (strlen(consline) < 3 && consline[0] == 'q' ) {
436                                 quit = 1;
437                                 break;
438                         }
439
440                         memmove(iov[i].v, consline, strlen(consline)+ 1);
441                         iov[i].length = strlen(consline) + 1;
442                 }
443                 if (debug) fprintf(stderr, "call add_used\n");
444                 /* host: now ack that we used them all. */
445                 add_used(v, head, outlen+inlen);
446                 /* turn off consdata - the IRQ injection isn't right */
447                 //consdata = 1;
448                 if (debug) fprintf(stderr, "DONE call add_used\n");
449
450                 // Send spurious for testing (Gan)
451                 set_posted_interrupt(0xE5);
452                 virtio_mmio_set_vring_irq();
453
454                 ros_syscall(SYS_vmm_poke_guest, 0, 0, 0, 0, 0, 0);
455                 /*if (!timer_started && mcp) {
456                         // Start up timer thread
457                         if (pthread_create(&timerthread_struct, NULL, timer_thread, NULL)) {
458                                 fprintf(stderr, "pth_create failed for timer thread.");
459                                 perror("pth_create");
460                         } else {
461                                 timer_started = 1;
462                         }
463                 }*/
464         }
465         fprintf(stderr, "All done\n");
466         return NULL;
467 }
468
469 static struct vqdev vqdev= {
470 name: "console",
471 dev: VIRTIO_ID_CONSOLE,
472 device_features: 0, /* Can't do it: linux console device does not support it. VIRTIO_F_VERSION_1*/
473 numvqs: 2,
474 vqs: {
475                 {name: "consin", maxqnum: 64, f: consin, arg: (void *)0},
476                 {name: "consout", maxqnum: 64, f: consout, arg: (void *)0},
477         }
478 };
479
480 void lowmem() {
481         __asm__ __volatile__ (".section .lowmem, \"aw\"\n\tlow: \n\t.=0x1000\n\t.align 0x100000\n\t.previous\n");
482 }
483
484 static uint8_t acpi_tb_checksum(uint8_t *buffer, uint32_t length)
485 {
486         uint8_t sum = 0;
487         uint8_t *end = buffer + length;
488         fprintf(stderr, "tbchecksum %p for %d", buffer, length);
489         while (buffer < end) {
490                 if (end - buffer < 2)
491                         fprintf(stderr, "%02x\n", sum);
492                 sum = (uint8_t)(sum + *(buffer++));
493         }
494         fprintf(stderr, " is %02x\n", sum);
495         return (sum);
496 }
497
498 static void gencsum(uint8_t *target, void *data, int len)
499 {
500         uint8_t csum;
501         // blast target to zero so it does not get counted (it might be in the struct we checksum) 
502         // And, yes, it is, goodness.
503         fprintf(stderr, "gencsum %p target %p source %d bytes\n", target, data, len);
504         *target = 0;
505         csum  = acpi_tb_checksum((uint8_t *)data, len);
506         *target = ~csum + 1;
507         fprintf(stderr, "Cmoputed is %02x\n", *target);
508 }
509
510 static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
511 {
512         int oldbit;
513
514         asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
515                      "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
516
517         return oldbit;
518 }
519
520 static void pir_dump()
521 {
522         unsigned long *pir_ptr = gpci.posted_irq_desc;
523         int i;
524         fprintf(stderr, "-------Begin PIR dump-------\n");
525         for (i = 0; i < 8; i++){
526                 fprintf(stderr, "Byte %d: 0x%016x\n", i, pir_ptr[i]);
527         }
528         fprintf(stderr, "-------End PIR dump-------\n");
529 }
530
531 static void set_posted_interrupt(int vector)
532 {
533         test_and_set_bit(vector, gpci.posted_irq_desc);
534         /* LOCKed instruction provides the mb() */
535         test_and_set_bit(VMX_POSTED_OUTSTANDING_NOTIF, gpci.posted_irq_desc);
536 }
537
538 int main(int argc, char **argv)
539 {
540         uint64_t *p64;
541         void *a = (void *)0xe0000;
542         struct acpi_table_rsdp *r;
543         struct acpi_table_fadt *f;
544         struct acpi_table_madt *m;
545         struct acpi_table_xsdt *x;
546         uint64_t virtiobase = 0x100000000ULL;
547         // lowmem is a bump allocated pointer to 2M at the "physbase" of memory 
548         void *lowmem = (void *) 0x1000000;
549         //struct vmctl vmctl;
550         int amt;
551         int vmmflags = 0; // Disabled probably forever. VMM_VMCALL_PRINTF;
552         uint64_t entry = 0x1200000, kerneladdress = 0x1200000;
553         int nr_gpcs = 1;
554         int ret;
555         void * xp;
556         int kfd = -1;
557         static char cmd[512];
558         int i;
559         uint8_t csum;
560         void *coreboot_tables = (void *) 0x1165000;
561         void *a_page;
562
563         the_ball = uth_mutex_alloc();
564         uth_mutex_lock(the_ball);
565
566         fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT,
567                         PML1_PTE_REACH);
568
569         // mmap is not working for us at present.
570         if ((uint64_t)_kernel > GKERNBASE) {
571                 fprintf(stderr, "kernel array @%p is above , GKERNBASE@%p sucks\n", _kernel, GKERNBASE);
572                 exit(1);
573         }
574         memset(_kernel, 0, sizeof(_kernel));
575         memset(lowmem, 0xff, 2*1048576);
576         memset(low4k, 0xff, 4096);
577         // avoid at all costs, requires too much instruction emulation.
578         //low4k[0x40e] = 0;
579         //low4k[0x40f] = 0xe0;
580
581         //Place mmap(Gan)
582         a_page = mmap((void *)0xfee00000, PGSIZE, PROT_READ | PROT_WRITE,
583                               MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
584         fprintf(stderr, "a_page mmap pointer %p", a_page);
585
586         if (a_page == (void *) -1) {
587                 perror("Could not mmap APIC");
588                 exit(1);
589         }
590         if (((uint64_t)a_page & 0xfff) != 0) {
591                 perror("APIC page mapping is not page aligned");
592                 exit(1);
593         }
594
595         memset(a_page, 0, 4096);
596         //((uint32_t *)a_page)[0x30/4] = 0x01060015;
597         ((uint32_t *)a_page)[0x30/4] = 0xDEADBEEF;
598
599
600         argc--,argv++;
601         // switches ...
602         // Sorry, I don't much like the gnu opt parsing code.
603         while (1) {
604                 if (*argv[0] != '-')
605                         break;
606                 switch(argv[0][1]) {
607                 case 'd':
608                         debug++;
609                         break;
610                 case 'v':
611                         vmmflags |= VMM_VMCALL_PRINTF;
612                         break;
613                 case 'm':
614                         argc--,argv++;
615                         maxresume = strtoull(argv[0], 0, 0);
616                         break;
617                 case 'i':
618                         argc--,argv++;
619                         virtioirq = strtoull(argv[0], 0, 0);
620                         break;
621                 default:
622                         fprintf(stderr, "BMAFR\n");
623                         break;
624                 }
625                 argc--,argv++;
626         }
627         if (argc < 1) {
628                 fprintf(stderr, "Usage: %s vmimage [-n (no vmcall printf)] [coreboot_tables [loadaddress [entrypoint]]]\n", argv[0]);
629                 exit(1);
630         }
631         if (argc > 1)
632                 coreboot_tables = (void *) strtoull(argv[1], 0, 0);
633         if (argc > 2)
634                 kerneladdress = strtoull(argv[2], 0, 0);
635         if (argc > 3)
636                 entry = strtoull(argv[3], 0, 0);
637         kfd = open(argv[0], O_RDONLY);
638         if (kfd < 0) {
639                 perror(argv[0]);
640                 exit(1);
641         }
642         // read in the kernel.
643         xp = (void *)kerneladdress;
644         for(;;) {
645                 amt = read(kfd, xp, 1048576);
646                 if (amt < 0) {
647                         perror("read");
648                         exit(1);
649                 }
650                 if (amt == 0) {
651                         break;
652                 }
653                 xp += amt;
654         }
655         fprintf(stderr, "Read in %d bytes\n", xp-kerneladdress);
656         close(kfd);
657
658         // The low 1m so we can fill in bullshit like ACPI. */
659         // And, sorry, due to the STUPID format of the RSDP for now we need the low 1M.
660         low1m = mmap((int*)4096, MiB-4096, PROT_READ | PROT_WRITE,
661                          MAP_ANONYMOUS, -1, 0);
662         if (low1m != (void *)4096) {
663                 perror("Unable to mmap low 1m");
664                 exit(1);
665         }
666         memset(low1m, 0xff, MiB-4096);
667         r = a;
668         fprintf(stderr, "install rsdp to %p\n", r);
669         *r = rsdp;
670         a += sizeof(*r);
671         memmove(&r->xsdt_physical_address, &a, sizeof(a));
672         gencsum(&r->checksum, r, ACPI_RSDP_CHECKSUM_LENGTH);
673         if ((csum = acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_CHECKSUM_LENGTH)) != 0) {
674                 fprintf(stderr, "RSDP has bad checksum; summed to %x\n", csum);
675                 exit(1);
676         }
677
678         /* Check extended checksum if table version >= 2 */
679         gencsum(&r->extended_checksum, r, ACPI_RSDP_XCHECKSUM_LENGTH);
680         if ((rsdp.revision >= 2) &&
681             (acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_XCHECKSUM_LENGTH) != 0)) {
682                 fprintf(stderr, "RSDP has bad checksum v2\n");
683                 exit(1);
684         }
685
686         /* just leave a bunch of space for the xsdt. */
687         /* we need to zero the area since it has pointers. */
688         x = a;
689         a += sizeof(*x) + 8*sizeof(void *);
690         memset(x, 0, a - (void *)x);
691         fprintf(stderr, "install xsdt to %p\n", x);
692         *x = xsdt;
693         x->table_offset_entry[0] = 0;
694         x->table_offset_entry[1] = 0;
695         x->header.length = a - (void *)x;
696
697         f = a;
698         fprintf(stderr, "install fadt to %p\n", f);
699         *f = fadt;
700         x->table_offset_entry[2] = (uint64_t) f;
701         a += sizeof(*f);
702         f->header.length = a - (void *)f;
703         gencsum(&f->header.checksum, f, f->header.length);
704         if (acpi_tb_checksum((uint8_t *)f, f->header.length) != 0) {
705                 fprintf(stderr, "ffadt has bad checksum v2\n");
706                 exit(1);
707         }
708
709         m = a;
710         *m = madt;
711         x->table_offset_entry[3] = (uint64_t) m;
712         a += sizeof(*m);
713         fprintf(stderr, "install madt to %p\n", m);
714         memmove(a, &Apic0, sizeof(Apic0));
715         a += sizeof(Apic0);
716         memmove(a, &Apic1, sizeof(Apic1));
717         a += sizeof(Apic1);
718         memmove(a, &isor, sizeof(isor));
719         a += sizeof(isor);
720         m->header.length = a - (void *)m;
721         gencsum(&m->header.checksum, m, m->header.length);
722         if (acpi_tb_checksum((uint8_t *) m, m->header.length) != 0) {
723                 fprintf(stderr, "madt has bad checksum v2\n");
724                 exit(1);
725         }
726         fprintf(stderr, "allchecksums ok\n");
727
728         gencsum(&x->header.checksum, x, x->header.length);
729         if ((csum = acpi_tb_checksum((uint8_t *) x, x->header.length)) != 0) {
730                 fprintf(stderr, "XSDT has bad checksum; summed to %x\n", csum);
731                 exit(1);
732         }
733
734         hexdump(stdout, r, a-(void *)r);
735
736         a = (void *)(((unsigned long)a + 0xfff) & ~0xfff);
737         gpci.posted_irq_desc = a;
738         memset(a, 0, 4096);
739         a += 4096;
740         gpci.vapic_addr = a;
741         //vmctl.vapic = (uint64_t) a_page;      
742         memset(a, 0, 4096);
743         ((uint32_t *)a)[0x30/4] = 0x01060014;
744         p64 = a;
745         // set up apic values? do we need to?
746         // qemu does this.
747         //((uint8_t *)a)[4] = 1;
748         a += 4096;
749         gpci.apic_addr = (void*)0xfee00000;
750
751         if (ros_syscall(SYS_vmm_setup, nr_gpcs, &gpci, vmmflags, 0, 0, 0) !=
752             nr_gpcs) {
753                 perror("Guest pcore setup failed");
754                 exit(1);
755         }
756
757         fprintf(stderr, "Run with %d cores and vmmflags 0x%x\n", nr_gpcs, vmmflags);
758         mcp = 1;
759         if (mcp) {
760                 my_retvals = malloc(sizeof(void*) * nr_threads);
761                 if (!my_retvals)
762                         perror("Init threads/malloc");
763
764                 pthread_can_vcore_request(FALSE);       /* 2LS won't manage vcores */
765                 pthread_need_tls(FALSE);
766                 pthread_mcp_init();                                     /* gives us one vcore */
767                 vcore_request(nr_threads - 1);          /* ghetto incremental interface */
768                 for (int i = 0; i < nr_threads; i++) {
769                         xp = __procinfo.vcoremap;
770                         fprintf(stderr, "%p\n", __procinfo.vcoremap);
771                         fprintf(stderr, "Vcore %d mapped to pcore %d\n", i,
772                                 __procinfo.vcoremap[i].pcoreid);
773                 }
774         }
775
776         ret = syscall(33, 1);
777         if (ret < 0) {
778                 perror("vm setup");
779                 exit(1);
780         }
781         ret = posix_memalign((void **)&p512, 4096, 3*4096);
782         fprintf(stderr, "memalign is %p\n", p512);
783         if (ret) {
784                 perror("ptp alloc");
785                 exit(1);
786         }
787         p1 = &p512[512];
788         p2m = &p512[1024];
789         uint64_t kernbase = 0; //0xffffffff80000000;
790         uint64_t highkernbase = 0xffffffff80000000;
791         p512[PML4(kernbase)] = (unsigned long long)p1 | 7;
792         p1[PML3(kernbase)] = /*0x87; */(unsigned long long)p2m | 7;
793         p512[PML4(highkernbase)] = (unsigned long long)p1 | 7;
794         p1[PML3(highkernbase)] = /*0x87; */(unsigned long long)p2m | 7;
795 #define _2MiB (0x200000)
796
797         for (i = 0; i < 512; i++) {
798                 p2m[PML2(kernbase + i * _2MiB)] = 0x87 | i * _2MiB;
799         }
800
801         kernbase >>= (0+12);
802         kernbase <<= (0 + 12);
803         uint8_t *kernel = (void *)GKERNBASE;
804         //write_coreboot_table(coreboot_tables, ((void *)VIRTIOBASE) /*kernel*/, KERNSIZE + 1048576);
805         hexdump(stdout, coreboot_tables, 512);
806         fprintf(stderr, "kernbase for pml4 is 0x%llx and entry is %llx\n", kernbase, entry);
807         fprintf(stderr, "p512 %p p512[0] is 0x%lx p1 %p p1[0] is 0x%x\n", p512, p512[0], p1, p1[0]);
808         vmctl.interrupt = 0;
809         vmctl.command = REG_RSP_RIP_CR3;
810         vmctl.cr3 = (uint64_t) p512;
811         vmctl.regs.tf_rip = entry;
812         vmctl.regs.tf_rsp = (uint64_t) &stack[1024];
813         if (mcp) {
814                 /* set up virtio bits, which depend on threads being enabled. */
815                 register_virtio_mmio(&vqdev, virtio_mmio_base);
816         }
817         fprintf(stderr, "threads started\n");
818         fprintf(stderr, "Writing command :%s:\n", cmd);
819         
820         if (debug)
821                 vapic_status_dump(stderr, (void *)gpci.vapic_addr);
822
823         run_vmthread(&vmctl);
824
825         if (debug)
826                 vapic_status_dump(stderr, (void *)gpci.vapic_addr);
827
828         while (1) {
829                 void showstatus(FILE *f, struct vmctl *v);
830                 int c;
831                 uint8_t byte;
832                 vmctl.command = REG_RIP;
833                 if (maxresume-- == 0) {
834                         debug = 1;
835                         resumeprompt = 1;
836                 }
837                 if (debug) {
838                         fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
839                         showstatus(stderr, &vmctl);
840                 }
841                 if (resumeprompt) {
842                         fprintf(stderr, "RESUME?\n");
843                         c = getchar();
844                         if (c == 'q')
845                                 break;
846                 }
847                 if (vmctl.shutdown == SHUTDOWN_EPT_VIOLATION) {
848                         uint64_t gpa, *regp, val;
849                         uint8_t regx;
850                         int store, size;
851                         int advance;
852                         if (decode(&vmctl, &gpa, &regx, &regp, &store, &size, &advance)) {
853                                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
854                                 showstatus(stderr, &vmctl);
855                                 quit = 1;
856                                 break;
857                         }
858                         if (debug) fprintf(stderr, "%p %p %p %p %p %p\n", gpa, regx, regp, store, size, advance);
859                         if ((gpa & ~0xfffULL) == virtiobase) {
860                                 if (debug) fprintf(stderr, "DO SOME VIRTIO\n");
861                                 // Lucky for us the various virtio ops are well-defined.
862                                 virtio_mmio(&vmctl, gpa, regx, regp, store);
863                                 if (debug) fprintf(stderr, "store is %d:\n", store);
864                                 if (debug) fprintf(stderr, "REGP IS %16x:\n", *regp);
865                         } else if ((gpa & 0xfee00000) == 0xfee00000) {
866                                 // until we fix our include mess, just put the proto here.
867                                 //int apic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
868                                 //apic(&vmctl, gpa, regx, regp, store);
869                         } else if ((gpa & 0xfec00000) == 0xfec00000) {
870                                 // until we fix our include mess, just put the proto here.
871                                 int do_ioapic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
872                                 do_ioapic(&vmctl, gpa, regx, regp, store);
873                         } else if (gpa < 4096) {
874                                 uint64_t val = 0;
875                                 memmove(&val, &low4k[gpa], size);
876                                 hexdump(stdout, &low4k[gpa], size);
877                                 fprintf(stderr, "Low 1m, code %p read @ %p, size %d, val %p\n", vmctl.regs.tf_rip, gpa, size, val);
878                                 memmove(regp, &low4k[gpa], size);
879                                 hexdump(stdout, regp, size);
880                         } else {
881                                 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
882                                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
883                                 fprintf(stderr, "Returning 0xffffffff\n");
884                                 showstatus(stderr, &vmctl);
885                                 // Just fill the whole register for now.
886                                 *regp = (uint64_t) -1;
887                         }
888                         vmctl.regs.tf_rip += advance;
889                         if (debug) fprintf(stderr, "Advance rip by %d bytes to %p\n", advance, vmctl.regs.tf_rip);
890                         vmctl.shutdown = 0;
891                         vmctl.gpa = 0;
892                         vmctl.command = REG_ALL;
893                 } else if (vmctl.shutdown == SHUTDOWN_UNHANDLED_EXIT_REASON) {
894                         switch(vmctl.ret_code){
895                         case  EXIT_REASON_VMCALL:
896                                 byte = vmctl.regs.tf_rdi;
897                                 printf("%c", byte);
898                                 if (byte == '\n') printf("%c", '%');
899                                 vmctl.regs.tf_rip += 3;
900                                 break;
901                         case EXIT_REASON_EXTERNAL_INTERRUPT:
902                                 //debug = 1;
903                                 if (debug) fprintf(stderr, "XINT 0x%x 0x%x\n", vmctl.intrinfo1, vmctl.intrinfo2);
904                                 if (debug) pir_dump();
905                                 vmctl.command = RESUME;
906                                 break;
907                         case EXIT_REASON_IO_INSTRUCTION:
908                                 fprintf(stderr, "IO @ %p\n", vmctl.regs.tf_rip);
909                                 io(&vmctl);
910                                 vmctl.shutdown = 0;
911                                 vmctl.gpa = 0;
912                                 vmctl.command = REG_ALL;
913                                 break;
914                         case EXIT_REASON_INTERRUPT_WINDOW:
915                                 if (consdata) {
916                                         if (debug) fprintf(stderr, "inject an interrupt\n");
917                                         virtio_mmio_set_vring_irq();
918                                         vmctl.interrupt = 0x80000000 | virtioirq;
919                                         vmctl.command = RESUME;
920                                         consdata = 0;
921                                 }
922                                 break;
923                         case EXIT_REASON_MSR_WRITE:
924                         case EXIT_REASON_MSR_READ:
925                                 fprintf(stderr, "Do an msr\n");
926                                 quit = msrio(&vmctl, vmctl.ret_code);
927                                 if (quit) {
928                                         fprintf(stderr, "MSR FAILED: RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
929                                         showstatus(stderr, &vmctl);
930                                 }
931                                 break;
932                         case EXIT_REASON_MWAIT_INSTRUCTION:
933                           fflush(stdout);
934                                 if (debug)fprintf(stderr, "\n================== Guest MWAIT. =======================\n");
935                                 if (debug)fprintf(stderr, "Wait for cons data\n");
936                                 while (!consdata)
937                                         ;
938                                 //debug = 1;
939                                 if (debug)
940                                         vapic_status_dump(stderr, gpci.vapic_addr);
941                                 if (debug)fprintf(stderr, "Resume with consdata ...\n");
942                                 vmctl.regs.tf_rip += 3;
943                                 run_vmthread(&vmctl);
944                                 //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
945                                 //showstatus(stderr, &vmctl);
946                                 break;
947                         case EXIT_REASON_HLT:
948                                 fflush(stdout);
949                                 if (debug)fprintf(stderr, "\n================== Guest halted. =======================\n");
950                                 if (debug)fprintf(stderr, "Wait for cons data\n");
951                                 while (!consdata)
952                                         ;
953                                 //debug = 1;
954                                 if (debug)fprintf(stderr, "Resume with consdata ...\n");
955                                 vmctl.regs.tf_rip += 1;
956                                 run_vmthread(&vmctl);
957                                 //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
958                                 //showstatus(stderr, &vmctl);
959                                 break;
960                         case EXIT_REASON_APIC_ACCESS:                           
961                                 if (1 || debug)fprintf(stderr, "APIC READ EXIT\n");
962                                 
963                                 uint64_t gpa, *regp, val;
964                                 uint8_t regx;
965                                 int store, size;
966                                 int advance;
967                                 if (decode(&vmctl, &gpa, &regx, &regp, &store, &size, &advance)) {
968                                         fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
969                                         showstatus(stderr, &vmctl);
970                                         quit = 1;
971                                         break;
972                                 }
973
974                                 int apic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
975                                 apic(&vmctl, gpa, regx, regp, store);
976                                 vmctl.regs.tf_rip += advance;
977                                 if (debug) fprintf(stderr, "Advance rip by %d bytes to %p\n", advance, vmctl.regs.tf_rip);
978                                 vmctl.shutdown = 0;
979                                 vmctl.gpa = 0;
980                                 vmctl.command = REG_ALL;
981                                 break;
982                         case EXIT_REASON_APIC_WRITE:
983                                 if (1 || debug)fprintf(stderr, "APIC WRITE EXIT\n");
984                                 break;
985                         default:
986                                 fprintf(stderr, "Don't know how to handle exit %d\n", vmctl.ret_code);
987                                 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
988                                 showstatus(stderr, &vmctl);
989                                 quit = 1;
990                                 break;
991                         }
992                 }
993                 if (debug) fprintf(stderr, "at bottom of switch, quit is %d\n", quit);
994                 if (quit)
995                         break;
996                 if (consdata) {
997                         if (debug) fprintf(stderr, "inject an interrupt\n");
998                         if (debug) fprintf(stderr, "XINT 0x%x 0x%x\n", vmctl.intrinfo1, vmctl.intrinfo2);
999                         vmctl.interrupt = 0x80000000 | virtioirq;
1000                         virtio_mmio_set_vring_irq();
1001                         consdata = 0;
1002                         //debug = 1;
1003                         vmctl.command = RESUME;
1004                 }
1005                 if (debug) fprintf(stderr, "NOW DO A RESUME\n");
1006                 run_vmthread(&vmctl);
1007         }
1008
1009         /* later. 
1010         for (int i = 0; i < nr_threads-1; i++) {
1011                 int ret;
1012                 if (pthread_join(my_threads[i], &my_retvals[i]))
1013                         perror("pth_join failed");
1014                 fprintf(stderr, "%d %d\n", i, ret);
1015         }
1016  */
1017
1018         fflush(stdout);
1019         exit(0);
1020 }