6 #include <parlib/arch/arch.h>
7 #include <parlib/ros_debug.h>
13 #include <ros/syscall.h>
15 #include <vmm/coreboot_tables.h>
17 #include <vmm/acpi/acpi.h>
18 #include <ros/arch/mmu.h>
20 #include <parlib/uthread.h>
21 #include <vmm/linux_bootparam.h>
22 #include <vmm/virtio.h>
23 #include <vmm/virtio_mmio.h>
24 #include <vmm/virtio_ids.h>
25 #include <vmm/virtio_config.h>
26 #include <vmm/sched.h>
29 struct vmm_gpcore_init gpci;
31 /* Whoever holds the ball runs. run_vm never actually grabs it - it is grabbed
36 void (*old_thread_refl)(struct uthread *uth, struct user_context *ctx);
38 static void copy_vmtf_to_vmctl(struct vm_trapframe *vm_tf, struct vmctl *vmctl)
40 vmctl->cr3 = vm_tf->tf_cr3;
41 vmctl->gva = vm_tf->tf_guest_va;
42 vmctl->gpa = vm_tf->tf_guest_pa;
43 vmctl->exit_qual = vm_tf->tf_exit_qual;
44 if (vm_tf->tf_exit_reason == EXIT_REASON_EPT_VIOLATION)
45 vmctl->shutdown = SHUTDOWN_EPT_VIOLATION;
47 vmctl->shutdown = SHUTDOWN_UNHANDLED_EXIT_REASON;
48 vmctl->ret_code = vm_tf->tf_exit_reason;
49 vmctl->interrupt = vm_tf->tf_trap_inject;
50 vmctl->intrinfo1 = vm_tf->tf_intrinfo1;
51 vmctl->intrinfo2 = vm_tf->tf_intrinfo2;
52 /* Most of the HW TF. Should be good enough for now */
53 vmctl->regs.tf_rax = vm_tf->tf_rax;
54 vmctl->regs.tf_rbx = vm_tf->tf_rbx;
55 vmctl->regs.tf_rcx = vm_tf->tf_rcx;
56 vmctl->regs.tf_rdx = vm_tf->tf_rdx;
57 vmctl->regs.tf_rbp = vm_tf->tf_rbp;
58 vmctl->regs.tf_rsi = vm_tf->tf_rsi;
59 vmctl->regs.tf_rdi = vm_tf->tf_rdi;
60 vmctl->regs.tf_r8 = vm_tf->tf_r8;
61 vmctl->regs.tf_r9 = vm_tf->tf_r9;
62 vmctl->regs.tf_r10 = vm_tf->tf_r10;
63 vmctl->regs.tf_r11 = vm_tf->tf_r11;
64 vmctl->regs.tf_r12 = vm_tf->tf_r12;
65 vmctl->regs.tf_r13 = vm_tf->tf_r13;
66 vmctl->regs.tf_r14 = vm_tf->tf_r14;
67 vmctl->regs.tf_r15 = vm_tf->tf_r15;
68 vmctl->regs.tf_rip = vm_tf->tf_rip;
69 vmctl->regs.tf_rflags = vm_tf->tf_rflags;
70 vmctl->regs.tf_rsp = vm_tf->tf_rsp;
73 static void copy_vmctl_to_vmtf(struct vmctl *vmctl, struct vm_trapframe *vm_tf)
75 vm_tf->tf_rax = vmctl->regs.tf_rax;
76 vm_tf->tf_rbx = vmctl->regs.tf_rbx;
77 vm_tf->tf_rcx = vmctl->regs.tf_rcx;
78 vm_tf->tf_rdx = vmctl->regs.tf_rdx;
79 vm_tf->tf_rbp = vmctl->regs.tf_rbp;
80 vm_tf->tf_rsi = vmctl->regs.tf_rsi;
81 vm_tf->tf_rdi = vmctl->regs.tf_rdi;
82 vm_tf->tf_r8 = vmctl->regs.tf_r8;
83 vm_tf->tf_r9 = vmctl->regs.tf_r9;
84 vm_tf->tf_r10 = vmctl->regs.tf_r10;
85 vm_tf->tf_r11 = vmctl->regs.tf_r11;
86 vm_tf->tf_r12 = vmctl->regs.tf_r12;
87 vm_tf->tf_r13 = vmctl->regs.tf_r13;
88 vm_tf->tf_r14 = vmctl->regs.tf_r14;
89 vm_tf->tf_r15 = vmctl->regs.tf_r15;
90 vm_tf->tf_rip = vmctl->regs.tf_rip;
91 vm_tf->tf_rflags = vmctl->regs.tf_rflags;
92 vm_tf->tf_rsp = vmctl->regs.tf_rsp;
93 vm_tf->tf_cr3 = vmctl->cr3;
94 vm_tf->tf_trap_inject = vmctl->interrupt;
95 /* Don't care about the rest of the fields. The kernel only writes them */
98 /* callback, runs in vcore context. this sets up our initial context. once we
99 * become runnable again, we'll run the first bits of the vm ctx. after that,
100 * our context will be stopped and started and will just run whatever the guest
101 * VM wants. we'll never come back to this code or to run_vm(). */
102 static void __build_vm_ctx_cb(struct uthread *uth, void *arg)
104 struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
105 struct vmctl *vmctl = (struct vmctl*)arg;
106 struct vm_trapframe *vm_tf;
108 __pthread_generic_yield(pthread);
109 pthread->state = PTH_BLK_YIELDING;
111 memset(&uth->u_ctx, 0, sizeof(struct user_context));
112 uth->u_ctx.type = ROS_VM_CTX;
113 vm_tf = &uth->u_ctx.tf.vm_tf;
115 vm_tf->tf_guest_pcoreid = 0; /* assuming only 1 guest core */
117 copy_vmctl_to_vmtf(vmctl, vm_tf);
119 /* other HW/GP regs are 0, which should be fine. the FP state is still
120 * whatever we were running before, though this is pretty much unnecessary.
121 * we mostly don't want crazy crap in the uth->as, and a non-current_uthread
122 * VM ctx is supposed to have something in their FP state (like HW ctxs). */
123 save_fp_state(&uth->as);
124 uth->flags |= UTHREAD_FPSAVED | UTHREAD_SAVED;
126 uthread_runnable(uth);
129 static void *run_vm(void *arg)
131 struct vmctl *vmctl = (struct vmctl*)arg;
133 assert(vmctl->command == REG_RSP_RIP_CR3);
134 /* We need to hack our context, so that next time we run, we're a VM ctx */
135 uthread_yield(FALSE, __build_vm_ctx_cb, arg);
138 static void vmm_thread_refl_fault(struct uthread *uth,
139 struct user_context *ctx)
141 struct pthread_tcb *pthread = (struct pthread_tcb*)uth;
143 /* Hack to call the original pth 2LS op */
144 if (!ctx->type == ROS_VM_CTX) {
145 old_thread_refl(uth, ctx);
148 __pthread_generic_yield(pthread);
149 /* normally we'd handle the vmexit here. to work within the existing
150 * framework, we just wake the controller thread. It'll look at our ctx
151 * then make us runnable again */
152 pthread->state = PTH_BLK_MUTEX;
153 uth_mutex_unlock(the_ball); /* wake the run_vmthread */
158 /* this will start the vm thread, and return when the thread has blocked,
159 * with the right info in vmctl. */
160 static void run_vmthread(struct vmctl *vmctl)
162 struct vm_trapframe *vm_tf;
165 /* first time through, we make the vm thread. the_ball was already
166 * grabbed right after it was alloc'd. */
167 if (pthread_create(&vm_thread, NULL, run_vm, vmctl)) {
168 perror("pth_create");
171 /* hack in our own handlers for some 2LS ops */
172 old_thread_refl = sched_ops->thread_refl_fault;
173 sched_ops->thread_refl_fault = vmm_thread_refl_fault;
175 copy_vmctl_to_vmtf(vmctl, &vm_thread->uthread.u_ctx.tf.vm_tf);
176 uth_mutex_lock(the_ball); /* grab it for the vm_thread */
177 uthread_runnable((struct uthread*)vm_thread);
179 uth_mutex_lock(the_ball);
180 /* We woke due to a vm exit. Need to unlock for the next time we're run */
181 uth_mutex_unlock(the_ball);
182 /* the vm stopped. we can do whatever we want before rerunning it. since
183 * we're controlling the uth, we need to handle its vmexits. we'll fill in
184 * the vmctl, since that's the current framework. */
185 copy_vmtf_to_vmctl(&vm_thread->uthread.u_ctx.tf.vm_tf, vmctl);
188 /* By 1999, you could just scan the hardware
189 * and work it out. But 2005, that was no longer possible. How sad.
190 * so we have to fake acpi to make it all work.
191 * This will be copied to memory at 0xe0000, so the kernel can find it.
194 /* assume they're all 256 bytes long just to make it easy.
195 * Just have pointers that point to aligned things.
198 struct acpi_table_rsdp rsdp = {
199 .signature = "RSD PTR ",
205 struct acpi_table_xsdt xsdt = {
208 // This is so stupid. Incredibly stupid.
211 .oem_table_id = "ALPHABET",
213 .asl_compiler_id = "RON ",
214 .asl_compiler_revision = 0,
217 struct acpi_table_fadt fadt = {
220 // This is so stupid. Incredibly stupid.
223 .oem_table_id = "ALPHABET",
225 .asl_compiler_id = "RON ",
226 .asl_compiler_revision = 0,
230 /* This has to be dropped into memory, then the other crap just follows it.
232 struct acpi_table_madt madt = {
237 .oem_table_id = "ALPHABET",
239 .asl_compiler_id = "RON ",
240 .asl_compiler_revision = 0,
243 .address = 0xfee00000ULL,
246 struct acpi_madt_local_apic Apic0 = {.header = {.type = ACPI_MADT_TYPE_LOCAL_APIC, .length = sizeof(struct acpi_madt_local_apic)},
247 .processor_id = 0, .id = 0};
248 struct acpi_madt_io_apic Apic1 = {.header = {.type = ACPI_MADT_TYPE_IO_APIC, .length = sizeof(struct acpi_madt_io_apic)},
249 .id = 1, .address = 0xfec00000, .global_irq_base = 0};
250 struct acpi_madt_local_x2apic X2Apic0 = {
252 .type = ACPI_MADT_TYPE_LOCAL_X2APIC,
253 .length = sizeof(struct acpi_madt_local_x2apic)
259 struct acpi_madt_interrupt_override isor[] = {
260 /* I have no idea if it should be source irq 2, global 0, or global 2, source 0. Shit. */
261 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
262 .bus = 0, .source_irq = 2, .global_irq = 0, .inti_flags = 0},
263 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
264 .bus = 0, .source_irq = 1, .global_irq = 1, .inti_flags = 0},
265 //{.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
266 //.bus = 0, .source_irq = 2, .global_irq = 2, .inti_flags = 0},
267 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
268 .bus = 0, .source_irq = 3, .global_irq = 3, .inti_flags = 0},
269 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
270 .bus = 0, .source_irq = 4, .global_irq = 4, .inti_flags = 0},
271 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
272 .bus = 0, .source_irq = 5, .global_irq = 5, .inti_flags = 0},
273 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
274 .bus = 0, .source_irq = 6, .global_irq = 6, .inti_flags = 0},
275 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
276 .bus = 0, .source_irq = 7, .global_irq = 7, .inti_flags = 0},
277 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
278 .bus = 0, .source_irq = 8, .global_irq = 8, .inti_flags = 0},
279 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
280 .bus = 0, .source_irq = 9, .global_irq = 9, .inti_flags = 0},
281 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
282 .bus = 0, .source_irq = 10, .global_irq = 10, .inti_flags = 0},
283 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
284 .bus = 0, .source_irq = 11, .global_irq = 11, .inti_flags = 0},
285 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
286 .bus = 0, .source_irq = 12, .global_irq = 12, .inti_flags = 0},
287 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
288 .bus = 0, .source_irq = 13, .global_irq = 13, .inti_flags = 0},
289 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
290 .bus = 0, .source_irq = 14, .global_irq = 14, .inti_flags = 0},
291 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
292 .bus = 0, .source_irq = 15, .global_irq = 15, .inti_flags = 0},
293 // VMMCP routes irq 32 to gsi 17
294 {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
295 .bus = 0, .source_irq = 32, .global_irq = 17, .inti_flags = 5},
299 /* this test will run the "kernel" in the negative address space. We hope. */
302 unsigned long long stack[1024];
303 volatile int shared = 0;
304 volatile int quit = 0;
308 /* total hack. If the vm runs away we want to get control again. */
309 unsigned int maxresume = (unsigned int) -1;
311 #define MiB 0x100000u
313 #define GKERNBASE (16*MiB)
314 #define KERNSIZE (128*MiB+GKERNBASE)
315 uint8_t _kernel[KERNSIZE];
317 unsigned long long *p512, *p1, *p2m;
322 int resumeprompt = 0;
323 /* unlike Linux, this shared struct is for both host and guest. */
324 // struct virtqueue *constoguest =
325 // vring_new_virtqueue(0, 512, 8192, 0, inpages, NULL, NULL, "test");
326 uint64_t virtio_mmio_base = 0x100000000ULL;
328 void vapic_status_dump(FILE *f, void *vapic);
329 static void set_posted_interrupt(int vector);
331 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
332 #error "Get a gcc newer than 4.4.0"
334 #define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
337 #define LOCK_PREFIX "lock "
338 #define ADDR BITOP_ADDR(addr)
339 static inline int test_and_set_bit(int nr, volatile unsigned long *addr);
341 pthread_t timerthread_struct;
343 void *timer_thread(void *arg)
346 uint32_t initial_count;
348 vector = ((uint32_t *)gpci.vapic_addr)[0x32] & 0xff;
349 initial_count = ((uint32_t *)gpci.vapic_addr)[0x38];
350 if (vector && initial_count) {
351 set_posted_interrupt(vector);
352 ros_syscall(SYS_vmm_poke_guest, 0, 0, 0, 0, 0, 0);
354 uthread_usleep(100000);
356 fprintf(stderr, "SENDING TIMER\n");
359 void *consout(void *arg)
361 char *line, *consline, *outline;
362 static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
363 static struct scatterlist in[] = { {NULL, sizeof(line)}, };
364 static struct scatterlist iov[32];
365 struct virtio_threadarg *a = arg;
366 static unsigned int inlen, outlen, conslen;
367 struct virtqueue *v = a->arg->virtio;
368 fprintf(stderr, "talk thread ..\n");
369 uint16_t head, gaveit = 0, gotitback = 0;
375 fprintf(stderr, "----------------------- TT a %p\n", a);
376 fprintf(stderr, "talk thread ttargs %x v %x\n", a, v);
379 for(num = 0;;num++) {
381 /* host: use any buffers we should have been sent. */
382 head = wait_for_vq_desc(v, iov, &outlen, &inlen);
384 fprintf(stderr, "CCC: vq desc head %d, gaveit %d gotitback %d\n", head, gaveit, gotitback);
385 for(i = 0; debug && i < outlen + inlen; i++)
386 fprintf(stderr, "CCC: v[%d/%d] v %p len %d\n", i, outlen + inlen, iov[i].v, iov[i].length);
387 /* host: if we got an output buffer, just output it. */
388 for(i = 0; i < outlen; i++) {
392 fprintf(stderr, "CCC: IOV length is %d\n", iov[i].length);
394 for (j = 0; j < iov[i].length; j++)
395 printf("%c", ((char *)iov[i].v)[j]);
399 fprintf(stderr, "CCC: outlen is %d; inlen is %d\n", outlen, inlen);
400 /* host: fill in the writeable buffers. */
401 /* why we're getting these I don't know. */
402 for (i = outlen; i < outlen + inlen; i++) {
403 if (debug) fprintf(stderr, "CCC: send back empty writeable");
406 if (debug) fprintf(stderr, "CCC: call add_used\n");
407 /* host: now ack that we used them all. */
408 add_used(v, head, outlen+inlen);
409 if (debug) fprintf(stderr, "CCC: DONE call add_used\n");
411 fprintf(stderr, "All done\n");
416 volatile int consdata = 0;
418 void *consin(void *arg)
420 struct virtio_threadarg *a = arg;
421 char *line, *outline;
422 static char consline[128];
423 static struct scatterlist iov[32];
424 static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
425 static struct scatterlist in[] = { {NULL, sizeof(line)}, };
427 static unsigned int inlen, outlen, conslen;
428 struct virtqueue *v = a->arg->virtio;
429 fprintf(stderr, "consin thread ..\n");
430 uint16_t head, gaveit = 0, gotitback = 0;
436 if (debug) fprintf(stderr, "Spin on console being read, print num queues, halt\n");
438 for(num = 0;! quit;num++) {
440 /* host: use any buffers we should have been sent. */
441 head = wait_for_vq_desc(v, iov, &outlen, &inlen);
443 fprintf(stderr, "vq desc head %d, gaveit %d gotitback %d\n", head, gaveit, gotitback);
444 for(i = 0; debug && i < outlen + inlen; i++)
445 fprintf(stderr, "v[%d/%d] v %p len %d\n", i, outlen + inlen, iov[i].v, iov[i].length);
447 fprintf(stderr, "outlen is %d; inlen is %d\n", outlen, inlen);
448 /* host: fill in the writeable buffers. */
449 for (i = outlen; i < outlen + inlen; i++) {
450 /* host: read a line. */
451 memset(consline, 0, 128);
452 if (read(0, consline, 1) < 0) {
455 if (debug) fprintf(stderr, "CONSIN: GOT A LINE:%s:\n", consline);
456 if (debug) fprintf(stderr, "CONSIN: OUTLEN:%d:\n", outlen);
457 if (strlen(consline) < 3 && consline[0] == 'q' ) {
462 memmove(iov[i].v, consline, strlen(consline)+ 1);
463 iov[i].length = strlen(consline) + 1;
465 if (debug) fprintf(stderr, "call add_used\n");
466 /* host: now ack that we used them all. */
467 add_used(v, head, outlen+inlen);
468 /* turn off consdata - the IRQ injection isn't right */
470 if (debug) fprintf(stderr, "DONE call add_used\n");
472 // Send spurious for testing (Gan)
473 set_posted_interrupt(0xE5);
474 virtio_mmio_set_vring_irq();
476 ros_syscall(SYS_vmm_poke_guest, 0, 0, 0, 0, 0, 0);
478 fprintf(stderr, "All done\n");
482 static struct vqdev vqdev= {
484 dev: VIRTIO_ID_CONSOLE,
485 device_features: 0, /* Can't do it: linux console device does not support it. VIRTIO_F_VERSION_1*/
488 {name: "consin", maxqnum: 64, f: consin, arg: (void *)0},
489 {name: "consout", maxqnum: 64, f: consout, arg: (void *)0},
494 __asm__ __volatile__ (".section .lowmem, \"aw\"\n\tlow: \n\t.=0x1000\n\t.align 0x100000\n\t.previous\n");
497 static uint8_t acpi_tb_checksum(uint8_t *buffer, uint32_t length)
500 uint8_t *end = buffer + length;
501 fprintf(stderr, "tbchecksum %p for %d", buffer, length);
502 while (buffer < end) {
503 if (end - buffer < 2)
504 fprintf(stderr, "%02x\n", sum);
505 sum = (uint8_t)(sum + *(buffer++));
507 fprintf(stderr, " is %02x\n", sum);
511 static void gencsum(uint8_t *target, void *data, int len)
514 // blast target to zero so it does not get counted
515 // (it might be in the struct we checksum) And, yes, it is, goodness.
516 fprintf(stderr, "gencsum %p target %p source %d bytes\n", target, data, len);
518 csum = acpi_tb_checksum((uint8_t *)data, len);
520 fprintf(stderr, "Cmoputed is %02x\n", *target);
523 static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
527 asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
528 "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
533 static void pir_dump()
535 unsigned long *pir_ptr = gpci.posted_irq_desc;
537 fprintf(stderr, "-------Begin PIR dump-------\n");
538 for (i = 0; i < 8; i++){
539 fprintf(stderr, "Byte %d: 0x%016x\n", i, pir_ptr[i]);
541 fprintf(stderr, "-------End PIR dump-------\n");
544 static void set_posted_interrupt(int vector)
546 test_and_set_bit(vector, gpci.posted_irq_desc);
547 /* LOCKed instruction provides the mb() */
548 test_and_set_bit(VMX_POSTED_OUTSTANDING_NOTIF, gpci.posted_irq_desc);
551 int main(int argc, char **argv)
553 struct boot_params *bp;
554 char *cmdline_default = "earlyprintk=vmcall,keep"
556 " virtio_mmio.device=1M@0x100000000:32"
559 " acpi.debug_layer=0x2"
560 " acpi.debug_level=0xffffffff"
564 " init=/bin/launcher"
565 " lapic=notscdeadline"
566 " lapictimerfreq=1000000"
568 char *cmdline_extra = "\0";
571 void *a = (void *)0xe0000;
572 struct acpi_table_rsdp *r;
573 struct acpi_table_fadt *f;
574 struct acpi_table_madt *m;
575 struct acpi_table_xsdt *x;
576 uint64_t virtiobase = 0x100000000ULL;
577 // lowmem is a bump allocated pointer to 2M at the "physbase" of memory
578 void *lowmem = (void *) 0x1000000;
579 //struct vmctl vmctl;
581 int vmmflags = 0; // Disabled probably forever. VMM_VMCALL_PRINTF;
582 uint64_t entry = 0x1200000, kerneladdress = 0x1200000;
587 static char cmd[512];
590 void *coreboot_tables = (void *) 0x1165000;
592 struct vm_trapframe *vm_tf;
593 uint64_t tsc_freq_khz;
595 the_ball = uth_mutex_alloc();
596 uth_mutex_lock(the_ball);
598 fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT,
602 // mmap is not working for us at present.
603 if ((uint64_t)_kernel > GKERNBASE) {
604 fprintf(stderr, "kernel array @%p is above , GKERNBASE@%p sucks\n", _kernel, GKERNBASE);
607 memset(_kernel, 0, sizeof(_kernel));
608 memset(lowmem, 0xff, 2*1048576);
609 memset(low4k, 0xff, 4096);
610 // avoid at all costs, requires too much instruction emulation.
612 //low4k[0x40f] = 0xe0;
615 a_page = mmap((void *)0xfee00000, PGSIZE, PROT_READ | PROT_WRITE,
616 MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
617 fprintf(stderr, "a_page mmap pointer %p\n", a_page);
619 if (a_page == (void *) -1) {
620 perror("Could not mmap APIC");
623 if (((uint64_t)a_page & 0xfff) != 0) {
624 perror("APIC page mapping is not page aligned");
628 memset(a_page, 0, 4096);
629 ((uint32_t *)a_page)[0x30/4] = 0x01060015;
630 //((uint32_t *)a_page)[0x30/4] = 0xDEADBEEF;
635 // Sorry, I don't much like the gnu opt parsing code.
644 vmmflags |= VMM_VMCALL_PRINTF;
648 maxresume = strtoull(argv[0], 0, 0);
652 virtioirq = strtoull(argv[0], 0, 0);
656 cmdline_extra = argv[0];
658 fprintf(stderr, "BMAFR\n");
664 fprintf(stderr, "Usage: %s vmimage [-n (no vmcall printf)] [coreboot_tables [loadaddress [entrypoint]]]\n", argv[0]);
668 coreboot_tables = (void *) strtoull(argv[1], 0, 0);
670 kerneladdress = strtoull(argv[2], 0, 0);
672 entry = strtoull(argv[3], 0, 0);
673 kfd = open(argv[0], O_RDONLY);
678 // read in the kernel.
679 xp = (void *)kerneladdress;
681 amt = read(kfd, xp, 1048576);
691 fprintf(stderr, "Read in %d bytes\n", xp-kerneladdress);
694 // The low 1m so we can fill in bullshit like ACPI. */
695 // And, sorry, due to the STUPID format of the RSDP for now we need the low 1M.
696 low1m = mmap((int*)4096, MiB-4096, PROT_READ | PROT_WRITE,
697 MAP_ANONYMOUS, -1, 0);
698 if (low1m != (void *)4096) {
699 perror("Unable to mmap low 1m");
702 memset(low1m, 0xff, MiB-4096);
704 fprintf(stderr, "install rsdp to %p\n", r);
707 memmove(&r->xsdt_physical_address, &a, sizeof(a));
708 gencsum(&r->checksum, r, ACPI_RSDP_CHECKSUM_LENGTH);
709 if ((csum = acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_CHECKSUM_LENGTH)) != 0) {
710 fprintf(stderr, "RSDP has bad checksum; summed to %x\n", csum);
714 /* Check extended checksum if table version >= 2 */
715 gencsum(&r->extended_checksum, r, ACPI_RSDP_XCHECKSUM_LENGTH);
716 if ((rsdp.revision >= 2) &&
717 (acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_XCHECKSUM_LENGTH) != 0)) {
718 fprintf(stderr, "RSDP has bad checksum v2\n");
722 /* just leave a bunch of space for the xsdt. */
723 /* we need to zero the area since it has pointers. */
725 a += sizeof(*x) + 8*sizeof(void *);
726 memset(x, 0, a - (void *)x);
727 fprintf(stderr, "install xsdt to %p\n", x);
729 x->table_offset_entry[0] = 0;
730 x->table_offset_entry[1] = 0;
731 x->header.length = a - (void *)x;
734 fprintf(stderr, "install fadt to %p\n", f);
736 x->table_offset_entry[2] = (uint64_t) f;
738 f->header.length = a - (void *)f;
739 gencsum(&f->header.checksum, f, f->header.length);
740 if (acpi_tb_checksum((uint8_t *)f, f->header.length) != 0) {
741 fprintf(stderr, "ffadt has bad checksum v2\n");
747 x->table_offset_entry[3] = (uint64_t) m;
749 fprintf(stderr, "install madt to %p\n", m);
750 memmove(a, &Apic0, sizeof(Apic0));
752 memmove(a, &Apic1, sizeof(Apic1));
754 memmove(a, &X2Apic0, sizeof(X2Apic0));
755 a += sizeof(X2Apic0);
756 memmove(a, &isor, sizeof(isor));
758 m->header.length = a - (void *)m;
759 gencsum(&m->header.checksum, m, m->header.length);
760 if (acpi_tb_checksum((uint8_t *) m, m->header.length) != 0) {
761 fprintf(stderr, "madt has bad checksum v2\n");
764 fprintf(stderr, "allchecksums ok\n");
766 gencsum(&x->header.checksum, x, x->header.length);
767 if ((csum = acpi_tb_checksum((uint8_t *) x, x->header.length)) != 0) {
768 fprintf(stderr, "XSDT has bad checksum; summed to %x\n", csum);
772 hexdump(stdout, r, a-(void *)r);
774 a = (void *)(((unsigned long)a + 0xfff) & ~0xfff);
775 gpci.posted_irq_desc = a;
779 //vmctl.vapic = (uint64_t) a_page;
781 ((uint32_t *)a)[0x30/4] = 0x01060014;
783 // set up apic values? do we need to?
785 //((uint8_t *)a)[4] = 1;
787 gpci.apic_addr = (void*)0xfee00000;
789 /* Allocate memory for, and zero the bootparams
790 * page before writing to it, or Linux thinks
791 * we're talking crazy.
797 /* Set the kernel command line parameters */
801 bp->hdr.cmd_line_ptr = (uintptr_t) cmdline;
802 tsc_freq_khz = get_tsc_freq()/1000;
803 sprintf(cmdline, "%s tscfreq=%lld %s", cmdline_default, tsc_freq_khz,
807 /* Put the e820 memory region information in the boot_params */
808 bp->e820_entries = 3;
811 bp->e820_map[e820i].addr = 0;
812 bp->e820_map[e820i].size = 16 * 1048576;
813 bp->e820_map[e820i++].type = E820_RESERVED;
815 bp->e820_map[e820i].addr = 16 * 1048576;
816 bp->e820_map[e820i].size = 128 * 1048576;
817 bp->e820_map[e820i++].type = E820_RAM;
819 bp->e820_map[e820i].addr = 0xf0000000;
820 bp->e820_map[e820i].size = 0x10000000;
821 bp->e820_map[e820i++].type = E820_RESERVED;
823 if (ros_syscall(SYS_vmm_setup, nr_gpcs, &gpci, vmmflags, 0, 0, 0) !=
825 perror("Guest pcore setup failed");
829 fprintf(stderr, "Run with %d cores and vmmflags 0x%x\n", nr_gpcs, vmmflags);
832 my_retvals = malloc(sizeof(void*) * nr_threads);
834 perror("Init threads/malloc");
836 pthread_can_vcore_request(FALSE); /* 2LS won't manage vcores */
837 pthread_need_tls(FALSE);
838 pthread_mcp_init(); /* gives us one vcore */
839 vcore_request(nr_threads - 1); /* ghetto incremental interface */
840 for (int i = 0; i < nr_threads; i++) {
841 xp = __procinfo.vcoremap;
842 fprintf(stderr, "%p\n", __procinfo.vcoremap);
843 fprintf(stderr, "Vcore %d mapped to pcore %d\n", i,
844 __procinfo.vcoremap[i].pcoreid);
848 ret = syscall(33, 1);
853 ret = posix_memalign((void **)&p512, 4096, 3*4096);
854 fprintf(stderr, "memalign is %p\n", p512);
861 uint64_t kernbase = 0; //0xffffffff80000000;
862 uint64_t highkernbase = 0xffffffff80000000;
863 p512[PML4(kernbase)] = (unsigned long long)p1 | 7;
864 p1[PML3(kernbase)] = /*0x87; */(unsigned long long)p2m | 7;
865 p512[PML4(highkernbase)] = (unsigned long long)p1 | 7;
866 p1[PML3(highkernbase)] = /*0x87; */(unsigned long long)p2m | 7;
867 #define _2MiB (0x200000)
869 for (i = 0; i < 512; i++) {
870 p2m[PML2(kernbase + i * _2MiB)] = 0x87 | i * _2MiB;
874 kernbase <<= (0 + 12);
875 uint8_t *kernel = (void *)GKERNBASE;
876 //write_coreboot_table(coreboot_tables, ((void *)VIRTIOBASE) /*kernel*/, KERNSIZE + 1048576);
877 hexdump(stdout, coreboot_tables, 512);
878 fprintf(stderr, "kernbase for pml4 is 0x%llx and entry is %llx\n", kernbase, entry);
879 fprintf(stderr, "p512 %p p512[0] is 0x%lx p1 %p p1[0] is 0x%x\n", p512, p512[0], p1, p1[0]);
881 vmctl.command = REG_RSP_RIP_CR3;
882 vmctl.cr3 = (uint64_t) p512;
883 vmctl.regs.tf_rip = entry;
884 vmctl.regs.tf_rsp = (uint64_t) &stack[1024];
885 vmctl.regs.tf_rsi = (uint64_t) bp;
887 /* set up virtio bits, which depend on threads being enabled. */
888 register_virtio_mmio(&vqdev, virtio_mmio_base);
890 fprintf(stderr, "threads started\n");
891 fprintf(stderr, "Writing command :%s:\n", cmd);
894 vapic_status_dump(stderr, (void *)gpci.vapic_addr);
896 run_vmthread(&vmctl);
899 vapic_status_dump(stderr, (void *)gpci.vapic_addr);
902 /* Start up timer thread */
903 if (pthread_create(&timerthread_struct, NULL, timer_thread, NULL)) {
904 fprintf(stderr, "pth_create failed for timer thread.");
905 perror("pth_create");
909 vm_tf = &(vm_thread->uthread.u_ctx.tf.vm_tf);
915 //vmctl.command = REG_RIP;
916 if (maxresume-- == 0) {
921 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
922 vm_tf->tf_exit_reason);
923 showstatus(stderr, (struct guest_thread*)&vm_thread);
926 fprintf(stderr, "RESUME?\n");
931 if (vm_tf->tf_exit_reason == EXIT_REASON_EPT_VIOLATION) {
932 uint64_t gpa, *regp, val;
936 if (decode((struct guest_thread *) vm_thread, &gpa, ®x, ®p,
937 &store, &size, &advance)) {
938 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
939 vm_tf->tf_exit_reason);
940 showstatus(stderr, (struct guest_thread*)&vm_thread);
944 if (debug) fprintf(stderr, "%p %p %p %p %p %p\n", gpa, regx, regp, store, size, advance);
945 if ((gpa & ~0xfffULL) == virtiobase) {
946 if (debug) fprintf(stderr, "DO SOME VIRTIO\n");
947 // Lucky for us the various virtio ops are well-defined.
948 virtio_mmio((struct guest_thread *)vm_thread, gpa, regx, regp,
950 if (debug) fprintf(stderr, "store is %d:\n", store);
951 if (debug) fprintf(stderr, "REGP IS %16x:\n", *regp);
952 } else if ((gpa & 0xfee00000) == 0xfee00000) {
953 // until we fix our include mess, just put the proto here.
954 //int apic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
955 //apic(&vmctl, gpa, regx, regp, store);
956 } else if ((gpa & 0xfec00000) == 0xfec00000) {
957 // until we fix our include mess, just put the proto here.
958 do_ioapic((struct guest_thread *)vm_thread, gpa, regx, regp,
960 } else if (gpa < 4096) {
962 memmove(&val, &low4k[gpa], size);
963 hexdump(stdout, &low4k[gpa], size);
964 fprintf(stderr, "Low 1m, code %p read @ %p, size %d, val %p\n",
965 vm_tf->tf_rip, gpa, size, val);
966 memmove(regp, &low4k[gpa], size);
967 hexdump(stdout, regp, size);
969 fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
970 fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
971 vm_tf->tf_exit_reason);
972 fprintf(stderr, "Returning 0xffffffff\n");
973 showstatus(stderr, (struct guest_thread*)&vm_thread);
974 // Just fill the whole register for now.
975 *regp = (uint64_t) -1;
977 vm_tf->tf_rip += advance;
979 fprintf(stderr, "Advance rip by %d bytes to %p\n",
980 advance, vm_tf->tf_rip);
981 //vmctl.shutdown = 0;
983 //vmctl.command = REG_ALL;
985 switch (vm_tf->tf_exit_reason) {
986 case EXIT_REASON_VMCALL:
987 byte = vm_tf->tf_rdi;
989 if (byte == '\n') printf("%c", '%');
992 case EXIT_REASON_EXTERNAL_INTERRUPT:
995 fprintf(stderr, "XINT 0x%x 0x%x\n",
996 vm_tf->tf_intrinfo1, vm_tf->tf_intrinfo2);
997 if (debug) pir_dump();
998 //vmctl.command = RESUME;
1000 case EXIT_REASON_IO_INSTRUCTION:
1001 fprintf(stderr, "IO @ %p\n", vm_tf->tf_rip);
1002 io((struct guest_thread *)vm_thread);
1003 //vmctl.shutdown = 0;
1005 //vmctl.command = REG_ALL;
1007 case EXIT_REASON_INTERRUPT_WINDOW:
1009 if (debug) fprintf(stderr, "inject an interrupt\n");
1010 virtio_mmio_set_vring_irq();
1011 vm_tf->tf_trap_inject = 0x80000000 | virtioirq;
1012 //vmctl.command = RESUME;
1016 case EXIT_REASON_MSR_WRITE:
1017 case EXIT_REASON_MSR_READ:
1018 fprintf(stderr, "Do an msr\n");
1019 if (msrio((struct guest_thread *)vm_thread, &gpci,
1020 vm_tf->tf_exit_reason)) {
1021 // uh-oh, msrio failed
1022 // well, hand back a GP fault which is what Intel does
1023 fprintf(stderr, "MSR FAILED: RIP %p, shutdown 0x%x\n",
1024 vm_tf->tf_rip, vm_tf->tf_exit_reason);
1025 showstatus(stderr, (struct guest_thread*)&vm_thread);
1027 // Use event injection through vmctl to send
1028 // a general protection fault
1029 // vmctl.interrupt gets written to the VM-Entry
1030 // Interruption-Information Field by vmx
1031 vm_tf->tf_trap_inject = VM_TRAP_VALID
1032 | VM_TRAP_ERROR_CODE
1039 case EXIT_REASON_MWAIT_INSTRUCTION:
1041 if (debug)fprintf(stderr, "\n================== Guest MWAIT. =======================\n");
1042 if (debug)fprintf(stderr, "Wait for cons data\n");
1047 vapic_status_dump(stderr, gpci.vapic_addr);
1048 if (debug)fprintf(stderr, "Resume with consdata ...\n");
1050 //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
1051 //showstatus(stderr, (struct guest_thread*)&vm_thread);
1053 case EXIT_REASON_HLT:
1055 if (debug)fprintf(stderr, "\n================== Guest halted. =======================\n");
1056 if (debug)fprintf(stderr, "Wait for cons data\n");
1060 if (debug)fprintf(stderr, "Resume with consdata ...\n");
1062 //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
1063 //showstatus(stderr, (struct guest_thread*)&vm_thread);
1065 case EXIT_REASON_APIC_ACCESS:
1066 if (1 || debug)fprintf(stderr, "APIC READ EXIT\n");
1068 uint64_t gpa, *regp, val;
1072 if (decode((struct guest_thread *)vm_thread, &gpa, ®x,
1073 ®p, &store, &size, &advance)) {
1074 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
1075 vm_tf->tf_exit_reason);
1076 showstatus(stderr, (struct guest_thread*)&vm_thread);
1081 int apic(struct guest_thread *vm_thread, uint64_t gpa,
1082 int destreg, uint64_t *regp, int store);
1083 apic((struct guest_thread *)vm_thread, gpa, regx, regp, store);
1084 vm_tf->tf_rip += advance;
1086 fprintf(stderr, "Advance rip by %d bytes to %p\n",
1087 advance, vm_tf->tf_rip);
1088 //vmctl.shutdown = 0;
1090 //vmctl.command = REG_ALL;
1092 case EXIT_REASON_APIC_WRITE:
1093 if (1 || debug)fprintf(stderr, "APIC WRITE EXIT\n");
1096 fprintf(stderr, "Don't know how to handle exit %d\n",
1097 vm_tf->tf_exit_reason);
1098 fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
1099 vm_tf->tf_exit_reason);
1100 showstatus(stderr, (struct guest_thread*)&vm_thread);
1105 if (debug) fprintf(stderr, "at bottom of switch, quit is %d\n", quit);
1109 if (debug) fprintf(stderr, "inject an interrupt\n");
1111 fprintf(stderr, "XINT 0x%x 0x%x\n", vm_tf->tf_intrinfo1,
1112 vm_tf->tf_intrinfo2);
1113 vm_tf->tf_trap_inject = 0x80000000 | virtioirq;
1114 virtio_mmio_set_vring_irq();
1117 //vmctl.command = RESUME;
1119 if (debug) fprintf(stderr, "NOW DO A RESUME\n");
1120 copy_vmtf_to_vmctl(vm_tf, &vmctl);
1121 run_vmthread(&vmctl);
1122 copy_vmctl_to_vmtf(&vmctl, vm_tf);
1126 for (int i = 0; i < nr_threads-1; i++) {
1128 if (pthread_join(my_threads[i], &my_retvals[i]))
1129 perror("pth_join failed");
1130 fprintf(stderr, "%d %d\n", i, ret);