vmrunkernel: allow loading of an SMBIOS table
[akaros.git] / tests / vmm / vmrunkernel.c
index 5a94e54..91bef23 100644 (file)
@@ -1,4 +1,4 @@
-#include <stdio.h> 
+#include <stdio.h>
 #include <pthread.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -6,36 +6,53 @@
 #include <parlib/arch/arch.h>
 #include <parlib/ros_debug.h>
 #include <unistd.h>
+#include <gelf.h>
 #include <errno.h>
+#include <libelf.h>
 #include <dirent.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ros/syscall.h>
 #include <sys/mman.h>
-#include <vmm/coreboot_tables.h>
 #include <vmm/vmm.h>
 #include <vmm/acpi/acpi.h>
+#include <vmm/acpi/vmm_simple_dsdt.h>
 #include <ros/arch/mmu.h>
+#include <ros/arch/membar.h>
 #include <ros/vmm.h>
 #include <parlib/uthread.h>
+#include <vmm/linux_bootparam.h>
+#include <getopt.h>
+
 #include <vmm/virtio.h>
+#include <vmm/virtio_blk.h>
 #include <vmm/virtio_mmio.h>
 #include <vmm/virtio_ids.h>
 #include <vmm/virtio_config.h>
+#include <vmm/virtio_console.h>
+#include <vmm/virtio_net.h>
+#include <vmm/virtio_lguest_console.h>
+
+#include <vmm/sched.h>
+#include <sys/eventfd.h>
+#include <sys/uio.h>
 
-int msrio(struct vmctl *vcpu, uint32_t opcode);
+struct virtual_machine local_vm, *vm = &local_vm;
 
-struct vmctl vmctl;
+struct vmm_gpcore_init gpci;
 
-/* Kind of sad what a total clusterf the pc world is. By 1999, you could just scan the hardware 
- * and work it out. But 2005, that was no longer possible. How sad. 
- * so we have to fake acpi to make it all work. !@#$!@#$#.
+/* By 1999, you could just scan the hardware
+ * and work it out. But 2005, that was no longer possible. How sad.
+ * so we have to fake acpi to make it all work.
  * This will be copied to memory at 0xe0000, so the kernel can find it.
  */
-/* assume they're all 256 bytes long just to make it easy. Just have pointers that point to aligned things. */
+
+/* assume they're all 256 bytes long just to make it easy.
+ * Just have pointers that point to aligned things.
+ */
 
 struct acpi_table_rsdp rsdp = {
-       .signature = "RSD PTR ",
+       .signature = ACPI_SIG_RSDP,
        .oem_id = "AKAROS",
        .revision = 2,
        .length = 36,
@@ -43,9 +60,8 @@ struct acpi_table_rsdp rsdp = {
 
 struct acpi_table_xsdt xsdt = {
        .header = {
-               .signature= "XSDT",
-               // This is so stupid. Incredibly stupid.
-               .revision = 0,
+               .signature = ACPI_SIG_DSDT,
+               .revision = 2,
                .oem_id = "AKAROS",
                .oem_table_id = "ALPHABET",
                .oem_revision = 0,
@@ -55,9 +71,8 @@ struct acpi_table_xsdt xsdt = {
 };
 struct acpi_table_fadt fadt = {
        .header = {
-               .signature= "FADT",
-               // This is so stupid. Incredibly stupid.
-               .revision = 0,
+               .signature = ACPI_SIG_FADT,
+               .revision = 2,
                .oem_id = "AKAROS",
                .oem_table_id = "ALPHABET",
                .oem_revision = 0,
@@ -66,84 +81,62 @@ struct acpi_table_fadt fadt = {
        },
 };
 
+
 /* This has to be dropped into memory, then the other crap just follows it.
  */
 struct acpi_table_madt madt = {
        .header = {
-               .signature = "APIC",
-               .revision = 0,
+               .signature = ACPI_SIG_MADT,
+               .revision = 2,
                .oem_id = "AKAROS",
                .oem_table_id = "ALPHABET",
                .oem_revision = 0,
                .asl_compiler_id = "RON ",
                .asl_compiler_revision = 0,
        },
-       
+
        .address = 0xfee00000ULL,
+       .flags = 0,
 };
 
 struct acpi_madt_local_apic Apic0 = {.header = {.type = ACPI_MADT_TYPE_LOCAL_APIC, .length = sizeof(struct acpi_madt_local_apic)},
-                                    .processor_id = 0, .id = 0};
+                                     .processor_id = 0, .id = 0, .lapic_flags = 1};
 struct acpi_madt_io_apic Apic1 = {.header = {.type = ACPI_MADT_TYPE_IO_APIC, .length = sizeof(struct acpi_madt_io_apic)},
-                                 .id = 1, .address = 0xfec00000, .global_irq_base = 0};
+                                  .id = 0, .address = 0xfec00000, .global_irq_base = 0};
+struct acpi_madt_local_x2apic X2Apic0 = {
+       .header = {
+               .type = ACPI_MADT_TYPE_LOCAL_X2APIC,
+               .length = sizeof(struct acpi_madt_local_x2apic)
+       },
+       .local_apic_id = 0,
+       .uid = 0
+};
+
 struct acpi_madt_interrupt_override isor[] = {
-       /* I have no idea if it should be source irq 2, global 0, or global 2, source 0. Shit. */
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 2, .global_irq = 0, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 1, .global_irq = 1, .inti_flags = 0},
-       //{.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        //.bus = 0, .source_irq = 2, .global_irq = 2, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 3, .global_irq = 3, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 4, .global_irq = 4, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 5, .global_irq = 5, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 6, .global_irq = 6, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 7, .global_irq = 7, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 8, .global_irq = 8, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 9, .global_irq = 9, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 10, .global_irq = 10, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 11, .global_irq = 11, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 12, .global_irq = 12, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 13, .global_irq = 13, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 14, .global_irq = 14, .inti_flags = 0},
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 15, .global_irq = 15, .inti_flags = 0},
-       // VMMCP routes irq 32 to gsi 17
-       {.header = {.type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, .length = sizeof(struct acpi_madt_interrupt_override)},
-        .bus = 0, .source_irq = 32, .global_irq = 17, .inti_flags = 5},
+       /* From the ACPI Specification Version 6.1:
+        * For example, if your machine has the ISA Programmable Interrupt Timer
+        * (PIT) connected to ISA IRQ 0, but in APIC mode, it is connected to I/O
+        * APIC interrupt input 2, then you would need an Interrupt Source Override
+        * where the source entry is ‘0’ and the Global System Interrupt is ‘2.’
+        */
 };
 
 
 /* this test will run the "kernel" in the negative address space. We hope. */
 void *low1m;
-uint8_t low4k[4096];
-unsigned long long stack[1024];
 volatile int shared = 0;
 volatile int quit = 0;
-int mcp = 1;
-int virtioirq = 17;
 
 /* total hack. If the vm runs away we want to get control again. */
 unsigned int maxresume = (unsigned int) -1;
 
-#define MiB 0x100000u
-#define GiB (1u<<30)
-#define GKERNBASE (16*MiB)
-#define KERNSIZE (128*MiB+GKERNBASE)
-uint8_t _kernel[KERNSIZE];
-
+#define MiB 0x100000ull
+#define GiB (1ull << 30)
+#define MinMemory (16*MiB)
+void *kernel;
+unsigned long long memsize = GiB;
+uintptr_t memstart = MinMemory;
+uintptr_t stack;
 unsigned long long *p512, *p1, *p2m;
 
 void **my_retvals;
@@ -151,12 +144,10 @@ int nr_threads = 4;
 int debug = 0;
 int resumeprompt = 0;
 /* unlike Linux, this shared struct is for both host and guest. */
-//     struct virtqueue *constoguest = 
+//     struct virtqueue *constoguest =
 //             vring_new_virtqueue(0, 512, 8192, 0, inpages, NULL, NULL, "test");
-uint64_t virtio_mmio_base = 0x100000000ULL;
 
 void vapic_status_dump(FILE *f, void *vapic);
-static void set_posted_interrupt(int vector);
 
 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
 #error "Get a gcc newer than 4.4.0"
@@ -167,160 +158,131 @@ static void set_posted_interrupt(int vector);
 #define LOCK_PREFIX "lock "
 #define ADDR                           BITOP_ADDR(addr)
 static inline int test_and_set_bit(int nr, volatile unsigned long *addr);
+static int default_nic = 1;
 
-void *timer_thread(void *arg)
-{
-       int fd = open("#cons/vmctl", O_RDWR), ret;
+pthread_t timerthread_struct;
 
+void timer_thread(void *arg)
+{
+       uint8_t vector;
+       uint32_t initial_count;
        while (1) {
-               set_posted_interrupt(0xef);
-               pwrite(fd, &vmctl, sizeof(vmctl), 1<<12);
-               uthread_usleep(1);
+               vector = ((uint32_t *)gpci.vapic_addr)[0x32] & 0xff;
+               initial_count = ((uint32_t *)gpci.vapic_addr)[0x38];
+               if (vector && initial_count)
+                       vmm_interrupt_guest(vm, 0, vector);
+               uthread_usleep(100000);
        }
+       fprintf(stderr, "SENDING TIMER\n");
 }
 
-void *consout(void *arg)
-{
-       char *line, *consline, *outline;
-       static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
-       static struct scatterlist in[] = { {NULL, sizeof(line)}, };
-       static struct scatterlist iov[32];
-       struct virtio_threadarg *a = arg;
-       static unsigned int inlen, outlen, conslen;
-       struct virtqueue *v = a->arg->virtio;
-       fprintf(stderr, "talk thread ..\n");
-       uint16_t head, gaveit = 0, gotitback = 0;
-       uint32_t vv;
-       int i;
-       int num;
-       if (debug) {
-               fprintf(stderr, "----------------------- TT a %p\n", a);
-               fprintf(stderr, "talk thread ttargs %x v %x\n", a, v);
-       }
-       
-       for(num = 0;;num++) {
-               //int debug = 1;
-               /* host: use any buffers we should have been sent. */
-               head = wait_for_vq_desc(v, iov, &outlen, &inlen);
-               if (debug)
-                       fprintf(stderr, "CCC: vq desc head %d, gaveit %d gotitback %d\n", head, gaveit, gotitback);
-               for(i = 0; debug && i < outlen + inlen; i++)
-                       fprintf(stderr, "CCC: v[%d/%d] v %p len %d\n", i, outlen + inlen, iov[i].v, iov[i].length);
-               /* host: if we got an output buffer, just output it. */
-               for(i = 0; i < outlen; i++) {
-                       num++;
-                       int j;
-                       if (debug) {
-                               fprintf(stderr, "CCC: IOV length is %d\n", iov[i].length);
-                       }
-                       for (j = 0; j < iov[i].length; j++)
-                               printf("%c", ((char *)iov[i].v)[j]);
-               }
-               fflush(stdout);
-               if (debug)
-                       fprintf(stderr, "CCC: outlen is %d; inlen is %d\n", outlen, inlen);
-               /* host: fill in the writeable buffers. */
-               /* why we're getting these I don't know. */
-               for (i = outlen; i < outlen + inlen; i++) {
-                       if (debug) fprintf(stderr, "CCC: send back empty writeable");
-                       iov[i].length = 0;
-               }
-               if (debug) fprintf(stderr, "CCC: call add_used\n");
-               /* host: now ack that we used them all. */
-               add_used(v, head, outlen+inlen);
-               if (debug) fprintf(stderr, "CCC: DONE call add_used\n");
-       }
-       fprintf(stderr, "All done\n");
-       return NULL;
-}
 
-// FIXME. 
+// FIXME.
 volatile int consdata = 0;
 
-void *consin(void *arg)
+/* TODO: pass a core id to poke_guest */
+static void virtio_poke_guest(uint8_t vec)
 {
-       struct virtio_threadarg *a = arg;
-       char *line, *outline;
-       static char consline[128];
-       static struct scatterlist iov[32];
-       static struct scatterlist out[] = { {NULL, sizeof(outline)}, };
-       static struct scatterlist in[] = { {NULL, sizeof(line)}, };
-
-       static unsigned int inlen, outlen, conslen;
-       struct virtqueue *v = a->arg->virtio;
-       fprintf(stderr, "consin thread ..\n");
-       uint16_t head, gaveit = 0, gotitback = 0;
-       uint32_t vv;
-       int i;
-       int num;
-       //char c[1];
-       int timer_started = 0;
-       pthread_t timerthread_struct;
-
-       int fd = open("#cons/vmctl", O_RDWR), ret;
-       
-       if (debug) fprintf(stderr, "Spin on console being read, print num queues, halt\n");
-
-       for(num = 0;! quit;num++) {
-               //int debug = 1;
-               /* host: use any buffers we should have been sent. */
-               head = wait_for_vq_desc(v, iov, &outlen, &inlen);
-               if (debug)
-                       fprintf(stderr, "vq desc head %d, gaveit %d gotitback %d\n", head, gaveit, gotitback);
-               for(i = 0; debug && i < outlen + inlen; i++)
-                       fprintf(stderr, "v[%d/%d] v %p len %d\n", i, outlen + inlen, iov[i].v, iov[i].length);
-               if (debug)
-                       fprintf(stderr, "outlen is %d; inlen is %d\n", outlen, inlen);
-               /* host: fill in the writeable buffers. */
-               for (i = outlen; i < outlen + inlen; i++) {
-                       /* host: read a line. */
-                       memset(consline, 0, 128);
-                       if (read(0, consline, 1) < 0) {
-                               exit(0);
-                       } 
-                       if (debug) fprintf(stderr, "CONSIN: GOT A LINE:%s:\n", consline);
-                       if (debug) fprintf(stderr, "CONSIN: OUTLEN:%d:\n", outlen);
-                       if (strlen(consline) < 3 && consline[0] == 'q' ) {
-                               quit = 1;
-                               break;
-                       }
+       vmm_interrupt_guest(vm, 0, vec);
+}
 
-                       memmove(iov[i].v, consline, strlen(consline)+ 1);
-                       iov[i].length = strlen(consline) + 1;
-               }
-               if (debug) fprintf(stderr, "call add_used\n");
-               /* host: now ack that we used them all. */
-               add_used(v, head, outlen+inlen);
-               consdata = 1;
-               if (debug) fprintf(stderr, "DONE call add_used\n");
-
-               // Send spurious for testing (Gan)
-               set_posted_interrupt(0xE5);
-               virtio_mmio_set_vring_irq();
-
-               pwrite(fd, &vmctl, sizeof(vmctl), 1<<12);
-               if (!timer_started && mcp) {
-                       /* Start up timer thread */
-                       if (pthread_create(&timerthread_struct, NULL, timer_thread, NULL)) {
-                               fprintf(stderr, "pth_create failed for timer thread.");
-                               perror("pth_create");
-                       } else {
-                               timer_started = 1;
-                       }
-               }
+static struct virtio_mmio_dev cons_mmio_dev = {
+       .poke_guest = virtio_poke_guest,
+};
+
+static struct virtio_console_config cons_cfg;
+static struct virtio_console_config cons_cfg_d;
+
+static struct virtio_vq_dev cons_vqdev = {
+       .name = "console",
+       .dev_id = VIRTIO_ID_CONSOLE,
+       .dev_feat =
+       (1ULL << VIRTIO_F_VERSION_1) | (1 << VIRTIO_RING_F_INDIRECT_DESC),
+       .num_vqs = 2,
+       .cfg = &cons_cfg,
+       .cfg_d = &cons_cfg_d,
+       .cfg_sz = sizeof(struct virtio_console_config),
+       .transport_dev = &cons_mmio_dev,
+       .vqs = {
+               {
+                       .name = "cons_receiveq",
+                       .qnum_max = 64,
+                       .srv_fn = cons_receiveq_fn,
+                       .vqdev = &cons_vqdev
+               },
+               {
+                       .name = "cons_transmitq",
+                       .qnum_max = 64,
+                       .srv_fn = cons_transmitq_fn,
+                       .vqdev = &cons_vqdev
+               },
        }
-       fprintf(stderr, "All done\n");
-       return NULL;
-}
+};
+
+static struct virtio_mmio_dev net_mmio_dev = {
+       .poke_guest = virtio_poke_guest,
+};
+
+static struct virtio_net_config net_cfg = {
+       .max_virtqueue_pairs = 1
+};
+static struct virtio_net_config net_cfg_d = {
+       .max_virtqueue_pairs = 1
+};
+
+static struct virtio_vq_dev net_vqdev = {
+       .name = "network",
+       .dev_id = VIRTIO_ID_NET,
+       .dev_feat = (1ULL << VIRTIO_F_VERSION_1 | 1 << VIRTIO_NET_F_MAC),
+
+       .num_vqs = 2,
+       .cfg = &net_cfg,
+       .cfg_d = &net_cfg_d,
+       .cfg_sz = sizeof(struct virtio_net_config),
+       .transport_dev = &net_mmio_dev,
+       .vqs = {
+               {
+                       .name = "net_receiveq",
+                       .qnum_max = 64,
+                       .srv_fn = net_receiveq_fn,
+                       .vqdev = &net_vqdev
+               },
+               {
+                       .name = "net_transmitq",
+                       .qnum_max = 64,
+                       .srv_fn = net_transmitq_fn,
+                       .vqdev = &net_vqdev
+               },
+       }
+};
+
+static struct virtio_mmio_dev blk_mmio_dev = {
+       .poke_guest = virtio_poke_guest,
+};
+
+static struct virtio_blk_config blk_cfg = {
+};
+
+static struct virtio_blk_config blk_cfg_d = {
+};
 
-static struct vqdev vqdev= {
-name: "console",
-dev: VIRTIO_ID_CONSOLE,
-device_features: 0, /* Can't do it: linux console device does not support it. VIRTIO_F_VERSION_1*/
-numvqs: 2,
-vqs: {
-               {name: "consin", maxqnum: 64, f: consin, arg: (void *)0},
-               {name: "consout", maxqnum: 64, f: consout, arg: (void *)0},
+static struct virtio_vq_dev blk_vqdev = {
+       .name = "block",
+       .dev_id = VIRTIO_ID_BLOCK,
+       .dev_feat = (1ULL << VIRTIO_F_VERSION_1),
+
+       .num_vqs = 1,
+       .cfg = &blk_cfg,
+       .cfg_d = &blk_cfg_d,
+       .cfg_sz = sizeof(struct virtio_blk_config),
+       .transport_dev = &blk_mmio_dev,
+       .vqs = {
+               {
+                       .name = "blk_request",
+                       .qnum_max = 64,
+                       .srv_fn = blk_request,
+                       .vqdev = &blk_vqdev
+               },
        }
 };
 
@@ -345,8 +307,8 @@ static uint8_t acpi_tb_checksum(uint8_t *buffer, uint32_t length)
 static void gencsum(uint8_t *target, void *data, int len)
 {
        uint8_t csum;
-       // blast target to zero so it does not get counted (it might be in the struct we checksum) 
-       // And, yes, it is, goodness.
+       // blast target to zero so it does not get counted
+       // (it might be in the struct we checksum) And, yes, it is, goodness.
        fprintf(stderr, "gencsum %p target %p source %d bytes\n", target, data, len);
        *target = 0;
        csum  = acpi_tb_checksum((uint8_t *)data, len);
@@ -359,91 +321,214 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
        int oldbit;
 
        asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
-                    "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
+                    "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
 
        return oldbit;
 }
 
-static void pir_dump()
+/* load_kernel loads an ELF file as a kernel. */
+uintptr_t
+load_kernel(char *filename, uintptr_t *kernstart, uintptr_t *kernend)
+{
+       Elf64_Ehdr *ehdr;
+       Elf *elf;
+       size_t phnum = 0;
+       Elf64_Phdr *hdrs;
+       int fd;
+
+       elf_version(EV_CURRENT);
+       fd = open(filename, O_RDONLY);
+       if (fd < 0) {
+               fprintf(stderr, "Can't open %s: %r\n", filename);
+               return 0;
+       }
+
+       elf = elf_begin(fd, ELF_C_READ, NULL);
+       if (elf == NULL) {
+               fprintf(stderr, "%s: cannot read %s ELF file.\n", __func__, filename);
+               close(fd);
+               return 0;
+       }
+
+       ehdr = elf64_getehdr(elf);
+       if (ehdr == NULL) {
+               fprintf(stderr, "%s: cannot get exec header of %s.\n",
+                       __func__, filename);
+               goto fail;
+       }
+       fprintf(stderr, "%s ELF entry point is %p\n", filename,
+               (void *)ehdr->e_entry);
+
+       if (elf_getphdrnum(elf, &phnum) < 0) {
+               fprintf(stderr, "%s: cannot get program header num of %s.\n",
+                       __func__, filename);
+               goto fail;
+       }
+       fprintf(stderr, "%s has %p program headers\n", filename, phnum);
+
+       hdrs = elf64_getphdr(elf);
+       if (hdrs == NULL) {
+               fprintf(stderr, "%s: cannot get program headers of %s.\n",
+                       __func__, filename);
+               goto fail;
+       }
+
+       for (int i = 0; i < phnum; i++) {
+               size_t tot;
+               Elf64_Phdr *h = &hdrs[i];
+               uintptr_t pa;
+
+               fprintf(stderr,
+                       "%d: type 0x%lx flags 0x%lx  offset 0x%lx vaddr 0x%lx paddr 0x%lx size 0x%lx  memsz 0x%lx align 0x%lx\n",
+                       i,
+                       h->p_type,              /* Segment type */
+                       h->p_flags,             /* Segment flags */
+                       h->p_offset,            /* Segment file offset */
+                       h->p_vaddr,             /* Segment virtual address */
+                       h->p_paddr,             /* Segment physical address */
+                       h->p_filesz,            /* Segment size in file */
+                       h->p_memsz,             /* Segment size in memory */
+                       h->p_align              /* Segment alignment */);
+               if (h->p_type != PT_LOAD)
+                       continue;
+               if ((h->p_flags & (PF_R | PF_W | PF_X)) == 0)
+                       continue;
+
+               pa = h->p_paddr;
+               if (*kernstart > pa)
+                       *kernstart = pa;
+               if (*kernend < pa + h->p_memsz)
+                       *kernend = pa + h->p_memsz;
+               fprintf(stderr,
+                       "Read header %d @offset %p to %p (elf PA is %p) %d bytes:",
+                       i, h->p_offset, pa, h->p_paddr, h->p_filesz);
+               tot = 0;
+               while (tot < h->p_filesz) {
+                       int amt = pread(fd, (void *)(pa + tot), h->p_filesz - tot,
+                                       h->p_offset + tot);
+                       if (amt < 1)
+                               break;
+                       tot += amt;
+               }
+               fprintf(stderr, "read a total of %d bytes\n", tot);
+               if (tot < h->p_filesz) {
+                       fprintf(stderr, "%s: got %d bytes, wanted %d bytes\n",
+                               filename, tot, h->p_filesz);
+                       goto fail;
+               }
+       }
+
+       close(fd);
+       elf_end(elf);
+       return ehdr->e_entry;
+fail:
+       close(fd);
+       elf_end(elf);
+       return 0;
+}
+
+/* TODO: put this in a library somewhere */
+int cat(char *file, char *where)
 {
-       unsigned long *pir_ptr = (unsigned long *)vmctl.pir;
-       int i;
-       fprintf(stderr, "-------Begin PIR dump-------\n");
-       for (i = 0; i < 8; i++){
-               fprintf(stderr, "Byte %d: 0x%016x\n", i, pir_ptr[i]);
+       int fd;
+       int amt, tot = 0;
+
+       fd = open(file, O_RDONLY);
+       if (fd < 0)
+               return -1;
+
+       while (amt = read(fd, where, 4096)) {
+               if (amt < 0) {
+                       close(fd);
+                       return -1;
+               }
+               tot += amt;
+               where += amt;
        }
-       fprintf(stderr, "-------End PIR dump-------\n");
+       close(fd);
+       return tot;
 }
 
-static void set_posted_interrupt(int vector)
+int smbios(char *smbiostable, void *esegment)
 {
-       unsigned long *bit_vec;
-       int bit_offset;
-       int i, j;
-       unsigned long *pir = (unsigned long *)vmctl.pir;
-       // Move to the correct location to set our bit.
-       bit_vec = pir + vector/(sizeof(unsigned long)*8);
-       bit_offset = vector%(sizeof(unsigned long)*8);
-       if(debug) fprintf(stderr, "%s: Pre set PIR dump\n", __func__);
-       if(debug) pir_dump();
-       if(debug) vapic_status_dump(stderr, (void *)vmctl.vapic);
-       if(debug) fprintf(stderr, "%s: Setting pir bit offset %d at 0x%p\n", __func__,
-                       bit_offset, bit_vec);
-       test_and_set_bit(bit_offset, bit_vec);
-
-       // Set outstanding notification bit
-       /*bit_vec = pir + 4;
-       fprintf(stderr, "%s: Setting pir bit offset 0 at 0x%p", __func__,
-                       bit_vec);
-       test_and_set_bit(0, bit_vec);*/
-
-       if(debug) pir_dump();
+       int amt;
+
+       amt = cat(smbiostable, esegment);
+       if (amt < 0) {
+               fprintf(stderr, "%s: %r\n", smbiostable);
+               exit(1);
+       }
+
+       return amt;
 }
 
 int main(int argc, char **argv)
 {
-       uint64_t *p64;
+       struct boot_params *bp;
+       char cmdline_default[512] = {0};
+       char *cmdline_extra = "\0";
+       char *cmdline;
        void *a = (void *)0xe0000;
        struct acpi_table_rsdp *r;
        struct acpi_table_fadt *f;
        struct acpi_table_madt *m;
        struct acpi_table_xsdt *x;
-       uint64_t virtiobase = 0x100000000ULL;
-       // lowmem is a bump allocated pointer to 2M at the "physbase" of memory 
-       void *lowmem = (void *) 0x1000000;
-       //struct vmctl vmctl;
-       int amt;
        int vmmflags = 0; // Disabled probably forever. VMM_VMCALL_PRINTF;
-       uint64_t entry = 0x1200000, kerneladdress = 0x1200000;
-       int nr_gpcs = 1;
-       int fd = open("#cons/vmctl", O_RDWR), ret;
-       void * xp;
-       int kfd = -1;
-       static char cmd[512];
-       int i;
+       uint64_t entry = 0;
+       int ret;
+       uintptr_t size;
        uint8_t csum;
-       void *coreboot_tables = (void *) 0x1165000;
        void *a_page;
-fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT, PML1_PTE_REACH);
-
-       // mmap is not working for us at present.
-       if ((uint64_t)_kernel > GKERNBASE) {
-               fprintf(stderr, "kernel array @%p is above , GKERNBASE@%p sucks\n", _kernel, GKERNBASE);
+       struct vm_trapframe *vm_tf;
+       uint64_t tsc_freq_khz;
+       char *cmdlinep;
+       int cmdlinesz, len, cmdline_fd;
+       char *disk_image_file = NULL;
+       int c;
+       struct stat stat_result;
+       int num_read;
+       int option_index;
+       uintptr_t kernstart = (uintptr_t)~1, kernend = 0;
+       char *smbiostable = NULL;
+
+       static struct option long_options[] = {
+               {"debug",         no_argument,       0, 'd'},
+               {"vmm_vmcall",    no_argument,       0, 'v'},
+               {"maxresume",     required_argument, 0, 'R'},
+               {"memsize",       required_argument, 0, 'm'},
+               {"memstart",      required_argument, 0, 'M'},
+               {"stack",         required_argument, 0, 'S'},
+               {"cmdline_extra", required_argument, 0, 'c'},
+               {"greedy",        no_argument,       0, 'g'},
+               {"scp",           no_argument,       0, 's'},
+               {"image_file",    required_argument, 0, 'f'},
+               {"cmdline",       required_argument, 0, 'k'},
+               {"nic",           required_argument, 0, 'n'},
+               {"smbiostable",   required_argument, 0, 't'},
+               {"help",          no_argument,       0, 'h'},
+               {0, 0, 0, 0}
+       };
+
+       fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT,
+               PML1_PTE_REACH);
+
+       if ((uintptr_t)__procinfo.program_end >= MinMemory) {
+               fprintf(stderr,
+                       "Panic: vmrunkernel binary extends into guest memory\n");
                exit(1);
        }
-       memset(_kernel, 0, sizeof(_kernel));
-       memset(lowmem, 0xff, 2*1048576);
-       memset(low4k, 0xff, 4096);
-       // avoid at all costs, requires too much instruction emulation.
-       //low4k[0x40e] = 0;
-       //low4k[0x40f] = 0xe0;
+
+       vm->low4k = malloc(PGSIZE);
+       memset(vm->low4k, 0xff, PGSIZE);
+       vm->low4k[0x40e] = 0;
+       vm->low4k[0x40f] = 0;
 
        //Place mmap(Gan)
        a_page = mmap((void *)0xfee00000, PGSIZE, PROT_READ | PROT_WRITE,
-                             MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
-       fprintf(stderr, "a_page mmap pointer %p", a_page);
+                     MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
+       fprintf(stderr, "a_page mmap pointer %p\n", a_page);
 
-       if (a_page == (void *) -1) {
+       if (a_page != (void *)0xfee00000) {
                perror("Could not mmap APIC");
                exit(1);
        }
@@ -452,22 +537,12 @@ fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT, PML1_PTE_REACH);
                exit(1);
        }
 
-       memset(a_page, 0, 4096);
-       //((uint32_t *)a_page)[0x30/4] = 0x01060015;
-       ((uint32_t *)a_page)[0x30/4] = 0xDEADBEEF;
-
+       ((uint32_t *)a_page)[0x30/4] = 0x01060015;
+       //((uint32_t *)a_page)[0x30/4] = 0xDEADBEEF;
 
-       if (fd < 0) {
-               perror("#cons/sysctl");
-               exit(1);
-       }
-       argc--,argv++;
-       // switches ...
-       // Sorry, I don't much like the gnu opt parsing code.
-       while (1) {
-               if (*argv[0] != '-')
-                       break;
-               switch(argv[0][1]) {
+       while ((c = getopt_long(argc, argv, "dvm:M:S:c:gsf:k:n:t:hR:",
+                               long_options, &option_index)) != -1) {
+               switch (c) {
                case 'd':
                        debug++;
                        break;
@@ -475,64 +550,118 @@ fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT, PML1_PTE_REACH);
                        vmmflags |= VMM_VMCALL_PRINTF;
                        break;
                case 'm':
-                       argc--,argv++;
-                       maxresume = strtoull(argv[0], 0, 0);
+                       memsize = strtoull(optarg, 0, 0);
                        break;
-               case 'i':
-                       argc--,argv++;
-                       virtioirq = strtoull(argv[0], 0, 0);
+               case 'M':
+                       memstart = strtoull(optarg, 0, 0);
                        break;
-               default:
-                       fprintf(stderr, "BMAFR\n");
+               case 'S':
+                       stack = strtoull(optarg, 0, 0);
+                       break;
+               case 'R':
+                       maxresume = strtoull(optarg, 0, 0);
+                       break;
+               case 'c':
+                       cmdline_extra = optarg;
+               case 'g':       /* greedy */
+                       parlib_never_yield = TRUE;
+                       break;
+               case 's':       /* scp */
+                       parlib_wants_to_be_mcp = FALSE;
+                       break;
+               case 'f':       /* file to pass to blk_init */
+                       disk_image_file = optarg;
+                       break;
+               case 'k':       /* specify file to get cmdline args from */
+                       cmdline_fd = open(optarg, O_RDONLY);
+                       if (cmdline_fd < 0) {
+                               fprintf(stderr, "failed to open file: %s\n", optarg);
+                               exit(1);
+                       }
+                       if (stat(optarg, &stat_result) == -1) {
+                               fprintf(stderr, "stat of %s failed\n", optarg);
+                               exit(1);
+                       }
+                       len = stat_result.st_size;
+                       if (len > 512) {
+                               fprintf(stderr, "command line options exceed 512 bytes!");
+                               exit(1);
+                       }
+                       num_read = read(cmdline_fd, cmdline_default, len);
+                       if (num_read != len) {
+                               fprintf(stderr, "read failed len was : %d, num_read was: %d\n",
+                                       len, num_read);
+                               exit(1);
+                       }
+                       close(cmdline_fd);
                        break;
+               case 't':
+                       smbiostable = optarg;
+                       break;
+               case 'n':
+                       default_nic = strtoull(optarg, 0, 0);
+                       break;
+               case 'h':
+               default:
+                       // Sadly, the getopt_long struct does
+                       // not have a pointer to help text.
+                       for (int i = 0;
+                            i < sizeof(long_options)/sizeof(long_options[0]) - 1;
+                            i++) {
+                               struct option *l = &long_options[i];
+
+                               fprintf(stderr, "%s or %c%s\n", l->name, l->val,
+                                       l->has_arg ? " <arg>" : "");
+                       }
+                       exit(0);
                }
-               argc--,argv++;
        }
+       if (strlen(cmdline_default) == 0) {
+               fprintf(stderr, "WARNING: No command line parameter file specified.\n");
+       }
+       argc -= optind;
+       argv += optind;
        if (argc < 1) {
-               fprintf(stderr, "Usage: %s vmimage [-n (no vmcall printf)] [coreboot_tables [loadaddress [entrypoint]]]\n", argv[0]);
+               fprintf(stderr, "Usage: %s vmimage [-n (no vmcall printf)]\n", argv[0]);
                exit(1);
        }
-       if (argc > 1)
-               coreboot_tables = (void *) strtoull(argv[1], 0, 0);
-       if (argc > 2)
-               kerneladdress = strtoull(argv[2], 0, 0);
-       if (argc > 3)
-               entry = strtoull(argv[3], 0, 0);
-       kfd = open(argv[0], O_RDONLY);
-       if (kfd < 0) {
-               perror(argv[0]);
+
+       if ((uintptr_t)(memstart + memsize) >= (uintptr_t)BRK_START) {
+               fprintf(stderr,
+                       "memstart 0x%lx memsize 0x%lx -> 0x%lx is too large; overlaps BRK_START at %p\n",
+                       memstart, memsize, memstart + memsize, BRK_START);
                exit(1);
        }
-       // read in the kernel.
-       xp = (void *)kerneladdress;
-       for(;;) {
-               amt = read(kfd, xp, 1048576);
-               if (amt < 0) {
-                       perror("read");
-                       exit(1);
-               }
-               if (amt == 0) {
-                       break;
-               }
-               xp += amt;
+
+       kernel = mmap((void *)memstart, memsize,
+                     PROT_READ | PROT_WRITE | PROT_EXEC,
+                     MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
+       if (kernel != (void *)memstart) {
+               fprintf(stderr, "Could not mmap 0x%lx bytes at 0x%lx\n",
+                       memsize, memstart);
+               exit(1);
+       }
+
+       entry = load_kernel(argv[0], &kernstart, &kernend);
+       if (entry == 0) {
+               fprintf(stderr, "Unable to load kernel %s\n", argv[0]);
+               exit(1);
        }
-       fprintf(stderr, "Read in %d bytes\n", xp-kerneladdress);
-       close(kfd);
+
 
        // The low 1m so we can fill in bullshit like ACPI. */
        // And, sorry, due to the STUPID format of the RSDP for now we need the low 1M.
        low1m = mmap((int*)4096, MiB-4096, PROT_READ | PROT_WRITE,
-                        MAP_ANONYMOUS, -1, 0);
+                    MAP_ANONYMOUS, -1, 0);
        if (low1m != (void *)4096) {
                perror("Unable to mmap low 1m");
                exit(1);
        }
-       memset(low1m, 0xff, MiB-4096);
        r = a;
        fprintf(stderr, "install rsdp to %p\n", r);
        *r = rsdp;
        a += sizeof(*r);
-       memmove(&r->xsdt_physical_address, &a, sizeof(a));
+       r->xsdt_physical_address = (uint64_t)a;
        gencsum(&r->checksum, r, ACPI_RSDP_CHECKSUM_LENGTH);
        if ((csum = acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_CHECKSUM_LENGTH)) != 0) {
                fprintf(stderr, "RSDP has bad checksum; summed to %x\n", csum);
@@ -561,12 +690,18 @@ fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT, PML1_PTE_REACH);
        f = a;
        fprintf(stderr, "install fadt to %p\n", f);
        *f = fadt;
-       x->table_offset_entry[2] = (uint64_t) f;
+       x->table_offset_entry[0] = (uint64_t)f; // fadt MUST be first in xsdt!
        a += sizeof(*f);
        f->header.length = a - (void *)f;
+
+       f->Xdsdt = (uint64_t) a;
+       fprintf(stderr, "install dsdt to %p\n", a);
+       memcpy(a, &DSDT_DSDTTBL_Header, 36);
+       a += 36;
+
        gencsum(&f->header.checksum, f, f->header.length);
        if (acpi_tb_checksum((uint8_t *)f, f->header.length) != 0) {
-               fprintf(stderr, "ffadt has bad checksum v2\n");
+               fprintf(stderr, "fadt has bad checksum v2\n");
                exit(1);
        }
 
@@ -579,15 +714,17 @@ fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT, PML1_PTE_REACH);
        a += sizeof(Apic0);
        memmove(a, &Apic1, sizeof(Apic1));
        a += sizeof(Apic1);
+       memmove(a, &X2Apic0, sizeof(X2Apic0));
+       a += sizeof(X2Apic0);
        memmove(a, &isor, sizeof(isor));
        a += sizeof(isor);
        m->header.length = a - (void *)m;
+
        gencsum(&m->header.checksum, m, m->header.length);
        if (acpi_tb_checksum((uint8_t *) m, m->header.length) != 0) {
                fprintf(stderr, "madt has bad checksum v2\n");
                exit(1);
        }
-       fprintf(stderr, "allchecksums ok\n");
 
        gencsum(&x->header.checksum, x, x->header.length);
        if ((csum = acpi_tb_checksum((uint8_t *) x, x->header.length)) != 0) {
@@ -595,297 +732,173 @@ fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT, PML1_PTE_REACH);
                exit(1);
        }
 
+       fprintf(stderr, "allchecksums ok\n");
+
        hexdump(stdout, r, a-(void *)r);
 
        a = (void *)(((unsigned long)a + 0xfff) & ~0xfff);
-       vmctl.pir = (uint64_t) a;
+       gpci.posted_irq_desc = a;
        memset(a, 0, 4096);
        a += 4096;
-       vmctl.vapic = (uint64_t) a;
-       //vmctl.vapic = (uint64_t) a_page;      
+       gpci.vapic_addr = a;
        memset(a, 0, 4096);
        ((uint32_t *)a)[0x30/4] = 0x01060014;
-       p64 = a;
        // set up apic values? do we need to?
        // qemu does this.
        //((uint8_t *)a)[4] = 1;
        a += 4096;
+       gpci.apic_addr = (void*)0xfee00000;
 
-       if (ros_syscall(SYS_setup_vmm, nr_gpcs, vmmflags, 0, 0, 0, 0) != nr_gpcs) {
-               perror("Guest pcore setup failed");
-               exit(1);
+       /* Allocate memory for, and zero the bootparams
+        * page before writing to it, or Linux thinks
+        * we're talking crazy.
+        */
+       a += 4096;
+       bp = a;
+       memset(bp, 0, 4096);
+
+       /* Put the e820 memory region information in the boot_params */
+       bp->e820_entries = 5;
+       int e820i = 0;
+
+       /* Give it just a tiny bit of memory -- 60k -- at low memory. */
+       bp->e820_map[e820i].addr = 0;
+       bp->e820_map[e820i].size = 4 * 1024;
+       bp->e820_map[e820i++].type = E820_RESERVED;
+
+       bp->e820_map[e820i].addr = 4 * 1024;
+       bp->e820_map[e820i].size = 64 * 1024 - 4 * 1024;
+       bp->e820_map[e820i++].type = E820_RAM;
+
+       bp->e820_map[e820i].addr = 64 * 1024;
+       bp->e820_map[e820i].size = memstart - 64 * 1024;
+       bp->e820_map[e820i++].type = E820_RESERVED;
+
+       bp->e820_map[e820i].addr = memstart;
+       bp->e820_map[e820i].size = memsize;
+       bp->e820_map[e820i++].type = E820_RAM;
+
+       bp->e820_map[e820i].addr = 0xf0000000;
+       bp->e820_map[e820i].size = 0x10000000;
+       bp->e820_map[e820i++].type = E820_RESERVED;
+
+       /* The MMIO address of the console device is really the address of an
+        * unbacked EPT page: accesses to this page will cause a page fault that
+        * traps to the host, which will examine the fault, see it was for the
+        * known MMIO address, and fulfill the MMIO read or write on the guest's
+        * behalf accordingly. We place the virtio space at 512 GB higher than the
+        * guest physical memory to avoid a full page table walk. */
+       uint64_t virtio_mmio_base_addr = ROUNDUP((bp->e820_map[e820i - 1].addr +
+                                                 bp->e820_map[e820i - 1].size),
+                                                512 * GiB);
+
+       cons_mmio_dev.addr =
+               virtio_mmio_base_addr + PGSIZE * VIRTIO_MMIO_CONSOLE_DEV;
+       cons_mmio_dev.vqdev = &cons_vqdev;
+       vm->virtio_mmio_devices[VIRTIO_MMIO_CONSOLE_DEV] = &cons_mmio_dev;
+
+       net_mmio_dev.addr =
+               virtio_mmio_base_addr + PGSIZE * VIRTIO_MMIO_NETWORK_DEV;
+       net_mmio_dev.vqdev = &net_vqdev;
+       vm->virtio_mmio_devices[VIRTIO_MMIO_NETWORK_DEV] = &net_mmio_dev;
+
+       if (disk_image_file != NULL) {
+               blk_mmio_dev.addr =
+                       virtio_mmio_base_addr + PGSIZE * VIRTIO_MMIO_BLOCK_DEV;
+               blk_mmio_dev.vqdev = &blk_vqdev;
+               vm->virtio_mmio_devices[VIRTIO_MMIO_BLOCK_DEV] = &blk_mmio_dev;
+               blk_init_fn(&blk_vqdev, disk_image_file);
        }
+       net_init_fn(&net_vqdev, default_nic);
 
-       fprintf(stderr, "Run with %d cores and vmmflags 0x%x\n", nr_gpcs, vmmflags);
-       mcp = 1;
-       if (mcp) {
-               my_retvals = malloc(sizeof(void*) * nr_threads);
-               if (!my_retvals)
-                       perror("Init threads/malloc");
-
-               pthread_can_vcore_request(FALSE);       /* 2LS won't manage vcores */
-               pthread_need_tls(FALSE);
-               pthread_mcp_init();                                     /* gives us one vcore */
-               vcore_request(nr_threads - 1);          /* ghetto incremental interface */
-               for (int i = 0; i < nr_threads; i++) {
-                       xp = __procinfo.vcoremap;
-                       fprintf(stderr, "%p\n", __procinfo.vcoremap);
-                       fprintf(stderr, "Vcore %d mapped to pcore %d\n", i,
-                               __procinfo.vcoremap[i].pcoreid);
-               }
+       /* Set the kernel command line parameters */
+       a += 4096;
+       cmdline = a;
+       a += 4096;
+
+       if (smbiostable) {
+               fprintf(stderr, "Using SMBIOS table %s\n", smbiostable);
+               a += smbios(smbiostable, a);
        }
 
-       ret = syscall(33, 1);
-       if (ret < 0) {
-               perror("vm setup");
-               exit(1);
+       bp->hdr.cmd_line_ptr = (uintptr_t) cmdline;
+
+       tsc_freq_khz = get_tsc_freq()/1000;
+       len = snprintf(cmdline, 4096, "%s tscfreq=%lld %s", cmdline_default,
+                      tsc_freq_khz, cmdline_extra);
+
+       cmdlinesz = 4096 - len;
+       cmdlinep = cmdline + len;
+
+       for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
+               if (vm->virtio_mmio_devices[i] == NULL)
+                       continue;
+
+               /* Append all the virtio mmio base addresses. */
+
+               /* Since the lower number irqs are no longer being used, the irqs
+                * can now be assigned starting from 0.
+                */
+               vm->virtio_mmio_devices[i]->irq = i;
+               len = snprintf(cmdlinep, cmdlinesz,
+                              " virtio_mmio.device=1K@0x%llx:%lld",
+                              vm->virtio_mmio_devices[i]->addr,
+                              vm->virtio_mmio_devices[i]->irq);
+               if (len >= cmdlinesz) {
+                       fprintf(stderr, "Too many arguments to the linux command line.");
+                       exit(1);
+               }
+               cmdlinesz -= len;
+               cmdlinep += len;
        }
-       ret = posix_memalign((void **)&p512, 4096, 3*4096);
-       fprintf(stderr, "memalign is %p\n", p512);
+
+       vm->nr_gpcs = 1;
+       vm->gpcis = &gpci;
+       ret = vmm_init(vm, vmmflags);
+       assert(!ret);
+
+       /* Allocate 3 pages for page table pages: a page of 512 GiB
+        * PTEs with only one entry filled to point to a page of 1 GiB
+        * PTEs; a page of 1 GiB PTEs with only one entry filled to
+        * point to a page of 2 MiB PTEs; and a page of 2 MiB PTEs,
+        * only a subset of which will be filled. */
+       ret = posix_memalign((void **)&p512, PGSIZE, 3 * PGSIZE);
        if (ret) {
                perror("ptp alloc");
                exit(1);
        }
-       p1 = &p512[512];
-       p2m = &p512[1024];
-       uint64_t kernbase = 0; //0xffffffff80000000;
-       uint64_t highkernbase = 0xffffffff80000000;
-       p512[PML4(kernbase)] = (unsigned long long)p1 | 7;
-       p1[PML3(kernbase)] = /*0x87; */(unsigned long long)p2m | 7;
-       p512[PML4(highkernbase)] = (unsigned long long)p1 | 7;
-       p1[PML3(highkernbase)] = /*0x87; */(unsigned long long)p2m | 7;
-#define _2MiB (0x200000)
-
-       for (i = 0; i < 512; i++) {
-               p2m[PML2(kernbase + i * _2MiB)] = 0x87 | i * _2MiB;
-       }
-
-       kernbase >>= (0+12);
-       kernbase <<= (0 + 12);
-       uint8_t *kernel = (void *)GKERNBASE;
-       //write_coreboot_table(coreboot_tables, ((void *)VIRTIOBASE) /*kernel*/, KERNSIZE + 1048576);
-       hexdump(stdout, coreboot_tables, 512);
-       fprintf(stderr, "kernbase for pml4 is 0x%llx and entry is %llx\n", kernbase, entry);
-       fprintf(stderr, "p512 %p p512[0] is 0x%lx p1 %p p1[0] is 0x%x\n", p512, p512[0], p1, p1[0]);
-       vmctl.interrupt = 0;
-       vmctl.command = REG_RSP_RIP_CR3;
-       vmctl.cr3 = (uint64_t) p512;
-       vmctl.regs.tf_rip = entry;
-       vmctl.regs.tf_rsp = (uint64_t) &stack[1024];
-       if (mcp) {
-               /* set up virtio bits, which depend on threads being enabled. */
-               register_virtio_mmio(&vqdev, virtio_mmio_base);
+
+       /* Set up a 1:1 ("identity") page mapping from guest virtual
+        * to guest physical using the (host virtual)
+        * `kerneladdress`. This mapping is used for only a short
+        * time, until the guest sets up its own page tables. Be aware
+        * that the values stored in the table are physical addresses.
+        * This is subtle and mistakes are easily disguised due to the
+        * identity mapping, so take care when manipulating these
+        * mappings. */
+       p1 = &p512[NPTENTRIES];
+       p2m = &p512[2 * NPTENTRIES];
+
+       size = kernend - kernstart;
+       fprintf(stderr, "Map %p for %zu bytes\n", kernstart, size);
+       p512[PML4(kernstart)] = (uint64_t)p1 | PTE_KERN_RW;
+       p1[PML3(kernstart)] = (uint64_t)p2m | PTE_KERN_RW;
+       for (uintptr_t i = 0; i < size; i += PML2_PTE_REACH) {
+               p2m[PML2(kernstart + i)] =
+                       (uint64_t)(kernstart + i) | PTE_KERN_RW | PTE_PS;
        }
-       fprintf(stderr, "threads started\n");
-       fprintf(stderr, "Writing command :%s:\n", cmd);
-       
-       if(debug) vapic_status_dump(stderr, (void *)vmctl.vapic);
 
-       ret = pwrite(fd, &vmctl, sizeof(vmctl), 0);
+       fprintf(stderr, "p512 %p p512[0] is 0x%lx p1 %p p1[0] is 0x%x\n", p512, p512[0], p1, p1[0]);
 
-       if(debug) vapic_status_dump(stderr, (void *)vmctl.vapic);
+       vmm_run_task(vm, timer_thread, 0);
 
-       if (ret != sizeof(vmctl)) {
-               perror(cmd);
-       }
-       while (1) {
-               void showstatus(FILE *f, struct vmctl *v);
-               int c;
-               uint8_t byte;
-               vmctl.command = REG_RIP;
-               if (maxresume-- == 0) {
-                       debug = 1;
-                       resumeprompt = 1;
-               }
-               if (debug) {
-                       fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
-                       showstatus(stderr, &vmctl);
-               }
-               if (resumeprompt) {
-                       fprintf(stderr, "RESUME?\n");
-                       c = getchar();
-                       if (c == 'q')
-                               break;
-               }
-               if (vmctl.shutdown == SHUTDOWN_EPT_VIOLATION) {
-                       uint64_t gpa, *regp, val;
-                       uint8_t regx;
-                       int store, size;
-                       int advance;
-                       if (decode(&vmctl, &gpa, &regx, &regp, &store, &size, &advance)) {
-                               fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
-                               showstatus(stderr, &vmctl);
-                               quit = 1;
-                               break;
-                       }
-                       if (debug) fprintf(stderr, "%p %p %p %p %p %p\n", gpa, regx, regp, store, size, advance);
-                       if ((gpa & ~0xfffULL) == virtiobase) {
-                               if (debug) fprintf(stderr, "DO SOME VIRTIO\n");
-                               // Lucky for us the various virtio ops are well-defined.
-                               virtio_mmio(&vmctl, gpa, regx, regp, store);
-                               if (debug) fprintf(stderr, "store is %d:\n", store);
-                               if (debug) fprintf(stderr, "REGP IS %16x:\n", *regp);
-                       } else if ((gpa & 0xfee00000) == 0xfee00000) {
-                               // until we fix our include mess, just put the proto here.
-                               //int apic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
-                               //apic(&vmctl, gpa, regx, regp, store);
-                       } else if ((gpa & 0xfec00000) == 0xfec00000) {
-                               // until we fix our include mess, just put the proto here.
-                               int do_ioapic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
-                               do_ioapic(&vmctl, gpa, regx, regp, store);
-                       } else if (gpa < 4096) {
-                               uint64_t val = 0;
-                               memmove(&val, &low4k[gpa], size);
-                               hexdump(stdout, &low4k[gpa], size);
-                               fprintf(stderr, "Low 1m, code %p read @ %p, size %d, val %p\n", vmctl.regs.tf_rip, gpa, size, val);
-                               memmove(regp, &low4k[gpa], size);
-                               hexdump(stdout, regp, size);
-                       } else {
-                               fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
-                               fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
-                               fprintf(stderr, "Returning 0xffffffff\n");
-                               showstatus(stderr, &vmctl);
-                               // Just fill the whole register for now.
-                               *regp = (uint64_t) -1;
-                       }
-                       vmctl.regs.tf_rip += advance;
-                       if (debug) fprintf(stderr, "Advance rip by %d bytes to %p\n", advance, vmctl.regs.tf_rip);
-                       vmctl.shutdown = 0;
-                       vmctl.gpa = 0;
-                       vmctl.command = REG_ALL;
-               } else if (vmctl.shutdown == SHUTDOWN_UNHANDLED_EXIT_REASON) {
-                       switch(vmctl.ret_code){
-                       case  EXIT_REASON_VMCALL:
-                               byte = vmctl.regs.tf_rdi;
-                               printf("%c", byte);
-                               if (byte == '\n') printf("%c", '%');
-                               vmctl.regs.tf_rip += 3;
-                               break;
-                       case EXIT_REASON_EXTERNAL_INTERRUPT:
-                               //debug = 1;
-                               if (debug) fprintf(stderr, "XINT 0x%x 0x%x\n", vmctl.intrinfo1, vmctl.intrinfo2);
-                               if (debug) pir_dump();
-                               vmctl.command = RESUME;
-                               break;
-                       case EXIT_REASON_IO_INSTRUCTION:
-                               fprintf(stderr, "IO @ %p\n", vmctl.regs.tf_rip);
-                               io(&vmctl);
-                               vmctl.shutdown = 0;
-                               vmctl.gpa = 0;
-                               vmctl.command = REG_ALL;
-                               break;
-                       case EXIT_REASON_INTERRUPT_WINDOW:
-                               if (consdata) {
-                                       if (debug) fprintf(stderr, "inject an interrupt\n");
-                                       virtio_mmio_set_vring_irq();
-                                       vmctl.interrupt = 0x80000000 | virtioirq;
-                                       vmctl.command = RESUME;
-                                       consdata = 0;
-                               }
-                               break;
-                       case EXIT_REASON_MSR_WRITE:
-                       case EXIT_REASON_MSR_READ:
-                               fprintf(stderr, "Do an msr\n");
-                               quit = msrio(&vmctl, vmctl.ret_code);
-                               if (quit) {
-                                       fprintf(stderr, "MSR FAILED: RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
-                                       showstatus(stderr, &vmctl);
-                               }
-                               break;
-                       case EXIT_REASON_MWAIT_INSTRUCTION:
-                         fflush(stdout);
-                               if (debug)fprintf(stderr, "\n================== Guest MWAIT. =======================\n");
-                               if (debug)fprintf(stderr, "Wait for cons data\n");
-                               while (!consdata)
-                                       ;
-                               //debug = 1;
-                               if(debug) vapic_status_dump(stderr, (void *)vmctl.vapic);
-                               if (debug)fprintf(stderr, "Resume with consdata ...\n");
-                               vmctl.regs.tf_rip += 3;
-                               ret = pwrite(fd, &vmctl, sizeof(vmctl), 0);
-                               if (ret != sizeof(vmctl)) {
-                                       perror(cmd);
-                               }
-                               //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
-                               //showstatus(stderr, &vmctl);
-                               break;
-                       case EXIT_REASON_HLT:
-                               fflush(stdout);
-                               if (debug)fprintf(stderr, "\n================== Guest halted. =======================\n");
-                               if (debug)fprintf(stderr, "Wait for cons data\n");
-                               while (!consdata)
-                                       ;
-                               //debug = 1;
-                               if (debug)fprintf(stderr, "Resume with consdata ...\n");
-                               vmctl.regs.tf_rip += 1;
-                               ret = pwrite(fd, &vmctl, sizeof(vmctl), 0);
-                               if (ret != sizeof(vmctl)) {
-                                       perror(cmd);
-                               }
-                               //fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
-                               //showstatus(stderr, &vmctl);
-                               break;
-                       case EXIT_REASON_APIC_ACCESS:                           
-                               if (1 || debug)fprintf(stderr, "APIC READ EXIT\n");
-                               
-                               uint64_t gpa, *regp, val;
-                               uint8_t regx;
-                               int store, size;
-                               int advance;
-                               if (decode(&vmctl, &gpa, &regx, &regp, &store, &size, &advance)) {
-                                       fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
-                                       showstatus(stderr, &vmctl);
-                                       quit = 1;
-                                       break;
-                               }
-
-                               int apic(struct vmctl *v, uint64_t gpa, int destreg, uint64_t *regp, int store);
-                               apic(&vmctl, gpa, regx, regp, store);
-                               vmctl.regs.tf_rip += advance;
-                               if (debug) fprintf(stderr, "Advance rip by %d bytes to %p\n", advance, vmctl.regs.tf_rip);
-                               vmctl.shutdown = 0;
-                               vmctl.gpa = 0;
-                               vmctl.command = REG_ALL;
-                               break;
-                       case EXIT_REASON_APIC_WRITE:
-                               if (1 || debug)fprintf(stderr, "APIC WRITE EXIT\n");
-                               break;
-                       default:
-                               fprintf(stderr, "Don't know how to handle exit %d\n", vmctl.ret_code);
-                               fprintf(stderr, "RIP %p, shutdown 0x%x\n", vmctl.regs.tf_rip, vmctl.shutdown);
-                               showstatus(stderr, &vmctl);
-                               quit = 1;
-                               break;
-                       }
-               }
-               if (debug) fprintf(stderr, "at bottom of switch, quit is %d\n", quit);
-               if (quit)
-                       break;
-               if (consdata) {
-                       if (debug) fprintf(stderr, "inject an interrupt\n");
-                       if (debug) fprintf(stderr, "XINT 0x%x 0x%x\n", vmctl.intrinfo1, vmctl.intrinfo2);
-                       vmctl.interrupt = 0x80000000 | virtioirq;
-                       virtio_mmio_set_vring_irq();
-                       consdata = 0;
-                       //debug = 1;
-                       vmctl.command = RESUME;
-               }
-               if (debug) fprintf(stderr, "NOW DO A RESUME\n");
-               ret = pwrite(fd, &vmctl, sizeof(vmctl), 0);
-               if (ret != sizeof(vmctl)) {
-                       perror(cmd);
-               }
-       }
-
-       /* later. 
-       for (int i = 0; i < nr_threads-1; i++) {
-               int ret;
-               if (pthread_join(my_threads[i], &my_retvals[i]))
-                       perror("pth_join failed");
-               fprintf(stderr, "%d %d\n", i, ret);
-       }
- */
+       vm_tf = gth_to_vmtf(vm->gths[0]);
+       vm_tf->tf_cr3 = (uint64_t) p512;
+       vm_tf->tf_rip = entry;
+       vm_tf->tf_rsp = stack;
+       vm_tf->tf_rsi = (uint64_t) bp;
+       start_guest_thread(vm->gths[0]);
 
-       fflush(stdout);
-       exit(0);
+       uthread_sleep_forever();
+       return 0;
 }