VMM: perform per-guest-pcore init at setup (XCC)
authorBarret Rhoden <brho@cs.berkeley.edu>
Mon, 25 Jan 2016 19:23:15 +0000 (14:23 -0500)
committerBarret Rhoden <brho@cs.berkeley.edu>
Tue, 2 Feb 2016 22:43:52 +0000 (17:43 -0500)
The EOI bitmap, PIR, VAPIC, and APIC pages all need to be set up per
VMCS/vcpu/guest pcore.  We had been doing this during the REG_RSP_RIP_CR3,
which was shorthand for "first time through."  Now, we do that during
initializing the entire proc VMM.

Most of the changes involve passing the guest pcore init info down through
the syscall and init paths.  I also made a helper for setting up the VMCS
pages, since the PIR, VAPIC, and APIC all follow the same pattern.  That
helper is also able to fail, which wasn't considered before.

Reinstall your kernel headers (vmctl changed).

Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
kern/arch/riscv/ros/vmm.h
kern/arch/x86/ros/vmm.h
kern/arch/x86/vmm/intel/vmx.c
kern/arch/x86/vmm/vmm.c
kern/arch/x86/vmm/vmm.h
kern/include/ros/vmm.h
kern/src/syscall.c
tests/vmm/vmrunkernel.c

index 8e662bb..a8c612e 100644 (file)
@@ -5,3 +5,6 @@
  * RISC-V VMM kernel headers */
 
 #pragma once
+
+struct vmm_gpcore_init {
+}
index d839026..04624fc 100644 (file)
@@ -7,3 +7,11 @@
 #pragma once
 
 #include <ros/arch/vmx.h>
+
+/* Initialization data provided by the userspace part of the VMM when setting
+ * up a guest physical core (vmx vcpu). */
+struct vmm_gpcore_init {
+       void                                    *pir_addr;
+       void                                    *vapic_addr;
+       void                                    *apic_addr;
+};
index cb100e3..369fa58 100644 (file)
@@ -960,16 +960,46 @@ construct_eptp(physaddr_t root_hpa)
        return eptp;
 }
 
+/* Helper: some fields of the VMCS need a physical page address, e.g. the VAPIC
+ * page.  We have the user address.  This converts the user to phys addr and
+ * sets that up in the VMCS.  Returns 0 on success, -1 o/w. */
+static int vmcs_set_pgaddr(struct proc *p, void *u_addr, unsigned long field)
+{
+       uintptr_t kva;
+       physaddr_t paddr;
+
+       /* Enforce page alignment */
+       kva = uva2kva(p, ROUNDDOWN(u_addr, PGSIZE));
+       if (!kva) {
+               set_error(EINVAL, "Unmapped pgaddr %p for VMCS", u_addr);
+               return -1;
+       }
+       paddr = PADDR(kva);
+       /* TODO: need to pin the page.  A munmap would actually be okay (though
+        * probably we should kill the process), but we need to keep the page from
+        * being reused.  A refcnt would do the trick, which we decref when we
+        * destroy the guest core/vcpu. */
+       assert(!PGOFF(paddr));
+       vmcs_writel(field, paddr);
+       /* Pages are inserted twice.  Once, with the full paddr.  The next field is
+        * the upper 32 bits of the paddr. */
+       vmcs_writel(field + 1, paddr >> 32);
+       return 0;
+}
+
 /**
- * vmx_setup_initial_guest_state - configures the initial state of guest registers
+ * vmx_setup_initial_guest_state - configures the initial state of guest
+ * registers and the VMCS.  Returns 0 on success, -1 o/w.
  */
-static void
-vmx_setup_initial_guest_state(void)
+static int vmx_setup_initial_guest_state(struct proc *p,
+                                         struct vmm_gpcore_init *gpci)
 {
        unsigned long tmpl;
        unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
                X86_CR4_PGE | X86_CR4_OSFXSR;
        uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
+       int ret = 0;
+
 #if 0
        do
                we need it if (boot_cpu_has(X86_FEATURE_PCID))
@@ -1060,7 +1090,25 @@ vmx_setup_initial_guest_state(void)
 
        /* Initialize posted interrupt notification vector */
        vmcs_write16(POSTED_NOTIFICATION_VEC, I_VMMCP_POSTED);
-       }
+
+       /* Clear the EOI exit bitmap */
+       vmcs_writel(EOI_EXIT_BITMAP0, 0);
+       vmcs_writel(EOI_EXIT_BITMAP0_HIGH, 0);
+       vmcs_writel(EOI_EXIT_BITMAP1, 0);
+       vmcs_writel(EOI_EXIT_BITMAP1_HIGH, 0);
+       vmcs_writel(EOI_EXIT_BITMAP2, 0);
+       vmcs_writel(EOI_EXIT_BITMAP2_HIGH, 0);
+       vmcs_writel(EOI_EXIT_BITMAP3, 0);
+       vmcs_writel(EOI_EXIT_BITMAP3_HIGH, 0);
+
+       /* Initialize parts based on the users info.  If one of them fails, we'll do
+        * the others but then error out. */
+       ret |= vmcs_set_pgaddr(p, gpci->pir_addr, POSTED_INTR_DESC_ADDR);
+       ret |= vmcs_set_pgaddr(p, gpci->vapic_addr, VIRTUAL_APIC_PAGE_ADDR);
+       ret |= vmcs_set_pgaddr(p, gpci->apic_addr, APIC_ACCESS_ADDR);
+
+       return ret;
+}
 
 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
                                            uint32_t msr) {
@@ -1463,8 +1511,11 @@ static void vmx_setup_vmcs(struct vmx_vcpu *vcpu) {
  *
  * Returns: A new VCPU structure
  */
-struct vmx_vcpu *vmx_create_vcpu(struct proc *p) {
+struct vmx_vcpu *vmx_create_vcpu(struct proc *p, struct vmm_gpcore_init *gpci)
+{
        struct vmx_vcpu *vcpu = kmalloc(sizeof(struct vmx_vcpu), KMALLOC_WAIT);
+       int ret;
+
        if (!vcpu) {
                return NULL;
        }
@@ -1481,10 +1532,11 @@ struct vmx_vcpu *vmx_create_vcpu(struct proc *p) {
 
        vmx_get_cpu(vcpu);
        vmx_setup_vmcs(vcpu);
-       vmx_setup_initial_guest_state();
+       ret = vmx_setup_initial_guest_state(p, gpci);
        vmx_put_cpu(vcpu);
 
-       return vcpu;
+       if (!ret)
+               return vcpu;
 
 fail_vmcs:
        kfree(vcpu);
@@ -1819,45 +1871,6 @@ int vmx_launch(struct vmctl *v) {
                vmcs_writel(GUEST_RSP, v->regs.tf_rsp);
                vmcs_writel(GUEST_CR3, v->cr3);
                vcpu->regs = v->regs;
-
-               pir_kva = uva2kva(current_proc, (void *)v->pir);
-               pir_physical = (uint64_t)PADDR(pir_kva);
-
-               vmcs_writel(POSTED_INTR_DESC_ADDR, pir_physical);
-               vmcs_writel(POSTED_INTR_DESC_ADDR_HIGH, pir_physical>>32);
-               printk("POSTED_INTR_DESC_ADDR_HIGH %ld\n", vmcs_readl(POSTED_INTR_DESC_ADDR_HIGH));
-               if (pir_physical & 0xfff) {
-                       printk("Low order 12 bits of pir address is not 0, value: %p\n", pir_physical);
-               }
-
-               vapic_kva = uva2kva(current_proc, (void *)v->vapic);
-               vapic_physical = (uint64_t)PADDR(vapic_kva);
-
-               vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, vapic_physical);
-               vmcs_writel(VIRTUAL_APIC_PAGE_ADDR_HIGH, vapic_physical>>32);
-               if (vapic_physical & 0xfff) {
-                       printk("Low order 12 bits of vapic address is not 0, value: %p\n", vapic_physical);
-               }
-
-               printk("VAPIC PHYSICAL ADDRESS: %p\n", vapic_physical);
-
-               apic_kva = uva2kva(current_proc, (void *)0xfee00000);
-               apic_physical = (uint64_t)PADDR(apic_kva);
-
-               vmcs_writel(APIC_ACCESS_ADDR, apic_physical);
-               vmcs_writel(APIC_ACCESS_ADDR_HIGH, apic_physical>>32);
-
-               // Clear the EOI exit bitmap(Gan)
-               vmcs_writel(EOI_EXIT_BITMAP0, 0);
-               vmcs_writel(EOI_EXIT_BITMAP0_HIGH, 0);
-               vmcs_writel(EOI_EXIT_BITMAP1, 0);
-               vmcs_writel(EOI_EXIT_BITMAP1_HIGH, 0);
-               vmcs_writel(EOI_EXIT_BITMAP2, 0);
-               vmcs_writel(EOI_EXIT_BITMAP2_HIGH, 0);
-               vmcs_writel(EOI_EXIT_BITMAP3, 0);
-               vmcs_writel(EOI_EXIT_BITMAP3_HIGH, 0);
-
-               printk("v->apic %p v->pir %p\n", (void *)v->vapic, (void *)v->pir);
                // fallthrough
        case REG_RIP:
                printd("REG_RIP %p\n", v->regs.tf_rip);
index d75e48a..9c79757 100644 (file)
@@ -18,6 +18,7 @@
 #include "intel/vmx.h"
 #include "vmm.h"
 #include <trap.h>
+#include <umem.h>
 
 /* TODO: have better cpuid info storage and checks */
 bool x86_supports_vmx = FALSE;
@@ -93,10 +94,13 @@ int vm_run(struct vmctl *v)
 
 /* Initializes a process to run virtual machine contexts, returning the number
  * initialized, optionally setting errno */
-int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores, int flags)
+int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores,
+                    struct vmm_gpcore_init *u_gpcis, int flags)
 {
        struct vmm *vmm = &p->vmm;
        unsigned int i;
+       struct vmm_gpcore_init gpci;
+
        if (flags & ~VMM_ALL_FLAGS) {
                set_errstr("%s: flags is 0x%lx, VMM_ALL_FLAGS is 0x%lx\n", __func__,
                           flags, VMM_ALL_FLAGS);
@@ -104,7 +108,6 @@ int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores, int flags)
                return 0;
        }
        vmm->flags = flags;
-
        if (!x86_supports_vmx) {
                set_errno(ENODEV);
                return 0;
@@ -121,7 +124,12 @@ int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores, int flags)
        vmm->amd = 0;
        vmm->guest_pcores = kzmalloc(sizeof(void*) * nr_guest_pcores, KMALLOC_WAIT);
        for (i = 0; i < nr_guest_pcores; i++) {
-               vmm->guest_pcores[i] = vmx_create_vcpu(p);
+               if (copy_from_user(&gpci, &u_gpcis[i],
+                                  sizeof(struct vmm_gpcore_init))) {
+                       set_error(EINVAL, "Bad pointer %p for gps", u_gpcis);
+                       break;
+               }
+               vmm->guest_pcores[i] = vmx_create_vcpu(p, &gpci);
                /* If we failed, we'll clean it up when the process dies */
                if (!vmm->guest_pcores[i]) {
                        set_errno(ENOMEM);
index d36abf0..4dfeb9d 100644 (file)
@@ -46,7 +46,8 @@ struct vmm {
 void vmm_init(void);
 void vmm_pcpu_init(void);
 
-int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores, int flags);
+int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores,
+                    struct vmm_gpcore_init *gpcis, int flags);
 void __vmm_struct_cleanup(struct proc *p);
 
 int vm_post_interrupt(struct vmctl *v);
@@ -54,7 +55,7 @@ int vm_run(struct vmctl *);
 int intel_vmx_start(int id);
 int intel_vmx_setup(int nvmcs);
 
-struct vmx_vcpu *vmx_create_vcpu(struct proc *p);
+struct vmx_vcpu *vmx_create_vcpu(struct proc *p, struct vmm_gpcore_init *gpci);
 void vmx_destroy_vcpu(struct vmx_vcpu *vcpu);
 uint64_t construct_eptp(physaddr_t root_hpa);
 void ept_flush(uint64_t eptp);
index 40ce423..926d285 100644 (file)
@@ -35,8 +35,5 @@ struct vmctl {
        uint32_t interrupt;
        uint32_t intrinfo1;
        uint32_t intrinfo2;
-       // These two page-sized entities must be page-aligned
-       uint64_t pir;
-       uint64_t vapic;
        struct hw_trapframe regs;
 };
index 925fb00..db74250 100644 (file)
@@ -1322,9 +1322,9 @@ static int sys_pop_ctx(struct proc *p, struct user_context *ctx)
 /* Initializes a process to run virtual machine contexts, returning the number
  * initialized, optionally setting errno */
 static int sys_setup_vmm(struct proc *p, unsigned int nr_guest_pcores,
-                         int flags)
+                         struct vmm_gpcore_init *gpcis, int flags)
 {
-       return vmm_struct_init(p, nr_guest_pcores, flags);
+       return vmm_struct_init(p, nr_guest_pcores, gpcis, flags);
 }
 
 /* Pokes the ksched for the given resource for target_pid.  If the target pid
index 90c9165..859dc33 100644 (file)
@@ -26,6 +26,7 @@
 int msrio(struct vmctl *vcpu, uint32_t opcode);
 
 struct vmctl vmctl;
+struct vmm_gpcore_init gpci;
 
 /* Kind of sad what a total clusterf the pc world is. By 1999, you could just scan the hardware 
  * and work it out. But 2005, that was no longer possible. How sad. 
@@ -366,7 +367,7 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
 
 static void pir_dump()
 {
-       unsigned long *pir_ptr = (unsigned long *)vmctl.pir;
+       unsigned long *pir_ptr = (unsigned long *)gpci.pir_addr;
        int i;
        fprintf(stderr, "-------Begin PIR dump-------\n");
        for (i = 0; i < 8; i++){
@@ -380,13 +381,14 @@ static void set_posted_interrupt(int vector)
        unsigned long *bit_vec;
        int bit_offset;
        int i, j;
-       unsigned long *pir = (unsigned long *)vmctl.pir;
+       unsigned long *pir = (unsigned long *)gpci.pir_addr;
        // Move to the correct location to set our bit.
        bit_vec = pir + vector/(sizeof(unsigned long)*8);
        bit_offset = vector%(sizeof(unsigned long)*8);
        if(debug) fprintf(stderr, "%s: Pre set PIR dump\n", __func__);
        if(debug) pir_dump();
-       if(debug) vapic_status_dump(stderr, (void *)vmctl.vapic);
+       if (debug)
+               vapic_status_dump(stderr, gpci.vapic_addr);
        if(debug) fprintf(stderr, "%s: Setting pir bit offset %d at 0x%p\n", __func__,
                        bit_offset, bit_vec);
        test_and_set_bit(bit_offset, bit_vec);
@@ -601,10 +603,10 @@ int main(int argc, char **argv)
        hexdump(stdout, r, a-(void *)r);
 
        a = (void *)(((unsigned long)a + 0xfff) & ~0xfff);
-       vmctl.pir = (uint64_t) a;
+       gpci.pir_addr = a;
        memset(a, 0, 4096);
        a += 4096;
-       vmctl.vapic = (uint64_t) a;
+       gpci.vapic_addr = a;
        //vmctl.vapic = (uint64_t) a_page;      
        memset(a, 0, 4096);
        ((uint32_t *)a)[0x30/4] = 0x01060014;
@@ -613,8 +615,10 @@ int main(int argc, char **argv)
        // qemu does this.
        //((uint8_t *)a)[4] = 1;
        a += 4096;
+       gpci.apic_addr = (void*)0xfee00000;
 
-       if (ros_syscall(SYS_setup_vmm, nr_gpcs, vmmflags, 0, 0, 0, 0) != nr_gpcs) {
+       if (ros_syscall(SYS_setup_vmm, nr_gpcs, &gpci, vmmflags, 0, 0, 0) !=
+           nr_gpcs) {
                perror("Guest pcore setup failed");
                exit(1);
        }
@@ -682,11 +686,13 @@ int main(int argc, char **argv)
        fprintf(stderr, "threads started\n");
        fprintf(stderr, "Writing command :%s:\n", cmd);
        
-       if(debug) vapic_status_dump(stderr, (void *)vmctl.vapic);
+       if (debug)
+               vapic_status_dump(stderr, (void *)gpci.vapic_addr);
 
        ret = pwrite(fd, &vmctl, sizeof(vmctl), 0);
 
-       if(debug) vapic_status_dump(stderr, (void *)vmctl.vapic);
+       if (debug)
+               vapic_status_dump(stderr, (void *)gpci.vapic_addr);
 
        if (ret != sizeof(vmctl)) {
                perror(cmd);
@@ -802,7 +808,8 @@ int main(int argc, char **argv)
                                while (!consdata)
                                        ;
                                //debug = 1;
-                               if(debug) vapic_status_dump(stderr, (void *)vmctl.vapic);
+                               if (debug)
+                                       vapic_status_dump(stderr, gpci.vapic_addr);
                                if (debug)fprintf(stderr, "Resume with consdata ...\n");
                                vmctl.regs.tf_rip += 3;
                                ret = pwrite(fd, &vmctl, sizeof(vmctl), 0);