Fixes elf panic
[akaros.git] / kern / src / mm.c
index ead6c43..e9e5e53 100644 (file)
@@ -250,6 +250,7 @@ void unmap_and_destroy_vmrs(struct proc *p)
        /* this only gets called from __proc_free, so there should be no sync
         * concerns.  still, better safe than sorry. */
        spin_lock(&p->vmr_lock);
+       p->vmr_history++;
        spin_lock(&p->pte_lock);
        TAILQ_FOREACH(vmr_i, &p->vm_regions, vm_link) {
                /* note this CB sets the PTE = 0, regardless of if it was P or not */
@@ -484,6 +485,8 @@ static int map_page_at_addr(struct proc *p, struct page *page, uintptr_t addr,
         * in which case we should just return. */
        if (PAGE_PRESENT(*pte)) {
                spin_unlock(&p->pte_lock);
+               /* callers expect us to eat the ref if we succeed. */
+               page_decref(page);
                return 0;
        }
        /* preserve the dirty bit - pm removal could be looking concurrently */
@@ -507,6 +510,77 @@ static int __copy_and_swap_pmpg(struct proc *p, struct page **pp)
        return 0;
 }
 
+/* Hold the VMR lock when you call this - it'll assume the entire VA range is
+ * mappable, which isn't true if there are concurrent changes to the VMRs. */
+static int populate_anon_va(struct proc *p, uintptr_t va, unsigned long nr_pgs,
+                            int pte_prot)
+{
+       struct page *page;
+       int ret;
+       for (long i = 0; i < nr_pgs; i++) {
+               if (upage_alloc(p, &page, TRUE))
+                       return -ENOMEM;
+               /* could imagine doing a memwalk instead of a for loop */
+               ret = map_page_at_addr(p, page, va + i * PGSIZE, pte_prot);
+               if (ret) {
+                       page_decref(page);
+                       return ret;
+               }
+       }
+       return 0;
+}
+
+/* This will periodically unlock the vmr lock. */
+static int populate_pm_va(struct proc *p, uintptr_t va, unsigned long nr_pgs,
+                          int pte_prot, struct page_map *pm, size_t offset,
+                          int flags, bool exec)
+{
+       int ret = 0;
+       unsigned long pm_idx0 = offset >> PGSHIFT;
+       int vmr_history = ACCESS_ONCE(p->vmr_history);
+       struct page *page;
+
+       /* locking rules: start the loop holding the vmr lock, enter and exit the
+        * entire func holding the lock. */
+       for (long i = 0; i < nr_pgs; i++) {
+               ret = pm_load_page_nowait(pm, pm_idx0 + i, &page);
+               if (ret) {
+                       if (ret != -EAGAIN)
+                               break;
+                       spin_unlock(&p->vmr_lock);
+                       /* might block here, can't hold the spinlock */
+                       ret = pm_load_page(pm, pm_idx0 + i, &page);
+                       spin_lock(&p->vmr_lock);
+                       if (ret)
+                               break;
+                       /* while we were sleeping, the VMRs could have changed on us. */
+                       if (vmr_history != ACCESS_ONCE(p->vmr_history)) {
+                               pm_put_page(page);
+                               printk("[kernel] FYI: VMR changed during populate\n");
+                               break;
+                       }
+               }
+               if (flags & MAP_PRIVATE) {
+                       ret = __copy_and_swap_pmpg(p, &page);
+                       if (ret) {
+                               pm_put_page(page);
+                               break;
+                       }
+               }
+               /* if this is an executable page, we might have to flush the
+                * instruction cache if our HW requires it.
+                * TODO: is this still needed?  andrew put this in a while ago*/
+               if (exec)
+                       icache_flush_page(0, page2kva(page));
+               ret = map_page_at_addr(p, page, va + i * PGSIZE, pte_prot);
+               if (atomic_read(&page->pg_flags) & PG_PAGEMAP)
+                       pm_put_page(page);
+               if (ret)
+                       break;
+       }
+       return ret;
+}
+
 void *do_mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
               struct file *file, size_t offset)
 {
@@ -515,6 +589,7 @@ void *do_mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
 
        /* read/write vmr lock (will change the tree) */
        spin_lock(&p->vmr_lock);
+       p->vmr_history++;
        /* Sanity check, for callers that bypass mmap().  We want addr for anon
         * memory to start above the break limit (BRK_END), but not 0.  Keep this in
         * sync with BRK_END in mmap(). */
@@ -522,9 +597,13 @@ void *do_mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
                addr = BRK_END;
        assert(!PGOFF(offset));
 
-#ifndef CONFIG_DEMAND_PAGING
-       flags |= MAP_POPULATE;
-#endif
+       /* MCPs will need their code and data pinned.  This check will start to fail
+        * after uthread_slim_init(), at which point userspace should have enough
+        * control over its mmaps (i.e. no longer done by LD or load_elf) that it
+        * can ask for pinned and populated pages.  Except for dl_opens(). */
+       struct preempt_data *vcpd = &p->procdata->vcore_preempt_data[0];
+       if (file && (atomic_read(&vcpd->flags) & VC_SCP_NOVCCTX))
+               flags |= MAP_POPULATE | MAP_LOCKED;
        /* Need to make sure nothing is in our way when we want a FIXED location.
         * We just need to split on the end points (if they exist), and then remove
         * everything in between.  __do_munmap() will do this.  Careful, this means
@@ -580,67 +659,15 @@ void *do_mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
        if (flags & MAP_POPULATE && prot != PROT_NONE) {
                int pte_prot = (prot & PROT_WRITE) ? PTE_USER_RW :
                           (prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO : 0;
-               int num_pages = len / PGSIZE;
+               unsigned long nr_pgs = len >> PGSHIFT;
                int ret = 0;
-               struct page *a_page;
                if (!file) {
-                       for (int i = 0; i < num_pages; i++) {
-                               if (upage_alloc(p, &a_page, TRUE)) {
-                                       ret = -ENOMEM;
-                                       break;
-                               }
-                               a_page->pg_tree_slot = 0;
-                               /* could imagine doing a memwalk instead of a for loop */
-                               ret = map_page_at_addr(p, a_page, addr + i * PGSIZE, pte_prot);
-                               if (ret) {
-                                       page_decref(a_page);
-                                       break;
-                               }
-                       }
+                       ret = populate_anon_va(p, addr, nr_pgs, pte_prot);
                } else {
-                       /* our refcnt on the file keeps the pm alive when we unlock */
-                       struct page_map *pm = file->f_mapping;
-                       unsigned long pm_idx0 = offset >> PGSHIFT;
-                       struct vm_region vmr_copy;
-                       memcpy(&vmr_copy, vmr, sizeof(struct vm_region));
-                       for (int i = 0; i < num_pages; i++) {
-                               ret = pm_load_page_nowait(pm, pm_idx0 + i, &a_page);
-                               if (ret) {
-                                       if (ret != -EAGAIN)
-                                               break;
-                                       spin_unlock(&p->vmr_lock);
-                                       /* might block here, can't hold the spinlock */
-                                       ret = pm_load_page(pm, pm_idx0 + i, &a_page);
-                                       spin_lock(&p->vmr_lock);
-                                       if (ret)
-                                               break;
-                                       /* ugly - while we were sleeping, our VMR could have changed
-                                        * on us.  should be okay with weird ABA races too. */
-                                       vmr = find_vmr(p, addr + i * PGSIZE);
-                                       if (memcmp(&vmr_copy, vmr, sizeof(struct vm_region))) {
-                                               pm_put_page(a_page);
-                                               printk("[kernel] FYI: VMR changed during populate\n");
-                                               break;
-                                       }
-                               }
-                               if (flags & MAP_PRIVATE) {
-                                       ret = __copy_and_swap_pmpg(p, &a_page);
-                                       if (ret) {
-                                               pm_put_page(a_page);
-                                               break;
-                                       }
-                               }
-                               /* if this is an executable page, we might have to flush the
-                                * instruction cache if our HW requires it.
-                                * TODO: is this still needed?  andrew put this in a while ago*/
-                               if (prot & PROT_EXEC)
-                                       icache_flush_page(0, page2kva(a_page));
-                               ret = map_page_at_addr(p, a_page, addr + i * PGSIZE, pte_prot);
-                               if (atomic_read(&a_page->pg_flags) & PG_PAGEMAP)
-                                       pm_put_page(a_page);
-                               if (ret)
-                                       break;
-                       }
+                       /* Note: this will unlock if it blocks.  our refcnt on the file
+                        * keeps the pm alive when we unlock */
+                       ret = populate_pm_va(p, addr, nr_pgs, pte_prot, file->f_mapping,
+                                            offset, flags, prot & PROT_EXEC);
                }
                if (ret == -ENOMEM) {
                        spin_unlock(&p->vmr_lock);
@@ -669,6 +696,7 @@ int mprotect(struct proc *p, uintptr_t addr, size_t len, int prot)
        }
        /* read/write lock, will probably change the tree and settings */
        spin_lock(&p->vmr_lock);
+       p->vmr_history++;
        int ret = __do_mprotect(p, addr, len, prot);
        spin_unlock(&p->vmr_lock);
        return ret;
@@ -720,6 +748,8 @@ int munmap(struct proc *p, uintptr_t addr, size_t len)
        printd("munmap(addr %x, len %x)\n", addr, len);
        if (!len)
                return 0;
+       len = ROUNDUP(len, PGSIZE);
+
        if ((addr % PGSIZE) || (addr < MMAP_LOWEST_VA)) {
                set_errno(EINVAL);
                return -1;
@@ -731,6 +761,7 @@ int munmap(struct proc *p, uintptr_t addr, size_t len)
        }
        /* read/write: changing the vmrs (trees, properties, and whatnot) */
        spin_lock(&p->vmr_lock);
+       p->vmr_history++;
        int ret = __do_munmap(p, addr, len);
        spin_unlock(&p->vmr_lock);
        return ret;
@@ -822,6 +853,59 @@ static void __put_page(struct page *page)
                page_decref(page);
 }
 
+static int __hpf_load_page(struct proc *p, struct page_map *pm,
+                           unsigned long idx, struct page **page, bool first)
+{
+       int ret = 0;
+       int coreid = core_id();
+       struct per_cpu_info *pcpui = &per_cpu_info[coreid];
+       bool wake_scp = FALSE;
+       spin_lock(&p->proc_lock);
+       switch (p->state) {
+               case (PROC_RUNNING_S):
+                       wake_scp = TRUE;
+                       __proc_set_state(p, PROC_WAITING);
+                       /* it's possible for HPF to loop a few times; we can only save the
+                        * first time, o/w we could clobber. */
+                       if (first) {
+                               __proc_save_context_s(p, pcpui->cur_ctx);
+                               __proc_save_fpu_s(p);
+                               /* We clear the owner, since userspace doesn't run here
+                                * anymore, but we won't abandon since the fault handler
+                                * still runs in our process. */
+                               clear_owning_proc(coreid);
+                       }
+                       /* other notes: we don't currently need to tell the ksched
+                        * we switched from running to waiting, though we probably
+                        * will later for more generic scheds. */
+                       break;
+               case (PROC_RUNNABLE_M):
+               case (PROC_RUNNING_M):
+                       spin_unlock(&p->proc_lock);
+                       return -EAGAIN; /* will get reflected back to userspace */
+               case (PROC_DYING):
+                       spin_unlock(&p->proc_lock);
+                       return -EINVAL;
+               default:
+                       /* shouldn't have any waitings, under the current yield style.  if
+                        * this becomes an issue, we can branch on is_mcp(). */
+                       printk("HPF unexpectecd state(%s)", procstate2str(p->state));
+                       spin_unlock(&p->proc_lock);
+                       return -EINVAL;
+       }
+       spin_unlock(&p->proc_lock);
+       ret = pm_load_page(pm, idx, page);
+       if (wake_scp)
+               proc_wakeup(p);
+       if (ret) {
+               printk("load failed with ret %d\n", ret);
+               return ret;
+       }
+       /* need to put our old ref, next time around HPF will get another. */
+       pm_put_page(*page);
+       return 0;
+}
+
 /* Returns 0 on success, or an appropriate -error code. 
  *
  * Notes: if your TLB caches negative results, you'll need to flush the
@@ -837,6 +921,7 @@ int handle_page_fault(struct proc *p, uintptr_t va, int prot)
        unsigned int f_idx;     /* index of the missing page in the file */
        pte_t *pte;
        int ret = 0;
+       bool first = TRUE;
        va = ROUNDDOWN(va,PGSIZE);
 
        if (prot != PROT_READ && prot != PROT_WRITE && prot != PROT_EXEC)
@@ -889,15 +974,9 @@ refault:
                        /* keep the file alive after we unlock */
                        kref_get(&vmr->vm_file->f_kref, 1);
                        spin_unlock(&p->vmr_lock);
-
-                       /* TODO: here is where we handle SCP vs MCP vs whatever.
-                        * - do some prep
-                        * - prefetch it for userspace, optionally for MCPs
-                        * - this will break if we actually block, since smp_idle will
-                        *   restart the proc
-                        *   */
-                       ret = pm_load_page(vmr->vm_file->f_mapping, f_idx, &a_page);
-
+                       ret = __hpf_load_page(p, vmr->vm_file->f_mapping, f_idx, &a_page,
+                                             first);
+                       first = FALSE;
                        kref_put(&vmr->vm_file->f_kref);
                        if (ret)
                                return ret;
@@ -911,7 +990,7 @@ refault:
                if ((vmr->vm_flags & MAP_PRIVATE)) {
                        ret = __copy_and_swap_pmpg(p, &a_page);
                        if (ret)
-                               goto out;
+                               goto out_put_pg;
                }
                /* if this is an executable page, we might have to flush the instruction
                 * cache if our HW requires it. */
@@ -923,19 +1002,68 @@ refault:
        int pte_prot = (vmr->vm_prot & PROT_WRITE) ? PTE_USER_RW :
                       (vmr->vm_prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO : 0;
        ret = map_page_at_addr(p, a_page, va, pte_prot);
-       if (ret)
-               goto out;
+       /* fall through, even for errors */
+out_put_pg:
        /* the VMR's existence in the PM (via the mmap) allows us to have PTE point
         * to a_page without it magically being reallocated.  For non-PM memory
         * (anon memory or private pages) we transferred the ref to the PTE. */
        if (atomic_read(&a_page->pg_flags) & PG_PAGEMAP)
                pm_put_page(a_page);
-       ret = 0;
 out:
        spin_unlock(&p->vmr_lock);
        return ret;
 }
 
+/* Attempts to populate the pages, as if there was a page faults.  Bails on
+ * errors, and returns the number of pages populated.  */
+unsigned long populate_va(struct proc *p, uintptr_t va, unsigned long nr_pgs)
+{
+       struct vm_region *vmr, vmr_copy;
+       unsigned long nr_pgs_this_vmr;
+       unsigned long nr_filled = 0;
+       struct page *page;
+       int pte_prot;
+
+       /* we can screw around with ways to limit the find_vmr calls (can do the
+        * next in line if we didn't unlock, etc., but i don't expect us to do this
+        * for more than a single VMR in most cases. */
+       spin_lock(&p->vmr_lock);
+       while (nr_pgs) {
+               vmr = find_vmr(p, va);
+               if (!vmr)
+                       break;
+               if (vmr->vm_prot == PROT_NONE)
+                       break;
+               pte_prot = (vmr->vm_prot & PROT_WRITE) ? PTE_USER_RW :
+                          (vmr->vm_prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO : 0;
+               nr_pgs_this_vmr = MIN(nr_pgs, (vmr->vm_end - va) >> PGSHIFT);
+               if (!vmr->vm_file) {
+                       if (populate_anon_va(p, va, nr_pgs_this_vmr, pte_prot)) {
+                               /* on any error, we can just bail.  we might be underestimating
+                                * nr_filled. */
+                               break;
+                       }
+               } else {
+                       /* need to keep the file alive in case we unlock/block */
+                       kref_get(&vmr->vm_file->f_kref, 1);
+                       if (populate_pm_va(p, va, nr_pgs_this_vmr, pte_prot,
+                                          vmr->vm_file->f_mapping,
+                                          vmr->vm_foff - (va - vmr->vm_base),
+                                                          vmr->vm_flags, vmr->vm_prot & PROT_EXEC)) {
+                               /* we might have failed if the underlying file doesn't cover the
+                                * mmap window, depending on how we'll deal with truncation. */
+                               break;
+                       }
+                       kref_put(&vmr->vm_file->f_kref);
+               }
+               nr_filled += nr_pgs_this_vmr;
+               va += nr_pgs_this_vmr << PGSHIFT;
+               nr_pgs -= nr_pgs_this_vmr;
+       }
+       spin_unlock(&p->vmr_lock);
+       return nr_filled;
+}
+
 /* Kernel Dynamic Memory Mappings */
 uintptr_t dyn_vmap_llim = KERN_DYN_TOP;
 spinlock_t dyn_vmap_lock = SPINLOCK_INITIALIZER;
@@ -1027,21 +1155,37 @@ int unmap_vmap_segment(uintptr_t vaddr, unsigned long num_pages)
        return 0;
 }
 
-uintptr_t vmap_pmem(uintptr_t paddr, size_t nr_bytes)
+/* This can handle unaligned paddrs */
+static uintptr_t vmap_pmem_flags(uintptr_t paddr, size_t nr_bytes, int flags)
 {
        uintptr_t vaddr;
-       unsigned long nr_pages = ROUNDUP(nr_bytes, PGSIZE) >> PGSHIFT;
+       unsigned long nr_pages;
        assert(nr_bytes && paddr);
+       nr_bytes += PGOFF(paddr);
+       nr_pages = ROUNDUP(nr_bytes, PGSIZE) >> PGSHIFT;
        vaddr = get_vmap_segment(nr_pages);
        if (!vaddr) {
                warn("Unable to get a vmap segment");   /* probably a bug */
                return 0;
        }
-       if (map_vmap_segment(vaddr, paddr, nr_pages, PTE_P | PTE_KERN_RW)) {
+       /* it's not strictly necessary to drop paddr's pgoff, but it might save some
+        * vmap heartache in the future. */
+       if (map_vmap_segment(vaddr, PG_ADDR(paddr), nr_pages,
+                            PTE_P | PTE_KERN_RW | flags)) {
                warn("Unable to map a vmap segment");   /* probably a bug */
                return 0;
        }
-       return vaddr;
+       return vaddr + PGOFF(paddr);
+}
+
+uintptr_t vmap_pmem(uintptr_t paddr, size_t nr_bytes)
+{
+       return vmap_pmem_flags(paddr, nr_bytes, 0);
+}
+
+uintptr_t vmap_pmem_nocache(uintptr_t paddr, size_t nr_bytes)
+{
+       return vmap_pmem_flags(paddr, nr_bytes, PTE_NOCACHE);
 }
 
 int vunmap_vmem(uintptr_t vaddr, size_t nr_bytes)