akaros/kern/src/mm.c
<<
>>
Prefs
   1/* Copyright (c) 2009, 2010 The Regents of the University of California
   2 * Barret Rhoden <brho@cs.berkeley.edu>
   3 * See LICENSE for details.
   4 *
   5 * Virtual memory management functions.  Creation, modification, etc, of virtual
   6 * memory regions (VMRs) as well as mmap(), mprotect(), and munmap().
   7 *
   8 * In general, error checking / bounds checks are done in the main function
   9 * (e.g. mmap()), and the work is done in a do_ function (e.g. do_mmap()).
  10 * Versions of those functions that are called when the vmr lock is already held
  11 * begin with __ (e.g. __do_munmap()).
  12 *
  13 * Note that if we were called from kern/src/syscall.c, we probably don't have
  14 * an edible reference to p. */
  15
  16#include <ros/common.h>
  17#include <pmap.h>
  18#include <mm.h>
  19#include <process.h>
  20#include <stdio.h>
  21#include <syscall.h>
  22#include <slab.h>
  23#include <kmalloc.h>
  24#include <smp.h>
  25#include <profiler.h>
  26#include <umem.h>
  27#include <ns.h>
  28#include <tree_file.h>
  29
  30/* These are the only mmap flags that are saved in the VMR.  If we implement
  31 * more of the mmap interface, we may need to grow this. */
  32#define MAP_PERSIST_FLAGS       (MAP_SHARED | MAP_PRIVATE | MAP_ANONYMOUS)
  33
  34struct kmem_cache *vmr_kcache;
  35
  36static int __vmr_free_pgs(struct proc *p, pte_t pte, void *va, void *arg);
  37static int populate_pm_va(struct proc *p, uintptr_t va, unsigned long nr_pgs,
  38                          int pte_prot, struct page_map *pm, size_t offset,
  39                          int flags, bool exec);
  40
  41static struct page_map *foc_to_pm(struct file_or_chan *foc)
  42{
  43        switch (foc->type) {
  44        case F_OR_C_CHAN:
  45                assert(foc->fsf);
  46                return foc->fsf->pm;
  47        }
  48        panic("unknown F_OR_C type");
  49}
  50
  51static struct page_map *vmr_to_pm(struct vm_region *vmr)
  52{
  53        return foc_to_pm(vmr->__vm_foc);
  54}
  55
  56char *foc_to_name(struct file_or_chan *foc)
  57{
  58        switch (foc->type) {
  59        case F_OR_C_CHAN:
  60                if (foc->fsf)
  61                        return foc->fsf->dir.name;
  62                else
  63                        return foc->chan->name->s;
  64        }
  65        panic("unknown F_OR_C type");
  66}
  67
  68char *foc_abs_path(struct file_or_chan *foc)
  69{
  70        switch (foc->type) {
  71        case F_OR_C_CHAN:
  72                /* Not sure, but I'd like to know if we have externally visible
  73                 * chans that have no name. */
  74                assert(foc->chan->name);
  75                assert(foc->chan->name->s);
  76                return foc->chan->name->s;
  77        }
  78        panic("unknown F_OR_C type");
  79}
  80
  81ssize_t foc_read(struct file_or_chan *foc, void *buf, size_t amt, off64_t off)
  82{
  83        ERRSTACK(1);
  84        off64_t fake_off = off;
  85        ssize_t ret = -1;
  86
  87        switch (foc->type) {
  88        case F_OR_C_CHAN:
  89                if (!qid_is_file(foc->chan->qid))
  90                        return -1;
  91                if (!waserror())
  92                        ret = devtab[foc->chan->type].read(foc->chan, buf, amt,
  93                                                           off);
  94                poperror();
  95                return ret;
  96        }
  97        panic("unknown F_OR_C type");
  98}
  99
 100static void __foc_free_rcu(struct rcu_head *head)
 101{
 102        struct file_or_chan *foc = container_of(head, struct file_or_chan, rcu);
 103
 104        switch (foc->type) {
 105        case F_OR_C_CHAN:
 106                cclose(foc->chan);
 107                break;
 108        default:
 109                panic("unknown F_OR_C type, %d", foc->type);
 110        }
 111        kfree(foc);
 112}
 113
 114static void foc_release(struct kref *kref)
 115{
 116        struct file_or_chan *foc = container_of(kref, struct file_or_chan,
 117                                                kref);
 118
 119        /* A lot of places decref while holding a spinlock, but we can't free
 120         * then, since the cclose() might block. */
 121        call_rcu(&foc->rcu, __foc_free_rcu);
 122}
 123
 124static struct file_or_chan *foc_alloc(void)
 125{
 126        struct file_or_chan *foc;
 127
 128        foc = kzmalloc(sizeof(struct file_or_chan), MEM_ATOMIC);
 129        if (!foc)
 130                return NULL;
 131        kref_init(&foc->kref, foc_release, 1);
 132        return foc;
 133}
 134
 135struct file_or_chan *foc_open(char *path, int omode, int perm)
 136{
 137        ERRSTACK(1);
 138        struct file_or_chan *foc = foc_alloc();
 139
 140        if (!foc)
 141                return NULL;
 142        if (waserror()) {
 143                kfree(foc);
 144                poperror();
 145                return NULL;
 146        }
 147        foc->chan = namec(path, Aopen, omode, perm, NULL);
 148        foc->type = F_OR_C_CHAN;
 149        poperror();
 150        return foc;
 151}
 152
 153struct file_or_chan *fd_to_foc(struct fd_table *fdt, int fd)
 154{
 155        ERRSTACK(1);
 156        struct file_or_chan *foc = foc_alloc();
 157
 158        if (!foc)
 159                return NULL;
 160        if (waserror()) {
 161                kfree(foc);
 162                poperror();
 163                return NULL;
 164        }
 165        /* We're not checking mode here (-1).  mm code checks later. */
 166        foc->chan = fdtochan(fdt, fd, -1, true, true);
 167        foc->type = F_OR_C_CHAN;
 168        poperror();
 169        return foc;
 170}
 171
 172void foc_incref(struct file_or_chan *foc)
 173{
 174        kref_get(&foc->kref, 1);
 175}
 176
 177void foc_decref(struct file_or_chan *foc)
 178{
 179        kref_put(&foc->kref);
 180}
 181
 182void *foc_pointer(struct file_or_chan *foc)
 183{
 184        if (!foc)
 185                return NULL;
 186        switch (foc->type) {
 187        case F_OR_C_CHAN:
 188                return foc->chan;
 189        default:
 190                panic("unknown F_OR_C type, %d", foc->type);
 191        }
 192}
 193
 194size_t foc_get_len(struct file_or_chan *foc)
 195{
 196        switch (foc->type) {
 197        case F_OR_C_CHAN:
 198                assert(foc->fsf);
 199                return foc->fsf->dir.length;
 200        }
 201        panic("unknown F_OR_C type, %d", foc->type);
 202}
 203
 204static bool check_chan_perms(struct vm_region *vmr, struct chan *chan, int prot)
 205{
 206        /* glibc isn't opening its files O_EXEC */
 207        prot &= ~PROT_EXEC;
 208        if (!(chan->mode & O_READ))
 209                return false;
 210        if (vmr->vm_flags & MAP_PRIVATE)
 211                prot &= ~PROT_WRITE;
 212        return (chan->mode & prot) == prot;
 213}
 214
 215static bool check_foc_perms(struct vm_region *vmr, struct file_or_chan *foc,
 216                            int prot)
 217{
 218        switch (foc->type) {
 219        case F_OR_C_CHAN:
 220                return check_chan_perms(vmr, foc->chan, prot);
 221        }
 222        panic("unknown F_OR_C type");
 223}
 224
 225static int foc_dev_mmap(struct file_or_chan *foc, struct vm_region *vmr,
 226                        int prot, int flags)
 227{
 228        if (!check_foc_perms(vmr, foc, prot))
 229                return -1;
 230        switch (foc->type) {
 231        case F_OR_C_CHAN:
 232                if (!devtab[foc->chan->type].mmap) {
 233                        set_error(ENODEV, "device does not support mmap");
 234                        return -1;
 235                }
 236                foc->fsf = devtab[foc->chan->type].mmap(foc->chan, vmr, prot,
 237                                                        flags);
 238                return foc->fsf ? 0 : -1;
 239        }
 240        panic("unknown F_OR_C type, %d", foc->type);
 241}
 242
 243void vmr_init(void)
 244{
 245        vmr_kcache = kmem_cache_create("vm_regions",
 246                                       sizeof(struct vm_region),
 247                                       __alignof__(struct vm_region), 0, NULL,
 248                                       0, 0, NULL);
 249}
 250
 251static struct vm_region *vmr_zalloc(void)
 252{
 253        struct vm_region *vmr;
 254
 255        vmr = kmem_cache_alloc(vmr_kcache, MEM_WAIT);
 256        memset(vmr, 0, sizeof(struct vm_region));
 257        return vmr;
 258}
 259
 260static void vmr_free(struct vm_region *vmr)
 261{
 262        kmem_cache_free(vmr_kcache, vmr);
 263}
 264
 265/* The caller will set the prot, flags, file, and offset.  We find a spot for it
 266 * in p's address space, set proc, base, and end.  Caller holds p's vmr_lock.
 267 *
 268 * TODO: take a look at solari's vmem alloc.  And consider keeping these in a
 269 * tree of some sort for easier lookups. */
 270static bool vmr_insert(struct vm_region *vmr, struct proc *p, uintptr_t va,
 271                       size_t len)
 272{
 273        struct vm_region *vm_i, *vm_next;
 274        uintptr_t gap_end;
 275        bool ret = false;
 276
 277        assert(!PGOFF(va));
 278        assert(!PGOFF(len));
 279        assert(__is_user_addr((void*)va, len, UMAPTOP));
 280        /* Is there room before the first one: */
 281        vm_i = TAILQ_FIRST(&p->vm_regions);
 282        /* This works for now, but if all we have is BRK_END ones, we'll start
 283         * growing backwards (TODO) */
 284        if (!vm_i || (va + len <= vm_i->vm_base)) {
 285                vmr->vm_base = va;
 286                TAILQ_INSERT_HEAD(&p->vm_regions, vmr, vm_link);
 287                ret = true;
 288        } else {
 289                TAILQ_FOREACH(vm_i, &p->vm_regions, vm_link) {
 290                        vm_next = TAILQ_NEXT(vm_i, vm_link);
 291                        gap_end = vm_next ? vm_next->vm_base : UMAPTOP;
 292                        /* skip til we get past the 'hint' va */
 293                        if (va >= gap_end)
 294                                continue;
 295                        /* Find a gap that is big enough */
 296                        if (gap_end - vm_i->vm_end >= len) {
 297                                /* if we can put it at va, let's do that.  o/w,
 298                                 * put it so it fits */
 299                                if ((gap_end >= va + len) &&
 300                                    (va >= vm_i->vm_end))
 301                                        vmr->vm_base = va;
 302                                else
 303                                        vmr->vm_base = vm_i->vm_end;
 304                                TAILQ_INSERT_AFTER(&p->vm_regions, vm_i, vmr,
 305                                                   vm_link);
 306                                ret = true;
 307                                break;
 308                        }
 309                }
 310        }
 311        /* Finalize the creation, if we got one */
 312        if (ret) {
 313                vmr->vm_proc = p;
 314                vmr->vm_end = vmr->vm_base + len;
 315        }
 316        if (!ret)
 317                warn("Not making a VMR, wanted %p, + %p = %p", va, len, va +
 318                     len);
 319        return ret;
 320}
 321
 322/* Split a VMR at va, returning the new VMR.  It is set up the same way, with
 323 * file offsets fixed accordingly.  'va' is the beginning of the new one, and
 324 * must be page aligned. */
 325static struct vm_region *split_vmr(struct vm_region *old_vmr, uintptr_t va)
 326{
 327        struct vm_region *new_vmr;
 328
 329        assert(!PGOFF(va));
 330        if ((old_vmr->vm_base >= va) || (old_vmr->vm_end <= va))
 331                return 0;
 332        new_vmr = kmem_cache_alloc(vmr_kcache, 0);
 333        assert(new_vmr);
 334        TAILQ_INSERT_AFTER(&old_vmr->vm_proc->vm_regions, old_vmr, new_vmr,
 335                           vm_link);
 336        new_vmr->vm_proc = old_vmr->vm_proc;
 337        new_vmr->vm_base = va;
 338        new_vmr->vm_end = old_vmr->vm_end;
 339        old_vmr->vm_end = va;
 340        new_vmr->vm_prot = old_vmr->vm_prot;
 341        new_vmr->vm_flags = old_vmr->vm_flags;
 342        if (vmr_has_file(old_vmr)) {
 343                foc_incref(old_vmr->__vm_foc);
 344                new_vmr->__vm_foc = old_vmr->__vm_foc;
 345                new_vmr->vm_foff = old_vmr->vm_foff +
 346                                      old_vmr->vm_end - old_vmr->vm_base;
 347                pm_add_vmr(vmr_to_pm(old_vmr), new_vmr);
 348        } else {
 349                new_vmr->__vm_foc = NULL;
 350                new_vmr->vm_foff = 0;
 351        }
 352        return new_vmr;
 353}
 354
 355/* Called by the unmapper, just cleans up.  Whoever calls this will need to sort
 356 * out the page table entries. */
 357static void destroy_vmr(struct vm_region *vmr)
 358{
 359        if (vmr_has_file(vmr)) {
 360                pm_remove_vmr(vmr_to_pm(vmr), vmr);
 361                foc_decref(vmr->__vm_foc);
 362        }
 363        TAILQ_REMOVE(&vmr->vm_proc->vm_regions, vmr, vm_link);
 364        vmr_free(vmr);
 365}
 366
 367/* Merges two vm regions.  For now, it will check to make sure they are the
 368 * same.  The second one will be destroyed. */
 369static int merge_vmr(struct vm_region *first, struct vm_region *second)
 370{
 371        assert(first->vm_proc == second->vm_proc);
 372        if ((first->vm_end != second->vm_base) ||
 373            (first->vm_prot != second->vm_prot) ||
 374            (first->vm_flags != second->vm_flags) ||
 375            (first->__vm_foc != second->__vm_foc))
 376                return -1;
 377        if (vmr_has_file(first) && (second->vm_foff != first->vm_foff +
 378                                    first->vm_end - first->vm_base))
 379                return -1;
 380        first->vm_end = second->vm_end;
 381        destroy_vmr(second);
 382        return 0;
 383}
 384
 385/* Attempts to merge vmr with adjacent VMRs, returning a ptr to be used for vmr.
 386 * It could be the same struct vmr, or possibly another one (usually lower in
 387 * the address space. */
 388static struct vm_region *merge_me(struct vm_region *vmr)
 389{
 390        struct vm_region *vmr_temp;
 391
 392        /* Merge will fail if it cannot do it.  If it succeeds, the second VMR
 393         * is destroyed, so we need to be a bit careful. */
 394        vmr_temp = TAILQ_PREV(vmr, vmr_tailq, vm_link);
 395        if (vmr_temp)
 396                if (!merge_vmr(vmr_temp, vmr))
 397                        vmr = vmr_temp;
 398        vmr_temp = TAILQ_NEXT(vmr, vm_link);
 399        if (vmr_temp)
 400                merge_vmr(vmr, vmr_temp);
 401        return vmr;
 402}
 403
 404/* Grows the vm region up to (and not including) va.  Fails if another is in the
 405 * way, etc. */
 406static int grow_vmr(struct vm_region *vmr, uintptr_t va)
 407{
 408        assert(!PGOFF(va));
 409        struct vm_region *next = TAILQ_NEXT(vmr, vm_link);
 410        if (next && next->vm_base < va)
 411                return -1;
 412        if (va <= vmr->vm_end)
 413                return -1;
 414        vmr->vm_end = va;
 415        return 0;
 416}
 417
 418/* Shrinks the vm region down to (and not including) va.  Whoever calls this
 419 * will need to sort out the page table entries. */
 420static int shrink_vmr(struct vm_region *vmr, uintptr_t va)
 421{
 422        assert(!PGOFF(va));
 423        if ((va < vmr->vm_base) || (va > vmr->vm_end))
 424                return -1;
 425        vmr->vm_end = va;
 426        return 0;
 427}
 428
 429/* Given a va and a proc (later an mm, possibly), returns the owning vmr, or 0
 430 * if there is none. */
 431static struct vm_region *find_vmr(struct proc *p, uintptr_t va)
 432{
 433        struct vm_region *vmr;
 434
 435        /* ugly linear seach */
 436        TAILQ_FOREACH(vmr, &p->vm_regions, vm_link) {
 437                if ((vmr->vm_base <= va) && (vmr->vm_end > va))
 438                        return vmr;
 439        }
 440        return 0;
 441}
 442
 443/* Finds the first vmr after va (including the one holding va), or 0 if there is
 444 * none. */
 445static struct vm_region *find_first_vmr(struct proc *p, uintptr_t va)
 446{
 447        struct vm_region *vmr;
 448
 449        /* ugly linear seach */
 450        TAILQ_FOREACH(vmr, &p->vm_regions, vm_link) {
 451                if ((vmr->vm_base <= va) && (vmr->vm_end > va))
 452                        return vmr;
 453                if (vmr->vm_base > va)
 454                        return vmr;
 455        }
 456        return 0;
 457}
 458
 459/* Makes sure that no VMRs cross either the start or end of the given region
 460 * [va, va + len), splitting any VMRs that are on the endpoints. */
 461static void isolate_vmrs(struct proc *p, uintptr_t va, size_t len)
 462{
 463        struct vm_region *vmr;
 464        if ((vmr = find_vmr(p, va)))
 465                split_vmr(vmr, va);
 466        /* TODO: don't want to do another find (linear search) */
 467        if ((vmr = find_vmr(p, va + len)))
 468                split_vmr(vmr, va + len);
 469}
 470
 471void unmap_and_destroy_vmrs(struct proc *p)
 472{
 473        struct vm_region *vmr_i, *vmr_temp;
 474
 475        /* this only gets called from __proc_free, so there should be no sync
 476         * concerns.  still, better safe than sorry. */
 477        spin_lock(&p->vmr_lock);
 478        p->vmr_history++;
 479        spin_lock(&p->pte_lock);
 480        TAILQ_FOREACH(vmr_i, &p->vm_regions, vm_link) {
 481                /* note this CB sets the PTE = 0, regardless of if it was P or
 482                 * not */
 483                env_user_mem_walk(p, (void*)vmr_i->vm_base,
 484                                  vmr_i->vm_end - vmr_i->vm_base,
 485                                  __vmr_free_pgs, 0);
 486        }
 487        spin_unlock(&p->pte_lock);
 488        /* need the safe style, since destroy_vmr modifies the list.  also, we
 489         * want to do this outside the pte lock, since it grabs the pm lock. */
 490        TAILQ_FOREACH_SAFE(vmr_i, &p->vm_regions, vm_link, vmr_temp)
 491                destroy_vmr(vmr_i);
 492        spin_unlock(&p->vmr_lock);
 493}
 494
 495/* Helper: copies the contents of pages from p to new p.  For pages that aren't
 496 * present, once we support swapping or CoW, we can do something more
 497 * intelligent.  0 on success, -ERROR on failure.  Can't handle jumbos. */
 498static int copy_pages(struct proc *p, struct proc *new_p, uintptr_t va_start,
 499                      uintptr_t va_end)
 500{
 501        int ret;
 502
 503        /* Sanity checks.  If these fail, we had a screwed up VMR.
 504         * Check for: alignment, wraparound, or userspace addresses */
 505        if ((PGOFF(va_start)) ||
 506            (PGOFF(va_end)) ||
 507            (va_end < va_start) ||/* now, start > UMAPTOP -> end > UMAPTOP */
 508            (va_end > UMAPTOP)) {
 509                warn("VMR mapping is probably screwed up (%p - %p)", va_start,
 510                     va_end);
 511                return -EINVAL;
 512        }
 513        int copy_page(struct proc *p, pte_t pte, void *va, void *arg) {
 514                struct proc *new_p = (struct proc*)arg;
 515                struct page *pp;
 516
 517                if (pte_is_unmapped(pte))
 518                        return 0;
 519                /* pages could be !P, but right now that's only for file backed
 520                 * VMRs undergoing page removal, which isn't the caller of
 521                 * copy_pages. */
 522                if (pte_is_mapped(pte)) {
 523                        /* TODO: check for jumbos */
 524                        if (upage_alloc(new_p, &pp, 0))
 525                                return -ENOMEM;
 526                        memcpy(page2kva(pp), KADDR(pte_get_paddr(pte)), PGSIZE);
 527                        if (page_insert(new_p->env_pgdir, pp, va,
 528                                        pte_get_settings(pte))) {
 529                                page_decref(pp);
 530                                return -ENOMEM;
 531                        }
 532                } else if (pte_is_paged_out(pte)) {
 533                        /* TODO: (SWAP) will need to either make a copy or
 534                         * CoW/refcnt the backend store.  For now, this PTE will
 535                         * be the same as the original PTE */
 536                        panic("Swapping not supported!");
 537                } else {
 538                        panic("Weird PTE %p in %s!", pte_print(pte),
 539                              __FUNCTION__);
 540                }
 541                return 0;
 542        }
 543        spin_lock(&p->pte_lock);        /* walking and changing PTEs */
 544        ret = env_user_mem_walk(p, (void*)va_start, va_end - va_start,
 545                                &copy_page, new_p);
 546        spin_unlock(&p->pte_lock);
 547        return ret;
 548}
 549
 550static int fill_vmr(struct proc *p, struct proc *new_p, struct vm_region *vmr)
 551{
 552        int ret = 0;
 553
 554        if (!vmr_has_file(vmr) || (vmr->vm_flags & MAP_PRIVATE)) {
 555                /* We don't support ANON + SHARED yet */
 556                assert(!(vmr->vm_flags & MAP_SHARED));
 557                ret = copy_pages(p, new_p, vmr->vm_base, vmr->vm_end);
 558        } else {
 559                /* non-private file, i.e. page cacheable.  we have to honor
 560                 * MAP_LOCKED, (but we might be able to ignore MAP_POPULATE). */
 561                if (vmr->vm_flags & MAP_LOCKED) {
 562                        /* need to keep the file alive in case we unlock/block
 563                         */
 564                        foc_incref(vmr->__vm_foc);
 565                        /* math is a bit nasty if vm_base isn't page aligned */
 566                        assert(!PGOFF(vmr->vm_base));
 567                        ret = populate_pm_va(new_p, vmr->vm_base,
 568                                             (vmr->vm_end - vmr->vm_base) >>
 569                                                                       PGSHIFT,
 570                                             vmr->vm_prot, vmr_to_pm(vmr),
 571                                             vmr->vm_foff, vmr->vm_flags,
 572                                             vmr->vm_prot & PROT_EXEC);
 573                        foc_decref(vmr->__vm_foc);
 574                }
 575        }
 576        if (ret < 0) {
 577                /* Need to undo any mappings we left behind.  We don't know
 578                 * where we died, but we can just blast the entire region */
 579                spin_lock(&new_p->pte_lock);
 580                env_user_mem_walk(new_p, (void*)vmr->vm_base,
 581                                  vmr->vm_end - vmr->vm_base,
 582                                  __vmr_free_pgs, NULL);
 583                spin_unlock(&new_p->pte_lock);
 584        }
 585        return ret;
 586}
 587
 588/* This will make new_p have the same VMRs as p, and it will make sure all
 589 * physical pages are copied over, with the exception of MAP_SHARED files.
 590 * MAP_SHARED files that are also MAP_LOCKED will be attached to the process -
 591 * presumably they are in the page cache since the parent locked them.  This is
 592 * all pretty nasty.
 593 *
 594 * This is used by fork().
 595 *
 596 * Note that if you are working on a VMR that is a file, you'll want to be
 597 * careful about how it is mapped (SHARED, PRIVATE, etc). */
 598int duplicate_vmrs(struct proc *p, struct proc *new_p)
 599{
 600        int ret = 0;
 601        struct vm_region *vmr, *vm_i;
 602
 603        TAILQ_FOREACH(vm_i, &p->vm_regions, vm_link) {
 604                vmr = kmem_cache_alloc(vmr_kcache, 0);
 605                if (!vmr)
 606                        return -ENOMEM;
 607                vmr->vm_proc = new_p;
 608                vmr->vm_base = vm_i->vm_base;
 609                vmr->vm_end = vm_i->vm_end;
 610                vmr->vm_prot = vm_i->vm_prot;
 611                vmr->vm_flags = vm_i->vm_flags;
 612                vmr->__vm_foc = vm_i->__vm_foc;
 613                vmr->vm_foff = vm_i->vm_foff;
 614                if (vmr_has_file(vm_i)) {
 615                        foc_incref(vm_i->__vm_foc);
 616                        pm_add_vmr(vmr_to_pm(vm_i), vmr);
 617                }
 618                ret = fill_vmr(p, new_p, vmr);
 619                if (ret) {
 620                        if (vmr_has_file(vm_i)) {
 621                                pm_remove_vmr(vmr_to_pm(vm_i), vmr);
 622                                foc_decref(vm_i->__vm_foc);
 623                        }
 624                        vmr_free(vmr);
 625                        return ret;
 626                }
 627                TAILQ_INSERT_TAIL(&new_p->vm_regions, vmr, vm_link);
 628        }
 629        return 0;
 630}
 631
 632void print_vmrs(struct proc *p)
 633{
 634        int count = 0;
 635        struct vm_region *vmr;
 636
 637        print_lock();
 638        printk("VM Regions for proc %d\n", p->pid);
 639        printk("NR:"
 640               "                                     Range:"
 641               "       Prot,"
 642               "      Flags,"
 643               "               File,"
 644               "                Off\n");
 645        TAILQ_FOREACH(vmr, &p->vm_regions, vm_link)
 646                printk("%02d: (%p - %p): 0x%08x, 0x%08x, %p, %p\n", count++,
 647                       vmr->vm_base, vmr->vm_end, vmr->vm_prot, vmr->vm_flags,
 648                       foc_pointer(vmr->__vm_foc), vmr->vm_foff);
 649        print_unlock();
 650}
 651
 652void enumerate_vmrs(struct proc *p, void (*func)(struct vm_region *vmr,
 653                                                 void *opaque), void *opaque)
 654{
 655        struct vm_region *vmr;
 656
 657        spin_lock(&p->vmr_lock);
 658        TAILQ_FOREACH(vmr, &p->vm_regions, vm_link)
 659                func(vmr, opaque);
 660        spin_unlock(&p->vmr_lock);
 661}
 662
 663static bool mmap_flags_priv_ok(int flags)
 664{
 665        return (flags & (MAP_PRIVATE | MAP_SHARED)) == MAP_PRIVATE ||
 666               (flags & (MAP_PRIVATE | MAP_SHARED)) == MAP_SHARED;
 667}
 668
 669static bool prot_is_valid(int prot)
 670{
 671        /* Remember PROT_NONE (0) is valid. */
 672        return !(prot & ~PROT_VALID_PROTS);
 673}
 674
 675static bool prot_has_access(int prot)
 676{
 677        return prot & (PROT_READ | PROT_WRITE | PROT_EXEC);
 678}
 679
 680/* Error values aren't quite comprehensive - check man mmap() once we do better
 681 * with the FS.
 682 *
 683 * The mmap call's offset is in units of PGSIZE (like Linux's mmap2()), but
 684 * internally, the offset is tracked in bytes.  The reason for the PGSIZE is for
 685 * 32bit apps to enumerate large files, but a full 64bit system won't need that.
 686 * We track things internally in bytes since that is how file pointers work, vmr
 687 * bases and ends, and similar math.  While it's not a hard change, there's no
 688 * need for it, and ideally we'll be a fully 64bit system before we deal with
 689 * files that large. */
 690void *mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
 691           int fd, size_t offset)
 692{
 693        struct file_or_chan *file = NULL;
 694        void *result;
 695
 696        offset <<= PGSHIFT;
 697        printd("mmap(addr %x, len %x, prot %x, flags %x, fd %x, off %x)\n",
 698               addr, len, prot, flags, fd, offset);
 699        if (!mmap_flags_priv_ok(flags)) {
 700                set_errno(EINVAL);
 701                return MAP_FAILED;
 702        }
 703        if (!prot_is_valid(prot)) {
 704                set_error(EINVAL, "invalid prot 0x%x (%x)", prot,
 705                          PROT_VALID_PROTS);
 706                return MAP_FAILED;
 707        }
 708        if (!len) {
 709                set_errno(EINVAL);
 710                return MAP_FAILED;
 711        }
 712        if (!(flags & MAP_ANON) && (fd >= 0)) {
 713                file = fd_to_foc(&p->open_files, fd);
 714                if (!file) {
 715                        set_errno(EBADF);
 716                        result = MAP_FAILED;
 717                        goto out_ref;
 718                }
 719        }
 720        /* Check for overflow.  This helps do_mmap and populate_va, among
 721         * others. */
 722        if (offset + len < offset) {
 723                set_errno(EINVAL);
 724                result = MAP_FAILED;
 725                goto out_ref;
 726        }
 727        /* If they don't care where to put it, we'll start looking after the
 728         * break.  We could just have userspace handle this (in glibc's mmap),
 729         * so we don't need to know about BRK_END, but this will work for now
 730         * (and may avoid bugs).  Note that this limits mmap(0) a bit.  Keep
 731         * this in sync with do_mmap()'s check.  (Both are necessary).  */
 732        if (addr == 0)
 733                addr = BRK_END;
 734        /* Still need to enforce this: */
 735        addr = MAX(addr, MMAP_LOWEST_VA);
 736        /* Need to check addr + len, after we do our addr adjustments */
 737        if (!__is_user_addr((void*)addr, len, UMAPTOP)) {
 738                set_errno(EINVAL);
 739                result = MAP_FAILED;
 740                goto out_ref;
 741        }
 742        if (PGOFF(addr)) {
 743                set_errno(EINVAL);
 744                result = MAP_FAILED;
 745                goto out_ref;
 746        }
 747        result = do_mmap(p, addr, len, prot, flags, file, offset);
 748out_ref:
 749        if (file)
 750                foc_decref(file);
 751        return result;
 752}
 753
 754/* Helper, maps in page at addr, but only if nothing is mapped there.  Returns
 755 * 0 on success.  Will take ownership of non-pagemap pages, including on error
 756 * cases.  This just means we free it on error, and notionally store it in the
 757 * PTE on success, which will get freed later.
 758 *
 759 * It's possible that a page has already been mapped here, in which case we'll
 760 * treat as success.  So when we return 0, *a* page is mapped here, but not
 761 * necessarily the one you passed in. */
 762static int map_page_at_addr(struct proc *p, struct page *page, uintptr_t addr,
 763                            int pte_prot)
 764{
 765        pte_t pte;
 766
 767        spin_lock(&p->pte_lock);        /* walking and changing PTEs */
 768        /* find offending PTE (prob don't read this in).  This might alloc an
 769         * intermediate page table page. */
 770        pte = pgdir_walk(p->env_pgdir, (void*)addr, TRUE);
 771        if (!pte_walk_okay(pte)) {
 772                spin_unlock(&p->pte_lock);
 773                if (!page_is_pagemap(page))
 774                        page_decref(page);
 775                return -ENOMEM;
 776        }
 777        /* a spurious, valid PF is possible due to a legit race: the page might
 778         * have been faulted in by another core already (and raced on the memory
 779         * lock), in which case we should just return. */
 780        if (pte_is_present(pte)) {
 781                spin_unlock(&p->pte_lock);
 782                if (!page_is_pagemap(page))
 783                        page_decref(page);
 784                return 0;
 785        }
 786        /* I used to allow clobbering an old entry (contrary to the
 787         * documentation), but it's probably a sign of another bug. */
 788        assert(!pte_is_mapped(pte));
 789        /* preserve the dirty bit - pm removal could be looking concurrently */
 790        pte_prot |= (pte_is_dirty(pte) ? PTE_D : 0);
 791        /* We have a ref to page (for non PMs), which we are storing in the PTE
 792         */
 793        pte_write(pte, page2pa(page), pte_prot);
 794        spin_unlock(&p->pte_lock);
 795        return 0;
 796}
 797
 798/* Helper: copies *pp's contents to a new page, replacing your page pointer.  If
 799 * this succeeds, you'll have a non-PM page, which matters for how you put it.*/
 800static int __copy_and_swap_pmpg(struct proc *p, struct page **pp)
 801{
 802        struct page *new_page, *old_page = *pp;
 803
 804        if (upage_alloc(p, &new_page, FALSE))
 805                return -ENOMEM;
 806        memcpy(page2kva(new_page), page2kva(old_page), PGSIZE);
 807        pm_put_page(old_page);
 808        *pp = new_page;
 809        return 0;
 810}
 811
 812/* Hold the VMR lock when you call this - it'll assume the entire VA range is
 813 * mappable, which isn't true if there are concurrent changes to the VMRs. */
 814static int populate_anon_va(struct proc *p, uintptr_t va, unsigned long nr_pgs,
 815                            int pte_prot)
 816{
 817        struct page *page;
 818        int ret;
 819
 820        for (long i = 0; i < nr_pgs; i++) {
 821                if (upage_alloc(p, &page, TRUE))
 822                        return -ENOMEM;
 823                /* could imagine doing a memwalk instead of a for loop */
 824                ret = map_page_at_addr(p, page, va + i * PGSIZE, pte_prot);
 825                if (ret)
 826                        return ret;
 827        }
 828        return 0;
 829}
 830
 831/* This will periodically unlock the vmr lock. */
 832static int populate_pm_va(struct proc *p, uintptr_t va, unsigned long nr_pgs,
 833                          int pte_prot, struct page_map *pm, size_t offset,
 834                          int flags, bool exec)
 835{
 836        int ret = 0;
 837        unsigned long pm_idx0 = offset >> PGSHIFT;
 838        int vmr_history = ACCESS_ONCE(p->vmr_history);
 839        struct page *page;
 840
 841        /* This is a racy check - see the comments in fs_file.c.  Also, we're
 842         * not even attempting to populate the va, though we could do a partial
 843         * if necessary. */
 844        if (pm_idx0 + nr_pgs > nr_pages(fs_file_get_length(pm->pm_file)))
 845                return -ESPIPE;
 846        /* locking rules: start the loop holding the vmr lock, enter and exit
 847         * the entire func holding the lock. */
 848        for (long i = 0; i < nr_pgs; i++) {
 849                ret = pm_load_page_nowait(pm, pm_idx0 + i, &page);
 850                if (ret) {
 851                        if (ret != -EAGAIN)
 852                                break;
 853                        spin_unlock(&p->vmr_lock);
 854                        /* might block here, can't hold the spinlock */
 855                        ret = pm_load_page(pm, pm_idx0 + i, &page);
 856                        spin_lock(&p->vmr_lock);
 857                        if (ret)
 858                                break;
 859                        /* while we were sleeping, the VMRs could have changed
 860                         * on us. */
 861                        if (vmr_history != ACCESS_ONCE(p->vmr_history)) {
 862                                pm_put_page(page);
 863                                printk("[kernel] "
 864                                       "FYI: VMR changed during populate\n");
 865                                break;
 866                        }
 867                }
 868                if (flags & MAP_PRIVATE) {
 869                        ret = __copy_and_swap_pmpg(p, &page);
 870                        if (ret) {
 871                                pm_put_page(page);
 872                                break;
 873                        }
 874                }
 875                /* if this is an executable page, we might have to flush the
 876                 * instruction cache if our HW requires it.
 877                 * TODO: is this still needed?  andrew put this in a while ago*/
 878                if (exec)
 879                        icache_flush_page(0, page2kva(page));
 880                /* The page could be either in the PM, or a private, now-anon
 881                 * page. */
 882                ret = map_page_at_addr(p, page, va + i * PGSIZE, pte_prot);
 883                if (page_is_pagemap(page))
 884                        pm_put_page(page);
 885                if (ret)
 886                        break;
 887        }
 888        return ret;
 889}
 890
 891void *do_mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
 892              struct file_or_chan *file, size_t offset)
 893{
 894        len = ROUNDUP(len, PGSIZE);
 895        struct vm_region *vmr, *vmr_temp;
 896
 897        assert(mmap_flags_priv_ok(flags));
 898        assert(prot_is_valid(prot));
 899
 900        vmr = vmr_zalloc();
 901
 902        /* Sanity check, for callers that bypass mmap().  We want addr for anon
 903         * memory to start above the break limit (BRK_END), but not 0.  Keep
 904         * this in sync with BRK_END in mmap(). */
 905        if (addr == 0)
 906                addr = BRK_END;
 907        assert(!PGOFF(offset));
 908        /* MCPs will need their code and data pinned.  This check will start to
 909         * fail after uthread_slim_init(), at which point userspace should have
 910         * enough control over its mmaps (i.e. no longer done by LD or load_elf)
 911         * that it can ask for pinned and populated pages.  Except for
 912         * dl_opens(). */
 913        struct preempt_data *vcpd = &p->procdata->vcore_preempt_data[0];
 914
 915        if (file && (atomic_read(&vcpd->flags) & VC_SCP_NOVCCTX))
 916                flags |= MAP_POPULATE | MAP_LOCKED;
 917        vmr->vm_prot = prot;
 918        vmr->vm_foff = offset;
 919        vmr->vm_flags = flags & MAP_PERSIST_FLAGS;
 920        /* We grab the file early, so we can block.  This is all hokey.  The VMR
 921         * isn't ready yet, so the PM code will ignore it. */
 922        if (file) {
 923                /* Prep the FS and make sure it can mmap the file.  The
 924                 * device/FS checks perms, and does whatever else it needs to
 925                 * make the mmap work. */
 926                if (foc_dev_mmap(file, vmr, prot, flags & MAP_PERSIST_FLAGS)) {
 927                        vmr_free(vmr);
 928                        set_errno(EACCES);      /* not quite */
 929                        return MAP_FAILED;
 930                }
 931                /* TODO: push the PM stuff into the chan/fs_file. */
 932                pm_add_vmr(foc_to_pm(file), vmr);
 933                foc_incref(file);
 934                vmr->__vm_foc = file;
 935                /* TODO: consider locking the file while checking (not as
 936                 * manadatory as in handle_page_fault() */
 937                if (nr_pages(offset + len) > nr_pages(foc_get_len(file))) {
 938                        /* We're allowing them to set up the VMR, though if they
 939                         * attempt to fault in any pages beyond the file's
 940                         * limit, they'll fail.  Since they might not access the
 941                         * region, we need to make sure POPULATE is off.  FYI,
 942                         * 64 bit glibc shared libs map in an extra 2MB of
 943                         * unaligned space between their RO and RW sections, but
 944                         * then immediately mprotect it to PROT_NONE. */
 945                        flags &= ~MAP_POPULATE;
 946                }
 947        }
 948        /* read/write vmr lock (will change the tree) */
 949        spin_lock(&p->vmr_lock);
 950        p->vmr_history++;
 951        /* Need to make sure nothing is in our way when we want a FIXED
 952         * location.  We just need to split on the end points (if they exist),
 953         * and then remove everything in between.  __do_munmap() will do this.
 954         * Careful, this means an mmap can be an implied munmap() (not my
 955         * call...). */
 956        if (flags & MAP_FIXED)
 957                __do_munmap(p, addr, len);
 958        if (!vmr_insert(vmr, p, addr, len)) {
 959                spin_unlock(&p->vmr_lock);
 960                if (vmr_has_file(vmr)) {
 961                        pm_remove_vmr(vmr_to_pm(vmr), vmr);
 962                        foc_decref(vmr->__vm_foc);
 963                }
 964                vmr_free(vmr);
 965                set_error(ENOMEM, "probably tried to mmap beyond UMAPTOP");
 966                /* Slightly weird semantics: if we fail and had munmapped the
 967                 * space, they will have a hole in their VM now. */
 968                return MAP_FAILED;
 969        }
 970        addr = vmr->vm_base;
 971        vmr->vm_ready = true;
 972
 973        vmr = merge_me(vmr);            /* attempts to merge with neighbors */
 974
 975        if (flags & MAP_POPULATE && prot_has_access(prot)) {
 976                int pte_prot = (prot & PROT_WRITE) ? PTE_USER_RW :
 977                           (prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO : 0;
 978                unsigned long nr_pgs = len >> PGSHIFT;
 979                int ret = 0;
 980                if (!file) {
 981                        ret = populate_anon_va(p, addr, nr_pgs, pte_prot);
 982                } else {
 983                        /* Note: this will unlock if it blocks.  our refcnt on
 984                         * the file keeps the pm alive when we unlock */
 985                        ret = populate_pm_va(p, addr, nr_pgs, pte_prot,
 986                                             foc_to_pm(file), offset, flags,
 987                                             prot & PROT_EXEC);
 988                }
 989                if (ret == -ENOMEM) {
 990                        spin_unlock(&p->vmr_lock);
 991                        printk("[kernel] ENOMEM, killing %d\n", p->pid);
 992                        proc_destroy(p);
 993                        /* this will never make it back to userspace */
 994                        return MAP_FAILED;
 995                }
 996        }
 997        spin_unlock(&p->vmr_lock);
 998
 999        profiler_notify_mmap(p, addr, len, prot, flags, file, offset);
1000
1001        return (void*)addr;
1002}
1003
1004int mprotect(struct proc *p, uintptr_t addr, size_t len, int prot)
1005{
1006        int ret;
1007
1008        printd("mprotect: (addr %p, len %p, prot 0x%x)\n", addr, len, prot);
1009        if (!prot_is_valid(prot)) {
1010                set_error(EINVAL, "invalid prot 0x%x (%x)", prot,
1011                          PROT_VALID_PROTS);
1012                return -1;
1013        }
1014        if (!len)
1015                return 0;
1016        len = ROUNDUP(len, PGSIZE);
1017        if (PGOFF(addr)) {
1018                set_errno(EINVAL);
1019                return -1;
1020        }
1021        if (!__is_user_addr((void*)addr, len, UMAPTOP)) {
1022                set_errno(ENOMEM);
1023                return -1;
1024        }
1025        /* read/write lock, will probably change the tree and settings */
1026        spin_lock(&p->vmr_lock);
1027        p->vmr_history++;
1028        ret = __do_mprotect(p, addr, len, prot);
1029        spin_unlock(&p->vmr_lock);
1030        return ret;
1031}
1032
1033/* This does not care if the region is not mapped.  POSIX says you should return
1034 * ENOMEM if any part of it is unmapped.  Can do this later if we care, based on
1035 * the VMRs, not the actual page residency. */
1036int __do_mprotect(struct proc *p, uintptr_t addr, size_t len, int prot)
1037{
1038        struct vm_region *vmr, *next_vmr;
1039        pte_t pte;
1040        bool shootdown_needed = FALSE;
1041        bool file_access_failure = FALSE;
1042        int pte_prot = (prot & PROT_WRITE) ? PTE_USER_RW :
1043                       (prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO : PTE_NONE;
1044
1045        assert(prot_is_valid(prot));
1046        /* TODO: this is aggressively splitting, when we might not need to if
1047         * the prots are the same as the previous.  Plus, there are three
1048         * excessive scans. */
1049        isolate_vmrs(p, addr, len);
1050        vmr = find_first_vmr(p, addr);
1051        while (vmr && vmr->vm_base < addr + len) {
1052                if (vmr->vm_prot == prot)
1053                        goto next_vmr;
1054                if (vmr_has_file(vmr) &&
1055                    !check_foc_perms(vmr, vmr->__vm_foc, prot)) {
1056                        file_access_failure = TRUE;
1057                        goto next_vmr;
1058                }
1059                vmr->vm_prot = prot;
1060                spin_lock(&p->pte_lock);        /* walking and changing PTEs */
1061                /* TODO: use a memwalk.  At a minimum, we need to change every
1062                 * existing PTE that won't trigger a PF (meaning, present PTEs)
1063                 * to have the new prot.  The others will fault on access, and
1064                 * we'll change the PTE then.  In the off chance we have a
1065                 * mapped but not present PTE, we might as well change it too,
1066                 * since we're already here. */
1067                for (uintptr_t va = vmr->vm_base; va < vmr->vm_end;
1068                     va += PGSIZE) {
1069                        pte = pgdir_walk(p->env_pgdir, (void*)va, 0);
1070                        if (pte_walk_okay(pte) && pte_is_mapped(pte)) {
1071                                pte_replace_perm(pte, pte_prot);
1072                                shootdown_needed = TRUE;
1073                        }
1074                }
1075                spin_unlock(&p->pte_lock);
1076next_vmr:
1077                /* Note that this merger could cause us to not look at the next
1078                 * one, since we merged with it.  That's ok, since in that case,
1079                 * the next one already has the right prots.  Also note that
1080                 * every VMR in the region, including the ones at the endpoints,
1081                 * attempted to merge left and right. */
1082                vmr = merge_me(vmr);
1083                next_vmr = TAILQ_NEXT(vmr, vm_link);
1084                vmr = next_vmr;
1085        }
1086        if (shootdown_needed)
1087                proc_tlbshootdown(p, addr, addr + len);
1088        if (file_access_failure) {
1089                set_errno(EACCES);
1090                return -1;
1091        }
1092        return 0;
1093}
1094
1095int munmap(struct proc *p, uintptr_t addr, size_t len)
1096{
1097        int ret;
1098
1099        printd("munmap(addr %x, len %x)\n", addr, len);
1100        if (!len)
1101                return 0;
1102        len = ROUNDUP(len, PGSIZE);
1103        if (PGOFF(addr)) {
1104                set_errno(EINVAL);
1105                return -1;
1106        }
1107        if (!__is_user_addr((void*)addr, len, UMAPTOP)) {
1108                set_errno(EINVAL);
1109                return -1;
1110        }
1111        /* read/write: changing the vmrs (trees, properties, and whatnot) */
1112        spin_lock(&p->vmr_lock);
1113        p->vmr_history++;
1114        ret = __do_munmap(p, addr, len);
1115        spin_unlock(&p->vmr_lock);
1116        return ret;
1117}
1118
1119static int __munmap_pte(struct proc *p, pte_t pte, void *va, void *arg)
1120{
1121        bool *shootdown_needed = (bool*)arg;
1122        struct page *page;
1123
1124        /* could put in some checks here for !P and also !0 */
1125        if (!pte_is_present(pte)) /* unmapped (== 0) *ptes are also not PTE_P */
1126                return 0;
1127        if (pte_is_dirty(pte)) {
1128                page = pa2page(pte_get_paddr(pte));
1129                atomic_or(&page->pg_flags, PG_DIRTY);
1130        }
1131        pte_clear_present(pte);
1132        *shootdown_needed = TRUE;
1133        return 0;
1134}
1135
1136/* If our page is actually in the PM, we don't do anything.  All a page map
1137 * really needs is for our VMR to no longer track it (vmr being in the pm's
1138 * list) and to not point at its pages (mark it 0, dude).
1139 *
1140 * But private mappings mess with that a bit.  Luckily, we can tell by looking
1141 * at a page whether the specific page is in the PM or not.  If it isn't, we
1142 * still need to free our "VMR local" copy.
1143 *
1144 * For pages in a PM, we're racing with PM removers.  Both of us sync with the
1145 * mm lock, so once we hold the lock, it's a matter of whether or not the PTE is
1146 * 0 or not.  If it isn't, then we're still okay to look at the page.  Consider
1147 * the PTE a weak ref on the page.  So long as you hold the mm lock, you can
1148 * look at the PTE and know the page isn't being freed. */
1149static int __vmr_free_pgs(struct proc *p, pte_t pte, void *va, void *arg)
1150{
1151        struct page *page;
1152        if (pte_is_unmapped(pte))
1153                return 0;
1154        page = pa2page(pte_get_paddr(pte));
1155        pte_clear(pte);
1156        if (!page_is_pagemap(page))
1157                page_decref(page);
1158        return 0;
1159}
1160
1161int __do_munmap(struct proc *p, uintptr_t addr, size_t len)
1162{
1163        struct vm_region *vmr, *next_vmr, *first_vmr;
1164        bool shootdown_needed = FALSE;
1165
1166        /* TODO: this will be a bit slow, since we end up doing three linear
1167         * searches (two in isolate, one in find_first). */
1168        isolate_vmrs(p, addr, len);
1169        first_vmr = find_first_vmr(p, addr);
1170        vmr = first_vmr;
1171        spin_lock(&p->pte_lock);        /* changing PTEs */
1172        while (vmr && vmr->vm_base < addr + len) {
1173                /* It's important that we call __munmap_pte and sync the
1174                 * PG_DIRTY bit before we unhook the VMR from the PM (in
1175                 * destroy_vmr). */
1176                env_user_mem_walk(p, (void*)vmr->vm_base,
1177                                  vmr->vm_end - vmr->vm_base, __munmap_pte,
1178                                  &shootdown_needed);
1179                vmr = TAILQ_NEXT(vmr, vm_link);
1180        }
1181        spin_unlock(&p->pte_lock);
1182        /* we haven't freed the pages yet; still using the PTEs to store the
1183         * them.  There should be no races with inserts/faults, since we still
1184         * hold the mm lock since the previous CB. */
1185        if (shootdown_needed)
1186                proc_tlbshootdown(p, addr, addr + len);
1187        vmr = first_vmr;
1188        while (vmr && vmr->vm_base < addr + len) {
1189                /* there is rarely more than one VMR in this loop.  o/w, we'll
1190                 * need to gather up the vmrs and destroy outside the pte_lock.
1191                 */
1192                spin_lock(&p->pte_lock);        /* changing PTEs */
1193                env_user_mem_walk(p, (void*)vmr->vm_base,
1194                                  vmr->vm_end - vmr->vm_base, __vmr_free_pgs,
1195                                  0);
1196                spin_unlock(&p->pte_lock);
1197                next_vmr = TAILQ_NEXT(vmr, vm_link);
1198                destroy_vmr(vmr);
1199                vmr = next_vmr;
1200        }
1201        return 0;
1202}
1203
1204/* Helper - drop the page differently based on where it is from */
1205static void __put_page(struct page *page)
1206{
1207        if (page_is_pagemap(page))
1208                pm_put_page(page);
1209        else
1210                page_decref(page);
1211}
1212
1213static int __hpf_load_page(struct proc *p, struct page_map *pm,
1214                           unsigned long idx, struct page **page, bool first)
1215{
1216        int ret = 0;
1217        int coreid = core_id();
1218        struct per_cpu_info *pcpui = &per_cpu_info[coreid];
1219        bool wake_scp = FALSE;
1220        spin_lock(&p->proc_lock);
1221        switch (p->state) {
1222        case (PROC_RUNNING_S):
1223                wake_scp = TRUE;
1224                __proc_set_state(p, PROC_WAITING);
1225                /* it's possible for HPF to loop a few times; we can only save
1226                 * the first time, o/w we could clobber. */
1227                if (first) {
1228                        __proc_save_context_s(p);
1229                        __proc_save_fpu_s(p);
1230                        /* We clear the owner, since userspace doesn't run here
1231                         * anymore, but we won't abandon since the fault handler
1232                         * still runs in our process. */
1233                        clear_owning_proc(coreid);
1234                }
1235                /* other notes: we don't currently need to tell the ksched
1236                 * we switched from running to waiting, though we probably
1237                 * will later for more generic scheds. */
1238                break;
1239        case (PROC_RUNNABLE_M):
1240        case (PROC_RUNNING_M):
1241                spin_unlock(&p->proc_lock);
1242                return -EAGAIN; /* will get reflected back to userspace */
1243        case (PROC_DYING):
1244        case (PROC_DYING_ABORT):
1245                spin_unlock(&p->proc_lock);
1246                return -EINVAL;
1247        default:
1248                /* shouldn't have any waitings, under the current yield style.
1249                 * if this becomes an issue, we can branch on is_mcp(). */
1250                printk("HPF unexpectecd state(%s)", procstate2str(p->state));
1251                spin_unlock(&p->proc_lock);
1252                return -EINVAL;
1253        }
1254        spin_unlock(&p->proc_lock);
1255        ret = pm_load_page(pm, idx, page);
1256        if (wake_scp)
1257                proc_wakeup(p);
1258        if (ret) {
1259                printk("load failed with ret %d\n", ret);
1260                return ret;
1261        }
1262        /* need to put our old ref, next time around HPF will get another. */
1263        pm_put_page(*page);
1264        return 0;
1265}
1266
1267/* Returns 0 on success, or an appropriate -error code.
1268 *
1269 * Notes: if your TLB caches negative results, you'll need to flush the
1270 * appropriate tlb entry.  Also, you could have a weird race where a present PTE
1271 * faulted for a different reason (was mprotected on another core), and the
1272 * shootdown is on its way.  Userspace should have waited for the mprotect to
1273 * return before trying to write (or whatever), so we don't care and will fault
1274 * them. */
1275static int __hpf(struct proc *p, uintptr_t va, int prot, bool file_ok)
1276{
1277        struct vm_region *vmr;
1278        struct file_or_chan *file;
1279        struct page *a_page;
1280        unsigned int f_idx;     /* index of the missing page in the file */
1281        int ret = 0;
1282        bool first = TRUE;
1283        va = ROUNDDOWN(va,PGSIZE);
1284
1285refault:
1286        /* read access to the VMRs TODO: RCU */
1287        spin_lock(&p->vmr_lock);
1288        /* Check the vmr's protection */
1289        vmr = find_vmr(p, va);
1290        if (!vmr) {                     /* not mapped at all */
1291                printd("fault: %p not mapped\n", va);
1292                ret = -EFAULT;
1293                goto out;
1294        }
1295        if (!(vmr->vm_prot & prot)) {   /* wrong prots for this vmr */
1296                ret = -EPERM;
1297                goto out;
1298        }
1299        if (!vmr_has_file(vmr)) {
1300                /* No file - just want anonymous memory */
1301                if (upage_alloc(p, &a_page, TRUE)) {
1302                        ret = -ENOMEM;
1303                        goto out;
1304                }
1305        } else {
1306                if (!file_ok) {
1307                        ret = -EACCES;
1308                        goto out;
1309                }
1310                file = vmr->__vm_foc;
1311                /* If this fails, either something got screwed up with the VMR,
1312                 * or the permissions changed after mmap/mprotect.  Either way,
1313                 * I want to know (though it's not critical). */
1314                if (!check_foc_perms(vmr, file, prot))
1315                        printk("[kernel] "
1316                               "possible issue with VMR prots on file %s!\n",
1317                               foc_to_name(file));
1318                /* Load the file's page in the page cache.
1319                 * TODO: (BLK) Note, we are holding the mem lock!  We need to
1320                 * rewrite this stuff so we aren't hold the lock as excessively
1321                 * as we are, and such that we can block and resume later. */
1322                assert(!PGOFF(va - vmr->vm_base + vmr->vm_foff));
1323                f_idx = (va - vmr->vm_base + vmr->vm_foff) >> PGSHIFT;
1324                /* This is a racy check - see the comments in fs_file.c */
1325                if (f_idx + 1 > nr_pages(foc_get_len(file))) {
1326                        ret = -ESPIPE; /* linux sends a SIGBUS at access time */
1327                        goto out;
1328                }
1329                ret = pm_load_page_nowait(foc_to_pm(file), f_idx, &a_page);
1330                if (ret) {
1331                        if (ret != -EAGAIN)
1332                                goto out;
1333                        /* keep the file alive after we unlock */
1334                        foc_incref(file);
1335                        spin_unlock(&p->vmr_lock);
1336                        ret = __hpf_load_page(p, foc_to_pm(file), f_idx,
1337                                              &a_page, first);
1338                        first = FALSE;
1339                        foc_decref(file);
1340                        if (ret)
1341                                return ret;
1342                        goto refault;
1343                }
1344                /* If we want a private map, we'll preemptively give you a new
1345                 * page.  We used to just care if it was private and writable,
1346                 * but were running into issues with libc changing its mapping
1347                 * (map private, then mprotect to writable...)  In the future,
1348                 * we want to CoW this anyway, so it's not a big deal. */
1349                if ((vmr->vm_flags & MAP_PRIVATE)) {
1350                        ret = __copy_and_swap_pmpg(p, &a_page);
1351                        if (ret)
1352                                goto out_put_pg;
1353                }
1354                /* if this is an executable page, we might have to flush the
1355                 * instruction cache if our HW requires it. */
1356                if (vmr->vm_prot & PROT_EXEC)
1357                        icache_flush_page((void*)va, page2kva(a_page));
1358        }
1359        /* update the page table TODO: careful with MAP_PRIVATE etc.  might do
1360         * this separately (file, no file) */
1361        int pte_prot = (vmr->vm_prot & PROT_WRITE) ? PTE_USER_RW :
1362                       (vmr->vm_prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO : 0;
1363        ret = map_page_at_addr(p, a_page, va, pte_prot);
1364        /* fall through, even for errors */
1365out_put_pg:
1366        /* the VMR's existence in the PM (via the mmap) allows us to have PTE
1367         * point to a_page without it magically being reallocated.  For non-PM
1368         * memory (anon memory or private pages) we transferred the ref to the
1369         * PTE. */
1370        if (page_is_pagemap(a_page))
1371                pm_put_page(a_page);
1372out:
1373        spin_unlock(&p->vmr_lock);
1374        return ret;
1375}
1376
1377int handle_page_fault(struct proc *p, uintptr_t va, int prot)
1378{
1379        return __hpf(p, va, prot, TRUE);
1380}
1381
1382int handle_page_fault_nofile(struct proc *p, uintptr_t va, int prot)
1383{
1384        return __hpf(p, va, prot, FALSE);
1385}
1386
1387/* Attempts to populate the pages, as if there was a page faults.  Bails on
1388 * errors, and returns the number of pages populated.  */
1389unsigned long populate_va(struct proc *p, uintptr_t va, unsigned long nr_pgs)
1390{
1391        struct vm_region *vmr, vmr_copy;
1392        struct file_or_chan *file;
1393        unsigned long nr_pgs_this_vmr;
1394        unsigned long nr_filled = 0;
1395        struct page *page;
1396        int pte_prot;
1397        int ret;
1398
1399        /* we can screw around with ways to limit the find_vmr calls (can do the
1400         * next in line if we didn't unlock, etc., but i don't expect us to do
1401         * this for more than a single VMR in most cases. */
1402        spin_lock(&p->vmr_lock);
1403        while (nr_pgs) {
1404                vmr = find_vmr(p, va);
1405                if (!vmr)
1406                        break;
1407                if (!prot_has_access(vmr->vm_prot))
1408                        break;
1409                pte_prot = (vmr->vm_prot & PROT_WRITE) ? PTE_USER_RW :
1410                           (vmr->vm_prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO
1411                                                                  : 0;
1412                nr_pgs_this_vmr = MIN(nr_pgs, (vmr->vm_end - va) >> PGSHIFT);
1413                if (!vmr_has_file(vmr)) {
1414                        if (populate_anon_va(p, va, nr_pgs_this_vmr, pte_prot))
1415                        {
1416                                /* on any error, we can just bail.  we might be
1417                                 * underestimating nr_filled. */
1418                                break;
1419                        }
1420                } else {
1421                        file = vmr->__vm_foc;
1422                        /* need to keep the file alive in case we unlock/block
1423                         */
1424                        foc_incref(file);
1425                        /* Regarding foff + (va - base): va - base < len, and
1426                         * foff + len does not over flow */
1427                        ret = populate_pm_va(p, va, nr_pgs_this_vmr, pte_prot,
1428                                             foc_to_pm(file),
1429                                             vmr->vm_foff + (va - vmr->vm_base),
1430                                             vmr->vm_flags,
1431                                             vmr->vm_prot & PROT_EXEC);
1432                        foc_decref(file);
1433                        if (ret) {
1434                                /* we might have failed if the underlying file
1435                                 * doesn't cover the mmap window, depending on
1436                                 * how we'll deal with truncation. */
1437                                break;
1438                        }
1439                }
1440                nr_filled += nr_pgs_this_vmr;
1441                va += nr_pgs_this_vmr << PGSHIFT;
1442                nr_pgs -= nr_pgs_this_vmr;
1443        }
1444        spin_unlock(&p->vmr_lock);
1445        return nr_filled;
1446}
1447
1448/* Kernel Dynamic Memory Mappings */
1449
1450static struct arena *vmap_addr_arena;
1451struct arena *vmap_arena;
1452static spinlock_t vmap_lock = SPINLOCK_INITIALIZER;
1453struct vmap_free_tracker {
1454        void                            *addr;
1455        size_t                          nr_bytes;
1456};
1457static struct vmap_free_tracker *vmap_to_free;
1458static size_t vmap_nr_to_free;
1459/* This value tunes the ratio of global TLB shootdowns to __vmap_free()s. */
1460#define VMAP_MAX_TO_FREE 1000
1461
1462/* We don't immediately return the addrs to their source (vmap_addr_arena).
1463 * Instead, we hold on to them until we have a suitable amount, then free them
1464 * in a batch.  This amoritizes the cost of the TLB global shootdown.  We can
1465 * explore other tricks in the future too (like RCU for a certain index in the
1466 * vmap_to_free array). */
1467static void __vmap_free(struct arena *source, void *obj, size_t size)
1468{
1469        struct vmap_free_tracker *vft;
1470
1471        spin_lock(&vmap_lock);
1472        /* All objs get *unmapped* immediately, but we'll shootdown later.  Note
1473         * that it is OK (but slightly dangerous) for the kernel to reuse the
1474         * paddrs pointed to by the vaddrs before a TLB shootdown. */
1475        unmap_segment(boot_pgdir, (uintptr_t)obj, size);
1476        if (vmap_nr_to_free < VMAP_MAX_TO_FREE) {
1477                vft = &vmap_to_free[vmap_nr_to_free++];
1478                vft->addr = obj;
1479                vft->nr_bytes = size;
1480                spin_unlock(&vmap_lock);
1481                return;
1482        }
1483        tlb_shootdown_global();
1484        for (int i = 0; i < vmap_nr_to_free; i++) {
1485                vft = &vmap_to_free[i];
1486                arena_free(source, vft->addr, vft->nr_bytes);
1487        }
1488        /* don't forget to free the one passed in */
1489        arena_free(source, obj, size);
1490        vmap_nr_to_free = 0;
1491        spin_unlock(&vmap_lock);
1492}
1493
1494void vmap_init(void)
1495{
1496        vmap_addr_arena = arena_create("vmap_addr", (void*)KERN_DYN_BOT,
1497                                       KERN_DYN_TOP - KERN_DYN_BOT,
1498                                       PGSIZE, NULL, NULL, NULL, 0, MEM_WAIT);
1499        vmap_arena = arena_create("vmap", NULL, 0, PGSIZE, arena_alloc,
1500                                  __vmap_free, vmap_addr_arena, 0, MEM_WAIT);
1501        vmap_to_free = kmalloc(sizeof(struct vmap_free_tracker)
1502                               * VMAP_MAX_TO_FREE, MEM_WAIT);
1503        /* This ensures the boot_pgdir's top-most PML (PML4) has entries
1504         * pointing to PML3s that cover the dynamic mapping range.  Now, it's
1505         * safe to create processes that copy from boot_pgdir and still
1506         * dynamically change the kernel mappings. */
1507        arch_add_intermediate_pts(boot_pgdir, KERN_DYN_BOT,
1508                                  KERN_DYN_TOP - KERN_DYN_BOT);
1509}
1510
1511uintptr_t get_vmap_segment(size_t nr_bytes)
1512{
1513        uintptr_t ret;
1514
1515        ret = (uintptr_t)arena_alloc(vmap_arena, nr_bytes, MEM_ATOMIC);
1516        assert(ret);
1517        return ret;
1518}
1519
1520void put_vmap_segment(uintptr_t vaddr, size_t nr_bytes)
1521{
1522        arena_free(vmap_arena, (void*)vaddr, nr_bytes);
1523}
1524
1525/* Map a virtual address chunk to physical addresses.  Make sure you got a vmap
1526 * segment before actually trying to do the mapping.
1527 *
1528 * Careful with more than one 'page', since it will assume your physical pages
1529 * are also contiguous.  Most callers will only use one page.
1530 *
1531 * Finally, note that this does not care whether or not there are real pages
1532 * being mapped, and will not attempt to incref your page (if there is such a
1533 * thing).  Handle your own refcnting for pages. */
1534int map_vmap_segment(uintptr_t vaddr, uintptr_t paddr, unsigned long num_pages,
1535                     int perm)
1536{
1537#ifdef CONFIG_X86
1538        perm |= PTE_G;
1539#endif
1540        spin_lock(&vmap_lock);
1541        map_segment(boot_pgdir, vaddr, num_pages * PGSIZE, paddr, perm,
1542                    arch_max_jumbo_page_shift());
1543        spin_unlock(&vmap_lock);
1544        return 0;
1545}
1546
1547/* This can handle unaligned paddrs */
1548static uintptr_t vmap_pmem_flags(uintptr_t paddr, size_t nr_bytes, int flags)
1549{
1550        uintptr_t vaddr;
1551        unsigned long nr_pages;
1552
1553        assert(nr_bytes && paddr);
1554        nr_bytes += PGOFF(paddr);
1555        nr_pages = ROUNDUP(nr_bytes, PGSIZE) >> PGSHIFT;
1556        vaddr = get_vmap_segment(nr_bytes);
1557        if (!vaddr) {
1558                warn("Unable to get a vmap segment");   /* probably a bug */
1559                return 0;
1560        }
1561        /* it's not strictly necessary to drop paddr's pgoff, but it might save
1562         * some vmap heartache in the future. */
1563        if (map_vmap_segment(vaddr, PG_ADDR(paddr), nr_pages,
1564                             PTE_KERN_RW | flags)) {
1565                warn("Unable to map a vmap segment");   /* probably a bug */
1566                return 0;
1567        }
1568        return vaddr + PGOFF(paddr);
1569}
1570
1571uintptr_t vmap_pmem(uintptr_t paddr, size_t nr_bytes)
1572{
1573        return vmap_pmem_flags(paddr, nr_bytes, 0);
1574}
1575
1576uintptr_t vmap_pmem_nocache(uintptr_t paddr, size_t nr_bytes)
1577{
1578        return vmap_pmem_flags(paddr, nr_bytes, PTE_NOCACHE);
1579}
1580
1581uintptr_t vmap_pmem_writecomb(uintptr_t paddr, size_t nr_bytes)
1582{
1583        return vmap_pmem_flags(paddr, nr_bytes, PTE_WRITECOMB);
1584}
1585
1586int vunmap_vmem(uintptr_t vaddr, size_t nr_bytes)
1587{
1588        nr_bytes += PGOFF(vaddr);
1589        put_vmap_segment(PG_ADDR(vaddr), nr_bytes);
1590        return 0;
1591}
1592