akaros/kern/src/mm.c
<<
>>
Prefs
   1/* Copyright (c) 2009, 2010 The Regents of the University of California
   2 * Barret Rhoden <brho@cs.berkeley.edu>
   3 * See LICENSE for details.
   4 *
   5 * Virtual memory management functions.  Creation, modification, etc, of virtual
   6 * memory regions (VMRs) as well as mmap(), mprotect(), and munmap().
   7 *
   8 * In general, error checking / bounds checks are done in the main function
   9 * (e.g. mmap()), and the work is done in a do_ function (e.g. do_mmap()).
  10 * Versions of those functions that are called when the vmr lock is already held
  11 * begin with __ (e.g. __do_munmap()).
  12 *
  13 * Note that if we were called from kern/src/syscall.c, we probably don't have
  14 * an edible reference to p. */
  15
  16#include <ros/common.h>
  17#include <pmap.h>
  18#include <mm.h>
  19#include <process.h>
  20#include <stdio.h>
  21#include <syscall.h>
  22#include <slab.h>
  23#include <kmalloc.h>
  24#include <smp.h>
  25#include <profiler.h>
  26#include <umem.h>
  27#include <ns.h>
  28#include <tree_file.h>
  29
  30/* These are the only mmap flags that are saved in the VMR.  If we implement
  31 * more of the mmap interface, we may need to grow this. */
  32#define MAP_PERSIST_FLAGS       (MAP_SHARED | MAP_PRIVATE | MAP_ANONYMOUS)
  33
  34struct kmem_cache *vmr_kcache;
  35
  36static int __vmr_free_pgs(struct proc *p, pte_t pte, void *va, void *arg);
  37static int populate_pm_va(struct proc *p, uintptr_t va, unsigned long nr_pgs,
  38                          int pte_prot, struct page_map *pm, size_t offset,
  39                          int flags, bool exec);
  40
  41static struct page_map *foc_to_pm(struct file_or_chan *foc)
  42{
  43        switch (foc->type) {
  44        case F_OR_C_CHAN:
  45                assert(foc->fsf);
  46                return foc->fsf->pm;
  47        }
  48        panic("unknown F_OR_C type");
  49}
  50
  51static struct page_map *vmr_to_pm(struct vm_region *vmr)
  52{
  53        return foc_to_pm(vmr->__vm_foc);
  54}
  55
  56char *foc_to_name(struct file_or_chan *foc)
  57{
  58        switch (foc->type) {
  59        case F_OR_C_CHAN:
  60                if (foc->fsf)
  61                        return foc->fsf->dir.name;
  62                else
  63                        return foc->chan->name->s;
  64        }
  65        panic("unknown F_OR_C type");
  66}
  67
  68char *foc_abs_path(struct file_or_chan *foc)
  69{
  70        switch (foc->type) {
  71        case F_OR_C_CHAN:
  72                /* Not sure, but I'd like to know if we have externally visible
  73                 * chans that have no name. */
  74                assert(foc->chan->name);
  75                assert(foc->chan->name->s);
  76                return foc->chan->name->s;
  77        }
  78        panic("unknown F_OR_C type");
  79}
  80
  81ssize_t foc_read(struct file_or_chan *foc, void *buf, size_t amt, off64_t off)
  82{
  83        ERRSTACK(1);
  84        off64_t fake_off = off;
  85        ssize_t ret = -1;
  86
  87        switch (foc->type) {
  88        case F_OR_C_CHAN:
  89                if (!qid_is_file(foc->chan->qid))
  90                        return -1;
  91                if (!waserror())
  92                        ret = devtab[foc->chan->type].read(foc->chan, buf, amt,
  93                                                           off);
  94                poperror();
  95                return ret;
  96        }
  97        panic("unknown F_OR_C type");
  98}
  99
 100static void __foc_free_rcu(struct rcu_head *head)
 101{
 102        struct file_or_chan *foc = container_of(head, struct file_or_chan, rcu);
 103
 104        switch (foc->type) {
 105        case F_OR_C_CHAN:
 106                cclose(foc->chan);
 107                break;
 108        default:
 109                panic("unknown F_OR_C type, %d", foc->type);
 110        }
 111        kfree(foc);
 112}
 113
 114static void foc_release(struct kref *kref)
 115{
 116        struct file_or_chan *foc = container_of(kref, struct file_or_chan,
 117                                                kref);
 118
 119        /* A lot of places decref while holding a spinlock, but we can't free
 120         * then, since the cclose() might block. */
 121        call_rcu(&foc->rcu, __foc_free_rcu);
 122}
 123
 124static struct file_or_chan *foc_alloc(void)
 125{
 126        struct file_or_chan *foc;
 127
 128        foc = kzmalloc(sizeof(struct file_or_chan), MEM_ATOMIC);
 129        if (!foc)
 130                return NULL;
 131        kref_init(&foc->kref, foc_release, 1);
 132        return foc;
 133}
 134
 135struct file_or_chan *foc_open(char *path, int omode, int perm)
 136{
 137        ERRSTACK(1);
 138        struct file_or_chan *foc = foc_alloc();
 139
 140        if (!foc)
 141                return NULL;
 142        if (waserror()) {
 143                kfree(foc);
 144                poperror();
 145                return NULL;
 146        }
 147        foc->chan = namec(path, Aopen, omode, perm, NULL);
 148        foc->type = F_OR_C_CHAN;
 149        poperror();
 150        return foc;
 151}
 152
 153struct file_or_chan *fd_to_foc(struct fd_table *fdt, int fd)
 154{
 155        ERRSTACK(1);
 156        struct file_or_chan *foc = foc_alloc();
 157
 158        if (!foc)
 159                return NULL;
 160        if (waserror()) {
 161                kfree(foc);
 162                poperror();
 163                return NULL;
 164        }
 165        /* We're not checking mode here (-1).  mm code checks later. */
 166        foc->chan = fdtochan(fdt, fd, -1, true, true);
 167        foc->type = F_OR_C_CHAN;
 168        poperror();
 169        return foc;
 170}
 171
 172void foc_incref(struct file_or_chan *foc)
 173{
 174        kref_get(&foc->kref, 1);
 175}
 176
 177void foc_decref(struct file_or_chan *foc)
 178{
 179        kref_put(&foc->kref);
 180}
 181
 182void *foc_pointer(struct file_or_chan *foc)
 183{
 184        if (!foc)
 185                return NULL;
 186        switch (foc->type) {
 187        case F_OR_C_CHAN:
 188                return foc->chan;
 189        default:
 190                panic("unknown F_OR_C type, %d", foc->type);
 191        }
 192}
 193
 194size_t foc_get_len(struct file_or_chan *foc)
 195{
 196        switch (foc->type) {
 197        case F_OR_C_CHAN:
 198                assert(foc->fsf);
 199                return foc->fsf->dir.length;
 200        }
 201        panic("unknown F_OR_C type, %d", foc->type);
 202}
 203
 204static bool check_chan_perms(struct vm_region *vmr, struct chan *chan, int prot)
 205{
 206        /* glibc isn't opening its files O_EXEC */
 207        prot &= ~PROT_EXEC;
 208        if (!(chan->mode & O_READ))
 209                return false;
 210        if (vmr->vm_flags & MAP_PRIVATE)
 211                prot &= ~PROT_WRITE;
 212        return (chan->mode & prot) == prot;
 213}
 214
 215static bool check_foc_perms(struct vm_region *vmr, struct file_or_chan *foc,
 216                            int prot)
 217{
 218        switch (foc->type) {
 219        case F_OR_C_CHAN:
 220                return check_chan_perms(vmr, foc->chan, prot);
 221        }
 222        panic("unknown F_OR_C type");
 223}
 224
 225static int foc_dev_mmap(struct file_or_chan *foc, struct vm_region *vmr,
 226                        int prot, int flags)
 227{
 228        if (!check_foc_perms(vmr, foc, prot))
 229                return -1;
 230        switch (foc->type) {
 231        case F_OR_C_CHAN:
 232                if (!devtab[foc->chan->type].mmap) {
 233                        set_error(ENODEV, "device does not support mmap");
 234                        return -1;
 235                }
 236                foc->fsf = devtab[foc->chan->type].mmap(foc->chan, vmr, prot,
 237                                                        flags);
 238                return foc->fsf ? 0 : -1;
 239        }
 240        panic("unknown F_OR_C type, %d", foc->type);
 241}
 242
 243void vmr_init(void)
 244{
 245        vmr_kcache = kmem_cache_create("vm_regions",
 246                                       sizeof(struct vm_region),
 247                                       __alignof__(struct vm_region), 0, NULL,
 248                                       0, 0, NULL);
 249}
 250
 251static struct vm_region *vmr_zalloc(void)
 252{
 253        struct vm_region *vmr;
 254
 255        vmr = kmem_cache_alloc(vmr_kcache, MEM_WAIT);
 256        memset(vmr, 0, sizeof(struct vm_region));
 257        return vmr;
 258}
 259
 260static void vmr_free(struct vm_region *vmr)
 261{
 262        kmem_cache_free(vmr_kcache, vmr);
 263}
 264
 265/* The caller will set the prot, flags, file, and offset.  We find a spot for it
 266 * in p's address space, set proc, base, and end.  Caller holds p's vmr_lock.
 267 *
 268 * TODO: take a look at solari's vmem alloc.  And consider keeping these in a
 269 * tree of some sort for easier lookups. */
 270static bool vmr_insert(struct vm_region *vmr, struct proc *p, uintptr_t va,
 271                       size_t len)
 272{
 273        struct vm_region *vm_i, *vm_next;
 274        uintptr_t gap_end;
 275        bool ret = false;
 276
 277        assert(!PGOFF(va));
 278        assert(!PGOFF(len));
 279        assert(__is_user_addr((void*)va, len, UMAPTOP));
 280        /* Is there room before the first one: */
 281        vm_i = TAILQ_FIRST(&p->vm_regions);
 282        /* This works for now, but if all we have is BRK_END ones, we'll start
 283         * growing backwards (TODO) */
 284        if (!vm_i || (va + len <= vm_i->vm_base)) {
 285                vmr->vm_base = va;
 286                TAILQ_INSERT_HEAD(&p->vm_regions, vmr, vm_link);
 287                ret = true;
 288        } else {
 289                TAILQ_FOREACH(vm_i, &p->vm_regions, vm_link) {
 290                        vm_next = TAILQ_NEXT(vm_i, vm_link);
 291                        gap_end = vm_next ? vm_next->vm_base : UMAPTOP;
 292                        /* skip til we get past the 'hint' va */
 293                        if (va >= gap_end)
 294                                continue;
 295                        /* Find a gap that is big enough */
 296                        if (gap_end - vm_i->vm_end >= len) {
 297                                /* if we can put it at va, let's do that.  o/w,
 298                                 * put it so it fits */
 299                                if ((gap_end >= va + len) &&
 300                                    (va >= vm_i->vm_end))
 301                                        vmr->vm_base = va;
 302                                else
 303                                        vmr->vm_base = vm_i->vm_end;
 304                                TAILQ_INSERT_AFTER(&p->vm_regions, vm_i, vmr,
 305                                                   vm_link);
 306                                ret = true;
 307                                break;
 308                        }
 309                }
 310        }
 311        /* Finalize the creation, if we got one */
 312        if (ret) {
 313                vmr->vm_proc = p;
 314                vmr->vm_end = vmr->vm_base + len;
 315        }
 316        if (!ret)
 317                warn("Not making a VMR, wanted %p, + %p = %p", va, len, va +
 318                     len);
 319        return ret;
 320}
 321
 322/* Split a VMR at va, returning the new VMR.  It is set up the same way, with
 323 * file offsets fixed accordingly.  'va' is the beginning of the new one, and
 324 * must be page aligned. */
 325static struct vm_region *split_vmr(struct vm_region *old_vmr, uintptr_t va)
 326{
 327        struct vm_region *new_vmr;
 328
 329        assert(!PGOFF(va));
 330        if ((old_vmr->vm_base >= va) || (old_vmr->vm_end <= va))
 331                return 0;
 332        new_vmr = kmem_cache_alloc(vmr_kcache, 0);
 333        assert(new_vmr);
 334        TAILQ_INSERT_AFTER(&old_vmr->vm_proc->vm_regions, old_vmr, new_vmr,
 335                           vm_link);
 336        new_vmr->vm_proc = old_vmr->vm_proc;
 337        new_vmr->vm_base = va;
 338        new_vmr->vm_end = old_vmr->vm_end;
 339        old_vmr->vm_end = va;
 340        new_vmr->vm_prot = old_vmr->vm_prot;
 341        new_vmr->vm_flags = old_vmr->vm_flags;
 342        if (vmr_has_file(old_vmr)) {
 343                foc_incref(old_vmr->__vm_foc);
 344                new_vmr->__vm_foc = old_vmr->__vm_foc;
 345                new_vmr->vm_foff = old_vmr->vm_foff +
 346                                      old_vmr->vm_end - old_vmr->vm_base;
 347                pm_add_vmr(vmr_to_pm(old_vmr), new_vmr);
 348        } else {
 349                new_vmr->__vm_foc = NULL;
 350                new_vmr->vm_foff = 0;
 351        }
 352        return new_vmr;
 353}
 354
 355/* Called by the unmapper, just cleans up.  Whoever calls this will need to sort
 356 * out the page table entries. */
 357static void destroy_vmr(struct vm_region *vmr)
 358{
 359        if (vmr_has_file(vmr)) {
 360                pm_remove_vmr(vmr_to_pm(vmr), vmr);
 361                foc_decref(vmr->__vm_foc);
 362        }
 363        TAILQ_REMOVE(&vmr->vm_proc->vm_regions, vmr, vm_link);
 364        vmr_free(vmr);
 365}
 366
 367/* Merges two vm regions.  For now, it will check to make sure they are the
 368 * same.  The second one will be destroyed. */
 369static int merge_vmr(struct vm_region *first, struct vm_region *second)
 370{
 371        assert(first->vm_proc == second->vm_proc);
 372        if ((first->vm_end != second->vm_base) ||
 373            (first->vm_prot != second->vm_prot) ||
 374            (first->vm_flags != second->vm_flags) ||
 375            (first->__vm_foc != second->__vm_foc))
 376                return -1;
 377        if (vmr_has_file(first) && (second->vm_foff != first->vm_foff +
 378                                    first->vm_end - first->vm_base))
 379                return -1;
 380        first->vm_end = second->vm_end;
 381        destroy_vmr(second);
 382        return 0;
 383}
 384
 385/* Attempts to merge vmr with adjacent VMRs, returning a ptr to be used for vmr.
 386 * It could be the same struct vmr, or possibly another one (usually lower in
 387 * the address space. */
 388static struct vm_region *merge_me(struct vm_region *vmr)
 389{
 390        struct vm_region *vmr_temp;
 391
 392        /* Merge will fail if it cannot do it.  If it succeeds, the second VMR
 393         * is destroyed, so we need to be a bit careful. */
 394        vmr_temp = TAILQ_PREV(vmr, vmr_tailq, vm_link);
 395        if (vmr_temp)
 396                if (!merge_vmr(vmr_temp, vmr))
 397                        vmr = vmr_temp;
 398        vmr_temp = TAILQ_NEXT(vmr, vm_link);
 399        if (vmr_temp)
 400                merge_vmr(vmr, vmr_temp);
 401        return vmr;
 402}
 403
 404/* Grows the vm region up to (and not including) va.  Fails if another is in the
 405 * way, etc. */
 406static int grow_vmr(struct vm_region *vmr, uintptr_t va)
 407{
 408        assert(!PGOFF(va));
 409        struct vm_region *next = TAILQ_NEXT(vmr, vm_link);
 410        if (next && next->vm_base < va)
 411                return -1;
 412        if (va <= vmr->vm_end)
 413                return -1;
 414        vmr->vm_end = va;
 415        return 0;
 416}
 417
 418/* Shrinks the vm region down to (and not including) va.  Whoever calls this
 419 * will need to sort out the page table entries. */
 420static int shrink_vmr(struct vm_region *vmr, uintptr_t va)
 421{
 422        assert(!PGOFF(va));
 423        if ((va < vmr->vm_base) || (va > vmr->vm_end))
 424                return -1;
 425        vmr->vm_end = va;
 426        return 0;
 427}
 428
 429/* Given a va and a proc (later an mm, possibly), returns the owning vmr, or 0
 430 * if there is none. */
 431static struct vm_region *find_vmr(struct proc *p, uintptr_t va)
 432{
 433        struct vm_region *vmr;
 434
 435        /* ugly linear seach */
 436        TAILQ_FOREACH(vmr, &p->vm_regions, vm_link) {
 437                if ((vmr->vm_base <= va) && (vmr->vm_end > va))
 438                        return vmr;
 439        }
 440        return 0;
 441}
 442
 443/* Finds the first vmr after va (including the one holding va), or 0 if there is
 444 * none. */
 445static struct vm_region *find_first_vmr(struct proc *p, uintptr_t va)
 446{
 447        struct vm_region *vmr;
 448
 449        /* ugly linear seach */
 450        TAILQ_FOREACH(vmr, &p->vm_regions, vm_link) {
 451                if ((vmr->vm_base <= va) && (vmr->vm_end > va))
 452                        return vmr;
 453                if (vmr->vm_base > va)
 454                        return vmr;
 455        }
 456        return 0;
 457}
 458
 459/* Makes sure that no VMRs cross either the start or end of the given region
 460 * [va, va + len), splitting any VMRs that are on the endpoints. */
 461static void isolate_vmrs(struct proc *p, uintptr_t va, size_t len)
 462{
 463        struct vm_region *vmr;
 464        if ((vmr = find_vmr(p, va)))
 465                split_vmr(vmr, va);
 466        /* TODO: don't want to do another find (linear search) */
 467        if ((vmr = find_vmr(p, va + len)))
 468                split_vmr(vmr, va + len);
 469}
 470
 471void unmap_and_destroy_vmrs(struct proc *p)
 472{
 473        struct vm_region *vmr_i, *vmr_temp;
 474
 475        /* this only gets called from __proc_free, so there should be no sync
 476         * concerns.  still, better safe than sorry. */
 477        spin_lock(&p->vmr_lock);
 478        p->vmr_history++;
 479        spin_lock(&p->pte_lock);
 480        TAILQ_FOREACH(vmr_i, &p->vm_regions, vm_link) {
 481                /* note this CB sets the PTE = 0, regardless of if it was P or
 482                 * not */
 483                env_user_mem_walk(p, (void*)vmr_i->vm_base,
 484                                  vmr_i->vm_end - vmr_i->vm_base,
 485                                  __vmr_free_pgs, 0);
 486        }
 487        spin_unlock(&p->pte_lock);
 488        /* need the safe style, since destroy_vmr modifies the list.  also, we
 489         * want to do this outside the pte lock, since it grabs the pm lock. */
 490        TAILQ_FOREACH_SAFE(vmr_i, &p->vm_regions, vm_link, vmr_temp)
 491                destroy_vmr(vmr_i);
 492        spin_unlock(&p->vmr_lock);
 493}
 494
 495/* Helper: copies the contents of pages from p to new p.  For pages that aren't
 496 * present, once we support swapping or CoW, we can do something more
 497 * intelligent.  0 on success, -ERROR on failure.  Can't handle jumbos. */
 498static int copy_pages(struct proc *p, struct proc *new_p, uintptr_t va_start,
 499                      uintptr_t va_end)
 500{
 501        int ret;
 502
 503        /* Sanity checks.  If these fail, we had a screwed up VMR.
 504         * Check for: alignment, wraparound, or userspace addresses */
 505        if ((PGOFF(va_start)) ||
 506            (PGOFF(va_end)) ||
 507            (va_end < va_start) ||/* now, start > UMAPTOP -> end > UMAPTOP */
 508            (va_end > UMAPTOP)) {
 509                warn("VMR mapping is probably screwed up (%p - %p)", va_start,
 510                     va_end);
 511                return -EINVAL;
 512        }
 513        int copy_page(struct proc *p, pte_t pte, void *va, void *arg) {
 514                struct proc *new_p = (struct proc*)arg;
 515                struct page *pp;
 516
 517                if (pte_is_unmapped(pte))
 518                        return 0;
 519                /* pages could be !P, but right now that's only for file backed
 520                 * VMRs undergoing page removal, which isn't the caller of
 521                 * copy_pages. */
 522                if (pte_is_mapped(pte)) {
 523                        /* TODO: check for jumbos */
 524                        if (upage_alloc(new_p, &pp, 0))
 525                                return -ENOMEM;
 526                        memcpy(page2kva(pp), KADDR(pte_get_paddr(pte)), PGSIZE);
 527                        if (page_insert(new_p->env_pgdir, pp, va,
 528                                        pte_get_settings(pte))) {
 529                                page_decref(pp);
 530                                return -ENOMEM;
 531                        }
 532                } else if (pte_is_paged_out(pte)) {
 533                        /* TODO: (SWAP) will need to either make a copy or
 534                         * CoW/refcnt the backend store.  For now, this PTE will
 535                         * be the same as the original PTE */
 536                        panic("Swapping not supported!");
 537                } else {
 538                        panic("Weird PTE %p in %s!", pte_print(pte),
 539                              __FUNCTION__);
 540                }
 541                return 0;
 542        }
 543        spin_lock(&p->pte_lock);        /* walking and changing PTEs */
 544        ret = env_user_mem_walk(p, (void*)va_start, va_end - va_start,
 545                                &copy_page, new_p);
 546        spin_unlock(&p->pte_lock);
 547        return ret;
 548}
 549
 550static int fill_vmr(struct proc *p, struct proc *new_p, struct vm_region *vmr)
 551{
 552        int ret = 0;
 553
 554        if (!vmr_has_file(vmr) || (vmr->vm_flags & MAP_PRIVATE)) {
 555                /* We don't support ANON + SHARED yet */
 556                assert(!(vmr->vm_flags & MAP_SHARED));
 557                ret = copy_pages(p, new_p, vmr->vm_base, vmr->vm_end);
 558        } else {
 559                /* non-private file, i.e. page cacheable.  we have to honor
 560                 * MAP_LOCKED, (but we might be able to ignore MAP_POPULATE). */
 561                if (vmr->vm_flags & MAP_LOCKED) {
 562                        /* need to keep the file alive in case we unlock/block
 563                         */
 564                        foc_incref(vmr->__vm_foc);
 565                        /* math is a bit nasty if vm_base isn't page aligned */
 566                        assert(!PGOFF(vmr->vm_base));
 567                        ret = populate_pm_va(new_p, vmr->vm_base,
 568                                             (vmr->vm_end - vmr->vm_base) >>
 569                                                                       PGSHIFT,
 570                                             vmr->vm_prot, vmr_to_pm(vmr),
 571                                             vmr->vm_foff, vmr->vm_flags,
 572                                             vmr->vm_prot & PROT_EXEC);
 573                        foc_decref(vmr->__vm_foc);
 574                }
 575        }
 576        return ret;
 577}
 578
 579/* This will make new_p have the same VMRs as p, and it will make sure all
 580 * physical pages are copied over, with the exception of MAP_SHARED files.
 581 * MAP_SHARED files that are also MAP_LOCKED will be attached to the process -
 582 * presumably they are in the page cache since the parent locked them.  This is
 583 * all pretty nasty.
 584 *
 585 * This is used by fork().
 586 *
 587 * Note that if you are working on a VMR that is a file, you'll want to be
 588 * careful about how it is mapped (SHARED, PRIVATE, etc). */
 589int duplicate_vmrs(struct proc *p, struct proc *new_p)
 590{
 591        int ret = 0;
 592        struct vm_region *vmr, *vm_i;
 593
 594        TAILQ_FOREACH(vm_i, &p->vm_regions, vm_link) {
 595                vmr = kmem_cache_alloc(vmr_kcache, 0);
 596                if (!vmr)
 597                        return -ENOMEM;
 598                vmr->vm_proc = new_p;
 599                vmr->vm_base = vm_i->vm_base;
 600                vmr->vm_end = vm_i->vm_end;
 601                vmr->vm_prot = vm_i->vm_prot;
 602                vmr->vm_flags = vm_i->vm_flags;
 603                vmr->__vm_foc = vm_i->__vm_foc;
 604                vmr->vm_foff = vm_i->vm_foff;
 605                if (vmr_has_file(vm_i)) {
 606                        foc_incref(vm_i->__vm_foc);
 607                        pm_add_vmr(vmr_to_pm(vm_i), vmr);
 608                }
 609                ret = fill_vmr(p, new_p, vmr);
 610                if (ret) {
 611                        if (vmr_has_file(vm_i)) {
 612                                pm_remove_vmr(vmr_to_pm(vm_i), vmr);
 613                                foc_decref(vm_i->__vm_foc);
 614                        }
 615                        vmr_free(vmr);
 616                        return ret;
 617                }
 618                TAILQ_INSERT_TAIL(&new_p->vm_regions, vmr, vm_link);
 619        }
 620        return 0;
 621}
 622
 623void print_vmrs(struct proc *p)
 624{
 625        int count = 0;
 626        struct vm_region *vmr;
 627
 628        print_lock();
 629        printk("VM Regions for proc %d\n", p->pid);
 630        printk("NR:"
 631               "                                     Range:"
 632               "       Prot,"
 633               "      Flags,"
 634               "               File,"
 635               "                Off\n");
 636        TAILQ_FOREACH(vmr, &p->vm_regions, vm_link)
 637                printk("%02d: (%p - %p): 0x%08x, 0x%08x, %p, %p\n", count++,
 638                       vmr->vm_base, vmr->vm_end, vmr->vm_prot, vmr->vm_flags,
 639                       foc_pointer(vmr->__vm_foc), vmr->vm_foff);
 640        print_unlock();
 641}
 642
 643void enumerate_vmrs(struct proc *p, void (*func)(struct vm_region *vmr,
 644                                                 void *opaque), void *opaque)
 645{
 646        struct vm_region *vmr;
 647
 648        spin_lock(&p->vmr_lock);
 649        TAILQ_FOREACH(vmr, &p->vm_regions, vm_link)
 650                func(vmr, opaque);
 651        spin_unlock(&p->vmr_lock);
 652}
 653
 654static bool mmap_flags_priv_ok(int flags)
 655{
 656        return (flags & (MAP_PRIVATE | MAP_SHARED)) == MAP_PRIVATE ||
 657               (flags & (MAP_PRIVATE | MAP_SHARED)) == MAP_SHARED;
 658}
 659
 660static bool prot_is_valid(int prot)
 661{
 662        /* Remember PROT_NONE (0) is valid. */
 663        return !(prot & ~PROT_VALID_PROTS);
 664}
 665
 666static bool prot_has_access(int prot)
 667{
 668        return prot & (PROT_READ | PROT_WRITE | PROT_EXEC);
 669}
 670
 671/* Error values aren't quite comprehensive - check man mmap() once we do better
 672 * with the FS.
 673 *
 674 * The mmap call's offset is in units of PGSIZE (like Linux's mmap2()), but
 675 * internally, the offset is tracked in bytes.  The reason for the PGSIZE is for
 676 * 32bit apps to enumerate large files, but a full 64bit system won't need that.
 677 * We track things internally in bytes since that is how file pointers work, vmr
 678 * bases and ends, and similar math.  While it's not a hard change, there's no
 679 * need for it, and ideally we'll be a fully 64bit system before we deal with
 680 * files that large. */
 681void *mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
 682           int fd, size_t offset)
 683{
 684        struct file_or_chan *file = NULL;
 685        void *result;
 686
 687        offset <<= PGSHIFT;
 688        printd("mmap(addr %x, len %x, prot %x, flags %x, fd %x, off %x)\n",
 689               addr, len, prot, flags, fd, offset);
 690        if (!mmap_flags_priv_ok(flags)) {
 691                set_errno(EINVAL);
 692                return MAP_FAILED;
 693        }
 694        if (!prot_is_valid(prot)) {
 695                set_error(EINVAL, "invalid prot 0x%x (%x)", prot,
 696                          PROT_VALID_PROTS);
 697                return MAP_FAILED;
 698        }
 699        if (!len) {
 700                set_errno(EINVAL);
 701                return MAP_FAILED;
 702        }
 703        if (!(flags & MAP_ANON) && (fd >= 0)) {
 704                file = fd_to_foc(&p->open_files, fd);
 705                if (!file) {
 706                        set_errno(EBADF);
 707                        result = MAP_FAILED;
 708                        goto out_ref;
 709                }
 710        }
 711        /* Check for overflow.  This helps do_mmap and populate_va, among
 712         * others. */
 713        if (offset + len < offset) {
 714                set_errno(EINVAL);
 715                result = MAP_FAILED;
 716                goto out_ref;
 717        }
 718        /* If they don't care where to put it, we'll start looking after the
 719         * break.  We could just have userspace handle this (in glibc's mmap),
 720         * so we don't need to know about BRK_END, but this will work for now
 721         * (and may avoid bugs).  Note that this limits mmap(0) a bit.  Keep
 722         * this in sync with do_mmap()'s check.  (Both are necessary).  */
 723        if (addr == 0)
 724                addr = BRK_END;
 725        /* Still need to enforce this: */
 726        addr = MAX(addr, MMAP_LOWEST_VA);
 727        /* Need to check addr + len, after we do our addr adjustments */
 728        if (!__is_user_addr((void*)addr, len, UMAPTOP)) {
 729                set_errno(EINVAL);
 730                result = MAP_FAILED;
 731                goto out_ref;
 732        }
 733        if (PGOFF(addr)) {
 734                set_errno(EINVAL);
 735                result = MAP_FAILED;
 736                goto out_ref;
 737        }
 738        result = do_mmap(p, addr, len, prot, flags, file, offset);
 739out_ref:
 740        if (file)
 741                foc_decref(file);
 742        return result;
 743}
 744
 745/* Helper, maps in page at addr, but only if nothing is mapped there.  Returns
 746 * 0 on success.  Will take ownership of non-pagemap pages, including on error
 747 * cases.  This just means we free it on error, and notionally store it in the
 748 * PTE on success, which will get freed later.
 749 *
 750 * It's possible that a page has already been mapped here, in which case we'll
 751 * treat as success.  So when we return 0, *a* page is mapped here, but not
 752 * necessarily the one you passed in. */
 753static int map_page_at_addr(struct proc *p, struct page *page, uintptr_t addr,
 754                            int pte_prot)
 755{
 756        pte_t pte;
 757
 758        spin_lock(&p->pte_lock);        /* walking and changing PTEs */
 759        /* find offending PTE (prob don't read this in).  This might alloc an
 760         * intermediate page table page. */
 761        pte = pgdir_walk(p->env_pgdir, (void*)addr, TRUE);
 762        if (!pte_walk_okay(pte)) {
 763                spin_unlock(&p->pte_lock);
 764                if (!page_is_pagemap(page))
 765                        page_decref(page);
 766                return -ENOMEM;
 767        }
 768        /* a spurious, valid PF is possible due to a legit race: the page might
 769         * have been faulted in by another core already (and raced on the memory
 770         * lock), in which case we should just return. */
 771        if (pte_is_present(pte)) {
 772                spin_unlock(&p->pte_lock);
 773                if (!page_is_pagemap(page))
 774                        page_decref(page);
 775                return 0;
 776        }
 777        /* I used to allow clobbering an old entry (contrary to the
 778         * documentation), but it's probably a sign of another bug. */
 779        assert(!pte_is_mapped(pte));
 780        /* preserve the dirty bit - pm removal could be looking concurrently */
 781        pte_prot |= (pte_is_dirty(pte) ? PTE_D : 0);
 782        /* We have a ref to page (for non PMs), which we are storing in the PTE
 783         */
 784        pte_write(pte, page2pa(page), pte_prot);
 785        spin_unlock(&p->pte_lock);
 786        return 0;
 787}
 788
 789/* Helper: copies *pp's contents to a new page, replacing your page pointer.  If
 790 * this succeeds, you'll have a non-PM page, which matters for how you put it.*/
 791static int __copy_and_swap_pmpg(struct proc *p, struct page **pp)
 792{
 793        struct page *new_page, *old_page = *pp;
 794
 795        if (upage_alloc(p, &new_page, FALSE))
 796                return -ENOMEM;
 797        memcpy(page2kva(new_page), page2kva(old_page), PGSIZE);
 798        pm_put_page(old_page);
 799        *pp = new_page;
 800        return 0;
 801}
 802
 803/* Hold the VMR lock when you call this - it'll assume the entire VA range is
 804 * mappable, which isn't true if there are concurrent changes to the VMRs. */
 805static int populate_anon_va(struct proc *p, uintptr_t va, unsigned long nr_pgs,
 806                            int pte_prot)
 807{
 808        struct page *page;
 809        int ret;
 810
 811        for (long i = 0; i < nr_pgs; i++) {
 812                if (upage_alloc(p, &page, TRUE))
 813                        return -ENOMEM;
 814                /* could imagine doing a memwalk instead of a for loop */
 815                ret = map_page_at_addr(p, page, va + i * PGSIZE, pte_prot);
 816                if (ret)
 817                        return ret;
 818        }
 819        return 0;
 820}
 821
 822/* This will periodically unlock the vmr lock. */
 823static int populate_pm_va(struct proc *p, uintptr_t va, unsigned long nr_pgs,
 824                          int pte_prot, struct page_map *pm, size_t offset,
 825                          int flags, bool exec)
 826{
 827        int ret = 0;
 828        unsigned long pm_idx0 = offset >> PGSHIFT;
 829        int vmr_history = ACCESS_ONCE(p->vmr_history);
 830        struct page *page;
 831
 832        /* This is a racy check - see the comments in fs_file.c.  Also, we're
 833         * not even attempting to populate the va, though we could do a partial
 834         * if necessary. */
 835        if (pm_idx0 + nr_pgs > nr_pages(fs_file_get_length(pm->pm_file)))
 836                return -ESPIPE;
 837        /* locking rules: start the loop holding the vmr lock, enter and exit
 838         * the entire func holding the lock. */
 839        for (long i = 0; i < nr_pgs; i++) {
 840                ret = pm_load_page_nowait(pm, pm_idx0 + i, &page);
 841                if (ret) {
 842                        if (ret != -EAGAIN)
 843                                break;
 844                        spin_unlock(&p->vmr_lock);
 845                        /* might block here, can't hold the spinlock */
 846                        ret = pm_load_page(pm, pm_idx0 + i, &page);
 847                        spin_lock(&p->vmr_lock);
 848                        if (ret)
 849                                break;
 850                        /* while we were sleeping, the VMRs could have changed
 851                         * on us. */
 852                        if (vmr_history != ACCESS_ONCE(p->vmr_history)) {
 853                                pm_put_page(page);
 854                                printk("[kernel] "
 855                                       "FYI: VMR changed during populate\n");
 856                                break;
 857                        }
 858                }
 859                if (flags & MAP_PRIVATE) {
 860                        ret = __copy_and_swap_pmpg(p, &page);
 861                        if (ret) {
 862                                pm_put_page(page);
 863                                break;
 864                        }
 865                }
 866                /* if this is an executable page, we might have to flush the
 867                 * instruction cache if our HW requires it.
 868                 * TODO: is this still needed?  andrew put this in a while ago*/
 869                if (exec)
 870                        icache_flush_page(0, page2kva(page));
 871                /* The page could be either in the PM, or a private, now-anon
 872                 * page. */
 873                ret = map_page_at_addr(p, page, va + i * PGSIZE, pte_prot);
 874                if (page_is_pagemap(page))
 875                        pm_put_page(page);
 876                if (ret)
 877                        break;
 878        }
 879        return ret;
 880}
 881
 882void *do_mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
 883              struct file_or_chan *file, size_t offset)
 884{
 885        len = ROUNDUP(len, PGSIZE);
 886        struct vm_region *vmr, *vmr_temp;
 887
 888        assert(mmap_flags_priv_ok(flags));
 889        assert(prot_is_valid(prot));
 890
 891        vmr = vmr_zalloc();
 892
 893        /* Sanity check, for callers that bypass mmap().  We want addr for anon
 894         * memory to start above the break limit (BRK_END), but not 0.  Keep
 895         * this in sync with BRK_END in mmap(). */
 896        if (addr == 0)
 897                addr = BRK_END;
 898        assert(!PGOFF(offset));
 899        /* MCPs will need their code and data pinned.  This check will start to
 900         * fail after uthread_slim_init(), at which point userspace should have
 901         * enough control over its mmaps (i.e. no longer done by LD or load_elf)
 902         * that it can ask for pinned and populated pages.  Except for
 903         * dl_opens(). */
 904        struct preempt_data *vcpd = &p->procdata->vcore_preempt_data[0];
 905
 906        if (file && (atomic_read(&vcpd->flags) & VC_SCP_NOVCCTX))
 907                flags |= MAP_POPULATE | MAP_LOCKED;
 908        vmr->vm_prot = prot;
 909        vmr->vm_foff = offset;
 910        vmr->vm_flags = flags & MAP_PERSIST_FLAGS;
 911        /* We grab the file early, so we can block.  This is all hokey.  The VMR
 912         * isn't ready yet, so the PM code will ignore it. */
 913        if (file) {
 914                /* Prep the FS and make sure it can mmap the file.  The
 915                 * device/FS checks perms, and does whatever else it needs to
 916                 * make the mmap work. */
 917                if (foc_dev_mmap(file, vmr, prot, flags & MAP_PERSIST_FLAGS)) {
 918                        vmr_free(vmr);
 919                        set_errno(EACCES);      /* not quite */
 920                        return MAP_FAILED;
 921                }
 922                /* TODO: push the PM stuff into the chan/fs_file. */
 923                pm_add_vmr(foc_to_pm(file), vmr);
 924                foc_incref(file);
 925                vmr->__vm_foc = file;
 926                /* TODO: consider locking the file while checking (not as
 927                 * manadatory as in handle_page_fault() */
 928                if (nr_pages(offset + len) > nr_pages(foc_get_len(file))) {
 929                        /* We're allowing them to set up the VMR, though if they
 930                         * attempt to fault in any pages beyond the file's
 931                         * limit, they'll fail.  Since they might not access the
 932                         * region, we need to make sure POPULATE is off.  FYI,
 933                         * 64 bit glibc shared libs map in an extra 2MB of
 934                         * unaligned space between their RO and RW sections, but
 935                         * then immediately mprotect it to PROT_NONE. */
 936                        flags &= ~MAP_POPULATE;
 937                }
 938        }
 939        /* read/write vmr lock (will change the tree) */
 940        spin_lock(&p->vmr_lock);
 941        p->vmr_history++;
 942        /* Need to make sure nothing is in our way when we want a FIXED
 943         * location.  We just need to split on the end points (if they exist),
 944         * and then remove everything in between.  __do_munmap() will do this.
 945         * Careful, this means an mmap can be an implied munmap() (not my
 946         * call...). */
 947        if (flags & MAP_FIXED)
 948                __do_munmap(p, addr, len);
 949        if (!vmr_insert(vmr, p, addr, len)) {
 950                spin_unlock(&p->vmr_lock);
 951                if (vmr_has_file(vmr)) {
 952                        pm_remove_vmr(vmr_to_pm(vmr), vmr);
 953                        foc_decref(vmr->__vm_foc);
 954                }
 955                vmr_free(vmr);
 956                set_error(ENOMEM, "probably tried to mmap beyond UMAPTOP");
 957                /* Slightly weird semantics: if we fail and had munmapped the
 958                 * space, they will have a hole in their VM now. */
 959                return MAP_FAILED;
 960        }
 961        addr = vmr->vm_base;
 962        vmr->vm_ready = true;
 963
 964        vmr = merge_me(vmr);            /* attempts to merge with neighbors */
 965
 966        if (flags & MAP_POPULATE && prot_has_access(prot)) {
 967                int pte_prot = (prot & PROT_WRITE) ? PTE_USER_RW :
 968                           (prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO : 0;
 969                unsigned long nr_pgs = len >> PGSHIFT;
 970                int ret = 0;
 971                if (!file) {
 972                        ret = populate_anon_va(p, addr, nr_pgs, pte_prot);
 973                } else {
 974                        /* Note: this will unlock if it blocks.  our refcnt on
 975                         * the file keeps the pm alive when we unlock */
 976                        ret = populate_pm_va(p, addr, nr_pgs, pte_prot,
 977                                             foc_to_pm(file), offset, flags,
 978                                             prot & PROT_EXEC);
 979                }
 980                if (ret == -ENOMEM) {
 981                        spin_unlock(&p->vmr_lock);
 982                        printk("[kernel] ENOMEM, killing %d\n", p->pid);
 983                        proc_destroy(p);
 984                        /* this will never make it back to userspace */
 985                        return MAP_FAILED;
 986                }
 987        }
 988        spin_unlock(&p->vmr_lock);
 989
 990        profiler_notify_mmap(p, addr, len, prot, flags, file, offset);
 991
 992        return (void*)addr;
 993}
 994
 995int mprotect(struct proc *p, uintptr_t addr, size_t len, int prot)
 996{
 997        int ret;
 998
 999        printd("mprotect: (addr %p, len %p, prot 0x%x)\n", addr, len, prot);
1000        if (!prot_is_valid(prot)) {
1001                set_error(EINVAL, "invalid prot 0x%x (%x)", prot,
1002                          PROT_VALID_PROTS);
1003                return -1;
1004        }
1005        if (!len)
1006                return 0;
1007        len = ROUNDUP(len, PGSIZE);
1008        if (PGOFF(addr)) {
1009                set_errno(EINVAL);
1010                return -1;
1011        }
1012        if (!__is_user_addr((void*)addr, len, UMAPTOP)) {
1013                set_errno(ENOMEM);
1014                return -1;
1015        }
1016        /* read/write lock, will probably change the tree and settings */
1017        spin_lock(&p->vmr_lock);
1018        p->vmr_history++;
1019        ret = __do_mprotect(p, addr, len, prot);
1020        spin_unlock(&p->vmr_lock);
1021        return ret;
1022}
1023
1024/* This does not care if the region is not mapped.  POSIX says you should return
1025 * ENOMEM if any part of it is unmapped.  Can do this later if we care, based on
1026 * the VMRs, not the actual page residency. */
1027int __do_mprotect(struct proc *p, uintptr_t addr, size_t len, int prot)
1028{
1029        struct vm_region *vmr, *next_vmr;
1030        pte_t pte;
1031        bool shootdown_needed = FALSE;
1032        bool file_access_failure = FALSE;
1033        int pte_prot = (prot & PROT_WRITE) ? PTE_USER_RW :
1034                       (prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO : PTE_NONE;
1035
1036        assert(prot_is_valid(prot));
1037        /* TODO: this is aggressively splitting, when we might not need to if
1038         * the prots are the same as the previous.  Plus, there are three
1039         * excessive scans. */
1040        isolate_vmrs(p, addr, len);
1041        vmr = find_first_vmr(p, addr);
1042        while (vmr && vmr->vm_base < addr + len) {
1043                if (vmr->vm_prot == prot)
1044                        goto next_vmr;
1045                if (vmr_has_file(vmr) &&
1046                    !check_foc_perms(vmr, vmr->__vm_foc, prot)) {
1047                        file_access_failure = TRUE;
1048                        goto next_vmr;
1049                }
1050                vmr->vm_prot = prot;
1051                spin_lock(&p->pte_lock);        /* walking and changing PTEs */
1052                /* TODO: use a memwalk.  At a minimum, we need to change every
1053                 * existing PTE that won't trigger a PF (meaning, present PTEs)
1054                 * to have the new prot.  The others will fault on access, and
1055                 * we'll change the PTE then.  In the off chance we have a
1056                 * mapped but not present PTE, we might as well change it too,
1057                 * since we're already here. */
1058                for (uintptr_t va = vmr->vm_base; va < vmr->vm_end;
1059                     va += PGSIZE) {
1060                        pte = pgdir_walk(p->env_pgdir, (void*)va, 0);
1061                        if (pte_walk_okay(pte) && pte_is_mapped(pte)) {
1062                                pte_replace_perm(pte, pte_prot);
1063                                shootdown_needed = TRUE;
1064                        }
1065                }
1066                spin_unlock(&p->pte_lock);
1067next_vmr:
1068                /* Note that this merger could cause us to not look at the next
1069                 * one, since we merged with it.  That's ok, since in that case,
1070                 * the next one already has the right prots.  Also note that
1071                 * every VMR in the region, including the ones at the endpoints,
1072                 * attempted to merge left and right. */
1073                vmr = merge_me(vmr);
1074                next_vmr = TAILQ_NEXT(vmr, vm_link);
1075                vmr = next_vmr;
1076        }
1077        if (shootdown_needed)
1078                proc_tlbshootdown(p, addr, addr + len);
1079        if (file_access_failure) {
1080                set_errno(EACCES);
1081                return -1;
1082        }
1083        return 0;
1084}
1085
1086int munmap(struct proc *p, uintptr_t addr, size_t len)
1087{
1088        int ret;
1089
1090        printd("munmap(addr %x, len %x)\n", addr, len);
1091        if (!len)
1092                return 0;
1093        len = ROUNDUP(len, PGSIZE);
1094        if (PGOFF(addr)) {
1095                set_errno(EINVAL);
1096                return -1;
1097        }
1098        if (!__is_user_addr((void*)addr, len, UMAPTOP)) {
1099                set_errno(EINVAL);
1100                return -1;
1101        }
1102        /* read/write: changing the vmrs (trees, properties, and whatnot) */
1103        spin_lock(&p->vmr_lock);
1104        p->vmr_history++;
1105        ret = __do_munmap(p, addr, len);
1106        spin_unlock(&p->vmr_lock);
1107        return ret;
1108}
1109
1110static int __munmap_pte(struct proc *p, pte_t pte, void *va, void *arg)
1111{
1112        bool *shootdown_needed = (bool*)arg;
1113        struct page *page;
1114
1115        /* could put in some checks here for !P and also !0 */
1116        if (!pte_is_present(pte)) /* unmapped (== 0) *ptes are also not PTE_P */
1117                return 0;
1118        if (pte_is_dirty(pte)) {
1119                page = pa2page(pte_get_paddr(pte));
1120                atomic_or(&page->pg_flags, PG_DIRTY);
1121        }
1122        pte_clear_present(pte);
1123        *shootdown_needed = TRUE;
1124        return 0;
1125}
1126
1127/* If our page is actually in the PM, we don't do anything.  All a page map
1128 * really needs is for our VMR to no longer track it (vmr being in the pm's
1129 * list) and to not point at its pages (mark it 0, dude).
1130 *
1131 * But private mappings mess with that a bit.  Luckily, we can tell by looking
1132 * at a page whether the specific page is in the PM or not.  If it isn't, we
1133 * still need to free our "VMR local" copy.
1134 *
1135 * For pages in a PM, we're racing with PM removers.  Both of us sync with the
1136 * mm lock, so once we hold the lock, it's a matter of whether or not the PTE is
1137 * 0 or not.  If it isn't, then we're still okay to look at the page.  Consider
1138 * the PTE a weak ref on the page.  So long as you hold the mm lock, you can
1139 * look at the PTE and know the page isn't being freed. */
1140static int __vmr_free_pgs(struct proc *p, pte_t pte, void *va, void *arg)
1141{
1142        struct page *page;
1143        if (pte_is_unmapped(pte))
1144                return 0;
1145        page = pa2page(pte_get_paddr(pte));
1146        pte_clear(pte);
1147        if (!page_is_pagemap(page))
1148                page_decref(page);
1149        return 0;
1150}
1151
1152int __do_munmap(struct proc *p, uintptr_t addr, size_t len)
1153{
1154        struct vm_region *vmr, *next_vmr, *first_vmr;
1155        bool shootdown_needed = FALSE;
1156
1157        /* TODO: this will be a bit slow, since we end up doing three linear
1158         * searches (two in isolate, one in find_first). */
1159        isolate_vmrs(p, addr, len);
1160        first_vmr = find_first_vmr(p, addr);
1161        vmr = first_vmr;
1162        spin_lock(&p->pte_lock);        /* changing PTEs */
1163        while (vmr && vmr->vm_base < addr + len) {
1164                /* It's important that we call __munmap_pte and sync the
1165                 * PG_DIRTY bit before we unhook the VMR from the PM (in
1166                 * destroy_vmr). */
1167                env_user_mem_walk(p, (void*)vmr->vm_base,
1168                                  vmr->vm_end - vmr->vm_base, __munmap_pte,
1169                                  &shootdown_needed);
1170                vmr = TAILQ_NEXT(vmr, vm_link);
1171        }
1172        spin_unlock(&p->pte_lock);
1173        /* we haven't freed the pages yet; still using the PTEs to store the
1174         * them.  There should be no races with inserts/faults, since we still
1175         * hold the mm lock since the previous CB. */
1176        if (shootdown_needed)
1177                proc_tlbshootdown(p, addr, addr + len);
1178        vmr = first_vmr;
1179        while (vmr && vmr->vm_base < addr + len) {
1180                /* there is rarely more than one VMR in this loop.  o/w, we'll
1181                 * need to gather up the vmrs and destroy outside the pte_lock.
1182                 */
1183                spin_lock(&p->pte_lock);        /* changing PTEs */
1184                env_user_mem_walk(p, (void*)vmr->vm_base,
1185                                  vmr->vm_end - vmr->vm_base, __vmr_free_pgs,
1186                                  0);
1187                spin_unlock(&p->pte_lock);
1188                next_vmr = TAILQ_NEXT(vmr, vm_link);
1189                destroy_vmr(vmr);
1190                vmr = next_vmr;
1191        }
1192        return 0;
1193}
1194
1195/* Helper - drop the page differently based on where it is from */
1196static void __put_page(struct page *page)
1197{
1198        if (page_is_pagemap(page))
1199                pm_put_page(page);
1200        else
1201                page_decref(page);
1202}
1203
1204static int __hpf_load_page(struct proc *p, struct page_map *pm,
1205                           unsigned long idx, struct page **page, bool first)
1206{
1207        int ret = 0;
1208        int coreid = core_id();
1209        struct per_cpu_info *pcpui = &per_cpu_info[coreid];
1210        bool wake_scp = FALSE;
1211        spin_lock(&p->proc_lock);
1212        switch (p->state) {
1213        case (PROC_RUNNING_S):
1214                wake_scp = TRUE;
1215                __proc_set_state(p, PROC_WAITING);
1216                /* it's possible for HPF to loop a few times; we can only save
1217                 * the first time, o/w we could clobber. */
1218                if (first) {
1219                        __proc_save_context_s(p);
1220                        __proc_save_fpu_s(p);
1221                        /* We clear the owner, since userspace doesn't run here
1222                         * anymore, but we won't abandon since the fault handler
1223                         * still runs in our process. */
1224                        clear_owning_proc(coreid);
1225                }
1226                /* other notes: we don't currently need to tell the ksched
1227                 * we switched from running to waiting, though we probably
1228                 * will later for more generic scheds. */
1229                break;
1230        case (PROC_RUNNABLE_M):
1231        case (PROC_RUNNING_M):
1232                spin_unlock(&p->proc_lock);
1233                return -EAGAIN; /* will get reflected back to userspace */
1234        case (PROC_DYING):
1235        case (PROC_DYING_ABORT):
1236                spin_unlock(&p->proc_lock);
1237                return -EINVAL;
1238        default:
1239                /* shouldn't have any waitings, under the current yield style.
1240                 * if this becomes an issue, we can branch on is_mcp(). */
1241                printk("HPF unexpectecd state(%s)", procstate2str(p->state));
1242                spin_unlock(&p->proc_lock);
1243                return -EINVAL;
1244        }
1245        spin_unlock(&p->proc_lock);
1246        ret = pm_load_page(pm, idx, page);
1247        if (wake_scp)
1248                proc_wakeup(p);
1249        if (ret) {
1250                printk("load failed with ret %d\n", ret);
1251                return ret;
1252        }
1253        /* need to put our old ref, next time around HPF will get another. */
1254        pm_put_page(*page);
1255        return 0;
1256}
1257
1258/* Returns 0 on success, or an appropriate -error code.
1259 *
1260 * Notes: if your TLB caches negative results, you'll need to flush the
1261 * appropriate tlb entry.  Also, you could have a weird race where a present PTE
1262 * faulted for a different reason (was mprotected on another core), and the
1263 * shootdown is on its way.  Userspace should have waited for the mprotect to
1264 * return before trying to write (or whatever), so we don't care and will fault
1265 * them. */
1266static int __hpf(struct proc *p, uintptr_t va, int prot, bool file_ok)
1267{
1268        struct vm_region *vmr;
1269        struct file_or_chan *file;
1270        struct page *a_page;
1271        unsigned int f_idx;     /* index of the missing page in the file */
1272        int ret = 0;
1273        bool first = TRUE;
1274        va = ROUNDDOWN(va,PGSIZE);
1275
1276refault:
1277        /* read access to the VMRs TODO: RCU */
1278        spin_lock(&p->vmr_lock);
1279        /* Check the vmr's protection */
1280        vmr = find_vmr(p, va);
1281        if (!vmr) {                     /* not mapped at all */
1282                printd("fault: %p not mapped\n", va);
1283                ret = -EFAULT;
1284                goto out;
1285        }
1286        if (!(vmr->vm_prot & prot)) {   /* wrong prots for this vmr */
1287                ret = -EPERM;
1288                goto out;
1289        }
1290        if (!vmr_has_file(vmr)) {
1291                /* No file - just want anonymous memory */
1292                if (upage_alloc(p, &a_page, TRUE)) {
1293                        ret = -ENOMEM;
1294                        goto out;
1295                }
1296        } else {
1297                if (!file_ok) {
1298                        ret = -EACCES;
1299                        goto out;
1300                }
1301                file = vmr->__vm_foc;
1302                /* If this fails, either something got screwed up with the VMR,
1303                 * or the permissions changed after mmap/mprotect.  Either way,
1304                 * I want to know (though it's not critical). */
1305                if (!check_foc_perms(vmr, file, prot))
1306                        printk("[kernel] "
1307                               "possible issue with VMR prots on file %s!\n",
1308                               foc_to_name(file));
1309                /* Load the file's page in the page cache.
1310                 * TODO: (BLK) Note, we are holding the mem lock!  We need to
1311                 * rewrite this stuff so we aren't hold the lock as excessively
1312                 * as we are, and such that we can block and resume later. */
1313                assert(!PGOFF(va - vmr->vm_base + vmr->vm_foff));
1314                f_idx = (va - vmr->vm_base + vmr->vm_foff) >> PGSHIFT;
1315                /* This is a racy check - see the comments in fs_file.c */
1316                if (f_idx + 1 > nr_pages(foc_get_len(file))) {
1317                        ret = -ESPIPE; /* linux sends a SIGBUS at access time */
1318                        goto out;
1319                }
1320                ret = pm_load_page_nowait(foc_to_pm(file), f_idx, &a_page);
1321                if (ret) {
1322                        if (ret != -EAGAIN)
1323                                goto out;
1324                        /* keep the file alive after we unlock */
1325                        foc_incref(file);
1326                        spin_unlock(&p->vmr_lock);
1327                        ret = __hpf_load_page(p, foc_to_pm(file), f_idx,
1328                                              &a_page, first);
1329                        first = FALSE;
1330                        foc_decref(file);
1331                        if (ret)
1332                                return ret;
1333                        goto refault;
1334                }
1335                /* If we want a private map, we'll preemptively give you a new
1336                 * page.  We used to just care if it was private and writable,
1337                 * but were running into issues with libc changing its mapping
1338                 * (map private, then mprotect to writable...)  In the future,
1339                 * we want to CoW this anyway, so it's not a big deal. */
1340                if ((vmr->vm_flags & MAP_PRIVATE)) {
1341                        ret = __copy_and_swap_pmpg(p, &a_page);
1342                        if (ret)
1343                                goto out_put_pg;
1344                }
1345                /* if this is an executable page, we might have to flush the
1346                 * instruction cache if our HW requires it. */
1347                if (vmr->vm_prot & PROT_EXEC)
1348                        icache_flush_page((void*)va, page2kva(a_page));
1349        }
1350        /* update the page table TODO: careful with MAP_PRIVATE etc.  might do
1351         * this separately (file, no file) */
1352        int pte_prot = (vmr->vm_prot & PROT_WRITE) ? PTE_USER_RW :
1353                       (vmr->vm_prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO : 0;
1354        ret = map_page_at_addr(p, a_page, va, pte_prot);
1355        /* fall through, even for errors */
1356out_put_pg:
1357        /* the VMR's existence in the PM (via the mmap) allows us to have PTE
1358         * point to a_page without it magically being reallocated.  For non-PM
1359         * memory (anon memory or private pages) we transferred the ref to the
1360         * PTE. */
1361        if (page_is_pagemap(a_page))
1362                pm_put_page(a_page);
1363out:
1364        spin_unlock(&p->vmr_lock);
1365        return ret;
1366}
1367
1368int handle_page_fault(struct proc *p, uintptr_t va, int prot)
1369{
1370        return __hpf(p, va, prot, TRUE);
1371}
1372
1373int handle_page_fault_nofile(struct proc *p, uintptr_t va, int prot)
1374{
1375        return __hpf(p, va, prot, FALSE);
1376}
1377
1378/* Attempts to populate the pages, as if there was a page faults.  Bails on
1379 * errors, and returns the number of pages populated.  */
1380unsigned long populate_va(struct proc *p, uintptr_t va, unsigned long nr_pgs)
1381{
1382        struct vm_region *vmr, vmr_copy;
1383        struct file_or_chan *file;
1384        unsigned long nr_pgs_this_vmr;
1385        unsigned long nr_filled = 0;
1386        struct page *page;
1387        int pte_prot;
1388        int ret;
1389
1390        /* we can screw around with ways to limit the find_vmr calls (can do the
1391         * next in line if we didn't unlock, etc., but i don't expect us to do
1392         * this for more than a single VMR in most cases. */
1393        spin_lock(&p->vmr_lock);
1394        while (nr_pgs) {
1395                vmr = find_vmr(p, va);
1396                if (!vmr)
1397                        break;
1398                if (!prot_has_access(vmr->vm_prot))
1399                        break;
1400                pte_prot = (vmr->vm_prot & PROT_WRITE) ? PTE_USER_RW :
1401                           (vmr->vm_prot & (PROT_READ|PROT_EXEC)) ? PTE_USER_RO
1402                                                                  : 0;
1403                nr_pgs_this_vmr = MIN(nr_pgs, (vmr->vm_end - va) >> PGSHIFT);
1404                if (!vmr_has_file(vmr)) {
1405                        if (populate_anon_va(p, va, nr_pgs_this_vmr, pte_prot))
1406                        {
1407                                /* on any error, we can just bail.  we might be
1408                                 * underestimating nr_filled. */
1409                                break;
1410                        }
1411                } else {
1412                        file = vmr->__vm_foc;
1413                        /* need to keep the file alive in case we unlock/block
1414                         */
1415                        foc_incref(file);
1416                        /* Regarding foff + (va - base): va - base < len, and
1417                         * foff + len does not over flow */
1418                        ret = populate_pm_va(p, va, nr_pgs_this_vmr, pte_prot,
1419                                             foc_to_pm(file),
1420                                             vmr->vm_foff + (va - vmr->vm_base),
1421                                             vmr->vm_flags,
1422                                             vmr->vm_prot & PROT_EXEC);
1423                        foc_decref(file);
1424                        if (ret) {
1425                                /* we might have failed if the underlying file
1426                                 * doesn't cover the mmap window, depending on
1427                                 * how we'll deal with truncation. */
1428                                break;
1429                        }
1430                }
1431                nr_filled += nr_pgs_this_vmr;
1432                va += nr_pgs_this_vmr << PGSHIFT;
1433                nr_pgs -= nr_pgs_this_vmr;
1434        }
1435        spin_unlock(&p->vmr_lock);
1436        return nr_filled;
1437}
1438
1439/* Kernel Dynamic Memory Mappings */
1440
1441static struct arena *vmap_addr_arena;
1442struct arena *vmap_arena;
1443static spinlock_t vmap_lock = SPINLOCK_INITIALIZER;
1444struct vmap_free_tracker {
1445        void                            *addr;
1446        size_t                          nr_bytes;
1447};
1448static struct vmap_free_tracker *vmap_to_free;
1449static size_t vmap_nr_to_free;
1450/* This value tunes the ratio of global TLB shootdowns to __vmap_free()s. */
1451#define VMAP_MAX_TO_FREE 1000
1452
1453/* We don't immediately return the addrs to their source (vmap_addr_arena).
1454 * Instead, we hold on to them until we have a suitable amount, then free them
1455 * in a batch.  This amoritizes the cost of the TLB global shootdown.  We can
1456 * explore other tricks in the future too (like RCU for a certain index in the
1457 * vmap_to_free array). */
1458static void __vmap_free(struct arena *source, void *obj, size_t size)
1459{
1460        struct vmap_free_tracker *vft;
1461
1462        spin_lock(&vmap_lock);
1463        /* All objs get *unmapped* immediately, but we'll shootdown later.  Note
1464         * that it is OK (but slightly dangerous) for the kernel to reuse the
1465         * paddrs pointed to by the vaddrs before a TLB shootdown. */
1466        unmap_segment(boot_pgdir, (uintptr_t)obj, size);
1467        if (vmap_nr_to_free < VMAP_MAX_TO_FREE) {
1468                vft = &vmap_to_free[vmap_nr_to_free++];
1469                vft->addr = obj;
1470                vft->nr_bytes = size;
1471                spin_unlock(&vmap_lock);
1472                return;
1473        }
1474        tlb_shootdown_global();
1475        for (int i = 0; i < vmap_nr_to_free; i++) {
1476                vft = &vmap_to_free[i];
1477                arena_free(source, vft->addr, vft->nr_bytes);
1478        }
1479        /* don't forget to free the one passed in */
1480        arena_free(source, obj, size);
1481        vmap_nr_to_free = 0;
1482        spin_unlock(&vmap_lock);
1483}
1484
1485void vmap_init(void)
1486{
1487        vmap_addr_arena = arena_create("vmap_addr", (void*)KERN_DYN_BOT,
1488                                       KERN_DYN_TOP - KERN_DYN_BOT,
1489                                       PGSIZE, NULL, NULL, NULL, 0, MEM_WAIT);
1490        vmap_arena = arena_create("vmap", NULL, 0, PGSIZE, arena_alloc,
1491                                  __vmap_free, vmap_addr_arena, 0, MEM_WAIT);
1492        vmap_to_free = kmalloc(sizeof(struct vmap_free_tracker)
1493                               * VMAP_MAX_TO_FREE, MEM_WAIT);
1494        /* This ensures the boot_pgdir's top-most PML (PML4) has entries
1495         * pointing to PML3s that cover the dynamic mapping range.  Now, it's
1496         * safe to create processes that copy from boot_pgdir and still
1497         * dynamically change the kernel mappings. */
1498        arch_add_intermediate_pts(boot_pgdir, KERN_DYN_BOT,
1499                                  KERN_DYN_TOP - KERN_DYN_BOT);
1500}
1501
1502uintptr_t get_vmap_segment(size_t nr_bytes)
1503{
1504        uintptr_t ret;
1505
1506        ret = (uintptr_t)arena_alloc(vmap_arena, nr_bytes, MEM_ATOMIC);
1507        assert(ret);
1508        return ret;
1509}
1510
1511void put_vmap_segment(uintptr_t vaddr, size_t nr_bytes)
1512{
1513        arena_free(vmap_arena, (void*)vaddr, nr_bytes);
1514}
1515
1516/* Map a virtual address chunk to physical addresses.  Make sure you got a vmap
1517 * segment before actually trying to do the mapping.
1518 *
1519 * Careful with more than one 'page', since it will assume your physical pages
1520 * are also contiguous.  Most callers will only use one page.
1521 *
1522 * Finally, note that this does not care whether or not there are real pages
1523 * being mapped, and will not attempt to incref your page (if there is such a
1524 * thing).  Handle your own refcnting for pages. */
1525int map_vmap_segment(uintptr_t vaddr, uintptr_t paddr, unsigned long num_pages,
1526                     int perm)
1527{
1528#ifdef CONFIG_X86
1529        perm |= PTE_G;
1530#endif
1531        spin_lock(&vmap_lock);
1532        map_segment(boot_pgdir, vaddr, num_pages * PGSIZE, paddr, perm,
1533                    arch_max_jumbo_page_shift());
1534        spin_unlock(&vmap_lock);
1535        return 0;
1536}
1537
1538/* This can handle unaligned paddrs */
1539static uintptr_t vmap_pmem_flags(uintptr_t paddr, size_t nr_bytes, int flags)
1540{
1541        uintptr_t vaddr;
1542        unsigned long nr_pages;
1543
1544        assert(nr_bytes && paddr);
1545        nr_bytes += PGOFF(paddr);
1546        nr_pages = ROUNDUP(nr_bytes, PGSIZE) >> PGSHIFT;
1547        vaddr = get_vmap_segment(nr_bytes);
1548        if (!vaddr) {
1549                warn("Unable to get a vmap segment");   /* probably a bug */
1550                return 0;
1551        }
1552        /* it's not strictly necessary to drop paddr's pgoff, but it might save
1553         * some vmap heartache in the future. */
1554        if (map_vmap_segment(vaddr, PG_ADDR(paddr), nr_pages,
1555                             PTE_KERN_RW | flags)) {
1556                warn("Unable to map a vmap segment");   /* probably a bug */
1557                return 0;
1558        }
1559        return vaddr + PGOFF(paddr);
1560}
1561
1562uintptr_t vmap_pmem(uintptr_t paddr, size_t nr_bytes)
1563{
1564        return vmap_pmem_flags(paddr, nr_bytes, 0);
1565}
1566
1567uintptr_t vmap_pmem_nocache(uintptr_t paddr, size_t nr_bytes)
1568{
1569        return vmap_pmem_flags(paddr, nr_bytes, PTE_NOCACHE);
1570}
1571
1572uintptr_t vmap_pmem_writecomb(uintptr_t paddr, size_t nr_bytes)
1573{
1574        return vmap_pmem_flags(paddr, nr_bytes, PTE_WRITECOMB);
1575}
1576
1577int vunmap_vmem(uintptr_t vaddr, size_t nr_bytes)
1578{
1579        nr_bytes += PGOFF(vaddr);
1580        put_vmap_segment(PG_ADDR(vaddr), nr_bytes);
1581        return 0;
1582}
1583