akaros/kern/src/pagemap.c
<<
>>
Prefs
   1/* Copyright (c) 2014 The Regents of the University of California
   2 * Barret Rhoden <brho@cs.berkeley.edu>
   3 * See LICENSE for details.
   4 *
   5 * Page mapping: maps an object (inode or block dev) in page size chunks.
   6 * Analagous to Linux's "struct address space" */
   7
   8#include <pmap.h>
   9#include <atomic.h>
  10#include <radix.h>
  11#include <kref.h>
  12#include <assert.h>
  13#include <stdio.h>
  14#include <pagemap.h>
  15#include <rcu.h>
  16
  17void pm_add_vmr(struct page_map *pm, struct vm_region *vmr)
  18{
  19        /* note that the VMR being reverse-mapped by the PM is protected by the
  20         * PM's lock.  we clearly need a write lock here, but removal also needs
  21         * a write lock, so later when removal holds this, it delays munmaps and
  22         * keeps the VMR connected. */
  23        spin_lock(&pm->pm_lock);
  24        TAILQ_INSERT_TAIL(&pm->pm_vmrs, vmr, vm_pm_link);
  25        spin_unlock(&pm->pm_lock);
  26}
  27
  28void pm_remove_vmr(struct page_map *pm, struct vm_region *vmr)
  29{
  30        spin_lock(&pm->pm_lock);
  31        TAILQ_REMOVE(&pm->pm_vmrs, vmr, vm_pm_link);
  32        spin_unlock(&pm->pm_lock);
  33}
  34
  35/* PM slot void *s look like this:
  36 *
  37 * |--11--|--1--|----52 or 20 bits--|
  38 * | ref  | flag|    ppn of page    |
  39 *              \  <--- meta shift -/
  40 *
  41 * The setter funcs return the void* that should update slot_val; it doesn't
  42 * change slot_val in place (it's a val, not the addr) */
  43
  44#ifdef CONFIG_64BIT
  45# define PM_FLAGS_SHIFT 52
  46#else
  47# define PM_FLAGS_SHIFT 20
  48#endif
  49#define PM_REFCNT_SHIFT (PM_FLAGS_SHIFT + 1)
  50
  51#define PM_UNUSED_FLAG (1UL << PM_FLAGS_SHIFT)
  52
  53static int pm_slot_check_refcnt(void *slot_val)
  54{
  55        return (unsigned long)slot_val >> PM_REFCNT_SHIFT;
  56}
  57
  58static void *pm_slot_inc_refcnt(void *slot_val)
  59{
  60        void *ret;
  61
  62        ret = (void*)((unsigned long)slot_val + (1UL << PM_REFCNT_SHIFT));
  63        /* Catches previously negative refcnts */
  64        assert(pm_slot_check_refcnt(ret) > 0);
  65        return ret;
  66}
  67
  68static void *pm_slot_dec_refcnt(void *slot_val)
  69{
  70        assert(pm_slot_check_refcnt(slot_val) > 0);
  71        return (void*)((unsigned long)slot_val - (1UL << PM_REFCNT_SHIFT));
  72}
  73
  74static struct page *pm_slot_get_page(void *slot_val)
  75{
  76        if (!slot_val)
  77                return 0;
  78        return ppn2page((unsigned long)slot_val & ((1UL << PM_FLAGS_SHIFT) - 1));
  79}
  80
  81static void *pm_slot_set_page(void *slot_val, struct page *pg)
  82{
  83        assert(pg != pages);    /* we should never alloc page 0, for sanity */
  84        return (void*)(page2ppn(pg) | ((unsigned long)slot_val &
  85                                       ~((1UL << PM_FLAGS_SHIFT) - 1)));
  86}
  87
  88/* Initializes a PM.  Host should be an fs_file.  The reference this stores is
  89 * uncounted. */
  90void pm_init(struct page_map *pm, struct page_map_operations *op, void *host)
  91{
  92        pm->pm_file = host;
  93        radix_tree_init(&pm->pm_tree);
  94        pm->pm_num_pages = 0;
  95        pm->pm_op = op;
  96        qlock_init(&pm->pm_qlock);
  97        spinlock_init(&pm->pm_lock);
  98        TAILQ_INIT(&pm->pm_vmrs);
  99}
 100
 101/* Looks up the index'th page in the page map, returning a refcnt'd reference
 102 * that need to be dropped with pm_put_page, or 0 if it was not in the map. */
 103static struct page *pm_find_page(struct page_map *pm, unsigned long index)
 104{
 105        void **tree_slot;
 106        void *old_slot_val, *slot_val;
 107        struct page *page = 0;
 108
 109        /* We use rcu to protect our radix walk, specifically the tree_slot
 110         * pointer.  We get our own 'pm refcnt' on the slot itself, which
 111         * doesn't need RCU. */
 112        rcu_read_lock();
 113        /* We're syncing with removal.  The deal is that if we grab the page
 114         * (and we'd only do that if the page != 0), we up the slot ref and
 115         * clear removal.  A remover will only remove it if removal is still
 116         * set.  If we grab and release while removal is in progress, even
 117         * though we no longer hold the ref, we have unset removal.  Also, to
 118         * prevent removal where we get a page well before the removal process,
 119         * the removal won't even bother when the slot refcnt is upped. */
 120        tree_slot = radix_lookup_slot(&pm->pm_tree, index);
 121        if (!tree_slot)
 122                goto out;
 123        do {
 124                old_slot_val = ACCESS_ONCE(*tree_slot);
 125                slot_val = old_slot_val;
 126                page = pm_slot_get_page(slot_val);
 127                if (!page)
 128                        goto out;
 129                slot_val = pm_slot_inc_refcnt(slot_val); /* not a page kref */
 130        } while (!atomic_cas_ptr(tree_slot, old_slot_val, slot_val));
 131        assert(page->pg_tree_slot == tree_slot);
 132out:
 133        rcu_read_unlock();
 134        return page;
 135}
 136
 137/* Attempts to insert the page into the page_map, returns 0 for success, or an
 138 * error code if there was one already (EEXIST) or we ran out of memory
 139 * (ENOMEM).
 140 *
 141 * On success, callers *lose* their page ref, but get a PM slot ref.  This slot
 142 * ref is sufficient to keep the page alive (slot ref protects the page ref)..
 143 *
 144 * Makes no assumptions about the quality of the data loaded, that's up to the
 145 * caller. */
 146static int pm_insert_page(struct page_map *pm, unsigned long index,
 147                          struct page *page)
 148{
 149        int ret;
 150        void **tree_slot;
 151        void *slot_val = 0;
 152
 153        page->pg_mapping = pm;  /* debugging */
 154        page->pg_index = index;
 155        /* no one should be looking at the tree slot til we stop write locking.
 156         * the only other one who looks is removal, who requires a PM write
 157         * lock. */
 158        page->pg_tree_slot = (void*)0xdeadbeef; /* poison */
 159        slot_val = pm_slot_inc_refcnt(slot_val);
 160        /* passing the page ref from the caller to the slot */
 161        slot_val = pm_slot_set_page(slot_val, page);
 162        qlock(&pm->pm_qlock);
 163        ret = radix_insert(&pm->pm_tree, index, slot_val, &tree_slot);
 164        if (ret) {
 165                qunlock(&pm->pm_qlock);
 166                return ret;
 167        }
 168        page->pg_tree_slot = tree_slot;
 169        pm->pm_num_pages++;
 170        qunlock(&pm->pm_qlock);
 171        return 0;
 172}
 173
 174/* Decrefs the PM slot ref (usage of a PM page).  The PM's page ref remains. */
 175void pm_put_page(struct page *page)
 176{
 177        void **tree_slot = page->pg_tree_slot;
 178
 179        assert(tree_slot);
 180        assert(pm_slot_get_page(*tree_slot) == page);
 181        assert(pm_slot_check_refcnt(*tree_slot) > 0);
 182        /* decref, don't care about CASing */
 183        atomic_add((atomic_t*)tree_slot, -(1UL << PM_REFCNT_SHIFT));
 184}
 185
 186/* Makes sure the index'th page of the mapped object is loaded in the page cache
 187 * and returns its location via **pp.
 188 *
 189 * You'll get a pm-slot refcnt back, which you need to put when you're done. */
 190int pm_load_page(struct page_map *pm, unsigned long index, struct page **pp)
 191{
 192        struct page *page;
 193        int error;
 194
 195        page = pm_find_page(pm, index);
 196        while (!page) {
 197                if (kpage_alloc(&page))
 198                        return -ENOMEM;
 199                /* important that UP_TO_DATE is not set.  once we put it in the
 200                 * PM, others can find it, and we still need to fill it. */
 201                atomic_set(&page->pg_flags, PG_LOCKED | PG_PAGEMAP);
 202                /* The sem needs to be initted before anyone can try to lock it,
 203                 * meaning before it is in the page cache.  We also want it
 204                 * locked preemptively, by setting signals = 0. */
 205                sem_init(&page->pg_sem, 0);
 206                error = pm_insert_page(pm, index, page);
 207                switch (error) {
 208                case 0:
 209                        goto load_locked_page;
 210                        break;
 211                case -EEXIST:
 212                        /* the page was mapped already (benign race), just get
 213                         * rid of our page and try again (the only case that
 214                         * uses the while) */
 215                        atomic_set(&page->pg_flags, 0);
 216                        page_decref(page);
 217                        page = pm_find_page(pm, index);
 218                        break;
 219                default:
 220                        atomic_set(&page->pg_flags, 0);
 221                        page_decref(page);
 222                        return error;
 223                }
 224        }
 225        assert(page);
 226        assert(pm_slot_check_refcnt(*page->pg_tree_slot));
 227        assert(pm_slot_get_page(*page->pg_tree_slot) == page);
 228        if (atomic_read(&page->pg_flags) & PG_UPTODATE) {
 229                *pp = page;
 230                printd("pm %p FOUND page %p, addr %p, idx %d\n", pm, page,
 231                       page2kva(page), index);
 232                return 0;
 233        }
 234        lock_page(page);
 235        /* double-check.  if we we blocked on lock_page, it was probably for
 236         * someone else loading.  plus, we can't load a page more than once (it
 237         * could clobber newer writes) */
 238        if (atomic_read(&page->pg_flags) & PG_UPTODATE) {
 239                unlock_page(page);
 240                *pp = page;
 241                return 0;
 242        }
 243        /* fall through */
 244load_locked_page:
 245        error = pm->pm_op->readpage(pm, page);
 246        assert(!error);
 247        assert(atomic_read(&page->pg_flags) & PG_UPTODATE);
 248        unlock_page(page);
 249        *pp = page;
 250        printd("pm %p LOADS page %p, addr %p, idx %d\n", pm, page,
 251               page2kva(page), index);
 252        return 0;
 253}
 254
 255int pm_load_page_nowait(struct page_map *pm, unsigned long index,
 256                        struct page **pp)
 257{
 258        struct page *page = pm_find_page(pm, index);
 259
 260        if (!page)
 261                return -EAGAIN;
 262        if (!(atomic_read(&page->pg_flags) & PG_UPTODATE)) {
 263                /* TODO: could have a read_nowait pm_op */
 264                pm_put_page(page);
 265                return -EAGAIN;
 266        }
 267        *pp = page;
 268        return 0;
 269}
 270
 271static bool vmr_has_page_idx(struct vm_region *vmr, unsigned long pg_idx)
 272{
 273        unsigned long nr_pgs = (vmr->vm_end - vmr->vm_base) >> PGSHIFT;
 274        unsigned long start_pg = vmr->vm_foff >> PGSHIFT;
 275
 276        if (!vmr->vm_ready)
 277                return false;
 278        return ((start_pg <= pg_idx) && (pg_idx < start_pg + nr_pgs));
 279}
 280
 281/* Runs CB on every PTE in the VMR that corresponds to the file's pg_idx, for up
 282 * to max_nr_pgs. */
 283static void vmr_for_each(struct vm_region *vmr, unsigned long pg_idx,
 284                         unsigned long max_nr_pgs, mem_walk_callback_t callback)
 285{
 286        uintptr_t start_va;
 287        off64_t file_off = pg_idx << PGSHIFT;
 288        size_t len = max_nr_pgs << PGSHIFT;
 289
 290        if (file_off < vmr->vm_foff) {
 291                len -= vmr->vm_foff - file_off;
 292                file_off = vmr->vm_foff;
 293        }
 294
 295        start_va = vmr->vm_base + (file_off - vmr->vm_foff);
 296        if (start_va < vmr->vm_base) {
 297                warn("wraparound! %p %p %p %p", start_va, vmr->vm_base,
 298                     vmr->vm_foff, pg_idx);
 299                return;
 300        }
 301        if (start_va >= vmr->vm_end)
 302                return;
 303
 304        len = MIN(len, vmr->vm_end - start_va);
 305        if (!len)
 306                return;
 307        env_user_mem_walk(vmr->vm_proc, (void*)start_va, len, callback, vmr);
 308}
 309
 310static bool pm_has_vmr_with_page(struct page_map *pm, unsigned long pg_idx)
 311{
 312        struct vm_region *vmr_i;
 313
 314        spin_lock(&pm->pm_lock);
 315        TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
 316                if (vmr_has_page_idx(vmr_i, pg_idx)) {
 317                        spin_unlock(&pm->pm_lock);
 318                        return true;
 319                }
 320        }
 321        spin_unlock(&pm->pm_lock);
 322        return false;
 323}
 324
 325static bool __remove_or_zero_cb(void **slot, unsigned long tree_idx, void *arg)
 326{
 327        struct page_map *pm = arg;
 328        struct page *page;
 329        void *old_slot_val, *slot_val;
 330
 331        old_slot_val = ACCESS_ONCE(*slot);
 332        slot_val = old_slot_val;
 333        page = pm_slot_get_page(slot_val);
 334        /* We shouldn't have an item in the tree without a page, unless there's
 335         * another removal.  Currently, this CB is called with a qlock. */
 336        assert(page);
 337        /* Don't even bother with VMRs that might have faulted in the page */
 338        if (pm_has_vmr_with_page(pm, tree_idx)) {
 339                memset(page2kva(page), 0, PGSIZE);
 340                return false;
 341        }
 342        /* syncing with lookups, writebacks, etc - anyone who gets a ref on a PM
 343         * leaf/page (e.g. pm_load_page / pm_find_page. */
 344        slot_val = pm_slot_set_page(slot_val, NULL);
 345        if (pm_slot_check_refcnt(slot_val) ||
 346                !atomic_cas_ptr(slot, old_slot_val, slot_val)) {
 347                memset(page2kva(page), 0, PGSIZE);
 348                return false;
 349        }
 350        /* We yanked the page out.  The radix tree still has an item until we
 351         * return true, but this is fine.  Future lock-free lookups will now
 352         * fail (since the page is 0), and insertions will block on the write
 353         * lock. */
 354        atomic_set(&page->pg_flags, 0); /* cause/catch bugs */
 355        page_decref(page);
 356        return true;
 357}
 358
 359void pm_remove_or_zero_pages(struct page_map *pm, unsigned long start_idx,
 360                             unsigned long nr_pgs)
 361{
 362        unsigned long end_idx = start_idx + nr_pgs;
 363
 364        assert(end_idx > start_idx);
 365        qlock(&pm->pm_qlock);
 366        radix_for_each_slot_in_range(&pm->pm_tree, start_idx, end_idx,
 367                                     __remove_or_zero_cb, pm);
 368        qunlock(&pm->pm_qlock);
 369}
 370
 371static int __pm_mark_and_clear_dirty(struct proc *p, pte_t pte, void *va,
 372                                     void *arg)
 373{
 374        struct page *page = pa2page(pte_get_paddr(pte));
 375        struct vm_region *vmr = arg;
 376
 377        if (!pte_is_present(pte) || !pte_is_dirty(pte))
 378                return 0;
 379        if (!(atomic_read(&page->pg_flags) & PG_DIRTY))
 380                atomic_or(&page->pg_flags, PG_DIRTY);
 381        pte_clear_dirty(pte);
 382        vmr->vm_shootdown_needed = true;
 383        return 0;
 384}
 385
 386/* Dirty PTE bits will get marked to the struct page itself, and the PTEs will
 387 * have the dirty bit cleared.  VMRs that need a shootdown are marked.  Note
 388 * this only marks PTEs and VMRs if they were the one to do some of the
 389 * dirtying. */
 390static void mark_and_clear_dirty_ptes(struct page_map *pm)
 391{
 392        struct vm_region *vmr_i;
 393        pte_t pte;
 394
 395        spin_lock(&pm->pm_lock);
 396        TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
 397                if (!(vmr_i->vm_prot & PROT_WRITE))
 398                        continue;
 399                /* Only care about shared mappings, not private.  Private
 400                 * mappings have a reference to the file, but the pages are not
 401                 * in the page cache - they hang directly off the PTEs (for
 402                 * now). */
 403                if (!(vmr_i->vm_flags & MAP_SHARED))
 404                        continue;
 405                spin_lock(&vmr_i->vm_proc->pte_lock);
 406                vmr_for_each(vmr_i, 0, ULONG_MAX, __pm_mark_and_clear_dirty);
 407                spin_unlock(&vmr_i->vm_proc->pte_lock);
 408        }
 409        spin_unlock(&pm->pm_lock);
 410}
 411
 412static void shootdown_vmrs(struct page_map *pm)
 413{
 414        struct vm_region *vmr_i;
 415
 416        /* The VMR flag shootdown_needed is owned by the PM.  Each VMR is hooked
 417         * to at most one file, so there's no issue there.  We might have a proc
 418         * that has multiple non-private VMRs in the same file, but it shouldn't
 419         * be a big enough issue to worry about. */
 420        spin_lock(&pm->pm_lock);
 421        TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
 422                if (vmr_i->vm_shootdown_needed) {
 423                        vmr_i->vm_shootdown_needed = false;
 424                        proc_tlbshootdown(vmr_i->vm_proc, 0, 0);
 425                }
 426        }
 427        spin_unlock(&pm->pm_lock);
 428}
 429
 430/* Send any queued WBs that haven't been sent yet. */
 431static void flush_queued_writebacks(struct page_map *pm)
 432{
 433        /* TODO (WB) */
 434}
 435
 436/* Batches up pages to be written back, preferably as one big op.  If we have a
 437 * bunch outstanding, we'll send them. */
 438static void queue_writeback(struct page_map *pm, struct page *page)
 439{
 440        /* TODO (WB): add a bulk op (instead of only writepage()), collect
 441         * extents, and send them to the device.  Probably do something similar
 442         * for reads. */
 443        pm->pm_op->writepage(pm, page);
 444}
 445
 446static bool __writeback_cb(void **slot, unsigned long tree_idx, void *arg)
 447{
 448        struct page_map *pm = arg;
 449        struct page *page = pm_slot_get_page(*slot);
 450
 451        /* We're qlocked, so all items should have pages. */
 452        assert(page);
 453        if (atomic_read(&page->pg_flags) & PG_DIRTY) {
 454                atomic_and(&page->pg_flags, ~PG_DIRTY);
 455                queue_writeback(pm, page);
 456        }
 457        return false;
 458}
 459
 460/* Every dirty page gets written back, regardless of whether it's in a VMR or
 461 * not.  All the dirty bits get cleared too, before writing back. */
 462void pm_writeback_pages(struct page_map *pm)
 463{
 464        qlock(&pm->pm_qlock);
 465        mark_and_clear_dirty_ptes(pm);
 466        shootdown_vmrs(pm);
 467        radix_for_each_slot(&pm->pm_tree, __writeback_cb, pm);
 468        flush_queued_writebacks(pm);
 469        qunlock(&pm->pm_qlock);
 470}
 471
 472static bool __flush_unused_cb(void **slot, unsigned long tree_idx, void *arg)
 473{
 474        struct page_map *pm = arg;
 475        struct page *page = pm_slot_get_page(*slot);
 476        void *old_slot_val, *slot_val;
 477
 478        /* We're qlocked, so all items should have pages. */
 479        assert(page);
 480        old_slot_val = ACCESS_ONCE(*slot);
 481        slot_val = old_slot_val;
 482        /* Under any contention, we just skip it */
 483        if (pm_slot_check_refcnt(slot_val))
 484                return false;
 485        assert(pm_slot_get_page(slot_val) == page);
 486        slot_val = pm_slot_set_page(slot_val, NULL);
 487        if (!atomic_cas_ptr(slot, old_slot_val, slot_val))
 488                return false;
 489        /* At this point, we yanked the page.  any concurrent wait-free users
 490         * that want to get this page will fail (pm_find_page /
 491         * pm_load_page_nowait).  They will block on the qlock that we hold when
 492         * they try to insert a page (as part of pm_load_page, for both reading
 493         * or writing).  We can still bail out and everything will be fine, so
 494         * long as we put the page back.
 495         *
 496         * We can't tell from looking at the page if it was actually faulted
 497         * into the VMR; we just know it was possible.  (currently).  Also, we
 498         * need to do this check after removing the page from the PM slot, since
 499         * the mm faulting code (hpf) will attempt a non-blocking PM lookup. */
 500        if (pm_has_vmr_with_page(pm, tree_idx)) {
 501                slot_val = pm_slot_set_page(slot_val, page);
 502                /* No one should be writing to it.  We hold the qlock, and any
 503                 * readers should not have increffed while the page was NULL. */
 504                WRITE_ONCE(*slot, slot_val);
 505                return false;
 506        }
 507        /* Need to check PG_DIRTY *after* checking VMRs.  o/w we could check,
 508         * PAUSE, see no VMRs.  But in the meantime, we had a VMR that munmapped
 509         * and wrote-back the dirty flag. */
 510        if (atomic_read(&page->pg_flags) & PG_DIRTY) {
 511                /* If we want to batch these, we'll also have to batch the
 512                 * freeing, which isn't a big deal.  Just do it before freeing
 513                 * and before unlocking the PM; we don't want someone to load
 514                 * the page from the backing store and get an old value. */
 515                pm->pm_op->writepage(pm, page);
 516        }
 517        /* All clear - the page is unused and (now) clean. */
 518        atomic_set(&page->pg_flags, 0); /* catch bugs */
 519        page_decref(page);
 520        return true;
 521}
 522
 523/* Unused pages (not currently involved in a read, write, or mmap) are pruned.
 524 * Dirty pages are written back first.
 525 *
 526 * We ignore anything mapped in a VMR.  Not bothering with unmapping or
 527 * shootdowns or anything.  At least for now. */
 528void pm_free_unused_pages(struct page_map *pm)
 529{
 530        qlock(&pm->pm_qlock);
 531        radix_for_each_slot(&pm->pm_tree, __flush_unused_cb, pm);
 532        qunlock(&pm->pm_qlock);
 533}
 534
 535static bool __destroy_cb(void **slot, unsigned long tree_idx, void *arg)
 536{
 537        struct page *page = pm_slot_get_page(*slot);
 538
 539        /* Should be no users or need to sync */
 540        assert(pm_slot_check_refcnt(*slot) == 0);
 541        atomic_set(&page->pg_flags, 0); /* catch bugs */
 542        page_decref(page);
 543        return true;
 544}
 545
 546void pm_destroy(struct page_map *pm)
 547{
 548        radix_for_each_slot(&pm->pm_tree, __destroy_cb, pm);
 549        radix_tree_destroy(&pm->pm_tree);
 550}
 551
 552void print_page_map_info(struct page_map *pm)
 553{
 554        struct vm_region *vmr_i;
 555        printk("Page Map %p\n", pm);
 556        printk("\tNum pages: %lu\n", pm->pm_num_pages);
 557        spin_lock(&pm->pm_lock);
 558        TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
 559                printk("\tVMR proc %d: (%p - %p): 0x%08x, 0x%08x, %p, %p\n",
 560                       vmr_i->vm_proc->pid, vmr_i->vm_base, vmr_i->vm_end,
 561                       vmr_i->vm_prot, vmr_i->vm_flags,
 562                       foc_pointer(vmr_i->__vm_foc), vmr_i->vm_foff);
 563        }
 564        spin_unlock(&pm->pm_lock);
 565}
 566
 567void pm_page_asserter(struct page *page, char *str)
 568{
 569        void **tree_slot = page->pg_tree_slot;
 570
 571        if (!page_is_pagemap(page))
 572                return;
 573        assert(tree_slot);
 574        assert(pm_slot_get_page(*tree_slot) == page);
 575        assert(pm_slot_check_refcnt(*tree_slot) > 0);
 576}
 577