1 /* Copyright (c) 2014 The Regents of the University of California
2 * Barret Rhoden <brho@cs.berkeley.edu>
3 * See LICENSE for details.
5 * Page mapping: maps an object (inode or block dev) in page size chunks.
6 * Analagous to Linux's "struct address space" */
15 void pm_add_vmr(struct page_map *pm, struct vm_region *vmr)
17 /* note that the VMR being reverse-mapped by the PM is protected by the PM's
18 * lock. we clearly need a write lock here, but removal also needs a write
19 * lock, so later when removal holds this, it delays munmaps and keeps the
21 spin_lock(&pm->pm_lock);
22 TAILQ_INSERT_TAIL(&pm->pm_vmrs, vmr, vm_pm_link);
23 spin_unlock(&pm->pm_lock);
26 void pm_remove_vmr(struct page_map *pm, struct vm_region *vmr)
28 spin_lock(&pm->pm_lock);
29 TAILQ_REMOVE(&pm->pm_vmrs, vmr, vm_pm_link);
30 spin_unlock(&pm->pm_lock);
33 /* PM slot void *s look like this:
35 * |--11--|--1--|----52 or 20 bits--|
36 * | ref | flag| ppn of page |
37 * \ <--- meta shift -/
39 * The setter funcs return the void* that should update slot_val; it doesn't
40 * change slot_val in place (it's a val, not the addr) */
43 # define PM_FLAGS_SHIFT 52
45 # define PM_FLAGS_SHIFT 20
47 #define PM_REFCNT_SHIFT (PM_FLAGS_SHIFT + 1)
49 #define PM_REMOVAL (1UL << PM_FLAGS_SHIFT)
51 static bool pm_slot_check_removal(void *slot_val)
53 return (unsigned long)slot_val & PM_REMOVAL ? TRUE : FALSE;
56 static void *pm_slot_set_removal(void *slot_val)
58 return (void*)((unsigned long)slot_val | PM_REMOVAL);
61 static void *pm_slot_clear_removal(void *slot_val)
63 return (void*)((unsigned long)slot_val & ~PM_REMOVAL);
66 static int pm_slot_check_refcnt(void *slot_val)
68 return (unsigned long)slot_val >> PM_REFCNT_SHIFT;
71 static void *pm_slot_inc_refcnt(void *slot_val)
73 return (void*)((unsigned long)slot_val + (1UL << PM_REFCNT_SHIFT));
76 static void *pm_slot_dec_refcnt(void *slot_val)
78 return (void*)((unsigned long)slot_val - (1UL << PM_REFCNT_SHIFT));
81 static struct page *pm_slot_get_page(void *slot_val)
85 return ppn2page((unsigned long)slot_val & ((1UL << PM_FLAGS_SHIFT) - 1));
88 static void *pm_slot_set_page(void *slot_val, struct page *pg)
90 assert(pg != pages); /* we should never alloc page 0, for sanity */
91 return (void*)(page2ppn(pg) | ((unsigned long)slot_val &
92 ~((1UL << PM_FLAGS_SHIFT) - 1)));
95 /* Initializes a PM. Host should be an *inode or a *bdev (doesn't matter). The
96 * reference this stores is uncounted. */
97 void pm_init(struct page_map *pm, struct page_map_operations *op, void *host)
99 pm->pm_bdev = host; /* note the uncounted ref */
100 radix_tree_init(&pm->pm_tree);
101 pm->pm_num_pages = 0; /* no pages in a new pm */
103 spinlock_init(&pm->pm_lock);
104 TAILQ_INIT(&pm->pm_vmrs);
105 atomic_set(&pm->pm_removal, 0);
108 /* Looks up the index'th page in the page map, returning a refcnt'd reference
109 * that need to be dropped with pm_put_page, or 0 if it was not in the map. */
110 static struct page *pm_find_page(struct page_map *pm, unsigned long index)
113 void *old_slot_val, *slot_val;
114 struct page *page = 0;
115 /* Read walking the PM tree TODO: (RCU) */
116 spin_lock(&pm->pm_lock);
117 /* We're syncing with removal. The deal is that if we grab the page (and
118 * we'd only do that if the page != 0), we up the slot ref and clear
119 * removal. A remover will only remove it if removal is still set. If we
120 * grab and release while removal is in progress, even though we no longer
121 * hold the ref, we have unset removal. Also, to prevent removal where we
122 * get a page well before the removal process, the removal won't even bother
123 * when the slot refcnt is upped. */
124 tree_slot = radix_lookup_slot(&pm->pm_tree, index);
128 old_slot_val = ACCESS_ONCE(*tree_slot);
129 slot_val = old_slot_val;
130 page = pm_slot_get_page(slot_val);
133 slot_val = pm_slot_clear_removal(slot_val);
134 slot_val = pm_slot_inc_refcnt(slot_val); /* not a page kref */
135 } while (!atomic_cas_ptr(tree_slot, old_slot_val, slot_val));
136 assert(page->pg_tree_slot == tree_slot);
138 spin_unlock(&pm->pm_lock);
142 /* Attempts to insert the page into the page_map, returns 0 for success, or an
143 * error code if there was one already (EEXIST) or we ran out of memory
146 * On success, callers *lose* their page ref, but get a PM slot ref. This slot
147 * ref is sufficient to keep the page alive (slot ref protects the page ref)..
149 * Makes no assumptions about the quality of the data loaded, that's up to the
151 static int pm_insert_page(struct page_map *pm, unsigned long index,
157 /* write locking the PM */
158 spin_lock(&pm->pm_lock);
159 page->pg_mapping = pm; /* debugging */
160 page->pg_index = index;
161 /* no one should be looking at the tree slot til we stop write locking. the
162 * only other one who looks is removal, who requires a PM write lock. */
163 page->pg_tree_slot = (void*)0xdeadbeef; /* poison */
164 slot_val = pm_slot_inc_refcnt(slot_val);
165 /* passing the page ref from the caller to the slot */
166 slot_val = pm_slot_set_page(slot_val, page);
167 /* shouldn't need a CAS or anything for the slot write, since we hold the
168 * write lock. o/w, we'd need to get the slot and CAS instead of insert. */
169 ret = radix_insert(&pm->pm_tree, index, slot_val, &tree_slot);
171 spin_unlock(&pm->pm_lock);
174 page->pg_tree_slot = tree_slot;
176 spin_unlock(&pm->pm_lock);
180 /* Decrefs the PM slot ref (usage of a PM page). The PM's page ref remains. */
181 void pm_put_page(struct page *page)
183 void **tree_slot = page->pg_tree_slot;
185 /* decref, don't care about CASing */
186 atomic_add((atomic_t*)tree_slot, -(1UL << PM_REFCNT_SHIFT));
189 /* Makes sure the index'th page of the mapped object is loaded in the page cache
190 * and returns its location via **pp.
192 * You'll get a pm-slot refcnt back, which you need to put when you're done. */
193 int pm_load_page(struct page_map *pm, unsigned long index, struct page **pp)
198 page = pm_find_page(pm, index);
200 if (kpage_alloc(&page))
202 /* important that UP_TO_DATE is not set. once we put it in the PM,
203 * others can find it, and we still need to fill it. */
204 atomic_set(&page->pg_flags, PG_LOCKED | PG_PAGEMAP);
205 page->pg_sem.nr_signals = 0; /* preemptively locking */
206 error = pm_insert_page(pm, index, page);
209 goto load_locked_page;
212 /* the page was mapped already (benign race), just get rid of
213 * our page and try again (the only case that uses the while) */
215 page = pm_find_page(pm, index);
222 assert(page && pm_slot_check_refcnt(*page->pg_tree_slot));
223 if (atomic_read(&page->pg_flags) & PG_UPTODATE) {
225 printd("pm %p FOUND page %p, addr %p, idx %d\n", pm, page,
226 page2kva(page), index);
230 /* double-check. if we we blocked on lock_page, it was probably for someone
231 * else loading. plus, we can't load a page more than once (it could
232 * clobber newer writes) */
233 if (atomic_read(&page->pg_flags) & PG_UPTODATE) {
240 error = pm->pm_op->readpage(pm, page);
242 assert(atomic_read(&page->pg_flags) & PG_UPTODATE);
245 printd("pm %p LOADS page %p, addr %p, idx %d\n", pm, page,
246 page2kva(page), index);
250 int pm_load_page_nowait(struct page_map *pm, unsigned long index,
253 struct page *page = pm_find_page(pm, index);
256 if (!(atomic_read(&page->pg_flags) & PG_UPTODATE)) {
257 /* TODO: could have a read_nowait pm_op */
265 static bool vmr_has_page_idx(struct vm_region *vmr, unsigned long pg_idx)
267 unsigned long nr_pgs = (vmr->vm_end - vmr->vm_base) >> PGSHIFT;
268 unsigned long start_pg = vmr->vm_foff >> PGSHIFT;
269 return ((start_pg <= pg_idx) && (pg_idx < start_pg + nr_pgs));
272 static void *vmr_idx_to_va(struct vm_region *vmr, unsigned long pg_idx)
274 uintptr_t va = vmr->vm_base + ((pg_idx << PGSHIFT) - vmr->vm_foff);
275 assert(va < vmr->vm_end);
279 static unsigned long vmr_get_end_idx(struct vm_region *vmr)
281 return ((vmr->vm_end - vmr->vm_base) + vmr->vm_foff) >> PGSHIFT;
284 static void vmr_for_each(struct vm_region *vmr, unsigned long pg_idx,
285 unsigned long max_nr_pgs, mem_walk_callback_t callback)
287 void *start_va = vmr_idx_to_va(vmr, pg_idx);
288 size_t len = vmr->vm_end - (uintptr_t)start_va;
289 len = MIN(len, max_nr_pgs << PGSHIFT);
290 /* TODO: start using pml_for_each, across all arches */
291 env_user_mem_walk(vmr->vm_proc, start_va, len, callback, 0);
294 /* These next two helpers are called on a VMR's range of VAs corresponding to a
295 * pages in a PM undergoing removal.
297 * In general, it is safe to mark !P or 0 a PTE so long as the page the PTE
298 * points to belongs to a PM. We'll refault, find the page, and rebuild the
299 * PTE. This allows us to handle races like: {pm marks !p, {fault, find page,
300 * abort removal, write new pte}, pm clears pte}.
302 * In that race, HPF is writing the PTE, which removal code subsequently looks
303 * at to determine if the underlying page is dirty. We need to make sure no one
304 * clears dirty bits unless they handle the WB (or discard). HPF preserves the
305 * dirty bit for this reason. */
306 static int __pm_mark_not_present(struct proc *p, pte_t pte, void *va, void *arg)
309 /* mapped includes present. Any PTE pointing to a page (mapped) will get
310 * flagged for removal and have its access prots revoked. We need to deal
311 * with mapped-but-maybe-not-present in case of a dirtied file that was
312 * mprotected to PROT_NONE (which is not present) */
313 if (pte_is_unmapped(pte))
315 page = pa2page(pte_get_paddr(pte));
316 if (atomic_read(&page->pg_flags) & PG_REMOVAL)
317 pte_clear_present(pte);
321 static int __pm_mark_dirty_pgs_unmap(struct proc *p, pte_t pte, void *va,
325 /* we're not checking for 'present' or not, since we marked them !P earlier.
326 * but the CB is still called on everything in the range. we can tell the
327 * formerly-valid PTEs from the completely unmapped, since the latter are
328 * unmapped, while the former have other things in them, but just are !P. */
329 if (pte_is_unmapped(pte))
331 page = pa2page(pte_get_paddr(pte));
332 /* need to check for removal again, just like in mark_not_present */
333 if (atomic_read(&page->pg_flags) & PG_REMOVAL) {
334 if (pte_is_dirty(pte))
335 atomic_or(&page->pg_flags, PG_DIRTY);
341 static int __pm_mark_unmap(struct proc *p, pte_t pte, void *va, void *arg)
344 if (pte_is_unmapped(pte))
346 page = pa2page(pte_get_paddr(pte));
347 if (atomic_read(&page->pg_flags) & PG_REMOVAL)
352 static void shootdown_and_reset_ptrstore(void *proc_ptrs[], int *arr_idx)
354 for (int i = 0; i < *arr_idx; i++)
355 proc_tlbshootdown((struct proc*)proc_ptrs[i], 0, 0);
359 /* Attempts to remove pages from the pm, from [index, index + nr_pgs). Returns
360 * the number of pages removed. There can only be one remover at a time per PM
361 * - others will return 0. */
362 int pm_remove_contig(struct page_map *pm, unsigned long index,
363 unsigned long nr_pgs)
368 void *old_slot_val, *slot_val;
369 struct vm_region *vmr_i;
370 bool pm_has_pinned_vmrs = FALSE;
371 /* using this for both procs and later WBs */
372 #define PTR_ARR_LEN 10
373 void *ptr_store[PTR_ARR_LEN];
374 int ptr_free_idx = 0;
376 /* could also call a simpler remove if nr_pgs == 1 */
379 /* only one remover at a time (since we walk the PM multiple times as our
380 * 'working list', and need the REMOVAL flag to tell us which pages we're
381 * working on. with more than one remover, we'd be confused and would need
383 if (atomic_swap(&pm->pm_removal, 1)) {
384 /* We got a 1 back, so someone else is already removing */
387 /* TODO: RCU: we're read walking the PM tree and write walking the VMR list.
388 * the reason for the write lock is since we need to prevent new VMRs or the
389 * changing of a VMR to being pinned. o/w, we could fail to unmap and check
391 spin_lock(&pm->pm_lock);
392 assert(index + nr_pgs > index); /* til we figure out who validates */
393 /* check for any pinned VMRs. if we have none, then we can skip some loops
395 TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
396 if (vmr_i->vm_flags & MAP_LOCKED)
397 pm_has_pinned_vmrs = TRUE;
399 /* this pass, we mark pages for removal */
400 for (i = index; i < index + nr_pgs; i++) {
401 if (pm_has_pinned_vmrs) {
402 /* for pinned pages, we don't even want to attempt to remove them */
403 TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
404 /* once we've found a pinned page, we can skip over the rest of
405 * the range of pages mapped by this vmr - even if the vmr
406 * hasn't actually faulted them in yet. */
407 if ((vmr_i->vm_flags & MAP_LOCKED) &&
408 (vmr_has_page_idx(vmr_i, i))) {
409 i = vmr_get_end_idx(vmr_i) - 1; /* loop will +1 */
410 goto next_loop_mark_rm;
414 /* TODO: would like a radix_next_slot() iterator (careful with skipping
415 * chunks of the loop) */
416 tree_slot = radix_lookup_slot(&pm->pm_tree, i);
419 old_slot_val = ACCESS_ONCE(*tree_slot);
420 slot_val = old_slot_val;
421 page = pm_slot_get_page(slot_val);
424 /* syncing with lookups, writebacks, etc. only one remover per pm in
425 * general. any new ref-getter (WB, lookup, etc) will clear removal,
426 * causing us to abort later. */
427 if (pm_slot_check_refcnt(slot_val))
429 /* it's possible that removal is already set, if we happened to repeat a
430 * loop (due to running out of space in the proc arr) */
431 slot_val = pm_slot_set_removal(slot_val);
432 if (!atomic_cas_ptr(tree_slot, old_slot_val, slot_val))
434 /* mark the page itself. this isn't used for syncing - just out of
435 * convenience for ourselves (memwalk callbacks are easier). need the
436 * atomic in case a new user comes in and tries mucking with the flags*/
437 atomic_or(&page->pg_flags, PG_REMOVAL);
441 /* second pass, over VMRs instead of pages. we remove the marked pages from
442 * all VMRs, collecting the procs for batch shootdowns. not sure how often
443 * we'll have more than one VMR (for a PM) per proc. shared libs tend to
444 * have a couple, so we'll still batch things up */
445 TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
446 /* might have some pinned VMRs that only map part of the file we aren't
447 * messing with (so they didn't trigger earlier). */
448 if (vmr_i->vm_flags & MAP_LOCKED)
450 /* Private mappings: for each page, either PMs have a separate copy
451 * hanging off their PTE (in which case they aren't using the PM page,
452 * and the actual page in use won't have PG_REMOVAL set, and the CB will
453 * ignore it), or they are still using the shared version. In which
454 * case they haven't written it yet, and we can remove it. If they
455 * concurrently are performing a write fault to CoW the page, they will
456 * incref and clear REMOVAL, thereby aborting the remove anyways.
458 * Though if the entire mapping is unique-copies of private pages, we
459 * won't need a shootdown. mem_walk can't handle this yet though. */
460 if (!vmr_has_page_idx(vmr_i, index))
462 spin_lock(&vmr_i->vm_proc->pte_lock);
463 /* all PTEs for pages marked for removal are marked !P for the entire
464 * range. it's possible we'll remove some extra PTEs (races with
465 * loaders, etc), but those pages will remain in the PM and should get
466 * soft-faulted back in. */
467 vmr_for_each(vmr_i, index, nr_pgs, __pm_mark_not_present);
468 spin_unlock(&vmr_i->vm_proc->pte_lock);
469 /* batching TLB shootdowns for a given proc (continue if found).
470 * the proc stays alive while we hold a read lock on the PM tree,
471 * since the VMR can't get yanked out yet. */
472 for (i = 0; i < ptr_free_idx; i++) {
473 if (ptr_store[i] == vmr_i->vm_proc)
476 if (i != ptr_free_idx)
478 if (ptr_free_idx == PTR_ARR_LEN)
479 shootdown_and_reset_ptrstore(ptr_store, &ptr_free_idx);
480 ptr_store[ptr_free_idx++] = vmr_i->vm_proc;
482 /* Need to shootdown so that all TLBs have the page marked absent. Then we
483 * can check the dirty bit, now that concurrent accesses will fault. btw,
484 * we have a lock ordering: pm (RCU) -> proc lock (state, vcmap, etc) */
485 shootdown_and_reset_ptrstore(ptr_store, &ptr_free_idx);
486 /* Now that we've shotdown, we can check for dirtiness. One downside to
487 * this approach is we check every VMR for a page, even once we know the
488 * page is dirty. We also need to unmap the pages (set ptes to 0) for any
489 * that we previously marked not present (complete the unmap). We're racing
490 * with munmap here, which treats the PTE as a weak ref on a page. */
491 TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
492 if (vmr_i->vm_flags & MAP_LOCKED)
494 if (!vmr_has_page_idx(vmr_i, index))
496 spin_lock(&vmr_i->vm_proc->pte_lock);
497 if (vmr_i->vm_prot & PROT_WRITE)
498 vmr_for_each(vmr_i, index, nr_pgs, __pm_mark_dirty_pgs_unmap);
500 vmr_for_each(vmr_i, index, nr_pgs, __pm_mark_unmap);
501 spin_unlock(&vmr_i->vm_proc->pte_lock);
503 /* Now we'll go through from the PM again and deal with pages are dirty. */
506 for (/* i set already */; i < index + nr_pgs; i++) {
507 /* TODO: consider putting in the pinned check & advance again. Careful,
508 * since we could unlock on a handle_dirty loop, and skipping could skip
509 * over a new VMR, but those pages would still be marked for removal.
510 * It's not wrong, currently, to have spurious REMOVALs. */
511 tree_slot = radix_lookup_slot(&pm->pm_tree, i);
514 page = pm_slot_get_page(*tree_slot);
517 /* only operate on pages we marked earlier */
518 if (!(atomic_read(&page->pg_flags) & PG_REMOVAL))
520 /* if someone has used it since we grabbed it, we lost the race and
521 * won't remove it later. no sense writing it back now either. */
522 if (!pm_slot_check_removal(*tree_slot)) {
523 /* since we set PG_REMOVAL, we're the ones to clear it */
524 atomic_and(&page->pg_flags, ~PG_REMOVAL);
527 /* this dirty flag could also be set by write()s, not just VMRs */
528 if (atomic_read(&page->pg_flags) & PG_DIRTY) {
529 /* need to bail out. after we WB, we'll restart this big loop where
530 * we left off ('i' is still set) */
531 if (ptr_free_idx == PTR_ARR_LEN)
533 ptr_store[ptr_free_idx++] = page;
534 /* once we've decided to WB, we can clear the dirty flag. might
535 * have an extra WB later, but we won't miss new data */
536 atomic_and(&page->pg_flags, ~PG_DIRTY);
539 /* we're unlocking, meaning VMRs and the radix tree can be changed, but we
540 * are still the only remover. still can have new refs that clear REMOVAL */
541 spin_unlock(&pm->pm_lock);
542 /* could batch these up, etc. */
543 for (int j = 0; j < ptr_free_idx; j++)
544 pm->pm_op->writepage(pm, (struct page*)ptr_store[j]);
546 spin_lock(&pm->pm_lock);
547 /* bailed out of the dirty check loop earlier, need to finish and WB. i is
548 * still set to where we failed and left off in the big loop. */
549 if (i < index + nr_pgs)
551 /* TODO: RCU - we need a write lock here (the current spinlock is fine) */
552 /* All dirty pages were WB, anything left as REMOVAL can be removed */
553 for (i = index; i < index + nr_pgs; i++) {
554 /* TODO: consider putting in the pinned check & advance again */
555 tree_slot = radix_lookup_slot(&pm->pm_tree, i);
558 old_slot_val = ACCESS_ONCE(*tree_slot);
559 slot_val = old_slot_val;
560 page = pm_slot_get_page(*tree_slot);
563 if (!(atomic_read(&page->pg_flags) & PG_REMOVAL))
565 /* syncing with lookups, writebacks, etc. if someone has used it since
566 * we started removing, they would have cleared the slot's REMOVAL (but
567 * not PG_REMOVAL), though the refcnt could be back down to 0 again. */
568 if (!pm_slot_check_removal(slot_val)) {
569 /* since we set PG_REMOVAL, we're the ones to clear it */
570 atomic_and(&page->pg_flags, ~PG_REMOVAL);
573 if (pm_slot_check_refcnt(slot_val))
574 warn("Unexpected refcnt in PM remove!");
575 /* Note that we keep slot REMOVAL set, so the radix tree thinks it's
576 * still an item (artifact of that implementation). */
577 slot_val = pm_slot_set_page(slot_val, 0);
578 if (!atomic_cas_ptr(tree_slot, old_slot_val, slot_val)) {
579 atomic_and(&page->pg_flags, ~PG_REMOVAL);
582 /* at this point, we're free at last! When we update the radix tree, it
583 * still thinks it has an item. This is fine. Lookups will now fail
584 * (since the page is 0), and insertions will block on the write lock.*/
585 atomic_set(&page->pg_flags, 0); /* cause/catch bugs */
588 radix_delete(&pm->pm_tree, i);
590 pm->pm_num_pages -= nr_removed;
591 spin_unlock(&pm->pm_lock);
592 atomic_set(&pm->pm_removal, 0);
596 void print_page_map_info(struct page_map *pm)
598 struct vm_region *vmr_i;
599 printk("Page Map %p\n", pm);
600 printk("\tNum pages: %lu\n", pm->pm_num_pages);
601 spin_lock(&pm->pm_lock);
602 TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
603 printk("\tVMR proc %d: (%p - %p): 0x%08x, 0x%08x, %p, %p\n",
604 vmr_i->vm_proc->pid, vmr_i->vm_base, vmr_i->vm_end,
605 vmr_i->vm_prot, vmr_i->vm_flags, vmr_i->vm_file, vmr_i->vm_foff);
607 spin_unlock(&pm->pm_lock);