akaros/kern/arch/x86/pmap64.c
<<
>>
Prefs
   1/* Copyright (c) 2013 The Regents of the University of California
   2 * Barret Rhoden <brho@cs.berkeley.edu>
   3 * See LICENSE for details.
   4 *
   5 * 64 bit virtual memory / address space management (and a touch of pmem).
   6 *
   7 * TODO:
   8 * - better testing: check my helper funcs, a variety of inserts/segments remove
   9 * it all, etc (esp with jumbos).  check permissions and the existence of
  10 * mappings.
  11 * - mapping segments doesn't support having a PTE already present
  12 * - mtrrs break big machines
  13 * - jumbo pages are only supported at the VM layer, not PM (a jumbo is 2^9
  14 * little pages, for example)
  15 * - usermemwalk and freeing might need some help (in higher layers of the
  16 * kernel). */
  17
  18#include <arch/x86.h>
  19#include <arch/arch.h>
  20#include <arch/mmu.h>
  21#include <arch/apic.h>
  22#include <error.h>
  23#include <sys/queue.h>
  24#include <atomic.h>
  25#include <string.h>
  26#include <assert.h>
  27#include <pmap.h>
  28#include <env.h>
  29#include <stdio.h>
  30#include <kmalloc.h>
  31#include <page_alloc.h>
  32#include <umem.h>
  33
  34extern char boot_pml4[], gdt64[], gdt64desc[];
  35pgdir_t boot_pgdir;
  36physaddr_t boot_cr3;
  37segdesc_t *gdt;
  38pseudodesc_t gdt_pd;
  39
  40#define PG_WALK_SHIFT_MASK              0x00ff  /* first byte = target shift */
  41#define PG_WALK_CREATE                  0x0100
  42
  43kpte_t *pml_walk(kpte_t *pml, uintptr_t va, int flags);
  44typedef int (*kpte_cb_t)(kpte_t *kpte, uintptr_t kva, int pml_shift,
  45                        bool visited_subs, void *arg);
  46int pml_for_each(kpte_t *pml, uintptr_t start, size_t len, kpte_cb_t callback,
  47                 void *arg);
  48/* Helpers for PML for-each walks */
  49static inline bool pte_is_final(pte_t pte, int pml_shift)
  50{
  51        return (pml_shift == PML1_SHIFT) || pte_is_jumbo(pte);
  52}
  53
  54static inline bool pte_is_intermediate(pte_t pte, int pml_shift)
  55{
  56        return !pte_is_final(pte, pml_shift);
  57}
  58
  59/* Helper: gets the kpte_t pointer which is the base of the PML4 from pgdir */
  60static kpte_t *pgdir_get_kpt(pgdir_t pgdir)
  61{
  62        return pgdir.kpte;
  63}
  64
  65/* Helper: returns true if we do not need to walk the page table any further.
  66 *
  67 * The caller may or may not know if a jumbo is desired.  pml_shift determines
  68 * which layer we are at in the page walk, and flags contains the target level
  69 * we're looking for, like a jumbo or a default.
  70 *
  71 * Regardless of the desired target, if we find a jumbo page, we're also done.
  72 */
  73static bool walk_is_complete(kpte_t *kpte, int pml_shift, int flags)
  74{
  75        if ((pml_shift == (flags & PG_WALK_SHIFT_MASK)) || (*kpte & PTE_PS))
  76                return TRUE;
  77        return FALSE;
  78}
  79
  80/* PTE_ADDR should only be used on a PTE that has a physical address of the next
  81 * PML inside.  i.e., not a final PTE in the page table walk. */
  82static kpte_t *kpte2pml(kpte_t kpte)
  83{
  84        return (kpte_t*)KADDR(PTE_ADDR(kpte));
  85}
  86
  87static kpte_t *__pml_walk(kpte_t *pml, uintptr_t va, int flags, int pml_shift)
  88{
  89        kpte_t *kpte;
  90        epte_t *epte;
  91        void *new_pml_kva;
  92
  93        kpte = &pml[PMLx(va, pml_shift)];
  94        epte = kpte_to_epte(kpte);
  95        if (walk_is_complete(kpte, pml_shift, flags))
  96                return kpte;
  97        if (!kpte_is_present(kpte)) {
  98                if (!(flags & PG_WALK_CREATE))
  99                        return NULL;
 100                new_pml_kva = kpages_alloc(2 * PGSIZE, MEM_WAIT);
 101                memset(new_pml_kva, 0, PGSIZE * 2);
 102                /* Might want better error handling (we're probably out of
 103                 * memory) */
 104                if (!new_pml_kva)
 105                        return NULL;
 106                /* We insert the new PT into the PML with U and W perms.
 107                 * Permissions on page table walks are anded together (if any of
 108                 * them are !User, the translation is !User).  We put the perms
 109                 * on the last entry, not the intermediates. */
 110                *kpte = PADDR(new_pml_kva) | PTE_P | PTE_U | PTE_W;
 111                /* For a dose of paranoia, we'll avoid mapping intermediate
 112                 * eptes when we know we're using an address that should never
 113                 * be ept-accesible. */
 114                if (va < ULIM) {
 115                        /* The physaddr of the new_pml is one page higher than
 116                         * the KPT page.
 117                         * A few other things:
 118                         * - for the same reason that we have U and X set on all
 119                         *   intermediate PTEs, we now set R, X, and W for the
 120                         *   EPTE.
 121                         * - All EPTEs have U perms
 122                         * - We can't use epte_write since we're workin on
 123                         *   intermediate PTEs, and they don't have the memory
 124                         *   type set. */
 125                        *epte = (PADDR(new_pml_kva) + PGSIZE) | EPTE_R | EPTE_X
 126                                | EPTE_W;
 127                }
 128        }
 129        return __pml_walk(kpte2pml(*kpte), va, flags, pml_shift - BITS_PER_PML);
 130}
 131
 132/* Returns a pointer to the page table entry corresponding to va.  Flags has
 133 * some options and selects which level of the page table we're happy with
 134 * stopping at.  Normally, this is PML1 for a normal page (e.g. flags =
 135 * PML1_SHIFT), but could be for a jumbo page (PML3 or PML2 entry).
 136 *
 137 * Flags also controls whether or not intermediate page tables are created or
 138 * not.  This is useful for when we are checking whether or not a mapping
 139 * exists, but aren't interested in creating intermediate tables that will not
 140 * get filled.  When we want to create intermediate pages (i.e. we're looking
 141 * for the PTE to insert a page), pass in PG_WALK_CREATE with flags.
 142 *
 143 * Returns 0 on error or absence of a PTE for va. */
 144kpte_t *pml_walk(kpte_t *pml, uintptr_t va, int flags)
 145{
 146        return __pml_walk(pml, va, flags, PML4_SHIFT);
 147}
 148
 149/* Helper: determines how much va needs to be advanced until it is aligned to
 150 * pml_shift. */
 151static uintptr_t amt_til_aligned(uintptr_t va, int pml_shift)
 152{
 153        /* find the lower bits of va, subtract them from the shift to see what
 154         * we would need to add to get to the shift.  va might be aligned
 155         * already, and we subtracted 0, so we mask off the top part again. */
 156        return ((1UL << pml_shift) - (va & ((1UL << pml_shift) - 1))) &
 157               ((1UL << pml_shift) - 1);
 158}
 159
 160/* Helper: determines how much of size we can take, in chunks of pml_shift */
 161static uintptr_t amt_of_aligned_bytes(uintptr_t size, int pml_shift)
 162{
 163        /* creates a mask all 1s from MSB down to (including) shift */
 164        return (~((1UL << pml_shift) - 1)) & size;
 165}
 166
 167/* Helper: Advance kpte, given old_pte.  Will do pml walks when necessary. */
 168static kpte_t *get_next_pte(kpte_t *old_pte, kpte_t *pgdir, uintptr_t va,
 169                            int flags)
 170{
 171        /* PTEs (undereferenced) are addresses within page tables.  so long as
 172         * we stay inside the PML, we can just advance via pointer arithmetic.
 173         * if we advance old_pte and it points to the beginning of a page
 174         * (offset == 0), we've looped outside of our original PML, and need to
 175         * get a new one. */
 176        old_pte++;
 177        if (!PGOFF(old_pte))
 178                return pml_walk(pgdir, va, flags);
 179        return old_pte;
 180}
 181
 182/* Helper: maps pages from va to pa for size bytes, all for a given page size */
 183static void map_my_pages(kpte_t *pgdir, uintptr_t va, size_t size,
 184                         physaddr_t pa, int perm, int pml_shift)
 185{
 186        /* set to trigger a pml walk on the first get_next */
 187        kpte_t *kpte = (kpte_t*)PGSIZE - 1;
 188        size_t pgsize = 1UL << pml_shift;
 189
 190        for (size_t i = 0; i < size; i += pgsize, va += pgsize,
 191             pa += pgsize) {
 192                kpte = get_next_pte(kpte, pgdir, va,
 193                                    PG_WALK_CREATE | pml_shift);
 194                assert(kpte);
 195                pte_write(kpte, pa, perm | (pml_shift != PML1_SHIFT ? PTE_PS
 196                                                                    : 0));
 197                printd("Wrote *kpte %p, for va %p to pa %p tried to cover %p\n",
 198                       *kpte, va, pa, amt_mapped);
 199        }
 200}
 201
 202/* Maps all pages possible from va->pa, up to size, preferring to use pages of
 203 * type pml_shift (size == (1 << shift)).  Assumes that it is possible to map va
 204 * to pa at the given shift. */
 205static uintptr_t __map_segment(kpte_t *pgdir, uintptr_t va, size_t size,
 206                               physaddr_t pa, int perm, int pml_shift)
 207{
 208        printd("__map_segment, va %p, size %p, pa %p, shift %d\n", va, size,
 209               pa, pml_shift);
 210        uintptr_t amt_to_submap, amt_to_map, amt_mapped = 0;
 211
 212        amt_to_submap = amt_til_aligned(va, pml_shift);
 213        amt_to_submap = MIN(amt_to_submap, size);
 214        if (amt_to_submap) {
 215                amt_mapped = __map_segment(pgdir, va, amt_to_submap, pa, perm,
 216                                           pml_shift - BITS_PER_PML);
 217                va += amt_mapped;
 218                pa += amt_mapped;
 219                size -= amt_mapped;
 220        }
 221        /* Now we're either aligned and ready to map, or size == 0 */
 222        amt_to_map = amt_of_aligned_bytes(size, pml_shift);
 223        if (amt_to_map) {
 224                map_my_pages(pgdir, va, amt_to_map, pa, perm, pml_shift);
 225                va += amt_to_map;
 226                pa += amt_to_map;
 227                size -= amt_to_map;
 228                amt_mapped += amt_to_map;
 229        }
 230        /* Map whatever is left over */
 231        if (size)
 232                amt_mapped += __map_segment(pgdir, va, size, pa, perm,
 233                                            pml_shift - BITS_PER_PML);
 234        return amt_mapped;
 235}
 236
 237/* Returns the maximum pml shift possible between a va->pa mapping.  It is the
 238 * number of least-significant bits the two addresses have in common.  For
 239 * instance, if the two pages are 0x456000 and 0x156000, this returns 20.  For
 240 * regular pages, it will be at least 12 (every page ends in 0x000).
 241 *
 242 * The max pml shift possible for an va->pa mapping is determined by the
 243 * least bit that differs between va and pa.
 244 *
 245 * We can optimize this a bit, since we know the first 12 bits are the same, and
 246 * we won't go higher than max_pml_shift. */
 247static int max_possible_shift(uintptr_t va, uintptr_t pa)
 248{
 249        int shift = 0;
 250        if (va == pa)
 251                return sizeof(uintptr_t) * 8;
 252        while ((va & 1) == (pa & 1)) {
 253                va >>= 1;
 254                pa >>= 1;
 255                shift++;
 256        }
 257        return shift;
 258}
 259
 260/* Map [va, va+size) of virtual (linear) address space to physical [pa, pa+size)
 261 * in the page table rooted at pgdir.  Size is a multiple of PGSIZE.  Use
 262 * permission bits perm for the entries.  Set pml_shift to the shift of the
 263 * largest page size you're willing to use.
 264 *
 265 * Doesn't handle having pages currently mapped yet, and while supporting that
 266 * is relatively easy, doing an insertion of small pages into an existing jumbo
 267 * would be trickier.  Might have the vmem region code deal with this.
 268 *
 269 * Don't use this to set the PAT flag on jumbo pages in perm, unless you are
 270 * absolultely sure you won't map regular pages.  */
 271void map_segment(pgdir_t pgdir, uintptr_t va, size_t size, physaddr_t pa,
 272                 int perm, int pml_shift)
 273{
 274        int max_shift_possible;
 275        if (PGOFF(va) || PGOFF(pa) || PGOFF(size))
 276                panic("Asked to map with bad alignment.  va %p, pa %p, size %p\n",
 277                      va, pa, size);
 278        /* Given the max_page_size, try and use larger pages.  We'll figure out
 279         * the largest possible jumbo page, up to whatever we were asked for. */
 280        if (pml_shift != PGSHIFT) {
 281                max_shift_possible = max_possible_shift(va, pa);
 282                max_shift_possible = MIN(max_shift_possible,
 283                                         arch_max_jumbo_page_shift());
 284                /* Assumes we were given a proper PML shift 12, 21, 30, etc */
 285                while (pml_shift > max_shift_possible)
 286                        pml_shift -= BITS_PER_PML;
 287        }
 288        assert((pml_shift == PML1_SHIFT) ||
 289               (pml_shift == PML2_SHIFT) ||
 290               (pml_shift == PML3_SHIFT));
 291        __map_segment(pgdir_get_kpt(pgdir), va, size, pa, perm, pml_shift);
 292}
 293
 294/* For every PTE in [start, start + len), call callback(kpte, shift,
 295 * etc), including the not present PTEs.  pml_shift is the shift/size of pml.
 296 *
 297 * This will recurse down into sub PMLs, and perform the CB in a
 298 * depth-first-search.  The CB will be told which level of the paging it is at,
 299 * via 'shift'.
 300 *
 301 * The CB will also run on intermediate PTEs: meaning, PTEs that point to page
 302 * tables (and not (jumbo) pages) will be executed.  If the CB returns anything
 303 * other than 0, we'll abort and propagate that back out from for_each. */
 304static int __pml_for_each(kpte_t *pml,  uintptr_t start, size_t len,
 305                          kpte_cb_t callback, void *arg, int pml_shift)
 306{
 307        int ret;
 308        bool visited_all_subs;
 309        kpte_t *kpte_s, *kpte_e, *kpte_i;
 310        uintptr_t kva, pgsize = 1UL << pml_shift;
 311
 312        if (!len)
 313                return 0;
 314        kpte_s = &pml[PMLx(start, pml_shift)];
 315        /* Later, we'll loop up to and including kpte_e.  Since start + len
 316         * might not be page aligned, we'll need to include the final kpte.  If
 317         * it is aligned, we don't want to visit, so we subtract one so that the
 318         * aligned case maps to the index below its normal kpte. */
 319        kpte_e = &pml[PMLx(start + len - 1, pml_shift)];
 320        /* tracks the virt addr kpte_i works on, rounded for this PML */
 321        kva = ROUNDDOWN(start, pgsize);
 322        printd("start %p PMLx(S) %d, end-inc %p PMLx(E) %d shift %d, kva %p\n",
 323               start, PMLx(start, pml_shift), start + len - 1,
 324               PMLx(start + len - 1, pml_shift), pml_shift, kva);
 325        for (kpte_i = kpte_s; kpte_i <= kpte_e; kpte_i++, kva += pgsize) {
 326                visited_all_subs = FALSE;
 327                /* Complete only on the last level (PML1_SHIFT) or on a jumbo */
 328                if (kpte_is_present(kpte_i) &&
 329                    (!walk_is_complete(kpte_i, pml_shift, PML1_SHIFT))) {
 330                        /* only pass truncated end points (e.g. start may not be
 331                         * page aligned) when we're on the first (or last) item.
 332                         * For the middle entries, we want the subpmls to
 333                         * process the full range they are responsible for:
 334                         * [kva, kva + pgsize). */
 335                        uintptr_t sub_start = MAX(kva, start);
 336                        size_t sub_len = MIN(start + len - sub_start,
 337                                             kva + pgsize - sub_start);
 338
 339                        ret = __pml_for_each(kpte2pml(*kpte_i), sub_start,
 340                                             sub_len, callback, arg,
 341                                             pml_shift - BITS_PER_PML);
 342                        if (ret)
 343                                return ret;
 344                        /* based on sub_{start,end}, we can tell if our sub
 345                         * visited all of its PTES. */
 346                        if ((sub_start == kva) && (sub_len == pgsize))
 347                                visited_all_subs = TRUE;
 348                }
 349                if ((ret = callback(kpte_i, kva, pml_shift, visited_all_subs,
 350                                    arg)))
 351                        return ret;
 352        }
 353        return 0;
 354}
 355
 356int pml_for_each(kpte_t *pml, uintptr_t start, size_t len, kpte_cb_t callback,
 357                 void *arg)
 358{
 359        return __pml_for_each(pml, start, len, callback, arg, PML4_SHIFT);
 360}
 361
 362/* Unmaps [va, va + size) from pgdir, freeing any intermediate page tables for
 363 * non-kernel mappings.  This does not free the actual memory pointed to by the
 364 * page tables, nor does it flush the TLB. */
 365int unmap_segment(pgdir_t pgdir, uintptr_t va, size_t size)
 366{
 367        int pt_free_cb(kpte_t *kpte, uintptr_t kva, int shift,
 368                       bool visited_subs, void *data)
 369        {
 370                if (!kpte_is_present(kpte))
 371                        return 0;
 372                if (pte_is_final(kpte, shift)) {
 373                        pte_clear(kpte);
 374                        return 0;
 375                }
 376                /* Never remove intermediate pages for any kernel mappings.
 377                 * This is also important for x86 so that we don't accidentally
 378                 * free any of the boot PMLs, which aren't two-page alloc'd from
 379                 * kpages_arena. */
 380                if (kva >= ULIM)
 381                        return 0;
 382                /* If we haven't visited all of our subs, we might still have
 383                 * some mappings hanging off this page table. */
 384                if (!visited_subs) {
 385                        kpte_t *kpte_i = kpte2pml(*kpte);/* first kpte == pml */
 386                        /* make sure we have no PTEs in use */
 387                        for (int i = 0; i < NPTENTRIES; i++, kpte_i++) {
 388                                if (*kpte_i)
 389                                        return 0;
 390                        }
 391                }
 392                kpages_free(KADDR(PTE_ADDR(*kpte)), 2 * PGSIZE);
 393                pte_clear(kpte);
 394                return 0;
 395        }
 396        return pml_for_each(pgdir_get_kpt(pgdir), va, size, pt_free_cb, 0);
 397}
 398
 399/* Older interface for page table walks - will return the PTE corresponding to
 400 * VA.  If create is 1, it'll create intermediate tables.  This can return jumbo
 401 * PTEs, but only if they already exist.  Otherwise, (with create), it'll walk
 402 * to the lowest PML.  If the walk fails due to a lack of intermediate tables or
 403 * memory, this returns 0 (subject to change based on pte_t). */
 404pte_t pgdir_walk(pgdir_t pgdir, const void *va, int create)
 405{
 406        pte_t ret;
 407        int flags = PML1_SHIFT;
 408        if (create == 1)
 409                flags |= PG_WALK_CREATE;
 410        return pml_walk(pgdir_get_kpt(pgdir), (uintptr_t)va, flags);
 411}
 412
 413static int pml_perm_walk(kpte_t *pml, const void *va, int pml_shift)
 414{
 415        kpte_t *kpte;
 416        int perms_here;
 417
 418        kpte = &pml[PMLx(va, pml_shift)];
 419        if (!kpte_is_present(kpte))
 420                return 0;
 421        perms_here = *kpte & PTE_PERM;
 422        if (walk_is_complete(kpte, pml_shift, PML1_SHIFT))
 423                return perms_here;
 424        return pml_perm_walk(kpte2pml(*kpte), va, pml_shift - BITS_PER_PML) &
 425               perms_here;
 426}
 427
 428/* Returns the effective permissions for PTE_U, PTE_W, and PTE_P on a given
 429 * virtual address.  Note we need to consider the composition of every PTE in
 430 * the page table walk (we bit-and all of them together) */
 431int get_va_perms(pgdir_t pgdir, const void *va)
 432{
 433        return pml_perm_walk(pgdir_get_kpt(pgdir), va, PML4_SHIFT);
 434}
 435
 436#define check_sym_va(sym, addr)                                                \
 437({                                                                             \
 438        if ((sym) != (addr))                                                   \
 439                panic("Error: " #sym " is %p, should be " #addr, sym);         \
 440})
 441
 442static void check_syms_va(void)
 443{
 444        /* Make sure our symbols are up to date (see arch/ros/mmu64.h) */
 445        check_sym_va(KERN_LOAD_ADDR, 0xffffffffc0000000);
 446        check_sym_va(IOAPIC_BASE,    0xffffffffbff00000);
 447        check_sym_va(VPT_TOP,        0xffffff0000000000);
 448        check_sym_va(VPT,            0xfffffe8000000000);
 449        check_sym_va(KERN_VMAP_TOP,  0xfffffe8000000000);
 450        check_sym_va(KERNBASE,       0xffff800000000000);
 451        check_sym_va(ULIM,           0x0000800000000000);
 452        check_sym_va(UVPT,           0x00007f8000000000);
 453        check_sym_va(UGINFO,         0x00007f7fffe00000);
 454        check_sym_va(UINFO,          0x00007f7fffc00000);
 455        check_sym_va(UWLIM,          0x00007f7fffc00000);
 456        check_sym_va(UDATA,          0x00007f7fffa00000);
 457        check_sym_va(UGDATA,         0x00007f7fff9ff000);
 458        check_sym_va(UMAPTOP,        0x00007f7fff9ff000);
 459        check_sym_va(USTACKTOP,      0x00007f7fff9ff000);
 460        check_sym_va(BRK_END,        0x0000300000000000);
 461}
 462
 463/* Initializes anything related to virtual memory.  Paging is already on, but we
 464 * have a slimmed down page table. */
 465void vm_init(void)
 466{
 467        int max_jumbo_shift;
 468        kpte_t *boot_kpt = KADDR(get_boot_pml4());
 469
 470        boot_cr3 = get_boot_pml4();
 471        boot_pgdir.kpte = boot_kpt;
 472        boot_pgdir.eptp = 0;
 473        gdt = KADDR(get_gdt64());
 474
 475        /* We need to limit our mappings on machines that don't support 1GB
 476         * pages */
 477        max_jumbo_shift = arch_max_jumbo_page_shift();
 478        check_syms_va();
 479        /* KERNBASE mapping: we already have 512 GB complete (one full
 480         * PML3_REACH).  It's okay if we have extra, just need to make sure we
 481         * reach max_paddr. */
 482        if (KERNBASE + PML3_REACH < (uintptr_t)KADDR(max_paddr)) {
 483                map_segment(boot_pgdir, KERNBASE + PML3_REACH,
 484                            max_paddr - PML3_REACH, 0x0 + PML3_REACH,
 485                            PTE_KERN_RW | PTE_G, max_jumbo_shift);
 486        }
 487        /* For the LAPIC and IOAPIC, we use PAT (but not *the* PAT flag) to make
 488         * these type UC */
 489        map_segment(boot_pgdir, IOAPIC_BASE, APIC_SIZE, IOAPIC_PBASE,
 490                    PTE_NOCACHE | PTE_KERN_RW | PTE_G, max_jumbo_shift);
 491        /* VPT mapping: recursive PTE inserted at the VPT spot */
 492        boot_kpt[PML4(VPT)] = PADDR(boot_kpt) | PTE_KERN_RW;
 493        /* same for UVPT, accessible by userspace (RO). */
 494        boot_kpt[PML4(UVPT)] = PADDR(boot_kpt) | PTE_USER_RO;
 495        /* set up core0s now (mostly for debugging) */
 496        setup_default_mtrrs(0);
 497        /* Our current gdt_pd (gdt64desc) is pointing to a physical address for
 498         * the GDT.  We need to switch over to pointing to one with a virtual
 499         * address, so we can later unmap the low memory */
 500        gdt_pd = (pseudodesc_t) {sizeof(segdesc_t) * SEG_COUNT - 1,
 501                                 (uintptr_t)gdt};
 502        asm volatile("lgdt %0" : : "m"(gdt_pd));
 503}
 504
 505void x86_cleanup_bootmem(void)
 506{
 507        /* the boot page tables weren't alloc'd the same as other pages, so
 508         * we'll need to do some hackery to 'free' them.  This doesn't actually
 509         * free anything - it just unmaps but leave 2 KPTs (4 pages) sitting
 510         * around. */
 511        //unmap_segment(boot_pgdir, 0, PML3_PTE_REACH); // want to do this
 512        boot_pgdir.kpte[0] = 0;
 513        tlb_flush_global();
 514}
 515
 516/* Walks len bytes from start, executing 'callback' on every PTE, passing it a
 517 * specific VA and whatever arg is passed in.  Note, this cannot handle jumbo
 518 * pages.
 519 *
 520 * This is just a clumsy wrapper around the more powerful pml_for_each, which
 521 * can handle jumbo and intermediate pages. */
 522int env_user_mem_walk(struct proc *p, void *start, size_t len,
 523                      mem_walk_callback_t callback, void *arg)
 524{
 525        struct tramp_package {
 526                struct proc *p;
 527                mem_walk_callback_t cb;
 528                void *cb_arg;
 529        };
 530        int trampoline_cb(kpte_t *kpte, uintptr_t kva, int shift,
 531                          bool visited_subs, void *data)
 532        {
 533                struct tramp_package *tp = (struct tramp_package*)data;
 534                assert(tp->cb);
 535                /* memwalk CBs don't know how to handle intermediates or jumbos
 536                 */
 537                if (shift != PML1_SHIFT)
 538                        return 0;
 539                return tp->cb(tp->p, kpte, (void*)kva, tp->cb_arg);
 540        }
 541
 542        struct tramp_package local_tp;
 543        local_tp.p = p;
 544        local_tp.cb = callback;
 545        local_tp.cb_arg = arg;
 546        return pml_for_each(pgdir_get_kpt(p->env_pgdir), (uintptr_t)start, len,
 547                           trampoline_cb, &local_tp);
 548}
 549
 550/* Frees (decrefs) all pages of the process's page table, including the page
 551 * directory.  Does not free the memory that is actually mapped. */
 552void env_pagetable_free(struct proc *p)
 553{
 554        unmap_segment(p->env_pgdir, 0, UVPT - 0);
 555        /* the page directory is not a PTE, so it never was freed */
 556        kpages_free(pgdir_get_kpt(p->env_pgdir), 2 * PGSIZE);
 557        tlbflush();
 558}
 559
 560/* Remove the inner page tables along va's walk.  The internals are more
 561 * powerful.  We'll eventually want better arch-indep VM functions. */
 562error_t pagetable_remove(pgdir_t pgdir, void *va)
 563{
 564        return unmap_segment(pgdir, (uintptr_t)va, PGSIZE);
 565}
 566
 567void page_check(void)
 568{
 569}
 570
 571/* Similar to the kernels page table walk, but walks the guest page tables for a
 572 * guest_va.  Takes a proc and user virtual (guest physical) address for the
 573 * PML, returning the actual PTE (copied out of userspace). */
 574static kpte_t __guest_pml_walk(struct proc *p, kpte_t *u_pml, uintptr_t gva,
 575                               int flags, int pml_shift)
 576{
 577        kpte_t pte;
 578
 579        if (memcpy_from_user(p, &pte, &u_pml[PMLx(gva, pml_shift)],
 580                             sizeof(kpte_t))) {
 581                warn("Buggy pml %p, tried %p\n", u_pml,
 582                     &u_pml[PMLx(gva, pml_shift)]);
 583                return 0;
 584        }
 585        if (walk_is_complete(&pte, pml_shift, flags))
 586                return pte;
 587        if (!kpte_is_present(&pte))
 588                return 0;
 589        return __guest_pml_walk(p, (kpte_t*)PTE_ADDR(pte), gva, flags,
 590                                pml_shift - BITS_PER_PML);
 591}
 592
 593uintptr_t gva2gpa(struct proc *p, uintptr_t cr3, uintptr_t gva)
 594{
 595        kpte_t pte;
 596        int shift = PML1_SHIFT;
 597
 598        pte = __guest_pml_walk(p, (kpte_t*)cr3, gva, shift, PML4_SHIFT);
 599        if (!pte)
 600                return 0;
 601        /* TODO: Jumbos mess with us.  We need to know the shift the walk did.
 602         * This is a little nasty, but will work til we make Akaros more
 603         * jumbo-aware. */
 604        while (pte & PTE_PS) {
 605                shift += BITS_PER_PML;
 606                pte = __guest_pml_walk(p, (kpte_t*)cr3, gva, shift, PML4_SHIFT);
 607                if (!pte)
 608                        return 0;
 609        }
 610        return (pte & ~((1 << shift) - 1)) | (gva & ((1 << shift) - 1));
 611}
 612
 613/* Sets up the page directory, based on boot_copy.
 614 *
 615 * For x86, to support VMs, all processes will have an EPT and a KPT.  Ideally,
 616 * we'd use the same actual PT for both, but we can't thanks to the EPT design.
 617 * Although they are not the same actual PT, they have the same contents.
 618 *
 619 * The KPT-EPT invariant is that the KPT and EPT hold the same mappings from
 620 * [0,UVPT), so long as some lock is held.  Right now, the lock is the pte_lock,
 621 * but it could be a finer-grained lock (e.g. on lower level PTs) in the future.
 622 *
 623 * Part of the reason for the invariant is so that a pgdir walk on the process's
 624 * address space will get the 'same' PTE for both the KPT and the EPT.  For
 625 * instance, if a page is present in the KPT, a pte is present and points to the
 626 * same physical page in the EPT.  Likewise, both the KPT and EPT agree on jumbo
 627 * mappings.
 628 *
 629 * I went with UVPT for the upper limit of equality btw the KPT and EPT for a
 630 * couple reasons: I wanted something static (technically the physaddr width is
 631 * runtime dependent), and we'll never actually PF high enough for it to make a
 632 * difference.  Plus, the UVPT is something that would need to be changed for
 633 * the EPT too, if we supported it at all.
 634 *
 635 * Each page table page is actually two contiguous pages.  The lower is the KPT.
 636 * The upper is the EPT.  Order-1 page allocs are a little harder, but the
 637 * tradeoff is simplicity in all of the pm code.  Given a KPTE, we can find an
 638 * EPTE with no hassle.  Note that this two-page business is a tax on *all*
 639 * processes, which is less than awesome.
 640 *
 641 * Another note is that the boot page tables are *not* double-pages.  The EPT
 642 * won't cover those spaces (e.g. kernbase mapping), so it's not necessary, and
 643 * it's a pain in the ass to get it to work (can't align to 2*PGSIZE without
 644 * grub complaining, and we might run into issues with freeing memory in the
 645 * data segment). */
 646int arch_pgdir_setup(pgdir_t boot_copy, pgdir_t *new_pd)
 647{
 648        kpte_t *kpt;
 649        epte_t *ept;
 650
 651        kpt = kpages_alloc(2 * PGSIZE, MEM_WAIT);
 652        memcpy(kpt, boot_copy.kpte, PGSIZE);
 653        ept = kpte_to_epte(kpt);
 654        memset(ept, 0, PGSIZE);
 655
 656        /* This bit of paranoia slows process creation a little, but makes sure
 657         * that there is nothing below ULIM in boot_pgdir.  Any PML4 entries
 658         * copied from boot_pgdir (e.g. the kernel's memory) will be *shared*
 659         * among all processes, including *everything* under the PML4 entries
 660         * reach (e.g.  PML4_PTE_REACH = 512 GB) and any activity would need to
 661         * be synchronized.
 662         *
 663         * We could do this once at boot time, but that would miss out on
 664         * potential changes to the boot_pgdir at runtime.
 665         *
 666         * We could also just memset that region to 0.  For now, I want to catch
 667         * whatever mappings exist, since they are probably bugs. */
 668        for (int i = 0; i < PML4(ULIM - 1); i++)
 669                assert(kpt[i] == 0);
 670
 671        /* VPT and UVPT map the proc's page table, with different permissions.*/
 672        kpt[PML4(VPT)]  = build_kpte(PADDR(kpt), PTE_KERN_RW);
 673        kpt[PML4(UVPT)] = build_kpte(PADDR(kpt), PTE_USER_RO);
 674
 675        new_pd->kpte = kpt;
 676        new_pd->eptp = construct_eptp(PADDR(ept));
 677        return 0;
 678}
 679
 680physaddr_t arch_pgdir_get_cr3(pgdir_t pd)
 681{
 682        return PADDR(pd.kpte);
 683}
 684
 685void arch_pgdir_clear(pgdir_t *pd)
 686{
 687        pd->kpte = 0;
 688        pd->eptp = 0;
 689}
 690
 691/* Returns the page shift of the largest jumbo supported */
 692int arch_max_jumbo_page_shift(void)
 693{
 694        uint32_t edx;
 695        cpuid(0x80000001, 0x0, 0, 0, 0, &edx);
 696        return edx & (1 << 26) ? PML3_SHIFT : PML2_SHIFT;
 697}
 698
 699/* Adds empty intermediate PTs to the top-most PML in pgdir for the given range.
 700 * On a 4-PML system, this will add entries to PML4, consisting of a bunch of
 701 * empty PML3s, such that [va, va+len) has intermediate tables in pgdir.
 702 *
 703 * A few related notes:
 704 *
 705 * The boot_pgdir is where we do the original kernel mappings.  All of the PML4
 706 * entries are filled in, pointing to intermediate PML3s.  All other pgdirs copy
 707 * the kernel mapping, which means they have the same content.  That content
 708 * never changes at runtime.  What changes is the contents of the PML3s and
 709 * below, which are pointed to by all pgdirs.
 710 *
 711 * The proc pgdirs do not have KPT or EPT mappings above ULIM, so if the
 712 * intermediate PTs have EPT entries, it's just a waste of memory, but not a
 713 * mapping the user could exploit.
 714 *
 715 * On occasion, there might be code that maps things into boot_pgdir below ULIM,
 716 * though right now this is just an out-of-branch "mmap a page at 0" debugging
 717 * hack. */
 718void arch_add_intermediate_pts(pgdir_t pgdir, uintptr_t va, size_t len)
 719{
 720        kpte_t *pml4 = pgdir_get_kpt(pgdir);
 721        kpte_t *kpte;
 722        epte_t *epte;
 723        void *new_pml_kva;
 724
 725        for (size_t i = 0; i < len; i += PML4_PTE_REACH, va += PML4_PTE_REACH) {
 726                kpte = &pml4[PML4(va)];
 727                epte = kpte_to_epte(kpte);
 728                if (kpte_is_present(kpte))
 729                        continue;
 730                new_pml_kva = kpages_zalloc(2 * PGSIZE, MEM_WAIT);
 731                /* We insert the same as for __pml_walk. */
 732                *kpte = PADDR(new_pml_kva) | PTE_P | PTE_U | PTE_W;
 733                if (va < ULIM)
 734                        *epte = (PADDR(new_pml_kva) + PGSIZE) | EPTE_R | EPTE_X
 735                                | EPTE_W;
 736        }
 737}
 738
 739/* Debugging */
 740static int print_pte(kpte_t *kpte, uintptr_t kva, int shift, bool visited_subs,
 741                     void *data)
 742{
 743        if (kpte_is_unmapped(kpte))
 744                return 0;
 745        print_lock();
 746        switch (shift) {
 747                case (PML1_SHIFT):
 748                        printk("\t");
 749                        /* fall-through */
 750                case (PML2_SHIFT):
 751                        printk("\t");
 752                        /* fall-through */
 753                case (PML3_SHIFT):
 754                        printk("\t");
 755        }
 756        printk("KVA: %p, PTE val %p, shift %d, visit %d%s\n", kva, *kpte, shift,
 757               visited_subs, (*kpte & PTE_PS ? " (jumbo)" : ""));
 758        print_unlock();
 759        return 0;
 760}
 761
 762void debug_print_pgdir(kpte_t *pgdir)
 763{
 764        if (! pgdir)
 765                pgdir = KADDR(rcr3());
 766        printk("Printing the entire page table set for %p, DFS\n", pgdir);
 767        /* Need to be careful we avoid VPT/UVPT, o/w we'll recurse */
 768        pml_for_each(pgdir, 0, UVPT, print_pte, 0);
 769        if (arch_max_jumbo_page_shift() < PML3_SHIFT)
 770                printk("(skipping kernbase mapping - too many entries)\n");
 771        else
 772                pml_for_each(pgdir, KERNBASE, VPT - KERNBASE, print_pte, 0);
 773        pml_for_each(pgdir, VPT_TOP, MAX_VADDR - VPT_TOP, print_pte, 0);
 774}
 775
 776/* Debug helper - makes sure the KPT == EPT for [0, UVPT) */
 777int debug_check_kpt_ept(void)
 778{
 779        int db_cb(kpte_t *kpte, uintptr_t kva, int shift, bool visited_subs,
 780                  void *data)
 781        {
 782                epte_t *epte = kpte_to_epte(kpte);
 783                char *reason;
 784                int pa_offset = 0;
 785
 786                if (kpte_is_present(kpte) != epte_is_present(epte)) {
 787                        reason = "present bit";
 788                        goto fail;
 789                }
 790                if (kpte_is_mapped(kpte) != epte_is_mapped(epte)) {
 791                        reason = "mapped or not";
 792                        goto fail;
 793                }
 794                if (kpte_is_jumbo(kpte) != epte_is_jumbo(epte)) {
 795                        reason = "jumbo";
 796                        goto fail;
 797                }
 798                /* Intermediate PTEs have the EPTE pointing to PADDR + PGSIZE */
 799                if (pte_is_present(kpte) && pte_is_intermediate(kpte, shift))
 800                        pa_offset = PGSIZE;
 801                if (kpte_get_paddr(kpte) + pa_offset != epte_get_paddr(epte)) {
 802                        reason = "paddr";
 803                        goto fail;
 804                }
 805                if ((kpte_get_settings(kpte) & PTE_PERM) !=
 806                    (epte_get_settings(epte) & PTE_PERM)) {
 807                        reason = "permissions";
 808                        goto fail;
 809                }
 810                return 0;
 811
 812fail:
 813                panic("kpte %p (%p) epte %p (%p) kva %p shift %d: %s",
 814                       kpte, *kpte, epte, *epte, kva, shift, reason);
 815                return -1;
 816        }
 817        return pml_for_each(current->env_pgdir.kpte, 0, UVPT - 0, db_cb, 0);
 818}
 819