Added demand paging support
authorAndrew Waterman <waterman@ros-dev.(none)>
Sat, 27 Mar 2010 04:06:31 +0000 (21:06 -0700)
committerKevin Klues <klueska@cs.berkeley.edu>
Thu, 3 Nov 2011 00:35:38 +0000 (17:35 -0700)
mmap'd pages are now faulted in on demand.  Metadata about paged-out
pages are stored in the unused bits of the PTE when the PTE is invalid.
Metadata comes in the form of a pointer to type pfault_info_t.

The main thing to keep in mind is it's often no longer sufficient to
check user PTEs against PTE_P to determine their state.  Use the
macros PAGE_PRESENT, PAGE_PAGED_OUT, and PAGE_UNMAPPED instead;
exactly one of the three must be true at any time.

On any call to mmap with fd != -1, a new struct file is instantiated.
It is reference-counted, once per mmap'd page.  When a page is faulted
in, or when a paged-out page is freed, the file is decref'd.  Allocating
and freeing a pfault_info_t will automatically incref/decref the associated
file, if any.

14 files changed:
kern/arch/i686/env.c
kern/arch/i686/pmap.c
kern/arch/sparc/env.c
kern/arch/sparc/pmap.c
kern/include/frontend.h
kern/include/mm.h
kern/src/env.c
kern/src/frontend.c
kern/src/init.c
kern/src/manager.c
kern/src/mm.c
kern/src/pmap.c
kern/src/process.c
kern/src/syscall.c

index 864b2da..4a858a2 100644 (file)
@@ -112,7 +112,7 @@ int env_user_mem_walk(env_t* e, void* start, size_t len,
                                      PTX(end) : NPTENTRIES );
                int ret;
                for (pteno = pteno_start; pteno < pteno_end; pteno++) {
-                       if (pt[pteno] & PTE_P)
+                       if (!PAGE_UNMAPPED(pt[pteno]))
                                if((ret = callback(e, &pt[pteno], PGADDR(pdeno, pteno, 0), arg)))
                                        return ret;
                }
index dd7d7cf..fd75b91 100644 (file)
@@ -640,7 +640,7 @@ void *get_free_va_range(pde_t *pgdir, uintptr_t addr, size_t len)
                for(char* b = a; b < a+len; b += PGSIZE)
                {
                        pte_t* pte = pgdir_walk(pgdir,b,0);
-                       if(pte && (*pte & PTE_P))
+                       if(pte && !PAGE_UNMAPPED(*pte))
                        {
                                a = b;
                                break;
index 1eed111..6e199d2 100644 (file)
@@ -126,7 +126,7 @@ env_user_mem_walk(env_t* e, void* start, size_t len,
                        int l3x_end = l1x == l1x_end-1 && l2x == l2x_end-1 && L3X(end) ?
                                      L3X(end) : NL3ENTRIES;
                        for(int l3x = l3x_start, ret; l3x < l3x_end; l3x++)
-                               if(l3pt[l3x] & PTE_PTE)
+                               if(!PAGE_UNMAPPED(l3pt[l3x]))
                                        if((ret = callback(e,&l3pt[l3x],PGADDR(l1x,l2x,l3x,0),arg)))
                                                return ret;
                }
index e175e9d..4121a8d 100644 (file)
@@ -131,7 +131,7 @@ void *get_free_va_range(pde_t *pgdir, uintptr_t addr, size_t len)
                for(char* b = a; b < a+len; b += PGSIZE)
                {
                        pte_t* pte = pgdir_walk(pgdir,b,0);
-                       if(pte && (*pte & PTE_P))
+                       if(pte && !PAGE_UNMAPPED(*pte))
                        {
                                a = b;
                                break;
index 9e5ed4d..2cec60b 100644 (file)
@@ -38,6 +38,7 @@ char* user_strdup_errno(struct proc* p, const char* va, int max);
 int memcpy_to_user_errno(struct proc* p, void* dst, const void* src, int len);
 void* kmalloc_errno(int len);
 
+void file_init(void);
 error_t file_read_page(struct file* f, physaddr_t pa, size_t pgoff);
 struct file* file_open(const char* path, int oflag, int mode);
 struct file* file_open_from_fd(struct proc* p, int fd);
index 7b557f9..8ecf159 100644 (file)
@@ -14,6 +14,7 @@
 #include <process.h>
 #include <atomic.h>
 #include <sys/queue.h>
+#include <slab.h>
 
 /* Memory region for a process, consisting of linear(virtual) addresses.  This
  * is what the kernel allocates a process, and the physical mapping can be done
@@ -60,6 +61,21 @@ struct vm_region {
 };
 TAILQ_HEAD(vm_region_list, vm_region); // Declares 'struct memregion_list'
 
+// at least for now, we aren't using vm regions. we're storing pointers
+// to pfault_info_t inside the PTEs in an arch-specific way.
+struct file;
+typedef struct pfault_info {
+       struct file* file; // or NULL for zero-fill
+       size_t pgoff; // offset into file
+       size_t read_len; // amount of file to read into this page (zero-fill rest)
+       int perm;
+} pfault_info_t;
+
+void mmap_init(void);
+
+pfault_info_t* pfault_info_alloc(struct file* file);
+void pfault_info_free(pfault_info_t* pfi);
+
 struct mm {
        spinlock_t mm_lock;
        // per-process memory management stuff
index 8dd177d..60180eb 100644 (file)
@@ -21,6 +21,7 @@
 #include <stdio.h>
 #include <schedule.h>
 #include <kmalloc.h>
+#include <mm.h>
 
 #include <ros/syscall.h>
 #include <ros/error.h>
@@ -348,9 +349,17 @@ void env_user_mem_free(env_t* e, void* start, size_t len)
        assert((uintptr_t)start + len <= UVPT); //since this keeps fucking happening
        int user_page_free(env_t* e, pte_t* pte, void* va, void* arg)
        {
-               page_t* page = ppn2page(PTE2PPN(*pte));
-               *pte = 0;
-               page_decref(page);
+               if(PAGE_PRESENT(*pte))
+               {
+                       page_t* page = ppn2page(PTE2PPN(*pte));
+                       *pte = 0;
+                       page_decref(page);
+               }
+               else // PAGE_PAGED_OUT(*pte)
+               {
+                       pfault_info_free(PTE2PFAULT_INFO(*pte));
+                       *pte = 0;
+               }
                return 0;
        }
 
index d7bc5d9..e3fccb2 100644 (file)
@@ -13,6 +13,7 @@
 #include <frontend.h>
 #include <syscall.h>
 #include <smp.h>
+#include <slab.h>
 
 volatile int magic_mem[10];
 
@@ -124,6 +125,13 @@ void* kmalloc_errno(int len)
        return kva;
 }
 
+struct kmem_cache* struct_file_cache;
+void file_init()
+{
+       struct_file_cache = kmem_cache_create("struct_file",
+                                             sizeof(struct file), 8, 0, 0, 0);
+}
+
 error_t file_read_page(struct file* f, physaddr_t pa, size_t pgoff)
 {
        int ret = frontend_syscall(0,APPSERVER_SYSCALL_pread,f->fd,pa,PGSIZE,
@@ -136,10 +144,16 @@ error_t file_read_page(struct file* f, physaddr_t pa, size_t pgoff)
 struct file* file_open_from_fd(struct proc* p, int fd)
 {
        struct file* f = NULL;
-       if(!(f = kmalloc(sizeof(struct file),0)))
+       if(!(f = kmem_cache_alloc(struct_file_cache,0)))
                goto out;
 
        f->fd = frontend_syscall(p->pid,APPSERVER_SYSCALL_kdup,fd,0,0,0,NULL);
+       if(f->fd == -1)
+       {
+               kmem_cache_free(struct_file_cache,f);
+               f = NULL;
+               goto out;
+       }
        spinlock_init(&f->lock);
        f->refcnt = 1;
 
@@ -162,11 +176,17 @@ struct file* file_open(const char* path, int oflag, int mode)
                path = memcpy(malloced,path,len);
        }
 
-       if(!(f = kmalloc(sizeof(struct file),0)))
+       if(!(f = kmem_cache_alloc(struct_file_cache,0)))
                goto out;
 
        f->fd = frontend_syscall(0,APPSERVER_SYSCALL_open,PADDR(path),
                                 oflag,mode,0,NULL);
+       if(f->fd == -1)
+       {
+               kmem_cache_free(struct_file_cache,f);
+               f = NULL;
+               goto out;
+       }
        spinlock_init(&f->lock);
        f->refcnt = 1;
 
@@ -191,7 +211,7 @@ void file_decref(struct file* f)
        {
                int ret = frontend_syscall(0,APPSERVER_SYSCALL_close,f->fd,0,0,0,NULL);
                assert(ret == 0);
-               kfree(f);
+               kmem_cache_free(struct_file_cache,f);
        }
        else
                spin_unlock(&f->lock);
index 0c6b8bc..b1b9d9a 100644 (file)
@@ -28,6 +28,8 @@
 #include <testing.h>
 #include <kmalloc.h>
 #include <hashtable.h>
+#include <mm.h>
+#include <frontend.h>
 
 #include <arch/init.h>
 #include <arch/bitmask.h>
@@ -71,6 +73,8 @@ void kernel_init(multiboot_info_t *mboot_info)
        hashtable_init();
        cache_color_alloc_init();       // Inits data structs
        colored_page_alloc_init();      // Allocates colors for agnostic processes
+       mmap_init();
+       file_init();
        page_check();
 
        idt_init();
index 1ba1c80..b49ae0b 100644 (file)
@@ -217,12 +217,10 @@ void manager_waterman()
                char* envp[] = {"LD_LIBRARY_PATH=/lib",0};
                procinfo_pack_args(p->env_procinfo,argv,envp);
 
-               printk("loading busybox\n");
                struct file* f = file_open("/bin/busybox",0,0);
                assert(f != NULL);
                assert(load_elf(p,f) == 0);
                file_decref(f);
-               printk("loaded busybox\n");
 
                __proc_set_state(p, PROC_RUNNABLE_S);
                proc_run(p);
index 346e222..cf3dc65 100644 (file)
@@ -13,6 +13,8 @@
 #include <process.h>
 #include <stdio.h>
 #include <syscall.h>
+#include <slab.h>
+#include <kmalloc.h>
 
 void *mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
            int fd, size_t offset)
@@ -28,11 +30,20 @@ void *mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
                return (void*)-1;
        }
 
-       struct file* file = file_open_from_fd(p,fd);
-       if(!file)
-               return (void*)-1;
+       struct file* file = NULL;
+       if(fd != -1)
+       {
+               file = file_open_from_fd(p,fd);
+               if(!file)
+                       return (void*)-1;
+       }
+
+       void* result = do_mmap(p,addr,len,prot,flags,file,offset);
+
+       if(file)
+               file_decref(file);
 
-       return do_mmap(p,addr,len,prot,flags,file,offset);
+       return result;
 }
 
 void *do_mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
@@ -66,32 +77,65 @@ void *do_mmap(struct proc *p, uintptr_t addr, size_t len, int prot, int flags,
                assert(addr + num_pages*PGSIZE <= USTACKBOT);
        }
 
-       page_t *a_page;
-       for (int i = 0; i < num_pages; i++) {
-               if (upage_alloc(p, &a_page, 1))
-                       goto mmap_abort;
+       // get a list of pfault_info_t's and pte's a priori,
+       // because if their allocation fails, we could end up
+       // in an inconsistent state
 
-               // This is dumb--should not read until faulted in.
-               // This is just to get it correct at first
-               if(!(flags & MAP_ANON))
-               {
-                       if(file_read_page(file,page2pa(a_page),offset+i) < 0)
-                               goto mmap_abort;
+       pfault_info_t** pfis = kmalloc(sizeof(pfault_info_t*)*num_pages,0);
+       pte_t** ptes = kmalloc(sizeof(pte_t*)*num_pages,0);
+       if(!pfis || !ptes)
+       {
+               kfree(ptes);
+               kfree(pfis);
+               goto mmap_abort;
+       }
 
-                       // zero-fill end of last page
-                       if(len % PGSIZE && i == num_pages-1)
-                               memset(page2kva(a_page)+len%PGSIZE,0,PGSIZE-len%PGSIZE);
-               }
+       for(int i = 0; i < num_pages; i++)
+       {
+               pfis[i] = pfault_info_alloc(file);
+               ptes[i] = pgdir_walk(p->env_pgdir,(char*)addr+i*PGSIZE,1);
 
-               // TODO: TLB shootdown if replacing an old mapping
-               // TODO: handle all PROT flags
-               if (page_insert(p->env_pgdir, a_page, (void*SNT)(addr + i*PGSIZE),
-                               (prot & PROT_WRITE) ? PTE_USER_RW : PTE_USER_RO)) {
-                       page_free(a_page);
+               // cleanup allocated pfault_info_t's on allocation failure
+               if(!pfis[i] || !ptes[i])
+               {
+                       int free_until = pfis[i] ? i+1 : i;
+                       for(int j = 0; j < free_until; j++)
+                               pfault_info_free(pfis[j]);
+
+                       kfree(ptes);
+                       kfree(pfis);
                        goto mmap_abort;
                }
        }
 
+       // make the lazy mapping finally
+       int perm = (prot & PROT_WRITE) ? PTE_USER_RW :
+                  (prot & (PROT_READ|PROT_EXEC))  ? PTE_USER_RO : 0;
+       for(int i = 0; i < num_pages; i++)
+       {
+               // free an old page that was present here
+               if(PAGE_PRESENT(*ptes[i]))
+                       page_decref(ppn2page(PTE2PPN(*ptes[i])));
+               // free the pfault_info for a page that wasn't faulted-in yet
+               else if(PAGE_PAGED_OUT(*ptes[i]))
+                       pfault_info_free(PTE2PFAULT_INFO(*ptes[i]));
+
+               pfis[i]->file = file;
+               pfis[i]->pgoff = offset+i;
+               pfis[i]->read_len = PGSIZE;
+               // zero-fill end of last page
+               if(i == num_pages-1 && len % PGSIZE)
+                       pfis[i]->read_len = len % PGSIZE;
+               pfis[i]->perm = perm;
+               *ptes[i] = PFAULT_INFO2PTE(pfis[i]);
+
+               // uncomment the line below to simulate aggressive loading
+               //assert(handle_page_fault(p,(char*)addr+i*PGSIZE,PROT_READ) == 0);
+       }
+
+       kfree(ptes);
+       kfree(pfis);
+
        // TODO: release the appropriate mm_lock
        spin_unlock_irqsave(&p->proc_lock);
        return (void*SAFE)TC(addr);
@@ -133,7 +177,15 @@ int mprotect(struct proc* p, void* addr, size_t len, int prot)
        for(char* a = (char*)addr; a < end; a += PGSIZE)
        {
                pte_t* pte = pgdir_walk(p->env_pgdir,a,0);
-               if(pte && *pte & PTE_P)
+
+               // unmapped page? error out, behavior undefined (per POSIX)
+               if(!pte || PAGE_UNMAPPED(*pte))
+               {
+                       set_errno(current_tf,ENOMEM);
+                       return -1;
+               }
+               // common case: the page is present
+               else if(PAGE_PRESENT(*pte))
                {
                        // TODO: do munmap() in munmap(), instead of mprotect()
                        if(prot & PROT_UNMAP)
@@ -143,12 +195,20 @@ int mprotect(struct proc* p, void* addr, size_t len, int prot)
                                page_decref(page);
                        }
                        else
+                       {
                                *pte = (*pte & ~PTE_PERM) | newperm;
+                       }
                }
-               else
+               // or, the page might be mapped, but not yet faulted-in
+               else // PAGE_PAGED_OUT(*pte)
                {
-                       set_errno(current_tf,ENOMEM);
-                       return -1;
+                       if(prot & PROT_UNMAP)
+                       {
+                               pfault_info_free(PTE2PFAULT_INFO(*pte));
+                               *pte = 0;
+                       }
+                       else
+                               PTE2PFAULT_INFO(*pte)->perm = newperm;
                }
        }
 
@@ -169,11 +229,102 @@ int handle_page_fault(struct proc* p, uintptr_t va, int prot)
        int ret = -1;
        va = ROUNDDOWN(va,PGSIZE);
 
-       spin_lock_irqsave(&p->proc_lock);
+       if(prot != PROT_READ && prot != PROT_WRITE && prot != PROT_EXEC)
+               panic("bad prot!");
+
+       //spin_lock_irqsave(&p->proc_lock);
+
+       /// find offending PTE
+       pte_t* ppte = pgdir_walk(p->env_pgdir,(void*)va,0);
+       // if PTE is NULL, this is a fault that should kill the process
+       if(!ppte)
+               goto out;
+
+       pte_t pte = *ppte;
+
+       // if PTE is present, why did we fault?
+       if(PAGE_PRESENT(pte))
+       {
+               // a race is possible: the page might have been faulted in by
+               // another core already, in which case we should just return.
+               // otherwise, it's a fault that should kill the user
+               switch(prot)
+               {
+                       case PROT_READ:
+                       case PROT_EXEC:
+                               if(pte == PTE_USER_RO || pte == PTE_USER_RW)
+                                       ret = 0;
+                               goto out;
+                       case PROT_WRITE:
+                               if(pte == PTE_USER_RW)
+                                       ret = 0;
+                               goto out;
+               }
+               // can't get here
+       }
 
+       // if the page isn't present, kill the user
+       if(PAGE_UNMAPPED(pte))
+               goto out;
+
+       // now, we know that PAGE_PAGED_OUT(pte) is true
+       pfault_info_t* info = PTE2PFAULT_INFO(pte);
+
+       // allocate a page; maybe zero-fill it
+       int zerofill = info->file == NULL;
+       page_t* a_page;
+       if(upage_alloc(p, &a_page, zerofill))
+               goto out;
+
+       // if this isn't a zero-filled page, read it in from file
+       if(!zerofill)
+       {
+               int read_len = file_read_page(info->file,page2pa(a_page),info->pgoff);
+               if(read_len < 0)
+               {
+                       page_free(a_page);
+                       goto out;
+               }
+
+               // if we read too much, zero that part out
+               if(info->read_len < read_len)
+                       memset(page2kva(a_page)+info->read_len,0,read_len-info->read_len);
+       }
+
+       // update the page table
+       if(page_insert(p->env_pgdir, a_page, (void*)va, info->perm))
+       {
+               page_free(a_page);
+               goto out;
+       }
+
+       pfault_info_free(info);
+       ret = 0;
 
 out:
-       spin_unlock_irqsave(&p->proc_lock);
+       //spin_unlock_irqsave(&p->proc_lock);
+       tlbflush();
        return ret;
 }
 
+struct kmem_cache* pfault_info_cache;
+void mmap_init(void)
+{
+       pfault_info_cache = kmem_cache_create("pfault_info",
+                                             sizeof(pfault_info_t), 8, 0, 0, 0);
+}
+
+pfault_info_t* pfault_info_alloc(struct file* file)
+{
+       if(file)
+               file_incref(file);
+       return kmem_cache_alloc(pfault_info_cache,0);
+}
+
+void pfault_info_free(pfault_info_t* pfi)
+{
+       if(pfi->file)
+               file_decref(pfi->file);
+       kmem_cache_free(pfault_info_cache,pfi);
+}
+
index f9bed2c..2d3373d 100644 (file)
@@ -32,6 +32,7 @@
 #include <kclock.h>
 #include <process.h>
 #include <stdio.h>
+#include <mm.h>
 
 /**
  * @brief Global variable used to store erroneous virtual addresses as the
@@ -426,8 +427,13 @@ error_t memcpy_from_user(env_t* env, void* COUNT(len) dest,
        for(i = 0; i < num_pages; i++)
        {
                pte = pgdir_walk(env->env_pgdir, start+i*PGSIZE, 0);
-               if(!pte || (*pte & perm) != perm)
+               if(!pte)
                        return -EFAULT;
+               if((*pte & PTE_P) && (*pte & PTE_USER_RO) != PTE_USER_RO)
+                       return -EFAULT;
+               if(!(*pte & PTE_P))
+                       if(handle_page_fault(env,(uintptr_t)start+i*PGSIZE,PROT_READ))
+                               return -EFAULT;
 
                void*COUNT(PGSIZE) kpage = KADDR(PTE_ADDR(*pte));
                const void* src_start = i > 0 ? kpage : kpage+(va-start);
@@ -481,8 +487,13 @@ error_t memcpy_to_user(env_t* env, void*DANGEROUS va,
        for(i = 0; i < num_pages; i++)
        {
                pte = pgdir_walk(env->env_pgdir, start+i*PGSIZE, 0);
-               if(!pte || (*pte & perm) != perm)
+               if(!pte)
+                       return -EFAULT;
+               if((*pte & PTE_P) && (*pte & PTE_USER_RW) != PTE_USER_RW)
                        return -EFAULT;
+               if(!(*pte & PTE_P))
+                       if(handle_page_fault(env,(uintptr_t)start+i*PGSIZE,PROT_WRITE))
+                               return -EFAULT;
 
                void*COUNT(PGSIZE) kpage = KADDR(PTE_ADDR(*pte));
                void* dst_start = i > 0 ? kpage : kpage+(va-start);
index e72296d..285788a 100644 (file)
@@ -302,7 +302,7 @@ static void __proc_free(struct proc *p)
 {
        physaddr_t pa;
 
-       printk("[PID %d] freeing proc: %d\n", current ? current->pid : 0, p->pid);
+       printd("[PID %d] freeing proc: %d\n", current ? current->pid : 0, p->pid);
        // All parts of the kernel should have decref'd before __proc_free is called
        assert(p->env_refcnt == 0);
 
index 9c86834..28d6556 100644 (file)
@@ -325,16 +325,34 @@ static ssize_t sys_fork(env_t* e)
        {
                env_t* env = (env_t*)arg;
 
-               page_t* pp;
-               if(upage_alloc(env,&pp,0))
-                       return -1;
-               if(page_insert(env->env_pgdir,pp,va,*pte & PTE_PERM))
+               if(PAGE_PRESENT(*pte))
                {
-                       page_decref(pp);
-                       return -1;
+                       page_t* pp;
+                       if(upage_alloc(env,&pp,0))
+                               return -1;
+                       if(page_insert(env->env_pgdir,pp,va,*pte & PTE_PERM))
+                       {
+                               page_decref(pp);
+                               return -1;
+                       }
+
+                       pagecopy(page2kva(pp),ppn2kva(PTE2PPN(*pte)));
+               }
+               else // PAGE_PAGED_OUT(*pte)
+               {
+                       pte_t* newpte = pgdir_walk(env->env_pgdir,va,1);
+                       if(!newpte)
+                               return -1;
+
+                       struct file* file = PTE2PFAULT_INFO(*pte)->file;
+                       pfault_info_t* newpfi = pfault_info_alloc(file);
+                       if(!newpfi)
+                               return -1;
+
+                       *newpfi = *PTE2PFAULT_INFO(*pte);
+                       *newpte = PFAULT_INFO2PTE(newpfi);
                }
 
-               pagecopy(page2kva(pp),ppn2kva(PTE2PPN(*pte)));
                return 0;
        }