Userspace slab allocator
authorBarret Rhoden <brho@cs.berkeley.edu>
Tue, 11 Dec 2012 00:42:36 +0000 (16:42 -0800)
committerBarret Rhoden <brho@cs.berkeley.edu>
Tue, 11 Dec 2012 00:42:36 +0000 (16:42 -0800)
Note the caches lock use MCS-PDR locks, which for now call malloc() when
initializing.  So you can't currently use the slabs for something that
you need to implement malloc().

tests/slab.c [new file with mode: 0644]
user/parlib/include/slab.h [new file with mode: 0644]
user/parlib/slab.c [new file with mode: 0644]

diff --git a/tests/slab.c b/tests/slab.c
new file mode 100644 (file)
index 0000000..4cd1926
--- /dev/null
@@ -0,0 +1,38 @@
+#include <slab.h>
+#include <stdio.h>
+
+static void test_single_cache(int iters, size_t size, int align, int flags,
+                              void (*ctor)(void *, size_t),
+                              void (*dtor)(void *, size_t))
+{
+       struct kmem_cache *test_cache;
+       void *objects[iters];
+       test_cache = kmem_cache_create("test_cache", size, align, flags, ctor, dtor);
+       printf("Testing Kmem Cache:\n");
+       print_kmem_cache(test_cache);
+       for (int i = 0; i < iters; i++) {
+               objects[i] = kmem_cache_alloc(test_cache, 0);
+               printf("Buffer %d addr = %p\n", i, objects[i]);
+       }
+       for (int i = 0; i < iters; i++) {
+               kmem_cache_free(test_cache, objects[i]);
+       }
+       kmem_cache_destroy(test_cache);
+       printf("\n\n\n\n");
+}
+
+void a_ctor(void *buf, size_t size)
+{
+       printf("constructin tests\n");
+}
+void a_dtor(void *buf, size_t size)
+{
+       printf("destructin tests\n");
+}
+
+int main(void)
+{
+       test_single_cache(10, 128, 512, 0, 0, 0);
+       test_single_cache(10, 128, 4, 0, a_ctor, a_dtor);
+       test_single_cache(10, 1024, 16, 0, 0, 0);
+}
diff --git a/user/parlib/include/slab.h b/user/parlib/include/slab.h
new file mode 100644 (file)
index 0000000..b133166
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2009 The Regents of the University of California
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ * Kevin Klues <klueska@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * Slab allocator, based on the SunOS 5.4 allocator paper.
+ *
+ * There is a list of kmem_cache, which are the caches of objects of a given
+ * size.  This list is sorted in order of size.  Each kmem_cache has three
+ * lists of slabs: full, partial, and empty.  
+ *
+ * For large objects, the kmem_slabs point to bufctls, which have the address
+ * of their large buffers.  These slabs can consist of more than one contiguous
+ * page.
+ *
+ * For small objects, the slabs do not use the bufctls.  Instead, they point to
+ * the next free object in the slab.  The free objects themselves hold the
+ * address of the next free item.  The slab structure is stored at the end of
+ * the page.  There is only one page per slab.
+ *
+ * TODO: Note, that this is a minor pain in the ass, and worth thinking about
+ * before implementing.  To keep the constructor's state valid, we can't just
+ * overwrite things, so we need to add an extra 4-8 bytes per object for the
+ * pointer, and then pass over that data when we return the actual object's
+ * address.  This also might fuck with alignment.
+ *
+ * Ported directly from the kernel's slab allocator. */
+
+#ifndef PARLIB_SLAB_H
+#define PARLIB_SLAB_H
+
+#include <ros/common.h>
+#include <ros/arch/mmu.h>
+#include <sys/queue.h>
+#include <arch/atomic.h>
+#include <mcs.h>
+
+/* Back in the day, their cutoff for "large objects" was 512B, based on
+ * measurements and on not wanting more than 1/8 of internal fragmentation. */
+#define NUM_BUF_PER_SLAB 8
+#define SLAB_LARGE_CUTOFF (PGSIZE / NUM_BUF_PER_SLAB)
+
+struct kmem_slab;
+
+/* Control block for buffers for large-object slabs */
+struct kmem_bufctl {
+       TAILQ_ENTRY(kmem_bufctl) link;
+       void *buf_addr;
+       struct kmem_slab *my_slab;
+};
+TAILQ_HEAD(kmem_bufctl_list, kmem_bufctl);
+
+/* Slabs contain the objects.  Can be either full, partial, or empty,
+ * determined by checking the number of objects busy vs total.  For large
+ * slabs, the bufctl list is used to find a free buffer.  For small, the void*
+ * is used instead.*/
+struct kmem_slab {
+       TAILQ_ENTRY(kmem_slab) link;
+       size_t obj_size;
+       size_t num_busy_obj;
+       size_t num_total_obj;
+       union {
+               struct kmem_bufctl_list bufctl_freelist;
+               void *free_small_obj;
+       };
+};
+TAILQ_HEAD(kmem_slab_list, kmem_slab);
+
+/* Actual cache */
+struct kmem_cache {
+       SLIST_ENTRY(kmem_cache) link;
+       struct mcs_pdr_lock cache_lock;
+       const char *name;
+       size_t obj_size;
+       int align;
+       int flags;
+       struct kmem_slab_list full_slab_list;
+       struct kmem_slab_list partial_slab_list;
+       struct kmem_slab_list empty_slab_list;
+       void (*ctor)(void *, size_t);
+       void (*dtor)(void *, size_t);
+       unsigned long nr_cur_alloc;
+};
+
+/* List of all kmem_caches, sorted in order of size */
+SLIST_HEAD(kmem_cache_list, kmem_cache);
+extern struct kmem_cache_list kmem_caches;
+
+/* Cache management */
+struct kmem_cache *kmem_cache_create(const char *name, size_t obj_size,
+                                     int align, int flags,
+                                     void (*ctor)(void *, size_t),
+                                     void (*dtor)(void *, size_t));
+void kmem_cache_destroy(struct kmem_cache *cp);
+/* Front end: clients of caches use these */
+void *kmem_cache_alloc(struct kmem_cache *cp, int flags);
+void kmem_cache_free(struct kmem_cache *cp, void *buf);
+/* Back end: internal functions */
+void kmem_cache_init(void);
+void kmem_cache_reap(struct kmem_cache *cp);
+
+/* Debug */
+void print_kmem_cache(struct kmem_cache *kc);
+void print_kmem_slab(struct kmem_slab *slab);
+
+#endif /* PARLIB_SLAB_H */
diff --git a/user/parlib/slab.c b/user/parlib/slab.c
new file mode 100644 (file)
index 0000000..72356fd
--- /dev/null
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2009 The Regents of the University of California
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ * Kevin Klues <klueska@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * Slab allocator, based on the SunOS 5.4 allocator paper.
+ *
+ * Note that we don't have a hash table for buf to bufctl for the large buffer
+ * objects, so we use the same style for small objects: store the pointer to the
+ * controlling bufctl at the top of the slab object.  Fix this with TODO (BUF).
+ *
+ * Ported directly from the kernel's slab allocator. */
+
+#include <slab.h>
+#include <stdio.h>
+#include <assert.h>
+#include <sys/mman.h>
+
+struct kmem_cache_list kmem_caches;
+struct mcs_pdr_lock kmem_caches_lock;
+
+/* Backend/internal functions, defined later.  Grab the lock before calling
+ * these. */
+static void kmem_cache_grow(struct kmem_cache *cp);
+
+/* Cache of the kmem_cache objects, needed for bootstrapping */
+struct kmem_cache kmem_cache_cache;
+struct kmem_cache *kmem_slab_cache, *kmem_bufctl_cache;
+
+static void __kmem_cache_create(struct kmem_cache *kc, const char *name,
+                                size_t obj_size, int align, int flags,
+                                void (*ctor)(void *, size_t),
+                                void (*dtor)(void *, size_t))
+{
+       assert(kc);
+       assert(align);
+       mcs_pdr_init(&kc->cache_lock);
+       kc->name = name;
+       kc->obj_size = obj_size;
+       kc->align = align;
+       kc->flags = flags;
+       TAILQ_INIT(&kc->full_slab_list);
+       TAILQ_INIT(&kc->partial_slab_list);
+       TAILQ_INIT(&kc->empty_slab_list);
+       kc->ctor = ctor;
+       kc->dtor = dtor;
+       kc->nr_cur_alloc = 0;
+       
+       /* put in cache list based on it's size */
+       struct kmem_cache *i, *prev = NULL;
+       mcs_pdr_lock(&kmem_caches_lock);
+       /* find the kmem_cache before us in the list.  yes, this is O(n). */
+       SLIST_FOREACH(i, &kmem_caches, link) {
+               if (i->obj_size < kc->obj_size)
+                       prev = i;
+               else
+                       break;
+       }
+       if (prev)
+               SLIST_INSERT_AFTER(prev, kc, link);
+       else
+               SLIST_INSERT_HEAD(&kmem_caches, kc, link);
+       mcs_pdr_unlock(&kmem_caches_lock);
+}
+
+void kmem_cache_init(void)
+{
+       mcs_pdr_init(&kmem_caches_lock);
+       SLIST_INIT(&kmem_caches);
+       /* We need to call the __ version directly to bootstrap the global
+        * kmem_cache_cache. */
+       __kmem_cache_create(&kmem_cache_cache, "kmem_cache",
+                           sizeof(struct kmem_cache),
+                           __alignof__(struct kmem_cache), 0, NULL, NULL);
+       /* Build the slab and bufctl caches */
+       kmem_slab_cache = kmem_cache_create("kmem_slab", sizeof(struct kmem_slab),
+                              __alignof__(struct kmem_slab), 0, NULL, NULL); 
+       kmem_bufctl_cache = kmem_cache_create("kmem_bufctl",
+                                sizeof(struct kmem_bufctl),
+                                __alignof__(struct kmem_bufctl), 0, NULL, NULL); 
+}
+
+/* Cache management */
+struct kmem_cache *kmem_cache_create(const char *name, size_t obj_size,
+                                     int align, int flags,
+                                     void (*ctor)(void *, size_t),
+                                     void (*dtor)(void *, size_t))
+{
+       static atomic_t initialized = FALSE;
+       /* Init the slab system.  We'll set it to TRUE, and whoever one the race
+        * (among multiple initializers) will do the real init. */
+       if (!atomic_swap(&initialized, TRUE))
+               kmem_cache_init();
+
+       struct kmem_cache *kc = kmem_cache_alloc(&kmem_cache_cache, 0);
+       __kmem_cache_create(kc, name, obj_size, align, flags, ctor, dtor);
+       return kc;
+}
+
+static void kmem_slab_destroy(struct kmem_cache *cp, struct kmem_slab *a_slab)
+{
+       if (cp->obj_size <= SLAB_LARGE_CUTOFF) {
+               /* Deconstruct all the objects, if necessary */
+               if (cp->dtor) {
+                       void *buf = a_slab->free_small_obj;
+                       for (int i = 0; i < a_slab->num_total_obj; i++) {
+                               cp->dtor(buf, cp->obj_size);
+                               buf += a_slab->obj_size;
+                       }
+               }
+               munmap(ROUNDDOWN(a_slab, PGSIZE), PGSIZE);
+       } else {
+               struct kmem_bufctl *i;
+               void *page_start = (void*)-1;
+               // compute how many pages are allocated, given a power of two allocator
+               size_t num_pages = ROUNDUPPWR2(a_slab->num_total_obj * a_slab->obj_size)
+                                               / PGSIZE;
+               TAILQ_FOREACH(i, &a_slab->bufctl_freelist, link) {
+                       // Track the lowest buffer address, which is the start of the buffer
+                       page_start = MIN(page_start, i->buf_addr);
+                       /* Deconstruct all the objects, if necessary */
+                       if (cp->dtor) // TODO: (BUF)
+                               cp->dtor(i->buf_addr, cp->obj_size);
+                       kmem_cache_free(kmem_bufctl_cache, i);
+               }
+               // free the pages for the slab's buffer
+               munmap(page_start, num_pages * PGSIZE);
+               // free the slab object
+               kmem_cache_free(kmem_slab_cache, a_slab);
+       }
+}
+
+/* Once you call destroy, never use this cache again... o/w there may be weird
+ * races, and other serious issues.  */
+void kmem_cache_destroy(struct kmem_cache *cp)
+{
+       struct kmem_slab *a_slab, *next;
+
+       mcs_pdr_lock(&cp->cache_lock);
+       assert(TAILQ_EMPTY(&cp->full_slab_list));
+       assert(TAILQ_EMPTY(&cp->partial_slab_list));
+       /* Clean out the empty list.  We can't use a regular FOREACH here, since the
+        * link element is stored in the slab struct, which is stored on the page
+        * that we are freeing. */
+       a_slab = TAILQ_FIRST(&cp->empty_slab_list);
+       while (a_slab) {
+               next = TAILQ_NEXT(a_slab, link);
+               kmem_slab_destroy(cp, a_slab);
+               a_slab = next;
+       }
+       mcs_pdr_lock(&kmem_caches_lock);
+       SLIST_REMOVE(&kmem_caches, cp, kmem_cache, link);
+       mcs_pdr_unlock(&kmem_caches_lock);
+       kmem_cache_free(&kmem_cache_cache, cp); 
+       mcs_pdr_unlock(&cp->cache_lock);
+}
+
+/* Front end: clients of caches use these */
+void *kmem_cache_alloc(struct kmem_cache *cp, int flags)
+{
+       void *retval = NULL;
+       mcs_pdr_lock(&cp->cache_lock);
+       // look at partial list
+       struct kmem_slab *a_slab = TAILQ_FIRST(&cp->partial_slab_list);
+       //      if none, go to empty list and get an empty and make it partial
+       if (!a_slab) {
+               if (TAILQ_EMPTY(&cp->empty_slab_list))
+                       // TODO: think about non-sleeping flags
+                       kmem_cache_grow(cp);
+               // move to partial list
+               a_slab = TAILQ_FIRST(&cp->empty_slab_list);
+               TAILQ_REMOVE(&cp->empty_slab_list, a_slab, link);
+               TAILQ_INSERT_HEAD(&cp->partial_slab_list, a_slab, link);
+       } 
+       // have a partial now (a_slab), get an item, return item
+       if (cp->obj_size <= SLAB_LARGE_CUTOFF) {
+               retval = a_slab->free_small_obj;
+               /* adding the size of the cache_obj to get to the pointer at end of the
+                * buffer pointing to the next free_small_obj */
+               a_slab->free_small_obj = *(uintptr_t**)(a_slab->free_small_obj +
+                                                       cp->obj_size);
+       } else {
+               // rip the first bufctl out of the partial slab's buf list
+               struct kmem_bufctl *a_bufctl = TAILQ_FIRST(&a_slab->bufctl_freelist);
+               TAILQ_REMOVE(&a_slab->bufctl_freelist, a_bufctl, link);
+               retval = a_bufctl->buf_addr;
+       }
+       a_slab->num_busy_obj++;
+       // Check if we are full, if so, move to the full list
+       if (a_slab->num_busy_obj == a_slab->num_total_obj) {
+               TAILQ_REMOVE(&cp->partial_slab_list, a_slab, link);
+               TAILQ_INSERT_HEAD(&cp->full_slab_list, a_slab, link);
+       }
+       cp->nr_cur_alloc++;
+       mcs_pdr_unlock(&cp->cache_lock);
+       return retval;
+}
+
+static inline struct kmem_bufctl *buf2bufctl(void *buf, size_t offset)
+{
+       // TODO: hash table for back reference (BUF)
+       return *((struct kmem_bufctl**)(buf + offset));
+}
+
+void kmem_cache_free(struct kmem_cache *cp, void *buf)
+{
+       struct kmem_slab *a_slab;
+       struct kmem_bufctl *a_bufctl;
+
+       mcs_pdr_lock(&cp->cache_lock);
+       if (cp->obj_size <= SLAB_LARGE_CUTOFF) {
+               // find its slab
+               a_slab = (struct kmem_slab*)(ROUNDDOWN(buf, PGSIZE) + PGSIZE -
+                                            sizeof(struct kmem_slab));
+               /* write location of next free small obj to the space at the end of the
+                * buffer, then list buf as the next free small obj */
+               *(uintptr_t**)(buf + cp->obj_size) = a_slab->free_small_obj;
+               a_slab->free_small_obj = buf;
+       } else {
+               /* Give the bufctl back to the parent slab */
+               // TODO: (BUF) change the interface to not take an offset
+               a_bufctl = buf2bufctl(buf, cp->obj_size);
+               a_slab = a_bufctl->my_slab;
+               TAILQ_INSERT_HEAD(&a_slab->bufctl_freelist, a_bufctl, link);
+       }
+       a_slab->num_busy_obj--;
+       cp->nr_cur_alloc--;
+       // if it was full, move it to partial
+       if (a_slab->num_busy_obj + 1 == a_slab->num_total_obj) {
+               TAILQ_REMOVE(&cp->full_slab_list, a_slab, link);
+               TAILQ_INSERT_HEAD(&cp->partial_slab_list, a_slab, link);
+       } else if (!a_slab->num_busy_obj) {
+               // if there are none, move to from partial to empty
+               TAILQ_REMOVE(&cp->partial_slab_list, a_slab, link);
+               TAILQ_INSERT_HEAD(&cp->empty_slab_list, a_slab, link);
+       }
+       mcs_pdr_unlock(&cp->cache_lock);
+}
+
+/* Back end: internal functions */
+/* When this returns, the cache has at least one slab in the empty list.  If
+ * page_alloc fails, there are some serious issues.  This only grows by one slab
+ * at a time.
+ *
+ * Grab the cache lock before calling this.
+ *
+ * TODO: think about page colouring issues with kernel memory allocation. */
+static void kmem_cache_grow(struct kmem_cache *cp)
+{
+       struct kmem_slab *a_slab;
+       struct kmem_bufctl *a_bufctl;
+       void *a_page;
+       if (cp->obj_size <= SLAB_LARGE_CUTOFF) {
+               // Just get a single page for small slabs
+               a_page = mmap(0, PGSIZE, PROT_READ | PROT_WRITE,
+                             MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
+               assert(a_page);
+               // the slab struct is stored at the end of the page
+               a_slab = (struct kmem_slab*)(a_page + PGSIZE -
+                                            sizeof(struct kmem_slab));
+               // Need to add room for the next free item pointer in the object buffer.
+               a_slab->obj_size = ROUNDUP(cp->obj_size + sizeof(uintptr_t), cp->align);
+               a_slab->num_busy_obj = 0;
+               a_slab->num_total_obj = (PGSIZE - sizeof(struct kmem_slab)) /
+                                       a_slab->obj_size;
+               // TODO: consider staggering this IAW section 4.3
+               a_slab->free_small_obj = a_page;
+               /* Walk and create the free list, which is circular.  Each item stores
+                * the location of the next one at the end of the block. */
+               void *buf = a_slab->free_small_obj;
+               for (int i = 0; i < a_slab->num_total_obj - 1; i++) {
+                       // Initialize the object, if necessary
+                       if (cp->ctor)
+                               cp->ctor(buf, cp->obj_size);
+                       *(uintptr_t**)(buf + cp->obj_size) = buf + a_slab->obj_size;
+                       buf += a_slab->obj_size;
+               }
+               *((uintptr_t**)(buf + cp->obj_size)) = NULL;
+       } else {
+               a_slab = kmem_cache_alloc(kmem_slab_cache, 0);
+               // TODO: hash table for back reference (BUF)
+               a_slab->obj_size = ROUNDUP(cp->obj_size + sizeof(uintptr_t), cp->align);
+               // alloc n pages, such that it can hold at least 8 items
+               size_t num_pgs = ROUNDUP(NUM_BUF_PER_SLAB * a_slab->obj_size, PGSIZE) /
+                                          PGSIZE;
+               // round up for the contiguous page allocator
+               void *buf = mmap(0, num_pgs * PGSIZE, PROT_READ | PROT_WRITE,
+                                MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
+               assert(buf);
+               a_slab->num_busy_obj = 0;
+               a_slab->num_total_obj = ROUNDUPPWR2(num_pgs)*PGSIZE / a_slab->obj_size;
+               TAILQ_INIT(&a_slab->bufctl_freelist);
+               /* for each buffer, set up a bufctl and point to the buffer */
+               for (int i = 0; i < a_slab->num_total_obj; i++) {
+                       // Initialize the object, if necessary
+                       if (cp->ctor)
+                               cp->ctor(buf, cp->obj_size);
+                       a_bufctl = kmem_cache_alloc(kmem_bufctl_cache, 0);      
+                       TAILQ_INSERT_HEAD(&a_slab->bufctl_freelist, a_bufctl, link);
+                       a_bufctl->buf_addr = buf;
+                       a_bufctl->my_slab = a_slab;
+                       // TODO: (BUF) write the bufctl reference at the bottom of the buffer.
+                       *(struct kmem_bufctl**)(buf + cp->obj_size) = a_bufctl;
+                       buf += a_slab->obj_size;
+               }
+       }
+       // add a_slab to the empty_list
+       TAILQ_INSERT_HEAD(&cp->empty_slab_list, a_slab, link);
+}
+
+/* This deallocs every slab from the empty list.  TODO: think a bit more about
+ * this.  We can do things like not free all of the empty lists to prevent
+ * thrashing.  See 3.4 in the paper. */
+void kmem_cache_reap(struct kmem_cache *cp)
+{
+       struct kmem_slab *a_slab, *next;
+       
+       // Destroy all empty slabs.  Refer to the notes about the while loop
+       mcs_pdr_lock(&cp->cache_lock);
+       a_slab = TAILQ_FIRST(&cp->empty_slab_list);
+       while (a_slab) {
+               next = TAILQ_NEXT(a_slab, link);
+               kmem_slab_destroy(cp, a_slab);
+               a_slab = next;
+       }
+       mcs_pdr_unlock(&cp->cache_lock);
+}
+
+void print_kmem_cache(struct kmem_cache *cp)
+{
+       mcs_pdr_lock(&cp->cache_lock);
+       printf("\nPrinting kmem_cache:\n---------------------\n");
+       printf("Name: %s\n", cp->name);
+       printf("Objsize: %d\n", cp->obj_size);
+       printf("Align: %d\n", cp->align);
+       printf("Flags: 0x%08x\n", cp->flags);
+       printf("Constructor: 0x%08x\n", cp->ctor);
+       printf("Destructor: 0x%08x\n", cp->dtor);
+       printf("Slab Full: 0x%08x\n", cp->full_slab_list);
+       printf("Slab Partial: 0x%08x\n", cp->partial_slab_list);
+       printf("Slab Empty: 0x%08x\n", cp->empty_slab_list);
+       printf("Current Allocations: %d\n", cp->nr_cur_alloc);
+       mcs_pdr_unlock(&cp->cache_lock);
+}
+
+void print_kmem_slab(struct kmem_slab *slab)
+{
+       printf("\nPrinting kmem_slab:\n---------------------\n");
+       printf("Objsize: %d (%p)\n", slab->obj_size, slab->obj_size);
+       printf("NumBusy: %d\n", slab->num_busy_obj);
+       printf("Num_total: %d\n", slab->num_total_obj);
+       if (slab->obj_size + sizeof(uintptr_t) < SLAB_LARGE_CUTOFF) {
+               printf("Free Small obj: 0x%08x\n", slab->free_small_obj);
+               void *buf = slab->free_small_obj;
+               for (int i = 0; i < slab->num_total_obj; i++) {
+                       printf("Addr of buf: 0x%08x, Addr of next: 0x%08x\n", buf,
+                              *((uintptr_t**)buf));
+                       buf += slab->obj_size;
+               }
+       } else {
+               printf("This is a big slab!\n");
+       }
+}
+