akaros/kern/src/slab.c
<<
>>
Prefs
   1/* Copyright (c) 2009 The Regents of the University of California
   2 * See LICENSE for details.
   3 *
   4 * Slab allocator, based on the SunOS 5.4 allocator paper.
   5 *
   6 * Barret Rhoden <brho@cs.berkeley.edu>
   7 * Kevin Klues <klueska@cs.berkeley.edu>
   8 *
   9 * Copyright (c) 2016 Google Inc
  10 *
  11 * Upgraded and extended to support magazines, based on Bonwick and Adams's
  12 * "Magazines and Vmem" paper.
  13 *
  14 * Barret Rhoden <brho@cs.berkeley.edu>
  15 *
  16 * FAQ:
  17 * - What sort of allocator do we need for the kmem_pcpu_caches?  In general,
  18 *   the base allocator.  All slabs/caches depend on the pcpu_caches for any
  19 *   allocation, so we need something that does not rely on slabs.  We could use
  20 *   generic kpages, if we knew that we weren't: qcaches for a kpages_arena, the
  21 *   slab kcache, or the bufctl kcache.  This is the same set of restrictions
  22 *   for the hash table allocations.
  23 * - Why doesn't the magazine cache deadlock on itself?  Because magazines are
  24 *   only allocated during the free path of another cache.  There are no
  25 *   magazine allocations during a cache's allocation.
  26 * - Does the magazine cache need to be statically allocated?  Maybe not, but it
  27 *   doesn't hurt.  We need to set it up at some point.  We can use other caches
  28 *   for allocations before the mag cache is initialized, but we can't free.
  29 * - Does the magazine cache need to pull from the base arena?  Similar to the
  30 *   static allocation question - by default, maybe not, but it is safer.  And
  31 *   yes, due to other design choices.  We could initialize it after kpages is
  32 *   allocated and use a kpages_arena, but that would require us to not free a
  33 *   page before or during kpages_arena_init().  A related note is where the
  34 *   first magazines in a pcpu_cache come from.  I'm currently going with "raw
  35 *   slab alloc from the magazine cache", which means magazines need to work
  36 *   when we're setting up the qcache's for kpages_arena.  That creates a
  37 *   dependency, which means kpages depends on mags, which means mags can only
  38 *   depend on base.  If we ever use slabs for non-base arena btags, we'll also
  39 *   have this dependency between kpages and mags.
  40 * - The paper talks about full and empty magazines.  Why does our code talk
  41 *   about not_empty and empty?  The way we'll do our magazine resizing is to
  42 *   just() increment the pcpu_cache's magsize.  Then we'll eventually start
  43 *   filling the magazines to their new capacity (during frees, btw).  During
  44 *   this time, a mag that was previously full will technically be not-empty,
  45 *   but not full.  The correctness of the magazine code is still OK, I think,
  46 *   since when they say 'full', they require 'not empty' in most cases.  In
  47 *   short, 'not empty' is more accurate, though it makes sense to say 'full'
  48 *   when explaining the basic idea for their paper.
  49 * - Due to a resize, what happens when the depot gives a pcpu cache a magazine
  50 *   with *more* rounds than ppc->magsize?  The allocation path doesn't care
  51 *   about magsize - it just looks at nr_rounds.  So that's fine.  On the free
  52 *   path, we might mistakenly think that a mag has no more room.  In that case,
  53 *   we'll just hand it to the depot and it'll be a 'not-empty' mag.  Eventually
  54 *   it'll get filled up, or it just won't matter.  'magsize' is basically an
  55 *   instruction to the pcpu_cache: "fill to X, please."
  56 * - Why is nr_rounds tracked in the magazine and not the pcpu cache?  The paper
  57 *   uses the pcpu cache, but doesn't say whether or not the mag tracks it too.
  58 *   We track it in the mag since not all mags have the same size (e.g.  during
  59 *   a resize operation).  For performance (avoid an occasional cache miss), we
  60 *   could consider tracking it in the pcpu_cache.  Might save a miss now and
  61 *   then.
  62 * - Why do we just disable IRQs for the pcpu_cache?  The paper explicitly talks
  63 *   about using locks instead of disabling IRQs, since disabling IRQs can be
  64 *   expensive.  First off, we only just disable IRQs when there's 1:1 core to
  65 *   pcc.  If we were to use a spinlock, we'd be disabling IRQs anyway, since we
  66 *   do allocations from IRQ context.  The other reason to lock is when changing
  67 *   the pcpu state during a magazine resize.  I have two ways to do this: just
  68 *   racily write and set pcc->magsize, or have the pcc's poll when they check
  69 *   the depot during free.  Either approach doesn't require someone else to
  70 *   grab a pcc lock.
  71 *
  72 * TODO:
  73 * - Add reclaim function.
  74 * - When resizing, do we want to go through the depot and consolidate
  75 *   magazines?  (probably not a big deal.  maybe we'd deal with it when we
  76 *   clean up our excess mags.)
  77 * - Could do some working set tracking.  Like max/min over an interval, with
  78 *   resetting (in the depot, used for reclaim and maybe aggressive freeing).
  79 * - Debugging info
  80 */
  81
  82#include <slab.h>
  83#include <stdio.h>
  84#include <assert.h>
  85#include <pmap.h>
  86#include <kmalloc.h>
  87#include <hash.h>
  88#include <arena.h>
  89#include <hashtable.h>
  90
  91#define SLAB_POISON ((void*)0xdead1111)
  92
  93/* Tunables.  I don't know which numbers to pick yet.  Maybe we play with it at
  94 * runtime.  Though once a mag increases, it'll never decrease. */
  95uint64_t resize_timeout_ns = 1000000000;
  96unsigned int resize_threshold = 1;
  97
  98/* Protected by the arenas_and_slabs_lock. */
  99struct kmem_cache_tailq all_kmem_caches =
 100                TAILQ_HEAD_INITIALIZER(all_kmem_caches);
 101
 102static void kmc_track(struct kmem_cache *kc)
 103{
 104        struct kmem_cache *kc_i;
 105
 106        qlock(&arenas_and_slabs_lock);
 107        TAILQ_INSERT_TAIL(&all_kmem_caches, kc, all_kmc_link);
 108        qunlock(&arenas_and_slabs_lock);
 109}
 110
 111static void kmc_untrack(struct kmem_cache *kc)
 112{
 113        qlock(&arenas_and_slabs_lock);
 114        TAILQ_REMOVE(&all_kmem_caches, kc, all_kmc_link);
 115        qunlock(&arenas_and_slabs_lock);
 116}
 117
 118/* Backend/internal functions, defined later.  Grab the lock before calling
 119 * these. */
 120static bool kmem_cache_grow(struct kmem_cache *cp);
 121static void *__kmem_alloc_from_slab(struct kmem_cache *cp, int flags);
 122static void __kmem_free_to_slab(struct kmem_cache *cp, void *buf);
 123
 124/* Forward declarations for trace hooks */
 125static void kmem_trace_ht_init(struct kmem_trace_ht *ht);
 126static void kmem_trace_free(struct kmem_cache *kc, void *obj);
 127static void kmem_trace_alloc(struct kmem_cache *kc, void *obj);
 128static void kmem_trace_warn_notempty(struct kmem_cache *kc);
 129
 130/* Cache of the kmem_cache objects, needed for bootstrapping */
 131struct kmem_cache kmem_cache_cache[1];
 132struct kmem_cache kmem_slab_cache[1];
 133struct kmem_cache kmem_bufctl_cache[1];
 134struct kmem_cache kmem_magazine_cache[1];
 135struct kmem_cache kmem_trace_cache[1];
 136
 137static bool __use_bufctls(struct kmem_cache *cp)
 138{
 139        return cp->flags & __KMC_USE_BUFCTL;
 140}
 141
 142/* Using a layer of indirection for the pcpu caches, in case we want to use
 143 * clustered objects, only per-NUMA-domain caches, or something like that. */
 144unsigned int kmc_nr_pcpu_caches(void)
 145{
 146        return num_cores;
 147}
 148
 149static struct kmem_pcpu_cache *get_my_pcpu_cache(struct kmem_cache *kc)
 150{
 151        return &kc->pcpu_caches[core_id()];
 152}
 153
 154/* In our current model, there is one pcc per core.  If we had multiple cores
 155 * that could use the pcc, such as with per-NUMA caches, then we'd need a
 156 * spinlock.  Since we do allocations from IRQ context, we still need to disable
 157 * IRQs. */
 158static void lock_pcu_cache(struct kmem_pcpu_cache *pcc)
 159{
 160        disable_irqsave(&pcc->irq_state);
 161}
 162
 163static void unlock_pcu_cache(struct kmem_pcpu_cache *pcc)
 164{
 165        enable_irqsave(&pcc->irq_state);
 166}
 167
 168static void lock_depot(struct kmem_depot *depot)
 169{
 170        uint64_t time;
 171
 172        if (spin_trylock_irqsave(&depot->lock))
 173                return;
 174        /* The lock is contended.  When we finally get the lock, we'll up the
 175         * contention count and see if we've had too many contentions over time.
 176         *
 177         * The idea is that if there are bursts of contention worse than X
 178         * contended acquisitions in Y nsec, then we'll grow the magazines.
 179         * This might not be that great of an approach - every thread gets one
 180         * count, regardless of how long they take.
 181         *
 182         * We read the time before locking so that we don't artificially grow
 183         * the window too much.  Say the lock is heavily contended and we take a
 184         * long time to get it.  Perhaps X threads try to lock it immediately,
 185         * but it takes over Y seconds for the Xth thread to actually get the
 186         * lock.  We might then think the burst wasn't big enough. */
 187        time = nsec();
 188        spin_lock_irqsave(&depot->lock);
 189        /* If there are no not-empty mags, we're probably fighting for the lock
 190         * not because the magazines aren't big enough, but because there aren't
 191         * enough mags in the system yet. */
 192        if (!depot->nr_not_empty)
 193                return;
 194        if (time - depot->busy_start > resize_timeout_ns) {
 195                depot->busy_count = 0;
 196                depot->busy_start = time;
 197        }
 198        depot->busy_count++;
 199        if (depot->busy_count > resize_threshold) {
 200                depot->busy_count = 0;
 201                depot->magsize = MIN(KMC_MAG_MAX_SZ, depot->magsize + 1);
 202                /* That's all we do - the pccs will eventually notice and up
 203                 * their magazine sizes. */
 204        }
 205}
 206
 207static void unlock_depot(struct kmem_depot *depot)
 208{
 209        spin_unlock_irqsave(&depot->lock);
 210}
 211
 212static void depot_init(struct kmem_depot *depot)
 213{
 214        spinlock_init_irqsave(&depot->lock);
 215        SLIST_INIT(&depot->not_empty);
 216        SLIST_INIT(&depot->empty);
 217        depot->magsize = KMC_MAG_MIN_SZ;
 218        depot->nr_not_empty = 0;
 219        depot->nr_empty = 0;
 220        depot->busy_count = 0;
 221        depot->busy_start = 0;
 222}
 223
 224static bool mag_is_empty(struct kmem_magazine *mag)
 225{
 226        return mag->nr_rounds == 0;
 227}
 228
 229/* Helper, swaps the loaded and previous mags.  Hold the pcc lock. */
 230static void __swap_mags(struct kmem_pcpu_cache *pcc)
 231{
 232        struct kmem_magazine *temp;
 233
 234        temp = pcc->prev;
 235        pcc->prev = pcc->loaded;
 236        pcc->loaded = temp;
 237}
 238
 239/* Helper, returns a magazine to the depot.  Hold the depot lock. */
 240static void __return_to_depot(struct kmem_cache *kc, struct kmem_magazine *mag)
 241{
 242        struct kmem_depot *depot = &kc->depot;
 243
 244        if (mag_is_empty(mag)) {
 245                SLIST_INSERT_HEAD(&depot->empty, mag, link);
 246                depot->nr_empty++;
 247        } else {
 248                SLIST_INSERT_HEAD(&depot->not_empty, mag, link);
 249                depot->nr_not_empty++;
 250        }
 251}
 252
 253/* Helper, removes the contents of the magazine, giving them back to the slab
 254 * layer. */
 255static void drain_mag(struct kmem_cache *kc, struct kmem_magazine *mag)
 256{
 257        for (int i = 0; i < mag->nr_rounds; i++) {
 258                if (kc->dtor)
 259                        kc->dtor(mag->rounds[i], kc->priv);
 260                __kmem_free_to_slab(kc, mag->rounds[i]);
 261        }
 262        mag->nr_rounds = 0;
 263}
 264
 265static struct kmem_pcpu_cache *build_pcpu_caches(void)
 266{
 267        struct kmem_pcpu_cache *pcc;
 268
 269        pcc = base_alloc(NULL,
 270                         sizeof(struct kmem_pcpu_cache) * kmc_nr_pcpu_caches(),
 271                         MEM_WAIT);
 272        for (int i = 0; i < kmc_nr_pcpu_caches(); i++) {
 273                pcc[i].irq_state = 0;
 274                pcc[i].magsize = KMC_MAG_MIN_SZ;
 275                pcc[i].loaded = __kmem_alloc_from_slab(kmem_magazine_cache,
 276                                                       MEM_WAIT);
 277                pcc[i].prev = __kmem_alloc_from_slab(kmem_magazine_cache,
 278                                                     MEM_WAIT);
 279                pcc[i].nr_allocs_ever = 0;
 280        }
 281        return pcc;
 282}
 283
 284void __kmem_cache_create(struct kmem_cache *kc, const char *name,
 285                         size_t obj_size, int align, int flags,
 286                         struct arena *source,
 287                         int (*ctor)(void *, void *, int),
 288                         void (*dtor)(void *, void *), void *priv)
 289{
 290        assert(kc);
 291        /* Our alignment is independent of our source's quantum.  We pull from
 292         * our source, which gives us quantum-multiple/aligned chunks, but our
 293         * alignment and object size is our own business.  Mostly.
 294         *
 295         * There is one guarantee we must make:
 296         * - If aligned-obj_size (ALIGN(obj_size, align)) is a multiple of our
 297         *   source's quantum, then all objects we return are
 298         *   quantum-multiple-aligned (addresses are multiples of quantum).
 299         *
 300         * The main restriction for us is that when we get a slab from our
 301         * source, we need to hand out objects at the beginning of the slab
 302         * (where we are source quantum-aligned).
 303         *
 304         * As an example, if our source quantum is 15, and we give out 45 byte
 305         * objects, we must give out e.g. [15,60), but not [10,55).  This really
 306         * only comes up for qcaches for arenas that aren't memory, since all
 307         * memory users will be going with power-of-two alignment.  And
 308         * typically the slabs will have their own alignment.  e.g.
 309         * alignof(struct foo), with a PGSIZE-quantum source.
 310         *
 311         * Our objects are always aligned to 'align', regardless of our source's
 312         * alignment/quantum.  Similarly, if our source's quantum is a multiple
 313         * of aligned-obj_size, then all objects we return are
 314         * obj_size-multiple-aligned. */
 315        assert(IS_PWR2(align));
 316        /* Every allocation is aligned, and every allocation is the same
 317         * size, so we might as well align-up obj_size. */
 318        obj_size = ALIGN(obj_size, align);
 319        spinlock_init_irqsave(&kc->cache_lock);
 320        strlcpy(kc->name, name, KMC_NAME_SZ);
 321        /* We might want some sort of per-call site NUMA-awareness in the
 322         * future. */
 323        source = source ?: kpages_arena;
 324        kc->source = source;
 325        kc->obj_size = obj_size;
 326        kc->align = align;
 327        kc->flags = flags;
 328        /* No touch must use bufctls, even for small objects, so that it does
 329         * not use the object as memory.  RAM objects need enough space for a
 330         * pointer to form the linked list of objects. */
 331        if (obj_size < sizeof(void*) || obj_size > SLAB_LARGE_CUTOFF
 332            || flags & KMC_NOTOUCH) {
 333                kc->flags |= __KMC_USE_BUFCTL;
 334        } else {
 335                /* pro-touch (non-bufctl) slabs must get a page-aligned slab
 336                 * from the source.  quantum < PGSIZE won't guarantee that.
 337                 * quantum > PGSIZE is a waste and a programmer error. */
 338                if (kc->source->quantum != PGSIZE) {
 339                        warn("KC %s is 'pro-touch', but source arena %s has non-PGSIZE quantum %d",
 340                             kc->name, source->name, source->quantum);
 341                        kc->flags |= __KMC_USE_BUFCTL;
 342                }
 343        }
 344        /* Note that import_amt is only used for bufctls.  The alternative puts
 345         * the slab at the end of a PGSIZE chunk, and fills the page with
 346         * objects.  The reliance on PGSIZE is used when finding a slab for a
 347         * given buffer.
 348         *
 349         * Also note that import_amt can be ignored for qcaches too.  If the
 350         * object is small and pro-touch, we'll still try and get a page from
 351         * the source, even if that is very large.  Consider a source with
 352         * qcache_max = 5, quantum = 1.  It's actually fine - we may waste a
 353         * little (unused allocations), but we save on not having bufctls. */
 354        if (flags & KMC_QCACHE)
 355                kc->import_amt = ROUNDUPPWR2(3 * source->qcache_max);
 356        else
 357                kc->import_amt = ROUNDUP(NUM_BUF_PER_SLAB * obj_size,
 358                                         ROUNDUP(PGSIZE, source->quantum));
 359        TAILQ_INIT(&kc->full_slab_list);
 360        TAILQ_INIT(&kc->partial_slab_list);
 361        TAILQ_INIT(&kc->empty_slab_list);
 362        kc->ctor = ctor;
 363        kc->dtor = dtor;
 364        kc->priv = priv;
 365        kc->nr_cur_alloc = 0;
 366        kc->nr_direct_allocs_ever = 0;
 367        kc->alloc_hash = kc->static_hash;
 368        hash_init_hh(&kc->hh);
 369        for (int i = 0; i < kc->hh.nr_hash_lists; i++)
 370                SLIST_INIT(&kc->static_hash[i]);
 371        depot_init(&kc->depot);
 372        kmem_trace_ht_init(&kc->trace_ht);
 373        /* We do this last, since this will all into the magazine cache - which
 374         * we could be creating on this call! */
 375        kc->pcpu_caches = build_pcpu_caches();
 376        add_importing_slab(kc->source, kc);
 377        kmc_track(kc);
 378}
 379
 380static int __mag_ctor(void *obj, void *priv, int flags)
 381{
 382        struct kmem_magazine *mag = (struct kmem_magazine*)obj;
 383
 384        mag->nr_rounds = 0;
 385        return 0;
 386}
 387
 388void kmem_cache_init(void)
 389{
 390        /* magazine must be first - all caches, including mags, will do a slab
 391         * alloc from the mag cache. */
 392        static_assert(sizeof(struct kmem_magazine) <= SLAB_LARGE_CUTOFF);
 393        __kmem_cache_create(kmem_magazine_cache, "kmem_magazine",
 394                            sizeof(struct kmem_magazine),
 395                            __alignof__(struct kmem_magazine), KMC_NOTRACE,
 396                            base_arena, __mag_ctor, NULL, NULL);
 397        __kmem_cache_create(kmem_cache_cache, "kmem_cache",
 398                            sizeof(struct kmem_cache),
 399                            __alignof__(struct kmem_cache), 0, base_arena,
 400                            NULL, NULL, NULL);
 401        __kmem_cache_create(kmem_slab_cache, "kmem_slab",
 402                            sizeof(struct kmem_slab),
 403                            __alignof__(struct kmem_slab), KMC_NOTRACE,
 404                            base_arena, NULL, NULL, NULL);
 405        __kmem_cache_create(kmem_bufctl_cache, "kmem_bufctl",
 406                            sizeof(struct kmem_bufctl),
 407                            __alignof__(struct kmem_bufctl), KMC_NOTRACE,
 408                            base_arena, NULL, NULL, NULL);
 409        __kmem_cache_create(kmem_trace_cache, "kmem_trace",
 410                            sizeof(struct kmem_trace),
 411                            __alignof__(struct kmem_trace), KMC_NOTRACE,
 412                            base_arena, NULL, NULL, NULL);
 413}
 414
 415/* Cache management */
 416struct kmem_cache *kmem_cache_create(const char *name, size_t obj_size,
 417                                     int align, int flags,
 418                                     struct arena *source,
 419                                     int (*ctor)(void *, void *, int),
 420                                     void (*dtor)(void *, void *),
 421                                     void *priv)
 422{
 423        struct kmem_cache *kc = kmem_cache_alloc(kmem_cache_cache, MEM_WAIT);
 424
 425        __kmem_cache_create(kc, name, obj_size, align, flags, source, ctor,
 426                            dtor, priv);
 427        return kc;
 428}
 429
 430/* Helper during destruction.  No one should be touching the allocator anymore.
 431 * We just need to hand objects back to the depot, which will hand them to the
 432 * slab.  Locking is just a formality here. */
 433static void drain_pcpu_caches(struct kmem_cache *kc)
 434{
 435        struct kmem_pcpu_cache *pcc;
 436
 437        for (int i = 0; i < kmc_nr_pcpu_caches(); i++) {
 438                pcc = &kc->pcpu_caches[i];
 439                lock_pcu_cache(pcc);
 440                lock_depot(&kc->depot);
 441                __return_to_depot(kc, pcc->loaded);
 442                __return_to_depot(kc, pcc->prev);
 443                unlock_depot(&kc->depot);
 444                pcc->loaded = SLAB_POISON;
 445                pcc->prev = SLAB_POISON;
 446                unlock_pcu_cache(pcc);
 447        }
 448}
 449
 450static void depot_destroy(struct kmem_cache *kc)
 451{
 452        struct kmem_magazine *mag_i;
 453        struct kmem_depot *depot = &kc->depot;
 454
 455        lock_depot(depot);
 456        while ((mag_i = SLIST_FIRST(&depot->not_empty))) {
 457                SLIST_REMOVE_HEAD(&depot->not_empty, link);
 458                drain_mag(kc, mag_i);
 459                kmem_cache_free(kmem_magazine_cache, mag_i);
 460        }
 461        while ((mag_i = SLIST_FIRST(&depot->empty))) {
 462                SLIST_REMOVE_HEAD(&depot->empty, link);
 463                assert(mag_i->nr_rounds == 0);
 464                kmem_cache_free(kmem_magazine_cache, mag_i);
 465        }
 466        unlock_depot(depot);
 467}
 468
 469static void kmem_slab_destroy(struct kmem_cache *cp, struct kmem_slab *a_slab)
 470{
 471        if (!__use_bufctls(cp)) {
 472                arena_free(cp->source, a_slab->source_obj, PGSIZE);
 473        } else {
 474                struct kmem_bufctl *i, *temp;
 475
 476                SLIST_FOREACH_SAFE(i, &a_slab->bufctl_freelist, link, temp) {
 477                        /* This is a little dangerous, but we can skip removing,
 478                         * since we init the freelist when we reuse the slab. */
 479                        kmem_cache_free(kmem_bufctl_cache, i);
 480                }
 481                arena_free(cp->source, a_slab->source_obj, cp->import_amt);
 482                kmem_cache_free(kmem_slab_cache, a_slab);
 483        }
 484}
 485
 486/* Once you call destroy, never use this cache again... o/w there may be weird
 487 * races, and other serious issues.  */
 488void __kmem_cache_destroy(struct kmem_cache *cp)
 489{
 490        struct kmem_slab *a_slab, *next;
 491
 492        kmc_untrack(cp);
 493        del_importing_slab(cp->source, cp);
 494        drain_pcpu_caches(cp);
 495        depot_destroy(cp);
 496        spin_lock_irqsave(&cp->cache_lock);
 497        /* This is a little debatable.  We leak the cache and whatnot, but even
 498         * worse, someone has the object still, and they might free it, after
 499         * we've already torn down the depot.  At best this is a marginal way to
 500         * continue.  See similar code in arena.c. */
 501        if (!TAILQ_EMPTY(&cp->full_slab_list) ||
 502            !TAILQ_EMPTY(&cp->partial_slab_list)) {
 503                warn("KMC %s has unfreed items!  Will not destroy.", cp->name);
 504                spin_unlock_irqsave(&cp->cache_lock);
 505                return;
 506        }
 507        /* Clean out the empty list.  We can't use a regular FOREACH here, since
 508         * the link element is stored in the slab struct, which is stored on the
 509         * page that we are freeing. */
 510        a_slab = TAILQ_FIRST(&cp->empty_slab_list);
 511        while (a_slab) {
 512                next = TAILQ_NEXT(a_slab, link);
 513                kmem_slab_destroy(cp, a_slab);
 514                a_slab = next;
 515        }
 516        if (cp->alloc_hash != cp->static_hash)
 517                base_free(NULL, cp->alloc_hash,
 518                          array_size(cp->hh.nr_hash_lists,
 519                                     sizeof(struct kmem_bufctl_slist)));
 520        spin_unlock_irqsave(&cp->cache_lock);
 521        kmem_trace_warn_notempty(cp);
 522}
 523
 524void kmem_cache_destroy(struct kmem_cache *cp)
 525{
 526        __kmem_cache_destroy(cp);
 527        kmem_cache_free(kmem_cache_cache, cp);
 528}
 529
 530static void __try_hash_resize(struct kmem_cache *cp)
 531{
 532        struct kmem_bufctl_slist *new_tbl, *old_tbl;
 533        struct kmem_bufctl *bc_i;
 534        unsigned int new_tbl_nr_lists, old_tbl_nr_lists;
 535        size_t new_tbl_sz, old_tbl_sz;
 536        size_t hash_idx;
 537
 538        if (!hash_needs_more(&cp->hh))
 539                return;
 540        new_tbl_nr_lists = hash_next_nr_lists(&cp->hh);
 541        new_tbl_sz = new_tbl_nr_lists * sizeof(struct kmem_bufctl_slist);
 542        /* TODO: we only need to pull from base if our arena is a base or we are
 543         * inside a kpages arena (keep in mind there could be more than one of
 544         * those, depending on how we do NUMA allocs).  This might help with
 545         * fragmentation.  To know this, we'll need the caller to pass us a
 546         * flag. */
 547        new_tbl = base_zalloc(NULL, new_tbl_sz, ARENA_INSTANTFIT | MEM_ATOMIC);
 548        if (!new_tbl)
 549                return;
 550        old_tbl = cp->alloc_hash;
 551        old_tbl_nr_lists = cp->hh.nr_hash_lists;
 552        old_tbl_sz = old_tbl_nr_lists * sizeof(struct kmem_bufctl_slist);
 553        cp->alloc_hash = new_tbl;
 554        hash_incr_nr_lists(&cp->hh);
 555        for (int i = 0; i < old_tbl_nr_lists; i++) {
 556                while ((bc_i = SLIST_FIRST(&old_tbl[i]))) {
 557                        SLIST_REMOVE_HEAD(&old_tbl[i], link);
 558                        hash_idx = hash_ptr(bc_i->buf_addr,
 559                                            cp->hh.nr_hash_bits);
 560                        SLIST_INSERT_HEAD(&cp->alloc_hash[hash_idx], bc_i,
 561                                          link);
 562                }
 563        }
 564        hash_reset_load_limit(&cp->hh);
 565        if (old_tbl != cp->static_hash)
 566                base_free(NULL, old_tbl, old_tbl_sz);
 567}
 568
 569/* Helper, tracks the allocation of @bc in the hash table */
 570static void __track_alloc(struct kmem_cache *cp, struct kmem_bufctl *bc)
 571{
 572        size_t hash_idx;
 573
 574        hash_idx = hash_ptr(bc->buf_addr, cp->hh.nr_hash_bits);
 575        SLIST_INSERT_HEAD(&cp->alloc_hash[hash_idx], bc, link);
 576        cp->hh.nr_items++;
 577        __try_hash_resize(cp);
 578}
 579
 580/* Helper, looks up and removes the bufctl corresponding to buf. */
 581static struct kmem_bufctl *__yank_bufctl(struct kmem_cache *cp, void *buf)
 582{
 583        struct kmem_bufctl *bc_i, **pp;
 584        struct kmem_bufctl_slist *slist;
 585        size_t hash_idx;
 586
 587        hash_idx = hash_ptr(buf, cp->hh.nr_hash_bits);
 588        slist = &cp->alloc_hash[hash_idx];
 589        SLIST_FOREACH_PREVPTR(bc_i, pp, slist, link) {
 590                if (bc_i->buf_addr != buf)
 591                        continue;
 592                *pp = SLIST_NEXT(bc_i, link);   /* Removes bc_i */
 593                return bc_i;
 594        }
 595        panic("Could not find buf %p in cache %s!", buf, cp->name);
 596}
 597
 598/* Alloc, bypassing the magazines and depot */
 599static void *__kmem_alloc_from_slab(struct kmem_cache *cp, int flags)
 600{
 601        void *retval;
 602
 603        spin_lock_irqsave(&cp->cache_lock);
 604        // look at partial list
 605        struct kmem_slab *a_slab = TAILQ_FIRST(&cp->partial_slab_list);
 606        //  if none, go to empty list and get an empty and make it partial
 607        if (!a_slab) {
 608                if (TAILQ_EMPTY(&cp->empty_slab_list) && !kmem_cache_grow(cp)) {
 609                        spin_unlock_irqsave(&cp->cache_lock);
 610                        goto out_oom;
 611                }
 612                // move to partial list
 613                a_slab = TAILQ_FIRST(&cp->empty_slab_list);
 614                TAILQ_REMOVE(&cp->empty_slab_list, a_slab, link);
 615                TAILQ_INSERT_HEAD(&cp->partial_slab_list, a_slab, link);
 616        }
 617        // have a partial now (a_slab), get an item, return item
 618        if (!__use_bufctls(cp)) {
 619                retval = a_slab->free_small_obj;
 620                /* the next free_small_obj address is stored at the beginning of
 621                 * the current free_small_obj. */
 622                a_slab->free_small_obj = *(uintptr_t**)(a_slab->free_small_obj);
 623        } else {
 624                // rip the first bufctl out of the partial slab's buf list
 625                struct kmem_bufctl *a_bufctl =
 626                        SLIST_FIRST(&a_slab->bufctl_freelist);
 627
 628                SLIST_REMOVE_HEAD(&a_slab->bufctl_freelist, link);
 629                __track_alloc(cp, a_bufctl);
 630                retval = a_bufctl->buf_addr;
 631        }
 632        a_slab->num_busy_obj++;
 633        // Check if we are full, if so, move to the full list
 634        if (a_slab->num_busy_obj == a_slab->num_total_obj) {
 635                TAILQ_REMOVE(&cp->partial_slab_list, a_slab, link);
 636                TAILQ_INSERT_HEAD(&cp->full_slab_list, a_slab, link);
 637        }
 638        cp->nr_cur_alloc++;
 639        cp->nr_direct_allocs_ever++;
 640        spin_unlock_irqsave(&cp->cache_lock);
 641        if (cp->ctor) {
 642                if (cp->ctor(retval, cp->priv, flags)) {
 643                        warn("Ctor %p failed, probably a bug!");
 644                        __kmem_free_to_slab(cp, retval);
 645                        goto out_oom;
 646                }
 647        }
 648        return retval;
 649out_oom:
 650        if (flags & MEM_ATOMIC)
 651                return NULL;
 652        /* Old code didn't set any MEM_ flag.  Typically '0' for MEM_ATOMIC. */
 653        if (!(flags & MEM_FLAGS))
 654                return NULL;
 655        if (flags & MEM_ERROR)
 656                error(ENOMEM, ERROR_FIXME);
 657        else
 658                panic("[German Accent]: OOM for a small slab growth!!!");
 659}
 660
 661void *kmem_cache_alloc(struct kmem_cache *kc, int flags)
 662{
 663        struct kmem_pcpu_cache *pcc = get_my_pcpu_cache(kc);
 664        struct kmem_depot *depot = &kc->depot;
 665        struct kmem_magazine *mag;
 666        void *ret;
 667
 668        lock_pcu_cache(pcc);
 669try_alloc:
 670        if (pcc->loaded->nr_rounds) {
 671                ret = pcc->loaded->rounds[pcc->loaded->nr_rounds - 1];
 672                pcc->loaded->nr_rounds--;
 673                pcc->nr_allocs_ever++;
 674                unlock_pcu_cache(pcc);
 675                kmem_trace_alloc(kc, ret);
 676                return ret;
 677        }
 678        if (!mag_is_empty(pcc->prev)) {
 679                __swap_mags(pcc);
 680                goto try_alloc;
 681        }
 682        /* Note the lock ordering: pcc -> depot */
 683        lock_depot(depot);
 684        mag = SLIST_FIRST(&depot->not_empty);
 685        if (mag) {
 686                SLIST_REMOVE_HEAD(&depot->not_empty, link);
 687                depot->nr_not_empty--;
 688                __return_to_depot(kc, pcc->prev);
 689                unlock_depot(depot);
 690                pcc->prev = pcc->loaded;
 691                pcc->loaded = mag;
 692                goto try_alloc;
 693        }
 694        unlock_depot(depot);
 695        unlock_pcu_cache(pcc);
 696        ret = __kmem_alloc_from_slab(kc, flags);
 697        kmem_trace_alloc(kc, ret);
 698        return ret;
 699}
 700
 701void *kmem_cache_zalloc(struct kmem_cache *kc, int flags)
 702{
 703        void *obj = kmem_cache_alloc(kc, flags);
 704
 705        if (!obj)
 706                return NULL;
 707        memset(obj, 0, kc->obj_size);
 708        return obj;
 709}
 710
 711/* Returns an object to the slab layer.  Caller must deconstruct the objects.
 712 * Note that objects in the slabs are unconstructed. */
 713static void __kmem_free_to_slab(struct kmem_cache *cp, void *buf)
 714{
 715        struct kmem_slab *a_slab;
 716        struct kmem_bufctl *a_bufctl;
 717
 718        spin_lock_irqsave(&cp->cache_lock);
 719        if (!__use_bufctls(cp)) {
 720                // find its slab
 721                a_slab = (struct kmem_slab*)(ROUNDDOWN((uintptr_t)buf, PGSIZE) +
 722                                             PGSIZE - sizeof(struct kmem_slab));
 723                /* write location of next free small obj to the space at the
 724                 * beginning of the buffer, then list buf as the next free small
 725                 * obj */
 726                *(uintptr_t**)buf = a_slab->free_small_obj;
 727                a_slab->free_small_obj = buf;
 728        } else {
 729                /* Give the bufctl back to the parent slab */
 730                a_bufctl = __yank_bufctl(cp, buf);
 731                a_slab = a_bufctl->my_slab;
 732                SLIST_INSERT_HEAD(&a_slab->bufctl_freelist, a_bufctl, link);
 733        }
 734        a_slab->num_busy_obj--;
 735        cp->nr_cur_alloc--;
 736        // if it was full, move it to partial
 737        if (a_slab->num_busy_obj + 1 == a_slab->num_total_obj) {
 738                TAILQ_REMOVE(&cp->full_slab_list, a_slab, link);
 739                TAILQ_INSERT_HEAD(&cp->partial_slab_list, a_slab, link);
 740        } else if (!a_slab->num_busy_obj) {
 741                // if there are none, move to from partial to empty
 742                TAILQ_REMOVE(&cp->partial_slab_list, a_slab, link);
 743                TAILQ_INSERT_HEAD(&cp->empty_slab_list, a_slab, link);
 744        }
 745        spin_unlock_irqsave(&cp->cache_lock);
 746}
 747
 748void kmem_cache_free(struct kmem_cache *kc, void *buf)
 749{
 750        struct kmem_pcpu_cache *pcc = get_my_pcpu_cache(kc);
 751        struct kmem_depot *depot = &kc->depot;
 752        struct kmem_magazine *mag;
 753
 754        assert(buf);    /* catch bugs */
 755        kmem_trace_free(kc, buf);
 756        lock_pcu_cache(pcc);
 757try_free:
 758        if (pcc->loaded->nr_rounds < pcc->magsize) {
 759                pcc->loaded->rounds[pcc->loaded->nr_rounds] = buf;
 760                pcc->loaded->nr_rounds++;
 761                unlock_pcu_cache(pcc);
 762                return;
 763        }
 764        /* The paper checks 'is empty' here.  But we actually just care if it
 765         * has room left, not that prev is completely empty.  This could be the
 766         * case due to magazine resize. */
 767        if (pcc->prev->nr_rounds < pcc->magsize) {
 768                __swap_mags(pcc);
 769                goto try_free;
 770        }
 771        lock_depot(depot);
 772        /* Here's where the resize magic happens.  We'll start using it for the
 773         * next magazine. */
 774        pcc->magsize = depot->magsize;
 775        mag = SLIST_FIRST(&depot->empty);
 776        if (mag) {
 777                SLIST_REMOVE_HEAD(&depot->empty, link);
 778                depot->nr_empty--;
 779                __return_to_depot(kc, pcc->prev);
 780                unlock_depot(depot);
 781                pcc->prev = pcc->loaded;
 782                pcc->loaded = mag;
 783                goto try_free;
 784        }
 785        unlock_depot(depot);
 786        /* Need to unlock, in case we end up calling back into ourselves. */
 787        unlock_pcu_cache(pcc);
 788        /* don't want to wait on a free.  if this fails, we can still just give
 789         * it to the slab layer. */
 790        mag = kmem_cache_alloc(kmem_magazine_cache, MEM_ATOMIC);
 791        if (mag) {
 792                assert(mag->nr_rounds == 0);
 793                lock_depot(depot);
 794                SLIST_INSERT_HEAD(&depot->empty, mag, link);
 795                depot->nr_empty++;
 796                unlock_depot(depot);
 797                lock_pcu_cache(pcc);
 798                goto try_free;
 799        }
 800        if (kc->dtor)
 801                kc->dtor(buf, kc->priv);
 802        __kmem_free_to_slab(kc, buf);
 803}
 804
 805/* Back end: internal functions */
 806/* When this returns, the cache has at least one slab in the empty list.  If
 807 * page_alloc fails, there are some serious issues.  This only grows by one slab
 808 * at a time.
 809 *
 810 * Grab the cache lock before calling this.
 811 *
 812 * TODO: think about page colouring issues with kernel memory allocation. */
 813static bool kmem_cache_grow(struct kmem_cache *cp)
 814{
 815        struct kmem_slab *a_slab;
 816        struct kmem_bufctl *a_bufctl;
 817
 818        if (!__use_bufctls(cp)) {
 819                void *a_page;
 820
 821                a_page = arena_alloc(cp->source, PGSIZE, MEM_ATOMIC);
 822                if (!a_page)
 823                        return FALSE;
 824                /* The slab struct is stored at the end of the page.  Keep it
 825                 * there, so that our first object is page aligned, and thus
 826                 * aligned to all smaller alignments.  If align > PGSIZE,
 827                 * obj_size > PGSIZE, and we'd use bufctls. */
 828                a_slab = (struct kmem_slab*)(a_page + PGSIZE
 829                                             - sizeof(struct kmem_slab));
 830                a_slab->source_obj = a_page;
 831                a_slab->num_busy_obj = 0;
 832                a_slab->num_total_obj = (PGSIZE - sizeof(struct kmem_slab)) /
 833                                        cp->obj_size;
 834                a_slab->free_small_obj = a_page;
 835                /* Walk and create the free list, which is circular.  Each item
 836                 * stores the location of the next one at the beginning of the
 837                 * block. */
 838                void *buf = a_slab->free_small_obj;
 839
 840                for (int i = 0; i < a_slab->num_total_obj - 1; i++) {
 841                        *(uintptr_t**)buf = buf + cp->obj_size;
 842                        buf += cp->obj_size;
 843                }
 844                *((uintptr_t**)buf) = NULL;
 845        } else {
 846                void *buf;
 847                uintptr_t delta;
 848
 849                a_slab = kmem_cache_alloc(kmem_slab_cache, MEM_ATOMIC);
 850                if (!a_slab)
 851                        return FALSE;
 852                buf = arena_alloc(cp->source, cp->import_amt, MEM_ATOMIC);
 853                if (!buf)
 854                        goto err_slab;
 855                a_slab->source_obj = buf;
 856                buf = ALIGN(buf, cp->align);
 857                delta = buf - a_slab->source_obj;
 858                if (delta >= cp->import_amt) {
 859                        /* Shouldn't happen - the import_amt should always be
 860                         * enough for at least two objects, with obj_size >=
 861                         * align.  Maybe if a qcache had an alignment (which
 862                         * they don't). */
 863                        warn("Delta %p >= import_amt %p! (buf %p align %p)",
 864                             delta, cp->import_amt, a_slab->source_obj,
 865                             cp->align);
 866                        goto err_source_obj;
 867                }
 868                a_slab->num_busy_obj = 0;
 869                a_slab->num_total_obj = (cp->import_amt - delta) / cp->obj_size;
 870                SLIST_INIT(&a_slab->bufctl_freelist);
 871                /* for each buffer, set up a bufctl and point to the buffer */
 872                for (int i = 0; i < a_slab->num_total_obj; i++) {
 873                        a_bufctl = kmem_cache_alloc(kmem_bufctl_cache,
 874                                                    MEM_ATOMIC);
 875                        if (!a_bufctl) {
 876                                struct kmem_bufctl *i, *temp;
 877
 878                                SLIST_FOREACH_SAFE(i, &a_slab->bufctl_freelist,
 879                                                   link, temp) {
 880                                        kmem_cache_free(kmem_bufctl_cache, i);
 881                                }
 882                                goto err_source_obj;
 883                        }
 884                        SLIST_INSERT_HEAD(&a_slab->bufctl_freelist, a_bufctl,
 885                                          link);
 886                        a_bufctl->buf_addr = buf;
 887                        a_bufctl->my_slab = a_slab;
 888                        buf += cp->obj_size;
 889                }
 890        }
 891        // add a_slab to the empty_list
 892        TAILQ_INSERT_HEAD(&cp->empty_slab_list, a_slab, link);
 893
 894        return TRUE;
 895
 896err_source_obj:
 897        arena_free(cp->source, a_slab->source_obj, cp->import_amt);
 898err_slab:
 899        kmem_cache_free(kmem_slab_cache, a_slab);
 900        return FALSE;
 901}
 902
 903/* This deallocs every slab from the empty list.  TODO: think a bit more about
 904 * this.  We can do things like not free all of the empty lists to prevent
 905 * thrashing.  See 3.4 in the paper. */
 906void kmem_cache_reap(struct kmem_cache *cp)
 907{
 908        struct kmem_slab *a_slab, *next;
 909
 910        // Destroy all empty slabs.  Refer to the notes about the while loop
 911        spin_lock_irqsave(&cp->cache_lock);
 912        a_slab = TAILQ_FIRST(&cp->empty_slab_list);
 913        while (a_slab) {
 914                next = TAILQ_NEXT(a_slab, link);
 915                kmem_slab_destroy(cp, a_slab);
 916                a_slab = next;
 917        }
 918        spin_unlock_irqsave(&cp->cache_lock);
 919}
 920
 921
 922/* Tracing */
 923
 924static void kmem_trace_ht_foreach(struct kmem_trace_ht *ht,
 925                                  void (*f)(struct kmem_trace *, void *),
 926                                  void *arg)
 927{
 928        struct kmem_trace *tr;
 929        struct hlist_node *temp;
 930
 931        spin_lock_irqsave(&ht->lock);
 932        for (int i = 0; i < ht->hh.nr_hash_lists; i++)
 933                hlist_for_each_entry_safe(tr, temp, &ht->ht[i], hash)
 934                        f(tr, arg);
 935        spin_unlock_irqsave(&ht->lock);
 936}
 937
 938static void __kmem_trace_print(struct kmem_trace *tr, void *arg)
 939{
 940        struct sized_alloc *sza = arg;
 941
 942        sza_printf(sza, "Obj %p, from %s:\n----\n", tr->obj, tr->str);
 943        sza_print_backtrace_list(sza, tr->pcs, tr->nr_pcs);
 944        sza_printf(sza, "\n");
 945}
 946
 947static void __kmem_trace_bytes_needed(struct kmem_trace *tr, void *arg)
 948{
 949        size_t *x = arg;
 950
 951        /* Just a guess of how much room we'll need, plus fudge. */
 952        *x += tr->nr_pcs * 80 + 100;
 953}
 954
 955struct sized_alloc *kmem_trace_print(struct kmem_cache *kc)
 956{
 957        struct sized_alloc *sza;
 958        size_t amt = 100;
 959
 960        kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_bytes_needed, &amt);
 961        sza = sized_kzmalloc(amt, MEM_WAIT);
 962        sza_printf(sza, "Dumping outstanding allocs from %s\n", kc->name);
 963        sza_printf(sza, "-------------------------\n");
 964        kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_print, sza);
 965        return sza;
 966}
 967
 968static void __kmem_trace_drop(struct kmem_trace *tr, void *arg)
 969{
 970        hlist_del(&tr->hash);
 971        kmem_cache_free(kmem_trace_cache, tr);
 972}
 973
 974void kmem_trace_reset(struct kmem_cache *kc)
 975{
 976        kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_drop, NULL);
 977}
 978
 979/* It's a bug to ever have a left-over trace when we delete a KMC, and probably
 980 * will never happen.  If it does, we can expand the debugging info. */
 981static void __kmem_trace_warn_and_drop(struct kmem_trace *tr, void *arg)
 982{
 983        warn("KMC had an object! (%p)", tr->obj);
 984        __kmem_trace_drop(tr, NULL);
 985}
 986
 987static void kmem_trace_warn_notempty(struct kmem_cache *kc)
 988{
 989        kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_warn_and_drop, NULL);
 990}
 991
 992int kmem_trace_start(struct kmem_cache *kc)
 993{
 994        spin_lock_irqsave(&kc->cache_lock);
 995        if (kc->flags & KMC_NOTRACE) {
 996                spin_unlock_irqsave(&kc->cache_lock);
 997                return -1;
 998        }
 999        WRITE_ONCE(kc->flags, kc->flags | __KMC_TRACED | __KMC_EVER_TRACED);
1000        spin_unlock_irqsave(&kc->cache_lock);
1001        return 0;
1002}
1003
1004/* Note that the tracers locklessly-peek at the flags, and we may have an
1005 * allocation complete its trace after we stop.  You could conceivably stop,
1006 * reset/remove all traces, and then have a trace appear still. */
1007void kmem_trace_stop(struct kmem_cache *kc)
1008{
1009        spin_lock_irqsave(&kc->cache_lock);
1010        WRITE_ONCE(kc->flags, kc->flags & ~__KMC_TRACED);
1011        spin_unlock_irqsave(&kc->cache_lock);
1012}
1013
1014static void kmem_trace_ht_init(struct kmem_trace_ht *ht)
1015{
1016        spinlock_init(&ht->lock);
1017        ht->ht = ht->static_ht;
1018        hash_init_hh(&ht->hh);
1019        for (int i = 0; i < ht->hh.nr_hash_lists; i++)
1020                INIT_HLIST_HEAD(&ht->ht[i]);
1021}
1022
1023static void kmem_trace_ht_insert(struct kmem_trace_ht *ht,
1024                                 struct kmem_trace *tr)
1025{
1026        unsigned long hash_val = __generic_hash(tr->obj);
1027        struct hlist_head *bucket;
1028
1029        spin_lock_irqsave(&ht->lock);
1030        bucket = &ht->ht[hash_val % ht->hh.nr_hash_bits];
1031        hlist_add_head(&tr->hash, bucket);
1032        spin_unlock_irqsave(&ht->lock);
1033}
1034
1035static struct kmem_trace *kmem_trace_ht_remove(struct kmem_trace_ht *ht,
1036                                               void *obj)
1037{
1038        struct kmem_trace *tr;
1039        unsigned long hash_val = __generic_hash(obj);
1040        struct hlist_head *bucket;
1041
1042        spin_lock_irqsave(&ht->lock);
1043        bucket = &ht->ht[hash_val % ht->hh.nr_hash_bits];
1044        hlist_for_each_entry(tr, bucket, hash) {
1045                if (tr->obj == obj) {
1046                        hlist_del(&tr->hash);
1047                        break;
1048                }
1049        }
1050        spin_unlock_irqsave(&ht->lock);
1051        return tr;
1052}
1053
1054static void kmem_trace_free(struct kmem_cache *kc, void *obj)
1055{
1056        struct kmem_trace *tr;
1057
1058        /* Even if we turn off tracing, we still want to free traces that we may
1059         * have collected earlier.  Otherwise, those objects will never get
1060         * removed from the trace, and could lead to confusion if they are
1061         * reallocated and traced again.  Still, we don't want to pay the cost
1062         * on every free for untraced KCs. */
1063        if (!(READ_ONCE(kc->flags) & __KMC_EVER_TRACED))
1064                return;
1065        tr = kmem_trace_ht_remove(&kc->trace_ht, obj);
1066        if (tr)
1067                kmem_cache_free(kmem_trace_cache, tr);
1068}
1069
1070static void trace_context(struct kmem_trace *tr)
1071{
1072        if (is_ktask(current_kthread)) {
1073                snprintf(tr->str, sizeof(tr->str), "ktask %s",
1074                         current_kthread->name);
1075        } else if (current) {
1076                /* When we're handling a page fault, knowing the user PC helps
1077                 * determine the source.  A backtrace is nice, but harder to do.
1078                 * Since we're deep in MM code and holding locks, we can't use
1079                 * copy_from_user, which uses the page-fault fixups.  If you
1080                 * need to get the BT, stash it in the genbuf in
1081                 * handle_page_fault(). */
1082                snprintf(tr->str, sizeof(tr->str), "PID %d %s: %s, %p",
1083                         current->pid, current->progname,
1084                         current_kthread->name,
1085                         current_ctx ? get_user_ctx_pc(current_ctx) : 0);
1086        } else {
1087                snprintf(tr->str, sizeof(tr->str), "(none)");
1088        }
1089        tr->str[sizeof(tr->str) - 1] = 0;
1090}
1091
1092static void kmem_trace_alloc(struct kmem_cache *kc, void *obj)
1093{
1094        struct kmem_trace *tr;
1095
1096        if (!(READ_ONCE(kc->flags) & __KMC_TRACED))
1097                return;
1098        tr = kmem_cache_alloc(kmem_trace_cache, MEM_ATOMIC);
1099        if (!tr)
1100                return;
1101        tr->obj = obj;
1102        tr->nr_pcs = backtrace_list(get_caller_pc(), get_caller_fp(), tr->pcs,
1103                                    ARRAY_SIZE(tr->pcs));
1104        trace_context(tr);
1105        kmem_trace_ht_insert(&kc->trace_ht, tr);
1106}
1107