mm/slab.c

   1 /*
   2  * linux/mm/slab.c
   3  * Written by Mark Hemment, 1996/97.
   4  * (markhe@nextd.demon.co.uk)
   5  *
   6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
   7  *
   8  * Major cleanup, different bufctl logic, per-cpu arrays
   9  *      (c) 2000 Manfred Spraul
  10  *
  11  * Cleanup, make the head arrays unconditional, preparation for NUMA
  12  *      (c) 2002 Manfred Spraul
  13  *
  14  * An implementation of the Slab Allocator as described in outline in;
  15  *      UNIX Internals: The New Frontiers by Uresh Vahalia
  16  *      Pub: Prentice Hall      ISBN 0-13-101908-2
  17  * or with a little more detail in;
  18  *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  19  *      Jeff Bonwick (Sun Microsystems).
  20  *      Presented at: USENIX Summer 1994 Technical Conference
  21  *
  22  * The memory is organized in caches, one cache for each object type.
  23  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  24  * Each cache consists out of many slabs (they are small (usually one
  25  * page long) and always contiguous), and each slab contains multiple
  26  * initialized objects.
  27  *
  28  * This means, that your constructor is used only for newly allocated
  29  * slabs and you must pass objects with the same intializations to
  30  * kmem_cache_free.
  31  *
  32  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  33  * normal). If you need a special memory type, then must create a new
  34  * cache for that memory type.
  35  *
  36  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  37  *   full slabs with 0 free objects
  38  *   partial slabs
  39  *   empty slabs with no allocated objects
  40  *
  41  * If partial slabs exist, then new allocations come from these slabs,
  42  * otherwise from empty slabs or new slabs are allocated.
  43  *
  44  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  45  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  46  *
  47  * Each cache has a short per-cpu head array, most allocs
  48  * and frees go into that array, and if that array overflows, then 1/2
  49  * of the entries in the array are given back into the global cache.
  50  * The head array is strictly LIFO and should improve the cache hit rates.
  51  * On SMP, it additionally reduces the spinlock operations.
  52  *
  53  * The c_cpuarray may not be read with enabled local interrupts -
  54  * it's changed with a smp_call_function().
  55  *
  56  * SMP synchronization:
  57  *  constructors and destructors are called without any locking.
  58  *  Several members in struct kmem_cache and struct slab never change, they
  59  *      are accessed without any locking.
  60  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  61  *      and local interrupts are disabled so slab code is preempt-safe.
  62  *  The non-constant members are protected with a per-cache irq spinlock.
  63  *
  64  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  65  * in 2000 - many ideas in the current implementation are derived from
  66  * his patch.
  67  *
  68  * Further notes from the original documentation:
  69  *
  70  * 11 April '97.  Started multi-threading - markhe
  71  *      The global cache-chain is protected by the mutex 'cache_chain_mutex'.
  72  *      The sem is only needed when accessing/extending the cache-chain, which
  73  *      can never happen inside an interrupt (kmem_cache_create(),
  74  *      kmem_cache_shrink() and kmem_cache_reap()).
  75  *
  76  *      At present, each engine can be growing a cache.  This should be blocked.
  77  *
  78  * 15 March 2005. NUMA slab allocator.
  79  *      Shai Fultheim <shai@scalex86.org>.
  80  *      Shobhit Dayal <shobhit@calsoftinc.com>
  81  *      Alok N Kataria <alokk@calsoftinc.com>
  82  *      Christoph Lameter <christoph@lameter.com>
  83  *
  84  *      Modified the slab allocator to be node aware on NUMA systems.
  85  *      Each node has its own list of partial, free and full slabs.
  86  *      All object allocations for a node occur from node specific slab lists.
  87  */
  88
  89 #include        <linux/config.h>
  90 #include        <linux/slab.h>
  91 #include        <linux/mm.h>
  92 #include        <linux/poison.h>
  93 #include        <linux/swap.h>
  94 #include        <linux/cache.h>
  95 #include        <linux/interrupt.h>
  96 #include        <linux/init.h>
  97 #include        <linux/compiler.h>
  98 #include        <linux/cpuset.h>
  99 #include        <linux/seq_file.h>
 100 #include        <linux/notifier.h>
 101 #include        <linux/kallsyms.h>
 102 #include        <linux/cpu.h>
 103 #include        <linux/sysctl.h>
 104 #include        <linux/module.h>
 105 #include        <linux/rcupdate.h>
 106 #include        <linux/string.h>
 107 #include        <linux/nodemask.h>
 108 #include        <linux/mempolicy.h>
 109 #include        <linux/mutex.h>
 110
 111 #include        <asm/uaccess.h>
 112 #include        <asm/cacheflush.h>
 113 #include        <asm/tlbflush.h>
 114 #include        <asm/page.h>
 115
 116 /*
 117  * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
 118  *                SLAB_RED_ZONE & SLAB_POISON.
 119  *                0 for faster, smaller code (especially in the critical paths).
 120  *
 121  * STATS        - 1 to collect stats for /proc/slabinfo.
 122  *                0 for faster, smaller code (especially in the critical paths).
 123  *
 124  * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
 125  */
 126
 127 #ifdef CONFIG_DEBUG_SLAB
 128 #define DEBUG           1
 129 #define STATS           1
 130 #define FORCED_DEBUG    1
 131 #else
 132 #define DEBUG           0
 133 #define STATS           0
 134 #define FORCED_DEBUG    0
 135 #endif
 136
 137 /* Shouldn't this be in a header file somewhere? */
 138 #define BYTES_PER_WORD          sizeof(void *)
 139
 140 #ifndef cache_line_size
 141 #define cache_line_size()       L1_CACHE_BYTES
 142 #endif
 143
 144 #ifndef ARCH_KMALLOC_MINALIGN
 145 /*
 146  * Enforce a minimum alignment for the kmalloc caches.
 147  * Usually, the kmalloc caches are cache_line_size() aligned, except when
 148  * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
 149  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 150  * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
 151  * Note that this flag disables some debug features.
 152  */
 153 #define ARCH_KMALLOC_MINALIGN 0
 154 #endif
 155
 156 #ifndef ARCH_SLAB_MINALIGN
 157 /*
 158  * Enforce a minimum alignment for all caches.
 159  * Intended for archs that get misalignment faults even for BYTES_PER_WORD
 160  * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
 161  * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
 162  * some debug features.
 163  */
 164 #define ARCH_SLAB_MINALIGN 0
 165 #endif
 166
 167 #ifndef ARCH_KMALLOC_FLAGS
 168 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 169 #endif
 170
 171 /* Legal flag mask for kmem_cache_create(). */
 172 #if DEBUG
 173 # define CREATE_MASK    (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
 174                          SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 175                          SLAB_CACHE_DMA | \
 176                          SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 177                          SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 178                          SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 179 #else
 180 # define CREATE_MASK    (SLAB_HWCACHE_ALIGN | \
 181                          SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 182                          SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 183                          SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 184 #endif
 185
 186 /*
 187  * kmem_bufctl_t:
 188  *
 189  * Bufctl's are used for linking objs within a slab
 190  * linked offsets.
 191  *
 192  * This implementation relies on "struct page" for locating the cache &
 193  * slab an object belongs to.
 194  * This allows the bufctl structure to be small (one int), but limits
 195  * the number of objects a slab (not a cache) can contain when off-slab
 196  * bufctls are used. The limit is the size of the largest general cache
 197  * that does not use off-slab slabs.
 198  * For 32bit archs with 4 kB pages, is this 56.
 199  * This is not serious, as it is only for large objects, when it is unwise
 200  * to have too many per slab.
 201  * Note: This limit can be raised by introducing a general cache whose size
 202  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 203  */
 204
 205 typedef unsigned int kmem_bufctl_t;
 206 #define BUFCTL_END      (((kmem_bufctl_t)(~0U))-0)
 207 #define BUFCTL_FREE     (((kmem_bufctl_t)(~0U))-1)
 208 #define BUFCTL_ACTIVE   (((kmem_bufctl_t)(~0U))-2)
 209 #define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
 210
 211 /*
 212  * struct slab
 213  *
 214  * Manages the objs in a slab. Placed either at the beginning of mem allocated
 215  * for a slab, or allocated from an general cache.
 216  * Slabs are chained into three list: fully used, partial, fully free slabs.
 217  */
 218 struct slab {
 219         struct list_head list;
 220         unsigned long colouroff;
 221         void *s_mem;            /* including colour offset */
 222         unsigned int inuse;     /* num of objs active in slab */
 223         kmem_bufctl_t free;
 224         unsigned short nodeid;
 225 };
 226
 227 /*
 228  * struct slab_rcu
 229  *
 230  * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
 231  * arrange for kmem_freepages to be called via RCU.  This is useful if
 232  * we need to approach a kernel structure obliquely, from its address
 233  * obtained without the usual locking.  We can lock the structure to
 234  * stabilize it and check it's still at the given address, only if we
 235  * can be sure that the memory has not been meanwhile reused for some
 236  * other kind of object (which our subsystem's lock might corrupt).
 237  *
 238  * rcu_read_lock before reading the address, then rcu_read_unlock after
 239  * taking the spinlock within the structure expected at that address.
 240  *
 241  * We assume struct slab_rcu can overlay struct slab when destroying.
 242  */
 243 struct slab_rcu {
 244         struct rcu_head head;
 245         struct kmem_cache *cachep;
 246         void *addr;
 247 };
 248
 249 /*
 250  * struct array_cache
 251  *
 252  * Purpose:
 253  * - LIFO ordering, to hand out cache-warm objects from _alloc
 254  * - reduce the number of linked list operations
 255  * - reduce spinlock operations
 256  *
 257  * The limit is stored in the per-cpu structure to reduce the data cache
 258  * footprint.
 259  *
 260  */
 261 struct array_cache {
 262         unsigned int avail;
 263         unsigned int limit;
 264         unsigned int batchcount;
 265         unsigned int touched;
 266         spinlock_t lock;
 267         void *entry[0]; /*
 268                          * Must have this definition in here for the proper
 269                          * alignment of array_cache. Also simplifies accessing
 270                          * the entries.
 271                          * [0] is for gcc 2.95. It should really be [].
 272                          */
 273 };
 274
 275 /*
 276  * bootstrap: The caches do not work without cpuarrays anymore, but the
 277  * cpuarrays are allocated from the generic caches...
 278  */
 279 #define BOOT_CPUCACHE_ENTRIES   1
 280 struct arraycache_init {
 281         struct array_cache cache;
 282         void *entries[BOOT_CPUCACHE_ENTRIES];
 283 };
 284
 285 /*
 286  * The slab lists for all objects.
 287  */
 288 struct kmem_list3 {
 289         struct list_head slabs_partial; /* partial list first, better asm code */
 290         struct list_head slabs_full;
 291         struct list_head slabs_free;
 292         unsigned long free_objects;
 293         unsigned int free_limit;
 294         unsigned int colour_next;       /* Per-node cache coloring */
 295         spinlock_t list_lock;
 296         struct array_cache *shared;     /* shared per node */
 297         struct array_cache **alien;     /* on other nodes */
 298         unsigned long next_reap;        /* updated without locking */
 299         int free_touched;               /* updated without locking */
 300 };
 301
 302 /*
 303  * Need this for bootstrapping a per node allocator.
 304  */
 305 #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
 306 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 307 #define CACHE_CACHE 0
 308 #define SIZE_AC 1
 309 #define SIZE_L3 (1 + MAX_NUMNODES)
 310
 311 /*
 312  * This function must be completely optimized away if a constant is passed to
 313  * it.  Mostly the same as what is in linux/slab.h except it returns an index.
 314  */
 315 static __always_inline int index_of(const size_t size)
 316 {
 317         extern void __bad_size(void);
 318
 319         if (__builtin_constant_p(size)) {
 320                 int i = 0;
 321
 322 #define CACHE(x) \
 323         if (size <=x) \
 324                 return i; \
 325         else \
 326                 i++;
 327 #include "linux/kmalloc_sizes.h"
 328 #undef CACHE
 329                 __bad_size();
 330         } else
 331                 __bad_size();
 332         return 0;
 333 }
 334
 335 static int slab_early_init = 1;
 336
 337 #define INDEX_AC index_of(sizeof(struct arraycache_init))
 338 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
 339
 340 static void kmem_list3_init(struct kmem_list3 *parent)
 341 {
 342         INIT_LIST_HEAD(&parent->slabs_full);
 343         INIT_LIST_HEAD(&parent->slabs_partial);
 344         INIT_LIST_HEAD(&parent->slabs_free);
 345         parent->shared = NULL;
 346         parent->alien = NULL;
 347         parent->colour_next = 0;
 348         spin_lock_init(&parent->list_lock);
 349         parent->free_objects = 0;
 350         parent->free_touched = 0;
 351 }
 352
 353 #define MAKE_LIST(cachep, listp, slab, nodeid)                          \
 354         do {                                                            \
 355                 INIT_LIST_HEAD(listp);                                  \
 356                 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
 357         } while (0)
 358
 359 #define MAKE_ALL_LISTS(cachep, ptr, nodeid)                             \
 360         do {                                                            \
 361         MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);  \
 362         MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 363         MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);  \
 364         } while (0)
 365
 366 /*
 367  * struct kmem_cache
 368  *
 369  * manages a cache.
 370  */
 371
 372 struct kmem_cache {
 373 /* 1) per-cpu data, touched during every alloc/free */
 374         struct array_cache *array[NR_CPUS];
 375 /* 2) Cache tunables. Protected by cache_chain_mutex */
 376         unsigned int batchcount;
 377         unsigned int limit;
 378         unsigned int shared;
 379
 380         unsigned int buffer_size;
 381 /* 3) touched by every alloc & free from the backend */
 382         struct kmem_list3 *nodelists[MAX_NUMNODES];
 383
 384         unsigned int flags;             /* constant flags */
 385         unsigned int num;               /* # of objs per slab */
 386
 387 /* 4) cache_grow/shrink */
 388         /* order of pgs per slab (2^n) */
 389         unsigned int gfporder;
 390
 391         /* force GFP flags, e.g. GFP_DMA */
 392         gfp_t gfpflags;
 393
 394         size_t colour;                  /* cache colouring range */
 395         unsigned int colour_off;        /* colour offset */
 396         struct kmem_cache *slabp_cache;
 397         unsigned int slab_size;
 398         unsigned int dflags;            /* dynamic flags */
 399
 400         /* constructor func */
 401         void (*ctor) (void *, struct kmem_cache *, unsigned long);
 402
 403         /* de-constructor func */
 404         void (*dtor) (void *, struct kmem_cache *, unsigned long);
 405
 406 /* 5) cache creation/removal */
 407         const char *name;
 408         struct list_head next;
 409
 410 /* 6) statistics */
 411 #if STATS
 412         unsigned long num_active;
 413         unsigned long num_allocations;
 414         unsigned long high_mark;
 415         unsigned long grown;
 416         unsigned long reaped;
 417         unsigned long errors;
 418         unsigned long max_freeable;
 419         unsigned long node_allocs;
 420         unsigned long node_frees;
 421         unsigned long node_overflow;
 422         atomic_t allochit;
 423         atomic_t allocmiss;
 424         atomic_t freehit;
 425         atomic_t freemiss;
 426 #endif
 427 #if DEBUG
 428         /*
 429          * If debugging is enabled, then the allocator can add additional
 430          * fields and/or padding to every object. buffer_size contains the total
 431          * object size including these internal fields, the following two
 432          * variables contain the offset to the user object and its size.
 433          */
 434         int obj_offset;
 435         int obj_size;
 436 #endif
 437 };
 438
 439 #define CFLGS_OFF_SLAB          (0x80000000UL)
 440 #define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 441
 442 #define BATCHREFILL_LIMIT       16
 443 /*
 444  * Optimization question: fewer reaps means less probability for unnessary
 445  * cpucache drain/refill cycles.
 446  *
 447  * OTOH the cpuarrays can contain lots of objects,
 448  * which could lock up otherwise freeable slabs.
 449  */
 450 #define REAPTIMEOUT_CPUC        (2*HZ)
 451 #define REAPTIMEOUT_LIST3       (4*HZ)
 452
 453 #if STATS
 454 #define STATS_INC_ACTIVE(x)     ((x)->num_active++)
 455 #define STATS_DEC_ACTIVE(x)     ((x)->num_active--)
 456 #define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 457 #define STATS_INC_GROWN(x)      ((x)->grown++)
 458 #define STATS_INC_REAPED(x)     ((x)->reaped++)
 459 #define STATS_SET_HIGH(x)                                               \
 460         do {                                                            \
 461                 if ((x)->num_active > (x)->high_mark)                   \
 462                         (x)->high_mark = (x)->num_active;               \
 463         } while (0)
 464 #define STATS_INC_ERR(x)        ((x)->errors++)
 465 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
 466 #define STATS_INC_NODEFREES(x)  ((x)->node_frees++)
 467 #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
 468 #define STATS_SET_FREEABLE(x, i)                                        \
 469         do {                                                            \
 470                 if ((x)->max_freeable < i)                              \
 471                         (x)->max_freeable = i;                          \
 472         } while (0)
 473 #define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 474 #define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 475 #define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
 476 #define STATS_INC_FREEMISS(x)   atomic_inc(&(x)->freemiss)
 477 #else
 478 #define STATS_INC_ACTIVE(x)     do { } while (0)
 479 #define STATS_DEC_ACTIVE(x)     do { } while (0)
 480 #define STATS_INC_ALLOCED(x)    do { } while (0)
 481 #define STATS_INC_GROWN(x)      do { } while (0)
 482 #define STATS_INC_REAPED(x)     do { } while (0)
 483 #define STATS_SET_HIGH(x)       do { } while (0)
 484 #define STATS_INC_ERR(x)        do { } while (0)
 485 #define STATS_INC_NODEALLOCS(x) do { } while (0)
 486 #define STATS_INC_NODEFREES(x)  do { } while (0)
 487 #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
 488 #define STATS_SET_FREEABLE(x, i) do { } while (0)
 489 #define STATS_INC_ALLOCHIT(x)   do { } while (0)
 490 #define STATS_INC_ALLOCMISS(x)  do { } while (0)
 491 #define STATS_INC_FREEHIT(x)    do { } while (0)
 492 #define STATS_INC_FREEMISS(x)   do { } while (0)
 493 #endif
 494
 495 #if DEBUG
 496
 497 /*
 498  * memory layout of objects:
 499  * 0            : objp
 500  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
 501  *              the end of an object is aligned with the end of the real
 502  *              allocation. Catches writes behind the end of the allocation.
 503  * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
 504  *              redzone word.
 505  * cachep->obj_offset: The real object.
 506  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
 507  * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
 508  *                                      [BYTES_PER_WORD long]
 509  */
 510 static int obj_offset(struct kmem_cache *cachep)
 511 {
 512         return cachep->obj_offset;
 513 }
 514
 515 static int obj_size(struct kmem_cache *cachep)
 516 {
 517         return cachep->obj_size;
 518 }
 519
 520 static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 521 {
 522         BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 523         return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
 524 }
 525
 526 static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 527 {
 528         BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 529         if (cachep->flags & SLAB_STORE_USER)
 530                 return (unsigned long *)(objp + cachep->buffer_size -
 531                                          2 * BYTES_PER_WORD);
 532         return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
 533 }
 534
 535 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 536 {
 537         BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 538         return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
 539 }
 540
 541 #else
 542
 543 #define obj_offset(x)                   0
 544 #define obj_size(cachep)                (cachep->buffer_size)
 545 #define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long *)NULL;})
 546 #define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long *)NULL;})
 547 #define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
 548
 549 #endif
 550
 551 /*
 552  * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
 553  * order.
 554  */
 555 #if defined(CONFIG_LARGE_ALLOCS)
 556 #define MAX_OBJ_ORDER   13      /* up to 32Mb */
 557 #define MAX_GFP_ORDER   13      /* up to 32Mb */
 558 #elif defined(CONFIG_MMU)
 559 #define MAX_OBJ_ORDER   5       /* 32 pages */
 560 #define MAX_GFP_ORDER   5       /* 32 pages */
 561 #else
 562 #define MAX_OBJ_ORDER   8       /* up to 1Mb */
 563 #define MAX_GFP_ORDER   8       /* up to 1Mb */
 564 #endif
 565
 566 /*
 567  * Do not go above this order unless 0 objects fit into the slab.
 568  */
 569 #define BREAK_GFP_ORDER_HI      1
 570 #define BREAK_GFP_ORDER_LO      0
 571 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 572
 573 /*
 574  * Functions for storing/retrieving the cachep and or slab from the page
 575  * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
 576  * these are used to find the cache which an obj belongs to.
 577  */
 578 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 579 {
 580         page->lru.next = (struct list_head *)cache;
 581 }
 582
 583 static inline struct kmem_cache *page_get_cache(struct page *page)
 584 {
 585         if (unlikely(PageCompound(page)))
 586                 page = (struct page *)page_private(page);
 587         BUG_ON(!PageSlab(page));
 588         return (struct kmem_cache *)page->lru.next;
 589 }
 590
 591 static inline void page_set_slab(struct page *page, struct slab *slab)
 592 {
 593         page->lru.prev = (struct list_head *)slab;
 594 }
 595
 596 static inline struct slab *page_get_slab(struct page *page)
 597 {
 598         if (unlikely(PageCompound(page)))
 599                 page = (struct page *)page_private(page);
 600         BUG_ON(!PageSlab(page));
 601         return (struct slab *)page->lru.prev;
 602 }
 603
 604 static inline struct kmem_cache *virt_to_cache(const void *obj)
 605 {
 606         struct page *page = virt_to_page(obj);
 607         return page_get_cache(page);
 608 }
 609
 610 static inline struct slab *virt_to_slab(const void *obj)
 611 {
 612         struct page *page = virt_to_page(obj);
 613         return page_get_slab(page);
 614 }
 615
 616 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
 617                                  unsigned int idx)
 618 {
 619         return slab->s_mem + cache->buffer_size * idx;
 620 }
 621
 622 static inline unsigned int obj_to_index(struct kmem_cache *cache,
 623                                         struct slab *slab, void *obj)
 624 {
 625         return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
 626 }
 627
 628 /*
 629  * These are the default caches for kmalloc. Custom caches can have other sizes.
 630  */
 631 struct cache_sizes malloc_sizes[] = {
 632 #define CACHE(x) { .cs_size = (x) },
 633 #include <linux/kmalloc_sizes.h>
 634         CACHE(ULONG_MAX)
 635 #undef CACHE
 636 };
 637 EXPORT_SYMBOL(malloc_sizes);
 638
 639 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
 640 struct cache_names {
 641         char *name;
 642         char *name_dma;
 643 };
 644
 645 static struct cache_names __initdata cache_names[] = {
 646 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 647 #include <linux/kmalloc_sizes.h>
 648         {NULL,}
 649 #undef CACHE
 650 };
 651
 652 static struct arraycache_init initarray_cache __initdata =
 653     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 654 static struct arraycache_init initarray_generic =
 655     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 656
 657 /* internal cache of cache description objs */
 658 static struct kmem_cache cache_cache = {
 659         .batchcount = 1,
 660         .limit = BOOT_CPUCACHE_ENTRIES,
 661         .shared = 1,
 662         .buffer_size = sizeof(struct kmem_cache),
 663         .name = "kmem_cache",
 664 #if DEBUG
 665         .obj_size = sizeof(struct kmem_cache),
 666 #endif
 667 };
 668
 669 /* Guard access to the cache-chain. */
 670 static DEFINE_MUTEX(cache_chain_mutex);
 671 static struct list_head cache_chain;
 672
 673 /*
 674  * vm_enough_memory() looks at this to determine how many slab-allocated pages
 675  * are possibly freeable under pressure
 676  *
 677  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
 678  */
 679 atomic_t slab_reclaim_pages;
 680
 681 /*
 682  * chicken and egg problem: delay the per-cpu array allocation
 683  * until the general caches are up.
 684  */
 685 static enum {
 686         NONE,
 687         PARTIAL_AC,
 688         PARTIAL_L3,
 689         FULL
 690 } g_cpucache_up;
 691
 692 /*
 693  * used by boot code to determine if it can use slab based allocator
 694  */
 695 int slab_is_available(void)
 696 {
 697         return g_cpucache_up == FULL;
 698 }
 699
 700 static DEFINE_PER_CPU(struct work_struct, reap_work);
 701
 702 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 703                         int node);
 704 static void enable_cpucache(struct kmem_cache *cachep);
 705 static void cache_reap(void *unused);
 706 static int __node_shrink(struct kmem_cache *cachep, int node);
 707
 708 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 709 {
 710         return cachep->array[smp_processor_id()];
 711 }
 712
 713 static inline struct kmem_cache *__find_general_cachep(size_t size,
 714                                                         gfp_t gfpflags)
 715 {
 716         struct cache_sizes *csizep = malloc_sizes;
 717
 718 #if DEBUG
 719         /* This happens if someone tries to call
 720          * kmem_cache_create(), or __kmalloc(), before
 721          * the generic caches are initialized.
 722          */
 723         BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 724 #endif
 725         while (size > csizep->cs_size)
 726                 csizep++;
 727
 728         /*
 729          * Really subtle: The last entry with cs->cs_size==ULONG_MAX
 730          * has cs_{dma,}cachep==NULL. Thus no special case
 731          * for large kmalloc calls required.
 732          */
 733         if (unlikely(gfpflags & GFP_DMA))
 734                 return csizep->cs_dmacachep;
 735         return csizep->cs_cachep;
 736 }
 737
 738 struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 739 {
 740         return __find_general_cachep(size, gfpflags);
 741 }
 742 EXPORT_SYMBOL(kmem_find_general_cachep);
 743
 744 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 745 {
 746         return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 747 }
 748
 749 /*
 750  * Calculate the number of objects and left-over bytes for a given buffer size.
 751  */
 752 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 753                            size_t align, int flags, size_t *left_over,
 754                            unsigned int *num)
 755 {
 756         int nr_objs;
 757         size_t mgmt_size;
 758         size_t slab_size = PAGE_SIZE << gfporder;
 759
 760         /*
 761          * The slab management structure can be either off the slab or
 762          * on it. For the latter case, the memory allocated for a
 763          * slab is used for:
 764          *
 765          * - The struct slab
 766          * - One kmem_bufctl_t for each object
 767          * - Padding to respect alignment of @align
 768          * - @buffer_size bytes for each object
 769          *
 770          * If the slab management structure is off the slab, then the
 771          * alignment will already be calculated into the size. Because
 772          * the slabs are all pages aligned, the objects will be at the
 773          * correct alignment when allocated.
 774          */
 775         if (flags & CFLGS_OFF_SLAB) {
 776                 mgmt_size = 0;
 777                 nr_objs = slab_size / buffer_size;
 778
 779                 if (nr_objs > SLAB_LIMIT)
 780                         nr_objs = SLAB_LIMIT;
 781         } else {
 782                 /*
 783                  * Ignore padding for the initial guess. The padding
 784                  * is at most @align-1 bytes, and @buffer_size is at
 785                  * least @align. In the worst case, this result will
 786                  * be one greater than the number of objects that fit
 787                  * into the memory allocation when taking the padding
 788                  * into account.
 789                  */
 790                 nr_objs = (slab_size - sizeof(struct slab)) /
 791                           (buffer_size + sizeof(kmem_bufctl_t));
 792
 793                 /*
 794                  * This calculated number will be either the right
 795                  * amount, or one greater than what we want.
 796                  */
 797                 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
 798                        > slab_size)
 799                         nr_objs--;
 800
 801                 if (nr_objs > SLAB_LIMIT)
 802                         nr_objs = SLAB_LIMIT;
 803
 804                 mgmt_size = slab_mgmt_size(nr_objs, align);
 805         }
 806         *num = nr_objs;
 807         *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 808 }
 809
 810 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 811
 812 static void __slab_error(const char *function, struct kmem_cache *cachep,
 813                         char *msg)
 814 {
 815         printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 816                function, cachep->name, msg);
 817         dump_stack();
 818 }
 819
 820 #ifdef CONFIG_NUMA
 821 /*
 822  * Special reaping functions for NUMA systems called from cache_reap().
 823  * These take care of doing round robin flushing of alien caches (containing
 824  * objects freed on different nodes from which they were allocated) and the
 825  * flushing of remote pcps by calling drain_node_pages.
 826  */
 827 static DEFINE_PER_CPU(unsigned long, reap_node);
 828
 829 static void init_reap_node(int cpu)
 830 {
 831         int node;
 832
 833         node = next_node(cpu_to_node(cpu), node_online_map);
 834         if (node == MAX_NUMNODES)
 835                 node = first_node(node_online_map);
 836
 837         __get_cpu_var(reap_node) = node;
 838 }
 839
 840 static void next_reap_node(void)
 841 {
 842         int node = __get_cpu_var(reap_node);
 843
 844         /*
 845          * Also drain per cpu pages on remote zones
 846          */
 847         if (node != numa_node_id())
 848                 drain_node_pages(node);
 849
 850         node = next_node(node, node_online_map);
 851         if (unlikely(node >= MAX_NUMNODES))
 852                 node = first_node(node_online_map);
 853         __get_cpu_var(reap_node) = node;
 854 }
 855
 856 #else
 857 #define init_reap_node(cpu) do { } while (0)
 858 #define next_reap_node(void) do { } while (0)
 859 #endif
 860
 861 /*
 862  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 863  * via the workqueue/eventd.
 864  * Add the CPU number into the expiration time to minimize the possibility of
 865  * the CPUs getting into lockstep and contending for the global cache chain
 866  * lock.
 867  */
 868 static void __devinit start_cpu_timer(int cpu)
 869 {
 870         struct work_struct *reap_work = &per_cpu(reap_work, cpu);
 871
 872         /*
 873          * When this gets called from do_initcalls via cpucache_init(),
 874          * init_workqueues() has already run, so keventd will be setup
 875          * at that time.
 876          */
 877         if (keventd_up() && reap_work->func == NULL) {
 878                 init_reap_node(cpu);
 879                 INIT_WORK(reap_work, cache_reap, NULL);
 880                 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
 881         }
 882 }
 883
 884 static struct array_cache *alloc_arraycache(int node, int entries,
 885                                             int batchcount)
 886 {
 887         int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 888         struct array_cache *nc = NULL;
 889
 890         nc = kmalloc_node(memsize, GFP_KERNEL, node);
 891         if (nc) {
 892                 nc->avail = 0;
 893                 nc->limit = entries;
 894                 nc->batchcount = batchcount;
 895                 nc->touched = 0;
 896                 spin_lock_init(&nc->lock);
 897         }
 898         return nc;
 899 }
 900
 901 /*
 902  * Transfer objects in one arraycache to another.
 903  * Locking must be handled by the caller.
 904  *
 905  * Return the number of entries transferred.
 906  */
 907 static int transfer_objects(struct array_cache *to,
 908                 struct array_cache *from, unsigned int max)
 909 {
 910         /* Figure out how many entries to transfer */
 911         int nr = min(min(from->avail, max), to->limit - to->avail);
 912
 913         if (!nr)
 914                 return 0;
 915
 916         memcpy(to->entry + to->avail, from->entry + from->avail -nr,
 917                         sizeof(void *) *nr);
 918
 919         from->avail -= nr;
 920         to->avail += nr;
 921         to->touched = 1;
 922         return nr;
 923 }
 924
 925 #ifdef CONFIG_NUMA
 926 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
 927 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 928
 929 static struct array_cache **alloc_alien_cache(int node, int limit)
 930 {
 931         struct array_cache **ac_ptr;
 932         int memsize = sizeof(void *) * MAX_NUMNODES;
 933         int i;
 934
 935         if (limit > 1)
 936                 limit = 12;
 937         ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
 938         if (ac_ptr) {
 939                 for_each_node(i) {
 940                         if (i == node || !node_online(i)) {
 941                                 ac_ptr[i] = NULL;
 942                                 continue;
 943                         }
 944                         ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
 945                         if (!ac_ptr[i]) {
 946                                 for (i--; i <= 0; i--)
 947                                         kfree(ac_ptr[i]);
 948                                 kfree(ac_ptr);
 949                                 return NULL;
 950                         }
 951                 }
 952         }
 953         return ac_ptr;
 954 }
 955
 956 static void free_alien_cache(struct array_cache **ac_ptr)
 957 {
 958         int i;
 959
 960         if (!ac_ptr)
 961                 return;
 962         for_each_node(i)
 963             kfree(ac_ptr[i]);
 964         kfree(ac_ptr);
 965 }
 966
 967 static void __drain_alien_cache(struct kmem_cache *cachep,
 968                                 struct array_cache *ac, int node)
 969 {
 970         struct kmem_list3 *rl3 = cachep->nodelists[node];
 971
 972         if (ac->avail) {
 973                 spin_lock(&rl3->list_lock);
 974                 /*
 975                  * Stuff objects into the remote nodes shared array first.
 976                  * That way we could avoid the overhead of putting the objects
 977                  * into the free lists and getting them back later.
 978                  */
 979                 if (rl3->shared)
 980                         transfer_objects(rl3->shared, ac, ac->limit);
 981
 982                 free_block(cachep, ac->entry, ac->avail, node);
 983                 ac->avail = 0;
 984                 spin_unlock(&rl3->list_lock);
 985         }
 986 }
 987
 988 /*
 989  * Called from cache_reap() to regularly drain alien caches round robin.
 990  */
 991 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 992 {
 993         int node = __get_cpu_var(reap_node);
 994
 995         if (l3->alien) {
 996                 struct array_cache *ac = l3->alien[node];
 997
 998                 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
 999                         __drain_alien_cache(cachep, ac, node);
1000                         spin_unlock_irq(&ac->lock);
1001                 }
1002         }
1003 }
1004
1005 static void drain_alien_cache(struct kmem_cache *cachep,
1006                                 struct array_cache **alien)
1007 {
1008         int i = 0;
1009         struct array_cache *ac;
1010         unsigned long flags;
1011
1012         for_each_online_node(i) {
1013                 ac = alien[i];
1014                 if (ac) {
1015                         spin_lock_irqsave(&ac->lock, flags);
1016                         __drain_alien_cache(cachep, ac, i);
1017                         spin_unlock_irqrestore(&ac->lock, flags);
1018                 }
1019         }
1020 }
1021
1022 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1023 {
1024         struct slab *slabp = virt_to_slab(objp);
1025         int nodeid = slabp->nodeid;
1026         struct kmem_list3 *l3;
1027         struct array_cache *alien = NULL;
1028
1029         /*
1030          * Make sure we are not freeing a object from another node to the array
1031          * cache on this cpu.
1032          */
1033         if (likely(slabp->nodeid == numa_node_id()))
1034                 return 0;
1035
1036         l3 = cachep->nodelists[numa_node_id()];
1037         STATS_INC_NODEFREES(cachep);
1038         if (l3->alien && l3->alien[nodeid]) {
1039                 alien = l3->alien[nodeid];
1040                 spin_lock(&alien->lock);
1041                 if (unlikely(alien->avail == alien->limit)) {
1042                         STATS_INC_ACOVERFLOW(cachep);
1043                         __drain_alien_cache(cachep, alien, nodeid);
1044                 }
1045                 alien->entry[alien->avail++] = objp;
1046                 spin_unlock(&alien->lock);
1047         } else {
1048                 spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1049                 free_block(cachep, &objp, 1, nodeid);
1050                 spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1051         }
1052         return 1;
1053 }
1054
1055 #else
1056
1057 #define drain_alien_cache(cachep, alien) do { } while (0)
1058 #define reap_alien(cachep, l3) do { } while (0)
1059
1060 static inline struct array_cache **alloc_alien_cache(int node, int limit)
1061 {
1062         return (struct array_cache **) 0x01020304ul;
1063 }
1064
1065 static inline void free_alien_cache(struct array_cache **ac_ptr)
1066 {
1067 }
1068
1069 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1070 {
1071         return 0;
1072 }
1073
1074 #endif
1075
1076 static int __devinit cpuup_callback(struct notifier_block *nfb,
1077                                     unsigned long action, void *hcpu)
1078 {
1079         long cpu = (long)hcpu;
1080         struct kmem_cache *cachep;
1081         struct kmem_list3 *l3 = NULL;
1082         int node = cpu_to_node(cpu);
1083         int memsize = sizeof(struct kmem_list3);
1084
1085         switch (action) {
1086         case CPU_UP_PREPARE:
1087                 mutex_lock(&cache_chain_mutex);
1088                 /*
1089                  * We need to do this right in the beginning since
1090                  * alloc_arraycache's are going to use this list.
1091                  * kmalloc_node allows us to add the slab to the right
1092                  * kmem_list3 and not this cpu's kmem_list3
1093                  */
1094
1095                 list_for_each_entry(cachep, &cache_chain, next) {
1096                         /*
1097                          * Set up the size64 kmemlist for cpu before we can
1098                          * begin anything. Make sure some other cpu on this
1099                          * node has not already allocated this
1100                          */
1101                         if (!cachep->nodelists[node]) {
1102                                 l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1103                                 if (!l3)
1104                                         goto bad;
1105                                 kmem_list3_init(l3);
1106                                 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1107                                     ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1108
1109                                 /*
1110                                  * The l3s don't come and go as CPUs come and
1111                                  * go.  cache_chain_mutex is sufficient
1112                                  * protection here.
1113                                  */
1114                                 cachep->nodelists[node] = l3;
1115                         }
1116
1117                         spin_lock_irq(&cachep->nodelists[node]->list_lock);
1118                         cachep->nodelists[node]->free_limit =
1119                                 (1 + nr_cpus_node(node)) *
1120                                 cachep->batchcount + cachep->num;
1121                         spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1122                 }
1123
1124                 /*
1125                  * Now we can go ahead with allocating the shared arrays and
1126                  * array caches
1127                  */
1128                 list_for_each_entry(cachep, &cache_chain, next) {
1129                         struct array_cache *nc;
1130                         struct array_cache *shared;
1131                         struct array_cache **alien;
1132
1133                         nc = alloc_arraycache(node, cachep->limit,
1134                                                 cachep->batchcount);
1135                         if (!nc)
1136                                 goto bad;
1137                         shared = alloc_arraycache(node,
1138                                         cachep->shared * cachep->batchcount,
1139                                         0xbaadf00d);
1140                         if (!shared)
1141                                 goto bad;
1142
1143                         alien = alloc_alien_cache(node, cachep->limit);
1144                         if (!alien)
1145                                 goto bad;
1146                         cachep->array[cpu] = nc;
1147                         l3 = cachep->nodelists[node];
1148                         BUG_ON(!l3);
1149
1150                         spin_lock_irq(&l3->list_lock);
1151                         if (!l3->shared) {
1152                                 /*
1153                                  * We are serialised from CPU_DEAD or
1154                                  * CPU_UP_CANCELLED by the cpucontrol lock
1155                                  */
1156                                 l3->shared = shared;
1157                                 shared = NULL;
1158                         }
1159 #ifdef CONFIG_NUMA
1160                         if (!l3->alien) {
1161                                 l3->alien = alien;
1162                                 alien = NULL;
1163                         }
1164 #endif
1165                         spin_unlock_irq(&l3->list_lock);
1166                         kfree(shared);
1167                         free_alien_cache(alien);
1168                 }
1169                 mutex_unlock(&cache_chain_mutex);
1170                 break;
1171         case CPU_ONLINE:
1172                 start_cpu_timer(cpu);
1173                 break;
1174 #ifdef CONFIG_HOTPLUG_CPU
1175         case CPU_DEAD:
1176                 /*
1177                  * Even if all the cpus of a node are down, we don't free the
1178                  * kmem_list3 of any cache. This to avoid a race between
1179                  * cpu_down, and a kmalloc allocation from another cpu for
1180                  * memory from the node of the cpu going down.  The list3
1181                  * structure is usually allocated from kmem_cache_create() and
1182                  * gets destroyed at kmem_cache_destroy().
1183                  */
1184                 /* fall thru */
1185         case CPU_UP_CANCELED:
1186                 mutex_lock(&cache_chain_mutex);
1187                 list_for_each_entry(cachep, &cache_chain, next) {
1188                         struct array_cache *nc;
1189                         struct array_cache *shared;
1190                         struct array_cache **alien;
1191                         cpumask_t mask;
1192
1193                         mask = node_to_cpumask(node);
1194                         /* cpu is dead; no one can alloc from it. */
1195                         nc = cachep->array[cpu];
1196                         cachep->array[cpu] = NULL;
1197                         l3 = cachep->nodelists[node];
1198
1199                         if (!l3)
1200                                 goto free_array_cache;
1201
1202                         spin_lock_irq(&l3->list_lock);
1203
1204                         /* Free limit for this kmem_list3 */
1205                         l3->free_limit -= cachep->batchcount;
1206                         if (nc)
1207                                 free_block(cachep, nc->entry, nc->avail, node);
1208
1209                         if (!cpus_empty(mask)) {
1210                                 spin_unlock_irq(&l3->list_lock);
1211                                 goto free_array_cache;
1212                         }
1213
1214                         shared = l3->shared;
1215                         if (shared) {
1216                                 free_block(cachep, l3->shared->entry,
1217                                            l3->shared->avail, node);
1218                                 l3->shared = NULL;
1219                         }
1220
1221                         alien = l3->alien;
1222                         l3->alien = NULL;
1223
1224                         spin_unlock_irq(&l3->list_lock);
1225
1226                         kfree(shared);
1227                         if (alien) {
1228                                 drain_alien_cache(cachep, alien);
1229                                 free_alien_cache(alien);
1230                         }
1231 free_array_cache:
1232                         kfree(nc);
1233                 }
1234                 /*
1235                  * In the previous loop, all the objects were freed to
1236                  * the respective cache's slabs,  now we can go ahead and
1237                  * shrink each nodelist to its limit.
1238                  */
1239                 list_for_each_entry(cachep, &cache_chain, next) {
1240                         l3 = cachep->nodelists[node];
1241                         if (!l3)
1242                                 continue;
1243                         spin_lock_irq(&l3->list_lock);
1244                         /* free slabs belonging to this node */
1245                         __node_shrink(cachep, node);
1246                         spin_unlock_irq(&l3->list_lock);
1247                 }
1248                 mutex_unlock(&cache_chain_mutex);
1249                 break;
1250 #endif
1251         }
1252         return NOTIFY_OK;
1253 bad:
1254         mutex_unlock(&cache_chain_mutex);
1255         return NOTIFY_BAD;
1256 }
1257
1258 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
1259
1260 /*
1261  * swap the static kmem_list3 with kmalloced memory
1262  */
1263 static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1264                         int nodeid)
1265 {
1266         struct kmem_list3 *ptr;
1267
1268         BUG_ON(cachep->nodelists[nodeid] != list);
1269         ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
1270         BUG_ON(!ptr);
1271
1272         local_irq_disable();
1273         memcpy(ptr, list, sizeof(struct kmem_list3));
1274         MAKE_ALL_LISTS(cachep, ptr, nodeid);
1275         cachep->nodelists[nodeid] = ptr;
1276         local_irq_enable();
1277 }
1278
1279 /*
1280  * Initialisation.  Called after the page allocator have been initialised and
1281  * before smp_init().
1282  */
1283 void __init kmem_cache_init(void)
1284 {
1285         size_t left_over;
1286         struct cache_sizes *sizes;
1287         struct cache_names *names;
1288         int i;
1289         int order;
1290
1291         for (i = 0; i < NUM_INIT_LISTS; i++) {
1292                 kmem_list3_init(&initkmem_list3[i]);
1293                 if (i < MAX_NUMNODES)
1294                         cache_cache.nodelists[i] = NULL;
1295         }
1296
1297         /*
1298          * Fragmentation resistance on low memory - only use bigger
1299          * page orders on machines with more than 32MB of memory.
1300          */
1301         if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1302                 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1303
1304         /* Bootstrap is tricky, because several objects are allocated
1305          * from caches that do not exist yet:
1306          * 1) initialize the cache_cache cache: it contains the struct
1307          *    kmem_cache structures of all caches, except cache_cache itself:
1308          *    cache_cache is statically allocated.
1309          *    Initially an __init data area is used for the head array and the
1310          *    kmem_list3 structures, it's replaced with a kmalloc allocated
1311          *    array at the end of the bootstrap.
1312          * 2) Create the first kmalloc cache.
1313          *    The struct kmem_cache for the new cache is allocated normally.
1314          *    An __init data area is used for the head array.
1315          * 3) Create the remaining kmalloc caches, with minimally sized
1316          *    head arrays.
1317          * 4) Replace the __init data head arrays for cache_cache and the first
1318          *    kmalloc cache with kmalloc allocated arrays.
1319          * 5) Replace the __init data for kmem_list3 for cache_cache and
1320          *    the other cache's with kmalloc allocated memory.
1321          * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1322          */
1323
1324         /* 1) create the cache_cache */
1325         INIT_LIST_HEAD(&cache_chain);
1326         list_add(&cache_cache.next, &cache_chain);
1327         cache_cache.colour_off = cache_line_size();
1328         cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1329         cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1330
1331         cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1332                                         cache_line_size());
1333
1334         for (order = 0; order < MAX_ORDER; order++) {
1335                 cache_estimate(order, cache_cache.buffer_size,
1336                         cache_line_size(), 0, &left_over, &cache_cache.num);
1337                 if (cache_cache.num)
1338                         break;
1339         }
1340         BUG_ON(!cache_cache.num);
1341         cache_cache.gfporder = order;
1342         cache_cache.colour = left_over / cache_cache.colour_off;
1343         cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1344                                       sizeof(struct slab), cache_line_size());
1345
1346         /* 2+3) create the kmalloc caches */
1347         sizes = malloc_sizes;
1348         names = cache_names;
1349
1350         /*
1351          * Initialize the caches that provide memory for the array cache and the
1352          * kmem_list3 structures first.  Without this, further allocations will
1353          * bug.
1354          */
1355
1356         sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1357                                         sizes[INDEX_AC].cs_size,
1358                                         ARCH_KMALLOC_MINALIGN,
1359                                         ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1360                                         NULL, NULL);
1361
1362         if (INDEX_AC != INDEX_L3) {
1363                 sizes[INDEX_L3].cs_cachep =
1364                         kmem_cache_create(names[INDEX_L3].name,
1365                                 sizes[INDEX_L3].cs_size,
1366                                 ARCH_KMALLOC_MINALIGN,
1367                                 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1368                                 NULL, NULL);
1369         }
1370
1371         slab_early_init = 0;
1372
1373         while (sizes->cs_size != ULONG_MAX) {
1374                 /*
1375                  * For performance, all the general caches are L1 aligned.
1376                  * This should be particularly beneficial on SMP boxes, as it
1377                  * eliminates "false sharing".
1378                  * Note for systems short on memory removing the alignment will
1379                  * allow tighter packing of the smaller caches.
1380                  */
1381                 if (!sizes->cs_cachep) {
1382                         sizes->cs_cachep = kmem_cache_create(names->name,
1383                                         sizes->cs_size,
1384                                         ARCH_KMALLOC_MINALIGN,
1385                                         ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1386                                         NULL, NULL);
1387                 }
1388
1389                 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1390                                         sizes->cs_size,
1391                                         ARCH_KMALLOC_MINALIGN,
1392                                         ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1393                                                 SLAB_PANIC,
1394                                         NULL, NULL);
1395                 sizes++;
1396                 names++;
1397         }
1398         /* 4) Replace the bootstrap head arrays */
1399         {
1400                 void *ptr;
1401
1402                 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1403
1404                 local_irq_disable();
1405                 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1406                 memcpy(ptr, cpu_cache_get(&cache_cache),
1407                        sizeof(struct arraycache_init));
1408                 cache_cache.array[smp_processor_id()] = ptr;
1409                 local_irq_enable();
1410
1411                 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1412
1413                 local_irq_disable();
1414                 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1415                        != &initarray_generic.cache);
1416                 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1417                        sizeof(struct arraycache_init));
1418                 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1419                     ptr;
1420                 local_irq_enable();
1421         }
1422         /* 5) Replace the bootstrap kmem_list3's */
1423         {
1424                 int node;
1425                 /* Replace the static kmem_list3 structures for the boot cpu */
1426                 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1427                           numa_node_id());
1428
1429                 for_each_online_node(node) {
1430                         init_list(malloc_sizes[INDEX_AC].cs_cachep,
1431                                   &initkmem_list3[SIZE_AC + node], node);
1432
1433                         if (INDEX_AC != INDEX_L3) {
1434                                 init_list(malloc_sizes[INDEX_L3].cs_cachep,
1435                                           &initkmem_list3[SIZE_L3 + node],
1436                                           node);
1437                         }
1438                 }
1439         }
1440
1441         /* 6) resize the head arrays to their final sizes */
1442         {
1443                 struct kmem_cache *cachep;
1444                 mutex_lock(&cache_chain_mutex);
1445                 list_for_each_entry(cachep, &cache_chain, next)
1446                         enable_cpucache(cachep);
1447                 mutex_unlock(&cache_chain_mutex);
1448         }
1449
1450         /* Done! */
1451         g_cpucache_up = FULL;
1452
1453         /*
1454          * Register a cpu startup notifier callback that initializes
1455          * cpu_cache_get for all new cpus
1456          */
1457         register_cpu_notifier(&cpucache_notifier);
1458
1459         /*
1460          * The reap timers are started later, with a module init call: That part
1461          * of the kernel is not yet operational.
1462          */
1463 }
1464
1465 static int __init cpucache_init(void)
1466 {
1467         int cpu;
1468
1469         /*
1470          * Register the timers that return unneeded pages to the page allocator
1471          */
1472         for_each_online_cpu(cpu)
1473                 start_cpu_timer(cpu);
1474         return 0;
1475 }
1476 __initcall(cpucache_init);
1477
1478 /*
1479  * Interface to system's page allocator. No need to hold the cache-lock.
1480  *
1481  * If we requested dmaable memory, we will get it. Even if we
1482  * did not request dmaable memory, we might get it, but that
1483  * would be relatively rare and ignorable.
1484  */
1485 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1486 {
1487         struct page *page;
1488         int nr_pages;
1489         int i;
1490
1491 #ifndef CONFIG_MMU
1492         /*
1493          * Nommu uses slab's for process anonymous memory allocations, and thus
1494          * requires __GFP_COMP to properly refcount higher order allocations
1495          */
1496         flags |= __GFP_COMP;
1497 #endif
1498         flags |= cachep->gfpflags;
1499
1500         page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1501         if (!page)
1502                 return NULL;
1503
1504         nr_pages = (1 << cachep->gfporder);
1505         if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1506                 atomic_add(nr_pages, &slab_reclaim_pages);
1507         add_page_state(nr_slab, nr_pages);
1508         for (i = 0; i < nr_pages; i++)
1509                 __SetPageSlab(page + i);
1510         return page_address(page);
1511 }
1512
1513 /*
1514  * Interface to system's page release.
1515  */
1516 static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1517 {
1518         unsigned long i = (1 << cachep->gfporder);
1519         struct page *page = virt_to_page(addr);
1520         const unsigned long nr_freed = i;
1521
1522         while (i--) {
1523                 BUG_ON(!PageSlab(page));
1524                 __ClearPageSlab(page);
1525                 page++;
1526         }
1527         sub_page_state(nr_slab, nr_freed);
1528         if (current->reclaim_state)
1529                 current->reclaim_state->reclaimed_slab += nr_freed;
1530         free_pages((unsigned long)addr, cachep->gfporder);
1531         if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1532                 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1533 }
1534
1535 static void kmem_rcu_free(struct rcu_head *head)
1536 {
1537         struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1538         struct kmem_cache *cachep = slab_rcu->cachep;
1539
1540         kmem_freepages(cachep, slab_rcu->addr);
1541         if (OFF_SLAB(cachep))
1542                 kmem_cache_free(cachep->slabp_cache, slab_rcu);
1543 }
1544
1545 #if DEBUG
1546
1547 #ifdef CONFIG_DEBUG_PAGEALLOC
1548 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1549                             unsigned long caller)
1550 {
1551         int size = obj_size(cachep);
1552
1553         addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1554
1555         if (size < 5 * sizeof(unsigned long))
1556                 return;
1557
1558         *addr++ = 0x12345678;
1559         *addr++ = caller;
1560         *addr++ = smp_processor_id();
1561         size -= 3 * sizeof(unsigned long);
1562         {
1563                 unsigned long *sptr = &caller;
1564                 unsigned long svalue;
1565
1566                 while (!kstack_end(sptr)) {
1567                         svalue = *sptr++;
1568                         if (kernel_text_address(svalue)) {
1569                                 *addr++ = svalue;
1570                                 size -= sizeof(unsigned long);
1571                                 if (size <= sizeof(unsigned long))
1572                                         break;
1573                         }
1574                 }
1575
1576         }
1577         *addr++ = 0x87654321;
1578 }
1579 #endif
1580
1581 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1582 {
1583         int size = obj_size(cachep);
1584         addr = &((char *)addr)[obj_offset(cachep)];
1585
1586         memset(addr, val, size);
1587         *(unsigned char *)(addr + size - 1) = POISON_END;
1588 }
1589
1590 static void dump_line(char *data, int offset, int limit)
1591 {
1592         int i;
1593         printk(KERN_ERR "%03x:", offset);
1594         for (i = 0; i < limit; i++)
1595                 printk(" %02x", (unsigned char)data[offset + i]);
1596         printk("\n");
1597 }
1598 #endif
1599
1600 #if DEBUG
1601
1602 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1603 {
1604         int i, size;
1605         char *realobj;
1606
1607         if (cachep->flags & SLAB_RED_ZONE) {
1608                 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1609                         *dbg_redzone1(cachep, objp),
1610                         *dbg_redzone2(cachep, objp));
1611         }
1612
1613         if (cachep->flags & SLAB_STORE_USER) {
1614                 printk(KERN_ERR "Last user: [<%p>]",
1615                         *dbg_userword(cachep, objp));
1616                 print_symbol("(%s)",
1617                                 (unsigned long)*dbg_userword(cachep, objp));
1618                 printk("\n");
1619         }
1620         realobj = (char *)objp + obj_offset(cachep);
1621         size = obj_size(cachep);
1622         for (i = 0; i < size && lines; i += 16, lines--) {
1623                 int limit;
1624                 limit = 16;
1625                 if (i + limit > size)
1626                         limit = size - i;
1627                 dump_line(realobj, i, limit);
1628         }
1629 }
1630
1631 static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1632 {
1633         char *realobj;
1634         int size, i;
1635         int lines = 0;
1636
1637         realobj = (char *)objp + obj_offset(cachep);
1638         size = obj_size(cachep);
1639
1640         for (i = 0; i < size; i++) {
1641                 char exp = POISON_FREE;
1642                 if (i == size - 1)
1643                         exp = POISON_END;
1644                 if (realobj[i] != exp) {
1645                         int limit;
1646                         /* Mismatch ! */
1647                         /* Print header */
1648                         if (lines == 0) {
1649                                 printk(KERN_ERR
1650                                         "Slab corruption: start=%p, len=%d\n",
1651                                         realobj, size);
1652                                 print_objinfo(cachep, objp, 0);
1653                         }
1654                         /* Hexdump the affected line */
1655                         i = (i / 16) * 16;
1656                         limit = 16;
1657                         if (i + limit > size)
1658                                 limit = size - i;
1659                         dump_line(realobj, i, limit);
1660                         i += 16;
1661                         lines++;
1662                         /* Limit to 5 lines */
1663                         if (lines > 5)
1664                                 break;
1665                 }
1666         }
1667         if (lines != 0) {
1668                 /* Print some data about the neighboring objects, if they
1669                  * exist:
1670                  */
1671                 struct slab *slabp = virt_to_slab(objp);
1672                 unsigned int objnr;
1673
1674                 objnr = obj_to_index(cachep, slabp, objp);
1675                 if (objnr) {
1676                         objp = index_to_obj(cachep, slabp, objnr - 1);
1677                         realobj = (char *)objp + obj_offset(cachep);
1678                         printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1679                                realobj, size);
1680                         print_objinfo(cachep, objp, 2);
1681                 }
1682                 if (objnr + 1 < cachep->num) {
1683                         objp = index_to_obj(cachep, slabp, objnr + 1);
1684                         realobj = (char *)objp + obj_offset(cachep);
1685                         printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1686                                realobj, size);
1687                         print_objinfo(cachep, objp, 2);
1688                 }
1689         }
1690 }
1691 #endif
1692
1693 #if DEBUG
1694 /**
1695  * slab_destroy_objs - destroy a slab and its objects
1696  * @cachep: cache pointer being destroyed
1697  * @slabp: slab pointer being destroyed
1698  *
1699  * Call the registered destructor for each object in a slab that is being
1700  * destroyed.
1701  */
1702 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1703 {
1704         int i;
1705         for (i = 0; i < cachep->num; i++) {
1706                 void *objp = index_to_obj(cachep, slabp, i);
1707
1708                 if (cachep->flags & SLAB_POISON) {
1709 #ifdef CONFIG_DEBUG_PAGEALLOC
1710                         if (cachep->buffer_size % PAGE_SIZE == 0 &&
1711                                         OFF_SLAB(cachep))
1712                                 kernel_map_pages(virt_to_page(objp),
1713                                         cachep->buffer_size / PAGE_SIZE, 1);
1714                         else
1715                                 check_poison_obj(cachep, objp);
1716 #else
1717                         check_poison_obj(cachep, objp);
1718 #endif
1719                 }
1720                 if (cachep->flags & SLAB_RED_ZONE) {
1721                         if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1722                                 slab_error(cachep, "start of a freed object "
1723                                            "was overwritten");
1724                         if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1725                                 slab_error(cachep, "end of a freed object "
1726                                            "was overwritten");
1727                 }
1728                 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1729                         (cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1730         }
1731 }
1732 #else
1733 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1734 {
1735         if (cachep->dtor) {
1736                 int i;
1737                 for (i = 0; i < cachep->num; i++) {
1738                         void *objp = index_to_obj(cachep, slabp, i);
1739                         (cachep->dtor) (objp, cachep, 0);
1740                 }
1741         }
1742 }
1743 #endif
1744
1745 /**
1746  * slab_destroy - destroy and release all objects in a slab
1747  * @cachep: cache pointer being destroyed
1748  * @slabp: slab pointer being destroyed
1749  *
1750  * Destroy all the objs in a slab, and release the mem back to the system.
1751  * Before calling the slab must have been unlinked from the cache.  The
1752  * cache-lock is not held/needed.
1753  */
1754 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1755 {
1756         void *addr = slabp->s_mem - slabp->colouroff;
1757
1758         slab_destroy_objs(cachep, slabp);
1759         if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1760                 struct slab_rcu *slab_rcu;
1761
1762                 slab_rcu = (struct slab_rcu *)slabp;
1763                 slab_rcu->cachep = cachep;
1764                 slab_rcu->addr = addr;
1765                 call_rcu(&slab_rcu->head, kmem_rcu_free);
1766         } else {
1767                 kmem_freepages(cachep, addr);
1768                 if (OFF_SLAB(cachep))
1769                         kmem_cache_free(cachep->slabp_cache, slabp);
1770         }
1771 }
1772
1773 /*
1774  * For setting up all the kmem_list3s for cache whose buffer_size is same as
1775  * size of kmem_list3.
1776  */
1777 static void set_up_list3s(struct kmem_cache *cachep, int index)
1778 {
1779         int node;
1780
1781         for_each_online_node(node) {
1782                 cachep->nodelists[node] = &initkmem_list3[index + node];
1783                 cachep->nodelists[node]->next_reap = jiffies +
1784                     REAPTIMEOUT_LIST3 +
1785                     ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1786         }
1787 }
1788
1789 /**
1790  * calculate_slab_order - calculate size (page order) of slabs
1791  * @cachep: pointer to the cache that is being created
1792  * @size: size of objects to be created in this cache.
1793  * @align: required alignment for the objects.
1794  * @flags: slab allocation flags
1795  *
1796  * Also calculates the number of objects per slab.
1797  *
1798  * This could be made much more intelligent.  For now, try to avoid using
1799  * high order pages for slabs.  When the gfp() functions are more friendly
1800  * towards high-order requests, this should be changed.
1801  */
1802 static size_t calculate_slab_order(struct kmem_cache *cachep,
1803                         size_t size, size_t align, unsigned long flags)
1804 {
1805         unsigned long offslab_limit;
1806         size_t left_over = 0;
1807         int gfporder;
1808
1809         for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
1810                 unsigned int num;
1811                 size_t remainder;
1812
1813                 cache_estimate(gfporder, size, align, flags, &remainder, &num);
1814                 if (!num)
1815                         continue;
1816
1817                 if (flags & CFLGS_OFF_SLAB) {
1818                         /*
1819                          * Max number of objs-per-slab for caches which
1820                          * use off-slab slabs. Needed to avoid a possible
1821                          * looping condition in cache_grow().
1822                          */
1823                         offslab_limit = size - sizeof(struct slab);
1824                         offslab_limit /= sizeof(kmem_bufctl_t);
1825
1826                         if (num > offslab_limit)
1827                                 break;
1828                 }
1829
1830                 /* Found something acceptable - save it away */
1831                 cachep->num = num;
1832                 cachep->gfporder = gfporder;
1833                 left_over = remainder;
1834
1835                 /*
1836                  * A VFS-reclaimable slab tends to have most allocations
1837                  * as GFP_NOFS and we really don't want to have to be allocating
1838                  * higher-order pages when we are unable to shrink dcache.
1839                  */
1840                 if (flags & SLAB_RECLAIM_ACCOUNT)
1841                         break;
1842
1843                 /*
1844                  * Large number of objects is good, but very large slabs are
1845                  * currently bad for the gfp()s.
1846                  */
1847                 if (gfporder >= slab_break_gfp_order)
1848                         break;
1849
1850                 /*
1851                  * Acceptable internal fragmentation?
1852                  */
1853                 if (left_over * 8 <= (PAGE_SIZE << gfporder))
1854                         break;
1855         }
1856         return left_over;
1857 }
1858
1859 static void setup_cpu_cache(struct kmem_cache *cachep)
1860 {
1861         if (g_cpucache_up == FULL) {
1862                 enable_cpucache(cachep);
1863                 return;
1864         }
1865         if (g_cpucache_up == NONE) {
1866                 /*
1867                  * Note: the first kmem_cache_create must create the cache
1868                  * that's used by kmalloc(24), otherwise the creation of
1869                  * further caches will BUG().
1870                  */
1871                 cachep->array[smp_processor_id()] = &initarray_generic.cache;
1872
1873                 /*
1874                  * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
1875                  * the first cache, then we need to set up all its list3s,
1876                  * otherwise the creation of further caches will BUG().
1877                  */
1878                 set_up_list3s(cachep, SIZE_AC);
1879                 if (INDEX_AC == INDEX_L3)
1880                         g_cpucache_up = PARTIAL_L3;
1881                 else
1882                         g_cpucache_up = PARTIAL_AC;
1883         } else {
1884                 cachep->array[smp_processor_id()] =
1885                         kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1886
1887                 if (g_cpucache_up == PARTIAL_AC) {
1888                         set_up_list3s(cachep, SIZE_L3);
1889                         g_cpucache_up = PARTIAL_L3;
1890                 } else {
1891                         int node;
1892                         for_each_online_node(node) {
1893                                 cachep->nodelists[node] =
1894                                     kmalloc_node(sizeof(struct kmem_list3),
1895                                                 GFP_KERNEL, node);
1896                                 BUG_ON(!cachep->nodelists[node]);
1897                                 kmem_list3_init(cachep->nodelists[node]);
1898                         }
1899                 }
1900         }
1901         cachep->nodelists[numa_node_id()]->next_reap =
1902                         jiffies + REAPTIMEOUT_LIST3 +
1903                         ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1904
1905         cpu_cache_get(cachep)->avail = 0;
1906         cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1907         cpu_cache_get(cachep)->batchcount = 1;
1908         cpu_cache_get(cachep)->touched = 0;
1909         cachep->batchcount = 1;
1910         cachep->limit = BOOT_CPUCACHE_ENTRIES;
1911 }
1912
1913 /**
1914  * kmem_cache_create - Create a cache.
1915  * @name: A string which is used in /proc/slabinfo to identify this cache.
1916  * @size: The size of objects to be created in this cache.
1917  * @align: The required alignment for the objects.
1918  * @flags: SLAB flags
1919  * @ctor: A constructor for the objects.
1920  * @dtor: A destructor for the objects.
1921  *
1922  * Returns a ptr to the cache on success, NULL on failure.
1923  * Cannot be called within a int, but can be interrupted.
1924  * The @ctor is run when new pages are allocated by the cache
1925  * and the @dtor is run before the pages are handed back.
1926  *
1927  * @name must be valid until the cache is destroyed. This implies that
1928  * the module calling this has to destroy the cache before getting unloaded.
1929  *
1930  * The flags are
1931  *
1932  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1933  * to catch references to uninitialised memory.
1934  *
1935  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1936  * for buffer overruns.
1937  *
1938  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1939  * cacheline.  This can be beneficial if you're counting cycles as closely
1940  * as davem.
1941  */
1942 struct kmem_cache *
1943 kmem_cache_create (const char *name, size_t size, size_t align,
1944         unsigned long flags,
1945         void (*ctor)(void*, struct kmem_cache *, unsigned long),
1946         void (*dtor)(void*, struct kmem_cache *, unsigned long))
1947 {
1948         size_t left_over, slab_size, ralign;
1949         struct kmem_cache *cachep = NULL, *pc;
1950
1951         /*
1952          * Sanity checks... these are all serious usage bugs.
1953          */
1954         if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
1955             (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1956                 printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
1957                                 name);
1958                 BUG();
1959         }
1960
1961         /*
1962          * Prevent CPUs from coming and going.
1963          * lock_cpu_hotplug() nests outside cache_chain_mutex
1964          */
1965         lock_cpu_hotplug();
1966
1967         mutex_lock(&cache_chain_mutex);
1968
1969         list_for_each_entry(pc, &cache_chain, next) {
1970                 mm_segment_t old_fs = get_fs();
1971                 char tmp;
1972                 int res;
1973
1974                 /*
1975                  * This happens when the module gets unloaded and doesn't
1976                  * destroy its slab cache and no-one else reuses the vmalloc
1977                  * area of the module.  Print a warning.
1978                  */
1979                 set_fs(KERNEL_DS);
1980                 res = __get_user(tmp, pc->name);
1981                 set_fs(old_fs);
1982                 if (res) {
1983                         printk("SLAB: cache with size %d has lost its name\n",
1984                                pc->buffer_size);
1985                         continue;
1986                 }
1987
1988                 if (!strcmp(pc->name, name)) {
1989                         printk("kmem_cache_create: duplicate cache %s\n", name);
1990                         dump_stack();
1991                         goto oops;
1992                 }
1993         }
1994
1995 #if DEBUG
1996         WARN_ON(strchr(name, ' '));     /* It confuses parsers */
1997         if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1998                 /* No constructor, but inital state check requested */
1999                 printk(KERN_ERR "%s: No con, but init state check "
2000                        "requested - %s\n", __FUNCTION__, name);
2001                 flags &= ~SLAB_DEBUG_INITIAL;
2002         }
2003 #if FORCED_DEBUG
2004         /*
2005          * Enable redzoning and last user accounting, except for caches with
2006          * large objects, if the increased size would increase the object size
2007          * above the next power of two: caches with object sizes just above a
2008          * power of two have a significant amount of internal fragmentation.
2009          */
2010         if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
2011                 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2012         if (!(flags & SLAB_DESTROY_BY_RCU))
2013                 flags |= SLAB_POISON;
2014 #endif
2015         if (flags & SLAB_DESTROY_BY_RCU)
2016                 BUG_ON(flags & SLAB_POISON);
2017 #endif
2018         if (flags & SLAB_DESTROY_BY_RCU)
2019                 BUG_ON(dtor);
2020
2021         /*
2022          * Always checks flags, a caller might be expecting debug support which
2023          * isn't available.
2024          */
2025         BUG_ON(flags & ~CREATE_MASK);
2026
2027         /*
2028          * Check that size is in terms of words.  This is needed to avoid
2029          * unaligned accesses for some archs when redzoning is used, and makes
2030          * sure any on-slab bufctl's are also correctly aligned.
2031          */
2032         if (size & (BYTES_PER_WORD - 1)) {
2033                 size += (BYTES_PER_WORD - 1);
2034                 size &= ~(BYTES_PER_WORD - 1);
2035         }
2036
2037         /* calculate the final buffer alignment: */
2038
2039         /* 1) arch recommendation: can be overridden for debug */
2040         if (flags & SLAB_HWCACHE_ALIGN) {
2041                 /*
2042                  * Default alignment: as specified by the arch code.  Except if
2043                  * an object is really small, then squeeze multiple objects into
2044                  * one cacheline.
2045                  */
2046                 ralign = cache_line_size();
2047                 while (size <= ralign / 2)
2048                         ralign /= 2;
2049         } else {
2050                 ralign = BYTES_PER_WORD;
2051         }
2052         /* 2) arch mandated alignment: disables debug if necessary */
2053         if (ralign < ARCH_SLAB_MINALIGN) {
2054                 ralign = ARCH_SLAB_MINALIGN;
2055                 if (ralign > BYTES_PER_WORD)
2056                         flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2057         }
2058         /* 3) caller mandated alignment: disables debug if necessary */
2059         if (ralign < align) {
2060                 ralign = align;
2061                 if (ralign > BYTES_PER_WORD)
2062                         flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2063         }
2064         /*
2065          * 4) Store it. Note that the debug code below can reduce
2066          *    the alignment to BYTES_PER_WORD.
2067          */
2068         align = ralign;
2069
2070         /* Get cache's description obj. */
2071         cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
2072         if (!cachep)
2073                 goto oops;
2074
2075 #if DEBUG
2076         cachep->obj_size = size;
2077
2078         if (flags & SLAB_RED_ZONE) {
2079                 /* redzoning only works with word aligned caches */
2080                 align = BYTES_PER_WORD;
2081
2082                 /* add space for red zone words */
2083                 cachep->obj_offset += BYTES_PER_WORD;
2084                 size += 2 * BYTES_PER_WORD;
2085         }
2086         if (flags & SLAB_STORE_USER) {
2087                 /* user store requires word alignment and
2088                  * one word storage behind the end of the real
2089                  * object.
2090                  */
2091                 align = BYTES_PER_WORD;
2092                 size += BYTES_PER_WORD;
2093         }
2094 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2095         if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2096             && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2097                 cachep->obj_offset += PAGE_SIZE - size;
2098                 size = PAGE_SIZE;
2099         }
2100 #endif
2101 #endif
2102
2103         /*
2104          * Determine if the slab management is 'on' or 'off' slab.
2105          * (bootstrapping cannot cope with offslab caches so don't do
2106          * it too early on.)
2107          */
2108         if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2109                 /*
2110                  * Size is large, assume best to place the slab management obj
2111                  * off-slab (should allow better packing of objs).
2112                  */
2113                 flags |= CFLGS_OFF_SLAB;
2114
2115         size = ALIGN(size, align);
2116
2117         left_over = calculate_slab_order(cachep, size, align, flags);
2118
2119         if (!cachep->num) {
2120                 printk("kmem_cache_create: couldn't create cache %s.\n", name);
2121                 kmem_cache_free(&cache_cache, cachep);
2122                 cachep = NULL;
2123                 goto oops;
2124         }
2125         slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2126                           + sizeof(struct slab), align);
2127
2128         /*
2129          * If the slab has been placed off-slab, and we have enough space then
2130          * move it on-slab. This is at the expense of any extra colouring.
2131          */
2132         if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2133                 flags &= ~CFLGS_OFF_SLAB;
2134                 left_over -= slab_size;
2135         }
2136
2137         if (flags & CFLGS_OFF_SLAB) {
2138                 /* really off slab. No need for manual alignment */
2139                 slab_size =
2140                     cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2141         }
2142
2143         cachep->colour_off = cache_line_size();
2144         /* Offset must be a multiple of the alignment. */
2145         if (cachep->colour_off < align)
2146                 cachep->colour_off = align;
2147         cachep->colour = left_over / cachep->colour_off;
2148         cachep->slab_size = slab_size;
2149         cachep->flags = flags;
2150         cachep->gfpflags = 0;
2151         if (flags & SLAB_CACHE_DMA)
2152                 cachep->gfpflags |= GFP_DMA;
2153         cachep->buffer_size = size;
2154
2155         if (flags & CFLGS_OFF_SLAB)
2156                 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2157         cachep->ctor = ctor;
2158         cachep->dtor = dtor;
2159         cachep->name = name;
2160
2161
2162         setup_cpu_cache(cachep);
2163
2164         /* cache setup completed, link it into the list */
2165         list_add(&cachep->next, &cache_chain);
2166 oops:
2167         if (!cachep && (flags & SLAB_PANIC))
2168                 panic("kmem_cache_create(): failed to create slab `%s'\n",
2169                       name);
2170         mutex_unlock(&cache_chain_mutex);
2171         unlock_cpu_hotplug();
2172         return cachep;
2173 }
2174 EXPORT_SYMBOL(kmem_cache_create);
2175
2176 #if DEBUG
2177 static void check_irq_off(void)
2178 {
2179         BUG_ON(!irqs_disabled());
2180 }
2181
2182 static void check_irq_on(void)
2183 {
2184         BUG_ON(irqs_disabled());
2185 }
2186
2187 static void check_spinlock_acquired(struct kmem_cache *cachep)
2188 {
2189 #ifdef CONFIG_SMP
2190         check_irq_off();
2191         assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2192 #endif
2193 }
2194
2195 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2196 {
2197 #ifdef CONFIG_SMP
2198         check_irq_off();
2199         assert_spin_locked(&cachep->nodelists[node]->list_lock);
2200 #endif
2201 }
2202
2203 #else
2204 #define check_irq_off() do { } while(0)
2205 #define check_irq_on()  do { } while(0)
2206 #define check_spinlock_acquired(x) do { } while(0)
2207 #define check_spinlock_acquired_node(x, y) do { } while(0)
2208 #endif
2209
2210 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2211                         struct array_cache *ac,
2212                         int force, int node);
2213
2214 static void do_drain(void *arg)
2215 {
2216         struct kmem_cache *cachep = arg;
2217         struct array_cache *ac;
2218         int node = numa_node_id();
2219
2220         check_irq_off();
2221         ac = cpu_cache_get(cachep);
2222         spin_lock(&cachep->nodelists[node]->list_lock);
2223         free_block(cachep, ac->entry, ac->avail, node);
2224         spin_unlock(&cachep->nodelists[node]->list_lock);
2225         ac->avail = 0;
2226 }
2227
2228 static void drain_cpu_caches(struct kmem_cache *cachep)
2229 {
2230         struct kmem_list3 *l3;
2231         int node;
2232
2233         on_each_cpu(do_drain, cachep, 1, 1);
2234         check_irq_on();
2235         for_each_online_node(node) {
2236                 l3 = cachep->nodelists[node];
2237                 if (l3 && l3->alien)
2238                         drain_alien_cache(cachep, l3->alien);
2239         }
2240
2241         for_each_online_node(node) {
2242                 l3 = cachep->nodelists[node];
2243                 if (l3)
2244                         drain_array(cachep, l3, l3->shared, 1, node);
2245         }
2246 }
2247
2248 static int __node_shrink(struct kmem_cache *cachep, int node)
2249 {
2250         struct slab *slabp;
2251         struct kmem_list3 *l3 = cachep->nodelists[node];
2252         int ret;
2253
2254         for (;;) {
2255                 struct list_head *p;
2256
2257                 p = l3->slabs_free.prev;
2258                 if (p == &l3->slabs_free)
2259                         break;
2260
2261                 slabp = list_entry(l3->slabs_free.prev, struct slab, list);
2262 #if DEBUG
2263                 BUG_ON(slabp->inuse);
2264 #endif
2265                 list_del(&slabp->list);
2266
2267                 l3->free_objects -= cachep->num;
2268                 spin_unlock_irq(&l3->list_lock);
2269                 slab_destroy(cachep, slabp);
2270                 spin_lock_irq(&l3->list_lock);
2271         }
2272         ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
2273         return ret;
2274 }
2275
2276 static int __cache_shrink(struct kmem_cache *cachep)
2277 {
2278         int ret = 0, i = 0;
2279         struct kmem_list3 *l3;
2280
2281         drain_cpu_caches(cachep);
2282
2283         check_irq_on();
2284         for_each_online_node(i) {
2285                 l3 = cachep->nodelists[i];
2286                 if (l3) {
2287                         spin_lock_irq(&l3->list_lock);
2288                         ret += __node_shrink(cachep, i);
2289                         spin_unlock_irq(&l3->list_lock);
2290                 }
2291         }
2292         return (ret ? 1 : 0);
2293 }
2294
2295 /**
2296  * kmem_cache_shrink - Shrink a cache.
2297  * @cachep: The cache to shrink.
2298  *
2299  * Releases as many slabs as possible for a cache.
2300  * To help debugging, a zero exit status indicates all slabs were released.
2301  */
2302 int kmem_cache_shrink(struct kmem_cache *cachep)
2303 {
2304         BUG_ON(!cachep || in_interrupt());
2305
2306         return __cache_shrink(cachep);
2307 }
2308 EXPORT_SYMBOL(kmem_cache_shrink);
2309
2310 /**
2311  * kmem_cache_destroy - delete a cache
2312  * @cachep: the cache to destroy
2313  *
2314  * Remove a struct kmem_cache object from the slab cache.
2315  * Returns 0 on success.
2316  *
2317  * It is expected this function will be called by a module when it is
2318  * unloaded.  This will remove the cache completely, and avoid a duplicate
2319  * cache being allocated each time a module is loaded and unloaded, if the
2320  * module doesn't have persistent in-kernel storage across loads and unloads.
2321  *
2322  * The cache must be empty before calling this function.
2323  *
2324  * The caller must guarantee that noone will allocate memory from the cache
2325  * during the kmem_cache_destroy().
2326  */
2327 int kmem_cache_destroy(struct kmem_cache *cachep)
2328 {
2329         int i;
2330         struct kmem_list3 *l3;
2331
2332         BUG_ON(!cachep || in_interrupt());
2333
2334         /* Don't let CPUs to come and go */
2335         lock_cpu_hotplug();
2336
2337         /* Find the cache in the chain of caches. */
2338         mutex_lock(&cache_chain_mutex);
2339         /*
2340          * the chain is never empty, cache_cache is never destroyed
2341          */
2342         list_del(&cachep->next);
2343         mutex_unlock(&cache_chain_mutex);
2344
2345         if (__cache_shrink(cachep)) {
2346                 slab_error(cachep, "Can't free all objects");
2347                 mutex_lock(&cache_chain_mutex);
2348                 list_add(&cachep->next, &cache_chain);
2349                 mutex_unlock(&cache_chain_mutex);
2350                 unlock_cpu_hotplug();
2351                 return 1;
2352         }
2353
2354         if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2355                 synchronize_rcu();
2356
2357         for_each_online_cpu(i)
2358             kfree(cachep->array[i]);
2359
2360         /* NUMA: free the list3 structures */
2361         for_each_online_node(i) {
2362                 l3 = cachep->nodelists[i];
2363                 if (l3) {
2364                         kfree(l3->shared);
2365                         free_alien_cache(l3->alien);
2366                         kfree(l3);
2367                 }
2368         }
2369         kmem_cache_free(&cache_cache, cachep);
2370         unlock_cpu_hotplug();
2371         return 0;
2372 }
2373 EXPORT_SYMBOL(kmem_cache_destroy);
2374
2375 /* Get the memory for a slab management obj. */
2376 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2377                                    int colour_off, gfp_t local_flags,
2378                                    int nodeid)
2379 {
2380         struct slab *slabp;
2381
2382         if (OFF_SLAB(cachep)) {
2383                 /* Slab management obj is off-slab. */
2384                 slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2385                                               local_flags, nodeid);
2386                 if (!slabp)
2387                         return NULL;
2388         } else {
2389                 slabp = objp + colour_off;
2390                 colour_off += cachep->slab_size;
2391         }
2392         slabp->inuse = 0;
2393         slabp->colouroff = colour_off;
2394         slabp->s_mem = objp + colour_off;
2395         slabp->nodeid = nodeid;
2396         return slabp;
2397 }
2398
2399 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2400 {
2401         return (kmem_bufctl_t *) (slabp + 1);
2402 }
2403
2404 static void cache_init_objs(struct kmem_cache *cachep,
2405                             struct slab *slabp, unsigned long ctor_flags)
2406 {
2407         int i;
2408
2409         for (i = 0; i < cachep->num; i++) {
2410                 void *objp = index_to_obj(cachep, slabp, i);
2411 #if DEBUG
2412                 /* need to poison the objs? */
2413                 if (cachep->flags & SLAB_POISON)
2414                         poison_obj(cachep, objp, POISON_FREE);
2415                 if (cachep->flags & SLAB_STORE_USER)
2416                         *dbg_userword(cachep, objp) = NULL;
2417
2418                 if (cachep->flags & SLAB_RED_ZONE) {
2419                         *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2420                         *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2421                 }
2422                 /*
2423                  * Constructors are not allowed to allocate memory from the same
2424                  * cache which they are a constructor for.  Otherwise, deadlock.
2425                  * They must also be threaded.
2426                  */
2427                 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2428                         cachep->ctor(objp + obj_offset(cachep), cachep,
2429                                      ctor_flags);
2430
2431                 if (cachep->flags & SLAB_RED_ZONE) {
2432                         if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2433                                 slab_error(cachep, "constructor overwrote the"
2434                                            " end of an object");
2435                         if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2436                                 slab_error(cachep, "constructor overwrote the"
2437                                            " start of an object");
2438                 }
2439                 if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2440                             OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2441                         kernel_map_pages(virt_to_page(objp),
2442                                          cachep->buffer_size / PAGE_SIZE, 0);
2443 #else
2444                 if (cachep->ctor)
2445                         cachep->ctor(objp, cachep, ctor_flags);
2446 #endif
2447                 slab_bufctl(slabp)[i] = i + 1;
2448         }
2449         slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2450         slabp->free = 0;
2451 }
2452
2453 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2454 {
2455         if (flags & SLAB_DMA)
2456                 BUG_ON(!(cachep->gfpflags & GFP_DMA));
2457         else
2458                 BUG_ON(cachep->gfpflags & GFP_DMA);
2459 }
2460
2461 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2462                                 int nodeid)
2463 {
2464         void *objp = index_to_obj(cachep, slabp, slabp->free);
2465         kmem_bufctl_t next;
2466
2467         slabp->inuse++;
2468         next = slab_bufctl(slabp)[slabp->free];
2469 #if DEBUG
2470         slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2471         WARN_ON(slabp->nodeid != nodeid);
2472 #endif
2473         slabp->free = next;
2474
2475         return objp;
2476 }
2477
2478 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2479                                 void *objp, int nodeid)
2480 {
2481         unsigned int objnr = obj_to_index(cachep, slabp, objp);
2482
2483 #if DEBUG
2484         /* Verify that the slab belongs to the intended node */
2485         WARN_ON(slabp->nodeid != nodeid);
2486
2487         if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2488                 printk(KERN_ERR "slab: double free detected in cache "
2489                                 "'%s', objp %p\n", cachep->name, objp);
2490                 BUG();
2491         }
2492 #endif
2493         slab_bufctl(slabp)[objnr] = slabp->free;
2494         slabp->free = objnr;
2495         slabp->inuse--;
2496 }
2497
2498 /*
2499  * Map pages beginning at addr to the given cache and slab. This is required
2500  * for the slab allocator to be able to lookup the cache and slab of a
2501  * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2502  */
2503 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2504                            void *addr)
2505 {
2506         int nr_pages;
2507         struct page *page;
2508
2509         page = virt_to_page(addr);
2510
2511         nr_pages = 1;
2512         if (likely(!PageCompound(page)))
2513                 nr_pages <<= cache->gfporder;
2514
2515         do {
2516                 page_set_cache(page, cache);
2517                 page_set_slab(page, slab);
2518                 page++;
2519         } while (--nr_pages);
2520 }
2521
2522 /*
2523  * Grow (by 1) the number of slabs within a cache.  This is called by
2524  * kmem_cache_alloc() when there are no active objs left in a cache.
2525  */
2526 static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2527 {
2528         struct slab *slabp;
2529         void *objp;
2530         size_t offset;
2531         gfp_t local_flags;
2532         unsigned long ctor_flags;
2533         struct kmem_list3 *l3;
2534
2535         /*
2536          * Be lazy and only check for valid flags here,  keeping it out of the
2537          * critical path in kmem_cache_alloc().
2538          */
2539         BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW));
2540         if (flags & SLAB_NO_GROW)
2541                 return 0;
2542
2543         ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2544         local_flags = (flags & SLAB_LEVEL_MASK);
2545         if (!(local_flags & __GFP_WAIT))
2546                 /*
2547                  * Not allowed to sleep.  Need to tell a constructor about
2548                  * this - it might need to know...
2549                  */
2550                 ctor_flags |= SLAB_CTOR_ATOMIC;
2551
2552         /* Take the l3 list lock to change the colour_next on this node */
2553         check_irq_off();
2554         l3 = cachep->nodelists[nodeid];
2555         spin_lock(&l3->list_lock);
2556
2557         /* Get colour for the slab, and cal the next value. */
2558         offset = l3->colour_next;
2559         l3->colour_next++;
2560         if (l3->colour_next >= cachep->colour)
2561                 l3->colour_next = 0;
2562         spin_unlock(&l3->list_lock);
2563
2564         offset *= cachep->colour_off;
2565
2566         if (local_flags & __GFP_WAIT)
2567                 local_irq_enable();
2568
2569         /*
2570          * The test for missing atomic flag is performed here, rather than
2571          * the more obvious place, simply to reduce the critical path length
2572          * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2573          * will eventually be caught here (where it matters).
2574          */
2575         kmem_flagcheck(cachep, flags);
2576
2577         /*
2578          * Get mem for the objs.  Attempt to allocate a physical page from
2579          * 'nodeid'.
2580          */
2581         objp = kmem_getpages(cachep, flags, nodeid);
2582         if (!objp)
2583                 goto failed;
2584
2585         /* Get slab management. */
2586         slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid);
2587         if (!slabp)
2588                 goto opps1;
2589
2590         slabp->nodeid = nodeid;
2591         slab_map_pages(cachep, slabp, objp);
2592
2593         cache_init_objs(cachep, slabp, ctor_flags);
2594
2595         if (local_flags & __GFP_WAIT)
2596                 local_irq_disable();
2597         check_irq_off();
2598         spin_lock(&l3->list_lock);
2599
2600         /* Make slab active. */
2601         list_add_tail(&slabp->list, &(l3->slabs_free));
2602         STATS_INC_GROWN(cachep);
2603         l3->free_objects += cachep->num;
2604         spin_unlock(&l3->list_lock);
2605         return 1;
2606 opps1:
2607         kmem_freepages(cachep, objp);
2608 failed:
2609         if (local_flags & __GFP_WAIT)
2610                 local_irq_disable();
2611         return 0;
2612 }
2613
2614 #if DEBUG
2615
2616 /*
2617  * Perform extra freeing checks:
2618  * - detect bad pointers.
2619  * - POISON/RED_ZONE checking
2620  * - destructor calls, for caches with POISON+dtor
2621  */
2622 static void kfree_debugcheck(const void *objp)
2623 {
2624         struct page *page;
2625
2626         if (!virt_addr_valid(objp)) {
2627                 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2628                        (unsigned long)objp);
2629                 BUG();
2630         }
2631         page = virt_to_page(objp);
2632         if (!PageSlab(page)) {
2633                 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2634                        (unsigned long)objp);
2635                 BUG();
2636         }
2637 }
2638
2639 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2640 {
2641         unsigned long redzone1, redzone2;
2642
2643         redzone1 = *dbg_redzone1(cache, obj);
2644         redzone2 = *dbg_redzone2(cache, obj);
2645
2646         /*
2647          * Redzone is ok.
2648          */
2649         if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2650                 return;
2651
2652         if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2653                 slab_error(cache, "double free detected");
2654         else
2655                 slab_error(cache, "memory outside object was overwritten");
2656
2657         printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
2658                         obj, redzone1, redzone2);
2659 }
2660
2661 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2662                                    void *caller)
2663 {
2664         struct page *page;
2665         unsigned int objnr;
2666         struct slab *slabp;
2667
2668         objp -= obj_offset(cachep);
2669         kfree_debugcheck(objp);
2670         page = virt_to_page(objp);
2671
2672         slabp = page_get_slab(page);
2673
2674         if (cachep->flags & SLAB_RED_ZONE) {
2675                 verify_redzone_free(cachep, objp);
2676                 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2677                 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2678         }
2679         if (cachep->flags & SLAB_STORE_USER)
2680                 *dbg_userword(cachep, objp) = caller;
2681
2682         objnr = obj_to_index(cachep, slabp, objp);
2683
2684         BUG_ON(objnr >= cachep->num);
2685         BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2686
2687         if (cachep->flags & SLAB_DEBUG_INITIAL) {
2688                 /*
2689                  * Need to call the slab's constructor so the caller can
2690                  * perform a verify of its state (debugging).  Called without
2691                  * the cache-lock held.
2692                  */
2693                 cachep->ctor(objp + obj_offset(cachep),
2694                              cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2695         }
2696         if (cachep->flags & SLAB_POISON && cachep->dtor) {
2697                 /* we want to cache poison the object,
2698                  * call the destruction callback
2699                  */
2700                 cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2701         }
2702 #ifdef CONFIG_DEBUG_SLAB_LEAK
2703         slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2704 #endif
2705         if (cachep->flags & SLAB_POISON) {
2706 #ifdef CONFIG_DEBUG_PAGEALLOC
2707                 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2708                         store_stackinfo(cachep, objp, (unsigned long)caller);
2709                         kernel_map_pages(virt_to_page(objp),
2710                                          cachep->buffer_size / PAGE_SIZE, 0);
2711                 } else {
2712                         poison_obj(cachep, objp, POISON_FREE);
2713                 }
2714 #else
2715                 poison_obj(cachep, objp, POISON_FREE);
2716 #endif
2717         }
2718         return objp;
2719 }
2720
2721 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2722 {
2723         kmem_bufctl_t i;
2724         int entries = 0;
2725
2726         /* Check slab's freelist to see if this obj is there. */
2727         for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2728                 entries++;
2729                 if (entries > cachep->num || i >= cachep->num)
2730                         goto bad;
2731         }
2732         if (entries != cachep->num - slabp->inuse) {
2733 bad:
2734                 printk(KERN_ERR "slab: Internal list corruption detected in "
2735                                 "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2736                         cachep->name, cachep->num, slabp, slabp->inuse);
2737                 for (i = 0;
2738                      i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2739                      i++) {
2740                         if (i % 16 == 0)
2741                                 printk("\n%03x:", i);
2742                         printk(" %02x", ((unsigned char *)slabp)[i]);
2743                 }
2744                 printk("\n");
2745                 BUG();
2746         }
2747 }
2748 #else
2749 #define kfree_debugcheck(x) do { } while(0)
2750 #define cache_free_debugcheck(x,objp,z) (objp)
2751 #define check_slabp(x,y) do { } while(0)
2752 #endif
2753
2754 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2755 {
2756         int batchcount;
2757         struct kmem_list3 *l3;
2758         struct array_cache *ac;
2759
2760         check_irq_off();
2761         ac = cpu_cache_get(cachep);
2762 retry:
2763         batchcount = ac->batchcount;
2764         if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2765                 /*
2766                  * If there was little recent activity on this cache, then
2767                  * perform only a partial refill.  Otherwise we could generate
2768                  * refill bouncing.
2769                  */
2770                 batchcount = BATCHREFILL_LIMIT;
2771         }
2772         l3 = cachep->nodelists[numa_node_id()];
2773
2774         BUG_ON(ac->avail > 0 || !l3);
2775         spin_lock(&l3->list_lock);
2776
2777         /* See if we can refill from the shared array */
2778         if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2779                 goto alloc_done;
2780
2781         while (batchcount > 0) {
2782                 struct list_head *entry;
2783                 struct slab *slabp;
2784                 /* Get slab alloc is to come from. */
2785                 entry = l3->slabs_partial.next;
2786                 if (entry == &l3->slabs_partial) {
2787                         l3->free_touched = 1;
2788                         entry = l3->slabs_free.next;
2789                         if (entry == &l3->slabs_free)
2790                                 goto must_grow;
2791                 }
2792
2793                 slabp = list_entry(entry, struct slab, list);
2794                 check_slabp(cachep, slabp);
2795                 check_spinlock_acquired(cachep);
2796                 while (slabp->inuse < cachep->num && batchcount--) {
2797                         STATS_INC_ALLOCED(cachep);
2798                         STATS_INC_ACTIVE(cachep);
2799                         STATS_SET_HIGH(cachep);
2800
2801                         ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2802                                                             numa_node_id());
2803                 }
2804                 check_slabp(cachep, slabp);
2805
2806                 /* move slabp to correct slabp list: */
2807                 list_del(&slabp->list);
2808                 if (slabp->free == BUFCTL_END)
2809                         list_add(&slabp->list, &l3->slabs_full);
2810                 else
2811                         list_add(&slabp->list, &l3->slabs_partial);
2812         }
2813
2814 must_grow:
2815         l3->free_objects -= ac->avail;
2816 alloc_done:
2817         spin_unlock(&l3->list_lock);
2818
2819         if (unlikely(!ac->avail)) {
2820                 int x;
2821                 x = cache_grow(cachep, flags, numa_node_id());
2822
2823                 /* cache_grow can reenable interrupts, then ac could change. */
2824                 ac = cpu_cache_get(cachep);
2825                 if (!x && ac->avail == 0)       /* no objects in sight? abort */
2826                         return NULL;
2827
2828                 if (!ac->avail)         /* objects refilled by interrupt? */
2829                         goto retry;
2830         }
2831         ac->touched = 1;
2832         return ac->entry[--ac->avail];
2833 }
2834
2835 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2836                                                 gfp_t flags)
2837 {
2838         might_sleep_if(flags & __GFP_WAIT);
2839 #if DEBUG
2840         kmem_flagcheck(cachep, flags);
2841 #endif
2842 }
2843
2844 #if DEBUG
2845 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2846                                 gfp_t flags, void *objp, void *caller)
2847 {
2848         if (!objp)
2849                 return objp;
2850         if (cachep->flags & SLAB_POISON) {
2851 #ifdef CONFIG_DEBUG_PAGEALLOC
2852                 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2853                         kernel_map_pages(virt_to_page(objp),
2854                                          cachep->buffer_size / PAGE_SIZE, 1);
2855                 else
2856                         check_poison_obj(cachep, objp);
2857 #else
2858                 check_poison_obj(cachep, objp);
2859 #endif
2860                 poison_obj(cachep, objp, POISON_INUSE);
2861         }
2862         if (cachep->flags & SLAB_STORE_USER)
2863                 *dbg_userword(cachep, objp) = caller;
2864
2865         if (cachep->flags & SLAB_RED_ZONE) {
2866                 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
2867                                 *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2868                         slab_error(cachep, "double free, or memory outside"
2869                                                 " object was overwritten");
2870                         printk(KERN_ERR
2871                                 "%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
2872                                 objp, *dbg_redzone1(cachep, objp),
2873                                 *dbg_redzone2(cachep, objp));
2874                 }
2875                 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2876                 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2877         }
2878 #ifdef CONFIG_DEBUG_SLAB_LEAK
2879         {
2880                 struct slab *slabp;
2881                 unsigned objnr;
2882
2883                 slabp = page_get_slab(virt_to_page(objp));
2884                 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
2885                 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
2886         }
2887 #endif
2888         objp += obj_offset(cachep);
2889         if (cachep->ctor && cachep->flags & SLAB_POISON) {
2890                 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2891
2892                 if (!(flags & __GFP_WAIT))
2893                         ctor_flags |= SLAB_CTOR_ATOMIC;
2894
2895                 cachep->ctor(objp, cachep, ctor_flags);
2896         }
2897         return objp;
2898 }
2899 #else
2900 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2901 #endif
2902
2903 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2904 {
2905         void *objp;
2906         struct array_cache *ac;
2907
2908 #ifdef CONFIG_NUMA
2909         if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
2910                 objp = alternate_node_alloc(cachep, flags);
2911                 if (objp != NULL)
2912                         return objp;
2913         }
2914 #endif
2915
2916         check_irq_off();
2917         ac = cpu_cache_get(cachep);
2918         if (likely(ac->avail)) {
2919                 STATS_INC_ALLOCHIT(cachep);
2920                 ac->touched = 1;
2921                 objp = ac->entry[--ac->avail];
2922         } else {
2923                 STATS_INC_ALLOCMISS(cachep);
2924                 objp = cache_alloc_refill(cachep, flags);
2925         }
2926         return objp;
2927 }
2928
2929 static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
2930                                                 gfp_t flags, void *caller)
2931 {
2932         unsigned long save_flags;
2933         void *objp;
2934
2935         cache_alloc_debugcheck_before(cachep, flags);
2936
2937         local_irq_save(save_flags);
2938         objp = ____cache_alloc(cachep, flags);
2939         local_irq_restore(save_flags);
2940         objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2941                                             caller);
2942         prefetchw(objp);
2943         return objp;
2944 }
2945
2946 #ifdef CONFIG_NUMA
2947 /*
2948  * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
2949  *
2950  * If we are in_interrupt, then process context, including cpusets and
2951  * mempolicy, may not apply and should not be used for allocation policy.
2952  */
2953 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
2954 {
2955         int nid_alloc, nid_here;
2956
2957         if (in_interrupt())
2958                 return NULL;
2959         nid_alloc = nid_here = numa_node_id();
2960         if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
2961                 nid_alloc = cpuset_mem_spread_node();
2962         else if (current->mempolicy)
2963                 nid_alloc = slab_node(current->mempolicy);
2964         if (nid_alloc != nid_here)
2965                 return __cache_alloc_node(cachep, flags, nid_alloc);
2966         return NULL;
2967 }
2968
2969 /*
2970  * A interface to enable slab creation on nodeid
2971  */
2972 static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
2973                                 int nodeid)
2974 {
2975         struct list_head *entry;
2976         struct slab *slabp;
2977         struct kmem_list3 *l3;
2978         void *obj;
2979         int x;
2980
2981         l3 = cachep->nodelists[nodeid];
2982         BUG_ON(!l3);
2983
2984 retry:
2985         check_irq_off();
2986         spin_lock(&l3->list_lock);
2987         entry = l3->slabs_partial.next;
2988         if (entry == &l3->slabs_partial) {
2989                 l3->free_touched = 1;
2990                 entry = l3->slabs_free.next;
2991                 if (entry == &l3->slabs_free)
2992                         goto must_grow;
2993         }
2994
2995         slabp = list_entry(entry, struct slab, list);
2996         check_spinlock_acquired_node(cachep, nodeid);
2997         check_slabp(cachep, slabp);
2998
2999         STATS_INC_NODEALLOCS(cachep);
3000         STATS_INC_ACTIVE(cachep);
3001         STATS_SET_HIGH(cachep);
3002
3003         BUG_ON(slabp->inuse == cachep->num);
3004
3005         obj = slab_get_obj(cachep, slabp, nodeid);
3006         check_slabp(cachep, slabp);
3007         l3->free_objects--;
3008         /* move slabp to correct slabp list: */
3009         list_del(&slabp->list);
3010
3011         if (slabp->free == BUFCTL_END)
3012                 list_add(&slabp->list, &l3->slabs_full);
3013         else
3014                 list_add(&slabp->list, &l3->slabs_partial);
3015
3016         spin_unlock(&l3->list_lock);
3017         goto done;
3018
3019 must_grow:
3020         spin_unlock(&l3->list_lock);
3021         x = cache_grow(cachep, flags, nodeid);
3022
3023         if (!x)
3024                 return NULL;
3025
3026         goto retry;
3027 done:
3028         return obj;
3029 }
3030 #endif
3031
3032 /*
3033  * Caller needs to acquire correct kmem_list's list_lock
3034  */
3035 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3036                        int node)
3037 {
3038         int i;
3039         struct kmem_list3 *l3;
3040
3041         for (i = 0; i < nr_objects; i++) {
3042                 void *objp = objpp[i];
3043                 struct slab *slabp;
3044
3045                 slabp = virt_to_slab(objp);
3046                 l3 = cachep->nodelists[node];
3047                 list_del(&slabp->list);
3048                 check_spinlock_acquired_node(cachep, node);
3049                 check_slabp(cachep, slabp);
3050                 slab_put_obj(cachep, slabp, objp, node);
3051                 STATS_DEC_ACTIVE(cachep);
3052                 l3->free_objects++;
3053                 check_slabp(cachep, slabp);
3054
3055                 /* fixup slab chains */
3056                 if (slabp->inuse == 0) {
3057                         if (l3->free_objects > l3->free_limit) {
3058                                 l3->free_objects -= cachep->num;
3059                                 slab_destroy(cachep, slabp);
3060                         } else {
3061                                 list_add(&slabp->list, &l3->slabs_free);
3062                         }
3063                 } else {
3064                         /* Unconditionally move a slab to the end of the
3065                          * partial list on free - maximum time for the
3066                          * other objects to be freed, too.
3067                          */
3068                         list_add_tail(&slabp->list, &l3->slabs_partial);
3069                 }
3070         }
3071 }
3072
3073 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3074 {
3075         int batchcount;
3076         struct kmem_list3 *l3;
3077         int node = numa_node_id();
3078
3079         batchcount = ac->batchcount;
3080 #if DEBUG
3081         BUG_ON(!batchcount || batchcount > ac->avail);
3082 #endif
3083         check_irq_off();
3084         l3 = cachep->nodelists[node];
3085         spin_lock(&l3->list_lock);
3086         if (l3->shared) {
3087                 struct array_cache *shared_array = l3->shared;
3088                 int max = shared_array->limit - shared_array->avail;
3089                 if (max) {
3090                         if (batchcount > max)
3091                                 batchcount = max;
3092                         memcpy(&(shared_array->entry[shared_array->avail]),
3093                                ac->entry, sizeof(void *) * batchcount);
3094                         shared_array->avail += batchcount;
3095                         goto free_done;
3096                 }
3097         }
3098
3099         free_block(cachep, ac->entry, batchcount, node);
3100 free_done:
3101 #if STATS
3102         {
3103                 int i = 0;
3104                 struct list_head *p;
3105
3106                 p = l3->slabs_free.next;
3107                 while (p != &(l3->slabs_free)) {
3108                         struct slab *slabp;
3109
3110                         slabp = list_entry(p, struct slab, list);
3111                         BUG_ON(slabp->inuse);
3112
3113                         i++;
3114                         p = p->next;
3115                 }
3116                 STATS_SET_FREEABLE(cachep, i);
3117         }
3118 #endif
3119         spin_unlock(&l3->list_lock);
3120         ac->avail -= batchcount;
3121         memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3122 }
3123
3124 /*
3125  * Release an obj back to its cache. If the obj has a constructed state, it must
3126  * be in this state _before_ it is released.  Called with disabled ints.
3127  */
3128 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3129 {
3130         struct array_cache *ac = cpu_cache_get(cachep);
3131
3132         check_irq_off();
3133         objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3134
3135         if (cache_free_alien(cachep, objp))
3136                 return;
3137
3138         if (likely(ac->avail < ac->limit)) {
3139                 STATS_INC_FREEHIT(cachep);
3140                 ac->entry[ac->avail++] = objp;
3141                 return;
3142         } else {
3143                 STATS_INC_FREEMISS(cachep);
3144                 cache_flusharray(cachep, ac);
3145                 ac->entry[ac->avail++] = objp;
3146         }
3147 }
3148
3149 /**
3150  * kmem_cache_alloc - Allocate an object
3151  * @cachep: The cache to allocate from.
3152  * @flags: See kmalloc().
3153  *
3154  * Allocate an object from this cache.  The flags are only relevant
3155  * if the cache has no available objects.
3156  */
3157 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3158 {
3159         return __cache_alloc(cachep, flags, __builtin_return_address(0));
3160 }
3161 EXPORT_SYMBOL(kmem_cache_alloc);
3162
3163 /**
3164  * kmem_cache_alloc - Allocate an object. The memory is set to zero.
3165  * @cache: The cache to allocate from.
3166  * @flags: See kmalloc().
3167  *
3168  * Allocate an object from this cache and set the allocated memory to zero.
3169  * The flags are only relevant if the cache has no available objects.
3170  */
3171 void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
3172 {
3173         void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
3174         if (ret)
3175                 memset(ret, 0, obj_size(cache));
3176         return ret;
3177 }
3178 EXPORT_SYMBOL(kmem_cache_zalloc);
3179
3180 /**
3181  * kmem_ptr_validate - check if an untrusted pointer might
3182  *      be a slab entry.
3183  * @cachep: the cache we're checking against
3184  * @ptr: pointer to validate
3185  *
3186  * This verifies that the untrusted pointer looks sane:
3187  * it is _not_ a guarantee that the pointer is actually
3188  * part of the slab cache in question, but it at least
3189  * validates that the pointer can be dereferenced and
3190  * looks half-way sane.
3191  *
3192  * Currently only used for dentry validation.
3193  */
3194 int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
3195 {
3196         unsigned long addr = (unsigned long)ptr;
3197         unsigned long min_addr = PAGE_OFFSET;
3198         unsigned long align_mask = BYTES_PER_WORD - 1;
3199         unsigned long size = cachep->buffer_size;
3200         struct page *page;
3201
3202         if (unlikely(addr < min_addr))
3203                 goto out;
3204         if (unlikely(addr > (unsigned long)high_memory - size))
3205                 goto out;
3206         if (unlikely(addr & align_mask))
3207                 goto out;
3208         if (unlikely(!kern_addr_valid(addr)))
3209                 goto out;
3210         if (unlikely(!kern_addr_valid(addr + size - 1)))
3211                 goto out;
3212         page = virt_to_page(ptr);
3213         if (unlikely(!PageSlab(page)))
3214                 goto out;
3215         if (unlikely(page_get_cache(page) != cachep))
3216                 goto out;
3217         return 1;
3218 out:
3219         return 0;
3220 }
3221
3222 #ifdef CONFIG_NUMA
3223 /**
3224  * kmem_cache_alloc_node - Allocate an object on the specified node
3225  * @cachep: The cache to allocate from.
3226  * @flags: See kmalloc().
3227  * @nodeid: node number of the target node.
3228  *
3229  * Identical to kmem_cache_alloc, except that this function is slow
3230  * and can sleep. And it will allocate memory on the given node, which
3231  * can improve the performance for cpu bound structures.
3232  * New and improved: it will now make sure that the object gets
3233  * put on the correct node list so that there is no false sharing.
3234  */
3235 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3236 {
3237         unsigned long save_flags;
3238         void *ptr;
3239
3240         cache_alloc_debugcheck_before(cachep, flags);
3241         local_irq_save(save_flags);
3242
3243         if (nodeid == -1 || nodeid == numa_node_id() ||
3244                         !cachep->nodelists[nodeid])
3245                 ptr = ____cache_alloc(cachep, flags);
3246         else
3247                 ptr = __cache_alloc_node(cachep, flags, nodeid);
3248         local_irq_restore(save_flags);
3249
3250         ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
3251                                            __builtin_return_address(0));
3252
3253         return ptr;
3254 }
3255 EXPORT_SYMBOL(kmem_cache_alloc_node);
3256
3257 void *kmalloc_node(size_t size, gfp_t flags, int node)
3258 {
3259         struct kmem_cache *cachep;
3260
3261         cachep = kmem_find_general_cachep(size, flags);
3262         if (unlikely(cachep == NULL))
3263                 return NULL;
3264         return kmem_cache_alloc_node(cachep, flags, node);
3265 }
3266 EXPORT_SYMBOL(kmalloc_node);
3267 #endif
3268
3269 /**
3270  * __do_kmalloc - allocate memory
3271  * @size: how many bytes of memory are required.
3272  * @flags: the type of memory to allocate (see kmalloc).
3273  * @caller: function caller for debug tracking of the caller
3274  */
3275 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3276                                           void *caller)
3277 {
3278         struct kmem_cache *cachep;
3279
3280         /* If you want to save a few bytes .text space: replace
3281          * __ with kmem_.
3282          * Then kmalloc uses the uninlined functions instead of the inline
3283          * functions.
3284          */
3285         cachep = __find_general_cachep(size, flags);
3286         if (unlikely(cachep == NULL))
3287                 return NULL;
3288         return __cache_alloc(cachep, flags, caller);
3289 }
3290
3291
3292 void *__kmalloc(size_t size, gfp_t flags)
3293 {
3294 #ifndef CONFIG_DEBUG_SLAB
3295         return __do_kmalloc(size, flags, NULL);
3296 #else
3297         return __do_kmalloc(size, flags, __builtin_return_address(0));
3298 #endif
3299 }
3300 EXPORT_SYMBOL(__kmalloc);
3301
3302 #ifdef CONFIG_DEBUG_SLAB
3303 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3304 {
3305         return __do_kmalloc(size, flags, caller);
3306 }
3307 EXPORT_SYMBOL(__kmalloc_track_caller);
3308 #endif
3309
3310 #ifdef CONFIG_SMP
3311 /**
3312  * __alloc_percpu - allocate one copy of the object for every present
3313  * cpu in the system, zeroing them.
3314  * Objects should be dereferenced using the per_cpu_ptr macro only.
3315  *
3316  * @size: how many bytes of memory are required.
3317  */
3318 void *__alloc_percpu(size_t size)
3319 {
3320         int i;
3321         struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
3322
3323         if (!pdata)
3324                 return NULL;
3325
3326         /*
3327          * Cannot use for_each_online_cpu since a cpu may come online
3328          * and we have no way of figuring out how to fix the array
3329          * that we have allocated then....
3330          */
3331         for_each_possible_cpu(i) {
3332                 int node = cpu_to_node(i);
3333
3334                 if (node_online(node))
3335                         pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
3336                 else
3337                         pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
3338
3339                 if (!pdata->ptrs[i])
3340                         goto unwind_oom;
3341                 memset(pdata->ptrs[i], 0, size);
3342         }
3343
3344         /* Catch derefs w/o wrappers */
3345         return (void *)(~(unsigned long)pdata);
3346
3347 unwind_oom:
3348         while (--i >= 0) {
3349                 if (!cpu_possible(i))
3350                         continue;
3351                 kfree(pdata->ptrs[i]);
3352         }
3353         kfree(pdata);
3354         return NULL;
3355 }
3356 EXPORT_SYMBOL(__alloc_percpu);
3357 #endif
3358
3359 /**
3360  * kmem_cache_free - Deallocate an object
3361  * @cachep: The cache the allocation was from.
3362  * @objp: The previously allocated object.
3363  *
3364  * Free an object which was previously allocated from this
3365  * cache.
3366  */
3367 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3368 {
3369         unsigned long flags;
3370
3371         BUG_ON(virt_to_cache(objp) != cachep);
3372
3373         local_irq_save(flags);
3374         __cache_free(cachep, objp);
3375         local_irq_restore(flags);
3376 }
3377 EXPORT_SYMBOL(kmem_cache_free);
3378
3379 /**
3380  * kfree - free previously allocated memory
3381  * @objp: pointer returned by kmalloc.
3382  *
3383  * If @objp is NULL, no operation is performed.
3384  *
3385  * Don't free memory not originally allocated by kmalloc()
3386  * or you will run into trouble.
3387  */
3388 void kfree(const void *objp)
3389 {
3390         struct kmem_cache *c;
3391         unsigned long flags;
3392
3393         if (unlikely(!objp))
3394                 return;
3395         local_irq_save(flags);
3396         kfree_debugcheck(objp);
3397         c = virt_to_cache(objp);
3398         mutex_debug_check_no_locks_freed(objp, obj_size(c));
3399         __cache_free(c, (void *)objp);
3400         local_irq_restore(flags);
3401 }
3402 EXPORT_SYMBOL(kfree);
3403
3404 #ifdef CONFIG_SMP
3405 /**
3406  * free_percpu - free previously allocated percpu memory
3407  * @objp: pointer returned by alloc_percpu.
3408  *
3409  * Don't free memory not originally allocated by alloc_percpu()
3410  * The complemented objp is to check for that.
3411  */
3412 void free_percpu(const void *objp)
3413 {
3414         int i;
3415         struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3416
3417         /*
3418          * We allocate for all cpus so we cannot use for online cpu here.
3419          */
3420         for_each_possible_cpu(i)
3421             kfree(p->ptrs[i]);
3422         kfree(p);
3423 }
3424 EXPORT_SYMBOL(free_percpu);
3425 #endif
3426
3427 unsigned int kmem_cache_size(struct kmem_cache *cachep)
3428 {
3429         return obj_size(cachep);
3430 }
3431 EXPORT_SYMBOL(kmem_cache_size);
3432
3433 const char *kmem_cache_name(struct kmem_cache *cachep)
3434 {
3435         return cachep->name;
3436 }
3437 EXPORT_SYMBOL_GPL(kmem_cache_name);
3438
3439 /*
3440  * This initializes kmem_list3 or resizes varioius caches for all nodes.
3441  */
3442 static int alloc_kmemlist(struct kmem_cache *cachep)
3443 {
3444         int node;
3445         struct kmem_list3 *l3;
3446         struct array_cache *new_shared;
3447         struct array_cache **new_alien;
3448
3449         for_each_online_node(node) {
3450
3451                 new_alien = alloc_alien_cache(node, cachep->limit);
3452                 if (!new_alien)
3453                         goto fail;
3454
3455                 new_shared = alloc_arraycache(node,
3456                                 cachep->shared*cachep->batchcount,
3457                                         0xbaadf00d);
3458                 if (!new_shared) {
3459                         free_alien_cache(new_alien);
3460                         goto fail;
3461                 }
3462
3463                 l3 = cachep->nodelists[node];
3464                 if (l3) {
3465                         struct array_cache *shared = l3->shared;
3466
3467                         spin_lock_irq(&l3->list_lock);
3468
3469                         if (shared)
3470                                 free_block(cachep, shared->entry,
3471                                                 shared->avail, node);
3472
3473                         l3->shared = new_shared;
3474                         if (!l3->alien) {
3475                                 l3->alien = new_alien;
3476                                 new_alien = NULL;
3477                         }
3478                         l3->free_limit = (1 + nr_cpus_node(node)) *
3479                                         cachep->batchcount + cachep->num;
3480                         spin_unlock_irq(&l3->list_lock);
3481                         kfree(shared);
3482                         free_alien_cache(new_alien);
3483                         continue;
3484                 }
3485                 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3486                 if (!l3) {
3487                         free_alien_cache(new_alien);
3488                         kfree(new_shared);
3489                         goto fail;
3490                 }
3491
3492                 kmem_list3_init(l3);
3493                 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3494                                 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3495                 l3->shared = new_shared;
3496                 l3->alien = new_alien;
3497                 l3->free_limit = (1 + nr_cpus_node(node)) *
3498                                         cachep->batchcount + cachep->num;
3499                 cachep->nodelists[node] = l3;
3500         }
3501         return 0;
3502
3503 fail:
3504         if (!cachep->next.next) {
3505                 /* Cache is not active yet. Roll back what we did */
3506                 node--;
3507                 while (node >= 0) {
3508                         if (cachep->nodelists[node]) {
3509                                 l3 = cachep->nodelists[node];
3510
3511                                 kfree(l3->shared);
3512                                 free_alien_cache(l3->alien);
3513                                 kfree(l3);
3514                                 cachep->nodelists[node] = NULL;
3515                         }
3516                         node--;
3517                 }
3518         }
3519         return -ENOMEM;
3520 }
3521
3522 struct ccupdate_struct {
3523         struct kmem_cache *cachep;
3524         struct array_cache *new[NR_CPUS];
3525 };
3526
3527 static void do_ccupdate_local(void *info)
3528 {
3529         struct ccupdate_struct *new = info;
3530         struct array_cache *old;
3531
3532         check_irq_off();
3533         old = cpu_cache_get(new->cachep);
3534
3535         new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3536         new->new[smp_processor_id()] = old;
3537 }
3538
3539 /* Always called with the cache_chain_mutex held */
3540 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3541                                 int batchcount, int shared)
3542 {
3543         struct ccupdate_struct new;
3544         int i, err;
3545
3546         memset(&new.new, 0, sizeof(new.new));
3547         for_each_online_cpu(i) {
3548                 new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
3549                                                 batchcount);
3550                 if (!new.new[i]) {
3551                         for (i--; i >= 0; i--)
3552                                 kfree(new.new[i]);
3553                         return -ENOMEM;
3554                 }
3555         }
3556         new.cachep = cachep;
3557
3558         on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
3559
3560         check_irq_on();
3561         cachep->batchcount = batchcount;
3562         cachep->limit = limit;
3563         cachep->shared = shared;
3564
3565         for_each_online_cpu(i) {
3566                 struct array_cache *ccold = new.new[i];
3567                 if (!ccold)
3568                         continue;
3569                 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3570                 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3571                 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3572                 kfree(ccold);
3573         }
3574
3575         err = alloc_kmemlist(cachep);
3576         if (err) {
3577                 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3578                        cachep->name, -err);
3579                 BUG();
3580         }
3581         return 0;
3582 }
3583
3584 /* Called with cache_chain_mutex held always */
3585 static void enable_cpucache(struct kmem_cache *cachep)
3586 {
3587         int err;
3588         int limit, shared;
3589
3590         /*
3591          * The head array serves three purposes:
3592          * - create a LIFO ordering, i.e. return objects that are cache-warm
3593          * - reduce the number of spinlock operations.
3594          * - reduce the number of linked list operations on the slab and
3595          *   bufctl chains: array operations are cheaper.
3596          * The numbers are guessed, we should auto-tune as described by
3597          * Bonwick.
3598          */
3599         if (cachep->buffer_size > 131072)
3600                 limit = 1;
3601         else if (cachep->buffer_size > PAGE_SIZE)
3602                 limit = 8;
3603         else if (cachep->buffer_size > 1024)
3604                 limit = 24;
3605         else if (cachep->buffer_size > 256)
3606                 limit = 54;
3607         else
3608                 limit = 120;
3609
3610         /*
3611          * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3612          * allocation behaviour: Most allocs on one cpu, most free operations
3613          * on another cpu. For these cases, an efficient object passing between
3614          * cpus is necessary. This is provided by a shared array. The array
3615          * replaces Bonwick's magazine layer.
3616          * On uniprocessor, it's functionally equivalent (but less efficient)
3617          * to a larger limit. Thus disabled by default.
3618          */
3619         shared = 0;
3620 #ifdef CONFIG_SMP
3621         if (cachep->buffer_size <= PAGE_SIZE)
3622                 shared = 8;
3623 #endif
3624
3625 #if DEBUG
3626         /*
3627          * With debugging enabled, large batchcount lead to excessively long
3628          * periods with disabled local interrupts. Limit the batchcount
3629          */
3630         if (limit > 32)
3631                 limit = 32;
3632 #endif
3633         err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3634         if (err)
3635                 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3636                        cachep->name, -err);
3637 }
3638
3639 /*
3640  * Drain an array if it contains any elements taking the l3 lock only if
3641  * necessary. Note that the l3 listlock also protects the array_cache
3642  * if drain_array() is used on the shared array.
3643  */
3644 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3645                          struct array_cache *ac, int force, int node)
3646 {
3647         int tofree;
3648
3649         if (!ac || !ac->avail)
3650                 return;
3651         if (ac->touched && !force) {
3652                 ac->touched = 0;
3653         } else {
3654                 spin_lock_irq(&l3->list_lock);
3655                 if (ac->avail) {
3656                         tofree = force ? ac->avail : (ac->limit + 4) / 5;
3657                         if (tofree > ac->avail)
3658                                 tofree = (ac->avail + 1) / 2;
3659                         free_block(cachep, ac->entry, tofree, node);
3660                         ac->avail -= tofree;
3661                         memmove(ac->entry, &(ac->entry[tofree]),
3662                                 sizeof(void *) * ac->avail);
3663                 }
3664                 spin_unlock_irq(&l3->list_lock);
3665         }
3666 }
3667
3668 /**
3669  * cache_reap - Reclaim memory from caches.
3670  * @unused: unused parameter
3671  *
3672  * Called from workqueue/eventd every few seconds.
3673  * Purpose:
3674  * - clear the per-cpu caches for this CPU.
3675  * - return freeable pages to the main free memory pool.
3676  *
3677  * If we cannot acquire the cache chain mutex then just give up - we'll try
3678  * again on the next iteration.
3679  */
3680 static void cache_reap(void *unused)
3681 {
3682         struct kmem_cache *searchp;
3683         struct kmem_list3 *l3;
3684         int node = numa_node_id();
3685
3686         if (!mutex_trylock(&cache_chain_mutex)) {
3687                 /* Give up. Setup the next iteration. */
3688                 schedule_delayed_work(&__get_cpu_var(reap_work),
3689                                       REAPTIMEOUT_CPUC);
3690                 return;
3691         }
3692
3693         list_for_each_entry(searchp, &cache_chain, next) {
3694                 struct list_head *p;
3695                 int tofree;
3696                 struct slab *slabp;
3697
3698                 check_irq_on();
3699
3700                 /*
3701                  * We only take the l3 lock if absolutely necessary and we
3702                  * have established with reasonable certainty that
3703                  * we can do some work if the lock was obtained.
3704                  */
3705                 l3 = searchp->nodelists[node];
3706
3707                 reap_alien(searchp, l3);
3708
3709                 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
3710
3711                 /*
3712                  * These are racy checks but it does not matter
3713                  * if we skip one check or scan twice.
3714                  */
3715                 if (time_after(l3->next_reap, jiffies))
3716                         goto next;
3717
3718                 l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
3719
3720                 drain_array(searchp, l3, l3->shared, 0, node);
3721
3722                 if (l3->free_touched) {
3723                         l3->free_touched = 0;
3724                         goto next;
3725                 }
3726
3727                 tofree = (l3->free_limit + 5 * searchp->num - 1) /
3728                                 (5 * searchp->num);
3729                 do {
3730                         /*
3731                          * Do not lock if there are no free blocks.
3732                          */
3733                         if (list_empty(&l3->slabs_free))
3734                                 break;
3735
3736                         spin_lock_irq(&l3->list_lock);
3737                         p = l3->slabs_free.next;
3738                         if (p == &(l3->slabs_free)) {
3739                                 spin_unlock_irq(&l3->list_lock);
3740                                 break;
3741                         }
3742
3743                         slabp = list_entry(p, struct slab, list);
3744                         BUG_ON(slabp->inuse);
3745                         list_del(&slabp->list);
3746                         STATS_INC_REAPED(searchp);
3747
3748                         /*
3749                          * Safe to drop the lock. The slab is no longer linked
3750                          * to the cache. searchp cannot disappear, we hold
3751                          * cache_chain_lock
3752                          */
3753                         l3->free_objects -= searchp->num;
3754                         spin_unlock_irq(&l3->list_lock);
3755                         slab_destroy(searchp, slabp);
3756                 } while (--tofree > 0);
3757 next:
3758                 cond_resched();
3759         }
3760         check_irq_on();
3761         mutex_unlock(&cache_chain_mutex);
3762         next_reap_node();
3763         /* Set up the next iteration */
3764         schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3765 }
3766
3767 #ifdef CONFIG_PROC_FS
3768
3769 static void print_slabinfo_header(struct seq_file *m)
3770 {
3771         /*
3772          * Output format version, so at least we can change it
3773          * without _too_ many complaints.
3774          */
3775 #if STATS
3776         seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
3777 #else
3778         seq_puts(m, "slabinfo - version: 2.1\n");
3779 #endif
3780         seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
3781                  "<objperslab> <pagesperslab>");
3782         seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3783         seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3784 #if STATS
3785         seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
3786                  "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
3787         seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
3788 #endif
3789         seq_putc(m, '\n');
3790 }
3791
3792 static void *s_start(struct seq_file *m, loff_t *pos)
3793 {
3794         loff_t n = *pos;
3795         struct list_head *p;
3796
3797         mutex_lock(&cache_chain_mutex);
3798         if (!n)
3799                 print_slabinfo_header(m);
3800         p = cache_chain.next;
3801         while (n--) {
3802                 p = p->next;
3803                 if (p == &cache_chain)
3804                         return NULL;
3805         }
3806         return list_entry(p, struct kmem_cache, next);
3807 }
3808
3809 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3810 {
3811         struct kmem_cache *cachep = p;
3812         ++*pos;
3813         return cachep->next.next == &cache_chain ?
3814                 NULL : list_entry(cachep->next.next, struct kmem_cache, next);
3815 }
3816
3817 static void s_stop(struct seq_file *m, void *p)
3818 {
3819         mutex_unlock(&cache_chain_mutex);
3820 }
3821
3822 static int s_show(struct seq_file *m, void *p)
3823 {
3824         struct kmem_cache *cachep = p;
3825         struct slab *slabp;
3826         unsigned long active_objs;
3827         unsigned long num_objs;
3828         unsigned long active_slabs = 0;
3829         unsigned long num_slabs, free_objects = 0, shared_avail = 0;
3830         const char *name;
3831         char *error = NULL;
3832         int node;
3833         struct kmem_list3 *l3;
3834
3835         active_objs = 0;
3836         num_slabs = 0;
3837         for_each_online_node(node) {
3838                 l3 = cachep->nodelists[node];
3839                 if (!l3)
3840                         continue;
3841
3842                 check_irq_on();
3843                 spin_lock_irq(&l3->list_lock);
3844
3845                 list_for_each_entry(slabp, &l3->slabs_full, list) {
3846                         if (slabp->inuse != cachep->num && !error)
3847                                 error = "slabs_full accounting error";
3848                         active_objs += cachep->num;
3849                         active_slabs++;
3850                 }
3851                 list_for_each_entry(slabp, &l3->slabs_partial, list) {
3852                         if (slabp->inuse == cachep->num && !error)
3853                                 error = "slabs_partial inuse accounting error";
3854                         if (!slabp->inuse && !error)
3855                                 error = "slabs_partial/inuse accounting error";
3856                         active_objs += slabp->inuse;
3857                         active_slabs++;
3858                 }
3859                 list_for_each_entry(slabp, &l3->slabs_free, list) {
3860                         if (slabp->inuse && !error)
3861                                 error = "slabs_free/inuse accounting error";
3862                         num_slabs++;
3863                 }
3864                 free_objects += l3->free_objects;
3865                 if (l3->shared)
3866                         shared_avail += l3->shared->avail;
3867
3868                 spin_unlock_irq(&l3->list_lock);
3869         }
3870         num_slabs += active_slabs;
3871         num_objs = num_slabs * cachep->num;
3872         if (num_objs - active_objs != free_objects && !error)
3873                 error = "free_objects accounting error";
3874
3875         name = cachep->name;
3876         if (error)
3877                 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3878
3879         seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3880                    name, active_objs, num_objs, cachep->buffer_size,
3881                    cachep->num, (1 << cachep->gfporder));
3882         seq_printf(m, " : tunables %4u %4u %4u",
3883                    cachep->limit, cachep->batchcount, cachep->shared);
3884         seq_printf(m, " : slabdata %6lu %6lu %6lu",
3885                    active_slabs, num_slabs, shared_avail);
3886 #if STATS
3887         {                       /* list3 stats */
3888                 unsigned long high = cachep->high_mark;
3889                 unsigned long allocs = cachep->num_allocations;
3890                 unsigned long grown = cachep->grown;
3891                 unsigned long reaped = cachep->reaped;
3892                 unsigned long errors = cachep->errors;
3893                 unsigned long max_freeable = cachep->max_freeable;
3894                 unsigned long node_allocs = cachep->node_allocs;
3895                 unsigned long node_frees = cachep->node_frees;
3896                 unsigned long overflows = cachep->node_overflow;
3897
3898                 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3899                                 %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
3900                                 reaped, errors, max_freeable, node_allocs,
3901                                 node_frees, overflows);
3902         }
3903         /* cpu stats */
3904         {
3905                 unsigned long allochit = atomic_read(&cachep->allochit);
3906                 unsigned long allocmiss = atomic_read(&cachep->allocmiss);
3907                 unsigned long freehit = atomic_read(&cachep->freehit);
3908                 unsigned long freemiss = atomic_read(&cachep->freemiss);
3909
3910                 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
3911                            allochit, allocmiss, freehit, freemiss);
3912         }
3913 #endif
3914         seq_putc(m, '\n');
3915         return 0;
3916 }
3917
3918 /*
3919  * slabinfo_op - iterator that generates /proc/slabinfo
3920  *
3921  * Output layout:
3922  * cache-name
3923  * num-active-objs
3924  * total-objs
3925  * object size
3926  * num-active-slabs
3927  * total-slabs
3928  * num-pages-per-slab
3929  * + further values on SMP and with statistics enabled
3930  */
3931
3932 struct seq_operations slabinfo_op = {
3933         .start = s_start,
3934         .next = s_next,
3935         .stop = s_stop,
3936         .show = s_show,
3937 };
3938
3939 #define MAX_SLABINFO_WRITE 128
3940 /**
3941  * slabinfo_write - Tuning for the slab allocator
3942  * @file: unused
3943  * @buffer: user buffer
3944  * @count: data length
3945  * @ppos: unused
3946  */
3947 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3948                        size_t count, loff_t *ppos)
3949 {
3950         char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3951         int limit, batchcount, shared, res;
3952         struct kmem_cache *cachep;
3953
3954         if (count > MAX_SLABINFO_WRITE)
3955                 return -EINVAL;
3956         if (copy_from_user(&kbuf, buffer, count))
3957                 return -EFAULT;
3958         kbuf[MAX_SLABINFO_WRITE] = '\0';
3959
3960         tmp = strchr(kbuf, ' ');
3961         if (!tmp)
3962                 return -EINVAL;
3963         *tmp = '\0';
3964         tmp++;
3965         if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
3966                 return -EINVAL;
3967
3968         /* Find the cache in the chain of caches. */
3969         mutex_lock(&cache_chain_mutex);
3970         res = -EINVAL;
3971         list_for_each_entry(cachep, &cache_chain, next) {
3972                 if (!strcmp(cachep->name, kbuf)) {
3973                         if (limit < 1 || batchcount < 1 ||
3974                                         batchcount > limit || shared < 0) {
3975                                 res = 0;
3976                         } else {
3977                                 res = do_tune_cpucache(cachep, limit,
3978                                                        batchcount, shared);
3979                         }
3980                         break;
3981                 }
3982         }
3983         mutex_unlock(&cache_chain_mutex);
3984         if (res >= 0)
3985                 res = count;
3986         return res;
3987 }
3988
3989 #ifdef CONFIG_DEBUG_SLAB_LEAK
3990
3991 static void *leaks_start(struct seq_file *m, loff_t *pos)
3992 {
3993         loff_t n = *pos;
3994         struct list_head *p;
3995
3996         mutex_lock(&cache_chain_mutex);
3997         p = cache_chain.next;
3998         while (n--) {
3999                 p = p->next;
4000                 if (p == &cache_chain)
4001                         return NULL;
4002         }
4003         return list_entry(p, struct kmem_cache, next);
4004 }
4005
4006 static inline int add_caller(unsigned long *n, unsigned long v)
4007 {
4008         unsigned long *p;
4009         int l;
4010         if (!v)
4011                 return 1;
4012         l = n[1];
4013         p = n + 2;
4014         while (l) {
4015                 int i = l/2;
4016                 unsigned long *q = p + 2 * i;
4017                 if (*q == v) {
4018                         q[1]++;
4019                         return 1;
4020                 }
4021                 if (*q > v) {
4022                         l = i;
4023                 } else {
4024                         p = q + 2;
4025                         l -= i + 1;
4026                 }
4027         }
4028         if (++n[1] == n[0])
4029                 return 0;
4030         memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4031         p[0] = v;
4032         p[1] = 1;
4033         return 1;
4034 }
4035
4036 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4037 {
4038         void *p;
4039         int i;
4040         if (n[0] == n[1])
4041                 return;
4042         for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4043                 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4044                         continue;
4045                 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4046                         return;
4047         }
4048 }
4049
4050 static void show_symbol(struct seq_file *m, unsigned long address)
4051 {
4052 #ifdef CONFIG_KALLSYMS
4053         char *modname;
4054         const char *name;
4055         unsigned long offset, size;
4056         char namebuf[KSYM_NAME_LEN+1];
4057
4058         name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
4059
4060         if (name) {
4061                 seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4062                 if (modname)
4063                         seq_printf(m, " [%s]", modname);
4064                 return;
4065         }
4066 #endif
4067         seq_printf(m, "%p", (void *)address);
4068 }
4069
4070 static int leaks_show(struct seq_file *m, void *p)
4071 {
4072         struct kmem_cache *cachep = p;
4073         struct slab *slabp;
4074         struct kmem_list3 *l3;
4075         const char *name;
4076         unsigned long *n = m->private;
4077         int node;
4078         int i;
4079
4080         if (!(cachep->flags & SLAB_STORE_USER))
4081                 return 0;
4082         if (!(cachep->flags & SLAB_RED_ZONE))
4083                 return 0;
4084
4085         /* OK, we can do it */
4086
4087         n[1] = 0;
4088
4089         for_each_online_node(node) {
4090                 l3 = cachep->nodelists[node];
4091                 if (!l3)
4092                         continue;
4093
4094                 check_irq_on();
4095                 spin_lock_irq(&l3->list_lock);
4096
4097                 list_for_each_entry(slabp, &l3->slabs_full, list)
4098                         handle_slab(n, cachep, slabp);
4099                 list_for_each_entry(slabp, &l3->slabs_partial, list)
4100                         handle_slab(n, cachep, slabp);
4101                 spin_unlock_irq(&l3->list_lock);
4102         }
4103         name = cachep->name;
4104         if (n[0] == n[1]) {
4105                 /* Increase the buffer size */
4106                 mutex_unlock(&cache_chain_mutex);
4107                 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4108                 if (!m->private) {
4109                         /* Too bad, we are really out */
4110                         m->private = n;
4111                         mutex_lock(&cache_chain_mutex);
4112                         return -ENOMEM;
4113                 }
4114                 *(unsigned long *)m->private = n[0] * 2;
4115                 kfree(n);
4116                 mutex_lock(&cache_chain_mutex);
4117                 /* Now make sure this entry will be retried */
4118                 m->count = m->size;
4119                 return 0;
4120         }
4121         for (i = 0; i < n[1]; i++) {
4122                 seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4123                 show_symbol(m, n[2*i+2]);
4124                 seq_putc(m, '\n');
4125         }
4126         return 0;
4127 }
4128
4129 struct seq_operations slabstats_op = {
4130         .start = leaks_start,
4131         .next = s_next,
4132         .stop = s_stop,
4133         .show = leaks_show,
4134 };
4135 #endif
4136 #endif
4137
4138 /**
4139  * ksize - get the actual amount of memory allocated for a given object
4140  * @objp: Pointer to the object
4141  *
4142  * kmalloc may internally round up allocations and return more memory
4143  * than requested. ksize() can be used to determine the actual amount of
4144  * memory allocated. The caller may use this additional memory, even though
4145  * a smaller amount of memory was initially specified with the kmalloc call.
4146  * The caller must guarantee that objp points to a valid object previously
4147  * allocated with either kmalloc() or kmem_cache_alloc(). The object
4148  * must not be freed during the duration of the call.
4149  */
4150 unsigned int ksize(const void *objp)
4151 {
4152         if (unlikely(objp == NULL))
4153                 return 0;
4154
4155         return obj_size(virt_to_cache(objp));
4156 }