mm/slab.c

   1 /*
   2  * linux/mm/slab.c
   3  * Written by Mark Hemment, 1996/97.
   4  * (markhe@nextd.demon.co.uk)
   5  *
   6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
   7  *
   8  * Major cleanup, different bufctl logic, per-cpu arrays
   9  *      (c) 2000 Manfred Spraul
  10  *
  11  * Cleanup, make the head arrays unconditional, preparation for NUMA
  12  *      (c) 2002 Manfred Spraul
  13  *
  14  * An implementation of the Slab Allocator as described in outline in;
  15  *      UNIX Internals: The New Frontiers by Uresh Vahalia
  16  *      Pub: Prentice Hall      ISBN 0-13-101908-2
  17  * or with a little more detail in;
  18  *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  19  *      Jeff Bonwick (Sun Microsystems).
  20  *      Presented at: USENIX Summer 1994 Technical Conference
  21  *
  22  * The memory is organized in caches, one cache for each object type.
  23  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  24  * Each cache consists out of many slabs (they are small (usually one
  25  * page long) and always contiguous), and each slab contains multiple
  26  * initialized objects.
  27  *
  28  * This means, that your constructor is used only for newly allocated
  29  * slabs and you must pass objects with the same intializations to
  30  * kmem_cache_free.
  31  *
  32  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  33  * normal). If you need a special memory type, then must create a new
  34  * cache for that memory type.
  35  *
  36  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  37  *   full slabs with 0 free objects
  38  *   partial slabs
  39  *   empty slabs with no allocated objects
  40  *
  41  * If partial slabs exist, then new allocations come from these slabs,
  42  * otherwise from empty slabs or new slabs are allocated.
  43  *
  44  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  45  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  46  *
  47  * Each cache has a short per-cpu head array, most allocs
  48  * and frees go into that array, and if that array overflows, then 1/2
  49  * of the entries in the array are given back into the global cache.
  50  * The head array is strictly LIFO and should improve the cache hit rates.
  51  * On SMP, it additionally reduces the spinlock operations.
  52  *
  53  * The c_cpuarray may not be read with enabled local interrupts -
  54  * it's changed with a smp_call_function().
  55  *
  56  * SMP synchronization:
  57  *  constructors and destructors are called without any locking.
  58  *  Several members in kmem_cache_t and struct slab never change, they
  59  *      are accessed without any locking.
  60  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  61  *      and local interrupts are disabled so slab code is preempt-safe.
  62  *  The non-constant members are protected with a per-cache irq spinlock.
  63  *
  64  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  65  * in 2000 - many ideas in the current implementation are derived from
  66  * his patch.
  67  *
  68  * Further notes from the original documentation:
  69  *
  70  * 11 April '97.  Started multi-threading - markhe
  71  *      The global cache-chain is protected by the semaphore 'cache_chain_sem'.
  72  *      The sem is only needed when accessing/extending the cache-chain, which
  73  *      can never happen inside an interrupt (kmem_cache_create(),
  74  *      kmem_cache_shrink() and kmem_cache_reap()).
  75  *
  76  *      At present, each engine can be growing a cache.  This should be blocked.
  77  *
  78  */
  79
  80 #include        <linux/config.h>
  81 #include        <linux/slab.h>
  82 #include        <linux/mm.h>
  83 #include        <linux/swap.h>
  84 #include        <linux/cache.h>
  85 #include        <linux/interrupt.h>
  86 #include        <linux/init.h>
  87 #include        <linux/compiler.h>
  88 #include        <linux/seq_file.h>
  89 #include        <linux/notifier.h>
  90 #include        <linux/kallsyms.h>
  91 #include        <linux/cpu.h>
  92 #include        <linux/sysctl.h>
  93 #include        <linux/module.h>
  94
  95 #include        <asm/uaccess.h>
  96 #include        <asm/cacheflush.h>
  97 #include        <asm/tlbflush.h>
  98
  99 /*
 100  * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
 101  *                SLAB_RED_ZONE & SLAB_POISON.
 102  *                0 for faster, smaller code (especially in the critical paths).
 103  *
 104  * STATS        - 1 to collect stats for /proc/slabinfo.
 105  *                0 for faster, smaller code (especially in the critical paths).
 106  *
 107  * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
 108  */
 109
 110 #ifdef CONFIG_DEBUG_SLAB
 111 #define DEBUG           1
 112 #define STATS           1
 113 #define FORCED_DEBUG    1
 114 #else
 115 #define DEBUG           0
 116 #define STATS           0
 117 #define FORCED_DEBUG    0
 118 #endif
 119
 120
 121 /* Shouldn't this be in a header file somewhere? */
 122 #define BYTES_PER_WORD          sizeof(void *)
 123
 124 /* Legal flag mask for kmem_cache_create(). */
 125 #if DEBUG
 126 # define CREATE_MASK    (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
 127                          SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 128                          SLAB_NO_REAP | SLAB_CACHE_DMA | \
 129                          SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 130                          SLAB_RECLAIM_ACCOUNT )
 131 #else
 132 # define CREATE_MASK    (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
 133                          SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 134                          SLAB_RECLAIM_ACCOUNT)
 135 #endif
 136
 137 /*
 138  * kmem_bufctl_t:
 139  *
 140  * Bufctl's are used for linking objs within a slab
 141  * linked offsets.
 142  *
 143  * This implementation relies on "struct page" for locating the cache &
 144  * slab an object belongs to.
 145  * This allows the bufctl structure to be small (one int), but limits
 146  * the number of objects a slab (not a cache) can contain when off-slab
 147  * bufctls are used. The limit is the size of the largest general cache
 148  * that does not use off-slab slabs.
 149  * For 32bit archs with 4 kB pages, is this 56.
 150  * This is not serious, as it is only for large objects, when it is unwise
 151  * to have too many per slab.
 152  * Note: This limit can be raised by introducing a general cache whose size
 153  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 154  */
 155
 156 #define BUFCTL_END 0xffffFFFF
 157 #define SLAB_LIMIT 0xffffFFFE
 158 typedef unsigned int kmem_bufctl_t;
 159
 160 /* Max number of objs-per-slab for caches which use off-slab slabs.
 161  * Needed to avoid a possible looping condition in cache_grow().
 162  */
 163 static unsigned long offslab_limit;
 164
 165 /*
 166  * struct slab
 167  *
 168  * Manages the objs in a slab. Placed either at the beginning of mem allocated
 169  * for a slab, or allocated from an general cache.
 170  * Slabs are chained into three list: fully used, partial, fully free slabs.
 171  */
 172 struct slab {
 173         struct list_head        list;
 174         unsigned long           colouroff;
 175         void                    *s_mem;         /* including colour offset */
 176         unsigned int            inuse;          /* num of objs active in slab */
 177         kmem_bufctl_t           free;
 178 };
 179
 180 /*
 181  * struct array_cache
 182  *
 183  * Per cpu structures
 184  * Purpose:
 185  * - LIFO ordering, to hand out cache-warm objects from _alloc
 186  * - reduce the number of linked list operations
 187  * - reduce spinlock operations
 188  *
 189  * The limit is stored in the per-cpu structure to reduce the data cache
 190  * footprint.
 191  *
 192  */
 193 struct array_cache {
 194         unsigned int avail;
 195         unsigned int limit;
 196         unsigned int batchcount;
 197         unsigned int touched;
 198 };
 199
 200 /* bootstrap: The caches do not work without cpuarrays anymore,
 201  * but the cpuarrays are allocated from the generic caches...
 202  */
 203 #define BOOT_CPUCACHE_ENTRIES   1
 204 struct arraycache_init {
 205         struct array_cache cache;
 206         void * entries[BOOT_CPUCACHE_ENTRIES];
 207 };
 208
 209 /*
 210  * The slab lists of all objects.
 211  * Hopefully reduce the internal fragmentation
 212  * NUMA: The spinlock could be moved from the kmem_cache_t
 213  * into this structure, too. Figure out what causes
 214  * fewer cross-node spinlock operations.
 215  */
 216 struct kmem_list3 {
 217         struct list_head        slabs_partial;  /* partial list first, better asm code */
 218         struct list_head        slabs_full;
 219         struct list_head        slabs_free;
 220         unsigned long   free_objects;
 221         int             free_touched;
 222         unsigned long   next_reap;
 223         struct array_cache      *shared;
 224 };
 225
 226 #define LIST3_INIT(parent) \
 227         { \
 228                 .slabs_full     = LIST_HEAD_INIT(parent.slabs_full), \
 229                 .slabs_partial  = LIST_HEAD_INIT(parent.slabs_partial), \
 230                 .slabs_free     = LIST_HEAD_INIT(parent.slabs_free) \
 231         }
 232 #define list3_data(cachep) \
 233         (&(cachep)->lists)
 234
 235 /* NUMA: per-node */
 236 #define list3_data_ptr(cachep, ptr) \
 237                 list3_data(cachep)
 238
 239 /*
 240  * kmem_cache_t
 241  *
 242  * manages a cache.
 243  */
 244
 245 struct kmem_cache_s {
 246 /* 1) per-cpu data, touched during every alloc/free */
 247         struct array_cache      *array[NR_CPUS];
 248         unsigned int            batchcount;
 249         unsigned int            limit;
 250 /* 2) touched by every alloc & free from the backend */
 251         struct kmem_list3       lists;
 252         /* NUMA: kmem_3list_t   *nodelists[NR_NODES] */
 253         unsigned int            objsize;
 254         unsigned int            flags;  /* constant flags */
 255         unsigned int            num;    /* # of objs per slab */
 256         unsigned int            free_limit; /* upper limit of objects in the lists */
 257         spinlock_t              spinlock;
 258
 259 /* 3) cache_grow/shrink */
 260         /* order of pgs per slab (2^n) */
 261         unsigned int            gfporder;
 262
 263         /* force GFP flags, e.g. GFP_DMA */
 264         unsigned int            gfpflags;
 265
 266         size_t                  colour;         /* cache colouring range */
 267         unsigned int            colour_off;     /* colour offset */
 268         unsigned int            colour_next;    /* cache colouring */
 269         kmem_cache_t            *slabp_cache;
 270         unsigned int            dflags;         /* dynamic flags */
 271
 272         /* constructor func */
 273         void (*ctor)(void *, kmem_cache_t *, unsigned long);
 274
 275         /* de-constructor func */
 276         void (*dtor)(void *, kmem_cache_t *, unsigned long);
 277
 278 /* 4) cache creation/removal */
 279         const char              *name;
 280         struct list_head        next;
 281
 282 /* 5) statistics */
 283 #if STATS
 284         unsigned long           num_active;
 285         unsigned long           num_allocations;
 286         unsigned long           high_mark;
 287         unsigned long           grown;
 288         unsigned long           reaped;
 289         unsigned long           errors;
 290         unsigned long           max_freeable;
 291         atomic_t                allochit;
 292         atomic_t                allocmiss;
 293         atomic_t                freehit;
 294         atomic_t                freemiss;
 295 #endif
 296 };
 297
 298 #define CFLGS_OFF_SLAB          (0x80000000UL)
 299 #define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 300
 301 #define BATCHREFILL_LIMIT       16
 302 /* Optimization question: fewer reaps means less
 303  * probability for unnessary cpucache drain/refill cycles.
 304  *
 305  * OTHO the cpuarrays can contain lots of objects,
 306  * which could lock up otherwise freeable slabs.
 307  */
 308 #define REAPTIMEOUT_CPUC        (2*HZ)
 309 #define REAPTIMEOUT_LIST3       (4*HZ)
 310
 311 #if STATS
 312 #define STATS_INC_ACTIVE(x)     ((x)->num_active++)
 313 #define STATS_DEC_ACTIVE(x)     ((x)->num_active--)
 314 #define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 315 #define STATS_INC_GROWN(x)      ((x)->grown++)
 316 #define STATS_INC_REAPED(x)     ((x)->reaped++)
 317 #define STATS_SET_HIGH(x)       do { if ((x)->num_active > (x)->high_mark) \
 318                                         (x)->high_mark = (x)->num_active; \
 319                                 } while (0)
 320 #define STATS_INC_ERR(x)        ((x)->errors++)
 321 #define STATS_SET_FREEABLE(x, i) \
 322                                 do { if ((x)->max_freeable < i) \
 323                                         (x)->max_freeable = i; \
 324                                 } while (0)
 325
 326 #define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 327 #define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 328 #define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
 329 #define STATS_INC_FREEMISS(x)   atomic_inc(&(x)->freemiss)
 330 #else
 331 #define STATS_INC_ACTIVE(x)     do { } while (0)
 332 #define STATS_DEC_ACTIVE(x)     do { } while (0)
 333 #define STATS_INC_ALLOCED(x)    do { } while (0)
 334 #define STATS_INC_GROWN(x)      do { } while (0)
 335 #define STATS_INC_REAPED(x)     do { } while (0)
 336 #define STATS_SET_HIGH(x)       do { } while (0)
 337 #define STATS_INC_ERR(x)        do { } while (0)
 338 #define STATS_SET_FREEABLE(x, i) \
 339                                 do { } while (0)
 340
 341 #define STATS_INC_ALLOCHIT(x)   do { } while (0)
 342 #define STATS_INC_ALLOCMISS(x)  do { } while (0)
 343 #define STATS_INC_FREEHIT(x)    do { } while (0)
 344 #define STATS_INC_FREEMISS(x)   do { } while (0)
 345 #endif
 346
 347 #if DEBUG
 348 /* Magic nums for obj red zoning.
 349  * Placed in the first word before and the first word after an obj.
 350  */
 351 #define RED_INACTIVE    0x5A2CF071UL    /* when obj is inactive */
 352 #define RED_ACTIVE      0x170FC2A5UL    /* when obj is active */
 353
 354 /* ...and for poisoning */
 355 #define POISON_BEFORE   0x5a    /* for use-uninitialised poisoning */
 356 #define POISON_AFTER    0x6b    /* for use-after-free poisoning */
 357 #define POISON_END      0xa5    /* end-byte of poisoning */
 358
 359 static inline int obj_dbghead(kmem_cache_t *cachep)
 360 {
 361         if (cachep->flags & SLAB_RED_ZONE)
 362                 return BYTES_PER_WORD;
 363         return 0;
 364 }
 365
 366 static inline int obj_dbglen(kmem_cache_t *cachep)
 367 {
 368         int len = 0;
 369
 370         if (cachep->flags & SLAB_RED_ZONE) {
 371                 len += 2*BYTES_PER_WORD;
 372         }
 373         if (cachep->flags & SLAB_STORE_USER) {
 374                 len += BYTES_PER_WORD;
 375         }
 376         return len;
 377 }
 378 #else
 379 static inline int obj_dbghead(kmem_cache_t *cachep)
 380 {
 381         return 0;
 382 }
 383 static inline int obj_dbglen(kmem_cache_t *cachep)
 384 {
 385         return 0;
 386 }
 387 #endif
 388
 389 /*
 390  * Maximum size of an obj (in 2^order pages)
 391  * and absolute limit for the gfp order.
 392  */
 393 #if defined(CONFIG_LARGE_ALLOCS)
 394 #define MAX_OBJ_ORDER   13      /* up to 32Mb */
 395 #define MAX_GFP_ORDER   13      /* up to 32Mb */
 396 #elif defined(CONFIG_MMU)
 397 #define MAX_OBJ_ORDER   5       /* 32 pages */
 398 #define MAX_GFP_ORDER   5       /* 32 pages */
 399 #else
 400 #define MAX_OBJ_ORDER   8       /* up to 1Mb */
 401 #define MAX_GFP_ORDER   8       /* up to 1Mb */
 402 #endif
 403
 404 /*
 405  * Do not go above this order unless 0 objects fit into the slab.
 406  */
 407 #define BREAK_GFP_ORDER_HI      2
 408 #define BREAK_GFP_ORDER_LO      1
 409 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 410
 411 /* Macros for storing/retrieving the cachep and or slab from the
 412  * global 'mem_map'. These are used to find the slab an obj belongs to.
 413  * With kfree(), these are used to find the cache which an obj belongs to.
 414  */
 415 #define SET_PAGE_CACHE(pg,x)  ((pg)->list.next = (struct list_head *)(x))
 416 #define GET_PAGE_CACHE(pg)    ((kmem_cache_t *)(pg)->list.next)
 417 #define SET_PAGE_SLAB(pg,x)   ((pg)->list.prev = (struct list_head *)(x))
 418 #define GET_PAGE_SLAB(pg)     ((struct slab *)(pg)->list.prev)
 419
 420 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
 421 struct cache_sizes malloc_sizes[] = {
 422 #define CACHE(x) { .cs_size = (x) },
 423 #include <linux/kmalloc_sizes.h>
 424         { 0, }
 425 #undef CACHE
 426 };
 427
 428 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
 429 static struct cache_names {
 430         char *name;
 431         char *name_dma;
 432 } cache_names[] = {
 433 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 434 #include <linux/kmalloc_sizes.h>
 435         { 0, }
 436 #undef CACHE
 437 };
 438
 439 struct arraycache_init initarray_cache __initdata = { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 440 struct arraycache_init initarray_generic __initdata = { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 441
 442 /* internal cache of cache description objs */
 443 static kmem_cache_t cache_cache = {
 444         .lists          = LIST3_INIT(cache_cache.lists),
 445         .batchcount     = 1,
 446         .limit          = BOOT_CPUCACHE_ENTRIES,
 447         .objsize        = sizeof(kmem_cache_t),
 448         .flags          = SLAB_NO_REAP,
 449         .spinlock       = SPIN_LOCK_UNLOCKED,
 450         .colour_off     = L1_CACHE_BYTES,
 451         .name           = "kmem_cache",
 452 };
 453
 454 /* Guard access to the cache-chain. */
 455 static struct semaphore cache_chain_sem;
 456
 457 struct list_head cache_chain;
 458
 459 /*
 460  * vm_enough_memory() looks at this to determine how many
 461  * slab-allocated pages are possibly freeable under pressure
 462  *
 463  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
 464  */
 465 atomic_t slab_reclaim_pages;
 466 EXPORT_SYMBOL(slab_reclaim_pages);
 467
 468 /*
 469  * chicken and egg problem: delay the per-cpu array allocation
 470  * until the general caches are up.
 471  */
 472 enum {
 473         NONE,
 474         PARTIAL,
 475         FULL
 476 } g_cpucache_up;
 477
 478 static DEFINE_PER_CPU(struct timer_list, reap_timers);
 479
 480 static void reap_timer_fnc(unsigned long data);
 481
 482 static void enable_cpucache (kmem_cache_t *cachep);
 483
 484 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 485 static void cache_estimate (unsigned long gfporder, size_t size,
 486                  int flags, size_t *left_over, unsigned int *num)
 487 {
 488         int i;
 489         size_t wastage = PAGE_SIZE<<gfporder;
 490         size_t extra = 0;
 491         size_t base = 0;
 492
 493         if (!(flags & CFLGS_OFF_SLAB)) {
 494                 base = sizeof(struct slab);
 495                 extra = sizeof(kmem_bufctl_t);
 496         }
 497         i = 0;
 498         while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage)
 499                 i++;
 500         if (i > 0)
 501                 i--;
 502
 503         if (i > SLAB_LIMIT)
 504                 i = SLAB_LIMIT;
 505
 506         *num = i;
 507         wastage -= i*size;
 508         wastage -= L1_CACHE_ALIGN(base+i*extra);
 509         *left_over = wastage;
 510 }
 511
 512 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 513
 514 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
 515 {
 516         printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 517                 function, cachep->name, msg);
 518         dump_stack();
 519 }
 520
 521 /*
 522  * Start the reap timer running on the target CPU.  We run at around 1 to 2Hz.
 523  * Add the CPU number into the expiry time to minimize the possibility of the
 524  * CPUs getting into lockstep and contending for the global cache chain lock.
 525  */
 526 static void start_cpu_timer(int cpu)
 527 {
 528         struct timer_list *rt = &per_cpu(reap_timers, cpu);
 529
 530         if (rt->function == NULL) {
 531                 init_timer(rt);
 532                 rt->expires = jiffies + HZ + 3*cpu;
 533                 rt->function = reap_timer_fnc;
 534                 add_timer_on(rt, cpu);
 535         }
 536 }
 537
 538 /*
 539  * Note: if someone calls kmem_cache_alloc() on the new
 540  * cpu before the cpuup callback had a chance to allocate
 541  * the head arrays, it will oops.
 542  * Is CPU_ONLINE early enough?
 543  */
 544 static int __devinit cpuup_callback(struct notifier_block *nfb,
 545                                   unsigned long action,
 546                                   void *hcpu)
 547 {
 548         long cpu = (long)hcpu;
 549         struct list_head *p;
 550
 551         switch (action) {
 552         case CPU_UP_PREPARE:
 553                 down(&cache_chain_sem);
 554                 list_for_each(p, &cache_chain) {
 555                         int memsize;
 556                         struct array_cache *nc;
 557
 558                         kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
 559                         memsize = sizeof(void*)*cachep->limit+sizeof(struct array_cache);
 560                         nc = kmalloc(memsize, GFP_KERNEL);
 561                         if (!nc)
 562                                 goto bad;
 563                         nc->avail = 0;
 564                         nc->limit = cachep->limit;
 565                         nc->batchcount = cachep->batchcount;
 566                         nc->touched = 0;
 567
 568                         spin_lock_irq(&cachep->spinlock);
 569                         cachep->array[cpu] = nc;
 570                         cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
 571                                                 + cachep->num;
 572                         spin_unlock_irq(&cachep->spinlock);
 573
 574                 }
 575                 up(&cache_chain_sem);
 576                 break;
 577         case CPU_ONLINE:
 578                 if (g_cpucache_up == FULL)
 579                         start_cpu_timer(cpu);
 580                 break;
 581         case CPU_UP_CANCELED:
 582                 down(&cache_chain_sem);
 583
 584                 list_for_each(p, &cache_chain) {
 585                         struct array_cache *nc;
 586                         kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
 587
 588                         nc = cachep->array[cpu];
 589                         cachep->array[cpu] = NULL;
 590                         kfree(nc);
 591                 }
 592                 up(&cache_chain_sem);
 593                 break;
 594         }
 595         return NOTIFY_OK;
 596 bad:
 597         up(&cache_chain_sem);
 598         return NOTIFY_BAD;
 599 }
 600
 601 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 602
 603 static inline void ** ac_entry(struct array_cache *ac)
 604 {
 605         return (void**)(ac+1);
 606 }
 607
 608 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
 609 {
 610         return cachep->array[smp_processor_id()];
 611 }
 612
 613 /* Initialisation.
 614  * Called after the gfp() functions have been enabled, and before smp_init().
 615  */
 616 void __init kmem_cache_init(void)
 617 {
 618         size_t left_over;
 619         struct cache_sizes *sizes;
 620         struct cache_names *names;
 621
 622         /*
 623          * Fragmentation resistance on low memory - only use bigger
 624          * page orders on machines with more than 32MB of memory.
 625          */
 626         if (num_physpages > (32 << 20) >> PAGE_SHIFT)
 627                 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
 628
 629
 630         /* Bootstrap is tricky, because several objects are allocated
 631          * from caches that do not exist yet:
 632          * 1) initialize the cache_cache cache: it contains the kmem_cache_t
 633          *    structures of all caches, except cache_cache itself: cache_cache
 634          *    is statically allocated.
 635          *    Initially an __init data area is used for the head array, it's
 636          *    replaced with a kmalloc allocated array at the end of the bootstrap.
 637          * 2) Create the first kmalloc cache.
 638          *    The kmem_cache_t for the new cache is allocated normally. An __init
 639          *    data area is used for the head array.
 640          * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
 641          * 4) Replace the __init data head arrays for cache_cache and the first
 642          *    kmalloc cache with kmalloc allocated arrays.
 643          * 5) Resize the head arrays of the kmalloc caches to their final sizes.
 644          */
 645
 646         /* 1) create the cache_cache */
 647         init_MUTEX(&cache_chain_sem);
 648         INIT_LIST_HEAD(&cache_chain);
 649         list_add(&cache_cache.next, &cache_chain);
 650         cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 651
 652         cache_estimate(0, cache_cache.objsize, 0,
 653                         &left_over, &cache_cache.num);
 654         if (!cache_cache.num)
 655                 BUG();
 656
 657         cache_cache.colour = left_over/cache_cache.colour_off;
 658         cache_cache.colour_next = 0;
 659
 660
 661         /* 2+3) create the kmalloc caches */
 662         sizes = malloc_sizes;
 663         names = cache_names;
 664
 665         while (sizes->cs_size) {
 666                 /* For performance, all the general caches are L1 aligned.
 667                  * This should be particularly beneficial on SMP boxes, as it
 668                  * eliminates "false sharing".
 669                  * Note for systems short on memory removing the alignment will
 670                  * allow tighter packing of the smaller caches. */
 671                 sizes->cs_cachep = kmem_cache_create(
 672                         names->name, sizes->cs_size,
 673                         0, SLAB_HWCACHE_ALIGN, NULL, NULL);
 674                 if (!sizes->cs_cachep)
 675                         BUG();
 676
 677                 /* Inc off-slab bufctl limit until the ceiling is hit. */
 678                 if (!(OFF_SLAB(sizes->cs_cachep))) {
 679                         offslab_limit = sizes->cs_size-sizeof(struct slab);
 680                         offslab_limit /= sizeof(kmem_bufctl_t);
 681                 }
 682
 683                 sizes->cs_dmacachep = kmem_cache_create(
 684                         names->name_dma, sizes->cs_size,
 685                         0, SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN, NULL, NULL);
 686                 if (!sizes->cs_dmacachep)
 687                         BUG();
 688
 689                 sizes++;
 690                 names++;
 691         }
 692         /* 4) Replace the bootstrap head arrays */
 693         {
 694                 void * ptr;
 695
 696                 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 697                 local_irq_disable();
 698                 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
 699                 memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
 700                 cache_cache.array[smp_processor_id()] = ptr;
 701                 local_irq_enable();
 702
 703                 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 704                 local_irq_disable();
 705                 BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
 706                 memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
 707                                 sizeof(struct arraycache_init));
 708                 malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
 709                 local_irq_enable();
 710         }
 711
 712         /* 5) resize the head arrays to their final sizes */
 713         {
 714                 kmem_cache_t *cachep;
 715                 down(&cache_chain_sem);
 716                 list_for_each_entry(cachep, &cache_chain, next)
 717                         enable_cpucache(cachep);
 718                 up(&cache_chain_sem);
 719         }
 720
 721         /* Done! */
 722         g_cpucache_up = FULL;
 723
 724         /* Register a cpu startup notifier callback
 725          * that initializes ac_data for all new cpus
 726          */
 727         register_cpu_notifier(&cpucache_notifier);
 728
 729
 730         /* The reap timers are started later, with a module init call:
 731          * That part of the kernel is not yet operational.
 732          */
 733 }
 734
 735 int __init cpucache_init(void)
 736 {
 737         int cpu;
 738
 739         /*
 740          * Register the timers that return unneeded
 741          * pages to gfp.
 742          */
 743         for (cpu = 0; cpu < NR_CPUS; cpu++) {
 744                 if (cpu_online(cpu))
 745                         start_cpu_timer(cpu);
 746         }
 747
 748         return 0;
 749 }
 750
 751 __initcall(cpucache_init);
 752
 753 /* Interface to system's page allocator. No need to hold the cache-lock.
 754  */
 755 static inline void * kmem_getpages (kmem_cache_t *cachep, unsigned long flags)
 756 {
 757         void    *addr;
 758
 759         /*
 760          * If we requested dmaable memory, we will get it. Even if we
 761          * did not request dmaable memory, we might get it, but that
 762          * would be relatively rare and ignorable.
 763          */
 764         flags |= cachep->gfpflags;
 765         if ( cachep->flags & SLAB_RECLAIM_ACCOUNT)
 766                 atomic_add(1<<cachep->gfporder, &slab_reclaim_pages);
 767         addr = (void*) __get_free_pages(flags, cachep->gfporder);
 768         /* Assume that now we have the pages no one else can legally
 769          * messes with the 'struct page's.
 770          * However vm_scan() might try to test the structure to see if
 771          * it is a named-page or buffer-page.  The members it tests are
 772          * of no interest here.....
 773          */
 774         return addr;
 775 }
 776
 777 /* Interface to system's page release. */
 778 static inline void kmem_freepages (kmem_cache_t *cachep, void *addr)
 779 {
 780         unsigned long i = (1<<cachep->gfporder);
 781         struct page *page = virt_to_page(addr);
 782         const unsigned long nr_freed = i;
 783
 784         /* free_pages() does not clear the type bit - we do that.
 785          * The pages have been unlinked from their cache-slab,
 786          * but their 'struct page's might be accessed in
 787          * vm_scan(). Shouldn't be a worry.
 788          */
 789         while (i--) {
 790                 ClearPageSlab(page);
 791                 page++;
 792         }
 793         sub_page_state(nr_slab, nr_freed);
 794         if (current->reclaim_state)
 795                 current->reclaim_state->reclaimed_slab += nr_freed;
 796         free_pages((unsigned long)addr, cachep->gfporder);
 797         if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 798                 atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
 799 }
 800
 801 #if DEBUG
 802
 803 #ifdef CONFIG_DEBUG_PAGEALLOC
 804 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, unsigned long caller)
 805 {
 806         int size = cachep->objsize-obj_dbglen(cachep);
 807
 808         addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
 809
 810         if (size < 5*sizeof(unsigned long))
 811                 return;
 812
 813         *addr++=0x12345678;
 814         *addr++=caller;
 815         *addr++=smp_processor_id();
 816         size -= 3*sizeof(unsigned long);
 817         {
 818                 unsigned long *sptr = &caller;
 819                 unsigned long svalue;
 820
 821                 while (((long) sptr & (THREAD_SIZE-1)) != 0) {
 822                         svalue = *sptr++;
 823                         if (kernel_text_address(svalue)) {
 824                                 *addr++=svalue;
 825                                 size -= sizeof(unsigned long);
 826                                 if (size <= sizeof(unsigned long))
 827                                         break;
 828                         }
 829                 }
 830
 831         }
 832         *addr++=0x87654321;
 833 }
 834 #endif
 835
 836 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
 837 {
 838         int size = cachep->objsize-obj_dbglen(cachep);
 839         addr = &((char*)addr)[obj_dbghead(cachep)];
 840
 841         memset(addr, val, size);
 842         *(unsigned char *)(addr+size-1) = POISON_END;
 843 }
 844
 845 static void *scan_poisoned_obj(unsigned char* addr, unsigned int size)
 846 {
 847         unsigned char *end;
 848
 849         end = addr + size - 1;
 850
 851         for (; addr < end; addr++) {
 852                 if (*addr != POISON_BEFORE && *addr != POISON_AFTER)
 853                         return addr;
 854         }
 855         if (*addr != POISON_END)
 856                 return addr;
 857         return NULL;
 858 }
 859
 860 static void check_poison_obj(kmem_cache_t *cachep, void *addr)
 861 {
 862         void *end;
 863         int size = cachep->objsize-obj_dbglen(cachep);
 864
 865         addr = &((char*)addr)[obj_dbghead(cachep)];
 866
 867         end = scan_poisoned_obj(addr, size);
 868         if (end) {
 869                 int s;
 870                 printk(KERN_ERR "Slab corruption: start=%p, expend=%p, "
 871                                 "problemat=%p\n", addr, addr+size-1, end);
 872                 if (cachep->flags & SLAB_STORE_USER) {
 873                         void *pc;
 874
 875                         if (cachep->flags & SLAB_RED_ZONE)
 876                                 pc = *(void**)(addr+size+BYTES_PER_WORD);
 877                         else
 878                                 pc = *(void**)(addr+size);
 879                         printk(KERN_ERR "Last user: [<%p>]", pc);
 880                         print_symbol("(%s)", (unsigned long)pc);
 881                         printk("\n");
 882                 }
 883                 printk(KERN_ERR "Data: ");
 884                 for (s = 0; s < size; s++) {
 885                         if (((char*)addr)[s] == POISON_BEFORE)
 886                                 printk(".");
 887                         else if (((char*)addr)[s] == POISON_AFTER)
 888                                 printk("*");
 889                         else
 890                                 printk("%02X ", ((unsigned char*)addr)[s]);
 891                 }
 892                 printk("\n");
 893                 printk(KERN_ERR "Next: ");
 894                 for (; s < size + 32; s++) {
 895                         if (((char*)addr)[s] == POISON_BEFORE)
 896                                 printk(".");
 897                         else if (((char*)addr)[s] == POISON_AFTER)
 898                                 printk("*");
 899                         else
 900                                 printk("%02X ", ((unsigned char*)addr)[s]);
 901                 }
 902                 printk("\n");
 903                 slab_error(cachep, "object was modified after freeing");
 904         }
 905 }
 906 #endif
 907
 908 /* Destroy all the objs in a slab, and release the mem back to the system.
 909  * Before calling the slab must have been unlinked from the cache.
 910  * The cache-lock is not held/needed.
 911  */
 912 static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 913 {
 914 #if DEBUG
 915         int i;
 916         for (i = 0; i < cachep->num; i++) {
 917                 void *objp = slabp->s_mem + cachep->objsize * i;
 918                 int objlen = cachep->objsize;
 919
 920                 if (cachep->flags & SLAB_POISON) {
 921 #ifdef CONFIG_DEBUG_PAGEALLOC
 922                         if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
 923                                 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
 924                         else
 925                                 check_poison_obj(cachep, objp);
 926 #else
 927                         check_poison_obj(cachep, objp);
 928 #endif
 929                 }
 930                 if (cachep->flags & SLAB_STORE_USER)
 931                         objlen -= BYTES_PER_WORD;
 932
 933                 if (cachep->flags & SLAB_RED_ZONE) {
 934                         if (*((unsigned long*)(objp)) != RED_INACTIVE)
 935                                 slab_error(cachep, "start of a freed object "
 936                                                         "was overwritten");
 937                         if (*((unsigned long*)(objp + objlen - BYTES_PER_WORD))
 938                                         != RED_INACTIVE)
 939                                 slab_error(cachep, "end of a freed object "
 940                                                         "was overwritten");
 941                         objp += BYTES_PER_WORD;
 942                 }
 943                 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
 944                         (cachep->dtor)(objp, cachep, 0);
 945         }
 946 #else
 947         if (cachep->dtor) {
 948                 int i;
 949                 for (i = 0; i < cachep->num; i++) {
 950                         void* objp = slabp->s_mem+cachep->objsize*i;
 951                         (cachep->dtor)(objp, cachep, 0);
 952                 }
 953         }
 954 #endif
 955
 956         kmem_freepages(cachep, slabp->s_mem-slabp->colouroff);
 957         if (OFF_SLAB(cachep))
 958                 kmem_cache_free(cachep->slabp_cache, slabp);
 959 }
 960
 961 /**
 962  * kmem_cache_create - Create a cache.
 963  * @name: A string which is used in /proc/slabinfo to identify this cache.
 964  * @size: The size of objects to be created in this cache.
 965  * @offset: The offset to use within the page.
 966  * @flags: SLAB flags
 967  * @ctor: A constructor for the objects.
 968  * @dtor: A destructor for the objects.
 969  *
 970  * Returns a ptr to the cache on success, NULL on failure.
 971  * Cannot be called within a int, but can be interrupted.
 972  * The @ctor is run when new pages are allocated by the cache
 973  * and the @dtor is run before the pages are handed back.
 974  *
 975  * @name must be valid until the cache is destroyed. This implies that
 976  * the module calling this has to destroy the cache before getting
 977  * unloaded.
 978  *
 979  * The flags are
 980  *
 981  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 982  * to catch references to uninitialised memory.
 983  *
 984  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 985  * for buffer overruns.
 986  *
 987  * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
 988  * memory pressure.
 989  *
 990  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 991  * cacheline.  This can be beneficial if you're counting cycles as closely
 992  * as davem.
 993  */
 994 kmem_cache_t *
 995 kmem_cache_create (const char *name, size_t size, size_t offset,
 996         unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
 997         void (*dtor)(void*, kmem_cache_t *, unsigned long))
 998 {
 999         const char *func_nm = KERN_ERR "kmem_create: ";
1000         size_t left_over, align, slab_size;
1001         kmem_cache_t *cachep = NULL;
1002
1003         /*
1004          * Sanity checks... these are all serious usage bugs.
1005          */
1006         if ((!name) ||
1007                 in_interrupt() ||
1008                 (size < BYTES_PER_WORD) ||
1009                 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
1010                 (dtor && !ctor) ||
1011                 (offset < 0 || offset > size))
1012                         BUG();
1013
1014 #if DEBUG
1015         if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1016                 /* No constructor, but inital state check requested */
1017                 printk("%sNo con, but init state check requested - %s\n", func_nm, name);
1018                 flags &= ~SLAB_DEBUG_INITIAL;
1019         }
1020
1021 #if FORCED_DEBUG
1022 #ifdef CONFIG_DEBUG_PAGEALLOC
1023         if (size < PAGE_SIZE-3*BYTES_PER_WORD && size > 128)
1024                 size = PAGE_SIZE-3*BYTES_PER_WORD;
1025 #endif
1026         /*
1027          * Enable redzoning and last user accounting, except
1028          * - for caches with forced alignment: redzoning would violate the
1029          *   alignment
1030          * - for caches with large objects, if the increased size would
1031          *   increase the object size above the next power of two: caches
1032          *   with object sizes just above a power of two have a significant
1033          *   amount of internal fragmentation
1034          */
1035         if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))
1036                         && !(flags & SLAB_MUST_HWCACHE_ALIGN)) {
1037                 flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
1038         }
1039         flags |= SLAB_POISON;
1040 #endif
1041 #endif
1042
1043         /*
1044          * Always checks flags, a caller might be expecting debug
1045          * support which isn't available.
1046          */
1047         if (flags & ~CREATE_MASK)
1048                 BUG();
1049
1050         /* Get cache's description obj. */
1051         cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1052         if (!cachep)
1053                 goto opps;
1054         memset(cachep, 0, sizeof(kmem_cache_t));
1055
1056         /* Check that size is in terms of words.  This is needed to avoid
1057          * unaligned accesses for some archs when redzoning is used, and makes
1058          * sure any on-slab bufctl's are also correctly aligned.
1059          */
1060         if (size & (BYTES_PER_WORD-1)) {
1061                 size += (BYTES_PER_WORD-1);
1062                 size &= ~(BYTES_PER_WORD-1);
1063                 printk("%sForcing size word alignment - %s\n", func_nm, name);
1064         }
1065
1066 #if DEBUG
1067         if (flags & SLAB_RED_ZONE) {
1068                 /*
1069                  * There is no point trying to honour cache alignment
1070                  * when redzoning.
1071                  */
1072                 flags &= ~SLAB_HWCACHE_ALIGN;
1073                 size += 2*BYTES_PER_WORD;       /* words for redzone */
1074         }
1075         if (flags & SLAB_STORE_USER) {
1076                 flags &= ~SLAB_HWCACHE_ALIGN;
1077                 size += BYTES_PER_WORD;         /* word for kfree caller address */
1078         }
1079 #endif
1080         align = BYTES_PER_WORD;
1081         if (flags & SLAB_HWCACHE_ALIGN)
1082                 align = L1_CACHE_BYTES;
1083
1084         /* Determine if the slab management is 'on' or 'off' slab. */
1085         if (size >= (PAGE_SIZE>>3))
1086                 /*
1087                  * Size is large, assume best to place the slab management obj
1088                  * off-slab (should allow better packing of objs).
1089                  */
1090                 flags |= CFLGS_OFF_SLAB;
1091
1092         if (flags & SLAB_HWCACHE_ALIGN) {
1093                 /* Need to adjust size so that objs are cache aligned. */
1094                 /* Small obj size, can get at least two per cache line. */
1095                 while (size <= align/2)
1096                         align /= 2;
1097                 size = (size+align-1)&(~(align-1));
1098         }
1099
1100         /* Cal size (in pages) of slabs, and the num of objs per slab.
1101          * This could be made much more intelligent.  For now, try to avoid
1102          * using high page-orders for slabs.  When the gfp() funcs are more
1103          * friendly towards high-order requests, this should be changed.
1104          */
1105         do {
1106                 unsigned int break_flag = 0;
1107 cal_wastage:
1108                 cache_estimate(cachep->gfporder, size, flags,
1109                                                 &left_over, &cachep->num);
1110                 if (break_flag)
1111                         break;
1112                 if (cachep->gfporder >= MAX_GFP_ORDER)
1113                         break;
1114                 if (!cachep->num)
1115                         goto next;
1116                 if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) {
1117                         /* Oops, this num of objs will cause problems. */
1118                         cachep->gfporder--;
1119                         break_flag++;
1120                         goto cal_wastage;
1121                 }
1122
1123                 /*
1124                  * Large num of objs is good, but v. large slabs are currently
1125                  * bad for the gfp()s.
1126                  */
1127                 if (cachep->gfporder >= slab_break_gfp_order)
1128                         break;
1129
1130                 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
1131                         break;  /* Acceptable internal fragmentation. */
1132 next:
1133                 cachep->gfporder++;
1134         } while (1);
1135
1136         if (!cachep->num) {
1137                 printk("kmem_cache_create: couldn't create cache %s.\n", name);
1138                 kmem_cache_free(&cache_cache, cachep);
1139                 cachep = NULL;
1140                 goto opps;
1141         }
1142         slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab));
1143
1144         /*
1145          * If the slab has been placed off-slab, and we have enough space then
1146          * move it on-slab. This is at the expense of any extra colouring.
1147          */
1148         if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
1149                 flags &= ~CFLGS_OFF_SLAB;
1150                 left_over -= slab_size;
1151         }
1152
1153         /* Offset must be a multiple of the alignment. */
1154         offset += (align-1);
1155         offset &= ~(align-1);
1156         if (!offset)
1157                 offset = L1_CACHE_BYTES;
1158         cachep->colour_off = offset;
1159         cachep->colour = left_over/offset;
1160
1161         cachep->flags = flags;
1162         cachep->gfpflags = 0;
1163         if (flags & SLAB_CACHE_DMA)
1164                 cachep->gfpflags |= GFP_DMA;
1165         spin_lock_init(&cachep->spinlock);
1166         cachep->objsize = size;
1167         /* NUMA */
1168         INIT_LIST_HEAD(&cachep->lists.slabs_full);
1169         INIT_LIST_HEAD(&cachep->lists.slabs_partial);
1170         INIT_LIST_HEAD(&cachep->lists.slabs_free);
1171
1172         if (flags & CFLGS_OFF_SLAB)
1173                 cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
1174         cachep->ctor = ctor;
1175         cachep->dtor = dtor;
1176         cachep->name = name;
1177
1178         if (g_cpucache_up == FULL) {
1179                 enable_cpucache(cachep);
1180         } else {
1181                 if (g_cpucache_up == NONE) {
1182                         /* Note: the first kmem_cache_create must create
1183                          * the cache that's used by kmalloc(24), otherwise
1184                          * the creation of further caches will BUG().
1185                          */
1186                         cachep->array[smp_processor_id()] = &initarray_generic.cache;
1187                         g_cpucache_up = PARTIAL;
1188                 } else {
1189                         cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
1190                 }
1191                 BUG_ON(!ac_data(cachep));
1192                 ac_data(cachep)->avail = 0;
1193                 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1194                 ac_data(cachep)->batchcount = 1;
1195                 ac_data(cachep)->touched = 0;
1196                 cachep->batchcount = 1;
1197                 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1198                 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
1199                                         + cachep->num;
1200         }
1201
1202         cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
1203                                         ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
1204
1205         /* Need the semaphore to access the chain. */
1206         down(&cache_chain_sem);
1207         {
1208                 struct list_head *p;
1209                 mm_segment_t old_fs;
1210
1211                 old_fs = get_fs();
1212                 set_fs(KERNEL_DS);
1213                 list_for_each(p, &cache_chain) {
1214                         kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
1215                         char tmp;
1216                         /* This happens when the module gets unloaded and doesn't
1217                            destroy its slab cache and noone else reuses the vmalloc
1218                            area of the module. Print a warning. */
1219                         if (__get_user(tmp,pc->name)) {
1220                                 printk("SLAB: cache with size %d has lost its name\n",
1221                                         pc->objsize);
1222                                 continue;
1223                         }
1224                         if (!strcmp(pc->name,name)) {
1225                                 printk("kmem_cache_create: duplicate cache %s\n",name);
1226                                 up(&cache_chain_sem);
1227                                 BUG();
1228                         }
1229                 }
1230                 set_fs(old_fs);
1231         }
1232
1233         /* cache setup completed, link it into the list */
1234         list_add(&cachep->next, &cache_chain);
1235         up(&cache_chain_sem);
1236 opps:
1237         return cachep;
1238 }
1239
1240 static inline void check_irq_off(void)
1241 {
1242 #if DEBUG
1243         BUG_ON(!irqs_disabled());
1244 #endif
1245 }
1246
1247 static inline void check_irq_on(void)
1248 {
1249 #if DEBUG
1250         BUG_ON(irqs_disabled());
1251 #endif
1252 }
1253
1254 static inline void check_spinlock_acquired(kmem_cache_t *cachep)
1255 {
1256 #ifdef CONFIG_SMP
1257         check_irq_off();
1258         BUG_ON(spin_trylock(&cachep->spinlock));
1259 #endif
1260 }
1261
1262 /*
1263  * Waits for all CPUs to execute func().
1264  */
1265 static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
1266 {
1267         check_irq_on();
1268         preempt_disable();
1269
1270         local_irq_disable();
1271         func(arg);
1272         local_irq_enable();
1273
1274         if (smp_call_function(func, arg, 1, 1))
1275                 BUG();
1276
1277         preempt_enable();
1278 }
1279
1280 static void free_block (kmem_cache_t* cachep, void** objpp, int len);
1281 static void drain_array_locked(kmem_cache_t* cachep,
1282                                 struct array_cache *ac, int force);
1283
1284 static void do_drain(void *arg)
1285 {
1286         kmem_cache_t *cachep = (kmem_cache_t*)arg;
1287         struct array_cache *ac;
1288
1289         check_irq_off();
1290         ac = ac_data(cachep);
1291         spin_lock(&cachep->spinlock);
1292         free_block(cachep, &ac_entry(ac)[0], ac->avail);
1293         spin_unlock(&cachep->spinlock);
1294         ac->avail = 0;
1295 }
1296
1297 static void drain_cpu_caches(kmem_cache_t *cachep)
1298 {
1299         smp_call_function_all_cpus(do_drain, cachep);
1300         check_irq_on();
1301         spin_lock_irq(&cachep->spinlock);
1302         if (cachep->lists.shared)
1303                 drain_array_locked(cachep, cachep->lists.shared, 1);
1304         spin_unlock_irq(&cachep->spinlock);
1305 }
1306
1307
1308 /* NUMA shrink all list3s */
1309 static int __cache_shrink(kmem_cache_t *cachep)
1310 {
1311         struct slab *slabp;
1312         int ret;
1313
1314         drain_cpu_caches(cachep);
1315
1316         check_irq_on();
1317         spin_lock_irq(&cachep->spinlock);
1318
1319         for(;;) {
1320                 struct list_head *p;
1321
1322                 p = cachep->lists.slabs_free.prev;
1323                 if (p == &cachep->lists.slabs_free)
1324                         break;
1325
1326                 slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
1327 #if DEBUG
1328                 if (slabp->inuse)
1329                         BUG();
1330 #endif
1331                 list_del(&slabp->list);
1332
1333                 cachep->lists.free_objects -= cachep->num;
1334                 spin_unlock_irq(&cachep->spinlock);
1335                 slab_destroy(cachep, slabp);
1336                 spin_lock_irq(&cachep->spinlock);
1337         }
1338         ret = !list_empty(&cachep->lists.slabs_full) ||
1339                 !list_empty(&cachep->lists.slabs_partial);
1340         spin_unlock_irq(&cachep->spinlock);
1341         return ret;
1342 }
1343
1344 /**
1345  * kmem_cache_shrink - Shrink a cache.
1346  * @cachep: The cache to shrink.
1347  *
1348  * Releases as many slabs as possible for a cache.
1349  * To help debugging, a zero exit status indicates all slabs were released.
1350  */
1351 int kmem_cache_shrink(kmem_cache_t *cachep)
1352 {
1353         if (!cachep || in_interrupt())
1354                 BUG();
1355
1356         return __cache_shrink(cachep);
1357 }
1358
1359 /**
1360  * kmem_cache_destroy - delete a cache
1361  * @cachep: the cache to destroy
1362  *
1363  * Remove a kmem_cache_t object from the slab cache.
1364  * Returns 0 on success.
1365  *
1366  * It is expected this function will be called by a module when it is
1367  * unloaded.  This will remove the cache completely, and avoid a duplicate
1368  * cache being allocated each time a module is loaded and unloaded, if the
1369  * module doesn't have persistent in-kernel storage across loads and unloads.
1370  *
1371  * The cache must be empty before calling this function.
1372  *
1373  * The caller must guarantee that noone will allocate memory from the cache
1374  * during the kmem_cache_destroy().
1375  */
1376 int kmem_cache_destroy (kmem_cache_t * cachep)
1377 {
1378         int i;
1379
1380         if (!cachep || in_interrupt())
1381                 BUG();
1382
1383         /* Find the cache in the chain of caches. */
1384         down(&cache_chain_sem);
1385         /*
1386          * the chain is never empty, cache_cache is never destroyed
1387          */
1388         list_del(&cachep->next);
1389         up(&cache_chain_sem);
1390
1391         if (__cache_shrink(cachep)) {
1392                 slab_error(cachep, "Can't free all objects");
1393                 down(&cache_chain_sem);
1394                 list_add(&cachep->next,&cache_chain);
1395                 up(&cache_chain_sem);
1396                 return 1;
1397         }
1398
1399         for (i = 0; i < NR_CPUS; i++)
1400                 kfree(cachep->array[i]);
1401
1402         /* NUMA: free the list3 structures */
1403         kfree(cachep->lists.shared);
1404         cachep->lists.shared = NULL;
1405         kmem_cache_free(&cache_cache, cachep);
1406
1407         return 0;
1408 }
1409
1410 /* Get the memory for a slab management obj. */
1411 static inline struct slab* alloc_slabmgmt (kmem_cache_t *cachep,
1412                         void *objp, int colour_off, int local_flags)
1413 {
1414         struct slab *slabp;
1415
1416         if (OFF_SLAB(cachep)) {
1417                 /* Slab management obj is off-slab. */
1418                 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
1419                 if (!slabp)
1420                         return NULL;
1421         } else {
1422                 slabp = objp+colour_off;
1423                 colour_off += L1_CACHE_ALIGN(cachep->num *
1424                                 sizeof(kmem_bufctl_t) + sizeof(struct slab));
1425         }
1426         slabp->inuse = 0;
1427         slabp->colouroff = colour_off;
1428         slabp->s_mem = objp+colour_off;
1429
1430         return slabp;
1431 }
1432
1433 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
1434 {
1435         return (kmem_bufctl_t *)(slabp+1);
1436 }
1437
1438 static void cache_init_objs (kmem_cache_t * cachep,
1439                         struct slab * slabp, unsigned long ctor_flags)
1440 {
1441         int i;
1442
1443         for (i = 0; i < cachep->num; i++) {
1444                 void* objp = slabp->s_mem+cachep->objsize*i;
1445 #if DEBUG
1446                 int objlen = cachep->objsize;
1447                 /* need to poison the objs? */
1448                 if (cachep->flags & SLAB_POISON)
1449                         poison_obj(cachep, objp, POISON_BEFORE);
1450                 if (cachep->flags & SLAB_STORE_USER) {
1451                         objlen -= BYTES_PER_WORD;
1452                         ((unsigned long*)(objp+objlen))[0] = 0;
1453                 }
1454
1455                 if (cachep->flags & SLAB_RED_ZONE) {
1456                         *((unsigned long*)(objp)) = RED_INACTIVE;
1457                         objp += BYTES_PER_WORD;
1458                         objlen -= 2* BYTES_PER_WORD;
1459                         *((unsigned long*)(objp + objlen)) = RED_INACTIVE;
1460                 }
1461                 /*
1462                  * Constructors are not allowed to allocate memory from
1463                  * the same cache which they are a constructor for.
1464                  * Otherwise, deadlock. They must also be threaded.
1465                  */
1466                 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
1467                         cachep->ctor(objp, cachep, ctor_flags);
1468
1469                 if (cachep->flags & SLAB_RED_ZONE) {
1470                         if (*((unsigned long*)(objp + objlen)) != RED_INACTIVE)
1471                                 slab_error(cachep, "constructor overwrote the"
1472                                                         " end of an object");
1473                         objp -= BYTES_PER_WORD;
1474                         if (*((unsigned long*)(objp)) != RED_INACTIVE)
1475                                 slab_error(cachep, "constructor overwrote the"
1476                                                         " start of an object");
1477                 }
1478                 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
1479                         kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
1480 #else
1481                 if (cachep->ctor)
1482                         cachep->ctor(objp, cachep, ctor_flags);
1483 #endif
1484                 slab_bufctl(slabp)[i] = i+1;
1485         }
1486         slab_bufctl(slabp)[i-1] = BUFCTL_END;
1487         slabp->free = 0;
1488 }
1489
1490 static void kmem_flagcheck(kmem_cache_t *cachep, int flags)
1491 {
1492         if (flags & SLAB_DMA) {
1493                 if (!(cachep->gfpflags & GFP_DMA))
1494                         BUG();
1495         } else {
1496                 if (cachep->gfpflags & GFP_DMA)
1497                         BUG();
1498         }
1499 }
1500
1501 /*
1502  * Grow (by 1) the number of slabs within a cache.  This is called by
1503  * kmem_cache_alloc() when there are no active objs left in a cache.
1504  */
1505 static int cache_grow (kmem_cache_t * cachep, int flags)
1506 {
1507         struct slab     *slabp;
1508         struct page     *page;
1509         void            *objp;
1510         size_t           offset;
1511         unsigned int     i, local_flags;
1512         unsigned long    ctor_flags;
1513
1514         /* Be lazy and only check for valid flags here,
1515          * keeping it out of the critical path in kmem_cache_alloc().
1516          */
1517         if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
1518                 BUG();
1519         if (flags & SLAB_NO_GROW)
1520                 return 0;
1521
1522         ctor_flags = SLAB_CTOR_CONSTRUCTOR;
1523         local_flags = (flags & SLAB_LEVEL_MASK);
1524         if (!(local_flags & __GFP_WAIT))
1525                 /*
1526                  * Not allowed to sleep.  Need to tell a constructor about
1527                  * this - it might need to know...
1528                  */
1529                 ctor_flags |= SLAB_CTOR_ATOMIC;
1530
1531         /* About to mess with non-constant members - lock. */
1532         check_irq_off();
1533         spin_lock(&cachep->spinlock);
1534
1535         /* Get colour for the slab, and cal the next value. */
1536         offset = cachep->colour_next;
1537         cachep->colour_next++;
1538         if (cachep->colour_next >= cachep->colour)
1539                 cachep->colour_next = 0;
1540         offset *= cachep->colour_off;
1541
1542         spin_unlock(&cachep->spinlock);
1543
1544         if (local_flags & __GFP_WAIT)
1545                 local_irq_enable();
1546
1547         /*
1548          * The test for missing atomic flag is performed here, rather than
1549          * the more obvious place, simply to reduce the critical path length
1550          * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
1551          * will eventually be caught here (where it matters).
1552          */
1553         kmem_flagcheck(cachep, flags);
1554
1555
1556         /* Get mem for the objs. */
1557         if (!(objp = kmem_getpages(cachep, flags)))
1558                 goto failed;
1559
1560         /* Get slab management. */
1561         if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
1562                 goto opps1;
1563
1564         /* Nasty!!!!!! I hope this is OK. */
1565         i = 1 << cachep->gfporder;
1566         page = virt_to_page(objp);
1567         do {
1568                 SET_PAGE_CACHE(page, cachep);
1569                 SET_PAGE_SLAB(page, slabp);
1570                 SetPageSlab(page);
1571                 inc_page_state(nr_slab);
1572                 page++;
1573         } while (--i);
1574
1575         cache_init_objs(cachep, slabp, ctor_flags);
1576
1577         if (local_flags & __GFP_WAIT)
1578                 local_irq_disable();
1579         check_irq_off();
1580         spin_lock(&cachep->spinlock);
1581
1582         /* Make slab active. */
1583         list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
1584         STATS_INC_GROWN(cachep);
1585         list3_data(cachep)->free_objects += cachep->num;
1586         spin_unlock(&cachep->spinlock);
1587         return 1;
1588 opps1:
1589         kmem_freepages(cachep, objp);
1590 failed:
1591         if (local_flags & __GFP_WAIT)
1592                 local_irq_disable();
1593         return 0;
1594 }
1595
1596 /*
1597  * Perform extra freeing checks:
1598  * - detect bad pointers.
1599  * - POISON/RED_ZONE checking
1600  * - destructor calls, for caches with POISON+dtor
1601  */
1602 static inline void kfree_debugcheck(const void *objp)
1603 {
1604 #if DEBUG
1605         struct page *page;
1606
1607         if (!virt_addr_valid(objp)) {
1608                 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
1609                         (unsigned long)objp);
1610                 BUG();
1611         }
1612         page = virt_to_page(objp);
1613         if (!PageSlab(page)) {
1614                 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
1615                 BUG();
1616         }
1617 #endif
1618 }
1619
1620 static inline void *cache_free_debugcheck (kmem_cache_t * cachep, void * objp, void *caller)
1621 {
1622 #if DEBUG
1623         struct page *page;
1624         unsigned int objnr;
1625         int objlen = cachep->objsize;
1626         struct slab *slabp;
1627
1628         kfree_debugcheck(objp);
1629         page = virt_to_page(objp);
1630
1631         BUG_ON(GET_PAGE_CACHE(page) != cachep);
1632         slabp = GET_PAGE_SLAB(page);
1633
1634         if (cachep->flags & SLAB_STORE_USER) {
1635                 objlen -= BYTES_PER_WORD;
1636         }
1637         if (cachep->flags & SLAB_RED_ZONE) {
1638                 objp -= BYTES_PER_WORD;
1639                 if (xchg((unsigned long *)objp, RED_INACTIVE) != RED_ACTIVE)
1640                         slab_error(cachep, "double free, or memory before"
1641                                                 " object was overwritten");
1642                 if (xchg((unsigned long *)(objp+objlen-BYTES_PER_WORD), RED_INACTIVE) != RED_ACTIVE)
1643                         slab_error(cachep, "double free, or memory after "
1644                                                 " object was overwritten");
1645         }
1646         if (cachep->flags & SLAB_STORE_USER) {
1647                 *((void**)(objp+objlen)) = caller;
1648         }
1649
1650         objnr = (objp-slabp->s_mem)/cachep->objsize;
1651
1652         BUG_ON(objnr >= cachep->num);
1653         BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
1654
1655         if (cachep->flags & SLAB_DEBUG_INITIAL) {
1656                 /* Need to call the slab's constructor so the
1657                  * caller can perform a verify of its state (debugging).
1658                  * Called without the cache-lock held.
1659                  */
1660                 cachep->ctor(objp+obj_dbghead(cachep),
1661                                         cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
1662         }
1663         if (cachep->flags & SLAB_POISON && cachep->dtor) {
1664                 /* we want to cache poison the object,
1665                  * call the destruction callback
1666                  */
1667                 cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
1668         }
1669         if (cachep->flags & SLAB_POISON) {
1670 #ifdef CONFIG_DEBUG_PAGEALLOC
1671                 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
1672                         store_stackinfo(cachep, objp, POISON_AFTER);
1673                         kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
1674                 } else {
1675                         poison_obj(cachep, objp, POISON_AFTER);
1676                 }
1677 #else
1678                 poison_obj(cachep, objp, POISON_AFTER);
1679 #endif
1680         }
1681 #endif
1682         return objp;
1683 }
1684
1685 static inline void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
1686 {
1687 #if DEBUG
1688         int i;
1689         int entries = 0;
1690
1691         check_spinlock_acquired(cachep);
1692         /* Check slab's freelist to see if this obj is there. */
1693         for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
1694                 entries++;
1695                 BUG_ON(entries > cachep->num);
1696                 BUG_ON(i < 0 || i >= cachep->num);
1697         }
1698         BUG_ON(entries != cachep->num - slabp->inuse);
1699 #endif
1700 }
1701
1702 static inline void * cache_alloc_one_tail (kmem_cache_t *cachep,
1703                                                 struct slab *slabp)
1704 {
1705         void *objp;
1706
1707         check_spinlock_acquired(cachep);
1708
1709         STATS_INC_ALLOCED(cachep);
1710         STATS_INC_ACTIVE(cachep);
1711         STATS_SET_HIGH(cachep);
1712
1713         /* get obj pointer */
1714         slabp->inuse++;
1715         objp = slabp->s_mem + slabp->free*cachep->objsize;
1716         slabp->free=slab_bufctl(slabp)[slabp->free];
1717
1718         return objp;
1719 }
1720
1721 static inline void cache_alloc_listfixup(struct kmem_list3 *l3, struct slab *slabp)
1722 {
1723         list_del(&slabp->list);
1724         if (slabp->free == BUFCTL_END) {
1725                 list_add(&slabp->list, &l3->slabs_full);
1726         } else {
1727                 list_add(&slabp->list, &l3->slabs_partial);
1728         }
1729 }
1730
1731 static void* cache_alloc_refill(kmem_cache_t* cachep, int flags)
1732 {
1733         int batchcount;
1734         struct kmem_list3 *l3;
1735         struct array_cache *ac;
1736
1737         check_irq_off();
1738         ac = ac_data(cachep);
1739 retry:
1740         batchcount = ac->batchcount;
1741         if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
1742                 /* if there was little recent activity on this
1743                  * cache, then perform only a partial refill.
1744                  * Otherwise we could generate refill bouncing.
1745                  */
1746                 batchcount = BATCHREFILL_LIMIT;
1747         }
1748         l3 = list3_data(cachep);
1749
1750         BUG_ON(ac->avail > 0);
1751         spin_lock(&cachep->spinlock);
1752         if (l3->shared) {
1753                 struct array_cache *shared_array = l3->shared;
1754                 if (shared_array->avail) {
1755                         if (batchcount > shared_array->avail)
1756                                 batchcount = shared_array->avail;
1757                         shared_array->avail -= batchcount;
1758                         ac->avail = batchcount;
1759                         memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
1760                                         sizeof(void*)*batchcount);
1761                         shared_array->touched = 1;
1762                         goto alloc_done;
1763                 }
1764         }
1765         while (batchcount > 0) {
1766                 struct list_head *entry;
1767                 struct slab *slabp;
1768                 /* Get slab alloc is to come from. */
1769                 entry = l3->slabs_partial.next;
1770                 if (entry == &l3->slabs_partial) {
1771                         l3->free_touched = 1;
1772                         entry = l3->slabs_free.next;
1773                         if (entry == &l3->slabs_free)
1774                                 goto must_grow;
1775                 }
1776
1777                 slabp = list_entry(entry, struct slab, list);
1778                 check_slabp(cachep, slabp);
1779                 while (slabp->inuse < cachep->num && batchcount--)
1780                         ac_entry(ac)[ac->avail++] =
1781                                 cache_alloc_one_tail(cachep, slabp);
1782                 check_slabp(cachep, slabp);
1783                 cache_alloc_listfixup(l3, slabp);
1784         }
1785
1786 must_grow:
1787         l3->free_objects -= ac->avail;
1788 alloc_done:
1789         spin_unlock(&cachep->spinlock);
1790
1791         if (unlikely(!ac->avail)) {
1792                 int x;
1793                 x = cache_grow(cachep, flags);
1794
1795                 // cache_grow can reenable interrupts, then ac could change.
1796                 ac = ac_data(cachep);
1797                 if (!x && ac->avail == 0)       // no objects in sight? abort
1798                         return NULL;
1799
1800                 if (!ac->avail)         // objects refilled by interrupt?
1801                         goto retry;
1802         }
1803         ac->touched = 1;
1804         return ac_entry(ac)[--ac->avail];
1805 }
1806
1807 static inline void
1808 cache_alloc_debugcheck_before(kmem_cache_t *cachep, int flags)
1809 {
1810         if (flags & __GFP_WAIT)
1811                 might_sleep();
1812 #if DEBUG
1813         kmem_flagcheck(cachep, flags);
1814 #endif
1815 }
1816
1817 static inline void *
1818 cache_alloc_debugcheck_after(kmem_cache_t *cachep,
1819                         unsigned long flags, void *objp, void *caller)
1820 {
1821 #if DEBUG
1822         int objlen = cachep->objsize;
1823
1824         if (!objp)
1825                 return objp;
1826         if (cachep->flags & SLAB_POISON) {
1827 #ifdef CONFIG_DEBUG_PAGEALLOC
1828                 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
1829                         kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
1830                 else
1831                         check_poison_obj(cachep, objp);
1832 #else
1833                 check_poison_obj(cachep, objp);
1834 #endif
1835                 poison_obj(cachep, objp, POISON_BEFORE);
1836         }
1837         if (cachep->flags & SLAB_STORE_USER) {
1838                 objlen -= BYTES_PER_WORD;
1839                 *((void **)(objp+objlen)) = caller;
1840         }
1841
1842         if (cachep->flags & SLAB_RED_ZONE) {
1843                 /* Set alloc red-zone, and check old one. */
1844                 if (xchg((unsigned long *)objp, RED_ACTIVE) != RED_INACTIVE) {
1845                         slab_error(cachep, "memory before object was "
1846                                                 "overwritten");
1847                 }
1848                 if (xchg((unsigned long *)(objp+objlen - BYTES_PER_WORD),
1849                                         RED_ACTIVE) != RED_INACTIVE) {
1850                         slab_error(cachep, "memory after object was "
1851                                                 "overwritten");
1852                 }
1853                 objp += BYTES_PER_WORD;
1854         }
1855         if (cachep->ctor && cachep->flags & SLAB_POISON) {
1856                 unsigned long   ctor_flags = SLAB_CTOR_CONSTRUCTOR;
1857
1858                 if (!(flags & __GFP_WAIT))
1859                         ctor_flags |= SLAB_CTOR_ATOMIC;
1860
1861                 cachep->ctor(objp, cachep, ctor_flags);
1862         }
1863 #endif
1864         return objp;
1865 }
1866
1867
1868 static inline void * __cache_alloc (kmem_cache_t *cachep, int flags)
1869 {
1870         unsigned long save_flags;
1871         void* objp;
1872         struct array_cache *ac;
1873
1874         cache_alloc_debugcheck_before(cachep, flags);
1875
1876         local_irq_save(save_flags);
1877         ac = ac_data(cachep);
1878         if (likely(ac->avail)) {
1879                 STATS_INC_ALLOCHIT(cachep);
1880                 ac->touched = 1;
1881                 objp = ac_entry(ac)[--ac->avail];
1882         } else {
1883                 STATS_INC_ALLOCMISS(cachep);
1884                 objp = cache_alloc_refill(cachep, flags);
1885         }
1886         local_irq_restore(save_flags);
1887         objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
1888         return objp;
1889 }
1890
1891 /*
1892  * NUMA: different approach needed if the spinlock is moved into
1893  * the l3 structure
1894  */
1895
1896 static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
1897 {
1898         int i;
1899
1900         check_spinlock_acquired(cachep);
1901
1902         /* NUMA: move add into loop */
1903         cachep->lists.free_objects += nr_objects;
1904
1905         for (i = 0; i < nr_objects; i++) {
1906                 void *objp = objpp[i];
1907                 struct slab *slabp;
1908                 unsigned int objnr;
1909
1910                 slabp = GET_PAGE_SLAB(virt_to_page(objp));
1911                 list_del(&slabp->list);
1912                 objnr = (objp - slabp->s_mem) / cachep->objsize;
1913                 check_slabp(cachep, slabp);
1914                 slab_bufctl(slabp)[objnr] = slabp->free;
1915                 slabp->free = objnr;
1916                 STATS_DEC_ACTIVE(cachep);
1917                 slabp->inuse--;
1918                 check_slabp(cachep, slabp);
1919
1920                 /* fixup slab chains */
1921                 if (slabp->inuse == 0) {
1922                         if (cachep->lists.free_objects > cachep->free_limit) {
1923                                 cachep->lists.free_objects -= cachep->num;
1924                                 slab_destroy(cachep, slabp);
1925                         } else {
1926                                 list_add(&slabp->list,
1927                                 &list3_data_ptr(cachep, objp)->slabs_free);
1928                         }
1929                 } else {
1930                         /* Unconditionally move a slab to the end of the
1931                          * partial list on free - maximum time for the
1932                          * other objects to be freed, too.
1933                          */
1934                         list_add_tail(&slabp->list,
1935                                 &list3_data_ptr(cachep, objp)->slabs_partial);
1936                 }
1937         }
1938 }
1939
1940 static void cache_flusharray (kmem_cache_t* cachep, struct array_cache *ac)
1941 {
1942         int batchcount;
1943
1944         batchcount = ac->batchcount;
1945 #if DEBUG
1946         BUG_ON(!batchcount || batchcount > ac->avail);
1947 #endif
1948         check_irq_off();
1949         spin_lock(&cachep->spinlock);
1950         if (cachep->lists.shared) {
1951                 struct array_cache *shared_array = cachep->lists.shared;
1952                 int max = shared_array->limit-shared_array->avail;
1953                 if (max) {
1954                         if (batchcount > max)
1955                                 batchcount = max;
1956                         memcpy(&ac_entry(shared_array)[shared_array->avail],
1957                                         &ac_entry(ac)[0],
1958                                         sizeof(void*)*batchcount);
1959                         shared_array->avail += batchcount;
1960                         goto free_done;
1961                 }
1962         }
1963
1964         free_block(cachep, &ac_entry(ac)[0], batchcount);
1965 free_done:
1966 #if STATS
1967         {
1968                 int i = 0;
1969                 struct list_head *p;
1970
1971                 p = list3_data(cachep)->slabs_free.next;
1972                 while (p != &(list3_data(cachep)->slabs_free)) {
1973                         struct slab *slabp;
1974
1975                         slabp = list_entry(p, struct slab, list);
1976                         BUG_ON(slabp->inuse);
1977
1978                         i++;
1979                         p = p->next;
1980                 }
1981                 STATS_SET_FREEABLE(cachep, i);
1982         }
1983 #endif
1984         spin_unlock(&cachep->spinlock);
1985         ac->avail -= batchcount;
1986         memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
1987                         sizeof(void*)*ac->avail);
1988 }
1989
1990 /*
1991  * __cache_free
1992  * Release an obj back to its cache. If the obj has a constructed
1993  * state, it must be in this state _before_ it is released.
1994  *
1995  * Called with disabled ints.
1996  */
1997 static inline void __cache_free (kmem_cache_t *cachep, void* objp)
1998 {
1999         struct array_cache *ac = ac_data(cachep);
2000
2001         check_irq_off();
2002         objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
2003
2004         if (likely(ac->avail < ac->limit)) {
2005                 STATS_INC_FREEHIT(cachep);
2006                 ac_entry(ac)[ac->avail++] = objp;
2007                 return;
2008         } else {
2009                 STATS_INC_FREEMISS(cachep);
2010                 cache_flusharray(cachep, ac);
2011                 ac_entry(ac)[ac->avail++] = objp;
2012         }
2013 }
2014
2015 /**
2016  * kmem_cache_alloc - Allocate an object
2017  * @cachep: The cache to allocate from.
2018  * @flags: See kmalloc().
2019  *
2020  * Allocate an object from this cache.  The flags are only relevant
2021  * if the cache has no available objects.
2022  */
2023 void * kmem_cache_alloc (kmem_cache_t *cachep, int flags)
2024 {
2025         return __cache_alloc(cachep, flags);
2026 }
2027
2028 /**
2029  * kmalloc - allocate memory
2030  * @size: how many bytes of memory are required.
2031  * @flags: the type of memory to allocate.
2032  *
2033  * kmalloc is the normal method of allocating memory
2034  * in the kernel.
2035  *
2036  * The @flags argument may be one of:
2037  *
2038  * %GFP_USER - Allocate memory on behalf of user.  May sleep.
2039  *
2040  * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
2041  *
2042  * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
2043  *
2044  * Additionally, the %GFP_DMA flag may be set to indicate the memory
2045  * must be suitable for DMA.  This can mean different things on different
2046  * platforms.  For example, on i386, it means that the memory must come
2047  * from the first 16MB.
2048  */
2049 void * __kmalloc (size_t size, int flags)
2050 {
2051         struct cache_sizes *csizep = malloc_sizes;
2052
2053         for (; csizep->cs_size; csizep++) {
2054                 if (size > csizep->cs_size)
2055                         continue;
2056 #if DEBUG
2057                 /* This happens if someone tries to call
2058                  * kmem_cache_create(), or kmalloc(), before
2059                  * the generic caches are initialized.
2060                  */
2061                 BUG_ON(csizep->cs_cachep == NULL);
2062 #endif
2063                 return __cache_alloc(flags & GFP_DMA ?
2064                          csizep->cs_dmacachep : csizep->cs_cachep, flags);
2065         }
2066         return NULL;
2067 }
2068
2069 #ifdef CONFIG_SMP
2070 /**
2071  * __alloc_percpu - allocate one copy of the object for every present
2072  * cpu in the system, zeroing them.
2073  * Objects should be dereferenced using per_cpu_ptr/get_cpu_ptr
2074  * macros only.
2075  *
2076  * @size: how many bytes of memory are required.
2077  * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
2078  */
2079 void *__alloc_percpu(size_t size, size_t align)
2080 {
2081         int i;
2082         struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
2083
2084         if (!pdata)
2085                 return NULL;
2086
2087         for (i = 0; i < NR_CPUS; i++) {
2088                 if (!cpu_possible(i))
2089                         continue;
2090                 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
2091                 if (!pdata->ptrs[i])
2092                         goto unwind_oom;
2093                 memset(pdata->ptrs[i], 0, size);
2094         }
2095
2096         /* Catch derefs w/o wrappers */
2097         return (void *) (~(unsigned long) pdata);
2098
2099 unwind_oom:
2100         while (--i >= 0) {
2101                 if (!cpu_possible(i))
2102                         continue;
2103                 kfree(pdata->ptrs[i]);
2104         }
2105         kfree(pdata);
2106         return NULL;
2107 }
2108 #endif
2109
2110 /**
2111  * kmem_cache_free - Deallocate an object
2112  * @cachep: The cache the allocation was from.
2113  * @objp: The previously allocated object.
2114  *
2115  * Free an object which was previously allocated from this
2116  * cache.
2117  */
2118 void kmem_cache_free (kmem_cache_t *cachep, void *objp)
2119 {
2120         unsigned long flags;
2121
2122         local_irq_save(flags);
2123         __cache_free(cachep, objp);
2124         local_irq_restore(flags);
2125 }
2126
2127 /**
2128  * kfree - free previously allocated memory
2129  * @objp: pointer returned by kmalloc.
2130  *
2131  * Don't free memory not originally allocated by kmalloc()
2132  * or you will run into trouble.
2133  */
2134 void kfree (const void *objp)
2135 {
2136         kmem_cache_t *c;
2137         unsigned long flags;
2138
2139         if (!objp)
2140                 return;
2141         local_irq_save(flags);
2142         kfree_debugcheck(objp);
2143         c = GET_PAGE_CACHE(virt_to_page(objp));
2144         __cache_free(c, (void*)objp);
2145         local_irq_restore(flags);
2146 }
2147
2148 #ifdef CONFIG_SMP
2149 /**
2150  * free_percpu - free previously allocated percpu memory
2151  * @objp: pointer returned by alloc_percpu.
2152  *
2153  * Don't free memory not originally allocated by alloc_percpu()
2154  * The complemented objp is to check for that.
2155  */
2156 void
2157 free_percpu(const void *objp)
2158 {
2159         int i;
2160         struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
2161
2162         for (i = 0; i < NR_CPUS; i++) {
2163                 if (!cpu_possible(i))
2164                         continue;
2165                 kfree(p->ptrs[i]);
2166         }
2167 }
2168 #endif
2169
2170 unsigned int kmem_cache_size(kmem_cache_t *cachep)
2171 {
2172         return cachep->objsize-obj_dbglen(cachep);
2173 }
2174
2175 kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
2176 {
2177         struct cache_sizes *csizep = malloc_sizes;
2178
2179         /* This function could be moved to the header file, and
2180          * made inline so consumers can quickly determine what
2181          * cache pointer they require.
2182          */
2183         for ( ; csizep->cs_size; csizep++) {
2184                 if (size > csizep->cs_size)
2185                         continue;
2186                 break;
2187         }
2188         return (gfpflags & GFP_DMA) ? csizep->cs_dmacachep : csizep->cs_cachep;
2189 }
2190
2191 struct ccupdate_struct {
2192         kmem_cache_t *cachep;
2193         struct array_cache *new[NR_CPUS];
2194 };
2195
2196 static void do_ccupdate_local(void *info)
2197 {
2198         struct ccupdate_struct *new = (struct ccupdate_struct *)info;
2199         struct array_cache *old;
2200
2201         check_irq_off();
2202         old = ac_data(new->cachep);
2203
2204         new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
2205         new->new[smp_processor_id()] = old;
2206 }
2207
2208
2209 static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount, int shared)
2210 {
2211         struct ccupdate_struct new;
2212         struct array_cache *new_shared;
2213         int i;
2214
2215         memset(&new.new,0,sizeof(new.new));
2216         for (i = 0; i < NR_CPUS; i++) {
2217                 struct array_cache *ccnew;
2218
2219                 ccnew = kmalloc(sizeof(void*)*limit+
2220                                 sizeof(struct array_cache), GFP_KERNEL);
2221                 if (!ccnew) {
2222                         for (i--; i >= 0; i--) kfree(new.new[i]);
2223                         return -ENOMEM;
2224                 }
2225                 ccnew->avail = 0;
2226                 ccnew->limit = limit;
2227                 ccnew->batchcount = batchcount;
2228                 ccnew->touched = 0;
2229                 new.new[i] = ccnew;
2230         }
2231         new.cachep = cachep;
2232
2233         smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
2234
2235         check_irq_on();
2236         spin_lock_irq(&cachep->spinlock);
2237         cachep->batchcount = batchcount;
2238         cachep->limit = limit;
2239         cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
2240         spin_unlock_irq(&cachep->spinlock);
2241
2242         for (i = 0; i < NR_CPUS; i++) {
2243                 struct array_cache *ccold = new.new[i];
2244                 if (!ccold)
2245                         continue;
2246                 spin_lock_irq(&cachep->spinlock);
2247                 free_block(cachep, ac_entry(ccold), ccold->avail);
2248                 spin_unlock_irq(&cachep->spinlock);
2249                 kfree(ccold);
2250         }
2251         new_shared = kmalloc(sizeof(void*)*batchcount*shared+
2252                                 sizeof(struct array_cache), GFP_KERNEL);
2253         if (new_shared) {
2254                 struct array_cache *old;
2255                 new_shared->avail = 0;
2256                 new_shared->limit = batchcount*shared;
2257                 new_shared->batchcount = 0xbaadf00d;
2258                 new_shared->touched = 0;
2259
2260                 spin_lock_irq(&cachep->spinlock);
2261                 old = cachep->lists.shared;
2262                 cachep->lists.shared = new_shared;
2263                 if (old)
2264                         free_block(cachep, ac_entry(old), old->avail);
2265                 spin_unlock_irq(&cachep->spinlock);
2266                 kfree(old);
2267         }
2268
2269         return 0;
2270 }
2271
2272
2273 static void enable_cpucache (kmem_cache_t *cachep)
2274 {
2275         int err;
2276         int limit, shared;
2277
2278         /* The head array serves three purposes:
2279          * - create a LIFO ordering, i.e. return objects that are cache-warm
2280          * - reduce the number of spinlock operations.
2281          * - reduce the number of linked list operations on the slab and
2282          *   bufctl chains: array operations are cheaper.
2283          * The numbers are guessed, we should auto-tune as described by
2284          * Bonwick.
2285          */
2286         if (cachep->objsize > 131072)
2287                 limit = 1;
2288         else if (cachep->objsize > PAGE_SIZE)
2289                 limit = 8;
2290         else if (cachep->objsize > 1024)
2291                 limit = 24;
2292         else if (cachep->objsize > 256)
2293                 limit = 54;
2294         else
2295                 limit = 120;
2296
2297         /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
2298          * allocation behaviour: Most allocs on one cpu, most free operations
2299          * on another cpu. For these cases, an efficient object passing between
2300          * cpus is necessary. This is provided by a shared array. The array
2301          * replaces Bonwick's magazine layer.
2302          * On uniprocessor, it's functionally equivalent (but less efficient)
2303          * to a larger limit. Thus disabled by default.
2304          */
2305         shared = 0;
2306 #ifdef CONFIG_SMP
2307         if (cachep->objsize <= PAGE_SIZE)
2308                 shared = 8;
2309 #endif
2310
2311 #if DEBUG
2312         /* With debugging enabled, large batchcount lead to excessively
2313          * long periods with disabled local interrupts. Limit the
2314          * batchcount
2315          */
2316         if (limit > 32)
2317                 limit = 32;
2318 #endif
2319         err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
2320         if (err)
2321                 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
2322                                         cachep->name, -err);
2323 }
2324
2325 static void drain_array(kmem_cache_t *cachep, struct array_cache *ac)
2326 {
2327         int tofree;
2328
2329         check_irq_off();
2330         if (ac->touched) {
2331                 ac->touched = 0;
2332         } else if (ac->avail) {
2333                 tofree = (ac->limit+4)/5;
2334                 if (tofree > ac->avail) {
2335                         tofree = (ac->avail+1)/2;
2336                 }
2337                 spin_lock(&cachep->spinlock);
2338                 free_block(cachep, ac_entry(ac), tofree);
2339                 spin_unlock(&cachep->spinlock);
2340                 ac->avail -= tofree;
2341                 memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
2342                                         sizeof(void*)*ac->avail);
2343         }
2344 }
2345
2346 static void drain_array_locked(kmem_cache_t *cachep,
2347                                 struct array_cache *ac, int force)
2348 {
2349         int tofree;
2350
2351         check_spinlock_acquired(cachep);
2352         if (ac->touched) {
2353                 ac->touched = 0;
2354         } else if (ac->avail) {
2355                 tofree = force ? ac->avail : (ac->limit+4)/5;
2356                 if (tofree > ac->avail) {
2357                         tofree = (ac->avail+1)/2;
2358                 }
2359                 free_block(cachep, ac_entry(ac), tofree);
2360                 ac->avail -= tofree;
2361                 memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
2362                                         sizeof(void*)*ac->avail);
2363         }
2364 }
2365
2366 /**
2367  * cache_reap - Reclaim memory from caches.
2368  *
2369  * Called from a timer, every few seconds
2370  * Purpose:
2371  * - clear the per-cpu caches for this CPU.
2372  * - return freeable pages to the main free memory pool.
2373  *
2374  * If we cannot acquire the cache chain semaphore then just give up - we'll
2375  * try again next timer interrupt.
2376  */
2377 static inline void cache_reap (void)
2378 {
2379         struct list_head *walk;
2380
2381 #if DEBUG
2382         BUG_ON(!in_interrupt());
2383         BUG_ON(in_irq());
2384 #endif
2385         if (down_trylock(&cache_chain_sem))
2386                 return;
2387
2388         list_for_each(walk, &cache_chain) {
2389                 kmem_cache_t *searchp;
2390                 struct list_head* p;
2391                 int tofree;
2392                 struct slab *slabp;
2393
2394                 searchp = list_entry(walk, kmem_cache_t, next);
2395
2396                 if (searchp->flags & SLAB_NO_REAP)
2397                         goto next;
2398
2399                 check_irq_on();
2400                 local_irq_disable();
2401                 drain_array(searchp, ac_data(searchp));
2402
2403                 if(time_after(searchp->lists.next_reap, jiffies))
2404                         goto next_irqon;
2405
2406                 spin_lock(&searchp->spinlock);
2407                 if(time_after(searchp->lists.next_reap, jiffies)) {
2408                         goto next_unlock;
2409                 }
2410                 searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
2411
2412                 if (searchp->lists.shared)
2413                         drain_array_locked(searchp, searchp->lists.shared, 0);
2414
2415                 if (searchp->lists.free_touched) {
2416                         searchp->lists.free_touched = 0;
2417                         goto next_unlock;
2418                 }
2419
2420                 tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
2421                 do {
2422                         p = list3_data(searchp)->slabs_free.next;
2423                         if (p == &(list3_data(searchp)->slabs_free))
2424                                 break;
2425
2426                         slabp = list_entry(p, struct slab, list);
2427                         BUG_ON(slabp->inuse);
2428                         list_del(&slabp->list);
2429                         STATS_INC_REAPED(searchp);
2430
2431                         /* Safe to drop the lock. The slab is no longer
2432                          * linked to the cache.
2433                          * searchp cannot disappear, we hold
2434                          * cache_chain_lock
2435                          */
2436                         searchp->lists.free_objects -= searchp->num;
2437                         spin_unlock_irq(&searchp->spinlock);
2438                         slab_destroy(searchp, slabp);
2439                         spin_lock_irq(&searchp->spinlock);
2440                 } while(--tofree > 0);
2441 next_unlock:
2442                 spin_unlock(&searchp->spinlock);
2443 next_irqon:
2444                 local_irq_enable();
2445 next:
2446                 ;
2447         }
2448         check_irq_on();
2449         up(&cache_chain_sem);
2450 }
2451
2452 /*
2453  * This is a timer handler.  There is on per CPU.  It is called periodially
2454  * to shrink this CPU's caches.  Otherwise there could be memory tied up
2455  * for long periods (or for ever) due to load changes.
2456  */
2457 static void reap_timer_fnc(unsigned long data)
2458 {
2459         int cpu = smp_processor_id();
2460         struct timer_list *rt = &__get_cpu_var(reap_timers);
2461
2462         cache_reap();
2463         mod_timer(rt, jiffies + REAPTIMEOUT_CPUC + cpu);
2464 }
2465
2466 #ifdef CONFIG_PROC_FS
2467
2468 static void *s_start(struct seq_file *m, loff_t *pos)
2469 {
2470         loff_t n = *pos;
2471         struct list_head *p;
2472
2473         down(&cache_chain_sem);
2474         if (!n) {
2475                 /*
2476                  * Output format version, so at least we can change it
2477                  * without _too_ many complaints.
2478                  */
2479 #if STATS
2480                 seq_puts(m, "slabinfo - version: 2.0 (statistics)\n");
2481 #else
2482                 seq_puts(m, "slabinfo - version: 2.0\n");
2483 #endif
2484                 seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
2485                 seq_puts(m, " : tunables <batchcount> <limit <sharedfactor>");
2486                 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
2487 #if STATS
2488                 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <freelimit>");
2489                 seq_puts(m, " : cpustat <allochit <allocmiss <freehit <freemiss>");
2490 #endif
2491                 seq_putc(m, '\n');
2492         }
2493         p = cache_chain.next;
2494         while (n--) {
2495                 p = p->next;
2496                 if (p == &cache_chain)
2497                         return NULL;
2498         }
2499         return list_entry(p, kmem_cache_t, next);
2500 }
2501
2502 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2503 {
2504         kmem_cache_t *cachep = p;
2505         ++*pos;
2506         return cachep->next.next == &cache_chain ? NULL
2507                 : list_entry(cachep->next.next, kmem_cache_t, next);
2508 }
2509
2510 static void s_stop(struct seq_file *m, void *p)
2511 {
2512         up(&cache_chain_sem);
2513 }
2514
2515 static int s_show(struct seq_file *m, void *p)
2516 {
2517         kmem_cache_t *cachep = p;
2518         struct list_head *q;
2519         struct slab     *slabp;
2520         unsigned long   active_objs;
2521         unsigned long   num_objs;
2522         unsigned long   active_slabs = 0;
2523         unsigned long   num_slabs;
2524         const char *name;
2525         char *error = NULL;
2526         mm_segment_t old_fs;
2527         char tmp;
2528
2529         check_irq_on();
2530         spin_lock_irq(&cachep->spinlock);
2531         active_objs = 0;
2532         num_slabs = 0;
2533         list_for_each(q,&cachep->lists.slabs_full) {
2534                 slabp = list_entry(q, struct slab, list);
2535                 if (slabp->inuse != cachep->num && !error)
2536                         error = "slabs_full accounting error";
2537                 active_objs += cachep->num;
2538                 active_slabs++;
2539         }
2540         list_for_each(q,&cachep->lists.slabs_partial) {
2541                 slabp = list_entry(q, struct slab, list);
2542                 if (slabp->inuse == cachep->num && !error)
2543                         error = "slabs_partial inuse accounting error";
2544                 if (!slabp->inuse && !error)
2545                         error = "slabs_partial/inuse accounting error";
2546                 active_objs += slabp->inuse;
2547                 active_slabs++;
2548         }
2549         list_for_each(q,&cachep->lists.slabs_free) {
2550                 slabp = list_entry(q, struct slab, list);
2551                 if (slabp->inuse && !error)
2552                         error = "slabs_free/inuse accounting error";
2553                 num_slabs++;
2554         }
2555         num_slabs+=active_slabs;
2556         num_objs = num_slabs*cachep->num;
2557         if (num_objs - active_objs != cachep->lists.free_objects && !error)
2558                 error = "free_objects accounting error";
2559
2560         name = cachep->name;
2561
2562         /*
2563          * Check to see if `name' resides inside a module which has been
2564          * unloaded (someone forgot to destroy their cache)
2565          */
2566         old_fs = get_fs();
2567         set_fs(KERNEL_DS);
2568         if (__get_user(tmp, name))
2569                 name = "broken";
2570         set_fs(old_fs);
2571
2572         if (error)
2573                 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
2574
2575         seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
2576                 name, active_objs, num_objs, cachep->objsize,
2577                 cachep->num, (1<<cachep->gfporder));
2578         seq_printf(m, " : tunables %4u %4u %4u",
2579                         cachep->limit, cachep->batchcount,
2580                         cachep->lists.shared->limit/cachep->batchcount);
2581         seq_printf(m, " : slabdata %6lu %6lu %6u",
2582                         active_slabs, num_slabs, cachep->lists.shared->avail);
2583 #if STATS
2584         {       /* list3 stats */
2585                 unsigned long high = cachep->high_mark;
2586                 unsigned long allocs = cachep->num_allocations;
2587                 unsigned long grown = cachep->grown;
2588                 unsigned long reaped = cachep->reaped;
2589                 unsigned long errors = cachep->errors;
2590                 unsigned long max_freeable = cachep->max_freeable;
2591                 unsigned long free_limit = cachep->free_limit;
2592
2593                 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu",
2594                                 allocs, high, grown, reaped, errors,
2595                                 max_freeable, free_limit);
2596         }
2597         /* cpu stats */
2598         {
2599                 unsigned long allochit = atomic_read(&cachep->allochit);
2600                 unsigned long allocmiss = atomic_read(&cachep->allocmiss);
2601                 unsigned long freehit = atomic_read(&cachep->freehit);
2602                 unsigned long freemiss = atomic_read(&cachep->freemiss);
2603
2604                 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
2605                         allochit, allocmiss, freehit, freemiss);
2606         }
2607 #endif
2608         seq_putc(m, '\n');
2609         spin_unlock_irq(&cachep->spinlock);
2610         return 0;
2611 }
2612
2613 /*
2614  * slabinfo_op - iterator that generates /proc/slabinfo
2615  *
2616  * Output layout:
2617  * cache-name
2618  * num-active-objs
2619  * total-objs
2620  * object size
2621  * num-active-slabs
2622  * total-slabs
2623  * num-pages-per-slab
2624  * + further values on SMP and with statistics enabled
2625  */
2626
2627 struct seq_operations slabinfo_op = {
2628         .start  = s_start,
2629         .next   = s_next,
2630         .stop   = s_stop,
2631         .show   = s_show,
2632 };
2633
2634 #define MAX_SLABINFO_WRITE 128
2635 /**
2636  * slabinfo_write - Tuning for the slab allocator
2637  * @file: unused
2638  * @buffer: user buffer
2639  * @count: data len
2640  * @data: unused
2641  */
2642 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
2643                                 size_t count, loff_t *ppos)
2644 {
2645         char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
2646         int limit, batchcount, shared, res;
2647         struct list_head *p;
2648
2649         if (count > MAX_SLABINFO_WRITE)
2650                 return -EINVAL;
2651         if (copy_from_user(&kbuf, buffer, count))
2652                 return -EFAULT;
2653         kbuf[MAX_SLABINFO_WRITE] = '\0';
2654
2655         tmp = strchr(kbuf, ' ');
2656         if (!tmp)
2657                 return -EINVAL;
2658         *tmp = '\0';
2659         tmp++;
2660         if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
2661                 return -EINVAL;
2662
2663         /* Find the cache in the chain of caches. */
2664         down(&cache_chain_sem);
2665         res = -EINVAL;
2666         list_for_each(p,&cache_chain) {
2667                 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
2668
2669                 if (!strcmp(cachep->name, kbuf)) {
2670                         if (limit < 1 ||
2671                             batchcount < 1 ||
2672                             batchcount > limit ||
2673                             shared < 0) {
2674                                 res = -EINVAL;
2675                         } else {
2676                                 res = do_tune_cpucache(cachep, limit, batchcount, shared);
2677                         }
2678                         break;
2679                 }
2680         }
2681         up(&cache_chain_sem);
2682         if (res >= 0)
2683                 res = count;
2684         return res;
2685 }
2686 #endif
2687
2688 unsigned int ksize(const void *objp)
2689 {
2690         kmem_cache_t *c;
2691         unsigned long flags;
2692         unsigned int size = 0;
2693
2694         if (likely(objp != NULL)) {
2695                 local_irq_save(flags);
2696                 c = GET_PAGE_CACHE(virt_to_page(objp));
2697                 size = kmem_cache_size(c);
2698                 local_irq_restore(flags);
2699         }
2700
2701         return size;
2702 }
2703
2704 void ptrinfo(unsigned long addr)
2705 {
2706         struct page *page;
2707
2708         printk("Dumping data about address %p.\n", (void*)addr);
2709         if (!virt_addr_valid((void*)addr)) {
2710                 printk("virt addr invalid.\n");
2711                 return;
2712         }
2713         do {
2714                 pgd_t *pgd = pgd_offset_k(addr);
2715                 pmd_t *pmd;
2716                 if (pgd_none(*pgd)) {
2717                         printk("No pgd.\n");
2718                         break;
2719                 }
2720                 pmd = pmd_offset(pgd, addr);
2721                 if (pmd_none(*pmd)) {
2722                         printk("No pmd.\n");
2723                         break;
2724                 }
2725 #ifdef CONFIG_X86
2726                 if (pmd_large(*pmd)) {
2727                         printk("Large page.\n");
2728                         break;
2729                 }
2730 #endif
2731                 printk("normal page, pte_val 0x%llx\n",
2732                   (unsigned long long)pte_val(*pte_offset_kernel(pmd, addr)));
2733         } while(0);
2734
2735         page = virt_to_page((void*)addr);
2736         printk("struct page at %p, flags %lxh.\n", page, page->flags);
2737         if (PageSlab(page)) {
2738                 kmem_cache_t *c;
2739                 struct slab *s;
2740                 unsigned long flags;
2741                 int objnr;
2742                 void *objp;
2743
2744                 c = GET_PAGE_CACHE(page);
2745                 printk("belongs to cache %s.\n",c->name);
2746
2747                 spin_lock_irqsave(&c->spinlock, flags);
2748                 s = GET_PAGE_SLAB(page);
2749                 printk("slabp %p with %d inuse objects (from %d).\n",
2750                         s, s->inuse, c->num);
2751                 check_slabp(c,s);
2752
2753                 objnr = (addr-(unsigned long)s->s_mem)/c->objsize;
2754                 objp = s->s_mem+c->objsize*objnr;
2755                 printk("points into object no %d, starting at %p, len %d.\n",
2756                         objnr, objp, c->objsize);
2757                 if (objnr >= c->num) {
2758                         printk("Bad obj number.\n");
2759                 } else {
2760                         kernel_map_pages(virt_to_page(objp), c->objsize/PAGE_SIZE, 1);
2761
2762                         printk("redzone: %lxh/%lxh/%lxh.\n",
2763                                 ((unsigned long*)objp)[0],
2764                                 ((unsigned long*)(objp+c->objsize))[-2],
2765                                 ((unsigned long*)(objp+c->objsize))[-1]);
2766                 }
2767                 spin_unlock_irqrestore(&c->spinlock, flags);
2768
2769         }
2770 }