mm/slab.c

   1 /*
   2  * linux/mm/slab.c
   3  * Written by Mark Hemment, 1996/97.
   4  * (markhe@nextd.demon.co.uk)
   5  *
   6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
   7  *
   8  * 11 April '97.  Started multi-threading - markhe
   9  *      The global cache-chain is protected by the semaphore 'cache_chain_sem'.
  10  *      The sem is only needed when accessing/extending the cache-chain, which
  11  *      can never happen inside an interrupt (kmem_cache_create(),
  12  *      kmem_cache_shrink() and kmem_cache_reap()).
  13  *      This is a medium-term exclusion lock.
  14  *
  15  *      Each cache has its own lock; 'c_spinlock'.  This lock is needed only
  16  *      when accessing non-constant members of a cache-struct.
  17  *      Note: 'constant members' are assigned a value in kmem_cache_create() before
  18  *      the cache is linked into the cache-chain.  The values never change, so not
  19  *      even a multi-reader lock is needed for these members.
  20  *      The c_spinlock is only ever held for a few cycles.
  21  *
  22  *      To prevent kmem_cache_shrink() trying to shrink a 'growing' cache (which
  23  *      maybe be sleeping and therefore not holding the semaphore/lock), the
  24  *      c_growing field is used.  This also prevents reaping from a cache.
  25  *
  26  *      Note, caches can _never_ be destroyed.  When a sub-system (eg module) has
  27  *      finished with a cache, it can only be shrunk.  This leaves the cache empty,
  28  *      but already enabled for re-use, eg. during a module re-load.
  29  *
  30  *      Notes:
  31  *              o Constructors/deconstructors are called while the cache-lock
  32  *                is _not_ held.  Therefore they _must_ be threaded.
  33  *              o Constructors must not attempt to allocate memory from the
  34  *                same cache that they are a constructor for - infinite loop!
  35  *                (There is no easy way to trap this.)
  36  *              o The per-cache locks must be obtained with local-interrupts disabled.
  37  *              o When compiled with debug support, and an object-verify (upon release)
  38  *                is request for a cache, the verify-function is called with the cache
  39  *                lock held.  This helps debugging.
  40  *              o The functions called from try_to_free_page() must not attempt
  41  *                to allocate memory from a cache which is being grown.
  42  *                The buffer sub-system might try to allocate memory, via buffer_cachep.
  43  *                As this pri is passed to the SLAB, and then (if necessary) onto the
  44  *                gfp() funcs (which avoid calling try_to_free_page()), no deadlock
  45  *                should happen.
  46  *
  47  *      The positioning of the per-cache lock is tricky.  If the lock is
  48  *      placed on the same h/w cache line as commonly accessed members
  49  *      the number of L1 cache-line faults is reduced.  However, this can
  50  *      lead to the cache-line ping-ponging between processors when the
  51  *      lock is in contention (and the common members are being accessed).
  52  *      Decided to keep it away from common members.
  53  *
  54  *      More fine-graining is possible, with per-slab locks...but this might be
  55  *      taking fine graining too far, but would have the advantage;
  56  *              During most allocs/frees no writes occur to the cache-struct.
  57  *              Therefore a multi-reader/one writer lock could be used (the writer
  58  *              needed when the slab chain is being link/unlinked).
  59  *              As we would not have an exclusion lock for the cache-structure, one
  60  *              would be needed per-slab (for updating s_free ptr, and/or the contents
  61  *              of s_index).
  62  *      The above locking would allow parallel operations to different slabs within
  63  *      the same cache with reduced spinning.
  64  *
  65  *      Per-engine slab caches, backed by a global cache (as in Mach's Zone allocator),
  66  *      would allow most allocations from the same cache to execute in parallel.
  67  *
  68  *      At present, each engine can be growing a cache.  This should be blocked.
  69  *
  70  *      It is not currently 100% safe to examine the page_struct outside of a kernel
  71  *      or global cli lock.  The risk is v. small, and non-fatal.
  72  *
  73  *      Calls to printk() are not 100% safe (the function is not threaded).  However,
  74  *      printk() is only used under an error condition, and the risk is v. small (not
  75  *      sure if the console write functions 'enjoy' executing multiple contexts in
  76  *      parallel.  I guess they don't...).
  77  *      Note, for most calls to printk() any held cache-lock is dropped.  This is not
  78  *      always done for text size reasons - having *_unlock() everywhere is bloat.
  79  */
  80
  81 /*
  82  * An implementation of the Slab Allocator as described in outline in;
  83  *      UNIX Internals: The New Frontiers by Uresh Vahalia
  84  *      Pub: Prentice Hall      ISBN 0-13-101908-2
  85  * or with a little more detail in;
  86  *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  87  *      Jeff Bonwick (Sun Microsystems).
  88  *      Presented at: USENIX Summer 1994 Technical Conference
  89  */
  90
  91 /*
  92  * This implementation deviates from Bonwick's paper as it
  93  * does not use a hash-table for large objects, but rather a per slab
  94  * index to hold the bufctls.  This allows the bufctl structure to
  95  * be small (one word), but limits the number of objects a slab (not
  96  * a cache) can contain when off-slab bufctls are used.  The limit is the
  97  * size of the largest general cache that does not use off-slab bufctls,
  98  * divided by the size of a bufctl.  For 32bit archs, is this 256/4 = 64.
  99  * This is not serious, as it is only for large objects, when it is unwise
 100  * to have too many per slab.
 101  * Note: This limit can be raised by introducing a general cache whose size
 102  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 103  */
 104
 105 #include        <linux/config.h>
 106 #include        <linux/slab.h>
 107 #include        <linux/interrupt.h>
 108 #include        <linux/init.h>
 109
 110 /* If there is a different PAGE_SIZE around, and it works with this allocator,
 111  * then change the following.
 112  */
 113 #if     (PAGE_SIZE != 8192 && PAGE_SIZE != 4096 && PAGE_SIZE != 16384 && PAGE_SIZE != 32768)
 114 #error  Your page size is probably not correctly supported - please check
 115 #endif
 116
 117 /* SLAB_MGMT_CHECKS     - 1 to enable extra checks in kmem_cache_create().
 118  *                        0 if you wish to reduce memory usage.
 119  *
 120  * SLAB_DEBUG_SUPPORT   - 1 for kmem_cache_create() to honour; SLAB_DEBUG_FREE,
 121  *                        SLAB_DEBUG_INITIAL, SLAB_RED_ZONE & SLAB_POISON.
 122  *                        0 for faster, smaller, code (especially in the critical paths).
 123  *
 124  * SLAB_STATS           - 1 to collect stats for /proc/slabinfo.
 125  *                        0 for faster, smaller, code (especially in the critical paths).
 126  *
 127  * SLAB_SELFTEST        - 1 to perform a few tests, mainly for development.
 128  */
 129 #define         SLAB_MGMT_CHECKS        1
 130 #define         SLAB_DEBUG_SUPPORT      1
 131 #define         SLAB_STATS              0
 132 #define         SLAB_SELFTEST           0
 133
 134 /* Shouldn't this be in a header file somewhere? */
 135 #define BYTES_PER_WORD          sizeof(void *)
 136
 137 /* Legal flag mask for kmem_cache_create(). */
 138 #if     SLAB_DEBUG_SUPPORT
 139 #if     0
 140 #define SLAB_C_MASK             (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \
 141                                  SLAB_POISON|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP| \
 142                                  SLAB_HIGH_PACK)
 143 #endif
 144 #define SLAB_C_MASK             (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \
 145                                  SLAB_POISON|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP)
 146 #else
 147 #if     0
 148 #define SLAB_C_MASK             (SLAB_HWCACHE_ALIGN|SLAB_NO_REAP|SLAB_HIGH_PACK)
 149 #endif
 150 #define SLAB_C_MASK             (SLAB_HWCACHE_ALIGN|SLAB_NO_REAP)
 151 #endif  /* SLAB_DEBUG_SUPPORT */
 152
 153 /* Slab management struct.
 154  * Manages the objs in a slab.  Placed either at the end of mem allocated
 155  * for a slab, or from an internal obj cache (cache_slabp).
 156  * Slabs are chained into a partially ordered list; fully used first, partial
 157  * next, and then fully free slabs.
 158  * The first 4 members are referenced during an alloc/free operation, and
 159  * should always appear on the same cache line.
 160  * Note: The offset between some members _must_ match offsets within
 161  * the kmem_cache_t - see kmem_cache_init() for the checks. */
 162
 163 #define SLAB_OFFSET_BITS        16      /* could make this larger for 64bit archs */
 164
 165 typedef struct kmem_slab_s {
 166         struct kmem_bufctl_s    *s_freep;  /* ptr to first inactive obj in slab */
 167         struct kmem_bufctl_s    *s_index;
 168         unsigned long            s_magic;
 169         unsigned long            s_inuse;  /* num of objs active in slab */
 170
 171         struct kmem_slab_s      *s_nextp;
 172         struct kmem_slab_s      *s_prevp;
 173         void                    *s_mem;    /* addr of first obj in slab */
 174         unsigned long            s_offset:SLAB_OFFSET_BITS,
 175                                  s_dma:1;
 176 } kmem_slab_t;
 177
 178 /* When the slab management is on-slab, this gives the size to use. */
 179 #define slab_align_size         (L1_CACHE_ALIGN(sizeof(kmem_slab_t)))
 180
 181 /* Test for end of slab chain. */
 182 #define kmem_slab_end(x)        ((kmem_slab_t*)&((x)->c_offset))
 183
 184 /* s_magic */
 185 #define SLAB_MAGIC_ALLOC        0xA5C32F2BUL    /* slab is alive */
 186 #define SLAB_MAGIC_DESTROYED    0xB2F23C5AUL    /* slab has been destroyed */
 187
 188 /* Bufctl's are used for linking objs within a slab, identifying what slab an obj
 189  * is in, and the address of the associated obj (for sanity checking with off-slab
 190  * bufctls).  What a bufctl contains depends upon the state of the obj and
 191  * the organisation of the cache.
 192  */
 193 typedef struct kmem_bufctl_s {
 194         union {
 195                 struct kmem_bufctl_s    *buf_nextp;
 196                 kmem_slab_t             *buf_slabp;     /* slab for obj */
 197                 void *                   buf_objp;
 198         } u;
 199 } kmem_bufctl_t;
 200
 201 /* ...shorthand... */
 202 #define buf_nextp       u.buf_nextp
 203 #define buf_slabp       u.buf_slabp
 204 #define buf_objp        u.buf_objp
 205
 206 #if     SLAB_DEBUG_SUPPORT
 207 /* Magic nums for obj red zoning.
 208  * Placed in the first word before and the first word after an obj.
 209  */
 210 #define SLAB_RED_MAGIC1         0x5A2CF071UL    /* when obj is active */
 211 #define SLAB_RED_MAGIC2         0x170FC2A5UL    /* when obj is inactive */
 212
 213 /* ...and for poisoning */
 214 #define SLAB_POISON_BYTE        0x5a            /* byte value for poisoning */
 215 #define SLAB_POISON_END 0xa5            /* end-byte of poisoning */
 216
 217 #endif  /* SLAB_DEBUG_SUPPORT */
 218
 219 #define SLAB_CACHE_NAME_LEN     20      /* max name length for a slab cache */
 220
 221 /* Cache struct - manages a cache.
 222  * First four members are commonly referenced during an alloc/free operation.
 223  */
 224 struct kmem_cache_s {
 225         kmem_slab_t              *c_freep;      /* first slab w. free objs */
 226         unsigned long             c_flags;      /* constant flags */
 227         unsigned long             c_offset;
 228         unsigned long             c_num;        /* # of objs per slab */
 229
 230         unsigned long             c_magic;
 231         unsigned long             c_inuse;      /* kept at zero */
 232         kmem_slab_t              *c_firstp;     /* first slab in chain */
 233         kmem_slab_t              *c_lastp;      /* last slab in chain */
 234
 235         spinlock_t                c_spinlock;
 236         unsigned long             c_growing;
 237         unsigned long             c_dflags;     /* dynamic flags */
 238         size_t                    c_org_size;
 239         unsigned long             c_gfporder;   /* order of pgs per slab (2^n) */
 240         void (*c_ctor)(void *, kmem_cache_t *, unsigned long); /* constructor func */
 241         void (*c_dtor)(void *, kmem_cache_t *, unsigned long); /* de-constructor func */
 242         unsigned long             c_align;      /* alignment of objs */
 243         size_t                    c_colour;     /* cache colouring range */
 244         size_t                    c_colour_next;/* cache colouring */
 245         unsigned long             c_failures;
 246         char                      c_name[SLAB_CACHE_NAME_LEN];
 247         struct kmem_cache_s      *c_nextp;
 248         kmem_cache_t             *c_index_cachep;
 249 #if     SLAB_STATS
 250         unsigned long             c_num_active;
 251         unsigned long             c_num_allocations;
 252         unsigned long             c_high_mark;
 253         unsigned long             c_grown;
 254         unsigned long             c_reaped;
 255         atomic_t                  c_errors;
 256 #endif  /* SLAB_STATS */
 257 };
 258
 259 /* internal c_flags */
 260 #define SLAB_CFLGS_OFF_SLAB     0x010000UL      /* slab management in own cache */
 261 #define SLAB_CFLGS_BUFCTL       0x020000UL      /* bufctls in own cache */
 262 #define SLAB_CFLGS_GENERAL      0x080000UL      /* a general cache */
 263
 264 /* c_dflags (dynamic flags).  Need to hold the spinlock to access this member */
 265 #define SLAB_CFLGS_GROWN        0x000002UL      /* don't reap a recently grown */
 266
 267 #define SLAB_OFF_SLAB(x)        ((x) & SLAB_CFLGS_OFF_SLAB)
 268 #define SLAB_BUFCTL(x)          ((x) & SLAB_CFLGS_BUFCTL)
 269 #define SLAB_GROWN(x)           ((x) & SLAB_CFLGS_GROWN)
 270
 271 #if     SLAB_STATS
 272 #define SLAB_STATS_INC_ACTIVE(x)        ((x)->c_num_active++)
 273 #define SLAB_STATS_DEC_ACTIVE(x)        ((x)->c_num_active--)
 274 #define SLAB_STATS_INC_ALLOCED(x)       ((x)->c_num_allocations++)
 275 #define SLAB_STATS_INC_GROWN(x)         ((x)->c_grown++)
 276 #define SLAB_STATS_INC_REAPED(x)        ((x)->c_reaped++)
 277 #define SLAB_STATS_SET_HIGH(x)          do { if ((x)->c_num_active > (x)->c_high_mark) \
 278                                                 (x)->c_high_mark = (x)->c_num_active; \
 279                                         } while (0)
 280 #define SLAB_STATS_INC_ERR(x)           (atomic_inc(&(x)->c_errors))
 281 #else
 282 #define SLAB_STATS_INC_ACTIVE(x)
 283 #define SLAB_STATS_DEC_ACTIVE(x)
 284 #define SLAB_STATS_INC_ALLOCED(x)
 285 #define SLAB_STATS_INC_GROWN(x)
 286 #define SLAB_STATS_INC_REAPED(x)
 287 #define SLAB_STATS_SET_HIGH(x)
 288 #define SLAB_STATS_INC_ERR(x)
 289 #endif  /* SLAB_STATS */
 290
 291 #if     SLAB_SELFTEST
 292 #if     !SLAB_DEBUG_SUPPORT
 293 #error  Debug support needed for self-test
 294 #endif
 295 static void kmem_self_test(void);
 296 #endif  /* SLAB_SELFTEST */
 297
 298 /* c_magic - used to detect 'out of slabs' in __kmem_cache_alloc() */
 299 #define SLAB_C_MAGIC            0x4F17A36DUL
 300
 301 /* maximum size of an obj (in 2^order pages) */
 302 #define SLAB_OBJ_MAX_ORDER      5       /* 32 pages */
 303
 304 /* maximum num of pages for a slab (prevents large requests to the VM layer) */
 305 #define SLAB_MAX_GFP_ORDER      5       /* 32 pages */
 306
 307 /* the 'preferred' minimum num of objs per slab - maybe less for large objs */
 308 #define SLAB_MIN_OBJS_PER_SLAB  4
 309
 310 /* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB,
 311  * then the page order must be less than this before trying the next order.
 312  */
 313 #define SLAB_BREAK_GFP_ORDER_HI 2
 314 #define SLAB_BREAK_GFP_ORDER_LO 1
 315 static int slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_LO;
 316
 317 /* Macros for storing/retrieving the cachep and or slab from the
 318  * global 'mem_map'.  With off-slab bufctls, these are used to find the
 319  * slab an obj belongs to.  With kmalloc(), and kfree(), these are used
 320  * to find the cache which an obj belongs to.
 321  */
 322 #define SLAB_SET_PAGE_CACHE(pg,x)  ((pg)->list.next = (struct list_head *)(x))
 323 #define SLAB_GET_PAGE_CACHE(pg)    ((kmem_cache_t *)(pg)->list.next)
 324 #define SLAB_SET_PAGE_SLAB(pg,x)   ((pg)->list.prev = (struct list_head *)(x))
 325 #define SLAB_GET_PAGE_SLAB(pg)     ((kmem_slab_t *)(pg)->list.prev)
 326
 327 /* Size description struct for general caches. */
 328 typedef struct cache_sizes {
 329         size_t           cs_size;
 330         kmem_cache_t    *cs_cachep;
 331 } cache_sizes_t;
 332
 333 static cache_sizes_t cache_sizes[] = {
 334 #if     PAGE_SIZE == 4096
 335         {  32,          NULL},
 336 #endif
 337         {  64,          NULL},
 338         { 128,          NULL},
 339         { 256,          NULL},
 340         { 512,          NULL},
 341         {1024,          NULL},
 342         {2048,          NULL},
 343         {4096,          NULL},
 344         {8192,          NULL},
 345         {16384,         NULL},
 346         {32768,         NULL},
 347         {65536,         NULL},
 348         {131072,        NULL},
 349         {0,             NULL}
 350 };
 351
 352 /* Names for the general caches.  Not placed into the sizes struct for
 353  * a good reason; the string ptr is not needed while searching in kmalloc(),
 354  * and would 'get-in-the-way' in the h/w cache.
 355  */
 356 static char *cache_sizes_name[] = {
 357 #if     PAGE_SIZE == 4096
 358         "size-32",
 359 #endif
 360         "size-64",
 361         "size-128",
 362         "size-256",
 363         "size-512",
 364         "size-1024",
 365         "size-2048",
 366         "size-4096",
 367         "size-8192",
 368         "size-16384",
 369         "size-32768",
 370         "size-65536",
 371         "size-131072"
 372 };
 373
 374 /* internal cache of cache description objs */
 375 static  kmem_cache_t    cache_cache = {
 376 /* freep, flags */              kmem_slab_end(&cache_cache), SLAB_NO_REAP,
 377 /* offset, num */               sizeof(kmem_cache_t),   0,
 378 /* c_magic, c_inuse */          SLAB_C_MAGIC, 0,
 379 /* firstp, lastp */             kmem_slab_end(&cache_cache), kmem_slab_end(&cache_cache),
 380 /* spinlock */                  SPIN_LOCK_UNLOCKED,
 381 /* growing */                   0,
 382 /* dflags */                    0,
 383 /* org_size, gfp */             0, 0,
 384 /* ctor, dtor, align */         NULL, NULL, L1_CACHE_BYTES,
 385 /* colour, colour_next */       0, 0,
 386 /* failures */                  0,
 387 /* name */                      "kmem_cache",
 388 /* nextp */                     &cache_cache,
 389 /* index */                     NULL,
 390 };
 391
 392 /* Guard access to the cache-chain. */
 393 static struct semaphore cache_chain_sem;
 394
 395 /* Place maintainer for reaping. */
 396 static  kmem_cache_t    *clock_searchp = &cache_cache;
 397
 398 /* Internal slab management cache, for when slab management is off-slab. */
 399 static kmem_cache_t     *cache_slabp;
 400
 401 /* Max number of objs-per-slab for caches which use bufctl's.
 402  * Needed to avoid a possible looping condition in kmem_cache_grow().
 403  */
 404 static unsigned long bufctl_limit;
 405
 406 /* Initialisation - setup the `cache' cache. */
 407 void __init kmem_cache_init(void)
 408 {
 409         size_t size, i;
 410
 411 #define kmem_slab_offset(x)  ((unsigned long)&((kmem_slab_t *)0)->x)
 412 #define kmem_slab_diff(a,b)  (kmem_slab_offset(a) - kmem_slab_offset(b))
 413 #define kmem_cache_offset(x) ((unsigned long)&((kmem_cache_t *)0)->x)
 414 #define kmem_cache_diff(a,b) (kmem_cache_offset(a) - kmem_cache_offset(b))
 415
 416         /* Sanity checks... */
 417         if (kmem_cache_diff(c_firstp, c_magic) != kmem_slab_diff(s_nextp, s_magic) ||
 418             kmem_cache_diff(c_firstp, c_inuse) != kmem_slab_diff(s_nextp, s_inuse) ||
 419             ((kmem_cache_offset(c_lastp) -
 420               ((unsigned long) kmem_slab_end((kmem_cache_t*)NULL))) !=
 421              kmem_slab_offset(s_prevp)) ||
 422             kmem_cache_diff(c_lastp, c_firstp) != kmem_slab_diff(s_prevp, s_nextp)) {
 423                 /* Offsets to the magic are incorrect, either the structures have
 424                  * been incorrectly changed, or adjustments are needed for your
 425                  * architecture.
 426                  */
 427                 panic("kmem_cache_init(): Offsets are wrong - I've been messed with!");
 428                 /* NOTREACHED */
 429         }
 430 #undef  kmem_cache_offset
 431 #undef  kmem_cache_diff
 432 #undef  kmem_slab_offset
 433 #undef  kmem_slab_diff
 434
 435         init_MUTEX(&cache_chain_sem);
 436
 437         size = cache_cache.c_offset + sizeof(kmem_bufctl_t);
 438         size += (L1_CACHE_BYTES-1);
 439         size &= ~(L1_CACHE_BYTES-1);
 440         cache_cache.c_offset = size-sizeof(kmem_bufctl_t);
 441
 442         i = (PAGE_SIZE<<cache_cache.c_gfporder)-slab_align_size;
 443         cache_cache.c_num = i / size;   /* num of objs per slab */
 444
 445         /* Cache colouring. */
 446         cache_cache.c_colour = (i-(cache_cache.c_num*size))/L1_CACHE_BYTES;
 447         cache_cache.c_colour_next = cache_cache.c_colour;
 448
 449         /*
 450          * Fragmentation resistance on low memory - only use bigger
 451          * page orders on machines with more than 32MB of memory.
 452          */
 453         if (num_physpages > (32 << 20) >> PAGE_SHIFT)
 454                 slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_HI;
 455 }
 456
 457 /* Initialisation - setup remaining internal and general caches.
 458  * Called after the gfp() functions have been enabled, and before smp_init().
 459  */
 460 void __init kmem_cache_sizes_init(void)
 461 {
 462         unsigned int    found = 0;
 463
 464         cache_slabp = kmem_cache_create("slab_cache", sizeof(kmem_slab_t),
 465                                         0, SLAB_HWCACHE_ALIGN, NULL, NULL);
 466         if (cache_slabp) {
 467                 char **names = cache_sizes_name;
 468                 cache_sizes_t *sizes = cache_sizes;
 469                 do {
 470                         /* For performance, all the general caches are L1 aligned.
 471                          * This should be particularly beneficial on SMP boxes, as it
 472                          * eliminates "false sharing".
 473                          * Note for systems short on memory removing the alignment will
 474                          * allow tighter packing of the smaller caches. */
 475                         if (!(sizes->cs_cachep =
 476                               kmem_cache_create(*names++, sizes->cs_size,
 477                                                 0, SLAB_HWCACHE_ALIGN, NULL, NULL)))
 478                                 goto panic_time;
 479                         if (!found) {
 480                                 /* Inc off-slab bufctl limit until the ceiling is hit. */
 481                                 if (SLAB_BUFCTL(sizes->cs_cachep->c_flags))
 482                                         found++;
 483                                 else
 484                                         bufctl_limit =
 485                                                 (sizes->cs_size/sizeof(kmem_bufctl_t));
 486                         }
 487                         sizes->cs_cachep->c_flags |= SLAB_CFLGS_GENERAL;
 488                         sizes++;
 489                 } while (sizes->cs_size);
 490 #if     SLAB_SELFTEST
 491                 kmem_self_test();
 492 #endif  /* SLAB_SELFTEST */
 493                 return;
 494         }
 495 panic_time:
 496         panic("kmem_cache_sizes_init: Error creating caches");
 497         /* NOTREACHED */
 498 }
 499
 500 /* Interface to system's page allocator.  Dma pts to non-zero if all
 501  * of memory is DMAable. No need to hold the cache-lock.
 502  */
 503 static inline void *
 504 kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma)
 505 {
 506         void    *addr;
 507
 508         /*
 509          * If we requested dmaable memory, we will get it. Even if we
 510          * did not request dmaable memory, we might get it, but that
 511          * would be relatively rare and ignorable.
 512          */
 513         *dma = flags & SLAB_DMA;
 514         addr = (void*) __get_free_pages(flags, cachep->c_gfporder);
 515         /* Assume that now we have the pages no one else can legally
 516          * messes with the 'struct page's.
 517          * However vm_scan() might try to test the structure to see if
 518          * it is a named-page or buffer-page.  The members it tests are
 519          * of no interest here.....
 520          */
 521         return addr;
 522 }
 523
 524 /* Interface to system's page release. */
 525 static inline void
 526 kmem_freepages(kmem_cache_t *cachep, void *addr)
 527 {
 528         unsigned long i = (1<<cachep->c_gfporder);
 529         struct page *page = &mem_map[MAP_NR(addr)];
 530
 531         /* free_pages() does not clear the type bit - we do that.
 532          * The pages have been unlinked from their cache-slab,
 533          * but their 'struct page's might be accessed in
 534          * vm_scan(). Shouldn't be a worry.
 535          */
 536         while (i--) {
 537                 PageClearSlab(page);
 538                 page++;
 539         }
 540         free_pages((unsigned long)addr, cachep->c_gfporder);
 541 }
 542
 543 #if     SLAB_DEBUG_SUPPORT
 544 static inline void
 545 kmem_poison_obj(kmem_cache_t *cachep, void *addr)
 546 {
 547         memset(addr, SLAB_POISON_BYTE, cachep->c_org_size);
 548         *(unsigned char *)(addr+cachep->c_org_size-1) = SLAB_POISON_END;
 549 }
 550
 551 static inline int
 552 kmem_check_poison_obj(kmem_cache_t *cachep, void *addr)
 553 {
 554         void *end;
 555         end = memchr(addr, SLAB_POISON_END, cachep->c_org_size);
 556         if (end != (addr+cachep->c_org_size-1))
 557                 return 1;
 558         return 0;
 559 }
 560 #endif  /* SLAB_DEBUG_SUPPORT */
 561
 562 /* Three slab chain funcs - all called with ints disabled and the appropriate
 563  * cache-lock held.
 564  */
 565 static inline void
 566 kmem_slab_unlink(kmem_slab_t *slabp)
 567 {
 568         kmem_slab_t     *prevp = slabp->s_prevp;
 569         kmem_slab_t     *nextp = slabp->s_nextp;
 570         prevp->s_nextp = nextp;
 571         nextp->s_prevp = prevp;
 572 }
 573
 574 static inline void
 575 kmem_slab_link_end(kmem_cache_t *cachep, kmem_slab_t *slabp)
 576 {
 577         kmem_slab_t     *lastp = cachep->c_lastp;
 578         slabp->s_nextp = kmem_slab_end(cachep);
 579         slabp->s_prevp = lastp;
 580         cachep->c_lastp = slabp;
 581         lastp->s_nextp = slabp;
 582 }
 583
 584 static inline void
 585 kmem_slab_link_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
 586 {
 587         kmem_slab_t     *nextp = cachep->c_freep;
 588         kmem_slab_t     *prevp = nextp->s_prevp;
 589         slabp->s_nextp = nextp;
 590         slabp->s_prevp = prevp;
 591         nextp->s_prevp = slabp;
 592         slabp->s_prevp->s_nextp = slabp;
 593 }
 594
 595 /* Destroy all the objs in a slab, and release the mem back to the system.
 596  * Before calling the slab must have been unlinked from the cache.
 597  * The cache-lock is not held/needed.
 598  */
 599 static void
 600 kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
 601 {
 602         if (cachep->c_dtor
 603 #if     SLAB_DEBUG_SUPPORT
 604                 || cachep->c_flags & (SLAB_POISON | SLAB_RED_ZONE)
 605 #endif  /*SLAB_DEBUG_SUPPORT*/
 606         ) {
 607                 /* Doesn't use the bufctl ptrs to find objs. */
 608                 unsigned long num = cachep->c_num;
 609                 void *objp = slabp->s_mem;
 610                 do {
 611 #if     SLAB_DEBUG_SUPPORT
 612                         if (cachep->c_flags & SLAB_RED_ZONE) {
 613                                 if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1)
 614                                         printk(KERN_ERR "kmem_slab_destroy: "
 615                                                "Bad front redzone - %s\n",
 616                                                cachep->c_name);
 617                                 objp += BYTES_PER_WORD;
 618                                 if (*((unsigned long*)(objp+cachep->c_org_size)) !=
 619                                     SLAB_RED_MAGIC1)
 620                                         printk(KERN_ERR "kmem_slab_destroy: "
 621                                                "Bad rear redzone - %s\n",
 622                                                cachep->c_name);
 623                         }
 624                         if (cachep->c_dtor)
 625 #endif  /*SLAB_DEBUG_SUPPORT*/
 626                                 (cachep->c_dtor)(objp, cachep, 0);
 627 #if     SLAB_DEBUG_SUPPORT
 628                         else if (cachep->c_flags & SLAB_POISON) {
 629                                 if (kmem_check_poison_obj(cachep, objp))
 630                                         printk(KERN_ERR "kmem_slab_destroy: "
 631                                                "Bad poison - %s\n", cachep->c_name);
 632                         }
 633                         if (cachep->c_flags & SLAB_RED_ZONE)
 634                                 objp -= BYTES_PER_WORD;
 635 #endif  /* SLAB_DEBUG_SUPPORT */
 636                         objp += cachep->c_offset;
 637                         if (!slabp->s_index)
 638                                 objp += sizeof(kmem_bufctl_t);
 639                 } while (--num);
 640         }
 641
 642         slabp->s_magic = SLAB_MAGIC_DESTROYED;
 643         if (slabp->s_index)
 644                 kmem_cache_free(cachep->c_index_cachep, slabp->s_index);
 645         kmem_freepages(cachep, slabp->s_mem-slabp->s_offset);
 646         if (SLAB_OFF_SLAB(cachep->c_flags))
 647                 kmem_cache_free(cache_slabp, slabp);
 648 }
 649
 650 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 651 static inline size_t
 652 kmem_cache_cal_waste(unsigned long gfporder, size_t size, size_t extra,
 653                      unsigned long flags, size_t *left_over, unsigned long *num)
 654 {
 655         size_t wastage = PAGE_SIZE<<gfporder;
 656
 657         if (SLAB_OFF_SLAB(flags))
 658                 gfporder = 0;
 659         else
 660                 gfporder = slab_align_size;
 661         wastage -= gfporder;
 662         *num = wastage / size;
 663         wastage -= (*num * size);
 664         *left_over = wastage;
 665
 666         return (wastage + gfporder + (extra * *num));
 667 }
 668
 669 /**
 670  * kmem_cache_create - Create a cache.
 671  * @name: A string which is used in /proc/slabinfo to identify this cache.
 672  * @size: The size of objects to be created in this cache.
 673  * @offset: The offset to use within the page.
 674  * @flags: SLAB flags
 675  * @ctor: A constructor for the objects.
 676  * @dtor: A destructor for the objects.
 677  *
 678  * Returns a ptr to the cache on success, NULL on failure.
 679  * Cannot be called within a int, but can be interrupted.
 680  * The @ctor is run when new pages are allocated by the cache
 681  * and the @dtor is run before the pages are handed back.
 682  * The flags are
 683  *
 684  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 685  * to catch references to uninitialised memory.
 686  *
 687  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 688  * for buffer overruns.
 689  *
 690  * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
 691  * memory pressure.
 692  *
 693  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 694  * cacheline.  This can be beneficial if you're counting cycles as closely
 695  * as davem.
 696  */
 697 kmem_cache_t *
 698 kmem_cache_create(const char *name, size_t size, size_t offset,
 699         unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
 700         void (*dtor)(void*, kmem_cache_t *, unsigned long))
 701 {
 702         const char *func_nm= KERN_ERR "kmem_create: ";
 703         kmem_cache_t    *searchp;
 704         kmem_cache_t    *cachep=NULL;
 705         size_t          extra;
 706         size_t          left_over;
 707         size_t          align;
 708
 709 #if SLAB_DEBUG_SUPPORT
 710         flags |= SLAB_POISON;
 711 #endif
 712         /* Sanity checks... */
 713 #if     SLAB_MGMT_CHECKS
 714         if (!name) {
 715                 printk("%sNULL ptr\n", func_nm);
 716                 goto opps;
 717         }
 718         if (strlen(name) >= SLAB_CACHE_NAME_LEN) {
 719                 printk("%sname too long\n", func_nm);
 720                 goto opps;
 721         }
 722         if (in_interrupt()) {
 723                 printk("%sCalled during int - %s\n", func_nm, name);
 724                 goto opps;
 725         }
 726
 727         if (size < BYTES_PER_WORD) {
 728                 printk("%sSize too small %d - %s\n", func_nm, (int) size, name);
 729                 size = BYTES_PER_WORD;
 730         }
 731
 732         if (size > ((1<<SLAB_OBJ_MAX_ORDER)*PAGE_SIZE)) {
 733                 printk("%sSize too large %d - %s\n", func_nm, (int) size, name);
 734                 goto opps;
 735         }
 736
 737         if (dtor && !ctor) {
 738                 /* Decon, but no con - doesn't make sense */
 739                 printk("%sDecon but no con - %s\n", func_nm, name);
 740                 goto opps;
 741         }
 742
 743         if (offset < 0 || offset > size) {
 744                 printk("%sOffset weird %d - %s\n", func_nm, (int) offset, name);
 745                 offset = 0;
 746         }
 747
 748 #if     SLAB_DEBUG_SUPPORT
 749         if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
 750                 /* No constructor, but inital state check requested */
 751                 printk("%sNo con, but init state check requested - %s\n", func_nm, name);
 752                 flags &= ~SLAB_DEBUG_INITIAL;
 753         }
 754
 755         if ((flags & SLAB_POISON) && ctor) {
 756                 /* request for poisoning, but we can't do that with a constructor */
 757                 printk("%sPoisoning requested, but con given - %s\n", func_nm, name);
 758                 flags &= ~SLAB_POISON;
 759         }
 760 #if     0
 761         if ((flags & SLAB_HIGH_PACK) && ctor) {
 762                 printk("%sHigh pack requested, but con given - %s\n", func_nm, name);
 763                 flags &= ~SLAB_HIGH_PACK;
 764         }
 765         if ((flags & SLAB_HIGH_PACK) && (flags & (SLAB_POISON|SLAB_RED_ZONE))) {
 766                 printk("%sHigh pack requested, but with poisoning/red-zoning - %s\n",
 767                        func_nm, name);
 768                 flags &= ~SLAB_HIGH_PACK;
 769         }
 770 #endif
 771 #endif  /* SLAB_DEBUG_SUPPORT */
 772 #endif  /* SLAB_MGMT_CHECKS */
 773
 774         /* Always checks flags, a caller might be expecting debug
 775          * support which isn't available.
 776          */
 777         if (flags & ~SLAB_C_MASK) {
 778                 printk("%sIllgl flg %lX - %s\n", func_nm, flags, name);
 779                 flags &= SLAB_C_MASK;
 780         }
 781
 782         /* Get cache's description obj. */
 783         cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
 784         if (!cachep)
 785                 goto opps;
 786         memset(cachep, 0, sizeof(kmem_cache_t));
 787
 788         /* Check that size is in terms of words.  This is needed to avoid
 789          * unaligned accesses for some archs when redzoning is used, and makes
 790          * sure any on-slab bufctl's are also correctly aligned.
 791          */
 792         if (size & (BYTES_PER_WORD-1)) {
 793                 size += (BYTES_PER_WORD-1);
 794                 size &= ~(BYTES_PER_WORD-1);
 795                 printk("%sForcing size word alignment - %s\n", func_nm, name);
 796         }
 797
 798         cachep->c_org_size = size;
 799 #if     SLAB_DEBUG_SUPPORT
 800         if (flags & SLAB_RED_ZONE) {
 801                 /* There is no point trying to honour cache alignment when redzoning. */
 802                 flags &= ~SLAB_HWCACHE_ALIGN;
 803                 size += 2*BYTES_PER_WORD;               /* words for redzone */
 804         }
 805 #endif  /* SLAB_DEBUG_SUPPORT */
 806
 807         align = BYTES_PER_WORD;
 808         if (flags & SLAB_HWCACHE_ALIGN)
 809                 align = L1_CACHE_BYTES;
 810
 811         /* Determine if the slab management and/or bufclts are 'on' or 'off' slab. */
 812         extra = sizeof(kmem_bufctl_t);
 813         if (size < (PAGE_SIZE>>3)) {
 814                 /* Size is small(ish).  Use packing where bufctl size per
 815                  * obj is low, and slab management is on-slab.
 816                  */
 817 #if     0
 818                 if ((flags & SLAB_HIGH_PACK)) {
 819                         /* Special high packing for small objects
 820                          * (mainly for vm_mapping structs, but
 821                          * others can use it).
 822                          */
 823                         if (size == (L1_CACHE_BYTES/4) || size == (L1_CACHE_BYTES/2) ||
 824                             size == L1_CACHE_BYTES) {
 825                                 /* The bufctl is stored with the object. */
 826                                 extra = 0;
 827                         } else
 828                                 flags &= ~SLAB_HIGH_PACK;
 829                 }
 830 #endif
 831         } else {
 832                 /* Size is large, assume best to place the slab management obj
 833                  * off-slab (should allow better packing of objs).
 834                  */
 835                 flags |= SLAB_CFLGS_OFF_SLAB;
 836                 if (!(size & ~PAGE_MASK) || size == (PAGE_SIZE/2)
 837                     || size == (PAGE_SIZE/4) || size == (PAGE_SIZE/8)) {
 838                         /* To avoid waste the bufctls are off-slab... */
 839                         flags |= SLAB_CFLGS_BUFCTL;
 840                         extra = 0;
 841                 } /* else slab management is off-slab, but freelist pointers are on. */
 842         }
 843         size += extra;
 844
 845         if (flags & SLAB_HWCACHE_ALIGN) {
 846                 /* Need to adjust size so that objs are cache aligned. */
 847                 if (size > (L1_CACHE_BYTES/2)) {
 848                         size_t words = size % L1_CACHE_BYTES;
 849                         if (words)
 850                                 size += (L1_CACHE_BYTES-words);
 851                 } else {
 852                         /* Small obj size, can get at least two per cache line. */
 853                         int num_per_line = L1_CACHE_BYTES/size;
 854                         left_over = L1_CACHE_BYTES - (num_per_line*size);
 855                         if (left_over) {
 856                                 /* Need to adjust size so objs cache align. */
 857                                 if (left_over%num_per_line) {
 858                                         /* Odd num of objs per line - fixup. */
 859                                         num_per_line--;
 860                                         left_over += size;
 861                                 }
 862                                 size += (left_over/num_per_line);
 863                         }
 864                 }
 865         } else if (!(size%L1_CACHE_BYTES)) {
 866                 /* Size happens to cache align... */
 867                 flags |= SLAB_HWCACHE_ALIGN;
 868                 align = L1_CACHE_BYTES;
 869         }
 870
 871         /* Cal size (in pages) of slabs, and the num of objs per slab.
 872          * This could be made much more intelligent.  For now, try to avoid
 873          * using high page-orders for slabs.  When the gfp() funcs are more
 874          * friendly towards high-order requests, this should be changed.
 875          */
 876         do {
 877                 size_t wastage;
 878                 unsigned int break_flag = 0;
 879 cal_wastage:
 880                 wastage = kmem_cache_cal_waste(cachep->c_gfporder, size, extra,
 881                                                flags, &left_over, &cachep->c_num);
 882                 if (!cachep->c_num)
 883                         goto next;
 884                 if (break_flag)
 885                         break;
 886                 if (SLAB_BUFCTL(flags) && cachep->c_num > bufctl_limit) {
 887                         /* Oops, this num of objs will cause problems. */
 888                         cachep->c_gfporder--;
 889                         break_flag++;
 890                         goto cal_wastage;
 891                 }
 892                 if (cachep->c_gfporder == SLAB_MAX_GFP_ORDER)
 893                         break;
 894
 895                 /* Large num of objs is good, but v. large slabs are currently
 896                  * bad for the gfp()s.
 897                  */
 898                 if (cachep->c_num <= SLAB_MIN_OBJS_PER_SLAB) {
 899                         if (cachep->c_gfporder < slab_break_gfp_order)
 900                                 goto next;
 901                 }
 902
 903                 /* Stop caches with small objs having a large num of pages. */
 904                 if (left_over <= slab_align_size)
 905                         break;
 906                 if ((wastage*8) <= (PAGE_SIZE<<cachep->c_gfporder))
 907                         break;  /* Acceptable internal fragmentation. */
 908 next:
 909                 cachep->c_gfporder++;
 910         } while (1);
 911
 912         /* If the slab has been placed off-slab, and we have enough space then
 913          * move it on-slab.  This is at the expense of any extra colouring.
 914          */
 915         if ((flags & SLAB_CFLGS_OFF_SLAB) && !SLAB_BUFCTL(flags) &&
 916             left_over >= slab_align_size) {
 917                 flags &= ~SLAB_CFLGS_OFF_SLAB;
 918                 left_over -= slab_align_size;
 919         }
 920
 921         /* Offset must be a multiple of the alignment. */
 922         offset += (align-1);
 923         offset &= ~(align-1);
 924
 925         /* Mess around with the offset alignment. */
 926         if (!left_over) {
 927                 offset = 0;
 928         } else if (left_over < offset) {
 929                 offset = align;
 930                 if (flags & SLAB_HWCACHE_ALIGN) {
 931                         if (left_over < offset)
 932                                 offset = 0;
 933                 } else {
 934                         /* Offset is BYTES_PER_WORD, and left_over is at
 935                          * least BYTES_PER_WORD.
 936                          */
 937                         if (left_over >= (BYTES_PER_WORD*2)) {
 938                                 offset >>= 1;
 939                                 if (left_over >= (BYTES_PER_WORD*4))
 940                                         offset >>= 1;
 941                         }
 942                 }
 943         } else if (!offset) {
 944                 /* No offset requested, but space enough - give one. */
 945                 offset = left_over/align;
 946                 if (flags & SLAB_HWCACHE_ALIGN) {
 947                         if (offset >= 8) {
 948                                 /* A large number of colours - use a larger alignment. */
 949                                 align <<= 1;
 950                         }
 951                 } else {
 952                         if (offset >= 10) {
 953                                 align <<= 1;
 954                                 if (offset >= 16)
 955                                         align <<= 1;
 956                         }
 957                 }
 958                 offset = align;
 959         }
 960
 961 #if     0
 962 printk("%s: Left_over:%d Align:%d Size:%d\n", name, left_over, offset, size);
 963 #endif
 964
 965         if ((cachep->c_align = (unsigned long) offset))
 966                 cachep->c_colour = (left_over/offset);
 967         cachep->c_colour_next = cachep->c_colour;
 968
 969         /* If the bufctl's are on-slab, c_offset does not include the size of bufctl. */
 970         if (!SLAB_BUFCTL(flags))
 971                 size -= sizeof(kmem_bufctl_t);
 972         else
 973                 cachep->c_index_cachep =
 974                         kmem_find_general_cachep(cachep->c_num*sizeof(kmem_bufctl_t));
 975         cachep->c_offset = (unsigned long) size;
 976         cachep->c_freep = kmem_slab_end(cachep);
 977         cachep->c_firstp = kmem_slab_end(cachep);
 978         cachep->c_lastp = kmem_slab_end(cachep);
 979         cachep->c_flags = flags;
 980         cachep->c_ctor = ctor;
 981         cachep->c_dtor = dtor;
 982         cachep->c_magic = SLAB_C_MAGIC;
 983         /* Copy name over so we don't have problems with unloaded modules */
 984         strcpy(cachep->c_name, name);
 985         spin_lock_init(&cachep->c_spinlock);
 986
 987         /* Need the semaphore to access the chain. */
 988         down(&cache_chain_sem);
 989         searchp = &cache_cache;
 990         do {
 991                 /* The name field is constant - no lock needed. */
 992                 if (!strcmp(searchp->c_name, name)) {
 993                         printk("%sDup name - %s\n", func_nm, name);
 994                         break;
 995                 }
 996                 searchp = searchp->c_nextp;
 997         } while (searchp != &cache_cache);
 998
 999         /* There is no reason to lock our new cache before we
1000          * link it in - no one knows about it yet...
1001          */
1002         cachep->c_nextp = cache_cache.c_nextp;
1003         cache_cache.c_nextp = cachep;
1004         up(&cache_chain_sem);
1005 opps:
1006         return cachep;
1007 }
1008
1009 /*
1010  * This check if the kmem_cache_t pointer is chained in the cache_cache
1011  * list. -arca
1012  */
1013 static int is_chained_kmem_cache(kmem_cache_t * cachep)
1014 {
1015         kmem_cache_t * searchp;
1016         int ret = 0;
1017
1018         /* Find the cache in the chain of caches. */
1019         down(&cache_chain_sem);
1020         for (searchp = &cache_cache; searchp->c_nextp != &cache_cache;
1021              searchp = searchp->c_nextp) {
1022                 if (searchp->c_nextp != cachep)
1023                         continue;
1024
1025                 /* Accessing clock_searchp is safe - we hold the mutex. */
1026                 if (cachep == clock_searchp)
1027                         clock_searchp = cachep->c_nextp;
1028                 ret = 1;
1029                 break;
1030         }
1031         up(&cache_chain_sem);
1032
1033         return ret;
1034 }
1035
1036 /* returns 0 if every slab is been freed -arca */
1037 static int __kmem_cache_shrink(kmem_cache_t *cachep)
1038 {
1039         kmem_slab_t     *slabp;
1040         int     ret;
1041
1042         spin_lock_irq(&cachep->c_spinlock);
1043
1044         /* If the cache is growing, stop shrinking. */
1045         while (!cachep->c_growing) {
1046                 slabp = cachep->c_lastp;
1047                 if (slabp->s_inuse || slabp == kmem_slab_end(cachep))
1048                         break;
1049                 /*
1050                  * If this slab is the first slab with free objects
1051                  * (c_freep), and as we are walking the slab chain
1052                  * backwards, it is also the last slab with free
1053                  * objects.  After unlinking it, there will be no
1054                  * slabs with free objects, so point c_freep into the
1055                  * cache structure.
1056                  */
1057                 if (cachep->c_freep == slabp)
1058                         cachep->c_freep = kmem_slab_end(cachep);
1059                 kmem_slab_unlink(slabp);
1060                 spin_unlock_irq(&cachep->c_spinlock);
1061                 kmem_slab_destroy(cachep, slabp);
1062                 spin_lock_irq(&cachep->c_spinlock);
1063         }
1064         ret = 1;
1065         if (cachep->c_lastp == kmem_slab_end(cachep))
1066                 ret = 0;                /* Cache is empty. */
1067         spin_unlock_irq(&cachep->c_spinlock);
1068         return ret;
1069 }
1070
1071 /**
1072  * kmem_cache_shrink - Shrink a cache.
1073  * @cachep: The cache to shrink.
1074  *
1075  * Releases as many slabs as possible for a cache.
1076  * To help debugging, a zero exit status indicates all slabs were released.
1077  */
1078 int
1079 kmem_cache_shrink(kmem_cache_t *cachep)
1080 {
1081         if (!cachep)
1082                 BUG();
1083         if (in_interrupt())
1084                 BUG();
1085         if (!is_chained_kmem_cache(cachep))
1086                 BUG();
1087
1088         return __kmem_cache_shrink(cachep);
1089 }
1090
1091 /**
1092  * kmem_cache_destroy - delete a cache
1093  * @cachep: the cache to destroy
1094  *
1095  * Remove a kmem_cache_t object from the slab cache.
1096  * Returns 0 on success.
1097  *
1098  * It is expected this function will be called by a module when it is
1099  * unloaded.  This will remove the cache completely, and avoid a duplicate
1100  * cache being allocated each time a module is loaded and unloaded, if the
1101  * module doesn't have persistent in-kernel storage across loads and unloads.
1102  *
1103  */
1104 int kmem_cache_destroy(kmem_cache_t * cachep)
1105 {
1106         kmem_cache_t * prev;
1107         int ret;
1108
1109         if (!cachep) {
1110                 printk(KERN_ERR "kmem_destroy: NULL ptr\n");
1111                 return 1;
1112         }
1113         if (in_interrupt()) {
1114                 printk(KERN_ERR "kmem_destroy: Called during int - %s\n",
1115                        cachep->c_name);
1116                 return 1;
1117         }
1118
1119         ret = 0;
1120         /* Find the cache in the chain of caches. */
1121         down(&cache_chain_sem);
1122         for (prev = &cache_cache; prev->c_nextp != &cache_cache;
1123              prev = prev->c_nextp) {
1124                 if (prev->c_nextp != cachep)
1125                         continue;
1126
1127                 /* Accessing clock_searchp is safe - we hold the mutex. */
1128                 if (cachep == clock_searchp)
1129                         clock_searchp = cachep->c_nextp;
1130
1131                 /* remove the cachep from the cache_cache list. -arca */
1132                 prev->c_nextp = cachep->c_nextp;
1133
1134                 ret = 1;
1135                 break;
1136         }
1137         up(&cache_chain_sem);
1138
1139         if (!ret) {
1140                 printk(KERN_ERR "kmem_destroy: Invalid cache addr %p\n",
1141                        cachep);
1142                 return 1;
1143         }
1144
1145         if (__kmem_cache_shrink(cachep)) {
1146                 printk(KERN_ERR "kmem_destroy: Can't free all objects %p\n",
1147                        cachep);
1148                 down(&cache_chain_sem);
1149                 cachep->c_nextp = cache_cache.c_nextp;
1150                 cache_cache.c_nextp = cachep;
1151                 up(&cache_chain_sem);
1152                 return 1;
1153         }
1154
1155         kmem_cache_free(&cache_cache, cachep);
1156
1157         return 0;
1158 }
1159
1160 /* Get the memory for a slab management obj. */
1161 static inline kmem_slab_t *
1162 kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, int local_flags)
1163 {
1164         kmem_slab_t     *slabp;
1165
1166         if (SLAB_OFF_SLAB(cachep->c_flags)) {
1167                 /* Slab management obj is off-slab. */
1168                 slabp = kmem_cache_alloc(cache_slabp, local_flags);
1169         } else {
1170                 /* Slab management at end of slab memory, placed so that
1171                  * the position is 'coloured'.
1172                  */
1173                 void *end;
1174                 end = objp + (cachep->c_num * cachep->c_offset);
1175                 if (!SLAB_BUFCTL(cachep->c_flags))
1176                         end += (cachep->c_num * sizeof(kmem_bufctl_t));
1177                 slabp = (kmem_slab_t *) L1_CACHE_ALIGN((unsigned long)end);
1178         }
1179
1180         if (slabp) {
1181                 slabp->s_inuse = 0;
1182                 slabp->s_dma = 0;
1183                 slabp->s_index = NULL;
1184         }
1185
1186         return slabp;
1187 }
1188
1189 static inline void
1190 kmem_cache_init_objs(kmem_cache_t * cachep, kmem_slab_t * slabp, void *objp,
1191                                 unsigned long ctor_flags)
1192 {
1193         kmem_bufctl_t   **bufpp = &slabp->s_freep;
1194         unsigned long   num = cachep->c_num-1;
1195
1196         do {
1197 #if     SLAB_DEBUG_SUPPORT
1198                 if (cachep->c_flags & SLAB_RED_ZONE) {
1199                         *((unsigned long*)(objp)) = SLAB_RED_MAGIC1;
1200                         objp += BYTES_PER_WORD;
1201                         *((unsigned long*)(objp+cachep->c_org_size)) = SLAB_RED_MAGIC1;
1202                 }
1203 #endif  /* SLAB_DEBUG_SUPPORT */
1204
1205                 /* Constructors are not allowed to allocate memory from the same cache
1206                  * which they are a constructor for.  Otherwise, deadlock.
1207                  * They must also be threaded.
1208                  */
1209                 if (cachep->c_ctor)
1210                         cachep->c_ctor(objp, cachep, ctor_flags);
1211 #if     SLAB_DEBUG_SUPPORT
1212                 else if (cachep->c_flags & SLAB_POISON) {
1213                         /* need to poison the objs */
1214                         kmem_poison_obj(cachep, objp);
1215                 }
1216
1217                 if (cachep->c_flags & SLAB_RED_ZONE) {
1218                         if (*((unsigned long*)(objp+cachep->c_org_size)) !=
1219                             SLAB_RED_MAGIC1) {
1220                                 *((unsigned long*)(objp+cachep->c_org_size)) =
1221                                         SLAB_RED_MAGIC1;
1222                                 printk(KERN_ERR "kmem_init_obj: Bad rear redzone "
1223                                        "after constructor - %s\n", cachep->c_name);
1224                         }
1225                         objp -= BYTES_PER_WORD;
1226                         if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1) {
1227                                 *((unsigned long*)(objp)) = SLAB_RED_MAGIC1;
1228                                 printk(KERN_ERR "kmem_init_obj: Bad front redzone "
1229                                        "after constructor - %s\n", cachep->c_name);
1230                         }
1231                 }
1232 #endif  /* SLAB_DEBUG_SUPPORT */
1233
1234                 objp += cachep->c_offset;
1235                 if (!slabp->s_index) {
1236                         *bufpp = objp;
1237                         objp += sizeof(kmem_bufctl_t);
1238                 } else
1239                         *bufpp = &slabp->s_index[num];
1240                 bufpp = &(*bufpp)->buf_nextp;
1241         } while (num--);
1242
1243         *bufpp = NULL;
1244 }
1245
1246 /* Grow (by 1) the number of slabs within a cache.  This is called by
1247  * kmem_cache_alloc() when there are no active objs left in a cache.
1248  */
1249 static int
1250 kmem_cache_grow(kmem_cache_t * cachep, int flags)
1251 {
1252         kmem_slab_t     *slabp;
1253         struct page     *page;
1254         void            *objp;
1255         size_t           offset;
1256         unsigned int     dma, local_flags;
1257         unsigned long    ctor_flags;
1258         unsigned long    save_flags;
1259
1260         /* Be lazy and only check for valid flags here,
1261          * keeping it out of the critical path in kmem_cache_alloc().
1262          */
1263         if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) {
1264                 printk(KERN_WARNING "kmem_grow: Illegal flgs %X (correcting) - %s\n",
1265                        flags, cachep->c_name);
1266                 flags &= (SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW);
1267         }
1268
1269         if (flags & SLAB_NO_GROW)
1270                 return 0;
1271
1272         /* The test for missing atomic flag is performed here, rather than
1273          * the more obvious place, simply to reduce the critical path length
1274          * in kmem_cache_alloc().  If a caller is slightly mis-behaving they
1275          * will eventually be caught here (where it matters).
1276          */
1277         if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC) {
1278                 printk(KERN_ERR "kmem_grow: Called nonatomically from int - %s\n",
1279                        cachep->c_name);
1280                 flags &= ~SLAB_LEVEL_MASK;
1281                 flags |= SLAB_ATOMIC;
1282         }
1283         ctor_flags = SLAB_CTOR_CONSTRUCTOR;
1284         local_flags = (flags & SLAB_LEVEL_MASK);
1285         if (local_flags == SLAB_ATOMIC) {
1286                 /* Not allowed to sleep.  Need to tell a constructor about
1287                  * this - it might need to know...
1288                  */
1289                 ctor_flags |= SLAB_CTOR_ATOMIC;
1290         }
1291
1292         /* About to mess with non-constant members - lock. */
1293         spin_lock_irqsave(&cachep->c_spinlock, save_flags);
1294
1295         /* Get colour for the slab, and cal the next value. */
1296         if (!(offset = cachep->c_colour_next--))
1297                 cachep->c_colour_next = cachep->c_colour;
1298         offset *= cachep->c_align;
1299         cachep->c_dflags = SLAB_CFLGS_GROWN;
1300
1301         cachep->c_growing++;
1302         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1303
1304         /* A series of memory allocations for a new slab.
1305          * Neither the cache-chain semaphore, or cache-lock, are
1306          * held, but the incrementing c_growing prevents this
1307          * this cache from being reaped or shrunk.
1308          * Note: The cache could be selected in for reaping in
1309          * kmem_cache_reap(), but when the final test is made the
1310          * growing value will be seen.
1311          */
1312
1313         /* Get mem for the objs. */
1314         if (!(objp = kmem_getpages(cachep, flags, &dma)))
1315                 goto failed;
1316
1317         /* Get slab management. */
1318         if (!(slabp = kmem_cache_slabmgmt(cachep, objp+offset, local_flags)))
1319                 goto opps1;
1320         if (dma)
1321                 slabp->s_dma = 1;
1322         if (SLAB_BUFCTL(cachep->c_flags)) {
1323                 slabp->s_index = kmem_cache_alloc(cachep->c_index_cachep, local_flags);
1324                 if (!slabp->s_index)
1325                         goto opps2;
1326         }
1327
1328         /* Nasty!!!!!!  I hope this is OK. */
1329         dma = 1 << cachep->c_gfporder;
1330         page = &mem_map[MAP_NR(objp)];
1331         do {
1332                 SLAB_SET_PAGE_CACHE(page, cachep);
1333                 SLAB_SET_PAGE_SLAB(page, slabp);
1334                 PageSetSlab(page);
1335                 page++;
1336         } while (--dma);
1337
1338         slabp->s_offset = offset;       /* It will fit... */
1339         objp += offset;         /* Address of first object. */
1340         slabp->s_mem = objp;
1341
1342         /* For on-slab bufctls, c_offset is the distance between the start of
1343          * an obj and its related bufctl.  For off-slab bufctls, c_offset is
1344          * the distance between objs in the slab.
1345          */
1346         kmem_cache_init_objs(cachep, slabp, objp, ctor_flags);
1347
1348         spin_lock_irq(&cachep->c_spinlock);
1349
1350         /* Make slab active. */
1351         slabp->s_magic = SLAB_MAGIC_ALLOC;
1352         kmem_slab_link_end(cachep, slabp);
1353         if (cachep->c_freep == kmem_slab_end(cachep))
1354                 cachep->c_freep = slabp;
1355         SLAB_STATS_INC_GROWN(cachep);
1356         cachep->c_failures = 0;
1357         cachep->c_growing--;
1358
1359         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1360         return 1;
1361 opps2:
1362         if (SLAB_OFF_SLAB(cachep->c_flags))
1363                 kmem_cache_free(cache_slabp, slabp);
1364 opps1:
1365         kmem_freepages(cachep, objp);
1366 failed:
1367         spin_lock_irq(&cachep->c_spinlock);
1368         cachep->c_growing--;
1369         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1370         return 0;
1371 }
1372
1373 static void
1374 kmem_report_alloc_err(const char *str, kmem_cache_t * cachep)
1375 {
1376         if (cachep)
1377                 SLAB_STATS_INC_ERR(cachep);     /* this is atomic */
1378         printk(KERN_ERR "kmem_alloc: %s (name=%s)\n",
1379                str, cachep ? cachep->c_name : "unknown");
1380 }
1381
1382 static void
1383 kmem_report_free_err(const char *str, const void *objp, kmem_cache_t * cachep)
1384 {
1385         if (cachep)
1386                 SLAB_STATS_INC_ERR(cachep);
1387         printk(KERN_ERR "kmem_free: %s (objp=%p, name=%s)\n",
1388                str, objp, cachep ? cachep->c_name : "unknown");
1389 }
1390
1391 /* Search for a slab whose objs are suitable for DMA.
1392  * Note: since testing the first free slab (in __kmem_cache_alloc()),
1393  * ints must not have been enabled, or the cache-lock released!
1394  */
1395 static inline kmem_slab_t *
1396 kmem_cache_search_dma(kmem_cache_t * cachep)
1397 {
1398         kmem_slab_t     *slabp = cachep->c_freep->s_nextp;
1399
1400         for (; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
1401                 if (!(slabp->s_dma))
1402                         continue;
1403                 kmem_slab_unlink(slabp);
1404                 kmem_slab_link_free(cachep, slabp);
1405                 cachep->c_freep = slabp;
1406                 break;
1407         }
1408         return slabp;
1409 }
1410
1411 #if     SLAB_DEBUG_SUPPORT
1412 /* Perform extra freeing checks.  Currently, this check is only for caches
1413  * that use bufctl structures within the slab.  Those which use bufctl's
1414  * from the internal cache have a reasonable check when the address is
1415  * searched for.  Called with the cache-lock held.
1416  */
1417 static void *
1418 kmem_extra_free_checks(kmem_cache_t * cachep, kmem_bufctl_t *search_bufp,
1419                        kmem_bufctl_t *bufp, void * objp)
1420 {
1421         if (SLAB_BUFCTL(cachep->c_flags))
1422                 return objp;
1423
1424         /* Check slab's freelist to see if this obj is there. */
1425         for (; search_bufp; search_bufp = search_bufp->buf_nextp) {
1426                 if (search_bufp != bufp)
1427                         continue;
1428                 return NULL;
1429         }
1430         return objp;
1431 }
1432 #endif  /* SLAB_DEBUG_SUPPORT */
1433
1434 /* Called with cache lock held. */
1435 static inline void
1436 kmem_cache_full_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
1437 {
1438         if (slabp->s_nextp->s_inuse) {
1439                 /* Not at correct position. */
1440                 if (cachep->c_freep == slabp)
1441                         cachep->c_freep = slabp->s_nextp;
1442                 kmem_slab_unlink(slabp);
1443                 kmem_slab_link_end(cachep, slabp);
1444         }
1445 }
1446
1447 /* Called with cache lock held. */
1448 static inline void
1449 kmem_cache_one_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
1450 {
1451         if (slabp->s_nextp->s_inuse == cachep->c_num) {
1452                 kmem_slab_unlink(slabp);
1453                 kmem_slab_link_free(cachep, slabp);
1454         }
1455         cachep->c_freep = slabp;
1456 }
1457
1458 /* Returns a ptr to an obj in the given cache. */
1459 static inline void *
1460 __kmem_cache_alloc(kmem_cache_t *cachep, int flags)
1461 {
1462         kmem_slab_t     *slabp;
1463         kmem_bufctl_t   *bufp;
1464         void            *objp;
1465         unsigned long   save_flags;
1466
1467         /* Sanity check. */
1468         if (!cachep)
1469                 goto nul_ptr;
1470         spin_lock_irqsave(&cachep->c_spinlock, save_flags);
1471 try_again:
1472         /* Get slab alloc is to come from. */
1473         slabp = cachep->c_freep;
1474
1475         /* Magic is a sanity check _and_ says if we need a new slab. */
1476         if (slabp->s_magic != SLAB_MAGIC_ALLOC)
1477                 goto alloc_new_slab;
1478         /* DMA requests are 'rare' - keep out of the critical path. */
1479         if (flags & SLAB_DMA)
1480                 goto search_dma;
1481 try_again_dma:
1482         SLAB_STATS_INC_ALLOCED(cachep);
1483         SLAB_STATS_INC_ACTIVE(cachep);
1484         SLAB_STATS_SET_HIGH(cachep);
1485         slabp->s_inuse++;
1486         bufp = slabp->s_freep;
1487         slabp->s_freep = bufp->buf_nextp;
1488         if (slabp->s_freep) {
1489 ret_obj:
1490                 if (!slabp->s_index) {
1491                         bufp->buf_slabp = slabp;
1492                         objp = ((void*)bufp) - cachep->c_offset;
1493 finished:
1494                         /* The lock is not needed by the red-zone or poison ops, and the
1495                          * obj has been removed from the slab.  Should be safe to drop
1496                          * the lock here.
1497                          */
1498                         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1499 #if     SLAB_DEBUG_SUPPORT
1500                         if (cachep->c_flags & SLAB_RED_ZONE)
1501                                 goto red_zone;
1502 ret_red:
1503                         if ((cachep->c_flags & SLAB_POISON) && kmem_check_poison_obj(cachep, objp))
1504                                 kmem_report_alloc_err("Bad poison", cachep);
1505 #endif  /* SLAB_DEBUG_SUPPORT */
1506                         return objp;
1507                 }
1508                 /* Update index ptr. */
1509                 objp = ((bufp-slabp->s_index)*cachep->c_offset) + slabp->s_mem;
1510                 bufp->buf_objp = objp;
1511                 goto finished;
1512         }
1513         cachep->c_freep = slabp->s_nextp;
1514         goto ret_obj;
1515
1516 #if     SLAB_DEBUG_SUPPORT
1517 red_zone:
1518         /* Set alloc red-zone, and check old one. */
1519         if (xchg((unsigned long *)objp, SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1)
1520                 kmem_report_alloc_err("Bad front redzone", cachep);
1521         objp += BYTES_PER_WORD;
1522         if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1)
1523                 kmem_report_alloc_err("Bad rear redzone", cachep);
1524         goto ret_red;
1525 #endif  /* SLAB_DEBUG_SUPPORT */
1526
1527 search_dma:
1528         if (slabp->s_dma || (slabp = kmem_cache_search_dma(cachep))!=kmem_slab_end(cachep))
1529                 goto try_again_dma;
1530 alloc_new_slab:
1531         /* Either out of slabs, or magic number corruption. */
1532         if (slabp == kmem_slab_end(cachep)) {
1533                 /* Need a new slab.  Release the lock before calling kmem_cache_grow().
1534                  * This allows objs to be released back into the cache while growing.
1535                  */
1536                 spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1537                 if (kmem_cache_grow(cachep, flags)) {
1538                         /* Someone may have stolen our objs.  Doesn't matter, we'll
1539                          * just come back here again.
1540                          */
1541                         spin_lock_irq(&cachep->c_spinlock);
1542                         goto try_again;
1543                 }
1544                 /* Couldn't grow, but some objs may have been freed. */
1545                 spin_lock_irq(&cachep->c_spinlock);
1546                 if (cachep->c_freep != kmem_slab_end(cachep)) {
1547                         if ((flags & SLAB_ATOMIC) == 0)
1548                                 goto try_again;
1549                 }
1550         } else {
1551                 /* Very serious error - maybe panic() here? */
1552                 kmem_report_alloc_err("Bad slab magic (corrupt)", cachep);
1553         }
1554         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1555 err_exit:
1556         return NULL;
1557 nul_ptr:
1558         kmem_report_alloc_err("NULL ptr", NULL);
1559         goto err_exit;
1560 }
1561
1562 /* Release an obj back to its cache.  If the obj has a constructed state,
1563  * it should be in this state _before_ it is released.
1564  */
1565 static inline void
1566 __kmem_cache_free(kmem_cache_t *cachep, void *objp)
1567 {
1568         kmem_slab_t     *slabp;
1569         kmem_bufctl_t   *bufp;
1570         unsigned long   save_flags;
1571
1572         /* Basic sanity checks. */
1573         if (!cachep || !objp)
1574                 goto null_addr;
1575
1576 #if     SLAB_DEBUG_SUPPORT
1577         /* A verify func is called without the cache-lock held. */
1578         if (cachep->c_flags & SLAB_DEBUG_INITIAL)
1579                 goto init_state_check;
1580 finished_initial:
1581
1582         if (cachep->c_flags & SLAB_RED_ZONE)
1583                 goto red_zone;
1584 return_red:
1585 #endif  /* SLAB_DEBUG_SUPPORT */
1586
1587         spin_lock_irqsave(&cachep->c_spinlock, save_flags);
1588
1589         if (SLAB_BUFCTL(cachep->c_flags))
1590                 goto bufctl;
1591         bufp = (kmem_bufctl_t *)(objp+cachep->c_offset);
1592
1593         /* Get slab for the object. */
1594 #if     0
1595         /* _NASTY_IF/ELSE_, but avoids a 'distant' memory ref for some objects.
1596          * Is this worth while? XXX
1597          */
1598         if (cachep->c_flags & SLAB_HIGH_PACK)
1599                 slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(bufp)]);
1600         else
1601 #endif
1602                 slabp = bufp->buf_slabp;
1603
1604 check_magic:
1605         if (slabp->s_magic != SLAB_MAGIC_ALLOC)         /* Sanity check. */
1606                 goto bad_slab;
1607
1608 #if     SLAB_DEBUG_SUPPORT
1609         if (cachep->c_flags & SLAB_DEBUG_FREE)
1610                 goto extra_checks;
1611 passed_extra:
1612 #endif  /* SLAB_DEBUG_SUPPORT */
1613
1614         if (slabp->s_inuse) {           /* Sanity check. */
1615                 SLAB_STATS_DEC_ACTIVE(cachep);
1616                 slabp->s_inuse--;
1617                 bufp->buf_nextp = slabp->s_freep;
1618                 slabp->s_freep = bufp;
1619                 if (bufp->buf_nextp) {
1620                         if (slabp->s_inuse) {
1621                                 /* (hopefully) The most common case. */
1622 finished:
1623 #if     SLAB_DEBUG_SUPPORT
1624                                 if (cachep->c_flags & SLAB_POISON) {
1625                                         if (cachep->c_flags & SLAB_RED_ZONE)
1626                                                 objp += BYTES_PER_WORD;
1627                                         kmem_poison_obj(cachep, objp);
1628                                 }
1629 #endif  /* SLAB_DEBUG_SUPPORT */
1630                                 spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1631                                 return;
1632                         }
1633                         kmem_cache_full_free(cachep, slabp);
1634                         goto finished;
1635                 }
1636                 kmem_cache_one_free(cachep, slabp);
1637                 goto finished;
1638         }
1639
1640         /* Don't add to freelist. */
1641         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1642         kmem_report_free_err("free with no active objs", objp, cachep);
1643         return;
1644 bufctl:
1645         /* No 'extra' checks are performed for objs stored this way, finding
1646          * the obj is check enough.
1647          */
1648         slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(objp)]);
1649         bufp =  &slabp->s_index[(objp - slabp->s_mem)/cachep->c_offset];
1650         if (bufp->buf_objp == objp)
1651                 goto check_magic;
1652         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1653         kmem_report_free_err("Either bad obj addr or double free", objp, cachep);
1654         return;
1655 #if     SLAB_DEBUG_SUPPORT
1656 init_state_check:
1657         /* Need to call the slab's constructor so the
1658          * caller can perform a verify of its state (debugging).
1659          */
1660         cachep->c_ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
1661         goto finished_initial;
1662 extra_checks:
1663         if (!kmem_extra_free_checks(cachep, slabp->s_freep, bufp, objp)) {
1664                 spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1665                 kmem_report_free_err("Double free detected during checks", objp, cachep);
1666                 return;
1667         }
1668         goto passed_extra;
1669 red_zone:
1670         /* We do not hold the cache-lock while checking the red-zone.
1671          */
1672         objp -= BYTES_PER_WORD;
1673         if (xchg((unsigned long *)objp, SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
1674                 /* Either write before start of obj, or a double free. */
1675                 kmem_report_free_err("Bad front redzone", objp, cachep);
1676         }
1677         if (xchg((unsigned long *)(objp+cachep->c_org_size+BYTES_PER_WORD), SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
1678                 /* Either write past end of obj, or a double free. */
1679                 kmem_report_free_err("Bad rear redzone", objp, cachep);
1680         }
1681         goto return_red;
1682 #endif  /* SLAB_DEBUG_SUPPORT */
1683
1684 bad_slab:
1685         /* Slab doesn't contain the correct magic num. */
1686         if (slabp->s_magic == SLAB_MAGIC_DESTROYED) {
1687                 /* Magic num says this is a destroyed slab. */
1688                 kmem_report_free_err("free from inactive slab", objp, cachep);
1689         } else
1690                 kmem_report_free_err("Bad obj addr", objp, cachep);
1691         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1692
1693 #if 1
1694 /* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */
1695         BUG();
1696 #endif
1697
1698         return;
1699 null_addr:
1700         kmem_report_free_err("NULL ptr", objp, cachep);
1701         return;
1702 }
1703
1704 /**
1705  * kmem_cache_alloc - Allocate an object
1706  * @cachep: The cache to allocate from.
1707  * @flags: See kmalloc().
1708  *
1709  * Allocate an object from this cache.  The flags are only relevant
1710  * if the cache has no available objects.
1711  */
1712 void *
1713 kmem_cache_alloc(kmem_cache_t *cachep, int flags)
1714 {
1715         return __kmem_cache_alloc(cachep, flags);
1716 }
1717
1718 /**
1719  * kmem_cache_free - Deallocate an object
1720  * @cachep: The cache the allocation was from.
1721  * @objp: The previously allocated object.
1722  *
1723  * Free an object which was previously allocated from this
1724  * cache.
1725  */
1726 void
1727 kmem_cache_free(kmem_cache_t *cachep, void *objp)
1728 {
1729         __kmem_cache_free(cachep, objp);
1730 }
1731
1732 /**
1733  * kmalloc - allocate memory
1734  * @size: how many bytes of memory are required.
1735  * @flags: the type of memory to allocate.
1736  *
1737  * kmalloc is the normal method of allocating memory
1738  * in the kernel.  The @flags argument may be one of:
1739  *
1740  * %GFP_BUFFER - XXX
1741  *
1742  * %GFP_ATOMIC - allocation will not sleep.  Use inside interrupt handlers.
1743  *
1744  * %GFP_USER - allocate memory on behalf of user.  May sleep.
1745  *
1746  * %GFP_KERNEL - allocate normal kernel ram.  May sleep.
1747  *
1748  * %GFP_NFS - has a slightly lower probability of sleeping than %GFP_KERNEL.
1749  * Don't use unless you're in the NFS code.
1750  *
1751  * %GFP_KSWAPD - Don't use unless you're modifying kswapd.
1752  */
1753 void *
1754 kmalloc(size_t size, int flags)
1755 {
1756         cache_sizes_t   *csizep = cache_sizes;
1757
1758         for (; csizep->cs_size; csizep++) {
1759                 if (size > csizep->cs_size)
1760                         continue;
1761                 return __kmem_cache_alloc(csizep->cs_cachep, flags);
1762         }
1763         printk(KERN_ERR "kmalloc: Size (%lu) too large\n", (unsigned long) size);
1764         return NULL;
1765 }
1766
1767 /**
1768  * kfree - free previously allocated memory
1769  * @objp: pointer returned by kmalloc.
1770  *
1771  * Don't free memory not originally allocated by kmalloc()
1772  * or you will run into trouble.
1773  */
1774 void
1775 kfree(const void *objp)
1776 {
1777         struct page *page;
1778         int     nr;
1779
1780         if (!objp)
1781                 goto null_ptr;
1782         nr = MAP_NR(objp);
1783         if (nr >= max_mapnr)
1784                 goto bad_ptr;
1785
1786         /* Assume we own the page structure - hence no locking.
1787          * If someone is misbehaving (for example, calling us with a bad
1788          * address), then access to the page structure can race with the
1789          * kmem_slab_destroy() code.  Need to add a spin_lock to each page
1790          * structure, which would be useful in threading the gfp() functions....
1791          */
1792         page = &mem_map[nr];
1793         if (PageSlab(page)) {
1794                 kmem_cache_t    *cachep;
1795
1796                 /* Here, we again assume the obj address is good.
1797                  * If it isn't, and happens to map onto another
1798                  * general cache page which has no active objs, then
1799                  * we race.
1800                  */
1801                 cachep = SLAB_GET_PAGE_CACHE(page);
1802                 if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) {
1803                         __kmem_cache_free(cachep, (void *)objp);
1804                         return;
1805                 }
1806         }
1807 bad_ptr:
1808         printk(KERN_ERR "kfree: Bad obj %p\n", objp);
1809
1810 #if 1
1811 /* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */
1812 BUG();
1813 #endif
1814
1815 null_ptr:
1816         return;
1817 }
1818
1819 /**
1820  * kfree_s - free previously allocated memory
1821  * @objp: pointer returned by kmalloc.
1822  * @size: size of object which is being freed.
1823  *
1824  * This function performs the same task as kfree() except
1825  * that it can use the extra information to speed up deallocation
1826  * or perform additional tests.
1827  * Don't free memory not originally allocated by kmalloc()
1828  * or allocated with a different size, or you will run into trouble.
1829  */
1830 void
1831 kfree_s(const void *objp, size_t size)
1832 {
1833         struct page *page;
1834         int     nr;
1835
1836         if (!objp)
1837                 goto null_ptr;
1838         nr = MAP_NR(objp);
1839         if (nr >= max_mapnr)
1840                 goto null_ptr;
1841         /* See comment in kfree() */
1842         page = &mem_map[nr];
1843         if (PageSlab(page)) {
1844                 kmem_cache_t    *cachep;
1845                 /* See comment in kfree() */
1846                 cachep = SLAB_GET_PAGE_CACHE(page);
1847                 if (cachep && cachep->c_flags & SLAB_CFLGS_GENERAL) {
1848                         if (size <= cachep->c_org_size) {       /* XXX better check */
1849                                 __kmem_cache_free(cachep, (void *)objp);
1850                                 return;
1851                         }
1852                 }
1853         }
1854 null_ptr:
1855         printk(KERN_ERR "kfree_s: Bad obj %p\n", objp);
1856         return;
1857 }
1858
1859 kmem_cache_t *
1860 kmem_find_general_cachep(size_t size)
1861 {
1862         cache_sizes_t   *csizep = cache_sizes;
1863
1864         /* This function could be moved to the header file, and
1865          * made inline so consumers can quickly determine what
1866          * cache pointer they require.
1867          */
1868         for (; csizep->cs_size; csizep++) {
1869                 if (size > csizep->cs_size)
1870                         continue;
1871                 break;
1872         }
1873         return csizep->cs_cachep;
1874 }
1875
1876
1877 /**
1878  * kmem_cache_reap - Reclaim memory from caches.
1879  * @gfp_mask: the type of memory required.
1880  *
1881  * Called from try_to_free_page().
1882  * This function _cannot_ be called within a int, but it
1883  * can be interrupted.
1884  */
1885 void
1886 kmem_cache_reap(int gfp_mask)
1887 {
1888         kmem_slab_t     *slabp;
1889         kmem_cache_t    *searchp;
1890         kmem_cache_t    *best_cachep;
1891         unsigned int     scan;
1892         unsigned int     reap_level;
1893
1894         if (in_interrupt()) {
1895                 printk("kmem_cache_reap() called within int!\n");
1896                 return;
1897         }
1898
1899         /* We really need a test semaphore op so we can avoid sleeping when
1900          * !wait is true.
1901          */
1902         down(&cache_chain_sem);
1903
1904         scan = 10;
1905         reap_level = 0;
1906
1907         best_cachep = NULL;
1908         searchp = clock_searchp;
1909         do {
1910                 unsigned int    full_free;
1911                 unsigned int    dma_flag;
1912
1913                 /* It's safe to test this without holding the cache-lock. */
1914                 if (searchp->c_flags & SLAB_NO_REAP)
1915                         goto next;
1916                 spin_lock_irq(&searchp->c_spinlock);
1917                 if (searchp->c_growing)
1918                         goto next_unlock;
1919                 if (searchp->c_dflags & SLAB_CFLGS_GROWN) {
1920                         searchp->c_dflags &= ~SLAB_CFLGS_GROWN;
1921                         goto next_unlock;
1922                 }
1923                 /* Sanity check for corruption of static values. */
1924                 if (searchp->c_inuse || searchp->c_magic != SLAB_C_MAGIC) {
1925                         spin_unlock_irq(&searchp->c_spinlock);
1926                         printk(KERN_ERR "kmem_reap: Corrupted cache struct for %s\n", searchp->c_name);
1927                         goto next;
1928                 }
1929                 dma_flag = 0;
1930                 full_free = 0;
1931
1932                 /* Count the fully free slabs.  There should not be not many,
1933                  * since we are holding the cache lock.
1934                  */
1935                 slabp = searchp->c_lastp;
1936                 while (!slabp->s_inuse && slabp != kmem_slab_end(searchp)) {
1937                         slabp = slabp->s_prevp;
1938                         full_free++;
1939                         if (slabp->s_dma)
1940                                 dma_flag++;
1941                 }
1942                 spin_unlock_irq(&searchp->c_spinlock);
1943
1944                 if ((gfp_mask & GFP_DMA) && !dma_flag)
1945                         goto next;
1946
1947                 if (full_free) {
1948                         if (full_free >= 10) {
1949                                 best_cachep = searchp;
1950                                 break;
1951                         }
1952
1953                         /* Try to avoid slabs with constructors and/or
1954                          * more than one page per slab (as it can be difficult
1955                          * to get high orders from gfp()).
1956                          */
1957                         if (full_free >= reap_level) {
1958                                 reap_level = full_free;
1959                                 best_cachep = searchp;
1960                         }
1961                 }
1962                 goto next;
1963 next_unlock:
1964                 spin_unlock_irq(&searchp->c_spinlock);
1965 next:
1966                 searchp = searchp->c_nextp;
1967         } while (--scan && searchp != clock_searchp);
1968
1969         clock_searchp = searchp;
1970
1971         if (!best_cachep) {
1972                 /* couldn't find anything to reap */
1973                 goto out;
1974         }
1975
1976         spin_lock_irq(&best_cachep->c_spinlock);
1977         while (!best_cachep->c_growing &&
1978                !(slabp = best_cachep->c_lastp)->s_inuse &&
1979                slabp != kmem_slab_end(best_cachep)) {
1980                 if (gfp_mask & GFP_DMA) {
1981                         do {
1982                                 if (slabp->s_dma)
1983                                         goto good_dma;
1984                                 slabp = slabp->s_prevp;
1985                         } while (!slabp->s_inuse && slabp != kmem_slab_end(best_cachep));
1986
1987                         /* Didn't found a DMA slab (there was a free one -
1988                          * must have been become active).
1989                          */
1990                         goto dma_fail;
1991 good_dma:
1992                 }
1993                 if (slabp == best_cachep->c_freep)
1994                         best_cachep->c_freep = slabp->s_nextp;
1995                 kmem_slab_unlink(slabp);
1996                 SLAB_STATS_INC_REAPED(best_cachep);
1997
1998                 /* Safe to drop the lock.  The slab is no longer linked to the
1999                  * cache.
2000                  */
2001                 spin_unlock_irq(&best_cachep->c_spinlock);
2002                 kmem_slab_destroy(best_cachep, slabp);
2003                 spin_lock_irq(&best_cachep->c_spinlock);
2004         }
2005 dma_fail:
2006         spin_unlock_irq(&best_cachep->c_spinlock);
2007 out:
2008         up(&cache_chain_sem);
2009         return;
2010 }
2011
2012 #if     SLAB_SELFTEST
2013 /* A few v. simple tests */
2014 static void
2015 kmem_self_test(void)
2016 {
2017         kmem_cache_t    *test_cachep;
2018
2019         printk(KERN_INFO "kmem_test() - start\n");
2020         test_cachep = kmem_cache_create("test-cachep", 16, 0, SLAB_RED_ZONE|SLAB_POISON, NULL, NULL);
2021         if (test_cachep) {
2022                 char *objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL);
2023                 if (objp) {
2024                         /* Write in front and past end, red-zone test. */
2025                         *(objp-1) = 1;
2026                         *(objp+16) = 1;
2027                         kmem_cache_free(test_cachep, objp);
2028
2029                         /* Mess up poisoning. */
2030                         *objp = 10;
2031                         objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL);
2032                         kmem_cache_free(test_cachep, objp);
2033
2034                         /* Mess up poisoning (again). */
2035                         *objp = 10;
2036                         kmem_cache_shrink(test_cachep);
2037                 }
2038         }
2039         printk(KERN_INFO "kmem_test() - finished\n");
2040 }
2041 #endif  /* SLAB_SELFTEST */
2042
2043 #if     defined(CONFIG_PROC_FS)
2044 /**
2045  * get_slabinfo - generates /proc/slabinfo
2046  * @buf: the buffer to write it into
2047  *
2048  * The contents of the buffer are
2049  * cache-name
2050  * num-active-objs
2051  * total-objs
2052  * num-active-slabs
2053  * total-slabs
2054  * num-pages-per-slab
2055  */
2056 int
2057 get_slabinfo(char *buf)
2058 {
2059         kmem_cache_t    *cachep;
2060         kmem_slab_t     *slabp;
2061         unsigned long   active_objs;
2062         unsigned long   save_flags;
2063         unsigned long   num_slabs;
2064         unsigned long   num_objs;
2065         int             len=0;
2066 #if     SLAB_STATS
2067         unsigned long   active_slabs;
2068 #endif  /* SLAB_STATS */
2069
2070         __save_flags(save_flags);
2071
2072         /* Output format version, so at least we can change it without _too_
2073          * many complaints.
2074          */
2075 #if     SLAB_STATS
2076         len = sprintf(buf, "slabinfo - version: 1.0 (statistics)\n");
2077 #else
2078         len = sprintf(buf, "slabinfo - version: 1.0\n");
2079 #endif  /* SLAB_STATS */
2080         down(&cache_chain_sem);
2081         cachep = &cache_cache;
2082         do {
2083 #if     SLAB_STATS
2084                 active_slabs = 0;
2085 #endif  /* SLAB_STATS */
2086                 num_slabs = active_objs = 0;
2087                 spin_lock_irq(&cachep->c_spinlock);
2088                 for (slabp = cachep->c_firstp; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
2089                         active_objs += slabp->s_inuse;
2090                         num_slabs++;
2091 #if     SLAB_STATS
2092                         if (slabp->s_inuse)
2093                                 active_slabs++;
2094 #endif  /* SLAB_STATS */
2095                 }
2096                 num_objs = cachep->c_num*num_slabs;
2097 #if     SLAB_STATS
2098                 {
2099                 unsigned long errors;
2100                 unsigned long high = cachep->c_high_mark;
2101                 unsigned long grown = cachep->c_grown;
2102                 unsigned long reaped = cachep->c_reaped;
2103                 unsigned long allocs = cachep->c_num_allocations;
2104                 errors = (unsigned long) atomic_read(&cachep->c_errors);
2105                 spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
2106                 len += sprintf(buf+len, "%-16s %6lu %6lu %6lu %4lu %4lu %4lu %6lu %7lu %5lu %4lu %4lu\n",
2107                                 cachep->c_name, active_objs, num_objs, cachep->c_offset, active_slabs, num_slabs,
2108                                 (1<<cachep->c_gfporder)*num_slabs,
2109                                 high, allocs, grown, reaped, errors);
2110                 }
2111 #else
2112                 spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
2113                 len += sprintf(buf+len, "%-17s %6lu %6lu %6lu\n", cachep->c_name, active_objs, num_objs, cachep->c_offset);
2114 #endif  /* SLAB_STATS */
2115         } while ((cachep = cachep->c_nextp) != &cache_cache);
2116         up(&cache_chain_sem);
2117
2118         return len;
2119 }
2120 #endif  /* CONFIG_PROC_FS */