mm/slab.c

   1 /*
   2  * linux/mm/slab.c
   3  * Written by Mark Hemment, 1996/97.
   4  * (markhe@nextd.demon.co.uk)
   5  *
   6  * 11 April '97.  Started multi-threading - markhe
   7  *      The global cache-chain is protected by the semaphore 'cache_chain_sem'.
   8  *      The sem is only needed when accessing/extending the cache-chain, which
   9  *      can never happen inside an interrupt (kmem_cache_create(),
  10  *      kmem_cache_shrink() and kmem_cache_reap()).
  11  *      This is a medium-term exclusion lock.
  12  *
  13  *      Each cache has its own lock; 'c_spinlock'.  This lock is needed only
  14  *      when accessing non-constant members of a cache-struct.
  15  *      Note: 'constant members' are assigned a value in kmem_cache_create() before
  16  *      the cache is linked into the cache-chain.  The values never change, so not
  17  *      even a multi-reader lock is needed for these members.
  18  *      The c_spinlock is only ever held for a few cycles.
  19  *
  20  *      To prevent kmem_cache_shrink() trying to shrink a 'growing' cache (which
  21  *      maybe be sleeping and therefore not holding the semaphore/lock), the
  22  *      c_growing field is used.  This also prevents reaping from a cache.
  23  *
  24  *      Note, caches can _never_ be destroyed.  When a sub-system (eg module) has
  25  *      finished with a cache, it can only be shrunk.  This leaves the cache empty,
  26  *      but already enabled for re-use, eg. during a module re-load.
  27  *
  28  *      Notes:
  29  *              o Constructors/deconstructors are called while the cache-lock
  30  *                is _not_ held.  Therefore they _must_ be threaded.
  31  *              o Constructors must not attempt to allocate memory from the
  32  *                same cache that they are a constructor for - infinite loop!
  33  *                (There is no easy way to trap this.)
  34  *              o The per-cache locks must be obtained with local-interrupts disabled.
  35  *              o When compiled with debug support, and an object-verify (upon release)
  36  *                is request for a cache, the verify-function is called with the cache
  37  *                lock held.  This helps debugging.
  38  *              o The functions called from try_to_free_page() must not attempt
  39  *                to allocate memory from a cache which is being grown.
  40  *                The buffer sub-system might try to allocate memory, via buffer_cachep.
  41  *                As this pri is passed to the SLAB, and then (if necessary) onto the
  42  *                gfp() funcs (which avoid calling try_to_free_page()), no deadlock
  43  *                should happen.
  44  *
  45  *      The positioning of the per-cache lock is tricky.  If the lock is
  46  *      placed on the same h/w cache line as commonly accessed members
  47  *      the number of L1 cache-line faults is reduced.  However, this can
  48  *      lead to the cache-line ping-ponging between processors when the
  49  *      lock is in contention (and the common members are being accessed).
  50  *      Decided to keep it away from common members.
  51  *
  52  *      More fine-graining is possible, with per-slab locks...but this might be
  53  *      taking fine graining too far, but would have the advantage;
  54  *              During most allocs/frees no writes occur to the cache-struct.
  55  *              Therefore a multi-reader/one writer lock could be used (the writer
  56  *              needed when the slab chain is being link/unlinked).
  57  *              As we would not have an exclusion lock for the cache-structure, one
  58  *              would be needed per-slab (for updating s_free ptr, and/or the contents
  59  *              of s_index).
  60  *      The above locking would allow parallel operations to different slabs within
  61  *      the same cache with reduced spinning.
  62  *
  63  *      Per-engine slab caches, backed by a global cache (as in Mach's Zone allocator),
  64  *      would allow most allocations from the same cache to execute in parallel.
  65  *
  66  *      At present, each engine can be growing a cache.  This should be blocked.
  67  *
  68  *      It is not currently 100% safe to examine the page_struct outside of a kernel
  69  *      or global cli lock.  The risk is v. small, and non-fatal.
  70  *
  71  *      Calls to printk() are not 100% safe (the function is not threaded).  However,
  72  *      printk() is only used under an error condition, and the risk is v. small (not
  73  *      sure if the console write functions 'enjoy' executing multiple contexts in
  74  *      parallel.  I guess they don't...).
  75  *      Note, for most calls to printk() any held cache-lock is dropped.  This is not
  76  *      always done for text size reasons - having *_unlock() everywhere is bloat.
  77  */
  78
  79 /*
  80  * An implementation of the Slab Allocator as described in outline in;
  81  *      UNIX Internals: The New Frontiers by Uresh Vahalia
  82  *      Pub: Prentice Hall      ISBN 0-13-101908-2
  83  * or with a little more detail in;
  84  *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  85  *      Jeff Bonwick (Sun Microsystems).
  86  *      Presented at: USENIX Summer 1994 Technical Conference
  87  */
  88
  89 /*
  90  * This implementation deviates from Bonwick's paper as it
  91  * does not use a hash-table for large objects, but rather a per slab
  92  * index to hold the bufctls.  This allows the bufctl structure to
  93  * be small (one word), but limits the number of objects a slab (not
  94  * a cache) can contain when off-slab bufctls are used.  The limit is the
  95  * size of the largest general cache that does not use off-slab bufctls,
  96  * divided by the size of a bufctl.  For 32bit archs, is this 256/4 = 64.
  97  * This is not serious, as it is only for large objects, when it is unwise
  98  * to have too many per slab.
  99  * Note: This limit can be raised by introducing a general cache whose size
 100  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 101  */
 102
 103 #include        <linux/mm.h>
 104 #include        <linux/slab.h>
 105 #include        <linux/interrupt.h>
 106 #include        <linux/config.h>
 107 #include        <linux/init.h>
 108 #include        <linux/smp.h>
 109
 110 #include        <asm/system.h>
 111 #include        <asm/atomic.h>
 112 #include        <asm/spinlock.h>
 113
 114 /* If there is a different PAGE_SIZE around, and it works with this allocator,
 115  * then change the following.
 116  */
 117 #if     (PAGE_SIZE != 8192 && PAGE_SIZE != 4096)
 118 #error  Your page size is probably not correctly supported - please check
 119 #endif
 120
 121 /* SLAB_MGMT_CHECKS     - 1 to enable extra checks in kmem_cache_create().
 122  *                        0 if you wish to reduce memory usage.
 123  *
 124  * SLAB_DEBUG_SUPPORT   - 1 for kmem_cache_create() to honour; SLAB_DEBUG_FREE,
 125  *                        SLAB_DEBUG_INITIAL, SLAB_RED_ZONE & SLAB_POISON.
 126  *                        0 for faster, smaller, code (especially in the critical paths).
 127  *
 128  * SLAB_STATS           - 1 to collect stats for /proc/slabinfo.
 129  *                        0 for faster, smaller, code (especially in the critical paths).
 130  *
 131  * SLAB_SELFTEST        - 1 to perform a few tests, mainly for development.
 132  */
 133 #define         SLAB_MGMT_CHECKS        1
 134 #define         SLAB_DEBUG_SUPPORT      0
 135 #define         SLAB_STATS              0
 136 #define         SLAB_SELFTEST           0
 137
 138 /* Shouldn't this be in a header file somewhere? */
 139 #define BYTES_PER_WORD          sizeof(void *)
 140
 141 /* Legal flag mask for kmem_cache_create(). */
 142 #if     SLAB_DEBUG_SUPPORT
 143 #if     0
 144 #define SLAB_C_MASK             (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \
 145                                  SLAB_POISON|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP| \
 146                                  SLAB_HIGH_PACK)
 147 #endif
 148 #define SLAB_C_MASK             (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \
 149                                  SLAB_POISON|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP)
 150 #else
 151 #if     0
 152 #define SLAB_C_MASK             (SLAB_HWCACHE_ALIGN|SLAB_NO_REAP|SLAB_HIGH_PACK)
 153 #endif
 154 #define SLAB_C_MASK             (SLAB_HWCACHE_ALIGN|SLAB_NO_REAP)
 155 #endif  /* SLAB_DEBUG_SUPPORT */
 156
 157 /* Slab management struct.
 158  * Manages the objs in a slab.  Placed either at the end of mem allocated
 159  * for a slab, or from an internal obj cache (cache_slabp).
 160  * Slabs are chained into a partially ordered list; fully used first, partial
 161  * next, and then fully free slabs.
 162  * The first 4 members are referenced during an alloc/free operation, and
 163  * should always appear on the same cache line.
 164  * Note: The offset between some members _must_ match offsets within
 165  * the kmem_cache_t - see kmem_cache_init() for the checks. */
 166
 167 #define SLAB_OFFSET_BITS        16      /* could make this larger for 64bit archs */
 168
 169 typedef struct kmem_slab_s {
 170         struct kmem_bufctl_s    *s_freep;  /* ptr to first inactive obj in slab */
 171         struct kmem_bufctl_s    *s_index;
 172         unsigned long            s_magic;
 173         unsigned long            s_inuse;  /* num of objs active in slab */
 174
 175         struct kmem_slab_s      *s_nextp;
 176         struct kmem_slab_s      *s_prevp;
 177         void                    *s_mem;    /* addr of first obj in slab */
 178         unsigned long            s_offset:SLAB_OFFSET_BITS,
 179                                  s_dma:1;
 180 } kmem_slab_t;
 181
 182 /* When the slab management is on-slab, this gives the size to use. */
 183 #define slab_align_size         (L1_CACHE_ALIGN(sizeof(kmem_slab_t)))
 184
 185 /* Test for end of slab chain. */
 186 #define kmem_slab_end(x)        ((kmem_slab_t*)&((x)->c_offset))
 187
 188 /* s_magic */
 189 #define SLAB_MAGIC_ALLOC        0xA5C32F2BUL    /* slab is alive */
 190 #define SLAB_MAGIC_DESTROYED    0xB2F23C5AUL    /* slab has been destroyed */
 191
 192 /* Bufctl's are used for linking objs within a slab, identifying what slab an obj
 193  * is in, and the address of the associated obj (for sanity checking with off-slab
 194  * bufctls).  What a bufctl contains depends upon the state of the obj and
 195  * the organisation of the cache.
 196  */
 197 typedef struct kmem_bufctl_s {
 198         union {
 199                 struct kmem_bufctl_s    *buf_nextp;
 200                 kmem_slab_t             *buf_slabp;     /* slab for obj */
 201                 void *                   buf_objp;
 202         } u;
 203 } kmem_bufctl_t;
 204
 205 /* ...shorthand... */
 206 #define buf_nextp       u.buf_nextp
 207 #define buf_slabp       u.buf_slabp
 208 #define buf_objp        u.buf_objp
 209
 210 #if     SLAB_DEBUG_SUPPORT
 211 /* Magic nums for obj red zoning.
 212  * Placed in the first word before and the first word after an obj.
 213  */
 214 #define SLAB_RED_MAGIC1         0x5A2CF071UL    /* when obj is active */
 215 #define SLAB_RED_MAGIC2         0x170FC2A5UL    /* when obj is inactive */
 216
 217 /* ...and for poisoning */
 218 #define SLAB_POISON_BYTE        0x5a            /* byte value for poisoning */
 219 #define SLAB_POISON_END 0xa5            /* end-byte of poisoning */
 220
 221 #endif  /* SLAB_DEBUG_SUPPORT */
 222
 223 /* Cache struct - manages a cache.
 224  * First four members are commonly referenced during an alloc/free operation.
 225  */
 226 struct kmem_cache_s {
 227         kmem_slab_t              *c_freep;      /* first slab w. free objs */
 228         unsigned long             c_flags;      /* constant flags */
 229         unsigned long             c_offset;
 230         unsigned long             c_num;        /* # of objs per slab */
 231
 232         unsigned long             c_magic;
 233         unsigned long             c_inuse;      /* kept at zero */
 234         kmem_slab_t              *c_firstp;     /* first slab in chain */
 235         kmem_slab_t              *c_lastp;      /* last slab in chain */
 236
 237         spinlock_t                c_spinlock;
 238         unsigned long             c_growing;
 239         unsigned long             c_dflags;     /* dynamic flags */
 240         size_t                    c_org_size;
 241         unsigned long             c_gfporder;   /* order of pgs per slab (2^n) */
 242         void (*c_ctor)(void *, kmem_cache_t *, unsigned long); /* constructor func */
 243         void (*c_dtor)(void *, kmem_cache_t *, unsigned long); /* de-constructor func */
 244         unsigned long             c_align;      /* alignment of objs */
 245         size_t                    c_colour;     /* cache colouring range */
 246         size_t                    c_colour_next;/* cache colouring */
 247         unsigned long             c_failures;
 248         const char               *c_name;
 249         struct kmem_cache_s      *c_nextp;
 250         kmem_cache_t             *c_index_cachep;
 251 #if     SLAB_STATS
 252         unsigned long             c_num_active;
 253         unsigned long             c_num_allocations;
 254         unsigned long             c_high_mark;
 255         unsigned long             c_grown;
 256         unsigned long             c_reaped;
 257         atomic_t                  c_errors;
 258 #endif  /* SLAB_STATS */
 259 };
 260
 261 /* internal c_flags */
 262 #define SLAB_CFLGS_OFF_SLAB     0x010000UL      /* slab management in own cache */
 263 #define SLAB_CFLGS_BUFCTL       0x020000UL      /* bufctls in own cache */
 264 #define SLAB_CFLGS_GENERAL      0x080000UL      /* a general cache */
 265
 266 /* c_dflags (dynamic flags).  Need to hold the spinlock to access this member */
 267 #define SLAB_CFLGS_GROWN        0x000002UL      /* don't reap a recently grown */
 268
 269 #define SLAB_OFF_SLAB(x)        ((x) & SLAB_CFLGS_OFF_SLAB)
 270 #define SLAB_BUFCTL(x)          ((x) & SLAB_CFLGS_BUFCTL)
 271 #define SLAB_GROWN(x)           ((x) & SLAB_CFLGS_GROWN)
 272
 273 #if     SLAB_STATS
 274 #define SLAB_STATS_INC_ACTIVE(x)        ((x)->c_num_active++)
 275 #define SLAB_STATS_DEC_ACTIVE(x)        ((x)->c_num_active--)
 276 #define SLAB_STATS_INC_ALLOCED(x)       ((x)->c_num_allocations++)
 277 #define SLAB_STATS_INC_GROWN(x)         ((x)->c_grown++)
 278 #define SLAB_STATS_INC_REAPED(x)        ((x)->c_reaped++)
 279 #define SLAB_STATS_SET_HIGH(x)          do { if ((x)->c_num_active > (x)->c_high_mark) \
 280                                                 (x)->c_high_mark = (x)->c_num_active; \
 281                                         } while (0)
 282 #define SLAB_STATS_INC_ERR(x)           (atomic_inc(&(x)->c_errors))
 283 #else
 284 #define SLAB_STATS_INC_ACTIVE(x)
 285 #define SLAB_STATS_DEC_ACTIVE(x)
 286 #define SLAB_STATS_INC_ALLOCED(x)
 287 #define SLAB_STATS_INC_GROWN(x)
 288 #define SLAB_STATS_INC_REAPED(x)
 289 #define SLAB_STATS_SET_HIGH(x)
 290 #define SLAB_STATS_INC_ERR(x)
 291 #endif  /* SLAB_STATS */
 292
 293 #if     SLAB_SELFTEST
 294 #if     !SLAB_DEBUG_SUPPORT
 295 #error  Debug support needed for self-test
 296 #endif
 297 static void kmem_self_test(void);
 298 #endif  /* SLAB_SELFTEST */
 299
 300 /* c_magic - used to detect 'out of slabs' in __kmem_cache_alloc() */
 301 #define SLAB_C_MAGIC            0x4F17A36DUL
 302
 303 /* maximum size of an obj (in 2^order pages) */
 304 #define SLAB_OBJ_MAX_ORDER      5       /* 32 pages */
 305
 306 /* maximum num of pages for a slab (prevents large requests to the VM layer) */
 307 #define SLAB_MAX_GFP_ORDER      5       /* 32 pages */
 308
 309 /* the 'preferred' minimum num of objs per slab - maybe less for large objs */
 310 #define SLAB_MIN_OBJS_PER_SLAB  4
 311
 312 /* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB,
 313  * then the page order must be less than this before trying the next order.
 314  */
 315 #define SLAB_BREAK_GFP_ORDER_HI 2
 316 #define SLAB_BREAK_GFP_ORDER_LO 1
 317 static int slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_LO;
 318
 319 /* Macros for storing/retrieving the cachep and or slab from the
 320  * global 'mem_map'.  With off-slab bufctls, these are used to find the
 321  * slab an obj belongs to.  With kmalloc(), and kfree(), these are used
 322  * to find the cache which an obj belongs to.
 323  */
 324 #define SLAB_SET_PAGE_CACHE(pg, x)      ((pg)->next = (struct page *)(x))
 325 #define SLAB_GET_PAGE_CACHE(pg)         ((kmem_cache_t *)(pg)->next)
 326 #define SLAB_SET_PAGE_SLAB(pg, x)       ((pg)->prev = (struct page *)(x))
 327 #define SLAB_GET_PAGE_SLAB(pg)          ((kmem_slab_t *)(pg)->prev)
 328
 329 /* Size description struct for general caches. */
 330 typedef struct cache_sizes {
 331         size_t           cs_size;
 332         kmem_cache_t    *cs_cachep;
 333 } cache_sizes_t;
 334
 335 static cache_sizes_t cache_sizes[] = {
 336 #if     PAGE_SIZE == 4096
 337         {  32,          NULL},
 338 #endif
 339         {  64,          NULL},
 340         { 128,          NULL},
 341         { 256,          NULL},
 342         { 512,          NULL},
 343         {1024,          NULL},
 344         {2048,          NULL},
 345         {4096,          NULL},
 346         {8192,          NULL},
 347         {16384,         NULL},
 348         {32768,         NULL},
 349         {65536,         NULL},
 350         {131072,        NULL},
 351         {0,             NULL}
 352 };
 353
 354 /* Names for the general caches.  Not placed into the sizes struct for
 355  * a good reason; the string ptr is not needed while searching in kmalloc(),
 356  * and would 'get-in-the-way' in the h/w cache.
 357  */
 358 static char *cache_sizes_name[] = {
 359 #if     PAGE_SIZE == 4096
 360         "size-32",
 361 #endif
 362         "size-64",
 363         "size-128",
 364         "size-256",
 365         "size-512",
 366         "size-1024",
 367         "size-2048",
 368         "size-4096",
 369         "size-8192",
 370         "size-16384",
 371         "size-32768",
 372         "size-65536",
 373         "size-131072"
 374 };
 375
 376 /* internal cache of cache description objs */
 377 static  kmem_cache_t    cache_cache = {
 378 /* freep, flags */              kmem_slab_end(&cache_cache), SLAB_NO_REAP,
 379 /* offset, num */               sizeof(kmem_cache_t),   0,
 380 /* c_magic, c_inuse */          SLAB_C_MAGIC, 0,
 381 /* firstp, lastp */             kmem_slab_end(&cache_cache), kmem_slab_end(&cache_cache),
 382 /* spinlock */                  SPIN_LOCK_UNLOCKED,
 383 /* growing */                   0,
 384 /* dflags */                    0,
 385 /* org_size, gfp */             0, 0,
 386 /* ctor, dtor, align */         NULL, NULL, L1_CACHE_BYTES,
 387 /* colour, colour_next */       0, 0,
 388 /* failures */                  0,
 389 /* name */                      "kmem_cache",
 390 /* nextp */                     &cache_cache,
 391 /* index */                     NULL,
 392 };
 393
 394 /* Guard access to the cache-chain. */
 395 static struct semaphore cache_chain_sem;
 396
 397 /* Place maintainer for reaping. */
 398 static  kmem_cache_t    *clock_searchp = &cache_cache;
 399
 400 /* Internal slab management cache, for when slab management is off-slab. */
 401 static kmem_cache_t     *cache_slabp = NULL;
 402
 403 /* Max number of objs-per-slab for caches which use bufctl's.
 404  * Needed to avoid a possible looping condition in kmem_cache_grow().
 405  */
 406 static unsigned long bufctl_limit = 0;
 407
 408 /* Initialisation - setup the `cache' cache. */
 409 long __init kmem_cache_init(long start, long end)
 410 {
 411         size_t size, i;
 412
 413 #define kmem_slab_offset(x)  ((unsigned long)&((kmem_slab_t *)0)->x)
 414 #define kmem_slab_diff(a,b)  (kmem_slab_offset(a) - kmem_slab_offset(b))
 415 #define kmem_cache_offset(x) ((unsigned long)&((kmem_cache_t *)0)->x)
 416 #define kmem_cache_diff(a,b) (kmem_cache_offset(a) - kmem_cache_offset(b))
 417
 418         /* Sanity checks... */
 419         if (kmem_cache_diff(c_firstp, c_magic) != kmem_slab_diff(s_nextp, s_magic) ||
 420             kmem_cache_diff(c_firstp, c_inuse) != kmem_slab_diff(s_nextp, s_inuse) ||
 421             ((kmem_cache_offset(c_lastp) -
 422               ((unsigned long) kmem_slab_end((kmem_cache_t*)NULL))) !=
 423              kmem_slab_offset(s_prevp)) ||
 424             kmem_cache_diff(c_lastp, c_firstp) != kmem_slab_diff(s_prevp, s_nextp)) {
 425                 /* Offsets to the magic are incorrect, either the structures have
 426                  * been incorrectly changed, or adjustments are needed for your
 427                  * architecture.
 428                  */
 429                 panic("kmem_cache_init(): Offsets are wrong - I've been messed with!");
 430                 /* NOTREACHED */
 431         }
 432 #undef  kmem_cache_offset
 433 #undef  kmem_cache_diff
 434 #undef  kmem_slab_offset
 435 #undef  kmem_slab_diff
 436
 437         cache_chain_sem = MUTEX;
 438
 439         size = cache_cache.c_offset + sizeof(kmem_bufctl_t);
 440         size += (L1_CACHE_BYTES-1);
 441         size &= ~(L1_CACHE_BYTES-1);
 442         cache_cache.c_offset = size-sizeof(kmem_bufctl_t);
 443
 444         i = (PAGE_SIZE<<cache_cache.c_gfporder)-slab_align_size;
 445         cache_cache.c_num = i / size;   /* num of objs per slab */
 446
 447         /* Cache colouring. */
 448         cache_cache.c_colour = (i-(cache_cache.c_num*size))/L1_CACHE_BYTES;
 449         cache_cache.c_colour_next = cache_cache.c_colour;
 450
 451         /*
 452          * Fragmentation resistance on low memory - only use bigger
 453          * page orders on machines with more than 32MB of memory.
 454          */
 455         if (num_physpages > (32 << 20) >> PAGE_SHIFT)
 456                 slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_HI;
 457         return start;
 458 }
 459
 460 /* Initialisation - setup remaining internal and general caches.
 461  * Called after the gfp() functions have been enabled, and before smp_init().
 462  */
 463 void __init kmem_cache_sizes_init(void)
 464 {
 465         unsigned int    found = 0;
 466
 467         cache_slabp = kmem_cache_create("slab_cache", sizeof(kmem_slab_t),
 468                                         0, SLAB_HWCACHE_ALIGN, NULL, NULL);
 469         if (cache_slabp) {
 470                 char **names = cache_sizes_name;
 471                 cache_sizes_t *sizes = cache_sizes;
 472                 do {
 473                         /* For performance, all the general caches are L1 aligned.
 474                          * This should be particularly beneficial on SMP boxes, as it
 475                          * eliminates "false sharing".
 476                          * Note for systems short on memory removing the alignment will
 477                          * allow tighter packing of the smaller caches. */
 478                         if (!(sizes->cs_cachep =
 479                               kmem_cache_create(*names++, sizes->cs_size,
 480                                                 0, SLAB_HWCACHE_ALIGN, NULL, NULL)))
 481                                 goto panic_time;
 482                         if (!found) {
 483                                 /* Inc off-slab bufctl limit until the ceiling is hit. */
 484                                 if (SLAB_BUFCTL(sizes->cs_cachep->c_flags))
 485                                         found++;
 486                                 else
 487                                         bufctl_limit =
 488                                                 (sizes->cs_size/sizeof(kmem_bufctl_t));
 489                         }
 490                         sizes->cs_cachep->c_flags |= SLAB_CFLGS_GENERAL;
 491                         sizes++;
 492                 } while (sizes->cs_size);
 493 #if     SLAB_SELFTEST
 494                 kmem_self_test();
 495 #endif  /* SLAB_SELFTEST */
 496                 return;
 497         }
 498 panic_time:
 499         panic("kmem_cache_sizes_init: Error creating caches");
 500         /* NOTREACHED */
 501 }
 502
 503 /* Interface to system's page allocator.  Dma pts to non-zero if all
 504  * of memory is DMAable. No need to hold the cache-lock.
 505  */
 506 static inline void *
 507 kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma)
 508 {
 509         void    *addr;
 510
 511         *dma = flags & SLAB_DMA;
 512         addr = (void*) __get_free_pages(flags, cachep->c_gfporder);
 513         /* Assume that now we have the pages no one else can legally
 514          * messes with the 'struct page's.
 515          * However vm_scan() might try to test the structure to see if
 516          * it is a named-page or buffer-page.  The members it tests are
 517          * of no interest here.....
 518          */
 519         if (!*dma && addr) {
 520                 /* Need to check if can dma. */
 521                 struct page *page = mem_map + MAP_NR(addr);
 522                 *dma = 1<<cachep->c_gfporder;
 523                 while ((*dma)--) {
 524                         if (!PageDMA(page)) {
 525                                 *dma = 0;
 526                                 break;
 527                         }
 528                         page++;
 529                 }
 530         }
 531         return addr;
 532 }
 533
 534 /* Interface to system's page release. */
 535 static inline void
 536 kmem_freepages(kmem_cache_t *cachep, void *addr)
 537 {
 538         unsigned long i = (1<<cachep->c_gfporder);
 539         struct page *page = &mem_map[MAP_NR(addr)];
 540
 541         /* free_pages() does not clear the type bit - we do that.
 542          * The pages have been unlinked from their cache-slab,
 543          * but their 'struct page's might be accessed in
 544          * vm_scan(). Shouldn't be a worry.
 545          */
 546         while (i--) {
 547                 PageClearSlab(page);
 548                 page++;
 549         }
 550         free_pages((unsigned long)addr, cachep->c_gfporder);
 551 }
 552
 553 #if     SLAB_DEBUG_SUPPORT
 554 static inline void
 555 kmem_poison_obj(kmem_cache_t *cachep, void *addr)
 556 {
 557         memset(addr, SLAB_POISON_BYTE, cachep->c_org_size);
 558         *(unsigned char *)(addr+cachep->c_org_size-1) = SLAB_POISON_END;
 559 }
 560
 561 static inline int
 562 kmem_check_poison_obj(kmem_cache_t *cachep, void *addr)
 563 {
 564         void *end;
 565         end = memchr(addr, SLAB_POISON_END, cachep->c_org_size);
 566         if (end != (addr+cachep->c_org_size-1))
 567                 return 1;
 568         return 0;
 569 }
 570 #endif  /* SLAB_DEBUG_SUPPORT */
 571
 572 /* Three slab chain funcs - all called with ints disabled and the appropriate
 573  * cache-lock held.
 574  */
 575 static inline void
 576 kmem_slab_unlink(kmem_slab_t *slabp)
 577 {
 578         kmem_slab_t     *prevp = slabp->s_prevp;
 579         kmem_slab_t     *nextp = slabp->s_nextp;
 580         prevp->s_nextp = nextp;
 581         nextp->s_prevp = prevp;
 582 }
 583
 584 static inline void
 585 kmem_slab_link_end(kmem_cache_t *cachep, kmem_slab_t *slabp)
 586 {
 587         kmem_slab_t     *lastp = cachep->c_lastp;
 588         slabp->s_nextp = kmem_slab_end(cachep);
 589         slabp->s_prevp = lastp;
 590         cachep->c_lastp = slabp;
 591         lastp->s_nextp = slabp;
 592 }
 593
 594 static inline void
 595 kmem_slab_link_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
 596 {
 597         kmem_slab_t     *nextp = cachep->c_freep;
 598         kmem_slab_t     *prevp = nextp->s_prevp;
 599         slabp->s_nextp = nextp;
 600         slabp->s_prevp = prevp;
 601         nextp->s_prevp = slabp;
 602         slabp->s_prevp->s_nextp = slabp;
 603 }
 604
 605 /* Destroy all the objs in a slab, and release the mem back to the system.
 606  * Before calling the slab must have been unlinked from the cache.
 607  * The cache-lock is not held/needed.
 608  */
 609 static void
 610 kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
 611 {
 612         if (cachep->c_dtor
 613 #if     SLAB_DEBUG_SUPPORT
 614                 || cachep->c_flags & (SLAB_POISON | SLAB_RED_ZONE)
 615 #endif  /*SLAB_DEBUG_SUPPORT*/
 616         ) {
 617                 /* Doesn't use the bufctl ptrs to find objs. */
 618                 unsigned long num = cachep->c_num;
 619                 void *objp = slabp->s_mem;
 620                 do {
 621 #if     SLAB_DEBUG_SUPPORT
 622                         if (cachep->c_flags & SLAB_RED_ZONE) {
 623                                 if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1)
 624                                         printk(KERN_ERR "kmem_slab_destroy: "
 625                                                "Bad front redzone - %s\n",
 626                                                cachep->c_name);
 627                                 objp += BYTES_PER_WORD;
 628                                 if (*((unsigned long*)(objp+cachep->c_org_size)) !=
 629                                     SLAB_RED_MAGIC1)
 630                                         printk(KERN_ERR "kmem_slab_destroy: "
 631                                                "Bad rear redzone - %s\n",
 632                                                cachep->c_name);
 633                         }
 634                         if (cachep->c_dtor)
 635 #endif  /*SLAB_DEBUG_SUPPORT*/
 636                                 (cachep->c_dtor)(objp, cachep, 0);
 637 #if     SLAB_DEBUG_SUPPORT
 638                         else if (cachep->c_flags & SLAB_POISON) {
 639                                 if (kmem_check_poison_obj(cachep, objp))
 640                                         printk(KERN_ERR "kmem_slab_destroy: "
 641                                                "Bad poison - %s\n", cachep->c_name);
 642                         }
 643                         if (cachep->c_flags & SLAB_RED_ZONE)
 644                                 objp -= BYTES_PER_WORD;
 645 #endif  /* SLAB_DEBUG_SUPPORT */
 646                         objp += cachep->c_offset;
 647                         if (!slabp->s_index)
 648                                 objp += sizeof(kmem_bufctl_t);
 649                 } while (--num);
 650         }
 651
 652         slabp->s_magic = SLAB_MAGIC_DESTROYED;
 653         kmem_freepages(cachep, slabp->s_mem-slabp->s_offset);
 654         if (slabp->s_index)
 655                 kmem_cache_free(cachep->c_index_cachep, slabp->s_index);
 656         if (SLAB_OFF_SLAB(cachep->c_flags))
 657                 kmem_cache_free(cache_slabp, slabp);
 658 }
 659
 660 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 661 static inline size_t
 662 kmem_cache_cal_waste(unsigned long gfporder, size_t size, size_t extra,
 663                      unsigned long flags, size_t *left_over, unsigned long *num)
 664 {
 665         size_t wastage = PAGE_SIZE<<gfporder;
 666
 667         if (SLAB_OFF_SLAB(flags))
 668                 gfporder = 0;
 669         else
 670                 gfporder = slab_align_size;
 671         wastage -= gfporder;
 672         *num = wastage / size;
 673         wastage -= (*num * size);
 674         *left_over = wastage;
 675
 676         return (wastage + gfporder + (extra * *num));
 677 }
 678
 679 /* Create a cache:
 680  * Returns a ptr to the cache on success, NULL on failure.
 681  * Cannot be called within a int, but can be interrupted.
 682  * NOTE: The 'name' is assumed to be memory that is _not_  going to disappear.
 683  */
 684 kmem_cache_t *
 685 kmem_cache_create(const char *name, size_t size, size_t offset,
 686         unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
 687         void (*dtor)(void*, kmem_cache_t *, unsigned long))
 688 {
 689         const char *func_nm= KERN_ERR "kmem_create: ";
 690         kmem_cache_t    *searchp;
 691         kmem_cache_t    *cachep=NULL;
 692         size_t          extra;
 693         size_t          left_over;
 694         size_t          align;
 695
 696         /* Sanity checks... */
 697 #if     SLAB_MGMT_CHECKS
 698         if (!name) {
 699                 printk("%sNULL ptr\n", func_nm);
 700                 goto opps;
 701         }
 702         if (in_interrupt()) {
 703                 printk("%sCalled during int - %s\n", func_nm, name);
 704                 goto opps;
 705         }
 706
 707         if (size < BYTES_PER_WORD) {
 708                 printk("%sSize too small %d - %s\n", func_nm, (int) size, name);
 709                 size = BYTES_PER_WORD;
 710         }
 711
 712         if (size > ((1<<SLAB_OBJ_MAX_ORDER)*PAGE_SIZE)) {
 713                 printk("%sSize too large %d - %s\n", func_nm, (int) size, name);
 714                 goto opps;
 715         }
 716
 717         if (dtor && !ctor) {
 718                 /* Decon, but no con - doesn't make sense */
 719                 printk("%sDecon but no con - %s\n", func_nm, name);
 720                 goto opps;
 721         }
 722
 723         if (offset < 0 || offset > size) {
 724                 printk("%sOffset weird %d - %s\n", func_nm, (int) offset, name);
 725                 offset = 0;
 726         }
 727
 728 #if     SLAB_DEBUG_SUPPORT
 729         if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
 730                 /* No constructor, but inital state check requested */
 731                 printk("%sNo con, but init state check requested - %s\n", func_nm, name);
 732                 flags &= ~SLAB_DEBUG_INITIAL;
 733         }
 734
 735         if ((flags & SLAB_POISON) && ctor) {
 736                 /* request for poisoning, but we can't do that with a constructor */
 737                 printk("%sPoisoning requested, but con given - %s\n", func_nm, name);
 738                 flags &= ~SLAB_POISON;
 739         }
 740 #if     0
 741         if ((flags & SLAB_HIGH_PACK) && ctor) {
 742                 printk("%sHigh pack requested, but con given - %s\n", func_nm, name);
 743                 flags &= ~SLAB_HIGH_PACK;
 744         }
 745         if ((flags & SLAB_HIGH_PACK) && (flags & (SLAB_POISON|SLAB_RED_ZONE))) {
 746                 printk("%sHigh pack requested, but with poisoning/red-zoning - %s\n",
 747                        func_nm, name);
 748                 flags &= ~SLAB_HIGH_PACK;
 749         }
 750 #endif
 751 #endif  /* SLAB_DEBUG_SUPPORT */
 752 #endif  /* SLAB_MGMT_CHECKS */
 753
 754         /* Always checks flags, a caller might be expecting debug
 755          * support which isn't available.
 756          */
 757         if (flags & ~SLAB_C_MASK) {
 758                 printk("%sIllgl flg %lX - %s\n", func_nm, flags, name);
 759                 flags &= SLAB_C_MASK;
 760         }
 761
 762         /* Get cache's description obj. */
 763         cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
 764         if (!cachep)
 765                 goto opps;
 766         memset(cachep, 0, sizeof(kmem_cache_t));
 767
 768         /* Check that size is in terms of words.  This is needed to avoid
 769          * unaligned accesses for some archs when redzoning is used, and makes
 770          * sure any on-slab bufctl's are also correctly aligned.
 771          */
 772         if (size & (BYTES_PER_WORD-1)) {
 773                 size += (BYTES_PER_WORD-1);
 774                 size &= ~(BYTES_PER_WORD-1);
 775                 printk("%sForcing size word alignment - %s\n", func_nm, name);
 776         }
 777
 778         cachep->c_org_size = size;
 779 #if     SLAB_DEBUG_SUPPORT
 780         if (flags & SLAB_RED_ZONE) {
 781                 /* There is no point trying to honour cache alignment when redzoning. */
 782                 flags &= ~SLAB_HWCACHE_ALIGN;
 783                 size += 2*BYTES_PER_WORD;               /* words for redzone */
 784         }
 785 #endif  /* SLAB_DEBUG_SUPPORT */
 786
 787         align = BYTES_PER_WORD;
 788         if (flags & SLAB_HWCACHE_ALIGN)
 789                 align = L1_CACHE_BYTES;
 790
 791         /* Determine if the slab management and/or bufclts are 'on' or 'off' slab. */
 792         extra = sizeof(kmem_bufctl_t);
 793         if (size < (PAGE_SIZE>>3)) {
 794                 /* Size is small(ish).  Use packing where bufctl size per
 795                  * obj is low, and slab management is on-slab.
 796                  */
 797 #if     0
 798                 if ((flags & SLAB_HIGH_PACK)) {
 799                         /* Special high packing for small objects
 800                          * (mainly for vm_mapping structs, but
 801                          * others can use it).
 802                          */
 803                         if (size == (L1_CACHE_BYTES/4) || size == (L1_CACHE_BYTES/2) ||
 804                             size == L1_CACHE_BYTES) {
 805                                 /* The bufctl is stored with the object. */
 806                                 extra = 0;
 807                         } else
 808                                 flags &= ~SLAB_HIGH_PACK;
 809                 }
 810 #endif
 811         } else {
 812                 /* Size is large, assume best to place the slab management obj
 813                  * off-slab (should allow better packing of objs).
 814                  */
 815                 flags |= SLAB_CFLGS_OFF_SLAB;
 816                 if (!(size & ~PAGE_MASK) || size == (PAGE_SIZE/2)
 817                     || size == (PAGE_SIZE/4) || size == (PAGE_SIZE/8)) {
 818                         /* To avoid waste the bufctls are off-slab... */
 819                         flags |= SLAB_CFLGS_BUFCTL;
 820                         extra = 0;
 821                 } /* else slab management is off-slab, but freelist pointers are on. */
 822         }
 823         size += extra;
 824
 825         if (flags & SLAB_HWCACHE_ALIGN) {
 826                 /* Need to adjust size so that objs are cache aligned. */
 827                 if (size > (L1_CACHE_BYTES/2)) {
 828                         size_t words = size % L1_CACHE_BYTES;
 829                         if (words)
 830                                 size += (L1_CACHE_BYTES-words);
 831                 } else {
 832                         /* Small obj size, can get at least two per cache line. */
 833                         int num_per_line = L1_CACHE_BYTES/size;
 834                         left_over = L1_CACHE_BYTES - (num_per_line*size);
 835                         if (left_over) {
 836                                 /* Need to adjust size so objs cache align. */
 837                                 if (left_over%num_per_line) {
 838                                         /* Odd num of objs per line - fixup. */
 839                                         num_per_line--;
 840                                         left_over += size;
 841                                 }
 842                                 size += (left_over/num_per_line);
 843                         }
 844                 }
 845         } else if (!(size%L1_CACHE_BYTES)) {
 846                 /* Size happens to cache align... */
 847                 flags |= SLAB_HWCACHE_ALIGN;
 848                 align = L1_CACHE_BYTES;
 849         }
 850
 851         /* Cal size (in pages) of slabs, and the num of objs per slab.
 852          * This could be made much more intelligent.  For now, try to avoid
 853          * using high page-orders for slabs.  When the gfp() funcs are more
 854          * friendly towards high-order requests, this should be changed.
 855          */
 856         do {
 857                 size_t wastage;
 858                 unsigned int break_flag = 0;
 859 cal_wastage:
 860                 wastage = kmem_cache_cal_waste(cachep->c_gfporder, size, extra,
 861                                                flags, &left_over, &cachep->c_num);
 862                 if (!cachep->c_num)
 863                         goto next;
 864                 if (break_flag)
 865                         break;
 866                 if (SLAB_BUFCTL(flags) && cachep->c_num > bufctl_limit) {
 867                         /* Oops, this num of objs will cause problems. */
 868                         cachep->c_gfporder--;
 869                         break_flag++;
 870                         goto cal_wastage;
 871                 }
 872                 if (cachep->c_gfporder == SLAB_MAX_GFP_ORDER)
 873                         break;
 874
 875                 /* Large num of objs is good, but v. large slabs are currently
 876                  * bad for the gfp()s.
 877                  */
 878                 if (cachep->c_num <= SLAB_MIN_OBJS_PER_SLAB) {
 879                         if (cachep->c_gfporder < slab_break_gfp_order)
 880                                 goto next;
 881                 }
 882
 883                 /* Stop caches with small objs having a large num of pages. */
 884                 if (left_over <= slab_align_size)
 885                         break;
 886                 if ((wastage*8) <= (PAGE_SIZE<<cachep->c_gfporder))
 887                         break;  /* Acceptable internal fragmentation. */
 888 next:
 889                 cachep->c_gfporder++;
 890         } while (1);
 891
 892         /* If the slab has been placed off-slab, and we have enough space then
 893          * move it on-slab.  This is at the expense of any extra colouring.
 894          */
 895         if ((flags & SLAB_CFLGS_OFF_SLAB) && !SLAB_BUFCTL(flags) &&
 896             left_over >= slab_align_size) {
 897                 flags &= ~SLAB_CFLGS_OFF_SLAB;
 898                 left_over -= slab_align_size;
 899         }
 900
 901         /* Offset must be a factor of the alignment. */
 902         offset += (align-1);
 903         offset &= ~(align-1);
 904
 905         /* Mess around with the offset alignment. */
 906         if (!left_over) {
 907                 offset = 0;
 908         } else if (left_over < offset) {
 909                 offset = align;
 910                 if (flags & SLAB_HWCACHE_ALIGN) {
 911                         if (left_over < offset)
 912                                 offset = 0;
 913                 } else {
 914                         /* Offset is BYTES_PER_WORD, and left_over is at
 915                          * least BYTES_PER_WORD.
 916                          */
 917                         if (left_over >= (BYTES_PER_WORD*2)) {
 918                                 offset >>= 1;
 919                                 if (left_over >= (BYTES_PER_WORD*4))
 920                                         offset >>= 1;
 921                         }
 922                 }
 923         } else if (!offset) {
 924                 /* No offset requested, but space enough - give one. */
 925                 offset = left_over/align;
 926                 if (flags & SLAB_HWCACHE_ALIGN) {
 927                         if (offset >= 8) {
 928                                 /* A large number of colours - use a larger alignment. */
 929                                 align <<= 1;
 930                         }
 931                 } else {
 932                         if (offset >= 10) {
 933                                 align <<= 1;
 934                                 if (offset >= 16)
 935                                         align <<= 1;
 936                         }
 937                 }
 938                 offset = align;
 939         }
 940
 941 #if     0
 942 printk("%s: Left_over:%d Align:%d Size:%d\n", name, left_over, offset, size);
 943 #endif
 944
 945         if ((cachep->c_align = (unsigned long) offset))
 946                 cachep->c_colour = (left_over/offset);
 947         cachep->c_colour_next = cachep->c_colour;
 948
 949         /* If the bufctl's are on-slab, c_offset does not include the size of bufctl. */
 950         if (!SLAB_BUFCTL(flags))
 951                 size -= sizeof(kmem_bufctl_t);
 952         else
 953                 cachep->c_index_cachep =
 954                         kmem_find_general_cachep(cachep->c_num*sizeof(kmem_bufctl_t));
 955         cachep->c_offset = (unsigned long) size;
 956         cachep->c_freep = kmem_slab_end(cachep);
 957         cachep->c_firstp = kmem_slab_end(cachep);
 958         cachep->c_lastp = kmem_slab_end(cachep);
 959         cachep->c_flags = flags;
 960         cachep->c_ctor = ctor;
 961         cachep->c_dtor = dtor;
 962         cachep->c_magic = SLAB_C_MAGIC;
 963         cachep->c_name = name;          /* Simply point to the name. */
 964         spin_lock_init(&cachep->c_spinlock);
 965
 966         /* Need the semaphore to access the chain. */
 967         down(&cache_chain_sem);
 968         searchp = &cache_cache;
 969         do {
 970                 /* The name field is constant - no lock needed. */
 971                 if (!strcmp(searchp->c_name, name)) {
 972                         printk("%sDup name - %s\n", func_nm, name);
 973                         break;
 974                 }
 975                 searchp = searchp->c_nextp;
 976         } while (searchp != &cache_cache);
 977
 978         /* There is no reason to lock our new cache before we
 979          * link it in - no one knows about it yet...
 980          */
 981         cachep->c_nextp = cache_cache.c_nextp;
 982         cache_cache.c_nextp = cachep;
 983         up(&cache_chain_sem);
 984 opps:
 985         return cachep;
 986 }
 987
 988 /* Shrink a cache.  Releases as many slabs as possible for a cache.
 989  * It is expected this function will be called by a module when it is
 990  * unloaded.  The cache is _not_ removed, this creates too many problems and
 991  * the cache-structure does not take up much room.  A module should keep its
 992  * cache pointer(s) in unloaded memory, so when reloaded it knows the cache
 993  * is available.  To help debugging, a zero exit status indicates all slabs
 994  * were released.
 995  */
 996 int
 997 kmem_cache_shrink(kmem_cache_t *cachep)
 998 {
 999         kmem_cache_t    *searchp;
1000         kmem_slab_t     *slabp;
1001         int     ret;
1002
1003         if (!cachep) {
1004                 printk(KERN_ERR "kmem_shrink: NULL ptr\n");
1005                 return 2;
1006         }
1007         if (in_interrupt()) {
1008                 printk(KERN_ERR "kmem_shrink: Called during int - %s\n", cachep->c_name);
1009                 return 2;
1010         }
1011
1012         /* Find the cache in the chain of caches. */
1013         down(&cache_chain_sem);         /* Semaphore is needed. */
1014         searchp = &cache_cache;
1015         for (;searchp->c_nextp != &cache_cache; searchp = searchp->c_nextp) {
1016                 if (searchp->c_nextp != cachep)
1017                         continue;
1018
1019                 /* Accessing clock_searchp is safe - we hold the mutex. */
1020                 if (cachep == clock_searchp)
1021                         clock_searchp = cachep->c_nextp;
1022                 goto found;
1023         }
1024         up(&cache_chain_sem);
1025         printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep);
1026         return 2;
1027 found:
1028         /* Release the semaphore before getting the cache-lock.  This could
1029          * mean multiple engines are shrinking the cache, but so what.
1030          */
1031         up(&cache_chain_sem);
1032         spin_lock_irq(&cachep->c_spinlock);
1033
1034         /* If the cache is growing, stop shrinking. */
1035         while (!cachep->c_growing) {
1036                 slabp = cachep->c_lastp;
1037                 if (slabp->s_inuse || slabp == kmem_slab_end(cachep))
1038                         break;
1039                 kmem_slab_unlink(slabp);
1040                 spin_unlock_irq(&cachep->c_spinlock);
1041                 kmem_slab_destroy(cachep, slabp);
1042                 spin_lock_irq(&cachep->c_spinlock);
1043         }
1044         ret = 1;
1045         if (cachep->c_lastp == kmem_slab_end(cachep))
1046                 ret--;          /* Cache is empty. */
1047         spin_unlock_irq(&cachep->c_spinlock);
1048         return ret;
1049 }
1050
1051 /* Get the memory for a slab management obj. */
1052 static inline kmem_slab_t *
1053 kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, int local_flags)
1054 {
1055         kmem_slab_t     *slabp;
1056
1057         if (SLAB_OFF_SLAB(cachep->c_flags)) {
1058                 /* Slab management obj is off-slab. */
1059                 slabp = kmem_cache_alloc(cache_slabp, local_flags);
1060         } else {
1061                 /* Slab management at end of slab memory, placed so that
1062                  * the position is 'coloured'.
1063                  */
1064                 void *end;
1065                 end = objp + (cachep->c_num * cachep->c_offset);
1066                 if (!SLAB_BUFCTL(cachep->c_flags))
1067                         end += (cachep->c_num * sizeof(kmem_bufctl_t));
1068                 slabp = (kmem_slab_t *) L1_CACHE_ALIGN((unsigned long)end);
1069         }
1070
1071         if (slabp) {
1072                 slabp->s_inuse = 0;
1073                 slabp->s_dma = 0;
1074                 slabp->s_index = NULL;
1075         }
1076
1077         return slabp;
1078 }
1079
1080 static inline void
1081 kmem_cache_init_objs(kmem_cache_t * cachep, kmem_slab_t * slabp, void *objp,
1082                                 unsigned long ctor_flags)
1083 {
1084         kmem_bufctl_t   **bufpp = &slabp->s_freep;
1085         unsigned long   num = cachep->c_num-1;
1086
1087         do {
1088 #if     SLAB_DEBUG_SUPPORT
1089                 if (cachep->c_flags & SLAB_RED_ZONE) {
1090                         *((unsigned long*)(objp)) = SLAB_RED_MAGIC1;
1091                         objp += BYTES_PER_WORD;
1092                         *((unsigned long*)(objp+cachep->c_org_size)) = SLAB_RED_MAGIC1;
1093                 }
1094 #endif  /* SLAB_DEBUG_SUPPORT */
1095
1096                 /* Constructors are not allowed to allocate memory from the same cache
1097                  * which they are a constructor for.  Otherwise, deadlock.
1098                  * They must also be threaded.
1099                  */
1100                 if (cachep->c_ctor)
1101                         cachep->c_ctor(objp, cachep, ctor_flags);
1102 #if     SLAB_DEBUG_SUPPORT
1103                 else if (cachep->c_flags & SLAB_POISON) {
1104                         /* need to poison the objs */
1105                         kmem_poison_obj(cachep, objp);
1106                 }
1107
1108                 if (cachep->c_flags & SLAB_RED_ZONE) {
1109                         if (*((unsigned long*)(objp+cachep->c_org_size)) !=
1110                             SLAB_RED_MAGIC1) {
1111                                 *((unsigned long*)(objp+cachep->c_org_size)) =
1112                                         SLAB_RED_MAGIC1;
1113                                 printk(KERN_ERR "kmem_init_obj: Bad rear redzone "
1114                                        "after constructor - %s\n", cachep->c_name);
1115                         }
1116                         objp -= BYTES_PER_WORD;
1117                         if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1) {
1118                                 *((unsigned long*)(objp)) = SLAB_RED_MAGIC1;
1119                                 printk(KERN_ERR "kmem_init_obj: Bad front redzone "
1120                                        "after constructor - %s\n", cachep->c_name);
1121                         }
1122                 }
1123 #endif  /* SLAB_DEBUG_SUPPORT */
1124
1125                 objp += cachep->c_offset;
1126                 if (!slabp->s_index) {
1127                         *bufpp = objp;
1128                         objp += sizeof(kmem_bufctl_t);
1129                 } else
1130                         *bufpp = &slabp->s_index[num];
1131                 bufpp = &(*bufpp)->buf_nextp;
1132         } while (num--);
1133
1134         *bufpp = NULL;
1135 }
1136
1137 /* Grow (by 1) the number of slabs within a cache.  This is called by
1138  * kmem_cache_alloc() when there are no active objs left in a cache.
1139  */
1140 static int
1141 kmem_cache_grow(kmem_cache_t * cachep, int flags)
1142 {
1143         kmem_slab_t     *slabp;
1144         struct page     *page;
1145         void            *objp;
1146         size_t           offset;
1147         unsigned int     dma, local_flags;
1148         unsigned long    ctor_flags;
1149         unsigned long    save_flags;
1150
1151         /* Be lazy and only check for valid flags here,
1152          * keeping it out of the critical path in kmem_cache_alloc().
1153          */
1154         if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) {
1155                 printk(KERN_WARNING "kmem_grow: Illegal flgs %X (correcting) - %s\n",
1156                        flags, cachep->c_name);
1157                 flags &= (SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW);
1158         }
1159
1160         if (flags & SLAB_NO_GROW)
1161                 return 0;
1162
1163         /* The test for missing atomic flag is performed here, rather than
1164          * the more obvious place, simply to reduce the critical path length
1165          * in kmem_cache_alloc().  If a caller is slightly mis-behaving they
1166          * will eventually be caught here (where it matters).
1167          */
1168         if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC) {
1169                 printk(KERN_ERR "kmem_grow: Called nonatomically from int - %s\n",
1170                        cachep->c_name);
1171                 flags &= ~SLAB_LEVEL_MASK;
1172                 flags |= SLAB_ATOMIC;
1173         }
1174         ctor_flags = SLAB_CTOR_CONSTRUCTOR;
1175         local_flags = (flags & SLAB_LEVEL_MASK);
1176         if (local_flags == SLAB_ATOMIC) {
1177                 /* Not allowed to sleep.  Need to tell a constructor about
1178                  * this - it might need to know...
1179                  */
1180                 ctor_flags |= SLAB_CTOR_ATOMIC;
1181         }
1182
1183         /* About to mess with non-constant members - lock. */
1184         spin_lock_irqsave(&cachep->c_spinlock, save_flags);
1185
1186         /* Get colour for the slab, and cal the next value. */
1187         if (!(offset = cachep->c_colour_next--))
1188                 cachep->c_colour_next = cachep->c_colour;
1189         offset *= cachep->c_align;
1190         cachep->c_dflags = SLAB_CFLGS_GROWN;
1191
1192         cachep->c_growing++;
1193 re_try:
1194         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1195
1196         /* A series of memory allocations for a new slab.
1197          * Neither the cache-chain semaphore, or cache-lock, are
1198          * held, but the incrementing c_growing prevents this
1199          * this cache from being reaped or shrunk.
1200          * Note: The cache could be selected in for reaping in
1201          * kmem_cache_reap(), but when the final test is made the
1202          * growing value will be seen.
1203          */
1204
1205         /* Get mem for the objs. */
1206         if (!(objp = kmem_getpages(cachep, flags, &dma)))
1207                 goto failed;
1208
1209         /* Get slab management. */
1210         if (!(slabp = kmem_cache_slabmgmt(cachep, objp+offset, local_flags)))
1211                 goto opps1;
1212         if (dma)
1213                 slabp->s_dma = 1;
1214         if (SLAB_BUFCTL(cachep->c_flags)) {
1215                 slabp->s_index = kmem_cache_alloc(cachep->c_index_cachep, local_flags);
1216                 if (!slabp->s_index)
1217                         goto opps2;
1218         }
1219
1220         /* Nasty!!!!!!  I hope this is OK. */
1221         dma = 1 << cachep->c_gfporder;
1222         page = &mem_map[MAP_NR(objp)];
1223         do {
1224                 SLAB_SET_PAGE_CACHE(page, cachep);
1225                 SLAB_SET_PAGE_SLAB(page, slabp);
1226                 PageSetSlab(page);
1227                 page++;
1228         } while (--dma);
1229
1230         slabp->s_offset = offset;       /* It will fit... */
1231         objp += offset;         /* Address of first object. */
1232         slabp->s_mem = objp;
1233
1234         /* For on-slab bufctls, c_offset is the distance between the start of
1235          * an obj and its related bufctl.  For off-slab bufctls, c_offset is
1236          * the distance between objs in the slab.
1237          */
1238         kmem_cache_init_objs(cachep, slabp, objp, ctor_flags);
1239
1240         spin_lock_irq(&cachep->c_spinlock);
1241
1242         /* Make slab active. */
1243         slabp->s_magic = SLAB_MAGIC_ALLOC;
1244         kmem_slab_link_end(cachep, slabp);
1245         if (cachep->c_freep == kmem_slab_end(cachep))
1246                 cachep->c_freep = slabp;
1247         SLAB_STATS_INC_GROWN(cachep);
1248         cachep->c_failures = 0;
1249         cachep->c_growing--;
1250
1251         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1252         return 1;
1253 opps2:
1254         if (SLAB_OFF_SLAB(cachep->c_flags))
1255                 kmem_cache_free(cache_slabp, slabp);
1256 opps1:
1257         kmem_freepages(cachep, objp);
1258 failed:
1259         spin_lock_irq(&cachep->c_spinlock);
1260         if (local_flags != SLAB_ATOMIC && cachep->c_gfporder) {
1261                 /* For large order (>0) slabs, we try again.
1262                  * Needed because the gfp() functions are not good at giving
1263                  * out contiguous pages unless pushed (but do not push too hard).
1264                  */
1265                 if (cachep->c_failures++ < 4 && cachep->c_freep == kmem_slab_end(cachep))
1266                         goto re_try;
1267                 cachep->c_failures = 1; /* Memory is low, don't try as hard next time. */
1268         }
1269         cachep->c_growing--;
1270         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1271         return 0;
1272 }
1273
1274 static void
1275 kmem_report_alloc_err(const char *str, kmem_cache_t * cachep)
1276 {
1277         if (cachep)
1278                 SLAB_STATS_INC_ERR(cachep);     /* this is atomic */
1279         printk(KERN_ERR "kmem_alloc: %s (name=%s)\n",
1280                str, cachep ? cachep->c_name : "unknown");
1281 }
1282
1283 static void
1284 kmem_report_free_err(const char *str, const void *objp, kmem_cache_t * cachep)
1285 {
1286         if (cachep)
1287                 SLAB_STATS_INC_ERR(cachep);
1288         printk(KERN_ERR "kmem_free: %s (objp=%p, name=%s)\n",
1289                str, objp, cachep ? cachep->c_name : "unknown");
1290 }
1291
1292 /* Search for a slab whose objs are suitable for DMA.
1293  * Note: since testing the first free slab (in __kmem_cache_alloc()),
1294  * ints must not have been enabled, or the cache-lock released!
1295  */
1296 static inline kmem_slab_t *
1297 kmem_cache_search_dma(kmem_cache_t * cachep)
1298 {
1299         kmem_slab_t     *slabp = cachep->c_freep->s_nextp;
1300
1301         for (; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
1302                 if (!(slabp->s_dma))
1303                         continue;
1304                 kmem_slab_unlink(slabp);
1305                 kmem_slab_link_free(cachep, slabp);
1306                 cachep->c_freep = slabp;
1307                 break;
1308         }
1309         return slabp;
1310 }
1311
1312 #if     SLAB_DEBUG_SUPPORT
1313 /* Perform extra freeing checks.  Currently, this check is only for caches
1314  * that use bufctl structures within the slab.  Those which use bufctl's
1315  * from the internal cache have a reasonable check when the address is
1316  * searched for.  Called with the cache-lock held.
1317  */
1318 static void *
1319 kmem_extra_free_checks(kmem_cache_t * cachep, kmem_bufctl_t *search_bufp,
1320                        kmem_bufctl_t *bufp, void * objp)
1321 {
1322         if (SLAB_BUFCTL(cachep->c_flags))
1323                 return objp;
1324
1325         /* Check slab's freelist to see if this obj is there. */
1326         for (; search_bufp; search_bufp = search_bufp->buf_nextp) {
1327                 if (search_bufp != bufp)
1328                         continue;
1329                 return NULL;
1330         }
1331         return objp;
1332 }
1333 #endif  /* SLAB_DEBUG_SUPPORT */
1334
1335 /* Called with cache lock held. */
1336 static inline void
1337 kmem_cache_full_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
1338 {
1339         if (slabp->s_nextp->s_inuse) {
1340                 /* Not at correct position. */
1341                 if (cachep->c_freep == slabp)
1342                         cachep->c_freep = slabp->s_nextp;
1343                 kmem_slab_unlink(slabp);
1344                 kmem_slab_link_end(cachep, slabp);
1345         }
1346 }
1347
1348 /* Called with cache lock held. */
1349 static inline void
1350 kmem_cache_one_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
1351 {
1352         if (slabp->s_nextp->s_inuse == cachep->c_num) {
1353                 kmem_slab_unlink(slabp);
1354                 kmem_slab_link_free(cachep, slabp);
1355         }
1356         cachep->c_freep = slabp;
1357 }
1358
1359 /* Returns a ptr to an obj in the given cache. */
1360 static inline void *
1361 __kmem_cache_alloc(kmem_cache_t *cachep, int flags)
1362 {
1363         kmem_slab_t     *slabp;
1364         kmem_bufctl_t   *bufp;
1365         void            *objp;
1366         unsigned long   save_flags;
1367
1368         /* Sanity check. */
1369         if (!cachep)
1370                 goto nul_ptr;
1371         spin_lock_irqsave(&cachep->c_spinlock, save_flags);
1372 try_again:
1373         /* Get slab alloc is to come from. */
1374         slabp = cachep->c_freep;
1375
1376         /* Magic is a sanity check _and_ says if we need a new slab. */
1377         if (slabp->s_magic != SLAB_MAGIC_ALLOC)
1378                 goto alloc_new_slab;
1379         /* DMA requests are 'rare' - keep out of the critical path. */
1380         if (flags & SLAB_DMA)
1381                 goto search_dma;
1382 try_again_dma:
1383         SLAB_STATS_INC_ALLOCED(cachep);
1384         SLAB_STATS_INC_ACTIVE(cachep);
1385         SLAB_STATS_SET_HIGH(cachep);
1386         slabp->s_inuse++;
1387         bufp = slabp->s_freep;
1388         slabp->s_freep = bufp->buf_nextp;
1389         if (slabp->s_freep) {
1390 ret_obj:
1391                 if (!slabp->s_index) {
1392                         bufp->buf_slabp = slabp;
1393                         objp = ((void*)bufp) - cachep->c_offset;
1394 finished:
1395                         /* The lock is not needed by the red-zone or poison ops, and the
1396                          * obj has been removed from the slab.  Should be safe to drop
1397                          * the lock here.
1398                          */
1399                         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1400 #if     SLAB_DEBUG_SUPPORT
1401                         if (cachep->c_flags & SLAB_RED_ZONE)
1402                                 goto red_zone;
1403 ret_red:
1404                         if ((cachep->c_flags & SLAB_POISON) && kmem_check_poison_obj(cachep, objp))
1405                                 kmem_report_alloc_err("Bad poison", cachep);
1406 #endif  /* SLAB_DEBUG_SUPPORT */
1407                         return objp;
1408                 }
1409                 /* Update index ptr. */
1410                 objp = ((bufp-slabp->s_index)*cachep->c_offset) + slabp->s_mem;
1411                 bufp->buf_objp = objp;
1412                 goto finished;
1413         }
1414         cachep->c_freep = slabp->s_nextp;
1415         goto ret_obj;
1416
1417 #if     SLAB_DEBUG_SUPPORT
1418 red_zone:
1419         /* Set alloc red-zone, and check old one. */
1420         if (xchg((unsigned long *)objp, SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1)
1421                 kmem_report_alloc_err("Bad front redzone", cachep);
1422         objp += BYTES_PER_WORD;
1423         if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1)
1424                 kmem_report_alloc_err("Bad rear redzone", cachep);
1425         goto ret_red;
1426 #endif  /* SLAB_DEBUG_SUPPORT */
1427
1428 search_dma:
1429         if (slabp->s_dma || (slabp = kmem_cache_search_dma(cachep))!=kmem_slab_end(cachep))
1430                 goto try_again_dma;
1431 alloc_new_slab:
1432         /* Either out of slabs, or magic number corruption. */
1433         if (slabp == kmem_slab_end(cachep)) {
1434                 /* Need a new slab.  Release the lock before calling kmem_cache_grow().
1435                  * This allows objs to be released back into the cache while growing.
1436                  */
1437                 spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1438                 if (kmem_cache_grow(cachep, flags)) {
1439                         /* Someone may have stolen our objs.  Doesn't matter, we'll
1440                          * just come back here again.
1441                          */
1442                         spin_lock_irq(&cachep->c_spinlock);
1443                         goto try_again;
1444                 }
1445                 /* Couldn't grow, but some objs may have been freed. */
1446                 spin_lock_irq(&cachep->c_spinlock);
1447                 if (cachep->c_freep != kmem_slab_end(cachep))
1448                         goto try_again;
1449         } else {
1450                 /* Very serious error - maybe panic() here? */
1451                 kmem_report_alloc_err("Bad slab magic (corrupt)", cachep);
1452         }
1453         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1454 err_exit:
1455         return NULL;
1456 nul_ptr:
1457         kmem_report_alloc_err("NULL ptr", NULL);
1458         goto err_exit;
1459 }
1460
1461 /* Release an obj back to its cache.  If the obj has a constructed state,
1462  * it should be in this state _before_ it is released.
1463  */
1464 static inline void
1465 __kmem_cache_free(kmem_cache_t *cachep, const void *objp)
1466 {
1467         kmem_slab_t     *slabp;
1468         kmem_bufctl_t   *bufp;
1469         unsigned long   save_flags;
1470
1471         /* Basic sanity checks. */
1472         if (!cachep || !objp)
1473                 goto null_addr;
1474
1475 #if     SLAB_DEBUG_SUPPORT
1476         /* A verify func is called without the cache-lock held. */
1477         if (cachep->c_flags & SLAB_DEBUG_INITIAL)
1478                 goto init_state_check;
1479 finished_initial:
1480
1481         if (cachep->c_flags & SLAB_RED_ZONE)
1482                 goto red_zone;
1483 return_red:
1484 #endif  /* SLAB_DEBUG_SUPPORT */
1485
1486         spin_lock_irqsave(&cachep->c_spinlock, save_flags);
1487
1488         if (SLAB_BUFCTL(cachep->c_flags))
1489                 goto bufctl;
1490         bufp = (kmem_bufctl_t *)(objp+cachep->c_offset);
1491
1492         /* Get slab for the object. */
1493 #if     0
1494         /* _NASTY_IF/ELSE_, but avoids a 'distant' memory ref for some objects.
1495          * Is this worth while? XXX
1496          */
1497         if (cachep->c_flags & SLAB_HIGH_PACK)
1498                 slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(bufp)]);
1499         else
1500 #endif
1501                 slabp = bufp->buf_slabp;
1502
1503 check_magic:
1504         if (slabp->s_magic != SLAB_MAGIC_ALLOC)         /* Sanity check. */
1505                 goto bad_slab;
1506
1507 #if     SLAB_DEBUG_SUPPORT
1508         if (cachep->c_flags & SLAB_DEBUG_FREE)
1509                 goto extra_checks;
1510 passed_extra:
1511 #endif  /* SLAB_DEBUG_SUPPORT */
1512
1513         if (slabp->s_inuse) {           /* Sanity check. */
1514                 SLAB_STATS_DEC_ACTIVE(cachep);
1515                 slabp->s_inuse--;
1516                 bufp->buf_nextp = slabp->s_freep;
1517                 slabp->s_freep = bufp;
1518                 if (bufp->buf_nextp) {
1519                         if (slabp->s_inuse) {
1520                                 /* (hopefully) The most common case. */
1521 finished:
1522 #if     SLAB_DEBUG_SUPPORT
1523                                 if (cachep->c_flags & SLAB_POISON) {
1524                                         if (cachep->c_flags & SLAB_RED_ZONE)
1525                                                 objp += BYTES_PER_WORD;
1526                                         kmem_poison_obj(cachep, objp);
1527                                 }
1528 #endif  /* SLAB_DEBUG_SUPPORT */
1529                                 spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1530                                 return;
1531                         }
1532                         kmem_cache_full_free(cachep, slabp);
1533                         goto finished;
1534                 }
1535                 kmem_cache_one_free(cachep, slabp);
1536                 goto finished;
1537         }
1538
1539         /* Don't add to freelist. */
1540         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1541         kmem_report_free_err("free with no active objs", objp, cachep);
1542         return;
1543 bufctl:
1544         /* No 'extra' checks are performed for objs stored this way, finding
1545          * the obj is check enough.
1546          */
1547         slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(objp)]);
1548         bufp =  &slabp->s_index[(objp - slabp->s_mem)/cachep->c_offset];
1549         if (bufp->buf_objp == objp)
1550                 goto check_magic;
1551         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1552         kmem_report_free_err("Either bad obj addr or double free", objp, cachep);
1553         return;
1554 #if     SLAB_DEBUG_SUPPORT
1555 init_state_check:
1556         /* Need to call the slab's constructor so the
1557          * caller can perform a verify of its state (debugging).
1558          */
1559         cachep->c_ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
1560         goto finished_initial;
1561 extra_checks:
1562         if (!kmem_extra_free_checks(cachep, slabp->s_freep, bufp, objp)) {
1563                 spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1564                 kmem_report_free_err("Double free detected during checks", objp, cachep);
1565                 return;
1566         }
1567         goto passed_extra;
1568 red_zone:
1569         /* We do not hold the cache-lock while checking the red-zone.
1570          */
1571         objp -= BYTES_PER_WORD;
1572         if (xchg((unsigned long *)objp, SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
1573                 /* Either write before start of obj, or a double free. */
1574                 kmem_report_free_err("Bad front redzone", objp, cachep);
1575         }
1576         if (xchg((unsigned long *)(objp+cachep->c_org_size+BYTES_PER_WORD), SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
1577                 /* Either write past end of obj, or a double free. */
1578                 kmem_report_free_err("Bad rear redzone", objp, cachep);
1579         }
1580         goto return_red;
1581 #endif  /* SLAB_DEBUG_SUPPORT */
1582
1583 bad_slab:
1584         /* Slab doesn't contain the correct magic num. */
1585         if (slabp->s_magic == SLAB_MAGIC_DESTROYED) {
1586                 /* Magic num says this is a destroyed slab. */
1587                 kmem_report_free_err("free from inactive slab", objp, cachep);
1588         } else
1589                 kmem_report_free_err("Bad obj addr", objp, cachep);
1590         spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1591
1592 #if 1
1593 /* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */
1594 *(int *) 0 = 0;
1595 #endif
1596
1597         return;
1598 null_addr:
1599         kmem_report_free_err("NULL ptr", objp, cachep);
1600         return;
1601 }
1602
1603 void *
1604 kmem_cache_alloc(kmem_cache_t *cachep, int flags)
1605 {
1606         return __kmem_cache_alloc(cachep, flags);
1607 }
1608
1609 void
1610 kmem_cache_free(kmem_cache_t *cachep, void *objp)
1611 {
1612         __kmem_cache_free(cachep, objp);
1613 }
1614
1615 void *
1616 kmalloc(size_t size, int flags)
1617 {
1618         cache_sizes_t   *csizep = cache_sizes;
1619
1620         for (; csizep->cs_size; csizep++) {
1621                 if (size > csizep->cs_size)
1622                         continue;
1623                 return __kmem_cache_alloc(csizep->cs_cachep, flags);
1624         }
1625         printk(KERN_ERR "kmalloc: Size (%lu) too large\n", (unsigned long) size);
1626         return NULL;
1627 }
1628
1629 void
1630 kfree(const void *objp)
1631 {
1632         struct page *page;
1633         int     nr;
1634
1635         if (!objp)
1636                 goto null_ptr;
1637         nr = MAP_NR(objp);
1638         if (nr >= max_mapnr)
1639                 goto bad_ptr;
1640
1641         /* Assume we own the page structure - hence no locking.
1642          * If someone is misbehaving (for example, calling us with a bad
1643          * address), then access to the page structure can race with the
1644          * kmem_slab_destroy() code.  Need to add a spin_lock to each page
1645          * structure, which would be useful in threading the gfp() functions....
1646          */
1647         page = &mem_map[nr];
1648         if (PageSlab(page)) {
1649                 kmem_cache_t    *cachep;
1650
1651                 /* Here, we again assume the obj address is good.
1652                  * If it isn't, and happens to map onto another
1653                  * general cache page which has no active objs, then
1654                  * we race.
1655                  */
1656                 cachep = SLAB_GET_PAGE_CACHE(page);
1657                 if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) {
1658                         __kmem_cache_free(cachep, objp);
1659                         return;
1660                 }
1661         }
1662 bad_ptr:
1663         printk(KERN_ERR "kfree: Bad obj %p\n", objp);
1664
1665 #if 1
1666 /* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */
1667 *(int *) 0 = 0;
1668 #endif
1669
1670 null_ptr:
1671         return;
1672 }
1673
1674 void
1675 kfree_s(const void *objp, size_t size)
1676 {
1677         struct page *page;
1678         int     nr;
1679
1680         if (!objp)
1681                 goto null_ptr;
1682         nr = MAP_NR(objp);
1683         if (nr >= max_mapnr)
1684                 goto null_ptr;
1685         /* See comment in kfree() */
1686         page = &mem_map[nr];
1687         if (PageSlab(page)) {
1688                 kmem_cache_t    *cachep;
1689                 /* See comment in kfree() */
1690                 cachep = SLAB_GET_PAGE_CACHE(page);
1691                 if (cachep && cachep->c_flags & SLAB_CFLGS_GENERAL) {
1692                         if (size <= cachep->c_org_size) {       /* XXX better check */
1693                                 __kmem_cache_free(cachep, objp);
1694                                 return;
1695                         }
1696                 }
1697         }
1698 null_ptr:
1699         printk(KERN_ERR "kfree_s: Bad obj %p\n", objp);
1700         return;
1701 }
1702
1703 kmem_cache_t *
1704 kmem_find_general_cachep(size_t size)
1705 {
1706         cache_sizes_t   *csizep = cache_sizes;
1707
1708         /* This function could be moved to the header file, and
1709          * made inline so consumers can quickly determine what
1710          * cache pointer they require.
1711          */
1712         for (; csizep->cs_size; csizep++) {
1713                 if (size > csizep->cs_size)
1714                         continue;
1715                 break;
1716         }
1717         return csizep->cs_cachep;
1718 }
1719
1720
1721 /* Called from try_to_free_page().
1722  * This function _cannot_ be called within a int, but it
1723  * can be interrupted.
1724  */
1725 void
1726 kmem_cache_reap(int gfp_mask)
1727 {
1728         kmem_slab_t     *slabp;
1729         kmem_cache_t    *searchp;
1730         kmem_cache_t    *best_cachep;
1731         unsigned int     scan;
1732         unsigned int     reap_level;
1733
1734         if (in_interrupt()) {
1735                 printk("kmem_cache_reap() called within int!\n");
1736                 return;
1737         }
1738
1739         /* We really need a test semaphore op so we can avoid sleeping when
1740          * !wait is true.
1741          */
1742         down(&cache_chain_sem);
1743
1744         scan = 10;
1745         reap_level = 0;
1746
1747         best_cachep = NULL;
1748         searchp = clock_searchp;
1749         do {
1750                 unsigned int    full_free;
1751                 unsigned int    dma_flag;
1752
1753                 /* It's safe to test this without holding the cache-lock. */
1754                 if (searchp->c_flags & SLAB_NO_REAP)
1755                         goto next;
1756                 spin_lock_irq(&searchp->c_spinlock);
1757                 if (searchp->c_growing)
1758                         goto next_unlock;
1759                 if (searchp->c_dflags & SLAB_CFLGS_GROWN) {
1760                         searchp->c_dflags &= ~SLAB_CFLGS_GROWN;
1761                         goto next_unlock;
1762                 }
1763                 /* Sanity check for corruption of static values. */
1764                 if (searchp->c_inuse || searchp->c_magic != SLAB_C_MAGIC) {
1765                         spin_unlock_irq(&searchp->c_spinlock);
1766                         printk(KERN_ERR "kmem_reap: Corrupted cache struct for %s\n", searchp->c_name);
1767                         goto next;
1768                 }
1769                 dma_flag = 0;
1770                 full_free = 0;
1771
1772                 /* Count the fully free slabs.  There should not be not many,
1773                  * since we are holding the cache lock.
1774                  */
1775                 slabp = searchp->c_lastp;
1776                 while (!slabp->s_inuse && slabp != kmem_slab_end(searchp)) {
1777                         slabp = slabp->s_prevp;
1778                         full_free++;
1779                         if (slabp->s_dma)
1780                                 dma_flag++;
1781                 }
1782                 spin_unlock_irq(&searchp->c_spinlock);
1783
1784                 if ((gfp_mask & GFP_DMA) && !dma_flag)
1785                         goto next;
1786
1787                 if (full_free) {
1788                         if (full_free >= 10) {
1789                                 best_cachep = searchp;
1790                                 break;
1791                         }
1792
1793                         /* Try to avoid slabs with constructors and/or
1794                          * more than one page per slab (as it can be difficult
1795                          * to get high orders from gfp()).
1796                          */
1797                         if (full_free >= reap_level) {
1798                                 reap_level = full_free;
1799                                 best_cachep = searchp;
1800                         }
1801                 }
1802                 goto next;
1803 next_unlock:
1804                 spin_unlock_irq(&searchp->c_spinlock);
1805 next:
1806                 searchp = searchp->c_nextp;
1807         } while (--scan && searchp != clock_searchp);
1808
1809         clock_searchp = searchp;
1810         up(&cache_chain_sem);
1811
1812         if (!best_cachep) {
1813                 /* couldn't find anything to reap */
1814                 return;
1815         }
1816
1817         spin_lock_irq(&best_cachep->c_spinlock);
1818         while (!best_cachep->c_growing &&
1819                !(slabp = best_cachep->c_lastp)->s_inuse &&
1820                slabp != kmem_slab_end(best_cachep)) {
1821                 if (gfp_mask & GFP_DMA) {
1822                         do {
1823                                 if (slabp->s_dma)
1824                                         goto good_dma;
1825                                 slabp = slabp->s_prevp;
1826                         } while (!slabp->s_inuse && slabp != kmem_slab_end(best_cachep));
1827
1828                         /* Didn't found a DMA slab (there was a free one -
1829                          * must have been become active).
1830                          */
1831                         goto dma_fail;
1832 good_dma:
1833                 }
1834                 if (slabp == best_cachep->c_freep)
1835                         best_cachep->c_freep = slabp->s_nextp;
1836                 kmem_slab_unlink(slabp);
1837                 SLAB_STATS_INC_REAPED(best_cachep);
1838
1839                 /* Safe to drop the lock.  The slab is no longer linked to the
1840                  * cache.
1841                  */
1842                 spin_unlock_irq(&best_cachep->c_spinlock);
1843                 kmem_slab_destroy(best_cachep, slabp);
1844                 spin_lock_irq(&best_cachep->c_spinlock);
1845         }
1846 dma_fail:
1847         spin_unlock_irq(&best_cachep->c_spinlock);
1848         return;
1849 }
1850
1851 #if     SLAB_SELFTEST
1852 /* A few v. simple tests */
1853 static void
1854 kmem_self_test(void)
1855 {
1856         kmem_cache_t    *test_cachep;
1857
1858         printk(KERN_INFO "kmem_test() - start\n");
1859         test_cachep = kmem_cache_create("test-cachep", 16, 0, SLAB_RED_ZONE|SLAB_POISON, NULL, NULL);
1860         if (test_cachep) {
1861                 char *objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL);
1862                 if (objp) {
1863                         /* Write in front and past end, red-zone test. */
1864                         *(objp-1) = 1;
1865                         *(objp+16) = 1;
1866                         kmem_cache_free(test_cachep, objp);
1867
1868                         /* Mess up poisoning. */
1869                         *objp = 10;
1870                         objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL);
1871                         kmem_cache_free(test_cachep, objp);
1872
1873                         /* Mess up poisoning (again). */
1874                         *objp = 10;
1875                         kmem_cache_shrink(test_cachep);
1876                 }
1877         }
1878         printk(KERN_INFO "kmem_test() - finished\n");
1879 }
1880 #endif  /* SLAB_SELFTEST */
1881
1882 #if     defined(CONFIG_PROC_FS)
1883 /* /proc/slabinfo
1884  * cache-name num-active-objs total-objs num-active-slabs total-slabs num-pages-per-slab
1885  */
1886 int
1887 get_slabinfo(char *buf)
1888 {
1889         kmem_cache_t    *cachep;
1890         kmem_slab_t     *slabp;
1891         unsigned long   active_objs;
1892         unsigned long   save_flags;
1893         unsigned long   num_slabs;
1894         unsigned long   num_objs;
1895         int             len=0;
1896 #if     SLAB_STATS
1897         unsigned long   active_slabs;
1898 #endif  /* SLAB_STATS */
1899
1900         __save_flags(save_flags);
1901
1902         /* Output format version, so at least we can change it without _too_
1903          * many complaints.
1904          */
1905 #if     SLAB_STATS
1906         len = sprintf(buf, "slabinfo - version: 1.0 (statistics)\n");
1907 #else
1908         len = sprintf(buf, "slabinfo - version: 1.0\n");
1909 #endif  /* SLAB_STATS */
1910         down(&cache_chain_sem);
1911         cachep = &cache_cache;
1912         do {
1913 #if     SLAB_STATS
1914                 active_slabs = 0;
1915 #endif  /* SLAB_STATS */
1916                 num_slabs = active_objs = 0;
1917                 spin_lock_irq(&cachep->c_spinlock);
1918                 for (slabp = cachep->c_firstp; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
1919                         active_objs += slabp->s_inuse;
1920                         num_slabs++;
1921 #if     SLAB_STATS
1922                         if (slabp->s_inuse)
1923                                 active_slabs++;
1924 #endif  /* SLAB_STATS */
1925                 }
1926                 num_objs = cachep->c_num*num_slabs;
1927 #if     SLAB_STATS
1928                 {
1929                 unsigned long errors;
1930                 unsigned long high = cachep->c_high_mark;
1931                 unsigned long grown = cachep->c_grown;
1932                 unsigned long reaped = cachep->c_reaped;
1933                 unsigned long allocs = cachep->c_num_allocations;
1934                 errors = (unsigned long) atomic_read(&cachep->c_errors);
1935                 spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1936                 len += sprintf(buf+len, "%-16s %6lu %6lu %4lu %4lu %4lu %6lu %7lu %5lu %4lu %4lu\n",
1937                                 cachep->c_name, active_objs, num_objs, active_slabs, num_slabs,
1938                                 (1<<cachep->c_gfporder)*num_slabs,
1939                                 high, allocs, grown, reaped, errors);
1940                 }
1941 #else
1942                 spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
1943                 len += sprintf(buf+len, "%-17s %6lu %6lu\n", cachep->c_name, active_objs, num_objs);
1944 #endif  /* SLAB_STATS */
1945         } while ((cachep = cachep->c_nextp) != &cache_cache);
1946         up(&cache_chain_sem);
1947
1948         return len;
1949 }
1950 #endif  /* CONFIG_PROC_FS */