mm/slqb.c

   1 /*
   2  * SLQB: A slab allocator that focuses on per-CPU scaling, and good performance
   3  * with order-0 allocations. Fastpaths emphasis is placed on local allocaiton
   4  * and freeing, but with a secondary goal of good remote freeing (freeing on
   5  * another CPU from that which allocated).
   6  *
   7  * Using ideas and code from mm/slab.c, mm/slob.c, and mm/slub.c.
   8  */
   9
  10 #include <linux/mm.h>
  11 #include <linux/swap.h> /* struct reclaim_state */
  12 #include <linux/module.h>
  13 #include <linux/interrupt.h>
  14 #include <linux/slab.h>
  15 #include <linux/seq_file.h>
  16 #include <linux/cpu.h>
  17 #include <linux/cpuset.h>
  18 #include <linux/mempolicy.h>
  19 #include <linux/ctype.h>
  20 #include <linux/kallsyms.h>
  21 #include <linux/memory.h>
  22 #include <linux/fault-inject.h>
  23
  24 /*
  25  * TODO
  26  * - fix up releasing of offlined data structures. Not a big deal because
  27  *   they don't get cumulatively leaked with successive online/offline cycles
  28  * - allow OOM conditions to flush back per-CPU pages to common lists to be
  29  *   reused by other CPUs.
  30  * - investiage performance with memoryless nodes. Perhaps CPUs can be given
  31  *   a default closest home node via which it can use fastpath functions.
  32  *   Perhaps it is not a big problem.
  33  */
  34
  35 /*
  36  * slqb_page overloads struct page, and is used to manage some slob allocation
  37  * aspects, however to avoid the horrible mess in include/linux/mm_types.h,
  38  * we'll just define our own struct slqb_page type variant here.
  39  */
  40 struct slqb_page {
  41         union {
  42                 struct {
  43                         unsigned long   flags;          /* mandatory */
  44                         atomic_t        _count;         /* mandatory */
  45                         unsigned int    inuse;          /* Nr of objects */
  46                         struct kmem_cache_list *list;   /* Pointer to list */
  47                         void             **freelist;    /* LIFO freelist */
  48                         union {
  49                                 struct list_head lru;   /* misc. list */
  50                                 struct rcu_head rcu_head; /* for rcu freeing */
  51                         };
  52                 };
  53                 struct page page;
  54         };
  55 };
  56 static inline void struct_slqb_page_wrong_size(void)
  57 { BUILD_BUG_ON(sizeof(struct slqb_page) != sizeof(struct page)); }
  58
  59 #define PG_SLQB_BIT (1 << PG_slab)
  60
  61 /*
  62  * slqb_min_order: minimum allocation order for slabs
  63  */
  64 static int slqb_min_order;
  65
  66 /*
  67  * slqb_min_objects: minimum number of objects per slab. Increasing this
  68  * will increase the allocation order for slabs with larger objects
  69  */
  70 static int slqb_min_objects = 1;
  71
  72 #ifdef CONFIG_NUMA
  73 static inline int slab_numa(struct kmem_cache *s)
  74 {
  75         return s->flags & SLAB_NUMA;
  76 }
  77 #else
  78 static inline int slab_numa(struct kmem_cache *s)
  79 {
  80         return 0;
  81 }
  82 #endif
  83
  84 static inline int slab_hiwater(struct kmem_cache *s)
  85 {
  86         return s->hiwater;
  87 }
  88
  89 static inline int slab_freebatch(struct kmem_cache *s)
  90 {
  91         return s->freebatch;
  92 }
  93
  94 /*
  95  * Lock order:
  96  * kmem_cache_node->list_lock
  97  *   kmem_cache_remote_free->lock
  98  *
  99  * Data structures:
 100  * SLQB is primarily per-cpu. For each kmem_cache, each CPU has:
 101  *
 102  * - A LIFO list of node-local objects. Allocation and freeing of node local
 103  *   objects goes first to this list.
 104  *
 105  * - 2 Lists of slab pages, free and partial pages. If an allocation misses
 106  *   the object list, it tries from the partial list, then the free list.
 107  *   After freeing an object to the object list, if it is over a watermark,
 108  *   some objects are freed back to pages. If an allocation misses these lists,
 109  *   a new slab page is allocated from the page allocator. If the free list
 110  *   reaches a watermark, some of its pages are returned to the page allocator.
 111  *
 112  * - A remote free queue, where objects freed that did not come from the local
 113  *   node are queued to. When this reaches a watermark, the objects are
 114  *   flushed.
 115  *
 116  * - A remotely freed queue, where objects allocated from this CPU are flushed
 117  *   to from other CPUs' remote free queues. kmem_cache_remote_free->lock is
 118  *   used to protect access to this queue.
 119  *
 120  *   When the remotely freed queue reaches a watermark, a flag is set to tell
 121  *   the owner CPU to check it. The owner CPU will then check the queue on the
 122  *   next allocation that misses the object list. It will move all objects from
 123  *   this list onto the object list and then allocate one.
 124  *
 125  *   This system of remote queueing is intended to reduce lock and remote
 126  *   cacheline acquisitions, and give a cooling off period for remotely freed
 127  *   objects before they are re-allocated.
 128  *
 129  * node specific allocations from somewhere other than the local node are
 130  * handled by a per-node list which is the same as the above per-CPU data
 131  * structures except for the following differences:
 132  *
 133  * - kmem_cache_node->list_lock is used to protect access for multiple CPUs to
 134  *   allocate from a given node.
 135  *
 136  * - There is no remote free queue. Nodes don't free objects, CPUs do.
 137  */
 138
 139 static inline void slqb_stat_inc(struct kmem_cache_list *list,
 140                                 enum stat_item si)
 141 {
 142 #ifdef CONFIG_SLQB_STATS
 143         list->stats[si]++;
 144 #endif
 145 }
 146
 147 static inline void slqb_stat_add(struct kmem_cache_list *list,
 148                                 enum stat_item si, unsigned long nr)
 149 {
 150 #ifdef CONFIG_SLQB_STATS
 151         list->stats[si] += nr;
 152 #endif
 153 }
 154
 155 static inline int slqb_page_to_nid(struct slqb_page *page)
 156 {
 157         return page_to_nid(&page->page);
 158 }
 159
 160 static inline void *slqb_page_address(struct slqb_page *page)
 161 {
 162         return page_address(&page->page);
 163 }
 164
 165 static inline struct zone *slqb_page_zone(struct slqb_page *page)
 166 {
 167         return page_zone(&page->page);
 168 }
 169
 170 static inline int virt_to_nid(const void *addr)
 171 {
 172         return page_to_nid(virt_to_page(addr));
 173 }
 174
 175 static inline struct slqb_page *virt_to_head_slqb_page(const void *addr)
 176 {
 177         struct page *p;
 178
 179         p = virt_to_head_page(addr);
 180         return (struct slqb_page *)p;
 181 }
 182
 183 static inline void __free_slqb_pages(struct slqb_page *page, unsigned int order,
 184                                         int pages)
 185 {
 186         struct page *p = &page->page;
 187
 188         reset_page_mapcount(p);
 189         p->mapping = NULL;
 190         VM_BUG_ON(!(p->flags & PG_SLQB_BIT));
 191         p->flags &= ~PG_SLQB_BIT;
 192
 193         if (current->reclaim_state)
 194                 current->reclaim_state->reclaimed_slab += pages;
 195         __free_pages(p, order);
 196 }
 197
 198 #ifdef CONFIG_SLQB_DEBUG
 199 static inline int slab_debug(struct kmem_cache *s)
 200 {
 201         return s->flags &
 202                         (SLAB_DEBUG_FREE |
 203                          SLAB_RED_ZONE |
 204                          SLAB_POISON |
 205                          SLAB_STORE_USER |
 206                          SLAB_TRACE);
 207 }
 208 static inline int slab_poison(struct kmem_cache *s)
 209 {
 210         return s->flags & SLAB_POISON;
 211 }
 212 #else
 213 static inline int slab_debug(struct kmem_cache *s)
 214 {
 215         return 0;
 216 }
 217 static inline int slab_poison(struct kmem_cache *s)
 218 {
 219         return 0;
 220 }
 221 #endif
 222
 223 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 224                                 SLAB_POISON | SLAB_STORE_USER)
 225
 226 /* Internal SLQB flags */
 227 #define __OBJECT_POISON         0x80000000 /* Poison object */
 228
 229 /* Not all arches define cache_line_size */
 230 #ifndef cache_line_size
 231 #define cache_line_size()       L1_CACHE_BYTES
 232 #endif
 233
 234 #ifdef CONFIG_SMP
 235 static struct notifier_block slab_notifier;
 236 #endif
 237
 238 /*
 239  * slqb_lock protects slab_caches list and serialises hotplug operations.
 240  * hotplug operations take lock for write, other operations can hold off
 241  * hotplug by taking it for read (or write).
 242  */
 243 static DECLARE_RWSEM(slqb_lock);
 244
 245 /*
 246  * A list of all slab caches on the system
 247  */
 248 static LIST_HEAD(slab_caches);
 249
 250 /*
 251  * Tracking user of a slab.
 252  */
 253 struct track {
 254         unsigned long addr;     /* Called from address */
 255         int cpu;                /* Was running on cpu */
 256         int pid;                /* Pid context */
 257         unsigned long when;     /* When did the operation occur */
 258 };
 259
 260 enum track_item { TRACK_ALLOC, TRACK_FREE };
 261
 262 static struct kmem_cache kmem_cache_cache;
 263
 264 #ifdef CONFIG_SLQB_SYSFS
 265 static int sysfs_slab_add(struct kmem_cache *s);
 266 static void sysfs_slab_remove(struct kmem_cache *s);
 267 #else
 268 static inline int sysfs_slab_add(struct kmem_cache *s)
 269 {
 270         return 0;
 271 }
 272 static inline void sysfs_slab_remove(struct kmem_cache *s)
 273 {
 274         kmem_cache_free(&kmem_cache_cache, s);
 275 }
 276 #endif
 277
 278 /********************************************************************
 279  *                      Core slab cache functions
 280  *******************************************************************/
 281
 282 static int __slab_is_available __read_mostly;
 283 int slab_is_available(void)
 284 {
 285         return __slab_is_available;
 286 }
 287
 288 static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
 289 {
 290 #ifdef CONFIG_SMP
 291         VM_BUG_ON(!s->cpu_slab[cpu]);
 292         return s->cpu_slab[cpu];
 293 #else
 294         return &s->cpu_slab;
 295 #endif
 296 }
 297
 298 static inline int check_valid_pointer(struct kmem_cache *s,
 299                                 struct slqb_page *page, const void *object)
 300 {
 301         void *base;
 302
 303         base = slqb_page_address(page);
 304         if (object < base || object >= base + s->objects * s->size ||
 305                 (object - base) % s->size) {
 306                 return 0;
 307         }
 308
 309         return 1;
 310 }
 311
 312 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 313 {
 314         return *(void **)(object + s->offset);
 315 }
 316
 317 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 318 {
 319         *(void **)(object + s->offset) = fp;
 320 }
 321
 322 /* Loop over all objects in a slab */
 323 #define for_each_object(__p, __s, __addr) \
 324         for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
 325                         __p += (__s)->size)
 326
 327 /* Scan freelist */
 328 #define for_each_free_object(__p, __s, __free) \
 329         for (__p = (__free); (__p) != NULL; __p = get_freepointer((__s),\
 330                 __p))
 331
 332 #ifdef CONFIG_SLQB_DEBUG
 333 /*
 334  * Debug settings:
 335  */
 336 #ifdef CONFIG_SLQB_DEBUG_ON
 337 static int slqb_debug __read_mostly = DEBUG_DEFAULT_FLAGS;
 338 #else
 339 static int slqb_debug __read_mostly;
 340 #endif
 341
 342 static char *slqb_debug_slabs;
 343
 344 /*
 345  * Object debugging
 346  */
 347 static void print_section(char *text, u8 *addr, unsigned int length)
 348 {
 349         int i, offset;
 350         int newline = 1;
 351         char ascii[17];
 352
 353         ascii[16] = 0;
 354
 355         for (i = 0; i < length; i++) {
 356                 if (newline) {
 357                         printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
 358                         newline = 0;
 359                 }
 360                 printk(KERN_CONT " %02x", addr[i]);
 361                 offset = i % 16;
 362                 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
 363                 if (offset == 15) {
 364                         printk(KERN_CONT " %s\n", ascii);
 365                         newline = 1;
 366                 }
 367         }
 368         if (!newline) {
 369                 i %= 16;
 370                 while (i < 16) {
 371                         printk(KERN_CONT "   ");
 372                         ascii[i] = ' ';
 373                         i++;
 374                 }
 375                 printk(KERN_CONT " %s\n", ascii);
 376         }
 377 }
 378
 379 static struct track *get_track(struct kmem_cache *s, void *object,
 380         enum track_item alloc)
 381 {
 382         struct track *p;
 383
 384         if (s->offset)
 385                 p = object + s->offset + sizeof(void *);
 386         else
 387                 p = object + s->inuse;
 388
 389         return p + alloc;
 390 }
 391
 392 static void set_track(struct kmem_cache *s, void *object,
 393                                 enum track_item alloc, unsigned long addr)
 394 {
 395         struct track *p;
 396
 397         if (s->offset)
 398                 p = object + s->offset + sizeof(void *);
 399         else
 400                 p = object + s->inuse;
 401
 402         p += alloc;
 403         if (addr) {
 404                 p->addr = addr;
 405                 p->cpu = raw_smp_processor_id();
 406                 p->pid = current ? current->pid : -1;
 407                 p->when = jiffies;
 408         } else
 409                 memset(p, 0, sizeof(struct track));
 410 }
 411
 412 static void init_tracking(struct kmem_cache *s, void *object)
 413 {
 414         if (!(s->flags & SLAB_STORE_USER))
 415                 return;
 416
 417         set_track(s, object, TRACK_FREE, 0UL);
 418         set_track(s, object, TRACK_ALLOC, 0UL);
 419 }
 420
 421 static void print_track(const char *s, struct track *t)
 422 {
 423         if (!t->addr)
 424                 return;
 425
 426         printk(KERN_ERR "INFO: %s in ", s);
 427         __print_symbol("%s", (unsigned long)t->addr);
 428         printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
 429 }
 430
 431 static void print_tracking(struct kmem_cache *s, void *object)
 432 {
 433         if (!(s->flags & SLAB_STORE_USER))
 434                 return;
 435
 436         print_track("Allocated", get_track(s, object, TRACK_ALLOC));
 437         print_track("Freed", get_track(s, object, TRACK_FREE));
 438 }
 439
 440 static void print_page_info(struct slqb_page *page)
 441 {
 442         printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
 443                 page, page->inuse, page->freelist, page->flags);
 444
 445 }
 446
 447 #define MAX_ERR_STR 100
 448 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 449 {
 450         va_list args;
 451         char buf[MAX_ERR_STR];
 452
 453         va_start(args, fmt);
 454         vsnprintf(buf, sizeof(buf), fmt, args);
 455         va_end(args);
 456         printk(KERN_ERR "========================================"
 457                         "=====================================\n");
 458         printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
 459         printk(KERN_ERR "----------------------------------------"
 460                         "-------------------------------------\n\n");
 461 }
 462
 463 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
 464 {
 465         va_list args;
 466         char buf[100];
 467
 468         va_start(args, fmt);
 469         vsnprintf(buf, sizeof(buf), fmt, args);
 470         va_end(args);
 471         printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
 472 }
 473
 474 static void print_trailer(struct kmem_cache *s, struct slqb_page *page, u8 *p)
 475 {
 476         unsigned int off;       /* Offset of last byte */
 477         u8 *addr = slqb_page_address(page);
 478
 479         print_tracking(s, p);
 480
 481         print_page_info(page);
 482
 483         printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
 484                         p, p - addr, get_freepointer(s, p));
 485
 486         if (p > addr + 16)
 487                 print_section("Bytes b4", p - 16, 16);
 488
 489         print_section("Object", p, min(s->objsize, 128));
 490
 491         if (s->flags & SLAB_RED_ZONE)
 492                 print_section("Redzone", p + s->objsize, s->inuse - s->objsize);
 493
 494         if (s->offset)
 495                 off = s->offset + sizeof(void *);
 496         else
 497                 off = s->inuse;
 498
 499         if (s->flags & SLAB_STORE_USER)
 500                 off += 2 * sizeof(struct track);
 501
 502         if (off != s->size) {
 503                 /* Beginning of the filler is the free pointer */
 504                 print_section("Padding", p + off, s->size - off);
 505         }
 506
 507         dump_stack();
 508 }
 509
 510 static void object_err(struct kmem_cache *s, struct slqb_page *page,
 511                         u8 *object, char *reason)
 512 {
 513         slab_bug(s, reason);
 514         print_trailer(s, page, object);
 515 }
 516
 517 static void slab_err(struct kmem_cache *s, struct slqb_page *page,
 518                         char *fmt, ...)
 519 {
 520         slab_bug(s, fmt);
 521         print_page_info(page);
 522         dump_stack();
 523 }
 524
 525 static void init_object(struct kmem_cache *s, void *object, int active)
 526 {
 527         u8 *p = object;
 528
 529         if (s->flags & __OBJECT_POISON) {
 530                 memset(p, POISON_FREE, s->objsize - 1);
 531                 p[s->objsize - 1] = POISON_END;
 532         }
 533
 534         if (s->flags & SLAB_RED_ZONE) {
 535                 memset(p + s->objsize,
 536                         active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
 537                         s->inuse - s->objsize);
 538         }
 539 }
 540
 541 static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
 542 {
 543         while (bytes) {
 544                 if (*start != (u8)value)
 545                         return start;
 546                 start++;
 547                 bytes--;
 548         }
 549         return NULL;
 550 }
 551
 552 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 553                                 void *from, void *to)
 554 {
 555         slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
 556         memset(from, data, to - from);
 557 }
 558
 559 static int check_bytes_and_report(struct kmem_cache *s, struct slqb_page *page,
 560                         u8 *object, char *what,
 561                         u8 *start, unsigned int value, unsigned int bytes)
 562 {
 563         u8 *fault;
 564         u8 *end;
 565
 566         fault = check_bytes(start, value, bytes);
 567         if (!fault)
 568                 return 1;
 569
 570         end = start + bytes;
 571         while (end > fault && end[-1] == value)
 572                 end--;
 573
 574         slab_bug(s, "%s overwritten", what);
 575         printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
 576                                         fault, end - 1, fault[0], value);
 577         print_trailer(s, page, object);
 578
 579         restore_bytes(s, what, value, fault, end);
 580         return 0;
 581 }
 582
 583 /*
 584  * Object layout:
 585  *
 586  * object address
 587  *      Bytes of the object to be managed.
 588  *      If the freepointer may overlay the object then the free
 589  *      pointer is the first word of the object.
 590  *
 591  *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 592  *      0xa5 (POISON_END)
 593  *
 594  * object + s->objsize
 595  *      Padding to reach word boundary. This is also used for Redzoning.
 596  *      Padding is extended by another word if Redzoning is enabled and
 597  *      objsize == inuse.
 598  *
 599  *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 600  *      0xcc (RED_ACTIVE) for objects in use.
 601  *
 602  * object + s->inuse
 603  *      Meta data starts here.
 604  *
 605  *      A. Free pointer (if we cannot overwrite object on free)
 606  *      B. Tracking data for SLAB_STORE_USER
 607  *      C. Padding to reach required alignment boundary or at mininum
 608  *              one word if debuggin is on to be able to detect writes
 609  *              before the word boundary.
 610  *
 611  *      Padding is done using 0x5a (POISON_INUSE)
 612  *
 613  * object + s->size
 614  *      Nothing is used beyond s->size.
 615  */
 616
 617 static int check_pad_bytes(struct kmem_cache *s, struct slqb_page *page, u8 *p)
 618 {
 619         unsigned long off = s->inuse;   /* The end of info */
 620
 621         if (s->offset) {
 622                 /* Freepointer is placed after the object. */
 623                 off += sizeof(void *);
 624         }
 625
 626         if (s->flags & SLAB_STORE_USER) {
 627                 /* We also have user information there */
 628                 off += 2 * sizeof(struct track);
 629         }
 630
 631         if (s->size == off)
 632                 return 1;
 633
 634         return check_bytes_and_report(s, page, p, "Object padding",
 635                                 p + off, POISON_INUSE, s->size - off);
 636 }
 637
 638 static int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
 639 {
 640         u8 *start;
 641         u8 *fault;
 642         u8 *end;
 643         int length;
 644         int remainder;
 645
 646         if (!(s->flags & SLAB_POISON))
 647                 return 1;
 648
 649         start = slqb_page_address(page);
 650         end = start + (PAGE_SIZE << s->order);
 651         length = s->objects * s->size;
 652         remainder = end - (start + length);
 653         if (!remainder)
 654                 return 1;
 655
 656         fault = check_bytes(start + length, POISON_INUSE, remainder);
 657         if (!fault)
 658                 return 1;
 659
 660         while (end > fault && end[-1] == POISON_INUSE)
 661                 end--;
 662
 663         slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
 664         print_section("Padding", start, length);
 665
 666         restore_bytes(s, "slab padding", POISON_INUSE, start, end);
 667         return 0;
 668 }
 669
 670 static int check_object(struct kmem_cache *s, struct slqb_page *page,
 671                                         void *object, int active)
 672 {
 673         u8 *p = object;
 674         u8 *endobject = object + s->objsize;
 675
 676         if (s->flags & SLAB_RED_ZONE) {
 677                 unsigned int red =
 678                         active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
 679
 680                 if (!check_bytes_and_report(s, page, object, "Redzone",
 681                         endobject, red, s->inuse - s->objsize))
 682                         return 0;
 683         } else {
 684                 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
 685                         check_bytes_and_report(s, page, p, "Alignment padding",
 686                                 endobject, POISON_INUSE, s->inuse - s->objsize);
 687                 }
 688         }
 689
 690         if (s->flags & SLAB_POISON) {
 691                 if (!active && (s->flags & __OBJECT_POISON)) {
 692                         if (!check_bytes_and_report(s, page, p, "Poison", p,
 693                                         POISON_FREE, s->objsize - 1))
 694                                 return 0;
 695
 696                         if (!check_bytes_and_report(s, page, p, "Poison",
 697                                         p + s->objsize - 1, POISON_END, 1))
 698                                 return 0;
 699                 }
 700
 701                 /*
 702                  * check_pad_bytes cleans up on its own.
 703                  */
 704                 check_pad_bytes(s, page, p);
 705         }
 706
 707         return 1;
 708 }
 709
 710 static int check_slab(struct kmem_cache *s, struct slqb_page *page)
 711 {
 712         if (!(page->flags & PG_SLQB_BIT)) {
 713                 slab_err(s, page, "Not a valid slab page");
 714                 return 0;
 715         }
 716         if (page->inuse == 0) {
 717                 slab_err(s, page, "inuse before free / after alloc", s->name);
 718                 return 0;
 719         }
 720         if (page->inuse > s->objects) {
 721                 slab_err(s, page, "inuse %u > max %u",
 722                         s->name, page->inuse, s->objects);
 723                 return 0;
 724         }
 725         /* Slab_pad_check fixes things up after itself */
 726         slab_pad_check(s, page);
 727         return 1;
 728 }
 729
 730 static void trace(struct kmem_cache *s, struct slqb_page *page,
 731                         void *object, int alloc)
 732 {
 733         if (s->flags & SLAB_TRACE) {
 734                 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
 735                         s->name,
 736                         alloc ? "alloc" : "free",
 737                         object, page->inuse,
 738                         page->freelist);
 739
 740                 if (!alloc)
 741                         print_section("Object", (void *)object, s->objsize);
 742
 743                 dump_stack();
 744         }
 745 }
 746
 747 static void setup_object_debug(struct kmem_cache *s, struct slqb_page *page,
 748                                 void *object)
 749 {
 750         if (!slab_debug(s))
 751                 return;
 752
 753         if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
 754                 return;
 755
 756         init_object(s, object, 0);
 757         init_tracking(s, object);
 758 }
 759
 760 static int alloc_debug_processing(struct kmem_cache *s,
 761                                         void *object, unsigned long addr)
 762 {
 763         struct slqb_page *page;
 764         page = virt_to_head_slqb_page(object);
 765
 766         if (!check_slab(s, page))
 767                 goto bad;
 768
 769         if (!check_valid_pointer(s, page, object)) {
 770                 object_err(s, page, object, "Freelist Pointer check fails");
 771                 goto bad;
 772         }
 773
 774         if (object && !check_object(s, page, object, 0))
 775                 goto bad;
 776
 777         /* Success perform special debug activities for allocs */
 778         if (s->flags & SLAB_STORE_USER)
 779                 set_track(s, object, TRACK_ALLOC, addr);
 780         trace(s, page, object, 1);
 781         init_object(s, object, 1);
 782         return 1;
 783
 784 bad:
 785         return 0;
 786 }
 787
 788 static int free_debug_processing(struct kmem_cache *s,
 789                                         void *object, unsigned long addr)
 790 {
 791         struct slqb_page *page;
 792         page = virt_to_head_slqb_page(object);
 793
 794         if (!check_slab(s, page))
 795                 goto fail;
 796
 797         if (!check_valid_pointer(s, page, object)) {
 798                 slab_err(s, page, "Invalid object pointer 0x%p", object);
 799                 goto fail;
 800         }
 801
 802         if (!check_object(s, page, object, 1))
 803                 return 0;
 804
 805         /* Special debug activities for freeing objects */
 806         if (s->flags & SLAB_STORE_USER)
 807                 set_track(s, object, TRACK_FREE, addr);
 808         trace(s, page, object, 0);
 809         init_object(s, object, 0);
 810         return 1;
 811
 812 fail:
 813         slab_fix(s, "Object at 0x%p not freed", object);
 814         return 0;
 815 }
 816
 817 static int __init setup_slqb_debug(char *str)
 818 {
 819         slqb_debug = DEBUG_DEFAULT_FLAGS;
 820         if (*str++ != '=' || !*str) {
 821                 /*
 822                  * No options specified. Switch on full debugging.
 823                  */
 824                 goto out;
 825         }
 826
 827         if (*str == ',') {
 828                 /*
 829                  * No options but restriction on slabs. This means full
 830                  * debugging for slabs matching a pattern.
 831                  */
 832                 goto check_slabs;
 833         }
 834
 835         slqb_debug = 0;
 836         if (*str == '-') {
 837                 /*
 838                  * Switch off all debugging measures.
 839                  */
 840                 goto out;
 841         }
 842
 843         /*
 844          * Determine which debug features should be switched on
 845          */
 846         for (; *str && *str != ','; str++) {
 847                 switch (tolower(*str)) {
 848                 case 'f':
 849                         slqb_debug |= SLAB_DEBUG_FREE;
 850                         break;
 851                 case 'z':
 852                         slqb_debug |= SLAB_RED_ZONE;
 853                         break;
 854                 case 'p':
 855                         slqb_debug |= SLAB_POISON;
 856                         break;
 857                 case 'u':
 858                         slqb_debug |= SLAB_STORE_USER;
 859                         break;
 860                 case 't':
 861                         slqb_debug |= SLAB_TRACE;
 862                         break;
 863                 default:
 864                         printk(KERN_ERR "slqb_debug option '%c' "
 865                                 "unknown. skipped\n", *str);
 866                 }
 867         }
 868
 869 check_slabs:
 870         if (*str == ',')
 871                 slqb_debug_slabs = str + 1;
 872 out:
 873         return 1;
 874 }
 875 __setup("slqb_debug", setup_slqb_debug);
 876
 877 static int __init setup_slqb_min_order(char *str)
 878 {
 879         get_option(&str, &slqb_min_order);
 880         slqb_min_order = min(slqb_min_order, MAX_ORDER - 1);
 881
 882         return 1;
 883 }
 884 __setup("slqb_min_order=", setup_slqb_min_order);
 885
 886 static int __init setup_slqb_min_objects(char *str)
 887 {
 888         get_option(&str, &slqb_min_objects);
 889
 890         return 1;
 891 }
 892
 893 __setup("slqb_min_objects=", setup_slqb_min_objects);
 894
 895 static unsigned long kmem_cache_flags(unsigned long objsize,
 896                                 unsigned long flags, const char *name,
 897                                 void (*ctor)(void *))
 898 {
 899         /*
 900          * Enable debugging if selected on the kernel commandline.
 901          */
 902         if (slqb_debug && (!slqb_debug_slabs ||
 903             strncmp(slqb_debug_slabs, name,
 904                 strlen(slqb_debug_slabs)) == 0))
 905                         flags |= slqb_debug;
 906
 907         if (num_possible_nodes() > 1)
 908                 flags |= SLAB_NUMA;
 909
 910         return flags;
 911 }
 912 #else
 913 static inline void setup_object_debug(struct kmem_cache *s,
 914                         struct slqb_page *page, void *object)
 915 {
 916 }
 917
 918 static inline int alloc_debug_processing(struct kmem_cache *s,
 919                         void *object, unsigned long addr)
 920 {
 921         return 0;
 922 }
 923
 924 static inline int free_debug_processing(struct kmem_cache *s,
 925                         void *object, unsigned long addr)
 926 {
 927         return 0;
 928 }
 929
 930 static inline int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
 931 {
 932         return 1;
 933 }
 934
 935 static inline int check_object(struct kmem_cache *s, struct slqb_page *page,
 936                         void *object, int active)
 937 {
 938         return 1;
 939 }
 940
 941 static inline void add_full(struct kmem_cache_node *n, struct slqb_page *page)
 942 {
 943 }
 944
 945 static inline unsigned long kmem_cache_flags(unsigned long objsize,
 946         unsigned long flags, const char *name, void (*ctor)(void *))
 947 {
 948         if (num_possible_nodes() > 1)
 949                 flags |= SLAB_NUMA;
 950         return flags;
 951 }
 952
 953 static const int slqb_debug;
 954 #endif
 955
 956 /*
 957  * allocate a new slab (return its corresponding struct slqb_page)
 958  */
 959 static struct slqb_page *allocate_slab(struct kmem_cache *s,
 960                                         gfp_t flags, int node)
 961 {
 962         struct slqb_page *page;
 963         int pages = 1 << s->order;
 964
 965         flags |= s->allocflags;
 966
 967         page = (struct slqb_page *)alloc_pages_node(node, flags, s->order);
 968         if (!page)
 969                 return NULL;
 970
 971         mod_zone_page_state(slqb_page_zone(page),
 972                 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
 973                 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 974                 pages);
 975
 976         return page;
 977 }
 978
 979 /*
 980  * Called once for each object on a new slab page
 981  */
 982 static void setup_object(struct kmem_cache *s,
 983                                 struct slqb_page *page, void *object)
 984 {
 985         setup_object_debug(s, page, object);
 986         if (unlikely(s->ctor))
 987                 s->ctor(object);
 988 }
 989
 990 /*
 991  * Allocate a new slab, set up its object list.
 992  */
 993 static struct slqb_page *new_slab_page(struct kmem_cache *s,
 994                                 gfp_t flags, int node, unsigned int colour)
 995 {
 996         struct slqb_page *page;
 997         void *start;
 998         void *last;
 999         void *p;
1000
1001         BUG_ON(flags & GFP_SLAB_BUG_MASK);
1002
1003         page = allocate_slab(s,
1004                 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1005         if (!page)
1006                 goto out;
1007
1008         page->flags |= PG_SLQB_BIT;
1009
1010         start = page_address(&page->page);
1011
1012         if (unlikely(slab_poison(s)))
1013                 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
1014
1015         start += colour;
1016
1017         last = start;
1018         for_each_object(p, s, start) {
1019                 setup_object(s, page, p);
1020                 set_freepointer(s, last, p);
1021                 last = p;
1022         }
1023         set_freepointer(s, last, NULL);
1024
1025         page->freelist = start;
1026         page->inuse = 0;
1027 out:
1028         return page;
1029 }
1030
1031 /*
1032  * Free a slab page back to the page allocator
1033  */
1034 static void __free_slab(struct kmem_cache *s, struct slqb_page *page)
1035 {
1036         int pages = 1 << s->order;
1037
1038         if (unlikely(slab_debug(s))) {
1039                 void *p;
1040
1041                 slab_pad_check(s, page);
1042                 for_each_free_object(p, s, page->freelist)
1043                         check_object(s, page, p, 0);
1044         }
1045
1046         mod_zone_page_state(slqb_page_zone(page),
1047                 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1048                 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1049                 -pages);
1050
1051         __free_slqb_pages(page, s->order, pages);
1052 }
1053
1054 static void rcu_free_slab(struct rcu_head *h)
1055 {
1056         struct slqb_page *page;
1057
1058         page = container_of(h, struct slqb_page, rcu_head);
1059         __free_slab(page->list->cache, page);
1060 }
1061
1062 static void free_slab(struct kmem_cache *s, struct slqb_page *page)
1063 {
1064         VM_BUG_ON(page->inuse);
1065         if (unlikely(s->flags & SLAB_DESTROY_BY_RCU))
1066                 call_rcu(&page->rcu_head, rcu_free_slab);
1067         else
1068                 __free_slab(s, page);
1069 }
1070
1071 /*
1072  * Return an object to its slab.
1073  *
1074  * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1075  * list_lock in the case of per-node list.
1076  */
1077 static int free_object_to_page(struct kmem_cache *s,
1078                         struct kmem_cache_list *l, struct slqb_page *page,
1079                         void *object)
1080 {
1081         VM_BUG_ON(page->list != l);
1082
1083         set_freepointer(s, object, page->freelist);
1084         page->freelist = object;
1085         page->inuse--;
1086
1087         if (!page->inuse) {
1088                 if (likely(s->objects > 1)) {
1089                         l->nr_partial--;
1090                         list_del(&page->lru);
1091                 }
1092                 l->nr_slabs--;
1093                 free_slab(s, page);
1094                 slqb_stat_inc(l, FLUSH_SLAB_FREE);
1095                 return 1;
1096
1097         } else if (page->inuse + 1 == s->objects) {
1098                 l->nr_partial++;
1099                 list_add(&page->lru, &l->partial);
1100                 slqb_stat_inc(l, FLUSH_SLAB_PARTIAL);
1101                 return 0;
1102         }
1103         return 0;
1104 }
1105
1106 #ifdef CONFIG_SMP
1107 static void slab_free_to_remote(struct kmem_cache *s, struct slqb_page *page,
1108                                 void *object, struct kmem_cache_cpu *c);
1109 #endif
1110
1111 /*
1112  * Flush the LIFO list of objects on a list. They are sent back to their pages
1113  * in case the pages also belong to the list, or to our CPU's remote-free list
1114  * in the case they do not.
1115  *
1116  * Doesn't flush the entire list. flush_free_list_all does.
1117  *
1118  * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1119  * list_lock in the case of per-node list.
1120  */
1121 static void flush_free_list(struct kmem_cache *s, struct kmem_cache_list *l)
1122 {
1123         void **head;
1124         int nr;
1125         int locked = 0;
1126
1127         nr = l->freelist.nr;
1128         if (unlikely(!nr))
1129                 return;
1130
1131         nr = min(slab_freebatch(s), nr);
1132
1133         slqb_stat_inc(l, FLUSH_FREE_LIST);
1134         slqb_stat_add(l, FLUSH_FREE_LIST_OBJECTS, nr);
1135
1136         l->freelist.nr -= nr;
1137         head = l->freelist.head;
1138
1139         do {
1140                 struct slqb_page *page;
1141                 void **object;
1142
1143                 object = head;
1144                 VM_BUG_ON(!object);
1145                 head = get_freepointer(s, object);
1146                 page = virt_to_head_slqb_page(object);
1147
1148 #ifdef CONFIG_SMP
1149                 if (page->list != l) {
1150                         struct kmem_cache_cpu *c;
1151
1152                         if (locked) {
1153                                 spin_unlock(&l->page_lock);
1154                                 locked = 0;
1155                         }
1156
1157                         c = get_cpu_slab(s, smp_processor_id());
1158
1159                         slab_free_to_remote(s, page, object, c);
1160                         slqb_stat_inc(l, FLUSH_FREE_LIST_REMOTE);
1161                 } else
1162 #endif
1163                 {
1164                         if (!locked) {
1165                                 spin_lock(&l->page_lock);
1166                                 locked = 1;
1167                         }
1168                         free_object_to_page(s, l, page, object);
1169                 }
1170
1171                 nr--;
1172         } while (nr);
1173
1174         if (locked)
1175                 spin_unlock(&l->page_lock);
1176
1177         l->freelist.head = head;
1178         if (!l->freelist.nr)
1179                 l->freelist.tail = NULL;
1180 }
1181
1182 static void flush_free_list_all(struct kmem_cache *s, struct kmem_cache_list *l)
1183 {
1184         while (l->freelist.nr)
1185                 flush_free_list(s, l);
1186 }
1187
1188 #ifdef CONFIG_SMP
1189 /*
1190  * If enough objects have been remotely freed back to this list,
1191  * remote_free_check will be set. In which case, we'll eventually come here
1192  * to take those objects off our remote_free list and onto our LIFO freelist.
1193  *
1194  * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1195  * list_lock in the case of per-node list.
1196  */
1197 static void claim_remote_free_list(struct kmem_cache *s,
1198                                         struct kmem_cache_list *l)
1199 {
1200         void **head, **tail;
1201         int nr;
1202
1203         if (!l->remote_free.list.nr)
1204                 return;
1205
1206         spin_lock(&l->remote_free.lock);
1207
1208         l->remote_free_check = 0;
1209         head = l->remote_free.list.head;
1210         l->remote_free.list.head = NULL;
1211         tail = l->remote_free.list.tail;
1212         l->remote_free.list.tail = NULL;
1213         nr = l->remote_free.list.nr;
1214         l->remote_free.list.nr = 0;
1215
1216         spin_unlock(&l->remote_free.lock);
1217
1218         VM_BUG_ON(!nr);
1219
1220         if (!l->freelist.nr) {
1221                 /* Get head hot for likely subsequent allocation or flush */
1222                 prefetchw(head);
1223                 l->freelist.head = head;
1224         } else
1225                 set_freepointer(s, l->freelist.tail, head);
1226         l->freelist.tail = tail;
1227
1228         l->freelist.nr += nr;
1229
1230         slqb_stat_inc(l, CLAIM_REMOTE_LIST);
1231         slqb_stat_add(l, CLAIM_REMOTE_LIST_OBJECTS, nr);
1232 }
1233 #else
1234 static inline void claim_remote_free_list(struct kmem_cache *s,
1235                                         struct kmem_cache_list *l)
1236 {
1237 }
1238 #endif
1239
1240 /*
1241  * Allocation fastpath. Get an object from the list's LIFO freelist, or
1242  * return NULL if it is empty.
1243  *
1244  * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1245  * list_lock in the case of per-node list.
1246  */
1247 static __always_inline void *__cache_list_get_object(struct kmem_cache *s,
1248                                                 struct kmem_cache_list *l)
1249 {
1250         void *object;
1251
1252         object = l->freelist.head;
1253         if (likely(object)) {
1254                 void *next = get_freepointer(s, object);
1255
1256                 VM_BUG_ON(!l->freelist.nr);
1257                 l->freelist.nr--;
1258                 l->freelist.head = next;
1259
1260                 return object;
1261         }
1262         VM_BUG_ON(l->freelist.nr);
1263
1264 #ifdef CONFIG_SMP
1265         if (unlikely(l->remote_free_check)) {
1266                 claim_remote_free_list(s, l);
1267
1268                 if (l->freelist.nr > slab_hiwater(s))
1269                         flush_free_list(s, l);
1270
1271                 /* repetition here helps gcc :( */
1272                 object = l->freelist.head;
1273                 if (likely(object)) {
1274                         void *next = get_freepointer(s, object);
1275
1276                         VM_BUG_ON(!l->freelist.nr);
1277                         l->freelist.nr--;
1278                         l->freelist.head = next;
1279
1280                         return object;
1281                 }
1282                 VM_BUG_ON(l->freelist.nr);
1283         }
1284 #endif
1285
1286         return NULL;
1287 }
1288
1289 /*
1290  * Slow(er) path. Get a page from this list's existing pages. Will be a
1291  * new empty page in the case that __slab_alloc_page has just been called
1292  * (empty pages otherwise never get queued up on the lists), or a partial page
1293  * already on the list.
1294  *
1295  * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1296  * list_lock in the case of per-node list.
1297  */
1298 static noinline void *__cache_list_get_page(struct kmem_cache *s,
1299                                 struct kmem_cache_list *l)
1300 {
1301         struct slqb_page *page;
1302         void *object;
1303
1304         if (unlikely(!l->nr_partial))
1305                 return NULL;
1306
1307         page = list_first_entry(&l->partial, struct slqb_page, lru);
1308         VM_BUG_ON(page->inuse == s->objects);
1309         if (page->inuse + 1 == s->objects) {
1310                 l->nr_partial--;
1311                 list_del(&page->lru);
1312         }
1313
1314         VM_BUG_ON(!page->freelist);
1315
1316         page->inuse++;
1317
1318         object = page->freelist;
1319         page->freelist = get_freepointer(s, object);
1320         if (page->freelist)
1321                 prefetchw(page->freelist);
1322         VM_BUG_ON((page->inuse == s->objects) != (page->freelist == NULL));
1323         slqb_stat_inc(l, ALLOC_SLAB_FILL);
1324
1325         return object;
1326 }
1327
1328 static void *cache_list_get_page(struct kmem_cache *s,
1329                                 struct kmem_cache_list *l)
1330 {
1331         void *object;
1332
1333         if (unlikely(!l->nr_partial))
1334                 return NULL;
1335
1336         spin_lock(&l->page_lock);
1337         object = __cache_list_get_page(s, l);
1338         spin_unlock(&l->page_lock);
1339
1340         return object;
1341 }
1342
1343 /*
1344  * Allocation slowpath. Allocate a new slab page from the page allocator, and
1345  * put it on the list's partial list. Must be followed by an allocation so
1346  * that we don't have dangling empty pages on the partial list.
1347  *
1348  * Returns 0 on allocation failure.
1349  *
1350  * Must be called with interrupts disabled.
1351  */
1352 static noinline void *__slab_alloc_page(struct kmem_cache *s,
1353                                 gfp_t gfpflags, int node)
1354 {
1355         struct slqb_page *page;
1356         struct kmem_cache_list *l;
1357         struct kmem_cache_cpu *c;
1358         unsigned int colour;
1359         void *object;
1360
1361         c = get_cpu_slab(s, smp_processor_id());
1362         colour = c->colour_next;
1363         c->colour_next += s->colour_off;
1364         if (c->colour_next >= s->colour_range)
1365                 c->colour_next = 0;
1366
1367         /* Caller handles __GFP_ZERO */
1368         gfpflags &= ~__GFP_ZERO;
1369
1370         if (gfpflags & __GFP_WAIT)
1371                 local_irq_enable();
1372         page = new_slab_page(s, gfpflags, node, colour);
1373         if (gfpflags & __GFP_WAIT)
1374                 local_irq_disable();
1375         if (unlikely(!page))
1376                 return page;
1377
1378         if (!NUMA_BUILD || likely(slqb_page_to_nid(page) == numa_node_id())) {
1379                 struct kmem_cache_cpu *c;
1380                 int cpu = smp_processor_id();
1381
1382                 c = get_cpu_slab(s, cpu);
1383                 l = &c->list;
1384                 page->list = l;
1385
1386                 spin_lock(&l->page_lock);
1387                 l->nr_slabs++;
1388                 l->nr_partial++;
1389                 list_add(&page->lru, &l->partial);
1390                 slqb_stat_inc(l, ALLOC);
1391                 slqb_stat_inc(l, ALLOC_SLAB_NEW);
1392                 object = __cache_list_get_page(s, l);
1393                 spin_unlock(&l->page_lock);
1394         } else {
1395 #ifdef CONFIG_NUMA
1396                 struct kmem_cache_node *n;
1397
1398                 n = s->node_slab[slqb_page_to_nid(page)];
1399                 l = &n->list;
1400                 page->list = l;
1401
1402                 spin_lock(&n->list_lock);
1403                 spin_lock(&l->page_lock);
1404                 l->nr_slabs++;
1405                 l->nr_partial++;
1406                 list_add(&page->lru, &l->partial);
1407                 slqb_stat_inc(l, ALLOC);
1408                 slqb_stat_inc(l, ALLOC_SLAB_NEW);
1409                 object = __cache_list_get_page(s, l);
1410                 spin_unlock(&l->page_lock);
1411                 spin_unlock(&n->list_lock);
1412 #endif
1413         }
1414         VM_BUG_ON(!object);
1415         return object;
1416 }
1417
1418 #ifdef CONFIG_NUMA
1419 static noinline int alternate_nid(struct kmem_cache *s,
1420                                 gfp_t gfpflags, int node)
1421 {
1422         if (in_interrupt() || (gfpflags & __GFP_THISNODE))
1423                 return node;
1424         if (cpuset_do_slab_mem_spread() && (s->flags & SLAB_MEM_SPREAD))
1425                 return cpuset_mem_spread_node();
1426         else if (current->mempolicy)
1427                 return slab_node(current->mempolicy);
1428         return node;
1429 }
1430
1431 /*
1432  * Allocate an object from a remote node. Return NULL if none could be found
1433  * (in which case, caller should allocate a new slab)
1434  *
1435  * Must be called with interrupts disabled.
1436  */
1437 static void *__remote_slab_alloc_node(struct kmem_cache *s,
1438                                 gfp_t gfpflags, int node)
1439 {
1440         struct kmem_cache_node *n;
1441         struct kmem_cache_list *l;
1442         void *object;
1443
1444         n = s->node_slab[node];
1445         if (unlikely(!n)) /* node has no memory */
1446                 return NULL;
1447         l = &n->list;
1448
1449         spin_lock(&n->list_lock);
1450
1451         object = __cache_list_get_object(s, l);
1452         if (unlikely(!object)) {
1453                 object = cache_list_get_page(s, l);
1454                 if (unlikely(!object)) {
1455                         spin_unlock(&n->list_lock);
1456                         return __slab_alloc_page(s, gfpflags, node);
1457                 }
1458         }
1459         if (likely(object))
1460                 slqb_stat_inc(l, ALLOC);
1461         spin_unlock(&n->list_lock);
1462         return object;
1463 }
1464
1465 static noinline void *__remote_slab_alloc(struct kmem_cache *s,
1466                                 gfp_t gfpflags, int node)
1467 {
1468         void *object;
1469         struct zonelist *zonelist;
1470         struct zoneref *z;
1471         struct zone *zone;
1472         enum zone_type high_zoneidx = gfp_zone(gfpflags);
1473
1474         object = __remote_slab_alloc_node(s, gfpflags, node);
1475         if (likely(object || (gfpflags & __GFP_THISNODE)))
1476                 return object;
1477
1478         zonelist = node_zonelist(slab_node(current->mempolicy), gfpflags);
1479         for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1480                 if (!cpuset_zone_allowed_hardwall(zone, gfpflags))
1481                         continue;
1482
1483                 node = zone_to_nid(zone);
1484                 object = __remote_slab_alloc_node(s, gfpflags, node);
1485                 if (likely(object))
1486                         return object;
1487         }
1488         return NULL;
1489 }
1490 #endif
1491
1492 /*
1493  * Main allocation path. Return an object, or NULL on allocation failure.
1494  *
1495  * Must be called with interrupts disabled.
1496  */
1497 static __always_inline void *__slab_alloc(struct kmem_cache *s,
1498                                 gfp_t gfpflags, int node)
1499 {
1500         void *object;
1501         struct kmem_cache_cpu *c;
1502         struct kmem_cache_list *l;
1503
1504 #ifdef CONFIG_NUMA
1505         if (unlikely(node != -1) && unlikely(node != numa_node_id())) {
1506 try_remote:
1507                 return __remote_slab_alloc(s, gfpflags, node);
1508         }
1509 #endif
1510
1511         c = get_cpu_slab(s, smp_processor_id());
1512         VM_BUG_ON(!c);
1513         l = &c->list;
1514         object = __cache_list_get_object(s, l);
1515         if (unlikely(!object)) {
1516 #ifdef CONFIG_NUMA
1517                 int thisnode = numa_node_id();
1518
1519                 /*
1520                  * If the local node is memoryless, try remote alloc before
1521                  * trying the page allocator. Otherwise, what happens is
1522                  * objects are always freed to remote lists but the allocation
1523                  * side always allocates a new page with only one object
1524                  * used in each page
1525                  */
1526                 if (unlikely(!node_state(thisnode, N_HIGH_MEMORY)))
1527                         object = __remote_slab_alloc(s, gfpflags, thisnode);
1528 #endif
1529
1530                 if (!object) {
1531                         object = cache_list_get_page(s, l);
1532                         if (unlikely(!object)) {
1533                                 object = __slab_alloc_page(s, gfpflags, node);
1534 #ifdef CONFIG_NUMA
1535                                 if (unlikely(!object)) {
1536                                         node = numa_node_id();
1537                                         goto try_remote;
1538                                 }
1539 #endif
1540                                 return object;
1541                         }
1542                 }
1543         }
1544         if (likely(object))
1545                 slqb_stat_inc(l, ALLOC);
1546         return object;
1547 }
1548
1549 /*
1550  * Perform some interrupts-on processing around the main allocation path
1551  * (debug checking and memset()ing).
1552  */
1553 static __always_inline void *slab_alloc(struct kmem_cache *s,
1554                                 gfp_t gfpflags, int node, unsigned long addr)
1555 {
1556         void *object;
1557         unsigned long flags;
1558
1559         gfpflags &= gfp_allowed_mask;
1560
1561         lockdep_trace_alloc(gfpflags);
1562         might_sleep_if(gfpflags & __GFP_WAIT);
1563
1564         if (should_failslab(s->objsize, gfpflags))
1565                 return NULL;
1566
1567 again:
1568         local_irq_save(flags);
1569         object = __slab_alloc(s, gfpflags, node);
1570         local_irq_restore(flags);
1571
1572         if (unlikely(slab_debug(s)) && likely(object)) {
1573                 if (unlikely(!alloc_debug_processing(s, object, addr)))
1574                         goto again;
1575         }
1576
1577         if (unlikely(gfpflags & __GFP_ZERO) && likely(object))
1578                 memset(object, 0, s->objsize);
1579
1580         return object;
1581 }
1582
1583 static __always_inline void *__kmem_cache_alloc(struct kmem_cache *s,
1584                                 gfp_t gfpflags, unsigned long caller)
1585 {
1586         int node = -1;
1587
1588 #ifdef CONFIG_NUMA
1589         if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
1590                 node = alternate_nid(s, gfpflags, node);
1591 #endif
1592         return slab_alloc(s, gfpflags, node, caller);
1593 }
1594
1595 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1596 {
1597         return __kmem_cache_alloc(s, gfpflags, _RET_IP_);
1598 }
1599 EXPORT_SYMBOL(kmem_cache_alloc);
1600
1601 #ifdef CONFIG_NUMA
1602 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1603 {
1604         return slab_alloc(s, gfpflags, node, _RET_IP_);
1605 }
1606 EXPORT_SYMBOL(kmem_cache_alloc_node);
1607 #endif
1608
1609 #ifdef CONFIG_SMP
1610 /*
1611  * Flush this CPU's remote free list of objects back to the list from where
1612  * they originate. They end up on that list's remotely freed list, and
1613  * eventually we set it's remote_free_check if there are enough objects on it.
1614  *
1615  * This seems convoluted, but it keeps is from stomping on the target CPU's
1616  * fastpath cachelines.
1617  *
1618  * Must be called with interrupts disabled.
1619  */
1620 static void flush_remote_free_cache(struct kmem_cache *s,
1621                                 struct kmem_cache_cpu *c)
1622 {
1623         struct kmlist *src;
1624         struct kmem_cache_list *dst;
1625         unsigned int nr;
1626         int set;
1627
1628         src = &c->rlist;
1629         nr = src->nr;
1630         if (unlikely(!nr))
1631                 return;
1632
1633 #ifdef CONFIG_SLQB_STATS
1634         {
1635                 struct kmem_cache_list *l = &c->list;
1636
1637                 slqb_stat_inc(l, FLUSH_RFREE_LIST);
1638                 slqb_stat_add(l, FLUSH_RFREE_LIST_OBJECTS, nr);
1639         }
1640 #endif
1641
1642         dst = c->remote_cache_list;
1643
1644         /*
1645          * Less common case, dst is filling up so free synchronously.
1646          * No point in having remote CPU free thse as it will just
1647          * free them back to the page list anyway.
1648          */
1649         if (unlikely(dst->remote_free.list.nr > (slab_hiwater(s) >> 1))) {
1650                 void **head;
1651
1652                 head = src->head;
1653                 spin_lock(&dst->page_lock);
1654                 do {
1655                         struct slqb_page *page;
1656                         void **object;
1657
1658                         object = head;
1659                         VM_BUG_ON(!object);
1660                         head = get_freepointer(s, object);
1661                         page = virt_to_head_slqb_page(object);
1662
1663                         free_object_to_page(s, dst, page, object);
1664                         nr--;
1665                 } while (nr);
1666                 spin_unlock(&dst->page_lock);
1667
1668                 src->head = NULL;
1669                 src->tail = NULL;
1670                 src->nr = 0;
1671
1672                 return;
1673         }
1674
1675         spin_lock(&dst->remote_free.lock);
1676
1677         if (!dst->remote_free.list.head)
1678                 dst->remote_free.list.head = src->head;
1679         else
1680                 set_freepointer(s, dst->remote_free.list.tail, src->head);
1681         dst->remote_free.list.tail = src->tail;
1682
1683         src->head = NULL;
1684         src->tail = NULL;
1685         src->nr = 0;
1686
1687         if (dst->remote_free.list.nr < slab_freebatch(s))
1688                 set = 1;
1689         else
1690                 set = 0;
1691
1692         dst->remote_free.list.nr += nr;
1693
1694         if (unlikely(dst->remote_free.list.nr >= slab_freebatch(s) && set))
1695                 dst->remote_free_check = 1;
1696
1697         spin_unlock(&dst->remote_free.lock);
1698 }
1699
1700 /*
1701  * Free an object to this CPU's remote free list.
1702  *
1703  * Must be called with interrupts disabled.
1704  */
1705 static noinline void slab_free_to_remote(struct kmem_cache *s,
1706                                 struct slqb_page *page, void *object,
1707                                 struct kmem_cache_cpu *c)
1708 {
1709         struct kmlist *r;
1710
1711         /*
1712          * Our remote free list corresponds to a different list. Must
1713          * flush it and switch.
1714          */
1715         if (page->list != c->remote_cache_list) {
1716                 flush_remote_free_cache(s, c);
1717                 c->remote_cache_list = page->list;
1718         }
1719
1720         r = &c->rlist;
1721         if (!r->head)
1722                 r->head = object;
1723         else
1724                 set_freepointer(s, r->tail, object);
1725         set_freepointer(s, object, NULL);
1726         r->tail = object;
1727         r->nr++;
1728
1729         if (unlikely(r->nr >= slab_freebatch(s)))
1730                 flush_remote_free_cache(s, c);
1731 }
1732 #endif
1733
1734 /*
1735  * Main freeing path. Return an object, or NULL on allocation failure.
1736  *
1737  * Must be called with interrupts disabled.
1738  */
1739 static __always_inline void __slab_free(struct kmem_cache *s,
1740                                 struct slqb_page *page, void *object)
1741 {
1742         struct kmem_cache_cpu *c;
1743         struct kmem_cache_list *l;
1744         int thiscpu = smp_processor_id();
1745
1746         c = get_cpu_slab(s, thiscpu);
1747         l = &c->list;
1748
1749         slqb_stat_inc(l, FREE);
1750
1751         if (!NUMA_BUILD || !slab_numa(s) ||
1752                         likely(slqb_page_to_nid(page) == numa_node_id())) {
1753                 /*
1754                  * Freeing fastpath. Collects all local-node objects, not
1755                  * just those allocated from our per-CPU list. This allows
1756                  * fast transfer of objects from one CPU to another within
1757                  * a given node.
1758                  */
1759                 set_freepointer(s, object, l->freelist.head);
1760                 l->freelist.head = object;
1761                 if (!l->freelist.nr)
1762                         l->freelist.tail = object;
1763                 l->freelist.nr++;
1764
1765                 if (unlikely(l->freelist.nr > slab_hiwater(s)))
1766                         flush_free_list(s, l);
1767
1768         } else {
1769 #ifdef CONFIG_SMP
1770                 /*
1771                  * Freeing an object that was allocated on a remote node.
1772                  */
1773                 slab_free_to_remote(s, page, object, c);
1774                 slqb_stat_inc(l, FREE_REMOTE);
1775 #endif
1776         }
1777 }
1778
1779 /*
1780  * Perform some interrupts-on processing around the main freeing path
1781  * (debug checking).
1782  */
1783 static __always_inline void slab_free(struct kmem_cache *s,
1784                                 struct slqb_page *page, void *object)
1785 {
1786         unsigned long flags;
1787
1788         prefetchw(object);
1789
1790         debug_check_no_locks_freed(object, s->objsize);
1791         if (likely(object) && unlikely(slab_debug(s))) {
1792                 if (unlikely(!free_debug_processing(s, object, _RET_IP_)))
1793                         return;
1794         }
1795
1796         local_irq_save(flags);
1797         __slab_free(s, page, object);
1798         local_irq_restore(flags);
1799 }
1800
1801 void kmem_cache_free(struct kmem_cache *s, void *object)
1802 {
1803         struct slqb_page *page = NULL;
1804
1805         if (slab_numa(s))
1806                 page = virt_to_head_slqb_page(object);
1807         slab_free(s, page, object);
1808 }
1809 EXPORT_SYMBOL(kmem_cache_free);
1810
1811 /*
1812  * Calculate the order of allocation given an slab object size.
1813  *
1814  * Order 0 allocations are preferred since order 0 does not cause fragmentation
1815  * in the page allocator, and they have fastpaths in the page allocator. But
1816  * also minimise external fragmentation with large objects.
1817  */
1818 static int slab_order(int size, int max_order, int frac)
1819 {
1820         int order;
1821
1822         if (fls(size - 1) <= PAGE_SHIFT)
1823                 order = 0;
1824         else
1825                 order = fls(size - 1) - PAGE_SHIFT;
1826         if (order < slqb_min_order)
1827                 order = slqb_min_order;
1828
1829         while (order <= max_order) {
1830                 unsigned long slab_size = PAGE_SIZE << order;
1831                 unsigned long objects;
1832                 unsigned long waste;
1833
1834                 objects = slab_size / size;
1835                 if (!objects)
1836                         goto next;
1837
1838                 if (order < MAX_ORDER && objects < slqb_min_objects) {
1839                         /*
1840                          * if we don't have enough objects for min_objects,
1841                          * then try the next size up. Unless we have reached
1842                          * our maximum possible page size.
1843                          */
1844                         goto next;
1845                 }
1846
1847                 waste = slab_size - (objects * size);
1848
1849                 if (waste * frac <= slab_size)
1850                         break;
1851
1852 next:
1853                 order++;
1854         }
1855
1856         return order;
1857 }
1858
1859 static int calculate_order(int size)
1860 {
1861         int order;
1862
1863         /*
1864          * Attempt to find best configuration for a slab. This
1865          * works by first attempting to generate a layout with
1866          * the best configuration and backing off gradually.
1867          */
1868         order = slab_order(size, 1, 4);
1869         if (order <= 1)
1870                 return order;
1871
1872         /*
1873          * This size cannot fit in order-1. Allow bigger orders, but
1874          * forget about trying to save space.
1875          */
1876         order = slab_order(size, MAX_ORDER - 1, 0);
1877         if (order < MAX_ORDER)
1878                 return order;
1879
1880         return -ENOSYS;
1881 }
1882
1883 /*
1884  * Figure out what the alignment of the objects will be.
1885  */
1886 static unsigned long calculate_alignment(unsigned long flags,
1887                                 unsigned long align, unsigned long size)
1888 {
1889         /*
1890          * If the user wants hardware cache aligned objects then follow that
1891          * suggestion if the object is sufficiently large.
1892          *
1893          * The hardware cache alignment cannot override the specified
1894          * alignment though. If that is greater then use it.
1895          */
1896         if (flags & SLAB_HWCACHE_ALIGN) {
1897                 unsigned long ralign = cache_line_size();
1898
1899                 while (size <= ralign / 2)
1900                         ralign /= 2;
1901                 align = max(align, ralign);
1902         }
1903
1904         if (align < ARCH_SLAB_MINALIGN)
1905                 align = ARCH_SLAB_MINALIGN;
1906
1907         return ALIGN(align, sizeof(void *));
1908 }
1909
1910 static void init_kmem_cache_list(struct kmem_cache *s,
1911                                 struct kmem_cache_list *l)
1912 {
1913         l->cache                = s;
1914         l->freelist.nr          = 0;
1915         l->freelist.head        = NULL;
1916         l->freelist.tail        = NULL;
1917         l->nr_partial           = 0;
1918         l->nr_slabs             = 0;
1919         INIT_LIST_HEAD(&l->partial);
1920         spin_lock_init(&l->page_lock);
1921
1922 #ifdef CONFIG_SMP
1923         l->remote_free_check    = 0;
1924         spin_lock_init(&l->remote_free.lock);
1925         l->remote_free.list.nr  = 0;
1926         l->remote_free.list.head = NULL;
1927         l->remote_free.list.tail = NULL;
1928 #endif
1929
1930 #ifdef CONFIG_SLQB_STATS
1931         memset(l->stats, 0, sizeof(l->stats));
1932 #endif
1933 }
1934
1935 static void init_kmem_cache_cpu(struct kmem_cache *s,
1936                                 struct kmem_cache_cpu *c)
1937 {
1938         init_kmem_cache_list(s, &c->list);
1939
1940         c->colour_next          = 0;
1941 #ifdef CONFIG_SMP
1942         c->rlist.nr             = 0;
1943         c->rlist.head           = NULL;
1944         c->rlist.tail           = NULL;
1945         c->remote_cache_list    = NULL;
1946 #endif
1947 }
1948
1949 #ifdef CONFIG_NUMA
1950 static void init_kmem_cache_node(struct kmem_cache *s,
1951                                 struct kmem_cache_node *n)
1952 {
1953         spin_lock_init(&n->list_lock);
1954         init_kmem_cache_list(s, &n->list);
1955 }
1956 #endif
1957
1958 /* Initial slabs. */
1959 #ifdef CONFIG_SMP
1960 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cache_cpus);
1961 #endif
1962 #ifdef CONFIG_NUMA
1963 /* XXX: really need a DEFINE_PER_NODE for per-node data because a static
1964  *      array is wasteful */
1965 static struct kmem_cache_node kmem_cache_nodes[MAX_NUMNODES];
1966 #endif
1967
1968 #ifdef CONFIG_SMP
1969 static struct kmem_cache kmem_cpu_cache;
1970 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cpu_cpus);
1971 #ifdef CONFIG_NUMA
1972 static struct kmem_cache_node kmem_cpu_nodes[MAX_NUMNODES]; /* XXX per-nid */
1973 #endif
1974 #endif
1975
1976 #ifdef CONFIG_NUMA
1977 static struct kmem_cache kmem_node_cache;
1978 #ifdef CONFIG_SMP
1979 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_node_cpus);
1980 #endif
1981 static struct kmem_cache_node kmem_node_nodes[MAX_NUMNODES]; /*XXX per-nid */
1982 #endif
1983
1984 #ifdef CONFIG_SMP
1985 static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1986                                 int cpu)
1987 {
1988         struct kmem_cache_cpu *c;
1989         int node;
1990
1991         node = cpu_to_node(cpu);
1992
1993         c = kmem_cache_alloc_node(&kmem_cpu_cache, GFP_KERNEL, node);
1994         if (!c)
1995                 return NULL;
1996
1997         init_kmem_cache_cpu(s, c);
1998         return c;
1999 }
2000
2001 static void free_kmem_cache_cpus(struct kmem_cache *s)
2002 {
2003         int cpu;
2004
2005         for_each_online_cpu(cpu) {
2006                 struct kmem_cache_cpu *c;
2007
2008                 c = s->cpu_slab[cpu];
2009                 if (c) {
2010                         kmem_cache_free(&kmem_cpu_cache, c);
2011                         s->cpu_slab[cpu] = NULL;
2012                 }
2013         }
2014 }
2015
2016 static int alloc_kmem_cache_cpus(struct kmem_cache *s)
2017 {
2018         int cpu;
2019
2020         for_each_online_cpu(cpu) {
2021                 struct kmem_cache_cpu *c;
2022
2023                 c = s->cpu_slab[cpu];
2024                 if (c)
2025                         continue;
2026
2027                 c = alloc_kmem_cache_cpu(s, cpu);
2028                 if (!c) {
2029                         free_kmem_cache_cpus(s);
2030                         return 0;
2031                 }
2032                 s->cpu_slab[cpu] = c;
2033         }
2034         return 1;
2035 }
2036
2037 #else
2038 static inline void free_kmem_cache_cpus(struct kmem_cache *s)
2039 {
2040 }
2041
2042 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2043 {
2044         init_kmem_cache_cpu(s, &s->cpu_slab);
2045         return 1;
2046 }
2047 #endif
2048
2049 #ifdef CONFIG_NUMA
2050 static void free_kmem_cache_nodes(struct kmem_cache *s)
2051 {
2052         int node;
2053
2054         for_each_node_state(node, N_NORMAL_MEMORY) {
2055                 struct kmem_cache_node *n;
2056
2057                 n = s->node_slab[node];
2058                 if (n) {
2059                         kmem_cache_free(&kmem_node_cache, n);
2060                         s->node_slab[node] = NULL;
2061                 }
2062         }
2063 }
2064
2065 static int alloc_kmem_cache_nodes(struct kmem_cache *s)
2066 {
2067         int node;
2068
2069         for_each_node_state(node, N_NORMAL_MEMORY) {
2070                 struct kmem_cache_node *n;
2071
2072                 n = kmem_cache_alloc_node(&kmem_node_cache, GFP_KERNEL, node);
2073                 if (!n) {
2074                         free_kmem_cache_nodes(s);
2075                         return 0;
2076                 }
2077                 init_kmem_cache_node(s, n);
2078                 s->node_slab[node] = n;
2079         }
2080         return 1;
2081 }
2082 #else
2083 static void free_kmem_cache_nodes(struct kmem_cache *s)
2084 {
2085 }
2086
2087 static int alloc_kmem_cache_nodes(struct kmem_cache *s)
2088 {
2089         return 1;
2090 }
2091 #endif
2092
2093 /*
2094  * calculate_sizes() determines the order and the distribution of data within
2095  * a slab object.
2096  */
2097 static int calculate_sizes(struct kmem_cache *s)
2098 {
2099         unsigned long flags = s->flags;
2100         unsigned long size = s->objsize;
2101         unsigned long align = s->align;
2102
2103         /*
2104          * Determine if we can poison the object itself. If the user of
2105          * the slab may touch the object after free or before allocation
2106          * then we should never poison the object itself.
2107          */
2108         if (slab_poison(s) && !(flags & SLAB_DESTROY_BY_RCU) && !s->ctor)
2109                 s->flags |= __OBJECT_POISON;
2110         else
2111                 s->flags &= ~__OBJECT_POISON;
2112
2113         /*
2114          * Round up object size to the next word boundary. We can only
2115          * place the free pointer at word boundaries and this determines
2116          * the possible location of the free pointer.
2117          */
2118         size = ALIGN(size, sizeof(void *));
2119
2120 #ifdef CONFIG_SLQB_DEBUG
2121         /*
2122          * If we are Redzoning then check if there is some space between the
2123          * end of the object and the free pointer. If not then add an
2124          * additional word to have some bytes to store Redzone information.
2125          */
2126         if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2127                 size += sizeof(void *);
2128 #endif
2129
2130         /*
2131          * With that we have determined the number of bytes in actual use
2132          * by the object. This is the potential offset to the free pointer.
2133          */
2134         s->inuse = size;
2135
2136         if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || s->ctor)) {
2137                 /*
2138                  * Relocate free pointer after the object if it is not
2139                  * permitted to overwrite the first word of the object on
2140                  * kmem_cache_free.
2141                  *
2142                  * This is the case if we do RCU, have a constructor or
2143                  * destructor or are poisoning the objects.
2144                  */
2145                 s->offset = size;
2146                 size += sizeof(void *);
2147         }
2148
2149 #ifdef CONFIG_SLQB_DEBUG
2150         if (flags & SLAB_STORE_USER) {
2151                 /*
2152                  * Need to store information about allocs and frees after
2153                  * the object.
2154                  */
2155                 size += 2 * sizeof(struct track);
2156         }
2157
2158         if (flags & SLAB_RED_ZONE) {
2159                 /*
2160                  * Add some empty padding so that we can catch
2161                  * overwrites from earlier objects rather than let
2162                  * tracking information or the free pointer be
2163                  * corrupted if an user writes before the start
2164                  * of the object.
2165                  */
2166                 size += sizeof(void *);
2167         }
2168 #endif
2169
2170         /*
2171          * Determine the alignment based on various parameters that the
2172          * user specified and the dynamic determination of cache line size
2173          * on bootup.
2174          */
2175         align = calculate_alignment(flags, align, s->objsize);
2176
2177         /*
2178          * SLQB stores one object immediately after another beginning from
2179          * offset 0. In order to align the objects we have to simply size
2180          * each object to conform to the alignment.
2181          */
2182         size = ALIGN(size, align);
2183         s->size = size;
2184         s->order = calculate_order(size);
2185
2186         if (s->order < 0)
2187                 return 0;
2188
2189         s->allocflags = 0;
2190         if (s->order)
2191                 s->allocflags |= __GFP_COMP;
2192
2193         if (s->flags & SLAB_CACHE_DMA)
2194                 s->allocflags |= SLQB_DMA;
2195
2196         if (s->flags & SLAB_RECLAIM_ACCOUNT)
2197                 s->allocflags |= __GFP_RECLAIMABLE;
2198
2199         /*
2200          * Determine the number of objects per slab
2201          */
2202         s->objects = (PAGE_SIZE << s->order) / size;
2203
2204         s->freebatch = max(4UL*PAGE_SIZE / size,
2205                                 min(256UL, 64*PAGE_SIZE / size));
2206         if (!s->freebatch)
2207                 s->freebatch = 1;
2208         s->hiwater = s->freebatch << 2;
2209
2210         return !!s->objects;
2211
2212 }
2213
2214 #ifdef CONFIG_SMP
2215 /*
2216  * Per-cpu allocator can't be used because it always uses slab allocator,
2217  * and it can't do per-node allocations.
2218  */
2219 static void *kmem_cache_dyn_array_alloc(int ids)
2220 {
2221         size_t size = sizeof(void *) * ids;
2222
2223         BUG_ON(!size);
2224
2225         if (unlikely(!slab_is_available())) {
2226                 static void *nextmem;
2227                 static size_t nextleft;
2228                 void *ret;
2229
2230                 /*
2231                  * Special case for setting up initial caches. These will
2232                  * never get freed by definition so we can do it rather
2233                  * simply.
2234                  */
2235                 if (size > nextleft) {
2236                         nextmem = alloc_pages_exact(size, GFP_KERNEL);
2237                         if (!nextmem)
2238                                 return NULL;
2239                         nextleft = roundup(size, PAGE_SIZE);
2240                 }
2241
2242                 ret = nextmem;
2243                 nextleft -= size;
2244                 nextmem += size;
2245                 memset(ret, 0, size);
2246                 return ret;
2247         } else {
2248                 return kzalloc(size, GFP_KERNEL);
2249         }
2250 }
2251
2252 static void kmem_cache_dyn_array_free(void *array)
2253 {
2254         if (unlikely(!slab_is_available()))
2255                 return; /* error case without crashing here (will panic soon) */
2256         kfree(array);
2257 }
2258 #endif
2259
2260 /*
2261  * Except in early boot, this should be called with slqb_lock held for write
2262  * to lock out hotplug, and protect list modifications.
2263  */
2264 static int kmem_cache_open(struct kmem_cache *s,
2265                         const char *name, size_t size, size_t align,
2266                         unsigned long flags, void (*ctor)(void *), int alloc)
2267 {
2268         unsigned int left_over;
2269
2270         memset(s, 0, sizeof(struct kmem_cache));
2271         s->name = name;
2272         s->ctor = ctor;
2273         s->objsize = size;
2274         s->align = align;
2275         s->flags = kmem_cache_flags(size, flags, name, ctor);
2276
2277         if (!calculate_sizes(s))
2278                 goto error;
2279
2280         if (!slab_debug(s)) {
2281                 left_over = (PAGE_SIZE << s->order) - (s->objects * s->size);
2282                 s->colour_off = max(cache_line_size(), s->align);
2283                 s->colour_range = left_over;
2284         } else {
2285                 s->colour_off = 0;
2286                 s->colour_range = 0;
2287         }
2288
2289 #ifdef CONFIG_SMP
2290         s->cpu_slab = kmem_cache_dyn_array_alloc(nr_cpu_ids);
2291         if (!s->cpu_slab)
2292                 goto error;
2293 # ifdef CONFIG_NUMA
2294         s->node_slab = kmem_cache_dyn_array_alloc(nr_node_ids);
2295         if (!s->node_slab)
2296                 goto error_cpu_array;
2297 # endif
2298 #endif
2299
2300         if (likely(alloc)) {
2301                 if (!alloc_kmem_cache_nodes(s))
2302                         goto error_node_array;
2303
2304                 if (!alloc_kmem_cache_cpus(s))
2305                         goto error_nodes;
2306         }
2307
2308         sysfs_slab_add(s);
2309         list_add(&s->list, &slab_caches);
2310
2311         return 1;
2312
2313 error_nodes:
2314         free_kmem_cache_nodes(s);
2315 error_node_array:
2316 #if defined(CONFIG_NUMA) && defined(CONFIG_SMP)
2317         kmem_cache_dyn_array_free(s->node_slab);
2318 error_cpu_array:
2319 #endif
2320 #ifdef CONFIG_SMP
2321         kmem_cache_dyn_array_free(s->cpu_slab);
2322 #endif
2323 error:
2324         if (flags & SLAB_PANIC)
2325                 panic("%s: failed to create slab `%s'\n", __func__, name);
2326         return 0;
2327 }
2328
2329 /**
2330  * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
2331  * @s: the cache we're checking against
2332  * @ptr: pointer to validate
2333  *
2334  * This verifies that the untrusted pointer looks sane;
2335  * it is _not_ a guarantee that the pointer is actually
2336  * part of the slab cache in question, but it at least
2337  * validates that the pointer can be dereferenced and
2338  * looks half-way sane.
2339  *
2340  * Currently only used for dentry validation.
2341  */
2342 int kmem_ptr_validate(struct kmem_cache *s, const void *ptr)
2343 {
2344         unsigned long addr = (unsigned long)ptr;
2345         struct slqb_page *page;
2346
2347         if (unlikely(addr < PAGE_OFFSET))
2348                 goto out;
2349         if (unlikely(addr > (unsigned long)high_memory - s->size))
2350                 goto out;
2351         if (unlikely(!IS_ALIGNED(addr, s->align)))
2352                 goto out;
2353         if (unlikely(!kern_addr_valid(addr)))
2354                 goto out;
2355         if (unlikely(!kern_addr_valid(addr + s->size - 1)))
2356                 goto out;
2357         if (unlikely(!pfn_valid(addr >> PAGE_SHIFT)))
2358                 goto out;
2359         page = virt_to_head_slqb_page(ptr);
2360         if (unlikely(!(page->flags & PG_SLQB_BIT)))
2361                 goto out;
2362         if (unlikely(page->list->cache != s)) /* XXX: ouch, racy */
2363                 goto out;
2364         return 1;
2365 out:
2366         return 0;
2367 }
2368 EXPORT_SYMBOL(kmem_ptr_validate);
2369
2370 /*
2371  * Determine the size of a slab object
2372  */
2373 unsigned int kmem_cache_size(struct kmem_cache *s)
2374 {
2375         return s->objsize;
2376 }
2377 EXPORT_SYMBOL(kmem_cache_size);
2378
2379 const char *kmem_cache_name(struct kmem_cache *s)
2380 {
2381         return s->name;
2382 }
2383 EXPORT_SYMBOL(kmem_cache_name);
2384
2385 /*
2386  * Release all resources used by a slab cache. No more concurrency on the
2387  * slab, so we can touch remote kmem_cache_cpu structures.
2388  */
2389 void kmem_cache_destroy(struct kmem_cache *s)
2390 {
2391 #ifdef CONFIG_NUMA
2392         int node;
2393 #endif
2394         int cpu;
2395
2396         down_write(&slqb_lock);
2397         list_del(&s->list);
2398
2399         local_irq_disable();
2400 #ifdef CONFIG_SMP
2401         for_each_online_cpu(cpu) {
2402                 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2403                 struct kmem_cache_list *l = &c->list;
2404
2405                 flush_free_list_all(s, l);
2406                 flush_remote_free_cache(s, c);
2407         }
2408 #endif
2409
2410         for_each_online_cpu(cpu) {
2411                 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2412                 struct kmem_cache_list *l = &c->list;
2413
2414                 claim_remote_free_list(s, l);
2415                 flush_free_list_all(s, l);
2416
2417                 WARN_ON(l->freelist.nr);
2418                 WARN_ON(l->nr_slabs);
2419                 WARN_ON(l->nr_partial);
2420         }
2421
2422         free_kmem_cache_cpus(s);
2423
2424 #ifdef CONFIG_NUMA
2425         for_each_node_state(node, N_NORMAL_MEMORY) {
2426                 struct kmem_cache_node *n;
2427                 struct kmem_cache_list *l;
2428
2429                 n = s->node_slab[node];
2430                 if (!n)
2431                         continue;
2432                 l = &n->list;
2433
2434                 claim_remote_free_list(s, l);
2435                 flush_free_list_all(s, l);
2436
2437                 WARN_ON(l->freelist.nr);
2438                 WARN_ON(l->nr_slabs);
2439                 WARN_ON(l->nr_partial);
2440         }
2441
2442         free_kmem_cache_nodes(s);
2443 #endif
2444         local_irq_enable();
2445
2446         sysfs_slab_remove(s);
2447         up_write(&slqb_lock);
2448 }
2449 EXPORT_SYMBOL(kmem_cache_destroy);
2450
2451 /********************************************************************
2452  *              Kmalloc subsystem
2453  *******************************************************************/
2454
2455 struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
2456 EXPORT_SYMBOL(kmalloc_caches);
2457
2458 #ifdef CONFIG_ZONE_DMA
2459 struct kmem_cache kmalloc_caches_dma[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
2460 EXPORT_SYMBOL(kmalloc_caches_dma);
2461 #endif
2462
2463 #ifndef ARCH_KMALLOC_FLAGS
2464 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
2465 #endif
2466
2467 static struct kmem_cache *open_kmalloc_cache(struct kmem_cache *s,
2468                                 const char *name, int size, gfp_t gfp_flags)
2469 {
2470         unsigned int flags = ARCH_KMALLOC_FLAGS | SLAB_PANIC;
2471
2472         if (gfp_flags & SLQB_DMA)
2473                 flags |= SLAB_CACHE_DMA;
2474
2475         kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL, 1);
2476
2477         return s;
2478 }
2479
2480 /*
2481  * Conversion table for small slabs sizes / 8 to the index in the
2482  * kmalloc array. This is necessary for slabs < 192 since we have non power
2483  * of two cache sizes there. The size of larger slabs can be determined using
2484  * fls.
2485  */
2486 static s8 size_index[24] __cacheline_aligned = {
2487         3,      /* 8 */
2488         4,      /* 16 */
2489         5,      /* 24 */
2490         5,      /* 32 */
2491         6,      /* 40 */
2492         6,      /* 48 */
2493         6,      /* 56 */
2494         6,      /* 64 */
2495 #if L1_CACHE_BYTES < 64
2496         1,      /* 72 */
2497         1,      /* 80 */
2498         1,      /* 88 */
2499         1,      /* 96 */
2500 #else
2501         7,
2502         7,
2503         7,
2504         7,
2505 #endif
2506         7,      /* 104 */
2507         7,      /* 112 */
2508         7,      /* 120 */
2509         7,      /* 128 */
2510 #if L1_CACHE_BYTES < 128
2511         2,      /* 136 */
2512         2,      /* 144 */
2513         2,      /* 152 */
2514         2,      /* 160 */
2515         2,      /* 168 */
2516         2,      /* 176 */
2517         2,      /* 184 */
2518         2       /* 192 */
2519 #else
2520         -1,
2521         -1,
2522         -1,
2523         -1,
2524         -1,
2525         -1,
2526         -1,
2527         -1
2528 #endif
2529 };
2530
2531 static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2532 {
2533         int index;
2534
2535         if (unlikely(size <= KMALLOC_MIN_SIZE)) {
2536                 if (unlikely(!size))
2537                         return ZERO_SIZE_PTR;
2538
2539                 index = KMALLOC_SHIFT_LOW;
2540                 goto got_index;
2541         }
2542
2543 #if L1_CACHE_BYTES >= 128
2544         if (size <= 128) {
2545 #else
2546         if (size <= 192) {
2547 #endif
2548                 index = size_index[(size - 1) / 8];
2549         } else {
2550                 if (unlikely(size > 1UL << KMALLOC_SHIFT_SLQB_HIGH))
2551                         return NULL;
2552
2553                 index = fls(size - 1);
2554         }
2555
2556 got_index:
2557         if (unlikely((flags & SLQB_DMA)))
2558                 return &kmalloc_caches_dma[index];
2559         else
2560                 return &kmalloc_caches[index];
2561 }
2562
2563 void *__kmalloc(size_t size, gfp_t flags)
2564 {
2565         struct kmem_cache *s;
2566
2567         s = get_slab(size, flags);
2568         if (unlikely(ZERO_OR_NULL_PTR(s)))
2569                 return s;
2570
2571         return __kmem_cache_alloc(s, flags, _RET_IP_);
2572 }
2573 EXPORT_SYMBOL(__kmalloc);
2574
2575 #ifdef CONFIG_NUMA
2576 void *__kmalloc_node(size_t size, gfp_t flags, int node)
2577 {
2578         struct kmem_cache *s;
2579
2580         s = get_slab(size, flags);
2581         if (unlikely(ZERO_OR_NULL_PTR(s)))
2582                 return s;
2583
2584         return kmem_cache_alloc_node(s, flags, node);
2585 }
2586 EXPORT_SYMBOL(__kmalloc_node);
2587 #endif
2588
2589 size_t ksize(const void *object)
2590 {
2591         struct slqb_page *page;
2592         struct kmem_cache *s;
2593
2594         BUG_ON(!object);
2595         if (unlikely(object == ZERO_SIZE_PTR))
2596                 return 0;
2597
2598         page = virt_to_head_slqb_page(object);
2599         BUG_ON(!(page->flags & PG_SLQB_BIT));
2600
2601         s = page->list->cache;
2602
2603         /*
2604          * Debugging requires use of the padding between object
2605          * and whatever may come after it.
2606          */
2607         if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2608                 return s->objsize;
2609
2610         /*
2611          * If we have the need to store the freelist pointer
2612          * back there or track user information then we can
2613          * only use the space before that information.
2614          */
2615         if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2616                 return s->inuse;
2617
2618         /*
2619          * Else we can use all the padding etc for the allocation
2620          */
2621         return s->size;
2622 }
2623 EXPORT_SYMBOL(ksize);
2624
2625 void kfree(const void *object)
2626 {
2627         struct kmem_cache *s;
2628         struct slqb_page *page;
2629
2630         if (unlikely(ZERO_OR_NULL_PTR(object)))
2631                 return;
2632
2633         page = virt_to_head_slqb_page(object);
2634         s = page->list->cache;
2635
2636         slab_free(s, page, (void *)object);
2637 }
2638 EXPORT_SYMBOL(kfree);
2639
2640 static void kmem_cache_trim_percpu(void *arg)
2641 {
2642         int cpu = smp_processor_id();
2643         struct kmem_cache *s = arg;
2644         struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2645         struct kmem_cache_list *l = &c->list;
2646
2647         claim_remote_free_list(s, l);
2648         flush_free_list(s, l);
2649 #ifdef CONFIG_SMP
2650         flush_remote_free_cache(s, c);
2651 #endif
2652 }
2653
2654 int kmem_cache_shrink(struct kmem_cache *s)
2655 {
2656 #ifdef CONFIG_NUMA
2657         int node;
2658 #endif
2659
2660         on_each_cpu(kmem_cache_trim_percpu, s, 1);
2661
2662 #ifdef CONFIG_NUMA
2663         for_each_node_state(node, N_NORMAL_MEMORY) {
2664                 struct kmem_cache_node *n;
2665                 struct kmem_cache_list *l;
2666
2667                 n = s->node_slab[node];
2668                 if (!n)
2669                         continue;
2670                 l = &n->list;
2671
2672                 spin_lock_irq(&n->list_lock);
2673                 claim_remote_free_list(s, l);
2674                 flush_free_list(s, l);
2675                 spin_unlock_irq(&n->list_lock);
2676         }
2677 #endif
2678
2679         return 0;
2680 }
2681 EXPORT_SYMBOL(kmem_cache_shrink);
2682
2683 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2684 static void kmem_cache_reap_percpu(void *arg)
2685 {
2686         int cpu = smp_processor_id();
2687         struct kmem_cache *s;
2688         long phase = (long)arg;
2689
2690         list_for_each_entry(s, &slab_caches, list) {
2691                 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2692                 struct kmem_cache_list *l = &c->list;
2693
2694                 if (phase == 0) {
2695                         flush_free_list_all(s, l);
2696                         flush_remote_free_cache(s, c);
2697                 }
2698
2699                 if (phase == 1) {
2700                         claim_remote_free_list(s, l);
2701                         flush_free_list_all(s, l);
2702                 }
2703         }
2704 }
2705
2706 static void kmem_cache_reap(void)
2707 {
2708         struct kmem_cache *s;
2709         int node;
2710
2711         down_read(&slqb_lock);
2712         on_each_cpu(kmem_cache_reap_percpu, (void *)0, 1);
2713         on_each_cpu(kmem_cache_reap_percpu, (void *)1, 1);
2714
2715         list_for_each_entry(s, &slab_caches, list) {
2716                 for_each_node_state(node, N_NORMAL_MEMORY) {
2717                         struct kmem_cache_node *n;
2718                         struct kmem_cache_list *l;
2719
2720                         n = s->node_slab[node];
2721                         if (!n)
2722                                 continue;
2723                         l = &n->list;
2724
2725                         spin_lock_irq(&n->list_lock);
2726                         claim_remote_free_list(s, l);
2727                         flush_free_list_all(s, l);
2728                         spin_unlock_irq(&n->list_lock);
2729                 }
2730         }
2731         up_read(&slqb_lock);
2732 }
2733 #endif
2734
2735 static void cache_trim_worker(struct work_struct *w)
2736 {
2737         struct delayed_work *work =
2738                 container_of(w, struct delayed_work, work);
2739         struct kmem_cache *s;
2740
2741         if (!down_read_trylock(&slqb_lock))
2742                 goto out;
2743
2744         list_for_each_entry(s, &slab_caches, list) {
2745 #ifdef CONFIG_NUMA
2746                 int node = numa_node_id();
2747                 struct kmem_cache_node *n = s->node_slab[node];
2748
2749                 if (n) {
2750                         struct kmem_cache_list *l = &n->list;
2751
2752                         spin_lock_irq(&n->list_lock);
2753                         claim_remote_free_list(s, l);
2754                         flush_free_list(s, l);
2755                         spin_unlock_irq(&n->list_lock);
2756                 }
2757 #endif
2758
2759                 local_irq_disable();
2760                 kmem_cache_trim_percpu(s);
2761                 local_irq_enable();
2762         }
2763
2764         up_read(&slqb_lock);
2765 out:
2766         schedule_delayed_work(work, round_jiffies_relative(3*HZ));
2767 }
2768
2769 static DEFINE_PER_CPU(struct delayed_work, cache_trim_work);
2770
2771 static void __cpuinit start_cpu_timer(int cpu)
2772 {
2773         struct delayed_work *cache_trim_work = &per_cpu(cache_trim_work, cpu);
2774
2775         /*
2776          * When this gets called from do_initcalls via cpucache_init(),
2777          * init_workqueues() has already run, so keventd will be setup
2778          * at that time.
2779          */
2780         if (keventd_up() && cache_trim_work->work.func == NULL) {
2781                 INIT_DELAYED_WORK(cache_trim_work, cache_trim_worker);
2782                 schedule_delayed_work_on(cpu, cache_trim_work,
2783                                         __round_jiffies_relative(HZ, cpu));
2784         }
2785 }
2786
2787 static int __init cpucache_init(void)
2788 {
2789         int cpu;
2790
2791         for_each_online_cpu(cpu)
2792                 start_cpu_timer(cpu);
2793
2794         return 0;
2795 }
2796 device_initcall(cpucache_init);
2797
2798 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2799 static void slab_mem_going_offline_callback(void *arg)
2800 {
2801         kmem_cache_reap();
2802 }
2803
2804 static void slab_mem_offline_callback(void *arg)
2805 {
2806         /* XXX: should release structures, see CPU offline comment */
2807 }
2808
2809 static int slab_mem_going_online_callback(void *arg)
2810 {
2811         struct kmem_cache *s;
2812         struct kmem_cache_node *n;
2813         struct memory_notify *marg = arg;
2814         int nid = marg->status_change_nid;
2815         int ret = 0;
2816
2817         /*
2818          * If the node's memory is already available, then kmem_cache_node is
2819          * already created. Nothing to do.
2820          */
2821         if (nid < 0)
2822                 return 0;
2823
2824         /*
2825          * We are bringing a node online. No memory is availabe yet. We must
2826          * allocate a kmem_cache_node structure in order to bring the node
2827          * online.
2828          */
2829         down_write(&slqb_lock);
2830         list_for_each_entry(s, &slab_caches, list) {
2831                 /*
2832                  * XXX: kmem_cache_alloc_node will fallback to other nodes
2833                  *      since memory is not yet available from the node that
2834                  *      is brought up.
2835                  */
2836                 if (s->node_slab[nid]) /* could be lefover from last online */
2837                         continue;
2838                 n = kmem_cache_alloc(&kmem_node_cache, GFP_KERNEL);
2839                 if (!n) {
2840                         ret = -ENOMEM;
2841                         goto out;
2842                 }
2843                 init_kmem_cache_node(s, n);
2844                 s->node_slab[nid] = n;
2845         }
2846 out:
2847         up_write(&slqb_lock);
2848         return ret;
2849 }
2850
2851 static int slab_memory_callback(struct notifier_block *self,
2852                                 unsigned long action, void *arg)
2853 {
2854         int ret = 0;
2855
2856         switch (action) {
2857         case MEM_GOING_ONLINE:
2858                 ret = slab_mem_going_online_callback(arg);
2859                 break;
2860         case MEM_GOING_OFFLINE:
2861                 slab_mem_going_offline_callback(arg);
2862                 break;
2863         case MEM_OFFLINE:
2864         case MEM_CANCEL_ONLINE:
2865                 slab_mem_offline_callback(arg);
2866                 break;
2867         case MEM_ONLINE:
2868         case MEM_CANCEL_OFFLINE:
2869                 break;
2870         }
2871
2872         if (ret)
2873                 ret = notifier_from_errno(ret);
2874         else
2875                 ret = NOTIFY_OK;
2876         return ret;
2877 }
2878
2879 #endif /* CONFIG_MEMORY_HOTPLUG */
2880
2881 /********************************************************************
2882  *                      Basic setup of slabs
2883  *******************************************************************/
2884
2885 void __init kmem_cache_init(void)
2886 {
2887         int i;
2888         unsigned int flags = SLAB_HWCACHE_ALIGN|SLAB_PANIC;
2889
2890         /*
2891          * All the ifdefs are rather ugly here, but it's just the setup code,
2892          * so it doesn't have to be too readable :)
2893          */
2894
2895         /*
2896          * No need to take slqb_lock here: there should be no concurrency
2897          * anyway, and spin_unlock_irq in rwsem code could enable interrupts
2898          * too early.
2899          */
2900         kmem_cache_open(&kmem_cache_cache, "kmem_cache",
2901                         sizeof(struct kmem_cache), 0, flags, NULL, 0);
2902 #ifdef CONFIG_SMP
2903         kmem_cache_open(&kmem_cpu_cache, "kmem_cache_cpu",
2904                         sizeof(struct kmem_cache_cpu), 0, flags, NULL, 0);
2905 #endif
2906 #ifdef CONFIG_NUMA
2907         kmem_cache_open(&kmem_node_cache, "kmem_cache_node",
2908                         sizeof(struct kmem_cache_node), 0, flags, NULL, 0);
2909 #endif
2910
2911 #ifdef CONFIG_SMP
2912         for_each_possible_cpu(i) {
2913                 struct kmem_cache_cpu *c;
2914
2915                 c = &per_cpu(kmem_cache_cpus, i);
2916                 init_kmem_cache_cpu(&kmem_cache_cache, c);
2917                 kmem_cache_cache.cpu_slab[i] = c;
2918
2919                 c = &per_cpu(kmem_cpu_cpus, i);
2920                 init_kmem_cache_cpu(&kmem_cpu_cache, c);
2921                 kmem_cpu_cache.cpu_slab[i] = c;
2922
2923 #ifdef CONFIG_NUMA
2924                 c = &per_cpu(kmem_node_cpus, i);
2925                 init_kmem_cache_cpu(&kmem_node_cache, c);
2926                 kmem_node_cache.cpu_slab[i] = c;
2927 #endif
2928         }
2929 #else
2930         init_kmem_cache_cpu(&kmem_cache_cache, &kmem_cache_cache.cpu_slab);
2931 #endif
2932
2933 #ifdef CONFIG_NUMA
2934         for_each_node_state(i, N_NORMAL_MEMORY) {
2935                 struct kmem_cache_node *n;
2936
2937                 n = &kmem_cache_nodes[i];
2938                 init_kmem_cache_node(&kmem_cache_cache, n);
2939                 kmem_cache_cache.node_slab[i] = n;
2940 #ifdef CONFIG_SMP
2941                 n = &kmem_cpu_nodes[i];
2942                 init_kmem_cache_node(&kmem_cpu_cache, n);
2943                 kmem_cpu_cache.node_slab[i] = n;
2944 #endif
2945                 n = &kmem_node_nodes[i];
2946                 init_kmem_cache_node(&kmem_node_cache, n);
2947                 kmem_node_cache.node_slab[i] = n;
2948         }
2949 #endif
2950
2951         /* Caches that are not of the two-to-the-power-of size */
2952         if (L1_CACHE_BYTES < 64 && KMALLOC_MIN_SIZE <= 64) {
2953                 open_kmalloc_cache(&kmalloc_caches[1],
2954                                 "kmalloc-96", 96, GFP_KERNEL);
2955 #ifdef CONFIG_ZONE_DMA
2956                 open_kmalloc_cache(&kmalloc_caches_dma[1],
2957                                 "kmalloc_dma-96", 96, GFP_KERNEL|SLQB_DMA);
2958 #endif
2959         }
2960         if (L1_CACHE_BYTES < 128 && KMALLOC_MIN_SIZE <= 128) {
2961                 open_kmalloc_cache(&kmalloc_caches[2],
2962                                 "kmalloc-192", 192, GFP_KERNEL);
2963 #ifdef CONFIG_ZONE_DMA
2964                 open_kmalloc_cache(&kmalloc_caches_dma[2],
2965                                 "kmalloc_dma-192", 192, GFP_KERNEL|SLQB_DMA);
2966 #endif
2967         }
2968
2969         for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
2970                 open_kmalloc_cache(&kmalloc_caches[i],
2971                                 "kmalloc", 1 << i, GFP_KERNEL);
2972 #ifdef CONFIG_ZONE_DMA
2973                 open_kmalloc_cache(&kmalloc_caches_dma[i],
2974                                 "kmalloc_dma", 1 << i, GFP_KERNEL|SLQB_DMA);
2975 #endif
2976         }
2977
2978         /*
2979          * Patch up the size_index table if we have strange large alignment
2980          * requirements for the kmalloc array. This is only the case for
2981          * mips it seems. The standard arches will not generate any code here.
2982          *
2983          * Largest permitted alignment is 256 bytes due to the way we
2984          * handle the index determination for the smaller caches.
2985          *
2986          * Make sure that nothing crazy happens if someone starts tinkering
2987          * around with ARCH_KMALLOC_MINALIGN
2988          */
2989         BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
2990                 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
2991
2992         for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
2993                 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
2994
2995         /* Provide the correct kmalloc names now that the caches are up */
2996         for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
2997                 kmalloc_caches[i].name =
2998                         kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
2999 #ifdef CONFIG_ZONE_DMA
3000                 kmalloc_caches_dma[i].name =
3001                         kasprintf(GFP_KERNEL, "kmalloc_dma-%d", 1 << i);
3002 #endif
3003         }
3004
3005 #ifdef CONFIG_SMP
3006         register_cpu_notifier(&slab_notifier);
3007 #endif
3008 #ifdef CONFIG_NUMA
3009         hotplug_memory_notifier(slab_memory_callback, 1);
3010 #endif
3011         /*
3012          * smp_init() has not yet been called, so no worries about memory
3013          * ordering with __slab_is_available.
3014          */
3015         __slab_is_available = 1;
3016 }
3017
3018 void __init kmem_cache_init_late(void)
3019 {
3020 }
3021
3022 /*
3023  * Some basic slab creation sanity checks
3024  */
3025 static int kmem_cache_create_ok(const char *name, size_t size,
3026                 size_t align, unsigned long flags)
3027 {
3028         struct kmem_cache *tmp;
3029
3030         /*
3031          * Sanity checks... these are all serious usage bugs.
3032          */
3033         if (!name || in_interrupt() || (size < sizeof(void *))) {
3034                 printk(KERN_ERR "kmem_cache_create(): early error in slab %s\n",
3035                                 name);
3036                 dump_stack();
3037
3038                 return 0;
3039         }
3040
3041         list_for_each_entry(tmp, &slab_caches, list) {
3042                 char x;
3043                 int res;
3044
3045                 /*
3046                  * This happens when the module gets unloaded and doesn't
3047                  * destroy its slab cache and no-one else reuses the vmalloc
3048                  * area of the module.  Print a warning.
3049                  */
3050                 res = probe_kernel_address(tmp->name, x);
3051                 if (res) {
3052                         printk(KERN_ERR
3053                                "SLAB: cache with size %d has lost its name\n",
3054                                tmp->size);
3055                         continue;
3056                 }
3057
3058                 if (!strcmp(tmp->name, name)) {
3059                         printk(KERN_ERR
3060                                "SLAB: duplicate cache %s\n", name);
3061                         dump_stack();
3062
3063                         return 0;
3064                 }
3065         }
3066
3067         WARN_ON(strchr(name, ' '));     /* It confuses parsers */
3068         if (flags & SLAB_DESTROY_BY_RCU)
3069                 WARN_ON(flags & SLAB_POISON);
3070
3071         return 1;
3072 }
3073
3074 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3075                 size_t align, unsigned long flags, void (*ctor)(void *))
3076 {
3077         struct kmem_cache *s;
3078
3079         down_write(&slqb_lock);
3080         if (!kmem_cache_create_ok(name, size, align, flags))
3081                 goto err;
3082
3083         s = kmem_cache_alloc(&kmem_cache_cache, GFP_KERNEL);
3084         if (!s)
3085                 goto err;
3086
3087         if (kmem_cache_open(s, name, size, align, flags, ctor, 1)) {
3088                 up_write(&slqb_lock);
3089                 return s;
3090         }
3091
3092         kmem_cache_free(&kmem_cache_cache, s);
3093
3094 err:
3095         up_write(&slqb_lock);
3096         if (flags & SLAB_PANIC)
3097                 panic("%s: failed to create slab `%s'\n", __func__, name);
3098
3099         return NULL;
3100 }
3101 EXPORT_SYMBOL(kmem_cache_create);
3102
3103 #ifdef CONFIG_SMP
3104 /*
3105  * Use the cpu notifier to insure that the cpu slabs are flushed when
3106  * necessary.
3107  */
3108 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3109                                 unsigned long action, void *hcpu)
3110 {
3111         long cpu = (long)hcpu;
3112         struct kmem_cache *s;
3113
3114         switch (action) {
3115         case CPU_UP_PREPARE:
3116         case CPU_UP_PREPARE_FROZEN:
3117                 down_write(&slqb_lock);
3118                 list_for_each_entry(s, &slab_caches, list) {
3119                         if (s->cpu_slab[cpu]) /* could be lefover last online */
3120                                 continue;
3121                         s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu);
3122                         if (!s->cpu_slab[cpu]) {
3123                                 up_read(&slqb_lock);
3124                                 return NOTIFY_BAD;
3125                         }
3126                 }
3127                 up_write(&slqb_lock);
3128                 break;
3129
3130         case CPU_ONLINE:
3131         case CPU_ONLINE_FROZEN:
3132         case CPU_DOWN_FAILED:
3133         case CPU_DOWN_FAILED_FROZEN:
3134                 start_cpu_timer(cpu);
3135                 break;
3136
3137         case CPU_DOWN_PREPARE:
3138         case CPU_DOWN_PREPARE_FROZEN:
3139                 cancel_rearming_delayed_work(&per_cpu(cache_trim_work, cpu));
3140                 per_cpu(cache_trim_work, cpu).work.func = NULL;
3141                 break;
3142
3143         case CPU_UP_CANCELED:
3144         case CPU_UP_CANCELED_FROZEN:
3145         case CPU_DEAD:
3146         case CPU_DEAD_FROZEN:
3147                 /*
3148                  * XXX: Freeing here doesn't work because objects can still be
3149                  * on this CPU's list. periodic timer needs to check if a CPU
3150                  * is offline and then try to cleanup from there. Same for node
3151                  * offline.
3152                  */
3153         default:
3154                 break;
3155         }
3156         return NOTIFY_OK;
3157 }
3158
3159 static struct notifier_block __cpuinitdata slab_notifier = {
3160         .notifier_call = slab_cpuup_callback
3161 };
3162
3163 #endif
3164
3165 #ifdef CONFIG_SLQB_DEBUG
3166 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3167 {
3168         struct kmem_cache *s;
3169         int node = -1;
3170
3171         s = get_slab(size, flags);
3172         if (unlikely(ZERO_OR_NULL_PTR(s)))
3173                 return s;
3174
3175 #ifdef CONFIG_NUMA
3176         if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3177                 node = alternate_nid(s, flags, node);
3178 #endif
3179         return slab_alloc(s, flags, node, caller);
3180 }
3181
3182 void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
3183                                 unsigned long caller)
3184 {
3185         struct kmem_cache *s;
3186
3187         s = get_slab(size, flags);
3188         if (unlikely(ZERO_OR_NULL_PTR(s)))
3189                 return s;
3190
3191         return slab_alloc(s, flags, node, caller);
3192 }
3193 #endif
3194
3195 #if defined(CONFIG_SLQB_SYSFS) || defined(CONFIG_SLABINFO)
3196 struct stats_gather {
3197         struct kmem_cache *s;
3198         spinlock_t lock;
3199         unsigned long nr_slabs;
3200         unsigned long nr_partial;
3201         unsigned long nr_inuse;
3202         unsigned long nr_objects;
3203
3204 #ifdef CONFIG_SLQB_STATS
3205         unsigned long stats[NR_SLQB_STAT_ITEMS];
3206 #endif
3207 };
3208
3209 static void __gather_stats(void *arg)
3210 {
3211         unsigned long nr_slabs;
3212         unsigned long nr_partial;
3213         unsigned long nr_inuse;
3214         struct stats_gather *gather = arg;
3215         int cpu = smp_processor_id();
3216         struct kmem_cache *s = gather->s;
3217         struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3218         struct kmem_cache_list *l = &c->list;
3219         struct slqb_page *page;
3220 #ifdef CONFIG_SLQB_STATS
3221         int i;
3222 #endif
3223
3224         spin_lock(&l->page_lock);
3225         nr_slabs = l->nr_slabs;
3226         nr_partial = l->nr_partial;
3227         nr_inuse = (nr_slabs - nr_partial) * s->objects;
3228
3229         list_for_each_entry(page, &l->partial, lru) {
3230                 nr_inuse += page->inuse;
3231         }
3232         spin_unlock(&l->page_lock);
3233
3234         spin_lock(&gather->lock);
3235         gather->nr_slabs += nr_slabs;
3236         gather->nr_partial += nr_partial;
3237         gather->nr_inuse += nr_inuse;
3238 #ifdef CONFIG_SLQB_STATS
3239         for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
3240                 gather->stats[i] += l->stats[i];
3241 #endif
3242         spin_unlock(&gather->lock);
3243 }
3244
3245 /* must be called with slqb_lock held */
3246 static void gather_stats_locked(struct kmem_cache *s,
3247                                 struct stats_gather *stats)
3248 {
3249 #ifdef CONFIG_NUMA
3250         int node;
3251 #endif
3252
3253         memset(stats, 0, sizeof(struct stats_gather));
3254         stats->s = s;
3255         spin_lock_init(&stats->lock);
3256
3257         on_each_cpu(__gather_stats, stats, 1);
3258
3259 #ifdef CONFIG_NUMA
3260         for_each_online_node(node) {
3261                 struct kmem_cache_node *n = s->node_slab[node];
3262                 struct kmem_cache_list *l = &n->list;
3263                 struct slqb_page *page;
3264                 unsigned long flags;
3265 #ifdef CONFIG_SLQB_STATS
3266                 int i;
3267 #endif
3268
3269                 spin_lock_irqsave(&n->list_lock, flags);
3270 #ifdef CONFIG_SLQB_STATS
3271                 for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
3272                         stats->stats[i] += l->stats[i];
3273 #endif
3274                 stats->nr_slabs += l->nr_slabs;
3275                 stats->nr_partial += l->nr_partial;
3276                 stats->nr_inuse += (l->nr_slabs - l->nr_partial) * s->objects;
3277
3278                 list_for_each_entry(page, &l->partial, lru) {
3279                         stats->nr_inuse += page->inuse;
3280                 }
3281                 spin_unlock_irqrestore(&n->list_lock, flags);
3282         }
3283 #endif
3284
3285         stats->nr_objects = stats->nr_slabs * s->objects;
3286 }
3287
3288 #ifdef CONFIG_SLQB_SYSFS
3289 static void gather_stats(struct kmem_cache *s, struct stats_gather *stats)
3290 {
3291         down_read(&slqb_lock); /* hold off hotplug */
3292         gather_stats_locked(s, stats);
3293         up_read(&slqb_lock);
3294 }
3295 #endif
3296 #endif
3297
3298 /*
3299  * The /proc/slabinfo ABI
3300  */
3301 #ifdef CONFIG_SLABINFO
3302 #include <linux/proc_fs.h>
3303 #include <linux/seq_file.h>
3304 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3305                        size_t count, loff_t *ppos)
3306 {
3307         return -EINVAL;
3308 }
3309
3310 static void print_slabinfo_header(struct seq_file *m)
3311 {
3312         seq_puts(m, "slabinfo - version: 2.1\n");
3313         seq_puts(m, "# name         <active_objs> <num_objs> <objsize> "
3314                  "<objperslab> <pagesperslab>");
3315         seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3316         seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3317         seq_putc(m, '\n');
3318 }
3319
3320 static void *s_start(struct seq_file *m, loff_t *pos)
3321 {
3322         loff_t n = *pos;
3323
3324         down_read(&slqb_lock);
3325         if (!n)
3326                 print_slabinfo_header(m);
3327
3328         return seq_list_start(&slab_caches, *pos);
3329 }
3330
3331 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3332 {
3333         return seq_list_next(p, &slab_caches, pos);
3334 }
3335
3336 static void s_stop(struct seq_file *m, void *p)
3337 {
3338         up_read(&slqb_lock);
3339 }
3340
3341 static int s_show(struct seq_file *m, void *p)
3342 {
3343         struct stats_gather stats;
3344         struct kmem_cache *s;
3345
3346         s = list_entry(p, struct kmem_cache, list);
3347
3348         gather_stats_locked(s, &stats);
3349
3350         seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, stats.nr_inuse,
3351                         stats.nr_objects, s->size, s->objects, (1 << s->order));
3352         seq_printf(m, " : tunables %4u %4u %4u", slab_hiwater(s),
3353                         slab_freebatch(s), 0);
3354         seq_printf(m, " : slabdata %6lu %6lu %6lu", stats.nr_slabs,
3355                         stats.nr_slabs, 0UL);
3356         seq_putc(m, '\n');
3357         return 0;
3358 }
3359
3360 static const struct seq_operations slabinfo_op = {
3361         .start = s_start,
3362         .next = s_next,
3363         .stop = s_stop,
3364         .show = s_show,
3365 };
3366
3367 static int slabinfo_open(struct inode *inode, struct file *file)
3368 {
3369         return seq_open(file, &slabinfo_op);
3370 }
3371
3372 static const struct file_operations proc_slabinfo_operations = {
3373         .open           = slabinfo_open,
3374         .read           = seq_read,
3375         .llseek         = seq_lseek,
3376         .release        = seq_release,
3377 };
3378
3379 static int __init slab_proc_init(void)
3380 {
3381         proc_create("slabinfo", S_IWUSR|S_IRUGO, NULL,
3382                         &proc_slabinfo_operations);
3383         return 0;
3384 }
3385 module_init(slab_proc_init);
3386 #endif /* CONFIG_SLABINFO */
3387
3388 #ifdef CONFIG_SLQB_SYSFS
3389 /*
3390  * sysfs API
3391  */
3392 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3393 #define to_slab(n) container_of(n, struct kmem_cache, kobj);
3394
3395 struct slab_attribute {
3396         struct attribute attr;
3397         ssize_t (*show)(struct kmem_cache *s, char *buf);
3398         ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3399 };
3400
3401 #define SLAB_ATTR_RO(_name) \
3402         static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3403
3404 #define SLAB_ATTR(_name) \
3405         static struct slab_attribute _name##_attr =  \
3406         __ATTR(_name, 0644, _name##_show, _name##_store)
3407
3408 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3409 {
3410         return sprintf(buf, "%d\n", s->size);
3411 }
3412 SLAB_ATTR_RO(slab_size);
3413
3414 static ssize_t align_show(struct kmem_cache *s, char *buf)
3415 {
3416         return sprintf(buf, "%d\n", s->align);
3417 }
3418 SLAB_ATTR_RO(align);
3419
3420 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3421 {
3422         return sprintf(buf, "%d\n", s->objsize);
3423 }
3424 SLAB_ATTR_RO(object_size);
3425
3426 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3427 {
3428         return sprintf(buf, "%d\n", s->objects);
3429 }
3430 SLAB_ATTR_RO(objs_per_slab);
3431
3432 static ssize_t order_show(struct kmem_cache *s, char *buf)
3433 {
3434         return sprintf(buf, "%d\n", s->order);
3435 }
3436 SLAB_ATTR_RO(order);
3437
3438 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3439 {
3440         if (s->ctor) {
3441                 int n = sprint_symbol(buf, (unsigned long)s->ctor);
3442
3443                 return n + sprintf(buf + n, "\n");
3444         }
3445         return 0;
3446 }
3447 SLAB_ATTR_RO(ctor);
3448
3449 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3450 {
3451         struct stats_gather stats;
3452
3453         gather_stats(s, &stats);
3454
3455         return sprintf(buf, "%lu\n", stats.nr_slabs);
3456 }
3457 SLAB_ATTR_RO(slabs);
3458
3459 static ssize_t objects_show(struct kmem_cache *s, char *buf)
3460 {
3461         struct stats_gather stats;
3462
3463         gather_stats(s, &stats);
3464
3465         return sprintf(buf, "%lu\n", stats.nr_inuse);
3466 }
3467 SLAB_ATTR_RO(objects);
3468
3469 static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
3470 {
3471         struct stats_gather stats;
3472
3473         gather_stats(s, &stats);
3474
3475         return sprintf(buf, "%lu\n", stats.nr_objects);
3476 }
3477 SLAB_ATTR_RO(total_objects);
3478
3479 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3480 {
3481         return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3482 }
3483 SLAB_ATTR_RO(reclaim_account);
3484
3485 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3486 {
3487         return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
3488 }
3489 SLAB_ATTR_RO(hwcache_align);
3490
3491 #ifdef CONFIG_ZONE_DMA
3492 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3493 {
3494         return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3495 }
3496 SLAB_ATTR_RO(cache_dma);
3497 #endif
3498
3499 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3500 {
3501         return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3502 }
3503 SLAB_ATTR_RO(destroy_by_rcu);
3504
3505 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3506 {
3507         return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3508 }
3509 SLAB_ATTR_RO(red_zone);
3510
3511 static ssize_t poison_show(struct kmem_cache *s, char *buf)
3512 {
3513         return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3514 }
3515 SLAB_ATTR_RO(poison);
3516
3517 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3518 {
3519         return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3520 }
3521 SLAB_ATTR_RO(store_user);
3522
3523 static ssize_t hiwater_store(struct kmem_cache *s,
3524                                 const char *buf, size_t length)
3525 {
3526         long hiwater;
3527         int err;
3528
3529         err = strict_strtol(buf, 10, &hiwater);
3530         if (err)
3531                 return err;
3532
3533         if (hiwater < 0)
3534                 return -EINVAL;
3535
3536         s->hiwater = hiwater;
3537
3538         return length;
3539 }
3540
3541 static ssize_t hiwater_show(struct kmem_cache *s, char *buf)
3542 {
3543         return sprintf(buf, "%d\n", slab_hiwater(s));
3544 }
3545 SLAB_ATTR(hiwater);
3546
3547 static ssize_t freebatch_store(struct kmem_cache *s,
3548                                 const char *buf, size_t length)
3549 {
3550         long freebatch;
3551         int err;
3552
3553         err = strict_strtol(buf, 10, &freebatch);
3554         if (err)
3555                 return err;
3556
3557         if (freebatch <= 0 || freebatch - 1 > s->hiwater)
3558                 return -EINVAL;
3559
3560         s->freebatch = freebatch;
3561
3562         return length;
3563 }
3564
3565 static ssize_t freebatch_show(struct kmem_cache *s, char *buf)
3566 {
3567         return sprintf(buf, "%d\n", slab_freebatch(s));
3568 }
3569 SLAB_ATTR(freebatch);
3570
3571 #ifdef CONFIG_SLQB_STATS
3572 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
3573 {
3574         struct stats_gather stats;
3575         int len;
3576 #ifdef CONFIG_SMP
3577         int cpu;
3578 #endif
3579
3580         gather_stats(s, &stats);
3581
3582         len = sprintf(buf, "%lu", stats.stats[si]);
3583
3584 #ifdef CONFIG_SMP
3585         for_each_online_cpu(cpu) {
3586                 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3587                 struct kmem_cache_list *l = &c->list;
3588
3589                 if (len < PAGE_SIZE - 20)
3590                         len += sprintf(buf+len, " C%d=%lu", cpu, l->stats[si]);
3591         }
3592 #endif
3593         return len + sprintf(buf + len, "\n");
3594 }
3595
3596 #define STAT_ATTR(si, text)                                     \
3597 static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
3598 {                                                               \
3599         return show_stat(s, buf, si);                           \
3600 }                                                               \
3601 SLAB_ATTR_RO(text);                                             \
3602
3603 STAT_ATTR(ALLOC, alloc);
3604 STAT_ATTR(ALLOC_SLAB_FILL, alloc_slab_fill);
3605 STAT_ATTR(ALLOC_SLAB_NEW, alloc_slab_new);
3606 STAT_ATTR(FREE, free);
3607 STAT_ATTR(FREE_REMOTE, free_remote);
3608 STAT_ATTR(FLUSH_FREE_LIST, flush_free_list);
3609 STAT_ATTR(FLUSH_FREE_LIST_OBJECTS, flush_free_list_objects);
3610 STAT_ATTR(FLUSH_FREE_LIST_REMOTE, flush_free_list_remote);
3611 STAT_ATTR(FLUSH_SLAB_PARTIAL, flush_slab_partial);
3612 STAT_ATTR(FLUSH_SLAB_FREE, flush_slab_free);
3613 STAT_ATTR(FLUSH_RFREE_LIST, flush_rfree_list);
3614 STAT_ATTR(FLUSH_RFREE_LIST_OBJECTS, flush_rfree_list_objects);
3615 STAT_ATTR(CLAIM_REMOTE_LIST, claim_remote_list);
3616 STAT_ATTR(CLAIM_REMOTE_LIST_OBJECTS, claim_remote_list_objects);
3617 #endif
3618
3619 static struct attribute *slab_attrs[] = {
3620         &slab_size_attr.attr,
3621         &object_size_attr.attr,
3622         &objs_per_slab_attr.attr,
3623         &order_attr.attr,
3624         &objects_attr.attr,
3625         &total_objects_attr.attr,
3626         &slabs_attr.attr,
3627         &ctor_attr.attr,
3628         &align_attr.attr,
3629         &hwcache_align_attr.attr,
3630         &reclaim_account_attr.attr,
3631         &destroy_by_rcu_attr.attr,
3632         &red_zone_attr.attr,
3633         &poison_attr.attr,
3634         &store_user_attr.attr,
3635         &hiwater_attr.attr,
3636         &freebatch_attr.attr,
3637 #ifdef CONFIG_ZONE_DMA
3638         &cache_dma_attr.attr,
3639 #endif
3640 #ifdef CONFIG_SLQB_STATS
3641         &alloc_attr.attr,
3642         &alloc_slab_fill_attr.attr,
3643         &alloc_slab_new_attr.attr,
3644         &free_attr.attr,
3645         &free_remote_attr.attr,
3646         &flush_free_list_attr.attr,
3647         &flush_free_list_objects_attr.attr,
3648         &flush_free_list_remote_attr.attr,
3649         &flush_slab_partial_attr.attr,
3650         &flush_slab_free_attr.attr,
3651         &flush_rfree_list_attr.attr,
3652         &flush_rfree_list_objects_attr.attr,
3653         &claim_remote_list_attr.attr,
3654         &claim_remote_list_objects_attr.attr,
3655 #endif
3656         NULL
3657 };
3658
3659 static struct attribute_group slab_attr_group = {
3660         .attrs = slab_attrs,
3661 };
3662
3663 static ssize_t slab_attr_show(struct kobject *kobj,
3664                                 struct attribute *attr, char *buf)
3665 {
3666         struct slab_attribute *attribute;
3667         struct kmem_cache *s;
3668         int err;
3669
3670         attribute = to_slab_attr(attr);
3671         s = to_slab(kobj);
3672
3673         if (!attribute->show)
3674                 return -EIO;
3675
3676         err = attribute->show(s, buf);
3677
3678         return err;
3679 }
3680
3681 static ssize_t slab_attr_store(struct kobject *kobj,
3682                         struct attribute *attr, const char *buf, size_t len)
3683 {
3684         struct slab_attribute *attribute;
3685         struct kmem_cache *s;
3686         int err;
3687
3688         attribute = to_slab_attr(attr);
3689         s = to_slab(kobj);
3690
3691         if (!attribute->store)
3692                 return -EIO;
3693
3694         err = attribute->store(s, buf, len);
3695
3696         return err;
3697 }
3698
3699 static void kmem_cache_release(struct kobject *kobj)
3700 {
3701         struct kmem_cache *s = to_slab(kobj);
3702
3703         kmem_cache_free(&kmem_cache_cache, s);
3704 }
3705
3706 static struct sysfs_ops slab_sysfs_ops = {
3707         .show = slab_attr_show,
3708         .store = slab_attr_store,
3709 };
3710
3711 static struct kobj_type slab_ktype = {
3712         .sysfs_ops = &slab_sysfs_ops,
3713         .release = kmem_cache_release
3714 };
3715
3716 static int uevent_filter(struct kset *kset, struct kobject *kobj)
3717 {
3718         struct kobj_type *ktype = get_ktype(kobj);
3719
3720         if (ktype == &slab_ktype)
3721                 return 1;
3722         return 0;
3723 }
3724
3725 static struct kset_uevent_ops slab_uevent_ops = {
3726         .filter = uevent_filter,
3727 };
3728
3729 static struct kset *slab_kset;
3730
3731 static int sysfs_available __read_mostly;
3732
3733 static int sysfs_slab_add(struct kmem_cache *s)
3734 {
3735         int err;
3736
3737         if (!sysfs_available)
3738                 return 0;
3739
3740         s->kobj.kset = slab_kset;
3741         err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, s->name);
3742         if (err) {
3743                 kobject_put(&s->kobj);
3744                 return err;
3745         }
3746
3747         err = sysfs_create_group(&s->kobj, &slab_attr_group);
3748         if (err)
3749                 return err;
3750
3751         kobject_uevent(&s->kobj, KOBJ_ADD);
3752
3753         return 0;
3754 }
3755
3756 static void sysfs_slab_remove(struct kmem_cache *s)
3757 {
3758         kobject_uevent(&s->kobj, KOBJ_REMOVE);
3759         kobject_del(&s->kobj);
3760         kobject_put(&s->kobj);
3761 }
3762
3763 static int __init slab_sysfs_init(void)
3764 {
3765         struct kmem_cache *s;
3766         int err;
3767
3768         slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
3769         if (!slab_kset) {
3770                 printk(KERN_ERR "Cannot register slab subsystem.\n");
3771                 return -ENOSYS;
3772         }
3773
3774         down_write(&slqb_lock);
3775
3776         sysfs_available = 1;
3777
3778         list_for_each_entry(s, &slab_caches, list) {
3779                 err = sysfs_slab_add(s);
3780                 if (err)
3781                         printk(KERN_ERR "SLQB: Unable to add boot slab %s"
3782                                                 " to sysfs\n", s->name);
3783         }
3784
3785         up_write(&slqb_lock);
3786
3787         return 0;
3788 }
3789 device_initcall(slab_sysfs_init);
3790
3791 #endif