mm/page_alloc.c

   1 /*
   2  *  linux/mm/page_alloc.c
   3  *
   4  *  Manages the free list, the system allocates free pages here.
   5  *  Note that kmalloc() lives in slab.c
   6  *
   7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8  *  Swap reorganised 29.12.95, Stephen Tweedie
   9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  13  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  14  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  15  */
  16
  17 #include <linux/config.h>
  18 #include <linux/stddef.h>
  19 #include <linux/mm.h>
  20 #include <linux/swap.h>
  21 #include <linux/interrupt.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/bootmem.h>
  24 #include <linux/compiler.h>
  25 #include <linux/module.h>
  26 #include <linux/suspend.h>
  27 #include <linux/pagevec.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/slab.h>
  30 #include <linux/notifier.h>
  31 #include <linux/topology.h>
  32 #include <linux/sysctl.h>
  33 #include <linux/cpu.h>
  34
  35 #include <asm/tlbflush.h>
  36
  37 DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
  38 struct pglist_data *pgdat_list;
  39 unsigned long totalram_pages;
  40 unsigned long totalhigh_pages;
  41 long nr_swap_pages;
  42 int numnodes = 1;
  43 int sysctl_lower_zone_protection = 0;
  44
  45 EXPORT_SYMBOL(totalram_pages);
  46 EXPORT_SYMBOL(nr_swap_pages);
  47
  48 /*
  49  * Used by page_zone() to look up the address of the struct zone whose
  50  * id is encoded in the upper bits of page->flags
  51  */
  52 struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
  53 EXPORT_SYMBOL(zone_table);
  54
  55 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  56 int min_free_kbytes = 1024;
  57
  58 unsigned long __initdata nr_kernel_pages;
  59 unsigned long __initdata nr_all_pages;
  60
  61 /*
  62  * Temporary debugging check for pages not lying within a given zone.
  63  */
  64 static int bad_range(struct zone *zone, struct page *page)
  65 {
  66         if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
  67                 return 1;
  68         if (page_to_pfn(page) < zone->zone_start_pfn)
  69                 return 1;
  70         if (zone != page_zone(page))
  71                 return 1;
  72         return 0;
  73 }
  74
  75 static void bad_page(const char *function, struct page *page)
  76 {
  77         printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
  78                 function, current->comm, page);
  79         printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
  80                 (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
  81                 page->mapping, page_mapcount(page), page_count(page));
  82         printk(KERN_EMERG "Backtrace:\n");
  83         dump_stack();
  84         printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
  85         page->flags &= ~(1 << PG_private        |
  86                         1 << PG_locked  |
  87                         1 << PG_lru     |
  88                         1 << PG_active  |
  89                         1 << PG_dirty   |
  90                         1 << PG_swapcache |
  91                         1 << PG_writeback);
  92         set_page_count(page, 0);
  93         reset_page_mapcount(page);
  94         page->mapping = NULL;
  95 }
  96
  97 #ifndef CONFIG_HUGETLB_PAGE
  98 #define prep_compound_page(page, order) do { } while (0)
  99 #define destroy_compound_page(page, order) do { } while (0)
 100 #else
 101 /*
 102  * Higher-order pages are called "compound pages".  They are structured thusly:
 103  *
 104  * The first PAGE_SIZE page is called the "head page".
 105  *
 106  * The remaining PAGE_SIZE pages are called "tail pages".
 107  *
 108  * All pages have PG_compound set.  All pages have their ->private pointing at
 109  * the head page (even the head page has this).
 110  *
 111  * The first tail page's ->mapping, if non-zero, holds the address of the
 112  * compound page's put_page() function.
 113  *
 114  * The order of the allocation is stored in the first tail page's ->index
 115  * This is only for debug at present.  This usage means that zero-order pages
 116  * may not be compound.
 117  */
 118 static void prep_compound_page(struct page *page, unsigned long order)
 119 {
 120         int i;
 121         int nr_pages = 1 << order;
 122
 123         page[1].mapping = NULL;
 124         page[1].index = order;
 125         for (i = 0; i < nr_pages; i++) {
 126                 struct page *p = page + i;
 127
 128                 SetPageCompound(p);
 129                 p->private = (unsigned long)page;
 130         }
 131 }
 132
 133 static void destroy_compound_page(struct page *page, unsigned long order)
 134 {
 135         int i;
 136         int nr_pages = 1 << order;
 137
 138         if (!PageCompound(page))
 139                 return;
 140
 141         if (page[1].index != order)
 142                 bad_page(__FUNCTION__, page);
 143
 144         for (i = 0; i < nr_pages; i++) {
 145                 struct page *p = page + i;
 146
 147                 if (!PageCompound(p))
 148                         bad_page(__FUNCTION__, page);
 149                 if (p->private != (unsigned long)page)
 150                         bad_page(__FUNCTION__, page);
 151                 ClearPageCompound(p);
 152         }
 153 }
 154 #endif          /* CONFIG_HUGETLB_PAGE */
 155
 156 /*
 157  * Freeing function for a buddy system allocator.
 158  *
 159  * The concept of a buddy system is to maintain direct-mapped table
 160  * (containing bit values) for memory blocks of various "orders".
 161  * The bottom level table contains the map for the smallest allocatable
 162  * units of memory (here, pages), and each level above it describes
 163  * pairs of units from the levels below, hence, "buddies".
 164  * At a high level, all that happens here is marking the table entry
 165  * at the bottom level available, and propagating the changes upward
 166  * as necessary, plus some accounting needed to play nicely with other
 167  * parts of the VM system.
 168  * At each level, we keep one bit for each pair of blocks, which
 169  * is set to 1 iff only one of the pair is allocated.  So when we
 170  * are allocating or freeing one, we can derive the state of the
 171  * other.  That is, if we allocate a small block, and both were
 172  * free, the remainder of the region must be split into blocks.
 173  * If a block is freed, and its buddy is also free, then this
 174  * triggers coalescing into a block of larger size.
 175  *
 176  * -- wli
 177  */
 178
 179 static inline void __free_pages_bulk (struct page *page, struct page *base,
 180                 struct zone *zone, struct free_area *area, unsigned int order)
 181 {
 182         unsigned long page_idx, index, mask;
 183
 184         if (order)
 185                 destroy_compound_page(page, order);
 186         mask = (~0UL) << order;
 187         page_idx = page - base;
 188         if (page_idx & ~mask)
 189                 BUG();
 190         index = page_idx >> (1 + order);
 191
 192         zone->free_pages += 1 << order;
 193         while (order < MAX_ORDER-1) {
 194                 struct page *buddy1, *buddy2;
 195
 196                 BUG_ON(area >= zone->free_area + MAX_ORDER);
 197                 if (!__test_and_change_bit(index, area->map))
 198                         /*
 199                          * the buddy page is still allocated.
 200                          */
 201                         break;
 202
 203                 /* Move the buddy up one level. */
 204                 buddy1 = base + (page_idx ^ (1 << order));
 205                 buddy2 = base + page_idx;
 206                 BUG_ON(bad_range(zone, buddy1));
 207                 BUG_ON(bad_range(zone, buddy2));
 208                 list_del(&buddy1->lru);
 209                 mask <<= 1;
 210                 order++;
 211                 area++;
 212                 index >>= 1;
 213                 page_idx &= mask;
 214         }
 215         list_add(&(base + page_idx)->lru, &area->free_list);
 216 }
 217
 218 static inline void free_pages_check(const char *function, struct page *page)
 219 {
 220         if (    page_mapped(page) ||
 221                 page->mapping != NULL ||
 222                 page_count(page) != 0 ||
 223                 (page->flags & (
 224                         1 << PG_lru     |
 225                         1 << PG_private |
 226                         1 << PG_locked  |
 227                         1 << PG_active  |
 228                         1 << PG_reclaim |
 229                         1 << PG_slab    |
 230                         1 << PG_swapcache |
 231                         1 << PG_writeback )))
 232                 bad_page(function, page);
 233         if (PageDirty(page))
 234                 ClearPageDirty(page);
 235 }
 236
 237 /*
 238  * Frees a list of pages.
 239  * Assumes all pages on list are in same zone, and of same order.
 240  * count is the number of pages to free, or 0 for all on the list.
 241  *
 242  * If the zone was previously in an "all pages pinned" state then look to
 243  * see if this freeing clears that state.
 244  *
 245  * And clear the zone's pages_scanned counter, to hold off the "all pages are
 246  * pinned" detection logic.
 247  */
 248 static int
 249 free_pages_bulk(struct zone *zone, int count,
 250                 struct list_head *list, unsigned int order)
 251 {
 252         unsigned long flags;
 253         struct free_area *area;
 254         struct page *base, *page = NULL;
 255         int ret = 0;
 256
 257         base = zone->zone_mem_map;
 258         area = zone->free_area + order;
 259         spin_lock_irqsave(&zone->lock, flags);
 260         zone->all_unreclaimable = 0;
 261         zone->pages_scanned = 0;
 262         while (!list_empty(list) && count--) {
 263                 page = list_entry(list->prev, struct page, lru);
 264                 /* have to delete it as __free_pages_bulk list manipulates */
 265                 list_del(&page->lru);
 266                 __free_pages_bulk(page, base, zone, area, order);
 267                 ret++;
 268         }
 269         spin_unlock_irqrestore(&zone->lock, flags);
 270         return ret;
 271 }
 272
 273 void __free_pages_ok(struct page *page, unsigned int order)
 274 {
 275         LIST_HEAD(list);
 276         int i;
 277
 278         arch_free_page(page, order);
 279
 280         mod_page_state(pgfree, 1 << order);
 281         for (i = 0 ; i < (1 << order) ; ++i)
 282                 free_pages_check(__FUNCTION__, page + i);
 283         list_add(&page->lru, &list);
 284         kernel_map_pages(page, 1<<order, 0);
 285         free_pages_bulk(page_zone(page), 1, &list, order);
 286 }
 287
 288 #define MARK_USED(index, order, area) \
 289         __change_bit((index) >> (1+(order)), (area)->map)
 290
 291 /*
 292  * The order of subdivision here is critical for the IO subsystem.
 293  * Please do not alter this order without good reasons and regression
 294  * testing. Specifically, as large blocks of memory are subdivided,
 295  * the order in which smaller blocks are delivered depends on the order
 296  * they're subdivided in this function. This is the primary factor
 297  * influencing the order in which pages are delivered to the IO
 298  * subsystem according to empirical testing, and this is also justified
 299  * by considering the behavior of a buddy system containing a single
 300  * large block of memory acted on by a series of small allocations.
 301  * This behavior is a critical factor in sglist merging's success.
 302  *
 303  * -- wli
 304  */
 305 static inline struct page *
 306 expand(struct zone *zone, struct page *page,
 307          unsigned long index, int low, int high, struct free_area *area)
 308 {
 309         unsigned long size = 1 << high;
 310
 311         while (high > low) {
 312                 area--;
 313                 high--;
 314                 size >>= 1;
 315                 BUG_ON(bad_range(zone, &page[size]));
 316                 list_add(&page[size].lru, &area->free_list);
 317                 MARK_USED(index + size, high, area);
 318         }
 319         return page;
 320 }
 321
 322 static inline void set_page_refs(struct page *page, int order)
 323 {
 324 #ifdef CONFIG_MMU
 325         set_page_count(page, 1);
 326 #else
 327         int i;
 328
 329         /*
 330          * We need to reference all the pages for this order, otherwise if
 331          * anyone accesses one of the pages with (get/put) it will be freed.
 332          */
 333         for (i = 0; i < (1 << order); i++)
 334                 set_page_count(page+i, 1);
 335 #endif /* CONFIG_MMU */
 336 }
 337
 338 /*
 339  * This page is about to be returned from the page allocator
 340  */
 341 static void prep_new_page(struct page *page, int order)
 342 {
 343         if (page->mapping || page_mapped(page) ||
 344             (page->flags & (
 345                         1 << PG_private |
 346                         1 << PG_locked  |
 347                         1 << PG_lru     |
 348                         1 << PG_active  |
 349                         1 << PG_dirty   |
 350                         1 << PG_reclaim |
 351                         1 << PG_swapcache |
 352                         1 << PG_writeback )))
 353                 bad_page(__FUNCTION__, page);
 354
 355         page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 356                         1 << PG_referenced | 1 << PG_arch_1 |
 357                         1 << PG_checked | 1 << PG_mappedtodisk);
 358         page->private = 0;
 359         set_page_refs(page, order);
 360 }
 361
 362 /*
 363  * Do the hard work of removing an element from the buddy allocator.
 364  * Call me with the zone->lock already held.
 365  */
 366 static struct page *__rmqueue(struct zone *zone, unsigned int order)
 367 {
 368         struct free_area * area;
 369         unsigned int current_order;
 370         struct page *page;
 371         unsigned int index;
 372
 373         for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 374                 area = zone->free_area + current_order;
 375                 if (list_empty(&area->free_list))
 376                         continue;
 377
 378                 page = list_entry(area->free_list.next, struct page, lru);
 379                 list_del(&page->lru);
 380                 index = page - zone->zone_mem_map;
 381                 if (current_order != MAX_ORDER-1)
 382                         MARK_USED(index, current_order, area);
 383                 zone->free_pages -= 1UL << order;
 384                 return expand(zone, page, index, order, current_order, area);
 385         }
 386
 387         return NULL;
 388 }
 389
 390 /*
 391  * Obtain a specified number of elements from the buddy allocator, all under
 392  * a single hold of the lock, for efficiency.  Add them to the supplied list.
 393  * Returns the number of new pages which were placed at *list.
 394  */
 395 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 396                         unsigned long count, struct list_head *list)
 397 {
 398         unsigned long flags;
 399         int i;
 400         int allocated = 0;
 401         struct page *page;
 402
 403         spin_lock_irqsave(&zone->lock, flags);
 404         for (i = 0; i < count; ++i) {
 405                 page = __rmqueue(zone, order);
 406                 if (page == NULL)
 407                         break;
 408                 allocated++;
 409                 list_add_tail(&page->lru, list);
 410         }
 411         spin_unlock_irqrestore(&zone->lock, flags);
 412         return allocated;
 413 }
 414
 415 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 416 static void __drain_pages(unsigned int cpu)
 417 {
 418         struct zone *zone;
 419         int i;
 420
 421         for_each_zone(zone) {
 422                 struct per_cpu_pageset *pset;
 423
 424                 pset = &zone->pageset[cpu];
 425                 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 426                         struct per_cpu_pages *pcp;
 427
 428                         pcp = &pset->pcp[i];
 429                         pcp->count -= free_pages_bulk(zone, pcp->count,
 430                                                 &pcp->list, 0);
 431                 }
 432         }
 433 }
 434 #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
 435
 436 #ifdef CONFIG_PM
 437 int is_head_of_free_region(struct page *page)
 438 {
 439         struct zone *zone = page_zone(page);
 440         unsigned long flags;
 441         int order;
 442         struct list_head *curr;
 443
 444         /*
 445          * Should not matter as we need quiescent system for
 446          * suspend anyway, but...
 447          */
 448         spin_lock_irqsave(&zone->lock, flags);
 449         for (order = MAX_ORDER - 1; order >= 0; --order)
 450                 list_for_each(curr, &zone->free_area[order].free_list)
 451                         if (page == list_entry(curr, struct page, lru)) {
 452                                 spin_unlock_irqrestore(&zone->lock, flags);
 453                                 return 1 << order;
 454                         }
 455         spin_unlock_irqrestore(&zone->lock, flags);
 456         return 0;
 457 }
 458
 459 /*
 460  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
 461  */
 462 void drain_local_pages(void)
 463 {
 464         unsigned long flags;
 465
 466         local_irq_save(flags);
 467         __drain_pages(smp_processor_id());
 468         local_irq_restore(flags);
 469 }
 470 #endif /* CONFIG_PM */
 471
 472 static void zone_statistics(struct zonelist *zonelist, struct zone *z)
 473 {
 474 #ifdef CONFIG_NUMA
 475         unsigned long flags;
 476         int cpu;
 477         pg_data_t *pg = z->zone_pgdat;
 478         pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
 479         struct per_cpu_pageset *p;
 480
 481         local_irq_save(flags);
 482         cpu = smp_processor_id();
 483         p = &z->pageset[cpu];
 484         if (pg == orig) {
 485                 z->pageset[cpu].numa_hit++;
 486         } else {
 487                 p->numa_miss++;
 488                 zonelist->zones[0]->pageset[cpu].numa_foreign++;
 489         }
 490         if (pg == NODE_DATA(numa_node_id()))
 491                 p->local_node++;
 492         else
 493                 p->other_node++;
 494         local_irq_restore(flags);
 495 #endif
 496 }
 497
 498 /*
 499  * Free a 0-order page
 500  */
 501 static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
 502 static void fastcall free_hot_cold_page(struct page *page, int cold)
 503 {
 504         struct zone *zone = page_zone(page);
 505         struct per_cpu_pages *pcp;
 506         unsigned long flags;
 507
 508         arch_free_page(page, 0);
 509
 510         kernel_map_pages(page, 1, 0);
 511         inc_page_state(pgfree);
 512         if (PageAnon(page))
 513                 page->mapping = NULL;
 514         free_pages_check(__FUNCTION__, page);
 515         pcp = &zone->pageset[get_cpu()].pcp[cold];
 516         local_irq_save(flags);
 517         if (pcp->count >= pcp->high)
 518                 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 519         list_add(&page->lru, &pcp->list);
 520         pcp->count++;
 521         local_irq_restore(flags);
 522         put_cpu();
 523 }
 524
 525 void fastcall free_hot_page(struct page *page)
 526 {
 527         free_hot_cold_page(page, 0);
 528 }
 529
 530 void fastcall free_cold_page(struct page *page)
 531 {
 532         free_hot_cold_page(page, 1);
 533 }
 534
 535 /*
 536  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 537  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 538  * or two.
 539  */
 540
 541 static struct page *
 542 buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
 543 {
 544         unsigned long flags;
 545         struct page *page = NULL;
 546         int cold = !!(gfp_flags & __GFP_COLD);
 547
 548         if (order == 0) {
 549                 struct per_cpu_pages *pcp;
 550
 551                 pcp = &zone->pageset[get_cpu()].pcp[cold];
 552                 local_irq_save(flags);
 553                 if (pcp->count <= pcp->low)
 554                         pcp->count += rmqueue_bulk(zone, 0,
 555                                                 pcp->batch, &pcp->list);
 556                 if (pcp->count) {
 557                         page = list_entry(pcp->list.next, struct page, lru);
 558                         list_del(&page->lru);
 559                         pcp->count--;
 560                 }
 561                 local_irq_restore(flags);
 562                 put_cpu();
 563         }
 564
 565         if (page == NULL) {
 566                 spin_lock_irqsave(&zone->lock, flags);
 567                 page = __rmqueue(zone, order);
 568                 spin_unlock_irqrestore(&zone->lock, flags);
 569         }
 570
 571         if (page != NULL) {
 572                 BUG_ON(bad_range(zone, page));
 573                 mod_page_state_zone(zone, pgalloc, 1 << order);
 574                 prep_new_page(page, order);
 575                 if (order && (gfp_flags & __GFP_COMP))
 576                         prep_compound_page(page, order);
 577         }
 578         return page;
 579 }
 580
 581 /*
 582  * This is the 'heart' of the zoned buddy allocator.
 583  *
 584  * Herein lies the mysterious "incremental min".  That's the
 585  *
 586  *      local_low = z->pages_low;
 587  *      min += local_low;
 588  *
 589  * thing.  The intent here is to provide additional protection to low zones for
 590  * allocation requests which _could_ use higher zones.  So a GFP_HIGHMEM
 591  * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
 592  * request.  This preserves additional space in those lower zones for requests
 593  * which really do need memory from those zones.  It means that on a decent
 594  * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
 595  * zone untouched.
 596  */
 597 struct page * fastcall
 598 __alloc_pages(unsigned int gfp_mask, unsigned int order,
 599                 struct zonelist *zonelist)
 600 {
 601         const int wait = gfp_mask & __GFP_WAIT;
 602         unsigned long min;
 603         struct zone **zones, *z;
 604         struct page *page;
 605         struct reclaim_state reclaim_state;
 606         struct task_struct *p = current;
 607         int i;
 608         int alloc_type;
 609         int do_retry;
 610         int can_try_harder;
 611
 612         might_sleep_if(wait);
 613
 614         /*
 615          * The caller may dip into page reserves a bit more if the caller
 616          * cannot run direct reclaim, or is the caller has realtime scheduling
 617          * policy
 618          */
 619         can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
 620
 621         zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 622
 623         if (unlikely(zones[0] == NULL)) {
 624                 /* Should this ever happen?? */
 625                 return NULL;
 626         }
 627
 628         alloc_type = zone_idx(zones[0]);
 629
 630         /* Go through the zonelist once, looking for a zone with enough free */
 631         for (i = 0; (z = zones[i]) != NULL; i++) {
 632                 min = z->pages_low + (1<<order) + z->protection[alloc_type];
 633
 634                 if (z->free_pages < min)
 635                         continue;
 636
 637                 page = buffered_rmqueue(z, order, gfp_mask);
 638                 if (page)
 639                         goto got_pg;
 640         }
 641
 642         for (i = 0; (z = zones[i]) != NULL; i++)
 643                 wakeup_kswapd(z);
 644
 645         /*
 646          * Go through the zonelist again. Let __GFP_HIGH and allocations
 647          * coming from realtime tasks to go deeper into reserves
 648          */
 649         for (i = 0; (z = zones[i]) != NULL; i++) {
 650                 min = z->pages_min;
 651                 if (gfp_mask & __GFP_HIGH)
 652                         min /= 2;
 653                 if (can_try_harder)
 654                         min -= min / 4;
 655                 min += (1<<order) + z->protection[alloc_type];
 656
 657                 if (z->free_pages < min)
 658                         continue;
 659
 660                 page = buffered_rmqueue(z, order, gfp_mask);
 661                 if (page)
 662                         goto got_pg;
 663         }
 664
 665         /* This allocation should allow future memory freeing. */
 666         if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
 667                 /* go through the zonelist yet again, ignoring mins */
 668                 for (i = 0; (z = zones[i]) != NULL; i++) {
 669                         page = buffered_rmqueue(z, order, gfp_mask);
 670                         if (page)
 671                                 goto got_pg;
 672                 }
 673                 goto nopage;
 674         }
 675
 676         /* Atomic allocations - we can't balance anything */
 677         if (!wait)
 678                 goto nopage;
 679
 680 rebalance:
 681         /* We now go into synchronous reclaim */
 682         p->flags |= PF_MEMALLOC;
 683         reclaim_state.reclaimed_slab = 0;
 684         p->reclaim_state = &reclaim_state;
 685
 686         try_to_free_pages(zones, gfp_mask, order);
 687
 688         p->reclaim_state = NULL;
 689         p->flags &= ~PF_MEMALLOC;
 690
 691         /* go through the zonelist yet one more time */
 692         for (i = 0; (z = zones[i]) != NULL; i++) {
 693                 min = z->pages_min;
 694                 if (gfp_mask & __GFP_HIGH)
 695                         min /= 2;
 696                 if (can_try_harder)
 697                         min -= min / 4;
 698                 min += (1<<order) + z->protection[alloc_type];
 699
 700                 if (z->free_pages < min)
 701                         continue;
 702
 703                 page = buffered_rmqueue(z, order, gfp_mask);
 704                 if (page)
 705                         goto got_pg;
 706         }
 707
 708         /*
 709          * Don't let big-order allocations loop unless the caller explicitly
 710          * requests that.  Wait for some write requests to complete then retry.
 711          *
 712          * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
 713          * <= 3, but that may not be true in other implementations.
 714          */
 715         do_retry = 0;
 716         if (!(gfp_mask & __GFP_NORETRY)) {
 717                 if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
 718                         do_retry = 1;
 719                 if (gfp_mask & __GFP_NOFAIL)
 720                         do_retry = 1;
 721         }
 722         if (do_retry) {
 723                 blk_congestion_wait(WRITE, HZ/50);
 724                 goto rebalance;
 725         }
 726
 727 nopage:
 728         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
 729                 printk(KERN_WARNING "%s: page allocation failure."
 730                         " order:%d, mode:0x%x\n",
 731                         p->comm, order, gfp_mask);
 732                 dump_stack();
 733         }
 734         return NULL;
 735 got_pg:
 736         zone_statistics(zonelist, z);
 737         kernel_map_pages(page, 1 << order, 1);
 738         return page;
 739 }
 740
 741 EXPORT_SYMBOL(__alloc_pages);
 742
 743 /*
 744  * Common helper functions.
 745  */
 746 fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
 747 {
 748         struct page * page;
 749         page = alloc_pages(gfp_mask, order);
 750         if (!page)
 751                 return 0;
 752         return (unsigned long) page_address(page);
 753 }
 754
 755 EXPORT_SYMBOL(__get_free_pages);
 756
 757 fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
 758 {
 759         struct page * page;
 760
 761         /*
 762          * get_zeroed_page() returns a 32-bit address, which cannot represent
 763          * a highmem page
 764          */
 765         BUG_ON(gfp_mask & __GFP_HIGHMEM);
 766
 767         page = alloc_pages(gfp_mask, 0);
 768         if (page) {
 769                 void *address = page_address(page);
 770                 clear_page(address);
 771                 return (unsigned long) address;
 772         }
 773         return 0;
 774 }
 775
 776 EXPORT_SYMBOL(get_zeroed_page);
 777
 778 void __pagevec_free(struct pagevec *pvec)
 779 {
 780         int i = pagevec_count(pvec);
 781
 782         while (--i >= 0)
 783                 free_hot_cold_page(pvec->pages[i], pvec->cold);
 784 }
 785
 786 fastcall void __free_pages(struct page *page, unsigned int order)
 787 {
 788         if (!PageReserved(page) && put_page_testzero(page)) {
 789                 if (order == 0)
 790                         free_hot_page(page);
 791                 else
 792                         __free_pages_ok(page, order);
 793         }
 794 }
 795
 796 EXPORT_SYMBOL(__free_pages);
 797
 798 fastcall void free_pages(unsigned long addr, unsigned int order)
 799 {
 800         if (addr != 0) {
 801                 BUG_ON(!virt_addr_valid((void *)addr));
 802                 __free_pages(virt_to_page((void *)addr), order);
 803         }
 804 }
 805
 806 EXPORT_SYMBOL(free_pages);
 807
 808 /*
 809  * Total amount of free (allocatable) RAM:
 810  */
 811 unsigned int nr_free_pages(void)
 812 {
 813         unsigned int sum = 0;
 814         struct zone *zone;
 815
 816         for_each_zone(zone)
 817                 sum += zone->free_pages;
 818
 819         return sum;
 820 }
 821
 822 EXPORT_SYMBOL(nr_free_pages);
 823
 824 #ifdef CONFIG_NUMA
 825 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
 826 {
 827         unsigned int i, sum = 0;
 828
 829         for (i = 0; i < MAX_NR_ZONES; i++)
 830                 sum += pgdat->node_zones[i].free_pages;
 831
 832         return sum;
 833 }
 834 #endif
 835
 836 static unsigned int nr_free_zone_pages(int offset)
 837 {
 838         pg_data_t *pgdat;
 839         unsigned int sum = 0;
 840
 841         for_each_pgdat(pgdat) {
 842                 struct zonelist *zonelist = pgdat->node_zonelists + offset;
 843                 struct zone **zonep = zonelist->zones;
 844                 struct zone *zone;
 845
 846                 for (zone = *zonep++; zone; zone = *zonep++) {
 847                         unsigned long size = zone->present_pages;
 848                         unsigned long high = zone->pages_high;
 849                         if (size > high)
 850                                 sum += size - high;
 851                 }
 852         }
 853
 854         return sum;
 855 }
 856
 857 /*
 858  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
 859  */
 860 unsigned int nr_free_buffer_pages(void)
 861 {
 862         return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
 863 }
 864
 865 /*
 866  * Amount of free RAM allocatable within all zones
 867  */
 868 unsigned int nr_free_pagecache_pages(void)
 869 {
 870         return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
 871 }
 872
 873 #ifdef CONFIG_HIGHMEM
 874 unsigned int nr_free_highpages (void)
 875 {
 876         pg_data_t *pgdat;
 877         unsigned int pages = 0;
 878
 879         for_each_pgdat(pgdat)
 880                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 881
 882         return pages;
 883 }
 884 #endif
 885
 886 #ifdef CONFIG_NUMA
 887 static void show_node(struct zone *zone)
 888 {
 889         printk("Node %d ", zone->zone_pgdat->node_id);
 890 }
 891 #else
 892 #define show_node(zone) do { } while (0)
 893 #endif
 894
 895 /*
 896  * Accumulate the page_state information across all CPUs.
 897  * The result is unavoidably approximate - it can change
 898  * during and after execution of this function.
 899  */
 900 DEFINE_PER_CPU(struct page_state, page_states) = {0};
 901 EXPORT_PER_CPU_SYMBOL(page_states);
 902
 903 atomic_t nr_pagecache = ATOMIC_INIT(0);
 904 EXPORT_SYMBOL(nr_pagecache);
 905 #ifdef CONFIG_SMP
 906 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 907 #endif
 908
 909 void __get_page_state(struct page_state *ret, int nr)
 910 {
 911         int cpu = 0;
 912
 913         memset(ret, 0, sizeof(*ret));
 914         while (cpu < NR_CPUS) {
 915                 unsigned long *in, *out, off;
 916
 917                 if (!cpu_possible(cpu)) {
 918                         cpu++;
 919                         continue;
 920                 }
 921
 922                 in = (unsigned long *)&per_cpu(page_states, cpu);
 923                 cpu++;
 924                 if (cpu < NR_CPUS && cpu_possible(cpu))
 925                         prefetch(&per_cpu(page_states, cpu));
 926                 out = (unsigned long *)ret;
 927                 for (off = 0; off < nr; off++)
 928                         *out++ += *in++;
 929         }
 930 }
 931
 932 void get_page_state(struct page_state *ret)
 933 {
 934         int nr;
 935
 936         nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
 937         nr /= sizeof(unsigned long);
 938
 939         __get_page_state(ret, nr + 1);
 940 }
 941
 942 void get_full_page_state(struct page_state *ret)
 943 {
 944         __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
 945 }
 946
 947 unsigned long __read_page_state(unsigned offset)
 948 {
 949         unsigned long ret = 0;
 950         int cpu;
 951
 952         for (cpu = 0; cpu < NR_CPUS; cpu++) {
 953                 unsigned long in;
 954
 955                 if (!cpu_possible(cpu))
 956                         continue;
 957
 958                 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
 959                 ret += *((unsigned long *)in);
 960         }
 961         return ret;
 962 }
 963
 964 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 965                         unsigned long *free, struct pglist_data *pgdat)
 966 {
 967         struct zone *zones = pgdat->node_zones;
 968         int i;
 969
 970         *active = 0;
 971         *inactive = 0;
 972         *free = 0;
 973         for (i = 0; i < MAX_NR_ZONES; i++) {
 974                 *active += zones[i].nr_active;
 975                 *inactive += zones[i].nr_inactive;
 976                 *free += zones[i].free_pages;
 977         }
 978 }
 979
 980 void get_zone_counts(unsigned long *active,
 981                 unsigned long *inactive, unsigned long *free)
 982 {
 983         struct pglist_data *pgdat;
 984
 985         *active = 0;
 986         *inactive = 0;
 987         *free = 0;
 988         for_each_pgdat(pgdat) {
 989                 unsigned long l, m, n;
 990                 __get_zone_counts(&l, &m, &n, pgdat);
 991                 *active += l;
 992                 *inactive += m;
 993                 *free += n;
 994         }
 995 }
 996
 997 void si_meminfo(struct sysinfo *val)
 998 {
 999         val->totalram = totalram_pages;
1000         val->sharedram = 0;
1001         val->freeram = nr_free_pages();
1002         val->bufferram = nr_blockdev_pages();
1003 #ifdef CONFIG_HIGHMEM
1004         val->totalhigh = totalhigh_pages;
1005         val->freehigh = nr_free_highpages();
1006 #else
1007         val->totalhigh = 0;
1008         val->freehigh = 0;
1009 #endif
1010         val->mem_unit = PAGE_SIZE;
1011 }
1012
1013 EXPORT_SYMBOL(si_meminfo);
1014
1015 #ifdef CONFIG_NUMA
1016 void si_meminfo_node(struct sysinfo *val, int nid)
1017 {
1018         pg_data_t *pgdat = NODE_DATA(nid);
1019
1020         val->totalram = pgdat->node_present_pages;
1021         val->freeram = nr_free_pages_pgdat(pgdat);
1022         val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1023         val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1024         val->mem_unit = PAGE_SIZE;
1025 }
1026 #endif
1027
1028 #define K(x) ((x) << (PAGE_SHIFT-10))
1029
1030 /*
1031  * Show free area list (used inside shift_scroll-lock stuff)
1032  * We also calculate the percentage fragmentation. We do this by counting the
1033  * memory on each free list with the exception of the first item on the list.
1034  */
1035 void show_free_areas(void)
1036 {
1037         struct page_state ps;
1038         int cpu, temperature;
1039         unsigned long active;
1040         unsigned long inactive;
1041         unsigned long free;
1042         struct zone *zone;
1043
1044         for_each_zone(zone) {
1045                 show_node(zone);
1046                 printk("%s per-cpu:", zone->name);
1047
1048                 if (!zone->present_pages) {
1049                         printk(" empty\n");
1050                         continue;
1051                 } else
1052                         printk("\n");
1053
1054                 for (cpu = 0; cpu < NR_CPUS; ++cpu) {
1055                         struct per_cpu_pageset *pageset;
1056
1057                         if (!cpu_possible(cpu))
1058                                 continue;
1059
1060                         pageset = zone->pageset + cpu;
1061
1062                         for (temperature = 0; temperature < 2; temperature++)
1063                                 printk("cpu %d %s: low %d, high %d, batch %d\n",
1064                                         cpu,
1065                                         temperature ? "cold" : "hot",
1066                                         pageset->pcp[temperature].low,
1067                                         pageset->pcp[temperature].high,
1068                                         pageset->pcp[temperature].batch);
1069                 }
1070         }
1071
1072         get_page_state(&ps);
1073         get_zone_counts(&active, &inactive, &free);
1074
1075         printk("\nFree pages: %11ukB (%ukB HighMem)\n",
1076                 K(nr_free_pages()),
1077                 K(nr_free_highpages()));
1078
1079         printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1080                 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1081                 active,
1082                 inactive,
1083                 ps.nr_dirty,
1084                 ps.nr_writeback,
1085                 ps.nr_unstable,
1086                 nr_free_pages(),
1087                 ps.nr_slab,
1088                 ps.nr_mapped,
1089                 ps.nr_page_table_pages);
1090
1091         for_each_zone(zone) {
1092                 int i;
1093
1094                 show_node(zone);
1095                 printk("%s"
1096                         " free:%lukB"
1097                         " min:%lukB"
1098                         " low:%lukB"
1099                         " high:%lukB"
1100                         " active:%lukB"
1101                         " inactive:%lukB"
1102                         " present:%lukB"
1103                         "\n",
1104                         zone->name,
1105                         K(zone->free_pages),
1106                         K(zone->pages_min),
1107                         K(zone->pages_low),
1108                         K(zone->pages_high),
1109                         K(zone->nr_active),
1110                         K(zone->nr_inactive),
1111                         K(zone->present_pages)
1112                         );
1113                 printk("protections[]:");
1114                 for (i = 0; i < MAX_NR_ZONES; i++)
1115                         printk(" %lu", zone->protection[i]);
1116                 printk("\n");
1117         }
1118
1119         for_each_zone(zone) {
1120                 struct list_head *elem;
1121                 unsigned long nr, flags, order, total = 0;
1122
1123                 show_node(zone);
1124                 printk("%s: ", zone->name);
1125                 if (!zone->present_pages) {
1126                         printk("empty\n");
1127                         continue;
1128                 }
1129
1130                 spin_lock_irqsave(&zone->lock, flags);
1131                 for (order = 0; order < MAX_ORDER; order++) {
1132                         nr = 0;
1133                         list_for_each(elem, &zone->free_area[order].free_list)
1134                                 ++nr;
1135                         total += nr << order;
1136                         printk("%lu*%lukB ", nr, K(1UL) << order);
1137                 }
1138                 spin_unlock_irqrestore(&zone->lock, flags);
1139                 printk("= %lukB\n", K(total));
1140         }
1141
1142         show_swap_cache_info();
1143 }
1144
1145 /*
1146  * Builds allocation fallback zone lists.
1147  */
1148 static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
1149 {
1150         switch (k) {
1151                 struct zone *zone;
1152         default:
1153                 BUG();
1154         case ZONE_HIGHMEM:
1155                 zone = pgdat->node_zones + ZONE_HIGHMEM;
1156                 if (zone->present_pages) {
1157 #ifndef CONFIG_HIGHMEM
1158                         BUG();
1159 #endif
1160                         zonelist->zones[j++] = zone;
1161                 }
1162         case ZONE_NORMAL:
1163                 zone = pgdat->node_zones + ZONE_NORMAL;
1164                 if (zone->present_pages)
1165                         zonelist->zones[j++] = zone;
1166         case ZONE_DMA:
1167                 zone = pgdat->node_zones + ZONE_DMA;
1168                 if (zone->present_pages)
1169                         zonelist->zones[j++] = zone;
1170         }
1171
1172         return j;
1173 }
1174
1175 #ifdef CONFIG_NUMA
1176 #define MAX_NODE_LOAD (numnodes)
1177 static int __initdata node_load[MAX_NUMNODES];
1178 /**
1179  * find_next_best_node - find the next node that should appear in a given
1180  *    node's fallback list
1181  * @node: node whose fallback list we're appending
1182  * @used_node_mask: pointer to the bitmap of already used nodes
1183  *
1184  * We use a number of factors to determine which is the next node that should
1185  * appear on a given node's fallback list.  The node should not have appeared
1186  * already in @node's fallback list, and it should be the next closest node
1187  * according to the distance array (which contains arbitrary distance values
1188  * from each node to each node in the system), and should also prefer nodes
1189  * with no CPUs, since presumably they'll have very little allocation pressure
1190  * on them otherwise.
1191  * It returns -1 if no node is found.
1192  */
1193 static int __init find_next_best_node(int node, void *used_node_mask)
1194 {
1195         int i, n, val;
1196         int min_val = INT_MAX;
1197         int best_node = -1;
1198
1199         for (i = 0; i < numnodes; i++) {
1200                 cpumask_t tmp;
1201
1202                 /* Start from local node */
1203                 n = (node+i)%numnodes;
1204
1205                 /* Don't want a node to appear more than once */
1206                 if (test_bit(n, used_node_mask))
1207                         continue;
1208
1209                 /* Use the distance array to find the distance */
1210                 val = node_distance(node, n);
1211
1212                 /* Give preference to headless and unused nodes */
1213                 tmp = node_to_cpumask(n);
1214                 if (!cpus_empty(tmp))
1215                         val += PENALTY_FOR_NODE_WITH_CPUS;
1216
1217                 /* Slight preference for less loaded node */
1218                 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
1219                 val += node_load[n];
1220
1221                 if (val < min_val) {
1222                         min_val = val;
1223                         best_node = n;
1224                 }
1225         }
1226
1227         if (best_node >= 0)
1228                 set_bit(best_node, used_node_mask);
1229
1230         return best_node;
1231 }
1232
1233 static void __init build_zonelists(pg_data_t *pgdat)
1234 {
1235         int i, j, k, node, local_node;
1236         int prev_node, load;
1237         struct zonelist *zonelist;
1238         DECLARE_BITMAP(used_mask, MAX_NUMNODES);
1239
1240         /* initialize zonelists */
1241         for (i = 0; i < GFP_ZONETYPES; i++) {
1242                 zonelist = pgdat->node_zonelists + i;
1243                 memset(zonelist, 0, sizeof(*zonelist));
1244                 zonelist->zones[0] = NULL;
1245         }
1246
1247         /* NUMA-aware ordering of nodes */
1248         local_node = pgdat->node_id;
1249         load = numnodes;
1250         prev_node = local_node;
1251         bitmap_zero(used_mask, MAX_NUMNODES);
1252         while ((node = find_next_best_node(local_node, used_mask)) >= 0) {
1253                 /*
1254                  * We don't want to pressure a particular node.
1255                  * So adding penalty to the first node in same
1256                  * distance group to make it round-robin.
1257                  */
1258                 if (node_distance(local_node, node) !=
1259                                 node_distance(local_node, prev_node))
1260                         node_load[node] += load;
1261                 prev_node = node;
1262                 load--;
1263                 for (i = 0; i < GFP_ZONETYPES; i++) {
1264                         zonelist = pgdat->node_zonelists + i;
1265                         for (j = 0; zonelist->zones[j] != NULL; j++);
1266
1267                         k = ZONE_NORMAL;
1268                         if (i & __GFP_HIGHMEM)
1269                                 k = ZONE_HIGHMEM;
1270                         if (i & __GFP_DMA)
1271                                 k = ZONE_DMA;
1272
1273                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1274                         zonelist->zones[j] = NULL;
1275                 }
1276         }
1277 }
1278
1279 #else   /* CONFIG_NUMA */
1280
1281 static void __init build_zonelists(pg_data_t *pgdat)
1282 {
1283         int i, j, k, node, local_node;
1284
1285         local_node = pgdat->node_id;
1286         for (i = 0; i < GFP_ZONETYPES; i++) {
1287                 struct zonelist *zonelist;
1288
1289                 zonelist = pgdat->node_zonelists + i;
1290                 memset(zonelist, 0, sizeof(*zonelist));
1291
1292                 j = 0;
1293                 k = ZONE_NORMAL;
1294                 if (i & __GFP_HIGHMEM)
1295                         k = ZONE_HIGHMEM;
1296                 if (i & __GFP_DMA)
1297                         k = ZONE_DMA;
1298
1299                 j = build_zonelists_node(pgdat, zonelist, j, k);
1300                 /*
1301                  * Now we build the zonelist so that it contains the zones
1302                  * of all the other nodes.
1303                  * We don't want to pressure a particular node, so when
1304                  * building the zones for node N, we make sure that the
1305                  * zones coming right after the local ones are those from
1306                  * node N+1 (modulo N)
1307                  */
1308                 for (node = local_node + 1; node < numnodes; node++)
1309                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1310                 for (node = 0; node < local_node; node++)
1311                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1312
1313                 zonelist->zones[j] = NULL;
1314         }
1315 }
1316
1317 #endif  /* CONFIG_NUMA */
1318
1319 void __init build_all_zonelists(void)
1320 {
1321         int i;
1322
1323         for(i = 0 ; i < numnodes ; i++)
1324                 build_zonelists(NODE_DATA(i));
1325         printk("Built %i zonelists\n", numnodes);
1326 }
1327
1328 /*
1329  * Helper functions to size the waitqueue hash table.
1330  * Essentially these want to choose hash table sizes sufficiently
1331  * large so that collisions trying to wait on pages are rare.
1332  * But in fact, the number of active page waitqueues on typical
1333  * systems is ridiculously low, less than 200. So this is even
1334  * conservative, even though it seems large.
1335  *
1336  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1337  * waitqueues, i.e. the size of the waitq table given the number of pages.
1338  */
1339 #define PAGES_PER_WAITQUEUE     256
1340
1341 static inline unsigned long wait_table_size(unsigned long pages)
1342 {
1343         unsigned long size = 1;
1344
1345         pages /= PAGES_PER_WAITQUEUE;
1346
1347         while (size < pages)
1348                 size <<= 1;
1349
1350         /*
1351          * Once we have dozens or even hundreds of threads sleeping
1352          * on IO we've got bigger problems than wait queue collision.
1353          * Limit the size of the wait table to a reasonable size.
1354          */
1355         size = min(size, 4096UL);
1356
1357         return max(size, 4UL);
1358 }
1359
1360 /*
1361  * This is an integer logarithm so that shifts can be used later
1362  * to extract the more random high bits from the multiplicative
1363  * hash function before the remainder is taken.
1364  */
1365 static inline unsigned long wait_table_bits(unsigned long size)
1366 {
1367         return ffz(~size);
1368 }
1369
1370 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1371
1372 static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1373                 unsigned long *zones_size, unsigned long *zholes_size)
1374 {
1375         unsigned long realtotalpages, totalpages = 0;
1376         int i;
1377
1378         for (i = 0; i < MAX_NR_ZONES; i++)
1379                 totalpages += zones_size[i];
1380         pgdat->node_spanned_pages = totalpages;
1381
1382         realtotalpages = totalpages;
1383         if (zholes_size)
1384                 for (i = 0; i < MAX_NR_ZONES; i++)
1385                         realtotalpages -= zholes_size[i];
1386         pgdat->node_present_pages = realtotalpages;
1387         printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1388 }
1389
1390
1391 /*
1392  * Initially all pages are reserved - free ones are freed
1393  * up by free_all_bootmem() once the early boot process is
1394  * done. Non-atomic initialization, single-pass.
1395  */
1396 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1397                 unsigned long start_pfn)
1398 {
1399         struct page *start = pfn_to_page(start_pfn);
1400         struct page *page;
1401
1402         for (page = start; page < (start + size); page++) {
1403                 set_page_zone(page, NODEZONE(nid, zone));
1404                 set_page_count(page, 0);
1405                 reset_page_mapcount(page);
1406                 SetPageReserved(page);
1407                 INIT_LIST_HEAD(&page->lru);
1408 #ifdef WANT_PAGE_VIRTUAL
1409                 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1410                 if (!is_highmem_idx(zone))
1411                         set_page_address(page, __va(start_pfn << PAGE_SHIFT));
1412 #endif
1413                 start_pfn++;
1414         }
1415 }
1416
1417 /*
1418  * Page buddy system uses "index >> (i+1)", where "index" is
1419  * at most "size-1".
1420  *
1421  * The extra "+3" is to round down to byte size (8 bits per byte
1422  * assumption). Thus we get "(size-1) >> (i+4)" as the last byte
1423  * we can access.
1424  *
1425  * The "+1" is because we want to round the byte allocation up
1426  * rather than down. So we should have had a "+7" before we shifted
1427  * down by three. Also, we have to add one as we actually _use_ the
1428  * last bit (it's [0,n] inclusive, not [0,n[).
1429  *
1430  * So we actually had +7+1 before we shift down by 3. But
1431  * (n+8) >> 3 == (n >> 3) + 1 (modulo overflows, which we do not have).
1432  *
1433  * Finally, we LONG_ALIGN because all bitmap operations are on longs.
1434  */
1435 unsigned long pages_to_bitmap_size(unsigned long order, unsigned long nr_pages)
1436 {
1437         unsigned long bitmap_size;
1438
1439         bitmap_size = (nr_pages-1) >> (order+4);
1440         bitmap_size = LONG_ALIGN(bitmap_size+1);
1441
1442         return bitmap_size;
1443 }
1444
1445 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, unsigned long size)
1446 {
1447         int order;
1448         for (order = 0; ; order++) {
1449                 unsigned long bitmap_size;
1450
1451                 INIT_LIST_HEAD(&zone->free_area[order].free_list);
1452                 if (order == MAX_ORDER-1) {
1453                         zone->free_area[order].map = NULL;
1454                         break;
1455                 }
1456
1457                 bitmap_size = pages_to_bitmap_size(order, size);
1458                 zone->free_area[order].map =
1459                   (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
1460         }
1461 }
1462
1463 #ifndef __HAVE_ARCH_MEMMAP_INIT
1464 #define memmap_init(size, nid, zone, start_pfn) \
1465         memmap_init_zone((size), (nid), (zone), (start_pfn))
1466 #endif
1467
1468 /*
1469  * Set up the zone data structures:
1470  *   - mark all pages reserved
1471  *   - mark all memory queues empty
1472  *   - clear the memory bitmaps
1473  */
1474 static void __init free_area_init_core(struct pglist_data *pgdat,
1475                 unsigned long *zones_size, unsigned long *zholes_size)
1476 {
1477         unsigned long i, j;
1478         const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
1479         int cpu, nid = pgdat->node_id;
1480         unsigned long zone_start_pfn = pgdat->node_start_pfn;
1481
1482         pgdat->nr_zones = 0;
1483         init_waitqueue_head(&pgdat->kswapd_wait);
1484
1485         for (j = 0; j < MAX_NR_ZONES; j++) {
1486                 struct zone *zone = pgdat->node_zones + j;
1487                 unsigned long size, realsize;
1488                 unsigned long batch;
1489
1490                 zone_table[NODEZONE(nid, j)] = zone;
1491                 realsize = size = zones_size[j];
1492                 if (zholes_size)
1493                         realsize -= zholes_size[j];
1494
1495                 if (j == ZONE_DMA || j == ZONE_NORMAL)
1496                         nr_kernel_pages += realsize;
1497                 nr_all_pages += realsize;
1498
1499                 zone->spanned_pages = size;
1500                 zone->present_pages = realsize;
1501                 zone->name = zone_names[j];
1502                 spin_lock_init(&zone->lock);
1503                 spin_lock_init(&zone->lru_lock);
1504                 zone->zone_pgdat = pgdat;
1505                 zone->free_pages = 0;
1506
1507                 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1508
1509                 /*
1510                  * The per-cpu-pages pools are set to around 1000th of the
1511                  * size of the zone.  But no more than 1/4 of a meg - there's
1512                  * no point in going beyond the size of L2 cache.
1513                  *
1514                  * OK, so we don't know how big the cache is.  So guess.
1515                  */
1516                 batch = zone->present_pages / 1024;
1517                 if (batch * PAGE_SIZE > 256 * 1024)
1518                         batch = (256 * 1024) / PAGE_SIZE;
1519                 batch /= 4;             /* We effectively *= 4 below */
1520                 if (batch < 1)
1521                         batch = 1;
1522
1523                 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1524                         struct per_cpu_pages *pcp;
1525
1526                         pcp = &zone->pageset[cpu].pcp[0];       /* hot */
1527                         pcp->count = 0;
1528                         pcp->low = 2 * batch;
1529                         pcp->high = 6 * batch;
1530                         pcp->batch = 1 * batch;
1531                         INIT_LIST_HEAD(&pcp->list);
1532
1533                         pcp = &zone->pageset[cpu].pcp[1];       /* cold */
1534                         pcp->count = 0;
1535                         pcp->low = 0;
1536                         pcp->high = 2 * batch;
1537                         pcp->batch = 1 * batch;
1538                         INIT_LIST_HEAD(&pcp->list);
1539                 }
1540                 printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
1541                                 zone_names[j], realsize, batch);
1542                 INIT_LIST_HEAD(&zone->active_list);
1543                 INIT_LIST_HEAD(&zone->inactive_list);
1544                 zone->nr_scan_active = 0;
1545                 zone->nr_scan_inactive = 0;
1546                 zone->nr_active = 0;
1547                 zone->nr_inactive = 0;
1548                 if (!size)
1549                         continue;
1550
1551                 /*
1552                  * The per-page waitqueue mechanism uses hashed waitqueues
1553                  * per zone.
1554                  */
1555                 zone->wait_table_size = wait_table_size(size);
1556                 zone->wait_table_bits =
1557                         wait_table_bits(zone->wait_table_size);
1558                 zone->wait_table = (wait_queue_head_t *)
1559                         alloc_bootmem_node(pgdat, zone->wait_table_size
1560                                                 * sizeof(wait_queue_head_t));
1561
1562                 for(i = 0; i < zone->wait_table_size; ++i)
1563                         init_waitqueue_head(zone->wait_table + i);
1564
1565                 pgdat->nr_zones = j+1;
1566
1567                 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1568                 zone->zone_start_pfn = zone_start_pfn;
1569
1570                 if ((zone_start_pfn) & (zone_required_alignment-1))
1571                         printk("BUG: wrong zone alignment, it will crash\n");
1572
1573                 memmap_init(size, nid, j, zone_start_pfn);
1574
1575                 zone_start_pfn += size;
1576
1577                 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1578         }
1579 }
1580
1581 void __init node_alloc_mem_map(struct pglist_data *pgdat)
1582 {
1583         unsigned long size;
1584
1585         size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
1586         pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
1587 #ifndef CONFIG_DISCONTIGMEM
1588         mem_map = contig_page_data.node_mem_map;
1589 #endif
1590 }
1591
1592 void __init free_area_init_node(int nid, struct pglist_data *pgdat,
1593                 unsigned long *zones_size, unsigned long node_start_pfn,
1594                 unsigned long *zholes_size)
1595 {
1596         pgdat->node_id = nid;
1597         pgdat->node_start_pfn = node_start_pfn;
1598         calculate_zone_totalpages(pgdat, zones_size, zholes_size);
1599
1600         if (!pfn_to_page(node_start_pfn))
1601                 node_alloc_mem_map(pgdat);
1602
1603         free_area_init_core(pgdat, zones_size, zholes_size);
1604 }
1605
1606 #ifndef CONFIG_DISCONTIGMEM
1607 static bootmem_data_t contig_bootmem_data;
1608 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
1609
1610 EXPORT_SYMBOL(contig_page_data);
1611
1612 void __init free_area_init(unsigned long *zones_size)
1613 {
1614         free_area_init_node(0, &contig_page_data, zones_size,
1615                         __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
1616 }
1617 #endif
1618
1619 #ifdef CONFIG_PROC_FS
1620
1621 #include <linux/seq_file.h>
1622
1623 static void *frag_start(struct seq_file *m, loff_t *pos)
1624 {
1625         pg_data_t *pgdat;
1626         loff_t node = *pos;
1627
1628         for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
1629                 --node;
1630
1631         return pgdat;
1632 }
1633
1634 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1635 {
1636         pg_data_t *pgdat = (pg_data_t *)arg;
1637
1638         (*pos)++;
1639         return pgdat->pgdat_next;
1640 }
1641
1642 static void frag_stop(struct seq_file *m, void *arg)
1643 {
1644 }
1645
1646 /*
1647  * This walks the freelist for each zone. Whilst this is slow, I'd rather
1648  * be slow here than slow down the fast path by keeping stats - mjbligh
1649  */
1650 static int frag_show(struct seq_file *m, void *arg)
1651 {
1652         pg_data_t *pgdat = (pg_data_t *)arg;
1653         struct zone *zone;
1654         struct zone *node_zones = pgdat->node_zones;
1655         unsigned long flags;
1656         int order;
1657
1658         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1659                 if (!zone->present_pages)
1660                         continue;
1661
1662                 spin_lock_irqsave(&zone->lock, flags);
1663                 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1664                 for (order = 0; order < MAX_ORDER; ++order) {
1665                         unsigned long nr_bufs = 0;
1666                         struct list_head *elem;
1667
1668                         list_for_each(elem, &(zone->free_area[order].free_list))
1669                                 ++nr_bufs;
1670                         seq_printf(m, "%6lu ", nr_bufs);
1671                 }
1672                 spin_unlock_irqrestore(&zone->lock, flags);
1673                 seq_putc(m, '\n');
1674         }
1675         return 0;
1676 }
1677
1678 struct seq_operations fragmentation_op = {
1679         .start  = frag_start,
1680         .next   = frag_next,
1681         .stop   = frag_stop,
1682         .show   = frag_show,
1683 };
1684
1685 static char *vmstat_text[] = {
1686         "nr_dirty",
1687         "nr_writeback",
1688         "nr_unstable",
1689         "nr_page_table_pages",
1690         "nr_mapped",
1691         "nr_slab",
1692
1693         "pgpgin",
1694         "pgpgout",
1695         "pswpin",
1696         "pswpout",
1697         "pgalloc_high",
1698
1699         "pgalloc_normal",
1700         "pgalloc_dma",
1701         "pgfree",
1702         "pgactivate",
1703         "pgdeactivate",
1704
1705         "pgfault",
1706         "pgmajfault",
1707         "pgrefill_high",
1708         "pgrefill_normal",
1709         "pgrefill_dma",
1710
1711         "pgsteal_high",
1712         "pgsteal_normal",
1713         "pgsteal_dma",
1714         "pgscan_kswapd_high",
1715         "pgscan_kswapd_normal",
1716
1717         "pgscan_kswapd_dma",
1718         "pgscan_direct_high",
1719         "pgscan_direct_normal",
1720         "pgscan_direct_dma",
1721         "pginodesteal",
1722
1723         "slabs_scanned",
1724         "kswapd_steal",
1725         "kswapd_inodesteal",
1726         "pageoutrun",
1727         "allocstall",
1728
1729         "pgrotated",
1730 };
1731
1732 static void *vmstat_start(struct seq_file *m, loff_t *pos)
1733 {
1734         struct page_state *ps;
1735
1736         if (*pos >= ARRAY_SIZE(vmstat_text))
1737                 return NULL;
1738
1739         ps = kmalloc(sizeof(*ps), GFP_KERNEL);
1740         m->private = ps;
1741         if (!ps)
1742                 return ERR_PTR(-ENOMEM);
1743         get_full_page_state(ps);
1744         ps->pgpgin /= 2;                /* sectors -> kbytes */
1745         ps->pgpgout /= 2;
1746         return (unsigned long *)ps + *pos;
1747 }
1748
1749 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1750 {
1751         (*pos)++;
1752         if (*pos >= ARRAY_SIZE(vmstat_text))
1753                 return NULL;
1754         return (unsigned long *)m->private + *pos;
1755 }
1756
1757 static int vmstat_show(struct seq_file *m, void *arg)
1758 {
1759         unsigned long *l = arg;
1760         unsigned long off = l - (unsigned long *)m->private;
1761
1762         seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
1763         return 0;
1764 }
1765
1766 static void vmstat_stop(struct seq_file *m, void *arg)
1767 {
1768         kfree(m->private);
1769         m->private = NULL;
1770 }
1771
1772 struct seq_operations vmstat_op = {
1773         .start  = vmstat_start,
1774         .next   = vmstat_next,
1775         .stop   = vmstat_stop,
1776         .show   = vmstat_show,
1777 };
1778
1779 #endif /* CONFIG_PROC_FS */
1780
1781 #ifdef CONFIG_HOTPLUG_CPU
1782 static int page_alloc_cpu_notify(struct notifier_block *self,
1783                                  unsigned long action, void *hcpu)
1784 {
1785         int cpu = (unsigned long)hcpu;
1786         long *count;
1787
1788         if (action == CPU_DEAD) {
1789                 /* Drain local pagecache count. */
1790                 count = &per_cpu(nr_pagecache_local, cpu);
1791                 atomic_add(*count, &nr_pagecache);
1792                 *count = 0;
1793                 local_irq_disable();
1794                 __drain_pages(cpu);
1795                 local_irq_enable();
1796         }
1797         return NOTIFY_OK;
1798 }
1799 #endif /* CONFIG_HOTPLUG_CPU */
1800
1801 void __init page_alloc_init(void)
1802 {
1803         hotcpu_notifier(page_alloc_cpu_notify, 0);
1804 }
1805
1806 static unsigned long higherzone_val(struct zone *z, int max_zone,
1807                                         int alloc_type)
1808 {
1809         int z_idx = zone_idx(z);
1810         struct zone *higherzone;
1811         unsigned long pages;
1812
1813         /* there is no higher zone to get a contribution from */
1814         if (z_idx == MAX_NR_ZONES-1)
1815                 return 0;
1816
1817         higherzone = &z->zone_pgdat->node_zones[z_idx+1];
1818
1819         /* We always start with the higher zone's protection value */
1820         pages = higherzone->protection[alloc_type];
1821
1822         /*
1823          * We get a lower-zone-protection contribution only if there are
1824          * pages in the higher zone and if we're not the highest zone
1825          * in the current zonelist.  e.g., never happens for GFP_DMA. Happens
1826          * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
1827          * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
1828          */
1829         if (higherzone->present_pages && z_idx < alloc_type)
1830                 pages += higherzone->pages_low * sysctl_lower_zone_protection;
1831
1832         return pages;
1833 }
1834
1835 /*
1836  * setup_per_zone_protection - called whenver min_free_kbytes or
1837  *      sysctl_lower_zone_protection changes.  Ensures that each zone
1838  *      has a correct pages_protected value, so an adequate number of
1839  *      pages are left in the zone after a successful __alloc_pages().
1840  *
1841  *      This algorithm is way confusing.  I tries to keep the same behavior
1842  *      as we had with the incremental min iterative algorithm.
1843  */
1844 static void setup_per_zone_protection(void)
1845 {
1846         struct pglist_data *pgdat;
1847         struct zone *zones, *zone;
1848         int max_zone;
1849         int i, j;
1850
1851         for_each_pgdat(pgdat) {
1852                 zones = pgdat->node_zones;
1853
1854                 for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
1855                         if (zones[i].present_pages)
1856                                 max_zone = i;
1857
1858                 /*
1859                  * For each of the different allocation types:
1860                  * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
1861                  */
1862                 for (i = 0; i < GFP_ZONETYPES; i++) {
1863                         /*
1864                          * For each of the zones:
1865                          * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
1866                          */
1867                         for (j = MAX_NR_ZONES-1; j >= 0; j--) {
1868                                 zone = &zones[j];
1869
1870                                 /*
1871                                  * We never protect zones that don't have memory
1872                                  * in them (j>max_zone) or zones that aren't in
1873                                  * the zonelists for a certain type of
1874                                  * allocation (j>=i).  We have to assign these
1875                                  * to zero because the lower zones take
1876                                  * contributions from the higher zones.
1877                                  */
1878                                 if (j > max_zone || j >= i) {
1879                                         zone->protection[i] = 0;
1880                                         continue;
1881                                 }
1882                                 /*
1883                                  * The contribution of the next higher zone
1884                                  */
1885                                 zone->protection[i] = higherzone_val(zone,
1886                                                                 max_zone, i);
1887                         }
1888                 }
1889         }
1890 }
1891
1892 /*
1893  * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures
1894  *      that the pages_{min,low,high} values for each zone are set correctly
1895  *      with respect to min_free_kbytes.
1896  */
1897 static void setup_per_zone_pages_min(void)
1898 {
1899         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
1900         unsigned long lowmem_pages = 0;
1901         struct zone *zone;
1902         unsigned long flags;
1903
1904         /* Calculate total number of !ZONE_HIGHMEM pages */
1905         for_each_zone(zone) {
1906                 if (!is_highmem(zone))
1907                         lowmem_pages += zone->present_pages;
1908         }
1909
1910         for_each_zone(zone) {
1911                 spin_lock_irqsave(&zone->lru_lock, flags);
1912                 if (is_highmem(zone)) {
1913                         /*
1914                          * Often, highmem doesn't need to reserve any pages.
1915                          * But the pages_min/low/high values are also used for
1916                          * batching up page reclaim activity so we need a
1917                          * decent value here.
1918                          */
1919                         int min_pages;
1920
1921                         min_pages = zone->present_pages / 1024;
1922                         if (min_pages < SWAP_CLUSTER_MAX)
1923                                 min_pages = SWAP_CLUSTER_MAX;
1924                         if (min_pages > 128)
1925                                 min_pages = 128;
1926                         zone->pages_min = min_pages;
1927                 } else {
1928                         /* if it's a lowmem zone, reserve a number of pages
1929                          * proportionate to the zone's size.
1930                          */
1931                         zone->pages_min = (pages_min * zone->present_pages) /
1932                                            lowmem_pages;
1933                 }
1934
1935                 zone->pages_low = zone->pages_min * 2;
1936                 zone->pages_high = zone->pages_min * 3;
1937                 spin_unlock_irqrestore(&zone->lru_lock, flags);
1938         }
1939 }
1940
1941 /*
1942  * Initialise min_free_kbytes.
1943  *
1944  * For small machines we want it small (128k min).  For large machines
1945  * we want it large (16MB max).  But it is not linear, because network
1946  * bandwidth does not increase linearly with machine size.  We use
1947  *
1948  *      min_free_kbytes = sqrt(lowmem_kbytes)
1949  *
1950  * which yields
1951  *
1952  * 16MB:        128k
1953  * 32MB:        181k
1954  * 64MB:        256k
1955  * 128MB:       362k
1956  * 256MB:       512k
1957  * 512MB:       724k
1958  * 1024MB:      1024k
1959  * 2048MB:      1448k
1960  * 4096MB:      2048k
1961  * 8192MB:      2896k
1962  * 16384MB:     4096k
1963  */
1964 static int __init init_per_zone_pages_min(void)
1965 {
1966         unsigned long lowmem_kbytes;
1967
1968         lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
1969
1970         min_free_kbytes = int_sqrt(lowmem_kbytes);
1971         if (min_free_kbytes < 128)
1972                 min_free_kbytes = 128;
1973         if (min_free_kbytes > 16384)
1974                 min_free_kbytes = 16384;
1975         setup_per_zone_pages_min();
1976         setup_per_zone_protection();
1977         return 0;
1978 }
1979 module_init(init_per_zone_pages_min)
1980
1981 /*
1982  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
1983  *      that we can call two helper functions whenever min_free_kbytes
1984  *      changes.
1985  */
1986 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
1987                 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
1988 {
1989         proc_dointvec(table, write, file, buffer, length, ppos);
1990         setup_per_zone_pages_min();
1991         setup_per_zone_protection();
1992         return 0;
1993 }
1994
1995 /*
1996  * lower_zone_protection_sysctl_handler - just a wrapper around
1997  *      proc_dointvec() so that we can call setup_per_zone_protection()
1998  *      whenever sysctl_lower_zone_protection changes.
1999  */
2000 int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
2001                  struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2002 {
2003         proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2004         setup_per_zone_protection();
2005         return 0;
2006 }
2007
2008 /*
2009  * allocate a large system hash table from bootmem
2010  * - it is assumed that the hash table must contain an exact power-of-2
2011  *   quantity of entries
2012  */
2013 void *__init alloc_large_system_hash(const char *tablename,
2014                                      unsigned long bucketsize,
2015                                      unsigned long numentries,
2016                                      int scale,
2017                                      int consider_highmem,
2018                                      unsigned int *_hash_shift,
2019                                      unsigned int *_hash_mask)
2020 {
2021         unsigned long long max;
2022         unsigned long log2qty, size;
2023         void *table;
2024
2025         /* allow the kernel cmdline to have a say */
2026         if (!numentries) {
2027                 /* round applicable memory size up to nearest megabyte */
2028                 numentries = consider_highmem ? nr_all_pages : nr_kernel_pages;
2029                 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
2030                 numentries >>= 20 - PAGE_SHIFT;
2031                 numentries <<= 20 - PAGE_SHIFT;
2032
2033                 /* limit to 1 bucket per 2^scale bytes of low memory */
2034                 if (scale > PAGE_SHIFT)
2035                         numentries >>= (scale - PAGE_SHIFT);
2036                 else
2037                         numentries <<= (PAGE_SHIFT - scale);
2038         }
2039         /* rounded up to nearest power of 2 in size */
2040         numentries = 1UL << (long_log2(numentries) + 1);
2041
2042         /* limit allocation size to 1/16 total memory */
2043         max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
2044         do_div(max, bucketsize);
2045
2046         if (numentries > max)
2047                 numentries = max;
2048
2049         log2qty = long_log2(numentries);
2050
2051         do {
2052                 size = bucketsize << log2qty;
2053                 table = alloc_bootmem(size);
2054         } while (!table && size > PAGE_SIZE && --log2qty);
2055
2056         if (!table)
2057                 panic("Failed to allocate %s hash table\n", tablename);
2058
2059         printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
2060                tablename,
2061                (1U << log2qty),
2062                long_log2(size) - PAGE_SHIFT,
2063                size);
2064
2065         if (_hash_shift)
2066                 *_hash_shift = log2qty;
2067         if (_hash_mask)
2068                 *_hash_mask = (1 << log2qty) - 1;
2069
2070         return table;
2071 }