mm/page_alloc.c

   1 /*
   2  *  linux/mm/page_alloc.c
   3  *
   4  *  Manages the free list, the system allocates free pages here.
   5  *  Note that kmalloc() lives in slab.c
   6  *
   7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8  *  Swap reorganised 29.12.95, Stephen Tweedie
   9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  13  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  14  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  15  */
  16
  17 #include <linux/config.h>
  18 #include <linux/stddef.h>
  19 #include <linux/mm.h>
  20 #include <linux/swap.h>
  21 #include <linux/interrupt.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/bootmem.h>
  24 #include <linux/compiler.h>
  25 #include <linux/module.h>
  26 #include <linux/suspend.h>
  27 #include <linux/pagevec.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/slab.h>
  30 #include <linux/notifier.h>
  31 #include <linux/topology.h>
  32 #include <linux/sysctl.h>
  33 #include <linux/cpu.h>
  34
  35 #include <asm/tlbflush.h>
  36
  37 DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
  38 DECLARE_BITMAP(memblk_online_map, MAX_NR_MEMBLKS);
  39 struct pglist_data *pgdat_list;
  40 unsigned long totalram_pages;
  41 unsigned long totalhigh_pages;
  42 int nr_swap_pages;
  43 int numnodes = 1;
  44 int sysctl_lower_zone_protection = 0;
  45
  46 EXPORT_SYMBOL(totalram_pages);
  47 EXPORT_SYMBOL(nr_swap_pages);
  48
  49 /*
  50  * Used by page_zone() to look up the address of the struct zone whose
  51  * id is encoded in the upper bits of page->flags
  52  */
  53 struct zone *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
  54 EXPORT_SYMBOL(zone_table);
  55
  56 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  57 int min_free_kbytes = 1024;
  58
  59 /*
  60  * Temporary debugging check for pages not lying within a given zone.
  61  */
  62 static int bad_range(struct zone *zone, struct page *page)
  63 {
  64         if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
  65                 return 1;
  66         if (page_to_pfn(page) < zone->zone_start_pfn)
  67                 return 1;
  68         if (zone != page_zone(page))
  69                 return 1;
  70         return 0;
  71 }
  72
  73 static void bad_page(const char *function, struct page *page)
  74 {
  75         printk("Bad page state at %s\n", function);
  76         printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n",
  77                 page->flags, page->mapping,
  78                 page_mapped(page), page_count(page));
  79         printk("Backtrace:\n");
  80         dump_stack();
  81         printk("Trying to fix it up, but a reboot is needed\n");
  82         page->flags &= ~(1 << PG_private        |
  83                         1 << PG_locked  |
  84                         1 << PG_lru     |
  85                         1 << PG_active  |
  86                         1 << PG_dirty   |
  87                         1 << PG_writeback);
  88         set_page_count(page, 0);
  89         page->mapping = NULL;
  90 }
  91
  92 #ifndef CONFIG_HUGETLB_PAGE
  93 #define prep_compound_page(page, order) do { } while (0)
  94 #define destroy_compound_page(page, order) do { } while (0)
  95 #else
  96 /*
  97  * Higher-order pages are called "compound pages".  They are structured thusly:
  98  *
  99  * The first PAGE_SIZE page is called the "head page".
 100  *
 101  * The remaining PAGE_SIZE pages are called "tail pages".
 102  *
 103  * All pages have PG_compound set.  All pages have their lru.next pointing at
 104  * the head page (even the head page has this).
 105  *
 106  * The head page's lru.prev, if non-zero, holds the address of the compound
 107  * page's put_page() function.
 108  *
 109  * The order of the allocation is stored in the first tail page's lru.prev.
 110  * This is only for debug at present.  This usage means that zero-order pages
 111  * may not be compound.
 112  */
 113 static void prep_compound_page(struct page *page, unsigned long order)
 114 {
 115         int i;
 116         int nr_pages = 1 << order;
 117
 118         page->lru.prev = NULL;
 119         page[1].lru.prev = (void *)order;
 120         for (i = 0; i < nr_pages; i++) {
 121                 struct page *p = page + i;
 122
 123                 SetPageCompound(p);
 124                 p->lru.next = (void *)page;
 125         }
 126 }
 127
 128 static void destroy_compound_page(struct page *page, unsigned long order)
 129 {
 130         int i;
 131         int nr_pages = 1 << order;
 132
 133         if (page[1].lru.prev != (void *)order)
 134                 bad_page(__FUNCTION__, page);
 135
 136         for (i = 0; i < nr_pages; i++) {
 137                 struct page *p = page + i;
 138
 139                 if (!PageCompound(p))
 140                         bad_page(__FUNCTION__, page);
 141                 if (p->lru.next != (void *)page)
 142                         bad_page(__FUNCTION__, page);
 143                 ClearPageCompound(p);
 144         }
 145 }
 146 #endif          /* CONFIG_HUGETLB_PAGE */
 147
 148 /*
 149  * Freeing function for a buddy system allocator.
 150  *
 151  * The concept of a buddy system is to maintain direct-mapped table
 152  * (containing bit values) for memory blocks of various "orders".
 153  * The bottom level table contains the map for the smallest allocatable
 154  * units of memory (here, pages), and each level above it describes
 155  * pairs of units from the levels below, hence, "buddies".
 156  * At a high level, all that happens here is marking the table entry
 157  * at the bottom level available, and propagating the changes upward
 158  * as necessary, plus some accounting needed to play nicely with other
 159  * parts of the VM system.
 160  * At each level, we keep one bit for each pair of blocks, which
 161  * is set to 1 iff only one of the pair is allocated.  So when we
 162  * are allocating or freeing one, we can derive the state of the
 163  * other.  That is, if we allocate a small block, and both were
 164  * free, the remainder of the region must be split into blocks.
 165  * If a block is freed, and its buddy is also free, then this
 166  * triggers coalescing into a block of larger size.
 167  *
 168  * -- wli
 169  */
 170
 171 static inline void __free_pages_bulk (struct page *page, struct page *base,
 172                 struct zone *zone, struct free_area *area, unsigned long mask,
 173                 unsigned int order)
 174 {
 175         unsigned long page_idx, index;
 176
 177         if (order)
 178                 destroy_compound_page(page, order);
 179         page_idx = page - base;
 180         if (page_idx & ~mask)
 181                 BUG();
 182         index = page_idx >> (1 + order);
 183
 184         zone->free_pages -= mask;
 185         while (mask + (1 << (MAX_ORDER-1))) {
 186                 struct page *buddy1, *buddy2;
 187
 188                 BUG_ON(area >= zone->free_area + MAX_ORDER);
 189                 if (!__test_and_change_bit(index, area->map))
 190                         /*
 191                          * the buddy page is still allocated.
 192                          */
 193                         break;
 194                 /*
 195                  * Move the buddy up one level.
 196                  * This code is taking advantage of the identity:
 197                  *      -mask = 1+~mask
 198                  */
 199                 buddy1 = base + (page_idx ^ -mask);
 200                 buddy2 = base + page_idx;
 201                 BUG_ON(bad_range(zone, buddy1));
 202                 BUG_ON(bad_range(zone, buddy2));
 203                 list_del(&buddy1->list);
 204                 mask <<= 1;
 205                 area++;
 206                 index >>= 1;
 207                 page_idx &= mask;
 208         }
 209         list_add(&(base + page_idx)->list, &area->free_list);
 210 }
 211
 212 static inline void free_pages_check(const char *function, struct page *page)
 213 {
 214         if (    page_mapped(page) ||
 215                 page->mapping != NULL ||
 216                 page_count(page) != 0 ||
 217                 (page->flags & (
 218                         1 << PG_lru     |
 219                         1 << PG_private |
 220                         1 << PG_locked  |
 221                         1 << PG_active  |
 222                         1 << PG_reclaim |
 223                         1 << PG_writeback )))
 224                 bad_page(function, page);
 225         if (PageDirty(page))
 226                 ClearPageDirty(page);
 227 }
 228
 229 /*
 230  * Frees a list of pages.
 231  * Assumes all pages on list are in same zone, and of same order.
 232  * count is the number of pages to free, or 0 for all on the list.
 233  *
 234  * If the zone was previously in an "all pages pinned" state then look to
 235  * see if this freeing clears that state.
 236  *
 237  * And clear the zone's pages_scanned counter, to hold off the "all pages are
 238  * pinned" detection logic.
 239  */
 240 static int
 241 free_pages_bulk(struct zone *zone, int count,
 242                 struct list_head *list, unsigned int order)
 243 {
 244         unsigned long mask, flags;
 245         struct free_area *area;
 246         struct page *base, *page = NULL;
 247         int ret = 0;
 248
 249         mask = (~0UL) << order;
 250         base = zone->zone_mem_map;
 251         area = zone->free_area + order;
 252         spin_lock_irqsave(&zone->lock, flags);
 253         zone->all_unreclaimable = 0;
 254         zone->pages_scanned = 0;
 255         while (!list_empty(list) && count--) {
 256                 page = list_entry(list->prev, struct page, list);
 257                 /* have to delete it as __free_pages_bulk list manipulates */
 258                 list_del(&page->list);
 259                 __free_pages_bulk(page, base, zone, area, mask, order);
 260                 ret++;
 261         }
 262         spin_unlock_irqrestore(&zone->lock, flags);
 263         return ret;
 264 }
 265
 266 void __free_pages_ok(struct page *page, unsigned int order)
 267 {
 268         LIST_HEAD(list);
 269
 270         mod_page_state(pgfree, 1 << order);
 271         free_pages_check(__FUNCTION__, page);
 272         list_add(&page->list, &list);
 273         kernel_map_pages(page, 1<<order, 0);
 274         free_pages_bulk(page_zone(page), 1, &list, order);
 275 }
 276
 277 #define MARK_USED(index, order, area) \
 278         __change_bit((index) >> (1+(order)), (area)->map)
 279
 280 static inline struct page *
 281 expand(struct zone *zone, struct page *page,
 282          unsigned long index, int low, int high, struct free_area *area)
 283 {
 284         unsigned long size = 1 << high;
 285
 286         while (high > low) {
 287                 BUG_ON(bad_range(zone, page));
 288                 area--;
 289                 high--;
 290                 size >>= 1;
 291                 list_add(&page->list, &area->free_list);
 292                 MARK_USED(index, high, area);
 293                 index += size;
 294                 page += size;
 295         }
 296         return page;
 297 }
 298
 299 static inline void set_page_refs(struct page *page, int order)
 300 {
 301 #ifdef CONFIG_MMU
 302         set_page_count(page, 1);
 303 #else
 304         int i;
 305
 306         /*
 307          * We need to reference all the pages for this order, otherwise if
 308          * anyone accesses one of the pages with (get/put) it will be freed.
 309          */
 310         for (i = 0; i < (1 << order); i++)
 311                 set_page_count(page+i, 1);
 312 #endif /* CONFIG_MMU */
 313 }
 314
 315 /*
 316  * This page is about to be returned from the page allocator
 317  */
 318 static void prep_new_page(struct page *page, int order)
 319 {
 320         if (page->mapping || page_mapped(page) ||
 321             (page->flags & (
 322                         1 << PG_private |
 323                         1 << PG_locked  |
 324                         1 << PG_lru     |
 325                         1 << PG_active  |
 326                         1 << PG_dirty   |
 327                         1 << PG_reclaim |
 328                         1 << PG_writeback )))
 329                 bad_page(__FUNCTION__, page);
 330
 331         page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 332                         1 << PG_referenced | 1 << PG_arch_1 |
 333                         1 << PG_checked | 1 << PG_mappedtodisk);
 334         set_page_refs(page, order);
 335 }
 336
 337 /*
 338  * Do the hard work of removing an element from the buddy allocator.
 339  * Call me with the zone->lock already held.
 340  */
 341 static struct page *__rmqueue(struct zone *zone, unsigned int order)
 342 {
 343         struct free_area * area;
 344         unsigned int current_order;
 345         struct page *page;
 346         unsigned int index;
 347
 348         for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 349                 area = zone->free_area + current_order;
 350                 if (list_empty(&area->free_list))
 351                         continue;
 352
 353                 page = list_entry(area->free_list.next, struct page, list);
 354                 list_del(&page->list);
 355                 index = page - zone->zone_mem_map;
 356                 if (current_order != MAX_ORDER-1)
 357                         MARK_USED(index, current_order, area);
 358                 zone->free_pages -= 1UL << order;
 359                 return expand(zone, page, index, order, current_order, area);
 360         }
 361
 362         return NULL;
 363 }
 364
 365 /*
 366  * Obtain a specified number of elements from the buddy allocator, all under
 367  * a single hold of the lock, for efficiency.  Add them to the supplied list.
 368  * Returns the number of new pages which were placed at *list.
 369  */
 370 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 371                         unsigned long count, struct list_head *list)
 372 {
 373         unsigned long flags;
 374         int i;
 375         int allocated = 0;
 376         struct page *page;
 377
 378         spin_lock_irqsave(&zone->lock, flags);
 379         for (i = 0; i < count; ++i) {
 380                 page = __rmqueue(zone, order);
 381                 if (page == NULL)
 382                         break;
 383                 allocated++;
 384                 list_add_tail(&page->list, list);
 385         }
 386         spin_unlock_irqrestore(&zone->lock, flags);
 387         return allocated;
 388 }
 389
 390 #ifdef CONFIG_SOFTWARE_SUSPEND
 391 int is_head_of_free_region(struct page *page)
 392 {
 393         struct zone *zone = page_zone(page);
 394         unsigned long flags;
 395         int order;
 396         struct list_head *curr;
 397
 398         /*
 399          * Should not matter as we need quiescent system for
 400          * suspend anyway, but...
 401          */
 402         spin_lock_irqsave(&zone->lock, flags);
 403         for (order = MAX_ORDER - 1; order >= 0; --order)
 404                 list_for_each(curr, &zone->free_area[order].free_list)
 405                         if (page == list_entry(curr, struct page, list)) {
 406                                 spin_unlock_irqrestore(&zone->lock, flags);
 407                                 return 1 << order;
 408                         }
 409         spin_unlock_irqrestore(&zone->lock, flags);
 410         return 0;
 411 }
 412
 413 /*
 414  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
 415  */
 416 void drain_local_pages(void)
 417 {
 418         unsigned long flags;
 419         struct zone *zone;
 420         int i;
 421
 422         local_irq_save(flags);
 423         for_each_zone(zone) {
 424                 struct per_cpu_pageset *pset;
 425
 426                 pset = &zone->pageset[smp_processor_id()];
 427                 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 428                         struct per_cpu_pages *pcp;
 429
 430                         pcp = &pset->pcp[i];
 431                         pcp->count -= free_pages_bulk(zone, pcp->count,
 432                                                 &pcp->list, 0);
 433                 }
 434         }
 435         local_irq_restore(flags);
 436 }
 437 #endif /* CONFIG_SOFTWARE_SUSPEND */
 438
 439 /*
 440  * Free a 0-order page
 441  */
 442 static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
 443 static void free_hot_cold_page(struct page *page, int cold)
 444 {
 445         struct zone *zone = page_zone(page);
 446         struct per_cpu_pages *pcp;
 447         unsigned long flags;
 448
 449         kernel_map_pages(page, 1, 0);
 450         inc_page_state(pgfree);
 451         free_pages_check(__FUNCTION__, page);
 452         pcp = &zone->pageset[get_cpu()].pcp[cold];
 453         local_irq_save(flags);
 454         if (pcp->count >= pcp->high)
 455                 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 456         list_add(&page->list, &pcp->list);
 457         pcp->count++;
 458         local_irq_restore(flags);
 459         put_cpu();
 460 }
 461
 462 void free_hot_page(struct page *page)
 463 {
 464         free_hot_cold_page(page, 0);
 465 }
 466
 467 void free_cold_page(struct page *page)
 468 {
 469         free_hot_cold_page(page, 1);
 470 }
 471
 472 /*
 473  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 474  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 475  * or two.
 476  */
 477
 478 static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
 479 {
 480         unsigned long flags;
 481         struct page *page = NULL;
 482
 483         if (order == 0) {
 484                 struct per_cpu_pages *pcp;
 485
 486                 pcp = &zone->pageset[get_cpu()].pcp[cold];
 487                 local_irq_save(flags);
 488                 if (pcp->count <= pcp->low)
 489                         pcp->count += rmqueue_bulk(zone, 0,
 490                                                 pcp->batch, &pcp->list);
 491                 if (pcp->count) {
 492                         page = list_entry(pcp->list.next, struct page, list);
 493                         list_del(&page->list);
 494                         pcp->count--;
 495                 }
 496                 local_irq_restore(flags);
 497                 put_cpu();
 498         }
 499
 500         if (page == NULL) {
 501                 spin_lock_irqsave(&zone->lock, flags);
 502                 page = __rmqueue(zone, order);
 503                 spin_unlock_irqrestore(&zone->lock, flags);
 504                 if (order && page)
 505                         prep_compound_page(page, order);
 506         }
 507
 508         if (page != NULL) {
 509                 BUG_ON(bad_range(zone, page));
 510                 mod_page_state(pgalloc, 1 << order);
 511                 prep_new_page(page, order);
 512         }
 513         return page;
 514 }
 515
 516 /*
 517  * This is the 'heart' of the zoned buddy allocator.
 518  *
 519  * Herein lies the mysterious "incremental min".  That's the
 520  *
 521  *      min += z->pages_low;
 522  *
 523  * thing.  The intent here is to provide additional protection to low zones for
 524  * allocation requests which _could_ use higher zones.  So a GFP_HIGHMEM
 525  * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
 526  * request.  This preserves additional space in those lower zones for requests
 527  * which really do need memory from those zones.  It means that on a decent
 528  * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
 529  * zone untouched.
 530  */
 531 struct page *
 532 __alloc_pages(unsigned int gfp_mask, unsigned int order,
 533                 struct zonelist *zonelist)
 534 {
 535         const int wait = gfp_mask & __GFP_WAIT;
 536         unsigned long min;
 537         struct zone **zones, *classzone;
 538         struct page *page;
 539         int i;
 540         int cold;
 541         int do_retry;
 542         struct reclaim_state reclaim_state;
 543
 544         if (wait)
 545                 might_sleep();
 546
 547         cold = 0;
 548         if (gfp_mask & __GFP_COLD)
 549                 cold = 1;
 550
 551         zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 552         classzone = zones[0];
 553         if (classzone == NULL)    /* no zones in the zonelist */
 554                 return NULL;
 555
 556         /* Go through the zonelist once, looking for a zone with enough free */
 557         min = 1UL << order;
 558         for (i = 0; zones[i] != NULL; i++) {
 559                 struct zone *z = zones[i];
 560
 561                 min += z->pages_low;
 562                 if (z->free_pages >= min ||
 563                                 (!wait && z->free_pages >= z->pages_high)) {
 564                         page = buffered_rmqueue(z, order, cold);
 565                         if (page)
 566                                 goto got_pg;
 567                 }
 568                 min += z->pages_low * sysctl_lower_zone_protection;
 569         }
 570
 571         /* we're somewhat low on memory, failed to find what we needed */
 572         for (i = 0; zones[i] != NULL; i++)
 573                 wakeup_kswapd(zones[i]);
 574
 575         /* Go through the zonelist again, taking __GFP_HIGH into account */
 576         min = 1UL << order;
 577         for (i = 0; zones[i] != NULL; i++) {
 578                 unsigned long local_min;
 579                 struct zone *z = zones[i];
 580
 581                 local_min = z->pages_min;
 582                 if (gfp_mask & __GFP_HIGH)
 583                         local_min >>= 2;
 584                 min += local_min;
 585                 if (z->free_pages >= min ||
 586                                 (!wait && z->free_pages >= z->pages_high)) {
 587                         page = buffered_rmqueue(z, order, cold);
 588                         if (page)
 589                                 goto got_pg;
 590                 }
 591                 min += local_min * sysctl_lower_zone_protection;
 592         }
 593
 594         /* here we're in the low on memory slow path */
 595
 596 rebalance:
 597         if ((current->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
 598                 /* go through the zonelist yet again, ignoring mins */
 599                 for (i = 0; zones[i] != NULL; i++) {
 600                         struct zone *z = zones[i];
 601
 602                         page = buffered_rmqueue(z, order, cold);
 603                         if (page)
 604                                 goto got_pg;
 605                 }
 606                 goto nopage;
 607         }
 608
 609         /* Atomic allocations - we can't balance anything */
 610         if (!wait)
 611                 goto nopage;
 612
 613         current->flags |= PF_MEMALLOC;
 614         reclaim_state.reclaimed_slab = 0;
 615         current->reclaim_state = &reclaim_state;
 616
 617         try_to_free_pages(classzone, gfp_mask, order);
 618
 619         current->reclaim_state = NULL;
 620         current->flags &= ~PF_MEMALLOC;
 621
 622         /* go through the zonelist yet one more time */
 623         min = 1UL << order;
 624         for (i = 0; zones[i] != NULL; i++) {
 625                 struct zone *z = zones[i];
 626
 627                 min += z->pages_min;
 628                 if (z->free_pages >= min ||
 629                                 (!wait && z->free_pages >= z->pages_high)) {
 630                         page = buffered_rmqueue(z, order, cold);
 631                         if (page)
 632                                 goto got_pg;
 633                 }
 634                 min += z->pages_low * sysctl_lower_zone_protection;
 635         }
 636
 637         /*
 638          * Don't let big-order allocations loop unless the caller explicitly
 639          * requests that.  Wait for some write requests to complete then retry.
 640          *
 641          * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
 642          * may not be true in other implementations.
 643          */
 644         do_retry = 0;
 645         if (!(gfp_mask & __GFP_NORETRY)) {
 646                 if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
 647                         do_retry = 1;
 648                 if (gfp_mask & __GFP_NOFAIL)
 649                         do_retry = 1;
 650         }
 651         if (do_retry) {
 652                 blk_congestion_wait(WRITE, HZ/50);
 653                 goto rebalance;
 654         }
 655
 656 nopage:
 657         if (!(gfp_mask & __GFP_NOWARN)) {
 658                 printk("%s: page allocation failure."
 659                         " order:%d, mode:0x%x\n",
 660                         current->comm, order, gfp_mask);
 661         }
 662         return NULL;
 663 got_pg:
 664         kernel_map_pages(page, 1 << order, 1);
 665         return page;
 666 }
 667
 668 /*
 669  * Common helper functions.
 670  */
 671 unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
 672 {
 673         struct page * page;
 674
 675         page = alloc_pages(gfp_mask, order);
 676         if (!page)
 677                 return 0;
 678         return (unsigned long) page_address(page);
 679 }
 680
 681 unsigned long get_zeroed_page(unsigned int gfp_mask)
 682 {
 683         struct page * page;
 684
 685         /*
 686          * get_zeroed_page() returns a 32-bit address, which cannot represent
 687          * a highmem page
 688          */
 689         BUG_ON(gfp_mask & __GFP_HIGHMEM);
 690
 691         page = alloc_pages(gfp_mask, 0);
 692         if (page) {
 693                 void *address = page_address(page);
 694                 clear_page(address);
 695                 return (unsigned long) address;
 696         }
 697         return 0;
 698 }
 699
 700 void __pagevec_free(struct pagevec *pvec)
 701 {
 702         int i = pagevec_count(pvec);
 703
 704         while (--i >= 0)
 705                 free_hot_cold_page(pvec->pages[i], pvec->cold);
 706 }
 707
 708 void __free_pages(struct page *page, unsigned int order)
 709 {
 710         if (!PageReserved(page) && put_page_testzero(page)) {
 711                 if (order == 0)
 712                         free_hot_page(page);
 713                 else
 714                         __free_pages_ok(page, order);
 715         }
 716 }
 717
 718 void free_pages(unsigned long addr, unsigned int order)
 719 {
 720         if (addr != 0) {
 721                 BUG_ON(!virt_addr_valid(addr));
 722                 __free_pages(virt_to_page(addr), order);
 723         }
 724 }
 725
 726 /*
 727  * Total amount of free (allocatable) RAM:
 728  */
 729 unsigned int nr_free_pages(void)
 730 {
 731         unsigned int sum = 0;
 732         struct zone *zone;
 733
 734         for_each_zone(zone)
 735                 sum += zone->free_pages;
 736
 737         return sum;
 738 }
 739 EXPORT_SYMBOL(nr_free_pages);
 740
 741 unsigned int nr_used_zone_pages(void)
 742 {
 743         unsigned int pages = 0;
 744         struct zone *zone;
 745
 746         for_each_zone(zone)
 747                 pages += zone->nr_active + zone->nr_inactive;
 748
 749         return pages;
 750 }
 751
 752 #ifdef CONFIG_NUMA
 753 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
 754 {
 755         unsigned int i, sum = 0;
 756
 757         for (i = 0; i < MAX_NR_ZONES; i++)
 758                 sum += pgdat->node_zones[i].free_pages;
 759
 760         return sum;
 761 }
 762 #endif
 763
 764 static unsigned int nr_free_zone_pages(int offset)
 765 {
 766         pg_data_t *pgdat;
 767         unsigned int sum = 0;
 768
 769         for_each_pgdat(pgdat) {
 770                 struct zonelist *zonelist = pgdat->node_zonelists + offset;
 771                 struct zone **zonep = zonelist->zones;
 772                 struct zone *zone;
 773
 774                 for (zone = *zonep++; zone; zone = *zonep++) {
 775                         unsigned long size = zone->present_pages;
 776                         unsigned long high = zone->pages_high;
 777                         if (size > high)
 778                                 sum += size - high;
 779                 }
 780         }
 781
 782         return sum;
 783 }
 784
 785 /*
 786  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
 787  */
 788 unsigned int nr_free_buffer_pages(void)
 789 {
 790         return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
 791 }
 792
 793 /*
 794  * Amount of free RAM allocatable within all zones
 795  */
 796 unsigned int nr_free_pagecache_pages(void)
 797 {
 798         return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
 799 }
 800
 801 #ifdef CONFIG_HIGHMEM
 802 unsigned int nr_free_highpages (void)
 803 {
 804         pg_data_t *pgdat;
 805         unsigned int pages = 0;
 806
 807         for_each_pgdat(pgdat)
 808                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 809
 810         return pages;
 811 }
 812 #endif
 813
 814 #ifdef CONFIG_NUMA
 815 static void show_node(struct zone *zone)
 816 {
 817         printk("Node %d ", zone->zone_pgdat->node_id);
 818 }
 819 #else
 820 #define show_node(zone) do { } while (0)
 821 #endif
 822
 823 /*
 824  * Accumulate the page_state information across all CPUs.
 825  * The result is unavoidably approximate - it can change
 826  * during and after execution of this function.
 827  */
 828 DEFINE_PER_CPU(struct page_state, page_states) = {0};
 829 EXPORT_PER_CPU_SYMBOL(page_states);
 830
 831 atomic_t nr_pagecache = ATOMIC_INIT(0);
 832 EXPORT_SYMBOL(nr_pagecache);
 833 #ifdef CONFIG_SMP
 834 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 835 #endif
 836
 837 void __get_page_state(struct page_state *ret, int nr)
 838 {
 839         int cpu = 0;
 840
 841         memset(ret, 0, sizeof(*ret));
 842         while (cpu < NR_CPUS) {
 843                 unsigned long *in, *out, off;
 844
 845                 if (!cpu_online(cpu)) {
 846                         cpu++;
 847                         continue;
 848                 }
 849
 850                 in = (unsigned long *)&per_cpu(page_states, cpu);
 851                 cpu++;
 852                 if (cpu < NR_CPUS && cpu_online(cpu))
 853                         prefetch(&per_cpu(page_states, cpu));
 854                 out = (unsigned long *)ret;
 855                 for (off = 0; off < nr; off++)
 856                         *out++ += *in++;
 857         }
 858 }
 859
 860 void get_page_state(struct page_state *ret)
 861 {
 862         int nr;
 863
 864         nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
 865         nr /= sizeof(unsigned long);
 866
 867         __get_page_state(ret, nr + 1);
 868 }
 869
 870 void get_full_page_state(struct page_state *ret)
 871 {
 872         __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
 873 }
 874
 875 void get_zone_counts(unsigned long *active,
 876                 unsigned long *inactive, unsigned long *free)
 877 {
 878         struct zone *zone;
 879
 880         *active = 0;
 881         *inactive = 0;
 882         *free = 0;
 883         for_each_zone(zone) {
 884                 *active += zone->nr_active;
 885                 *inactive += zone->nr_inactive;
 886                 *free += zone->free_pages;
 887         }
 888 }
 889
 890 void si_meminfo(struct sysinfo *val)
 891 {
 892         val->totalram = totalram_pages;
 893         val->sharedram = 0;
 894         val->freeram = nr_free_pages();
 895         val->bufferram = nr_blockdev_pages();
 896 #ifdef CONFIG_HIGHMEM
 897         val->totalhigh = totalhigh_pages;
 898         val->freehigh = nr_free_highpages();
 899 #else
 900         val->totalhigh = 0;
 901         val->freehigh = 0;
 902 #endif
 903         val->mem_unit = PAGE_SIZE;
 904 }
 905
 906 #ifdef CONFIG_NUMA
 907 void si_meminfo_node(struct sysinfo *val, int nid)
 908 {
 909         pg_data_t *pgdat = NODE_DATA(nid);
 910
 911         val->totalram = pgdat->node_present_pages;
 912         val->freeram = nr_free_pages_pgdat(pgdat);
 913         val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
 914         val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 915         val->mem_unit = PAGE_SIZE;
 916 }
 917 #endif
 918
 919 #define K(x) ((x) << (PAGE_SHIFT-10))
 920
 921 /*
 922  * Show free area list (used inside shift_scroll-lock stuff)
 923  * We also calculate the percentage fragmentation. We do this by counting the
 924  * memory on each free list with the exception of the first item on the list.
 925  */
 926 void show_free_areas(void)
 927 {
 928         struct page_state ps;
 929         int cpu, temperature;
 930         unsigned long active;
 931         unsigned long inactive;
 932         unsigned long free;
 933         struct zone *zone;
 934
 935         for_each_zone(zone) {
 936                 show_node(zone);
 937                 printk("%s per-cpu:", zone->name);
 938
 939                 if (!zone->present_pages) {
 940                         printk(" empty\n");
 941                         continue;
 942                 } else
 943                         printk("\n");
 944
 945                 for (cpu = 0; cpu < NR_CPUS; ++cpu) {
 946                         struct per_cpu_pageset *pageset = zone->pageset + cpu;
 947                         for (temperature = 0; temperature < 2; temperature++)
 948                                 printk("cpu %d %s: low %d, high %d, batch %d\n",
 949                                         cpu,
 950                                         temperature ? "cold" : "hot",
 951                                         pageset->pcp[temperature].low,
 952                                         pageset->pcp[temperature].high,
 953                                         pageset->pcp[temperature].batch);
 954                 }
 955         }
 956
 957         get_page_state(&ps);
 958         get_zone_counts(&active, &inactive, &free);
 959
 960         printk("\nFree pages: %11ukB (%ukB HighMem)\n",
 961                 K(nr_free_pages()),
 962                 K(nr_free_highpages()));
 963
 964         printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
 965                 "unstable:%lu free:%u\n",
 966                 active,
 967                 inactive,
 968                 ps.nr_dirty,
 969                 ps.nr_writeback,
 970                 ps.nr_unstable,
 971                 nr_free_pages());
 972
 973         for_each_zone(zone) {
 974                 show_node(zone);
 975                 printk("%s"
 976                         " free:%lukB"
 977                         " min:%lukB"
 978                         " low:%lukB"
 979                         " high:%lukB"
 980                         " active:%lukB"
 981                         " inactive:%lukB"
 982                         "\n",
 983                         zone->name,
 984                         K(zone->free_pages),
 985                         K(zone->pages_min),
 986                         K(zone->pages_low),
 987                         K(zone->pages_high),
 988                         K(zone->nr_active),
 989                         K(zone->nr_inactive)
 990                         );
 991         }
 992
 993         for_each_zone(zone) {
 994                 struct list_head *elem;
 995                 unsigned long nr, flags, order, total = 0;
 996
 997                 show_node(zone);
 998                 printk("%s: ", zone->name);
 999                 if (!zone->present_pages) {
1000                         printk("empty\n");
1001                         continue;
1002                 }
1003
1004                 spin_lock_irqsave(&zone->lock, flags);
1005                 for (order = 0; order < MAX_ORDER; order++) {
1006                         nr = 0;
1007                         list_for_each(elem, &zone->free_area[order].free_list)
1008                                 ++nr;
1009                         total += nr << order;
1010                         printk("%lu*%lukB ", nr, K(1UL) << order);
1011                 }
1012                 spin_unlock_irqrestore(&zone->lock, flags);
1013                 printk("= %lukB\n", K(total));
1014         }
1015
1016         show_swap_cache_info();
1017 }
1018
1019 /*
1020  * Builds allocation fallback zone lists.
1021  */
1022 static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
1023 {
1024         switch (k) {
1025                 struct zone *zone;
1026         default:
1027                 BUG();
1028         case ZONE_HIGHMEM:
1029                 zone = pgdat->node_zones + ZONE_HIGHMEM;
1030                 if (zone->present_pages) {
1031 #ifndef CONFIG_HIGHMEM
1032                         BUG();
1033 #endif
1034                         zonelist->zones[j++] = zone;
1035                 }
1036         case ZONE_NORMAL:
1037                 zone = pgdat->node_zones + ZONE_NORMAL;
1038                 if (zone->present_pages)
1039                         zonelist->zones[j++] = zone;
1040         case ZONE_DMA:
1041                 zone = pgdat->node_zones + ZONE_DMA;
1042                 if (zone->present_pages)
1043                         zonelist->zones[j++] = zone;
1044         }
1045
1046         return j;
1047 }
1048
1049 static void __init build_zonelists(pg_data_t *pgdat)
1050 {
1051         int i, j, k, node, local_node;
1052
1053         local_node = pgdat->node_id;
1054         printk("Building zonelist for node : %d\n", local_node);
1055         for (i = 0; i < MAX_NR_ZONES; i++) {
1056                 struct zonelist *zonelist;
1057
1058                 zonelist = pgdat->node_zonelists + i;
1059                 memset(zonelist, 0, sizeof(*zonelist));
1060
1061                 j = 0;
1062                 k = ZONE_NORMAL;
1063                 if (i & __GFP_HIGHMEM)
1064                         k = ZONE_HIGHMEM;
1065                 if (i & __GFP_DMA)
1066                         k = ZONE_DMA;
1067
1068                 j = build_zonelists_node(pgdat, zonelist, j, k);
1069                 /*
1070                  * Now we build the zonelist so that it contains the zones
1071                  * of all the other nodes.
1072                  * We don't want to pressure a particular node, so when
1073                  * building the zones for node N, we make sure that the
1074                  * zones coming right after the local ones are those from
1075                  * node N+1 (modulo N)
1076                  */
1077                 for (node = local_node + 1; node < numnodes; node++)
1078                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1079                 for (node = 0; node < local_node; node++)
1080                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1081
1082                 zonelist->zones[j++] = NULL;
1083         }
1084 }
1085
1086 void __init build_all_zonelists(void)
1087 {
1088         int i;
1089
1090         for(i = 0 ; i < numnodes ; i++)
1091                 build_zonelists(NODE_DATA(i));
1092 }
1093
1094 /*
1095  * Helper functions to size the waitqueue hash table.
1096  * Essentially these want to choose hash table sizes sufficiently
1097  * large so that collisions trying to wait on pages are rare.
1098  * But in fact, the number of active page waitqueues on typical
1099  * systems is ridiculously low, less than 200. So this is even
1100  * conservative, even though it seems large.
1101  *
1102  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1103  * waitqueues, i.e. the size of the waitq table given the number of pages.
1104  */
1105 #define PAGES_PER_WAITQUEUE     256
1106
1107 static inline unsigned long wait_table_size(unsigned long pages)
1108 {
1109         unsigned long size = 1;
1110
1111         pages /= PAGES_PER_WAITQUEUE;
1112
1113         while (size < pages)
1114                 size <<= 1;
1115
1116         /*
1117          * Once we have dozens or even hundreds of threads sleeping
1118          * on IO we've got bigger problems than wait queue collision.
1119          * Limit the size of the wait table to a reasonable size.
1120          */
1121         size = min(size, 4096UL);
1122
1123         return max(size, 4UL);
1124 }
1125
1126 /*
1127  * This is an integer logarithm so that shifts can be used later
1128  * to extract the more random high bits from the multiplicative
1129  * hash function before the remainder is taken.
1130  */
1131 static inline unsigned long wait_table_bits(unsigned long size)
1132 {
1133         return ffz(~size);
1134 }
1135
1136 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1137
1138 static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1139                 unsigned long *zones_size, unsigned long *zholes_size)
1140 {
1141         unsigned long realtotalpages, totalpages = 0;
1142         int i;
1143
1144         for (i = 0; i < MAX_NR_ZONES; i++)
1145                 totalpages += zones_size[i];
1146         pgdat->node_spanned_pages = totalpages;
1147
1148         realtotalpages = totalpages;
1149         if (zholes_size)
1150                 for (i = 0; i < MAX_NR_ZONES; i++)
1151                         realtotalpages -= zholes_size[i];
1152         pgdat->node_present_pages = realtotalpages;
1153         printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1154 }
1155
1156 /*
1157  * Get space for the valid bitmap.
1158  */
1159 static void __init calculate_zone_bitmap(struct pglist_data *pgdat,
1160                 unsigned long *zones_size)
1161 {
1162         unsigned long size = 0;
1163         int i;
1164
1165         for (i = 0; i < MAX_NR_ZONES; i++)
1166                 size += zones_size[i];
1167         size = LONG_ALIGN((size + 7) >> 3);
1168         if (size) {
1169                 pgdat->valid_addr_bitmap =
1170                         (unsigned long *)alloc_bootmem_node(pgdat, size);
1171                 memset(pgdat->valid_addr_bitmap, 0, size);
1172         }
1173 }
1174
1175 /*
1176  * Initially all pages are reserved - free ones are freed
1177  * up by free_all_bootmem() once the early boot process is
1178  * done. Non-atomic initialization, single-pass.
1179  */
1180 void __init memmap_init_zone(struct page *start, unsigned long size, int nid,
1181                 unsigned long zone, unsigned long start_pfn)
1182 {
1183         struct page *page;
1184
1185         for (page = start; page < (start + size); page++) {
1186                 set_page_zone(page, nid * MAX_NR_ZONES + zone);
1187                 set_page_count(page, 0);
1188                 SetPageReserved(page);
1189                 INIT_LIST_HEAD(&page->list);
1190 #ifdef WANT_PAGE_VIRTUAL
1191                 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1192                 if (zone != ZONE_HIGHMEM)
1193                         set_page_address(page, __va(start_pfn << PAGE_SHIFT));
1194 #endif
1195                 start_pfn++;
1196         }
1197 }
1198
1199 #ifndef __HAVE_ARCH_MEMMAP_INIT
1200 #define memmap_init(start, size, nid, zone, start_pfn) \
1201         memmap_init_zone((start), (size), (nid), (zone), (start_pfn))
1202 #endif
1203
1204 /*
1205  * Set up the zone data structures:
1206  *   - mark all pages reserved
1207  *   - mark all memory queues empty
1208  *   - clear the memory bitmaps
1209  */
1210 static void __init free_area_init_core(struct pglist_data *pgdat,
1211                 unsigned long *zones_size, unsigned long *zholes_size)
1212 {
1213         unsigned long i, j;
1214         const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
1215         int cpu, nid = pgdat->node_id;
1216         struct page *lmem_map = pgdat->node_mem_map;
1217         unsigned long zone_start_pfn = pgdat->node_start_pfn;
1218
1219         pgdat->nr_zones = 0;
1220         init_waitqueue_head(&pgdat->kswapd_wait);
1221
1222         for (j = 0; j < MAX_NR_ZONES; j++) {
1223                 struct zone *zone = pgdat->node_zones + j;
1224                 unsigned long size, realsize;
1225                 unsigned long batch;
1226
1227                 zone_table[nid * MAX_NR_ZONES + j] = zone;
1228                 realsize = size = zones_size[j];
1229                 if (zholes_size)
1230                         realsize -= zholes_size[j];
1231
1232                 zone->spanned_pages = size;
1233                 zone->present_pages = realsize;
1234                 zone->name = zone_names[j];
1235                 spin_lock_init(&zone->lock);
1236                 spin_lock_init(&zone->lru_lock);
1237                 zone->zone_pgdat = pgdat;
1238                 zone->free_pages = 0;
1239
1240                 /*
1241                  * The per-cpu-pages pools are set to around 1000th of the
1242                  * size of the zone.  But no more than 1/4 of a meg - there's
1243                  * no point in going beyond the size of L2 cache.
1244                  *
1245                  * OK, so we don't know how big the cache is.  So guess.
1246                  */
1247                 batch = zone->present_pages / 1024;
1248                 if (batch * PAGE_SIZE > 256 * 1024)
1249                         batch = (256 * 1024) / PAGE_SIZE;
1250                 batch /= 4;             /* We effectively *= 4 below */
1251                 if (batch < 1)
1252                         batch = 1;
1253
1254                 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1255                         struct per_cpu_pages *pcp;
1256
1257                         pcp = &zone->pageset[cpu].pcp[0];       /* hot */
1258                         pcp->count = 0;
1259                         pcp->low = 2 * batch;
1260                         pcp->high = 6 * batch;
1261                         pcp->batch = 1 * batch;
1262                         INIT_LIST_HEAD(&pcp->list);
1263
1264                         pcp = &zone->pageset[cpu].pcp[1];       /* cold */
1265                         pcp->count = 0;
1266                         pcp->low = 0;
1267                         pcp->high = 2 * batch;
1268                         pcp->batch = 1 * batch;
1269                         INIT_LIST_HEAD(&pcp->list);
1270                 }
1271                 printk("  %s zone: %lu pages, LIFO batch:%lu\n",
1272                                 zone_names[j], realsize, batch);
1273                 INIT_LIST_HEAD(&zone->active_list);
1274                 INIT_LIST_HEAD(&zone->inactive_list);
1275                 atomic_set(&zone->refill_counter, 0);
1276                 zone->nr_active = 0;
1277                 zone->nr_inactive = 0;
1278                 if (!size)
1279                         continue;
1280
1281                 /*
1282                  * The per-page waitqueue mechanism uses hashed waitqueues
1283                  * per zone.
1284                  */
1285                 zone->wait_table_size = wait_table_size(size);
1286                 zone->wait_table_bits =
1287                         wait_table_bits(zone->wait_table_size);
1288                 zone->wait_table = (wait_queue_head_t *)
1289                         alloc_bootmem_node(pgdat, zone->wait_table_size
1290                                                 * sizeof(wait_queue_head_t));
1291
1292                 for(i = 0; i < zone->wait_table_size; ++i)
1293                         init_waitqueue_head(zone->wait_table + i);
1294
1295                 pgdat->nr_zones = j+1;
1296
1297                 zone->zone_mem_map = lmem_map;
1298                 zone->zone_start_pfn = zone_start_pfn;
1299
1300                 if ((zone_start_pfn) & (zone_required_alignment-1))
1301                         printk("BUG: wrong zone alignment, it will crash\n");
1302
1303                 memmap_init(lmem_map, size, nid, j, zone_start_pfn);
1304
1305                 zone_start_pfn += size;
1306                 lmem_map += size;
1307
1308                 for (i = 0; ; i++) {
1309                         unsigned long bitmap_size;
1310
1311                         INIT_LIST_HEAD(&zone->free_area[i].free_list);
1312                         if (i == MAX_ORDER-1) {
1313                                 zone->free_area[i].map = NULL;
1314                                 break;
1315                         }
1316
1317                         /*
1318                          * Page buddy system uses "index >> (i+1)",
1319                          * where "index" is at most "size-1".
1320                          *
1321                          * The extra "+3" is to round down to byte
1322                          * size (8 bits per byte assumption). Thus
1323                          * we get "(size-1) >> (i+4)" as the last byte
1324                          * we can access.
1325                          *
1326                          * The "+1" is because we want to round the
1327                          * byte allocation up rather than down. So
1328                          * we should have had a "+7" before we shifted
1329                          * down by three. Also, we have to add one as
1330                          * we actually _use_ the last bit (it's [0,n]
1331                          * inclusive, not [0,n[).
1332                          *
1333                          * So we actually had +7+1 before we shift
1334                          * down by 3. But (n+8) >> 3 == (n >> 3) + 1
1335                          * (modulo overflows, which we do not have).
1336                          *
1337                          * Finally, we LONG_ALIGN because all bitmap
1338                          * operations are on longs.
1339                          */
1340                         bitmap_size = (size-1) >> (i+4);
1341                         bitmap_size = LONG_ALIGN(bitmap_size+1);
1342                         zone->free_area[i].map =
1343                           (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
1344                 }
1345         }
1346 }
1347
1348 void __init free_area_init_node(int nid, struct pglist_data *pgdat,
1349                 struct page *node_mem_map, unsigned long *zones_size,
1350                 unsigned long node_start_pfn, unsigned long *zholes_size)
1351 {
1352         unsigned long size;
1353
1354         pgdat->node_id = nid;
1355         pgdat->node_start_pfn = node_start_pfn;
1356         calculate_zone_totalpages(pgdat, zones_size, zholes_size);
1357         if (!node_mem_map) {
1358                 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
1359                 node_mem_map = alloc_bootmem_node(pgdat, size);
1360         }
1361         pgdat->node_mem_map = node_mem_map;
1362
1363         free_area_init_core(pgdat, zones_size, zholes_size);
1364         memblk_set_online(node_to_memblk(nid));
1365
1366         calculate_zone_bitmap(pgdat, zones_size);
1367 }
1368
1369 #ifndef CONFIG_DISCONTIGMEM
1370 static bootmem_data_t contig_bootmem_data;
1371 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
1372
1373 void __init free_area_init(unsigned long *zones_size)
1374 {
1375         free_area_init_node(0, &contig_page_data, NULL, zones_size,
1376                         __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
1377         mem_map = contig_page_data.node_mem_map;
1378 }
1379 #endif
1380
1381 #ifdef CONFIG_PROC_FS
1382
1383 #include <linux/seq_file.h>
1384
1385 static void *frag_start(struct seq_file *m, loff_t *pos)
1386 {
1387         pg_data_t *pgdat;
1388         loff_t node = *pos;
1389
1390         for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
1391                 --node;
1392
1393         return pgdat;
1394 }
1395
1396 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1397 {
1398         pg_data_t *pgdat = (pg_data_t *)arg;
1399
1400         (*pos)++;
1401         return pgdat->pgdat_next;
1402 }
1403
1404 static void frag_stop(struct seq_file *m, void *arg)
1405 {
1406 }
1407
1408 /*
1409  * This walks the freelist for each zone. Whilst this is slow, I'd rather
1410  * be slow here than slow down the fast path by keeping stats - mjbligh
1411  */
1412 static int frag_show(struct seq_file *m, void *arg)
1413 {
1414         pg_data_t *pgdat = (pg_data_t *)arg;
1415         struct zone *zone;
1416         struct zone *node_zones = pgdat->node_zones;
1417         unsigned long flags;
1418         int order;
1419
1420         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1421                 if (!zone->present_pages)
1422                         continue;
1423
1424                 spin_lock_irqsave(&zone->lock, flags);
1425                 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1426                 for (order = 0; order < MAX_ORDER; ++order) {
1427                         unsigned long nr_bufs = 0;
1428                         struct list_head *elem;
1429
1430                         list_for_each(elem, &(zone->free_area[order].free_list))
1431                                 ++nr_bufs;
1432                         seq_printf(m, "%6lu ", nr_bufs);
1433                 }
1434                 spin_unlock_irqrestore(&zone->lock, flags);
1435                 seq_putc(m, '\n');
1436         }
1437         return 0;
1438 }
1439
1440 struct seq_operations fragmentation_op = {
1441         .start  = frag_start,
1442         .next   = frag_next,
1443         .stop   = frag_stop,
1444         .show   = frag_show,
1445 };
1446
1447 static char *vmstat_text[] = {
1448         "nr_dirty",
1449         "nr_writeback",
1450         "nr_unstable",
1451         "nr_page_table_pages",
1452         "nr_mapped",
1453         "nr_slab",
1454
1455         "pgpgin",
1456         "pgpgout",
1457         "pswpin",
1458         "pswpout",
1459         "pgalloc",
1460
1461         "pgfree",
1462         "pgactivate",
1463         "pgdeactivate",
1464         "pgfault",
1465         "pgmajfault",
1466
1467         "pgscan",
1468         "pgrefill",
1469         "pgsteal",
1470         "pginodesteal",
1471         "kswapd_steal",
1472
1473         "kswapd_inodesteal",
1474         "pageoutrun",
1475         "allocstall",
1476         "pgrotated",
1477 };
1478
1479 static void *vmstat_start(struct seq_file *m, loff_t *pos)
1480 {
1481         struct page_state *ps;
1482
1483         if (*pos >= ARRAY_SIZE(vmstat_text))
1484                 return NULL;
1485
1486         ps = kmalloc(sizeof(*ps), GFP_KERNEL);
1487         m->private = ps;
1488         if (!ps)
1489                 return ERR_PTR(-ENOMEM);
1490         get_full_page_state(ps);
1491         ps->pgpgin /= 2;                /* sectors -> kbytes */
1492         ps->pgpgout /= 2;
1493         return (unsigned long *)ps + *pos;
1494 }
1495
1496 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1497 {
1498         (*pos)++;
1499         if (*pos >= ARRAY_SIZE(vmstat_text))
1500                 return NULL;
1501         return (unsigned long *)m->private + *pos;
1502 }
1503
1504 static int vmstat_show(struct seq_file *m, void *arg)
1505 {
1506         unsigned long *l = arg;
1507         unsigned long off = l - (unsigned long *)m->private;
1508
1509         seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
1510         return 0;
1511 }
1512
1513 static void vmstat_stop(struct seq_file *m, void *arg)
1514 {
1515         kfree(m->private);
1516         m->private = NULL;
1517 }
1518
1519 struct seq_operations vmstat_op = {
1520         .start  = vmstat_start,
1521         .next   = vmstat_next,
1522         .stop   = vmstat_stop,
1523         .show   = vmstat_show,
1524 };
1525
1526 #endif /* CONFIG_PROC_FS */
1527
1528 static void __devinit init_page_alloc_cpu(int cpu)
1529 {
1530         struct page_state *ps = &per_cpu(page_states, cpu);
1531         memset(ps, 0, sizeof(*ps));
1532 }
1533
1534 static int __devinit page_alloc_cpu_notify(struct notifier_block *self,
1535                                 unsigned long action, void *hcpu)
1536 {
1537         int cpu = (unsigned long)hcpu;
1538         switch(action) {
1539         case CPU_UP_PREPARE:
1540                 init_page_alloc_cpu(cpu);
1541                 break;
1542         default:
1543                 break;
1544         }
1545         return NOTIFY_OK;
1546 }
1547
1548 static struct notifier_block __devinitdata page_alloc_nb = {
1549         .notifier_call  = page_alloc_cpu_notify,
1550 };
1551
1552 void __init page_alloc_init(void)
1553 {
1554         init_page_alloc_cpu(smp_processor_id());
1555         register_cpu_notifier(&page_alloc_nb);
1556 }
1557
1558 /*
1559  * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures
1560  *      that the pages_{min,low,high} values for each zone are set correctly
1561  *      with respect to min_free_kbytes.
1562  */
1563 void setup_per_zone_pages_min(void)
1564 {
1565         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
1566         unsigned long lowmem_pages = 0;
1567         struct zone *zone;
1568         unsigned long flags;
1569
1570         /* Calculate total number of !ZONE_HIGHMEM pages */
1571         for_each_zone(zone)
1572                 if (!is_highmem(zone))
1573                         lowmem_pages += zone->present_pages;
1574
1575         for_each_zone(zone) {
1576                 spin_lock_irqsave(&zone->lru_lock, flags);
1577                 if (is_highmem(zone)) {
1578                         /*
1579                          * Often, highmem doesn't need to reserve any pages.
1580                          * But the pages_min/low/high values are also used for
1581                          * batching up page reclaim activity so we need a
1582                          * decent value here.
1583                          */
1584                         int min_pages;
1585
1586                         min_pages = zone->present_pages / 1024;
1587                         if (min_pages < SWAP_CLUSTER_MAX)
1588                                 min_pages = SWAP_CLUSTER_MAX;
1589                         if (min_pages > 128)
1590                                 min_pages = 128;
1591                         zone->pages_min = min_pages;
1592                 } else {
1593                         /* if it's a lowmem zone, reserve a number of pages
1594                          * proportionate to the zone's size.
1595                          */
1596                         zone->pages_min = (pages_min * zone->present_pages) /
1597                                            lowmem_pages;
1598                 }
1599
1600                 zone->pages_low = zone->pages_min * 2;
1601                 zone->pages_high = zone->pages_min * 3;
1602                 spin_unlock_irqrestore(&zone->lru_lock, flags);
1603         }
1604 }
1605
1606 /*
1607  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
1608  *      that we can call setup_per_zone_pages_min() whenever min_free_kbytes
1609  *      changes.
1610  */
1611 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
1612                 struct file *file, void *buffer, size_t *length)
1613 {
1614         proc_dointvec(table, write, file, buffer, length);
1615         setup_per_zone_pages_min();
1616         return 0;
1617 }