mm/page_alloc.c

   1 /*
   2  *  linux/mm/page_alloc.c
   3  *
   4  *  Manages the free list, the system allocates free pages here.
   5  *  Note that kmalloc() lives in slab.c
   6  *
   7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8  *  Swap reorganised 29.12.95, Stephen Tweedie
   9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  13  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  14  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  15  */
  16
  17 #include <linux/config.h>
  18 #include <linux/stddef.h>
  19 #include <linux/mm.h>
  20 #include <linux/swap.h>
  21 #include <linux/interrupt.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/bootmem.h>
  24 #include <linux/compiler.h>
  25 #include <linux/module.h>
  26 #include <linux/suspend.h>
  27 #include <linux/pagevec.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/slab.h>
  30 #include <linux/notifier.h>
  31 #include <linux/topology.h>
  32 #include <linux/sysctl.h>
  33 #include <linux/cpu.h>
  34
  35 #include <asm/tlbflush.h>
  36
  37 DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
  38 DECLARE_BITMAP(memblk_online_map, MAX_NR_MEMBLKS);
  39 struct pglist_data *pgdat_list;
  40 unsigned long totalram_pages;
  41 unsigned long totalhigh_pages;
  42 int nr_swap_pages;
  43 int numnodes = 1;
  44 int sysctl_lower_zone_protection = 0;
  45
  46 EXPORT_SYMBOL(totalram_pages);
  47 EXPORT_SYMBOL(nr_swap_pages);
  48
  49 /*
  50  * Used by page_zone() to look up the address of the struct zone whose
  51  * id is encoded in the upper bits of page->flags
  52  */
  53 struct zone *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
  54 EXPORT_SYMBOL(zone_table);
  55
  56 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  57 int min_free_kbytes = 1024;
  58
  59 /*
  60  * Temporary debugging check for pages not lying within a given zone.
  61  */
  62 static int bad_range(struct zone *zone, struct page *page)
  63 {
  64         if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
  65                 return 1;
  66         if (page_to_pfn(page) < zone->zone_start_pfn)
  67                 return 1;
  68         if (zone != page_zone(page))
  69                 return 1;
  70         return 0;
  71 }
  72
  73 static void bad_page(const char *function, struct page *page)
  74 {
  75         printk("Bad page state at %s\n", function);
  76         printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n",
  77                 page->flags, page->mapping,
  78                 page_mapped(page), page_count(page));
  79         printk("Backtrace:\n");
  80         dump_stack();
  81         printk("Trying to fix it up, but a reboot is needed\n");
  82         page->flags &= ~(1 << PG_private        |
  83                         1 << PG_locked  |
  84                         1 << PG_lru     |
  85                         1 << PG_active  |
  86                         1 << PG_dirty   |
  87                         1 << PG_writeback);
  88         set_page_count(page, 0);
  89         page->mapping = NULL;
  90 }
  91
  92 #ifndef CONFIG_HUGETLB_PAGE
  93 #define prep_compound_page(page, order) do { } while (0)
  94 #define destroy_compound_page(page, order) do { } while (0)
  95 #else
  96 /*
  97  * Higher-order pages are called "compound pages".  They are structured thusly:
  98  *
  99  * The first PAGE_SIZE page is called the "head page".
 100  *
 101  * The remaining PAGE_SIZE pages are called "tail pages".
 102  *
 103  * All pages have PG_compound set.  All pages have their lru.next pointing at
 104  * the head page (even the head page has this).
 105  *
 106  * The head page's lru.prev, if non-zero, holds the address of the compound
 107  * page's put_page() function.
 108  *
 109  * The order of the allocation is stored in the first tail page's lru.prev.
 110  * This is only for debug at present.  This usage means that zero-order pages
 111  * may not be compound.
 112  */
 113 static void prep_compound_page(struct page *page, unsigned long order)
 114 {
 115         int i;
 116         int nr_pages = 1 << order;
 117
 118         page->lru.prev = NULL;
 119         page[1].lru.prev = (void *)order;
 120         for (i = 0; i < nr_pages; i++) {
 121                 struct page *p = page + i;
 122
 123                 SetPageCompound(p);
 124                 p->lru.next = (void *)page;
 125         }
 126 }
 127
 128 static void destroy_compound_page(struct page *page, unsigned long order)
 129 {
 130         int i;
 131         int nr_pages = 1 << order;
 132
 133         if (page[1].lru.prev != (void *)order)
 134                 bad_page(__FUNCTION__, page);
 135
 136         for (i = 0; i < nr_pages; i++) {
 137                 struct page *p = page + i;
 138
 139                 if (!PageCompound(p))
 140                         bad_page(__FUNCTION__, page);
 141                 if (p->lru.next != (void *)page)
 142                         bad_page(__FUNCTION__, page);
 143                 ClearPageCompound(p);
 144         }
 145 }
 146 #endif          /* CONFIG_HUGETLB_PAGE */
 147
 148 /*
 149  * Freeing function for a buddy system allocator.
 150  *
 151  * The concept of a buddy system is to maintain direct-mapped table
 152  * (containing bit values) for memory blocks of various "orders".
 153  * The bottom level table contains the map for the smallest allocatable
 154  * units of memory (here, pages), and each level above it describes
 155  * pairs of units from the levels below, hence, "buddies".
 156  * At a high level, all that happens here is marking the table entry
 157  * at the bottom level available, and propagating the changes upward
 158  * as necessary, plus some accounting needed to play nicely with other
 159  * parts of the VM system.
 160  * At each level, we keep one bit for each pair of blocks, which
 161  * is set to 1 iff only one of the pair is allocated.  So when we
 162  * are allocating or freeing one, we can derive the state of the
 163  * other.  That is, if we allocate a small block, and both were
 164  * free, the remainder of the region must be split into blocks.
 165  * If a block is freed, and its buddy is also free, then this
 166  * triggers coalescing into a block of larger size.
 167  *
 168  * -- wli
 169  */
 170
 171 static inline void __free_pages_bulk (struct page *page, struct page *base,
 172                 struct zone *zone, struct free_area *area, unsigned long mask,
 173                 unsigned int order)
 174 {
 175         unsigned long page_idx, index;
 176
 177         if (order)
 178                 destroy_compound_page(page, order);
 179         page_idx = page - base;
 180         if (page_idx & ~mask)
 181                 BUG();
 182         index = page_idx >> (1 + order);
 183
 184         zone->free_pages -= mask;
 185         while (mask + (1 << (MAX_ORDER-1))) {
 186                 struct page *buddy1, *buddy2;
 187
 188                 BUG_ON(area >= zone->free_area + MAX_ORDER);
 189                 if (!__test_and_change_bit(index, area->map))
 190                         /*
 191                          * the buddy page is still allocated.
 192                          */
 193                         break;
 194                 /*
 195                  * Move the buddy up one level.
 196                  * This code is taking advantage of the identity:
 197                  *      -mask = 1+~mask
 198                  */
 199                 buddy1 = base + (page_idx ^ -mask);
 200                 buddy2 = base + page_idx;
 201                 BUG_ON(bad_range(zone, buddy1));
 202                 BUG_ON(bad_range(zone, buddy2));
 203                 list_del(&buddy1->list);
 204                 mask <<= 1;
 205                 area++;
 206                 index >>= 1;
 207                 page_idx &= mask;
 208         }
 209         list_add(&(base + page_idx)->list, &area->free_list);
 210 }
 211
 212 static inline void free_pages_check(const char *function, struct page *page)
 213 {
 214         if (    page_mapped(page) ||
 215                 page->mapping != NULL ||
 216                 page_count(page) != 0 ||
 217                 (page->flags & (
 218                         1 << PG_lru     |
 219                         1 << PG_private |
 220                         1 << PG_locked  |
 221                         1 << PG_active  |
 222                         1 << PG_reclaim |
 223                         1 << PG_slab    |
 224                         1 << PG_writeback )))
 225                 bad_page(function, page);
 226         if (PageDirty(page))
 227                 ClearPageDirty(page);
 228 }
 229
 230 /*
 231  * Frees a list of pages.
 232  * Assumes all pages on list are in same zone, and of same order.
 233  * count is the number of pages to free, or 0 for all on the list.
 234  *
 235  * If the zone was previously in an "all pages pinned" state then look to
 236  * see if this freeing clears that state.
 237  *
 238  * And clear the zone's pages_scanned counter, to hold off the "all pages are
 239  * pinned" detection logic.
 240  */
 241 static int
 242 free_pages_bulk(struct zone *zone, int count,
 243                 struct list_head *list, unsigned int order)
 244 {
 245         unsigned long mask, flags;
 246         struct free_area *area;
 247         struct page *base, *page = NULL;
 248         int ret = 0;
 249
 250         mask = (~0UL) << order;
 251         base = zone->zone_mem_map;
 252         area = zone->free_area + order;
 253         spin_lock_irqsave(&zone->lock, flags);
 254         zone->all_unreclaimable = 0;
 255         zone->pages_scanned = 0;
 256         while (!list_empty(list) && count--) {
 257                 page = list_entry(list->prev, struct page, list);
 258                 /* have to delete it as __free_pages_bulk list manipulates */
 259                 list_del(&page->list);
 260                 __free_pages_bulk(page, base, zone, area, mask, order);
 261                 ret++;
 262         }
 263         spin_unlock_irqrestore(&zone->lock, flags);
 264         return ret;
 265 }
 266
 267 void __free_pages_ok(struct page *page, unsigned int order)
 268 {
 269         LIST_HEAD(list);
 270
 271         mod_page_state(pgfree, 1 << order);
 272         free_pages_check(__FUNCTION__, page);
 273         list_add(&page->list, &list);
 274         kernel_map_pages(page, 1<<order, 0);
 275         free_pages_bulk(page_zone(page), 1, &list, order);
 276 }
 277
 278 #define MARK_USED(index, order, area) \
 279         __change_bit((index) >> (1+(order)), (area)->map)
 280
 281 static inline struct page *
 282 expand(struct zone *zone, struct page *page,
 283          unsigned long index, int low, int high, struct free_area *area)
 284 {
 285         unsigned long size = 1 << high;
 286
 287         while (high > low) {
 288                 BUG_ON(bad_range(zone, page));
 289                 area--;
 290                 high--;
 291                 size >>= 1;
 292                 list_add(&page->list, &area->free_list);
 293                 MARK_USED(index, high, area);
 294                 index += size;
 295                 page += size;
 296         }
 297         return page;
 298 }
 299
 300 static inline void set_page_refs(struct page *page, int order)
 301 {
 302 #ifdef CONFIG_MMU
 303         set_page_count(page, 1);
 304 #else
 305         int i;
 306
 307         /*
 308          * We need to reference all the pages for this order, otherwise if
 309          * anyone accesses one of the pages with (get/put) it will be freed.
 310          */
 311         for (i = 0; i < (1 << order); i++)
 312                 set_page_count(page+i, 1);
 313 #endif /* CONFIG_MMU */
 314 }
 315
 316 /*
 317  * This page is about to be returned from the page allocator
 318  */
 319 static void prep_new_page(struct page *page, int order)
 320 {
 321         if (page->mapping || page_mapped(page) ||
 322             (page->flags & (
 323                         1 << PG_private |
 324                         1 << PG_locked  |
 325                         1 << PG_lru     |
 326                         1 << PG_active  |
 327                         1 << PG_dirty   |
 328                         1 << PG_reclaim |
 329                         1 << PG_writeback )))
 330                 bad_page(__FUNCTION__, page);
 331
 332         page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 333                         1 << PG_referenced | 1 << PG_arch_1 |
 334                         1 << PG_checked | 1 << PG_mappedtodisk);
 335         page->private = 0;
 336         set_page_refs(page, order);
 337 }
 338
 339 /*
 340  * Do the hard work of removing an element from the buddy allocator.
 341  * Call me with the zone->lock already held.
 342  */
 343 static struct page *__rmqueue(struct zone *zone, unsigned int order)
 344 {
 345         struct free_area * area;
 346         unsigned int current_order;
 347         struct page *page;
 348         unsigned int index;
 349
 350         for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 351                 area = zone->free_area + current_order;
 352                 if (list_empty(&area->free_list))
 353                         continue;
 354
 355                 page = list_entry(area->free_list.next, struct page, list);
 356                 list_del(&page->list);
 357                 index = page - zone->zone_mem_map;
 358                 if (current_order != MAX_ORDER-1)
 359                         MARK_USED(index, current_order, area);
 360                 zone->free_pages -= 1UL << order;
 361                 return expand(zone, page, index, order, current_order, area);
 362         }
 363
 364         return NULL;
 365 }
 366
 367 /*
 368  * Obtain a specified number of elements from the buddy allocator, all under
 369  * a single hold of the lock, for efficiency.  Add them to the supplied list.
 370  * Returns the number of new pages which were placed at *list.
 371  */
 372 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 373                         unsigned long count, struct list_head *list)
 374 {
 375         unsigned long flags;
 376         int i;
 377         int allocated = 0;
 378         struct page *page;
 379
 380         spin_lock_irqsave(&zone->lock, flags);
 381         for (i = 0; i < count; ++i) {
 382                 page = __rmqueue(zone, order);
 383                 if (page == NULL)
 384                         break;
 385                 allocated++;
 386                 list_add_tail(&page->list, list);
 387         }
 388         spin_unlock_irqrestore(&zone->lock, flags);
 389         return allocated;
 390 }
 391
 392 #ifdef CONFIG_PM
 393 int is_head_of_free_region(struct page *page)
 394 {
 395         struct zone *zone = page_zone(page);
 396         unsigned long flags;
 397         int order;
 398         struct list_head *curr;
 399
 400         /*
 401          * Should not matter as we need quiescent system for
 402          * suspend anyway, but...
 403          */
 404         spin_lock_irqsave(&zone->lock, flags);
 405         for (order = MAX_ORDER - 1; order >= 0; --order)
 406                 list_for_each(curr, &zone->free_area[order].free_list)
 407                         if (page == list_entry(curr, struct page, list)) {
 408                                 spin_unlock_irqrestore(&zone->lock, flags);
 409                                 return 1 << order;
 410                         }
 411         spin_unlock_irqrestore(&zone->lock, flags);
 412         return 0;
 413 }
 414
 415 /*
 416  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
 417  */
 418 void drain_local_pages(void)
 419 {
 420         unsigned long flags;
 421         struct zone *zone;
 422         int i;
 423
 424         local_irq_save(flags);
 425         for_each_zone(zone) {
 426                 struct per_cpu_pageset *pset;
 427
 428                 pset = &zone->pageset[smp_processor_id()];
 429                 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 430                         struct per_cpu_pages *pcp;
 431
 432                         pcp = &pset->pcp[i];
 433                         pcp->count -= free_pages_bulk(zone, pcp->count,
 434                                                 &pcp->list, 0);
 435                 }
 436         }
 437         local_irq_restore(flags);
 438 }
 439 #endif /* CONFIG_PM */
 440
 441 /*
 442  * Free a 0-order page
 443  */
 444 static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
 445 static void free_hot_cold_page(struct page *page, int cold)
 446 {
 447         struct zone *zone = page_zone(page);
 448         struct per_cpu_pages *pcp;
 449         unsigned long flags;
 450
 451         kernel_map_pages(page, 1, 0);
 452         inc_page_state(pgfree);
 453         free_pages_check(__FUNCTION__, page);
 454         pcp = &zone->pageset[get_cpu()].pcp[cold];
 455         local_irq_save(flags);
 456         if (pcp->count >= pcp->high)
 457                 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 458         list_add(&page->list, &pcp->list);
 459         pcp->count++;
 460         local_irq_restore(flags);
 461         put_cpu();
 462 }
 463
 464 void free_hot_page(struct page *page)
 465 {
 466         free_hot_cold_page(page, 0);
 467 }
 468
 469 void free_cold_page(struct page *page)
 470 {
 471         free_hot_cold_page(page, 1);
 472 }
 473
 474 /*
 475  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 476  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 477  * or two.
 478  */
 479
 480 static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
 481 {
 482         unsigned long flags;
 483         struct page *page = NULL;
 484
 485         if (order == 0) {
 486                 struct per_cpu_pages *pcp;
 487
 488                 pcp = &zone->pageset[get_cpu()].pcp[cold];
 489                 local_irq_save(flags);
 490                 if (pcp->count <= pcp->low)
 491                         pcp->count += rmqueue_bulk(zone, 0,
 492                                                 pcp->batch, &pcp->list);
 493                 if (pcp->count) {
 494                         page = list_entry(pcp->list.next, struct page, list);
 495                         list_del(&page->list);
 496                         pcp->count--;
 497                 }
 498                 local_irq_restore(flags);
 499                 put_cpu();
 500         }
 501
 502         if (page == NULL) {
 503                 spin_lock_irqsave(&zone->lock, flags);
 504                 page = __rmqueue(zone, order);
 505                 spin_unlock_irqrestore(&zone->lock, flags);
 506                 if (order && page)
 507                         prep_compound_page(page, order);
 508         }
 509
 510         if (page != NULL) {
 511                 BUG_ON(bad_range(zone, page));
 512                 mod_page_state(pgalloc, 1 << order);
 513                 prep_new_page(page, order);
 514         }
 515         return page;
 516 }
 517
 518 /*
 519  * This is the 'heart' of the zoned buddy allocator.
 520  *
 521  * Herein lies the mysterious "incremental min".  That's the
 522  *
 523  *      local_low = z->pages_low;
 524  *      min += local_low;
 525  *
 526  * thing.  The intent here is to provide additional protection to low zones for
 527  * allocation requests which _could_ use higher zones.  So a GFP_HIGHMEM
 528  * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
 529  * request.  This preserves additional space in those lower zones for requests
 530  * which really do need memory from those zones.  It means that on a decent
 531  * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
 532  * zone untouched.
 533  */
 534 struct page *
 535 __alloc_pages(unsigned int gfp_mask, unsigned int order,
 536                 struct zonelist *zonelist)
 537 {
 538         const int wait = gfp_mask & __GFP_WAIT;
 539         unsigned long min;
 540         struct zone **zones, *classzone;
 541         struct page *page;
 542         struct reclaim_state reclaim_state;
 543         struct task_struct *p = current;
 544         int i;
 545         int cold;
 546         int do_retry;
 547
 548         might_sleep_if(wait);
 549
 550         cold = 0;
 551         if (gfp_mask & __GFP_COLD)
 552                 cold = 1;
 553
 554         zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 555         classzone = zones[0];
 556         if (classzone == NULL)    /* no zones in the zonelist */
 557                 return NULL;
 558
 559         /* Go through the zonelist once, looking for a zone with enough free */
 560         min = 1UL << order;
 561         for (i = 0; zones[i] != NULL; i++) {
 562                 struct zone *z = zones[i];
 563                 unsigned long local_low;
 564
 565                 /*
 566                  * This is the fabled 'incremental min'. We let real-time tasks
 567                  * dip their real-time paws a little deeper into reserves.
 568                  */
 569                 local_low = z->pages_low;
 570                 if (rt_task(p))
 571                         local_low >>= 1;
 572                 min += local_low;
 573
 574                 if (z->free_pages >= min ||
 575                                 (!wait && z->free_pages >= z->pages_high)) {
 576                         page = buffered_rmqueue(z, order, cold);
 577                         if (page)
 578                                 goto got_pg;
 579                 }
 580                 min += z->pages_low * sysctl_lower_zone_protection;
 581         }
 582
 583         /* we're somewhat low on memory, failed to find what we needed */
 584         for (i = 0; zones[i] != NULL; i++)
 585                 wakeup_kswapd(zones[i]);
 586
 587         /* Go through the zonelist again, taking __GFP_HIGH into account */
 588         min = 1UL << order;
 589         for (i = 0; zones[i] != NULL; i++) {
 590                 unsigned long local_min;
 591                 struct zone *z = zones[i];
 592
 593                 local_min = z->pages_min;
 594                 if (gfp_mask & __GFP_HIGH)
 595                         local_min >>= 2;
 596                 if (rt_task(p))
 597                         local_min >>= 1;
 598                 min += local_min;
 599                 if (z->free_pages >= min ||
 600                                 (!wait && z->free_pages >= z->pages_high)) {
 601                         page = buffered_rmqueue(z, order, cold);
 602                         if (page)
 603                                 goto got_pg;
 604                 }
 605                 min += local_min * sysctl_lower_zone_protection;
 606         }
 607
 608         /* here we're in the low on memory slow path */
 609
 610 rebalance:
 611         if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
 612                 /* go through the zonelist yet again, ignoring mins */
 613                 for (i = 0; zones[i] != NULL; i++) {
 614                         struct zone *z = zones[i];
 615
 616                         page = buffered_rmqueue(z, order, cold);
 617                         if (page)
 618                                 goto got_pg;
 619                 }
 620                 goto nopage;
 621         }
 622
 623         /* Atomic allocations - we can't balance anything */
 624         if (!wait)
 625                 goto nopage;
 626
 627         p->flags |= PF_MEMALLOC;
 628         reclaim_state.reclaimed_slab = 0;
 629         p->reclaim_state = &reclaim_state;
 630
 631         try_to_free_pages(classzone, gfp_mask, order);
 632
 633         p->reclaim_state = NULL;
 634         p->flags &= ~PF_MEMALLOC;
 635
 636         /* go through the zonelist yet one more time */
 637         min = 1UL << order;
 638         for (i = 0; zones[i] != NULL; i++) {
 639                 struct zone *z = zones[i];
 640
 641                 min += z->pages_min;
 642                 if (z->free_pages >= min ||
 643                                 (!wait && z->free_pages >= z->pages_high)) {
 644                         page = buffered_rmqueue(z, order, cold);
 645                         if (page)
 646                                 goto got_pg;
 647                 }
 648                 min += z->pages_low * sysctl_lower_zone_protection;
 649         }
 650
 651         /*
 652          * Don't let big-order allocations loop unless the caller explicitly
 653          * requests that.  Wait for some write requests to complete then retry.
 654          *
 655          * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
 656          * may not be true in other implementations.
 657          */
 658         do_retry = 0;
 659         if (!(gfp_mask & __GFP_NORETRY)) {
 660                 if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
 661                         do_retry = 1;
 662                 if (gfp_mask & __GFP_NOFAIL)
 663                         do_retry = 1;
 664         }
 665         if (do_retry) {
 666                 blk_congestion_wait(WRITE, HZ/50);
 667                 goto rebalance;
 668         }
 669
 670 nopage:
 671         if (!(gfp_mask & __GFP_NOWARN)) {
 672                 printk("%s: page allocation failure."
 673                         " order:%d, mode:0x%x\n",
 674                         p->comm, order, gfp_mask);
 675         }
 676         return NULL;
 677 got_pg:
 678         kernel_map_pages(page, 1 << order, 1);
 679         return page;
 680 }
 681
 682 /*
 683  * Common helper functions.
 684  */
 685 unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
 686 {
 687         struct page * page;
 688
 689         page = alloc_pages(gfp_mask, order);
 690         if (!page)
 691                 return 0;
 692         return (unsigned long) page_address(page);
 693 }
 694
 695 unsigned long get_zeroed_page(unsigned int gfp_mask)
 696 {
 697         struct page * page;
 698
 699         /*
 700          * get_zeroed_page() returns a 32-bit address, which cannot represent
 701          * a highmem page
 702          */
 703         BUG_ON(gfp_mask & __GFP_HIGHMEM);
 704
 705         page = alloc_pages(gfp_mask, 0);
 706         if (page) {
 707                 void *address = page_address(page);
 708                 clear_page(address);
 709                 return (unsigned long) address;
 710         }
 711         return 0;
 712 }
 713
 714 void __pagevec_free(struct pagevec *pvec)
 715 {
 716         int i = pagevec_count(pvec);
 717
 718         while (--i >= 0)
 719                 free_hot_cold_page(pvec->pages[i], pvec->cold);
 720 }
 721
 722 void __free_pages(struct page *page, unsigned int order)
 723 {
 724         if (!PageReserved(page) && put_page_testzero(page)) {
 725                 if (order == 0)
 726                         free_hot_page(page);
 727                 else
 728                         __free_pages_ok(page, order);
 729         }
 730 }
 731
 732 void free_pages(unsigned long addr, unsigned int order)
 733 {
 734         if (addr != 0) {
 735                 BUG_ON(!virt_addr_valid(addr));
 736                 __free_pages(virt_to_page(addr), order);
 737         }
 738 }
 739
 740 /*
 741  * Total amount of free (allocatable) RAM:
 742  */
 743 unsigned int nr_free_pages(void)
 744 {
 745         unsigned int sum = 0;
 746         struct zone *zone;
 747
 748         for_each_zone(zone)
 749                 sum += zone->free_pages;
 750
 751         return sum;
 752 }
 753 EXPORT_SYMBOL(nr_free_pages);
 754
 755 unsigned int nr_used_zone_pages(void)
 756 {
 757         unsigned int pages = 0;
 758         struct zone *zone;
 759
 760         for_each_zone(zone)
 761                 pages += zone->nr_active + zone->nr_inactive;
 762
 763         return pages;
 764 }
 765
 766 #ifdef CONFIG_NUMA
 767 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
 768 {
 769         unsigned int i, sum = 0;
 770
 771         for (i = 0; i < MAX_NR_ZONES; i++)
 772                 sum += pgdat->node_zones[i].free_pages;
 773
 774         return sum;
 775 }
 776 #endif
 777
 778 static unsigned int nr_free_zone_pages(int offset)
 779 {
 780         pg_data_t *pgdat;
 781         unsigned int sum = 0;
 782
 783         for_each_pgdat(pgdat) {
 784                 struct zonelist *zonelist = pgdat->node_zonelists + offset;
 785                 struct zone **zonep = zonelist->zones;
 786                 struct zone *zone;
 787
 788                 for (zone = *zonep++; zone; zone = *zonep++) {
 789                         unsigned long size = zone->present_pages;
 790                         unsigned long high = zone->pages_high;
 791                         if (size > high)
 792                                 sum += size - high;
 793                 }
 794         }
 795
 796         return sum;
 797 }
 798
 799 /*
 800  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
 801  */
 802 unsigned int nr_free_buffer_pages(void)
 803 {
 804         return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
 805 }
 806
 807 /*
 808  * Amount of free RAM allocatable within all zones
 809  */
 810 unsigned int nr_free_pagecache_pages(void)
 811 {
 812         return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
 813 }
 814
 815 #ifdef CONFIG_HIGHMEM
 816 unsigned int nr_free_highpages (void)
 817 {
 818         pg_data_t *pgdat;
 819         unsigned int pages = 0;
 820
 821         for_each_pgdat(pgdat)
 822                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 823
 824         return pages;
 825 }
 826 #endif
 827
 828 #ifdef CONFIG_NUMA
 829 static void show_node(struct zone *zone)
 830 {
 831         printk("Node %d ", zone->zone_pgdat->node_id);
 832 }
 833 #else
 834 #define show_node(zone) do { } while (0)
 835 #endif
 836
 837 /*
 838  * Accumulate the page_state information across all CPUs.
 839  * The result is unavoidably approximate - it can change
 840  * during and after execution of this function.
 841  */
 842 DEFINE_PER_CPU(struct page_state, page_states) = {0};
 843 EXPORT_PER_CPU_SYMBOL(page_states);
 844
 845 atomic_t nr_pagecache = ATOMIC_INIT(0);
 846 EXPORT_SYMBOL(nr_pagecache);
 847 #ifdef CONFIG_SMP
 848 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 849 #endif
 850
 851 void __get_page_state(struct page_state *ret, int nr)
 852 {
 853         int cpu = 0;
 854
 855         memset(ret, 0, sizeof(*ret));
 856         while (cpu < NR_CPUS) {
 857                 unsigned long *in, *out, off;
 858
 859                 if (!cpu_online(cpu)) {
 860                         cpu++;
 861                         continue;
 862                 }
 863
 864                 in = (unsigned long *)&per_cpu(page_states, cpu);
 865                 cpu++;
 866                 if (cpu < NR_CPUS && cpu_online(cpu))
 867                         prefetch(&per_cpu(page_states, cpu));
 868                 out = (unsigned long *)ret;
 869                 for (off = 0; off < nr; off++)
 870                         *out++ += *in++;
 871         }
 872 }
 873
 874 void get_page_state(struct page_state *ret)
 875 {
 876         int nr;
 877
 878         nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
 879         nr /= sizeof(unsigned long);
 880
 881         __get_page_state(ret, nr + 1);
 882 }
 883
 884 void get_full_page_state(struct page_state *ret)
 885 {
 886         __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
 887 }
 888
 889 void get_zone_counts(unsigned long *active,
 890                 unsigned long *inactive, unsigned long *free)
 891 {
 892         struct zone *zone;
 893
 894         *active = 0;
 895         *inactive = 0;
 896         *free = 0;
 897         for_each_zone(zone) {
 898                 *active += zone->nr_active;
 899                 *inactive += zone->nr_inactive;
 900                 *free += zone->free_pages;
 901         }
 902 }
 903
 904 void si_meminfo(struct sysinfo *val)
 905 {
 906         val->totalram = totalram_pages;
 907         val->sharedram = 0;
 908         val->freeram = nr_free_pages();
 909         val->bufferram = nr_blockdev_pages();
 910 #ifdef CONFIG_HIGHMEM
 911         val->totalhigh = totalhigh_pages;
 912         val->freehigh = nr_free_highpages();
 913 #else
 914         val->totalhigh = 0;
 915         val->freehigh = 0;
 916 #endif
 917         val->mem_unit = PAGE_SIZE;
 918 }
 919
 920 #ifdef CONFIG_NUMA
 921 void si_meminfo_node(struct sysinfo *val, int nid)
 922 {
 923         pg_data_t *pgdat = NODE_DATA(nid);
 924
 925         val->totalram = pgdat->node_present_pages;
 926         val->freeram = nr_free_pages_pgdat(pgdat);
 927         val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
 928         val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 929         val->mem_unit = PAGE_SIZE;
 930 }
 931 #endif
 932
 933 #define K(x) ((x) << (PAGE_SHIFT-10))
 934
 935 /*
 936  * Show free area list (used inside shift_scroll-lock stuff)
 937  * We also calculate the percentage fragmentation. We do this by counting the
 938  * memory on each free list with the exception of the first item on the list.
 939  */
 940 void show_free_areas(void)
 941 {
 942         struct page_state ps;
 943         int cpu, temperature;
 944         unsigned long active;
 945         unsigned long inactive;
 946         unsigned long free;
 947         struct zone *zone;
 948
 949         for_each_zone(zone) {
 950                 show_node(zone);
 951                 printk("%s per-cpu:", zone->name);
 952
 953                 if (!zone->present_pages) {
 954                         printk(" empty\n");
 955                         continue;
 956                 } else
 957                         printk("\n");
 958
 959                 for (cpu = 0; cpu < NR_CPUS; ++cpu) {
 960                         struct per_cpu_pageset *pageset = zone->pageset + cpu;
 961                         for (temperature = 0; temperature < 2; temperature++)
 962                                 printk("cpu %d %s: low %d, high %d, batch %d\n",
 963                                         cpu,
 964                                         temperature ? "cold" : "hot",
 965                                         pageset->pcp[temperature].low,
 966                                         pageset->pcp[temperature].high,
 967                                         pageset->pcp[temperature].batch);
 968                 }
 969         }
 970
 971         get_page_state(&ps);
 972         get_zone_counts(&active, &inactive, &free);
 973
 974         printk("\nFree pages: %11ukB (%ukB HighMem)\n",
 975                 K(nr_free_pages()),
 976                 K(nr_free_highpages()));
 977
 978         printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
 979                 "unstable:%lu free:%u\n",
 980                 active,
 981                 inactive,
 982                 ps.nr_dirty,
 983                 ps.nr_writeback,
 984                 ps.nr_unstable,
 985                 nr_free_pages());
 986
 987         for_each_zone(zone) {
 988                 show_node(zone);
 989                 printk("%s"
 990                         " free:%lukB"
 991                         " min:%lukB"
 992                         " low:%lukB"
 993                         " high:%lukB"
 994                         " active:%lukB"
 995                         " inactive:%lukB"
 996                         "\n",
 997                         zone->name,
 998                         K(zone->free_pages),
 999                         K(zone->pages_min),
1000                         K(zone->pages_low),
1001                         K(zone->pages_high),
1002                         K(zone->nr_active),
1003                         K(zone->nr_inactive)
1004                         );
1005         }
1006
1007         for_each_zone(zone) {
1008                 struct list_head *elem;
1009                 unsigned long nr, flags, order, total = 0;
1010
1011                 show_node(zone);
1012                 printk("%s: ", zone->name);
1013                 if (!zone->present_pages) {
1014                         printk("empty\n");
1015                         continue;
1016                 }
1017
1018                 spin_lock_irqsave(&zone->lock, flags);
1019                 for (order = 0; order < MAX_ORDER; order++) {
1020                         nr = 0;
1021                         list_for_each(elem, &zone->free_area[order].free_list)
1022                                 ++nr;
1023                         total += nr << order;
1024                         printk("%lu*%lukB ", nr, K(1UL) << order);
1025                 }
1026                 spin_unlock_irqrestore(&zone->lock, flags);
1027                 printk("= %lukB\n", K(total));
1028         }
1029
1030         show_swap_cache_info();
1031 }
1032
1033 /*
1034  * Builds allocation fallback zone lists.
1035  */
1036 static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
1037 {
1038         switch (k) {
1039                 struct zone *zone;
1040         default:
1041                 BUG();
1042         case ZONE_HIGHMEM:
1043                 zone = pgdat->node_zones + ZONE_HIGHMEM;
1044                 if (zone->present_pages) {
1045 #ifndef CONFIG_HIGHMEM
1046                         BUG();
1047 #endif
1048                         zonelist->zones[j++] = zone;
1049                 }
1050         case ZONE_NORMAL:
1051                 zone = pgdat->node_zones + ZONE_NORMAL;
1052                 if (zone->present_pages)
1053                         zonelist->zones[j++] = zone;
1054         case ZONE_DMA:
1055                 zone = pgdat->node_zones + ZONE_DMA;
1056                 if (zone->present_pages)
1057                         zonelist->zones[j++] = zone;
1058         }
1059
1060         return j;
1061 }
1062
1063 static void __init build_zonelists(pg_data_t *pgdat)
1064 {
1065         int i, j, k, node, local_node;
1066
1067         local_node = pgdat->node_id;
1068         printk("Building zonelist for node : %d\n", local_node);
1069         for (i = 0; i < MAX_NR_ZONES; i++) {
1070                 struct zonelist *zonelist;
1071
1072                 zonelist = pgdat->node_zonelists + i;
1073                 memset(zonelist, 0, sizeof(*zonelist));
1074
1075                 j = 0;
1076                 k = ZONE_NORMAL;
1077                 if (i & __GFP_HIGHMEM)
1078                         k = ZONE_HIGHMEM;
1079                 if (i & __GFP_DMA)
1080                         k = ZONE_DMA;
1081
1082                 j = build_zonelists_node(pgdat, zonelist, j, k);
1083                 /*
1084                  * Now we build the zonelist so that it contains the zones
1085                  * of all the other nodes.
1086                  * We don't want to pressure a particular node, so when
1087                  * building the zones for node N, we make sure that the
1088                  * zones coming right after the local ones are those from
1089                  * node N+1 (modulo N)
1090                  */
1091                 for (node = local_node + 1; node < numnodes; node++)
1092                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1093                 for (node = 0; node < local_node; node++)
1094                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1095
1096                 zonelist->zones[j++] = NULL;
1097         }
1098 }
1099
1100 void __init build_all_zonelists(void)
1101 {
1102         int i;
1103
1104         for(i = 0 ; i < numnodes ; i++)
1105                 build_zonelists(NODE_DATA(i));
1106 }
1107
1108 /*
1109  * Helper functions to size the waitqueue hash table.
1110  * Essentially these want to choose hash table sizes sufficiently
1111  * large so that collisions trying to wait on pages are rare.
1112  * But in fact, the number of active page waitqueues on typical
1113  * systems is ridiculously low, less than 200. So this is even
1114  * conservative, even though it seems large.
1115  *
1116  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1117  * waitqueues, i.e. the size of the waitq table given the number of pages.
1118  */
1119 #define PAGES_PER_WAITQUEUE     256
1120
1121 static inline unsigned long wait_table_size(unsigned long pages)
1122 {
1123         unsigned long size = 1;
1124
1125         pages /= PAGES_PER_WAITQUEUE;
1126
1127         while (size < pages)
1128                 size <<= 1;
1129
1130         /*
1131          * Once we have dozens or even hundreds of threads sleeping
1132          * on IO we've got bigger problems than wait queue collision.
1133          * Limit the size of the wait table to a reasonable size.
1134          */
1135         size = min(size, 4096UL);
1136
1137         return max(size, 4UL);
1138 }
1139
1140 /*
1141  * This is an integer logarithm so that shifts can be used later
1142  * to extract the more random high bits from the multiplicative
1143  * hash function before the remainder is taken.
1144  */
1145 static inline unsigned long wait_table_bits(unsigned long size)
1146 {
1147         return ffz(~size);
1148 }
1149
1150 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1151
1152 static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1153                 unsigned long *zones_size, unsigned long *zholes_size)
1154 {
1155         unsigned long realtotalpages, totalpages = 0;
1156         int i;
1157
1158         for (i = 0; i < MAX_NR_ZONES; i++)
1159                 totalpages += zones_size[i];
1160         pgdat->node_spanned_pages = totalpages;
1161
1162         realtotalpages = totalpages;
1163         if (zholes_size)
1164                 for (i = 0; i < MAX_NR_ZONES; i++)
1165                         realtotalpages -= zholes_size[i];
1166         pgdat->node_present_pages = realtotalpages;
1167         printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1168 }
1169
1170 /*
1171  * Get space for the valid bitmap.
1172  */
1173 static void __init calculate_zone_bitmap(struct pglist_data *pgdat,
1174                 unsigned long *zones_size)
1175 {
1176         unsigned long size = 0;
1177         int i;
1178
1179         for (i = 0; i < MAX_NR_ZONES; i++)
1180                 size += zones_size[i];
1181         size = LONG_ALIGN((size + 7) >> 3);
1182         if (size) {
1183                 pgdat->valid_addr_bitmap =
1184                         (unsigned long *)alloc_bootmem_node(pgdat, size);
1185                 memset(pgdat->valid_addr_bitmap, 0, size);
1186         }
1187 }
1188
1189 /*
1190  * Initially all pages are reserved - free ones are freed
1191  * up by free_all_bootmem() once the early boot process is
1192  * done. Non-atomic initialization, single-pass.
1193  */
1194 void __init memmap_init_zone(struct page *start, unsigned long size, int nid,
1195                 unsigned long zone, unsigned long start_pfn)
1196 {
1197         struct page *page;
1198
1199         for (page = start; page < (start + size); page++) {
1200                 set_page_zone(page, nid * MAX_NR_ZONES + zone);
1201                 set_page_count(page, 0);
1202                 SetPageReserved(page);
1203                 INIT_LIST_HEAD(&page->list);
1204 #ifdef WANT_PAGE_VIRTUAL
1205                 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1206                 if (zone != ZONE_HIGHMEM)
1207                         set_page_address(page, __va(start_pfn << PAGE_SHIFT));
1208 #endif
1209                 start_pfn++;
1210         }
1211 }
1212
1213 #ifndef __HAVE_ARCH_MEMMAP_INIT
1214 #define memmap_init(start, size, nid, zone, start_pfn) \
1215         memmap_init_zone((start), (size), (nid), (zone), (start_pfn))
1216 #endif
1217
1218 /*
1219  * Set up the zone data structures:
1220  *   - mark all pages reserved
1221  *   - mark all memory queues empty
1222  *   - clear the memory bitmaps
1223  */
1224 static void __init free_area_init_core(struct pglist_data *pgdat,
1225                 unsigned long *zones_size, unsigned long *zholes_size)
1226 {
1227         unsigned long i, j;
1228         const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
1229         int cpu, nid = pgdat->node_id;
1230         struct page *lmem_map = pgdat->node_mem_map;
1231         unsigned long zone_start_pfn = pgdat->node_start_pfn;
1232
1233         pgdat->nr_zones = 0;
1234         init_waitqueue_head(&pgdat->kswapd_wait);
1235
1236         for (j = 0; j < MAX_NR_ZONES; j++) {
1237                 struct zone *zone = pgdat->node_zones + j;
1238                 unsigned long size, realsize;
1239                 unsigned long batch;
1240
1241                 zone_table[nid * MAX_NR_ZONES + j] = zone;
1242                 realsize = size = zones_size[j];
1243                 if (zholes_size)
1244                         realsize -= zholes_size[j];
1245
1246                 zone->spanned_pages = size;
1247                 zone->present_pages = realsize;
1248                 zone->name = zone_names[j];
1249                 spin_lock_init(&zone->lock);
1250                 spin_lock_init(&zone->lru_lock);
1251                 zone->zone_pgdat = pgdat;
1252                 zone->free_pages = 0;
1253
1254                 /*
1255                  * The per-cpu-pages pools are set to around 1000th of the
1256                  * size of the zone.  But no more than 1/4 of a meg - there's
1257                  * no point in going beyond the size of L2 cache.
1258                  *
1259                  * OK, so we don't know how big the cache is.  So guess.
1260                  */
1261                 batch = zone->present_pages / 1024;
1262                 if (batch * PAGE_SIZE > 256 * 1024)
1263                         batch = (256 * 1024) / PAGE_SIZE;
1264                 batch /= 4;             /* We effectively *= 4 below */
1265                 if (batch < 1)
1266                         batch = 1;
1267
1268                 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1269                         struct per_cpu_pages *pcp;
1270
1271                         pcp = &zone->pageset[cpu].pcp[0];       /* hot */
1272                         pcp->count = 0;
1273                         pcp->low = 2 * batch;
1274                         pcp->high = 6 * batch;
1275                         pcp->batch = 1 * batch;
1276                         INIT_LIST_HEAD(&pcp->list);
1277
1278                         pcp = &zone->pageset[cpu].pcp[1];       /* cold */
1279                         pcp->count = 0;
1280                         pcp->low = 0;
1281                         pcp->high = 2 * batch;
1282                         pcp->batch = 1 * batch;
1283                         INIT_LIST_HEAD(&pcp->list);
1284                 }
1285                 printk("  %s zone: %lu pages, LIFO batch:%lu\n",
1286                                 zone_names[j], realsize, batch);
1287                 INIT_LIST_HEAD(&zone->active_list);
1288                 INIT_LIST_HEAD(&zone->inactive_list);
1289                 atomic_set(&zone->refill_counter, 0);
1290                 zone->nr_active = 0;
1291                 zone->nr_inactive = 0;
1292                 if (!size)
1293                         continue;
1294
1295                 /*
1296                  * The per-page waitqueue mechanism uses hashed waitqueues
1297                  * per zone.
1298                  */
1299                 zone->wait_table_size = wait_table_size(size);
1300                 zone->wait_table_bits =
1301                         wait_table_bits(zone->wait_table_size);
1302                 zone->wait_table = (wait_queue_head_t *)
1303                         alloc_bootmem_node(pgdat, zone->wait_table_size
1304                                                 * sizeof(wait_queue_head_t));
1305
1306                 for(i = 0; i < zone->wait_table_size; ++i)
1307                         init_waitqueue_head(zone->wait_table + i);
1308
1309                 pgdat->nr_zones = j+1;
1310
1311                 zone->zone_mem_map = lmem_map;
1312                 zone->zone_start_pfn = zone_start_pfn;
1313
1314                 if ((zone_start_pfn) & (zone_required_alignment-1))
1315                         printk("BUG: wrong zone alignment, it will crash\n");
1316
1317                 memmap_init(lmem_map, size, nid, j, zone_start_pfn);
1318
1319                 zone_start_pfn += size;
1320                 lmem_map += size;
1321
1322                 for (i = 0; ; i++) {
1323                         unsigned long bitmap_size;
1324
1325                         INIT_LIST_HEAD(&zone->free_area[i].free_list);
1326                         if (i == MAX_ORDER-1) {
1327                                 zone->free_area[i].map = NULL;
1328                                 break;
1329                         }
1330
1331                         /*
1332                          * Page buddy system uses "index >> (i+1)",
1333                          * where "index" is at most "size-1".
1334                          *
1335                          * The extra "+3" is to round down to byte
1336                          * size (8 bits per byte assumption). Thus
1337                          * we get "(size-1) >> (i+4)" as the last byte
1338                          * we can access.
1339                          *
1340                          * The "+1" is because we want to round the
1341                          * byte allocation up rather than down. So
1342                          * we should have had a "+7" before we shifted
1343                          * down by three. Also, we have to add one as
1344                          * we actually _use_ the last bit (it's [0,n]
1345                          * inclusive, not [0,n[).
1346                          *
1347                          * So we actually had +7+1 before we shift
1348                          * down by 3. But (n+8) >> 3 == (n >> 3) + 1
1349                          * (modulo overflows, which we do not have).
1350                          *
1351                          * Finally, we LONG_ALIGN because all bitmap
1352                          * operations are on longs.
1353                          */
1354                         bitmap_size = (size-1) >> (i+4);
1355                         bitmap_size = LONG_ALIGN(bitmap_size+1);
1356                         zone->free_area[i].map =
1357                           (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
1358                 }
1359         }
1360 }
1361
1362 void __init free_area_init_node(int nid, struct pglist_data *pgdat,
1363                 struct page *node_mem_map, unsigned long *zones_size,
1364                 unsigned long node_start_pfn, unsigned long *zholes_size)
1365 {
1366         unsigned long size;
1367
1368         pgdat->node_id = nid;
1369         pgdat->node_start_pfn = node_start_pfn;
1370         calculate_zone_totalpages(pgdat, zones_size, zholes_size);
1371         if (!node_mem_map) {
1372                 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
1373                 node_mem_map = alloc_bootmem_node(pgdat, size);
1374         }
1375         pgdat->node_mem_map = node_mem_map;
1376
1377         free_area_init_core(pgdat, zones_size, zholes_size);
1378         memblk_set_online(node_to_memblk(nid));
1379
1380         calculate_zone_bitmap(pgdat, zones_size);
1381 }
1382
1383 #ifndef CONFIG_DISCONTIGMEM
1384 static bootmem_data_t contig_bootmem_data;
1385 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
1386
1387 void __init free_area_init(unsigned long *zones_size)
1388 {
1389         free_area_init_node(0, &contig_page_data, NULL, zones_size,
1390                         __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
1391         mem_map = contig_page_data.node_mem_map;
1392 }
1393 #endif
1394
1395 #ifdef CONFIG_PROC_FS
1396
1397 #include <linux/seq_file.h>
1398
1399 static void *frag_start(struct seq_file *m, loff_t *pos)
1400 {
1401         pg_data_t *pgdat;
1402         loff_t node = *pos;
1403
1404         for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
1405                 --node;
1406
1407         return pgdat;
1408 }
1409
1410 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1411 {
1412         pg_data_t *pgdat = (pg_data_t *)arg;
1413
1414         (*pos)++;
1415         return pgdat->pgdat_next;
1416 }
1417
1418 static void frag_stop(struct seq_file *m, void *arg)
1419 {
1420 }
1421
1422 /*
1423  * This walks the freelist for each zone. Whilst this is slow, I'd rather
1424  * be slow here than slow down the fast path by keeping stats - mjbligh
1425  */
1426 static int frag_show(struct seq_file *m, void *arg)
1427 {
1428         pg_data_t *pgdat = (pg_data_t *)arg;
1429         struct zone *zone;
1430         struct zone *node_zones = pgdat->node_zones;
1431         unsigned long flags;
1432         int order;
1433
1434         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1435                 if (!zone->present_pages)
1436                         continue;
1437
1438                 spin_lock_irqsave(&zone->lock, flags);
1439                 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1440                 for (order = 0; order < MAX_ORDER; ++order) {
1441                         unsigned long nr_bufs = 0;
1442                         struct list_head *elem;
1443
1444                         list_for_each(elem, &(zone->free_area[order].free_list))
1445                                 ++nr_bufs;
1446                         seq_printf(m, "%6lu ", nr_bufs);
1447                 }
1448                 spin_unlock_irqrestore(&zone->lock, flags);
1449                 seq_putc(m, '\n');
1450         }
1451         return 0;
1452 }
1453
1454 struct seq_operations fragmentation_op = {
1455         .start  = frag_start,
1456         .next   = frag_next,
1457         .stop   = frag_stop,
1458         .show   = frag_show,
1459 };
1460
1461 static char *vmstat_text[] = {
1462         "nr_dirty",
1463         "nr_writeback",
1464         "nr_unstable",
1465         "nr_page_table_pages",
1466         "nr_mapped",
1467         "nr_slab",
1468
1469         "pgpgin",
1470         "pgpgout",
1471         "pswpin",
1472         "pswpout",
1473         "pgalloc",
1474
1475         "pgfree",
1476         "pgactivate",
1477         "pgdeactivate",
1478         "pgfault",
1479         "pgmajfault",
1480
1481         "pgscan",
1482         "pgrefill",
1483         "pgsteal",
1484         "pginodesteal",
1485         "kswapd_steal",
1486
1487         "kswapd_inodesteal",
1488         "pageoutrun",
1489         "allocstall",
1490         "pgrotated",
1491 };
1492
1493 static void *vmstat_start(struct seq_file *m, loff_t *pos)
1494 {
1495         struct page_state *ps;
1496
1497         if (*pos >= ARRAY_SIZE(vmstat_text))
1498                 return NULL;
1499
1500         ps = kmalloc(sizeof(*ps), GFP_KERNEL);
1501         m->private = ps;
1502         if (!ps)
1503                 return ERR_PTR(-ENOMEM);
1504         get_full_page_state(ps);
1505         ps->pgpgin /= 2;                /* sectors -> kbytes */
1506         ps->pgpgout /= 2;
1507         return (unsigned long *)ps + *pos;
1508 }
1509
1510 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1511 {
1512         (*pos)++;
1513         if (*pos >= ARRAY_SIZE(vmstat_text))
1514                 return NULL;
1515         return (unsigned long *)m->private + *pos;
1516 }
1517
1518 static int vmstat_show(struct seq_file *m, void *arg)
1519 {
1520         unsigned long *l = arg;
1521         unsigned long off = l - (unsigned long *)m->private;
1522
1523         seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
1524         return 0;
1525 }
1526
1527 static void vmstat_stop(struct seq_file *m, void *arg)
1528 {
1529         kfree(m->private);
1530         m->private = NULL;
1531 }
1532
1533 struct seq_operations vmstat_op = {
1534         .start  = vmstat_start,
1535         .next   = vmstat_next,
1536         .stop   = vmstat_stop,
1537         .show   = vmstat_show,
1538 };
1539
1540 #endif /* CONFIG_PROC_FS */
1541
1542 static void __devinit init_page_alloc_cpu(int cpu)
1543 {
1544         struct page_state *ps = &per_cpu(page_states, cpu);
1545         memset(ps, 0, sizeof(*ps));
1546 }
1547
1548 static int __devinit page_alloc_cpu_notify(struct notifier_block *self,
1549                                 unsigned long action, void *hcpu)
1550 {
1551         int cpu = (unsigned long)hcpu;
1552         switch(action) {
1553         case CPU_UP_PREPARE:
1554                 init_page_alloc_cpu(cpu);
1555                 break;
1556         default:
1557                 break;
1558         }
1559         return NOTIFY_OK;
1560 }
1561
1562 static struct notifier_block __devinitdata page_alloc_nb = {
1563         .notifier_call  = page_alloc_cpu_notify,
1564 };
1565
1566 void __init page_alloc_init(void)
1567 {
1568         init_page_alloc_cpu(smp_processor_id());
1569         register_cpu_notifier(&page_alloc_nb);
1570 }
1571
1572 /*
1573  * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures
1574  *      that the pages_{min,low,high} values for each zone are set correctly
1575  *      with respect to min_free_kbytes.
1576  */
1577 void setup_per_zone_pages_min(void)
1578 {
1579         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
1580         unsigned long lowmem_pages = 0;
1581         struct zone *zone;
1582         unsigned long flags;
1583
1584         /* Calculate total number of !ZONE_HIGHMEM pages */
1585         for_each_zone(zone)
1586                 if (!is_highmem(zone))
1587                         lowmem_pages += zone->present_pages;
1588
1589         for_each_zone(zone) {
1590                 spin_lock_irqsave(&zone->lru_lock, flags);
1591                 if (is_highmem(zone)) {
1592                         /*
1593                          * Often, highmem doesn't need to reserve any pages.
1594                          * But the pages_min/low/high values are also used for
1595                          * batching up page reclaim activity so we need a
1596                          * decent value here.
1597                          */
1598                         int min_pages;
1599
1600                         min_pages = zone->present_pages / 1024;
1601                         if (min_pages < SWAP_CLUSTER_MAX)
1602                                 min_pages = SWAP_CLUSTER_MAX;
1603                         if (min_pages > 128)
1604                                 min_pages = 128;
1605                         zone->pages_min = min_pages;
1606                 } else {
1607                         /* if it's a lowmem zone, reserve a number of pages
1608                          * proportionate to the zone's size.
1609                          */
1610                         zone->pages_min = (pages_min * zone->present_pages) /
1611                                            lowmem_pages;
1612                 }
1613
1614                 zone->pages_low = zone->pages_min * 2;
1615                 zone->pages_high = zone->pages_min * 3;
1616                 spin_unlock_irqrestore(&zone->lru_lock, flags);
1617         }
1618 }
1619
1620 /*
1621  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
1622  *      that we can call setup_per_zone_pages_min() whenever min_free_kbytes
1623  *      changes.
1624  */
1625 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
1626                 struct file *file, void __user *buffer, size_t *length)
1627 {
1628         proc_dointvec(table, write, file, buffer, length);
1629         setup_per_zone_pages_min();
1630         return 0;
1631 }