mm/page_alloc.c

   1 /*
   2  *  linux/mm/page_alloc.c
   3  *
   4  *  Manages the free list, the system allocates free pages here.
   5  *  Note that kmalloc() lives in slab.c
   6  *
   7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8  *  Swap reorganised 29.12.95, Stephen Tweedie
   9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  13  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  14  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  15  */
  16
  17 #include <linux/stddef.h>
  18 #include <linux/mm.h>
  19 #include <linux/swap.h>
  20 #include <linux/interrupt.h>
  21 #include <linux/pagemap.h>
  22 #include <linux/bootmem.h>
  23 #include <linux/compiler.h>
  24 #include <linux/kernel.h>
  25 #include <linux/module.h>
  26 #include <linux/suspend.h>
  27 #include <linux/pagevec.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/slab.h>
  30 #include <linux/notifier.h>
  31 #include <linux/topology.h>
  32 #include <linux/sysctl.h>
  33 #include <linux/cpu.h>
  34 #include <linux/cpuset.h>
  35 #include <linux/memory_hotplug.h>
  36 #include <linux/nodemask.h>
  37 #include <linux/vmalloc.h>
  38 #include <linux/mempolicy.h>
  39 #include <linux/stop_machine.h>
  40 #include <linux/sort.h>
  41 #include <linux/pfn.h>
  42 #include <linux/backing-dev.h>
  43 #include <linux/fault-inject.h>
  44 #include <linux/page-isolation.h>
  45
  46 #include <asm/tlbflush.h>
  47 #include <asm/div64.h>
  48 #include "internal.h"
  49
  50 /*
  51  * Array of node states.
  52  */
  53 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
  54         [N_POSSIBLE] = NODE_MASK_ALL,
  55         [N_ONLINE] = { { [0] = 1UL } },
  56 #ifndef CONFIG_NUMA
  57         [N_NORMAL_MEMORY] = { { [0] = 1UL } },
  58 #ifdef CONFIG_HIGHMEM
  59         [N_HIGH_MEMORY] = { { [0] = 1UL } },
  60 #endif
  61         [N_CPU] = { { [0] = 1UL } },
  62 #endif  /* NUMA */
  63 };
  64 EXPORT_SYMBOL(node_states);
  65
  66 unsigned long totalram_pages __read_mostly;
  67 unsigned long totalreserve_pages __read_mostly;
  68 long nr_swap_pages;
  69 int percpu_pagelist_fraction;
  70
  71 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
  72 int pageblock_order __read_mostly;
  73 #endif
  74
  75 static void __free_pages_ok(struct page *page, unsigned int order);
  76
  77 /*
  78  * results with 256, 32 in the lowmem_reserve sysctl:
  79  *      1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  80  *      1G machine -> (16M dma, 784M normal, 224M high)
  81  *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  82  *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  83  *      HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  84  *
  85  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  86  * don't need any ZONE_NORMAL reservation
  87  */
  88 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
  89 #ifdef CONFIG_ZONE_DMA
  90          256,
  91 #endif
  92 #ifdef CONFIG_ZONE_DMA32
  93          256,
  94 #endif
  95 #ifdef CONFIG_HIGHMEM
  96          32,
  97 #endif
  98          32,
  99 };
 100
 101 EXPORT_SYMBOL(totalram_pages);
 102
 103 static char * const zone_names[MAX_NR_ZONES] = {
 104 #ifdef CONFIG_ZONE_DMA
 105          "DMA",
 106 #endif
 107 #ifdef CONFIG_ZONE_DMA32
 108          "DMA32",
 109 #endif
 110          "Normal",
 111 #ifdef CONFIG_HIGHMEM
 112          "HighMem",
 113 #endif
 114          "Movable",
 115 };
 116
 117 int min_free_kbytes = 1024;
 118
 119 unsigned long __meminitdata nr_kernel_pages;
 120 unsigned long __meminitdata nr_all_pages;
 121 static unsigned long __meminitdata dma_reserve;
 122
 123 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
 124   /*
 125    * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
 126    * ranges of memory (RAM) that may be registered with add_active_range().
 127    * Ranges passed to add_active_range() will be merged if possible
 128    * so the number of times add_active_range() can be called is
 129    * related to the number of nodes and the number of holes
 130    */
 131   #ifdef CONFIG_MAX_ACTIVE_REGIONS
 132     /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
 133     #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
 134   #else
 135     #if MAX_NUMNODES >= 32
 136       /* If there can be many nodes, allow up to 50 holes per node */
 137       #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
 138     #else
 139       /* By default, allow up to 256 distinct regions */
 140       #define MAX_ACTIVE_REGIONS 256
 141     #endif
 142   #endif
 143
 144   static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
 145   static int __meminitdata nr_nodemap_entries;
 146   static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 147   static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 148 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
 149   static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
 150   static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
 151 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
 152   unsigned long __initdata required_kernelcore;
 153   static unsigned long __initdata required_movablecore;
 154   unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 155
 156   /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 157   int movable_zone;
 158   EXPORT_SYMBOL(movable_zone);
 159 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 160
 161 #if MAX_NUMNODES > 1
 162 int nr_node_ids __read_mostly = MAX_NUMNODES;
 163 EXPORT_SYMBOL(nr_node_ids);
 164 #endif
 165
 166 int page_group_by_mobility_disabled __read_mostly;
 167
 168 static void set_pageblock_migratetype(struct page *page, int migratetype)
 169 {
 170         set_pageblock_flags_group(page, (unsigned long)migratetype,
 171                                         PB_migrate, PB_migrate_end);
 172 }
 173
 174 #ifdef CONFIG_DEBUG_VM
 175 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 176 {
 177         int ret = 0;
 178         unsigned seq;
 179         unsigned long pfn = page_to_pfn(page);
 180
 181         do {
 182                 seq = zone_span_seqbegin(zone);
 183                 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
 184                         ret = 1;
 185                 else if (pfn < zone->zone_start_pfn)
 186                         ret = 1;
 187         } while (zone_span_seqretry(zone, seq));
 188
 189         return ret;
 190 }
 191
 192 static int page_is_consistent(struct zone *zone, struct page *page)
 193 {
 194         if (!pfn_valid_within(page_to_pfn(page)))
 195                 return 0;
 196         if (zone != page_zone(page))
 197                 return 0;
 198
 199         return 1;
 200 }
 201 /*
 202  * Temporary debugging check for pages not lying within a given zone.
 203  */
 204 static int bad_range(struct zone *zone, struct page *page)
 205 {
 206         if (page_outside_zone_boundaries(zone, page))
 207                 return 1;
 208         if (!page_is_consistent(zone, page))
 209                 return 1;
 210
 211         return 0;
 212 }
 213 #else
 214 static inline int bad_range(struct zone *zone, struct page *page)
 215 {
 216         return 0;
 217 }
 218 #endif
 219
 220 static void bad_page(struct page *page)
 221 {
 222         printk(KERN_EMERG "Bad page state in process '%s'\n"
 223                 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
 224                 KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
 225                 KERN_EMERG "Backtrace:\n",
 226                 current->comm, page, (int)(2*sizeof(unsigned long)),
 227                 (unsigned long)page->flags, page->mapping,
 228                 page_mapcount(page), page_count(page));
 229         dump_stack();
 230         page->flags &= ~(1 << PG_lru    |
 231                         1 << PG_private |
 232                         1 << PG_locked  |
 233                         1 << PG_active  |
 234                         1 << PG_dirty   |
 235                         1 << PG_reclaim |
 236                         1 << PG_slab    |
 237                         1 << PG_swapcache |
 238                         1 << PG_writeback |
 239                         1 << PG_buddy );
 240         set_page_count(page, 0);
 241         reset_page_mapcount(page);
 242         page->mapping = NULL;
 243         add_taint(TAINT_BAD_PAGE);
 244 }
 245
 246 /*
 247  * Higher-order pages are called "compound pages".  They are structured thusly:
 248  *
 249  * The first PAGE_SIZE page is called the "head page".
 250  *
 251  * The remaining PAGE_SIZE pages are called "tail pages".
 252  *
 253  * All pages have PG_compound set.  All pages have their ->private pointing at
 254  * the head page (even the head page has this).
 255  *
 256  * The first tail page's ->lru.next holds the address of the compound page's
 257  * put_page() function.  Its ->lru.prev holds the order of allocation.
 258  * This usage means that zero-order pages may not be compound.
 259  */
 260
 261 static void free_compound_page(struct page *page)
 262 {
 263         __free_pages_ok(page, compound_order(page));
 264 }
 265
 266 static void prep_compound_page(struct page *page, unsigned long order)
 267 {
 268         int i;
 269         int nr_pages = 1 << order;
 270
 271         set_compound_page_dtor(page, free_compound_page);
 272         set_compound_order(page, order);
 273         __SetPageHead(page);
 274         for (i = 1; i < nr_pages; i++) {
 275                 struct page *p = page + i;
 276
 277                 __SetPageTail(p);
 278                 p->first_page = page;
 279         }
 280 }
 281
 282 static void destroy_compound_page(struct page *page, unsigned long order)
 283 {
 284         int i;
 285         int nr_pages = 1 << order;
 286
 287         if (unlikely(compound_order(page) != order))
 288                 bad_page(page);
 289
 290         if (unlikely(!PageHead(page)))
 291                         bad_page(page);
 292         __ClearPageHead(page);
 293         for (i = 1; i < nr_pages; i++) {
 294                 struct page *p = page + i;
 295
 296                 if (unlikely(!PageTail(p) |
 297                                 (p->first_page != page)))
 298                         bad_page(page);
 299                 __ClearPageTail(p);
 300         }
 301 }
 302
 303 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 304 {
 305         int i;
 306
 307         VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
 308         /*
 309          * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 310          * and __GFP_HIGHMEM from hard or soft interrupt context.
 311          */
 312         VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 313         for (i = 0; i < (1 << order); i++)
 314                 clear_highpage(page + i);
 315 }
 316
 317 static inline void set_page_order(struct page *page, int order)
 318 {
 319         set_page_private(page, order);
 320         __SetPageBuddy(page);
 321 }
 322
 323 static inline void rmv_page_order(struct page *page)
 324 {
 325         __ClearPageBuddy(page);
 326         set_page_private(page, 0);
 327 }
 328
 329 /*
 330  * Locate the struct page for both the matching buddy in our
 331  * pair (buddy1) and the combined O(n+1) page they form (page).
 332  *
 333  * 1) Any buddy B1 will have an order O twin B2 which satisfies
 334  * the following equation:
 335  *     B2 = B1 ^ (1 << O)
 336  * For example, if the starting buddy (buddy2) is #8 its order
 337  * 1 buddy is #10:
 338  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 339  *
 340  * 2) Any buddy B will have an order O+1 parent P which
 341  * satisfies the following equation:
 342  *     P = B & ~(1 << O)
 343  *
 344  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 345  */
 346 static inline struct page *
 347 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
 348 {
 349         unsigned long buddy_idx = page_idx ^ (1 << order);
 350
 351         return page + (buddy_idx - page_idx);
 352 }
 353
 354 static inline unsigned long
 355 __find_combined_index(unsigned long page_idx, unsigned int order)
 356 {
 357         return (page_idx & ~(1 << order));
 358 }
 359
 360 /*
 361  * This function checks whether a page is free && is the buddy
 362  * we can do coalesce a page and its buddy if
 363  * (a) the buddy is not in a hole &&
 364  * (b) the buddy is in the buddy system &&
 365  * (c) a page and its buddy have the same order &&
 366  * (d) a page and its buddy are in the same zone.
 367  *
 368  * For recording whether a page is in the buddy system, we use PG_buddy.
 369  * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
 370  *
 371  * For recording page's order, we use page_private(page).
 372  */
 373 static inline int page_is_buddy(struct page *page, struct page *buddy,
 374                                                                 int order)
 375 {
 376         if (!pfn_valid_within(page_to_pfn(buddy)))
 377                 return 0;
 378
 379         if (page_zone_id(page) != page_zone_id(buddy))
 380                 return 0;
 381
 382         if (PageBuddy(buddy) && page_order(buddy) == order) {
 383                 BUG_ON(page_count(buddy) != 0);
 384                 return 1;
 385         }
 386         return 0;
 387 }
 388
 389 /*
 390  * Freeing function for a buddy system allocator.
 391  *
 392  * The concept of a buddy system is to maintain direct-mapped table
 393  * (containing bit values) for memory blocks of various "orders".
 394  * The bottom level table contains the map for the smallest allocatable
 395  * units of memory (here, pages), and each level above it describes
 396  * pairs of units from the levels below, hence, "buddies".
 397  * At a high level, all that happens here is marking the table entry
 398  * at the bottom level available, and propagating the changes upward
 399  * as necessary, plus some accounting needed to play nicely with other
 400  * parts of the VM system.
 401  * At each level, we keep a list of pages, which are heads of continuous
 402  * free pages of length of (1 << order) and marked with PG_buddy. Page's
 403  * order is recorded in page_private(page) field.
 404  * So when we are allocating or freeing one, we can derive the state of the
 405  * other.  That is, if we allocate a small block, and both were
 406  * free, the remainder of the region must be split into blocks.
 407  * If a block is freed, and its buddy is also free, then this
 408  * triggers coalescing into a block of larger size.
 409  *
 410  * -- wli
 411  */
 412
 413 static inline void __free_one_page(struct page *page,
 414                 struct zone *zone, unsigned int order)
 415 {
 416         unsigned long page_idx;
 417         int order_size = 1 << order;
 418         int migratetype = get_pageblock_migratetype(page);
 419
 420         if (unlikely(PageCompound(page)))
 421                 destroy_compound_page(page, order);
 422
 423         page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 424
 425         VM_BUG_ON(page_idx & (order_size - 1));
 426         VM_BUG_ON(bad_range(zone, page));
 427
 428         __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
 429         while (order < MAX_ORDER-1) {
 430                 unsigned long combined_idx;
 431                 struct page *buddy;
 432
 433                 buddy = __page_find_buddy(page, page_idx, order);
 434                 if (!page_is_buddy(page, buddy, order))
 435                         break;          /* Move the buddy up one level. */
 436
 437                 list_del(&buddy->lru);
 438                 zone->free_area[order].nr_free--;
 439                 rmv_page_order(buddy);
 440                 combined_idx = __find_combined_index(page_idx, order);
 441                 page = page + (combined_idx - page_idx);
 442                 page_idx = combined_idx;
 443                 order++;
 444         }
 445         set_page_order(page, order);
 446         list_add(&page->lru,
 447                 &zone->free_area[order].free_list[migratetype]);
 448         zone->free_area[order].nr_free++;
 449 }
 450
 451 static inline int free_pages_check(struct page *page)
 452 {
 453         if (unlikely(page_mapcount(page) |
 454                 (page->mapping != NULL)  |
 455                 (page_count(page) != 0)  |
 456                 (page->flags & (
 457                         1 << PG_lru     |
 458                         1 << PG_private |
 459                         1 << PG_locked  |
 460                         1 << PG_active  |
 461                         1 << PG_slab    |
 462                         1 << PG_swapcache |
 463                         1 << PG_writeback |
 464                         1 << PG_reserved |
 465                         1 << PG_buddy ))))
 466                 bad_page(page);
 467         if (PageDirty(page))
 468                 __ClearPageDirty(page);
 469         /*
 470          * For now, we report if PG_reserved was found set, but do not
 471          * clear it, and do not free the page.  But we shall soon need
 472          * to do more, for when the ZERO_PAGE count wraps negative.
 473          */
 474         return PageReserved(page);
 475 }
 476
 477 /*
 478  * Frees a list of pages.
 479  * Assumes all pages on list are in same zone, and of same order.
 480  * count is the number of pages to free.
 481  *
 482  * If the zone was previously in an "all pages pinned" state then look to
 483  * see if this freeing clears that state.
 484  *
 485  * And clear the zone's pages_scanned counter, to hold off the "all pages are
 486  * pinned" detection logic.
 487  */
 488 static void free_pages_bulk(struct zone *zone, int count,
 489                                         struct list_head *list, int order)
 490 {
 491         spin_lock(&zone->lock);
 492         zone->all_unreclaimable = 0;
 493         zone->pages_scanned = 0;
 494         while (count--) {
 495                 struct page *page;
 496
 497                 VM_BUG_ON(list_empty(list));
 498                 page = list_entry(list->prev, struct page, lru);
 499                 /* have to delete it as __free_one_page list manipulates */
 500                 list_del(&page->lru);
 501                 __free_one_page(page, zone, order);
 502         }
 503         spin_unlock(&zone->lock);
 504 }
 505
 506 static void free_one_page(struct zone *zone, struct page *page, int order)
 507 {
 508         spin_lock(&zone->lock);
 509         zone->all_unreclaimable = 0;
 510         zone->pages_scanned = 0;
 511         __free_one_page(page, zone, order);
 512         spin_unlock(&zone->lock);
 513 }
 514
 515 static void __free_pages_ok(struct page *page, unsigned int order)
 516 {
 517         unsigned long flags;
 518         int i;
 519         int reserved = 0;
 520
 521         for (i = 0 ; i < (1 << order) ; ++i)
 522                 reserved += free_pages_check(page + i);
 523         if (reserved)
 524                 return;
 525
 526         if (!PageHighMem(page))
 527                 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
 528         arch_free_page(page, order);
 529         kernel_map_pages(page, 1 << order, 0);
 530
 531         local_irq_save(flags);
 532         __count_vm_events(PGFREE, 1 << order);
 533         free_one_page(page_zone(page), page, order);
 534         local_irq_restore(flags);
 535 }
 536
 537 /*
 538  * permit the bootmem allocator to evade page validation on high-order frees
 539  */
 540 void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 541 {
 542         if (order == 0) {
 543                 __ClearPageReserved(page);
 544                 set_page_count(page, 0);
 545                 set_page_refcounted(page);
 546                 __free_page(page);
 547         } else {
 548                 int loop;
 549
 550                 prefetchw(page);
 551                 for (loop = 0; loop < BITS_PER_LONG; loop++) {
 552                         struct page *p = &page[loop];
 553
 554                         if (loop + 1 < BITS_PER_LONG)
 555                                 prefetchw(p + 1);
 556                         __ClearPageReserved(p);
 557                         set_page_count(p, 0);
 558                 }
 559
 560                 set_page_refcounted(page);
 561                 __free_pages(page, order);
 562         }
 563 }
 564
 565
 566 /*
 567  * The order of subdivision here is critical for the IO subsystem.
 568  * Please do not alter this order without good reasons and regression
 569  * testing. Specifically, as large blocks of memory are subdivided,
 570  * the order in which smaller blocks are delivered depends on the order
 571  * they're subdivided in this function. This is the primary factor
 572  * influencing the order in which pages are delivered to the IO
 573  * subsystem according to empirical testing, and this is also justified
 574  * by considering the behavior of a buddy system containing a single
 575  * large block of memory acted on by a series of small allocations.
 576  * This behavior is a critical factor in sglist merging's success.
 577  *
 578  * -- wli
 579  */
 580 static inline void expand(struct zone *zone, struct page *page,
 581         int low, int high, struct free_area *area,
 582         int migratetype)
 583 {
 584         unsigned long size = 1 << high;
 585
 586         while (high > low) {
 587                 area--;
 588                 high--;
 589                 size >>= 1;
 590                 VM_BUG_ON(bad_range(zone, &page[size]));
 591                 list_add(&page[size].lru, &area->free_list[migratetype]);
 592                 area->nr_free++;
 593                 set_page_order(&page[size], high);
 594         }
 595 }
 596
 597 /*
 598  * This page is about to be returned from the page allocator
 599  */
 600 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 601 {
 602         if (unlikely(page_mapcount(page) |
 603                 (page->mapping != NULL)  |
 604                 (page_count(page) != 0)  |
 605                 (page->flags & (
 606                         1 << PG_lru     |
 607                         1 << PG_private |
 608                         1 << PG_locked  |
 609                         1 << PG_active  |
 610                         1 << PG_dirty   |
 611                         1 << PG_slab    |
 612                         1 << PG_swapcache |
 613                         1 << PG_writeback |
 614                         1 << PG_reserved |
 615                         1 << PG_buddy ))))
 616                 bad_page(page);
 617
 618         /*
 619          * For now, we report if PG_reserved was found set, but do not
 620          * clear it, and do not allocate the page: as a safety net.
 621          */
 622         if (PageReserved(page))
 623                 return 1;
 624
 625         page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
 626                         1 << PG_referenced | 1 << PG_arch_1 |
 627                         1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
 628         set_page_private(page, 0);
 629         set_page_refcounted(page);
 630
 631         arch_alloc_page(page, order);
 632         kernel_map_pages(page, 1 << order, 1);
 633
 634         if (gfp_flags & __GFP_ZERO)
 635                 prep_zero_page(page, order, gfp_flags);
 636
 637         if (order && (gfp_flags & __GFP_COMP))
 638                 prep_compound_page(page, order);
 639
 640         return 0;
 641 }
 642
 643 /*
 644  * Go through the free lists for the given migratetype and remove
 645  * the smallest available page from the freelists
 646  */
 647 static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 648                                                 int migratetype)
 649 {
 650         unsigned int current_order;
 651         struct free_area * area;
 652         struct page *page;
 653
 654         /* Find a page of the appropriate size in the preferred list */
 655         for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 656                 area = &(zone->free_area[current_order]);
 657                 if (list_empty(&area->free_list[migratetype]))
 658                         continue;
 659
 660                 page = list_entry(area->free_list[migratetype].next,
 661                                                         struct page, lru);
 662                 list_del(&page->lru);
 663                 rmv_page_order(page);
 664                 area->nr_free--;
 665                 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
 666                 expand(zone, page, order, current_order, area, migratetype);
 667                 return page;
 668         }
 669
 670         return NULL;
 671 }
 672
 673
 674 /*
 675  * This array describes the order lists are fallen back to when
 676  * the free lists for the desirable migrate type are depleted
 677  */
 678 static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
 679         [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
 680         [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
 681         [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
 682         [MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
 683 };
 684
 685 /*
 686  * Move the free pages in a range to the free lists of the requested type.
 687  * Note that start_page and end_pages are not aligned on a pageblock
 688  * boundary. If alignment is required, use move_freepages_block()
 689  */
 690 int move_freepages(struct zone *zone,
 691                         struct page *start_page, struct page *end_page,
 692                         int migratetype)
 693 {
 694         struct page *page;
 695         unsigned long order;
 696         int pages_moved = 0;
 697
 698 #ifndef CONFIG_HOLES_IN_ZONE
 699         /*
 700          * page_zone is not safe to call in this context when
 701          * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 702          * anyway as we check zone boundaries in move_freepages_block().
 703          * Remove at a later date when no bug reports exist related to
 704          * grouping pages by mobility
 705          */
 706         BUG_ON(page_zone(start_page) != page_zone(end_page));
 707 #endif
 708
 709         for (page = start_page; page <= end_page;) {
 710                 if (!pfn_valid_within(page_to_pfn(page))) {
 711                         page++;
 712                         continue;
 713                 }
 714
 715                 if (!PageBuddy(page)) {
 716                         page++;
 717                         continue;
 718                 }
 719
 720                 order = page_order(page);
 721                 list_del(&page->lru);
 722                 list_add(&page->lru,
 723                         &zone->free_area[order].free_list[migratetype]);
 724                 page += 1 << order;
 725                 pages_moved += 1 << order;
 726         }
 727
 728         return pages_moved;
 729 }
 730
 731 int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
 732 {
 733         unsigned long start_pfn, end_pfn;
 734         struct page *start_page, *end_page;
 735
 736         start_pfn = page_to_pfn(page);
 737         start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 738         start_page = pfn_to_page(start_pfn);
 739         end_page = start_page + pageblock_nr_pages - 1;
 740         end_pfn = start_pfn + pageblock_nr_pages - 1;
 741
 742         /* Do not cross zone boundaries */
 743         if (start_pfn < zone->zone_start_pfn)
 744                 start_page = page;
 745         if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
 746                 return 0;
 747
 748         return move_freepages(zone, start_page, end_page, migratetype);
 749 }
 750
 751 /* Return the page with the lowest PFN in the list */
 752 static struct page *min_page(struct list_head *list)
 753 {
 754         unsigned long min_pfn = -1UL;
 755         struct page *min_page = NULL, *page;;
 756
 757         list_for_each_entry(page, list, lru) {
 758                 unsigned long pfn = page_to_pfn(page);
 759                 if (pfn < min_pfn) {
 760                         min_pfn = pfn;
 761                         min_page = page;
 762                 }
 763         }
 764
 765         return min_page;
 766 }
 767
 768 /* Remove an element from the buddy allocator from the fallback list */
 769 static struct page *__rmqueue_fallback(struct zone *zone, int order,
 770                                                 int start_migratetype)
 771 {
 772         struct free_area * area;
 773         int current_order;
 774         struct page *page;
 775         int migratetype, i;
 776
 777         /* Find the largest possible block of pages in the other list */
 778         for (current_order = MAX_ORDER-1; current_order >= order;
 779                                                 --current_order) {
 780                 for (i = 0; i < MIGRATE_TYPES - 1; i++) {
 781                         migratetype = fallbacks[start_migratetype][i];
 782
 783                         /* MIGRATE_RESERVE handled later if necessary */
 784                         if (migratetype == MIGRATE_RESERVE)
 785                                 continue;
 786
 787                         area = &(zone->free_area[current_order]);
 788                         if (list_empty(&area->free_list[migratetype]))
 789                                 continue;
 790
 791                         /* Bias kernel allocations towards low pfns */
 792                         page = list_entry(area->free_list[migratetype].next,
 793                                         struct page, lru);
 794                         if (unlikely(start_migratetype != MIGRATE_MOVABLE))
 795                                 page = min_page(&area->free_list[migratetype]);
 796                         area->nr_free--;
 797
 798                         /*
 799                          * If breaking a large block of pages, move all free
 800                          * pages to the preferred allocation list. If falling
 801                          * back for a reclaimable kernel allocation, be more
 802                          * agressive about taking ownership of free pages
 803                          */
 804                         if (unlikely(current_order >= (pageblock_order >> 1)) ||
 805                                         start_migratetype == MIGRATE_RECLAIMABLE) {
 806                                 unsigned long pages;
 807                                 pages = move_freepages_block(zone, page,
 808                                                                 start_migratetype);
 809
 810                                 /* Claim the whole block if over half of it is free */
 811                                 if (pages >= (1 << (pageblock_order-1)))
 812                                         set_pageblock_migratetype(page,
 813                                                                 start_migratetype);
 814
 815                                 migratetype = start_migratetype;
 816                         }
 817
 818                         /* Remove the page from the freelists */
 819                         list_del(&page->lru);
 820                         rmv_page_order(page);
 821                         __mod_zone_page_state(zone, NR_FREE_PAGES,
 822                                                         -(1UL << order));
 823
 824                         if (current_order == pageblock_order)
 825                                 set_pageblock_migratetype(page,
 826                                                         start_migratetype);
 827
 828                         expand(zone, page, order, current_order, area, migratetype);
 829                         return page;
 830                 }
 831         }
 832
 833         /* Use MIGRATE_RESERVE rather than fail an allocation */
 834         return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
 835 }
 836
 837 /*
 838  * Do the hard work of removing an element from the buddy allocator.
 839  * Call me with the zone->lock already held.
 840  */
 841 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 842                                                 int migratetype)
 843 {
 844         struct page *page;
 845
 846         page = __rmqueue_smallest(zone, order, migratetype);
 847
 848         if (unlikely(!page))
 849                 page = __rmqueue_fallback(zone, order, migratetype);
 850
 851         return page;
 852 }
 853
 854 /*
 855  * Obtain a specified number of elements from the buddy allocator, all under
 856  * a single hold of the lock, for efficiency.  Add them to the supplied list.
 857  * Returns the number of new pages which were placed at *list.
 858  */
 859 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 860                         unsigned long count, struct list_head *list,
 861                         int migratetype)
 862 {
 863         int i;
 864
 865         spin_lock(&zone->lock);
 866         for (i = 0; i < count; ++i) {
 867                 struct page *page = __rmqueue(zone, order, migratetype);
 868                 if (unlikely(page == NULL))
 869                         break;
 870                 list_add(&page->lru, list);
 871                 set_page_private(page, migratetype);
 872         }
 873         spin_unlock(&zone->lock);
 874         return i;
 875 }
 876
 877 #ifdef CONFIG_NUMA
 878 /*
 879  * Called from the vmstat counter updater to drain pagesets of this
 880  * currently executing processor on remote nodes after they have
 881  * expired.
 882  *
 883  * Note that this function must be called with the thread pinned to
 884  * a single processor.
 885  */
 886 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 887 {
 888         unsigned long flags;
 889         int to_drain;
 890
 891         local_irq_save(flags);
 892         if (pcp->count >= pcp->batch)
 893                 to_drain = pcp->batch;
 894         else
 895                 to_drain = pcp->count;
 896         free_pages_bulk(zone, to_drain, &pcp->list, 0);
 897         pcp->count -= to_drain;
 898         local_irq_restore(flags);
 899 }
 900 #endif
 901
 902 static void __drain_pages(unsigned int cpu)
 903 {
 904         unsigned long flags;
 905         struct zone *zone;
 906         int i;
 907
 908         for_each_zone(zone) {
 909                 struct per_cpu_pageset *pset;
 910
 911                 if (!populated_zone(zone))
 912                         continue;
 913
 914                 pset = zone_pcp(zone, cpu);
 915                 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 916                         struct per_cpu_pages *pcp;
 917
 918                         pcp = &pset->pcp[i];
 919                         local_irq_save(flags);
 920                         free_pages_bulk(zone, pcp->count, &pcp->list, 0);
 921                         pcp->count = 0;
 922                         local_irq_restore(flags);
 923                 }
 924         }
 925 }
 926
 927 #ifdef CONFIG_HIBERNATION
 928
 929 void mark_free_pages(struct zone *zone)
 930 {
 931         unsigned long pfn, max_zone_pfn;
 932         unsigned long flags;
 933         int order, t;
 934         struct list_head *curr;
 935
 936         if (!zone->spanned_pages)
 937                 return;
 938
 939         spin_lock_irqsave(&zone->lock, flags);
 940
 941         max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 942         for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 943                 if (pfn_valid(pfn)) {
 944                         struct page *page = pfn_to_page(pfn);
 945
 946                         if (!swsusp_page_is_forbidden(page))
 947                                 swsusp_unset_page_free(page);
 948                 }
 949
 950         for_each_migratetype_order(order, t) {
 951                 list_for_each(curr, &zone->free_area[order].free_list[t]) {
 952                         unsigned long i;
 953
 954                         pfn = page_to_pfn(list_entry(curr, struct page, lru));
 955                         for (i = 0; i < (1UL << order); i++)
 956                                 swsusp_set_page_free(pfn_to_page(pfn + i));
 957                 }
 958         }
 959         spin_unlock_irqrestore(&zone->lock, flags);
 960 }
 961 #endif /* CONFIG_PM */
 962
 963 /*
 964  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
 965  */
 966 void drain_local_pages(void)
 967 {
 968         unsigned long flags;
 969
 970         local_irq_save(flags);
 971         __drain_pages(smp_processor_id());
 972         local_irq_restore(flags);
 973 }
 974
 975 void smp_drain_local_pages(void *arg)
 976 {
 977         drain_local_pages();
 978 }
 979
 980 /*
 981  * Spill all the per-cpu pages from all CPUs back into the buddy allocator
 982  */
 983 void drain_all_local_pages(void)
 984 {
 985         unsigned long flags;
 986
 987         local_irq_save(flags);
 988         __drain_pages(smp_processor_id());
 989         local_irq_restore(flags);
 990
 991         smp_call_function(smp_drain_local_pages, NULL, 0, 1);
 992 }
 993
 994 /*
 995  * Free a 0-order page
 996  */
 997 static void fastcall free_hot_cold_page(struct page *page, int cold)
 998 {
 999         struct zone *zone = page_zone(page);
1000         struct per_cpu_pages *pcp;
1001         unsigned long flags;
1002
1003         if (PageAnon(page))
1004                 page->mapping = NULL;
1005         if (free_pages_check(page))
1006                 return;
1007
1008         if (!PageHighMem(page))
1009                 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
1010         arch_free_page(page, 0);
1011         kernel_map_pages(page, 1, 0);
1012
1013         pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
1014         local_irq_save(flags);
1015         __count_vm_event(PGFREE);
1016         list_add(&page->lru, &pcp->list);
1017         set_page_private(page, get_pageblock_migratetype(page));
1018         pcp->count++;
1019         if (pcp->count >= pcp->high) {
1020                 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
1021                 pcp->count -= pcp->batch;
1022         }
1023         local_irq_restore(flags);
1024         put_cpu();
1025 }
1026
1027 void fastcall free_hot_page(struct page *page)
1028 {
1029         free_hot_cold_page(page, 0);
1030 }
1031
1032 void fastcall free_cold_page(struct page *page)
1033 {
1034         free_hot_cold_page(page, 1);
1035 }
1036
1037 /*
1038  * split_page takes a non-compound higher-order page, and splits it into
1039  * n (1<<order) sub-pages: page[0..n]
1040  * Each sub-page must be freed individually.
1041  *
1042  * Note: this is probably too low level an operation for use in drivers.
1043  * Please consult with lkml before using this in your driver.
1044  */
1045 void split_page(struct page *page, unsigned int order)
1046 {
1047         int i;
1048
1049         VM_BUG_ON(PageCompound(page));
1050         VM_BUG_ON(!page_count(page));
1051         for (i = 1; i < (1 << order); i++)
1052                 set_page_refcounted(page + i);
1053 }
1054
1055 /*
1056  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
1057  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
1058  * or two.
1059  */
1060 static struct page *buffered_rmqueue(struct zonelist *zonelist,
1061                         struct zone *zone, int order, gfp_t gfp_flags)
1062 {
1063         unsigned long flags;
1064         struct page *page;
1065         int cold = !!(gfp_flags & __GFP_COLD);
1066         int cpu;
1067         int migratetype = allocflags_to_migratetype(gfp_flags);
1068
1069 again:
1070         cpu  = get_cpu();
1071         if (likely(order == 0)) {
1072                 struct per_cpu_pages *pcp;
1073
1074                 pcp = &zone_pcp(zone, cpu)->pcp[cold];
1075                 local_irq_save(flags);
1076                 if (!pcp->count) {
1077                         pcp->count = rmqueue_bulk(zone, 0,
1078                                         pcp->batch, &pcp->list, migratetype);
1079                         if (unlikely(!pcp->count))
1080                                 goto failed;
1081                 }
1082
1083                 /* Find a page of the appropriate migrate type */
1084                 list_for_each_entry(page, &pcp->list, lru)
1085                         if (page_private(page) == migratetype)
1086                                 break;
1087
1088                 /* Allocate more to the pcp list if necessary */
1089                 if (unlikely(&page->lru == &pcp->list)) {
1090                         pcp->count += rmqueue_bulk(zone, 0,
1091                                         pcp->batch, &pcp->list, migratetype);
1092                         page = list_entry(pcp->list.next, struct page, lru);
1093                 }
1094
1095                 list_del(&page->lru);
1096                 pcp->count--;
1097         } else {
1098                 spin_lock_irqsave(&zone->lock, flags);
1099                 page = __rmqueue(zone, order, migratetype);
1100                 spin_unlock(&zone->lock);
1101                 if (!page)
1102                         goto failed;
1103         }
1104
1105         __count_zone_vm_events(PGALLOC, zone, 1 << order);
1106         zone_statistics(zonelist, zone);
1107         local_irq_restore(flags);
1108         put_cpu();
1109
1110         VM_BUG_ON(bad_range(zone, page));
1111         if (prep_new_page(page, order, gfp_flags))
1112                 goto again;
1113         return page;
1114
1115 failed:
1116         local_irq_restore(flags);
1117         put_cpu();
1118         return NULL;
1119 }
1120
1121 #define ALLOC_NO_WATERMARKS     0x01 /* don't check watermarks at all */
1122 #define ALLOC_WMARK_MIN         0x02 /* use pages_min watermark */
1123 #define ALLOC_WMARK_LOW         0x04 /* use pages_low watermark */
1124 #define ALLOC_WMARK_HIGH        0x08 /* use pages_high watermark */
1125 #define ALLOC_HARDER            0x10 /* try to alloc harder */
1126 #define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
1127 #define ALLOC_CPUSET            0x40 /* check for correct cpuset */
1128
1129 #ifdef CONFIG_FAIL_PAGE_ALLOC
1130
1131 static struct fail_page_alloc_attr {
1132         struct fault_attr attr;
1133
1134         u32 ignore_gfp_highmem;
1135         u32 ignore_gfp_wait;
1136         u32 min_order;
1137
1138 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1139
1140         struct dentry *ignore_gfp_highmem_file;
1141         struct dentry *ignore_gfp_wait_file;
1142         struct dentry *min_order_file;
1143
1144 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1145
1146 } fail_page_alloc = {
1147         .attr = FAULT_ATTR_INITIALIZER,
1148         .ignore_gfp_wait = 1,
1149         .ignore_gfp_highmem = 1,
1150         .min_order = 1,
1151 };
1152
1153 static int __init setup_fail_page_alloc(char *str)
1154 {
1155         return setup_fault_attr(&fail_page_alloc.attr, str);
1156 }
1157 __setup("fail_page_alloc=", setup_fail_page_alloc);
1158
1159 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1160 {
1161         if (order < fail_page_alloc.min_order)
1162                 return 0;
1163         if (gfp_mask & __GFP_NOFAIL)
1164                 return 0;
1165         if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1166                 return 0;
1167         if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1168                 return 0;
1169
1170         return should_fail(&fail_page_alloc.attr, 1 << order);
1171 }
1172
1173 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1174
1175 static int __init fail_page_alloc_debugfs(void)
1176 {
1177         mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1178         struct dentry *dir;
1179         int err;
1180
1181         err = init_fault_attr_dentries(&fail_page_alloc.attr,
1182                                        "fail_page_alloc");
1183         if (err)
1184                 return err;
1185         dir = fail_page_alloc.attr.dentries.dir;
1186
1187         fail_page_alloc.ignore_gfp_wait_file =
1188                 debugfs_create_bool("ignore-gfp-wait", mode, dir,
1189                                       &fail_page_alloc.ignore_gfp_wait);
1190
1191         fail_page_alloc.ignore_gfp_highmem_file =
1192                 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1193                                       &fail_page_alloc.ignore_gfp_highmem);
1194         fail_page_alloc.min_order_file =
1195                 debugfs_create_u32("min-order", mode, dir,
1196                                    &fail_page_alloc.min_order);
1197
1198         if (!fail_page_alloc.ignore_gfp_wait_file ||
1199             !fail_page_alloc.ignore_gfp_highmem_file ||
1200             !fail_page_alloc.min_order_file) {
1201                 err = -ENOMEM;
1202                 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
1203                 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
1204                 debugfs_remove(fail_page_alloc.min_order_file);
1205                 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
1206         }
1207
1208         return err;
1209 }
1210
1211 late_initcall(fail_page_alloc_debugfs);
1212
1213 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1214
1215 #else /* CONFIG_FAIL_PAGE_ALLOC */
1216
1217 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1218 {
1219         return 0;
1220 }
1221
1222 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1223
1224 /*
1225  * Return 1 if free pages are above 'mark'. This takes into account the order
1226  * of the allocation.
1227  */
1228 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1229                       int classzone_idx, int alloc_flags)
1230 {
1231         /* free_pages my go negative - that's OK */
1232         long min = mark;
1233         long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
1234         int o;
1235
1236         if (alloc_flags & ALLOC_HIGH)
1237                 min -= min / 2;
1238         if (alloc_flags & ALLOC_HARDER)
1239                 min -= min / 4;
1240
1241         if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1242                 return 0;
1243         for (o = 0; o < order; o++) {
1244                 /* At the next order, this order's pages become unavailable */
1245                 free_pages -= z->free_area[o].nr_free << o;
1246
1247                 /* Require fewer higher order pages to be free */
1248                 min >>= 1;
1249
1250                 if (free_pages <= min)
1251                         return 0;
1252         }
1253         return 1;
1254 }
1255
1256 #ifdef CONFIG_NUMA
1257 /*
1258  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
1259  * skip over zones that are not allowed by the cpuset, or that have
1260  * been recently (in last second) found to be nearly full.  See further
1261  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
1262  * that have to skip over alot of full or unallowed zones.
1263  *
1264  * If the zonelist cache is present in the passed in zonelist, then
1265  * returns a pointer to the allowed node mask (either the current
1266  * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1267  *
1268  * If the zonelist cache is not available for this zonelist, does
1269  * nothing and returns NULL.
1270  *
1271  * If the fullzones BITMAP in the zonelist cache is stale (more than
1272  * a second since last zap'd) then we zap it out (clear its bits.)
1273  *
1274  * We hold off even calling zlc_setup, until after we've checked the
1275  * first zone in the zonelist, on the theory that most allocations will
1276  * be satisfied from that first zone, so best to examine that zone as
1277  * quickly as we can.
1278  */
1279 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1280 {
1281         struct zonelist_cache *zlc;     /* cached zonelist speedup info */
1282         nodemask_t *allowednodes;       /* zonelist_cache approximation */
1283
1284         zlc = zonelist->zlcache_ptr;
1285         if (!zlc)
1286                 return NULL;
1287
1288         if (jiffies - zlc->last_full_zap > 1 * HZ) {
1289                 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1290                 zlc->last_full_zap = jiffies;
1291         }
1292
1293         allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1294                                         &cpuset_current_mems_allowed :
1295                                         &node_states[N_HIGH_MEMORY];
1296         return allowednodes;
1297 }
1298
1299 /*
1300  * Given 'z' scanning a zonelist, run a couple of quick checks to see
1301  * if it is worth looking at further for free memory:
1302  *  1) Check that the zone isn't thought to be full (doesn't have its
1303  *     bit set in the zonelist_cache fullzones BITMAP).
1304  *  2) Check that the zones node (obtained from the zonelist_cache
1305  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1306  * Return true (non-zero) if zone is worth looking at further, or
1307  * else return false (zero) if it is not.
1308  *
1309  * This check -ignores- the distinction between various watermarks,
1310  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
1311  * found to be full for any variation of these watermarks, it will
1312  * be considered full for up to one second by all requests, unless
1313  * we are so low on memory on all allowed nodes that we are forced
1314  * into the second scan of the zonelist.
1315  *
1316  * In the second scan we ignore this zonelist cache and exactly
1317  * apply the watermarks to all zones, even it is slower to do so.
1318  * We are low on memory in the second scan, and should leave no stone
1319  * unturned looking for a free page.
1320  */
1321 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1322                                                 nodemask_t *allowednodes)
1323 {
1324         struct zonelist_cache *zlc;     /* cached zonelist speedup info */
1325         int i;                          /* index of *z in zonelist zones */
1326         int n;                          /* node that zone *z is on */
1327
1328         zlc = zonelist->zlcache_ptr;
1329         if (!zlc)
1330                 return 1;
1331
1332         i = z - zonelist->zones;
1333         n = zlc->z_to_n[i];
1334
1335         /* This zone is worth trying if it is allowed but not full */
1336         return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1337 }
1338
1339 /*
1340  * Given 'z' scanning a zonelist, set the corresponding bit in
1341  * zlc->fullzones, so that subsequent attempts to allocate a page
1342  * from that zone don't waste time re-examining it.
1343  */
1344 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1345 {
1346         struct zonelist_cache *zlc;     /* cached zonelist speedup info */
1347         int i;                          /* index of *z in zonelist zones */
1348
1349         zlc = zonelist->zlcache_ptr;
1350         if (!zlc)
1351                 return;
1352
1353         i = z - zonelist->zones;
1354
1355         set_bit(i, zlc->fullzones);
1356 }
1357
1358 #else   /* CONFIG_NUMA */
1359
1360 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1361 {
1362         return NULL;
1363 }
1364
1365 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1366                                 nodemask_t *allowednodes)
1367 {
1368         return 1;
1369 }
1370
1371 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1372 {
1373 }
1374 #endif  /* CONFIG_NUMA */
1375
1376 /*
1377  * get_page_from_freelist goes through the zonelist trying to allocate
1378  * a page.
1379  */
1380 static struct page *
1381 get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
1382                 struct zonelist *zonelist, int alloc_flags)
1383 {
1384         struct zone **z;
1385         struct page *page = NULL;
1386         int classzone_idx = zone_idx(zonelist->zones[0]);
1387         struct zone *zone;
1388         nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1389         int zlc_active = 0;             /* set if using zonelist_cache */
1390         int did_zlc_setup = 0;          /* just call zlc_setup() one time */
1391         enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */
1392
1393 zonelist_scan:
1394         /*
1395          * Scan zonelist, looking for a zone with enough free.
1396          * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1397          */
1398         z = zonelist->zones;
1399
1400         do {
1401                 /*
1402                  * In NUMA, this could be a policy zonelist which contains
1403                  * zones that may not be allowed by the current gfp_mask.
1404                  * Check the zone is allowed by the current flags
1405                  */
1406                 if (unlikely(alloc_should_filter_zonelist(zonelist))) {
1407                         if (highest_zoneidx == -1)
1408                                 highest_zoneidx = gfp_zone(gfp_mask);
1409                         if (zone_idx(*z) > highest_zoneidx)
1410                                 continue;
1411                 }
1412
1413                 if (NUMA_BUILD && zlc_active &&
1414                         !zlc_zone_worth_trying(zonelist, z, allowednodes))
1415                                 continue;
1416                 zone = *z;
1417                 if ((alloc_flags & ALLOC_CPUSET) &&
1418                         !cpuset_zone_allowed_softwall(zone, gfp_mask))
1419                                 goto try_next_zone;
1420
1421                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1422                         unsigned long mark;
1423                         if (alloc_flags & ALLOC_WMARK_MIN)
1424                                 mark = zone->pages_min;
1425                         else if (alloc_flags & ALLOC_WMARK_LOW)
1426                                 mark = zone->pages_low;
1427                         else
1428                                 mark = zone->pages_high;
1429                         if (!zone_watermark_ok(zone, order, mark,
1430                                     classzone_idx, alloc_flags)) {
1431                                 if (!zone_reclaim_mode ||
1432                                     !zone_reclaim(zone, gfp_mask, order))
1433                                         goto this_zone_full;
1434                         }
1435                 }
1436
1437                 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
1438                 if (page)
1439                         break;
1440 this_zone_full:
1441                 if (NUMA_BUILD)
1442                         zlc_mark_zone_full(zonelist, z);
1443 try_next_zone:
1444                 if (NUMA_BUILD && !did_zlc_setup) {
1445                         /* we do zlc_setup after the first zone is tried */
1446                         allowednodes = zlc_setup(zonelist, alloc_flags);
1447                         zlc_active = 1;
1448                         did_zlc_setup = 1;
1449                 }
1450         } while (*(++z) != NULL);
1451
1452         if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1453                 /* Disable zlc cache for second zonelist scan */
1454                 zlc_active = 0;
1455                 goto zonelist_scan;
1456         }
1457         return page;
1458 }
1459
1460 /*
1461  * This is the 'heart' of the zoned buddy allocator.
1462  */
1463 struct page * fastcall
1464 __alloc_pages(gfp_t gfp_mask, unsigned int order,
1465                 struct zonelist *zonelist)
1466 {
1467         const gfp_t wait = gfp_mask & __GFP_WAIT;
1468         struct zone **z;
1469         struct page *page;
1470         struct reclaim_state reclaim_state;
1471         struct task_struct *p = current;
1472         int do_retry;
1473         int alloc_flags;
1474         int did_some_progress;
1475
1476         might_sleep_if(wait);
1477
1478         if (should_fail_alloc_page(gfp_mask, order))
1479                 return NULL;
1480
1481 restart:
1482         z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
1483
1484         if (unlikely(*z == NULL)) {
1485                 /*
1486                  * Happens if we have an empty zonelist as a result of
1487                  * GFP_THISNODE being used on a memoryless node
1488                  */
1489                 return NULL;
1490         }
1491
1492         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1493                                 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1494         if (page)
1495                 goto got_pg;
1496
1497         /*
1498          * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1499          * __GFP_NOWARN set) should not cause reclaim since the subsystem
1500          * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1501          * using a larger set of nodes after it has established that the
1502          * allowed per node queues are empty and that nodes are
1503          * over allocated.
1504          */
1505         if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1506                 goto nopage;
1507
1508         for (z = zonelist->zones; *z; z++)
1509                 wakeup_kswapd(*z, order);
1510
1511         /*
1512          * OK, we're below the kswapd watermark and have kicked background
1513          * reclaim. Now things get more complex, so set up alloc_flags according
1514          * to how we want to proceed.
1515          *
1516          * The caller may dip into page reserves a bit more if the caller
1517          * cannot run direct reclaim, or if the caller has realtime scheduling
1518          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
1519          * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1520          */
1521         alloc_flags = ALLOC_WMARK_MIN;
1522         if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1523                 alloc_flags |= ALLOC_HARDER;
1524         if (gfp_mask & __GFP_HIGH)
1525                 alloc_flags |= ALLOC_HIGH;
1526         if (wait)
1527                 alloc_flags |= ALLOC_CPUSET;
1528
1529         /*
1530          * Go through the zonelist again. Let __GFP_HIGH and allocations
1531          * coming from realtime tasks go deeper into reserves.
1532          *
1533          * This is the last chance, in general, before the goto nopage.
1534          * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1535          * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1536          */
1537         page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
1538         if (page)
1539                 goto got_pg;
1540
1541         /* This allocation should allow future memory freeing. */
1542
1543 rebalance:
1544         if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1545                         && !in_interrupt()) {
1546                 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1547 nofail_alloc:
1548                         /* go through the zonelist yet again, ignoring mins */
1549                         page = get_page_from_freelist(gfp_mask, order,
1550                                 zonelist, ALLOC_NO_WATERMARKS);
1551                         if (page)
1552                                 goto got_pg;
1553                         if (gfp_mask & __GFP_NOFAIL) {
1554                                 congestion_wait(WRITE, HZ/50);
1555                                 goto nofail_alloc;
1556                         }
1557                 }
1558                 goto nopage;
1559         }
1560
1561         /* Atomic allocations - we can't balance anything */
1562         if (!wait)
1563                 goto nopage;
1564
1565         cond_resched();
1566
1567         /* We now go into synchronous reclaim */
1568         cpuset_memory_pressure_bump();
1569         p->flags |= PF_MEMALLOC;
1570         reclaim_state.reclaimed_slab = 0;
1571         p->reclaim_state = &reclaim_state;
1572
1573         did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
1574
1575         p->reclaim_state = NULL;
1576         p->flags &= ~PF_MEMALLOC;
1577
1578         cond_resched();
1579
1580         if (order != 0)
1581                 drain_all_local_pages();
1582
1583         if (likely(did_some_progress)) {
1584                 page = get_page_from_freelist(gfp_mask, order,
1585                                                 zonelist, alloc_flags);
1586                 if (page)
1587                         goto got_pg;
1588         } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1589                 /*
1590                  * Go through the zonelist yet one more time, keep
1591                  * very high watermark here, this is only to catch
1592                  * a parallel oom killing, we must fail if we're still
1593                  * under heavy pressure.
1594                  */
1595                 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1596                                 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1597                 if (page)
1598                         goto got_pg;
1599
1600                 /* The OOM killer will not help higher order allocs so fail */
1601                 if (order > PAGE_ALLOC_COSTLY_ORDER)
1602                         goto nopage;
1603
1604                 out_of_memory(zonelist, gfp_mask, order);
1605                 goto restart;
1606         }
1607
1608         /*
1609          * Don't let big-order allocations loop unless the caller explicitly
1610          * requests that.  Wait for some write requests to complete then retry.
1611          *
1612          * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
1613          * <= 3, but that may not be true in other implementations.
1614          */
1615         do_retry = 0;
1616         if (!(gfp_mask & __GFP_NORETRY)) {
1617                 if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
1618                                                 (gfp_mask & __GFP_REPEAT))
1619                         do_retry = 1;
1620                 if (gfp_mask & __GFP_NOFAIL)
1621                         do_retry = 1;
1622         }
1623         if (do_retry) {
1624                 congestion_wait(WRITE, HZ/50);
1625                 goto rebalance;
1626         }
1627
1628 nopage:
1629         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
1630                 printk(KERN_WARNING "%s: page allocation failure."
1631                         " order:%d, mode:0x%x\n",
1632                         p->comm, order, gfp_mask);
1633                 dump_stack();
1634                 show_mem();
1635         }
1636 got_pg:
1637         return page;
1638 }
1639
1640 EXPORT_SYMBOL(__alloc_pages);
1641
1642 /*
1643  * Common helper functions.
1644  */
1645 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1646 {
1647         struct page * page;
1648         page = alloc_pages(gfp_mask, order);
1649         if (!page)
1650                 return 0;
1651         return (unsigned long) page_address(page);
1652 }
1653
1654 EXPORT_SYMBOL(__get_free_pages);
1655
1656 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1657 {
1658         struct page * page;
1659
1660         /*
1661          * get_zeroed_page() returns a 32-bit address, which cannot represent
1662          * a highmem page
1663          */
1664         VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1665
1666         page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1667         if (page)
1668                 return (unsigned long) page_address(page);
1669         return 0;
1670 }
1671
1672 EXPORT_SYMBOL(get_zeroed_page);
1673
1674 void __pagevec_free(struct pagevec *pvec)
1675 {
1676         int i = pagevec_count(pvec);
1677
1678         while (--i >= 0)
1679                 free_hot_cold_page(pvec->pages[i], pvec->cold);
1680 }
1681
1682 fastcall void __free_pages(struct page *page, unsigned int order)
1683 {
1684         if (put_page_testzero(page)) {
1685                 if (order == 0)
1686                         free_hot_page(page);
1687                 else
1688                         __free_pages_ok(page, order);
1689         }
1690 }
1691
1692 EXPORT_SYMBOL(__free_pages);
1693
1694 fastcall void free_pages(unsigned long addr, unsigned int order)
1695 {
1696         if (addr != 0) {
1697                 VM_BUG_ON(!virt_addr_valid((void *)addr));
1698                 __free_pages(virt_to_page((void *)addr), order);
1699         }
1700 }
1701
1702 EXPORT_SYMBOL(free_pages);
1703
1704 static unsigned int nr_free_zone_pages(int offset)
1705 {
1706         /* Just pick one node, since fallback list is circular */
1707         pg_data_t *pgdat = NODE_DATA(numa_node_id());
1708         unsigned int sum = 0;
1709
1710         struct zonelist *zonelist = pgdat->node_zonelists + offset;
1711         struct zone **zonep = zonelist->zones;
1712         struct zone *zone;
1713
1714         for (zone = *zonep++; zone; zone = *zonep++) {
1715                 unsigned long size = zone->present_pages;
1716                 unsigned long high = zone->pages_high;
1717                 if (size > high)
1718                         sum += size - high;
1719         }
1720
1721         return sum;
1722 }
1723
1724 /*
1725  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
1726  */
1727 unsigned int nr_free_buffer_pages(void)
1728 {
1729         return nr_free_zone_pages(gfp_zone(GFP_USER));
1730 }
1731 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
1732
1733 /*
1734  * Amount of free RAM allocatable within all zones
1735  */
1736 unsigned int nr_free_pagecache_pages(void)
1737 {
1738         return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
1739 }
1740
1741 static inline void show_node(struct zone *zone)
1742 {
1743         if (NUMA_BUILD)
1744                 printk("Node %d ", zone_to_nid(zone));
1745 }
1746
1747 void si_meminfo(struct sysinfo *val)
1748 {
1749         val->totalram = totalram_pages;
1750         val->sharedram = 0;
1751         val->freeram = global_page_state(NR_FREE_PAGES);
1752         val->bufferram = nr_blockdev_pages();
1753         val->totalhigh = totalhigh_pages;
1754         val->freehigh = nr_free_highpages();
1755         val->mem_unit = PAGE_SIZE;
1756 }
1757
1758 EXPORT_SYMBOL(si_meminfo);
1759
1760 #ifdef CONFIG_NUMA
1761 void si_meminfo_node(struct sysinfo *val, int nid)
1762 {
1763         pg_data_t *pgdat = NODE_DATA(nid);
1764
1765         val->totalram = pgdat->node_present_pages;
1766         val->freeram = node_page_state(nid, NR_FREE_PAGES);
1767 #ifdef CONFIG_HIGHMEM
1768         val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1769         val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
1770                         NR_FREE_PAGES);
1771 #else
1772         val->totalhigh = 0;
1773         val->freehigh = 0;
1774 #endif
1775         val->mem_unit = PAGE_SIZE;
1776 }
1777 #endif
1778
1779 #define K(x) ((x) << (PAGE_SHIFT-10))
1780
1781 /*
1782  * Show free area list (used inside shift_scroll-lock stuff)
1783  * We also calculate the percentage fragmentation. We do this by counting the
1784  * memory on each free list with the exception of the first item on the list.
1785  */
1786 void show_free_areas(void)
1787 {
1788         int cpu;
1789         struct zone *zone;
1790
1791         for_each_zone(zone) {
1792                 if (!populated_zone(zone))
1793                         continue;
1794
1795                 show_node(zone);
1796                 printk("%s per-cpu:\n", zone->name);
1797
1798                 for_each_online_cpu(cpu) {
1799                         struct per_cpu_pageset *pageset;
1800
1801                         pageset = zone_pcp(zone, cpu);
1802
1803                         printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d   "
1804                                "Cold: hi:%5d, btch:%4d usd:%4d\n",
1805                                cpu, pageset->pcp[0].high,
1806                                pageset->pcp[0].batch, pageset->pcp[0].count,
1807                                pageset->pcp[1].high, pageset->pcp[1].batch,
1808                                pageset->pcp[1].count);
1809                 }
1810         }
1811
1812         printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
1813                 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1814                 global_page_state(NR_ACTIVE),
1815                 global_page_state(NR_INACTIVE),
1816                 global_page_state(NR_FILE_DIRTY),
1817                 global_page_state(NR_WRITEBACK),
1818                 global_page_state(NR_UNSTABLE_NFS),
1819                 global_page_state(NR_FREE_PAGES),
1820                 global_page_state(NR_SLAB_RECLAIMABLE) +
1821                         global_page_state(NR_SLAB_UNRECLAIMABLE),
1822                 global_page_state(NR_FILE_MAPPED),
1823                 global_page_state(NR_PAGETABLE),
1824                 global_page_state(NR_BOUNCE));
1825
1826         for_each_zone(zone) {
1827                 int i;
1828
1829                 if (!populated_zone(zone))
1830                         continue;
1831
1832                 show_node(zone);
1833                 printk("%s"
1834                         " free:%lukB"
1835                         " min:%lukB"
1836                         " low:%lukB"
1837                         " high:%lukB"
1838                         " active:%lukB"
1839                         " inactive:%lukB"
1840                         " present:%lukB"
1841                         " pages_scanned:%lu"
1842                         " all_unreclaimable? %s"
1843                         "\n",
1844                         zone->name,
1845                         K(zone_page_state(zone, NR_FREE_PAGES)),
1846                         K(zone->pages_min),
1847                         K(zone->pages_low),
1848                         K(zone->pages_high),
1849                         K(zone_page_state(zone, NR_ACTIVE)),
1850                         K(zone_page_state(zone, NR_INACTIVE)),
1851                         K(zone->present_pages),
1852                         zone->pages_scanned,
1853                         (zone->all_unreclaimable ? "yes" : "no")
1854                         );
1855                 printk("lowmem_reserve[]:");
1856                 for (i = 0; i < MAX_NR_ZONES; i++)
1857                         printk(" %lu", zone->lowmem_reserve[i]);
1858                 printk("\n");
1859         }
1860
1861         for_each_zone(zone) {
1862                 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1863
1864                 if (!populated_zone(zone))
1865                         continue;
1866
1867                 show_node(zone);
1868                 printk("%s: ", zone->name);
1869
1870                 spin_lock_irqsave(&zone->lock, flags);
1871                 for (order = 0; order < MAX_ORDER; order++) {
1872                         nr[order] = zone->free_area[order].nr_free;
1873                         total += nr[order] << order;
1874                 }
1875                 spin_unlock_irqrestore(&zone->lock, flags);
1876                 for (order = 0; order < MAX_ORDER; order++)
1877                         printk("%lu*%lukB ", nr[order], K(1UL) << order);
1878                 printk("= %lukB\n", K(total));
1879         }
1880
1881         show_swap_cache_info();
1882 }
1883
1884 /*
1885  * Builds allocation fallback zone lists.
1886  *
1887  * Add all populated zones of a node to the zonelist.
1888  */
1889 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
1890                                 int nr_zones, enum zone_type zone_type)
1891 {
1892         struct zone *zone;
1893
1894         BUG_ON(zone_type >= MAX_NR_ZONES);
1895         zone_type++;
1896
1897         do {
1898                 zone_type--;
1899                 zone = pgdat->node_zones + zone_type;
1900                 if (populated_zone(zone)) {
1901                         zonelist->zones[nr_zones++] = zone;
1902                         check_highest_zone(zone_type);
1903                 }
1904
1905         } while (zone_type);
1906         return nr_zones;
1907 }
1908
1909
1910 /*
1911  *  zonelist_order:
1912  *  0 = automatic detection of better ordering.
1913  *  1 = order by ([node] distance, -zonetype)
1914  *  2 = order by (-zonetype, [node] distance)
1915  *
1916  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
1917  *  the same zonelist. So only NUMA can configure this param.
1918  */
1919 #define ZONELIST_ORDER_DEFAULT  0
1920 #define ZONELIST_ORDER_NODE     1
1921 #define ZONELIST_ORDER_ZONE     2
1922
1923 /* zonelist order in the kernel.
1924  * set_zonelist_order() will set this to NODE or ZONE.
1925  */
1926 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
1927 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
1928
1929
1930 #ifdef CONFIG_NUMA
1931 /* The value user specified ....changed by config */
1932 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
1933 /* string for sysctl */
1934 #define NUMA_ZONELIST_ORDER_LEN 16
1935 char numa_zonelist_order[16] = "default";
1936
1937 /*
1938  * interface for configure zonelist ordering.
1939  * command line option "numa_zonelist_order"
1940  *      = "[dD]efault   - default, automatic configuration.
1941  *      = "[nN]ode      - order by node locality, then by zone within node
1942  *      = "[zZ]one      - order by zone, then by locality within zone
1943  */
1944
1945 static int __parse_numa_zonelist_order(char *s)
1946 {
1947         if (*s == 'd' || *s == 'D') {
1948                 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
1949         } else if (*s == 'n' || *s == 'N') {
1950                 user_zonelist_order = ZONELIST_ORDER_NODE;
1951         } else if (*s == 'z' || *s == 'Z') {
1952                 user_zonelist_order = ZONELIST_ORDER_ZONE;
1953         } else {
1954                 printk(KERN_WARNING
1955                         "Ignoring invalid numa_zonelist_order value:  "
1956                         "%s\n", s);
1957                 return -EINVAL;
1958         }
1959         return 0;
1960 }
1961
1962 static __init int setup_numa_zonelist_order(char *s)
1963 {
1964         if (s)
1965                 return __parse_numa_zonelist_order(s);
1966         return 0;
1967 }
1968 early_param("numa_zonelist_order", setup_numa_zonelist_order);
1969
1970 /*
1971  * sysctl handler for numa_zonelist_order
1972  */
1973 int numa_zonelist_order_handler(ctl_table *table, int write,
1974                 struct file *file, void __user *buffer, size_t *length,
1975                 loff_t *ppos)
1976 {
1977         char saved_string[NUMA_ZONELIST_ORDER_LEN];
1978         int ret;
1979
1980         if (write)
1981                 strncpy(saved_string, (char*)table->data,
1982                         NUMA_ZONELIST_ORDER_LEN);
1983         ret = proc_dostring(table, write, file, buffer, length, ppos);
1984         if (ret)
1985                 return ret;
1986         if (write) {
1987                 int oldval = user_zonelist_order;
1988                 if (__parse_numa_zonelist_order((char*)table->data)) {
1989                         /*
1990                          * bogus value.  restore saved string
1991                          */
1992                         strncpy((char*)table->data, saved_string,
1993                                 NUMA_ZONELIST_ORDER_LEN);
1994                         user_zonelist_order = oldval;
1995                 } else if (oldval != user_zonelist_order)
1996                         build_all_zonelists();
1997         }
1998         return 0;
1999 }
2000
2001
2002 #define MAX_NODE_LOAD (num_online_nodes())
2003 static int node_load[MAX_NUMNODES];
2004
2005 /**
2006  * find_next_best_node - find the next node that should appear in a given node's fallback list
2007  * @node: node whose fallback list we're appending
2008  * @used_node_mask: nodemask_t of already used nodes
2009  *
2010  * We use a number of factors to determine which is the next node that should
2011  * appear on a given node's fallback list.  The node should not have appeared
2012  * already in @node's fallback list, and it should be the next closest node
2013  * according to the distance array (which contains arbitrary distance values
2014  * from each node to each node in the system), and should also prefer nodes
2015  * with no CPUs, since presumably they'll have very little allocation pressure
2016  * on them otherwise.
2017  * It returns -1 if no node is found.
2018  */
2019 static int find_next_best_node(int node, nodemask_t *used_node_mask)
2020 {
2021         int n, val;
2022         int min_val = INT_MAX;
2023         int best_node = -1;
2024
2025         /* Use the local node if we haven't already */
2026         if (!node_isset(node, *used_node_mask)) {
2027                 node_set(node, *used_node_mask);
2028                 return node;
2029         }
2030
2031         for_each_node_state(n, N_HIGH_MEMORY) {
2032                 cpumask_t tmp;
2033
2034                 /* Don't want a node to appear more than once */
2035                 if (node_isset(n, *used_node_mask))
2036                         continue;
2037
2038                 /* Use the distance array to find the distance */
2039                 val = node_distance(node, n);
2040
2041                 /* Penalize nodes under us ("prefer the next node") */
2042                 val += (n < node);
2043
2044                 /* Give preference to headless and unused nodes */
2045                 tmp = node_to_cpumask(n);
2046                 if (!cpus_empty(tmp))
2047                         val += PENALTY_FOR_NODE_WITH_CPUS;
2048
2049                 /* Slight preference for less loaded node */
2050                 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
2051                 val += node_load[n];
2052
2053                 if (val < min_val) {
2054                         min_val = val;
2055                         best_node = n;
2056                 }
2057         }
2058
2059         if (best_node >= 0)
2060                 node_set(best_node, *used_node_mask);
2061
2062         return best_node;
2063 }
2064
2065
2066 /*
2067  * Build zonelists ordered by node and zones within node.
2068  * This results in maximum locality--normal zone overflows into local
2069  * DMA zone, if any--but risks exhausting DMA zone.
2070  */
2071 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
2072 {
2073         enum zone_type i;
2074         int j;
2075         struct zonelist *zonelist;
2076
2077         for (i = 0; i < MAX_NR_ZONES; i++) {
2078                 zonelist = pgdat->node_zonelists + i;
2079                 for (j = 0; zonelist->zones[j] != NULL; j++)
2080                         ;
2081                 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
2082                 zonelist->zones[j] = NULL;
2083         }
2084 }
2085
2086 /*
2087  * Build gfp_thisnode zonelists
2088  */
2089 static void build_thisnode_zonelists(pg_data_t *pgdat)
2090 {
2091         enum zone_type i;
2092         int j;
2093         struct zonelist *zonelist;
2094
2095         for (i = 0; i < MAX_NR_ZONES; i++) {
2096                 zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
2097                 j = build_zonelists_node(pgdat, zonelist, 0, i);
2098                 zonelist->zones[j] = NULL;
2099         }
2100 }
2101
2102 /*
2103  * Build zonelists ordered by zone and nodes within zones.
2104  * This results in conserving DMA zone[s] until all Normal memory is
2105  * exhausted, but results in overflowing to remote node while memory
2106  * may still exist in local DMA zone.
2107  */
2108 static int node_order[MAX_NUMNODES];
2109
2110 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
2111 {
2112         enum zone_type i;
2113         int pos, j, node;
2114         int zone_type;          /* needs to be signed */
2115         struct zone *z;
2116         struct zonelist *zonelist;
2117
2118         for (i = 0; i < MAX_NR_ZONES; i++) {
2119                 zonelist = pgdat->node_zonelists + i;
2120                 pos = 0;
2121                 for (zone_type = i; zone_type >= 0; zone_type--) {
2122                         for (j = 0; j < nr_nodes; j++) {
2123                                 node = node_order[j];
2124                                 z = &NODE_DATA(node)->node_zones[zone_type];
2125                                 if (populated_zone(z)) {
2126                                         zonelist->zones[pos++] = z;
2127                                         check_highest_zone(zone_type);
2128                                 }
2129                         }
2130                 }
2131                 zonelist->zones[pos] = NULL;
2132         }
2133 }
2134
2135 static int default_zonelist_order(void)
2136 {
2137         int nid, zone_type;
2138         unsigned long low_kmem_size,total_size;
2139         struct zone *z;
2140         int average_size;
2141         /*
2142          * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
2143          * If they are really small and used heavily, the system can fall
2144          * into OOM very easily.
2145          * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
2146          */
2147         /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
2148         low_kmem_size = 0;
2149         total_size = 0;
2150         for_each_online_node(nid) {
2151                 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
2152                         z = &NODE_DATA(nid)->node_zones[zone_type];
2153                         if (populated_zone(z)) {
2154                                 if (zone_type < ZONE_NORMAL)
2155                                         low_kmem_size += z->present_pages;
2156                                 total_size += z->present_pages;
2157                         }
2158                 }
2159         }
2160         if (!low_kmem_size ||  /* there are no DMA area. */
2161             low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
2162                 return ZONELIST_ORDER_NODE;
2163         /*
2164          * look into each node's config.
2165          * If there is a node whose DMA/DMA32 memory is very big area on
2166          * local memory, NODE_ORDER may be suitable.
2167          */
2168         average_size = total_size /
2169                                 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
2170         for_each_online_node(nid) {
2171                 low_kmem_size = 0;
2172                 total_size = 0;
2173                 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
2174                         z = &NODE_DATA(nid)->node_zones[zone_type];
2175                         if (populated_zone(z)) {
2176                                 if (zone_type < ZONE_NORMAL)
2177                                         low_kmem_size += z->present_pages;
2178                                 total_size += z->present_pages;
2179                         }
2180                 }
2181                 if (low_kmem_size &&
2182                     total_size > average_size && /* ignore small node */
2183                     low_kmem_size > total_size * 70/100)
2184                         return ZONELIST_ORDER_NODE;
2185         }
2186         return ZONELIST_ORDER_ZONE;
2187 }
2188
2189 static void set_zonelist_order(void)
2190 {
2191         if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
2192                 current_zonelist_order = default_zonelist_order();
2193         else
2194                 current_zonelist_order = user_zonelist_order;
2195 }
2196
2197 static void build_zonelists(pg_data_t *pgdat)
2198 {
2199         int j, node, load;
2200         enum zone_type i;
2201         nodemask_t used_mask;
2202         int local_node, prev_node;
2203         struct zonelist *zonelist;
2204         int order = current_zonelist_order;
2205
2206         /* initialize zonelists */
2207         for (i = 0; i < MAX_ZONELISTS; i++) {
2208                 zonelist = pgdat->node_zonelists + i;
2209                 zonelist->zones[0] = NULL;
2210         }
2211
2212         /* NUMA-aware ordering of nodes */
2213         local_node = pgdat->node_id;
2214         load = num_online_nodes();
2215         prev_node = local_node;
2216         nodes_clear(used_mask);
2217
2218         memset(node_load, 0, sizeof(node_load));
2219         memset(node_order, 0, sizeof(node_order));
2220         j = 0;
2221
2222         while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
2223                 int distance = node_distance(local_node, node);
2224
2225                 /*
2226                  * If another node is sufficiently far away then it is better
2227                  * to reclaim pages in a zone before going off node.
2228                  */
2229                 if (distance > RECLAIM_DISTANCE)
2230                         zone_reclaim_mode = 1;
2231
2232                 /*
2233                  * We don't want to pressure a particular node.
2234                  * So adding penalty to the first node in same
2235                  * distance group to make it round-robin.
2236                  */
2237                 if (distance != node_distance(local_node, prev_node))
2238                         node_load[node] = load;
2239
2240                 prev_node = node;
2241                 load--;
2242                 if (order == ZONELIST_ORDER_NODE)
2243                         build_zonelists_in_node_order(pgdat, node);
2244                 else
2245                         node_order[j++] = node; /* remember order */
2246         }
2247
2248         if (order == ZONELIST_ORDER_ZONE) {
2249                 /* calculate node order -- i.e., DMA last! */
2250                 build_zonelists_in_zone_order(pgdat, j);
2251         }
2252
2253         build_thisnode_zonelists(pgdat);
2254 }
2255
2256 /* Construct the zonelist performance cache - see further mmzone.h */
2257 static void build_zonelist_cache(pg_data_t *pgdat)
2258 {
2259         int i;
2260
2261         for (i = 0; i < MAX_NR_ZONES; i++) {
2262                 struct zonelist *zonelist;
2263                 struct zonelist_cache *zlc;
2264                 struct zone **z;
2265
2266                 zonelist = pgdat->node_zonelists + i;
2267                 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
2268                 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
2269                 for (z = zonelist->zones; *z; z++)
2270                         zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
2271         }
2272 }
2273
2274
2275 #else   /* CONFIG_NUMA */
2276
2277 static void set_zonelist_order(void)
2278 {
2279         current_zonelist_order = ZONELIST_ORDER_ZONE;
2280 }
2281
2282 static void build_zonelists(pg_data_t *pgdat)
2283 {
2284         int node, local_node;
2285         enum zone_type i,j;
2286
2287         local_node = pgdat->node_id;
2288         for (i = 0; i < MAX_NR_ZONES; i++) {
2289                 struct zonelist *zonelist;
2290
2291                 zonelist = pgdat->node_zonelists + i;
2292
2293                 j = build_zonelists_node(pgdat, zonelist, 0, i);
2294                 /*
2295                  * Now we build the zonelist so that it contains the zones
2296                  * of all the other nodes.
2297                  * We don't want to pressure a particular node, so when
2298                  * building the zones for node N, we make sure that the
2299                  * zones coming right after the local ones are those from
2300                  * node N+1 (modulo N)
2301                  */
2302                 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
2303                         if (!node_online(node))
2304                                 continue;
2305                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
2306                 }
2307                 for (node = 0; node < local_node; node++) {
2308                         if (!node_online(node))
2309                                 continue;
2310                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
2311                 }
2312
2313                 zonelist->zones[j] = NULL;
2314         }
2315 }
2316
2317 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
2318 static void build_zonelist_cache(pg_data_t *pgdat)
2319 {
2320         int i;
2321
2322         for (i = 0; i < MAX_NR_ZONES; i++)
2323                 pgdat->node_zonelists[i].zlcache_ptr = NULL;
2324 }
2325
2326 #endif  /* CONFIG_NUMA */
2327
2328 /* return values int ....just for stop_machine_run() */
2329 static int __build_all_zonelists(void *dummy)
2330 {
2331         int nid;
2332
2333         for_each_online_node(nid) {
2334                 pg_data_t *pgdat = NODE_DATA(nid);
2335
2336                 build_zonelists(pgdat);
2337                 build_zonelist_cache(pgdat);
2338         }
2339         return 0;
2340 }
2341
2342 void build_all_zonelists(void)
2343 {
2344         set_zonelist_order();
2345
2346         if (system_state == SYSTEM_BOOTING) {
2347                 __build_all_zonelists(NULL);
2348                 cpuset_init_current_mems_allowed();
2349         } else {
2350                 /* we have to stop all cpus to guaranntee there is no user
2351                    of zonelist */
2352                 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
2353                 /* cpuset refresh routine should be here */
2354         }
2355         vm_total_pages = nr_free_pagecache_pages();
2356         /*
2357          * Disable grouping by mobility if the number of pages in the
2358          * system is too low to allow the mechanism to work. It would be
2359          * more accurate, but expensive to check per-zone. This check is
2360          * made on memory-hotadd so a system can start with mobility
2361          * disabled and enable it later
2362          */
2363         if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
2364                 page_group_by_mobility_disabled = 1;
2365         else
2366                 page_group_by_mobility_disabled = 0;
2367
2368         printk("Built %i zonelists in %s order, mobility grouping %s.  "
2369                 "Total pages: %ld\n",
2370                         num_online_nodes(),
2371                         zonelist_order_name[current_zonelist_order],
2372                         page_group_by_mobility_disabled ? "off" : "on",
2373                         vm_total_pages);
2374 #ifdef CONFIG_NUMA
2375         printk("Policy zone: %s\n", zone_names[policy_zone]);
2376 #endif
2377 }
2378
2379 /*
2380  * Helper functions to size the waitqueue hash table.
2381  * Essentially these want to choose hash table sizes sufficiently
2382  * large so that collisions trying to wait on pages are rare.
2383  * But in fact, the number of active page waitqueues on typical
2384  * systems is ridiculously low, less than 200. So this is even
2385  * conservative, even though it seems large.
2386  *
2387  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
2388  * waitqueues, i.e. the size of the waitq table given the number of pages.
2389  */
2390 #define PAGES_PER_WAITQUEUE     256
2391
2392 #ifndef CONFIG_MEMORY_HOTPLUG
2393 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
2394 {
2395         unsigned long size = 1;
2396
2397         pages /= PAGES_PER_WAITQUEUE;
2398
2399         while (size < pages)
2400                 size <<= 1;
2401
2402         /*
2403          * Once we have dozens or even hundreds of threads sleeping
2404          * on IO we've got bigger problems than wait queue collision.
2405          * Limit the size of the wait table to a reasonable size.
2406          */
2407         size = min(size, 4096UL);
2408
2409         return max(size, 4UL);
2410 }
2411 #else
2412 /*
2413  * A zone's size might be changed by hot-add, so it is not possible to determine
2414  * a suitable size for its wait_table.  So we use the maximum size now.
2415  *
2416  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
2417  *
2418  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
2419  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
2420  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
2421  *
2422  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
2423  * or more by the traditional way. (See above).  It equals:
2424  *
2425  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
2426  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
2427  *    powerpc (64K page size)             : =  (32G +16M)byte.
2428  */
2429 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
2430 {
2431         return 4096UL;
2432 }
2433 #endif
2434
2435 /*
2436  * This is an integer logarithm so that shifts can be used later
2437  * to extract the more random high bits from the multiplicative
2438  * hash function before the remainder is taken.
2439  */
2440 static inline unsigned long wait_table_bits(unsigned long size)
2441 {
2442         return ffz(~size);
2443 }
2444
2445 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
2446
2447 /*
2448  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2449  * of blocks reserved is based on zone->pages_min. The memory within the
2450  * reserve will tend to store contiguous free pages. Setting min_free_kbytes
2451  * higher will lead to a bigger reserve which will get freed as contiguous
2452  * blocks as reclaim kicks in
2453  */
2454 static void setup_zone_migrate_reserve(struct zone *zone)
2455 {
2456         unsigned long start_pfn, pfn, end_pfn;
2457         struct page *page;
2458         unsigned long reserve, block_migratetype;
2459
2460         /* Get the start pfn, end pfn and the number of blocks to reserve */
2461         start_pfn = zone->zone_start_pfn;
2462         end_pfn = start_pfn + zone->spanned_pages;
2463         reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
2464                                                         pageblock_order;
2465
2466         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2467                 if (!pfn_valid(pfn))
2468                         continue;
2469                 page = pfn_to_page(pfn);
2470
2471                 /* Blocks with reserved pages will never free, skip them. */
2472                 if (PageReserved(page))
2473                         continue;
2474
2475                 block_migratetype = get_pageblock_migratetype(page);
2476
2477                 /* If this block is reserved, account for it */
2478                 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
2479                         reserve--;
2480                         continue;
2481                 }
2482
2483                 /* Suitable for reserving if this block is movable */
2484                 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
2485                         set_pageblock_migratetype(page, MIGRATE_RESERVE);
2486                         move_freepages_block(zone, page, MIGRATE_RESERVE);
2487                         reserve--;
2488                         continue;
2489                 }
2490
2491                 /*
2492                  * If the reserve is met and this is a previous reserved block,
2493                  * take it back
2494                  */
2495                 if (block_migratetype == MIGRATE_RESERVE) {
2496                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2497                         move_freepages_block(zone, page, MIGRATE_MOVABLE);
2498                 }
2499         }
2500 }
2501
2502 /*
2503  * Initially all pages are reserved - free ones are freed
2504  * up by free_all_bootmem() once the early boot process is
2505  * done. Non-atomic initialization, single-pass.
2506  */
2507 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2508                 unsigned long start_pfn, enum memmap_context context)
2509 {
2510         struct page *page;
2511         unsigned long end_pfn = start_pfn + size;
2512         unsigned long pfn;
2513
2514         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
2515                 /*
2516                  * There can be holes in boot-time mem_map[]s
2517                  * handed to this function.  They do not
2518                  * exist on hotplugged memory.
2519                  */
2520                 if (context == MEMMAP_EARLY) {
2521                         if (!early_pfn_valid(pfn))
2522                                 continue;
2523                         if (!early_pfn_in_nid(pfn, nid))
2524                                 continue;
2525                 }
2526                 page = pfn_to_page(pfn);
2527                 set_page_links(page, zone, nid, pfn);
2528                 init_page_count(page);
2529                 reset_page_mapcount(page);
2530                 SetPageReserved(page);
2531
2532                 /*
2533                  * Mark the block movable so that blocks are reserved for
2534                  * movable at startup. This will force kernel allocations
2535                  * to reserve their blocks rather than leaking throughout
2536                  * the address space during boot when many long-lived
2537                  * kernel allocations are made. Later some blocks near
2538                  * the start are marked MIGRATE_RESERVE by
2539                  * setup_zone_migrate_reserve()
2540                  */
2541                 if ((pfn & (pageblock_nr_pages-1)))
2542                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2543
2544                 INIT_LIST_HEAD(&page->lru);
2545 #ifdef WANT_PAGE_VIRTUAL
2546                 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
2547                 if (!is_highmem_idx(zone))
2548                         set_page_address(page, __va(pfn << PAGE_SHIFT));
2549 #endif
2550         }
2551 }
2552
2553 static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
2554                                 struct zone *zone, unsigned long size)
2555 {
2556         int order, t;
2557         for_each_migratetype_order(order, t) {
2558                 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
2559                 zone->free_area[order].nr_free = 0;
2560         }
2561 }
2562
2563 #ifndef __HAVE_ARCH_MEMMAP_INIT
2564 #define memmap_init(size, nid, zone, start_pfn) \
2565         memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
2566 #endif
2567
2568 static int __devinit zone_batchsize(struct zone *zone)
2569 {
2570         int batch;
2571
2572         /*
2573          * The per-cpu-pages pools are set to around 1000th of the
2574          * size of the zone.  But no more than 1/2 of a meg.
2575          *
2576          * OK, so we don't know how big the cache is.  So guess.
2577          */
2578         batch = zone->present_pages / 1024;
2579         if (batch * PAGE_SIZE > 512 * 1024)
2580                 batch = (512 * 1024) / PAGE_SIZE;
2581         batch /= 4;             /* We effectively *= 4 below */
2582         if (batch < 1)
2583                 batch = 1;
2584
2585         /*
2586          * Clamp the batch to a 2^n - 1 value. Having a power
2587          * of 2 value was found to be more likely to have
2588          * suboptimal cache aliasing properties in some cases.
2589          *
2590          * For example if 2 tasks are alternately allocating
2591          * batches of pages, one task can end up with a lot
2592          * of pages of one half of the possible page colors
2593          * and the other with pages of the other colors.
2594          */
2595         batch = (1 << (fls(batch + batch/2)-1)) - 1;
2596
2597         return batch;
2598 }
2599
2600 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2601 {
2602         struct per_cpu_pages *pcp;
2603
2604         memset(p, 0, sizeof(*p));
2605
2606         pcp = &p->pcp[0];               /* hot */
2607         pcp->count = 0;
2608         pcp->high = 6 * batch;
2609         pcp->batch = max(1UL, 1 * batch);
2610         INIT_LIST_HEAD(&pcp->list);
2611
2612         pcp = &p->pcp[1];               /* cold*/
2613         pcp->count = 0;
2614         pcp->high = 2 * batch;
2615         pcp->batch = max(1UL, batch/2);
2616         INIT_LIST_HEAD(&pcp->list);
2617 }
2618
2619 /*
2620  * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
2621  * to the value high for the pageset p.
2622  */
2623
2624 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
2625                                 unsigned long high)
2626 {
2627         struct per_cpu_pages *pcp;
2628
2629         pcp = &p->pcp[0]; /* hot list */
2630         pcp->high = high;
2631         pcp->batch = max(1UL, high/4);
2632         if ((high/4) > (PAGE_SHIFT * 8))
2633                 pcp->batch = PAGE_SHIFT * 8;
2634 }
2635
2636
2637 #ifdef CONFIG_NUMA
2638 /*
2639  * Boot pageset table. One per cpu which is going to be used for all
2640  * zones and all nodes. The parameters will be set in such a way
2641  * that an item put on a list will immediately be handed over to
2642  * the buddy list. This is safe since pageset manipulation is done
2643  * with interrupts disabled.
2644  *
2645  * Some NUMA counter updates may also be caught by the boot pagesets.
2646  *
2647  * The boot_pagesets must be kept even after bootup is complete for
2648  * unused processors and/or zones. They do play a role for bootstrapping
2649  * hotplugged processors.
2650  *
2651  * zoneinfo_show() and maybe other functions do
2652  * not check if the processor is online before following the pageset pointer.
2653  * Other parts of the kernel may not check if the zone is available.
2654  */
2655 static struct per_cpu_pageset boot_pageset[NR_CPUS];
2656
2657 /*
2658  * Dynamically allocate memory for the
2659  * per cpu pageset array in struct zone.
2660  */
2661 static int __cpuinit process_zones(int cpu)
2662 {
2663         struct zone *zone, *dzone;
2664         int node = cpu_to_node(cpu);
2665
2666         node_set_state(node, N_CPU);    /* this node has a cpu */
2667
2668         for_each_zone(zone) {
2669
2670                 if (!populated_zone(zone))
2671                         continue;
2672
2673                 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2674                                          GFP_KERNEL, node);
2675                 if (!zone_pcp(zone, cpu))
2676                         goto bad;
2677
2678                 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
2679
2680                 if (percpu_pagelist_fraction)
2681                         setup_pagelist_highmark(zone_pcp(zone, cpu),
2682                                 (zone->present_pages / percpu_pagelist_fraction));
2683         }
2684
2685         return 0;
2686 bad:
2687         for_each_zone(dzone) {
2688                 if (!populated_zone(dzone))
2689                         continue;
2690                 if (dzone == zone)
2691                         break;
2692                 kfree(zone_pcp(dzone, cpu));
2693                 zone_pcp(dzone, cpu) = NULL;
2694         }
2695         return -ENOMEM;
2696 }
2697
2698 static inline void free_zone_pagesets(int cpu)
2699 {
2700         struct zone *zone;
2701
2702         for_each_zone(zone) {
2703                 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
2704
2705                 /* Free per_cpu_pageset if it is slab allocated */
2706                 if (pset != &boot_pageset[cpu])
2707                         kfree(pset);
2708                 zone_pcp(zone, cpu) = NULL;
2709         }
2710 }
2711
2712 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
2713                 unsigned long action,
2714                 void *hcpu)
2715 {
2716         int cpu = (long)hcpu;
2717         int ret = NOTIFY_OK;
2718
2719         switch (action) {
2720         case CPU_UP_PREPARE:
2721         case CPU_UP_PREPARE_FROZEN:
2722                 if (process_zones(cpu))
2723                         ret = NOTIFY_BAD;
2724                 break;
2725         case CPU_UP_CANCELED:
2726         case CPU_UP_CANCELED_FROZEN:
2727         case CPU_DEAD:
2728         case CPU_DEAD_FROZEN:
2729                 free_zone_pagesets(cpu);
2730                 break;
2731         default:
2732                 break;
2733         }
2734         return ret;
2735 }
2736
2737 static struct notifier_block __cpuinitdata pageset_notifier =
2738         { &pageset_cpuup_callback, NULL, 0 };
2739
2740 void __init setup_per_cpu_pageset(void)
2741 {
2742         int err;
2743
2744         /* Initialize per_cpu_pageset for cpu 0.
2745          * A cpuup callback will do this for every cpu
2746          * as it comes online
2747          */
2748         err = process_zones(smp_processor_id());
2749         BUG_ON(err);
2750         register_cpu_notifier(&pageset_notifier);
2751 }
2752
2753 #endif
2754
2755 static noinline __init_refok
2756 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2757 {
2758         int i;
2759         struct pglist_data *pgdat = zone->zone_pgdat;
2760         size_t alloc_size;
2761
2762         /*
2763          * The per-page waitqueue mechanism uses hashed waitqueues
2764          * per zone.
2765          */
2766         zone->wait_table_hash_nr_entries =
2767                  wait_table_hash_nr_entries(zone_size_pages);
2768         zone->wait_table_bits =
2769                 wait_table_bits(zone->wait_table_hash_nr_entries);
2770         alloc_size = zone->wait_table_hash_nr_entries
2771                                         * sizeof(wait_queue_head_t);
2772
2773         if (system_state == SYSTEM_BOOTING) {
2774                 zone->wait_table = (wait_queue_head_t *)
2775                         alloc_bootmem_node(pgdat, alloc_size);
2776         } else {
2777                 /*
2778                  * This case means that a zone whose size was 0 gets new memory
2779                  * via memory hot-add.
2780                  * But it may be the case that a new node was hot-added.  In
2781                  * this case vmalloc() will not be able to use this new node's
2782                  * memory - this wait_table must be initialized to use this new
2783                  * node itself as well.
2784                  * To use this new node's memory, further consideration will be
2785                  * necessary.
2786                  */
2787                 zone->wait_table = vmalloc(alloc_size);
2788         }
2789         if (!zone->wait_table)
2790                 return -ENOMEM;
2791
2792         for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
2793                 init_waitqueue_head(zone->wait_table + i);
2794
2795         return 0;
2796 }
2797
2798 static __meminit void zone_pcp_init(struct zone *zone)
2799 {
2800         int cpu;
2801         unsigned long batch = zone_batchsize(zone);
2802
2803         for (cpu = 0; cpu < NR_CPUS; cpu++) {
2804 #ifdef CONFIG_NUMA
2805                 /* Early boot. Slab allocator not functional yet */
2806                 zone_pcp(zone, cpu) = &boot_pageset[cpu];
2807                 setup_pageset(&boot_pageset[cpu],0);
2808 #else
2809                 setup_pageset(zone_pcp(zone,cpu), batch);
2810 #endif
2811         }
2812         if (zone->present_pages)
2813                 printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
2814                         zone->name, zone->present_pages, batch);
2815 }
2816
2817 __meminit int init_currently_empty_zone(struct zone *zone,
2818                                         unsigned long zone_start_pfn,
2819                                         unsigned long size,
2820                                         enum memmap_context context)
2821 {
2822         struct pglist_data *pgdat = zone->zone_pgdat;
2823         int ret;
2824         ret = zone_wait_table_init(zone, size);
2825         if (ret)
2826                 return ret;
2827         pgdat->nr_zones = zone_idx(zone) + 1;
2828
2829         zone->zone_start_pfn = zone_start_pfn;
2830
2831         memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2832
2833         zone_init_free_lists(pgdat, zone, zone->spanned_pages);
2834
2835         return 0;
2836 }
2837
2838 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2839 /*
2840  * Basic iterator support. Return the first range of PFNs for a node
2841  * Note: nid == MAX_NUMNODES returns first region regardless of node
2842  */
2843 static int __meminit first_active_region_index_in_nid(int nid)
2844 {
2845         int i;
2846
2847         for (i = 0; i < nr_nodemap_entries; i++)
2848                 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2849                         return i;
2850
2851         return -1;
2852 }
2853
2854 /*
2855  * Basic iterator support. Return the next active range of PFNs for a node
2856  * Note: nid == MAX_NUMNODES returns next region regardles of node
2857  */
2858 static int __meminit next_active_region_index_in_nid(int index, int nid)
2859 {
2860         for (index = index + 1; index < nr_nodemap_entries; index++)
2861                 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2862                         return index;
2863
2864         return -1;
2865 }
2866
2867 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2868 /*
2869  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2870  * Architectures may implement their own version but if add_active_range()
2871  * was used and there are no special requirements, this is a convenient
2872  * alternative
2873  */
2874 int __meminit early_pfn_to_nid(unsigned long pfn)
2875 {
2876         int i;
2877
2878         for (i = 0; i < nr_nodemap_entries; i++) {
2879                 unsigned long start_pfn = early_node_map[i].start_pfn;
2880                 unsigned long end_pfn = early_node_map[i].end_pfn;
2881
2882                 if (start_pfn <= pfn && pfn < end_pfn)
2883                         return early_node_map[i].nid;
2884         }
2885
2886         return 0;
2887 }
2888 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2889
2890 /* Basic iterator support to walk early_node_map[] */
2891 #define for_each_active_range_index_in_nid(i, nid) \
2892         for (i = first_active_region_index_in_nid(nid); i != -1; \
2893                                 i = next_active_region_index_in_nid(i, nid))
2894
2895 /**
2896  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2897  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
2898  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
2899  *
2900  * If an architecture guarantees that all ranges registered with
2901  * add_active_ranges() contain no holes and may be freed, this
2902  * this function may be used instead of calling free_bootmem() manually.
2903  */
2904 void __init free_bootmem_with_active_regions(int nid,
2905                                                 unsigned long max_low_pfn)
2906 {
2907         int i;
2908
2909         for_each_active_range_index_in_nid(i, nid) {
2910                 unsigned long size_pages = 0;
2911                 unsigned long end_pfn = early_node_map[i].end_pfn;
2912
2913                 if (early_node_map[i].start_pfn >= max_low_pfn)
2914                         continue;
2915
2916                 if (end_pfn > max_low_pfn)
2917                         end_pfn = max_low_pfn;
2918
2919                 size_pages = end_pfn - early_node_map[i].start_pfn;
2920                 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2921                                 PFN_PHYS(early_node_map[i].start_pfn),
2922                                 size_pages << PAGE_SHIFT);
2923         }
2924 }
2925
2926 /**
2927  * sparse_memory_present_with_active_regions - Call memory_present for each active range
2928  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
2929  *
2930  * If an architecture guarantees that all ranges registered with
2931  * add_active_ranges() contain no holes and may be freed, this
2932  * function may be used instead of calling memory_present() manually.
2933  */
2934 void __init sparse_memory_present_with_active_regions(int nid)
2935 {
2936         int i;
2937
2938         for_each_active_range_index_in_nid(i, nid)
2939                 memory_present(early_node_map[i].nid,
2940                                 early_node_map[i].start_pfn,
2941                                 early_node_map[i].end_pfn);
2942 }
2943
2944 /**
2945  * push_node_boundaries - Push node boundaries to at least the requested boundary
2946  * @nid: The nid of the node to push the boundary for
2947  * @start_pfn: The start pfn of the node
2948  * @end_pfn: The end pfn of the node
2949  *
2950  * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2951  * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2952  * be hotplugged even though no physical memory exists. This function allows
2953  * an arch to push out the node boundaries so mem_map is allocated that can
2954  * be used later.
2955  */
2956 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2957 void __init push_node_boundaries(unsigned int nid,
2958                 unsigned long start_pfn, unsigned long end_pfn)
2959 {
2960         printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2961                         nid, start_pfn, end_pfn);
2962
2963         /* Initialise the boundary for this node if necessary */
2964         if (node_boundary_end_pfn[nid] == 0)
2965                 node_boundary_start_pfn[nid] = -1UL;
2966
2967         /* Update the boundaries */
2968         if (node_boundary_start_pfn[nid] > start_pfn)
2969                 node_boundary_start_pfn[nid] = start_pfn;
2970         if (node_boundary_end_pfn[nid] < end_pfn)
2971                 node_boundary_end_pfn[nid] = end_pfn;
2972 }
2973
2974 /* If necessary, push the node boundary out for reserve hotadd */
2975 static void __meminit account_node_boundary(unsigned int nid,
2976                 unsigned long *start_pfn, unsigned long *end_pfn)
2977 {
2978         printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2979                         nid, *start_pfn, *end_pfn);
2980
2981         /* Return if boundary information has not been provided */
2982         if (node_boundary_end_pfn[nid] == 0)
2983                 return;
2984
2985         /* Check the boundaries and update if necessary */
2986         if (node_boundary_start_pfn[nid] < *start_pfn)
2987                 *start_pfn = node_boundary_start_pfn[nid];
2988         if (node_boundary_end_pfn[nid] > *end_pfn)
2989                 *end_pfn = node_boundary_end_pfn[nid];
2990 }
2991 #else
2992 void __init push_node_boundaries(unsigned int nid,
2993                 unsigned long start_pfn, unsigned long end_pfn) {}
2994
2995 static void __meminit account_node_boundary(unsigned int nid,
2996                 unsigned long *start_pfn, unsigned long *end_pfn) {}
2997 #endif
2998
2999
3000 /**
3001  * get_pfn_range_for_nid - Return the start and end page frames for a node
3002  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
3003  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
3004  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
3005  *
3006  * It returns the start and end page frame of a node based on information
3007  * provided by an arch calling add_active_range(). If called for a node
3008  * with no available memory, a warning is printed and the start and end
3009  * PFNs will be 0.
3010  */
3011 void __meminit get_pfn_range_for_nid(unsigned int nid,
3012                         unsigned long *start_pfn, unsigned long *end_pfn)
3013 {
3014         int i;
3015         *start_pfn = -1UL;
3016         *end_pfn = 0;
3017
3018         for_each_active_range_index_in_nid(i, nid) {
3019                 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
3020                 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
3021         }
3022
3023         if (*start_pfn == -1UL)
3024                 *start_pfn = 0;
3025
3026         /* Push the node boundaries out if requested */
3027         account_node_boundary(nid, start_pfn, end_pfn);
3028 }
3029
3030 /*
3031  * This finds a zone that can be used for ZONE_MOVABLE pages. The
3032  * assumption is made that zones within a node are ordered in monotonic
3033  * increasing memory addresses so that the "highest" populated zone is used
3034  */
3035 void __init find_usable_zone_for_movable(void)
3036 {
3037         int zone_index;
3038         for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
3039                 if (zone_index == ZONE_MOVABLE)
3040                         continue;
3041
3042                 if (arch_zone_highest_possible_pfn[zone_index] >
3043                                 arch_zone_lowest_possible_pfn[zone_index])
3044                         break;
3045         }
3046
3047         VM_BUG_ON(zone_index == -1);
3048         movable_zone = zone_index;
3049 }
3050
3051 /*
3052  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
3053  * because it is sized independant of architecture. Unlike the other zones,
3054  * the starting point for ZONE_MOVABLE is not fixed. It may be different
3055  * in each node depending on the size of each node and how evenly kernelcore
3056  * is distributed. This helper function adjusts the zone ranges
3057  * provided by the architecture for a given node by using the end of the
3058  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
3059  * zones within a node are in order of monotonic increases memory addresses
3060  */
3061 void __meminit adjust_zone_range_for_zone_movable(int nid,
3062                                         unsigned long zone_type,
3063                                         unsigned long node_start_pfn,
3064                                         unsigned long node_end_pfn,
3065                                         unsigned long *zone_start_pfn,
3066                                         unsigned long *zone_end_pfn)
3067 {
3068         /* Only adjust if ZONE_MOVABLE is on this node */
3069         if (zone_movable_pfn[nid]) {
3070                 /* Size ZONE_MOVABLE */
3071                 if (zone_type == ZONE_MOVABLE) {
3072                         *zone_start_pfn = zone_movable_pfn[nid];
3073                         *zone_end_pfn = min(node_end_pfn,
3074                                 arch_zone_highest_possible_pfn[movable_zone]);
3075
3076                 /* Adjust for ZONE_MOVABLE starting within this range */
3077                 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
3078                                 *zone_end_pfn > zone_movable_pfn[nid]) {
3079                         *zone_end_pfn = zone_movable_pfn[nid];
3080
3081                 /* Check if this whole range is within ZONE_MOVABLE */
3082                 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
3083                         *zone_start_pfn = *zone_end_pfn;
3084         }
3085 }
3086
3087 /*
3088  * Return the number of pages a zone spans in a node, including holes
3089  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
3090  */
3091 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3092                                         unsigned long zone_type,
3093                                         unsigned long *ignored)
3094 {
3095         unsigned long node_start_pfn, node_end_pfn;
3096         unsigned long zone_start_pfn, zone_end_pfn;
3097
3098         /* Get the start and end of the node and zone */
3099         get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
3100         zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
3101         zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
3102         adjust_zone_range_for_zone_movable(nid, zone_type,
3103                                 node_start_pfn, node_end_pfn,
3104                                 &zone_start_pfn, &zone_end_pfn);
3105
3106         /* Check that this node has pages within the zone's required range */
3107         if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
3108                 return 0;
3109
3110         /* Move the zone boundaries inside the node if necessary */
3111         zone_end_pfn = min(zone_end_pfn, node_end_pfn);
3112         zone_start_pfn = max(zone_start_pfn, node_start_pfn);
3113
3114         /* Return the spanned pages */
3115         return zone_end_pfn - zone_start_pfn;
3116 }
3117
3118 /*
3119  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3120  * then all holes in the requested range will be accounted for.
3121  */
3122 unsigned long __meminit __absent_pages_in_range(int nid,
3123                                 unsigned long range_start_pfn,
3124                                 unsigned long range_end_pfn)
3125 {
3126         int i = 0;
3127         unsigned long prev_end_pfn = 0, hole_pages = 0;
3128         unsigned long start_pfn;
3129
3130         /* Find the end_pfn of the first active range of pfns in the node */
3131         i = first_active_region_index_in_nid(nid);
3132         if (i == -1)
3133                 return 0;
3134
3135         prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
3136
3137         /* Account for ranges before physical memory on this node */
3138         if (early_node_map[i].start_pfn > range_start_pfn)
3139                 hole_pages = prev_end_pfn - range_start_pfn;
3140
3141         /* Find all holes for the zone within the node */
3142         for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
3143
3144                 /* No need to continue if prev_end_pfn is outside the zone */
3145                 if (prev_end_pfn >= range_end_pfn)
3146                         break;
3147
3148                 /* Make sure the end of the zone is not within the hole */
3149                 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
3150                 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
3151
3152                 /* Update the hole size cound and move on */
3153                 if (start_pfn > range_start_pfn) {
3154                         BUG_ON(prev_end_pfn > start_pfn);
3155                         hole_pages += start_pfn - prev_end_pfn;
3156                 }
3157                 prev_end_pfn = early_node_map[i].end_pfn;
3158         }
3159
3160         /* Account for ranges past physical memory on this node */
3161         if (range_end_pfn > prev_end_pfn)
3162                 hole_pages += range_end_pfn -
3163                                 max(range_start_pfn, prev_end_pfn);
3164
3165         return hole_pages;
3166 }
3167
3168 /**
3169  * absent_pages_in_range - Return number of page frames in holes within a range
3170  * @start_pfn: The start PFN to start searching for holes
3171  * @end_pfn: The end PFN to stop searching for holes
3172  *
3173  * It returns the number of pages frames in memory holes within a range.
3174  */
3175 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
3176                                                         unsigned long end_pfn)
3177 {
3178         return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
3179 }
3180
3181 /* Return the number of page frames in holes in a zone on a node */
3182 static unsigned long __meminit zone_absent_pages_in_node(int nid,
3183                                         unsigned long zone_type,
3184                                         unsigned long *ignored)
3185 {
3186         unsigned long node_start_pfn, node_end_pfn;
3187         unsigned long zone_start_pfn, zone_end_pfn;
3188
3189         get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
3190         zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
3191                                                         node_start_pfn);
3192         zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
3193                                                         node_end_pfn);
3194
3195         adjust_zone_range_for_zone_movable(nid, zone_type,
3196                         node_start_pfn, node_end_pfn,
3197                         &zone_start_pfn, &zone_end_pfn);
3198         return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
3199 }
3200
3201 #else
3202 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
3203                                         unsigned long zone_type,
3204                                         unsigned long *zones_size)
3205 {
3206         return zones_size[zone_type];
3207 }
3208
3209 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
3210                                                 unsigned long zone_type,
3211                                                 unsigned long *zholes_size)
3212 {
3213         if (!zholes_size)
3214                 return 0;
3215
3216         return zholes_size[zone_type];
3217 }
3218
3219 #endif
3220
3221 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
3222                 unsigned long *zones_size, unsigned long *zholes_size)
3223 {
3224         unsigned long realtotalpages, totalpages = 0;
3225         enum zone_type i;
3226
3227         for (i = 0; i < MAX_NR_ZONES; i++)
3228                 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
3229                                                                 zones_size);
3230         pgdat->node_spanned_pages = totalpages;
3231
3232         realtotalpages = totalpages;
3233         for (i = 0; i < MAX_NR_ZONES; i++)
3234                 realtotalpages -=
3235                         zone_absent_pages_in_node(pgdat->node_id, i,
3236                                                                 zholes_size);
3237         pgdat->node_present_pages = realtotalpages;
3238         printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
3239                                                         realtotalpages);
3240 }
3241
3242 #ifndef CONFIG_SPARSEMEM
3243 /*
3244  * Calculate the size of the zone->blockflags rounded to an unsigned long
3245  * Start by making sure zonesize is a multiple of pageblock_order by rounding
3246  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
3247  * round what is now in bits to nearest long in bits, then return it in
3248  * bytes.
3249  */
3250 static unsigned long __init usemap_size(unsigned long zonesize)
3251 {
3252         unsigned long usemapsize;
3253
3254         usemapsize = roundup(zonesize, pageblock_nr_pages);
3255         usemapsize = usemapsize >> pageblock_order;
3256         usemapsize *= NR_PAGEBLOCK_BITS;
3257         usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
3258
3259         return usemapsize / 8;
3260 }
3261
3262 static void __init setup_usemap(struct pglist_data *pgdat,
3263                                 struct zone *zone, unsigned long zonesize)
3264 {
3265         unsigned long usemapsize = usemap_size(zonesize);
3266         zone->pageblock_flags = NULL;
3267         if (usemapsize) {
3268                 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
3269                 memset(zone->pageblock_flags, 0, usemapsize);
3270         }
3271 }
3272 #else
3273 static void inline setup_usemap(struct pglist_data *pgdat,
3274                                 struct zone *zone, unsigned long zonesize) {}
3275 #endif /* CONFIG_SPARSEMEM */
3276
3277 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
3278 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
3279 static inline void __init set_pageblock_order(unsigned int order)
3280 {
3281         /* Check that pageblock_nr_pages has not already been setup */
3282         if (pageblock_order)
3283                 return;
3284
3285         /*
3286          * Assume the largest contiguous order of interest is a huge page.
3287          * This value may be variable depending on boot parameters on IA64
3288          */
3289         pageblock_order = order;
3290 }
3291 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3292
3293 /* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */
3294 #define set_pageblock_order(x)  do {} while (0)
3295
3296 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3297
3298 /*
3299  * Set up the zone data structures:
3300  *   - mark all pages reserved
3301  *   - mark all memory queues empty
3302  *   - clear the memory bitmaps
3303  */
3304 static void __meminit free_area_init_core(struct pglist_data *pgdat,
3305                 unsigned long *zones_size, unsigned long *zholes_size)
3306 {
3307         enum zone_type j;
3308         int nid = pgdat->node_id;
3309         unsigned long zone_start_pfn = pgdat->node_start_pfn;
3310         int ret;
3311
3312         pgdat_resize_init(pgdat);
3313         pgdat->nr_zones = 0;
3314         init_waitqueue_head(&pgdat->kswapd_wait);
3315         pgdat->kswapd_max_order = 0;
3316
3317         for (j = 0; j < MAX_NR_ZONES; j++) {
3318                 struct zone *zone = pgdat->node_zones + j;
3319                 unsigned long size, realsize, memmap_pages;
3320
3321                 size = zone_spanned_pages_in_node(nid, j, zones_size);
3322                 realsize = size - zone_absent_pages_in_node(nid, j,
3323                                                                 zholes_size);
3324
3325                 /*
3326                  * Adjust realsize so that it accounts for how much memory
3327                  * is used by this zone for memmap. This affects the watermark
3328                  * and per-cpu initialisations
3329                  */
3330                 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
3331                 if (realsize >= memmap_pages) {
3332                         realsize -= memmap_pages;
3333                         printk(KERN_DEBUG
3334                                 "  %s zone: %lu pages used for memmap\n",
3335                                 zone_names[j], memmap_pages);
3336                 } else
3337                         printk(KERN_WARNING
3338                                 "  %s zone: %lu pages exceeds realsize %lu\n",
3339                                 zone_names[j], memmap_pages, realsize);
3340
3341                 /* Account for reserved pages */
3342                 if (j == 0 && realsize > dma_reserve) {
3343                         realsize -= dma_reserve;
3344                         printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
3345                                         zone_names[0], dma_reserve);
3346                 }
3347
3348                 if (!is_highmem_idx(j))
3349                         nr_kernel_pages += realsize;
3350                 nr_all_pages += realsize;
3351
3352                 zone->spanned_pages = size;
3353                 zone->present_pages = realsize;
3354 #ifdef CONFIG_NUMA
3355                 zone->node = nid;
3356                 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
3357                                                 / 100;
3358                 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
3359 #endif
3360                 zone->name = zone_names[j];
3361                 spin_lock_init(&zone->lock);
3362                 spin_lock_init(&zone->lru_lock);
3363                 zone_seqlock_init(zone);
3364                 zone->zone_pgdat = pgdat;
3365
3366                 zone->prev_priority = DEF_PRIORITY;
3367
3368                 zone_pcp_init(zone);
3369                 INIT_LIST_HEAD(&zone->active_list);
3370                 INIT_LIST_HEAD(&zone->inactive_list);
3371                 zone->nr_scan_active = 0;
3372                 zone->nr_scan_inactive = 0;
3373                 zap_zone_vm_stats(zone);
3374                 atomic_set(&zone->reclaim_in_progress, 0);
3375                 if (!size)
3376                         continue;
3377
3378                 set_pageblock_order(HUGETLB_PAGE_ORDER);
3379                 setup_usemap(pgdat, zone, size);
3380                 ret = init_currently_empty_zone(zone, zone_start_pfn,
3381                                                 size, MEMMAP_EARLY);
3382                 BUG_ON(ret);
3383                 zone_start_pfn += size;
3384         }
3385 }
3386
3387 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
3388 {
3389         /* Skip empty nodes */
3390         if (!pgdat->node_spanned_pages)
3391                 return;
3392
3393 #ifdef CONFIG_FLAT_NODE_MEM_MAP
3394         /* ia64 gets its own node_mem_map, before this, without bootmem */
3395         if (!pgdat->node_mem_map) {
3396                 unsigned long size, start, end;
3397                 struct page *map;
3398
3399                 /*
3400                  * The zone's endpoints aren't required to be MAX_ORDER
3401                  * aligned but the node_mem_map endpoints must be in order
3402                  * for the buddy allocator to function correctly.
3403                  */
3404                 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
3405                 end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
3406                 end = ALIGN(end, MAX_ORDER_NR_PAGES);
3407                 size =  (end - start) * sizeof(struct page);
3408                 map = alloc_remap(pgdat->node_id, size);
3409                 if (!map)
3410                         map = alloc_bootmem_node(pgdat, size);
3411                 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
3412         }
3413 #ifndef CONFIG_NEED_MULTIPLE_NODES
3414         /*
3415          * With no DISCONTIG, the global mem_map is just set as node 0's
3416          */
3417         if (pgdat == NODE_DATA(0)) {
3418                 mem_map = NODE_DATA(0)->node_mem_map;
3419 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
3420                 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
3421                         mem_map -= pgdat->node_start_pfn;
3422 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
3423         }
3424 #endif
3425 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
3426 }
3427
3428 void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
3429                 unsigned long *zones_size, unsigned long node_start_pfn,
3430                 unsigned long *zholes_size)
3431 {
3432         pgdat->node_id = nid;
3433         pgdat->node_start_pfn = node_start_pfn;
3434         calculate_node_totalpages(pgdat, zones_size, zholes_size);
3435
3436         alloc_node_mem_map(pgdat);
3437
3438         free_area_init_core(pgdat, zones_size, zholes_size);
3439 }
3440
3441 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
3442
3443 #if MAX_NUMNODES > 1
3444 /*
3445  * Figure out the number of possible node ids.
3446  */
3447 static void __init setup_nr_node_ids(void)
3448 {
3449         unsigned int node;
3450         unsigned int highest = 0;
3451
3452         for_each_node_mask(node, node_possible_map)
3453                 highest = node;
3454         nr_node_ids = highest + 1;
3455 }
3456 #else
3457 static inline void setup_nr_node_ids(void)
3458 {
3459 }
3460 #endif
3461
3462 /**
3463  * add_active_range - Register a range of PFNs backed by physical memory
3464  * @nid: The node ID the range resides on
3465  * @start_pfn: The start PFN of the available physical memory
3466  * @end_pfn: The end PFN of the available physical memory
3467  *
3468  * These ranges are stored in an early_node_map[] and later used by
3469  * free_area_init_nodes() to calculate zone sizes and holes. If the
3470  * range spans a memory hole, it is up to the architecture to ensure
3471  * the memory is not freed by the bootmem allocator. If possible
3472  * the range being registered will be merged with existing ranges.
3473  */
3474 void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3475                                                 unsigned long end_pfn)
3476 {
3477         int i;
3478
3479         printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
3480                           "%d entries of %d used\n",
3481                           nid, start_pfn, end_pfn,
3482                           nr_nodemap_entries, MAX_ACTIVE_REGIONS);
3483
3484         /* Merge with existing active regions if possible */
3485         for (i = 0; i < nr_nodemap_entries; i++) {
3486                 if (early_node_map[i].nid != nid)
3487                         continue;
3488
3489                 /* Skip if an existing region covers this new one */
3490                 if (start_pfn >= early_node_map[i].start_pfn &&
3491                                 end_pfn <= early_node_map[i].end_pfn)
3492                         return;
3493
3494                 /* Merge forward if suitable */
3495                 if (start_pfn <= early_node_map[i].end_pfn &&
3496                                 end_pfn > early_node_map[i].end_pfn) {
3497                         early_node_map[i].end_pfn = end_pfn;
3498                         return;
3499                 }
3500
3501                 /* Merge backward if suitable */
3502                 if (start_pfn < early_node_map[i].end_pfn &&
3503                                 end_pfn >= early_node_map[i].start_pfn) {
3504                         early_node_map[i].start_pfn = start_pfn;
3505                         return;
3506                 }
3507         }
3508
3509         /* Check that early_node_map is large enough */
3510         if (i >= MAX_ACTIVE_REGIONS) {
3511                 printk(KERN_CRIT "More than %d memory regions, truncating\n",
3512                                                         MAX_ACTIVE_REGIONS);
3513                 return;
3514         }
3515
3516         early_node_map[i].nid = nid;
3517         early_node_map[i].start_pfn = start_pfn;
3518         early_node_map[i].end_pfn = end_pfn;
3519         nr_nodemap_entries = i + 1;
3520 }
3521
3522 /**
3523  * shrink_active_range - Shrink an existing registered range of PFNs
3524  * @nid: The node id the range is on that should be shrunk
3525  * @old_end_pfn: The old end PFN of the range
3526  * @new_end_pfn: The new PFN of the range
3527  *
3528  * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
3529  * The map is kept at the end physical page range that has already been
3530  * registered with add_active_range(). This function allows an arch to shrink
3531  * an existing registered range.
3532  */
3533 void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
3534                                                 unsigned long new_end_pfn)
3535 {
3536         int i;
3537
3538         /* Find the old active region end and shrink */
3539         for_each_active_range_index_in_nid(i, nid)
3540                 if (early_node_map[i].end_pfn == old_end_pfn) {
3541                         early_node_map[i].end_pfn = new_end_pfn;
3542                         break;
3543                 }
3544 }
3545
3546 /**
3547  * remove_all_active_ranges - Remove all currently registered regions
3548  *
3549  * During discovery, it may be found that a table like SRAT is invalid
3550  * and an alternative discovery method must be used. This function removes
3551  * all currently registered regions.
3552  */
3553 void __init remove_all_active_ranges(void)
3554 {
3555         memset(early_node_map, 0, sizeof(early_node_map));
3556         nr_nodemap_entries = 0;
3557 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3558         memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
3559         memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
3560 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
3561 }
3562
3563 /* Compare two active node_active_regions */
3564 static int __init cmp_node_active_region(const void *a, const void *b)
3565 {
3566         struct node_active_region *arange = (struct node_active_region *)a;
3567         struct node_active_region *brange = (struct node_active_region *)b;
3568
3569         /* Done this way to avoid overflows */
3570         if (arange->start_pfn > brange->start_pfn)
3571                 return 1;
3572         if (arange->start_pfn < brange->start_pfn)
3573                 return -1;
3574
3575         return 0;
3576 }
3577
3578 /* sort the node_map by start_pfn */
3579 static void __init sort_node_map(void)
3580 {
3581         sort(early_node_map, (size_t)nr_nodemap_entries,
3582                         sizeof(struct node_active_region),
3583                         cmp_node_active_region, NULL);
3584 }
3585
3586 /* Find the lowest pfn for a node */
3587 unsigned long __init find_min_pfn_for_node(unsigned long nid)
3588 {
3589         int i;
3590         unsigned long min_pfn = ULONG_MAX;
3591
3592         /* Assuming a sorted map, the first range found has the starting pfn */
3593         for_each_active_range_index_in_nid(i, nid)
3594                 min_pfn = min(min_pfn, early_node_map[i].start_pfn);
3595
3596         if (min_pfn == ULONG_MAX) {
3597                 printk(KERN_WARNING
3598                         "Could not find start_pfn for node %lu\n", nid);
3599                 return 0;
3600         }
3601
3602         return min_pfn;
3603 }
3604
3605 /**
3606  * find_min_pfn_with_active_regions - Find the minimum PFN registered
3607  *
3608  * It returns the minimum PFN based on information provided via
3609  * add_active_range().
3610  */
3611 unsigned long __init find_min_pfn_with_active_regions(void)
3612 {
3613         return find_min_pfn_for_node(MAX_NUMNODES);
3614 }
3615
3616 /**
3617  * find_max_pfn_with_active_regions - Find the maximum PFN registered
3618  *
3619  * It returns the maximum PFN based on information provided via
3620  * add_active_range().
3621  */
3622 unsigned long __init find_max_pfn_with_active_regions(void)
3623 {
3624         int i;
3625         unsigned long max_pfn = 0;
3626
3627         for (i = 0; i < nr_nodemap_entries; i++)
3628                 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
3629
3630         return max_pfn;
3631 }
3632
3633 /*
3634  * early_calculate_totalpages()
3635  * Sum pages in active regions for movable zone.
3636  * Populate N_HIGH_MEMORY for calculating usable_nodes.
3637  */
3638 static unsigned long __init early_calculate_totalpages(void)
3639 {
3640         int i;
3641         unsigned long totalpages = 0;
3642
3643         for (i = 0; i < nr_nodemap_entries; i++) {
3644                 unsigned long pages = early_node_map[i].end_pfn -
3645                                                 early_node_map[i].start_pfn;
3646                 totalpages += pages;
3647                 if (pages)
3648                         node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
3649         }
3650         return totalpages;
3651 }
3652
3653 /*
3654  * Find the PFN the Movable zone begins in each node. Kernel memory
3655  * is spread evenly between nodes as long as the nodes have enough
3656  * memory. When they don't, some nodes will have more kernelcore than
3657  * others
3658  */
3659 void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3660 {
3661         int i, nid;
3662         unsigned long usable_startpfn;
3663         unsigned long kernelcore_node, kernelcore_remaining;
3664         unsigned long totalpages = early_calculate_totalpages();
3665         int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
3666
3667         /*
3668          * If movablecore was specified, calculate what size of
3669          * kernelcore that corresponds so that memory usable for
3670          * any allocation type is evenly spread. If both kernelcore
3671          * and movablecore are specified, then the value of kernelcore
3672          * will be used for required_kernelcore if it's greater than
3673          * what movablecore would have allowed.
3674          */
3675         if (required_movablecore) {
3676                 unsigned long corepages;
3677
3678                 /*
3679                  * Round-up so that ZONE_MOVABLE is at least as large as what
3680                  * was requested by the user
3681                  */
3682                 required_movablecore =
3683                         roundup(required_movablecore, MAX_ORDER_NR_PAGES);
3684                 corepages = totalpages - required_movablecore;
3685
3686                 required_kernelcore = max(required_kernelcore, corepages);
3687         }
3688
3689         /* If kernelcore was not specified, there is no ZONE_MOVABLE */
3690         if (!required_kernelcore)
3691                 return;
3692
3693         /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
3694         find_usable_zone_for_movable();
3695         usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
3696
3697 restart:
3698         /* Spread kernelcore memory as evenly as possible throughout nodes */
3699         kernelcore_node = required_kernelcore / usable_nodes;
3700         for_each_node_state(nid, N_HIGH_MEMORY) {
3701                 /*
3702                  * Recalculate kernelcore_node if the division per node
3703                  * now exceeds what is necessary to satisfy the requested
3704                  * amount of memory for the kernel
3705                  */
3706                 if (required_kernelcore < kernelcore_node)
3707                         kernelcore_node = required_kernelcore / usable_nodes;
3708
3709                 /*
3710                  * As the map is walked, we track how much memory is usable
3711                  * by the kernel using kernelcore_remaining. When it is
3712                  * 0, the rest of the node is usable by ZONE_MOVABLE
3713                  */
3714                 kernelcore_remaining = kernelcore_node;
3715
3716                 /* Go through each range of PFNs within this node */
3717                 for_each_active_range_index_in_nid(i, nid) {
3718                         unsigned long start_pfn, end_pfn;
3719                         unsigned long size_pages;
3720
3721                         start_pfn = max(early_node_map[i].start_pfn,
3722                                                 zone_movable_pfn[nid]);
3723                         end_pfn = early_node_map[i].end_pfn;
3724                         if (start_pfn >= end_pfn)
3725                                 continue;
3726
3727                         /* Account for what is only usable for kernelcore */
3728                         if (start_pfn < usable_startpfn) {
3729                                 unsigned long kernel_pages;
3730                                 kernel_pages = min(end_pfn, usable_startpfn)
3731                                                                 - start_pfn;
3732
3733                                 kernelcore_remaining -= min(kernel_pages,
3734                                                         kernelcore_remaining);
3735                                 required_kernelcore -= min(kernel_pages,
3736                                                         required_kernelcore);
3737
3738                                 /* Continue if range is now fully accounted */
3739                                 if (end_pfn <= usable_startpfn) {
3740
3741                                         /*
3742                                          * Push zone_movable_pfn to the end so
3743                                          * that if we have to rebalance
3744                                          * kernelcore across nodes, we will
3745                                          * not double account here
3746                                          */
3747                                         zone_movable_pfn[nid] = end_pfn;
3748                                         continue;
3749                                 }
3750                                 start_pfn = usable_startpfn;
3751                         }
3752
3753                         /*
3754                          * The usable PFN range for ZONE_MOVABLE is from
3755                          * start_pfn->end_pfn. Calculate size_pages as the
3756                          * number of pages used as kernelcore
3757                          */
3758                         size_pages = end_pfn - start_pfn;
3759                         if (size_pages > kernelcore_remaining)
3760                                 size_pages = kernelcore_remaining;
3761                         zone_movable_pfn[nid] = start_pfn + size_pages;
3762
3763                         /*
3764                          * Some kernelcore has been met, update counts and
3765                          * break if the kernelcore for this node has been
3766                          * satisified
3767                          */
3768                         required_kernelcore -= min(required_kernelcore,
3769                                                                 size_pages);
3770                         kernelcore_remaining -= size_pages;
3771                         if (!kernelcore_remaining)
3772                                 break;
3773                 }
3774         }
3775
3776         /*
3777          * If there is still required_kernelcore, we do another pass with one
3778          * less node in the count. This will push zone_movable_pfn[nid] further
3779          * along on the nodes that still have memory until kernelcore is
3780          * satisified
3781          */
3782         usable_nodes--;
3783         if (usable_nodes && required_kernelcore > usable_nodes)
3784                 goto restart;
3785
3786         /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
3787         for (nid = 0; nid < MAX_NUMNODES; nid++)
3788                 zone_movable_pfn[nid] =
3789                         roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
3790 }
3791
3792 /* Any regular memory on that node ? */
3793 static void check_for_regular_memory(pg_data_t *pgdat)
3794 {
3795 #ifdef CONFIG_HIGHMEM
3796         enum zone_type zone_type;
3797
3798         for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
3799                 struct zone *zone = &pgdat->node_zones[zone_type];
3800                 if (zone->present_pages)
3801                         node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
3802         }
3803 #endif
3804 }
3805
3806 /**
3807  * free_area_init_nodes - Initialise all pg_data_t and zone data
3808  * @max_zone_pfn: an array of max PFNs for each zone
3809  *
3810  * This will call free_area_init_node() for each active node in the system.
3811  * Using the page ranges provided by add_active_range(), the size of each
3812  * zone in each node and their holes is calculated. If the maximum PFN
3813  * between two adjacent zones match, it is assumed that the zone is empty.
3814  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
3815  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
3816  * starts where the previous one ended. For example, ZONE_DMA32 starts
3817  * at arch_max_dma_pfn.
3818  */
3819 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3820 {
3821         unsigned long nid;
3822         enum zone_type i;
3823
3824         /* Sort early_node_map as initialisation assumes it is sorted */
3825         sort_node_map();
3826
3827         /* Record where the zone boundaries are */
3828         memset(arch_zone_lowest_possible_pfn, 0,
3829                                 sizeof(arch_zone_lowest_possible_pfn));
3830         memset(arch_zone_highest_possible_pfn, 0,
3831                                 sizeof(arch_zone_highest_possible_pfn));
3832         arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
3833         arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
3834         for (i = 1; i < MAX_NR_ZONES; i++) {
3835                 if (i == ZONE_MOVABLE)
3836                         continue;
3837                 arch_zone_lowest_possible_pfn[i] =
3838                         arch_zone_highest_possible_pfn[i-1];
3839                 arch_zone_highest_possible_pfn[i] =
3840                         max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
3841         }
3842         arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
3843         arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
3844
3845         /* Find the PFNs that ZONE_MOVABLE begins at in each node */
3846         memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
3847         find_zone_movable_pfns_for_nodes(zone_movable_pfn);
3848
3849         /* Print out the zone ranges */
3850         printk("Zone PFN ranges:\n");
3851         for (i = 0; i < MAX_NR_ZONES; i++) {
3852                 if (i == ZONE_MOVABLE)
3853                         continue;
3854                 printk("  %-8s %8lu -> %8lu\n",
3855                                 zone_names[i],
3856                                 arch_zone_lowest_possible_pfn[i],
3857                                 arch_zone_highest_possible_pfn[i]);
3858         }
3859
3860         /* Print out the PFNs ZONE_MOVABLE begins at in each node */
3861         printk("Movable zone start PFN for each node\n");
3862         for (i = 0; i < MAX_NUMNODES; i++) {
3863                 if (zone_movable_pfn[i])
3864                         printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
3865         }
3866
3867         /* Print out the early_node_map[] */
3868         printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
3869         for (i = 0; i < nr_nodemap_entries; i++)
3870                 printk("  %3d: %8lu -> %8lu\n", early_node_map[i].nid,
3871                                                 early_node_map[i].start_pfn,
3872                                                 early_node_map[i].end_pfn);
3873
3874         /* Initialise every node */
3875         setup_nr_node_ids();
3876         for_each_online_node(nid) {
3877                 pg_data_t *pgdat = NODE_DATA(nid);
3878                 free_area_init_node(nid, pgdat, NULL,
3879                                 find_min_pfn_for_node(nid), NULL);
3880
3881                 /* Any memory on that node */
3882                 if (pgdat->node_present_pages)
3883                         node_set_state(nid, N_HIGH_MEMORY);
3884                 check_for_regular_memory(pgdat);
3885         }
3886 }
3887
3888 static int __init cmdline_parse_core(char *p, unsigned long *core)
3889 {
3890         unsigned long long coremem;
3891         if (!p)
3892                 return -EINVAL;
3893
3894         coremem = memparse(p, &p);
3895         *core = coremem >> PAGE_SHIFT;
3896
3897         /* Paranoid check that UL is enough for the coremem value */
3898         WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
3899
3900         return 0;
3901 }
3902
3903 /*
3904  * kernelcore=size sets the amount of memory for use for allocations that
3905  * cannot be reclaimed or migrated.
3906  */
3907 static int __init cmdline_parse_kernelcore(char *p)
3908 {
3909         return cmdline_parse_core(p, &required_kernelcore);
3910 }
3911
3912 /*
3913  * movablecore=size sets the amount of memory for use for allocations that
3914  * can be reclaimed or migrated.
3915  */
3916 static int __init cmdline_parse_movablecore(char *p)
3917 {
3918         return cmdline_parse_core(p, &required_movablecore);
3919 }
3920
3921 early_param("kernelcore", cmdline_parse_kernelcore);
3922 early_param("movablecore", cmdline_parse_movablecore);
3923
3924 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
3925
3926 /**
3927  * set_dma_reserve - set the specified number of pages reserved in the first zone
3928  * @new_dma_reserve: The number of pages to mark reserved
3929  *
3930  * The per-cpu batchsize and zone watermarks are determined by present_pages.
3931  * In the DMA zone, a significant percentage may be consumed by kernel image
3932  * and other unfreeable allocations which can skew the watermarks badly. This
3933  * function may optionally be used to account for unfreeable pages in the
3934  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
3935  * smaller per-cpu batchsize.
3936  */
3937 void __init set_dma_reserve(unsigned long new_dma_reserve)
3938 {
3939         dma_reserve = new_dma_reserve;
3940 }
3941
3942 #ifndef CONFIG_NEED_MULTIPLE_NODES
3943 static bootmem_data_t contig_bootmem_data;
3944 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
3945
3946 EXPORT_SYMBOL(contig_page_data);
3947 #endif
3948
3949 void __init free_area_init(unsigned long *zones_size)
3950 {
3951         free_area_init_node(0, NODE_DATA(0), zones_size,
3952                         __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
3953 }
3954
3955 static int page_alloc_cpu_notify(struct notifier_block *self,
3956                                  unsigned long action, void *hcpu)
3957 {
3958         int cpu = (unsigned long)hcpu;
3959
3960         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
3961                 local_irq_disable();
3962                 __drain_pages(cpu);
3963                 vm_events_fold_cpu(cpu);
3964                 local_irq_enable();
3965                 refresh_cpu_vm_stats(cpu);
3966         }
3967         return NOTIFY_OK;
3968 }
3969
3970 void __init page_alloc_init(void)
3971 {
3972         hotcpu_notifier(page_alloc_cpu_notify, 0);
3973 }
3974
3975 /*
3976  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
3977  *      or min_free_kbytes changes.
3978  */
3979 static void calculate_totalreserve_pages(void)
3980 {
3981         struct pglist_data *pgdat;
3982         unsigned long reserve_pages = 0;
3983         enum zone_type i, j;
3984
3985         for_each_online_pgdat(pgdat) {
3986                 for (i = 0; i < MAX_NR_ZONES; i++) {
3987                         struct zone *zone = pgdat->node_zones + i;
3988                         unsigned long max = 0;
3989
3990                         /* Find valid and maximum lowmem_reserve in the zone */
3991                         for (j = i; j < MAX_NR_ZONES; j++) {
3992                                 if (zone->lowmem_reserve[j] > max)
3993                                         max = zone->lowmem_reserve[j];
3994                         }
3995
3996                         /* we treat pages_high as reserved pages. */
3997                         max += zone->pages_high;
3998
3999                         if (max > zone->present_pages)
4000                                 max = zone->present_pages;
4001                         reserve_pages += max;
4002                 }
4003         }
4004         totalreserve_pages = reserve_pages;
4005 }
4006
4007 /*
4008  * setup_per_zone_lowmem_reserve - called whenever
4009  *      sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
4010  *      has a correct pages reserved value, so an adequate number of
4011  *      pages are left in the zone after a successful __alloc_pages().
4012  */
4013 static void setup_per_zone_lowmem_reserve(void)
4014 {
4015         struct pglist_data *pgdat;
4016         enum zone_type j, idx;
4017
4018         for_each_online_pgdat(pgdat) {
4019                 for (j = 0; j < MAX_NR_ZONES; j++) {
4020                         struct zone *zone = pgdat->node_zones + j;
4021                         unsigned long present_pages = zone->present_pages;
4022
4023                         zone->lowmem_reserve[j] = 0;
4024
4025                         idx = j;
4026                         while (idx) {
4027                                 struct zone *lower_zone;
4028
4029                                 idx--;
4030
4031                                 if (sysctl_lowmem_reserve_ratio[idx] < 1)
4032                                         sysctl_lowmem_reserve_ratio[idx] = 1;
4033
4034                                 lower_zone = pgdat->node_zones + idx;
4035                                 lower_zone->lowmem_reserve[j] = present_pages /
4036                                         sysctl_lowmem_reserve_ratio[idx];
4037                                 present_pages += lower_zone->present_pages;
4038                         }
4039                 }
4040         }
4041
4042         /* update totalreserve_pages */
4043         calculate_totalreserve_pages();
4044 }
4045
4046 /**
4047  * setup_per_zone_pages_min - called when min_free_kbytes changes.
4048  *
4049  * Ensures that the pages_{min,low,high} values for each zone are set correctly
4050  * with respect to min_free_kbytes.
4051  */
4052 void setup_per_zone_pages_min(void)
4053 {
4054         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4055         unsigned long lowmem_pages = 0;
4056         struct zone *zone;
4057         unsigned long flags;
4058
4059         /* Calculate total number of !ZONE_HIGHMEM pages */
4060         for_each_zone(zone) {
4061                 if (!is_highmem(zone))
4062                         lowmem_pages += zone->present_pages;
4063         }
4064
4065         for_each_zone(zone) {
4066                 u64 tmp;
4067
4068                 spin_lock_irqsave(&zone->lru_lock, flags);
4069                 tmp = (u64)pages_min * zone->present_pages;
4070                 do_div(tmp, lowmem_pages);
4071                 if (is_highmem(zone)) {
4072                         /*
4073                          * __GFP_HIGH and PF_MEMALLOC allocations usually don't
4074                          * need highmem pages, so cap pages_min to a small
4075                          * value here.
4076                          *
4077                          * The (pages_high-pages_low) and (pages_low-pages_min)
4078                          * deltas controls asynch page reclaim, and so should
4079                          * not be capped for highmem.
4080                          */
4081                         int min_pages;
4082
4083                         min_pages = zone->present_pages / 1024;
4084                         if (min_pages < SWAP_CLUSTER_MAX)
4085                                 min_pages = SWAP_CLUSTER_MAX;
4086                         if (min_pages > 128)
4087                                 min_pages = 128;
4088                         zone->pages_min = min_pages;
4089                 } else {
4090                         /*
4091                          * If it's a lowmem zone, reserve a number of pages
4092                          * proportionate to the zone's size.
4093                          */
4094                         zone->pages_min = tmp;
4095                 }
4096
4097                 zone->pages_low   = zone->pages_min + (tmp >> 2);
4098                 zone->pages_high  = zone->pages_min + (tmp >> 1);
4099                 setup_zone_migrate_reserve(zone);
4100                 spin_unlock_irqrestore(&zone->lru_lock, flags);
4101         }
4102
4103         /* update totalreserve_pages */
4104         calculate_totalreserve_pages();
4105 }
4106
4107 /*
4108  * Initialise min_free_kbytes.
4109  *
4110  * For small machines we want it small (128k min).  For large machines
4111  * we want it large (64MB max).  But it is not linear, because network
4112  * bandwidth does not increase linearly with machine size.  We use
4113  *
4114  *      min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
4115  *      min_free_kbytes = sqrt(lowmem_kbytes * 16)
4116  *
4117  * which yields
4118  *
4119  * 16MB:        512k
4120  * 32MB:        724k
4121  * 64MB:        1024k
4122  * 128MB:       1448k
4123  * 256MB:       2048k
4124  * 512MB:       2896k
4125  * 1024MB:      4096k
4126  * 2048MB:      5792k
4127  * 4096MB:      8192k
4128  * 8192MB:      11584k
4129  * 16384MB:     16384k
4130  */
4131 static int __init init_per_zone_pages_min(void)
4132 {
4133         unsigned long lowmem_kbytes;
4134
4135         lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
4136
4137         min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
4138         if (min_free_kbytes < 128)
4139                 min_free_kbytes = 128;
4140         if (min_free_kbytes > 65536)
4141                 min_free_kbytes = 65536;
4142         setup_per_zone_pages_min();
4143         setup_per_zone_lowmem_reserve();
4144         return 0;
4145 }
4146 module_init(init_per_zone_pages_min)
4147
4148 /*
4149  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
4150  *      that we can call two helper functions whenever min_free_kbytes
4151  *      changes.
4152  */
4153 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4154         struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
4155 {
4156         proc_dointvec(table, write, file, buffer, length, ppos);
4157         if (write)
4158                 setup_per_zone_pages_min();
4159         return 0;
4160 }
4161
4162 #ifdef CONFIG_NUMA
4163 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4164         struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
4165 {
4166         struct zone *zone;
4167         int rc;
4168
4169         rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4170         if (rc)
4171                 return rc;
4172
4173         for_each_zone(zone)
4174                 zone->min_unmapped_pages = (zone->present_pages *
4175                                 sysctl_min_unmapped_ratio) / 100;
4176         return 0;
4177 }
4178
4179 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4180         struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
4181 {
4182         struct zone *zone;
4183         int rc;
4184
4185         rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4186         if (rc)
4187                 return rc;
4188
4189         for_each_zone(zone)
4190                 zone->min_slab_pages = (zone->present_pages *
4191                                 sysctl_min_slab_ratio) / 100;
4192         return 0;
4193 }
4194 #endif
4195
4196 /*
4197  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
4198  *      proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
4199  *      whenever sysctl_lowmem_reserve_ratio changes.
4200  *
4201  * The reserve ratio obviously has absolutely no relation with the
4202  * pages_min watermarks. The lowmem reserve ratio can only make sense
4203  * if in function of the boot time zone sizes.
4204  */
4205 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4206         struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
4207 {
4208         proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4209         setup_per_zone_lowmem_reserve();
4210         return 0;
4211 }
4212
4213 /*
4214  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
4215  * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
4216  * can have before it gets flushed back to buddy allocator.
4217  */
4218
4219 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4220         struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
4221 {
4222         struct zone *zone;
4223         unsigned int cpu;
4224         int ret;
4225
4226         ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4227         if (!write || (ret == -EINVAL))
4228                 return ret;
4229         for_each_zone(zone) {
4230                 for_each_online_cpu(cpu) {
4231                         unsigned long  high;
4232                         high = zone->present_pages / percpu_pagelist_fraction;
4233                         setup_pagelist_highmark(zone_pcp(zone, cpu), high);
4234                 }
4235         }
4236         return 0;
4237 }
4238
4239 int hashdist = HASHDIST_DEFAULT;
4240
4241 #ifdef CONFIG_NUMA
4242 static int __init set_hashdist(char *str)
4243 {
4244         if (!str)
4245                 return 0;
4246         hashdist = simple_strtoul(str, &str, 0);
4247         return 1;
4248 }
4249 __setup("hashdist=", set_hashdist);
4250 #endif
4251
4252 /*
4253  * allocate a large system hash table from bootmem
4254  * - it is assumed that the hash table must contain an exact power-of-2
4255  *   quantity of entries
4256  * - limit is the number of hash buckets, not the total allocation size
4257  */
4258 void *__init alloc_large_system_hash(const char *tablename,
4259                                      unsigned long bucketsize,
4260                                      unsigned long numentries,
4261                                      int scale,
4262                                      int flags,
4263                                      unsigned int *_hash_shift,
4264                                      unsigned int *_hash_mask,
4265                                      unsigned long limit)
4266 {
4267         unsigned long long max = limit;
4268         unsigned long log2qty, size;
4269         void *table = NULL;
4270
4271         /* allow the kernel cmdline to have a say */
4272         if (!numentries) {
4273                 /* round applicable memory size up to nearest megabyte */
4274                 numentries = nr_kernel_pages;
4275                 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
4276                 numentries >>= 20 - PAGE_SHIFT;
4277                 numentries <<= 20 - PAGE_SHIFT;
4278
4279                 /* limit to 1 bucket per 2^scale bytes of low memory */
4280                 if (scale > PAGE_SHIFT)
4281                         numentries >>= (scale - PAGE_SHIFT);
4282                 else
4283                         numentries <<= (PAGE_SHIFT - scale);
4284
4285                 /* Make sure we've got at least a 0-order allocation.. */
4286                 if (unlikely((numentries * bucketsize) < PAGE_SIZE))
4287                         numentries = PAGE_SIZE / bucketsize;
4288         }
4289         numentries = roundup_pow_of_two(numentries);
4290
4291         /* limit allocation size to 1/16 total memory by default */
4292         if (max == 0) {
4293                 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
4294                 do_div(max, bucketsize);
4295         }
4296
4297         if (numentries > max)
4298                 numentries = max;
4299
4300         log2qty = ilog2(numentries);
4301
4302         do {
4303                 size = bucketsize << log2qty;
4304                 if (flags & HASH_EARLY)
4305                         table = alloc_bootmem(size);
4306                 else if (hashdist)
4307                         table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4308                 else {
4309                         unsigned long order;
4310                         for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
4311                                 ;
4312                         table = (void*) __get_free_pages(GFP_ATOMIC, order);
4313                         /*
4314                          * If bucketsize is not a power-of-two, we may free
4315                          * some pages at the end of hash table.
4316                          */
4317                         if (table) {
4318                                 unsigned long alloc_end = (unsigned long)table +
4319                                                 (PAGE_SIZE << order);
4320                                 unsigned long used = (unsigned long)table +
4321                                                 PAGE_ALIGN(size);
4322                                 split_page(virt_to_page(table), order);
4323                                 while (used < alloc_end) {
4324                                         free_page(used);
4325                                         used += PAGE_SIZE;
4326                                 }
4327                         }
4328                 }
4329         } while (!table && size > PAGE_SIZE && --log2qty);
4330
4331         if (!table)
4332                 panic("Failed to allocate %s hash table\n", tablename);
4333
4334         printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
4335                tablename,
4336                (1U << log2qty),
4337                ilog2(size) - PAGE_SHIFT,
4338                size);
4339
4340         if (_hash_shift)
4341                 *_hash_shift = log2qty;
4342         if (_hash_mask)
4343                 *_hash_mask = (1 << log2qty) - 1;
4344
4345         return table;
4346 }
4347
4348 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
4349 struct page *pfn_to_page(unsigned long pfn)
4350 {
4351         return __pfn_to_page(pfn);
4352 }
4353 unsigned long page_to_pfn(struct page *page)
4354 {
4355         return __page_to_pfn(page);
4356 }
4357 EXPORT_SYMBOL(pfn_to_page);
4358 EXPORT_SYMBOL(page_to_pfn);
4359 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
4360
4361 /* Return a pointer to the bitmap storing bits affecting a block of pages */
4362 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
4363                                                         unsigned long pfn)
4364 {
4365 #ifdef CONFIG_SPARSEMEM
4366         return __pfn_to_section(pfn)->pageblock_flags;
4367 #else
4368         return zone->pageblock_flags;
4369 #endif /* CONFIG_SPARSEMEM */
4370 }
4371
4372 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
4373 {
4374 #ifdef CONFIG_SPARSEMEM
4375         pfn &= (PAGES_PER_SECTION-1);
4376         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
4377 #else
4378         pfn = pfn - zone->zone_start_pfn;
4379         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
4380 #endif /* CONFIG_SPARSEMEM */
4381 }
4382
4383 /**
4384  * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
4385  * @page: The page within the block of interest
4386  * @start_bitidx: The first bit of interest to retrieve
4387  * @end_bitidx: The last bit of interest
4388  * returns pageblock_bits flags
4389  */
4390 unsigned long get_pageblock_flags_group(struct page *page,
4391                                         int start_bitidx, int end_bitidx)
4392 {
4393         struct zone *zone;
4394         unsigned long *bitmap;
4395         unsigned long pfn, bitidx;
4396         unsigned long flags = 0;
4397         unsigned long value = 1;
4398
4399         zone = page_zone(page);
4400         pfn = page_to_pfn(page);
4401         bitmap = get_pageblock_bitmap(zone, pfn);
4402         bitidx = pfn_to_bitidx(zone, pfn);
4403
4404         for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
4405                 if (test_bit(bitidx + start_bitidx, bitmap))
4406                         flags |= value;
4407
4408         return flags;
4409 }
4410
4411 /**
4412  * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
4413  * @page: The page within the block of interest
4414  * @start_bitidx: The first bit of interest
4415  * @end_bitidx: The last bit of interest
4416  * @flags: The flags to set
4417  */
4418 void set_pageblock_flags_group(struct page *page, unsigned long flags,
4419                                         int start_bitidx, int end_bitidx)
4420 {
4421         struct zone *zone;
4422         unsigned long *bitmap;
4423         unsigned long pfn, bitidx;
4424         unsigned long value = 1;
4425
4426         zone = page_zone(page);
4427         pfn = page_to_pfn(page);
4428         bitmap = get_pageblock_bitmap(zone, pfn);
4429         bitidx = pfn_to_bitidx(zone, pfn);
4430
4431         for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
4432                 if (flags & value)
4433                         __set_bit(bitidx + start_bitidx, bitmap);
4434                 else
4435                         __clear_bit(bitidx + start_bitidx, bitmap);
4436 }
4437
4438 /*
4439  * This is designed as sub function...plz see page_isolation.c also.
4440  * set/clear page block's type to be ISOLATE.
4441  * page allocater never alloc memory from ISOLATE block.
4442  */
4443
4444 int set_migratetype_isolate(struct page *page)
4445 {
4446         struct zone *zone;
4447         unsigned long flags;
4448         int ret = -EBUSY;
4449
4450         zone = page_zone(page);
4451         spin_lock_irqsave(&zone->lock, flags);
4452         /*
4453          * In future, more migrate types will be able to be isolation target.
4454          */
4455         if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
4456                 goto out;
4457         set_pageblock_migratetype(page, MIGRATE_ISOLATE);
4458         move_freepages_block(zone, page, MIGRATE_ISOLATE);
4459         ret = 0;
4460 out:
4461         spin_unlock_irqrestore(&zone->lock, flags);
4462         if (!ret)
4463                 drain_all_local_pages();
4464         return ret;
4465 }
4466
4467 void unset_migratetype_isolate(struct page *page)
4468 {
4469         struct zone *zone;
4470         unsigned long flags;
4471         zone = page_zone(page);
4472         spin_lock_irqsave(&zone->lock, flags);
4473         if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
4474                 goto out;
4475         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4476         move_freepages_block(zone, page, MIGRATE_MOVABLE);
4477 out:
4478         spin_unlock_irqrestore(&zone->lock, flags);
4479 }
4480
4481 #ifdef CONFIG_MEMORY_HOTREMOVE
4482 /*
4483  * All pages in the range must be isolated before calling this.
4484  */
4485 void
4486 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
4487 {
4488         struct page *page;
4489         struct zone *zone;
4490         int order, i;
4491         unsigned long pfn;
4492         unsigned long flags;
4493         /* find the first valid pfn */
4494         for (pfn = start_pfn; pfn < end_pfn; pfn++)
4495                 if (pfn_valid(pfn))
4496                         break;
4497         if (pfn == end_pfn)
4498                 return;
4499         zone = page_zone(pfn_to_page(pfn));
4500         spin_lock_irqsave(&zone->lock, flags);
4501         pfn = start_pfn;
4502         while (pfn < end_pfn) {
4503                 if (!pfn_valid(pfn)) {
4504                         pfn++;
4505                         continue;
4506                 }
4507                 page = pfn_to_page(pfn);
4508                 BUG_ON(page_count(page));
4509                 BUG_ON(!PageBuddy(page));
4510                 order = page_order(page);
4511 #ifdef CONFIG_DEBUG_VM
4512                 printk(KERN_INFO "remove from free list %lx %d %lx\n",
4513                        pfn, 1 << order, end_pfn);
4514 #endif
4515                 list_del(&page->lru);
4516                 rmv_page_order(page);
4517                 zone->free_area[order].nr_free--;
4518                 __mod_zone_page_state(zone, NR_FREE_PAGES,
4519                                       - (1UL << order));
4520                 for (i = 0; i < (1 << order); i++)
4521                         SetPageReserved((page+i));
4522                 pfn += (1 << order);
4523         }
4524         spin_unlock_irqrestore(&zone->lock, flags);
4525 }
4526 #endif