mm/page_alloc.c

   1 /*
   2  *  linux/mm/page_alloc.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *  Swap reorganised 29.12.95, Stephen Tweedie
   6  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   7  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   8  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   9  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  10  */
  11
  12 #include <linux/config.h>
  13 #include <linux/mm.h>
  14 #include <linux/swap.h>
  15 #include <linux/swapctl.h>
  16 #include <linux/interrupt.h>
  17 #include <linux/pagemap.h>
  18 #include <linux/bootmem.h>
  19
  20 /* Use NUMNODES instead of numnodes for better code inside kernel APIs */
  21 #ifndef CONFIG_DISCONTIGMEM
  22 #define NUMNODES 1
  23 #else
  24 #define NUMNODES numnodes
  25 #endif
  26
  27 int nr_swap_pages = 0;
  28 int nr_lru_pages = 0;
  29 pg_data_t *pgdat_list = (pg_data_t *)0;
  30
  31 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  32 static int zone_balance_ratio[MAX_NR_ZONES] = { 128, 128, 128, };
  33 static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
  34 static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
  35
  36 /*
  37  * Free_page() adds the page to the free lists. This is optimized for
  38  * fast normal cases (no error jumps taken normally).
  39  *
  40  * The way to optimize jumps for gcc-2.2.2 is to:
  41  *  - select the "normal" case and put it inside the if () { XXX }
  42  *  - no else-statements if you can avoid them
  43  *
  44  * With the above two rules, you get a straight-line execution path
  45  * for the normal case, giving better asm-code.
  46  */
  47
  48 #define memlist_init(x) INIT_LIST_HEAD(x)
  49 #define memlist_add_head list_add
  50 #define memlist_add_tail list_add_tail
  51 #define memlist_del list_del
  52 #define memlist_entry list_entry
  53 #define memlist_next(x) ((x)->next)
  54 #define memlist_prev(x) ((x)->prev)
  55
  56 /*
  57  * Temporary debugging check.
  58  */
  59 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
  60
  61 #if 0
  62
  63 static inline unsigned long classfree(zone_t *zone)
  64 {
  65         unsigned long free = 0;
  66         zone_t *z = zone->zone_pgdat->node_zones;
  67
  68         while (z != zone) {
  69                 free += z->free_pages;
  70                 z++;
  71         }
  72         free += zone->free_pages;
  73         return(free);
  74 }
  75
  76 #endif
  77
  78 /*
  79  * Buddy system. Hairy. You really aren't expected to understand this
  80  *
  81  * Hint: -mask = 1+~mask
  82  */
  83
  84 void __free_pages_ok (struct page *page, unsigned long order)
  85 {
  86         unsigned long index, page_idx, mask, flags;
  87         free_area_t *area;
  88         struct page *base;
  89         zone_t *zone;
  90
  91         /*
  92          * Subtle. We do not want to test this in the inlined part of
  93          * __free_page() - it's a rare condition and just increases
  94          * cache footprint unnecesserily. So we do an 'incorrect'
  95          * decrement on page->count for reserved pages, but this part
  96          * makes it safe.
  97          */
  98         if (PageReserved(page))
  99                 return;
 100
 101         if (page->buffers)
 102                 BUG();
 103         if (page->mapping)
 104                 BUG();
 105         if (page-mem_map >= max_mapnr)
 106                 BUG();
 107         if (PageSwapCache(page))
 108                 BUG();
 109         if (PageLocked(page))
 110                 BUG();
 111         if (PageDecrAfter(page))
 112                 BUG();
 113
 114         zone = page->zone;
 115
 116         mask = (~0UL) << order;
 117         base = mem_map + zone->offset;
 118         page_idx = page - base;
 119         if (page_idx & ~mask)
 120                 BUG();
 121         index = page_idx >> (1 + order);
 122
 123         area = zone->free_area + order;
 124
 125         spin_lock_irqsave(&zone->lock, flags);
 126
 127         zone->free_pages -= mask;
 128
 129         while (mask + (1 << (MAX_ORDER-1))) {
 130                 struct page *buddy1, *buddy2;
 131
 132                 if (area >= zone->free_area + MAX_ORDER)
 133                         BUG();
 134                 if (!test_and_change_bit(index, area->map))
 135                         /*
 136                          * the buddy page is still allocated.
 137                          */
 138                         break;
 139                 /*
 140                  * Move the buddy up one level.
 141                  */
 142                 buddy1 = base + (page_idx ^ -mask);
 143                 buddy2 = base + page_idx;
 144                 if (BAD_RANGE(zone,buddy1))
 145                         BUG();
 146                 if (BAD_RANGE(zone,buddy2))
 147                         BUG();
 148
 149                 memlist_del(&buddy1->list);
 150                 mask <<= 1;
 151                 area++;
 152                 index >>= 1;
 153                 page_idx &= mask;
 154         }
 155         memlist_add_head(&(base + page_idx)->list, &area->free_list);
 156
 157         spin_unlock_irqrestore(&zone->lock, flags);
 158
 159         if (zone->free_pages > zone->pages_high) {
 160                 zone->zone_wake_kswapd = 0;
 161                 zone->low_on_memory = 0;
 162         }
 163 }
 164
 165 #define MARK_USED(index, order, area) \
 166         change_bit((index) >> (1+(order)), (area)->map)
 167
 168 static inline struct page * expand (zone_t *zone, struct page *page,
 169          unsigned long index, int low, int high, free_area_t * area)
 170 {
 171         unsigned long size = 1 << high;
 172
 173         while (high > low) {
 174                 if (BAD_RANGE(zone,page))
 175                         BUG();
 176                 area--;
 177                 high--;
 178                 size >>= 1;
 179                 memlist_add_head(&(page)->list, &(area)->free_list);
 180                 MARK_USED(index, high, area);
 181                 index += size;
 182                 page += size;
 183         }
 184         if (BAD_RANGE(zone,page))
 185                 BUG();
 186         return page;
 187 }
 188
 189 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
 190 static struct page * rmqueue(zone_t *zone, unsigned long order)
 191 {
 192         free_area_t * area = zone->free_area + order;
 193         unsigned long curr_order = order;
 194         struct list_head *head, *curr;
 195         unsigned long flags;
 196         struct page *page;
 197
 198         spin_lock_irqsave(&zone->lock, flags);
 199         do {
 200                 head = &area->free_list;
 201                 curr = memlist_next(head);
 202
 203                 if (curr != head) {
 204                         unsigned int index;
 205
 206                         page = memlist_entry(curr, struct page, list);
 207                         if (BAD_RANGE(zone,page))
 208                                 BUG();
 209                         memlist_del(curr);
 210                         index = (page - mem_map) - zone->offset;
 211                         MARK_USED(index, curr_order, area);
 212                         zone->free_pages -= 1 << order;
 213
 214                         page = expand(zone, page, index, order, curr_order, area);
 215                         spin_unlock_irqrestore(&zone->lock, flags);
 216
 217                         set_page_count(page, 1);
 218                         if (BAD_RANGE(zone,page))
 219                                 BUG();
 220                         return page;
 221                 }
 222                 curr_order++;
 223                 area++;
 224         } while (curr_order < MAX_ORDER);
 225         spin_unlock_irqrestore(&zone->lock, flags);
 226
 227         return NULL;
 228 }
 229
 230 static int zone_balance_memory(zonelist_t *zonelist)
 231 {
 232         int tried = 0, freed = 0;
 233         zone_t **zone;
 234         int gfp_mask = zonelist->gfp_mask;
 235         extern wait_queue_head_t kswapd_wait;
 236
 237         zone = zonelist->zones;
 238         for (;;) {
 239                 zone_t *z = *(zone++);
 240                 if (!z)
 241                         break;
 242                 if (z->free_pages > z->pages_low)
 243                         continue;
 244
 245                 z->zone_wake_kswapd = 1;
 246                 wake_up_interruptible(&kswapd_wait);
 247
 248                 /* Are we reaching the critical stage? */
 249                 if (!z->low_on_memory) {
 250                         /* Not yet critical, so let kswapd handle it.. */
 251                         if (z->free_pages > z->pages_min)
 252                                 continue;
 253                         z->low_on_memory = 1;
 254                 }
 255                 /*
 256                  * In the atomic allocation case we only 'kick' the
 257                  * state machine, but do not try to free pages
 258                  * ourselves.
 259                  */
 260                 tried = 1;
 261                 freed |= try_to_free_pages(gfp_mask, z);
 262         }
 263         if (tried && !freed) {
 264                 if (!(gfp_mask & __GFP_HIGH))
 265                         return 0;
 266         }
 267         return 1;
 268 }
 269
 270 /*
 271  * This is the 'heart' of the zoned buddy allocator:
 272  */
 273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 274 {
 275         zone_t **zone = zonelist->zones;
 276         int gfp_mask = zonelist->gfp_mask;
 277         static int low_on_memory;
 278
 279         /*
 280          * If this is a recursive call, we'd better
 281          * do our best to just allocate things without
 282          * further thought.
 283          */
 284         if (current->flags & PF_MEMALLOC)
 285                 goto allocate_ok;
 286
 287         /* If we're a memory hog, unmap some pages */
 288         if (current->hog && low_on_memory && (gfp_mask & __GFP_WAIT)) {
 289         //      swap_out(6, gfp_mask);
 290         //      shm_swap(6, gfp_mask, (zone_t *)(zone));
 291                 try_to_free_pages(gfp_mask, (zone_t *)(zone));
 292         }
 293
 294         /*
 295          * (If anyone calls gfp from interrupts nonatomically then it
 296          * will sooner or later tripped up by a schedule().)
 297          *
 298          * We are falling back to lower-level zones if allocation
 299          * in a higher zone fails.
 300          */
 301         for (;;) {
 302                 zone_t *z = *(zone++);
 303                 if (!z)
 304                         break;
 305                 if (!z->size)
 306                         BUG();
 307
 308                 /* Are we supposed to free memory? Don't make it worse.. */
 309                 if (!z->zone_wake_kswapd && z->free_pages > z->pages_low) {
 310                         struct page *page = rmqueue(z, order);
 311                         low_on_memory = 0;
 312                         if (page)
 313                                 return page;
 314                 }
 315         }
 316
 317         low_on_memory = 1;
 318         /*
 319          * Ok, no obvious zones were available, start
 320          * balancing things a bit..
 321          */
 322         if (zone_balance_memory(zonelist)) {
 323                 zone = zonelist->zones;
 324 allocate_ok:
 325                 for (;;) {
 326                         zone_t *z = *(zone++);
 327                         if (!z)
 328                                 break;
 329                         if (z->free_pages) {
 330                                 struct page *page = rmqueue(z, order);
 331                                 if (page)
 332                                         return page;
 333                         }
 334                 }
 335         }
 336         return NULL;
 337
 338 /*
 339  * The main chunk of the balancing code is in this offline branch:
 340  */
 341 }
 342
 343 /*
 344  * Total amount of free (allocatable) RAM:
 345  */
 346 unsigned int nr_free_pages (void)
 347 {
 348         unsigned int sum;
 349         zone_t *zone;
 350         int i;
 351
 352         sum = 0;
 353         for (i = 0; i < NUMNODES; i++)
 354                 for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
 355                         sum += zone->free_pages;
 356         return sum;
 357 }
 358
 359 /*
 360  * Amount of free RAM allocatable as buffer memory:
 361  */
 362 unsigned int nr_free_buffer_pages (void)
 363 {
 364         unsigned int sum;
 365         zone_t *zone;
 366         int i;
 367
 368         sum = nr_lru_pages;
 369         for (i = 0; i < NUMNODES; i++)
 370                 for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++)
 371                         sum += zone->free_pages;
 372         return sum;
 373 }
 374
 375 #if CONFIG_HIGHMEM
 376 unsigned int nr_free_highpages (void)
 377 {
 378         int i;
 379         unsigned int pages = 0;
 380
 381         for (i = 0; i < NUMNODES; i++)
 382                 pages += NODE_DATA(i)->node_zones[ZONE_HIGHMEM].free_pages;
 383         return pages;
 384 }
 385 #endif
 386
 387 /*
 388  * Show free area list (used inside shift_scroll-lock stuff)
 389  * We also calculate the percentage fragmentation. We do this by counting the
 390  * memory on each free list with the exception of the first item on the list.
 391  */
 392 void show_free_areas_core(int nid)
 393 {
 394         unsigned long order;
 395         unsigned type;
 396
 397         printk("Free pages:      %6dkB (%6dkB HighMem)\n",
 398                 nr_free_pages() << (PAGE_SHIFT-10),
 399                 nr_free_highpages() << (PAGE_SHIFT-10));
 400
 401         printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
 402                 nr_free_pages(),
 403                 nr_lru_pages,
 404                 freepages.min,
 405                 freepages.low,
 406                 freepages.high);
 407
 408         for (type = 0; type < MAX_NR_ZONES; type++) {
 409                 struct list_head *head, *curr;
 410                 zone_t *zone = NODE_DATA(nid)->node_zones + type;
 411                 unsigned long nr, total, flags;
 412
 413                 printk("  %s: ", zone->name);
 414
 415                 total = 0;
 416                 if (zone->size) {
 417                         spin_lock_irqsave(&zone->lock, flags);
 418                         for (order = 0; order < MAX_ORDER; order++) {
 419                                 head = &(zone->free_area + order)->free_list;
 420                                 curr = head;
 421                                 nr = 0;
 422                                 for (;;) {
 423                                         curr = memlist_next(curr);
 424                                         if (curr == head)
 425                                                 break;
 426                                         nr++;
 427                                 }
 428                                 total += nr * (1 << order);
 429                                 printk("%lu*%lukB ", nr,
 430                                                 (PAGE_SIZE>>10) << order);
 431                         }
 432                         spin_unlock_irqrestore(&zone->lock, flags);
 433                 }
 434                 printk("= %lukB)\n", total * (PAGE_SIZE>>10));
 435         }
 436
 437 #ifdef SWAP_CACHE_INFO
 438         show_swap_cache_info();
 439 #endif
 440 }
 441
 442 void show_free_areas(void)
 443 {
 444         show_free_areas_core(0);
 445 }
 446
 447 /*
 448  * Builds allocation fallback zone lists.
 449  */
 450 static inline void build_zonelists(pg_data_t *pgdat)
 451 {
 452         int i, j, k;
 453
 454         for (i = 0; i < NR_GFPINDEX; i++) {
 455                 zonelist_t *zonelist;
 456                 zone_t *zone;
 457
 458                 zonelist = pgdat->node_zonelists + i;
 459                 memset(zonelist, 0, sizeof(*zonelist));
 460
 461                 zonelist->gfp_mask = i;
 462                 j = 0;
 463                 k = ZONE_NORMAL;
 464                 if (i & __GFP_HIGHMEM)
 465                         k = ZONE_HIGHMEM;
 466                 if (i & __GFP_DMA)
 467                         k = ZONE_DMA;
 468
 469                 switch (k) {
 470                         default:
 471                                 BUG();
 472                         /*
 473                          * fallthrough:
 474                          */
 475                         case ZONE_HIGHMEM:
 476                                 zone = pgdat->node_zones + ZONE_HIGHMEM;
 477                                 if (zone->size) {
 478 #ifndef CONFIG_HIGHMEM
 479                                         BUG();
 480 #endif
 481                                         zonelist->zones[j++] = zone;
 482                                 }
 483                         case ZONE_NORMAL:
 484                                 zone = pgdat->node_zones + ZONE_NORMAL;
 485                                 if (zone->size)
 486                                         zonelist->zones[j++] = zone;
 487                         case ZONE_DMA:
 488                                 zone = pgdat->node_zones + ZONE_DMA;
 489                                 if (zone->size)
 490                                         zonelist->zones[j++] = zone;
 491                 }
 492                 zonelist->zones[j++] = NULL;
 493         }
 494 }
 495
 496 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 497
 498 /*
 499  * Set up the zone data structures:
 500  *   - mark all pages reserved
 501  *   - mark all memory queues empty
 502  *   - clear the memory bitmaps
 503  */
 504 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 505         unsigned long *zones_size, unsigned long zone_start_paddr,
 506         unsigned long *zholes_size)
 507 {
 508         struct page *p, *lmem_map;
 509         unsigned long i, j;
 510         unsigned long map_size;
 511         unsigned long totalpages, offset, realtotalpages;
 512         unsigned int cumulative = 0;
 513
 514         pgdat->node_next = pgdat_list;
 515         pgdat_list = pgdat;
 516
 517         totalpages = 0;
 518         for (i = 0; i < MAX_NR_ZONES; i++) {
 519                 unsigned long size = zones_size[i];
 520                 totalpages += size;
 521         }
 522         realtotalpages = totalpages;
 523         if (zholes_size)
 524                 for (i = 0; i < MAX_NR_ZONES; i++)
 525                         realtotalpages -= zholes_size[i];
 526
 527         printk("On node %d totalpages: %lu\n", nid, realtotalpages);
 528
 529         /*
 530          * Select nr of pages we try to keep free for important stuff
 531          * with a minimum of 10 pages and a maximum of 256 pages, so
 532          * that we don't waste too much memory on large systems.
 533          * This is fairly arbitrary, but based on some behaviour
 534          * analysis.
 535          */
 536         i = realtotalpages >> 7;
 537         if (i < 10)
 538                 i = 10;
 539         if (i > 256)
 540                 i = 256;
 541         freepages.min += i;
 542         freepages.low += i * 2;
 543         freepages.high += i * 3;
 544         memlist_init(&lru_cache);
 545
 546         /*
 547          * Some architectures (with lots of mem and discontinous memory
 548          * maps) have to search for a good mem_map area:
 549          * For discontigmem, the conceptual mem map array starts from
 550          * PAGE_OFFSET, we need to align the actual array onto a mem map
 551          * boundary, so that MAP_NR works.
 552          */
 553         map_size = (totalpages + 1)*sizeof(struct page);
 554         lmem_map = (struct page *) alloc_bootmem_node(nid, map_size);
 555         lmem_map = (struct page *)(PAGE_OFFSET +
 556                         MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
 557         *gmap = pgdat->node_mem_map = lmem_map;
 558         pgdat->node_size = totalpages;
 559         pgdat->node_start_paddr = zone_start_paddr;
 560         pgdat->node_start_mapnr = (lmem_map - mem_map);
 561
 562         /*
 563          * Initially all pages are reserved - free ones are freed
 564          * up by free_all_bootmem() once the early boot process is
 565          * done.
 566          */
 567         for (p = lmem_map; p < lmem_map + totalpages; p++) {
 568                 set_page_count(p, 0);
 569                 SetPageReserved(p);
 570                 init_waitqueue_head(&p->wait);
 571                 memlist_init(&p->list);
 572         }
 573
 574         offset = lmem_map - mem_map;
 575         for (j = 0; j < MAX_NR_ZONES; j++) {
 576                 zone_t *zone = pgdat->node_zones + j;
 577                 unsigned long mask;
 578                 unsigned long size, realsize;
 579
 580                 realsize = size = zones_size[j];
 581                 if (zholes_size)
 582                         realsize -= zholes_size[j];
 583
 584                 printk("zone(%lu): %lu pages.\n", j, size);
 585                 zone->size = size;
 586                 zone->name = zone_names[j];
 587                 zone->lock = SPIN_LOCK_UNLOCKED;
 588                 zone->zone_pgdat = pgdat;
 589                 zone->free_pages = 0;
 590                 if (!size)
 591                         continue;
 592
 593                 zone->offset = offset;
 594                 cumulative += size;
 595                 mask = (realsize / zone_balance_ratio[j]);
 596                 if (mask < zone_balance_min[j])
 597                         mask = zone_balance_min[j];
 598                 else if (mask > zone_balance_max[j])
 599                         mask = zone_balance_max[j];
 600                 zone->pages_min = mask;
 601                 zone->pages_low = mask*2;
 602                 zone->pages_high = mask*3;
 603                 zone->low_on_memory = 0;
 604                 zone->zone_wake_kswapd = 0;
 605                 zone->zone_mem_map = mem_map + offset;
 606                 zone->zone_start_mapnr = offset;
 607                 zone->zone_start_paddr = zone_start_paddr;
 608
 609                 for (i = 0; i < size; i++) {
 610                         struct page *page = mem_map + offset + i;
 611                         page->zone = zone;
 612                         if (j != ZONE_HIGHMEM) {
 613                                 page->virtual = (unsigned long)(__va(zone_start_paddr));
 614                                 zone_start_paddr += PAGE_SIZE;
 615                         }
 616                 }
 617
 618                 offset += size;
 619                 mask = -1;
 620                 for (i = 0; i < MAX_ORDER; i++) {
 621                         unsigned long bitmap_size;
 622
 623                         memlist_init(&zone->free_area[i].free_list);
 624                         mask += mask;
 625                         size = (size + ~mask) & mask;
 626                         bitmap_size = size >> i;
 627                         bitmap_size = (bitmap_size + 7) >> 3;
 628                         bitmap_size = LONG_ALIGN(bitmap_size);
 629                         zone->free_area[i].map =
 630                           (unsigned int *) alloc_bootmem_node(nid, bitmap_size);
 631                 }
 632         }
 633         build_zonelists(pgdat);
 634 }
 635
 636 void __init free_area_init(unsigned long *zones_size)
 637 {
 638         free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0);
 639 }
 640
 641 static int __init setup_mem_frac(char *str)
 642 {
 643         int j = 0;
 644
 645         while (get_option(&str, &zone_balance_ratio[j++]) == 2);
 646         printk("setup_mem_frac: ");
 647         for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
 648         printk("\n");
 649         return 1;
 650 }
 651
 652 __setup("memfrac=", setup_mem_frac);