Beautify diagnostic messages.
[linux-2.6/linux-mips.git] / mm / page_alloc.c
blob8b74a73db388db5e6f085056010ed7c9d9822f10
1 /*
2 * linux/mm/page_alloc.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
8 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
9 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
12 #include <linux/config.h>
13 #include <linux/mm.h>
14 #include <linux/swap.h>
15 #include <linux/swapctl.h>
16 #include <linux/interrupt.h>
17 #include <linux/pagemap.h>
18 #include <linux/bootmem.h>
20 /* Use NUMNODES instead of numnodes for better code inside kernel APIs */
21 #ifndef CONFIG_DISCONTIGMEM
22 #define NUMNODES 1
23 #else
24 #define NUMNODES numnodes
25 #endif
27 int nr_swap_pages;
28 int nr_lru_pages;
29 pg_data_t *pgdat_list;
31 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
32 static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, };
33 static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
34 static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
37 * Free_page() adds the page to the free lists. This is optimized for
38 * fast normal cases (no error jumps taken normally).
40 * The way to optimize jumps for gcc-2.2.2 is to:
41 * - select the "normal" case and put it inside the if () { XXX }
42 * - no else-statements if you can avoid them
44 * With the above two rules, you get a straight-line execution path
45 * for the normal case, giving better asm-code.
48 #define memlist_init(x) INIT_LIST_HEAD(x)
49 #define memlist_add_head list_add
50 #define memlist_add_tail list_add_tail
51 #define memlist_del list_del
52 #define memlist_entry list_entry
53 #define memlist_next(x) ((x)->next)
54 #define memlist_prev(x) ((x)->prev)
57 * Temporary debugging check.
59 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
62 * Buddy system. Hairy. You really aren't expected to understand this
64 * Hint: -mask = 1+~mask
67 static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order));
68 static void __free_pages_ok (struct page *page, unsigned long order)
70 unsigned long index, page_idx, mask, flags;
71 free_area_t *area;
72 struct page *base;
73 zone_t *zone;
76 * Subtle. We do not want to test this in the inlined part of
77 * __free_page() - it's a rare condition and just increases
78 * cache footprint unnecesserily. So we do an 'incorrect'
79 * decrement on page->count for reserved pages, but this part
80 * makes it safe.
82 if (PageReserved(page))
83 return;
85 if (page->buffers)
86 BUG();
87 if (page->mapping)
88 BUG();
89 if (!VALID_PAGE(page))
90 BUG();
91 if (PageSwapCache(page))
92 BUG();
93 if (PageLocked(page))
94 BUG();
95 if (PageDecrAfter(page))
96 BUG();
97 if (PageDirty(page))
98 BUG();
100 zone = page->zone;
102 mask = (~0UL) << order;
103 base = mem_map + zone->offset;
104 page_idx = page - base;
105 if (page_idx & ~mask)
106 BUG();
107 index = page_idx >> (1 + order);
109 area = zone->free_area + order;
111 spin_lock_irqsave(&zone->lock, flags);
113 zone->free_pages -= mask;
115 while (mask + (1 << (MAX_ORDER-1))) {
116 struct page *buddy1, *buddy2;
118 if (area >= zone->free_area + MAX_ORDER)
119 BUG();
120 if (!test_and_change_bit(index, area->map))
122 * the buddy page is still allocated.
124 break;
126 * Move the buddy up one level.
128 buddy1 = base + (page_idx ^ -mask);
129 buddy2 = base + page_idx;
130 if (BAD_RANGE(zone,buddy1))
131 BUG();
132 if (BAD_RANGE(zone,buddy2))
133 BUG();
135 memlist_del(&buddy1->list);
136 mask <<= 1;
137 area++;
138 index >>= 1;
139 page_idx &= mask;
141 memlist_add_head(&(base + page_idx)->list, &area->free_list);
143 spin_unlock_irqrestore(&zone->lock, flags);
145 if (zone->free_pages > zone->pages_high) {
146 zone->zone_wake_kswapd = 0;
147 zone->low_on_memory = 0;
151 #define MARK_USED(index, order, area) \
152 change_bit((index) >> (1+(order)), (area)->map)
154 static inline struct page * expand (zone_t *zone, struct page *page,
155 unsigned long index, int low, int high, free_area_t * area)
157 unsigned long size = 1 << high;
159 while (high > low) {
160 if (BAD_RANGE(zone,page))
161 BUG();
162 area--;
163 high--;
164 size >>= 1;
165 memlist_add_head(&(page)->list, &(area)->free_list);
166 MARK_USED(index, high, area);
167 index += size;
168 page += size;
170 if (BAD_RANGE(zone,page))
171 BUG();
172 return page;
175 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
176 static struct page * rmqueue(zone_t *zone, unsigned long order)
178 free_area_t * area = zone->free_area + order;
179 unsigned long curr_order = order;
180 struct list_head *head, *curr;
181 unsigned long flags;
182 struct page *page;
184 spin_lock_irqsave(&zone->lock, flags);
185 do {
186 head = &area->free_list;
187 curr = memlist_next(head);
189 if (curr != head) {
190 unsigned int index;
192 page = memlist_entry(curr, struct page, list);
193 if (BAD_RANGE(zone,page))
194 BUG();
195 memlist_del(curr);
196 index = (page - mem_map) - zone->offset;
197 MARK_USED(index, curr_order, area);
198 zone->free_pages -= 1 << order;
200 page = expand(zone, page, index, order, curr_order, area);
201 spin_unlock_irqrestore(&zone->lock, flags);
203 set_page_count(page, 1);
204 if (BAD_RANGE(zone,page))
205 BUG();
206 return page;
208 curr_order++;
209 area++;
210 } while (curr_order < MAX_ORDER);
211 spin_unlock_irqrestore(&zone->lock, flags);
213 return NULL;
217 * This is the 'heart' of the zoned buddy allocator:
219 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
221 zone_t **zone;
222 extern wait_queue_head_t kswapd_wait;
225 * (If anyone calls gfp from interrupts nonatomically then it
226 * will sooner or later tripped up by a schedule().)
228 * We are falling back to lower-level zones if allocation
229 * in a higher zone fails.
232 zone = zonelist->zones;
233 for (;;) {
234 zone_t *z = *(zone++);
235 if (!z)
236 break;
237 if (!z->size)
238 BUG();
240 /* Are we supposed to free memory? Don't make it worse.. */
241 if (!z->zone_wake_kswapd) {
242 struct page *page = rmqueue(z, order);
243 if (z->free_pages < z->pages_low) {
244 z->zone_wake_kswapd = 1;
245 if (waitqueue_active(&kswapd_wait))
246 wake_up_interruptible(&kswapd_wait);
248 if (page)
249 return page;
253 /* Three possibilities to get here
254 * - Previous alloc_pages resulted in last zone set to have
255 * zone_wake_kswapd and start it. kswapd has not been able
256 * to release enough pages so that one zone does not have
257 * zone_wake_kswapd set.
258 * - Different sets of zones (zonelist)
259 * previous did not have all zones with zone_wake_kswapd but
260 * this one has... should kswapd be woken up? it will run once.
261 * - SMP race, kswapd went to sleep slightly after it as running
262 * in 'if (waitqueue_active(...))' above.
263 * + anyway the test is very cheap to do...
265 if (waitqueue_active(&kswapd_wait))
266 wake_up_interruptible(&kswapd_wait);
269 * Ok, we don't have any zones that don't need some
270 * balancing.. See if we have any that aren't critical..
272 zone = zonelist->zones;
273 for (;;) {
274 zone_t *z = *(zone++);
275 if (!z)
276 break;
277 if (!z->low_on_memory) {
278 struct page *page = rmqueue(z, order);
279 if (z->free_pages < z->pages_min)
280 z->low_on_memory = 1;
281 if (page)
282 return page;
287 * Uhhuh. All the zones have been critical, which means that
288 * we'd better do some synchronous swap-out. kswapd has not
289 * been able to cope..
291 if (!(current->flags & PF_MEMALLOC)) {
292 int gfp_mask = zonelist->gfp_mask;
293 if (!try_to_free_pages(gfp_mask)) {
294 if (!(gfp_mask & __GFP_HIGH))
295 goto fail;
300 * Final phase: allocate anything we can!
302 zone = zonelist->zones;
303 for (;;) {
304 struct page *page;
306 zone_t *z = *(zone++);
307 if (!z)
308 break;
309 page = rmqueue(z, order);
310 if (page)
311 return page;
314 fail:
315 /* No luck.. */
316 return NULL;
320 * Common helper functions.
322 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
324 struct page * page;
326 page = alloc_pages(gfp_mask, order);
327 if (!page)
328 return 0;
329 return (unsigned long) page_address(page);
332 unsigned long get_zeroed_page(int gfp_mask)
334 struct page * page;
336 page = alloc_pages(gfp_mask, 0);
337 if (page) {
338 void *address = page_address(page);
339 clear_page(address);
340 return (unsigned long) address;
342 return 0;
345 void __free_pages(struct page *page, unsigned long order)
347 if (put_page_testzero(page))
348 __free_pages_ok(page, order);
351 void free_pages(unsigned long addr, unsigned long order)
353 struct page *fpage;
355 #ifdef CONFIG_DISCONTIGMEM
356 if (addr == 0) return;
357 #endif
358 fpage = virt_to_page(addr);
359 if (VALID_PAGE(fpage))
360 __free_pages(fpage, order);
364 * Total amount of free (allocatable) RAM:
366 unsigned int nr_free_pages (void)
368 unsigned int sum;
369 zone_t *zone;
370 int i;
372 sum = 0;
373 for (i = 0; i < NUMNODES; i++)
374 for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
375 sum += zone->free_pages;
376 return sum;
380 * Amount of free RAM allocatable as buffer memory:
382 unsigned int nr_free_buffer_pages (void)
384 unsigned int sum;
385 zone_t *zone;
386 int i;
388 sum = nr_lru_pages / 3;
389 for (i = 0; i < NUMNODES; i++)
390 for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++)
391 sum += zone->free_pages;
392 return sum;
395 #if CONFIG_HIGHMEM
396 unsigned int nr_free_highpages (void)
398 int i;
399 unsigned int pages = 0;
401 for (i = 0; i < NUMNODES; i++)
402 pages += NODE_DATA(i)->node_zones[ZONE_HIGHMEM].free_pages;
403 return pages;
405 #endif
408 * Show free area list (used inside shift_scroll-lock stuff)
409 * We also calculate the percentage fragmentation. We do this by counting the
410 * memory on each free list with the exception of the first item on the list.
412 void show_free_areas_core(int nid)
414 unsigned long order;
415 unsigned type;
417 printk("Free pages: %6dkB (%6dkB HighMem)\n",
418 nr_free_pages() << (PAGE_SHIFT-10),
419 nr_free_highpages() << (PAGE_SHIFT-10));
421 printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
422 nr_free_pages(),
423 nr_lru_pages,
424 freepages.min,
425 freepages.low,
426 freepages.high);
428 for (type = 0; type < MAX_NR_ZONES; type++) {
429 struct list_head *head, *curr;
430 zone_t *zone = NODE_DATA(nid)->node_zones + type;
431 unsigned long nr, total, flags;
433 printk(" %c%d%d %s: ",
434 (zone->free_pages > zone->pages_low
435 ? (zone->free_pages > zone->pages_high
436 ? ' '
437 : 'H')
438 : (zone->free_pages > zone->pages_min
439 ? 'M'
440 : 'L')),
441 zone->zone_wake_kswapd, zone->low_on_memory,
442 zone->name);
444 total = 0;
445 if (zone->size) {
446 spin_lock_irqsave(&zone->lock, flags);
447 for (order = 0; order < MAX_ORDER; order++) {
448 head = &(zone->free_area + order)->free_list;
449 curr = head;
450 nr = 0;
451 for (;;) {
452 curr = memlist_next(curr);
453 if (curr == head)
454 break;
455 nr++;
457 total += nr * (1 << order);
458 printk("%lu*%lukB ", nr,
459 (PAGE_SIZE>>10) << order);
461 spin_unlock_irqrestore(&zone->lock, flags);
463 printk("= %lukB)\n", total * (PAGE_SIZE>>10));
466 #ifdef SWAP_CACHE_INFO
467 show_swap_cache_info();
468 #endif
471 void show_free_areas(void)
473 show_free_areas_core(0);
477 * Builds allocation fallback zone lists.
479 static inline void build_zonelists(pg_data_t *pgdat)
481 int i, j, k;
483 for (i = 0; i < NR_GFPINDEX; i++) {
484 zonelist_t *zonelist;
485 zone_t *zone;
487 zonelist = pgdat->node_zonelists + i;
488 memset(zonelist, 0, sizeof(*zonelist));
490 zonelist->gfp_mask = i;
491 j = 0;
492 k = ZONE_NORMAL;
493 if (i & __GFP_HIGHMEM)
494 k = ZONE_HIGHMEM;
495 if (i & __GFP_DMA)
496 k = ZONE_DMA;
498 switch (k) {
499 default:
500 BUG();
502 * fallthrough:
504 case ZONE_HIGHMEM:
505 zone = pgdat->node_zones + ZONE_HIGHMEM;
506 if (zone->size) {
507 #ifndef CONFIG_HIGHMEM
508 BUG();
509 #endif
510 zonelist->zones[j++] = zone;
512 case ZONE_NORMAL:
513 zone = pgdat->node_zones + ZONE_NORMAL;
514 if (zone->size)
515 zonelist->zones[j++] = zone;
516 case ZONE_DMA:
517 zone = pgdat->node_zones + ZONE_DMA;
518 if (zone->size)
519 zonelist->zones[j++] = zone;
521 zonelist->zones[j++] = NULL;
525 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
528 * Set up the zone data structures:
529 * - mark all pages reserved
530 * - mark all memory queues empty
531 * - clear the memory bitmaps
533 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
534 unsigned long *zones_size, unsigned long zone_start_paddr,
535 unsigned long *zholes_size)
537 struct page *p, *lmem_map;
538 unsigned long i, j;
539 unsigned long map_size;
540 unsigned long totalpages, offset, realtotalpages;
541 unsigned int cumulative = 0;
543 pgdat->node_next = pgdat_list;
544 pgdat_list = pgdat;
546 totalpages = 0;
547 for (i = 0; i < MAX_NR_ZONES; i++) {
548 unsigned long size = zones_size[i];
549 totalpages += size;
551 realtotalpages = totalpages;
552 if (zholes_size)
553 for (i = 0; i < MAX_NR_ZONES; i++)
554 realtotalpages -= zholes_size[i];
556 printk("On node %d totalpages: %lu\n", nid, realtotalpages);
559 * Select nr of pages we try to keep free for important stuff
560 * with a minimum of 10 pages and a maximum of 256 pages, so
561 * that we don't waste too much memory on large systems.
562 * This is fairly arbitrary, but based on some behaviour
563 * analysis.
565 i = realtotalpages >> 7;
566 if (i < 10)
567 i = 10;
568 if (i > 256)
569 i = 256;
570 freepages.min += i;
571 freepages.low += i * 2;
572 freepages.high += i * 3;
573 memlist_init(&lru_cache);
576 * Some architectures (with lots of mem and discontinous memory
577 * maps) have to search for a good mem_map area:
578 * For discontigmem, the conceptual mem map array starts from
579 * PAGE_OFFSET, we need to align the actual array onto a mem map
580 * boundary, so that MAP_NR works.
582 map_size = (totalpages + 1)*sizeof(struct page);
583 lmem_map = (struct page *) alloc_bootmem_node(nid, map_size);
584 lmem_map = (struct page *)(PAGE_OFFSET +
585 MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
586 *gmap = pgdat->node_mem_map = lmem_map;
587 pgdat->node_size = totalpages;
588 pgdat->node_start_paddr = zone_start_paddr;
589 pgdat->node_start_mapnr = (lmem_map - mem_map);
592 * Initially all pages are reserved - free ones are freed
593 * up by free_all_bootmem() once the early boot process is
594 * done.
596 for (p = lmem_map; p < lmem_map + totalpages; p++) {
597 set_page_count(p, 0);
598 SetPageReserved(p);
599 init_waitqueue_head(&p->wait);
600 memlist_init(&p->list);
603 offset = lmem_map - mem_map;
604 for (j = 0; j < MAX_NR_ZONES; j++) {
605 zone_t *zone = pgdat->node_zones + j;
606 unsigned long mask;
607 unsigned long size, realsize;
609 realsize = size = zones_size[j];
610 if (zholes_size)
611 realsize -= zholes_size[j];
613 printk("zone(%lu): %lu pages.\n", j, size);
614 zone->size = size;
615 zone->name = zone_names[j];
616 zone->lock = SPIN_LOCK_UNLOCKED;
617 zone->zone_pgdat = pgdat;
618 zone->free_pages = 0;
619 if (!size)
620 continue;
622 zone->offset = offset;
623 cumulative += size;
624 mask = (realsize / zone_balance_ratio[j]);
625 if (mask < zone_balance_min[j])
626 mask = zone_balance_min[j];
627 else if (mask > zone_balance_max[j])
628 mask = zone_balance_max[j];
629 zone->pages_min = mask;
630 zone->pages_low = mask*2;
631 zone->pages_high = mask*3;
632 zone->low_on_memory = 0;
633 zone->zone_wake_kswapd = 0;
634 zone->zone_mem_map = mem_map + offset;
635 zone->zone_start_mapnr = offset;
636 zone->zone_start_paddr = zone_start_paddr;
638 for (i = 0; i < size; i++) {
639 struct page *page = mem_map + offset + i;
640 page->zone = zone;
641 if (j != ZONE_HIGHMEM) {
642 page->virtual = __va(zone_start_paddr);
643 zone_start_paddr += PAGE_SIZE;
647 offset += size;
648 mask = -1;
649 for (i = 0; i < MAX_ORDER; i++) {
650 unsigned long bitmap_size;
652 memlist_init(&zone->free_area[i].free_list);
653 mask += mask;
654 size = (size + ~mask) & mask;
655 bitmap_size = size >> i;
656 bitmap_size = (bitmap_size + 7) >> 3;
657 bitmap_size = LONG_ALIGN(bitmap_size);
658 zone->free_area[i].map =
659 (unsigned int *) alloc_bootmem_node(nid, bitmap_size);
662 build_zonelists(pgdat);
665 void __init free_area_init(unsigned long *zones_size)
667 free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0);
670 static int __init setup_mem_frac(char *str)
672 int j = 0;
674 while (get_option(&str, &zone_balance_ratio[j++]) == 2);
675 printk("setup_mem_frac: ");
676 for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]);
677 printk("\n");
678 return 1;
681 __setup("memfrac=", setup_mem_frac);