Import 2.3.99pre7-5
[davej-history.git] / mm / page_alloc.c
blobd4e830a1714bd9674fb2d8cf4b44b5dd6a65a8bd
1 /*
2 * linux/mm/page_alloc.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
8 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
9 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
12 #include <linux/config.h>
13 #include <linux/mm.h>
14 #include <linux/swap.h>
15 #include <linux/swapctl.h>
16 #include <linux/interrupt.h>
17 #include <linux/pagemap.h>
18 #include <linux/bootmem.h>
20 /* Use NUMNODES instead of numnodes for better code inside kernel APIs */
21 #ifndef CONFIG_DISCONTIGMEM
22 #define NUMNODES 1
23 #else
24 #define NUMNODES numnodes
25 #endif
27 int nr_swap_pages = 0;
28 int nr_lru_pages = 0;
29 pg_data_t *pgdat_list = (pg_data_t *)0;
31 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
32 static int zone_balance_ratio[MAX_NR_ZONES] = { 128, 128, 128, };
33 static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
34 static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
37 * Free_page() adds the page to the free lists. This is optimized for
38 * fast normal cases (no error jumps taken normally).
40 * The way to optimize jumps for gcc-2.2.2 is to:
41 * - select the "normal" case and put it inside the if () { XXX }
42 * - no else-statements if you can avoid them
44 * With the above two rules, you get a straight-line execution path
45 * for the normal case, giving better asm-code.
48 #define memlist_init(x) INIT_LIST_HEAD(x)
49 #define memlist_add_head list_add
50 #define memlist_add_tail list_add_tail
51 #define memlist_del list_del
52 #define memlist_entry list_entry
53 #define memlist_next(x) ((x)->next)
54 #define memlist_prev(x) ((x)->prev)
57 * Temporary debugging check.
59 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
61 #if 0
63 static inline unsigned long classfree(zone_t *zone)
65 unsigned long free = 0;
66 zone_t *z = zone->zone_pgdat->node_zones;
68 while (z != zone) {
69 free += z->free_pages;
70 z++;
72 free += zone->free_pages;
73 return(free);
76 #endif
79 * Buddy system. Hairy. You really aren't expected to understand this
81 * Hint: -mask = 1+~mask
84 void __free_pages_ok (struct page *page, unsigned long order)
86 unsigned long index, page_idx, mask, flags;
87 free_area_t *area;
88 struct page *base;
89 zone_t *zone;
92 * Subtle. We do not want to test this in the inlined part of
93 * __free_page() - it's a rare condition and just increases
94 * cache footprint unnecesserily. So we do an 'incorrect'
95 * decrement on page->count for reserved pages, but this part
96 * makes it safe.
98 if (PageReserved(page))
99 return;
101 if (page->buffers)
102 BUG();
103 if (page->mapping)
104 BUG();
105 if (page-mem_map >= max_mapnr)
106 BUG();
107 if (PageSwapCache(page))
108 BUG();
109 if (PageLocked(page))
110 BUG();
111 if (PageDecrAfter(page))
112 BUG();
114 zone = page->zone;
116 mask = (~0UL) << order;
117 base = mem_map + zone->offset;
118 page_idx = page - base;
119 if (page_idx & ~mask)
120 BUG();
121 index = page_idx >> (1 + order);
123 area = zone->free_area + order;
125 spin_lock_irqsave(&zone->lock, flags);
127 zone->free_pages -= mask;
129 while (mask + (1 << (MAX_ORDER-1))) {
130 struct page *buddy1, *buddy2;
132 if (area >= zone->free_area + MAX_ORDER)
133 BUG();
134 if (!test_and_change_bit(index, area->map))
136 * the buddy page is still allocated.
138 break;
140 * Move the buddy up one level.
142 buddy1 = base + (page_idx ^ -mask);
143 buddy2 = base + page_idx;
144 if (BAD_RANGE(zone,buddy1))
145 BUG();
146 if (BAD_RANGE(zone,buddy2))
147 BUG();
149 memlist_del(&buddy1->list);
150 mask <<= 1;
151 area++;
152 index >>= 1;
153 page_idx &= mask;
155 memlist_add_head(&(base + page_idx)->list, &area->free_list);
157 spin_unlock_irqrestore(&zone->lock, flags);
159 if (zone->free_pages > zone->pages_high) {
160 zone->zone_wake_kswapd = 0;
161 zone->low_on_memory = 0;
165 #define MARK_USED(index, order, area) \
166 change_bit((index) >> (1+(order)), (area)->map)
168 static inline struct page * expand (zone_t *zone, struct page *page,
169 unsigned long index, int low, int high, free_area_t * area)
171 unsigned long size = 1 << high;
173 while (high > low) {
174 if (BAD_RANGE(zone,page))
175 BUG();
176 area--;
177 high--;
178 size >>= 1;
179 memlist_add_head(&(page)->list, &(area)->free_list);
180 MARK_USED(index, high, area);
181 index += size;
182 page += size;
184 if (BAD_RANGE(zone,page))
185 BUG();
186 return page;
189 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
190 static struct page * rmqueue(zone_t *zone, unsigned long order)
192 free_area_t * area = zone->free_area + order;
193 unsigned long curr_order = order;
194 struct list_head *head, *curr;
195 unsigned long flags;
196 struct page *page;
198 spin_lock_irqsave(&zone->lock, flags);
199 do {
200 head = &area->free_list;
201 curr = memlist_next(head);
203 if (curr != head) {
204 unsigned int index;
206 page = memlist_entry(curr, struct page, list);
207 if (BAD_RANGE(zone,page))
208 BUG();
209 memlist_del(curr);
210 index = (page - mem_map) - zone->offset;
211 MARK_USED(index, curr_order, area);
212 zone->free_pages -= 1 << order;
214 page = expand(zone, page, index, order, curr_order, area);
215 spin_unlock_irqrestore(&zone->lock, flags);
217 set_page_count(page, 1);
218 if (BAD_RANGE(zone,page))
219 BUG();
220 return page;
222 curr_order++;
223 area++;
224 } while (curr_order < MAX_ORDER);
225 spin_unlock_irqrestore(&zone->lock, flags);
227 return NULL;
230 static int zone_balance_memory(zonelist_t *zonelist)
232 int tried = 0, freed = 0;
233 zone_t **zone;
234 int gfp_mask = zonelist->gfp_mask;
235 extern wait_queue_head_t kswapd_wait;
237 zone = zonelist->zones;
238 for (;;) {
239 zone_t *z = *(zone++);
240 if (!z)
241 break;
242 if (z->free_pages > z->pages_low)
243 continue;
245 z->zone_wake_kswapd = 1;
246 wake_up_interruptible(&kswapd_wait);
248 /* Are we reaching the critical stage? */
249 if (!z->low_on_memory) {
250 /* Not yet critical, so let kswapd handle it.. */
251 if (z->free_pages > z->pages_min)
252 continue;
253 z->low_on_memory = 1;
256 * In the atomic allocation case we only 'kick' the
257 * state machine, but do not try to free pages
258 * ourselves.
260 tried = 1;
261 freed |= try_to_free_pages(gfp_mask, z);
263 if (tried && !freed) {
264 if (!(gfp_mask & __GFP_HIGH))
265 return 0;
267 return 1;
271 * This is the 'heart' of the zoned buddy allocator:
273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
275 zone_t **zone = zonelist->zones;
276 int gfp_mask = zonelist->gfp_mask;
277 static int low_on_memory;
280 * If this is a recursive call, we'd better
281 * do our best to just allocate things without
282 * further thought.
284 if (current->flags & PF_MEMALLOC)
285 goto allocate_ok;
287 /* If we're a memory hog, unmap some pages */
288 if (current->hog && low_on_memory && (gfp_mask & __GFP_WAIT)) {
289 // swap_out(6, gfp_mask);
290 // shm_swap(6, gfp_mask, (zone_t *)(zone));
291 try_to_free_pages(gfp_mask, (zone_t *)(zone));
295 * (If anyone calls gfp from interrupts nonatomically then it
296 * will sooner or later tripped up by a schedule().)
298 * We are falling back to lower-level zones if allocation
299 * in a higher zone fails.
301 for (;;) {
302 zone_t *z = *(zone++);
303 if (!z)
304 break;
305 if (!z->size)
306 BUG();
308 /* Are we supposed to free memory? Don't make it worse.. */
309 if (!z->zone_wake_kswapd && z->free_pages > z->pages_low) {
310 struct page *page = rmqueue(z, order);
311 low_on_memory = 0;
312 if (page)
313 return page;
317 low_on_memory = 1;
319 * Ok, no obvious zones were available, start
320 * balancing things a bit..
322 if (zone_balance_memory(zonelist)) {
323 zone = zonelist->zones;
324 allocate_ok:
325 for (;;) {
326 zone_t *z = *(zone++);
327 if (!z)
328 break;
329 if (z->free_pages) {
330 struct page *page = rmqueue(z, order);
331 if (page)
332 return page;
336 return NULL;
339 * The main chunk of the balancing code is in this offline branch:
344 * Total amount of free (allocatable) RAM:
346 unsigned int nr_free_pages (void)
348 unsigned int sum;
349 zone_t *zone;
350 int i;
352 sum = 0;
353 for (i = 0; i < NUMNODES; i++)
354 for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
355 sum += zone->free_pages;
356 return sum;
360 * Amount of free RAM allocatable as buffer memory:
362 unsigned int nr_free_buffer_pages (void)
364 unsigned int sum;
365 zone_t *zone;
366 int i;
368 sum = nr_lru_pages;
369 for (i = 0; i < NUMNODES; i++)
370 for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++)
371 sum += zone->free_pages;
372 return sum;
375 #if CONFIG_HIGHMEM
376 unsigned int nr_free_highpages (void)
378 int i;
379 unsigned int pages = 0;
381 for (i = 0; i < NUMNODES; i++)
382 pages += NODE_DATA(i)->node_zones[ZONE_HIGHMEM].free_pages;
383 return pages;
385 #endif
388 * Show free area list (used inside shift_scroll-lock stuff)
389 * We also calculate the percentage fragmentation. We do this by counting the
390 * memory on each free list with the exception of the first item on the list.
392 void show_free_areas_core(int nid)
394 unsigned long order;
395 unsigned type;
397 printk("Free pages: %6dkB (%6dkB HighMem)\n",
398 nr_free_pages() << (PAGE_SHIFT-10),
399 nr_free_highpages() << (PAGE_SHIFT-10));
401 printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
402 nr_free_pages(),
403 nr_lru_pages,
404 freepages.min,
405 freepages.low,
406 freepages.high);
408 for (type = 0; type < MAX_NR_ZONES; type++) {
409 struct list_head *head, *curr;
410 zone_t *zone = NODE_DATA(nid)->node_zones + type;
411 unsigned long nr, total, flags;
413 printk(" %s: ", zone->name);
415 total = 0;
416 if (zone->size) {
417 spin_lock_irqsave(&zone->lock, flags);
418 for (order = 0; order < MAX_ORDER; order++) {
419 head = &(zone->free_area + order)->free_list;
420 curr = head;
421 nr = 0;
422 for (;;) {
423 curr = memlist_next(curr);
424 if (curr == head)
425 break;
426 nr++;
428 total += nr * (1 << order);
429 printk("%lu*%lukB ", nr,
430 (PAGE_SIZE>>10) << order);
432 spin_unlock_irqrestore(&zone->lock, flags);
434 printk("= %lukB)\n", total * (PAGE_SIZE>>10));
437 #ifdef SWAP_CACHE_INFO
438 show_swap_cache_info();
439 #endif
442 void show_free_areas(void)
444 show_free_areas_core(0);
448 * Builds allocation fallback zone lists.
450 static inline void build_zonelists(pg_data_t *pgdat)
452 int i, j, k;
454 for (i = 0; i < NR_GFPINDEX; i++) {
455 zonelist_t *zonelist;
456 zone_t *zone;
458 zonelist = pgdat->node_zonelists + i;
459 memset(zonelist, 0, sizeof(*zonelist));
461 zonelist->gfp_mask = i;
462 j = 0;
463 k = ZONE_NORMAL;
464 if (i & __GFP_HIGHMEM)
465 k = ZONE_HIGHMEM;
466 if (i & __GFP_DMA)
467 k = ZONE_DMA;
469 switch (k) {
470 default:
471 BUG();
473 * fallthrough:
475 case ZONE_HIGHMEM:
476 zone = pgdat->node_zones + ZONE_HIGHMEM;
477 if (zone->size) {
478 #ifndef CONFIG_HIGHMEM
479 BUG();
480 #endif
481 zonelist->zones[j++] = zone;
483 case ZONE_NORMAL:
484 zone = pgdat->node_zones + ZONE_NORMAL;
485 if (zone->size)
486 zonelist->zones[j++] = zone;
487 case ZONE_DMA:
488 zone = pgdat->node_zones + ZONE_DMA;
489 if (zone->size)
490 zonelist->zones[j++] = zone;
492 zonelist->zones[j++] = NULL;
496 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
499 * Set up the zone data structures:
500 * - mark all pages reserved
501 * - mark all memory queues empty
502 * - clear the memory bitmaps
504 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
505 unsigned long *zones_size, unsigned long zone_start_paddr,
506 unsigned long *zholes_size)
508 struct page *p, *lmem_map;
509 unsigned long i, j;
510 unsigned long map_size;
511 unsigned long totalpages, offset, realtotalpages;
512 unsigned int cumulative = 0;
514 pgdat->node_next = pgdat_list;
515 pgdat_list = pgdat;
517 totalpages = 0;
518 for (i = 0; i < MAX_NR_ZONES; i++) {
519 unsigned long size = zones_size[i];
520 totalpages += size;
522 realtotalpages = totalpages;
523 if (zholes_size)
524 for (i = 0; i < MAX_NR_ZONES; i++)
525 realtotalpages -= zholes_size[i];
527 printk("On node %d totalpages: %lu\n", nid, realtotalpages);
530 * Select nr of pages we try to keep free for important stuff
531 * with a minimum of 10 pages and a maximum of 256 pages, so
532 * that we don't waste too much memory on large systems.
533 * This is fairly arbitrary, but based on some behaviour
534 * analysis.
536 i = realtotalpages >> 7;
537 if (i < 10)
538 i = 10;
539 if (i > 256)
540 i = 256;
541 freepages.min += i;
542 freepages.low += i * 2;
543 freepages.high += i * 3;
544 memlist_init(&lru_cache);
547 * Some architectures (with lots of mem and discontinous memory
548 * maps) have to search for a good mem_map area:
549 * For discontigmem, the conceptual mem map array starts from
550 * PAGE_OFFSET, we need to align the actual array onto a mem map
551 * boundary, so that MAP_NR works.
553 map_size = (totalpages + 1)*sizeof(struct page);
554 lmem_map = (struct page *) alloc_bootmem_node(nid, map_size);
555 lmem_map = (struct page *)(PAGE_OFFSET +
556 MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
557 *gmap = pgdat->node_mem_map = lmem_map;
558 pgdat->node_size = totalpages;
559 pgdat->node_start_paddr = zone_start_paddr;
560 pgdat->node_start_mapnr = (lmem_map - mem_map);
563 * Initially all pages are reserved - free ones are freed
564 * up by free_all_bootmem() once the early boot process is
565 * done.
567 for (p = lmem_map; p < lmem_map + totalpages; p++) {
568 set_page_count(p, 0);
569 SetPageReserved(p);
570 init_waitqueue_head(&p->wait);
571 memlist_init(&p->list);
574 offset = lmem_map - mem_map;
575 for (j = 0; j < MAX_NR_ZONES; j++) {
576 zone_t *zone = pgdat->node_zones + j;
577 unsigned long mask;
578 unsigned long size, realsize;
580 realsize = size = zones_size[j];
581 if (zholes_size)
582 realsize -= zholes_size[j];
584 printk("zone(%lu): %lu pages.\n", j, size);
585 zone->size = size;
586 zone->name = zone_names[j];
587 zone->lock = SPIN_LOCK_UNLOCKED;
588 zone->zone_pgdat = pgdat;
589 zone->free_pages = 0;
590 if (!size)
591 continue;
593 zone->offset = offset;
594 cumulative += size;
595 mask = (realsize / zone_balance_ratio[j]);
596 if (mask < zone_balance_min[j])
597 mask = zone_balance_min[j];
598 else if (mask > zone_balance_max[j])
599 mask = zone_balance_max[j];
600 zone->pages_min = mask;
601 zone->pages_low = mask*2;
602 zone->pages_high = mask*3;
603 zone->low_on_memory = 0;
604 zone->zone_wake_kswapd = 0;
605 zone->zone_mem_map = mem_map + offset;
606 zone->zone_start_mapnr = offset;
607 zone->zone_start_paddr = zone_start_paddr;
609 for (i = 0; i < size; i++) {
610 struct page *page = mem_map + offset + i;
611 page->zone = zone;
612 if (j != ZONE_HIGHMEM) {
613 page->virtual = (unsigned long)(__va(zone_start_paddr));
614 zone_start_paddr += PAGE_SIZE;
618 offset += size;
619 mask = -1;
620 for (i = 0; i < MAX_ORDER; i++) {
621 unsigned long bitmap_size;
623 memlist_init(&zone->free_area[i].free_list);
624 mask += mask;
625 size = (size + ~mask) & mask;
626 bitmap_size = size >> i;
627 bitmap_size = (bitmap_size + 7) >> 3;
628 bitmap_size = LONG_ALIGN(bitmap_size);
629 zone->free_area[i].map =
630 (unsigned int *) alloc_bootmem_node(nid, bitmap_size);
633 build_zonelists(pgdat);
636 void __init free_area_init(unsigned long *zones_size)
638 free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0);
641 static int __init setup_mem_frac(char *str)
643 int j = 0;
645 while (get_option(&str, &zone_balance_ratio[j++]) == 2);
646 printk("setup_mem_frac: ");
647 for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]);
648 printk("\n");
649 return 1;
652 __setup("memfrac=", setup_mem_frac);