2 * linux/mm/page_alloc.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
8 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
9 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
12 #include <linux/config.h>
14 #include <linux/swap.h>
15 #include <linux/swapctl.h>
16 #include <linux/interrupt.h>
17 #include <linux/pagemap.h>
18 #include <linux/bootmem.h>
20 /* Use NUMNODES instead of numnodes for better code inside kernel APIs */
21 #ifndef CONFIG_DISCONTIGMEM
24 #define NUMNODES numnodes
29 pg_data_t
*pgdat_list
;
31 static char *zone_names
[MAX_NR_ZONES
] = { "DMA", "Normal", "HighMem" };
32 static int zone_balance_ratio
[MAX_NR_ZONES
] = { 32, 128, 128, };
33 static int zone_balance_min
[MAX_NR_ZONES
] = { 10 , 10, 10, };
34 static int zone_balance_max
[MAX_NR_ZONES
] = { 255 , 255, 255, };
37 * Free_page() adds the page to the free lists. This is optimized for
38 * fast normal cases (no error jumps taken normally).
40 * The way to optimize jumps for gcc-2.2.2 is to:
41 * - select the "normal" case and put it inside the if () { XXX }
42 * - no else-statements if you can avoid them
44 * With the above two rules, you get a straight-line execution path
45 * for the normal case, giving better asm-code.
48 #define memlist_init(x) INIT_LIST_HEAD(x)
49 #define memlist_add_head list_add
50 #define memlist_add_tail list_add_tail
51 #define memlist_del list_del
52 #define memlist_entry list_entry
53 #define memlist_next(x) ((x)->next)
54 #define memlist_prev(x) ((x)->prev)
57 * Temporary debugging check.
59 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
62 * Buddy system. Hairy. You really aren't expected to understand this
64 * Hint: -mask = 1+~mask
67 static void FASTCALL(__free_pages_ok (struct page
*page
, unsigned long order
));
68 static void __free_pages_ok (struct page
*page
, unsigned long order
)
70 unsigned long index
, page_idx
, mask
, flags
;
76 * Subtle. We do not want to test this in the inlined part of
77 * __free_page() - it's a rare condition and just increases
78 * cache footprint unnecesserily. So we do an 'incorrect'
79 * decrement on page->count for reserved pages, but this part
82 if (PageReserved(page
))
89 if (!VALID_PAGE(page
))
91 if (PageSwapCache(page
))
95 if (PageDecrAfter(page
))
102 mask
= (~0UL) << order
;
103 base
= mem_map
+ zone
->offset
;
104 page_idx
= page
- base
;
105 if (page_idx
& ~mask
)
107 index
= page_idx
>> (1 + order
);
109 area
= zone
->free_area
+ order
;
111 spin_lock_irqsave(&zone
->lock
, flags
);
113 zone
->free_pages
-= mask
;
115 while (mask
+ (1 << (MAX_ORDER
-1))) {
116 struct page
*buddy1
, *buddy2
;
118 if (area
>= zone
->free_area
+ MAX_ORDER
)
120 if (!test_and_change_bit(index
, area
->map
))
122 * the buddy page is still allocated.
126 * Move the buddy up one level.
128 buddy1
= base
+ (page_idx
^ -mask
);
129 buddy2
= base
+ page_idx
;
130 if (BAD_RANGE(zone
,buddy1
))
132 if (BAD_RANGE(zone
,buddy2
))
135 memlist_del(&buddy1
->list
);
141 memlist_add_head(&(base
+ page_idx
)->list
, &area
->free_list
);
143 spin_unlock_irqrestore(&zone
->lock
, flags
);
145 if (zone
->free_pages
> zone
->pages_high
) {
146 zone
->zone_wake_kswapd
= 0;
147 zone
->low_on_memory
= 0;
151 #define MARK_USED(index, order, area) \
152 change_bit((index) >> (1+(order)), (area)->map)
154 static inline struct page
* expand (zone_t
*zone
, struct page
*page
,
155 unsigned long index
, int low
, int high
, free_area_t
* area
)
157 unsigned long size
= 1 << high
;
160 if (BAD_RANGE(zone
,page
))
165 memlist_add_head(&(page
)->list
, &(area
)->free_list
);
166 MARK_USED(index
, high
, area
);
170 if (BAD_RANGE(zone
,page
))
175 static FASTCALL(struct page
* rmqueue(zone_t
*zone
, unsigned long order
));
176 static struct page
* rmqueue(zone_t
*zone
, unsigned long order
)
178 free_area_t
* area
= zone
->free_area
+ order
;
179 unsigned long curr_order
= order
;
180 struct list_head
*head
, *curr
;
184 spin_lock_irqsave(&zone
->lock
, flags
);
186 head
= &area
->free_list
;
187 curr
= memlist_next(head
);
192 page
= memlist_entry(curr
, struct page
, list
);
193 if (BAD_RANGE(zone
,page
))
196 index
= (page
- mem_map
) - zone
->offset
;
197 MARK_USED(index
, curr_order
, area
);
198 zone
->free_pages
-= 1 << order
;
200 page
= expand(zone
, page
, index
, order
, curr_order
, area
);
201 spin_unlock_irqrestore(&zone
->lock
, flags
);
203 set_page_count(page
, 1);
204 if (BAD_RANGE(zone
,page
))
210 } while (curr_order
< MAX_ORDER
);
211 spin_unlock_irqrestore(&zone
->lock
, flags
);
217 * This is the 'heart' of the zoned buddy allocator:
219 struct page
* __alloc_pages(zonelist_t
*zonelist
, unsigned long order
)
222 extern wait_queue_head_t kswapd_wait
;
225 * (If anyone calls gfp from interrupts nonatomically then it
226 * will sooner or later tripped up by a schedule().)
228 * We are falling back to lower-level zones if allocation
229 * in a higher zone fails.
232 zone
= zonelist
->zones
;
234 zone_t
*z
= *(zone
++);
240 /* Are we supposed to free memory? Don't make it worse.. */
241 if (!z
->zone_wake_kswapd
) {
242 struct page
*page
= rmqueue(z
, order
);
243 if (z
->free_pages
< z
->pages_low
) {
244 z
->zone_wake_kswapd
= 1;
245 if (waitqueue_active(&kswapd_wait
))
246 wake_up_interruptible(&kswapd_wait
);
253 /* Three possibilities to get here
254 * - Previous alloc_pages resulted in last zone set to have
255 * zone_wake_kswapd and start it. kswapd has not been able
256 * to release enough pages so that one zone does not have
257 * zone_wake_kswapd set.
258 * - Different sets of zones (zonelist)
259 * previous did not have all zones with zone_wake_kswapd but
260 * this one has... should kswapd be woken up? it will run once.
261 * - SMP race, kswapd went to sleep slightly after it as running
262 * in 'if (waitqueue_active(...))' above.
263 * + anyway the test is very cheap to do...
265 if (waitqueue_active(&kswapd_wait
))
266 wake_up_interruptible(&kswapd_wait
);
269 * Ok, we don't have any zones that don't need some
270 * balancing.. See if we have any that aren't critical..
272 zone
= zonelist
->zones
;
274 zone_t
*z
= *(zone
++);
277 if (!z
->low_on_memory
) {
278 struct page
*page
= rmqueue(z
, order
);
279 if (z
->free_pages
< z
->pages_min
)
280 z
->low_on_memory
= 1;
287 * Uhhuh. All the zones have been critical, which means that
288 * we'd better do some synchronous swap-out. kswapd has not
289 * been able to cope..
291 if (!(current
->flags
& PF_MEMALLOC
)) {
292 int gfp_mask
= zonelist
->gfp_mask
;
293 if (!try_to_free_pages(gfp_mask
)) {
294 if (!(gfp_mask
& __GFP_HIGH
))
300 * Final phase: allocate anything we can!
302 zone
= zonelist
->zones
;
306 zone_t
*z
= *(zone
++);
309 page
= rmqueue(z
, order
);
320 * Common helper functions.
322 unsigned long __get_free_pages(int gfp_mask
, unsigned long order
)
326 page
= alloc_pages(gfp_mask
, order
);
329 return (unsigned long) page_address(page
);
332 unsigned long get_zeroed_page(int gfp_mask
)
336 page
= alloc_pages(gfp_mask
, 0);
338 void *address
= page_address(page
);
340 return (unsigned long) address
;
345 void __free_pages(struct page
*page
, unsigned long order
)
347 if (put_page_testzero(page
))
348 __free_pages_ok(page
, order
);
351 void free_pages(unsigned long addr
, unsigned long order
)
355 #ifdef CONFIG_DISCONTIGMEM
356 if (addr
== 0) return;
358 fpage
= virt_to_page(addr
);
359 if (VALID_PAGE(fpage
))
360 __free_pages(fpage
, order
);
364 * Total amount of free (allocatable) RAM:
366 unsigned int nr_free_pages (void)
373 for (i
= 0; i
< NUMNODES
; i
++)
374 for (zone
= NODE_DATA(i
)->node_zones
; zone
< NODE_DATA(i
)->node_zones
+ MAX_NR_ZONES
; zone
++)
375 sum
+= zone
->free_pages
;
380 * Amount of free RAM allocatable as buffer memory:
382 unsigned int nr_free_buffer_pages (void)
388 sum
= nr_lru_pages
/ 3;
389 for (i
= 0; i
< NUMNODES
; i
++)
390 for (zone
= NODE_DATA(i
)->node_zones
; zone
<= NODE_DATA(i
)->node_zones
+ZONE_NORMAL
; zone
++)
391 sum
+= zone
->free_pages
;
396 unsigned int nr_free_highpages (void)
399 unsigned int pages
= 0;
401 for (i
= 0; i
< NUMNODES
; i
++)
402 pages
+= NODE_DATA(i
)->node_zones
[ZONE_HIGHMEM
].free_pages
;
408 * Show free area list (used inside shift_scroll-lock stuff)
409 * We also calculate the percentage fragmentation. We do this by counting the
410 * memory on each free list with the exception of the first item on the list.
412 void show_free_areas_core(int nid
)
417 printk("Free pages: %6dkB (%6dkB HighMem)\n",
418 nr_free_pages() << (PAGE_SHIFT
-10),
419 nr_free_highpages() << (PAGE_SHIFT
-10));
421 printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
428 for (type
= 0; type
< MAX_NR_ZONES
; type
++) {
429 struct list_head
*head
, *curr
;
430 zone_t
*zone
= NODE_DATA(nid
)->node_zones
+ type
;
431 unsigned long nr
, total
, flags
;
433 printk(" %c%d%d %s: ",
434 (zone
->free_pages
> zone
->pages_low
435 ? (zone
->free_pages
> zone
->pages_high
438 : (zone
->free_pages
> zone
->pages_min
441 zone
->zone_wake_kswapd
, zone
->low_on_memory
,
446 spin_lock_irqsave(&zone
->lock
, flags
);
447 for (order
= 0; order
< MAX_ORDER
; order
++) {
448 head
= &(zone
->free_area
+ order
)->free_list
;
452 curr
= memlist_next(curr
);
457 total
+= nr
* (1 << order
);
458 printk("%lu*%lukB ", nr
,
459 (PAGE_SIZE
>>10) << order
);
461 spin_unlock_irqrestore(&zone
->lock
, flags
);
463 printk("= %lukB)\n", total
* (PAGE_SIZE
>>10));
466 #ifdef SWAP_CACHE_INFO
467 show_swap_cache_info();
471 void show_free_areas(void)
473 show_free_areas_core(0);
477 * Builds allocation fallback zone lists.
479 static inline void build_zonelists(pg_data_t
*pgdat
)
483 for (i
= 0; i
< NR_GFPINDEX
; i
++) {
484 zonelist_t
*zonelist
;
487 zonelist
= pgdat
->node_zonelists
+ i
;
488 memset(zonelist
, 0, sizeof(*zonelist
));
490 zonelist
->gfp_mask
= i
;
493 if (i
& __GFP_HIGHMEM
)
505 zone
= pgdat
->node_zones
+ ZONE_HIGHMEM
;
507 #ifndef CONFIG_HIGHMEM
510 zonelist
->zones
[j
++] = zone
;
513 zone
= pgdat
->node_zones
+ ZONE_NORMAL
;
515 zonelist
->zones
[j
++] = zone
;
517 zone
= pgdat
->node_zones
+ ZONE_DMA
;
519 zonelist
->zones
[j
++] = zone
;
521 zonelist
->zones
[j
++] = NULL
;
525 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
528 * Set up the zone data structures:
529 * - mark all pages reserved
530 * - mark all memory queues empty
531 * - clear the memory bitmaps
533 void __init
free_area_init_core(int nid
, pg_data_t
*pgdat
, struct page
**gmap
,
534 unsigned long *zones_size
, unsigned long zone_start_paddr
,
535 unsigned long *zholes_size
)
537 struct page
*p
, *lmem_map
;
539 unsigned long map_size
;
540 unsigned long totalpages
, offset
, realtotalpages
;
541 unsigned int cumulative
= 0;
543 pgdat
->node_next
= pgdat_list
;
547 for (i
= 0; i
< MAX_NR_ZONES
; i
++) {
548 unsigned long size
= zones_size
[i
];
551 realtotalpages
= totalpages
;
553 for (i
= 0; i
< MAX_NR_ZONES
; i
++)
554 realtotalpages
-= zholes_size
[i
];
556 printk("On node %d totalpages: %lu\n", nid
, realtotalpages
);
559 * Select nr of pages we try to keep free for important stuff
560 * with a minimum of 10 pages and a maximum of 256 pages, so
561 * that we don't waste too much memory on large systems.
562 * This is fairly arbitrary, but based on some behaviour
565 i
= realtotalpages
>> 7;
571 freepages
.low
+= i
* 2;
572 freepages
.high
+= i
* 3;
573 memlist_init(&lru_cache
);
576 * Some architectures (with lots of mem and discontinous memory
577 * maps) have to search for a good mem_map area:
578 * For discontigmem, the conceptual mem map array starts from
579 * PAGE_OFFSET, we need to align the actual array onto a mem map
580 * boundary, so that MAP_NR works.
582 map_size
= (totalpages
+ 1)*sizeof(struct page
);
583 lmem_map
= (struct page
*) alloc_bootmem_node(nid
, map_size
);
584 lmem_map
= (struct page
*)(PAGE_OFFSET
+
585 MAP_ALIGN((unsigned long)lmem_map
- PAGE_OFFSET
));
586 *gmap
= pgdat
->node_mem_map
= lmem_map
;
587 pgdat
->node_size
= totalpages
;
588 pgdat
->node_start_paddr
= zone_start_paddr
;
589 pgdat
->node_start_mapnr
= (lmem_map
- mem_map
);
592 * Initially all pages are reserved - free ones are freed
593 * up by free_all_bootmem() once the early boot process is
596 for (p
= lmem_map
; p
< lmem_map
+ totalpages
; p
++) {
597 set_page_count(p
, 0);
599 init_waitqueue_head(&p
->wait
);
600 memlist_init(&p
->list
);
603 offset
= lmem_map
- mem_map
;
604 for (j
= 0; j
< MAX_NR_ZONES
; j
++) {
605 zone_t
*zone
= pgdat
->node_zones
+ j
;
607 unsigned long size
, realsize
;
609 realsize
= size
= zones_size
[j
];
611 realsize
-= zholes_size
[j
];
613 printk("zone(%lu): %lu pages.\n", j
, size
);
615 zone
->name
= zone_names
[j
];
616 zone
->lock
= SPIN_LOCK_UNLOCKED
;
617 zone
->zone_pgdat
= pgdat
;
618 zone
->free_pages
= 0;
622 zone
->offset
= offset
;
624 mask
= (realsize
/ zone_balance_ratio
[j
]);
625 if (mask
< zone_balance_min
[j
])
626 mask
= zone_balance_min
[j
];
627 else if (mask
> zone_balance_max
[j
])
628 mask
= zone_balance_max
[j
];
629 zone
->pages_min
= mask
;
630 zone
->pages_low
= mask
*2;
631 zone
->pages_high
= mask
*3;
632 zone
->low_on_memory
= 0;
633 zone
->zone_wake_kswapd
= 0;
634 zone
->zone_mem_map
= mem_map
+ offset
;
635 zone
->zone_start_mapnr
= offset
;
636 zone
->zone_start_paddr
= zone_start_paddr
;
638 for (i
= 0; i
< size
; i
++) {
639 struct page
*page
= mem_map
+ offset
+ i
;
641 if (j
!= ZONE_HIGHMEM
) {
642 page
->virtual = __va(zone_start_paddr
);
643 zone_start_paddr
+= PAGE_SIZE
;
649 for (i
= 0; i
< MAX_ORDER
; i
++) {
650 unsigned long bitmap_size
;
652 memlist_init(&zone
->free_area
[i
].free_list
);
654 size
= (size
+ ~mask
) & mask
;
655 bitmap_size
= size
>> i
;
656 bitmap_size
= (bitmap_size
+ 7) >> 3;
657 bitmap_size
= LONG_ALIGN(bitmap_size
);
658 zone
->free_area
[i
].map
=
659 (unsigned int *) alloc_bootmem_node(nid
, bitmap_size
);
662 build_zonelists(pgdat
);
665 void __init
free_area_init(unsigned long *zones_size
)
667 free_area_init_core(0, NODE_DATA(0), &mem_map
, zones_size
, 0, 0);
670 static int __init
setup_mem_frac(char *str
)
674 while (get_option(&str
, &zone_balance_ratio
[j
++]) == 2);
675 printk("setup_mem_frac: ");
676 for (j
= 0; j
< MAX_NR_ZONES
; j
++) printk("%d ", zone_balance_ratio
[j
]);
681 __setup("memfrac=", setup_mem_frac
);