2 * linux/mm/page_alloc.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
8 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
9 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
12 #include <linux/config.h>
14 #include <linux/swap.h>
15 #include <linux/swapctl.h>
16 #include <linux/interrupt.h>
17 #include <linux/pagemap.h>
18 #include <linux/bootmem.h>
20 /* Use NUMNODES instead of numnodes for better code inside kernel APIs */
21 #ifndef CONFIG_DISCONTIGMEM
24 #define NUMNODES numnodes
29 pg_data_t
*pgdat_list
;
31 static char *zone_names
[MAX_NR_ZONES
] = { "DMA", "Normal", "HighMem" };
32 static int zone_balance_ratio
[MAX_NR_ZONES
] = { 128, 128, 128, };
33 static int zone_balance_min
[MAX_NR_ZONES
] = { 10 , 10, 10, };
34 static int zone_balance_max
[MAX_NR_ZONES
] = { 255 , 255, 255, };
37 * Free_page() adds the page to the free lists. This is optimized for
38 * fast normal cases (no error jumps taken normally).
40 * The way to optimize jumps for gcc-2.2.2 is to:
41 * - select the "normal" case and put it inside the if () { XXX }
42 * - no else-statements if you can avoid them
44 * With the above two rules, you get a straight-line execution path
45 * for the normal case, giving better asm-code.
48 #define memlist_init(x) INIT_LIST_HEAD(x)
49 #define memlist_add_head list_add
50 #define memlist_add_tail list_add_tail
51 #define memlist_del list_del
52 #define memlist_entry list_entry
53 #define memlist_next(x) ((x)->next)
54 #define memlist_prev(x) ((x)->prev)
57 * Temporary debugging check.
59 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
62 * Buddy system. Hairy. You really aren't expected to understand this
64 * Hint: -mask = 1+~mask
67 static void FASTCALL(__free_pages_ok (struct page
*page
, unsigned long order
));
68 static void __free_pages_ok (struct page
*page
, unsigned long order
)
70 unsigned long index
, page_idx
, mask
, flags
;
76 * Subtle. We do not want to test this in the inlined part of
77 * __free_page() - it's a rare condition and just increases
78 * cache footprint unnecesserily. So we do an 'incorrect'
79 * decrement on page->count for reserved pages, but this part
82 if (PageReserved(page
))
89 if (page
-mem_map
>= max_mapnr
)
91 if (PageSwapCache(page
))
95 if (PageDecrAfter(page
))
102 mask
= (~0UL) << order
;
103 base
= mem_map
+ zone
->offset
;
104 page_idx
= page
- base
;
105 if (page_idx
& ~mask
)
107 index
= page_idx
>> (1 + order
);
109 area
= zone
->free_area
+ order
;
111 spin_lock_irqsave(&zone
->lock
, flags
);
113 zone
->free_pages
-= mask
;
115 while (mask
+ (1 << (MAX_ORDER
-1))) {
116 struct page
*buddy1
, *buddy2
;
118 if (area
>= zone
->free_area
+ MAX_ORDER
)
120 if (!test_and_change_bit(index
, area
->map
))
122 * the buddy page is still allocated.
126 * Move the buddy up one level.
128 buddy1
= base
+ (page_idx
^ -mask
);
129 buddy2
= base
+ page_idx
;
130 if (BAD_RANGE(zone
,buddy1
))
132 if (BAD_RANGE(zone
,buddy2
))
135 memlist_del(&buddy1
->list
);
141 memlist_add_head(&(base
+ page_idx
)->list
, &area
->free_list
);
143 spin_unlock_irqrestore(&zone
->lock
, flags
);
145 if (zone
->free_pages
> zone
->pages_high
) {
146 zone
->zone_wake_kswapd
= 0;
147 zone
->low_on_memory
= 0;
151 #define MARK_USED(index, order, area) \
152 change_bit((index) >> (1+(order)), (area)->map)
154 static inline struct page
* expand (zone_t
*zone
, struct page
*page
,
155 unsigned long index
, int low
, int high
, free_area_t
* area
)
157 unsigned long size
= 1 << high
;
160 if (BAD_RANGE(zone
,page
))
165 memlist_add_head(&(page
)->list
, &(area
)->free_list
);
166 MARK_USED(index
, high
, area
);
170 if (BAD_RANGE(zone
,page
))
175 static FASTCALL(struct page
* rmqueue(zone_t
*zone
, unsigned long order
));
176 static struct page
* rmqueue(zone_t
*zone
, unsigned long order
)
178 free_area_t
* area
= zone
->free_area
+ order
;
179 unsigned long curr_order
= order
;
180 struct list_head
*head
, *curr
;
184 spin_lock_irqsave(&zone
->lock
, flags
);
186 head
= &area
->free_list
;
187 curr
= memlist_next(head
);
192 page
= memlist_entry(curr
, struct page
, list
);
193 if (BAD_RANGE(zone
,page
))
196 index
= (page
- mem_map
) - zone
->offset
;
197 MARK_USED(index
, curr_order
, area
);
198 zone
->free_pages
-= 1 << order
;
200 page
= expand(zone
, page
, index
, order
, curr_order
, area
);
201 spin_unlock_irqrestore(&zone
->lock
, flags
);
203 set_page_count(page
, 1);
204 if (BAD_RANGE(zone
,page
))
210 } while (curr_order
< MAX_ORDER
);
211 spin_unlock_irqrestore(&zone
->lock
, flags
);
217 * This is the 'heart' of the zoned buddy allocator:
219 struct page
* __alloc_pages(zonelist_t
*zonelist
, unsigned long order
)
222 extern wait_queue_head_t kswapd_wait
;
225 * (If anyone calls gfp from interrupts nonatomically then it
226 * will sooner or later tripped up by a schedule().)
228 * We are falling back to lower-level zones if allocation
229 * in a higher zone fails.
232 zone
= zonelist
->zones
;
234 zone_t
*z
= *(zone
++);
240 /* Are we supposed to free memory? Don't make it worse.. */
241 if (!z
->zone_wake_kswapd
) {
242 struct page
*page
= rmqueue(z
, order
);
243 if (z
->free_pages
< z
->pages_low
) {
244 z
->zone_wake_kswapd
= 1;
245 if (waitqueue_active(&kswapd_wait
))
246 wake_up_interruptible(&kswapd_wait
);
253 /* Three possibilities to get here
254 * - Previous alloc_pages resulted in last zone set to have
255 * zone_wake_kswapd and start it. kswapd has not been able
256 * to release enough pages so that one zone does not have
257 * zone_wake_kswapd set.
258 * - Different sets of zones (zonelist)
259 * previous did not have all zones with zone_wake_kswapd but
260 * this one has... should kswapd be woken up? it will run once.
261 * - SMP race, kswapd went to sleep slightly after it as running
262 * in 'if (waitqueue_active(...))' above.
263 * + anyway the test is very cheap to do...
265 if (waitqueue_active(&kswapd_wait
))
266 wake_up_interruptible(&kswapd_wait
);
269 * Ok, we don't have any zones that don't need some
270 * balancing.. See if we have any that aren't critical..
272 zone
= zonelist
->zones
;
274 zone_t
*z
= *(zone
++);
277 if (!z
->low_on_memory
) {
278 struct page
*page
= rmqueue(z
, order
);
279 if (z
->free_pages
< z
->pages_min
)
280 z
->low_on_memory
= 1;
287 * Uhhuh. All the zones have been critical, which means that
288 * we'd better do some synchronous swap-out. kswapd has not
289 * been able to cope..
291 if (!(current
->flags
& PF_MEMALLOC
)) {
292 int gfp_mask
= zonelist
->gfp_mask
;
293 if (!try_to_free_pages(gfp_mask
)) {
294 if (!(gfp_mask
& __GFP_HIGH
))
300 * Final phase: allocate anything we can!
302 zone
= zonelist
->zones
;
306 zone_t
*z
= *(zone
++);
309 page
= rmqueue(z
, order
);
320 * Common helper functions.
322 unsigned long __get_free_pages(int gfp_mask
, unsigned long order
)
326 page
= alloc_pages(gfp_mask
, order
);
329 return page_address(page
);
332 unsigned long get_zeroed_page(int gfp_mask
)
336 page
= alloc_pages(gfp_mask
, 0);
338 unsigned long address
= page_address(page
);
339 clear_page((void *)address
);
345 void __free_pages(struct page
*page
, unsigned long order
)
347 if (put_page_testzero(page
))
348 __free_pages_ok(page
, order
);
351 void free_pages(unsigned long addr
, unsigned long order
)
353 unsigned long map_nr
;
355 #ifdef CONFIG_DISCONTIGMEM
356 if (addr
== 0) return;
358 map_nr
= MAP_NR(addr
);
359 if (map_nr
< max_mapnr
)
360 __free_pages(mem_map
+ map_nr
, order
);
364 * Total amount of free (allocatable) RAM:
366 unsigned int nr_free_pages (void)
373 for (i
= 0; i
< NUMNODES
; i
++)
374 for (zone
= NODE_DATA(i
)->node_zones
; zone
< NODE_DATA(i
)->node_zones
+ MAX_NR_ZONES
; zone
++)
375 sum
+= zone
->free_pages
;
380 * Amount of free RAM allocatable as buffer memory:
382 unsigned int nr_free_buffer_pages (void)
389 for (i
= 0; i
< NUMNODES
; i
++)
390 for (zone
= NODE_DATA(i
)->node_zones
; zone
<= NODE_DATA(i
)->node_zones
+ZONE_NORMAL
; zone
++)
391 sum
+= zone
->free_pages
;
396 unsigned int nr_free_highpages (void)
399 unsigned int pages
= 0;
401 for (i
= 0; i
< NUMNODES
; i
++)
402 pages
+= NODE_DATA(i
)->node_zones
[ZONE_HIGHMEM
].free_pages
;
408 * Show free area list (used inside shift_scroll-lock stuff)
409 * We also calculate the percentage fragmentation. We do this by counting the
410 * memory on each free list with the exception of the first item on the list.
412 void show_free_areas_core(int nid
)
417 printk("Free pages: %6dkB (%6dkB HighMem)\n",
418 nr_free_pages() << (PAGE_SHIFT
-10),
419 nr_free_highpages() << (PAGE_SHIFT
-10));
421 printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
428 for (type
= 0; type
< MAX_NR_ZONES
; type
++) {
429 struct list_head
*head
, *curr
;
430 zone_t
*zone
= NODE_DATA(nid
)->node_zones
+ type
;
431 unsigned long nr
, total
, flags
;
433 printk(" %s: ", zone
->name
);
437 spin_lock_irqsave(&zone
->lock
, flags
);
438 for (order
= 0; order
< MAX_ORDER
; order
++) {
439 head
= &(zone
->free_area
+ order
)->free_list
;
443 curr
= memlist_next(curr
);
448 total
+= nr
* (1 << order
);
449 printk("%lu*%lukB ", nr
,
450 (PAGE_SIZE
>>10) << order
);
452 spin_unlock_irqrestore(&zone
->lock
, flags
);
454 printk("= %lukB)\n", total
* (PAGE_SIZE
>>10));
457 #ifdef SWAP_CACHE_INFO
458 show_swap_cache_info();
462 void show_free_areas(void)
464 show_free_areas_core(0);
468 * Builds allocation fallback zone lists.
470 static inline void build_zonelists(pg_data_t
*pgdat
)
474 for (i
= 0; i
< NR_GFPINDEX
; i
++) {
475 zonelist_t
*zonelist
;
478 zonelist
= pgdat
->node_zonelists
+ i
;
479 memset(zonelist
, 0, sizeof(*zonelist
));
481 zonelist
->gfp_mask
= i
;
484 if (i
& __GFP_HIGHMEM
)
496 zone
= pgdat
->node_zones
+ ZONE_HIGHMEM
;
498 #ifndef CONFIG_HIGHMEM
501 zonelist
->zones
[j
++] = zone
;
504 zone
= pgdat
->node_zones
+ ZONE_NORMAL
;
506 zonelist
->zones
[j
++] = zone
;
508 zone
= pgdat
->node_zones
+ ZONE_DMA
;
510 zonelist
->zones
[j
++] = zone
;
512 zonelist
->zones
[j
++] = NULL
;
516 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
519 * Set up the zone data structures:
520 * - mark all pages reserved
521 * - mark all memory queues empty
522 * - clear the memory bitmaps
524 void __init
free_area_init_core(int nid
, pg_data_t
*pgdat
, struct page
**gmap
,
525 unsigned long *zones_size
, unsigned long zone_start_paddr
,
526 unsigned long *zholes_size
)
528 struct page
*p
, *lmem_map
;
530 unsigned long map_size
;
531 unsigned long totalpages
, offset
, realtotalpages
;
532 unsigned int cumulative
= 0;
534 pgdat
->node_next
= pgdat_list
;
538 for (i
= 0; i
< MAX_NR_ZONES
; i
++) {
539 unsigned long size
= zones_size
[i
];
542 realtotalpages
= totalpages
;
544 for (i
= 0; i
< MAX_NR_ZONES
; i
++)
545 realtotalpages
-= zholes_size
[i
];
547 printk("On node %d totalpages: %lu\n", nid
, realtotalpages
);
550 * Select nr of pages we try to keep free for important stuff
551 * with a minimum of 10 pages and a maximum of 256 pages, so
552 * that we don't waste too much memory on large systems.
553 * This is fairly arbitrary, but based on some behaviour
556 i
= realtotalpages
>> 7;
562 freepages
.low
+= i
* 2;
563 freepages
.high
+= i
* 3;
564 memlist_init(&lru_cache
);
567 * Some architectures (with lots of mem and discontinous memory
568 * maps) have to search for a good mem_map area:
569 * For discontigmem, the conceptual mem map array starts from
570 * PAGE_OFFSET, we need to align the actual array onto a mem map
571 * boundary, so that MAP_NR works.
573 map_size
= (totalpages
+ 1)*sizeof(struct page
);
574 lmem_map
= (struct page
*) alloc_bootmem_node(nid
, map_size
);
575 lmem_map
= (struct page
*)(PAGE_OFFSET
+
576 MAP_ALIGN((unsigned long)lmem_map
- PAGE_OFFSET
));
577 *gmap
= pgdat
->node_mem_map
= lmem_map
;
578 pgdat
->node_size
= totalpages
;
579 pgdat
->node_start_paddr
= zone_start_paddr
;
580 pgdat
->node_start_mapnr
= (lmem_map
- mem_map
);
583 * Initially all pages are reserved - free ones are freed
584 * up by free_all_bootmem() once the early boot process is
587 for (p
= lmem_map
; p
< lmem_map
+ totalpages
; p
++) {
588 set_page_count(p
, 0);
590 init_waitqueue_head(&p
->wait
);
591 memlist_init(&p
->list
);
594 offset
= lmem_map
- mem_map
;
595 for (j
= 0; j
< MAX_NR_ZONES
; j
++) {
596 zone_t
*zone
= pgdat
->node_zones
+ j
;
598 unsigned long size
, realsize
;
600 realsize
= size
= zones_size
[j
];
602 realsize
-= zholes_size
[j
];
604 printk("zone(%lu): %lu pages.\n", j
, size
);
606 zone
->name
= zone_names
[j
];
607 zone
->lock
= SPIN_LOCK_UNLOCKED
;
608 zone
->zone_pgdat
= pgdat
;
609 zone
->free_pages
= 0;
613 zone
->offset
= offset
;
615 mask
= (realsize
/ zone_balance_ratio
[j
]);
616 if (mask
< zone_balance_min
[j
])
617 mask
= zone_balance_min
[j
];
618 else if (mask
> zone_balance_max
[j
])
619 mask
= zone_balance_max
[j
];
620 zone
->pages_min
= mask
;
621 zone
->pages_low
= mask
*2;
622 zone
->pages_high
= mask
*3;
623 zone
->low_on_memory
= 0;
624 zone
->zone_wake_kswapd
= 0;
625 zone
->zone_mem_map
= mem_map
+ offset
;
626 zone
->zone_start_mapnr
= offset
;
627 zone
->zone_start_paddr
= zone_start_paddr
;
629 for (i
= 0; i
< size
; i
++) {
630 struct page
*page
= mem_map
+ offset
+ i
;
632 if (j
!= ZONE_HIGHMEM
) {
633 page
->virtual = (unsigned long)(__va(zone_start_paddr
));
634 zone_start_paddr
+= PAGE_SIZE
;
640 for (i
= 0; i
< MAX_ORDER
; i
++) {
641 unsigned long bitmap_size
;
643 memlist_init(&zone
->free_area
[i
].free_list
);
645 size
= (size
+ ~mask
) & mask
;
646 bitmap_size
= size
>> i
;
647 bitmap_size
= (bitmap_size
+ 7) >> 3;
648 bitmap_size
= LONG_ALIGN(bitmap_size
);
649 zone
->free_area
[i
].map
=
650 (unsigned int *) alloc_bootmem_node(nid
, bitmap_size
);
653 build_zonelists(pgdat
);
656 void __init
free_area_init(unsigned long *zones_size
)
658 free_area_init_core(0, NODE_DATA(0), &mem_map
, zones_size
, 0, 0);
661 static int __init
setup_mem_frac(char *str
)
665 while (get_option(&str
, &zone_balance_ratio
[j
++]) == 2);
666 printk("setup_mem_frac: ");
667 for (j
= 0; j
< MAX_NR_ZONES
; j
++) printk("%d ", zone_balance_ratio
[j
]);
672 __setup("memfrac=", setup_mem_frac
);