2 * linux/mm/page_alloc.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
8 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
9 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
12 #include <linux/config.h>
14 #include <linux/swap.h>
15 #include <linux/swapctl.h>
16 #include <linux/interrupt.h>
17 #include <linux/pagemap.h>
18 #include <linux/bootmem.h>
20 /* Use NUMNODES instead of numnodes for better code inside kernel APIs */
21 #ifndef CONFIG_DISCONTIGMEM
24 #define NUMNODES numnodes
27 int nr_swap_pages
= 0;
29 pg_data_t
*pgdat_list
= (pg_data_t
*)0;
31 static char *zone_names
[MAX_NR_ZONES
] = { "DMA", "Normal", "HighMem" };
32 static int zone_balance_ratio
[MAX_NR_ZONES
] = { 128, 128, 128, };
33 static int zone_balance_min
[MAX_NR_ZONES
] = { 10 , 10, 10, };
34 static int zone_balance_max
[MAX_NR_ZONES
] = { 255 , 255, 255, };
37 * Free_page() adds the page to the free lists. This is optimized for
38 * fast normal cases (no error jumps taken normally).
40 * The way to optimize jumps for gcc-2.2.2 is to:
41 * - select the "normal" case and put it inside the if () { XXX }
42 * - no else-statements if you can avoid them
44 * With the above two rules, you get a straight-line execution path
45 * for the normal case, giving better asm-code.
48 #define memlist_init(x) INIT_LIST_HEAD(x)
49 #define memlist_add_head list_add
50 #define memlist_add_tail list_add_tail
51 #define memlist_del list_del
52 #define memlist_entry list_entry
53 #define memlist_next(x) ((x)->next)
54 #define memlist_prev(x) ((x)->prev)
57 * Temporary debugging check.
59 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
63 static inline unsigned long classfree(zone_t
*zone
)
65 unsigned long free
= 0;
66 zone_t
*z
= zone
->zone_pgdat
->node_zones
;
69 free
+= z
->free_pages
;
72 free
+= zone
->free_pages
;
79 * Buddy system. Hairy. You really aren't expected to understand this
81 * Hint: -mask = 1+~mask
84 void __free_pages_ok (struct page
*page
, unsigned long order
)
86 unsigned long index
, page_idx
, mask
, flags
;
92 * Subtle. We do not want to test this in the inlined part of
93 * __free_page() - it's a rare condition and just increases
94 * cache footprint unnecesserily. So we do an 'incorrect'
95 * decrement on page->count for reserved pages, but this part
98 if (PageReserved(page
))
105 if (page
-mem_map
>= max_mapnr
)
107 if (PageSwapCache(page
))
109 if (PageLocked(page
))
111 if (PageDecrAfter(page
))
116 mask
= (~0UL) << order
;
117 base
= mem_map
+ zone
->offset
;
118 page_idx
= page
- base
;
119 if (page_idx
& ~mask
)
121 index
= page_idx
>> (1 + order
);
123 area
= zone
->free_area
+ order
;
125 spin_lock_irqsave(&zone
->lock
, flags
);
127 zone
->free_pages
-= mask
;
129 while (mask
+ (1 << (MAX_ORDER
-1))) {
130 struct page
*buddy1
, *buddy2
;
132 if (area
>= zone
->free_area
+ MAX_ORDER
)
134 if (!test_and_change_bit(index
, area
->map
))
136 * the buddy page is still allocated.
140 * Move the buddy up one level.
142 buddy1
= base
+ (page_idx
^ -mask
);
143 buddy2
= base
+ page_idx
;
144 if (BAD_RANGE(zone
,buddy1
))
146 if (BAD_RANGE(zone
,buddy2
))
149 memlist_del(&buddy1
->list
);
155 memlist_add_head(&(base
+ page_idx
)->list
, &area
->free_list
);
157 spin_unlock_irqrestore(&zone
->lock
, flags
);
159 if (zone
->free_pages
> zone
->pages_high
) {
160 zone
->zone_wake_kswapd
= 0;
161 zone
->low_on_memory
= 0;
165 #define MARK_USED(index, order, area) \
166 change_bit((index) >> (1+(order)), (area)->map)
168 static inline struct page
* expand (zone_t
*zone
, struct page
*page
,
169 unsigned long index
, int low
, int high
, free_area_t
* area
)
171 unsigned long size
= 1 << high
;
174 if (BAD_RANGE(zone
,page
))
179 memlist_add_head(&(page
)->list
, &(area
)->free_list
);
180 MARK_USED(index
, high
, area
);
184 if (BAD_RANGE(zone
,page
))
189 static FASTCALL(struct page
* rmqueue(zone_t
*zone
, unsigned long order
));
190 static struct page
* rmqueue(zone_t
*zone
, unsigned long order
)
192 free_area_t
* area
= zone
->free_area
+ order
;
193 unsigned long curr_order
= order
;
194 struct list_head
*head
, *curr
;
198 spin_lock_irqsave(&zone
->lock
, flags
);
200 head
= &area
->free_list
;
201 curr
= memlist_next(head
);
206 page
= memlist_entry(curr
, struct page
, list
);
207 if (BAD_RANGE(zone
,page
))
210 index
= (page
- mem_map
) - zone
->offset
;
211 MARK_USED(index
, curr_order
, area
);
212 zone
->free_pages
-= 1 << order
;
214 page
= expand(zone
, page
, index
, order
, curr_order
, area
);
215 spin_unlock_irqrestore(&zone
->lock
, flags
);
217 set_page_count(page
, 1);
218 if (BAD_RANGE(zone
,page
))
224 } while (curr_order
< MAX_ORDER
);
225 spin_unlock_irqrestore(&zone
->lock
, flags
);
230 static int zone_balance_memory(zonelist_t
*zonelist
)
232 int tried
= 0, freed
= 0;
234 int gfp_mask
= zonelist
->gfp_mask
;
235 extern wait_queue_head_t kswapd_wait
;
237 zone
= zonelist
->zones
;
239 zone_t
*z
= *(zone
++);
242 if (z
->free_pages
> z
->pages_low
)
245 z
->zone_wake_kswapd
= 1;
246 wake_up_interruptible(&kswapd_wait
);
248 /* Are we reaching the critical stage? */
249 if (!z
->low_on_memory
) {
250 /* Not yet critical, so let kswapd handle it.. */
251 if (z
->free_pages
> z
->pages_min
)
253 z
->low_on_memory
= 1;
256 * In the atomic allocation case we only 'kick' the
257 * state machine, but do not try to free pages
261 freed
|= try_to_free_pages(gfp_mask
, z
);
263 if (tried
&& !freed
) {
264 if (!(gfp_mask
& __GFP_HIGH
))
271 * This is the 'heart' of the zoned buddy allocator:
273 struct page
* __alloc_pages(zonelist_t
*zonelist
, unsigned long order
)
275 zone_t
**zone
= zonelist
->zones
;
276 int gfp_mask
= zonelist
->gfp_mask
;
277 static int low_on_memory
;
280 * If this is a recursive call, we'd better
281 * do our best to just allocate things without
284 if (current
->flags
& PF_MEMALLOC
)
287 /* If we're a memory hog, unmap some pages */
288 if (current
->hog
&& low_on_memory
&& (gfp_mask
& __GFP_WAIT
)) {
289 // swap_out(6, gfp_mask);
290 // shm_swap(6, gfp_mask, (zone_t *)(zone));
291 try_to_free_pages(gfp_mask
, (zone_t
*)(zone
));
295 * (If anyone calls gfp from interrupts nonatomically then it
296 * will sooner or later tripped up by a schedule().)
298 * We are falling back to lower-level zones if allocation
299 * in a higher zone fails.
302 zone_t
*z
= *(zone
++);
308 /* Are we supposed to free memory? Don't make it worse.. */
309 if (!z
->zone_wake_kswapd
&& z
->free_pages
> z
->pages_low
) {
310 struct page
*page
= rmqueue(z
, order
);
319 * Ok, no obvious zones were available, start
320 * balancing things a bit..
322 if (zone_balance_memory(zonelist
)) {
323 zone
= zonelist
->zones
;
326 zone_t
*z
= *(zone
++);
330 struct page
*page
= rmqueue(z
, order
);
339 * The main chunk of the balancing code is in this offline branch:
344 * Total amount of free (allocatable) RAM:
346 unsigned int nr_free_pages (void)
353 for (i
= 0; i
< NUMNODES
; i
++)
354 for (zone
= NODE_DATA(i
)->node_zones
; zone
< NODE_DATA(i
)->node_zones
+ MAX_NR_ZONES
; zone
++)
355 sum
+= zone
->free_pages
;
360 * Amount of free RAM allocatable as buffer memory:
362 unsigned int nr_free_buffer_pages (void)
369 for (i
= 0; i
< NUMNODES
; i
++)
370 for (zone
= NODE_DATA(i
)->node_zones
; zone
<= NODE_DATA(i
)->node_zones
+ZONE_NORMAL
; zone
++)
371 sum
+= zone
->free_pages
;
376 unsigned int nr_free_highpages (void)
379 unsigned int pages
= 0;
381 for (i
= 0; i
< NUMNODES
; i
++)
382 pages
+= NODE_DATA(i
)->node_zones
[ZONE_HIGHMEM
].free_pages
;
388 * Show free area list (used inside shift_scroll-lock stuff)
389 * We also calculate the percentage fragmentation. We do this by counting the
390 * memory on each free list with the exception of the first item on the list.
392 void show_free_areas_core(int nid
)
397 printk("Free pages: %6dkB (%6dkB HighMem)\n",
398 nr_free_pages() << (PAGE_SHIFT
-10),
399 nr_free_highpages() << (PAGE_SHIFT
-10));
401 printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
408 for (type
= 0; type
< MAX_NR_ZONES
; type
++) {
409 struct list_head
*head
, *curr
;
410 zone_t
*zone
= NODE_DATA(nid
)->node_zones
+ type
;
411 unsigned long nr
, total
, flags
;
413 printk(" %s: ", zone
->name
);
417 spin_lock_irqsave(&zone
->lock
, flags
);
418 for (order
= 0; order
< MAX_ORDER
; order
++) {
419 head
= &(zone
->free_area
+ order
)->free_list
;
423 curr
= memlist_next(curr
);
428 total
+= nr
* (1 << order
);
429 printk("%lu*%lukB ", nr
,
430 (PAGE_SIZE
>>10) << order
);
432 spin_unlock_irqrestore(&zone
->lock
, flags
);
434 printk("= %lukB)\n", total
* (PAGE_SIZE
>>10));
437 #ifdef SWAP_CACHE_INFO
438 show_swap_cache_info();
442 void show_free_areas(void)
444 show_free_areas_core(0);
448 * Builds allocation fallback zone lists.
450 static inline void build_zonelists(pg_data_t
*pgdat
)
454 for (i
= 0; i
< NR_GFPINDEX
; i
++) {
455 zonelist_t
*zonelist
;
458 zonelist
= pgdat
->node_zonelists
+ i
;
459 memset(zonelist
, 0, sizeof(*zonelist
));
461 zonelist
->gfp_mask
= i
;
464 if (i
& __GFP_HIGHMEM
)
476 zone
= pgdat
->node_zones
+ ZONE_HIGHMEM
;
478 #ifndef CONFIG_HIGHMEM
481 zonelist
->zones
[j
++] = zone
;
484 zone
= pgdat
->node_zones
+ ZONE_NORMAL
;
486 zonelist
->zones
[j
++] = zone
;
488 zone
= pgdat
->node_zones
+ ZONE_DMA
;
490 zonelist
->zones
[j
++] = zone
;
492 zonelist
->zones
[j
++] = NULL
;
496 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
499 * Set up the zone data structures:
500 * - mark all pages reserved
501 * - mark all memory queues empty
502 * - clear the memory bitmaps
504 void __init
free_area_init_core(int nid
, pg_data_t
*pgdat
, struct page
**gmap
,
505 unsigned long *zones_size
, unsigned long zone_start_paddr
,
506 unsigned long *zholes_size
)
508 struct page
*p
, *lmem_map
;
510 unsigned long map_size
;
511 unsigned long totalpages
, offset
, realtotalpages
;
512 unsigned int cumulative
= 0;
514 pgdat
->node_next
= pgdat_list
;
518 for (i
= 0; i
< MAX_NR_ZONES
; i
++) {
519 unsigned long size
= zones_size
[i
];
522 realtotalpages
= totalpages
;
524 for (i
= 0; i
< MAX_NR_ZONES
; i
++)
525 realtotalpages
-= zholes_size
[i
];
527 printk("On node %d totalpages: %lu\n", nid
, realtotalpages
);
530 * Select nr of pages we try to keep free for important stuff
531 * with a minimum of 10 pages and a maximum of 256 pages, so
532 * that we don't waste too much memory on large systems.
533 * This is fairly arbitrary, but based on some behaviour
536 i
= realtotalpages
>> 7;
542 freepages
.low
+= i
* 2;
543 freepages
.high
+= i
* 3;
544 memlist_init(&lru_cache
);
547 * Some architectures (with lots of mem and discontinous memory
548 * maps) have to search for a good mem_map area:
549 * For discontigmem, the conceptual mem map array starts from
550 * PAGE_OFFSET, we need to align the actual array onto a mem map
551 * boundary, so that MAP_NR works.
553 map_size
= (totalpages
+ 1)*sizeof(struct page
);
554 lmem_map
= (struct page
*) alloc_bootmem_node(nid
, map_size
);
555 lmem_map
= (struct page
*)(PAGE_OFFSET
+
556 MAP_ALIGN((unsigned long)lmem_map
- PAGE_OFFSET
));
557 *gmap
= pgdat
->node_mem_map
= lmem_map
;
558 pgdat
->node_size
= totalpages
;
559 pgdat
->node_start_paddr
= zone_start_paddr
;
560 pgdat
->node_start_mapnr
= (lmem_map
- mem_map
);
563 * Initially all pages are reserved - free ones are freed
564 * up by free_all_bootmem() once the early boot process is
567 for (p
= lmem_map
; p
< lmem_map
+ totalpages
; p
++) {
568 set_page_count(p
, 0);
570 init_waitqueue_head(&p
->wait
);
571 memlist_init(&p
->list
);
574 offset
= lmem_map
- mem_map
;
575 for (j
= 0; j
< MAX_NR_ZONES
; j
++) {
576 zone_t
*zone
= pgdat
->node_zones
+ j
;
578 unsigned long size
, realsize
;
580 realsize
= size
= zones_size
[j
];
582 realsize
-= zholes_size
[j
];
584 printk("zone(%lu): %lu pages.\n", j
, size
);
586 zone
->name
= zone_names
[j
];
587 zone
->lock
= SPIN_LOCK_UNLOCKED
;
588 zone
->zone_pgdat
= pgdat
;
589 zone
->free_pages
= 0;
593 zone
->offset
= offset
;
595 mask
= (realsize
/ zone_balance_ratio
[j
]);
596 if (mask
< zone_balance_min
[j
])
597 mask
= zone_balance_min
[j
];
598 else if (mask
> zone_balance_max
[j
])
599 mask
= zone_balance_max
[j
];
600 zone
->pages_min
= mask
;
601 zone
->pages_low
= mask
*2;
602 zone
->pages_high
= mask
*3;
603 zone
->low_on_memory
= 0;
604 zone
->zone_wake_kswapd
= 0;
605 zone
->zone_mem_map
= mem_map
+ offset
;
606 zone
->zone_start_mapnr
= offset
;
607 zone
->zone_start_paddr
= zone_start_paddr
;
609 for (i
= 0; i
< size
; i
++) {
610 struct page
*page
= mem_map
+ offset
+ i
;
612 if (j
!= ZONE_HIGHMEM
) {
613 page
->virtual = (unsigned long)(__va(zone_start_paddr
));
614 zone_start_paddr
+= PAGE_SIZE
;
620 for (i
= 0; i
< MAX_ORDER
; i
++) {
621 unsigned long bitmap_size
;
623 memlist_init(&zone
->free_area
[i
].free_list
);
625 size
= (size
+ ~mask
) & mask
;
626 bitmap_size
= size
>> i
;
627 bitmap_size
= (bitmap_size
+ 7) >> 3;
628 bitmap_size
= LONG_ALIGN(bitmap_size
);
629 zone
->free_area
[i
].map
=
630 (unsigned int *) alloc_bootmem_node(nid
, bitmap_size
);
633 build_zonelists(pgdat
);
636 void __init
free_area_init(unsigned long *zones_size
)
638 free_area_init_core(0, NODE_DATA(0), &mem_map
, zones_size
, 0, 0);
641 static int __init
setup_mem_frac(char *str
)
645 while (get_option(&str
, &zone_balance_ratio
[j
++]) == 2);
646 printk("setup_mem_frac: ");
647 for (j
= 0; j
< MAX_NR_ZONES
; j
++) printk("%d ", zone_balance_ratio
[j
]);
652 __setup("memfrac=", setup_mem_frac
);