2 * linux/mm/page_alloc.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
8 #include <linux/config.h>
10 #include <linux/kernel_stat.h>
11 #include <linux/swap.h>
12 #include <linux/swapctl.h>
13 #include <linux/interrupt.h>
14 #include <linux/init.h>
15 #include <linux/pagemap.h>
18 #include <asm/uaccess.h> /* for copy_to/from_user */
19 #include <asm/pgtable.h>
21 int nr_swap_pages
= 0;
22 int nr_free_pages
= 0;
25 * Free area management
27 * The free_area_list arrays point to the queue heads of the free areas
32 /* the AP+ needs to allocate 8MB contiguous, aligned chunks of ram
33 for the ring buffers */
34 #define NR_MEM_LISTS 12
36 #define NR_MEM_LISTS 10
39 /* The start of this MUST match the start of "struct page" */
40 struct free_area_struct
{
46 #define memory_head(x) ((struct page *)(x))
48 static struct free_area_struct free_area
[NR_MEM_LISTS
];
50 static inline void init_mem_queue(struct free_area_struct
* head
)
52 head
->next
= memory_head(head
);
53 head
->prev
= memory_head(head
);
56 static inline void add_mem_queue(struct free_area_struct
* head
, struct page
* entry
)
58 struct page
* next
= head
->next
;
60 entry
->prev
= memory_head(head
);
66 static inline void remove_mem_queue(struct page
* entry
)
68 struct page
* next
= entry
->next
;
69 struct page
* prev
= entry
->prev
;
75 * Free_page() adds the page to the free lists. This is optimized for
76 * fast normal cases (no error jumps taken normally).
78 * The way to optimize jumps for gcc-2.2.2 is to:
79 * - select the "normal" case and put it inside the if () { XXX }
80 * - no else-statements if you can avoid them
82 * With the above two rules, you get a straight-line execution path
83 * for the normal case, giving better asm-code.
87 * Buddy system. Hairy. You really aren't expected to understand this
89 * Hint: -mask = 1+~mask
91 spinlock_t page_alloc_lock
= SPIN_LOCK_UNLOCKED
;
93 static inline void free_pages_ok(unsigned long map_nr
, unsigned long order
)
95 struct free_area_struct
*area
= free_area
+ order
;
96 unsigned long index
= map_nr
>> (1 + order
);
97 unsigned long mask
= (~0UL) << order
;
100 spin_lock_irqsave(&page_alloc_lock
, flags
);
102 #define list(x) (mem_map+(x))
105 nr_free_pages
-= mask
;
106 while (mask
+ (1 << (NR_MEM_LISTS
-1))) {
107 if (!test_and_change_bit(index
, area
->map
))
109 remove_mem_queue(list(map_nr
^ -mask
));
115 add_mem_queue(area
, list(map_nr
));
119 spin_unlock_irqrestore(&page_alloc_lock
, flags
);
122 void __free_page(struct page
*page
)
124 if (!PageReserved(page
) && put_page_testzero(page
)) {
125 if (PageSwapCache(page
))
127 page
->flags
&= ~(1 << PG_referenced
);
128 free_pages_ok(page
- mem_map
, 0);
133 void free_pages(unsigned long addr
, unsigned long order
)
135 unsigned long map_nr
= MAP_NR(addr
);
137 if (map_nr
< max_mapnr
) {
138 mem_map_t
* map
= mem_map
+ map_nr
;
139 if (PageReserved(map
))
141 if (put_page_testzero(map
)) {
142 if (PageSwapCache(map
))
144 map
->flags
&= ~(1 << PG_referenced
);
145 free_pages_ok(map_nr
, order
);
152 * Some ugly macros to speed up __get_free_pages()..
154 #define MARK_USED(index, order, area) \
155 change_bit((index) >> (1+(order)), (area)->map)
156 #define CAN_DMA(x) (PageDMA(x))
157 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
158 #define RMQUEUE(order, gfp_mask) \
159 do { struct free_area_struct * area = free_area+order; \
160 unsigned long new_order = order; \
161 do { struct page *prev = memory_head(area), *ret = prev->next; \
162 while (memory_head(area) != ret) { \
163 if (!(gfp_mask & __GFP_DMA) || CAN_DMA(ret)) { \
164 unsigned long map_nr; \
165 (prev->next = ret->next)->prev = prev; \
166 map_nr = ret - mem_map; \
167 MARK_USED(map_nr, new_order, area); \
168 nr_free_pages -= 1 << order; \
169 EXPAND(ret, map_nr, order, new_order, area); \
170 spin_unlock_irqrestore(&page_alloc_lock,flags);\
171 return ADDRESS(map_nr); \
176 new_order++; area++; \
177 } while (new_order < NR_MEM_LISTS); \
180 #define EXPAND(map,index,low,high,area) \
181 do { unsigned long size = 1 << high; \
182 while (high > low) { \
183 area--; high--; size >>= 1; \
184 add_mem_queue(area, map); \
185 MARK_USED(index, high, area); \
189 set_page_count(map, 1); \
192 int low_on_memory
= 0;
194 unsigned long __get_free_pages(int gfp_mask
, unsigned long order
)
198 if (order
>= NR_MEM_LISTS
)
201 #ifdef ATOMIC_MEMORY_DEBUGGING
202 if ((gfp_mask
& __GFP_WAIT
) && in_interrupt()) {
203 static int count
= 0;
205 printk("gfp called nonatomically from interrupt %p\n",
206 __builtin_return_address(0));
213 * If this is a recursive call, we'd better
214 * do our best to just allocate things without
217 if (!(current
->flags
& PF_MEMALLOC
)) {
220 if (nr_free_pages
> freepages
.min
) {
223 if (nr_free_pages
>= freepages
.high
) {
230 current
->flags
|= PF_MEMALLOC
;
231 freed
= try_to_free_pages(gfp_mask
);
232 current
->flags
&= ~PF_MEMALLOC
;
234 if (!freed
&& !(gfp_mask
& (__GFP_MED
| __GFP_HIGH
)))
238 spin_lock_irqsave(&page_alloc_lock
, flags
);
239 RMQUEUE(order
, gfp_mask
);
240 spin_unlock_irqrestore(&page_alloc_lock
, flags
);
243 * If we can schedule, do so, and make sure to yield.
244 * We may be a real-time process, and if kswapd is
245 * waiting for us we need to allow it to run a bit.
247 if (gfp_mask
& __GFP_WAIT
) {
248 current
->policy
|= SCHED_YIELD
;
257 * Show free area list (used inside shift_scroll-lock stuff)
258 * We also calculate the percentage fragmentation. We do this by counting the
259 * memory on each free list with the exception of the first item on the list.
261 void show_free_areas(void)
263 unsigned long order
, flags
;
264 unsigned long total
= 0;
266 printk("Free pages: %6dkB\n ( ",nr_free_pages
<<(PAGE_SHIFT
-10));
267 printk("Free: %d (%d %d %d)\n",
272 spin_lock_irqsave(&page_alloc_lock
, flags
);
273 for (order
=0 ; order
< NR_MEM_LISTS
; order
++) {
275 unsigned long nr
= 0;
276 for (tmp
= free_area
[order
].next
; tmp
!= memory_head(free_area
+order
) ; tmp
= tmp
->next
) {
279 total
+= nr
* ((PAGE_SIZE
>>10) << order
);
280 printk("%lu*%lukB ", nr
, (unsigned long)((PAGE_SIZE
>>10) << order
));
282 spin_unlock_irqrestore(&page_alloc_lock
, flags
);
283 printk("= %lukB)\n", total
);
284 #ifdef SWAP_CACHE_INFO
285 show_swap_cache_info();
289 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
292 * set up the free-area data structures:
293 * - mark all pages reserved
294 * - mark all memory queues empty
295 * - clear the memory bitmaps
297 unsigned long __init
free_area_init(unsigned long start_mem
, unsigned long end_mem
)
300 unsigned long mask
= PAGE_MASK
;
304 * Select nr of pages we try to keep free for important stuff
305 * with a minimum of 10 pages and a maximum of 256 pages, so
306 * that we don't waste too much memory on large systems.
307 * This is fairly arbitrary, but based on some behaviour
310 i
= (end_mem
- PAGE_OFFSET
) >> (PAGE_SHIFT
+7);
316 freepages
.low
= i
* 2;
317 freepages
.high
= i
* 3;
318 mem_map
= (mem_map_t
*) LONG_ALIGN(start_mem
);
319 p
= mem_map
+ MAP_NR(end_mem
);
320 start_mem
= LONG_ALIGN((unsigned long) p
);
321 memset(mem_map
, 0, start_mem
- (unsigned long) mem_map
);
324 set_page_count(p
, 0);
325 p
->flags
= (1 << PG_DMA
) | (1 << PG_reserved
);
326 init_waitqueue_head(&p
->wait
);
327 } while (p
> mem_map
);
329 for (i
= 0 ; i
< NR_MEM_LISTS
; i
++) {
330 unsigned long bitmap_size
;
331 init_mem_queue(free_area
+i
);
333 end_mem
= (end_mem
+ ~mask
) & mask
;
334 bitmap_size
= (end_mem
- PAGE_OFFSET
) >> (PAGE_SHIFT
+ i
);
335 bitmap_size
= (bitmap_size
+ 7) >> 3;
336 bitmap_size
= LONG_ALIGN(bitmap_size
);
337 free_area
[i
].map
= (unsigned int *) start_mem
;
338 memset((void *) start_mem
, 0, bitmap_size
);
339 start_mem
+= bitmap_size
;
345 * Primitive swap readahead code. We simply read an aligned block of
346 * (1 << page_cluster) entries in the swap area. This method is chosen
347 * because it doesn't cost us any seek time. We also make sure to queue
348 * the 'original' request together with the readahead ones...
350 void swapin_readahead(unsigned long entry
)
353 struct page
*new_page
;
354 unsigned long offset
= SWP_OFFSET(entry
);
355 struct swap_info_struct
*swapdev
= SWP_TYPE(entry
) + swap_info
;
357 offset
= (offset
>> page_cluster
) << page_cluster
;
359 i
= 1 << page_cluster
;
361 /* Don't read-ahead past the end of the swap area */
362 if (offset
>= swapdev
->max
)
364 /* Don't block on I/O for read-ahead */
365 if (atomic_read(&nr_async_pages
) >= pager_daemon
.swap_cluster
)
367 /* Don't read in bad or busy pages */
368 if (!swapdev
->swap_map
[offset
])
370 if (swapdev
->swap_map
[offset
] == SWAP_MAP_BAD
)
372 if (test_bit(offset
, swapdev
->swap_lockmap
))
375 /* Ok, do the async read-ahead now */
376 new_page
= read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry
), offset
), 0);
377 if (new_page
!= NULL
)
378 __free_page(new_page
);
385 * The tests may look silly, but it essentially makes sure that
386 * no other process did a swap-in on us just as we were waiting.
388 * Also, don't bother to add to the swap cache if this page-in
389 * was due to a write access.
391 void swap_in(struct task_struct
* tsk
, struct vm_area_struct
* vma
,
392 pte_t
* page_table
, unsigned long entry
, int write_access
)
395 struct page
*page_map
= lookup_swap_cache(entry
);
398 swapin_readahead(entry
);
399 page_map
= read_swap_cache(entry
);
401 if (pte_val(*page_table
) != entry
) {
403 free_page_and_swap_cache(page_address(page_map
));
407 set_pte(page_table
, BAD_PAGE
);
413 page
= page_address(page_map
);
418 if (!write_access
|| is_page_shared(page_map
)) {
419 set_pte(page_table
, mk_pte(page
, vma
->vm_page_prot
));
424 * The page is unshared and we're going to dirty it - so tear
425 * down the swap cache and give exclusive access to the page to
428 delete_from_swap_cache(page_map
);
429 set_pte(page_table
, pte_mkwrite(pte_mkdirty(mk_pte(page
, vma
->vm_page_prot
))));