2 * linux/mm/page_alloc.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
8 #include <linux/config.h>
10 #include <linux/kernel_stat.h>
11 #include <linux/swap.h>
12 #include <linux/swapctl.h>
13 #include <linux/interrupt.h>
14 #include <linux/init.h>
15 #include <linux/pagemap.h>
18 #include <asm/uaccess.h> /* for copy_to/from_user */
19 #include <asm/pgtable.h>
21 int nr_swap_pages
= 0;
22 int nr_free_pages
= 0;
25 * Free area management
27 * The free_area_list arrays point to the queue heads of the free areas
32 /* the AP+ needs to allocate 8MB contiguous, aligned chunks of ram
33 for the ring buffers */
34 #define NR_MEM_LISTS 12
36 #define NR_MEM_LISTS 10
39 /* The start of this MUST match the start of "struct page" */
40 struct free_area_struct
{
46 #define memory_head(x) ((struct page *)(x))
48 static struct free_area_struct free_area
[NR_MEM_LISTS
];
50 static inline void init_mem_queue(struct free_area_struct
* head
)
52 head
->next
= memory_head(head
);
53 head
->prev
= memory_head(head
);
56 static inline void add_mem_queue(struct free_area_struct
* head
, struct page
* entry
)
58 struct page
* next
= head
->next
;
60 entry
->prev
= memory_head(head
);
66 static inline void remove_mem_queue(struct page
* entry
)
68 struct page
* next
= entry
->next
;
69 struct page
* prev
= entry
->prev
;
75 * Free_page() adds the page to the free lists. This is optimized for
76 * fast normal cases (no error jumps taken normally).
78 * The way to optimize jumps for gcc-2.2.2 is to:
79 * - select the "normal" case and put it inside the if () { XXX }
80 * - no else-statements if you can avoid them
82 * With the above two rules, you get a straight-line execution path
83 * for the normal case, giving better asm-code.
87 * Buddy system. Hairy. You really aren't expected to understand this
89 * Hint: -mask = 1+~mask
91 spinlock_t page_alloc_lock
= SPIN_LOCK_UNLOCKED
;
93 static inline void free_pages_ok(unsigned long map_nr
, unsigned long order
)
95 struct free_area_struct
*area
= free_area
+ order
;
96 unsigned long index
= map_nr
>> (1 + order
);
97 unsigned long mask
= (~0UL) << order
;
100 spin_lock_irqsave(&page_alloc_lock
, flags
);
102 #define list(x) (mem_map+(x))
105 nr_free_pages
-= mask
;
106 while (mask
+ (1 << (NR_MEM_LISTS
-1))) {
107 if (!test_and_change_bit(index
, area
->map
))
109 remove_mem_queue(list(map_nr
^ -mask
));
115 add_mem_queue(area
, list(map_nr
));
119 spin_unlock_irqrestore(&page_alloc_lock
, flags
);
122 void __free_page(struct page
*page
)
124 if (!PageReserved(page
) && atomic_dec_and_test(&page
->count
)) {
125 if (PageSwapCache(page
))
126 panic ("Freeing swap cache page");
127 page
->flags
&= ~(1 << PG_referenced
);
128 free_pages_ok(page
- mem_map
, 0);
133 void free_pages(unsigned long addr
, unsigned long order
)
135 unsigned long map_nr
= MAP_NR(addr
);
137 if (map_nr
< max_mapnr
) {
138 mem_map_t
* map
= mem_map
+ map_nr
;
139 if (PageReserved(map
))
141 if (atomic_dec_and_test(&map
->count
)) {
142 if (PageSwapCache(map
))
143 panic ("Freeing swap cache pages");
144 map
->flags
&= ~(1 << PG_referenced
);
145 free_pages_ok(map_nr
, order
);
152 * Some ugly macros to speed up __get_free_pages()..
154 #define MARK_USED(index, order, area) \
155 change_bit((index) >> (1+(order)), (area)->map)
156 #define CAN_DMA(x) (PageDMA(x))
157 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
158 #define RMQUEUE(order, gfp_mask) \
159 do { struct free_area_struct * area = free_area+order; \
160 unsigned long new_order = order; \
161 do { struct page *prev = memory_head(area), *ret = prev->next; \
162 while (memory_head(area) != ret) { \
163 if (!(gfp_mask & __GFP_DMA) || CAN_DMA(ret)) { \
164 unsigned long map_nr; \
165 (prev->next = ret->next)->prev = prev; \
166 map_nr = ret - mem_map; \
167 MARK_USED(map_nr, new_order, area); \
168 nr_free_pages -= 1 << order; \
169 EXPAND(ret, map_nr, order, new_order, area); \
170 spin_unlock_irqrestore(&page_alloc_lock, flags); \
171 return ADDRESS(map_nr); \
176 new_order++; area++; \
177 } while (new_order < NR_MEM_LISTS); \
180 #define EXPAND(map,index,low,high,area) \
181 do { unsigned long size = 1 << high; \
182 while (high > low) { \
183 area--; high--; size >>= 1; \
184 add_mem_queue(area, map); \
185 MARK_USED(index, high, area); \
189 atomic_set(&map->count, 1); \
192 int low_on_memory
= 0;
194 unsigned long __get_free_pages(int gfp_mask
, unsigned long order
)
198 if (order
>= NR_MEM_LISTS
)
201 #ifdef ATOMIC_MEMORY_DEBUGGING
202 if ((gfp_mask
& __GFP_WAIT
) && in_interrupt()) {
203 static int count
= 0;
205 printk("gfp called nonatomically from interrupt %p\n",
206 __builtin_return_address(0));
213 * If this is a recursive call, we'd better
214 * do our best to just allocate things without
217 if (!(current
->flags
& PF_MEMALLOC
)) {
220 if (nr_free_pages
> freepages
.min
) {
223 if (nr_free_pages
>= freepages
.high
) {
230 current
->flags
|= PF_MEMALLOC
;
231 freed
= try_to_free_pages(gfp_mask
);
232 current
->flags
&= ~PF_MEMALLOC
;
234 if (!freed
&& !(gfp_mask
& (__GFP_MED
| __GFP_HIGH
)))
238 spin_lock_irqsave(&page_alloc_lock
, flags
);
239 RMQUEUE(order
, gfp_mask
);
240 spin_unlock_irqrestore(&page_alloc_lock
, flags
);
243 * If we can schedule, do so, and make sure to yield.
244 * We may be a real-time process, and if kswapd is
245 * waiting for us we need to allow it to run a bit.
247 if (gfp_mask
& __GFP_WAIT
) {
248 current
->policy
|= SCHED_YIELD
;
257 * Show free area list (used inside shift_scroll-lock stuff)
258 * We also calculate the percentage fragmentation. We do this by counting the
259 * memory on each free list with the exception of the first item on the list.
261 void show_free_areas(void)
263 unsigned long order
, flags
;
264 unsigned long total
= 0;
266 printk("Free pages: %6dkB\n ( ",nr_free_pages
<<(PAGE_SHIFT
-10));
267 printk("Free: %d (%d %d %d)\n",
272 spin_lock_irqsave(&page_alloc_lock
, flags
);
273 for (order
=0 ; order
< NR_MEM_LISTS
; order
++) {
275 unsigned long nr
= 0;
276 for (tmp
= free_area
[order
].next
; tmp
!= memory_head(free_area
+order
) ; tmp
= tmp
->next
) {
279 total
+= nr
* ((PAGE_SIZE
>>10) << order
);
280 printk("%lu*%lukB ", nr
, (unsigned long)((PAGE_SIZE
>>10) << order
));
282 spin_unlock_irqrestore(&page_alloc_lock
, flags
);
283 printk("= %lukB)\n", total
);
284 #ifdef SWAP_CACHE_INFO
285 show_swap_cache_info();
289 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
292 * set up the free-area data structures:
293 * - mark all pages reserved
294 * - mark all memory queues empty
295 * - clear the memory bitmaps
297 unsigned long __init
free_area_init(unsigned long start_mem
, unsigned long end_mem
)
300 unsigned long mask
= PAGE_MASK
;
304 * Select nr of pages we try to keep free for important stuff
305 * with a minimum of 10 pages and a maximum of 256 pages, so
306 * that we don't waste too much memory on large systems.
307 * This is fairly arbitrary, but based on some behaviour
310 i
= (end_mem
- PAGE_OFFSET
) >> (PAGE_SHIFT
+7);
316 freepages
.low
= i
* 2;
317 freepages
.high
= i
* 3;
318 mem_map
= (mem_map_t
*) LONG_ALIGN(start_mem
);
319 p
= mem_map
+ MAP_NR(end_mem
);
320 start_mem
= LONG_ALIGN((unsigned long) p
);
321 memset(mem_map
, 0, start_mem
- (unsigned long) mem_map
);
324 atomic_set(&p
->count
, 0);
325 p
->flags
= (1 << PG_DMA
) | (1 << PG_reserved
);
326 } while (p
> mem_map
);
328 for (i
= 0 ; i
< NR_MEM_LISTS
; i
++) {
329 unsigned long bitmap_size
;
330 init_mem_queue(free_area
+i
);
332 end_mem
= (end_mem
+ ~mask
) & mask
;
333 bitmap_size
= (end_mem
- PAGE_OFFSET
) >> (PAGE_SHIFT
+ i
);
334 bitmap_size
= (bitmap_size
+ 7) >> 3;
335 bitmap_size
= LONG_ALIGN(bitmap_size
);
336 free_area
[i
].map
= (unsigned int *) start_mem
;
337 memset((void *) start_mem
, 0, bitmap_size
);
338 start_mem
+= bitmap_size
;
344 * Primitive swap readahead code. We simply read an aligned block of
345 * (1 << page_cluster) entries in the swap area. This method is chosen
346 * because it doesn't cost us any seek time. We also make sure to queue
347 * the 'original' request together with the readahead ones...
349 void swapin_readahead(unsigned long entry
)
352 struct page
*new_page
;
353 unsigned long offset
= SWP_OFFSET(entry
);
354 struct swap_info_struct
*swapdev
= SWP_TYPE(entry
) + swap_info
;
356 offset
= (offset
>> page_cluster
) << page_cluster
;
358 i
= 1 << page_cluster
;
360 /* Don't read-ahead past the end of the swap area */
361 if (offset
>= swapdev
->max
)
363 /* Don't block on I/O for read-ahead */
364 if (atomic_read(&nr_async_pages
) >= pager_daemon
.swap_cluster
)
366 /* Don't read in bad or busy pages */
367 if (!swapdev
->swap_map
[offset
])
369 if (swapdev
->swap_map
[offset
] == SWAP_MAP_BAD
)
371 if (test_bit(offset
, swapdev
->swap_lockmap
))
374 /* Ok, do the async read-ahead now */
375 new_page
= read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry
), offset
), 0);
376 if (new_page
!= NULL
)
377 __free_page(new_page
);
384 * The tests may look silly, but it essentially makes sure that
385 * no other process did a swap-in on us just as we were waiting.
387 * Also, don't bother to add to the swap cache if this page-in
388 * was due to a write access.
390 void swap_in(struct task_struct
* tsk
, struct vm_area_struct
* vma
,
391 pte_t
* page_table
, unsigned long entry
, int write_access
)
394 struct page
*page_map
= lookup_swap_cache(entry
);
397 swapin_readahead(entry
);
398 page_map
= read_swap_cache(entry
);
400 if (pte_val(*page_table
) != entry
) {
402 free_page_and_swap_cache(page_address(page_map
));
406 set_pte(page_table
, BAD_PAGE
);
412 page
= page_address(page_map
);
417 if (!write_access
|| is_page_shared(page_map
)) {
418 set_pte(page_table
, mk_pte(page
, vma
->vm_page_prot
));
422 /* The page is unshared, and we want write access. In this
423 case, it is safe to tear down the swap cache and give the
424 page over entirely to this process. */
426 if (PageSwapCache(page_map
))
427 delete_from_swap_cache(page_map
);
428 set_pte(page_table
, pte_mkwrite(pte_mkdirty(mk_pte(page
, vma
->vm_page_prot
))));