There's a pre-3 patch on ftp.kernel.org in the kernel/testing directory,
[davej-history.git] / mm / page_alloc.c
blob66ef6f320022794b86c454ae4819032a6df611ac
1 /*
2 * linux/mm/page_alloc.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */
8 #include <linux/config.h>
9 #include <linux/mm.h>
10 #include <linux/kernel_stat.h>
11 #include <linux/swap.h>
12 #include <linux/swapctl.h>
13 #include <linux/interrupt.h>
14 #include <linux/init.h>
15 #include <linux/pagemap.h>
17 #include <asm/dma.h>
18 #include <asm/uaccess.h> /* for copy_to/from_user */
19 #include <asm/pgtable.h>
21 int nr_swap_pages = 0;
22 int nr_free_pages = 0;
25 * Free area management
27 * The free_area_list arrays point to the queue heads of the free areas
28 * of different sizes
31 #if CONFIG_AP1000
32 /* the AP+ needs to allocate 8MB contiguous, aligned chunks of ram
33 for the ring buffers */
34 #define NR_MEM_LISTS 12
35 #else
36 #define NR_MEM_LISTS 10
37 #endif
39 /* The start of this MUST match the start of "struct page" */
40 struct free_area_struct {
41 struct page *next;
42 struct page *prev;
43 unsigned int * map;
46 #define memory_head(x) ((struct page *)(x))
48 static struct free_area_struct free_area[NR_MEM_LISTS];
50 static inline void init_mem_queue(struct free_area_struct * head)
52 head->next = memory_head(head);
53 head->prev = memory_head(head);
56 static inline void add_mem_queue(struct free_area_struct * head, struct page * entry)
58 struct page * next = head->next;
60 entry->prev = memory_head(head);
61 entry->next = next;
62 next->prev = entry;
63 head->next = entry;
66 static inline void remove_mem_queue(struct page * entry)
68 struct page * next = entry->next;
69 struct page * prev = entry->prev;
70 next->prev = prev;
71 prev->next = next;
75 * Free_page() adds the page to the free lists. This is optimized for
76 * fast normal cases (no error jumps taken normally).
78 * The way to optimize jumps for gcc-2.2.2 is to:
79 * - select the "normal" case and put it inside the if () { XXX }
80 * - no else-statements if you can avoid them
82 * With the above two rules, you get a straight-line execution path
83 * for the normal case, giving better asm-code.
87 * Buddy system. Hairy. You really aren't expected to understand this
89 * Hint: -mask = 1+~mask
91 spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
93 static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
95 struct free_area_struct *area = free_area + order;
96 unsigned long index = map_nr >> (1 + order);
97 unsigned long mask = (~0UL) << order;
98 unsigned long flags;
100 spin_lock_irqsave(&page_alloc_lock, flags);
102 #define list(x) (mem_map+(x))
104 map_nr &= mask;
105 nr_free_pages -= mask;
106 while (mask + (1 << (NR_MEM_LISTS-1))) {
107 if (!test_and_change_bit(index, area->map))
108 break;
109 remove_mem_queue(list(map_nr ^ -mask));
110 mask <<= 1;
111 area++;
112 index >>= 1;
113 map_nr &= mask;
115 add_mem_queue(area, list(map_nr));
117 #undef list
119 spin_unlock_irqrestore(&page_alloc_lock, flags);
122 void __free_page(struct page *page)
124 if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
125 if (PageSwapCache(page))
126 panic ("Freeing swap cache page");
127 page->flags &= ~(1 << PG_referenced);
128 free_pages_ok(page - mem_map, 0);
129 return;
133 void free_pages(unsigned long addr, unsigned long order)
135 unsigned long map_nr = MAP_NR(addr);
137 if (map_nr < max_mapnr) {
138 mem_map_t * map = mem_map + map_nr;
139 if (PageReserved(map))
140 return;
141 if (atomic_dec_and_test(&map->count)) {
142 if (PageSwapCache(map))
143 panic ("Freeing swap cache pages");
144 map->flags &= ~(1 << PG_referenced);
145 free_pages_ok(map_nr, order);
146 return;
152 * Some ugly macros to speed up __get_free_pages()..
154 #define MARK_USED(index, order, area) \
155 change_bit((index) >> (1+(order)), (area)->map)
156 #define CAN_DMA(x) (PageDMA(x))
157 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
158 #define RMQUEUE(order, gfp_mask) \
159 do { struct free_area_struct * area = free_area+order; \
160 unsigned long new_order = order; \
161 do { struct page *prev = memory_head(area), *ret = prev->next; \
162 while (memory_head(area) != ret) { \
163 if (!(gfp_mask & __GFP_DMA) || CAN_DMA(ret)) { \
164 unsigned long map_nr; \
165 (prev->next = ret->next)->prev = prev; \
166 map_nr = ret - mem_map; \
167 MARK_USED(map_nr, new_order, area); \
168 nr_free_pages -= 1 << order; \
169 EXPAND(ret, map_nr, order, new_order, area); \
170 spin_unlock_irqrestore(&page_alloc_lock, flags); \
171 return ADDRESS(map_nr); \
173 prev = ret; \
174 ret = ret->next; \
176 new_order++; area++; \
177 } while (new_order < NR_MEM_LISTS); \
178 } while (0)
180 #define EXPAND(map,index,low,high,area) \
181 do { unsigned long size = 1 << high; \
182 while (high > low) { \
183 area--; high--; size >>= 1; \
184 add_mem_queue(area, map); \
185 MARK_USED(index, high, area); \
186 index += size; \
187 map += size; \
189 atomic_set(&map->count, 1); \
190 } while (0)
192 int low_on_memory = 0;
194 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
196 unsigned long flags;
198 if (order >= NR_MEM_LISTS)
199 goto nopage;
201 #ifdef ATOMIC_MEMORY_DEBUGGING
202 if ((gfp_mask & __GFP_WAIT) && in_interrupt()) {
203 static int count = 0;
204 if (++count < 5) {
205 printk("gfp called nonatomically from interrupt %p\n",
206 __builtin_return_address(0));
208 goto nopage;
210 #endif
213 * If this is a recursive call, we'd better
214 * do our best to just allocate things without
215 * further thought.
217 if (!(current->flags & PF_MEMALLOC)) {
218 int freed;
220 if (nr_free_pages > freepages.min) {
221 if (!low_on_memory)
222 goto ok_to_allocate;
223 if (nr_free_pages >= freepages.high) {
224 low_on_memory = 0;
225 goto ok_to_allocate;
229 low_on_memory = 1;
230 current->flags |= PF_MEMALLOC;
231 freed = try_to_free_pages(gfp_mask);
232 current->flags &= ~PF_MEMALLOC;
234 if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
235 goto nopage;
237 ok_to_allocate:
238 spin_lock_irqsave(&page_alloc_lock, flags);
239 RMQUEUE(order, gfp_mask);
240 spin_unlock_irqrestore(&page_alloc_lock, flags);
243 * If we can schedule, do so, and make sure to yield.
244 * We may be a real-time process, and if kswapd is
245 * waiting for us we need to allow it to run a bit.
247 if (gfp_mask & __GFP_WAIT) {
248 current->policy |= SCHED_YIELD;
249 schedule();
252 nopage:
253 return 0;
257 * Show free area list (used inside shift_scroll-lock stuff)
258 * We also calculate the percentage fragmentation. We do this by counting the
259 * memory on each free list with the exception of the first item on the list.
261 void show_free_areas(void)
263 unsigned long order, flags;
264 unsigned long total = 0;
266 printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
267 printk("Free: %d (%d %d %d)\n",
268 nr_free_pages,
269 freepages.min,
270 freepages.low,
271 freepages.high);
272 spin_lock_irqsave(&page_alloc_lock, flags);
273 for (order=0 ; order < NR_MEM_LISTS; order++) {
274 struct page * tmp;
275 unsigned long nr = 0;
276 for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) {
277 nr ++;
279 total += nr * ((PAGE_SIZE>>10) << order);
280 printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order));
282 spin_unlock_irqrestore(&page_alloc_lock, flags);
283 printk("= %lukB)\n", total);
284 #ifdef SWAP_CACHE_INFO
285 show_swap_cache_info();
286 #endif
289 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
292 * set up the free-area data structures:
293 * - mark all pages reserved
294 * - mark all memory queues empty
295 * - clear the memory bitmaps
297 unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_mem)
299 mem_map_t * p;
300 unsigned long mask = PAGE_MASK;
301 unsigned long i;
304 * Select nr of pages we try to keep free for important stuff
305 * with a minimum of 10 pages and a maximum of 256 pages, so
306 * that we don't waste too much memory on large systems.
307 * This is fairly arbitrary, but based on some behaviour
308 * analysis.
310 i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
311 if (i < 10)
312 i = 10;
313 if (i > 256)
314 i = 256;
315 freepages.min = i;
316 freepages.low = i * 2;
317 freepages.high = i * 3;
318 mem_map = (mem_map_t *) LONG_ALIGN(start_mem);
319 p = mem_map + MAP_NR(end_mem);
320 start_mem = LONG_ALIGN((unsigned long) p);
321 memset(mem_map, 0, start_mem - (unsigned long) mem_map);
322 do {
323 --p;
324 atomic_set(&p->count, 0);
325 p->flags = (1 << PG_DMA) | (1 << PG_reserved);
326 } while (p > mem_map);
328 for (i = 0 ; i < NR_MEM_LISTS ; i++) {
329 unsigned long bitmap_size;
330 init_mem_queue(free_area+i);
331 mask += mask;
332 end_mem = (end_mem + ~mask) & mask;
333 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
334 bitmap_size = (bitmap_size + 7) >> 3;
335 bitmap_size = LONG_ALIGN(bitmap_size);
336 free_area[i].map = (unsigned int *) start_mem;
337 memset((void *) start_mem, 0, bitmap_size);
338 start_mem += bitmap_size;
340 return start_mem;
344 * Primitive swap readahead code. We simply read an aligned block of
345 * (1 << page_cluster) entries in the swap area. This method is chosen
346 * because it doesn't cost us any seek time. We also make sure to queue
347 * the 'original' request together with the readahead ones...
349 void swapin_readahead(unsigned long entry)
351 int i;
352 struct page *new_page;
353 unsigned long offset = SWP_OFFSET(entry);
354 struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
356 offset = (offset >> page_cluster) << page_cluster;
358 i = 1 << page_cluster;
359 do {
360 /* Don't read-ahead past the end of the swap area */
361 if (offset >= swapdev->max)
362 break;
363 /* Don't block on I/O for read-ahead */
364 if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
365 break;
366 /* Don't read in bad or busy pages */
367 if (!swapdev->swap_map[offset])
368 break;
369 if (swapdev->swap_map[offset] == SWAP_MAP_BAD)
370 break;
371 if (test_bit(offset, swapdev->swap_lockmap))
372 break;
374 /* Ok, do the async read-ahead now */
375 new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
376 if (new_page != NULL)
377 __free_page(new_page);
378 offset++;
379 } while (--i);
380 return;
384 * The tests may look silly, but it essentially makes sure that
385 * no other process did a swap-in on us just as we were waiting.
387 * Also, don't bother to add to the swap cache if this page-in
388 * was due to a write access.
390 void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
391 pte_t * page_table, unsigned long entry, int write_access)
393 unsigned long page;
394 struct page *page_map = lookup_swap_cache(entry);
396 if (!page_map) {
397 swapin_readahead(entry);
398 page_map = read_swap_cache(entry);
400 if (pte_val(*page_table) != entry) {
401 if (page_map)
402 free_page_and_swap_cache(page_address(page_map));
403 return;
405 if (!page_map) {
406 set_pte(page_table, BAD_PAGE);
407 swap_free(entry);
408 oom(tsk);
409 return;
412 page = page_address(page_map);
413 vma->vm_mm->rss++;
414 tsk->min_flt++;
415 swap_free(entry);
417 if (!write_access || is_page_shared(page_map)) {
418 set_pte(page_table, mk_pte(page, vma->vm_page_prot));
419 return;
422 /* The page is unshared, and we want write access. In this
423 case, it is safe to tear down the swap cache and give the
424 page over entirely to this process. */
426 if (PageSwapCache(page_map))
427 delete_from_swap_cache(page_map);
428 set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
429 return;