Import 2.2.8pre2
[davej-history.git] / mm / page_alloc.c
blob16bbf0179feffba5669f3cf3a5a6dd9d57aa71d6
1 /*
2 * linux/mm/page_alloc.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */
8 #include <linux/config.h>
9 #include <linux/mm.h>
10 #include <linux/kernel_stat.h>
11 #include <linux/swap.h>
12 #include <linux/swapctl.h>
13 #include <linux/interrupt.h>
14 #include <linux/init.h>
15 #include <linux/pagemap.h>
17 #include <asm/dma.h>
18 #include <asm/uaccess.h> /* for copy_to/from_user */
19 #include <asm/pgtable.h>
21 int nr_swap_pages = 0;
22 int nr_free_pages = 0;
25 * Free area management
27 * The free_area_list arrays point to the queue heads of the free areas
28 * of different sizes
31 #if CONFIG_AP1000
32 /* the AP+ needs to allocate 8MB contiguous, aligned chunks of ram
33 for the ring buffers */
34 #define NR_MEM_LISTS 12
35 #else
36 #define NR_MEM_LISTS 10
37 #endif
39 /* The start of this MUST match the start of "struct page" */
40 struct free_area_struct {
41 struct page *next;
42 struct page *prev;
43 unsigned int * map;
46 #define memory_head(x) ((struct page *)(x))
48 static struct free_area_struct free_area[NR_MEM_LISTS];
50 static inline void init_mem_queue(struct free_area_struct * head)
52 head->next = memory_head(head);
53 head->prev = memory_head(head);
56 static inline void add_mem_queue(struct free_area_struct * head, struct page * entry)
58 struct page * next = head->next;
60 entry->prev = memory_head(head);
61 entry->next = next;
62 next->prev = entry;
63 head->next = entry;
66 static inline void remove_mem_queue(struct page * entry)
68 struct page * next = entry->next;
69 struct page * prev = entry->prev;
70 next->prev = prev;
71 prev->next = next;
75 * Free_page() adds the page to the free lists. This is optimized for
76 * fast normal cases (no error jumps taken normally).
78 * The way to optimize jumps for gcc-2.2.2 is to:
79 * - select the "normal" case and put it inside the if () { XXX }
80 * - no else-statements if you can avoid them
82 * With the above two rules, you get a straight-line execution path
83 * for the normal case, giving better asm-code.
87 * Buddy system. Hairy. You really aren't expected to understand this
89 * Hint: -mask = 1+~mask
91 spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
93 static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
95 struct free_area_struct *area = free_area + order;
96 unsigned long index = map_nr >> (1 + order);
97 unsigned long mask = (~0UL) << order;
98 unsigned long flags;
100 spin_lock_irqsave(&page_alloc_lock, flags);
102 #define list(x) (mem_map+(x))
104 map_nr &= mask;
105 nr_free_pages -= mask;
106 while (mask + (1 << (NR_MEM_LISTS-1))) {
107 if (!test_and_change_bit(index, area->map))
108 break;
109 remove_mem_queue(list(map_nr ^ -mask));
110 mask <<= 1;
111 area++;
112 index >>= 1;
113 map_nr &= mask;
115 add_mem_queue(area, list(map_nr));
117 #undef list
119 spin_unlock_irqrestore(&page_alloc_lock, flags);
122 void __free_page(struct page *page)
124 if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
125 if (PageSwapCache(page))
126 panic ("Freeing swap cache page");
127 page->flags &= ~(1 << PG_referenced);
128 free_pages_ok(page - mem_map, 0);
129 return;
133 void free_pages(unsigned long addr, unsigned long order)
135 unsigned long map_nr = MAP_NR(addr);
137 if (map_nr < max_mapnr) {
138 mem_map_t * map = mem_map + map_nr;
139 if (PageReserved(map))
140 return;
141 if (atomic_dec_and_test(&map->count)) {
142 if (PageSwapCache(map))
143 panic ("Freeing swap cache pages");
144 map->flags &= ~(1 << PG_referenced);
145 free_pages_ok(map_nr, order);
146 return;
152 * Some ugly macros to speed up __get_free_pages()..
154 #define MARK_USED(index, order, area) \
155 change_bit((index) >> (1+(order)), (area)->map)
156 #define CAN_DMA(x) (PageDMA(x))
157 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
158 #define RMQUEUE(order, gfp_mask) \
159 do { struct free_area_struct * area = free_area+order; \
160 unsigned long new_order = order; \
161 do { struct page *prev = memory_head(area), *ret = prev->next; \
162 while (memory_head(area) != ret) { \
163 if (!(gfp_mask & __GFP_DMA) || CAN_DMA(ret)) { \
164 unsigned long map_nr; \
165 (prev->next = ret->next)->prev = prev; \
166 map_nr = ret - mem_map; \
167 MARK_USED(map_nr, new_order, area); \
168 nr_free_pages -= 1 << order; \
169 EXPAND(ret, map_nr, order, new_order, area); \
170 spin_unlock_irqrestore(&page_alloc_lock, flags); \
171 return ADDRESS(map_nr); \
173 prev = ret; \
174 ret = ret->next; \
176 new_order++; area++; \
177 } while (new_order < NR_MEM_LISTS); \
178 } while (0)
180 #define EXPAND(map,index,low,high,area) \
181 do { unsigned long size = 1 << high; \
182 while (high > low) { \
183 area--; high--; size >>= 1; \
184 add_mem_queue(area, map); \
185 MARK_USED(index, high, area); \
186 index += size; \
187 map += size; \
189 atomic_set(&map->count, 1); \
190 } while (0)
192 int low_on_memory = 0;
194 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
196 unsigned long flags;
198 if (order >= NR_MEM_LISTS)
199 goto nopage;
201 #ifdef ATOMIC_MEMORY_DEBUGGING
202 if ((gfp_mask & __GFP_WAIT) && in_interrupt()) {
203 static int count = 0;
204 if (++count < 5) {
205 printk("gfp called nonatomically from interrupt %p\n",
206 __builtin_return_address(0));
208 goto nopage;
210 #endif
213 * If this is a recursive call, we'd better
214 * do our best to just allocate things without
215 * further thought.
217 if (!(current->flags & PF_MEMALLOC)) {
218 int freed;
220 if (nr_free_pages > freepages.min) {
221 if (!low_on_memory)
222 goto ok_to_allocate;
223 if (nr_free_pages >= freepages.high) {
224 low_on_memory = 0;
225 goto ok_to_allocate;
229 low_on_memory = 1;
230 current->flags |= PF_MEMALLOC;
231 freed = try_to_free_pages(gfp_mask);
232 current->flags &= ~PF_MEMALLOC;
234 if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
235 goto nopage;
237 ok_to_allocate:
238 spin_lock_irqsave(&page_alloc_lock, flags);
239 RMQUEUE(order, gfp_mask);
240 spin_unlock_irqrestore(&page_alloc_lock, flags);
243 * If we can schedule, do so, and make sure to yield.
244 * We may be a real-time process, and if kswapd is
245 * waiting for us we need to allow it to run a bit.
247 if (gfp_mask & __GFP_WAIT) {
248 current->policy |= SCHED_YIELD;
249 schedule();
252 nopage:
253 return 0;
257 * Show free area list (used inside shift_scroll-lock stuff)
258 * We also calculate the percentage fragmentation. We do this by counting the
259 * memory on each free list with the exception of the first item on the list.
261 void show_free_areas(void)
263 unsigned long order, flags;
264 unsigned long total = 0;
266 printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
267 printk("Free: %d (%d %d %d)\n",
268 nr_free_pages,
269 freepages.min,
270 freepages.low,
271 freepages.high);
272 spin_lock_irqsave(&page_alloc_lock, flags);
273 for (order=0 ; order < NR_MEM_LISTS; order++) {
274 struct page * tmp;
275 unsigned long nr = 0;
276 for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) {
277 nr ++;
279 total += nr * ((PAGE_SIZE>>10) << order);
280 printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order));
282 spin_unlock_irqrestore(&page_alloc_lock, flags);
283 printk("= %lukB)\n", total);
284 #ifdef SWAP_CACHE_INFO
285 show_swap_cache_info();
286 #endif
289 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
292 * set up the free-area data structures:
293 * - mark all pages reserved
294 * - mark all memory queues empty
295 * - clear the memory bitmaps
297 unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_mem)
299 mem_map_t * p;
300 unsigned long mask = PAGE_MASK;
301 unsigned long i;
304 * Select nr of pages we try to keep free for important stuff
305 * with a minimum of 10 pages and a maximum of 256 pages, so
306 * that we don't waste too much memory on large systems.
307 * This is fairly arbitrary, but based on some behaviour
308 * analysis.
310 i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
311 if (i < 10)
312 i = 10;
313 if (i > 256)
314 i = 256;
315 freepages.min = i;
316 freepages.low = i * 2;
317 freepages.high = i * 3;
318 mem_map = (mem_map_t *) LONG_ALIGN(start_mem);
319 p = mem_map + MAP_NR(end_mem);
320 start_mem = LONG_ALIGN((unsigned long) p);
321 memset(mem_map, 0, start_mem - (unsigned long) mem_map);
322 do {
323 --p;
324 atomic_set(&p->count, 0);
325 p->flags = (1 << PG_DMA) | (1 << PG_reserved);
326 } while (p > mem_map);
328 for (i = 0 ; i < NR_MEM_LISTS ; i++) {
329 unsigned long bitmap_size;
330 init_mem_queue(free_area+i);
331 mask += mask;
332 end_mem = (end_mem + ~mask) & mask;
333 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
334 bitmap_size = (bitmap_size + 7) >> 3;
335 bitmap_size = LONG_ALIGN(bitmap_size);
336 free_area[i].map = (unsigned int *) start_mem;
337 memset((void *) start_mem, 0, bitmap_size);
338 start_mem += bitmap_size;
340 return start_mem;
344 * Primitive swap readahead code. We simply read an aligned block of
345 * (1 << page_cluster) entries in the swap area. This method is chosen
346 * because it doesn't cost us any seek time. We also make sure to queue
347 * the 'original' request together with the readahead ones...
349 void swapin_readahead(unsigned long entry)
351 int i;
352 struct page *new_page;
353 unsigned long offset = SWP_OFFSET(entry);
354 struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
356 offset = (offset >> page_cluster) << page_cluster;
358 i = 1 << page_cluster;
359 do {
360 /* Don't read-ahead past the end of the swap area */
361 if (offset >= swapdev->max)
362 break;
363 /* Don't block on I/O for read-ahead */
364 if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
365 break;
366 /* Don't read in bad or busy pages */
367 if (!swapdev->swap_map[offset])
368 break;
369 if (swapdev->swap_map[offset] == SWAP_MAP_BAD)
370 break;
371 if (test_bit(offset, swapdev->swap_lockmap))
372 break;
374 /* Ok, do the async read-ahead now */
375 new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
376 if (new_page != NULL)
377 __free_page(new_page);
378 offset++;
379 } while (--i);
380 return;
384 * The tests may look silly, but it essentially makes sure that
385 * no other process did a swap-in on us just as we were waiting.
387 * Also, don't bother to add to the swap cache if this page-in
388 * was due to a write access.
390 void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
391 pte_t * page_table, unsigned long entry, int write_access)
393 unsigned long page;
394 struct page *page_map = lookup_swap_cache(entry);
396 if (!page_map) {
397 swapin_readahead(entry);
398 page_map = read_swap_cache(entry);
400 if (pte_val(*page_table) != entry) {
401 if (page_map)
402 free_page_and_swap_cache(page_address(page_map));
403 return;
405 if (!page_map) {
406 set_pte(page_table, BAD_PAGE);
407 swap_free(entry);
408 oom(tsk);
409 return;
412 page = page_address(page_map);
413 vma->vm_mm->rss++;
414 tsk->min_flt++;
415 swap_free(entry);
417 if (!write_access || is_page_shared(page_map)) {
418 set_pte(page_table, mk_pte(page, vma->vm_page_prot));
419 return;
423 * The page is unshared and we're going to dirty it - so tear
424 * down the swap cache and give exclusive access to the page to
425 * this process.
427 delete_from_swap_cache(page_map);
428 set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
429 return;