Import 2.1.118
[davej-history.git] / mm / vmscan.c
blob961b27022e0af6e3a1fc0f8d66f170f2943fb98f
1 /*
2 * linux/mm/vmscan.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
13 #include <linux/mm.h>
14 #include <linux/sched.h>
15 #include <linux/head.h>
16 #include <linux/kernel.h>
17 #include <linux/kernel_stat.h>
18 #include <linux/errno.h>
19 #include <linux/string.h>
20 #include <linux/swap.h>
21 #include <linux/swapctl.h>
22 #include <linux/smp_lock.h>
23 #include <linux/slab.h>
24 #include <linux/dcache.h>
25 #include <linux/fs.h>
26 #include <linux/pagemap.h>
27 #include <linux/init.h>
29 #include <asm/bitops.h>
30 #include <asm/pgtable.h>
32 /*
33 * When are we next due for a page scan?
35 static unsigned long next_swap_jiffies = 0;
37 /*
38 * How often do we do a pageout scan during normal conditions?
39 * Default is four times a second.
41 int swapout_interval = HZ / 4;
43 /*
44 * The wait queue for waking up the pageout daemon:
46 struct wait_queue * kswapd_wait = NULL;
48 static void init_swap_timer(void);
51 * The swap-out functions return 1 if they successfully
52 * threw something out, and we got a free page. It returns
53 * zero if it couldn't do anything, and any other value
54 * indicates it decreased rss, but the page was shared.
56 * NOTE! If it sleeps, it *must* return 1 to make sure we
57 * don't continue with the swap-out. Otherwise we may be
58 * using a process that no longer actually exists (it might
59 * have died while we slept).
61 static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
62 unsigned long address, pte_t * page_table, int gfp_mask)
64 pte_t pte;
65 unsigned long entry;
66 unsigned long page;
67 struct page * page_map;
69 pte = *page_table;
70 if (!pte_present(pte))
71 return 0;
72 page = pte_page(pte);
73 if (MAP_NR(page) >= max_mapnr)
74 return 0;
76 page_map = mem_map + MAP_NR(page);
77 if (PageReserved(page_map)
78 || PageLocked(page_map)
79 || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
80 return 0;
82 /*
83 * Deal with page aging. There are several special cases to
84 * consider:
86 * Page has been accessed, but is swap cached. If the page is
87 * getting sufficiently "interesting" --- its age is getting
88 * high --- then if we are sufficiently short of free swap
89 * pages, then delete the swap cache. We can only do this if
90 * the swap page's reference count is one: ie. there are no
91 * other references to it beyond the swap cache (as there must
92 * still be PTEs pointing to it if count > 1).
94 * If the page has NOT been touched, and its age reaches zero,
95 * then we are swapping it out:
97 * If there is already a swap cache page for this page, then
98 * another process has already allocated swap space, so just
99 * dereference the physical page and copy in the swap entry
100 * from the swap cache.
102 * Note, we rely on all pages read in from swap either having
103 * the swap cache flag set, OR being marked writable in the pte,
104 * but NEVER BOTH. (It IS legal to be neither cached nor dirty,
105 * however.)
107 * -- Stephen Tweedie 1998 */
109 if (PageSwapCache(page_map)) {
110 if (pte_write(pte)) {
111 struct page *found;
112 printk ("VM: Found a writable swap-cached page!\n");
113 /* Try to diagnose the problem ... */
114 found = find_page(&swapper_inode, page_map->offset);
115 if (found) {
116 printk("page=%p@%08lx, found=%p, count=%d\n",
117 page_map, page_map->offset,
118 found, atomic_read(&found->count));
119 __free_page(found);
120 } else
121 printk ("Spurious, page not in cache\n");
122 return 0;
126 if (pte_young(pte)) {
127 set_pte(page_table, pte_mkold(pte));
128 touch_page(page_map);
130 * We should test here to see if we want to recover any
131 * swap cache page here. We do this if the page seeing
132 * enough activity, AND we are sufficiently low on swap
134 * We need to track both the number of available swap
135 * pages and the total number present before we can do
136 * this...
138 return 0;
141 age_page(page_map);
142 if (page_map->age)
143 return 0;
145 if (pte_dirty(pte)) {
146 if (vma->vm_ops && vma->vm_ops->swapout) {
147 pid_t pid = tsk->pid;
148 vma->vm_mm->rss--;
149 if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
150 kill_proc(pid, SIGBUS, 1);
151 } else {
153 * This is a dirty, swappable page. First of all,
154 * get a suitable swap entry for it, and make sure
155 * we have the swap cache set up to associate the
156 * page with that swap entry.
158 entry = in_swap_cache(page_map);
159 if (!entry) {
160 entry = get_swap_page();
161 if (!entry)
162 return 0; /* No swap space left */
165 vma->vm_mm->rss--;
166 tsk->nswap++;
167 flush_cache_page(vma, address);
168 set_pte(page_table, __pte(entry));
169 flush_tlb_page(vma, address);
170 swap_duplicate(entry);
172 /* Now to write back the page. We have two
173 * cases: if the page is already part of the
174 * swap cache, then it is already on disk. Just
175 * free the page and return (we release the swap
176 * cache on the last accessor too).
178 * If we have made a new swap entry, then we
179 * start the write out to disk. If the page is
180 * shared, however, we still need to keep the
181 * copy in memory, so we add it to the swap
182 * cache. */
183 if (PageSwapCache(page_map)) {
184 free_page_and_swap_cache(page);
185 return (atomic_read(&page_map->count) == 0);
187 add_to_swap_cache(page_map, entry);
188 /* We checked we were unlocked way up above, and we
189 have been careful not to stall until here */
190 set_bit(PG_locked, &page_map->flags);
191 /* OK, do a physical write to swap. */
192 rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT));
194 /* Now we can free the current physical page. We also
195 * free up the swap cache if this is the last use of the
196 * page. Note that there is a race here: the page may
197 * still be shared COW by another process, but that
198 * process may exit while we are writing out the page
199 * asynchronously. That's no problem, shrink_mmap() can
200 * correctly clean up the occassional unshared page
201 * which gets left behind in the swap cache. */
202 free_page_and_swap_cache(page);
203 return 1; /* we slept: the process may not exist any more */
206 /* The page was _not_ dirty, but still has a zero age. It must
207 * already be uptodate on disk. If it is in the swap cache,
208 * then we can just unlink the page now. Remove the swap cache
209 * too if this is the last user. */
210 if ((entry = in_swap_cache(page_map))) {
211 vma->vm_mm->rss--;
212 flush_cache_page(vma, address);
213 set_pte(page_table, __pte(entry));
214 flush_tlb_page(vma, address);
215 swap_duplicate(entry);
216 free_page_and_swap_cache(page);
217 return (atomic_read(&page_map->count) == 0);
220 * A clean page to be discarded? Must be mmap()ed from
221 * somewhere. Unlink the pte, and tell the filemap code to
222 * discard any cached backing page if this is the last user.
224 if (PageSwapCache(page_map)) {
225 printk ("VM: How can this page _still_ be cached?");
226 return 0;
228 vma->vm_mm->rss--;
229 flush_cache_page(vma, address);
230 pte_clear(page_table);
231 flush_tlb_page(vma, address);
232 entry = page_unuse(page_map);
233 __free_page(page_map);
234 return entry;
238 * A new implementation of swap_out(). We do not swap complete processes,
239 * but only a small number of blocks, before we continue with the next
240 * process. The number of blocks actually swapped is determined on the
241 * number of page faults, that this process actually had in the last time,
242 * so we won't swap heavily used processes all the time ...
244 * Note: the priority argument is a hint on much CPU to waste with the
245 * swap block search, not a hint, of how much blocks to swap with
246 * each process.
248 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
251 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
252 pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
254 pte_t * pte;
255 unsigned long pmd_end;
257 if (pmd_none(*dir))
258 return 0;
259 if (pmd_bad(*dir)) {
260 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
261 pmd_clear(dir);
262 return 0;
265 pte = pte_offset(dir, address);
267 pmd_end = (address + PMD_SIZE) & PMD_MASK;
268 if (end > pmd_end)
269 end = pmd_end;
271 do {
272 int result;
273 tsk->swap_address = address + PAGE_SIZE;
274 result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
275 if (result)
276 return result;
277 address += PAGE_SIZE;
278 pte++;
279 } while (address < end);
280 return 0;
283 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
284 pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
286 pmd_t * pmd;
287 unsigned long pgd_end;
289 if (pgd_none(*dir))
290 return 0;
291 if (pgd_bad(*dir)) {
292 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
293 pgd_clear(dir);
294 return 0;
297 pmd = pmd_offset(dir, address);
299 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
300 if (end > pgd_end)
301 end = pgd_end;
303 do {
304 int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
305 if (result)
306 return result;
307 address = (address + PMD_SIZE) & PMD_MASK;
308 pmd++;
309 } while (address < end);
310 return 0;
313 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
314 pgd_t *pgdir, unsigned long start, int gfp_mask)
316 unsigned long end;
318 /* Don't swap out areas like shared memory which have their
319 own separate swapping mechanism or areas which are locked down */
320 if (vma->vm_flags & (VM_SHM | VM_LOCKED))
321 return 0;
323 end = vma->vm_end;
324 while (start < end) {
325 int result = swap_out_pgd(tsk, vma, pgdir, start, end, gfp_mask);
326 if (result)
327 return result;
328 start = (start + PGDIR_SIZE) & PGDIR_MASK;
329 pgdir++;
331 return 0;
334 static int swap_out_process(struct task_struct * p, int gfp_mask)
336 unsigned long address;
337 struct vm_area_struct* vma;
340 * Go through process' page directory.
342 address = p->swap_address;
345 * Find the proper vm-area
347 vma = find_vma(p->mm, address);
348 if (!vma) {
349 p->swap_address = 0;
350 return 0;
352 if (address < vma->vm_start)
353 address = vma->vm_start;
355 for (;;) {
356 int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, gfp_mask);
357 if (result)
358 return result;
359 vma = vma->vm_next;
360 if (!vma)
361 break;
362 address = vma->vm_start;
364 p->swap_address = 0;
365 return 0;
369 * Select the task with maximal swap_cnt and try to swap out a page.
370 * N.B. This function returns only 0 or 1. Return values != 1 from
371 * the lower level routines result in continued processing.
373 static int swap_out(unsigned int priority, int gfp_mask)
375 struct task_struct * p, * pbest;
376 int counter, assign, max_cnt;
379 * We make one or two passes through the task list, indexed by
380 * assign = {0, 1}:
381 * Pass 1: select the swappable task with maximal swap_cnt.
382 * Pass 2: assign new swap_cnt values, then select as above.
383 * With this approach, there's no need to remember the last task
384 * swapped out. If the swap-out fails, we clear swap_cnt so the
385 * task won't be selected again until all others have been tried.
387 counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
388 for (; counter >= 0; counter--) {
389 assign = 0;
390 max_cnt = 0;
391 pbest = NULL;
392 select:
393 read_lock(&tasklist_lock);
394 p = init_task.next_task;
395 for (; p != &init_task; p = p->next_task) {
396 if (!p->swappable)
397 continue;
398 if (p->mm->rss <= 0)
399 continue;
400 if (assign) {
402 * If we didn't select a task on pass 1,
403 * assign each task a new swap_cnt.
404 * Normalise the number of pages swapped
405 * by multiplying by (RSS / 1MB)
407 p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
409 if (p->swap_cnt > max_cnt) {
410 max_cnt = p->swap_cnt;
411 pbest = p;
414 read_unlock(&tasklist_lock);
415 if (!pbest) {
416 if (!assign) {
417 assign = 1;
418 goto select;
420 goto out;
422 pbest->swap_cnt--;
424 switch (swap_out_process(pbest, gfp_mask)) {
425 case 0:
427 * Clear swap_cnt so we don't look at this task
428 * again until we've tried all of the others.
429 * (We didn't block, so the task is still here.)
431 pbest->swap_cnt = 0;
432 break;
433 case 1:
434 return 1;
435 default:
436 break;
439 out:
440 return 0;
444 * We are much more aggressive about trying to swap out than we used
445 * to be. This works out OK, because we now do proper aging on page
446 * contents.
448 static int do_try_to_free_page(int gfp_mask)
450 static int state = 0;
451 int i=6;
452 int stop;
454 /* Always trim SLAB caches when memory gets low. */
455 kmem_cache_reap(gfp_mask);
457 /* We try harder if we are waiting .. */
458 stop = 3;
459 if (gfp_mask & __GFP_WAIT)
460 stop = 0;
462 if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages)
463 || (page_cache_size * 100 > page_cache.borrow_percent * num_physpages))
464 shrink_mmap(i, gfp_mask);
466 switch (state) {
467 do {
468 case 0:
469 if (shrink_mmap(i, gfp_mask))
470 return 1;
471 state = 1;
472 case 1:
473 if (shm_swap(i, gfp_mask))
474 return 1;
475 state = 2;
476 case 2:
477 if (swap_out(i, gfp_mask))
478 return 1;
479 state = 3;
480 case 3:
481 shrink_dcache_memory(i, gfp_mask);
482 state = 0;
483 i--;
484 } while ((i - stop) >= 0);
486 return 0;
490 * Before we start the kernel thread, print out the
491 * kswapd initialization message (otherwise the init message
492 * may be printed in the middle of another driver's init
493 * message). It looks very bad when that happens.
495 void __init kswapd_setup(void)
497 int i;
498 char *revision="$Revision: 1.5 $", *s, *e;
500 if ((s = strchr(revision, ':')) &&
501 (e = strchr(s, '$')))
502 s++, i = e - s;
503 else
504 s = revision, i = -1;
505 printk ("Starting kswapd v%.*s\n", i, s);
509 * The background pageout daemon.
510 * Started as a kernel thread from the init process.
512 int kswapd(void *unused)
514 struct wait_queue wait = { current, NULL };
515 current->session = 1;
516 current->pgrp = 1;
517 sprintf(current->comm, "kswapd");
518 sigfillset(&current->blocked);
521 * As a kernel thread we want to tamper with system buffers
522 * and other internals and thus be subject to the SMP locking
523 * rules. (On a uniprocessor box this does nothing).
525 lock_kernel();
527 /* Give kswapd a realtime priority. */
528 current->policy = SCHED_FIFO;
529 current->rt_priority = 32; /* Fixme --- we need to standardise our
530 namings for POSIX.4 realtime scheduling
531 priorities. */
534 * Tell the memory management that we're a "memory allocator",
535 * and that if we need more memory we should get access to it
536 * regardless (see "try_to_free_pages()"). "kswapd" should
537 * never get caught in the normal page freeing logic.
539 * (Kswapd normally doesn't need memory anyway, but sometimes
540 * you need a small amount of memory in order to be able to
541 * page out something else, and this flag essentially protects
542 * us from recursively trying to free more memory as we're
543 * trying to free the first piece of memory in the first place).
545 current->flags |= PF_MEMALLOC;
547 init_swap_timer();
548 add_wait_queue(&kswapd_wait, &wait);
549 while (1) {
550 int tries;
552 current->state = TASK_INTERRUPTIBLE;
553 flush_signals(current);
554 run_task_queue(&tq_disk);
555 schedule();
556 swapstats.wakeups++;
559 * Do the background pageout: be
560 * more aggressive if we're really
561 * low on free memory.
563 * We try page_daemon.tries_base times, divided by
564 * an 'urgency factor'. In practice this will mean
565 * a value of pager_daemon.tries_base / 8 or 4 = 64
566 * or 128 pages at a time.
567 * This gives us 64 (or 128) * 4k * 4 (times/sec) =
568 * 1 (or 2) MB/s swapping bandwidth in low-priority
569 * background paging. This number rises to 8 MB/s
570 * when the priority is highest (but then we'll be
571 * woken up more often and the rate will be even
572 * higher).
574 tries = pager_daemon.tries_base;
575 tries >>= 4*free_memory_available();
577 do {
578 do_try_to_free_page(0);
580 * Syncing large chunks is faster than swapping
581 * synchronously (less head movement). -- Rik.
583 if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
584 run_task_queue(&tq_disk);
585 if (free_memory_available() > 1)
586 break;
587 } while (--tries > 0);
589 /* As if we could ever get here - maybe we want to make this killable */
590 remove_wait_queue(&kswapd_wait, &wait);
591 unlock_kernel();
592 return 0;
596 * We need to make the locks finer granularity, but right
597 * now we need this so that we can do page allocations
598 * without holding the kernel lock etc.
600 * The "PF_MEMALLOC" flag protects us against recursion:
601 * if we need more memory as part of a swap-out effort we
602 * will just silently return "success" to tell the page
603 * allocator to accept the allocation.
605 int try_to_free_pages(unsigned int gfp_mask, int count)
607 int retval = 1;
609 lock_kernel();
610 if (!(current->flags & PF_MEMALLOC)) {
611 current->flags |= PF_MEMALLOC;
612 do {
613 retval = do_try_to_free_page(gfp_mask);
614 if (!retval)
615 break;
616 count--;
617 } while (count > 0);
618 current->flags &= ~PF_MEMALLOC;
620 unlock_kernel();
621 return retval;
625 * The swap_tick function gets called on every clock tick.
627 void swap_tick(void)
629 unsigned long now, want;
630 int want_wakeup = 0;
632 want = next_swap_jiffies;
633 now = jiffies;
636 * Examine the memory queues. Mark memory low
637 * if there is nothing available in the three
638 * highest queues.
640 * Schedule for wakeup if there isn't lots
641 * of free memory.
643 switch (free_memory_available()) {
644 case 0:
645 want = now;
646 /* Fall through */
647 case 1:
648 want_wakeup = 1;
649 default:
652 if ((long) (now - want) >= 0) {
653 if (want_wakeup || (num_physpages * buffer_mem.max_percent) < (buffermem >> PAGE_SHIFT) * 100
654 || (num_physpages * page_cache.max_percent < page_cache_size * 100)) {
655 /* Set the next wake-up time */
656 next_swap_jiffies = now + swapout_interval;
657 wake_up(&kswapd_wait);
660 timer_active |= (1<<SWAP_TIMER);
664 * Initialise the swap timer
667 void init_swap_timer(void)
669 timer_table[SWAP_TIMER].expires = 0;
670 timer_table[SWAP_TIMER].fn = swap_tick;
671 timer_active |= (1<<SWAP_TIMER);