pre-2.1.132-4..
[davej-history.git] / mm / vmscan.c
blob0f782a94f5915faf0080268870b7d0667afaa52c
1 /*
2 * linux/mm/vmscan.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
13 #include <linux/slab.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/swap.h>
16 #include <linux/swapctl.h>
17 #include <linux/smp_lock.h>
18 #include <linux/pagemap.h>
19 #include <linux/init.h>
21 #include <asm/pgtable.h>
23 /*
24 * The wait queue for waking up the pageout daemon:
26 static struct task_struct * kswapd_task = NULL;
28 static void init_swap_timer(void);
31 * The swap-out functions return 1 if they successfully
32 * threw something out, and we got a free page. It returns
33 * zero if it couldn't do anything, and any other value
34 * indicates it decreased rss, but the page was shared.
36 * NOTE! If it sleeps, it *must* return 1 to make sure we
37 * don't continue with the swap-out. Otherwise we may be
38 * using a process that no longer actually exists (it might
39 * have died while we slept).
41 static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
42 unsigned long address, pte_t * page_table, int gfp_mask)
44 pte_t pte;
45 unsigned long entry;
46 unsigned long page;
47 struct page * page_map;
49 pte = *page_table;
50 if (!pte_present(pte))
51 return 0;
52 page = pte_page(pte);
53 if (MAP_NR(page) >= max_mapnr)
54 return 0;
56 page_map = mem_map + MAP_NR(page);
57 if (PageReserved(page_map)
58 || PageLocked(page_map)
59 || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
60 return 0;
62 /*
63 * Deal with page aging. There are several special cases to
64 * consider:
66 * Page has been accessed, but is swap cached. If the page is
67 * getting sufficiently "interesting" --- its age is getting
68 * high --- then if we are sufficiently short of free swap
69 * pages, then delete the swap cache. We can only do this if
70 * the swap page's reference count is one: ie. there are no
71 * other references to it beyond the swap cache (as there must
72 * still be PTEs pointing to it if count > 1).
74 * If the page has NOT been touched, and its age reaches zero,
75 * then we are swapping it out:
77 * If there is already a swap cache page for this page, then
78 * another process has already allocated swap space, so just
79 * dereference the physical page and copy in the swap entry
80 * from the swap cache.
82 * Note, we rely on all pages read in from swap either having
83 * the swap cache flag set, OR being marked writable in the pte,
84 * but NEVER BOTH. (It IS legal to be neither cached nor dirty,
85 * however.)
87 * -- Stephen Tweedie 1998 */
89 if (PageSwapCache(page_map)) {
90 if (pte_write(pte)) {
91 struct page *found;
92 printk ("VM: Found a writable swap-cached page!\n");
93 /* Try to diagnose the problem ... */
94 found = find_page(&swapper_inode, page_map->offset);
95 if (found) {
96 printk("page=%p@%08lx, found=%p, count=%d\n",
97 page_map, page_map->offset,
98 found, atomic_read(&found->count));
99 __free_page(found);
100 } else
101 printk ("Spurious, page not in cache\n");
102 return 0;
106 if (pte_young(pte)) {
108 * Transfer the "accessed" bit from the page
109 * tables to the global page map.
111 set_pte(page_table, pte_mkold(pte));
112 set_bit(PG_referenced, &page_map->flags);
115 * We should test here to see if we want to recover any
116 * swap cache page here. We do this if the page seeing
117 * enough activity, AND we are sufficiently low on swap
119 * We need to track both the number of available swap
120 * pages and the total number present before we can do
121 * this...
123 return 0;
126 if (pte_dirty(pte)) {
127 if (vma->vm_ops && vma->vm_ops->swapout) {
128 pid_t pid = tsk->pid;
129 vma->vm_mm->rss--;
130 if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
131 kill_proc(pid, SIGBUS, 1);
132 } else {
134 * This is a dirty, swappable page. First of all,
135 * get a suitable swap entry for it, and make sure
136 * we have the swap cache set up to associate the
137 * page with that swap entry.
139 entry = in_swap_cache(page_map);
140 if (!entry) {
141 entry = get_swap_page();
142 if (!entry)
143 return 0; /* No swap space left */
146 vma->vm_mm->rss--;
147 tsk->nswap++;
148 flush_cache_page(vma, address);
149 set_pte(page_table, __pte(entry));
150 flush_tlb_page(vma, address);
151 swap_duplicate(entry);
153 /* Now to write back the page. We have two
154 * cases: if the page is already part of the
155 * swap cache, then it is already on disk. Just
156 * free the page and return (we release the swap
157 * cache on the last accessor too).
159 * If we have made a new swap entry, then we
160 * start the write out to disk. If the page is
161 * shared, however, we still need to keep the
162 * copy in memory, so we add it to the swap
163 * cache. */
164 if (PageSwapCache(page_map)) {
165 free_page(page);
166 return (atomic_read(&page_map->count) == 0);
168 add_to_swap_cache(page_map, entry);
169 /* We checked we were unlocked way up above, and we
170 have been careful not to stall until here */
171 set_bit(PG_locked, &page_map->flags);
172 /* OK, do a physical write to swap. */
173 rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT));
175 /* Now we can free the current physical page. We also
176 * free up the swap cache if this is the last use of the
177 * page. Note that there is a race here: the page may
178 * still be shared COW by another process, but that
179 * process may exit while we are writing out the page
180 * asynchronously. That's no problem, shrink_mmap() can
181 * correctly clean up the occassional unshared page
182 * which gets left behind in the swap cache. */
183 free_page(page);
184 return 1; /* we slept: the process may not exist any more */
187 /* The page was _not_ dirty, but still has a zero age. It must
188 * already be uptodate on disk. If it is in the swap cache,
189 * then we can just unlink the page now. Remove the swap cache
190 * too if this is the last user. */
191 if ((entry = in_swap_cache(page_map))) {
192 vma->vm_mm->rss--;
193 flush_cache_page(vma, address);
194 set_pte(page_table, __pte(entry));
195 flush_tlb_page(vma, address);
196 swap_duplicate(entry);
197 free_page(page);
198 return (atomic_read(&page_map->count) == 0);
201 * A clean page to be discarded? Must be mmap()ed from
202 * somewhere. Unlink the pte, and tell the filemap code to
203 * discard any cached backing page if this is the last user.
205 if (PageSwapCache(page_map)) {
206 printk ("VM: How can this page _still_ be cached?");
207 return 0;
209 vma->vm_mm->rss--;
210 flush_cache_page(vma, address);
211 pte_clear(page_table);
212 flush_tlb_page(vma, address);
213 entry = (atomic_read(&page_map->count) == 1);
214 __free_page(page_map);
215 return entry;
219 * A new implementation of swap_out(). We do not swap complete processes,
220 * but only a small number of blocks, before we continue with the next
221 * process. The number of blocks actually swapped is determined on the
222 * number of page faults, that this process actually had in the last time,
223 * so we won't swap heavily used processes all the time ...
225 * Note: the priority argument is a hint on much CPU to waste with the
226 * swap block search, not a hint, of how much blocks to swap with
227 * each process.
229 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
232 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
233 pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
235 pte_t * pte;
236 unsigned long pmd_end;
238 if (pmd_none(*dir))
239 return 0;
240 if (pmd_bad(*dir)) {
241 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
242 pmd_clear(dir);
243 return 0;
246 pte = pte_offset(dir, address);
248 pmd_end = (address + PMD_SIZE) & PMD_MASK;
249 if (end > pmd_end)
250 end = pmd_end;
252 do {
253 int result;
254 tsk->swap_address = address + PAGE_SIZE;
255 result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
256 if (result)
257 return result;
258 address += PAGE_SIZE;
259 pte++;
260 } while (address < end);
261 return 0;
264 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
265 pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
267 pmd_t * pmd;
268 unsigned long pgd_end;
270 if (pgd_none(*dir))
271 return 0;
272 if (pgd_bad(*dir)) {
273 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
274 pgd_clear(dir);
275 return 0;
278 pmd = pmd_offset(dir, address);
280 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
281 if (end > pgd_end)
282 end = pgd_end;
284 do {
285 int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
286 if (result)
287 return result;
288 address = (address + PMD_SIZE) & PMD_MASK;
289 pmd++;
290 } while (address < end);
291 return 0;
294 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
295 unsigned long address, int gfp_mask)
297 pgd_t *pgdir;
298 unsigned long end;
300 /* Don't swap out areas like shared memory which have their
301 own separate swapping mechanism or areas which are locked down */
302 if (vma->vm_flags & (VM_SHM | VM_LOCKED))
303 return 0;
305 pgdir = pgd_offset(tsk->mm, address);
307 end = vma->vm_end;
308 while (address < end) {
309 int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
310 if (result)
311 return result;
312 address = (address + PGDIR_SIZE) & PGDIR_MASK;
313 pgdir++;
315 return 0;
318 static int swap_out_process(struct task_struct * p, int gfp_mask)
320 unsigned long address;
321 struct vm_area_struct* vma;
324 * Go through process' page directory.
326 address = p->swap_address;
329 * Find the proper vm-area
331 vma = find_vma(p->mm, address);
332 if (vma) {
333 if (address < vma->vm_start)
334 address = vma->vm_start;
336 for (;;) {
337 int result = swap_out_vma(p, vma, address, gfp_mask);
338 if (result)
339 return result;
340 vma = vma->vm_next;
341 if (!vma)
342 break;
343 address = vma->vm_start;
347 /* We didn't find anything for the process */
348 p->swap_cnt = 0;
349 p->swap_address = 0;
350 return 0;
354 * Select the task with maximal swap_cnt and try to swap out a page.
355 * N.B. This function returns only 0 or 1. Return values != 1 from
356 * the lower level routines result in continued processing.
358 static int swap_out(unsigned int priority, int gfp_mask)
360 struct task_struct * p, * pbest;
361 int counter, assign, max_cnt;
364 * We make one or two passes through the task list, indexed by
365 * assign = {0, 1}:
366 * Pass 1: select the swappable task with maximal swap_cnt.
367 * Pass 2: assign new swap_cnt values, then select as above.
368 * With this approach, there's no need to remember the last task
369 * swapped out. If the swap-out fails, we clear swap_cnt so the
370 * task won't be selected again until all others have been tried.
372 counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
373 for (; counter >= 0; counter--) {
374 assign = 0;
375 max_cnt = 0;
376 pbest = NULL;
377 select:
378 read_lock(&tasklist_lock);
379 p = init_task.next_task;
380 for (; p != &init_task; p = p->next_task) {
381 if (!p->swappable)
382 continue;
383 if (p->mm->rss <= 0)
384 continue;
385 if (assign) {
387 * If we didn't select a task on pass 1,
388 * assign each task a new swap_cnt.
389 * Normalise the number of pages swapped
390 * by multiplying by (RSS / 1MB)
392 p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
394 if (p->swap_cnt > max_cnt) {
395 max_cnt = p->swap_cnt;
396 pbest = p;
399 read_unlock(&tasklist_lock);
400 if (!pbest) {
401 if (!assign) {
402 assign = 1;
403 goto select;
405 goto out;
407 pbest->swap_cnt--;
410 * Nonzero means we cleared out something, but only "1" means
411 * that we actually free'd up a page as a result.
413 if (swap_out_process(pbest, gfp_mask) == 1)
414 return 1;
416 out:
417 return 0;
421 * Before we start the kernel thread, print out the
422 * kswapd initialization message (otherwise the init message
423 * may be printed in the middle of another driver's init
424 * message). It looks very bad when that happens.
426 void __init kswapd_setup(void)
428 int i;
429 char *revision="$Revision: 1.5 $", *s, *e;
431 swap_setup();
433 if ((s = strchr(revision, ':')) &&
434 (e = strchr(s, '$')))
435 s++, i = e - s;
436 else
437 s = revision, i = -1;
438 printk ("Starting kswapd v%.*s\n", i, s);
441 #define free_memory(fn) \
442 count++; do { if (!--count) goto done; } while (fn)
444 static int kswapd_free_pages(int kswapd_state)
446 unsigned long end_time;
448 /* Always trim SLAB caches when memory gets low. */
449 kmem_cache_reap(0);
451 /* max one hundreth of a second */
452 end_time = jiffies + (HZ-1)/100;
453 do {
454 int priority = 5;
455 int count = pager_daemon.swap_cluster;
457 switch (kswapd_state) {
458 do {
459 default:
460 free_memory(shrink_mmap(priority, 0));
461 kswapd_state++;
462 case 1:
463 free_memory(shm_swap(priority, 0));
464 kswapd_state++;
465 case 2:
466 free_memory(swap_out(priority, 0));
467 shrink_dcache_memory(priority, 0);
468 kswapd_state = 0;
469 } while (--priority >= 0);
470 return kswapd_state;
472 done:
473 if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
474 break;
475 } while (time_before_eq(jiffies,end_time));
476 return kswapd_state;
480 * The background pageout daemon.
481 * Started as a kernel thread from the init process.
483 int kswapd(void *unused)
485 current->session = 1;
486 current->pgrp = 1;
487 strcpy(current->comm, "kswapd");
488 sigfillset(&current->blocked);
491 * As a kernel thread we want to tamper with system buffers
492 * and other internals and thus be subject to the SMP locking
493 * rules. (On a uniprocessor box this does nothing).
495 lock_kernel();
498 * Set the base priority to something smaller than a
499 * regular process. We will scale up the priority
500 * dynamically depending on how much memory we need.
502 current->priority = (DEF_PRIORITY * 2) / 3;
505 * Tell the memory management that we're a "memory allocator",
506 * and that if we need more memory we should get access to it
507 * regardless (see "try_to_free_pages()"). "kswapd" should
508 * never get caught in the normal page freeing logic.
510 * (Kswapd normally doesn't need memory anyway, but sometimes
511 * you need a small amount of memory in order to be able to
512 * page out something else, and this flag essentially protects
513 * us from recursively trying to free more memory as we're
514 * trying to free the first piece of memory in the first place).
516 current->flags |= PF_MEMALLOC;
518 init_swap_timer();
519 kswapd_task = current;
520 while (1) {
521 int state = 0;
523 current->state = TASK_INTERRUPTIBLE;
524 flush_signals(current);
525 run_task_queue(&tq_disk);
526 schedule();
527 swapstats.wakeups++;
528 state = kswapd_free_pages(state);
530 /* As if we could ever get here - maybe we want to make this killable */
531 kswapd_task = NULL;
532 unlock_kernel();
533 return 0;
537 * We need to make the locks finer granularity, but right
538 * now we need this so that we can do page allocations
539 * without holding the kernel lock etc.
541 * The "PF_MEMALLOC" flag protects us against recursion:
542 * if we need more memory as part of a swap-out effort we
543 * will just silently return "success" to tell the page
544 * allocator to accept the allocation.
546 * We want to try to free "count" pages, and we need to
547 * cluster them so that we get good swap-out behaviour. See
548 * the "free_memory()" macro for details.
550 int try_to_free_pages(unsigned int gfp_mask, int count)
552 int retval;
554 lock_kernel();
556 /* Always trim SLAB caches when memory gets low. */
557 kmem_cache_reap(gfp_mask);
559 retval = 1;
560 if (!(current->flags & PF_MEMALLOC)) {
561 int priority;
563 current->flags |= PF_MEMALLOC;
565 priority = 5;
566 do {
567 free_memory(shrink_mmap(priority, gfp_mask));
568 free_memory(shm_swap(priority, gfp_mask));
569 free_memory(swap_out(priority, gfp_mask));
570 shrink_dcache_memory(priority, gfp_mask);
571 } while (--priority >= 0);
572 retval = 0;
573 done:
574 current->flags &= ~PF_MEMALLOC;
576 unlock_kernel();
578 return retval;
582 * Wake up kswapd according to the priority
583 * 0 - no wakeup
584 * 1 - wake up as a low-priority process
585 * 2 - wake up as a normal process
586 * 3 - wake up as an almost real-time process
588 * This plays mind-games with the "goodness()"
589 * function in kernel/sched.c.
591 static inline void kswapd_wakeup(struct task_struct *p, int priority)
593 if (priority) {
594 p->counter = p->priority << priority;
595 wake_up_process(p);
600 * The swap_tick function gets called on every clock tick.
602 void swap_tick(void)
604 struct task_struct *p = kswapd_task;
607 * Only bother to try to wake kswapd up
608 * if the task exists and can be woken.
610 if (p && (p->state & TASK_INTERRUPTIBLE)) {
611 unsigned int pages;
612 int want_wakeup;
615 * Schedule for wakeup if there isn't lots
616 * of free memory or if there is too much
617 * of it used for buffers or pgcache.
619 * "want_wakeup" is our priority: 0 means
620 * not to wake anything up, while 3 means
621 * that we'd better give kswapd a realtime
622 * priority.
624 want_wakeup = 0;
625 pages = nr_free_pages;
626 if (pages < freepages.high)
627 want_wakeup = 1;
628 if (pages < freepages.low)
629 want_wakeup = 2;
630 if (pages < freepages.min)
631 want_wakeup = 3;
633 kswapd_wakeup(p,want_wakeup);
636 timer_active |= (1<<SWAP_TIMER);
640 * Initialise the swap timer
643 void init_swap_timer(void)
645 timer_table[SWAP_TIMER].expires = jiffies;
646 timer_table[SWAP_TIMER].fn = swap_tick;
647 timer_active |= (1<<SWAP_TIMER);