Import 2.3.10pre1
[davej-history.git] / mm / vmscan.c
blob3567098a15a38951a9345ad4c93df8562c58eafb
1 /*
2 * linux/mm/vmscan.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
13 #include <linux/slab.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/swap.h>
16 #include <linux/swapctl.h>
17 #include <linux/smp_lock.h>
18 #include <linux/pagemap.h>
19 #include <linux/init.h>
21 #include <asm/pgtable.h>
24 * The swap-out functions return 1 if they successfully
25 * threw something out, and we got a free page. It returns
26 * zero if it couldn't do anything, and any other value
27 * indicates it decreased rss, but the page was shared.
29 * NOTE! If it sleeps, it *must* return 1 to make sure we
30 * don't continue with the swap-out. Otherwise we may be
31 * using a process that no longer actually exists (it might
32 * have died while we slept).
34 static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
35 unsigned long address, pte_t * page_table, int gfp_mask)
37 pte_t pte;
38 unsigned long entry;
39 unsigned long page_addr;
40 struct page * page;
42 pte = *page_table;
43 if (!pte_present(pte))
44 goto out_failed;
45 page_addr = pte_page(pte);
46 if (MAP_NR(page_addr) >= max_mapnr)
47 goto out_failed;
49 page = mem_map + MAP_NR(page_addr);
50 write_lock(&tsk->mm->page_table_lock);
51 if (pte_val(pte) != pte_val(*page_table))
52 goto out_failed_unlock;
55 * Dont be too eager to get aging right if
56 * memory is dangerously low.
58 if (!low_on_memory && pte_young(pte)) {
60 * Transfer the "accessed" bit from the page
61 * tables to the global page map.
63 set_pte(page_table, pte_mkold(pte));
64 set_bit(PG_referenced, &page->flags);
65 goto out_failed_unlock;
68 if (PageReserved(page)
69 || PageLocked(page)
70 || ((gfp_mask & __GFP_DMA) && !PageDMA(page)))
71 goto out_failed_unlock;
74 * Is the page already in the swap cache? If so, then
75 * we can just drop our reference to it without doing
76 * any IO - it's already up-to-date on disk.
78 * Return 0, as we didn't actually free any real
79 * memory, and we should just continue our scan.
81 if (PageSwapCache(page)) {
82 entry = page->offset;
83 swap_duplicate(entry);
84 set_pte(page_table, __pte(entry));
85 drop_pte:
86 vma->vm_mm->rss--;
87 flush_tlb_page(vma, address);
88 __free_page(page);
89 goto out_failed_unlock;
93 * Is it a clean page? Then it must be recoverable
94 * by just paging it in again, and we can just drop
95 * it..
97 * However, this won't actually free any real
98 * memory, as the page will just be in the page cache
99 * somewhere, and as such we should just continue
100 * our scan.
102 * Basically, this just makes it possible for us to do
103 * some real work in the future in "shrink_mmap()".
105 if (!pte_dirty(pte)) {
106 pte_clear(page_table);
107 goto drop_pte;
111 * Don't go down into the swap-out stuff if
112 * we cannot do I/O! Avoid recursing on FS
113 * locks etc.
115 if (!(gfp_mask & __GFP_IO))
116 goto out_failed_unlock;
119 * Ok, it's really dirty. That means that
120 * we should either create a new swap cache
121 * entry for it, or we should write it back
122 * to its own backing store.
124 * Note that in neither case do we actually
125 * know that we make a page available, but
126 * as we potentially sleep we can no longer
127 * continue scanning, so we migth as well
128 * assume we free'd something.
130 * NOTE NOTE NOTE! This should just set a
131 * dirty bit in 'page', and just drop the
132 * pte. All the hard work would be done by
133 * shrink_mmap().
135 * That would get rid of a lot of problems.
137 flush_cache_page(vma, address);
138 if (vma->vm_ops && vma->vm_ops->swapout) {
139 pid_t pid = tsk->pid;
140 pte_clear(page_table);
141 write_unlock(&tsk->mm->page_table_lock);
142 flush_tlb_page(vma, address);
143 vma->vm_mm->rss--;
145 if (vma->vm_ops->swapout(vma, page))
146 kill_proc(pid, SIGBUS, 1);
147 goto out_free_success;
151 * This is a dirty, swappable page. First of all,
152 * get a suitable swap entry for it, and make sure
153 * we have the swap cache set up to associate the
154 * page with that swap entry.
156 entry = get_swap_page();
157 if (!entry)
158 goto out_failed; /* No swap space left */
160 vma->vm_mm->rss--;
161 tsk->mm->nswap++;
162 set_pte(page_table, __pte(entry));
163 write_unlock(&tsk->mm->page_table_lock);
165 flush_tlb_page(vma, address);
166 swap_duplicate(entry); /* One for the process, one for the swap cache */
168 /* This will also lock the page */
169 add_to_swap_cache(page, entry);
171 /* OK, do a physical asynchronous write to swap. */
172 rw_swap_page(WRITE, page, 0);
174 out_free_success:
175 __free_page(page);
176 return 1;
177 out_failed_unlock:
178 write_unlock(&tsk->mm->page_table_lock);
179 out_failed:
180 return 0;
184 * A new implementation of swap_out(). We do not swap complete processes,
185 * but only a small number of blocks, before we continue with the next
186 * process. The number of blocks actually swapped is determined on the
187 * number of page faults, that this process actually had in the last time,
188 * so we won't swap heavily used processes all the time ...
190 * Note: the priority argument is a hint on much CPU to waste with the
191 * swap block search, not a hint, of how much blocks to swap with
192 * each process.
194 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
197 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
198 pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
200 pte_t * pte;
201 unsigned long pmd_end;
203 if (pmd_none(*dir))
204 return 0;
205 if (pmd_bad(*dir)) {
206 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
207 pmd_clear(dir);
208 return 0;
211 pte = pte_offset(dir, address);
213 pmd_end = (address + PMD_SIZE) & PMD_MASK;
214 if (end > pmd_end)
215 end = pmd_end;
217 do {
218 int result;
219 tsk->mm->swap_address = address + PAGE_SIZE;
220 result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
221 if (result)
222 return result;
223 address += PAGE_SIZE;
224 pte++;
225 } while (address < end);
226 return 0;
229 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
230 pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
232 pmd_t * pmd;
233 unsigned long pgd_end;
235 if (pgd_none(*dir))
236 return 0;
237 if (pgd_bad(*dir)) {
238 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
239 pgd_clear(dir);
240 return 0;
243 pmd = pmd_offset(dir, address);
245 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
246 if (end > pgd_end)
247 end = pgd_end;
249 do {
250 int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
251 if (result)
252 return result;
253 address = (address + PMD_SIZE) & PMD_MASK;
254 pmd++;
255 } while (address < end);
256 return 0;
259 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
260 unsigned long address, int gfp_mask)
262 pgd_t *pgdir;
263 unsigned long end;
265 /* Don't swap out areas which are locked down */
266 if (vma->vm_flags & VM_LOCKED)
267 return 0;
269 pgdir = pgd_offset(tsk->mm, address);
271 end = vma->vm_end;
272 while (address < end) {
273 int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
274 if (result)
275 return result;
276 address = (address + PGDIR_SIZE) & PGDIR_MASK;
277 pgdir++;
279 return 0;
282 static int swap_out_process(struct task_struct * p, int gfp_mask)
284 unsigned long address;
285 struct vm_area_struct* vma;
288 * Go through process' page directory.
290 address = p->mm->swap_address;
293 * Find the proper vm-area
295 vma = find_vma(p->mm, address);
296 if (vma) {
297 if (address < vma->vm_start)
298 address = vma->vm_start;
300 for (;;) {
301 int result = swap_out_vma(p, vma, address, gfp_mask);
302 if (result)
303 return result;
304 vma = vma->vm_next;
305 if (!vma)
306 break;
307 address = vma->vm_start;
311 /* We didn't find anything for the process */
312 p->mm->swap_cnt = 0;
313 p->mm->swap_address = 0;
314 return 0;
318 * Select the task with maximal swap_cnt and try to swap out a page.
319 * N.B. This function returns only 0 or 1. Return values != 1 from
320 * the lower level routines result in continued processing.
322 static int swap_out(unsigned int priority, int gfp_mask)
324 struct task_struct * p, * pbest;
325 int counter, assign, max_cnt;
328 * We make one or two passes through the task list, indexed by
329 * assign = {0, 1}:
330 * Pass 1: select the swappable task with maximal RSS that has
331 * not yet been swapped out.
332 * Pass 2: re-assign rss swap_cnt values, then select as above.
334 * With this approach, there's no need to remember the last task
335 * swapped out. If the swap-out fails, we clear swap_cnt so the
336 * task won't be selected again until all others have been tried.
338 * Think of swap_cnt as a "shadow rss" - it tells us which process
339 * we want to page out (always try largest first).
341 counter = nr_tasks / (priority+1);
342 if (counter < 1)
343 counter = 1;
344 if (counter > nr_tasks)
345 counter = nr_tasks;
347 for (; counter >= 0; counter--) {
348 assign = 0;
349 max_cnt = 0;
350 pbest = NULL;
351 select:
352 read_lock(&tasklist_lock);
353 p = init_task.next_task;
354 for (; p != &init_task; p = p->next_task) {
355 if (!p->mm->swappable)
356 continue;
357 if (p->mm->rss <= 0)
358 continue;
359 /* Refresh swap_cnt? */
360 if (assign)
361 p->mm->swap_cnt = p->mm->rss;
362 if (p->mm->swap_cnt > max_cnt) {
363 max_cnt = p->mm->swap_cnt;
364 pbest = p;
367 read_unlock(&tasklist_lock);
368 if (!pbest) {
369 if (!assign) {
370 assign = 1;
371 goto select;
373 goto out;
376 if (swap_out_process(pbest, gfp_mask))
377 return 1;
379 out:
380 return 0;
384 * We need to make the locks finer granularity, but right
385 * now we need this so that we can do page allocations
386 * without holding the kernel lock etc.
388 * We want to try to free "count" pages, and we need to
389 * cluster them so that we get good swap-out behaviour. See
390 * the "free_memory()" macro for details.
392 static int do_try_to_free_pages(unsigned int gfp_mask)
394 int priority;
395 int count = SWAP_CLUSTER_MAX;
397 lock_kernel();
399 /* Always trim SLAB caches when memory gets low. */
400 kmem_cache_reap(gfp_mask);
402 priority = 6;
403 do {
404 while (shrink_mmap(priority, gfp_mask)) {
405 if (!--count)
406 goto done;
409 /* Try to get rid of some shared memory pages.. */
410 if (gfp_mask & __GFP_IO) {
411 while (shm_swap(priority, gfp_mask)) {
412 if (!--count)
413 goto done;
417 /* Then, try to page stuff out.. */
418 while (swap_out(priority, gfp_mask)) {
419 if (!--count)
420 goto done;
423 shrink_dcache_memory(priority, gfp_mask);
424 } while (--priority >= 0);
425 done:
426 unlock_kernel();
428 return priority >= 0;
432 * Before we start the kernel thread, print out the
433 * kswapd initialization message (otherwise the init message
434 * may be printed in the middle of another driver's init
435 * message). It looks very bad when that happens.
437 void __init kswapd_setup(void)
439 int i;
440 char *revision="$Revision: 1.5 $", *s, *e;
442 swap_setup();
444 if ((s = strchr(revision, ':')) &&
445 (e = strchr(s, '$')))
446 s++, i = e - s;
447 else
448 s = revision, i = -1;
449 printk ("Starting kswapd v%.*s\n", i, s);
452 static struct task_struct *kswapd_process;
455 * The background pageout daemon, started as a kernel thread
456 * from the init process.
458 * This basically executes once a second, trickling out pages
459 * so that we have _some_ free memory available even if there
460 * is no other activity that frees anything up. This is needed
461 * for things like routing etc, where we otherwise might have
462 * all activity going on in asynchronous contexts that cannot
463 * page things out.
465 * If there are applications that are active memory-allocators
466 * (most normal use), this basically shouldn't matter.
468 int kswapd(void *unused)
470 struct task_struct *tsk = current;
472 kswapd_process = tsk;
473 tsk->session = 1;
474 tsk->pgrp = 1;
475 strcpy(tsk->comm, "kswapd");
476 sigfillset(&tsk->blocked);
479 * Tell the memory management that we're a "memory allocator",
480 * and that if we need more memory we should get access to it
481 * regardless (see "__get_free_pages()"). "kswapd" should
482 * never get caught in the normal page freeing logic.
484 * (Kswapd normally doesn't need memory anyway, but sometimes
485 * you need a small amount of memory in order to be able to
486 * page out something else, and this flag essentially protects
487 * us from recursively trying to free more memory as we're
488 * trying to free the first piece of memory in the first place).
490 tsk->flags |= PF_MEMALLOC;
492 while (1) {
494 * Wake up once a second to see if we need to make
495 * more memory available.
497 * If we actually get into a low-memory situation,
498 * the processes needing more memory will wake us
499 * up on a more timely basis.
501 do {
502 if (nr_free_pages >= freepages.high)
503 break;
505 if (!do_try_to_free_pages(GFP_KSWAPD))
506 break;
507 run_task_queue(&tq_disk);
508 } while (!tsk->need_resched);
509 tsk->state = TASK_INTERRUPTIBLE;
510 schedule_timeout(HZ);
515 * Called by non-kswapd processes when they want more
516 * memory.
518 * In a perfect world, this should just wake up kswapd
519 * and return. We don't actually want to swap stuff out
520 * from user processes, because the locking issues are
521 * nasty to the extreme (file write locks, and MM locking)
523 * One option might be to let kswapd do all the page-out
524 * and VM page table scanning that needs locking, and this
525 * process thread could do just the mmap shrink stage that
526 * can be done by just dropping cached pages without having
527 * any deadlock issues.
529 int try_to_free_pages(unsigned int gfp_mask)
531 int retval = 1;
533 wake_up_process(kswapd_process);
534 if (gfp_mask & __GFP_WAIT)
535 retval = do_try_to_free_pages(gfp_mask);
536 return retval;