Import 2.3.13pre3
[davej-history.git] / mm / vmscan.c
blob73a4c912a029985cc65186e39e4be7b68a071229
1 /*
2 * linux/mm/vmscan.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
13 #include <linux/slab.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/swap.h>
16 #include <linux/swapctl.h>
17 #include <linux/smp_lock.h>
18 #include <linux/pagemap.h>
19 #include <linux/init.h>
21 #include <asm/pgtable.h>
24 * The swap-out functions return 1 if they successfully
25 * threw something out, and we got a free page. It returns
26 * zero if it couldn't do anything, and any other value
27 * indicates it decreased rss, but the page was shared.
29 * NOTE! If it sleeps, it *must* return 1 to make sure we
30 * don't continue with the swap-out. Otherwise we may be
31 * using a process that no longer actually exists (it might
32 * have died while we slept).
34 static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
36 pte_t pte;
37 unsigned long entry;
38 unsigned long page_addr;
39 struct page * page;
41 pte = *page_table;
42 if (!pte_present(pte))
43 goto out_failed;
44 page_addr = pte_page(pte);
45 if (MAP_NR(page_addr) >= max_mapnr)
46 goto out_failed;
48 page = mem_map + MAP_NR(page_addr);
49 spin_lock(&vma->vm_mm->page_table_lock);
50 if (pte_val(pte) != pte_val(*page_table))
51 goto out_failed_unlock;
54 * Dont be too eager to get aging right if
55 * memory is dangerously low.
57 if (!low_on_memory && pte_young(pte)) {
59 * Transfer the "accessed" bit from the page
60 * tables to the global page map.
62 set_pte(page_table, pte_mkold(pte));
63 set_bit(PG_referenced, &page->flags);
64 goto out_failed_unlock;
67 if (PageReserved(page)
68 || PageLocked(page)
69 || ((gfp_mask & __GFP_DMA) && !PageDMA(page)))
70 goto out_failed_unlock;
73 * Is the page already in the swap cache? If so, then
74 * we can just drop our reference to it without doing
75 * any IO - it's already up-to-date on disk.
77 * Return 0, as we didn't actually free any real
78 * memory, and we should just continue our scan.
80 if (PageSwapCache(page)) {
81 entry = page->offset;
82 swap_duplicate(entry);
83 set_pte(page_table, __pte(entry));
84 drop_pte:
85 vma->vm_mm->rss--;
86 flush_tlb_page(vma, address);
87 __free_page(page);
88 goto out_failed_unlock;
92 * Is it a clean page? Then it must be recoverable
93 * by just paging it in again, and we can just drop
94 * it..
96 * However, this won't actually free any real
97 * memory, as the page will just be in the page cache
98 * somewhere, and as such we should just continue
99 * our scan.
101 * Basically, this just makes it possible for us to do
102 * some real work in the future in "shrink_mmap()".
104 if (!pte_dirty(pte)) {
105 pte_clear(page_table);
106 goto drop_pte;
110 * Don't go down into the swap-out stuff if
111 * we cannot do I/O! Avoid recursing on FS
112 * locks etc.
114 if (!(gfp_mask & __GFP_IO))
115 goto out_failed_unlock;
118 * Ok, it's really dirty. That means that
119 * we should either create a new swap cache
120 * entry for it, or we should write it back
121 * to its own backing store.
123 * Note that in neither case do we actually
124 * know that we make a page available, but
125 * as we potentially sleep we can no longer
126 * continue scanning, so we migth as well
127 * assume we free'd something.
129 * NOTE NOTE NOTE! This should just set a
130 * dirty bit in 'page', and just drop the
131 * pte. All the hard work would be done by
132 * shrink_mmap().
134 * That would get rid of a lot of problems.
136 flush_cache_page(vma, address);
137 if (vma->vm_ops && vma->vm_ops->swapout) {
138 int error;
139 pte_clear(page_table);
140 spin_unlock(&vma->vm_mm->page_table_lock);
141 flush_tlb_page(vma, address);
142 vma->vm_mm->rss--;
143 error = vma->vm_ops->swapout(vma, page);
144 if (!error)
145 goto out_free_success;
146 __free_page(page);
147 return error;
151 * This is a dirty, swappable page. First of all,
152 * get a suitable swap entry for it, and make sure
153 * we have the swap cache set up to associate the
154 * page with that swap entry.
156 entry = get_swap_page();
157 if (!entry)
158 goto out_failed_unlock; /* No swap space left */
160 vma->vm_mm->rss--;
161 set_pte(page_table, __pte(entry));
162 spin_unlock(&vma->vm_mm->page_table_lock);
164 flush_tlb_page(vma, address);
165 swap_duplicate(entry); /* One for the process, one for the swap cache */
167 /* This will also lock the page */
168 add_to_swap_cache(page, entry);
170 /* OK, do a physical asynchronous write to swap. */
171 rw_swap_page(WRITE, page, 0);
173 out_free_success:
174 __free_page(page);
175 return 1;
176 out_failed_unlock:
177 spin_unlock(&vma->vm_mm->page_table_lock);
178 out_failed:
179 return 0;
183 * A new implementation of swap_out(). We do not swap complete processes,
184 * but only a small number of blocks, before we continue with the next
185 * process. The number of blocks actually swapped is determined on the
186 * number of page faults, that this process actually had in the last time,
187 * so we won't swap heavily used processes all the time ...
189 * Note: the priority argument is a hint on much CPU to waste with the
190 * swap block search, not a hint, of how much blocks to swap with
191 * each process.
193 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
196 static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
198 pte_t * pte;
199 unsigned long pmd_end;
201 if (pmd_none(*dir))
202 return 0;
203 if (pmd_bad(*dir)) {
204 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
205 pmd_clear(dir);
206 return 0;
209 pte = pte_offset(dir, address);
211 pmd_end = (address + PMD_SIZE) & PMD_MASK;
212 if (end > pmd_end)
213 end = pmd_end;
215 do {
216 int result;
217 vma->vm_mm->swap_address = address + PAGE_SIZE;
218 result = try_to_swap_out(vma, address, pte, gfp_mask);
219 if (result)
220 return result;
221 address += PAGE_SIZE;
222 pte++;
223 } while (address < end);
224 return 0;
227 static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
229 pmd_t * pmd;
230 unsigned long pgd_end;
232 if (pgd_none(*dir))
233 return 0;
234 if (pgd_bad(*dir)) {
235 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
236 pgd_clear(dir);
237 return 0;
240 pmd = pmd_offset(dir, address);
242 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
243 if (end > pgd_end)
244 end = pgd_end;
246 do {
247 int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
248 if (result)
249 return result;
250 address = (address + PMD_SIZE) & PMD_MASK;
251 pmd++;
252 } while (address < end);
253 return 0;
256 static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
258 pgd_t *pgdir;
259 unsigned long end;
261 /* Don't swap out areas which are locked down */
262 if (vma->vm_flags & VM_LOCKED)
263 return 0;
265 pgdir = pgd_offset(vma->vm_mm, address);
267 end = vma->vm_end;
268 while (address < end) {
269 int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
270 if (result)
271 return result;
272 address = (address + PGDIR_SIZE) & PGDIR_MASK;
273 pgdir++;
275 return 0;
278 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
280 unsigned long address;
281 struct vm_area_struct* vma;
284 * Go through process' page directory.
286 address = mm->swap_address;
289 * Find the proper vm-area
291 vma = find_vma(mm, address);
292 if (vma) {
293 if (address < vma->vm_start)
294 address = vma->vm_start;
296 for (;;) {
297 int result = swap_out_vma(vma, address, gfp_mask);
298 if (result)
299 return result;
300 vma = vma->vm_next;
301 if (!vma)
302 break;
303 address = vma->vm_start;
307 /* We didn't find anything for the process */
308 mm->swap_cnt = 0;
309 mm->swap_address = 0;
310 return 0;
314 * Select the task with maximal swap_cnt and try to swap out a page.
315 * N.B. This function returns only 0 or 1. Return values != 1 from
316 * the lower level routines result in continued processing.
318 static int swap_out(unsigned int priority, int gfp_mask)
320 struct task_struct * p;
321 int counter;
324 * We make one or two passes through the task list, indexed by
325 * assign = {0, 1}:
326 * Pass 1: select the swappable task with maximal RSS that has
327 * not yet been swapped out.
328 * Pass 2: re-assign rss swap_cnt values, then select as above.
330 * With this approach, there's no need to remember the last task
331 * swapped out. If the swap-out fails, we clear swap_cnt so the
332 * task won't be selected again until all others have been tried.
334 * Think of swap_cnt as a "shadow rss" - it tells us which process
335 * we want to page out (always try largest first).
337 counter = nr_threads / (priority+1);
338 if (counter < 1)
339 counter = 1;
340 if (counter > nr_threads)
341 counter = nr_threads;
343 for (; counter >= 0; counter--) {
344 int assign = 0;
345 int max_cnt = 0;
346 struct mm_struct *best = NULL;
347 int pid = 0;
348 select:
349 read_lock(&tasklist_lock);
350 p = init_task.next_task;
351 for (; p != &init_task; p = p->next_task) {
352 struct mm_struct *mm = p->mm;
353 if (!p->swappable || !mm)
354 continue;
355 if (mm->rss <= 0)
356 continue;
357 /* Refresh swap_cnt? */
358 if (assign)
359 mm->swap_cnt = mm->rss;
360 if (mm->swap_cnt > max_cnt) {
361 max_cnt = mm->swap_cnt;
362 best = mm;
363 pid = p->pid;
366 read_unlock(&tasklist_lock);
367 if (!best) {
368 if (!assign) {
369 assign = 1;
370 goto select;
372 goto out;
373 } else {
374 int ret;
376 atomic_inc(&best->mm_count);
377 ret = swap_out_mm(best, gfp_mask);
378 mmdrop(best);
380 if (!ret)
381 continue;
383 if (ret < 0)
384 kill_proc(pid, SIGBUS, 1);
385 return 1;
388 out:
389 return 0;
393 * We need to make the locks finer granularity, but right
394 * now we need this so that we can do page allocations
395 * without holding the kernel lock etc.
397 * We want to try to free "count" pages, and we need to
398 * cluster them so that we get good swap-out behaviour. See
399 * the "free_memory()" macro for details.
401 static int do_try_to_free_pages(unsigned int gfp_mask)
403 int priority;
404 int count = SWAP_CLUSTER_MAX;
406 lock_kernel();
408 /* Always trim SLAB caches when memory gets low. */
409 kmem_cache_reap(gfp_mask);
411 priority = 6;
412 do {
413 while (shrink_mmap(priority, gfp_mask)) {
414 if (!--count)
415 goto done;
418 /* Try to get rid of some shared memory pages.. */
419 if (gfp_mask & __GFP_IO) {
420 while (shm_swap(priority, gfp_mask)) {
421 if (!--count)
422 goto done;
426 /* Then, try to page stuff out.. */
427 while (swap_out(priority, gfp_mask)) {
428 if (!--count)
429 goto done;
432 shrink_dcache_memory(priority, gfp_mask);
433 } while (--priority >= 0);
434 done:
435 unlock_kernel();
437 return priority >= 0;
441 * Before we start the kernel thread, print out the
442 * kswapd initialization message (otherwise the init message
443 * may be printed in the middle of another driver's init
444 * message). It looks very bad when that happens.
446 void __init kswapd_setup(void)
448 int i;
449 char *revision="$Revision: 1.5 $", *s, *e;
451 swap_setup();
453 if ((s = strchr(revision, ':')) &&
454 (e = strchr(s, '$')))
455 s++, i = e - s;
456 else
457 s = revision, i = -1;
458 printk ("Starting kswapd v%.*s\n", i, s);
461 static struct task_struct *kswapd_process;
464 * The background pageout daemon, started as a kernel thread
465 * from the init process.
467 * This basically executes once a second, trickling out pages
468 * so that we have _some_ free memory available even if there
469 * is no other activity that frees anything up. This is needed
470 * for things like routing etc, where we otherwise might have
471 * all activity going on in asynchronous contexts that cannot
472 * page things out.
474 * If there are applications that are active memory-allocators
475 * (most normal use), this basically shouldn't matter.
477 int kswapd(void *unused)
479 struct task_struct *tsk = current;
481 kswapd_process = tsk;
482 tsk->session = 1;
483 tsk->pgrp = 1;
484 strcpy(tsk->comm, "kswapd");
485 sigfillset(&tsk->blocked);
488 * Tell the memory management that we're a "memory allocator",
489 * and that if we need more memory we should get access to it
490 * regardless (see "__get_free_pages()"). "kswapd" should
491 * never get caught in the normal page freeing logic.
493 * (Kswapd normally doesn't need memory anyway, but sometimes
494 * you need a small amount of memory in order to be able to
495 * page out something else, and this flag essentially protects
496 * us from recursively trying to free more memory as we're
497 * trying to free the first piece of memory in the first place).
499 tsk->flags |= PF_MEMALLOC;
501 while (1) {
503 * Wake up once a second to see if we need to make
504 * more memory available.
506 * If we actually get into a low-memory situation,
507 * the processes needing more memory will wake us
508 * up on a more timely basis.
510 do {
511 if (nr_free_pages >= freepages.high)
512 break;
514 if (!do_try_to_free_pages(GFP_KSWAPD))
515 break;
516 run_task_queue(&tq_disk);
517 } while (!tsk->need_resched);
518 tsk->state = TASK_INTERRUPTIBLE;
519 schedule_timeout(HZ);
524 * Called by non-kswapd processes when they want more
525 * memory.
527 * In a perfect world, this should just wake up kswapd
528 * and return. We don't actually want to swap stuff out
529 * from user processes, because the locking issues are
530 * nasty to the extreme (file write locks, and MM locking)
532 * One option might be to let kswapd do all the page-out
533 * and VM page table scanning that needs locking, and this
534 * process thread could do just the mmap shrink stage that
535 * can be done by just dropping cached pages without having
536 * any deadlock issues.
538 int try_to_free_pages(unsigned int gfp_mask)
540 int retval = 1;
542 wake_up_process(kswapd_process);
543 if (gfp_mask & __GFP_WAIT)
544 retval = do_try_to_free_pages(gfp_mask);
545 return retval;