Linux 2.2.0
[davej-history.git] / mm / vmscan.c
blob116096153341badcd18421db803da163cd90a517
1 /*
2 * linux/mm/vmscan.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
13 #include <linux/slab.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/swap.h>
16 #include <linux/swapctl.h>
17 #include <linux/smp_lock.h>
18 #include <linux/pagemap.h>
19 #include <linux/init.h>
21 #include <asm/pgtable.h>
24 * The swap-out functions return 1 if they successfully
25 * threw something out, and we got a free page. It returns
26 * zero if it couldn't do anything, and any other value
27 * indicates it decreased rss, but the page was shared.
29 * NOTE! If it sleeps, it *must* return 1 to make sure we
30 * don't continue with the swap-out. Otherwise we may be
31 * using a process that no longer actually exists (it might
32 * have died while we slept).
34 static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
35 unsigned long address, pte_t * page_table, int gfp_mask)
37 pte_t pte;
38 unsigned long entry;
39 unsigned long page;
40 struct page * page_map;
42 pte = *page_table;
43 if (!pte_present(pte))
44 return 0;
45 page = pte_page(pte);
46 if (MAP_NR(page) >= max_mapnr)
47 return 0;
49 page_map = mem_map + MAP_NR(page);
50 if (PageReserved(page_map)
51 || PageLocked(page_map)
52 || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
53 return 0;
55 if (pte_young(pte)) {
57 * Transfer the "accessed" bit from the page
58 * tables to the global page map.
60 set_pte(page_table, pte_mkold(pte));
61 set_bit(PG_referenced, &page_map->flags);
62 return 0;
66 * Is the page already in the swap cache? If so, then
67 * we can just drop our reference to it without doing
68 * any IO - it's already up-to-date on disk.
70 * Return 0, as we didn't actually free any real
71 * memory, and we should just continue our scan.
73 if (PageSwapCache(page_map)) {
74 entry = page_map->offset;
75 swap_duplicate(entry);
76 set_pte(page_table, __pte(entry));
77 drop_pte:
78 vma->vm_mm->rss--;
79 flush_tlb_page(vma, address);
80 __free_page(page_map);
81 return 0;
85 * Is it a clean page? Then it must be recoverable
86 * by just paging it in again, and we can just drop
87 * it..
89 * However, this won't actually free any real
90 * memory, as the page will just be in the page cache
91 * somewhere, and as such we should just continue
92 * our scan.
94 * Basically, this just makes it possible for us to do
95 * some real work in the future in "shrink_mmap()".
97 if (!pte_dirty(pte)) {
98 pte_clear(page_table);
99 goto drop_pte;
103 * Don't go down into the swap-out stuff if
104 * we cannot do I/O! Avoid recursing on FS
105 * locks etc.
107 if (!(gfp_mask & __GFP_IO))
108 return 0;
111 * Ok, it's really dirty. That means that
112 * we should either create a new swap cache
113 * entry for it, or we should write it back
114 * to its own backing store.
116 * Note that in neither case do we actually
117 * know that we make a page available, but
118 * as we potentially sleep we can no longer
119 * continue scanning, so we migth as well
120 * assume we free'd something.
122 * NOTE NOTE NOTE! This should just set a
123 * dirty bit in page_map, and just drop the
124 * pte. All the hard work would be done by
125 * shrink_mmap().
127 * That would get rid of a lot of problems.
129 flush_cache_page(vma, address);
130 if (vma->vm_ops && vma->vm_ops->swapout) {
131 pid_t pid = tsk->pid;
132 pte_clear(page_table);
133 flush_tlb_page(vma, address);
134 vma->vm_mm->rss--;
136 if (vma->vm_ops->swapout(vma, page_map))
137 kill_proc(pid, SIGBUS, 1);
138 __free_page(page_map);
139 return 1;
143 * This is a dirty, swappable page. First of all,
144 * get a suitable swap entry for it, and make sure
145 * we have the swap cache set up to associate the
146 * page with that swap entry.
148 entry = get_swap_page();
149 if (!entry)
150 return 0; /* No swap space left */
152 vma->vm_mm->rss--;
153 tsk->nswap++;
154 set_pte(page_table, __pte(entry));
155 flush_tlb_page(vma, address);
156 swap_duplicate(entry); /* One for the process, one for the swap cache */
157 add_to_swap_cache(page_map, entry);
158 /* We checked we were unlocked way up above, and we
159 have been careful not to stall until here */
160 set_bit(PG_locked, &page_map->flags);
162 /* OK, do a physical asynchronous write to swap. */
163 rw_swap_page(WRITE, entry, (char *) page, 0);
165 __free_page(page_map);
166 return 1;
170 * A new implementation of swap_out(). We do not swap complete processes,
171 * but only a small number of blocks, before we continue with the next
172 * process. The number of blocks actually swapped is determined on the
173 * number of page faults, that this process actually had in the last time,
174 * so we won't swap heavily used processes all the time ...
176 * Note: the priority argument is a hint on much CPU to waste with the
177 * swap block search, not a hint, of how much blocks to swap with
178 * each process.
180 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
183 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
184 pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
186 pte_t * pte;
187 unsigned long pmd_end;
189 if (pmd_none(*dir))
190 return 0;
191 if (pmd_bad(*dir)) {
192 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
193 pmd_clear(dir);
194 return 0;
197 pte = pte_offset(dir, address);
199 pmd_end = (address + PMD_SIZE) & PMD_MASK;
200 if (end > pmd_end)
201 end = pmd_end;
203 do {
204 int result;
205 tsk->swap_address = address + PAGE_SIZE;
206 result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
207 if (result)
208 return result;
209 address += PAGE_SIZE;
210 pte++;
211 } while (address < end);
212 return 0;
215 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
216 pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
218 pmd_t * pmd;
219 unsigned long pgd_end;
221 if (pgd_none(*dir))
222 return 0;
223 if (pgd_bad(*dir)) {
224 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
225 pgd_clear(dir);
226 return 0;
229 pmd = pmd_offset(dir, address);
231 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
232 if (end > pgd_end)
233 end = pgd_end;
235 do {
236 int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
237 if (result)
238 return result;
239 address = (address + PMD_SIZE) & PMD_MASK;
240 pmd++;
241 } while (address < end);
242 return 0;
245 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
246 unsigned long address, int gfp_mask)
248 pgd_t *pgdir;
249 unsigned long end;
251 /* Don't swap out areas like shared memory which have their
252 own separate swapping mechanism or areas which are locked down */
253 if (vma->vm_flags & (VM_SHM | VM_LOCKED))
254 return 0;
256 pgdir = pgd_offset(tsk->mm, address);
258 end = vma->vm_end;
259 while (address < end) {
260 int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
261 if (result)
262 return result;
263 address = (address + PGDIR_SIZE) & PGDIR_MASK;
264 pgdir++;
266 return 0;
269 static int swap_out_process(struct task_struct * p, int gfp_mask)
271 unsigned long address;
272 struct vm_area_struct* vma;
275 * Go through process' page directory.
277 address = p->swap_address;
280 * Find the proper vm-area
282 vma = find_vma(p->mm, address);
283 if (vma) {
284 if (address < vma->vm_start)
285 address = vma->vm_start;
287 for (;;) {
288 int result = swap_out_vma(p, vma, address, gfp_mask);
289 if (result)
290 return result;
291 vma = vma->vm_next;
292 if (!vma)
293 break;
294 address = vma->vm_start;
298 /* We didn't find anything for the process */
299 p->swap_cnt = 0;
300 p->swap_address = 0;
301 return 0;
305 * Select the task with maximal swap_cnt and try to swap out a page.
306 * N.B. This function returns only 0 or 1. Return values != 1 from
307 * the lower level routines result in continued processing.
309 static int swap_out(unsigned int priority, int gfp_mask)
311 struct task_struct * p, * pbest;
312 int counter, assign, max_cnt;
315 * We make one or two passes through the task list, indexed by
316 * assign = {0, 1}:
317 * Pass 1: select the swappable task with maximal RSS that has
318 * not yet been swapped out.
319 * Pass 2: re-assign rss swap_cnt values, then select as above.
321 * With this approach, there's no need to remember the last task
322 * swapped out. If the swap-out fails, we clear swap_cnt so the
323 * task won't be selected again until all others have been tried.
325 * Think of swap_cnt as a "shadow rss" - it tells us which process
326 * we want to page out (always try largest first).
328 counter = nr_tasks / (priority+1);
329 if (counter < 1)
330 counter = 1;
331 if (counter > nr_tasks)
332 counter = nr_tasks;
334 for (; counter >= 0; counter--) {
335 assign = 0;
336 max_cnt = 0;
337 pbest = NULL;
338 select:
339 read_lock(&tasklist_lock);
340 p = init_task.next_task;
341 for (; p != &init_task; p = p->next_task) {
342 if (!p->swappable)
343 continue;
344 if (p->mm->rss <= 0)
345 continue;
346 /* Refresh swap_cnt? */
347 if (assign)
348 p->swap_cnt = p->mm->rss;
349 if (p->swap_cnt > max_cnt) {
350 max_cnt = p->swap_cnt;
351 pbest = p;
354 read_unlock(&tasklist_lock);
355 if (!pbest) {
356 if (!assign) {
357 assign = 1;
358 goto select;
360 goto out;
363 if (swap_out_process(pbest, gfp_mask))
364 return 1;
366 out:
367 return 0;
371 * We need to make the locks finer granularity, but right
372 * now we need this so that we can do page allocations
373 * without holding the kernel lock etc.
375 * We want to try to free "count" pages, and we need to
376 * cluster them so that we get good swap-out behaviour. See
377 * the "free_memory()" macro for details.
379 static int do_try_to_free_pages(unsigned int gfp_mask)
381 int priority;
382 int count = SWAP_CLUSTER_MAX;
384 lock_kernel();
386 /* Always trim SLAB caches when memory gets low. */
387 kmem_cache_reap(gfp_mask);
389 priority = 6;
390 do {
391 while (shrink_mmap(priority, gfp_mask)) {
392 if (!--count)
393 goto done;
396 /* Try to get rid of some shared memory pages.. */
397 if (gfp_mask & __GFP_IO) {
398 while (shm_swap(priority, gfp_mask)) {
399 if (!--count)
400 goto done;
404 /* Then, try to page stuff out.. */
405 while (swap_out(priority, gfp_mask)) {
406 if (!--count)
407 goto done;
410 shrink_dcache_memory(priority, gfp_mask);
411 } while (--priority >= 0);
412 done:
413 unlock_kernel();
415 return priority >= 0;
419 * Before we start the kernel thread, print out the
420 * kswapd initialization message (otherwise the init message
421 * may be printed in the middle of another driver's init
422 * message). It looks very bad when that happens.
424 void __init kswapd_setup(void)
426 int i;
427 char *revision="$Revision: 1.5 $", *s, *e;
429 swap_setup();
431 if ((s = strchr(revision, ':')) &&
432 (e = strchr(s, '$')))
433 s++, i = e - s;
434 else
435 s = revision, i = -1;
436 printk ("Starting kswapd v%.*s\n", i, s);
439 static struct task_struct *kswapd_process;
442 * The background pageout daemon, started as a kernel thread
443 * from the init process.
445 * This basically executes once a second, trickling out pages
446 * so that we have _some_ free memory available even if there
447 * is no other activity that frees anything up. This is needed
448 * for things like routing etc, where we otherwise might have
449 * all activity going on in asynchronous contexts that cannot
450 * page things out.
452 * If there are applications that are active memory-allocators
453 * (most normal use), this basically shouldn't matter.
455 int kswapd(void *unused)
457 struct task_struct *tsk = current;
459 kswapd_process = tsk;
460 tsk->session = 1;
461 tsk->pgrp = 1;
462 strcpy(tsk->comm, "kswapd");
463 sigfillset(&tsk->blocked);
466 * Tell the memory management that we're a "memory allocator",
467 * and that if we need more memory we should get access to it
468 * regardless (see "__get_free_pages()"). "kswapd" should
469 * never get caught in the normal page freeing logic.
471 * (Kswapd normally doesn't need memory anyway, but sometimes
472 * you need a small amount of memory in order to be able to
473 * page out something else, and this flag essentially protects
474 * us from recursively trying to free more memory as we're
475 * trying to free the first piece of memory in the first place).
477 tsk->flags |= PF_MEMALLOC;
479 while (1) {
481 * Wake up once a second to see if we need to make
482 * more memory available.
484 * If we actually get into a low-memory situation,
485 * the processes needing more memory will wake us
486 * up on a more timely basis.
488 do {
489 if (nr_free_pages >= freepages.high)
490 break;
492 if (!do_try_to_free_pages(GFP_KSWAPD))
493 break;
494 } while (!tsk->need_resched);
495 run_task_queue(&tq_disk);
496 tsk->state = TASK_INTERRUPTIBLE;
497 schedule_timeout(HZ);
502 * Called by non-kswapd processes when they want more
503 * memory.
505 * In a perfect world, this should just wake up kswapd
506 * and return. We don't actually want to swap stuff out
507 * from user processes, because the locking issues are
508 * nasty to the extreme (file write locks, and MM locking)
510 * One option might be to let kswapd do all the page-out
511 * and VM page table scanning that needs locking, and this
512 * process thread could do just the mmap shrink stage that
513 * can be done by just dropping cached pages without having
514 * any deadlock issues.
516 int try_to_free_pages(unsigned int gfp_mask)
518 int retval = 1;
520 wake_up_process(kswapd_process);
521 if (gfp_mask & __GFP_WAIT)
522 retval = do_try_to_free_pages(gfp_mask);
523 return retval;