Import 2.3.99pre7-1
[davej-history.git] / mm / vmscan.c
blob644ff1cdec26ff522bf60a379239ce56c3ce8fd1
1 /*
2 * linux/mm/vmscan.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
11 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
14 #include <linux/slab.h>
15 #include <linux/kernel_stat.h>
16 #include <linux/swap.h>
17 #include <linux/swapctl.h>
18 #include <linux/smp_lock.h>
19 #include <linux/pagemap.h>
20 #include <linux/init.h>
21 #include <linux/highmem.h>
22 #include <linux/file.h>
24 #include <asm/pgalloc.h>
27 * The swap-out functions return 1 if they successfully
28 * threw something out, and we got a free page. It returns
29 * zero if it couldn't do anything, and any other value
30 * indicates it decreased rss, but the page was shared.
32 * NOTE! If it sleeps, it *must* return 1 to make sure we
33 * don't continue with the swap-out. Otherwise we may be
34 * using a process that no longer actually exists (it might
35 * have died while we slept).
37 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
39 pte_t pte;
40 swp_entry_t entry;
41 struct page * page;
42 int (*swapout)(struct page *, struct file *);
44 pte = *page_table;
45 if (!pte_present(pte))
46 goto out_failed;
47 page = pte_page(pte);
48 if ((page-mem_map >= max_mapnr) || PageReserved(page))
49 goto out_failed;
51 mm->swap_cnt--;
52 /* Don't look at this pte if it's been accessed recently. */
53 if (pte_young(pte)) {
55 * Transfer the "accessed" bit from the page
56 * tables to the global page map.
58 set_pte(page_table, pte_mkold(pte));
59 set_bit(PG_referenced, &page->flags);
60 goto out_failed;
63 if (PageLocked(page))
64 goto out_failed;
67 * Is the page already in the swap cache? If so, then
68 * we can just drop our reference to it without doing
69 * any IO - it's already up-to-date on disk.
71 * Return 0, as we didn't actually free any real
72 * memory, and we should just continue our scan.
74 if (PageSwapCache(page)) {
75 entry.val = page->index;
76 swap_duplicate(entry);
77 set_pte(page_table, swp_entry_to_pte(entry));
78 drop_pte:
79 vma->vm_mm->rss--;
80 flush_tlb_page(vma, address);
81 __free_page(page);
82 goto out_failed;
86 * Is it a clean page? Then it must be recoverable
87 * by just paging it in again, and we can just drop
88 * it..
90 * However, this won't actually free any real
91 * memory, as the page will just be in the page cache
92 * somewhere, and as such we should just continue
93 * our scan.
95 * Basically, this just makes it possible for us to do
96 * some real work in the future in "shrink_mmap()".
98 if (!pte_dirty(pte)) {
99 flush_cache_page(vma, address);
100 pte_clear(page_table);
101 goto drop_pte;
105 * Don't go down into the swap-out stuff if
106 * we cannot do I/O! Avoid recursing on FS
107 * locks etc.
109 if (!(gfp_mask & __GFP_IO))
110 goto out_failed;
113 * Ok, it's really dirty. That means that
114 * we should either create a new swap cache
115 * entry for it, or we should write it back
116 * to its own backing store.
118 * Note that in neither case do we actually
119 * know that we make a page available, but
120 * as we potentially sleep we can no longer
121 * continue scanning, so we migth as well
122 * assume we free'd something.
124 * NOTE NOTE NOTE! This should just set a
125 * dirty bit in 'page', and just drop the
126 * pte. All the hard work would be done by
127 * shrink_mmap().
129 * That would get rid of a lot of problems.
131 flush_cache_page(vma, address);
132 if (vma->vm_ops && (swapout = vma->vm_ops->swapout)) {
133 int error;
134 struct file *file = vma->vm_file;
135 if (file) get_file(file);
136 pte_clear(page_table);
137 vma->vm_mm->rss--;
138 flush_tlb_page(vma, address);
139 vmlist_access_unlock(vma->vm_mm);
140 error = swapout(page, file);
141 if (file) fput(file);
142 if (!error)
143 goto out_free_success;
144 __free_page(page);
145 return error;
149 * This is a dirty, swappable page. First of all,
150 * get a suitable swap entry for it, and make sure
151 * we have the swap cache set up to associate the
152 * page with that swap entry.
154 entry = acquire_swap_entry(page);
155 if (!entry.val)
156 goto out_failed; /* No swap space left */
158 if (!(page = prepare_highmem_swapout(page)))
159 goto out_swap_free;
161 swap_duplicate(entry); /* One for the process, one for the swap cache */
163 /* This will also lock the page */
164 add_to_swap_cache(page, entry);
165 /* Put the swap entry into the pte after the page is in swapcache */
166 vma->vm_mm->rss--;
167 set_pte(page_table, swp_entry_to_pte(entry));
168 flush_tlb_page(vma, address);
169 vmlist_access_unlock(vma->vm_mm);
171 /* OK, do a physical asynchronous write to swap. */
172 rw_swap_page(WRITE, page, 0);
174 out_free_success:
175 __free_page(page);
176 return 1;
177 out_swap_free:
178 swap_free(entry);
179 out_failed:
180 return 0;
185 * A new implementation of swap_out(). We do not swap complete processes,
186 * but only a small number of blocks, before we continue with the next
187 * process. The number of blocks actually swapped is determined on the
188 * number of page faults, that this process actually had in the last time,
189 * so we won't swap heavily used processes all the time ...
191 * Note: the priority argument is a hint on much CPU to waste with the
192 * swap block search, not a hint, of how much blocks to swap with
193 * each process.
195 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
198 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
200 pte_t * pte;
201 unsigned long pmd_end;
203 if (pmd_none(*dir))
204 return 0;
205 if (pmd_bad(*dir)) {
206 pmd_ERROR(*dir);
207 pmd_clear(dir);
208 return 0;
211 pte = pte_offset(dir, address);
213 pmd_end = (address + PMD_SIZE) & PMD_MASK;
214 if (end > pmd_end)
215 end = pmd_end;
217 do {
218 int result;
219 vma->vm_mm->swap_address = address + PAGE_SIZE;
220 result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
221 if (result)
222 return result;
223 if (!mm->swap_cnt)
224 return 0;
225 address += PAGE_SIZE;
226 pte++;
227 } while (address && (address < end));
228 return 0;
231 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
233 pmd_t * pmd;
234 unsigned long pgd_end;
236 if (pgd_none(*dir))
237 return 0;
238 if (pgd_bad(*dir)) {
239 pgd_ERROR(*dir);
240 pgd_clear(dir);
241 return 0;
244 pmd = pmd_offset(dir, address);
246 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
247 if (pgd_end && (end > pgd_end))
248 end = pgd_end;
250 do {
251 int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
252 if (result)
253 return result;
254 if (!mm->swap_cnt)
255 return 0;
256 address = (address + PMD_SIZE) & PMD_MASK;
257 pmd++;
258 } while (address && (address < end));
259 return 0;
262 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
264 pgd_t *pgdir;
265 unsigned long end;
267 /* Don't swap out areas which are locked down */
268 if (vma->vm_flags & VM_LOCKED)
269 return 0;
271 pgdir = pgd_offset(vma->vm_mm, address);
273 end = vma->vm_end;
274 if (address >= end)
275 BUG();
276 do {
277 int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
278 if (result)
279 return result;
280 if (!mm->swap_cnt)
281 return 0;
282 address = (address + PGDIR_SIZE) & PGDIR_MASK;
283 pgdir++;
284 } while (address && (address < end));
285 return 0;
288 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
290 unsigned long address;
291 struct vm_area_struct* vma;
294 * Go through process' page directory.
296 address = mm->swap_address;
299 * Find the proper vm-area after freezing the vma chain
300 * and ptes.
302 vmlist_access_lock(mm);
303 vma = find_vma(mm, address);
304 if (vma) {
305 if (address < vma->vm_start)
306 address = vma->vm_start;
308 for (;;) {
309 int result = swap_out_vma(mm, vma, address, gfp_mask);
310 if (result)
311 return result;
312 vma = vma->vm_next;
313 if (!vma)
314 break;
315 address = vma->vm_start;
318 vmlist_access_unlock(mm);
320 /* We didn't find anything for the process */
321 mm->swap_cnt = 0;
322 mm->swap_address = 0;
323 return 0;
327 * Select the task with maximal swap_cnt and try to swap out a page.
328 * N.B. This function returns only 0 or 1. Return values != 1 from
329 * the lower level routines result in continued processing.
331 int swap_out(unsigned int priority, int gfp_mask)
333 struct task_struct * p;
334 int counter;
335 int __ret = 0;
336 int assign = 0;
338 lock_kernel();
340 * We make one or two passes through the task list, indexed by
341 * assign = {0, 1}:
342 * Pass 1: select the swappable task with maximal RSS that has
343 * not yet been swapped out.
344 * Pass 2: re-assign rss swap_cnt values, then select as above.
346 * With this approach, there's no need to remember the last task
347 * swapped out. If the swap-out fails, we clear swap_cnt so the
348 * task won't be selected again until all others have been tried.
350 * Think of swap_cnt as a "shadow rss" - it tells us which process
351 * we want to page out (always try largest first).
353 counter = nr_threads / (priority+1);
354 if (counter < 1)
355 counter = 1;
357 for (; counter >= 0; counter--) {
358 unsigned long max_cnt = 0;
359 struct mm_struct *best = NULL;
360 int pid = 0;
361 select:
362 read_lock(&tasklist_lock);
363 p = init_task.next_task;
364 for (; p != &init_task; p = p->next_task) {
365 struct mm_struct *mm = p->mm;
366 p->hog = 0;
367 if (!p->swappable || !mm)
368 continue;
369 if (mm->rss <= 0)
370 continue;
371 /* Refresh swap_cnt? */
372 if (assign == 1)
373 mm->swap_cnt = mm->rss;
374 if (mm->swap_cnt > max_cnt) {
375 max_cnt = mm->swap_cnt;
376 best = mm;
377 pid = p->pid;
380 if (assign == 1) {
381 /* we just assigned swap_cnt, normalise values */
382 assign = 2;
383 p = init_task.next_task;
384 for (; p != &init_task; p = p->next_task) {
385 int i = 0;
386 struct mm_struct *mm = p->mm;
387 if (!p->swappable || !mm || mm->rss <= 0)
388 continue;
389 /* small processes are swapped out less */
390 while ((mm->swap_cnt << 2 * (i + 1) < max_cnt)
391 && i++ < 10)
392 mm->swap_cnt >>= i;
393 mm->swap_cnt += i; /* if swap_cnt reaches 0 */
394 /* we're big -> hog treatment */
395 if (!i)
396 p->hog = 1;
399 read_unlock(&tasklist_lock);
400 if (!best) {
401 if (!assign) {
402 assign = 1;
403 goto select;
405 goto out;
406 } else {
407 int ret;
409 atomic_inc(&best->mm_count);
410 ret = swap_out_mm(best, gfp_mask);
411 mmdrop(best);
413 if (!ret)
414 continue;
416 if (ret < 0)
417 kill_proc(pid, SIGBUS, 1);
418 __ret = 1;
419 goto out;
422 out:
423 unlock_kernel();
424 return __ret;
428 * We need to make the locks finer granularity, but right
429 * now we need this so that we can do page allocations
430 * without holding the kernel lock etc.
432 * We want to try to free "count" pages, and we need to
433 * cluster them so that we get good swap-out behaviour. See
434 * the "free_memory()" macro for details.
436 static int do_try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
438 int priority;
439 int count = SWAP_CLUSTER_MAX;
441 /* Always trim SLAB caches when memory gets low. */
442 kmem_cache_reap(gfp_mask);
444 priority = 6;
445 do {
446 while (shrink_mmap(priority, gfp_mask, zone)) {
447 if (!--count)
448 goto done;
452 /* Try to get rid of some shared memory pages.. */
453 if (gfp_mask & __GFP_IO) {
455 * don't be too light against the d/i cache since
456 * shrink_mmap() almost never fail when there's
457 * really plenty of memory free.
459 count -= shrink_dcache_memory(priority, gfp_mask, zone);
460 count -= shrink_icache_memory(priority, gfp_mask, zone);
461 if (count <= 0)
462 goto done;
463 while (shm_swap(priority, gfp_mask, zone)) {
464 if (!--count)
465 goto done;
469 /* Then, try to page stuff out.. */
470 while (swap_out(priority, gfp_mask)) {
471 if (!--count)
472 goto done;
474 } while (--priority >= 0);
475 done:
477 return priority >= 0;
480 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
483 * The background pageout daemon, started as a kernel thread
484 * from the init process.
486 * This basically trickles out pages so that we have _some_
487 * free memory available even if there is no other activity
488 * that frees anything up. This is needed for things like routing
489 * etc, where we otherwise might have all activity going on in
490 * asynchronous contexts that cannot page things out.
492 * If there are applications that are active memory-allocators
493 * (most normal use), this basically shouldn't matter.
495 int kswapd(void *unused)
497 int i;
498 struct task_struct *tsk = current;
499 pg_data_t *pgdat;
500 zone_t *zone;
502 tsk->session = 1;
503 tsk->pgrp = 1;
504 strcpy(tsk->comm, "kswapd");
505 sigfillset(&tsk->blocked);
508 * Tell the memory management that we're a "memory allocator",
509 * and that if we need more memory we should get access to it
510 * regardless (see "__alloc_pages()"). "kswapd" should
511 * never get caught in the normal page freeing logic.
513 * (Kswapd normally doesn't need memory anyway, but sometimes
514 * you need a small amount of memory in order to be able to
515 * page out something else, and this flag essentially protects
516 * us from recursively trying to free more memory as we're
517 * trying to free the first piece of memory in the first place).
519 tsk->flags |= PF_MEMALLOC;
521 while (1) {
523 * If we actually get into a low-memory situation,
524 * the processes needing more memory will wake us
525 * up on a more timely basis.
527 pgdat = pgdat_list;
528 while (pgdat) {
529 for (i = 0; i < MAX_NR_ZONES; i++) {
530 int count = SWAP_CLUSTER_MAX;
531 zone = pgdat->node_zones + i;
532 do {
533 if (tsk->need_resched)
534 schedule();
535 if ((!zone->size) || (!zone->zone_wake_kswapd))
536 continue;
537 do_try_to_free_pages(GFP_KSWAPD, zone);
538 } while (zone->free_pages < zone->pages_low &&
539 --count);
541 pgdat = pgdat->node_next;
543 run_task_queue(&tq_disk);
544 tsk->state = TASK_INTERRUPTIBLE;
545 interruptible_sleep_on(&kswapd_wait);
550 * Called by non-kswapd processes when they want more
551 * memory.
553 * In a perfect world, this should just wake up kswapd
554 * and return. We don't actually want to swap stuff out
555 * from user processes, because the locking issues are
556 * nasty to the extreme (file write locks, and MM locking)
558 * One option might be to let kswapd do all the page-out
559 * and VM page table scanning that needs locking, and this
560 * process thread could do just the mmap shrink stage that
561 * can be done by just dropping cached pages without having
562 * any deadlock issues.
564 int try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
566 int retval = 1;
568 if (gfp_mask & __GFP_WAIT) {
569 current->flags |= PF_MEMALLOC;
570 retval = do_try_to_free_pages(gfp_mask, zone);
571 current->flags &= ~PF_MEMALLOC;
573 return retval;
576 static int __init kswapd_init(void)
578 printk("Starting kswapd v1.6\n");
579 swap_setup();
580 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
581 return 0;
584 module_init(kswapd_init)