- pre4:
[davej-history.git] / mm / vmscan.c
blob6e327e6524a0f4fe1778e2e5fdfcaf13e5217e30
1 /*
2 * linux/mm/vmscan.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
11 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
14 #include <linux/slab.h>
15 #include <linux/kernel_stat.h>
16 #include <linux/swap.h>
17 #include <linux/swapctl.h>
18 #include <linux/smp_lock.h>
19 #include <linux/pagemap.h>
20 #include <linux/init.h>
21 #include <linux/highmem.h>
22 #include <linux/file.h>
24 #include <asm/pgalloc.h>
27 * The swap-out functions return 1 if they successfully
28 * threw something out, and we got a free page. It returns
29 * zero if it couldn't do anything, and any other value
30 * indicates it decreased rss, but the page was shared.
32 * NOTE! If it sleeps, it *must* return 1 to make sure we
33 * don't continue with the swap-out. Otherwise we may be
34 * using a process that no longer actually exists (it might
35 * have died while we slept).
37 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
39 pte_t pte;
40 swp_entry_t entry;
41 struct page * page;
42 int (*swapout)(struct page *, struct file *);
44 pte = *page_table;
45 if (!pte_present(pte))
46 goto out_failed;
47 page = pte_page(pte);
48 if ((!VALID_PAGE(page)) || PageReserved(page))
49 goto out_failed;
51 if (mm->swap_cnt)
52 mm->swap_cnt--;
54 /* Don't look at this pte if it's been accessed recently. */
55 if (pte_young(pte)) {
57 * Transfer the "accessed" bit from the page
58 * tables to the global page map.
60 set_pte(page_table, pte_mkold(pte));
61 SetPageReferenced(page);
62 goto out_failed;
65 if (TryLockPage(page))
66 goto out_failed;
69 * Is the page already in the swap cache? If so, then
70 * we can just drop our reference to it without doing
71 * any IO - it's already up-to-date on disk.
73 * Return 0, as we didn't actually free any real
74 * memory, and we should just continue our scan.
76 if (PageSwapCache(page)) {
77 entry.val = page->index;
78 swap_duplicate(entry);
79 set_pte(page_table, swp_entry_to_pte(entry));
80 drop_pte:
81 UnlockPage(page);
82 vma->vm_mm->rss--;
83 flush_tlb_page(vma, address);
84 page_cache_release(page);
85 goto out_failed;
89 * Is it a clean page? Then it must be recoverable
90 * by just paging it in again, and we can just drop
91 * it..
93 * However, this won't actually free any real
94 * memory, as the page will just be in the page cache
95 * somewhere, and as such we should just continue
96 * our scan.
98 * Basically, this just makes it possible for us to do
99 * some real work in the future in "shrink_mmap()".
101 if (!pte_dirty(pte)) {
102 flush_cache_page(vma, address);
103 pte_clear(page_table);
104 goto drop_pte;
108 * Don't go down into the swap-out stuff if
109 * we cannot do I/O! Avoid recursing on FS
110 * locks etc.
112 if (!(gfp_mask & __GFP_IO))
113 goto out_unlock;
116 * Don't do any of the expensive stuff if
117 * we're not really interested in this zone.
119 if (page->zone->free_pages > page->zone->pages_high)
120 goto out_unlock;
123 * Ok, it's really dirty. That means that
124 * we should either create a new swap cache
125 * entry for it, or we should write it back
126 * to its own backing store.
128 * Note that in neither case do we actually
129 * know that we make a page available, but
130 * as we potentially sleep we can no longer
131 * continue scanning, so we migth as well
132 * assume we free'd something.
134 * NOTE NOTE NOTE! This should just set a
135 * dirty bit in 'page', and just drop the
136 * pte. All the hard work would be done by
137 * shrink_mmap().
139 * That would get rid of a lot of problems.
141 flush_cache_page(vma, address);
142 if (vma->vm_ops && (swapout = vma->vm_ops->swapout)) {
143 int error;
144 struct file *file = vma->vm_file;
145 if (file) get_file(file);
146 pte_clear(page_table);
147 vma->vm_mm->rss--;
148 flush_tlb_page(vma, address);
149 vmlist_access_unlock(vma->vm_mm);
150 error = swapout(page, file);
151 UnlockPage(page);
152 if (file) fput(file);
153 if (!error)
154 goto out_free_success;
155 page_cache_release(page);
156 return error;
160 * This is a dirty, swappable page. First of all,
161 * get a suitable swap entry for it, and make sure
162 * we have the swap cache set up to associate the
163 * page with that swap entry.
165 entry = get_swap_page();
166 if (!entry.val)
167 goto out_unlock; /* No swap space left */
169 if (!(page = prepare_highmem_swapout(page)))
170 goto out_swap_free;
172 swap_duplicate(entry); /* One for the process, one for the swap cache */
174 /* Add it to the swap cache */
175 add_to_swap_cache(page, entry);
177 /* Put the swap entry into the pte after the page is in swapcache */
178 vma->vm_mm->rss--;
179 set_pte(page_table, swp_entry_to_pte(entry));
180 flush_tlb_page(vma, address);
181 vmlist_access_unlock(vma->vm_mm);
183 /* OK, do a physical asynchronous write to swap. */
184 rw_swap_page(WRITE, page, 0);
186 out_free_success:
187 page_cache_release(page);
188 return 1;
189 out_swap_free:
190 swap_free(entry);
191 out_failed:
192 return 0;
193 out_unlock:
194 UnlockPage(page);
195 return 0;
199 * A new implementation of swap_out(). We do not swap complete processes,
200 * but only a small number of blocks, before we continue with the next
201 * process. The number of blocks actually swapped is determined on the
202 * number of page faults, that this process actually had in the last time,
203 * so we won't swap heavily used processes all the time ...
205 * Note: the priority argument is a hint on much CPU to waste with the
206 * swap block search, not a hint, of how much blocks to swap with
207 * each process.
209 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
212 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
214 pte_t * pte;
215 unsigned long pmd_end;
217 if (pmd_none(*dir))
218 return 0;
219 if (pmd_bad(*dir)) {
220 pmd_ERROR(*dir);
221 pmd_clear(dir);
222 return 0;
225 pte = pte_offset(dir, address);
227 pmd_end = (address + PMD_SIZE) & PMD_MASK;
228 if (end > pmd_end)
229 end = pmd_end;
231 do {
232 int result;
233 vma->vm_mm->swap_address = address + PAGE_SIZE;
234 result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
235 if (result)
236 return result;
237 if (!mm->swap_cnt)
238 return 0;
239 address += PAGE_SIZE;
240 pte++;
241 } while (address && (address < end));
242 return 0;
245 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
247 pmd_t * pmd;
248 unsigned long pgd_end;
250 if (pgd_none(*dir))
251 return 0;
252 if (pgd_bad(*dir)) {
253 pgd_ERROR(*dir);
254 pgd_clear(dir);
255 return 0;
258 pmd = pmd_offset(dir, address);
260 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
261 if (pgd_end && (end > pgd_end))
262 end = pgd_end;
264 do {
265 int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
266 if (result)
267 return result;
268 if (!mm->swap_cnt)
269 return 0;
270 address = (address + PMD_SIZE) & PMD_MASK;
271 pmd++;
272 } while (address && (address < end));
273 return 0;
276 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
278 pgd_t *pgdir;
279 unsigned long end;
281 /* Don't swap out areas which are locked down */
282 if (vma->vm_flags & VM_LOCKED)
283 return 0;
285 pgdir = pgd_offset(vma->vm_mm, address);
287 end = vma->vm_end;
288 if (address >= end)
289 BUG();
290 do {
291 int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
292 if (result)
293 return result;
294 if (!mm->swap_cnt)
295 return 0;
296 address = (address + PGDIR_SIZE) & PGDIR_MASK;
297 pgdir++;
298 } while (address && (address < end));
299 return 0;
302 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
304 unsigned long address;
305 struct vm_area_struct* vma;
308 * Go through process' page directory.
310 address = mm->swap_address;
313 * Find the proper vm-area after freezing the vma chain
314 * and ptes.
316 vmlist_access_lock(mm);
317 vma = find_vma(mm, address);
318 if (vma) {
319 if (address < vma->vm_start)
320 address = vma->vm_start;
322 for (;;) {
323 int result = swap_out_vma(mm, vma, address, gfp_mask);
324 if (result)
325 return result;
326 vma = vma->vm_next;
327 if (!vma)
328 break;
329 address = vma->vm_start;
332 vmlist_access_unlock(mm);
334 /* We didn't find anything for the process */
335 mm->swap_cnt = 0;
336 mm->swap_address = 0;
337 return 0;
341 * Select the task with maximal swap_cnt and try to swap out a page.
342 * N.B. This function returns only 0 or 1. Return values != 1 from
343 * the lower level routines result in continued processing.
345 static int swap_out(unsigned int priority, int gfp_mask)
347 struct task_struct * p;
348 int counter;
349 int __ret = 0;
351 lock_kernel();
353 * We make one or two passes through the task list, indexed by
354 * assign = {0, 1}:
355 * Pass 1: select the swappable task with maximal RSS that has
356 * not yet been swapped out.
357 * Pass 2: re-assign rss swap_cnt values, then select as above.
359 * With this approach, there's no need to remember the last task
360 * swapped out. If the swap-out fails, we clear swap_cnt so the
361 * task won't be selected again until all others have been tried.
363 * Think of swap_cnt as a "shadow rss" - it tells us which process
364 * we want to page out (always try largest first).
366 counter = (nr_threads << 2) >> (priority >> 2);
367 if (counter < 1)
368 counter = 1;
370 for (; counter >= 0; counter--) {
371 unsigned long max_cnt = 0;
372 struct mm_struct *best = NULL;
373 int pid = 0;
374 int assign = 0;
375 select:
376 read_lock(&tasklist_lock);
377 p = init_task.next_task;
378 for (; p != &init_task; p = p->next_task) {
379 struct mm_struct *mm = p->mm;
380 if (!p->swappable || !mm)
381 continue;
382 if (mm->rss <= 0)
383 continue;
384 /* Refresh swap_cnt? */
385 if (assign == 1)
386 mm->swap_cnt = mm->rss;
387 if (mm->swap_cnt > max_cnt) {
388 max_cnt = mm->swap_cnt;
389 best = mm;
390 pid = p->pid;
393 read_unlock(&tasklist_lock);
394 if (!best) {
395 if (!assign) {
396 assign = 1;
397 goto select;
399 goto out;
400 } else {
401 int ret;
403 atomic_inc(&best->mm_count);
404 ret = swap_out_mm(best, gfp_mask);
405 mmdrop(best);
407 if (!ret)
408 continue;
410 if (ret < 0)
411 kill_proc(pid, SIGBUS, 1);
412 __ret = 1;
413 goto out;
416 out:
417 unlock_kernel();
418 return __ret;
422 * Check if there is any memory pressure (free_pages < pages_low)
424 static inline int memory_pressure(void)
426 pg_data_t *pgdat = pgdat_list;
428 do {
429 int i;
430 for(i = 0; i < MAX_NR_ZONES; i++) {
431 zone_t *zone = pgdat->node_zones+ i;
432 if (zone->size &&
433 zone->free_pages < zone->pages_low)
434 return 1;
436 pgdat = pgdat->node_next;
437 } while (pgdat);
439 return 0;
443 * Check if all zones have recently had memory_pressure (zone_wake_kswapd)
445 static inline int keep_kswapd_awake(void)
447 int all_recent = 1;
448 pg_data_t *pgdat = pgdat_list;
450 do {
451 int i;
452 for(i = 0; i < MAX_NR_ZONES; i++) {
453 zone_t *zone = pgdat->node_zones+ i;
454 if (zone->size) {
455 if (zone->free_pages < zone->pages_min)
456 return 1;
457 if (!zone->zone_wake_kswapd)
458 all_recent = 0;
461 pgdat = pgdat->node_next;
462 } while (pgdat);
464 return all_recent;
468 * We need to make the locks finer granularity, but right
469 * now we need this so that we can do page allocations
470 * without holding the kernel lock etc.
472 * We want to try to free "count" pages, and we want to
473 * cluster them so that we get good swap-out behaviour.
475 * Don't try _too_ hard, though. We don't want to have bad
476 * latency.
478 * Note: only called by kswapd and try_to_free_pages
479 * both can WAIT at top level.
481 #define FREE_COUNT 8
482 #define SWAP_COUNT 16
483 static int do_try_to_free_pages(unsigned int gfp_mask)
485 int priority;
486 int count = FREE_COUNT;
487 int swap_count;
489 /* Always trim SLAB caches when memory gets low. */
490 kmem_cache_reap(gfp_mask);
492 priority = 64;
493 do {
494 if (current->need_resched) {
495 schedule();
496 /* time has passed - pressure too? */
497 if (!memory_pressure())
498 goto done;
501 while (shrink_mmap(priority, gfp_mask)) {
502 if (!--count)
503 goto done;
506 /* check if mission completed */
507 if (!keep_kswapd_awake())
508 goto done;
510 /* Try to get rid of some shared memory pages.. */
511 if (gfp_mask & __GFP_IO) {
513 * don't be too light against the d/i cache since
514 * shrink_mmap() almost never fail when there's
515 * really plenty of memory free.
517 count -= shrink_dcache_memory(priority, gfp_mask);
518 count -= shrink_icache_memory(priority, gfp_mask);
520 * Not currently working, see fixme in shrink_?cache_memory
521 * In the inner funtions there is a comment:
522 * "To help debugging, a zero exit status indicates
523 * all slabs were released." (-arca?)
524 * lets handle it in a primitive but working way...
525 * if (count <= 0)
526 * goto done;
528 if (!keep_kswapd_awake())
529 goto done;
531 while (shm_swap(priority, gfp_mask)) {
532 if (!--count)
533 goto done;
538 * Then, try to page stuff out..
540 * This will not actually free any pages (they get
541 * put in the swap cache), so we must not count this
542 * as a "count" success.
544 swap_count = SWAP_COUNT;
545 while (swap_out(priority, gfp_mask))
546 if (--swap_count < 0)
547 break;
549 } while (--priority >= 0);
551 /* Always end on a shrink_mmap.., may sleep... */
552 while (shrink_mmap(0, gfp_mask)) {
553 if (!--count)
554 goto done;
556 /* Return 1 if any page is freed, or
557 * there are no more memory pressure */
558 return (count < FREE_COUNT || !keep_kswapd_awake());
560 done:
561 return 1;
564 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
567 * The background pageout daemon, started as a kernel thread
568 * from the init process.
570 * This basically trickles out pages so that we have _some_
571 * free memory available even if there is no other activity
572 * that frees anything up. This is needed for things like routing
573 * etc, where we otherwise might have all activity going on in
574 * asynchronous contexts that cannot page things out.
576 * If there are applications that are active memory-allocators
577 * (most normal use), this basically shouldn't matter.
579 int kswapd(void *unused)
581 struct task_struct *tsk = current;
583 tsk->session = 1;
584 tsk->pgrp = 1;
585 strcpy(tsk->comm, "kswapd");
586 sigfillset(&tsk->blocked);
589 * Tell the memory management that we're a "memory allocator",
590 * and that if we need more memory we should get access to it
591 * regardless (see "__alloc_pages()"). "kswapd" should
592 * never get caught in the normal page freeing logic.
594 * (Kswapd normally doesn't need memory anyway, but sometimes
595 * you need a small amount of memory in order to be able to
596 * page out something else, and this flag essentially protects
597 * us from recursively trying to free more memory as we're
598 * trying to free the first piece of memory in the first place).
600 tsk->flags |= PF_MEMALLOC;
602 for (;;) {
603 if (!keep_kswapd_awake()) {
604 interruptible_sleep_on(&kswapd_wait);
607 do_try_to_free_pages(GFP_KSWAPD);
612 * Called by non-kswapd processes when they want more
613 * memory.
615 * In a perfect world, this should just wake up kswapd
616 * and return. We don't actually want to swap stuff out
617 * from user processes, because the locking issues are
618 * nasty to the extreme (file write locks, and MM locking)
620 * One option might be to let kswapd do all the page-out
621 * and VM page table scanning that needs locking, and this
622 * process thread could do just the mmap shrink stage that
623 * can be done by just dropping cached pages without having
624 * any deadlock issues.
626 int try_to_free_pages(unsigned int gfp_mask)
628 int retval = 1;
630 if (gfp_mask & __GFP_WAIT) {
631 current->state = TASK_RUNNING;
632 current->flags |= PF_MEMALLOC;
633 retval = do_try_to_free_pages(gfp_mask);
634 current->flags &= ~PF_MEMALLOC;
637 /* someone needed memory that kswapd had not provided
638 * make sure kswapd runs, should not happen often */
639 if (waitqueue_active(&kswapd_wait))
640 wake_up_interruptible(&kswapd_wait);
642 return retval;
645 static int __init kswapd_init(void)
647 printk("Starting kswapd v1.7\n");
648 swap_setup();
649 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
650 return 0;
653 module_init(kswapd_init)