4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
11 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
14 #include <linux/slab.h>
15 #include <linux/kernel_stat.h>
16 #include <linux/swap.h>
17 #include <linux/swapctl.h>
18 #include <linux/smp_lock.h>
19 #include <linux/pagemap.h>
20 #include <linux/init.h>
21 #include <linux/highmem.h>
22 #include <linux/file.h>
24 #include <asm/pgalloc.h>
27 * The swap-out functions return 1 if they successfully
28 * threw something out, and we got a free page. It returns
29 * zero if it couldn't do anything, and any other value
30 * indicates it decreased rss, but the page was shared.
32 * NOTE! If it sleeps, it *must* return 1 to make sure we
33 * don't continue with the swap-out. Otherwise we may be
34 * using a process that no longer actually exists (it might
35 * have died while we slept).
37 static int try_to_swap_out(struct mm_struct
* mm
, struct vm_area_struct
* vma
, unsigned long address
, pte_t
* page_table
, int gfp_mask
)
42 int (*swapout
)(struct page
*, struct file
*);
45 if (!pte_present(pte
))
48 if ((page
-mem_map
>= max_mapnr
) || PageReserved(page
))
54 /* Don't look at this pte if it's been accessed recently. */
57 * Transfer the "accessed" bit from the page
58 * tables to the global page map.
60 set_pte(page_table
, pte_mkold(pte
));
61 SetPageReferenced(page
);
65 if (TryLockPage(page
))
69 * Is the page already in the swap cache? If so, then
70 * we can just drop our reference to it without doing
71 * any IO - it's already up-to-date on disk.
73 * Return 0, as we didn't actually free any real
74 * memory, and we should just continue our scan.
76 if (PageSwapCache(page
)) {
77 entry
.val
= page
->index
;
78 swap_duplicate(entry
);
79 set_pte(page_table
, swp_entry_to_pte(entry
));
83 flush_tlb_page(vma
, address
);
84 page_cache_release(page
);
89 * Is it a clean page? Then it must be recoverable
90 * by just paging it in again, and we can just drop
93 * However, this won't actually free any real
94 * memory, as the page will just be in the page cache
95 * somewhere, and as such we should just continue
98 * Basically, this just makes it possible for us to do
99 * some real work in the future in "shrink_mmap()".
101 if (!pte_dirty(pte
)) {
102 flush_cache_page(vma
, address
);
103 pte_clear(page_table
);
108 * Don't go down into the swap-out stuff if
109 * we cannot do I/O! Avoid recursing on FS
112 if (!(gfp_mask
& __GFP_IO
))
116 * Don't do any of the expensive stuff if
117 * we're not really interested in this zone.
119 if (page
->zone
->free_pages
> page
->zone
->pages_high
)
123 * Ok, it's really dirty. That means that
124 * we should either create a new swap cache
125 * entry for it, or we should write it back
126 * to its own backing store.
128 * Note that in neither case do we actually
129 * know that we make a page available, but
130 * as we potentially sleep we can no longer
131 * continue scanning, so we migth as well
132 * assume we free'd something.
134 * NOTE NOTE NOTE! This should just set a
135 * dirty bit in 'page', and just drop the
136 * pte. All the hard work would be done by
139 * That would get rid of a lot of problems.
141 flush_cache_page(vma
, address
);
142 if (vma
->vm_ops
&& (swapout
= vma
->vm_ops
->swapout
)) {
144 struct file
*file
= vma
->vm_file
;
145 if (file
) get_file(file
);
146 pte_clear(page_table
);
148 flush_tlb_page(vma
, address
);
149 vmlist_access_unlock(vma
->vm_mm
);
150 error
= swapout(page
, file
);
152 if (file
) fput(file
);
154 goto out_free_success
;
155 page_cache_release(page
);
160 * This is a dirty, swappable page. First of all,
161 * get a suitable swap entry for it, and make sure
162 * we have the swap cache set up to associate the
163 * page with that swap entry.
165 entry
= get_swap_page();
167 goto out_unlock
; /* No swap space left */
169 if (!(page
= prepare_highmem_swapout(page
)))
172 swap_duplicate(entry
); /* One for the process, one for the swap cache */
174 /* Add it to the swap cache */
175 add_to_swap_cache(page
, entry
);
177 /* Put the swap entry into the pte after the page is in swapcache */
179 set_pte(page_table
, swp_entry_to_pte(entry
));
180 flush_tlb_page(vma
, address
);
181 vmlist_access_unlock(vma
->vm_mm
);
183 /* OK, do a physical asynchronous write to swap. */
184 rw_swap_page(WRITE
, page
, 0);
187 page_cache_release(page
);
199 * A new implementation of swap_out(). We do not swap complete processes,
200 * but only a small number of blocks, before we continue with the next
201 * process. The number of blocks actually swapped is determined on the
202 * number of page faults, that this process actually had in the last time,
203 * so we won't swap heavily used processes all the time ...
205 * Note: the priority argument is a hint on much CPU to waste with the
206 * swap block search, not a hint, of how much blocks to swap with
209 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
212 static inline int swap_out_pmd(struct mm_struct
* mm
, struct vm_area_struct
* vma
, pmd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
215 unsigned long pmd_end
;
225 pte
= pte_offset(dir
, address
);
227 pmd_end
= (address
+ PMD_SIZE
) & PMD_MASK
;
233 vma
->vm_mm
->swap_address
= address
+ PAGE_SIZE
;
234 result
= try_to_swap_out(mm
, vma
, address
, pte
, gfp_mask
);
239 address
+= PAGE_SIZE
;
241 } while (address
&& (address
< end
));
245 static inline int swap_out_pgd(struct mm_struct
* mm
, struct vm_area_struct
* vma
, pgd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
248 unsigned long pgd_end
;
258 pmd
= pmd_offset(dir
, address
);
260 pgd_end
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
261 if (pgd_end
&& (end
> pgd_end
))
265 int result
= swap_out_pmd(mm
, vma
, pmd
, address
, end
, gfp_mask
);
270 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
272 } while (address
&& (address
< end
));
276 static int swap_out_vma(struct mm_struct
* mm
, struct vm_area_struct
* vma
, unsigned long address
, int gfp_mask
)
281 /* Don't swap out areas which are locked down */
282 if (vma
->vm_flags
& VM_LOCKED
)
285 pgdir
= pgd_offset(vma
->vm_mm
, address
);
291 int result
= swap_out_pgd(mm
, vma
, pgdir
, address
, end
, gfp_mask
);
296 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
298 } while (address
&& (address
< end
));
302 static int swap_out_mm(struct mm_struct
* mm
, int gfp_mask
)
304 unsigned long address
;
305 struct vm_area_struct
* vma
;
308 * Go through process' page directory.
310 address
= mm
->swap_address
;
313 * Find the proper vm-area after freezing the vma chain
316 vmlist_access_lock(mm
);
317 vma
= find_vma(mm
, address
);
319 if (address
< vma
->vm_start
)
320 address
= vma
->vm_start
;
323 int result
= swap_out_vma(mm
, vma
, address
, gfp_mask
);
329 address
= vma
->vm_start
;
332 vmlist_access_unlock(mm
);
334 /* We didn't find anything for the process */
336 mm
->swap_address
= 0;
341 * Select the task with maximal swap_cnt and try to swap out a page.
342 * N.B. This function returns only 0 or 1. Return values != 1 from
343 * the lower level routines result in continued processing.
345 static int swap_out(unsigned int priority
, int gfp_mask
)
347 struct task_struct
* p
;
353 * We make one or two passes through the task list, indexed by
355 * Pass 1: select the swappable task with maximal RSS that has
356 * not yet been swapped out.
357 * Pass 2: re-assign rss swap_cnt values, then select as above.
359 * With this approach, there's no need to remember the last task
360 * swapped out. If the swap-out fails, we clear swap_cnt so the
361 * task won't be selected again until all others have been tried.
363 * Think of swap_cnt as a "shadow rss" - it tells us which process
364 * we want to page out (always try largest first).
366 counter
= (nr_threads
<< 2) >> (priority
>> 2);
370 for (; counter
>= 0; counter
--) {
371 unsigned long max_cnt
= 0;
372 struct mm_struct
*best
= NULL
;
376 read_lock(&tasklist_lock
);
377 p
= init_task
.next_task
;
378 for (; p
!= &init_task
; p
= p
->next_task
) {
379 struct mm_struct
*mm
= p
->mm
;
380 if (!p
->swappable
|| !mm
)
384 /* Refresh swap_cnt? */
386 mm
->swap_cnt
= mm
->rss
;
387 if (mm
->swap_cnt
> max_cnt
) {
388 max_cnt
= mm
->swap_cnt
;
393 read_unlock(&tasklist_lock
);
403 atomic_inc(&best
->mm_count
);
404 ret
= swap_out_mm(best
, gfp_mask
);
411 kill_proc(pid
, SIGBUS
, 1);
422 * We need to make the locks finer granularity, but right
423 * now we need this so that we can do page allocations
424 * without holding the kernel lock etc.
426 * We want to try to free "count" pages, and we want to
427 * cluster them so that we get good swap-out behaviour.
429 * Don't try _too_ hard, though. We don't want to have bad
433 #define SWAP_COUNT 16
434 static int do_try_to_free_pages(unsigned int gfp_mask
)
437 int count
= FREE_COUNT
;
440 /* Always trim SLAB caches when memory gets low. */
441 kmem_cache_reap(gfp_mask
);
445 while (shrink_mmap(priority
, gfp_mask
)) {
451 /* Try to get rid of some shared memory pages.. */
452 if (gfp_mask
& __GFP_IO
) {
454 * don't be too light against the d/i cache since
455 * shrink_mmap() almost never fail when there's
456 * really plenty of memory free.
458 count
-= shrink_dcache_memory(priority
, gfp_mask
);
459 count
-= shrink_icache_memory(priority
, gfp_mask
);
462 while (shm_swap(priority
, gfp_mask
)) {
469 * Then, try to page stuff out..
471 * This will not actually free any pages (they get
472 * put in the swap cache), so we must not count this
473 * as a "count" success.
475 swap_count
= SWAP_COUNT
;
476 while (swap_out(priority
, gfp_mask
))
477 if (--swap_count
< 0)
480 } while (--priority
>= 0);
482 /* Always end on a shrink_mmap.. */
483 while (shrink_mmap(0, gfp_mask
)) {
487 /* We return 1 if we are freed some page */
488 return (count
!= FREE_COUNT
);
494 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait
);
497 * The background pageout daemon, started as a kernel thread
498 * from the init process.
500 * This basically trickles out pages so that we have _some_
501 * free memory available even if there is no other activity
502 * that frees anything up. This is needed for things like routing
503 * etc, where we otherwise might have all activity going on in
504 * asynchronous contexts that cannot page things out.
506 * If there are applications that are active memory-allocators
507 * (most normal use), this basically shouldn't matter.
509 int kswapd(void *unused
)
511 struct task_struct
*tsk
= current
;
515 strcpy(tsk
->comm
, "kswapd");
516 sigfillset(&tsk
->blocked
);
519 * Tell the memory management that we're a "memory allocator",
520 * and that if we need more memory we should get access to it
521 * regardless (see "__alloc_pages()"). "kswapd" should
522 * never get caught in the normal page freeing logic.
524 * (Kswapd normally doesn't need memory anyway, but sometimes
525 * you need a small amount of memory in order to be able to
526 * page out something else, and this flag essentially protects
527 * us from recursively trying to free more memory as we're
528 * trying to free the first piece of memory in the first place).
530 tsk
->flags
|= PF_MEMALLOC
;
534 int something_to_do
= 0;
539 for(i
= 0; i
< MAX_NR_ZONES
; i
++) {
540 zone_t
*zone
= pgdat
->node_zones
+ i
;
541 if (tsk
->need_resched
)
543 if (!zone
->size
|| !zone
->zone_wake_kswapd
)
545 if (zone
->free_pages
< zone
->pages_low
)
547 do_try_to_free_pages(GFP_KSWAPD
);
549 pgdat
= pgdat
->node_next
;
552 if (!something_to_do
) {
553 tsk
->state
= TASK_INTERRUPTIBLE
;
554 interruptible_sleep_on(&kswapd_wait
);
560 * Called by non-kswapd processes when they want more
563 * In a perfect world, this should just wake up kswapd
564 * and return. We don't actually want to swap stuff out
565 * from user processes, because the locking issues are
566 * nasty to the extreme (file write locks, and MM locking)
568 * One option might be to let kswapd do all the page-out
569 * and VM page table scanning that needs locking, and this
570 * process thread could do just the mmap shrink stage that
571 * can be done by just dropping cached pages without having
572 * any deadlock issues.
574 int try_to_free_pages(unsigned int gfp_mask
)
578 if (gfp_mask
& __GFP_WAIT
) {
579 current
->flags
|= PF_MEMALLOC
;
580 retval
= do_try_to_free_pages(gfp_mask
);
581 current
->flags
&= ~PF_MEMALLOC
;
586 static int __init
kswapd_init(void)
588 printk("Starting kswapd v1.6\n");
590 kernel_thread(kswapd
, NULL
, CLONE_FS
| CLONE_FILES
| CLONE_SIGHAND
);
594 module_init(kswapd_init
)