4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
13 #include <linux/slab.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/swap.h>
16 #include <linux/swapctl.h>
17 #include <linux/smp_lock.h>
18 #include <linux/pagemap.h>
19 #include <linux/init.h>
20 #include <linux/highmem.h>
21 #include <linux/file.h>
23 #include <asm/pgtable.h>
26 * The swap-out functions return 1 if they successfully
27 * threw something out, and we got a free page. It returns
28 * zero if it couldn't do anything, and any other value
29 * indicates it decreased rss, but the page was shared.
31 * NOTE! If it sleeps, it *must* return 1 to make sure we
32 * don't continue with the swap-out. Otherwise we may be
33 * using a process that no longer actually exists (it might
34 * have died while we slept).
36 static int try_to_swap_out(struct vm_area_struct
* vma
, unsigned long address
, pte_t
* page_table
, int gfp_mask
)
41 int (*swapout
)(struct page
*, struct file
*);
44 if (!pte_present(pte
))
47 if (page
-mem_map
>= max_mapnr
)
50 /* Don't look at this pte if it's been accessed recently. */
53 * Transfer the "accessed" bit from the page
54 * tables to the global page map.
56 set_pte(page_table
, pte_mkold(pte
));
57 set_bit(PG_referenced
, &page
->flags
);
61 if (PageReserved(page
)
63 || ((gfp_mask
& __GFP_DMA
) && !PageDMA(page
))
64 || (!(gfp_mask
& __GFP_HIGHMEM
) && PageHighMem(page
)))
68 * Is the page already in the swap cache? If so, then
69 * we can just drop our reference to it without doing
70 * any IO - it's already up-to-date on disk.
72 * Return 0, as we didn't actually free any real
73 * memory, and we should just continue our scan.
75 if (PageSwapCache(page
)) {
76 entry
.val
= page
->pg_offset
;
77 swap_duplicate(entry
);
78 set_pte(page_table
, swp_entry_to_pte(entry
));
81 flush_tlb_page(vma
, address
);
87 * Is it a clean page? Then it must be recoverable
88 * by just paging it in again, and we can just drop
91 * However, this won't actually free any real
92 * memory, as the page will just be in the page cache
93 * somewhere, and as such we should just continue
96 * Basically, this just makes it possible for us to do
97 * some real work in the future in "shrink_mmap()".
99 if (!pte_dirty(pte
)) {
100 pte_clear(page_table
);
105 * Don't go down into the swap-out stuff if
106 * we cannot do I/O! Avoid recursing on FS
109 if (!(gfp_mask
& __GFP_IO
))
113 * Ok, it's really dirty. That means that
114 * we should either create a new swap cache
115 * entry for it, or we should write it back
116 * to its own backing store.
118 * Note that in neither case do we actually
119 * know that we make a page available, but
120 * as we potentially sleep we can no longer
121 * continue scanning, so we migth as well
122 * assume we free'd something.
124 * NOTE NOTE NOTE! This should just set a
125 * dirty bit in 'page', and just drop the
126 * pte. All the hard work would be done by
129 * That would get rid of a lot of problems.
131 flush_cache_page(vma
, address
);
132 if (vma
->vm_ops
&& (swapout
= vma
->vm_ops
->swapout
)) {
134 struct file
*file
= vma
->vm_file
;
135 if (file
) get_file(file
);
136 pte_clear(page_table
);
138 flush_tlb_page(vma
, address
);
139 vmlist_access_unlock(vma
->vm_mm
);
140 error
= swapout(page
, file
);
141 if (file
) fput(file
);
143 goto out_free_success
;
149 * This is a dirty, swappable page. First of all,
150 * get a suitable swap entry for it, and make sure
151 * we have the swap cache set up to associate the
152 * page with that swap entry.
154 entry
= acquire_swap_entry(page
);
156 goto out_failed
; /* No swap space left */
158 if (!(page
= prepare_highmem_swapout(page
)))
162 set_pte(page_table
, swp_entry_to_pte(entry
));
163 vmlist_access_unlock(vma
->vm_mm
);
165 flush_tlb_page(vma
, address
);
166 swap_duplicate(entry
); /* One for the process, one for the swap cache */
168 /* This will also lock the page */
169 add_to_swap_cache(page
, entry
);
171 /* OK, do a physical asynchronous write to swap. */
172 rw_swap_page(WRITE
, page
, 0);
185 * A new implementation of swap_out(). We do not swap complete processes,
186 * but only a small number of blocks, before we continue with the next
187 * process. The number of blocks actually swapped is determined on the
188 * number of page faults, that this process actually had in the last time,
189 * so we won't swap heavily used processes all the time ...
191 * Note: the priority argument is a hint on much CPU to waste with the
192 * swap block search, not a hint, of how much blocks to swap with
195 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
198 static inline int swap_out_pmd(struct vm_area_struct
* vma
, pmd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
201 unsigned long pmd_end
;
211 pte
= pte_offset(dir
, address
);
213 pmd_end
= (address
+ PMD_SIZE
) & PMD_MASK
;
219 vma
->vm_mm
->swap_address
= address
+ PAGE_SIZE
;
220 result
= try_to_swap_out(vma
, address
, pte
, gfp_mask
);
223 address
+= PAGE_SIZE
;
225 } while (address
&& (address
< end
));
229 static inline int swap_out_pgd(struct vm_area_struct
* vma
, pgd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
232 unsigned long pgd_end
;
242 pmd
= pmd_offset(dir
, address
);
244 pgd_end
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
245 if (pgd_end
&& (end
> pgd_end
))
249 int result
= swap_out_pmd(vma
, pmd
, address
, end
, gfp_mask
);
252 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
254 } while (address
&& (address
< end
));
258 static int swap_out_vma(struct vm_area_struct
* vma
, unsigned long address
, int gfp_mask
)
263 /* Don't swap out areas which are locked down */
264 if (vma
->vm_flags
& VM_LOCKED
)
267 pgdir
= pgd_offset(vma
->vm_mm
, address
);
273 int result
= swap_out_pgd(vma
, pgdir
, address
, end
, gfp_mask
);
276 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
278 } while (address
&& (address
< end
));
282 static int swap_out_mm(struct mm_struct
* mm
, int gfp_mask
)
284 unsigned long address
;
285 struct vm_area_struct
* vma
;
288 * Go through process' page directory.
290 address
= mm
->swap_address
;
293 * Find the proper vm-area after freezing the vma chain
296 vmlist_access_lock(mm
);
297 vma
= find_vma(mm
, address
);
299 if (address
< vma
->vm_start
)
300 address
= vma
->vm_start
;
303 int result
= swap_out_vma(vma
, address
, gfp_mask
);
309 address
= vma
->vm_start
;
312 vmlist_access_unlock(mm
);
314 /* We didn't find anything for the process */
316 mm
->swap_address
= 0;
321 * Select the task with maximal swap_cnt and try to swap out a page.
322 * N.B. This function returns only 0 or 1. Return values != 1 from
323 * the lower level routines result in continued processing.
325 static int swap_out(unsigned int priority
, int gfp_mask
)
327 struct task_struct
* p
;
333 * We make one or two passes through the task list, indexed by
335 * Pass 1: select the swappable task with maximal RSS that has
336 * not yet been swapped out.
337 * Pass 2: re-assign rss swap_cnt values, then select as above.
339 * With this approach, there's no need to remember the last task
340 * swapped out. If the swap-out fails, we clear swap_cnt so the
341 * task won't be selected again until all others have been tried.
343 * Think of swap_cnt as a "shadow rss" - it tells us which process
344 * we want to page out (always try largest first).
346 counter
= nr_threads
/ (priority
+1);
349 if (counter
> nr_threads
)
350 counter
= nr_threads
;
352 for (; counter
>= 0; counter
--) {
355 struct mm_struct
*best
= NULL
;
358 read_lock(&tasklist_lock
);
359 p
= init_task
.next_task
;
360 for (; p
!= &init_task
; p
= p
->next_task
) {
361 struct mm_struct
*mm
= p
->mm
;
362 if (!p
->swappable
|| !mm
)
366 /* Refresh swap_cnt? */
368 mm
->swap_cnt
= mm
->rss
;
369 if (mm
->swap_cnt
> max_cnt
) {
370 max_cnt
= mm
->swap_cnt
;
375 read_unlock(&tasklist_lock
);
385 atomic_inc(&best
->mm_count
);
386 ret
= swap_out_mm(best
, gfp_mask
);
393 kill_proc(pid
, SIGBUS
, 1);
404 * We need to make the locks finer granularity, but right
405 * now we need this so that we can do page allocations
406 * without holding the kernel lock etc.
408 * We want to try to free "count" pages, and we need to
409 * cluster them so that we get good swap-out behaviour. See
410 * the "free_memory()" macro for details.
412 static int do_try_to_free_pages(unsigned int gfp_mask
)
415 int count
= SWAP_CLUSTER_MAX
;
417 /* Always trim SLAB caches when memory gets low. */
418 kmem_cache_reap(gfp_mask
);
422 while (shrink_mmap(priority
, gfp_mask
)) {
427 /* don't be too light against the d/i cache since
428 shrink_mmap() almost never fail when there's
429 really plenty of memory free. */
430 count
-= shrink_dcache_memory(priority
, gfp_mask
);
431 count
-= shrink_icache_memory(priority
, gfp_mask
);
435 /* Try to get rid of some shared memory pages.. */
436 if (gfp_mask
& __GFP_IO
) {
437 while (shm_swap(priority
, gfp_mask
)) {
443 /* Then, try to page stuff out.. */
444 while (swap_out(priority
, gfp_mask
)) {
448 } while (--priority
>= 0);
451 return priority
>= 0;
454 static struct task_struct
*kswapd_process
;
457 * The background pageout daemon, started as a kernel thread
458 * from the init process.
460 * This basically executes once a second, trickling out pages
461 * so that we have _some_ free memory available even if there
462 * is no other activity that frees anything up. This is needed
463 * for things like routing etc, where we otherwise might have
464 * all activity going on in asynchronous contexts that cannot
467 * If there are applications that are active memory-allocators
468 * (most normal use), this basically shouldn't matter.
470 int kswapd(void *unused
)
472 struct task_struct
*tsk
= current
;
474 kswapd_process
= tsk
;
477 strcpy(tsk
->comm
, "kswapd");
478 sigfillset(&tsk
->blocked
);
481 * Tell the memory management that we're a "memory allocator",
482 * and that if we need more memory we should get access to it
483 * regardless (see "__get_free_pages()"). "kswapd" should
484 * never get caught in the normal page freeing logic.
486 * (Kswapd normally doesn't need memory anyway, but sometimes
487 * you need a small amount of memory in order to be able to
488 * page out something else, and this flag essentially protects
489 * us from recursively trying to free more memory as we're
490 * trying to free the first piece of memory in the first place).
492 tsk
->flags
|= PF_MEMALLOC
;
496 * Wake up once a second to see if we need to make
497 * more memory available.
499 * If we actually get into a low-memory situation,
500 * the processes needing more memory will wake us
501 * up on a more timely basis.
504 /* kswapd is critical to provide GFP_ATOMIC
505 allocations (not GFP_HIGHMEM ones). */
506 if (nr_free_pages
- nr_free_highpages
>= freepages
.high
)
509 if (!do_try_to_free_pages(GFP_KSWAPD
))
511 run_task_queue(&tq_disk
);
512 } while (!tsk
->need_resched
);
513 tsk
->state
= TASK_INTERRUPTIBLE
;
514 schedule_timeout(HZ
);
519 * Called by non-kswapd processes when they want more
522 * In a perfect world, this should just wake up kswapd
523 * and return. We don't actually want to swap stuff out
524 * from user processes, because the locking issues are
525 * nasty to the extreme (file write locks, and MM locking)
527 * One option might be to let kswapd do all the page-out
528 * and VM page table scanning that needs locking, and this
529 * process thread could do just the mmap shrink stage that
530 * can be done by just dropping cached pages without having
531 * any deadlock issues.
533 int try_to_free_pages(unsigned int gfp_mask
)
537 wake_up_process(kswapd_process
);
538 if (gfp_mask
& __GFP_WAIT
)
539 retval
= do_try_to_free_pages(gfp_mask
);
543 static int __init
kswapd_init(void)
545 printk("Starting kswapd v1.6\n");
547 kernel_thread(kswapd
, NULL
, CLONE_FS
| CLONE_FILES
| CLONE_SIGHAND
);
551 module_init(kswapd_init
)