4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
13 #include <linux/slab.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/swap.h>
16 #include <linux/swapctl.h>
17 #include <linux/smp_lock.h>
18 #include <linux/pagemap.h>
19 #include <linux/init.h>
21 #include <asm/pgtable.h>
24 * The swap-out functions return 1 if they successfully
25 * threw something out, and we got a free page. It returns
26 * zero if it couldn't do anything, and any other value
27 * indicates it decreased rss, but the page was shared.
29 * NOTE! If it sleeps, it *must* return 1 to make sure we
30 * don't continue with the swap-out. Otherwise we may be
31 * using a process that no longer actually exists (it might
32 * have died while we slept).
34 static int try_to_swap_out(struct task_struct
* tsk
, struct vm_area_struct
* vma
,
35 unsigned long address
, pte_t
* page_table
, int gfp_mask
)
40 struct page
* page_map
;
43 if (!pte_present(pte
))
46 if (MAP_NR(page
) >= max_mapnr
)
48 page_map
= mem_map
+ MAP_NR(page
);
52 * Transfer the "accessed" bit from the page
53 * tables to the global page map.
55 set_pte(page_table
, pte_mkold(pte
));
56 set_bit(PG_referenced
, &page_map
->flags
);
60 if (PageReserved(page_map
)
61 || PageLocked(page_map
)
62 || ((gfp_mask
& __GFP_DMA
) && !PageDMA(page_map
)))
66 * Is the page already in the swap cache? If so, then
67 * we can just drop our reference to it without doing
68 * any IO - it's already up-to-date on disk.
70 * Return 0, as we didn't actually free any real
71 * memory, and we should just continue our scan.
73 if (PageSwapCache(page_map
)) {
74 entry
= page_map
->offset
;
75 swap_duplicate(entry
);
76 set_pte(page_table
, __pte(entry
));
79 flush_tlb_page(vma
, address
);
80 __free_page(page_map
);
85 * Is it a clean page? Then it must be recoverable
86 * by just paging it in again, and we can just drop
89 * However, this won't actually free any real
90 * memory, as the page will just be in the page cache
91 * somewhere, and as such we should just continue
94 * Basically, this just makes it possible for us to do
95 * some real work in the future in "shrink_mmap()".
97 if (!pte_dirty(pte
)) {
98 pte_clear(page_table
);
103 * Don't go down into the swap-out stuff if
104 * we cannot do I/O! Avoid recursing on FS
107 if (!(gfp_mask
& __GFP_IO
))
111 * Ok, it's really dirty. That means that
112 * we should either create a new swap cache
113 * entry for it, or we should write it back
114 * to its own backing store.
116 * Note that in neither case do we actually
117 * know that we make a page available, but
118 * as we potentially sleep we can no longer
119 * continue scanning, so we migth as well
120 * assume we free'd something.
122 * NOTE NOTE NOTE! This should just set a
123 * dirty bit in page_map, and just drop the
124 * pte. All the hard work would be done by
127 * That would get rid of a lot of problems.
129 flush_cache_page(vma
, address
);
130 if (vma
->vm_ops
&& vma
->vm_ops
->swapout
) {
131 pid_t pid
= tsk
->pid
;
132 pte_clear(page_table
);
133 flush_tlb_page(vma
, address
);
136 if (vma
->vm_ops
->swapout(vma
, page_map
))
137 kill_proc(pid
, SIGBUS
, 1);
138 __free_page(page_map
);
143 * This is a dirty, swappable page. First of all,
144 * get a suitable swap entry for it, and make sure
145 * we have the swap cache set up to associate the
146 * page with that swap entry.
148 entry
= get_swap_page();
150 return 0; /* No swap space left */
154 set_pte(page_table
, __pte(entry
));
155 flush_tlb_page(vma
, address
);
156 swap_duplicate(entry
); /* One for the process, one for the swap cache */
157 add_to_swap_cache(page_map
, entry
);
158 /* We checked we were unlocked way up above, and we
159 have been careful not to stall until here */
160 set_bit(PG_locked
, &page_map
->flags
);
162 /* OK, do a physical asynchronous write to swap. */
163 rw_swap_page(WRITE
, entry
, (char *) page
, 0);
165 __free_page(page_map
);
170 * A new implementation of swap_out(). We do not swap complete processes,
171 * but only a small number of blocks, before we continue with the next
172 * process. The number of blocks actually swapped is determined on the
173 * number of page faults, that this process actually had in the last time,
174 * so we won't swap heavily used processes all the time ...
176 * Note: the priority argument is a hint on much CPU to waste with the
177 * swap block search, not a hint, of how much blocks to swap with
180 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
183 static inline int swap_out_pmd(struct task_struct
* tsk
, struct vm_area_struct
* vma
,
184 pmd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
187 unsigned long pmd_end
;
192 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir
));
197 pte
= pte_offset(dir
, address
);
199 pmd_end
= (address
+ PMD_SIZE
) & PMD_MASK
;
205 tsk
->mm
->swap_address
= address
+ PAGE_SIZE
;
206 result
= try_to_swap_out(tsk
, vma
, address
, pte
, gfp_mask
);
209 address
+= PAGE_SIZE
;
211 } while (address
< end
);
215 static inline int swap_out_pgd(struct task_struct
* tsk
, struct vm_area_struct
* vma
,
216 pgd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
219 unsigned long pgd_end
;
224 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir
));
229 pmd
= pmd_offset(dir
, address
);
231 pgd_end
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
236 int result
= swap_out_pmd(tsk
, vma
, pmd
, address
, end
, gfp_mask
);
239 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
241 } while (address
< end
);
245 static int swap_out_vma(struct task_struct
* tsk
, struct vm_area_struct
* vma
,
246 unsigned long address
, int gfp_mask
)
251 /* Don't swap out areas which are locked down */
252 if (vma
->vm_flags
& VM_LOCKED
)
255 pgdir
= pgd_offset(tsk
->mm
, address
);
258 while (address
< end
) {
259 int result
= swap_out_pgd(tsk
, vma
, pgdir
, address
, end
, gfp_mask
);
262 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
268 static int swap_out_process(struct task_struct
* p
, int gfp_mask
)
270 unsigned long address
;
271 struct vm_area_struct
* vma
;
274 * Go through process' page directory.
276 address
= p
->mm
->swap_address
;
279 * Find the proper vm-area
281 vma
= find_vma(p
->mm
, address
);
283 if (address
< vma
->vm_start
)
284 address
= vma
->vm_start
;
287 int result
= swap_out_vma(p
, vma
, address
, gfp_mask
);
293 address
= vma
->vm_start
;
297 /* We didn't find anything for the process */
299 p
->mm
->swap_address
= 0;
304 * Select the task with maximal swap_cnt and try to swap out a page.
305 * N.B. This function returns only 0 or 1. Return values != 1 from
306 * the lower level routines result in continued processing.
308 static int swap_out(unsigned int priority
, int gfp_mask
)
310 struct task_struct
* p
, * pbest
;
311 int counter
, assign
, max_cnt
;
314 * We make one or two passes through the task list, indexed by
316 * Pass 1: select the swappable task with maximal RSS that has
317 * not yet been swapped out.
318 * Pass 2: re-assign rss swap_cnt values, then select as above.
320 * With this approach, there's no need to remember the last task
321 * swapped out. If the swap-out fails, we clear swap_cnt so the
322 * task won't be selected again until all others have been tried.
324 * Think of swap_cnt as a "shadow rss" - it tells us which process
325 * we want to page out (always try largest first).
327 counter
= nr_tasks
/ (priority
+1);
330 if (counter
> nr_tasks
)
333 for (; counter
>= 0; counter
--) {
338 read_lock(&tasklist_lock
);
339 p
= init_task
.next_task
;
340 for (; p
!= &init_task
; p
= p
->next_task
) {
345 /* Refresh swap_cnt? */
347 p
->mm
->swap_cnt
= p
->mm
->rss
;
348 if (p
->mm
->swap_cnt
> max_cnt
) {
349 max_cnt
= p
->mm
->swap_cnt
;
353 read_unlock(&tasklist_lock
);
362 if (swap_out_process(pbest
, gfp_mask
))
370 * We need to make the locks finer granularity, but right
371 * now we need this so that we can do page allocations
372 * without holding the kernel lock etc.
374 * We want to try to free "count" pages, and we need to
375 * cluster them so that we get good swap-out behaviour. See
376 * the "free_memory()" macro for details.
378 static int do_try_to_free_pages(unsigned int gfp_mask
)
381 int count
= SWAP_CLUSTER_MAX
;
385 /* Always trim SLAB caches when memory gets low. */
386 kmem_cache_reap(gfp_mask
);
390 while (shrink_mmap(priority
, gfp_mask
)) {
395 /* Try to get rid of some shared memory pages.. */
396 if (gfp_mask
& __GFP_IO
) {
397 while (shm_swap(priority
, gfp_mask
)) {
403 /* Then, try to page stuff out.. */
404 while (swap_out(priority
, gfp_mask
)) {
409 shrink_dcache_memory(priority
, gfp_mask
);
410 } while (--priority
>= 0);
414 return priority
>= 0;
418 * Before we start the kernel thread, print out the
419 * kswapd initialization message (otherwise the init message
420 * may be printed in the middle of another driver's init
421 * message). It looks very bad when that happens.
423 void __init
kswapd_setup(void)
426 char *revision
="$Revision: 1.5 $", *s
, *e
;
430 if ((s
= strchr(revision
, ':')) &&
431 (e
= strchr(s
, '$')))
434 s
= revision
, i
= -1;
435 printk ("Starting kswapd v%.*s\n", i
, s
);
438 static struct task_struct
*kswapd_process
;
441 * The background pageout daemon, started as a kernel thread
442 * from the init process.
444 * This basically executes once a second, trickling out pages
445 * so that we have _some_ free memory available even if there
446 * is no other activity that frees anything up. This is needed
447 * for things like routing etc, where we otherwise might have
448 * all activity going on in asynchronous contexts that cannot
451 * If there are applications that are active memory-allocators
452 * (most normal use), this basically shouldn't matter.
454 int kswapd(void *unused
)
456 struct task_struct
*tsk
= current
;
458 kswapd_process
= tsk
;
461 strcpy(tsk
->comm
, "kswapd");
462 sigfillset(&tsk
->blocked
);
465 * Tell the memory management that we're a "memory allocator",
466 * and that if we need more memory we should get access to it
467 * regardless (see "__get_free_pages()"). "kswapd" should
468 * never get caught in the normal page freeing logic.
470 * (Kswapd normally doesn't need memory anyway, but sometimes
471 * you need a small amount of memory in order to be able to
472 * page out something else, and this flag essentially protects
473 * us from recursively trying to free more memory as we're
474 * trying to free the first piece of memory in the first place).
476 tsk
->flags
|= PF_MEMALLOC
;
480 * Wake up once a second to see if we need to make
481 * more memory available.
483 * If we actually get into a low-memory situation,
484 * the processes needing more memory will wake us
485 * up on a more timely basis.
488 if (nr_free_pages
>= freepages
.high
)
491 if (!do_try_to_free_pages(GFP_KSWAPD
))
493 } while (!tsk
->need_resched
);
494 run_task_queue(&tq_disk
);
495 tsk
->state
= TASK_INTERRUPTIBLE
;
496 schedule_timeout(HZ
);
501 * Called by non-kswapd processes when they want more
504 * In a perfect world, this should just wake up kswapd
505 * and return. We don't actually want to swap stuff out
506 * from user processes, because the locking issues are
507 * nasty to the extreme (file write locks, and MM locking)
509 * One option might be to let kswapd do all the page-out
510 * and VM page table scanning that needs locking, and this
511 * process thread could do just the mmap shrink stage that
512 * can be done by just dropping cached pages without having
513 * any deadlock issues.
515 int try_to_free_pages(unsigned int gfp_mask
)
519 wake_up_process(kswapd_process
);
520 if (gfp_mask
& __GFP_WAIT
)
521 retval
= do_try_to_free_pages(gfp_mask
);