4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
14 #include <linux/sched.h>
15 #include <linux/head.h>
16 #include <linux/kernel.h>
17 #include <linux/kernel_stat.h>
18 #include <linux/errno.h>
19 #include <linux/string.h>
20 #include <linux/swap.h>
21 #include <linux/swapctl.h>
22 #include <linux/smp_lock.h>
23 #include <linux/slab.h>
24 #include <linux/dcache.h>
26 #include <linux/pagemap.h>
27 #include <linux/init.h>
29 #include <asm/bitops.h>
30 #include <asm/pgtable.h>
33 * When are we next due for a page scan?
35 static unsigned long next_swap_jiffies
= 0;
38 * How often do we do a pageout scan during normal conditions?
39 * Default is four times a second.
41 int swapout_interval
= HZ
/ 4;
44 * The wait queue for waking up the pageout daemon:
46 struct wait_queue
* kswapd_wait
= NULL
;
48 static void init_swap_timer(void);
51 * The swap-out functions return 1 if they successfully
52 * threw something out, and we got a free page. It returns
53 * zero if it couldn't do anything, and any other value
54 * indicates it decreased rss, but the page was shared.
56 * NOTE! If it sleeps, it *must* return 1 to make sure we
57 * don't continue with the swap-out. Otherwise we may be
58 * using a process that no longer actually exists (it might
59 * have died while we slept).
61 static inline int try_to_swap_out(struct task_struct
* tsk
, struct vm_area_struct
* vma
,
62 unsigned long address
, pte_t
* page_table
, int gfp_mask
)
67 struct page
* page_map
;
70 if (!pte_present(pte
))
73 if (MAP_NR(page
) >= max_mapnr
)
76 page_map
= mem_map
+ MAP_NR(page
);
77 if (PageReserved(page_map
)
78 || PageLocked(page_map
)
79 || ((gfp_mask
& __GFP_DMA
) && !PageDMA(page_map
)))
83 * Deal with page aging. There are several special cases to
86 * Page has been accessed, but is swap cached. If the page is
87 * getting sufficiently "interesting" --- its age is getting
88 * high --- then if we are sufficiently short of free swap
89 * pages, then delete the swap cache. We can only do this if
90 * the swap page's reference count is one: ie. there are no
91 * other references to it beyond the swap cache (as there must
92 * still be PTEs pointing to it if count > 1).
94 * If the page has NOT been touched, and its age reaches zero,
95 * then we are swapping it out:
97 * If there is already a swap cache page for this page, then
98 * another process has already allocated swap space, so just
99 * dereference the physical page and copy in the swap entry
100 * from the swap cache.
102 * Note, we rely on all pages read in from swap either having
103 * the swap cache flag set, OR being marked writable in the pte,
104 * but NEVER BOTH. (It IS legal to be neither cached nor dirty,
107 * -- Stephen Tweedie 1998 */
109 if (PageSwapCache(page_map
)) {
110 if (pte_write(pte
)) {
112 printk ("VM: Found a writable swap-cached page!\n");
113 /* Try to diagnose the problem ... */
114 found
= find_page(&swapper_inode
, page_map
->offset
);
116 printk("page=%p@%08lx, found=%p, count=%d\n",
117 page_map
, page_map
->offset
,
118 found
, atomic_read(&found
->count
));
121 printk ("Spurious, page not in cache\n");
126 if (pte_young(pte
)) {
127 set_pte(page_table
, pte_mkold(pte
));
128 touch_page(page_map
);
130 * We should test here to see if we want to recover any
131 * swap cache page here. We do this if the page seeing
132 * enough activity, AND we are sufficiently low on swap
134 * We need to track both the number of available swap
135 * pages and the total number present before we can do
145 if (pte_dirty(pte
)) {
146 if (vma
->vm_ops
&& vma
->vm_ops
->swapout
) {
147 pid_t pid
= tsk
->pid
;
149 if (vma
->vm_ops
->swapout(vma
, address
- vma
->vm_start
+ vma
->vm_offset
, page_table
))
150 kill_proc(pid
, SIGBUS
, 1);
153 * This is a dirty, swappable page. First of all,
154 * get a suitable swap entry for it, and make sure
155 * we have the swap cache set up to associate the
156 * page with that swap entry.
158 entry
= in_swap_cache(page_map
);
160 entry
= get_swap_page();
162 return 0; /* No swap space left */
167 flush_cache_page(vma
, address
);
168 set_pte(page_table
, __pte(entry
));
169 flush_tlb_page(vma
, address
);
170 swap_duplicate(entry
);
172 /* Now to write back the page. We have two
173 * cases: if the page is already part of the
174 * swap cache, then it is already on disk. Just
175 * free the page and return (we release the swap
176 * cache on the last accessor too).
178 * If we have made a new swap entry, then we
179 * start the write out to disk. If the page is
180 * shared, however, we still need to keep the
181 * copy in memory, so we add it to the swap
183 if (PageSwapCache(page_map
)) {
184 free_page_and_swap_cache(page
);
185 return (atomic_read(&page_map
->count
) == 0);
187 add_to_swap_cache(page_map
, entry
);
188 /* We checked we were unlocked way up above, and we
189 have been careful not to stall until here */
190 set_bit(PG_locked
, &page_map
->flags
);
191 /* OK, do a physical write to swap. */
192 rw_swap_page(WRITE
, entry
, (char *) page
, (gfp_mask
& __GFP_WAIT
));
194 /* Now we can free the current physical page. We also
195 * free up the swap cache if this is the last use of the
196 * page. Note that there is a race here: the page may
197 * still be shared COW by another process, but that
198 * process may exit while we are writing out the page
199 * asynchronously. That's no problem, shrink_mmap() can
200 * correctly clean up the occassional unshared page
201 * which gets left behind in the swap cache. */
202 free_page_and_swap_cache(page
);
203 return 1; /* we slept: the process may not exist any more */
206 /* The page was _not_ dirty, but still has a zero age. It must
207 * already be uptodate on disk. If it is in the swap cache,
208 * then we can just unlink the page now. Remove the swap cache
209 * too if this is the last user. */
210 if ((entry
= in_swap_cache(page_map
))) {
212 flush_cache_page(vma
, address
);
213 set_pte(page_table
, __pte(entry
));
214 flush_tlb_page(vma
, address
);
215 swap_duplicate(entry
);
216 free_page_and_swap_cache(page
);
217 return (atomic_read(&page_map
->count
) == 0);
220 * A clean page to be discarded? Must be mmap()ed from
221 * somewhere. Unlink the pte, and tell the filemap code to
222 * discard any cached backing page if this is the last user.
224 if (PageSwapCache(page_map
)) {
225 printk ("VM: How can this page _still_ be cached?");
229 flush_cache_page(vma
, address
);
230 pte_clear(page_table
);
231 flush_tlb_page(vma
, address
);
232 entry
= page_unuse(page_map
);
233 __free_page(page_map
);
238 * A new implementation of swap_out(). We do not swap complete processes,
239 * but only a small number of blocks, before we continue with the next
240 * process. The number of blocks actually swapped is determined on the
241 * number of page faults, that this process actually had in the last time,
242 * so we won't swap heavily used processes all the time ...
244 * Note: the priority argument is a hint on much CPU to waste with the
245 * swap block search, not a hint, of how much blocks to swap with
248 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
251 static inline int swap_out_pmd(struct task_struct
* tsk
, struct vm_area_struct
* vma
,
252 pmd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
255 unsigned long pmd_end
;
260 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir
));
265 pte
= pte_offset(dir
, address
);
267 pmd_end
= (address
+ PMD_SIZE
) & PMD_MASK
;
273 tsk
->swap_address
= address
+ PAGE_SIZE
;
274 result
= try_to_swap_out(tsk
, vma
, address
, pte
, gfp_mask
);
277 address
+= PAGE_SIZE
;
279 } while (address
< end
);
283 static inline int swap_out_pgd(struct task_struct
* tsk
, struct vm_area_struct
* vma
,
284 pgd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
287 unsigned long pgd_end
;
292 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir
));
297 pmd
= pmd_offset(dir
, address
);
299 pgd_end
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
304 int result
= swap_out_pmd(tsk
, vma
, pmd
, address
, end
, gfp_mask
);
307 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
309 } while (address
< end
);
313 static int swap_out_vma(struct task_struct
* tsk
, struct vm_area_struct
* vma
,
314 pgd_t
*pgdir
, unsigned long start
, int gfp_mask
)
318 /* Don't swap out areas like shared memory which have their
319 own separate swapping mechanism or areas which are locked down */
320 if (vma
->vm_flags
& (VM_SHM
| VM_LOCKED
))
324 while (start
< end
) {
325 int result
= swap_out_pgd(tsk
, vma
, pgdir
, start
, end
, gfp_mask
);
328 start
= (start
+ PGDIR_SIZE
) & PGDIR_MASK
;
334 static int swap_out_process(struct task_struct
* p
, int gfp_mask
)
336 unsigned long address
;
337 struct vm_area_struct
* vma
;
340 * Go through process' page directory.
342 address
= p
->swap_address
;
345 * Find the proper vm-area
347 vma
= find_vma(p
->mm
, address
);
352 if (address
< vma
->vm_start
)
353 address
= vma
->vm_start
;
356 int result
= swap_out_vma(p
, vma
, pgd_offset(p
->mm
, address
), address
, gfp_mask
);
362 address
= vma
->vm_start
;
369 * Select the task with maximal swap_cnt and try to swap out a page.
370 * N.B. This function returns only 0 or 1. Return values != 1 from
371 * the lower level routines result in continued processing.
373 static int swap_out(unsigned int priority
, int gfp_mask
)
375 struct task_struct
* p
, * pbest
;
376 int counter
, assign
, max_cnt
;
379 * We make one or two passes through the task list, indexed by
381 * Pass 1: select the swappable task with maximal swap_cnt.
382 * Pass 2: assign new swap_cnt values, then select as above.
383 * With this approach, there's no need to remember the last task
384 * swapped out. If the swap-out fails, we clear swap_cnt so the
385 * task won't be selected again until all others have been tried.
387 counter
= ((PAGEOUT_WEIGHT
* nr_tasks
) >> 10) >> priority
;
388 for (; counter
>= 0; counter
--) {
393 read_lock(&tasklist_lock
);
394 p
= init_task
.next_task
;
395 for (; p
!= &init_task
; p
= p
->next_task
) {
402 * If we didn't select a task on pass 1,
403 * assign each task a new swap_cnt.
404 * Normalise the number of pages swapped
405 * by multiplying by (RSS / 1MB)
407 p
->swap_cnt
= AGE_CLUSTER_SIZE(p
->mm
->rss
);
409 if (p
->swap_cnt
> max_cnt
) {
410 max_cnt
= p
->swap_cnt
;
414 read_unlock(&tasklist_lock
);
424 switch (swap_out_process(pbest
, gfp_mask
)) {
427 * Clear swap_cnt so we don't look at this task
428 * again until we've tried all of the others.
429 * (We didn't block, so the task is still here.)
444 * We are much more aggressive about trying to swap out than we used
445 * to be. This works out OK, because we now do proper aging on page
448 static int do_try_to_free_page(int gfp_mask
)
450 static int state
= 0;
454 /* Always trim SLAB caches when memory gets low. */
455 kmem_cache_reap(gfp_mask
);
457 /* We try harder if we are waiting .. */
459 if (gfp_mask
& __GFP_WAIT
)
462 if (((buffermem
>> PAGE_SHIFT
) * 100 > buffer_mem
.borrow_percent
* num_physpages
)
463 || (page_cache_size
* 100 > page_cache
.borrow_percent
* num_physpages
))
464 shrink_mmap(i
, gfp_mask
);
469 if (shrink_mmap(i
, gfp_mask
))
473 if (shm_swap(i
, gfp_mask
))
477 if (swap_out(i
, gfp_mask
))
481 shrink_dcache_memory(i
, gfp_mask
);
484 } while ((i
- stop
) >= 0);
490 * Before we start the kernel thread, print out the
491 * kswapd initialization message (otherwise the init message
492 * may be printed in the middle of another driver's init
493 * message). It looks very bad when that happens.
495 void __init
kswapd_setup(void)
498 char *revision
="$Revision: 1.5 $", *s
, *e
;
500 if ((s
= strchr(revision
, ':')) &&
501 (e
= strchr(s
, '$')))
504 s
= revision
, i
= -1;
505 printk ("Starting kswapd v%.*s\n", i
, s
);
509 * The background pageout daemon.
510 * Started as a kernel thread from the init process.
512 int kswapd(void *unused
)
514 struct wait_queue wait
= { current
, NULL
};
515 current
->session
= 1;
517 sprintf(current
->comm
, "kswapd");
518 sigfillset(¤t
->blocked
);
521 * As a kernel thread we want to tamper with system buffers
522 * and other internals and thus be subject to the SMP locking
523 * rules. (On a uniprocessor box this does nothing).
527 /* Give kswapd a realtime priority. */
528 current
->policy
= SCHED_FIFO
;
529 current
->rt_priority
= 32; /* Fixme --- we need to standardise our
530 namings for POSIX.4 realtime scheduling
534 * Tell the memory management that we're a "memory allocator",
535 * and that if we need more memory we should get access to it
536 * regardless (see "try_to_free_pages()"). "kswapd" should
537 * never get caught in the normal page freeing logic.
539 * (Kswapd normally doesn't need memory anyway, but sometimes
540 * you need a small amount of memory in order to be able to
541 * page out something else, and this flag essentially protects
542 * us from recursively trying to free more memory as we're
543 * trying to free the first piece of memory in the first place).
545 current
->flags
|= PF_MEMALLOC
;
548 add_wait_queue(&kswapd_wait
, &wait
);
552 current
->state
= TASK_INTERRUPTIBLE
;
553 flush_signals(current
);
554 run_task_queue(&tq_disk
);
559 * Do the background pageout: be
560 * more aggressive if we're really
561 * low on free memory.
563 * We try page_daemon.tries_base times, divided by
564 * an 'urgency factor'. In practice this will mean
565 * a value of pager_daemon.tries_base / 8 or 4 = 64
566 * or 128 pages at a time.
567 * This gives us 64 (or 128) * 4k * 4 (times/sec) =
568 * 1 (or 2) MB/s swapping bandwidth in low-priority
569 * background paging. This number rises to 8 MB/s
570 * when the priority is highest (but then we'll be
571 * woken up more often and the rate will be even
574 tries
= pager_daemon
.tries_base
;
575 tries
>>= 4*free_memory_available();
578 do_try_to_free_page(0);
580 * Syncing large chunks is faster than swapping
581 * synchronously (less head movement). -- Rik.
583 if (atomic_read(&nr_async_pages
) >= pager_daemon
.swap_cluster
)
584 run_task_queue(&tq_disk
);
585 if (free_memory_available() > 1)
587 } while (--tries
> 0);
589 /* As if we could ever get here - maybe we want to make this killable */
590 remove_wait_queue(&kswapd_wait
, &wait
);
596 * We need to make the locks finer granularity, but right
597 * now we need this so that we can do page allocations
598 * without holding the kernel lock etc.
600 * The "PF_MEMALLOC" flag protects us against recursion:
601 * if we need more memory as part of a swap-out effort we
602 * will just silently return "success" to tell the page
603 * allocator to accept the allocation.
605 int try_to_free_pages(unsigned int gfp_mask
, int count
)
610 if (!(current
->flags
& PF_MEMALLOC
)) {
611 current
->flags
|= PF_MEMALLOC
;
613 retval
= do_try_to_free_page(gfp_mask
);
618 current
->flags
&= ~PF_MEMALLOC
;
625 * The swap_tick function gets called on every clock tick.
629 unsigned long now
, want
;
632 want
= next_swap_jiffies
;
636 * Examine the memory queues. Mark memory low
637 * if there is nothing available in the three
640 * Schedule for wakeup if there isn't lots
643 switch (free_memory_available()) {
652 if ((long) (now
- want
) >= 0) {
653 if (want_wakeup
|| (num_physpages
* buffer_mem
.max_percent
) < (buffermem
>> PAGE_SHIFT
) * 100
654 || (num_physpages
* page_cache
.max_percent
< page_cache_size
* 100)) {
655 /* Set the next wake-up time */
656 next_swap_jiffies
= now
+ swapout_interval
;
657 wake_up(&kswapd_wait
);
660 timer_active
|= (1<<SWAP_TIMER
);
664 * Initialise the swap timer
667 void init_swap_timer(void)
669 timer_table
[SWAP_TIMER
].expires
= 0;
670 timer_table
[SWAP_TIMER
].fn
= swap_tick
;
671 timer_active
|= (1<<SWAP_TIMER
);