4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
11 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
12 * Multiqueue VM started 5.8.00, Rik van Riel.
15 #include <linux/slab.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/swap.h>
18 #include <linux/swapctl.h>
19 #include <linux/smp_lock.h>
20 #include <linux/pagemap.h>
21 #include <linux/init.h>
22 #include <linux/highmem.h>
23 #include <linux/file.h>
25 #include <asm/pgalloc.h>
28 * The swap-out functions return 1 if they successfully
29 * threw something out, and we got a free page. It returns
30 * zero if it couldn't do anything, and any other value
31 * indicates it decreased rss, but the page was shared.
33 * NOTE! If it sleeps, it *must* return 1 to make sure we
34 * don't continue with the swap-out. Otherwise we may be
35 * using a process that no longer actually exists (it might
36 * have died while we slept).
38 static int try_to_swap_out(struct mm_struct
* mm
, struct vm_area_struct
* vma
, unsigned long address
, pte_t
* page_table
, int gfp_mask
)
43 int (*swapout
)(struct page
*, struct file
*);
47 if (!pte_present(pte
))
50 if ((!VALID_PAGE(page
)) || PageReserved(page
))
56 onlist
= PageActive(page
);
57 /* Don't look at this pte if it's been accessed recently. */
58 if (ptep_test_and_clear_young(page_table
)) {
63 /* The page is still mapped, so it can't be freeable... */
64 age_page_down_ageonly(page
);
67 * If the page is in active use by us, or if the page
68 * is in active use by others, don't unmap it or
69 * (worse) start unneeded IO.
74 if (TryLockPage(page
))
77 /* From this point on, the odds are that we're going to
78 * nuke this pte, so read and clear the pte. This hook
79 * is needed on CPUs which update the accessed and dirty
82 pte
= ptep_get_and_clear(page_table
);
85 * Is the page already in the swap cache? If so, then
86 * we can just drop our reference to it without doing
87 * any IO - it's already up-to-date on disk.
89 * Return 0, as we didn't actually free any real
90 * memory, and we should just continue our scan.
92 if (PageSwapCache(page
)) {
93 entry
.val
= page
->index
;
97 swap_duplicate(entry
);
98 set_pte(page_table
, swp_entry_to_pte(entry
));
102 flush_tlb_page(vma
, address
);
103 deactivate_page(page
);
104 page_cache_release(page
);
110 * Is it a clean page? Then it must be recoverable
111 * by just paging it in again, and we can just drop
114 * However, this won't actually free any real
115 * memory, as the page will just be in the page cache
116 * somewhere, and as such we should just continue
119 * Basically, this just makes it possible for us to do
120 * some real work in the future in "refill_inactive()".
122 if (!pte_dirty(pte
)) {
123 flush_cache_page(vma
, address
);
128 * Don't go down into the swap-out stuff if
129 * we cannot do I/O! Avoid recursing on FS
132 if (!(gfp_mask
& __GFP_IO
))
133 goto out_unlock_restore
;
136 * Don't do any of the expensive stuff if
137 * we're not really interested in this zone.
139 if (page
->zone
->free_pages
+ page
->zone
->inactive_clean_pages
140 + page
->zone
->inactive_dirty_pages
141 > page
->zone
->pages_high
+ inactive_target
)
142 goto out_unlock_restore
;
145 * Ok, it's really dirty. That means that
146 * we should either create a new swap cache
147 * entry for it, or we should write it back
148 * to its own backing store.
150 * Note that in neither case do we actually
151 * know that we make a page available, but
152 * as we potentially sleep we can no longer
153 * continue scanning, so we migth as well
154 * assume we free'd something.
156 * NOTE NOTE NOTE! This should just set a
157 * dirty bit in 'page', and just drop the
158 * pte. All the hard work would be done by
161 * That would get rid of a lot of problems.
163 flush_cache_page(vma
, address
);
164 if (vma
->vm_ops
&& (swapout
= vma
->vm_ops
->swapout
)) {
166 struct file
*file
= vma
->vm_file
;
167 if (file
) get_file(file
);
170 flush_tlb_page(vma
, address
);
171 spin_unlock(&mm
->page_table_lock
);
172 error
= swapout(page
, file
);
173 if (file
) fput(file
);
175 goto out_unlock_restore
;
177 deactivate_page(page
);
178 page_cache_release(page
);
179 return 1; /* We released page_table_lock */
183 * This is a dirty, swappable page. First of all,
184 * get a suitable swap entry for it, and make sure
185 * we have the swap cache set up to associate the
186 * page with that swap entry.
188 entry
= get_swap_page();
190 goto out_unlock_restore
; /* No swap space left */
192 /* Add it to the swap cache and mark it dirty */
193 add_to_swap_cache(page
, entry
);
198 set_pte(page_table
, pte
);
204 * A new implementation of swap_out(). We do not swap complete processes,
205 * but only a small number of blocks, before we continue with the next
206 * process. The number of blocks actually swapped is determined on the
207 * number of page faults, that this process actually had in the last time,
208 * so we won't swap heavily used processes all the time ...
210 * Note: the priority argument is a hint on much CPU to waste with the
211 * swap block search, not a hint, of how much blocks to swap with
214 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
217 static inline int swap_out_pmd(struct mm_struct
* mm
, struct vm_area_struct
* vma
, pmd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
220 unsigned long pmd_end
;
230 pte
= pte_offset(dir
, address
);
232 pmd_end
= (address
+ PMD_SIZE
) & PMD_MASK
;
238 mm
->swap_address
= address
+ PAGE_SIZE
;
239 result
= try_to_swap_out(mm
, vma
, address
, pte
, gfp_mask
);
244 address
+= PAGE_SIZE
;
246 } while (address
&& (address
< end
));
250 static inline int swap_out_pgd(struct mm_struct
* mm
, struct vm_area_struct
* vma
, pgd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
253 unsigned long pgd_end
;
263 pmd
= pmd_offset(dir
, address
);
265 pgd_end
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
266 if (pgd_end
&& (end
> pgd_end
))
270 int result
= swap_out_pmd(mm
, vma
, pmd
, address
, end
, gfp_mask
);
275 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
277 } while (address
&& (address
< end
));
281 static int swap_out_vma(struct mm_struct
* mm
, struct vm_area_struct
* vma
, unsigned long address
, int gfp_mask
)
286 /* Don't swap out areas which are locked down */
287 if (vma
->vm_flags
& (VM_LOCKED
|VM_RESERVED
))
290 pgdir
= pgd_offset(mm
, address
);
296 int result
= swap_out_pgd(mm
, vma
, pgdir
, address
, end
, gfp_mask
);
301 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
303 } while (address
&& (address
< end
));
307 static int swap_out_mm(struct mm_struct
* mm
, int gfp_mask
)
309 unsigned long address
;
310 struct vm_area_struct
* vma
;
313 * Go through process' page directory.
315 address
= mm
->swap_address
;
318 * Find the proper vm-area after freezing the vma chain
321 spin_lock(&mm
->page_table_lock
);
322 vma
= find_vma(mm
, address
);
324 if (address
< vma
->vm_start
)
325 address
= vma
->vm_start
;
328 int result
= swap_out_vma(mm
, vma
, address
, gfp_mask
);
336 address
= vma
->vm_start
;
339 /* Reset to 0 when we reach the end of address space */
340 mm
->swap_address
= 0;
344 spin_unlock(&mm
->page_table_lock
);
346 /* We didn't find anything for the process */
351 * Select the task with maximal swap_cnt and try to swap out a page.
352 * N.B. This function returns only 0 or 1. Return values != 1 from
353 * the lower level routines result in continued processing.
358 static int swap_out(unsigned int priority
, int gfp_mask
, unsigned long idle_time
)
360 struct task_struct
* p
;
366 * We make one or two passes through the task list, indexed by
368 * Pass 1: select the swappable task with maximal RSS that has
369 * not yet been swapped out.
370 * Pass 2: re-assign rss swap_cnt values, then select as above.
372 * With this approach, there's no need to remember the last task
373 * swapped out. If the swap-out fails, we clear swap_cnt so the
374 * task won't be selected again until all others have been tried.
376 * Think of swap_cnt as a "shadow rss" - it tells us which process
377 * we want to page out (always try largest first).
379 counter
= (nr_threads
<< SWAP_SHIFT
) >> priority
;
383 for (; counter
>= 0; counter
--) {
384 unsigned long max_cnt
= 0;
385 struct mm_struct
*best
= NULL
;
390 read_lock(&tasklist_lock
);
391 p
= init_task
.next_task
;
392 for (; p
!= &init_task
; p
= p
->next_task
) {
393 struct mm_struct
*mm
= p
->mm
;
394 if (!p
->swappable
|| !mm
)
398 /* Skip tasks which haven't slept long enough yet when idle-swapping. */
399 if (idle_time
&& !assign
&& (!(p
->state
& TASK_INTERRUPTIBLE
) ||
400 time_after(p
->sleep_time
+ idle_time
* HZ
, jiffies
)))
403 /* Refresh swap_cnt? */
405 mm
->swap_cnt
= (mm
->rss
>> SWAP_SHIFT
);
406 if (mm
->swap_cnt
< SWAP_MIN
)
407 mm
->swap_cnt
= SWAP_MIN
;
409 if (mm
->swap_cnt
> max_cnt
) {
410 max_cnt
= mm
->swap_cnt
;
415 read_unlock(&tasklist_lock
);
417 if (!assign
&& found_task
> 0) {
425 atomic_inc(&best
->mm_count
);
426 ret
= swap_out_mm(best
, gfp_mask
);
433 kill_proc(pid
, SIGBUS
, 1);
445 * reclaim_page - reclaims one page from the inactive_clean list
446 * @zone: reclaim a page from this zone
448 * The pages on the inactive_clean can be instantly reclaimed.
449 * The tests look impressive, but most of the time we'll grab
450 * the first page of the list and exit successfully.
452 struct page
* reclaim_page(zone_t
* zone
)
454 struct page
* page
= NULL
;
455 struct list_head
* page_lru
;
459 * We only need the pagemap_lru_lock if we don't reclaim the page,
460 * but we have to grab the pagecache_lock before the pagemap_lru_lock
461 * to avoid deadlocks and most of the time we'll succeed anyway.
463 spin_lock(&pagecache_lock
);
464 spin_lock(&pagemap_lru_lock
);
465 maxscan
= zone
->inactive_clean_pages
;
466 while ((page_lru
= zone
->inactive_clean_list
.prev
) !=
467 &zone
->inactive_clean_list
&& maxscan
--) {
468 page
= list_entry(page_lru
, struct page
, lru
);
470 /* Wrong page on list?! (list corruption, should not happen) */
471 if (!PageInactiveClean(page
)) {
472 printk("VM: reclaim_page, wrong page on list.\n");
474 page
->zone
->inactive_clean_pages
--;
478 /* Page is or was in use? Move it to the active list. */
479 if (PageTestandClearReferenced(page
) || page
->age
> 0 ||
480 (!page
->buffers
&& page_count(page
) > 1)) {
481 del_page_from_inactive_clean_list(page
);
482 add_page_to_active_list(page
);
486 /* The page is dirty, or locked, move to inactive_dirty list. */
487 if (page
->buffers
|| TryLockPage(page
)) {
488 del_page_from_inactive_clean_list(page
);
489 add_page_to_inactive_dirty_list(page
);
493 /* OK, remove the page from the caches. */
494 if (PageSwapCache(page
)) {
495 __delete_from_swap_cache(page
);
500 __remove_inode_page(page
);
504 /* We should never ever get here. */
505 printk(KERN_ERR
"VM: reclaim_page, found unknown page\n");
507 zone
->inactive_clean_pages
--;
510 /* Reset page pointer, maybe we encountered an unfreeable page. */
515 del_page_from_inactive_clean_list(page
);
517 page
->age
= PAGE_AGE_START
;
518 if (page_count(page
) != 1)
519 printk("VM: reclaim_page, found page with count %d!\n",
522 spin_unlock(&pagemap_lru_lock
);
523 spin_unlock(&pagecache_lock
);
529 * page_launder - clean dirty inactive pages, move to inactive_clean list
530 * @gfp_mask: what operations we are allowed to do
531 * @sync: should we wait synchronously for the cleaning of pages
533 * When this function is called, we are most likely low on free +
534 * inactive_clean pages. Since we want to refill those pages as
535 * soon as possible, we'll make two loops over the inactive list,
536 * one to move the already cleaned pages to the inactive_clean lists
537 * and one to (often asynchronously) clean the dirty inactive pages.
539 * In situations where kswapd cannot keep up, user processes will
540 * end up calling this function. Since the user process needs to
541 * have a page before it can continue with its allocation, we'll
542 * do synchronous page flushing in that case.
544 * This code is heavily inspired by the FreeBSD source code. Thanks
545 * go out to Matthew Dillon.
547 #define MAX_LAUNDER (4 * (1 << page_cluster))
548 int page_launder(int gfp_mask
, int sync
)
550 int launder_loop
, maxscan
, cleaned_pages
, maxlaunder
;
551 int can_get_io_locks
;
552 struct list_head
* page_lru
;
556 * We can only grab the IO locks (eg. for flushing dirty
557 * buffers to disk) if __GFP_IO is set.
559 can_get_io_locks
= gfp_mask
& __GFP_IO
;
566 spin_lock(&pagemap_lru_lock
);
567 maxscan
= nr_inactive_dirty_pages
;
568 while ((page_lru
= inactive_dirty_list
.prev
) != &inactive_dirty_list
&&
570 page
= list_entry(page_lru
, struct page
, lru
);
572 /* Wrong page on list?! (list corruption, should not happen) */
573 if (!PageInactiveDirty(page
)) {
574 printk("VM: page_launder, wrong page on list.\n");
576 nr_inactive_dirty_pages
--;
577 page
->zone
->inactive_dirty_pages
--;
581 /* Page is or was in use? Move it to the active list. */
582 if (PageTestandClearReferenced(page
) || page
->age
> 0 ||
583 (!page
->buffers
&& page_count(page
) > 1) ||
584 page_ramdisk(page
)) {
585 del_page_from_inactive_dirty_list(page
);
586 add_page_to_active_list(page
);
591 * The page is locked. IO in progress?
592 * Move it to the back of the list.
594 if (TryLockPage(page
)) {
596 list_add(page_lru
, &inactive_dirty_list
);
601 * Dirty swap-cache page? Write it out if
604 if (PageDirty(page
)) {
605 int (*writepage
)(struct page
*) = page
->mapping
->a_ops
->writepage
;
609 /* Can't start IO? Move it to the back of the list */
610 if (!can_get_io_locks
) {
612 list_add(page_lru
, &inactive_dirty_list
);
617 /* OK, do a physical asynchronous write to swap. */
618 ClearPageDirty(page
);
619 page_cache_get(page
);
620 spin_unlock(&pagemap_lru_lock
);
623 page_cache_release(page
);
625 /* And re-start the thing.. */
626 spin_lock(&pagemap_lru_lock
);
631 * If the page has buffers, try to free the buffer mappings
632 * associated with this page. If we succeed we either free
633 * the page (in case it was a buffercache only page) or we
634 * move the page to the inactive_clean list.
636 * On the first round, we should free all previously cleaned
640 int wait
, clearedbuf
;
643 * Since we might be doing disk IO, we have to
644 * drop the spinlock and take an extra reference
645 * on the page so it doesn't go away from under us.
647 del_page_from_inactive_dirty_list(page
);
648 page_cache_get(page
);
649 spin_unlock(&pagemap_lru_lock
);
651 /* Will we do (asynchronous) IO? */
652 if (launder_loop
&& maxlaunder
== 0 && sync
)
653 wait
= 2; /* Synchrounous IO */
654 else if (launder_loop
&& maxlaunder
-- > 0)
655 wait
= 1; /* Async IO */
657 wait
= 0; /* No IO */
659 /* Try to free the page buffers. */
660 clearedbuf
= try_to_free_buffers(page
, wait
);
663 * Re-take the spinlock. Note that we cannot
664 * unlock the page yet since we're still
665 * accessing the page_struct here...
667 spin_lock(&pagemap_lru_lock
);
669 /* The buffers were not freed. */
671 add_page_to_inactive_dirty_list(page
);
673 /* The page was only in the buffer cache. */
674 } else if (!page
->mapping
) {
675 atomic_dec(&buffermem_pages
);
679 /* The page has more users besides the cache and us. */
680 } else if (page_count(page
) > 2) {
681 add_page_to_active_list(page
);
683 /* OK, we "created" a freeable page. */
684 } else /* page->mapping && page_count(page) == 2 */ {
685 add_page_to_inactive_clean_list(page
);
690 * Unlock the page and drop the extra reference.
691 * We can only do it here because we ar accessing
692 * the page struct above.
695 page_cache_release(page
);
698 * If we're freeing buffer cache pages, stop when
699 * we've got enough free memory.
701 if (freed_page
&& !free_shortage())
704 } else if (page
->mapping
&& !PageDirty(page
)) {
706 * If a page had an extra reference in
707 * deactivate_page(), we will find it here.
708 * Now the page is really freeable, so we
709 * move it to the inactive_clean list.
711 del_page_from_inactive_dirty_list(page
);
712 add_page_to_inactive_clean_list(page
);
718 * OK, we don't know what to do with the page.
719 * It's no use keeping it here, so we move it to
722 del_page_from_inactive_dirty_list(page
);
723 add_page_to_active_list(page
);
727 spin_unlock(&pagemap_lru_lock
);
730 * If we don't have enough free pages, we loop back once
731 * to queue the dirty pages for writeout. When we were called
732 * by a user process (that /needs/ a free page) and we didn't
733 * free anything yet, we wait synchronously on the writeout of
734 * MAX_SYNC_LAUNDER pages.
736 * We also wake up bdflush, since bdflush should, under most
737 * loads, flush out the dirty pages before we have to wait on
740 if (can_get_io_locks
&& !launder_loop
&& free_shortage()) {
742 /* If we cleaned pages, never do synchronous IO. */
745 /* We only do a few "out of order" flushes. */
746 maxlaunder
= MAX_LAUNDER
;
747 /* Kflushd takes care of the rest. */
749 goto dirty_page_rescan
;
752 /* Return the number of pages moved to the inactive_clean list. */
753 return cleaned_pages
;
757 * refill_inactive_scan - scan the active list and find pages to deactivate
758 * @priority: the priority at which to scan
759 * @oneshot: exit after deactivating one page
761 * This function will scan a portion of the active list to find
762 * unused pages, those pages will then be moved to the inactive list.
764 int refill_inactive_scan(unsigned int priority
, int oneshot
)
766 struct list_head
* page_lru
;
768 int maxscan
, page_active
= 0;
771 /* Take the lock while messing with the list... */
772 spin_lock(&pagemap_lru_lock
);
773 maxscan
= nr_active_pages
>> priority
;
774 while (maxscan
-- > 0 && (page_lru
= active_list
.prev
) != &active_list
) {
775 page
= list_entry(page_lru
, struct page
, lru
);
777 /* Wrong page on list?! (list corruption, should not happen) */
778 if (!PageActive(page
)) {
779 printk("VM: refill_inactive, wrong page on list.\n");
785 /* Do aging on the pages. */
786 if (PageTestandClearReferenced(page
)) {
787 age_page_up_nolock(page
);
790 age_page_down_ageonly(page
);
792 * Since we don't hold a reference on the page
793 * ourselves, we have to do our test a bit more
794 * strict then deactivate_page(). This is needed
795 * since otherwise the system could hang shuffling
796 * unfreeable pages from the active list to the
797 * inactive_dirty list and back again...
799 * SUBTLE: we can have buffer pages with count 1.
801 if (page
->age
== 0 && page_count(page
) <=
802 (page
->buffers
? 2 : 1)) {
803 deactivate_page_nolock(page
);
810 * If the page is still on the active list, move it
811 * to the other end of the list. Otherwise it was
812 * deactivated by age_page_down and we exit successfully.
814 if (page_active
|| PageActive(page
)) {
816 list_add(page_lru
, &active_list
);
823 spin_unlock(&pagemap_lru_lock
);
829 * Check if there are zones with a severe shortage of free pages,
830 * or if all zones have a minor shortage.
832 int free_shortage(void)
834 pg_data_t
*pgdat
= pgdat_list
;
836 int freeable
= nr_free_pages() + nr_inactive_clean_pages();
837 int freetarget
= freepages
.high
+ inactive_target
/ 3;
839 /* Are we low on free pages globally? */
840 if (freeable
< freetarget
)
841 return freetarget
- freeable
;
843 /* If not, are we very low on any particular zone? */
846 for(i
= 0; i
< MAX_NR_ZONES
; i
++) {
847 zone_t
*zone
= pgdat
->node_zones
+ i
;
848 if (zone
->size
&& (zone
->inactive_clean_pages
+
849 zone
->free_pages
< zone
->pages_min
+1)) {
850 /* + 1 to have overlap with alloc_pages() !! */
851 sum
+= zone
->pages_min
+ 1;
852 sum
-= zone
->free_pages
;
853 sum
-= zone
->inactive_clean_pages
;
856 pgdat
= pgdat
->node_next
;
863 * How many inactive pages are we short?
865 int inactive_shortage(void)
869 shortage
+= freepages
.high
;
870 shortage
+= inactive_target
;
871 shortage
-= nr_free_pages();
872 shortage
-= nr_inactive_clean_pages();
873 shortage
-= nr_inactive_dirty_pages
;
882 * We need to make the locks finer granularity, but right
883 * now we need this so that we can do page allocations
884 * without holding the kernel lock etc.
886 * We want to try to free "count" pages, and we want to
887 * cluster them so that we get good swap-out behaviour.
889 * OTOH, if we're a user process (and not kswapd), we
890 * really care about latency. In that case we don't try
891 * to free too many pages.
893 static int refill_inactive(unsigned int gfp_mask
, int user
)
895 int priority
, count
, start_count
, made_progress
;
896 unsigned long idle_time
;
898 count
= inactive_shortage() + free_shortage();
900 count
= (1 << page_cluster
);
903 /* Always trim SLAB caches when memory gets low. */
904 kmem_cache_reap(gfp_mask
);
907 * Calculate the minimum time (in seconds) a process must
908 * have slept before we consider it for idle swapping.
909 * This must be the number of seconds it takes to go through
910 * all of the cache. Doing this idle swapping makes the VM
911 * smoother once we start hitting swap.
913 idle_time
= atomic_read(&page_cache_size
);
914 idle_time
+= atomic_read(&buffermem_pages
);
915 idle_time
/= (inactive_target
+ 1);
921 if (current
->need_resched
) {
922 __set_current_state(TASK_RUNNING
);
926 while (refill_inactive_scan(priority
, 1) ||
927 swap_out(priority
, gfp_mask
, idle_time
)) {
934 * don't be too light against the d/i cache since
935 * refill_inactive() almost never fail when there's
936 * really plenty of memory free.
938 shrink_dcache_memory(priority
, gfp_mask
);
939 shrink_icache_memory(priority
, gfp_mask
);
942 * Then, try to page stuff out..
944 while (swap_out(priority
, gfp_mask
, 0)) {
951 * If we either have enough free memory, or if
952 * page_launder() will be able to make enough
953 * free memory, then stop.
955 if (!inactive_shortage() || !free_shortage())
959 * Only switch to a lower "priority" if we
960 * didn't make any useful progress in the
965 } while (priority
>= 0);
967 /* Always end on a refill_inactive.., may sleep... */
968 while (refill_inactive_scan(0, 1)) {
974 return (count
< start_count
);
977 static int do_try_to_free_pages(unsigned int gfp_mask
, int user
)
982 * If we're low on free pages, move pages from the
983 * inactive_dirty list to the inactive_clean list.
985 * Usually bdflush will have pre-cleaned the pages
986 * before we get around to moving them to the other
987 * list, so this is a relatively cheap operation.
989 if (free_shortage() || nr_inactive_dirty_pages
> nr_free_pages() +
990 nr_inactive_clean_pages())
991 ret
+= page_launder(gfp_mask
, user
);
994 * If needed, we move pages from the active list
995 * to the inactive list. We also "eat" pages from
996 * the inode and dentry cache whenever we do this.
998 if (free_shortage() || inactive_shortage()) {
999 shrink_dcache_memory(6, gfp_mask
);
1000 shrink_icache_memory(6, gfp_mask
);
1001 ret
+= refill_inactive(gfp_mask
, user
);
1004 * Reclaim unused slab cache memory.
1006 kmem_cache_reap(gfp_mask
);
1013 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait
);
1014 DECLARE_WAIT_QUEUE_HEAD(kswapd_done
);
1015 struct task_struct
*kswapd_task
;
1018 * The background pageout daemon, started as a kernel thread
1019 * from the init process.
1021 * This basically trickles out pages so that we have _some_
1022 * free memory available even if there is no other activity
1023 * that frees anything up. This is needed for things like routing
1024 * etc, where we otherwise might have all activity going on in
1025 * asynchronous contexts that cannot page things out.
1027 * If there are applications that are active memory-allocators
1028 * (most normal use), this basically shouldn't matter.
1030 int kswapd(void *unused
)
1032 struct task_struct
*tsk
= current
;
1036 strcpy(tsk
->comm
, "kswapd");
1037 sigfillset(&tsk
->blocked
);
1041 * Tell the memory management that we're a "memory allocator",
1042 * and that if we need more memory we should get access to it
1043 * regardless (see "__alloc_pages()"). "kswapd" should
1044 * never get caught in the normal page freeing logic.
1046 * (Kswapd normally doesn't need memory anyway, but sometimes
1047 * you need a small amount of memory in order to be able to
1048 * page out something else, and this flag essentially protects
1049 * us from recursively trying to free more memory as we're
1050 * trying to free the first piece of memory in the first place).
1052 tsk
->flags
|= PF_MEMALLOC
;
1058 static int recalc
= 0;
1060 /* If needed, try to free some memory. */
1061 if (inactive_shortage() || free_shortage()) {
1063 /* Do we need to do some synchronous flushing? */
1064 if (waitqueue_active(&kswapd_done
))
1066 do_try_to_free_pages(GFP_KSWAPD
, wait
);
1070 * Do some (very minimal) background scanning. This
1071 * will scan all pages on the active list once
1072 * every minute. This clears old referenced bits
1073 * and moves unused pages to the inactive list.
1075 refill_inactive_scan(6, 0);
1077 /* Once a second, recalculate some VM stats. */
1078 if (time_after(jiffies
, recalc
+ HZ
)) {
1080 recalculate_vm_stats();
1084 * Wake up everybody waiting for free memory
1085 * and unplug the disk queue.
1087 wake_up_all(&kswapd_done
);
1088 run_task_queue(&tq_disk
);
1091 * We go to sleep if either the free page shortage
1092 * or the inactive page shortage is gone. We do this
1094 * 1) we need no more free pages or
1095 * 2) the inactive pages need to be flushed to disk,
1096 * it wouldn't help to eat CPU time now ...
1098 * We go to sleep for one second, but if it's needed
1099 * we'll be woken up earlier...
1101 if (!free_shortage() || !inactive_shortage()) {
1102 interruptible_sleep_on_timeout(&kswapd_wait
, HZ
);
1104 * If we couldn't free enough memory, we see if it was
1105 * due to the system just not having enough memory.
1106 * If that is the case, the only solution is to kill
1107 * a process (the alternative is enternal deadlock).
1109 * If there still is enough memory around, we just loop
1110 * and try free some more memory...
1112 } else if (out_of_memory()) {
1118 void wakeup_kswapd(int block
)
1120 DECLARE_WAITQUEUE(wait
, current
);
1122 if (current
== kswapd_task
)
1126 if (waitqueue_active(&kswapd_wait
))
1127 wake_up(&kswapd_wait
);
1132 * Kswapd could wake us up before we get a chance
1133 * to sleep, so we have to be very careful here to
1134 * prevent SMP races...
1136 __set_current_state(TASK_UNINTERRUPTIBLE
);
1137 add_wait_queue(&kswapd_done
, &wait
);
1139 if (waitqueue_active(&kswapd_wait
))
1140 wake_up(&kswapd_wait
);
1143 remove_wait_queue(&kswapd_done
, &wait
);
1144 __set_current_state(TASK_RUNNING
);
1148 * Called by non-kswapd processes when they want more
1149 * memory but are unable to sleep on kswapd because
1150 * they might be holding some IO locks ...
1152 int try_to_free_pages(unsigned int gfp_mask
)
1156 if (gfp_mask
& __GFP_WAIT
) {
1157 current
->flags
|= PF_MEMALLOC
;
1158 ret
= do_try_to_free_pages(gfp_mask
, 1);
1159 current
->flags
&= ~PF_MEMALLOC
;
1165 DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait
);
1167 * Kreclaimd will move pages from the inactive_clean list to the
1168 * free list, in order to keep atomic allocations possible under
1169 * all circumstances. Even when kswapd is blocked on IO.
1171 int kreclaimd(void *unused
)
1173 struct task_struct
*tsk
= current
;
1178 strcpy(tsk
->comm
, "kreclaimd");
1179 sigfillset(&tsk
->blocked
);
1180 current
->flags
|= PF_MEMALLOC
;
1185 * We sleep until someone wakes us up from
1186 * page_alloc.c::__alloc_pages().
1188 interruptible_sleep_on(&kreclaimd_wait
);
1191 * Move some pages from the inactive_clean lists to
1192 * the free lists, if it is needed.
1197 for(i
= 0; i
< MAX_NR_ZONES
; i
++) {
1198 zone_t
*zone
= pgdat
->node_zones
+ i
;
1202 while (zone
->free_pages
< zone
->pages_low
) {
1204 page
= reclaim_page(zone
);
1210 pgdat
= pgdat
->node_next
;
1216 static int __init
kswapd_init(void)
1218 printk("Starting kswapd v1.8\n");
1220 kernel_thread(kswapd
, NULL
, CLONE_FS
| CLONE_FILES
| CLONE_SIGNAL
);
1221 kernel_thread(kreclaimd
, NULL
, CLONE_FS
| CLONE_FILES
| CLONE_SIGNAL
);
1225 module_init(kswapd_init
)