4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
11 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
12 * Multiqueue VM started 5.8.00, Rik van Riel.
15 #include <linux/slab.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/swap.h>
18 #include <linux/swapctl.h>
19 #include <linux/smp_lock.h>
20 #include <linux/pagemap.h>
21 #include <linux/init.h>
22 #include <linux/highmem.h>
23 #include <linux/file.h>
25 #include <asm/pgalloc.h>
28 * The swap-out functions return 1 if they successfully
29 * threw something out, and we got a free page. It returns
30 * zero if it couldn't do anything, and any other value
31 * indicates it decreased rss, but the page was shared.
33 * NOTE! If it sleeps, it *must* return 1 to make sure we
34 * don't continue with the swap-out. Otherwise we may be
35 * using a process that no longer actually exists (it might
36 * have died while we slept).
38 static int try_to_swap_out(struct mm_struct
* mm
, struct vm_area_struct
* vma
, unsigned long address
, pte_t
* page_table
, int gfp_mask
)
46 if (!pte_present(pte
))
49 if ((!VALID_PAGE(page
)) || PageReserved(page
))
55 onlist
= PageActive(page
);
56 /* Don't look at this pte if it's been accessed recently. */
57 if (ptep_test_and_clear_young(page_table
)) {
62 /* The page is still mapped, so it can't be freeable... */
63 age_page_down_ageonly(page
);
66 * If the page is in active use by us, or if the page
67 * is in active use by others, don't unmap it or
68 * (worse) start unneeded IO.
73 if (TryLockPage(page
))
76 /* From this point on, the odds are that we're going to
77 * nuke this pte, so read and clear the pte. This hook
78 * is needed on CPUs which update the accessed and dirty
81 pte
= ptep_get_and_clear(page_table
);
84 * Is the page already in the swap cache? If so, then
85 * we can just drop our reference to it without doing
86 * any IO - it's already up-to-date on disk.
88 * Return 0, as we didn't actually free any real
89 * memory, and we should just continue our scan.
91 if (PageSwapCache(page
)) {
92 entry
.val
= page
->index
;
96 swap_duplicate(entry
);
97 set_pte(page_table
, swp_entry_to_pte(entry
));
101 flush_tlb_page(vma
, address
);
102 deactivate_page(page
);
103 page_cache_release(page
);
109 * Is it a clean page? Then it must be recoverable
110 * by just paging it in again, and we can just drop
113 * However, this won't actually free any real
114 * memory, as the page will just be in the page cache
115 * somewhere, and as such we should just continue
118 * Basically, this just makes it possible for us to do
119 * some real work in the future in "refill_inactive()".
121 flush_cache_page(vma
, address
);
126 * Ok, it's really dirty. That means that
127 * we should either create a new swap cache
128 * entry for it, or we should write it back
129 * to its own backing store.
132 set_page_dirty(page
);
137 * This is a dirty, swappable page. First of all,
138 * get a suitable swap entry for it, and make sure
139 * we have the swap cache set up to associate the
140 * page with that swap entry.
142 entry
= get_swap_page();
144 goto out_unlock_restore
; /* No swap space left */
146 /* Add it to the swap cache and mark it dirty */
147 add_to_swap_cache(page
, entry
);
148 set_page_dirty(page
);
152 set_pte(page_table
, pte
);
158 * A new implementation of swap_out(). We do not swap complete processes,
159 * but only a small number of blocks, before we continue with the next
160 * process. The number of blocks actually swapped is determined on the
161 * number of page faults, that this process actually had in the last time,
162 * so we won't swap heavily used processes all the time ...
164 * Note: the priority argument is a hint on much CPU to waste with the
165 * swap block search, not a hint, of how much blocks to swap with
168 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
171 static inline int swap_out_pmd(struct mm_struct
* mm
, struct vm_area_struct
* vma
, pmd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
174 unsigned long pmd_end
;
184 pte
= pte_offset(dir
, address
);
186 pmd_end
= (address
+ PMD_SIZE
) & PMD_MASK
;
192 mm
->swap_address
= address
+ PAGE_SIZE
;
193 result
= try_to_swap_out(mm
, vma
, address
, pte
, gfp_mask
);
198 address
+= PAGE_SIZE
;
200 } while (address
&& (address
< end
));
204 static inline int swap_out_pgd(struct mm_struct
* mm
, struct vm_area_struct
* vma
, pgd_t
*dir
, unsigned long address
, unsigned long end
, int gfp_mask
)
207 unsigned long pgd_end
;
217 pmd
= pmd_offset(dir
, address
);
219 pgd_end
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
220 if (pgd_end
&& (end
> pgd_end
))
224 int result
= swap_out_pmd(mm
, vma
, pmd
, address
, end
, gfp_mask
);
229 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
231 } while (address
&& (address
< end
));
235 static int swap_out_vma(struct mm_struct
* mm
, struct vm_area_struct
* vma
, unsigned long address
, int gfp_mask
)
240 /* Don't swap out areas which are locked down */
241 if (vma
->vm_flags
& (VM_LOCKED
|VM_RESERVED
))
244 pgdir
= pgd_offset(mm
, address
);
250 int result
= swap_out_pgd(mm
, vma
, pgdir
, address
, end
, gfp_mask
);
255 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
257 } while (address
&& (address
< end
));
261 static int swap_out_mm(struct mm_struct
* mm
, int gfp_mask
)
263 unsigned long address
;
264 struct vm_area_struct
* vma
;
267 * Go through process' page directory.
269 address
= mm
->swap_address
;
272 * Find the proper vm-area after freezing the vma chain
275 spin_lock(&mm
->page_table_lock
);
276 vma
= find_vma(mm
, address
);
278 if (address
< vma
->vm_start
)
279 address
= vma
->vm_start
;
282 int result
= swap_out_vma(mm
, vma
, address
, gfp_mask
);
290 address
= vma
->vm_start
;
293 /* Reset to 0 when we reach the end of address space */
294 mm
->swap_address
= 0;
298 spin_unlock(&mm
->page_table_lock
);
300 /* We didn't find anything for the process */
305 * Select the task with maximal swap_cnt and try to swap out a page.
306 * N.B. This function returns only 0 or 1. Return values != 1 from
307 * the lower level routines result in continued processing.
312 static int swap_out(unsigned int priority
, int gfp_mask
, unsigned long idle_time
)
314 struct task_struct
* p
;
320 * We make one or two passes through the task list, indexed by
322 * Pass 1: select the swappable task with maximal RSS that has
323 * not yet been swapped out.
324 * Pass 2: re-assign rss swap_cnt values, then select as above.
326 * With this approach, there's no need to remember the last task
327 * swapped out. If the swap-out fails, we clear swap_cnt so the
328 * task won't be selected again until all others have been tried.
330 * Think of swap_cnt as a "shadow rss" - it tells us which process
331 * we want to page out (always try largest first).
333 counter
= (nr_threads
<< SWAP_SHIFT
) >> priority
;
337 for (; counter
>= 0; counter
--) {
338 unsigned long max_cnt
= 0;
339 struct mm_struct
*best
= NULL
;
344 read_lock(&tasklist_lock
);
345 p
= init_task
.next_task
;
346 for (; p
!= &init_task
; p
= p
->next_task
) {
347 struct mm_struct
*mm
= p
->mm
;
348 if (!p
->swappable
|| !mm
)
352 /* Skip tasks which haven't slept long enough yet when idle-swapping. */
353 if (idle_time
&& !assign
&& (!(p
->state
& TASK_INTERRUPTIBLE
) ||
354 time_after(p
->sleep_time
+ idle_time
* HZ
, jiffies
)))
357 /* Refresh swap_cnt? */
359 mm
->swap_cnt
= (mm
->rss
>> SWAP_SHIFT
);
360 if (mm
->swap_cnt
< SWAP_MIN
)
361 mm
->swap_cnt
= SWAP_MIN
;
363 if (mm
->swap_cnt
> max_cnt
) {
364 max_cnt
= mm
->swap_cnt
;
369 read_unlock(&tasklist_lock
);
371 if (!assign
&& found_task
> 0) {
379 atomic_inc(&best
->mm_count
);
380 ret
= swap_out_mm(best
, gfp_mask
);
394 * reclaim_page - reclaims one page from the inactive_clean list
395 * @zone: reclaim a page from this zone
397 * The pages on the inactive_clean can be instantly reclaimed.
398 * The tests look impressive, but most of the time we'll grab
399 * the first page of the list and exit successfully.
401 struct page
* reclaim_page(zone_t
* zone
)
403 struct page
* page
= NULL
;
404 struct list_head
* page_lru
;
408 * We only need the pagemap_lru_lock if we don't reclaim the page,
409 * but we have to grab the pagecache_lock before the pagemap_lru_lock
410 * to avoid deadlocks and most of the time we'll succeed anyway.
412 spin_lock(&pagecache_lock
);
413 spin_lock(&pagemap_lru_lock
);
414 maxscan
= zone
->inactive_clean_pages
;
415 while ((page_lru
= zone
->inactive_clean_list
.prev
) !=
416 &zone
->inactive_clean_list
&& maxscan
--) {
417 page
= list_entry(page_lru
, struct page
, lru
);
419 /* Wrong page on list?! (list corruption, should not happen) */
420 if (!PageInactiveClean(page
)) {
421 printk("VM: reclaim_page, wrong page on list.\n");
423 page
->zone
->inactive_clean_pages
--;
427 /* Page is or was in use? Move it to the active list. */
428 if (PageTestandClearReferenced(page
) || page
->age
> 0 ||
429 (!page
->buffers
&& page_count(page
) > 1)) {
430 del_page_from_inactive_clean_list(page
);
431 add_page_to_active_list(page
);
435 /* The page is dirty, or locked, move to inactive_dirty list. */
436 if (page
->buffers
|| PageDirty(page
) || TryLockPage(page
)) {
437 del_page_from_inactive_clean_list(page
);
438 add_page_to_inactive_dirty_list(page
);
442 /* OK, remove the page from the caches. */
443 if (PageSwapCache(page
)) {
444 __delete_from_swap_cache(page
);
449 __remove_inode_page(page
);
453 /* We should never ever get here. */
454 printk(KERN_ERR
"VM: reclaim_page, found unknown page\n");
456 zone
->inactive_clean_pages
--;
459 /* Reset page pointer, maybe we encountered an unfreeable page. */
464 del_page_from_inactive_clean_list(page
);
466 page
->age
= PAGE_AGE_START
;
467 if (page_count(page
) != 1)
468 printk("VM: reclaim_page, found page with count %d!\n",
471 spin_unlock(&pagemap_lru_lock
);
472 spin_unlock(&pagecache_lock
);
478 * page_launder - clean dirty inactive pages, move to inactive_clean list
479 * @gfp_mask: what operations we are allowed to do
480 * @sync: should we wait synchronously for the cleaning of pages
482 * When this function is called, we are most likely low on free +
483 * inactive_clean pages. Since we want to refill those pages as
484 * soon as possible, we'll make two loops over the inactive list,
485 * one to move the already cleaned pages to the inactive_clean lists
486 * and one to (often asynchronously) clean the dirty inactive pages.
488 * In situations where kswapd cannot keep up, user processes will
489 * end up calling this function. Since the user process needs to
490 * have a page before it can continue with its allocation, we'll
491 * do synchronous page flushing in that case.
493 * This code is heavily inspired by the FreeBSD source code. Thanks
494 * go out to Matthew Dillon.
496 #define MAX_LAUNDER (4 * (1 << page_cluster))
497 int page_launder(int gfp_mask
, int sync
)
499 int launder_loop
, maxscan
, cleaned_pages
, maxlaunder
;
500 int can_get_io_locks
;
501 struct list_head
* page_lru
;
505 * We can only grab the IO locks (eg. for flushing dirty
506 * buffers to disk) if __GFP_IO is set.
508 can_get_io_locks
= gfp_mask
& __GFP_IO
;
515 spin_lock(&pagemap_lru_lock
);
516 maxscan
= nr_inactive_dirty_pages
;
517 while ((page_lru
= inactive_dirty_list
.prev
) != &inactive_dirty_list
&&
519 page
= list_entry(page_lru
, struct page
, lru
);
521 /* Wrong page on list?! (list corruption, should not happen) */
522 if (!PageInactiveDirty(page
)) {
523 printk("VM: page_launder, wrong page on list.\n");
525 nr_inactive_dirty_pages
--;
526 page
->zone
->inactive_dirty_pages
--;
530 /* Page is or was in use? Move it to the active list. */
531 if (PageTestandClearReferenced(page
) || page
->age
> 0 ||
532 (!page
->buffers
&& page_count(page
) > 1) ||
533 page_ramdisk(page
)) {
534 del_page_from_inactive_dirty_list(page
);
535 add_page_to_active_list(page
);
540 * The page is locked. IO in progress?
541 * Move it to the back of the list.
543 if (TryLockPage(page
)) {
545 list_add(page_lru
, &inactive_dirty_list
);
550 * Dirty swap-cache page? Write it out if
553 if (PageDirty(page
)) {
554 int (*writepage
)(struct page
*) = page
->mapping
->a_ops
->writepage
;
560 /* First time through? Move it to the back of the list */
563 list_add(page_lru
, &inactive_dirty_list
);
568 /* OK, do a physical asynchronous write to swap. */
569 ClearPageDirty(page
);
570 page_cache_get(page
);
571 spin_unlock(&pagemap_lru_lock
);
573 result
= writepage(page
);
574 page_cache_release(page
);
576 /* And re-start the thing.. */
577 spin_lock(&pagemap_lru_lock
);
580 /* writepage refused to do anything */
581 set_page_dirty(page
);
586 * If the page has buffers, try to free the buffer mappings
587 * associated with this page. If we succeed we either free
588 * the page (in case it was a buffercache only page) or we
589 * move the page to the inactive_clean list.
591 * On the first round, we should free all previously cleaned
595 int wait
, clearedbuf
;
598 * Since we might be doing disk IO, we have to
599 * drop the spinlock and take an extra reference
600 * on the page so it doesn't go away from under us.
602 del_page_from_inactive_dirty_list(page
);
603 page_cache_get(page
);
604 spin_unlock(&pagemap_lru_lock
);
606 /* Will we do (asynchronous) IO? */
607 if (launder_loop
&& maxlaunder
== 0 && sync
)
608 wait
= 2; /* Synchrounous IO */
609 else if (launder_loop
&& maxlaunder
-- > 0)
610 wait
= 1; /* Async IO */
612 wait
= 0; /* No IO */
614 /* Try to free the page buffers. */
615 clearedbuf
= try_to_free_buffers(page
, wait
);
618 * Re-take the spinlock. Note that we cannot
619 * unlock the page yet since we're still
620 * accessing the page_struct here...
622 spin_lock(&pagemap_lru_lock
);
624 /* The buffers were not freed. */
626 add_page_to_inactive_dirty_list(page
);
628 /* The page was only in the buffer cache. */
629 } else if (!page
->mapping
) {
630 atomic_dec(&buffermem_pages
);
634 /* The page has more users besides the cache and us. */
635 } else if (page_count(page
) > 2) {
636 add_page_to_active_list(page
);
638 /* OK, we "created" a freeable page. */
639 } else /* page->mapping && page_count(page) == 2 */ {
640 add_page_to_inactive_clean_list(page
);
645 * Unlock the page and drop the extra reference.
646 * We can only do it here because we ar accessing
647 * the page struct above.
650 page_cache_release(page
);
653 * If we're freeing buffer cache pages, stop when
654 * we've got enough free memory.
656 if (freed_page
&& !free_shortage())
659 } else if (page
->mapping
&& !PageDirty(page
)) {
661 * If a page had an extra reference in
662 * deactivate_page(), we will find it here.
663 * Now the page is really freeable, so we
664 * move it to the inactive_clean list.
666 del_page_from_inactive_dirty_list(page
);
667 add_page_to_inactive_clean_list(page
);
673 * OK, we don't know what to do with the page.
674 * It's no use keeping it here, so we move it to
677 del_page_from_inactive_dirty_list(page
);
678 add_page_to_active_list(page
);
682 spin_unlock(&pagemap_lru_lock
);
685 * If we don't have enough free pages, we loop back once
686 * to queue the dirty pages for writeout. When we were called
687 * by a user process (that /needs/ a free page) and we didn't
688 * free anything yet, we wait synchronously on the writeout of
689 * MAX_SYNC_LAUNDER pages.
691 * We also wake up bdflush, since bdflush should, under most
692 * loads, flush out the dirty pages before we have to wait on
695 if (can_get_io_locks
&& !launder_loop
&& free_shortage()) {
697 /* If we cleaned pages, never do synchronous IO. */
700 /* We only do a few "out of order" flushes. */
701 maxlaunder
= MAX_LAUNDER
;
702 /* Kflushd takes care of the rest. */
704 goto dirty_page_rescan
;
707 /* Return the number of pages moved to the inactive_clean list. */
708 return cleaned_pages
;
712 * refill_inactive_scan - scan the active list and find pages to deactivate
713 * @priority: the priority at which to scan
714 * @oneshot: exit after deactivating one page
716 * This function will scan a portion of the active list to find
717 * unused pages, those pages will then be moved to the inactive list.
719 int refill_inactive_scan(unsigned int priority
, int oneshot
)
721 struct list_head
* page_lru
;
723 int maxscan
, page_active
= 0;
726 /* Take the lock while messing with the list... */
727 spin_lock(&pagemap_lru_lock
);
728 maxscan
= nr_active_pages
>> priority
;
729 while (maxscan
-- > 0 && (page_lru
= active_list
.prev
) != &active_list
) {
730 page
= list_entry(page_lru
, struct page
, lru
);
732 /* Wrong page on list?! (list corruption, should not happen) */
733 if (!PageActive(page
)) {
734 printk("VM: refill_inactive, wrong page on list.\n");
740 /* Do aging on the pages. */
741 if (PageTestandClearReferenced(page
)) {
742 age_page_up_nolock(page
);
745 age_page_down_ageonly(page
);
747 * Since we don't hold a reference on the page
748 * ourselves, we have to do our test a bit more
749 * strict then deactivate_page(). This is needed
750 * since otherwise the system could hang shuffling
751 * unfreeable pages from the active list to the
752 * inactive_dirty list and back again...
754 * SUBTLE: we can have buffer pages with count 1.
756 if (page
->age
== 0 && page_count(page
) <=
757 (page
->buffers
? 2 : 1)) {
758 deactivate_page_nolock(page
);
765 * If the page is still on the active list, move it
766 * to the other end of the list. Otherwise it was
767 * deactivated by age_page_down and we exit successfully.
769 if (page_active
|| PageActive(page
)) {
771 list_add(page_lru
, &active_list
);
778 spin_unlock(&pagemap_lru_lock
);
784 * Check if there are zones with a severe shortage of free pages,
785 * or if all zones have a minor shortage.
787 int free_shortage(void)
789 pg_data_t
*pgdat
= pgdat_list
;
791 int freeable
= nr_free_pages() + nr_inactive_clean_pages();
792 int freetarget
= freepages
.high
+ inactive_target
/ 3;
794 /* Are we low on free pages globally? */
795 if (freeable
< freetarget
)
796 return freetarget
- freeable
;
798 /* If not, are we very low on any particular zone? */
801 for(i
= 0; i
< MAX_NR_ZONES
; i
++) {
802 zone_t
*zone
= pgdat
->node_zones
+ i
;
803 if (zone
->size
&& (zone
->inactive_clean_pages
+
804 zone
->free_pages
< zone
->pages_min
+1)) {
805 /* + 1 to have overlap with alloc_pages() !! */
806 sum
+= zone
->pages_min
+ 1;
807 sum
-= zone
->free_pages
;
808 sum
-= zone
->inactive_clean_pages
;
811 pgdat
= pgdat
->node_next
;
818 * How many inactive pages are we short?
820 int inactive_shortage(void)
824 shortage
+= freepages
.high
;
825 shortage
+= inactive_target
;
826 shortage
-= nr_free_pages();
827 shortage
-= nr_inactive_clean_pages();
828 shortage
-= nr_inactive_dirty_pages
;
837 * We need to make the locks finer granularity, but right
838 * now we need this so that we can do page allocations
839 * without holding the kernel lock etc.
841 * We want to try to free "count" pages, and we want to
842 * cluster them so that we get good swap-out behaviour.
844 * OTOH, if we're a user process (and not kswapd), we
845 * really care about latency. In that case we don't try
846 * to free too many pages.
848 static int refill_inactive(unsigned int gfp_mask
, int user
)
850 int priority
, count
, start_count
, made_progress
;
851 unsigned long idle_time
;
853 count
= inactive_shortage() + free_shortage();
855 count
= (1 << page_cluster
);
858 /* Always trim SLAB caches when memory gets low. */
859 kmem_cache_reap(gfp_mask
);
862 * Calculate the minimum time (in seconds) a process must
863 * have slept before we consider it for idle swapping.
864 * This must be the number of seconds it takes to go through
865 * all of the cache. Doing this idle swapping makes the VM
866 * smoother once we start hitting swap.
868 idle_time
= atomic_read(&page_cache_size
);
869 idle_time
+= atomic_read(&buffermem_pages
);
870 idle_time
/= (inactive_target
+ 1);
876 if (current
->need_resched
) {
877 __set_current_state(TASK_RUNNING
);
881 while (refill_inactive_scan(priority
, 1) ||
882 swap_out(priority
, gfp_mask
, idle_time
)) {
889 * don't be too light against the d/i cache since
890 * refill_inactive() almost never fail when there's
891 * really plenty of memory free.
893 shrink_dcache_memory(priority
, gfp_mask
);
894 shrink_icache_memory(priority
, gfp_mask
);
897 * Then, try to page stuff out..
899 while (swap_out(priority
, gfp_mask
, 0)) {
906 * If we either have enough free memory, or if
907 * page_launder() will be able to make enough
908 * free memory, then stop.
910 if (!inactive_shortage() || !free_shortage())
914 * Only switch to a lower "priority" if we
915 * didn't make any useful progress in the
920 } while (priority
>= 0);
922 /* Always end on a refill_inactive.., may sleep... */
923 while (refill_inactive_scan(0, 1)) {
929 return (count
< start_count
);
932 static int do_try_to_free_pages(unsigned int gfp_mask
, int user
)
937 * If we're low on free pages, move pages from the
938 * inactive_dirty list to the inactive_clean list.
940 * Usually bdflush will have pre-cleaned the pages
941 * before we get around to moving them to the other
942 * list, so this is a relatively cheap operation.
944 if (free_shortage() || nr_inactive_dirty_pages
> nr_free_pages() +
945 nr_inactive_clean_pages())
946 ret
+= page_launder(gfp_mask
, user
);
949 * If needed, we move pages from the active list
950 * to the inactive list. We also "eat" pages from
951 * the inode and dentry cache whenever we do this.
953 if (free_shortage() || inactive_shortage()) {
954 shrink_dcache_memory(6, gfp_mask
);
955 shrink_icache_memory(6, gfp_mask
);
956 ret
+= refill_inactive(gfp_mask
, user
);
959 * Reclaim unused slab cache memory.
961 kmem_cache_reap(gfp_mask
);
968 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait
);
969 DECLARE_WAIT_QUEUE_HEAD(kswapd_done
);
970 struct task_struct
*kswapd_task
;
973 * The background pageout daemon, started as a kernel thread
974 * from the init process.
976 * This basically trickles out pages so that we have _some_
977 * free memory available even if there is no other activity
978 * that frees anything up. This is needed for things like routing
979 * etc, where we otherwise might have all activity going on in
980 * asynchronous contexts that cannot page things out.
982 * If there are applications that are active memory-allocators
983 * (most normal use), this basically shouldn't matter.
985 int kswapd(void *unused
)
987 struct task_struct
*tsk
= current
;
991 strcpy(tsk
->comm
, "kswapd");
992 sigfillset(&tsk
->blocked
);
996 * Tell the memory management that we're a "memory allocator",
997 * and that if we need more memory we should get access to it
998 * regardless (see "__alloc_pages()"). "kswapd" should
999 * never get caught in the normal page freeing logic.
1001 * (Kswapd normally doesn't need memory anyway, but sometimes
1002 * you need a small amount of memory in order to be able to
1003 * page out something else, and this flag essentially protects
1004 * us from recursively trying to free more memory as we're
1005 * trying to free the first piece of memory in the first place).
1007 tsk
->flags
|= PF_MEMALLOC
;
1013 static int recalc
= 0;
1015 /* If needed, try to free some memory. */
1016 if (inactive_shortage() || free_shortage()) {
1018 /* Do we need to do some synchronous flushing? */
1019 if (waitqueue_active(&kswapd_done
))
1021 do_try_to_free_pages(GFP_KSWAPD
, wait
);
1025 * Do some (very minimal) background scanning. This
1026 * will scan all pages on the active list once
1027 * every minute. This clears old referenced bits
1028 * and moves unused pages to the inactive list.
1030 refill_inactive_scan(6, 0);
1032 /* Once a second, recalculate some VM stats. */
1033 if (time_after(jiffies
, recalc
+ HZ
)) {
1035 recalculate_vm_stats();
1039 * Wake up everybody waiting for free memory
1040 * and unplug the disk queue.
1042 wake_up_all(&kswapd_done
);
1043 run_task_queue(&tq_disk
);
1046 * We go to sleep if either the free page shortage
1047 * or the inactive page shortage is gone. We do this
1049 * 1) we need no more free pages or
1050 * 2) the inactive pages need to be flushed to disk,
1051 * it wouldn't help to eat CPU time now ...
1053 * We go to sleep for one second, but if it's needed
1054 * we'll be woken up earlier...
1056 if (!free_shortage() || !inactive_shortage()) {
1057 interruptible_sleep_on_timeout(&kswapd_wait
, HZ
);
1059 * If we couldn't free enough memory, we see if it was
1060 * due to the system just not having enough memory.
1061 * If that is the case, the only solution is to kill
1062 * a process (the alternative is enternal deadlock).
1064 * If there still is enough memory around, we just loop
1065 * and try free some more memory...
1067 } else if (out_of_memory()) {
1073 void wakeup_kswapd(int block
)
1075 DECLARE_WAITQUEUE(wait
, current
);
1077 if (current
== kswapd_task
)
1081 if (waitqueue_active(&kswapd_wait
))
1082 wake_up(&kswapd_wait
);
1087 * Kswapd could wake us up before we get a chance
1088 * to sleep, so we have to be very careful here to
1089 * prevent SMP races...
1091 __set_current_state(TASK_UNINTERRUPTIBLE
);
1092 add_wait_queue(&kswapd_done
, &wait
);
1094 if (waitqueue_active(&kswapd_wait
))
1095 wake_up(&kswapd_wait
);
1098 remove_wait_queue(&kswapd_done
, &wait
);
1099 __set_current_state(TASK_RUNNING
);
1103 * Called by non-kswapd processes when they want more
1104 * memory but are unable to sleep on kswapd because
1105 * they might be holding some IO locks ...
1107 int try_to_free_pages(unsigned int gfp_mask
)
1111 if (gfp_mask
& __GFP_WAIT
) {
1112 current
->flags
|= PF_MEMALLOC
;
1113 ret
= do_try_to_free_pages(gfp_mask
, 1);
1114 current
->flags
&= ~PF_MEMALLOC
;
1120 DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait
);
1122 * Kreclaimd will move pages from the inactive_clean list to the
1123 * free list, in order to keep atomic allocations possible under
1124 * all circumstances. Even when kswapd is blocked on IO.
1126 int kreclaimd(void *unused
)
1128 struct task_struct
*tsk
= current
;
1133 strcpy(tsk
->comm
, "kreclaimd");
1134 sigfillset(&tsk
->blocked
);
1135 current
->flags
|= PF_MEMALLOC
;
1140 * We sleep until someone wakes us up from
1141 * page_alloc.c::__alloc_pages().
1143 interruptible_sleep_on(&kreclaimd_wait
);
1146 * Move some pages from the inactive_clean lists to
1147 * the free lists, if it is needed.
1152 for(i
= 0; i
< MAX_NR_ZONES
; i
++) {
1153 zone_t
*zone
= pgdat
->node_zones
+ i
;
1157 while (zone
->free_pages
< zone
->pages_low
) {
1159 page
= reclaim_page(zone
);
1165 pgdat
= pgdat
->node_next
;
1171 static int __init
kswapd_init(void)
1173 printk("Starting kswapd v1.8\n");
1175 kernel_thread(kswapd
, NULL
, CLONE_FS
| CLONE_FILES
| CLONE_SIGNAL
);
1176 kernel_thread(kreclaimd
, NULL
, CLONE_FS
| CLONE_FILES
| CLONE_SIGNAL
);
1180 module_init(kswapd_init
)