Ok. I didn't make 2.4.0 in 2000. Tough. I tried, but we had some
[davej-history.git] / mm / vmscan.c
blobd4a74f41f8a9913df078333d1d74becf408eac8f
1 /*
2 * linux/mm/vmscan.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
11 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
12 * Multiqueue VM started 5.8.00, Rik van Riel.
15 #include <linux/slab.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/swap.h>
18 #include <linux/swapctl.h>
19 #include <linux/smp_lock.h>
20 #include <linux/pagemap.h>
21 #include <linux/init.h>
22 #include <linux/highmem.h>
23 #include <linux/file.h>
25 #include <asm/pgalloc.h>
28 * The swap-out functions return 1 if they successfully
29 * threw something out, and we got a free page. It returns
30 * zero if it couldn't do anything, and any other value
31 * indicates it decreased rss, but the page was shared.
33 * NOTE! If it sleeps, it *must* return 1 to make sure we
34 * don't continue with the swap-out. Otherwise we may be
35 * using a process that no longer actually exists (it might
36 * have died while we slept).
38 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
40 pte_t pte;
41 swp_entry_t entry;
42 struct page * page;
43 int onlist;
45 pte = *page_table;
46 if (!pte_present(pte))
47 goto out_failed;
48 page = pte_page(pte);
49 if ((!VALID_PAGE(page)) || PageReserved(page))
50 goto out_failed;
52 if (mm->swap_cnt)
53 mm->swap_cnt--;
55 onlist = PageActive(page);
56 /* Don't look at this pte if it's been accessed recently. */
57 if (ptep_test_and_clear_young(page_table)) {
58 age_page_up(page);
59 goto out_failed;
61 if (!onlist)
62 /* The page is still mapped, so it can't be freeable... */
63 age_page_down_ageonly(page);
66 * If the page is in active use by us, or if the page
67 * is in active use by others, don't unmap it or
68 * (worse) start unneeded IO.
70 if (page->age > 0)
71 goto out_failed;
73 if (TryLockPage(page))
74 goto out_failed;
76 /* From this point on, the odds are that we're going to
77 * nuke this pte, so read and clear the pte. This hook
78 * is needed on CPUs which update the accessed and dirty
79 * bits in hardware.
81 pte = ptep_get_and_clear(page_table);
84 * Is the page already in the swap cache? If so, then
85 * we can just drop our reference to it without doing
86 * any IO - it's already up-to-date on disk.
88 * Return 0, as we didn't actually free any real
89 * memory, and we should just continue our scan.
91 if (PageSwapCache(page)) {
92 entry.val = page->index;
93 if (pte_dirty(pte))
94 set_page_dirty(page);
95 set_swap_pte:
96 swap_duplicate(entry);
97 set_pte(page_table, swp_entry_to_pte(entry));
98 drop_pte:
99 UnlockPage(page);
100 mm->rss--;
101 flush_tlb_page(vma, address);
102 deactivate_page(page);
103 page_cache_release(page);
104 out_failed:
105 return 0;
109 * Is it a clean page? Then it must be recoverable
110 * by just paging it in again, and we can just drop
111 * it..
113 * However, this won't actually free any real
114 * memory, as the page will just be in the page cache
115 * somewhere, and as such we should just continue
116 * our scan.
118 * Basically, this just makes it possible for us to do
119 * some real work in the future in "refill_inactive()".
121 flush_cache_page(vma, address);
122 if (!pte_dirty(pte))
123 goto drop_pte;
126 * Ok, it's really dirty. That means that
127 * we should either create a new swap cache
128 * entry for it, or we should write it back
129 * to its own backing store.
131 if (page->mapping) {
132 set_page_dirty(page);
133 goto drop_pte;
137 * This is a dirty, swappable page. First of all,
138 * get a suitable swap entry for it, and make sure
139 * we have the swap cache set up to associate the
140 * page with that swap entry.
142 entry = get_swap_page();
143 if (!entry.val)
144 goto out_unlock_restore; /* No swap space left */
146 /* Add it to the swap cache and mark it dirty */
147 add_to_swap_cache(page, entry);
148 set_page_dirty(page);
149 goto set_swap_pte;
151 out_unlock_restore:
152 set_pte(page_table, pte);
153 UnlockPage(page);
154 return 0;
158 * A new implementation of swap_out(). We do not swap complete processes,
159 * but only a small number of blocks, before we continue with the next
160 * process. The number of blocks actually swapped is determined on the
161 * number of page faults, that this process actually had in the last time,
162 * so we won't swap heavily used processes all the time ...
164 * Note: the priority argument is a hint on much CPU to waste with the
165 * swap block search, not a hint, of how much blocks to swap with
166 * each process.
168 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
171 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
173 pte_t * pte;
174 unsigned long pmd_end;
176 if (pmd_none(*dir))
177 return 0;
178 if (pmd_bad(*dir)) {
179 pmd_ERROR(*dir);
180 pmd_clear(dir);
181 return 0;
184 pte = pte_offset(dir, address);
186 pmd_end = (address + PMD_SIZE) & PMD_MASK;
187 if (end > pmd_end)
188 end = pmd_end;
190 do {
191 int result;
192 mm->swap_address = address + PAGE_SIZE;
193 result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
194 if (result)
195 return result;
196 if (!mm->swap_cnt)
197 return 0;
198 address += PAGE_SIZE;
199 pte++;
200 } while (address && (address < end));
201 return 0;
204 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
206 pmd_t * pmd;
207 unsigned long pgd_end;
209 if (pgd_none(*dir))
210 return 0;
211 if (pgd_bad(*dir)) {
212 pgd_ERROR(*dir);
213 pgd_clear(dir);
214 return 0;
217 pmd = pmd_offset(dir, address);
219 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
220 if (pgd_end && (end > pgd_end))
221 end = pgd_end;
223 do {
224 int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
225 if (result)
226 return result;
227 if (!mm->swap_cnt)
228 return 0;
229 address = (address + PMD_SIZE) & PMD_MASK;
230 pmd++;
231 } while (address && (address < end));
232 return 0;
235 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
237 pgd_t *pgdir;
238 unsigned long end;
240 /* Don't swap out areas which are locked down */
241 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
242 return 0;
244 pgdir = pgd_offset(mm, address);
246 end = vma->vm_end;
247 if (address >= end)
248 BUG();
249 do {
250 int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
251 if (result)
252 return result;
253 if (!mm->swap_cnt)
254 return 0;
255 address = (address + PGDIR_SIZE) & PGDIR_MASK;
256 pgdir++;
257 } while (address && (address < end));
258 return 0;
261 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
263 unsigned long address;
264 struct vm_area_struct* vma;
267 * Go through process' page directory.
269 address = mm->swap_address;
272 * Find the proper vm-area after freezing the vma chain
273 * and ptes.
275 spin_lock(&mm->page_table_lock);
276 vma = find_vma(mm, address);
277 if (vma) {
278 if (address < vma->vm_start)
279 address = vma->vm_start;
281 for (;;) {
282 int result = swap_out_vma(mm, vma, address, gfp_mask);
283 if (result)
284 return result;
285 if (!mm->swap_cnt)
286 goto out_unlock;
287 vma = vma->vm_next;
288 if (!vma)
289 break;
290 address = vma->vm_start;
293 /* Reset to 0 when we reach the end of address space */
294 mm->swap_address = 0;
295 mm->swap_cnt = 0;
297 out_unlock:
298 spin_unlock(&mm->page_table_lock);
300 /* We didn't find anything for the process */
301 return 0;
305 * Select the task with maximal swap_cnt and try to swap out a page.
306 * N.B. This function returns only 0 or 1. Return values != 1 from
307 * the lower level routines result in continued processing.
309 #define SWAP_SHIFT 5
310 #define SWAP_MIN 8
312 static int swap_out(unsigned int priority, int gfp_mask, unsigned long idle_time)
314 struct task_struct * p;
315 int counter;
316 int __ret = 0;
318 lock_kernel();
320 * We make one or two passes through the task list, indexed by
321 * assign = {0, 1}:
322 * Pass 1: select the swappable task with maximal RSS that has
323 * not yet been swapped out.
324 * Pass 2: re-assign rss swap_cnt values, then select as above.
326 * With this approach, there's no need to remember the last task
327 * swapped out. If the swap-out fails, we clear swap_cnt so the
328 * task won't be selected again until all others have been tried.
330 * Think of swap_cnt as a "shadow rss" - it tells us which process
331 * we want to page out (always try largest first).
333 counter = (nr_threads << SWAP_SHIFT) >> priority;
334 if (counter < 1)
335 counter = 1;
337 for (; counter >= 0; counter--) {
338 unsigned long max_cnt = 0;
339 struct mm_struct *best = NULL;
340 int pid = 0;
341 int assign = 0;
342 int found_task = 0;
343 select:
344 read_lock(&tasklist_lock);
345 p = init_task.next_task;
346 for (; p != &init_task; p = p->next_task) {
347 struct mm_struct *mm = p->mm;
348 if (!p->swappable || !mm)
349 continue;
350 if (mm->rss <= 0)
351 continue;
352 /* Skip tasks which haven't slept long enough yet when idle-swapping. */
353 if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) ||
354 time_after(p->sleep_time + idle_time * HZ, jiffies)))
355 continue;
356 found_task++;
357 /* Refresh swap_cnt? */
358 if (assign == 1) {
359 mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
360 if (mm->swap_cnt < SWAP_MIN)
361 mm->swap_cnt = SWAP_MIN;
363 if (mm->swap_cnt > max_cnt) {
364 max_cnt = mm->swap_cnt;
365 best = mm;
366 pid = p->pid;
369 read_unlock(&tasklist_lock);
370 if (!best) {
371 if (!assign && found_task > 0) {
372 assign = 1;
373 goto select;
375 goto out;
376 } else {
377 int ret;
379 atomic_inc(&best->mm_count);
380 ret = swap_out_mm(best, gfp_mask);
381 mmdrop(best);
383 __ret = 1;
384 goto out;
387 out:
388 unlock_kernel();
389 return __ret;
394 * reclaim_page - reclaims one page from the inactive_clean list
395 * @zone: reclaim a page from this zone
397 * The pages on the inactive_clean can be instantly reclaimed.
398 * The tests look impressive, but most of the time we'll grab
399 * the first page of the list and exit successfully.
401 struct page * reclaim_page(zone_t * zone)
403 struct page * page = NULL;
404 struct list_head * page_lru;
405 int maxscan;
408 * We only need the pagemap_lru_lock if we don't reclaim the page,
409 * but we have to grab the pagecache_lock before the pagemap_lru_lock
410 * to avoid deadlocks and most of the time we'll succeed anyway.
412 spin_lock(&pagecache_lock);
413 spin_lock(&pagemap_lru_lock);
414 maxscan = zone->inactive_clean_pages;
415 while ((page_lru = zone->inactive_clean_list.prev) !=
416 &zone->inactive_clean_list && maxscan--) {
417 page = list_entry(page_lru, struct page, lru);
419 /* Wrong page on list?! (list corruption, should not happen) */
420 if (!PageInactiveClean(page)) {
421 printk("VM: reclaim_page, wrong page on list.\n");
422 list_del(page_lru);
423 page->zone->inactive_clean_pages--;
424 continue;
427 /* Page is or was in use? Move it to the active list. */
428 if (PageTestandClearReferenced(page) || page->age > 0 ||
429 (!page->buffers && page_count(page) > 1)) {
430 del_page_from_inactive_clean_list(page);
431 add_page_to_active_list(page);
432 continue;
435 /* The page is dirty, or locked, move to inactive_dirty list. */
436 if (page->buffers || PageDirty(page) || TryLockPage(page)) {
437 del_page_from_inactive_clean_list(page);
438 add_page_to_inactive_dirty_list(page);
439 continue;
442 /* OK, remove the page from the caches. */
443 if (PageSwapCache(page)) {
444 __delete_from_swap_cache(page);
445 goto found_page;
448 if (page->mapping) {
449 __remove_inode_page(page);
450 goto found_page;
453 /* We should never ever get here. */
454 printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
455 list_del(page_lru);
456 zone->inactive_clean_pages--;
457 UnlockPage(page);
459 /* Reset page pointer, maybe we encountered an unfreeable page. */
460 page = NULL;
461 goto out;
463 found_page:
464 del_page_from_inactive_clean_list(page);
465 UnlockPage(page);
466 page->age = PAGE_AGE_START;
467 if (page_count(page) != 1)
468 printk("VM: reclaim_page, found page with count %d!\n",
469 page_count(page));
470 out:
471 spin_unlock(&pagemap_lru_lock);
472 spin_unlock(&pagecache_lock);
473 memory_pressure++;
474 return page;
478 * page_launder - clean dirty inactive pages, move to inactive_clean list
479 * @gfp_mask: what operations we are allowed to do
480 * @sync: should we wait synchronously for the cleaning of pages
482 * When this function is called, we are most likely low on free +
483 * inactive_clean pages. Since we want to refill those pages as
484 * soon as possible, we'll make two loops over the inactive list,
485 * one to move the already cleaned pages to the inactive_clean lists
486 * and one to (often asynchronously) clean the dirty inactive pages.
488 * In situations where kswapd cannot keep up, user processes will
489 * end up calling this function. Since the user process needs to
490 * have a page before it can continue with its allocation, we'll
491 * do synchronous page flushing in that case.
493 * This code is heavily inspired by the FreeBSD source code. Thanks
494 * go out to Matthew Dillon.
496 #define MAX_LAUNDER (4 * (1 << page_cluster))
497 int page_launder(int gfp_mask, int sync)
499 int launder_loop, maxscan, cleaned_pages, maxlaunder;
500 int can_get_io_locks;
501 struct list_head * page_lru;
502 struct page * page;
505 * We can only grab the IO locks (eg. for flushing dirty
506 * buffers to disk) if __GFP_IO is set.
508 can_get_io_locks = gfp_mask & __GFP_IO;
510 launder_loop = 0;
511 maxlaunder = 0;
512 cleaned_pages = 0;
514 dirty_page_rescan:
515 spin_lock(&pagemap_lru_lock);
516 maxscan = nr_inactive_dirty_pages;
517 while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
518 maxscan-- > 0) {
519 page = list_entry(page_lru, struct page, lru);
521 /* Wrong page on list?! (list corruption, should not happen) */
522 if (!PageInactiveDirty(page)) {
523 printk("VM: page_launder, wrong page on list.\n");
524 list_del(page_lru);
525 nr_inactive_dirty_pages--;
526 page->zone->inactive_dirty_pages--;
527 continue;
530 /* Page is or was in use? Move it to the active list. */
531 if (PageTestandClearReferenced(page) || page->age > 0 ||
532 (!page->buffers && page_count(page) > 1) ||
533 page_ramdisk(page)) {
534 del_page_from_inactive_dirty_list(page);
535 add_page_to_active_list(page);
536 continue;
540 * The page is locked. IO in progress?
541 * Move it to the back of the list.
543 if (TryLockPage(page)) {
544 list_del(page_lru);
545 list_add(page_lru, &inactive_dirty_list);
546 continue;
550 * Dirty swap-cache page? Write it out if
551 * last copy..
553 if (PageDirty(page)) {
554 int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
555 int result;
557 if (!writepage)
558 goto page_active;
560 /* First time through? Move it to the back of the list */
561 if (!launder_loop) {
562 list_del(page_lru);
563 list_add(page_lru, &inactive_dirty_list);
564 UnlockPage(page);
565 continue;
568 /* OK, do a physical asynchronous write to swap. */
569 ClearPageDirty(page);
570 page_cache_get(page);
571 spin_unlock(&pagemap_lru_lock);
573 result = writepage(page);
574 page_cache_release(page);
576 /* And re-start the thing.. */
577 spin_lock(&pagemap_lru_lock);
578 if (result != 1)
579 continue;
580 /* writepage refused to do anything */
581 set_page_dirty(page);
582 goto page_active;
586 * If the page has buffers, try to free the buffer mappings
587 * associated with this page. If we succeed we either free
588 * the page (in case it was a buffercache only page) or we
589 * move the page to the inactive_clean list.
591 * On the first round, we should free all previously cleaned
592 * buffer pages
594 if (page->buffers) {
595 int wait, clearedbuf;
596 int freed_page = 0;
598 * Since we might be doing disk IO, we have to
599 * drop the spinlock and take an extra reference
600 * on the page so it doesn't go away from under us.
602 del_page_from_inactive_dirty_list(page);
603 page_cache_get(page);
604 spin_unlock(&pagemap_lru_lock);
606 /* Will we do (asynchronous) IO? */
607 if (launder_loop && maxlaunder == 0 && sync)
608 wait = 2; /* Synchrounous IO */
609 else if (launder_loop && maxlaunder-- > 0)
610 wait = 1; /* Async IO */
611 else
612 wait = 0; /* No IO */
614 /* Try to free the page buffers. */
615 clearedbuf = try_to_free_buffers(page, wait);
618 * Re-take the spinlock. Note that we cannot
619 * unlock the page yet since we're still
620 * accessing the page_struct here...
622 spin_lock(&pagemap_lru_lock);
624 /* The buffers were not freed. */
625 if (!clearedbuf) {
626 add_page_to_inactive_dirty_list(page);
628 /* The page was only in the buffer cache. */
629 } else if (!page->mapping) {
630 atomic_dec(&buffermem_pages);
631 freed_page = 1;
632 cleaned_pages++;
634 /* The page has more users besides the cache and us. */
635 } else if (page_count(page) > 2) {
636 add_page_to_active_list(page);
638 /* OK, we "created" a freeable page. */
639 } else /* page->mapping && page_count(page) == 2 */ {
640 add_page_to_inactive_clean_list(page);
641 cleaned_pages++;
645 * Unlock the page and drop the extra reference.
646 * We can only do it here because we ar accessing
647 * the page struct above.
649 UnlockPage(page);
650 page_cache_release(page);
653 * If we're freeing buffer cache pages, stop when
654 * we've got enough free memory.
656 if (freed_page && !free_shortage())
657 break;
658 continue;
659 } else if (page->mapping && !PageDirty(page)) {
661 * If a page had an extra reference in
662 * deactivate_page(), we will find it here.
663 * Now the page is really freeable, so we
664 * move it to the inactive_clean list.
666 del_page_from_inactive_dirty_list(page);
667 add_page_to_inactive_clean_list(page);
668 UnlockPage(page);
669 cleaned_pages++;
670 } else {
671 page_active:
673 * OK, we don't know what to do with the page.
674 * It's no use keeping it here, so we move it to
675 * the active list.
677 del_page_from_inactive_dirty_list(page);
678 add_page_to_active_list(page);
679 UnlockPage(page);
682 spin_unlock(&pagemap_lru_lock);
685 * If we don't have enough free pages, we loop back once
686 * to queue the dirty pages for writeout. When we were called
687 * by a user process (that /needs/ a free page) and we didn't
688 * free anything yet, we wait synchronously on the writeout of
689 * MAX_SYNC_LAUNDER pages.
691 * We also wake up bdflush, since bdflush should, under most
692 * loads, flush out the dirty pages before we have to wait on
693 * IO.
695 if (can_get_io_locks && !launder_loop && free_shortage()) {
696 launder_loop = 1;
697 /* If we cleaned pages, never do synchronous IO. */
698 if (cleaned_pages)
699 sync = 0;
700 /* We only do a few "out of order" flushes. */
701 maxlaunder = MAX_LAUNDER;
702 /* Kflushd takes care of the rest. */
703 wakeup_bdflush(0);
704 goto dirty_page_rescan;
707 /* Return the number of pages moved to the inactive_clean list. */
708 return cleaned_pages;
712 * refill_inactive_scan - scan the active list and find pages to deactivate
713 * @priority: the priority at which to scan
714 * @oneshot: exit after deactivating one page
716 * This function will scan a portion of the active list to find
717 * unused pages, those pages will then be moved to the inactive list.
719 int refill_inactive_scan(unsigned int priority, int oneshot)
721 struct list_head * page_lru;
722 struct page * page;
723 int maxscan, page_active = 0;
724 int ret = 0;
726 /* Take the lock while messing with the list... */
727 spin_lock(&pagemap_lru_lock);
728 maxscan = nr_active_pages >> priority;
729 while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
730 page = list_entry(page_lru, struct page, lru);
732 /* Wrong page on list?! (list corruption, should not happen) */
733 if (!PageActive(page)) {
734 printk("VM: refill_inactive, wrong page on list.\n");
735 list_del(page_lru);
736 nr_active_pages--;
737 continue;
740 /* Do aging on the pages. */
741 if (PageTestandClearReferenced(page)) {
742 age_page_up_nolock(page);
743 page_active = 1;
744 } else {
745 age_page_down_ageonly(page);
747 * Since we don't hold a reference on the page
748 * ourselves, we have to do our test a bit more
749 * strict then deactivate_page(). This is needed
750 * since otherwise the system could hang shuffling
751 * unfreeable pages from the active list to the
752 * inactive_dirty list and back again...
754 * SUBTLE: we can have buffer pages with count 1.
756 if (page->age == 0 && page_count(page) <=
757 (page->buffers ? 2 : 1)) {
758 deactivate_page_nolock(page);
759 page_active = 0;
760 } else {
761 page_active = 1;
765 * If the page is still on the active list, move it
766 * to the other end of the list. Otherwise it was
767 * deactivated by age_page_down and we exit successfully.
769 if (page_active || PageActive(page)) {
770 list_del(page_lru);
771 list_add(page_lru, &active_list);
772 } else {
773 ret = 1;
774 if (oneshot)
775 break;
778 spin_unlock(&pagemap_lru_lock);
780 return ret;
784 * Check if there are zones with a severe shortage of free pages,
785 * or if all zones have a minor shortage.
787 int free_shortage(void)
789 pg_data_t *pgdat = pgdat_list;
790 int sum = 0;
791 int freeable = nr_free_pages() + nr_inactive_clean_pages();
792 int freetarget = freepages.high + inactive_target / 3;
794 /* Are we low on free pages globally? */
795 if (freeable < freetarget)
796 return freetarget - freeable;
798 /* If not, are we very low on any particular zone? */
799 do {
800 int i;
801 for(i = 0; i < MAX_NR_ZONES; i++) {
802 zone_t *zone = pgdat->node_zones+ i;
803 if (zone->size && (zone->inactive_clean_pages +
804 zone->free_pages < zone->pages_min+1)) {
805 /* + 1 to have overlap with alloc_pages() !! */
806 sum += zone->pages_min + 1;
807 sum -= zone->free_pages;
808 sum -= zone->inactive_clean_pages;
811 pgdat = pgdat->node_next;
812 } while (pgdat);
814 return sum;
818 * How many inactive pages are we short?
820 int inactive_shortage(void)
822 int shortage = 0;
824 shortage += freepages.high;
825 shortage += inactive_target;
826 shortage -= nr_free_pages();
827 shortage -= nr_inactive_clean_pages();
828 shortage -= nr_inactive_dirty_pages;
830 if (shortage > 0)
831 return shortage;
833 return 0;
837 * We need to make the locks finer granularity, but right
838 * now we need this so that we can do page allocations
839 * without holding the kernel lock etc.
841 * We want to try to free "count" pages, and we want to
842 * cluster them so that we get good swap-out behaviour.
844 * OTOH, if we're a user process (and not kswapd), we
845 * really care about latency. In that case we don't try
846 * to free too many pages.
848 static int refill_inactive(unsigned int gfp_mask, int user)
850 int priority, count, start_count, made_progress;
851 unsigned long idle_time;
853 count = inactive_shortage() + free_shortage();
854 if (user)
855 count = (1 << page_cluster);
856 start_count = count;
858 /* Always trim SLAB caches when memory gets low. */
859 kmem_cache_reap(gfp_mask);
862 * Calculate the minimum time (in seconds) a process must
863 * have slept before we consider it for idle swapping.
864 * This must be the number of seconds it takes to go through
865 * all of the cache. Doing this idle swapping makes the VM
866 * smoother once we start hitting swap.
868 idle_time = atomic_read(&page_cache_size);
869 idle_time += atomic_read(&buffermem_pages);
870 idle_time /= (inactive_target + 1);
872 priority = 6;
873 do {
874 made_progress = 0;
876 if (current->need_resched) {
877 __set_current_state(TASK_RUNNING);
878 schedule();
881 while (refill_inactive_scan(priority, 1) ||
882 swap_out(priority, gfp_mask, idle_time)) {
883 made_progress = 1;
884 if (--count <= 0)
885 goto done;
889 * don't be too light against the d/i cache since
890 * refill_inactive() almost never fail when there's
891 * really plenty of memory free.
893 shrink_dcache_memory(priority, gfp_mask);
894 shrink_icache_memory(priority, gfp_mask);
897 * Then, try to page stuff out..
899 while (swap_out(priority, gfp_mask, 0)) {
900 made_progress = 1;
901 if (--count <= 0)
902 goto done;
906 * If we either have enough free memory, or if
907 * page_launder() will be able to make enough
908 * free memory, then stop.
910 if (!inactive_shortage() || !free_shortage())
911 goto done;
914 * Only switch to a lower "priority" if we
915 * didn't make any useful progress in the
916 * last loop.
918 if (!made_progress)
919 priority--;
920 } while (priority >= 0);
922 /* Always end on a refill_inactive.., may sleep... */
923 while (refill_inactive_scan(0, 1)) {
924 if (--count <= 0)
925 goto done;
928 done:
929 return (count < start_count);
932 static int do_try_to_free_pages(unsigned int gfp_mask, int user)
934 int ret = 0;
937 * If we're low on free pages, move pages from the
938 * inactive_dirty list to the inactive_clean list.
940 * Usually bdflush will have pre-cleaned the pages
941 * before we get around to moving them to the other
942 * list, so this is a relatively cheap operation.
944 if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
945 nr_inactive_clean_pages())
946 ret += page_launder(gfp_mask, user);
949 * If needed, we move pages from the active list
950 * to the inactive list. We also "eat" pages from
951 * the inode and dentry cache whenever we do this.
953 if (free_shortage() || inactive_shortage()) {
954 shrink_dcache_memory(6, gfp_mask);
955 shrink_icache_memory(6, gfp_mask);
956 ret += refill_inactive(gfp_mask, user);
957 } else {
959 * Reclaim unused slab cache memory.
961 kmem_cache_reap(gfp_mask);
962 ret = 1;
965 return ret;
968 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
969 DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
970 struct task_struct *kswapd_task;
973 * The background pageout daemon, started as a kernel thread
974 * from the init process.
976 * This basically trickles out pages so that we have _some_
977 * free memory available even if there is no other activity
978 * that frees anything up. This is needed for things like routing
979 * etc, where we otherwise might have all activity going on in
980 * asynchronous contexts that cannot page things out.
982 * If there are applications that are active memory-allocators
983 * (most normal use), this basically shouldn't matter.
985 int kswapd(void *unused)
987 struct task_struct *tsk = current;
989 tsk->session = 1;
990 tsk->pgrp = 1;
991 strcpy(tsk->comm, "kswapd");
992 sigfillset(&tsk->blocked);
993 kswapd_task = tsk;
996 * Tell the memory management that we're a "memory allocator",
997 * and that if we need more memory we should get access to it
998 * regardless (see "__alloc_pages()"). "kswapd" should
999 * never get caught in the normal page freeing logic.
1001 * (Kswapd normally doesn't need memory anyway, but sometimes
1002 * you need a small amount of memory in order to be able to
1003 * page out something else, and this flag essentially protects
1004 * us from recursively trying to free more memory as we're
1005 * trying to free the first piece of memory in the first place).
1007 tsk->flags |= PF_MEMALLOC;
1010 * Kswapd main loop.
1012 for (;;) {
1013 static int recalc = 0;
1015 /* If needed, try to free some memory. */
1016 if (inactive_shortage() || free_shortage()) {
1017 int wait = 0;
1018 /* Do we need to do some synchronous flushing? */
1019 if (waitqueue_active(&kswapd_done))
1020 wait = 1;
1021 do_try_to_free_pages(GFP_KSWAPD, wait);
1025 * Do some (very minimal) background scanning. This
1026 * will scan all pages on the active list once
1027 * every minute. This clears old referenced bits
1028 * and moves unused pages to the inactive list.
1030 refill_inactive_scan(6, 0);
1032 /* Once a second, recalculate some VM stats. */
1033 if (time_after(jiffies, recalc + HZ)) {
1034 recalc = jiffies;
1035 recalculate_vm_stats();
1039 * Wake up everybody waiting for free memory
1040 * and unplug the disk queue.
1042 wake_up_all(&kswapd_done);
1043 run_task_queue(&tq_disk);
1046 * We go to sleep if either the free page shortage
1047 * or the inactive page shortage is gone. We do this
1048 * because:
1049 * 1) we need no more free pages or
1050 * 2) the inactive pages need to be flushed to disk,
1051 * it wouldn't help to eat CPU time now ...
1053 * We go to sleep for one second, but if it's needed
1054 * we'll be woken up earlier...
1056 if (!free_shortage() || !inactive_shortage()) {
1057 interruptible_sleep_on_timeout(&kswapd_wait, HZ);
1059 * If we couldn't free enough memory, we see if it was
1060 * due to the system just not having enough memory.
1061 * If that is the case, the only solution is to kill
1062 * a process (the alternative is enternal deadlock).
1064 * If there still is enough memory around, we just loop
1065 * and try free some more memory...
1067 } else if (out_of_memory()) {
1068 oom_kill();
1073 void wakeup_kswapd(int block)
1075 DECLARE_WAITQUEUE(wait, current);
1077 if (current == kswapd_task)
1078 return;
1080 if (!block) {
1081 if (waitqueue_active(&kswapd_wait))
1082 wake_up(&kswapd_wait);
1083 return;
1087 * Kswapd could wake us up before we get a chance
1088 * to sleep, so we have to be very careful here to
1089 * prevent SMP races...
1091 __set_current_state(TASK_UNINTERRUPTIBLE);
1092 add_wait_queue(&kswapd_done, &wait);
1094 if (waitqueue_active(&kswapd_wait))
1095 wake_up(&kswapd_wait);
1096 schedule();
1098 remove_wait_queue(&kswapd_done, &wait);
1099 __set_current_state(TASK_RUNNING);
1103 * Called by non-kswapd processes when they want more
1104 * memory but are unable to sleep on kswapd because
1105 * they might be holding some IO locks ...
1107 int try_to_free_pages(unsigned int gfp_mask)
1109 int ret = 1;
1111 if (gfp_mask & __GFP_WAIT) {
1112 current->flags |= PF_MEMALLOC;
1113 ret = do_try_to_free_pages(gfp_mask, 1);
1114 current->flags &= ~PF_MEMALLOC;
1117 return ret;
1120 DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
1122 * Kreclaimd will move pages from the inactive_clean list to the
1123 * free list, in order to keep atomic allocations possible under
1124 * all circumstances. Even when kswapd is blocked on IO.
1126 int kreclaimd(void *unused)
1128 struct task_struct *tsk = current;
1129 pg_data_t *pgdat;
1131 tsk->session = 1;
1132 tsk->pgrp = 1;
1133 strcpy(tsk->comm, "kreclaimd");
1134 sigfillset(&tsk->blocked);
1135 current->flags |= PF_MEMALLOC;
1137 while (1) {
1140 * We sleep until someone wakes us up from
1141 * page_alloc.c::__alloc_pages().
1143 interruptible_sleep_on(&kreclaimd_wait);
1146 * Move some pages from the inactive_clean lists to
1147 * the free lists, if it is needed.
1149 pgdat = pgdat_list;
1150 do {
1151 int i;
1152 for(i = 0; i < MAX_NR_ZONES; i++) {
1153 zone_t *zone = pgdat->node_zones + i;
1154 if (!zone->size)
1155 continue;
1157 while (zone->free_pages < zone->pages_low) {
1158 struct page * page;
1159 page = reclaim_page(zone);
1160 if (!page)
1161 break;
1162 __free_page(page);
1165 pgdat = pgdat->node_next;
1166 } while (pgdat);
1171 static int __init kswapd_init(void)
1173 printk("Starting kswapd v1.8\n");
1174 swap_setup();
1175 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1176 kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1177 return 0;
1180 module_init(kswapd_init)