- Jaroslav Kysela: ymfpci driver
[davej-history.git] / mm / vmscan.c
blob46eb771af90b31f44fde57cfed6a204b705c5bc5
1 /*
2 * linux/mm/vmscan.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
11 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
12 * Multiqueue VM started 5.8.00, Rik van Riel.
15 #include <linux/slab.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/swap.h>
18 #include <linux/swapctl.h>
19 #include <linux/smp_lock.h>
20 #include <linux/pagemap.h>
21 #include <linux/init.h>
22 #include <linux/highmem.h>
23 #include <linux/file.h>
25 #include <asm/pgalloc.h>
28 * The swap-out functions return 1 if they successfully
29 * threw something out, and we got a free page. It returns
30 * zero if it couldn't do anything, and any other value
31 * indicates it decreased rss, but the page was shared.
33 * NOTE! If it sleeps, it *must* return 1 to make sure we
34 * don't continue with the swap-out. Otherwise we may be
35 * using a process that no longer actually exists (it might
36 * have died while we slept).
38 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
40 pte_t pte;
41 swp_entry_t entry;
42 struct page * page;
43 int (*swapout)(struct page *, struct file *);
44 int onlist;
46 pte = *page_table;
47 if (!pte_present(pte))
48 goto out_failed;
49 page = pte_page(pte);
50 if ((!VALID_PAGE(page)) || PageReserved(page))
51 goto out_failed;
53 if (mm->swap_cnt)
54 mm->swap_cnt--;
56 onlist = PageActive(page);
57 /* Don't look at this pte if it's been accessed recently. */
58 if (ptep_test_and_clear_young(page_table)) {
59 age_page_up(page);
60 goto out_failed;
62 if (!onlist)
63 /* The page is still mapped, so it can't be freeable... */
64 age_page_down_ageonly(page);
67 * If the page is in active use by us, or if the page
68 * is in active use by others, don't unmap it or
69 * (worse) start unneeded IO.
71 if (page->age > 0)
72 goto out_failed;
74 if (TryLockPage(page))
75 goto out_failed;
77 /* From this point on, the odds are that we're going to
78 * nuke this pte, so read and clear the pte. This hook
79 * is needed on CPUs which update the accessed and dirty
80 * bits in hardware.
82 pte = ptep_get_and_clear(page_table);
85 * Is the page already in the swap cache? If so, then
86 * we can just drop our reference to it without doing
87 * any IO - it's already up-to-date on disk.
89 * Return 0, as we didn't actually free any real
90 * memory, and we should just continue our scan.
92 if (PageSwapCache(page)) {
93 entry.val = page->index;
94 if (pte_dirty(pte))
95 SetPageDirty(page);
96 set_swap_pte:
97 swap_duplicate(entry);
98 set_pte(page_table, swp_entry_to_pte(entry));
99 drop_pte:
100 UnlockPage(page);
101 mm->rss--;
102 flush_tlb_page(vma, address);
103 deactivate_page(page);
104 page_cache_release(page);
105 out_failed:
106 return 0;
110 * Is it a clean page? Then it must be recoverable
111 * by just paging it in again, and we can just drop
112 * it..
114 * However, this won't actually free any real
115 * memory, as the page will just be in the page cache
116 * somewhere, and as such we should just continue
117 * our scan.
119 * Basically, this just makes it possible for us to do
120 * some real work in the future in "refill_inactive()".
122 if (!pte_dirty(pte)) {
123 flush_cache_page(vma, address);
124 goto drop_pte;
128 * Don't go down into the swap-out stuff if
129 * we cannot do I/O! Avoid recursing on FS
130 * locks etc.
132 if (!(gfp_mask & __GFP_IO))
133 goto out_unlock_restore;
136 * Don't do any of the expensive stuff if
137 * we're not really interested in this zone.
139 if (page->zone->free_pages + page->zone->inactive_clean_pages
140 + page->zone->inactive_dirty_pages
141 > page->zone->pages_high + inactive_target)
142 goto out_unlock_restore;
145 * Ok, it's really dirty. That means that
146 * we should either create a new swap cache
147 * entry for it, or we should write it back
148 * to its own backing store.
150 * Note that in neither case do we actually
151 * know that we make a page available, but
152 * as we potentially sleep we can no longer
153 * continue scanning, so we migth as well
154 * assume we free'd something.
156 * NOTE NOTE NOTE! This should just set a
157 * dirty bit in 'page', and just drop the
158 * pte. All the hard work would be done by
159 * refill_inactive().
161 * That would get rid of a lot of problems.
163 flush_cache_page(vma, address);
164 if (vma->vm_ops && (swapout = vma->vm_ops->swapout)) {
165 int error;
166 struct file *file = vma->vm_file;
167 if (file) get_file(file);
169 mm->rss--;
170 flush_tlb_page(vma, address);
171 spin_unlock(&mm->page_table_lock);
172 error = swapout(page, file);
173 if (file) fput(file);
174 if (error < 0)
175 goto out_unlock_restore;
176 UnlockPage(page);
177 deactivate_page(page);
178 page_cache_release(page);
179 return 1; /* We released page_table_lock */
183 * This is a dirty, swappable page. First of all,
184 * get a suitable swap entry for it, and make sure
185 * we have the swap cache set up to associate the
186 * page with that swap entry.
188 entry = get_swap_page();
189 if (!entry.val)
190 goto out_unlock_restore; /* No swap space left */
192 /* Add it to the swap cache and mark it dirty */
193 add_to_swap_cache(page, entry);
194 SetPageDirty(page);
195 goto set_swap_pte;
197 out_unlock_restore:
198 set_pte(page_table, pte);
199 UnlockPage(page);
200 return 0;
204 * A new implementation of swap_out(). We do not swap complete processes,
205 * but only a small number of blocks, before we continue with the next
206 * process. The number of blocks actually swapped is determined on the
207 * number of page faults, that this process actually had in the last time,
208 * so we won't swap heavily used processes all the time ...
210 * Note: the priority argument is a hint on much CPU to waste with the
211 * swap block search, not a hint, of how much blocks to swap with
212 * each process.
214 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
217 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
219 pte_t * pte;
220 unsigned long pmd_end;
222 if (pmd_none(*dir))
223 return 0;
224 if (pmd_bad(*dir)) {
225 pmd_ERROR(*dir);
226 pmd_clear(dir);
227 return 0;
230 pte = pte_offset(dir, address);
232 pmd_end = (address + PMD_SIZE) & PMD_MASK;
233 if (end > pmd_end)
234 end = pmd_end;
236 do {
237 int result;
238 mm->swap_address = address + PAGE_SIZE;
239 result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
240 if (result)
241 return result;
242 if (!mm->swap_cnt)
243 return 0;
244 address += PAGE_SIZE;
245 pte++;
246 } while (address && (address < end));
247 return 0;
250 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
252 pmd_t * pmd;
253 unsigned long pgd_end;
255 if (pgd_none(*dir))
256 return 0;
257 if (pgd_bad(*dir)) {
258 pgd_ERROR(*dir);
259 pgd_clear(dir);
260 return 0;
263 pmd = pmd_offset(dir, address);
265 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
266 if (pgd_end && (end > pgd_end))
267 end = pgd_end;
269 do {
270 int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
271 if (result)
272 return result;
273 if (!mm->swap_cnt)
274 return 0;
275 address = (address + PMD_SIZE) & PMD_MASK;
276 pmd++;
277 } while (address && (address < end));
278 return 0;
281 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
283 pgd_t *pgdir;
284 unsigned long end;
286 /* Don't swap out areas which are locked down */
287 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
288 return 0;
290 pgdir = pgd_offset(mm, address);
292 end = vma->vm_end;
293 if (address >= end)
294 BUG();
295 do {
296 int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
297 if (result)
298 return result;
299 if (!mm->swap_cnt)
300 return 0;
301 address = (address + PGDIR_SIZE) & PGDIR_MASK;
302 pgdir++;
303 } while (address && (address < end));
304 return 0;
307 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
309 unsigned long address;
310 struct vm_area_struct* vma;
313 * Go through process' page directory.
315 address = mm->swap_address;
318 * Find the proper vm-area after freezing the vma chain
319 * and ptes.
321 spin_lock(&mm->page_table_lock);
322 vma = find_vma(mm, address);
323 if (vma) {
324 if (address < vma->vm_start)
325 address = vma->vm_start;
327 for (;;) {
328 int result = swap_out_vma(mm, vma, address, gfp_mask);
329 if (result)
330 return result;
331 if (!mm->swap_cnt)
332 goto out_unlock;
333 vma = vma->vm_next;
334 if (!vma)
335 break;
336 address = vma->vm_start;
339 /* Reset to 0 when we reach the end of address space */
340 mm->swap_address = 0;
341 mm->swap_cnt = 0;
343 out_unlock:
344 spin_unlock(&mm->page_table_lock);
346 /* We didn't find anything for the process */
347 return 0;
351 * Select the task with maximal swap_cnt and try to swap out a page.
352 * N.B. This function returns only 0 or 1. Return values != 1 from
353 * the lower level routines result in continued processing.
355 #define SWAP_SHIFT 5
356 #define SWAP_MIN 8
358 static int swap_out(unsigned int priority, int gfp_mask, unsigned long idle_time)
360 struct task_struct * p;
361 int counter;
362 int __ret = 0;
364 lock_kernel();
366 * We make one or two passes through the task list, indexed by
367 * assign = {0, 1}:
368 * Pass 1: select the swappable task with maximal RSS that has
369 * not yet been swapped out.
370 * Pass 2: re-assign rss swap_cnt values, then select as above.
372 * With this approach, there's no need to remember the last task
373 * swapped out. If the swap-out fails, we clear swap_cnt so the
374 * task won't be selected again until all others have been tried.
376 * Think of swap_cnt as a "shadow rss" - it tells us which process
377 * we want to page out (always try largest first).
379 counter = (nr_threads << SWAP_SHIFT) >> priority;
380 if (counter < 1)
381 counter = 1;
383 for (; counter >= 0; counter--) {
384 unsigned long max_cnt = 0;
385 struct mm_struct *best = NULL;
386 int pid = 0;
387 int assign = 0;
388 int found_task = 0;
389 select:
390 read_lock(&tasklist_lock);
391 p = init_task.next_task;
392 for (; p != &init_task; p = p->next_task) {
393 struct mm_struct *mm = p->mm;
394 if (!p->swappable || !mm)
395 continue;
396 if (mm->rss <= 0)
397 continue;
398 /* Skip tasks which haven't slept long enough yet when idle-swapping. */
399 if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) ||
400 time_after(p->sleep_time + idle_time * HZ, jiffies)))
401 continue;
402 found_task++;
403 /* Refresh swap_cnt? */
404 if (assign == 1) {
405 mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
406 if (mm->swap_cnt < SWAP_MIN)
407 mm->swap_cnt = SWAP_MIN;
409 if (mm->swap_cnt > max_cnt) {
410 max_cnt = mm->swap_cnt;
411 best = mm;
412 pid = p->pid;
415 read_unlock(&tasklist_lock);
416 if (!best) {
417 if (!assign && found_task > 0) {
418 assign = 1;
419 goto select;
421 goto out;
422 } else {
423 int ret;
425 atomic_inc(&best->mm_count);
426 ret = swap_out_mm(best, gfp_mask);
427 mmdrop(best);
429 if (!ret)
430 continue;
432 if (ret < 0)
433 kill_proc(pid, SIGBUS, 1);
434 __ret = 1;
435 goto out;
438 out:
439 unlock_kernel();
440 return __ret;
445 * reclaim_page - reclaims one page from the inactive_clean list
446 * @zone: reclaim a page from this zone
448 * The pages on the inactive_clean can be instantly reclaimed.
449 * The tests look impressive, but most of the time we'll grab
450 * the first page of the list and exit successfully.
452 struct page * reclaim_page(zone_t * zone)
454 struct page * page = NULL;
455 struct list_head * page_lru;
456 int maxscan;
459 * We only need the pagemap_lru_lock if we don't reclaim the page,
460 * but we have to grab the pagecache_lock before the pagemap_lru_lock
461 * to avoid deadlocks and most of the time we'll succeed anyway.
463 spin_lock(&pagecache_lock);
464 spin_lock(&pagemap_lru_lock);
465 maxscan = zone->inactive_clean_pages;
466 while ((page_lru = zone->inactive_clean_list.prev) !=
467 &zone->inactive_clean_list && maxscan--) {
468 page = list_entry(page_lru, struct page, lru);
470 /* Wrong page on list?! (list corruption, should not happen) */
471 if (!PageInactiveClean(page)) {
472 printk("VM: reclaim_page, wrong page on list.\n");
473 list_del(page_lru);
474 page->zone->inactive_clean_pages--;
475 continue;
478 /* Page is or was in use? Move it to the active list. */
479 if (PageTestandClearReferenced(page) || page->age > 0 ||
480 (!page->buffers && page_count(page) > 1)) {
481 del_page_from_inactive_clean_list(page);
482 add_page_to_active_list(page);
483 continue;
486 /* The page is dirty, or locked, move to inactive_dirty list. */
487 if (page->buffers || TryLockPage(page)) {
488 del_page_from_inactive_clean_list(page);
489 add_page_to_inactive_dirty_list(page);
490 continue;
493 /* OK, remove the page from the caches. */
494 if (PageSwapCache(page)) {
495 __delete_from_swap_cache(page);
496 goto found_page;
499 if (page->mapping) {
500 __remove_inode_page(page);
501 goto found_page;
504 /* We should never ever get here. */
505 printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
506 list_del(page_lru);
507 zone->inactive_clean_pages--;
508 UnlockPage(page);
510 /* Reset page pointer, maybe we encountered an unfreeable page. */
511 page = NULL;
512 goto out;
514 found_page:
515 del_page_from_inactive_clean_list(page);
516 UnlockPage(page);
517 page->age = PAGE_AGE_START;
518 if (page_count(page) != 1)
519 printk("VM: reclaim_page, found page with count %d!\n",
520 page_count(page));
521 out:
522 spin_unlock(&pagemap_lru_lock);
523 spin_unlock(&pagecache_lock);
524 memory_pressure++;
525 return page;
529 * page_launder - clean dirty inactive pages, move to inactive_clean list
530 * @gfp_mask: what operations we are allowed to do
531 * @sync: should we wait synchronously for the cleaning of pages
533 * When this function is called, we are most likely low on free +
534 * inactive_clean pages. Since we want to refill those pages as
535 * soon as possible, we'll make two loops over the inactive list,
536 * one to move the already cleaned pages to the inactive_clean lists
537 * and one to (often asynchronously) clean the dirty inactive pages.
539 * In situations where kswapd cannot keep up, user processes will
540 * end up calling this function. Since the user process needs to
541 * have a page before it can continue with its allocation, we'll
542 * do synchronous page flushing in that case.
544 * This code is heavily inspired by the FreeBSD source code. Thanks
545 * go out to Matthew Dillon.
547 #define MAX_LAUNDER (4 * (1 << page_cluster))
548 int page_launder(int gfp_mask, int sync)
550 int launder_loop, maxscan, cleaned_pages, maxlaunder;
551 int can_get_io_locks;
552 struct list_head * page_lru;
553 struct page * page;
556 * We can only grab the IO locks (eg. for flushing dirty
557 * buffers to disk) if __GFP_IO is set.
559 can_get_io_locks = gfp_mask & __GFP_IO;
561 launder_loop = 0;
562 maxlaunder = 0;
563 cleaned_pages = 0;
565 dirty_page_rescan:
566 spin_lock(&pagemap_lru_lock);
567 maxscan = nr_inactive_dirty_pages;
568 while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
569 maxscan-- > 0) {
570 page = list_entry(page_lru, struct page, lru);
572 /* Wrong page on list?! (list corruption, should not happen) */
573 if (!PageInactiveDirty(page)) {
574 printk("VM: page_launder, wrong page on list.\n");
575 list_del(page_lru);
576 nr_inactive_dirty_pages--;
577 page->zone->inactive_dirty_pages--;
578 continue;
581 /* Page is or was in use? Move it to the active list. */
582 if (PageTestandClearReferenced(page) || page->age > 0 ||
583 (!page->buffers && page_count(page) > 1) ||
584 page_ramdisk(page)) {
585 del_page_from_inactive_dirty_list(page);
586 add_page_to_active_list(page);
587 continue;
591 * The page is locked. IO in progress?
592 * Move it to the back of the list.
594 if (TryLockPage(page)) {
595 list_del(page_lru);
596 list_add(page_lru, &inactive_dirty_list);
597 continue;
601 * Dirty swap-cache page? Write it out if
602 * last copy..
604 if (PageDirty(page)) {
605 int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
606 if (!writepage)
607 goto page_active;
609 /* Can't start IO? Move it to the back of the list */
610 if (!can_get_io_locks) {
611 list_del(page_lru);
612 list_add(page_lru, &inactive_dirty_list);
613 UnlockPage(page);
614 continue;
617 /* OK, do a physical asynchronous write to swap. */
618 ClearPageDirty(page);
619 page_cache_get(page);
620 spin_unlock(&pagemap_lru_lock);
622 writepage(page);
623 page_cache_release(page);
625 /* And re-start the thing.. */
626 spin_lock(&pagemap_lru_lock);
627 continue;
631 * If the page has buffers, try to free the buffer mappings
632 * associated with this page. If we succeed we either free
633 * the page (in case it was a buffercache only page) or we
634 * move the page to the inactive_clean list.
636 * On the first round, we should free all previously cleaned
637 * buffer pages
639 if (page->buffers) {
640 int wait, clearedbuf;
641 int freed_page = 0;
643 * Since we might be doing disk IO, we have to
644 * drop the spinlock and take an extra reference
645 * on the page so it doesn't go away from under us.
647 del_page_from_inactive_dirty_list(page);
648 page_cache_get(page);
649 spin_unlock(&pagemap_lru_lock);
651 /* Will we do (asynchronous) IO? */
652 if (launder_loop && maxlaunder == 0 && sync)
653 wait = 2; /* Synchrounous IO */
654 else if (launder_loop && maxlaunder-- > 0)
655 wait = 1; /* Async IO */
656 else
657 wait = 0; /* No IO */
659 /* Try to free the page buffers. */
660 clearedbuf = try_to_free_buffers(page, wait);
663 * Re-take the spinlock. Note that we cannot
664 * unlock the page yet since we're still
665 * accessing the page_struct here...
667 spin_lock(&pagemap_lru_lock);
669 /* The buffers were not freed. */
670 if (!clearedbuf) {
671 add_page_to_inactive_dirty_list(page);
673 /* The page was only in the buffer cache. */
674 } else if (!page->mapping) {
675 atomic_dec(&buffermem_pages);
676 freed_page = 1;
677 cleaned_pages++;
679 /* The page has more users besides the cache and us. */
680 } else if (page_count(page) > 2) {
681 add_page_to_active_list(page);
683 /* OK, we "created" a freeable page. */
684 } else /* page->mapping && page_count(page) == 2 */ {
685 add_page_to_inactive_clean_list(page);
686 cleaned_pages++;
690 * Unlock the page and drop the extra reference.
691 * We can only do it here because we ar accessing
692 * the page struct above.
694 UnlockPage(page);
695 page_cache_release(page);
698 * If we're freeing buffer cache pages, stop when
699 * we've got enough free memory.
701 if (freed_page && !free_shortage())
702 break;
703 continue;
704 } else if (page->mapping && !PageDirty(page)) {
706 * If a page had an extra reference in
707 * deactivate_page(), we will find it here.
708 * Now the page is really freeable, so we
709 * move it to the inactive_clean list.
711 del_page_from_inactive_dirty_list(page);
712 add_page_to_inactive_clean_list(page);
713 UnlockPage(page);
714 cleaned_pages++;
715 } else {
716 page_active:
718 * OK, we don't know what to do with the page.
719 * It's no use keeping it here, so we move it to
720 * the active list.
722 del_page_from_inactive_dirty_list(page);
723 add_page_to_active_list(page);
724 UnlockPage(page);
727 spin_unlock(&pagemap_lru_lock);
730 * If we don't have enough free pages, we loop back once
731 * to queue the dirty pages for writeout. When we were called
732 * by a user process (that /needs/ a free page) and we didn't
733 * free anything yet, we wait synchronously on the writeout of
734 * MAX_SYNC_LAUNDER pages.
736 * We also wake up bdflush, since bdflush should, under most
737 * loads, flush out the dirty pages before we have to wait on
738 * IO.
740 if (can_get_io_locks && !launder_loop && free_shortage()) {
741 launder_loop = 1;
742 /* If we cleaned pages, never do synchronous IO. */
743 if (cleaned_pages)
744 sync = 0;
745 /* We only do a few "out of order" flushes. */
746 maxlaunder = MAX_LAUNDER;
747 /* Kflushd takes care of the rest. */
748 wakeup_bdflush(0);
749 goto dirty_page_rescan;
752 /* Return the number of pages moved to the inactive_clean list. */
753 return cleaned_pages;
757 * refill_inactive_scan - scan the active list and find pages to deactivate
758 * @priority: the priority at which to scan
759 * @oneshot: exit after deactivating one page
761 * This function will scan a portion of the active list to find
762 * unused pages, those pages will then be moved to the inactive list.
764 int refill_inactive_scan(unsigned int priority, int oneshot)
766 struct list_head * page_lru;
767 struct page * page;
768 int maxscan, page_active = 0;
769 int ret = 0;
771 /* Take the lock while messing with the list... */
772 spin_lock(&pagemap_lru_lock);
773 maxscan = nr_active_pages >> priority;
774 while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
775 page = list_entry(page_lru, struct page, lru);
777 /* Wrong page on list?! (list corruption, should not happen) */
778 if (!PageActive(page)) {
779 printk("VM: refill_inactive, wrong page on list.\n");
780 list_del(page_lru);
781 nr_active_pages--;
782 continue;
785 /* Do aging on the pages. */
786 if (PageTestandClearReferenced(page)) {
787 age_page_up_nolock(page);
788 page_active = 1;
789 } else {
790 age_page_down_ageonly(page);
792 * Since we don't hold a reference on the page
793 * ourselves, we have to do our test a bit more
794 * strict then deactivate_page(). This is needed
795 * since otherwise the system could hang shuffling
796 * unfreeable pages from the active list to the
797 * inactive_dirty list and back again...
799 * SUBTLE: we can have buffer pages with count 1.
801 if (page->age == 0 && page_count(page) <=
802 (page->buffers ? 2 : 1)) {
803 deactivate_page_nolock(page);
804 page_active = 0;
805 } else {
806 page_active = 1;
810 * If the page is still on the active list, move it
811 * to the other end of the list. Otherwise it was
812 * deactivated by age_page_down and we exit successfully.
814 if (page_active || PageActive(page)) {
815 list_del(page_lru);
816 list_add(page_lru, &active_list);
817 } else {
818 ret = 1;
819 if (oneshot)
820 break;
823 spin_unlock(&pagemap_lru_lock);
825 return ret;
829 * Check if there are zones with a severe shortage of free pages,
830 * or if all zones have a minor shortage.
832 int free_shortage(void)
834 pg_data_t *pgdat = pgdat_list;
835 int sum = 0;
836 int freeable = nr_free_pages() + nr_inactive_clean_pages();
837 int freetarget = freepages.high + inactive_target / 3;
839 /* Are we low on free pages globally? */
840 if (freeable < freetarget)
841 return freetarget - freeable;
843 /* If not, are we very low on any particular zone? */
844 do {
845 int i;
846 for(i = 0; i < MAX_NR_ZONES; i++) {
847 zone_t *zone = pgdat->node_zones+ i;
848 if (zone->size && (zone->inactive_clean_pages +
849 zone->free_pages < zone->pages_min+1)) {
850 /* + 1 to have overlap with alloc_pages() !! */
851 sum += zone->pages_min + 1;
852 sum -= zone->free_pages;
853 sum -= zone->inactive_clean_pages;
856 pgdat = pgdat->node_next;
857 } while (pgdat);
859 return sum;
863 * How many inactive pages are we short?
865 int inactive_shortage(void)
867 int shortage = 0;
869 shortage += freepages.high;
870 shortage += inactive_target;
871 shortage -= nr_free_pages();
872 shortage -= nr_inactive_clean_pages();
873 shortage -= nr_inactive_dirty_pages;
875 if (shortage > 0)
876 return shortage;
878 return 0;
882 * We need to make the locks finer granularity, but right
883 * now we need this so that we can do page allocations
884 * without holding the kernel lock etc.
886 * We want to try to free "count" pages, and we want to
887 * cluster them so that we get good swap-out behaviour.
889 * OTOH, if we're a user process (and not kswapd), we
890 * really care about latency. In that case we don't try
891 * to free too many pages.
893 static int refill_inactive(unsigned int gfp_mask, int user)
895 int priority, count, start_count, made_progress;
896 unsigned long idle_time;
898 count = inactive_shortage() + free_shortage();
899 if (user)
900 count = (1 << page_cluster);
901 start_count = count;
903 /* Always trim SLAB caches when memory gets low. */
904 kmem_cache_reap(gfp_mask);
907 * Calculate the minimum time (in seconds) a process must
908 * have slept before we consider it for idle swapping.
909 * This must be the number of seconds it takes to go through
910 * all of the cache. Doing this idle swapping makes the VM
911 * smoother once we start hitting swap.
913 idle_time = atomic_read(&page_cache_size);
914 idle_time += atomic_read(&buffermem_pages);
915 idle_time /= (inactive_target + 1);
917 priority = 6;
918 do {
919 made_progress = 0;
921 if (current->need_resched) {
922 __set_current_state(TASK_RUNNING);
923 schedule();
926 while (refill_inactive_scan(priority, 1) ||
927 swap_out(priority, gfp_mask, idle_time)) {
928 made_progress = 1;
929 if (--count <= 0)
930 goto done;
934 * don't be too light against the d/i cache since
935 * refill_inactive() almost never fail when there's
936 * really plenty of memory free.
938 shrink_dcache_memory(priority, gfp_mask);
939 shrink_icache_memory(priority, gfp_mask);
942 * Then, try to page stuff out..
944 while (swap_out(priority, gfp_mask, 0)) {
945 made_progress = 1;
946 if (--count <= 0)
947 goto done;
951 * If we either have enough free memory, or if
952 * page_launder() will be able to make enough
953 * free memory, then stop.
955 if (!inactive_shortage() || !free_shortage())
956 goto done;
959 * Only switch to a lower "priority" if we
960 * didn't make any useful progress in the
961 * last loop.
963 if (!made_progress)
964 priority--;
965 } while (priority >= 0);
967 /* Always end on a refill_inactive.., may sleep... */
968 while (refill_inactive_scan(0, 1)) {
969 if (--count <= 0)
970 goto done;
973 done:
974 return (count < start_count);
977 static int do_try_to_free_pages(unsigned int gfp_mask, int user)
979 int ret = 0;
982 * If we're low on free pages, move pages from the
983 * inactive_dirty list to the inactive_clean list.
985 * Usually bdflush will have pre-cleaned the pages
986 * before we get around to moving them to the other
987 * list, so this is a relatively cheap operation.
989 if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
990 nr_inactive_clean_pages())
991 ret += page_launder(gfp_mask, user);
994 * If needed, we move pages from the active list
995 * to the inactive list. We also "eat" pages from
996 * the inode and dentry cache whenever we do this.
998 if (free_shortage() || inactive_shortage()) {
999 shrink_dcache_memory(6, gfp_mask);
1000 shrink_icache_memory(6, gfp_mask);
1001 ret += refill_inactive(gfp_mask, user);
1002 } else {
1004 * Reclaim unused slab cache memory.
1006 kmem_cache_reap(gfp_mask);
1007 ret = 1;
1010 return ret;
1013 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
1014 DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
1015 struct task_struct *kswapd_task;
1018 * The background pageout daemon, started as a kernel thread
1019 * from the init process.
1021 * This basically trickles out pages so that we have _some_
1022 * free memory available even if there is no other activity
1023 * that frees anything up. This is needed for things like routing
1024 * etc, where we otherwise might have all activity going on in
1025 * asynchronous contexts that cannot page things out.
1027 * If there are applications that are active memory-allocators
1028 * (most normal use), this basically shouldn't matter.
1030 int kswapd(void *unused)
1032 struct task_struct *tsk = current;
1034 tsk->session = 1;
1035 tsk->pgrp = 1;
1036 strcpy(tsk->comm, "kswapd");
1037 sigfillset(&tsk->blocked);
1038 kswapd_task = tsk;
1041 * Tell the memory management that we're a "memory allocator",
1042 * and that if we need more memory we should get access to it
1043 * regardless (see "__alloc_pages()"). "kswapd" should
1044 * never get caught in the normal page freeing logic.
1046 * (Kswapd normally doesn't need memory anyway, but sometimes
1047 * you need a small amount of memory in order to be able to
1048 * page out something else, and this flag essentially protects
1049 * us from recursively trying to free more memory as we're
1050 * trying to free the first piece of memory in the first place).
1052 tsk->flags |= PF_MEMALLOC;
1055 * Kswapd main loop.
1057 for (;;) {
1058 static int recalc = 0;
1060 /* If needed, try to free some memory. */
1061 if (inactive_shortage() || free_shortage()) {
1062 int wait = 0;
1063 /* Do we need to do some synchronous flushing? */
1064 if (waitqueue_active(&kswapd_done))
1065 wait = 1;
1066 do_try_to_free_pages(GFP_KSWAPD, wait);
1070 * Do some (very minimal) background scanning. This
1071 * will scan all pages on the active list once
1072 * every minute. This clears old referenced bits
1073 * and moves unused pages to the inactive list.
1075 refill_inactive_scan(6, 0);
1077 /* Once a second, recalculate some VM stats. */
1078 if (time_after(jiffies, recalc + HZ)) {
1079 recalc = jiffies;
1080 recalculate_vm_stats();
1084 * Wake up everybody waiting for free memory
1085 * and unplug the disk queue.
1087 wake_up_all(&kswapd_done);
1088 run_task_queue(&tq_disk);
1091 * We go to sleep if either the free page shortage
1092 * or the inactive page shortage is gone. We do this
1093 * because:
1094 * 1) we need no more free pages or
1095 * 2) the inactive pages need to be flushed to disk,
1096 * it wouldn't help to eat CPU time now ...
1098 * We go to sleep for one second, but if it's needed
1099 * we'll be woken up earlier...
1101 if (!free_shortage() || !inactive_shortage()) {
1102 interruptible_sleep_on_timeout(&kswapd_wait, HZ);
1104 * If we couldn't free enough memory, we see if it was
1105 * due to the system just not having enough memory.
1106 * If that is the case, the only solution is to kill
1107 * a process (the alternative is enternal deadlock).
1109 * If there still is enough memory around, we just loop
1110 * and try free some more memory...
1112 } else if (out_of_memory()) {
1113 oom_kill();
1118 void wakeup_kswapd(int block)
1120 DECLARE_WAITQUEUE(wait, current);
1122 if (current == kswapd_task)
1123 return;
1125 if (!block) {
1126 if (waitqueue_active(&kswapd_wait))
1127 wake_up(&kswapd_wait);
1128 return;
1132 * Kswapd could wake us up before we get a chance
1133 * to sleep, so we have to be very careful here to
1134 * prevent SMP races...
1136 __set_current_state(TASK_UNINTERRUPTIBLE);
1137 add_wait_queue(&kswapd_done, &wait);
1139 if (waitqueue_active(&kswapd_wait))
1140 wake_up(&kswapd_wait);
1141 schedule();
1143 remove_wait_queue(&kswapd_done, &wait);
1144 __set_current_state(TASK_RUNNING);
1148 * Called by non-kswapd processes when they want more
1149 * memory but are unable to sleep on kswapd because
1150 * they might be holding some IO locks ...
1152 int try_to_free_pages(unsigned int gfp_mask)
1154 int ret = 1;
1156 if (gfp_mask & __GFP_WAIT) {
1157 current->flags |= PF_MEMALLOC;
1158 ret = do_try_to_free_pages(gfp_mask, 1);
1159 current->flags &= ~PF_MEMALLOC;
1162 return ret;
1165 DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
1167 * Kreclaimd will move pages from the inactive_clean list to the
1168 * free list, in order to keep atomic allocations possible under
1169 * all circumstances. Even when kswapd is blocked on IO.
1171 int kreclaimd(void *unused)
1173 struct task_struct *tsk = current;
1174 pg_data_t *pgdat;
1176 tsk->session = 1;
1177 tsk->pgrp = 1;
1178 strcpy(tsk->comm, "kreclaimd");
1179 sigfillset(&tsk->blocked);
1180 current->flags |= PF_MEMALLOC;
1182 while (1) {
1185 * We sleep until someone wakes us up from
1186 * page_alloc.c::__alloc_pages().
1188 interruptible_sleep_on(&kreclaimd_wait);
1191 * Move some pages from the inactive_clean lists to
1192 * the free lists, if it is needed.
1194 pgdat = pgdat_list;
1195 do {
1196 int i;
1197 for(i = 0; i < MAX_NR_ZONES; i++) {
1198 zone_t *zone = pgdat->node_zones + i;
1199 if (!zone->size)
1200 continue;
1202 while (zone->free_pages < zone->pages_low) {
1203 struct page * page;
1204 page = reclaim_page(zone);
1205 if (!page)
1206 break;
1207 __free_page(page);
1210 pgdat = pgdat->node_next;
1211 } while (pgdat);
1216 static int __init kswapd_init(void)
1218 printk("Starting kswapd v1.8\n");
1219 swap_setup();
1220 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1221 kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1222 return 0;
1225 module_init(kswapd_init)