mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95, Stephen Tweedie.
   7  *  kswapd added: 7.1.96  sct
   8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  12  *  Multiqueue VM started 5.8.00, Rik van Riel.
  13  */
  14
  15 #include <linux/slab.h>
  16 #include <linux/kernel_stat.h>
  17 #include <linux/swap.h>
  18 #include <linux/swapctl.h>
  19 #include <linux/smp_lock.h>
  20 #include <linux/pagemap.h>
  21 #include <linux/init.h>
  22 #include <linux/highmem.h>
  23 #include <linux/file.h>
  24
  25 #include <asm/pgalloc.h>
  26
  27 /*
  28  * The swap-out functions return 1 if they successfully
  29  * threw something out, and we got a free page. It returns
  30  * zero if it couldn't do anything, and any other value
  31  * indicates it decreased rss, but the page was shared.
  32  *
  33  * NOTE! If it sleeps, it *must* return 1 to make sure we
  34  * don't continue with the swap-out. Otherwise we may be
  35  * using a process that no longer actually exists (it might
  36  * have died while we slept).
  37  */
  38 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
  39 {
  40         pte_t pte;
  41         swp_entry_t entry;
  42         struct page * page;
  43         int onlist;
  44
  45         pte = *page_table;
  46         if (!pte_present(pte))
  47                 goto out_failed;
  48         page = pte_page(pte);
  49         if ((!VALID_PAGE(page)) || PageReserved(page))
  50                 goto out_failed;
  51
  52         if (mm->swap_cnt)
  53                 mm->swap_cnt--;
  54
  55         onlist = PageActive(page);
  56         /* Don't look at this pte if it's been accessed recently. */
  57         if (ptep_test_and_clear_young(page_table)) {
  58                 age_page_up(page);
  59                 goto out_failed;
  60         }
  61         if (!onlist)
  62                 /* The page is still mapped, so it can't be freeable... */
  63                 age_page_down_ageonly(page);
  64
  65         /*
  66          * If the page is in active use by us, or if the page
  67          * is in active use by others, don't unmap it or
  68          * (worse) start unneeded IO.
  69          */
  70         if (page->age > 0)
  71                 goto out_failed;
  72
  73         if (TryLockPage(page))
  74                 goto out_failed;
  75
  76         /* From this point on, the odds are that we're going to
  77          * nuke this pte, so read and clear the pte.  This hook
  78          * is needed on CPUs which update the accessed and dirty
  79          * bits in hardware.
  80          */
  81         pte = ptep_get_and_clear(page_table);
  82
  83         /*
  84          * Is the page already in the swap cache? If so, then
  85          * we can just drop our reference to it without doing
  86          * any IO - it's already up-to-date on disk.
  87          *
  88          * Return 0, as we didn't actually free any real
  89          * memory, and we should just continue our scan.
  90          */
  91         if (PageSwapCache(page)) {
  92                 entry.val = page->index;
  93                 if (pte_dirty(pte))
  94                         set_page_dirty(page);
  95 set_swap_pte:
  96                 swap_duplicate(entry);
  97                 set_pte(page_table, swp_entry_to_pte(entry));
  98 drop_pte:
  99                 UnlockPage(page);
 100                 mm->rss--;
 101                 flush_tlb_page(vma, address);
 102                 deactivate_page(page);
 103                 page_cache_release(page);
 104 out_failed:
 105                 return 0;
 106         }
 107
 108         /*
 109          * Is it a clean page? Then it must be recoverable
 110          * by just paging it in again, and we can just drop
 111          * it..
 112          *
 113          * However, this won't actually free any real
 114          * memory, as the page will just be in the page cache
 115          * somewhere, and as such we should just continue
 116          * our scan.
 117          *
 118          * Basically, this just makes it possible for us to do
 119          * some real work in the future in "refill_inactive()".
 120          */
 121         flush_cache_page(vma, address);
 122         if (!pte_dirty(pte))
 123                 goto drop_pte;
 124
 125         /*
 126          * Ok, it's really dirty. That means that
 127          * we should either create a new swap cache
 128          * entry for it, or we should write it back
 129          * to its own backing store.
 130          */
 131         if (page->mapping) {
 132                 set_page_dirty(page);
 133                 goto drop_pte;
 134         }
 135
 136         /*
 137          * This is a dirty, swappable page.  First of all,
 138          * get a suitable swap entry for it, and make sure
 139          * we have the swap cache set up to associate the
 140          * page with that swap entry.
 141          */
 142         entry = get_swap_page();
 143         if (!entry.val)
 144                 goto out_unlock_restore; /* No swap space left */
 145
 146         /* Add it to the swap cache and mark it dirty */
 147         add_to_swap_cache(page, entry);
 148         set_page_dirty(page);
 149         goto set_swap_pte;
 150
 151 out_unlock_restore:
 152         set_pte(page_table, pte);
 153         UnlockPage(page);
 154         return 0;
 155 }
 156
 157 /*
 158  * A new implementation of swap_out().  We do not swap complete processes,
 159  * but only a small number of blocks, before we continue with the next
 160  * process.  The number of blocks actually swapped is determined on the
 161  * number of page faults, that this process actually had in the last time,
 162  * so we won't swap heavily used processes all the time ...
 163  *
 164  * Note: the priority argument is a hint on much CPU to waste with the
 165  *       swap block search, not a hint, of how much blocks to swap with
 166  *       each process.
 167  *
 168  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 169  */
 170
 171 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 172 {
 173         pte_t * pte;
 174         unsigned long pmd_end;
 175
 176         if (pmd_none(*dir))
 177                 return 0;
 178         if (pmd_bad(*dir)) {
 179                 pmd_ERROR(*dir);
 180                 pmd_clear(dir);
 181                 return 0;
 182         }
 183
 184         pte = pte_offset(dir, address);
 185
 186         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 187         if (end > pmd_end)
 188                 end = pmd_end;
 189
 190         do {
 191                 int result;
 192                 mm->swap_address = address + PAGE_SIZE;
 193                 result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
 194                 if (result)
 195                         return result;
 196                 if (!mm->swap_cnt)
 197                         return 0;
 198                 address += PAGE_SIZE;
 199                 pte++;
 200         } while (address && (address < end));
 201         return 0;
 202 }
 203
 204 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 205 {
 206         pmd_t * pmd;
 207         unsigned long pgd_end;
 208
 209         if (pgd_none(*dir))
 210                 return 0;
 211         if (pgd_bad(*dir)) {
 212                 pgd_ERROR(*dir);
 213                 pgd_clear(dir);
 214                 return 0;
 215         }
 216
 217         pmd = pmd_offset(dir, address);
 218
 219         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 220         if (pgd_end && (end > pgd_end))
 221                 end = pgd_end;
 222
 223         do {
 224                 int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
 225                 if (result)
 226                         return result;
 227                 if (!mm->swap_cnt)
 228                         return 0;
 229                 address = (address + PMD_SIZE) & PMD_MASK;
 230                 pmd++;
 231         } while (address && (address < end));
 232         return 0;
 233 }
 234
 235 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 236 {
 237         pgd_t *pgdir;
 238         unsigned long end;
 239
 240         /* Don't swap out areas which are locked down */
 241         if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 242                 return 0;
 243
 244         pgdir = pgd_offset(mm, address);
 245
 246         end = vma->vm_end;
 247         if (address >= end)
 248                 BUG();
 249         do {
 250                 int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
 251                 if (result)
 252                         return result;
 253                 if (!mm->swap_cnt)
 254                         return 0;
 255                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 256                 pgdir++;
 257         } while (address && (address < end));
 258         return 0;
 259 }
 260
 261 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 262 {
 263         unsigned long address;
 264         struct vm_area_struct* vma;
 265
 266         /*
 267          * Go through process' page directory.
 268          */
 269         address = mm->swap_address;
 270
 271         /*
 272          * Find the proper vm-area after freezing the vma chain
 273          * and ptes.
 274          */
 275         spin_lock(&mm->page_table_lock);
 276         vma = find_vma(mm, address);
 277         if (vma) {
 278                 if (address < vma->vm_start)
 279                         address = vma->vm_start;
 280
 281                 for (;;) {
 282                         int result = swap_out_vma(mm, vma, address, gfp_mask);
 283                         if (result)
 284                                 return result;
 285                         if (!mm->swap_cnt)
 286                                 goto out_unlock;
 287                         vma = vma->vm_next;
 288                         if (!vma)
 289                                 break;
 290                         address = vma->vm_start;
 291                 }
 292         }
 293         /* Reset to 0 when we reach the end of address space */
 294         mm->swap_address = 0;
 295         mm->swap_cnt = 0;
 296
 297 out_unlock:
 298         spin_unlock(&mm->page_table_lock);
 299
 300         /* We didn't find anything for the process */
 301         return 0;
 302 }
 303
 304 /*
 305  * Select the task with maximal swap_cnt and try to swap out a page.
 306  * N.B. This function returns only 0 or 1.  Return values != 1 from
 307  * the lower level routines result in continued processing.
 308  */
 309 #define SWAP_SHIFT 5
 310 #define SWAP_MIN 8
 311
 312 static int swap_out(unsigned int priority, int gfp_mask, unsigned long idle_time)
 313 {
 314         struct task_struct * p;
 315         int counter;
 316         int __ret = 0;
 317
 318         lock_kernel();
 319         /*
 320          * We make one or two passes through the task list, indexed by
 321          * assign = {0, 1}:
 322          *   Pass 1: select the swappable task with maximal RSS that has
 323          *         not yet been swapped out.
 324          *   Pass 2: re-assign rss swap_cnt values, then select as above.
 325          *
 326          * With this approach, there's no need to remember the last task
 327          * swapped out.  If the swap-out fails, we clear swap_cnt so the
 328          * task won't be selected again until all others have been tried.
 329          *
 330          * Think of swap_cnt as a "shadow rss" - it tells us which process
 331          * we want to page out (always try largest first).
 332          */
 333         counter = (nr_threads << SWAP_SHIFT) >> priority;
 334         if (counter < 1)
 335                 counter = 1;
 336
 337         for (; counter >= 0; counter--) {
 338                 unsigned long max_cnt = 0;
 339                 struct mm_struct *best = NULL;
 340                 int pid = 0;
 341                 int assign = 0;
 342                 int found_task = 0;
 343         select:
 344                 read_lock(&tasklist_lock);
 345                 p = init_task.next_task;
 346                 for (; p != &init_task; p = p->next_task) {
 347                         struct mm_struct *mm = p->mm;
 348                         if (!p->swappable || !mm)
 349                                 continue;
 350                         if (mm->rss <= 0)
 351                                 continue;
 352                         /* Skip tasks which haven't slept long enough yet when idle-swapping. */
 353                         if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) ||
 354                                         time_after(p->sleep_time + idle_time * HZ, jiffies)))
 355                                 continue;
 356                         found_task++;
 357                         /* Refresh swap_cnt? */
 358                         if (assign == 1) {
 359                                 mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
 360                                 if (mm->swap_cnt < SWAP_MIN)
 361                                         mm->swap_cnt = SWAP_MIN;
 362                         }
 363                         if (mm->swap_cnt > max_cnt) {
 364                                 max_cnt = mm->swap_cnt;
 365                                 best = mm;
 366                                 pid = p->pid;
 367                         }
 368                 }
 369                 read_unlock(&tasklist_lock);
 370                 if (!best) {
 371                         if (!assign && found_task > 0) {
 372                                 assign = 1;
 373                                 goto select;
 374                         }
 375                         goto out;
 376                 } else {
 377                         int ret;
 378
 379                         atomic_inc(&best->mm_count);
 380                         ret = swap_out_mm(best, gfp_mask);
 381                         mmdrop(best);
 382
 383                         __ret = 1;
 384                         goto out;
 385                 }
 386         }
 387 out:
 388         unlock_kernel();
 389         return __ret;
 390 }
 391
 392
 393 /**
 394  * reclaim_page -       reclaims one page from the inactive_clean list
 395  * @zone: reclaim a page from this zone
 396  *
 397  * The pages on the inactive_clean can be instantly reclaimed.
 398  * The tests look impressive, but most of the time we'll grab
 399  * the first page of the list and exit successfully.
 400  */
 401 struct page * reclaim_page(zone_t * zone)
 402 {
 403         struct page * page = NULL;
 404         struct list_head * page_lru;
 405         int maxscan;
 406
 407         /*
 408          * We only need the pagemap_lru_lock if we don't reclaim the page,
 409          * but we have to grab the pagecache_lock before the pagemap_lru_lock
 410          * to avoid deadlocks and most of the time we'll succeed anyway.
 411          */
 412         spin_lock(&pagecache_lock);
 413         spin_lock(&pagemap_lru_lock);
 414         maxscan = zone->inactive_clean_pages;
 415         while ((page_lru = zone->inactive_clean_list.prev) !=
 416                         &zone->inactive_clean_list && maxscan--) {
 417                 page = list_entry(page_lru, struct page, lru);
 418
 419                 /* Wrong page on list?! (list corruption, should not happen) */
 420                 if (!PageInactiveClean(page)) {
 421                         printk("VM: reclaim_page, wrong page on list.\n");
 422                         list_del(page_lru);
 423                         page->zone->inactive_clean_pages--;
 424                         continue;
 425                 }
 426
 427                 /* Page is or was in use?  Move it to the active list. */
 428                 if (PageTestandClearReferenced(page) || page->age > 0 ||
 429                                 (!page->buffers && page_count(page) > 1)) {
 430                         del_page_from_inactive_clean_list(page);
 431                         add_page_to_active_list(page);
 432                         continue;
 433                 }
 434
 435                 /* The page is dirty, or locked, move to inactive_dirty list. */
 436                 if (page->buffers || PageDirty(page) || TryLockPage(page)) {
 437                         del_page_from_inactive_clean_list(page);
 438                         add_page_to_inactive_dirty_list(page);
 439                         continue;
 440                 }
 441
 442                 /* OK, remove the page from the caches. */
 443                 if (PageSwapCache(page)) {
 444                         __delete_from_swap_cache(page);
 445                         goto found_page;
 446                 }
 447
 448                 if (page->mapping) {
 449                         __remove_inode_page(page);
 450                         goto found_page;
 451                 }
 452
 453                 /* We should never ever get here. */
 454                 printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
 455                 list_del(page_lru);
 456                 zone->inactive_clean_pages--;
 457                 UnlockPage(page);
 458         }
 459         /* Reset page pointer, maybe we encountered an unfreeable page. */
 460         page = NULL;
 461         goto out;
 462
 463 found_page:
 464         del_page_from_inactive_clean_list(page);
 465         UnlockPage(page);
 466         page->age = PAGE_AGE_START;
 467         if (page_count(page) != 1)
 468                 printk("VM: reclaim_page, found page with count %d!\n",
 469                                 page_count(page));
 470 out:
 471         spin_unlock(&pagemap_lru_lock);
 472         spin_unlock(&pagecache_lock);
 473         memory_pressure++;
 474         return page;
 475 }
 476
 477 /**
 478  * page_launder - clean dirty inactive pages, move to inactive_clean list
 479  * @gfp_mask: what operations we are allowed to do
 480  * @sync: should we wait synchronously for the cleaning of pages
 481  *
 482  * When this function is called, we are most likely low on free +
 483  * inactive_clean pages. Since we want to refill those pages as
 484  * soon as possible, we'll make two loops over the inactive list,
 485  * one to move the already cleaned pages to the inactive_clean lists
 486  * and one to (often asynchronously) clean the dirty inactive pages.
 487  *
 488  * In situations where kswapd cannot keep up, user processes will
 489  * end up calling this function. Since the user process needs to
 490  * have a page before it can continue with its allocation, we'll
 491  * do synchronous page flushing in that case.
 492  *
 493  * This code is heavily inspired by the FreeBSD source code. Thanks
 494  * go out to Matthew Dillon.
 495  */
 496 #define MAX_LAUNDER             (4 * (1 << page_cluster))
 497 int page_launder(int gfp_mask, int sync)
 498 {
 499         int launder_loop, maxscan, cleaned_pages, maxlaunder;
 500         int can_get_io_locks;
 501         struct list_head * page_lru;
 502         struct page * page;
 503
 504         /*
 505          * We can only grab the IO locks (eg. for flushing dirty
 506          * buffers to disk) if __GFP_IO is set.
 507          */
 508         can_get_io_locks = gfp_mask & __GFP_IO;
 509
 510         launder_loop = 0;
 511         maxlaunder = 0;
 512         cleaned_pages = 0;
 513
 514 dirty_page_rescan:
 515         spin_lock(&pagemap_lru_lock);
 516         maxscan = nr_inactive_dirty_pages;
 517         while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
 518                                 maxscan-- > 0) {
 519                 page = list_entry(page_lru, struct page, lru);
 520
 521                 /* Wrong page on list?! (list corruption, should not happen) */
 522                 if (!PageInactiveDirty(page)) {
 523                         printk("VM: page_launder, wrong page on list.\n");
 524                         list_del(page_lru);
 525                         nr_inactive_dirty_pages--;
 526                         page->zone->inactive_dirty_pages--;
 527                         continue;
 528                 }
 529
 530                 /* Page is or was in use?  Move it to the active list. */
 531                 if (PageTestandClearReferenced(page) || page->age > 0 ||
 532                                 (!page->buffers && page_count(page) > 1) ||
 533                                 page_ramdisk(page)) {
 534                         del_page_from_inactive_dirty_list(page);
 535                         add_page_to_active_list(page);
 536                         continue;
 537                 }
 538
 539                 /*
 540                  * The page is locked. IO in progress?
 541                  * Move it to the back of the list.
 542                  */
 543                 if (TryLockPage(page)) {
 544                         list_del(page_lru);
 545                         list_add(page_lru, &inactive_dirty_list);
 546                         continue;
 547                 }
 548
 549                 /*
 550                  * Dirty swap-cache page? Write it out if
 551                  * last copy..
 552                  */
 553                 if (PageDirty(page)) {
 554                         int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
 555                         int result;
 556
 557                         if (!writepage)
 558                                 goto page_active;
 559
 560                         /* First time through? Move it to the back of the list */
 561                         if (!launder_loop) {
 562                                 list_del(page_lru);
 563                                 list_add(page_lru, &inactive_dirty_list);
 564                                 UnlockPage(page);
 565                                 continue;
 566                         }
 567
 568                         /* OK, do a physical asynchronous write to swap.  */
 569                         ClearPageDirty(page);
 570                         page_cache_get(page);
 571                         spin_unlock(&pagemap_lru_lock);
 572
 573                         result = writepage(page);
 574                         page_cache_release(page);
 575
 576                         /* And re-start the thing.. */
 577                         spin_lock(&pagemap_lru_lock);
 578                         if (result != 1)
 579                                 continue;
 580                         /* writepage refused to do anything */
 581                         set_page_dirty(page);
 582                         goto page_active;
 583                 }
 584
 585                 /*
 586                  * If the page has buffers, try to free the buffer mappings
 587                  * associated with this page. If we succeed we either free
 588                  * the page (in case it was a buffercache only page) or we
 589                  * move the page to the inactive_clean list.
 590                  *
 591                  * On the first round, we should free all previously cleaned
 592                  * buffer pages
 593                  */
 594                 if (page->buffers) {
 595                         int wait, clearedbuf;
 596                         int freed_page = 0;
 597                         /*
 598                          * Since we might be doing disk IO, we have to
 599                          * drop the spinlock and take an extra reference
 600                          * on the page so it doesn't go away from under us.
 601                          */
 602                         del_page_from_inactive_dirty_list(page);
 603                         page_cache_get(page);
 604                         spin_unlock(&pagemap_lru_lock);
 605
 606                         /* Will we do (asynchronous) IO? */
 607                         if (launder_loop && maxlaunder == 0 && sync)
 608                                 wait = 2;       /* Synchrounous IO */
 609                         else if (launder_loop && maxlaunder-- > 0)
 610                                 wait = 1;       /* Async IO */
 611                         else
 612                                 wait = 0;       /* No IO */
 613
 614                         /* Try to free the page buffers. */
 615                         clearedbuf = try_to_free_buffers(page, wait);
 616
 617                         /*
 618                          * Re-take the spinlock. Note that we cannot
 619                          * unlock the page yet since we're still
 620                          * accessing the page_struct here...
 621                          */
 622                         spin_lock(&pagemap_lru_lock);
 623
 624                         /* The buffers were not freed. */
 625                         if (!clearedbuf) {
 626                                 add_page_to_inactive_dirty_list(page);
 627
 628                         /* The page was only in the buffer cache. */
 629                         } else if (!page->mapping) {
 630                                 atomic_dec(&buffermem_pages);
 631                                 freed_page = 1;
 632                                 cleaned_pages++;
 633
 634                         /* The page has more users besides the cache and us. */
 635                         } else if (page_count(page) > 2) {
 636                                 add_page_to_active_list(page);
 637
 638                         /* OK, we "created" a freeable page. */
 639                         } else /* page->mapping && page_count(page) == 2 */ {
 640                                 add_page_to_inactive_clean_list(page);
 641                                 cleaned_pages++;
 642                         }
 643
 644                         /*
 645                          * Unlock the page and drop the extra reference.
 646                          * We can only do it here because we ar accessing
 647                          * the page struct above.
 648                          */
 649                         UnlockPage(page);
 650                         page_cache_release(page);
 651
 652                         /*
 653                          * If we're freeing buffer cache pages, stop when
 654                          * we've got enough free memory.
 655                          */
 656                         if (freed_page && !free_shortage())
 657                                 break;
 658                         continue;
 659                 } else if (page->mapping && !PageDirty(page)) {
 660                         /*
 661                          * If a page had an extra reference in
 662                          * deactivate_page(), we will find it here.
 663                          * Now the page is really freeable, so we
 664                          * move it to the inactive_clean list.
 665                          */
 666                         del_page_from_inactive_dirty_list(page);
 667                         add_page_to_inactive_clean_list(page);
 668                         UnlockPage(page);
 669                         cleaned_pages++;
 670                 } else {
 671 page_active:
 672                         /*
 673                          * OK, we don't know what to do with the page.
 674                          * It's no use keeping it here, so we move it to
 675                          * the active list.
 676                          */
 677                         del_page_from_inactive_dirty_list(page);
 678                         add_page_to_active_list(page);
 679                         UnlockPage(page);
 680                 }
 681         }
 682         spin_unlock(&pagemap_lru_lock);
 683
 684         /*
 685          * If we don't have enough free pages, we loop back once
 686          * to queue the dirty pages for writeout. When we were called
 687          * by a user process (that /needs/ a free page) and we didn't
 688          * free anything yet, we wait synchronously on the writeout of
 689          * MAX_SYNC_LAUNDER pages.
 690          *
 691          * We also wake up bdflush, since bdflush should, under most
 692          * loads, flush out the dirty pages before we have to wait on
 693          * IO.
 694          */
 695         if (can_get_io_locks && !launder_loop && free_shortage()) {
 696                 launder_loop = 1;
 697                 /* If we cleaned pages, never do synchronous IO. */
 698                 if (cleaned_pages)
 699                         sync = 0;
 700                 /* We only do a few "out of order" flushes. */
 701                 maxlaunder = MAX_LAUNDER;
 702                 /* Kflushd takes care of the rest. */
 703                 wakeup_bdflush(0);
 704                 goto dirty_page_rescan;
 705         }
 706
 707         /* Return the number of pages moved to the inactive_clean list. */
 708         return cleaned_pages;
 709 }
 710
 711 /**
 712  * refill_inactive_scan - scan the active list and find pages to deactivate
 713  * @priority: the priority at which to scan
 714  * @oneshot: exit after deactivating one page
 715  *
 716  * This function will scan a portion of the active list to find
 717  * unused pages, those pages will then be moved to the inactive list.
 718  */
 719 int refill_inactive_scan(unsigned int priority, int oneshot)
 720 {
 721         struct list_head * page_lru;
 722         struct page * page;
 723         int maxscan, page_active = 0;
 724         int ret = 0;
 725
 726         /* Take the lock while messing with the list... */
 727         spin_lock(&pagemap_lru_lock);
 728         maxscan = nr_active_pages >> priority;
 729         while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
 730                 page = list_entry(page_lru, struct page, lru);
 731
 732                 /* Wrong page on list?! (list corruption, should not happen) */
 733                 if (!PageActive(page)) {
 734                         printk("VM: refill_inactive, wrong page on list.\n");
 735                         list_del(page_lru);
 736                         nr_active_pages--;
 737                         continue;
 738                 }
 739
 740                 /* Do aging on the pages. */
 741                 if (PageTestandClearReferenced(page)) {
 742                         age_page_up_nolock(page);
 743                         page_active = 1;
 744                 } else {
 745                         age_page_down_ageonly(page);
 746                         /*
 747                          * Since we don't hold a reference on the page
 748                          * ourselves, we have to do our test a bit more
 749                          * strict then deactivate_page(). This is needed
 750                          * since otherwise the system could hang shuffling
 751                          * unfreeable pages from the active list to the
 752                          * inactive_dirty list and back again...
 753                          *
 754                          * SUBTLE: we can have buffer pages with count 1.
 755                          */
 756                         if (page->age == 0 && page_count(page) <=
 757                                                 (page->buffers ? 2 : 1)) {
 758                                 deactivate_page_nolock(page);
 759                                 page_active = 0;
 760                         } else {
 761                                 page_active = 1;
 762                         }
 763                 }
 764                 /*
 765                  * If the page is still on the active list, move it
 766                  * to the other end of the list. Otherwise it was
 767                  * deactivated by age_page_down and we exit successfully.
 768                  */
 769                 if (page_active || PageActive(page)) {
 770                         list_del(page_lru);
 771                         list_add(page_lru, &active_list);
 772                 } else {
 773                         ret = 1;
 774                         if (oneshot)
 775                                 break;
 776                 }
 777         }
 778         spin_unlock(&pagemap_lru_lock);
 779
 780         return ret;
 781 }
 782
 783 /*
 784  * Check if there are zones with a severe shortage of free pages,
 785  * or if all zones have a minor shortage.
 786  */
 787 int free_shortage(void)
 788 {
 789         pg_data_t *pgdat = pgdat_list;
 790         int sum = 0;
 791         int freeable = nr_free_pages() + nr_inactive_clean_pages();
 792         int freetarget = freepages.high + inactive_target / 3;
 793
 794         /* Are we low on free pages globally? */
 795         if (freeable < freetarget)
 796                 return freetarget - freeable;
 797
 798         /* If not, are we very low on any particular zone? */
 799         do {
 800                 int i;
 801                 for(i = 0; i < MAX_NR_ZONES; i++) {
 802                         zone_t *zone = pgdat->node_zones+ i;
 803                         if (zone->size && (zone->inactive_clean_pages +
 804                                         zone->free_pages < zone->pages_min+1)) {
 805                                 /* + 1 to have overlap with alloc_pages() !! */
 806                                 sum += zone->pages_min + 1;
 807                                 sum -= zone->free_pages;
 808                                 sum -= zone->inactive_clean_pages;
 809                         }
 810                 }
 811                 pgdat = pgdat->node_next;
 812         } while (pgdat);
 813
 814         return sum;
 815 }
 816
 817 /*
 818  * How many inactive pages are we short?
 819  */
 820 int inactive_shortage(void)
 821 {
 822         int shortage = 0;
 823
 824         shortage += freepages.high;
 825         shortage += inactive_target;
 826         shortage -= nr_free_pages();
 827         shortage -= nr_inactive_clean_pages();
 828         shortage -= nr_inactive_dirty_pages;
 829
 830         if (shortage > 0)
 831                 return shortage;
 832
 833         return 0;
 834 }
 835
 836 /*
 837  * We need to make the locks finer granularity, but right
 838  * now we need this so that we can do page allocations
 839  * without holding the kernel lock etc.
 840  *
 841  * We want to try to free "count" pages, and we want to
 842  * cluster them so that we get good swap-out behaviour.
 843  *
 844  * OTOH, if we're a user process (and not kswapd), we
 845  * really care about latency. In that case we don't try
 846  * to free too many pages.
 847  */
 848 static int refill_inactive(unsigned int gfp_mask, int user)
 849 {
 850         int priority, count, start_count, made_progress;
 851         unsigned long idle_time;
 852
 853         count = inactive_shortage() + free_shortage();
 854         if (user)
 855                 count = (1 << page_cluster);
 856         start_count = count;
 857
 858         /* Always trim SLAB caches when memory gets low. */
 859         kmem_cache_reap(gfp_mask);
 860
 861         /*
 862          * Calculate the minimum time (in seconds) a process must
 863          * have slept before we consider it for idle swapping.
 864          * This must be the number of seconds it takes to go through
 865          * all of the cache. Doing this idle swapping makes the VM
 866          * smoother once we start hitting swap.
 867          */
 868         idle_time = atomic_read(&page_cache_size);
 869         idle_time += atomic_read(&buffermem_pages);
 870         idle_time /= (inactive_target + 1);
 871
 872         priority = 6;
 873         do {
 874                 made_progress = 0;
 875
 876                 if (current->need_resched) {
 877                         __set_current_state(TASK_RUNNING);
 878                         schedule();
 879                 }
 880
 881                 while (refill_inactive_scan(priority, 1) ||
 882                                 swap_out(priority, gfp_mask, idle_time)) {
 883                         made_progress = 1;
 884                         if (--count <= 0)
 885                                 goto done;
 886                 }
 887
 888                 /*
 889                  * don't be too light against the d/i cache since
 890                  * refill_inactive() almost never fail when there's
 891                  * really plenty of memory free.
 892                  */
 893                 shrink_dcache_memory(priority, gfp_mask);
 894                 shrink_icache_memory(priority, gfp_mask);
 895
 896                 /*
 897                  * Then, try to page stuff out..
 898                  */
 899                 while (swap_out(priority, gfp_mask, 0)) {
 900                         made_progress = 1;
 901                         if (--count <= 0)
 902                                 goto done;
 903                 }
 904
 905                 /*
 906                  * If we either have enough free memory, or if
 907                  * page_launder() will be able to make enough
 908                  * free memory, then stop.
 909                  */
 910                 if (!inactive_shortage() || !free_shortage())
 911                         goto done;
 912
 913                 /*
 914                  * Only switch to a lower "priority" if we
 915                  * didn't make any useful progress in the
 916                  * last loop.
 917                  */
 918                 if (!made_progress)
 919                         priority--;
 920         } while (priority >= 0);
 921
 922         /* Always end on a refill_inactive.., may sleep... */
 923         while (refill_inactive_scan(0, 1)) {
 924                 if (--count <= 0)
 925                         goto done;
 926         }
 927
 928 done:
 929         return (count < start_count);
 930 }
 931
 932 static int do_try_to_free_pages(unsigned int gfp_mask, int user)
 933 {
 934         int ret = 0;
 935
 936         /*
 937          * If we're low on free pages, move pages from the
 938          * inactive_dirty list to the inactive_clean list.
 939          *
 940          * Usually bdflush will have pre-cleaned the pages
 941          * before we get around to moving them to the other
 942          * list, so this is a relatively cheap operation.
 943          */
 944         if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
 945                         nr_inactive_clean_pages())
 946                 ret += page_launder(gfp_mask, user);
 947
 948         /*
 949          * If needed, we move pages from the active list
 950          * to the inactive list. We also "eat" pages from
 951          * the inode and dentry cache whenever we do this.
 952          */
 953         if (free_shortage() || inactive_shortage()) {
 954                 shrink_dcache_memory(6, gfp_mask);
 955                 shrink_icache_memory(6, gfp_mask);
 956                 ret += refill_inactive(gfp_mask, user);
 957         } else {
 958                 /*
 959                  * Reclaim unused slab cache memory.
 960                  */
 961                 kmem_cache_reap(gfp_mask);
 962                 ret = 1;
 963         }
 964
 965         return ret;
 966 }
 967
 968 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 969 DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
 970 struct task_struct *kswapd_task;
 971
 972 /*
 973  * The background pageout daemon, started as a kernel thread
 974  * from the init process.
 975  *
 976  * This basically trickles out pages so that we have _some_
 977  * free memory available even if there is no other activity
 978  * that frees anything up. This is needed for things like routing
 979  * etc, where we otherwise might have all activity going on in
 980  * asynchronous contexts that cannot page things out.
 981  *
 982  * If there are applications that are active memory-allocators
 983  * (most normal use), this basically shouldn't matter.
 984  */
 985 int kswapd(void *unused)
 986 {
 987         struct task_struct *tsk = current;
 988
 989         tsk->session = 1;
 990         tsk->pgrp = 1;
 991         strcpy(tsk->comm, "kswapd");
 992         sigfillset(&tsk->blocked);
 993         kswapd_task = tsk;
 994
 995         /*
 996          * Tell the memory management that we're a "memory allocator",
 997          * and that if we need more memory we should get access to it
 998          * regardless (see "__alloc_pages()"). "kswapd" should
 999          * never get caught in the normal page freeing logic.
1000          *
1001          * (Kswapd normally doesn't need memory anyway, but sometimes
1002          * you need a small amount of memory in order to be able to
1003          * page out something else, and this flag essentially protects
1004          * us from recursively trying to free more memory as we're
1005          * trying to free the first piece of memory in the first place).
1006          */
1007         tsk->flags |= PF_MEMALLOC;
1008
1009         /*
1010          * Kswapd main loop.
1011          */
1012         for (;;) {
1013                 static int recalc = 0;
1014
1015                 /* If needed, try to free some memory. */
1016                 if (inactive_shortage() || free_shortage()) {
1017                         int wait = 0;
1018                         /* Do we need to do some synchronous flushing? */
1019                         if (waitqueue_active(&kswapd_done))
1020                                 wait = 1;
1021                         do_try_to_free_pages(GFP_KSWAPD, wait);
1022                 }
1023
1024                 /*
1025                  * Do some (very minimal) background scanning. This
1026                  * will scan all pages on the active list once
1027                  * every minute. This clears old referenced bits
1028                  * and moves unused pages to the inactive list.
1029                  */
1030                 refill_inactive_scan(6, 0);
1031
1032                 /* Once a second, recalculate some VM stats. */
1033                 if (time_after(jiffies, recalc + HZ)) {
1034                         recalc = jiffies;
1035                         recalculate_vm_stats();
1036                 }
1037
1038                 /*
1039                  * Wake up everybody waiting for free memory
1040                  * and unplug the disk queue.
1041                  */
1042                 wake_up_all(&kswapd_done);
1043                 run_task_queue(&tq_disk);
1044
1045                 /*
1046                  * We go to sleep if either the free page shortage
1047                  * or the inactive page shortage is gone. We do this
1048                  * because:
1049                  * 1) we need no more free pages   or
1050                  * 2) the inactive pages need to be flushed to disk,
1051                  *    it wouldn't help to eat CPU time now ...
1052                  *
1053                  * We go to sleep for one second, but if it's needed
1054                  * we'll be woken up earlier...
1055                  */
1056                 if (!free_shortage() || !inactive_shortage()) {
1057                         interruptible_sleep_on_timeout(&kswapd_wait, HZ);
1058                 /*
1059                  * If we couldn't free enough memory, we see if it was
1060                  * due to the system just not having enough memory.
1061                  * If that is the case, the only solution is to kill
1062                  * a process (the alternative is enternal deadlock).
1063                  *
1064                  * If there still is enough memory around, we just loop
1065                  * and try free some more memory...
1066                  */
1067                 } else if (out_of_memory()) {
1068                         oom_kill();
1069                 }
1070         }
1071 }
1072
1073 void wakeup_kswapd(int block)
1074 {
1075         DECLARE_WAITQUEUE(wait, current);
1076
1077         if (current == kswapd_task)
1078                 return;
1079
1080         if (!block) {
1081                 if (waitqueue_active(&kswapd_wait))
1082                         wake_up(&kswapd_wait);
1083                 return;
1084         }
1085
1086         /*
1087          * Kswapd could wake us up before we get a chance
1088          * to sleep, so we have to be very careful here to
1089          * prevent SMP races...
1090          */
1091         __set_current_state(TASK_UNINTERRUPTIBLE);
1092         add_wait_queue(&kswapd_done, &wait);
1093
1094         if (waitqueue_active(&kswapd_wait))
1095                 wake_up(&kswapd_wait);
1096         schedule();
1097
1098         remove_wait_queue(&kswapd_done, &wait);
1099         __set_current_state(TASK_RUNNING);
1100 }
1101
1102 /*
1103  * Called by non-kswapd processes when they want more
1104  * memory but are unable to sleep on kswapd because
1105  * they might be holding some IO locks ...
1106  */
1107 int try_to_free_pages(unsigned int gfp_mask)
1108 {
1109         int ret = 1;
1110
1111         if (gfp_mask & __GFP_WAIT) {
1112                 current->flags |= PF_MEMALLOC;
1113                 ret = do_try_to_free_pages(gfp_mask, 1);
1114                 current->flags &= ~PF_MEMALLOC;
1115         }
1116
1117         return ret;
1118 }
1119
1120 DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
1121 /*
1122  * Kreclaimd will move pages from the inactive_clean list to the
1123  * free list, in order to keep atomic allocations possible under
1124  * all circumstances. Even when kswapd is blocked on IO.
1125  */
1126 int kreclaimd(void *unused)
1127 {
1128         struct task_struct *tsk = current;
1129         pg_data_t *pgdat;
1130
1131         tsk->session = 1;
1132         tsk->pgrp = 1;
1133         strcpy(tsk->comm, "kreclaimd");
1134         sigfillset(&tsk->blocked);
1135         current->flags |= PF_MEMALLOC;
1136
1137         while (1) {
1138
1139                 /*
1140                  * We sleep until someone wakes us up from
1141                  * page_alloc.c::__alloc_pages().
1142                  */
1143                 interruptible_sleep_on(&kreclaimd_wait);
1144
1145                 /*
1146                  * Move some pages from the inactive_clean lists to
1147                  * the free lists, if it is needed.
1148                  */
1149                 pgdat = pgdat_list;
1150                 do {
1151                         int i;
1152                         for(i = 0; i < MAX_NR_ZONES; i++) {
1153                                 zone_t *zone = pgdat->node_zones + i;
1154                                 if (!zone->size)
1155                                         continue;
1156
1157                                 while (zone->free_pages < zone->pages_low) {
1158                                         struct page * page;
1159                                         page = reclaim_page(zone);
1160                                         if (!page)
1161                                                 break;
1162                                         __free_page(page);
1163                                 }
1164                         }
1165                         pgdat = pgdat->node_next;
1166                 } while (pgdat);
1167         }
1168 }
1169
1170
1171 static int __init kswapd_init(void)
1172 {
1173         printk("Starting kswapd v1.8\n");
1174         swap_setup();
1175         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1176         kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1177         return 0;
1178 }
1179
1180 module_init(kswapd_init)