mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95, Stephen Tweedie.
   7  *  kswapd added: 7.1.96  sct
   8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  12  *  Multiqueue VM started 5.8.00, Rik van Riel.
  13  */
  14
  15 #include <linux/slab.h>
  16 #include <linux/kernel_stat.h>
  17 #include <linux/swap.h>
  18 #include <linux/swapctl.h>
  19 #include <linux/smp_lock.h>
  20 #include <linux/pagemap.h>
  21 #include <linux/init.h>
  22 #include <linux/highmem.h>
  23 #include <linux/file.h>
  24
  25 #include <asm/pgalloc.h>
  26
  27 /*
  28  * The swap-out functions return 1 if they successfully
  29  * threw something out, and we got a free page. It returns
  30  * zero if it couldn't do anything, and any other value
  31  * indicates it decreased rss, but the page was shared.
  32  *
  33  * NOTE! If it sleeps, it *must* return 1 to make sure we
  34  * don't continue with the swap-out. Otherwise we may be
  35  * using a process that no longer actually exists (it might
  36  * have died while we slept).
  37  */
  38 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
  39 {
  40         pte_t pte;
  41         swp_entry_t entry;
  42         struct page * page;
  43         int (*swapout)(struct page *, struct file *);
  44         int onlist;
  45
  46         pte = *page_table;
  47         if (!pte_present(pte))
  48                 goto out_failed;
  49         page = pte_page(pte);
  50         if ((!VALID_PAGE(page)) || PageReserved(page))
  51                 goto out_failed;
  52
  53         if (mm->swap_cnt)
  54                 mm->swap_cnt--;
  55
  56         onlist = PageActive(page);
  57         /* Don't look at this pte if it's been accessed recently. */
  58         if (ptep_test_and_clear_young(page_table)) {
  59                 age_page_up(page);
  60                 goto out_failed;
  61         }
  62         if (!onlist)
  63                 /* The page is still mapped, so it can't be freeable... */
  64                 age_page_down_ageonly(page);
  65
  66         /*
  67          * If the page is in active use by us, or if the page
  68          * is in active use by others, don't unmap it or
  69          * (worse) start unneeded IO.
  70          */
  71         if (page->age > 0)
  72                 goto out_failed;
  73
  74         if (TryLockPage(page))
  75                 goto out_failed;
  76
  77         /* From this point on, the odds are that we're going to
  78          * nuke this pte, so read and clear the pte.  This hook
  79          * is needed on CPUs which update the accessed and dirty
  80          * bits in hardware.
  81          */
  82         pte = ptep_get_and_clear(page_table);
  83
  84         /*
  85          * Is the page already in the swap cache? If so, then
  86          * we can just drop our reference to it without doing
  87          * any IO - it's already up-to-date on disk.
  88          *
  89          * Return 0, as we didn't actually free any real
  90          * memory, and we should just continue our scan.
  91          */
  92         if (PageSwapCache(page)) {
  93                 entry.val = page->index;
  94                 if (pte_dirty(pte))
  95                         SetPageDirty(page);
  96 set_swap_pte:
  97                 swap_duplicate(entry);
  98                 set_pte(page_table, swp_entry_to_pte(entry));
  99 drop_pte:
 100                 UnlockPage(page);
 101                 mm->rss--;
 102                 flush_tlb_page(vma, address);
 103                 deactivate_page(page);
 104                 page_cache_release(page);
 105 out_failed:
 106                 return 0;
 107         }
 108
 109         /*
 110          * Is it a clean page? Then it must be recoverable
 111          * by just paging it in again, and we can just drop
 112          * it..
 113          *
 114          * However, this won't actually free any real
 115          * memory, as the page will just be in the page cache
 116          * somewhere, and as such we should just continue
 117          * our scan.
 118          *
 119          * Basically, this just makes it possible for us to do
 120          * some real work in the future in "refill_inactive()".
 121          */
 122         if (!pte_dirty(pte)) {
 123                 flush_cache_page(vma, address);
 124                 goto drop_pte;
 125         }
 126
 127         /*
 128          * Don't go down into the swap-out stuff if
 129          * we cannot do I/O! Avoid recursing on FS
 130          * locks etc.
 131          */
 132         if (!(gfp_mask & __GFP_IO))
 133                 goto out_unlock_restore;
 134
 135         /*
 136          * Don't do any of the expensive stuff if
 137          * we're not really interested in this zone.
 138          */
 139         if (page->zone->free_pages + page->zone->inactive_clean_pages
 140                                         + page->zone->inactive_dirty_pages
 141                         > page->zone->pages_high + inactive_target)
 142                 goto out_unlock_restore;
 143
 144         /*
 145          * Ok, it's really dirty. That means that
 146          * we should either create a new swap cache
 147          * entry for it, or we should write it back
 148          * to its own backing store.
 149          *
 150          * Note that in neither case do we actually
 151          * know that we make a page available, but
 152          * as we potentially sleep we can no longer
 153          * continue scanning, so we migth as well
 154          * assume we free'd something.
 155          *
 156          * NOTE NOTE NOTE! This should just set a
 157          * dirty bit in 'page', and just drop the
 158          * pte. All the hard work would be done by
 159          * refill_inactive().
 160          *
 161          * That would get rid of a lot of problems.
 162          */
 163         flush_cache_page(vma, address);
 164         if (vma->vm_ops && (swapout = vma->vm_ops->swapout)) {
 165                 int error;
 166                 struct file *file = vma->vm_file;
 167                 if (file) get_file(file);
 168
 169                 mm->rss--;
 170                 flush_tlb_page(vma, address);
 171                 spin_unlock(&mm->page_table_lock);
 172                 error = swapout(page, file);
 173                 if (file) fput(file);
 174                 if (error < 0)
 175                         goto out_unlock_restore;
 176                 UnlockPage(page);
 177                 deactivate_page(page);
 178                 page_cache_release(page);
 179                 return 1;       /* We released page_table_lock */
 180         }
 181
 182         /*
 183          * This is a dirty, swappable page.  First of all,
 184          * get a suitable swap entry for it, and make sure
 185          * we have the swap cache set up to associate the
 186          * page with that swap entry.
 187          */
 188         entry = get_swap_page();
 189         if (!entry.val)
 190                 goto out_unlock_restore; /* No swap space left */
 191
 192         /* Add it to the swap cache and mark it dirty */
 193         add_to_swap_cache(page, entry);
 194         SetPageDirty(page);
 195         goto set_swap_pte;
 196
 197 out_unlock_restore:
 198         set_pte(page_table, pte);
 199         UnlockPage(page);
 200         return 0;
 201 }
 202
 203 /*
 204  * A new implementation of swap_out().  We do not swap complete processes,
 205  * but only a small number of blocks, before we continue with the next
 206  * process.  The number of blocks actually swapped is determined on the
 207  * number of page faults, that this process actually had in the last time,
 208  * so we won't swap heavily used processes all the time ...
 209  *
 210  * Note: the priority argument is a hint on much CPU to waste with the
 211  *       swap block search, not a hint, of how much blocks to swap with
 212  *       each process.
 213  *
 214  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 215  */
 216
 217 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 218 {
 219         pte_t * pte;
 220         unsigned long pmd_end;
 221
 222         if (pmd_none(*dir))
 223                 return 0;
 224         if (pmd_bad(*dir)) {
 225                 pmd_ERROR(*dir);
 226                 pmd_clear(dir);
 227                 return 0;
 228         }
 229
 230         pte = pte_offset(dir, address);
 231
 232         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 233         if (end > pmd_end)
 234                 end = pmd_end;
 235
 236         do {
 237                 int result;
 238                 mm->swap_address = address + PAGE_SIZE;
 239                 result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
 240                 if (result)
 241                         return result;
 242                 if (!mm->swap_cnt)
 243                         return 0;
 244                 address += PAGE_SIZE;
 245                 pte++;
 246         } while (address && (address < end));
 247         return 0;
 248 }
 249
 250 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 251 {
 252         pmd_t * pmd;
 253         unsigned long pgd_end;
 254
 255         if (pgd_none(*dir))
 256                 return 0;
 257         if (pgd_bad(*dir)) {
 258                 pgd_ERROR(*dir);
 259                 pgd_clear(dir);
 260                 return 0;
 261         }
 262
 263         pmd = pmd_offset(dir, address);
 264
 265         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 266         if (pgd_end && (end > pgd_end))
 267                 end = pgd_end;
 268
 269         do {
 270                 int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
 271                 if (result)
 272                         return result;
 273                 if (!mm->swap_cnt)
 274                         return 0;
 275                 address = (address + PMD_SIZE) & PMD_MASK;
 276                 pmd++;
 277         } while (address && (address < end));
 278         return 0;
 279 }
 280
 281 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 282 {
 283         pgd_t *pgdir;
 284         unsigned long end;
 285
 286         /* Don't swap out areas which are locked down */
 287         if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 288                 return 0;
 289
 290         pgdir = pgd_offset(mm, address);
 291
 292         end = vma->vm_end;
 293         if (address >= end)
 294                 BUG();
 295         do {
 296                 int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
 297                 if (result)
 298                         return result;
 299                 if (!mm->swap_cnt)
 300                         return 0;
 301                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 302                 pgdir++;
 303         } while (address && (address < end));
 304         return 0;
 305 }
 306
 307 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 308 {
 309         unsigned long address;
 310         struct vm_area_struct* vma;
 311
 312         /*
 313          * Go through process' page directory.
 314          */
 315         address = mm->swap_address;
 316
 317         /*
 318          * Find the proper vm-area after freezing the vma chain
 319          * and ptes.
 320          */
 321         spin_lock(&mm->page_table_lock);
 322         vma = find_vma(mm, address);
 323         if (vma) {
 324                 if (address < vma->vm_start)
 325                         address = vma->vm_start;
 326
 327                 for (;;) {
 328                         int result = swap_out_vma(mm, vma, address, gfp_mask);
 329                         if (result)
 330                                 return result;
 331                         if (!mm->swap_cnt)
 332                                 goto out_unlock;
 333                         vma = vma->vm_next;
 334                         if (!vma)
 335                                 break;
 336                         address = vma->vm_start;
 337                 }
 338         }
 339         /* Reset to 0 when we reach the end of address space */
 340         mm->swap_address = 0;
 341         mm->swap_cnt = 0;
 342
 343 out_unlock:
 344         spin_unlock(&mm->page_table_lock);
 345
 346         /* We didn't find anything for the process */
 347         return 0;
 348 }
 349
 350 /*
 351  * Select the task with maximal swap_cnt and try to swap out a page.
 352  * N.B. This function returns only 0 or 1.  Return values != 1 from
 353  * the lower level routines result in continued processing.
 354  */
 355 #define SWAP_SHIFT 5
 356 #define SWAP_MIN 8
 357
 358 static int swap_out(unsigned int priority, int gfp_mask, unsigned long idle_time)
 359 {
 360         struct task_struct * p;
 361         int counter;
 362         int __ret = 0;
 363
 364         lock_kernel();
 365         /*
 366          * We make one or two passes through the task list, indexed by
 367          * assign = {0, 1}:
 368          *   Pass 1: select the swappable task with maximal RSS that has
 369          *         not yet been swapped out.
 370          *   Pass 2: re-assign rss swap_cnt values, then select as above.
 371          *
 372          * With this approach, there's no need to remember the last task
 373          * swapped out.  If the swap-out fails, we clear swap_cnt so the
 374          * task won't be selected again until all others have been tried.
 375          *
 376          * Think of swap_cnt as a "shadow rss" - it tells us which process
 377          * we want to page out (always try largest first).
 378          */
 379         counter = (nr_threads << SWAP_SHIFT) >> priority;
 380         if (counter < 1)
 381                 counter = 1;
 382
 383         for (; counter >= 0; counter--) {
 384                 unsigned long max_cnt = 0;
 385                 struct mm_struct *best = NULL;
 386                 int pid = 0;
 387                 int assign = 0;
 388                 int found_task = 0;
 389         select:
 390                 read_lock(&tasklist_lock);
 391                 p = init_task.next_task;
 392                 for (; p != &init_task; p = p->next_task) {
 393                         struct mm_struct *mm = p->mm;
 394                         if (!p->swappable || !mm)
 395                                 continue;
 396                         if (mm->rss <= 0)
 397                                 continue;
 398                         /* Skip tasks which haven't slept long enough yet when idle-swapping. */
 399                         if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) ||
 400                                         time_after(p->sleep_time + idle_time * HZ, jiffies)))
 401                                 continue;
 402                         found_task++;
 403                         /* Refresh swap_cnt? */
 404                         if (assign == 1) {
 405                                 mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
 406                                 if (mm->swap_cnt < SWAP_MIN)
 407                                         mm->swap_cnt = SWAP_MIN;
 408                         }
 409                         if (mm->swap_cnt > max_cnt) {
 410                                 max_cnt = mm->swap_cnt;
 411                                 best = mm;
 412                                 pid = p->pid;
 413                         }
 414                 }
 415                 read_unlock(&tasklist_lock);
 416                 if (!best) {
 417                         if (!assign && found_task > 0) {
 418                                 assign = 1;
 419                                 goto select;
 420                         }
 421                         goto out;
 422                 } else {
 423                         int ret;
 424
 425                         atomic_inc(&best->mm_count);
 426                         ret = swap_out_mm(best, gfp_mask);
 427                         mmdrop(best);
 428
 429                         if (!ret)
 430                                 continue;
 431
 432                         if (ret < 0)
 433                                 kill_proc(pid, SIGBUS, 1);
 434                         __ret = 1;
 435                         goto out;
 436                 }
 437         }
 438 out:
 439         unlock_kernel();
 440         return __ret;
 441 }
 442
 443
 444 /**
 445  * reclaim_page -       reclaims one page from the inactive_clean list
 446  * @zone: reclaim a page from this zone
 447  *
 448  * The pages on the inactive_clean can be instantly reclaimed.
 449  * The tests look impressive, but most of the time we'll grab
 450  * the first page of the list and exit successfully.
 451  */
 452 struct page * reclaim_page(zone_t * zone)
 453 {
 454         struct page * page = NULL;
 455         struct list_head * page_lru;
 456         int maxscan;
 457
 458         /*
 459          * We only need the pagemap_lru_lock if we don't reclaim the page,
 460          * but we have to grab the pagecache_lock before the pagemap_lru_lock
 461          * to avoid deadlocks and most of the time we'll succeed anyway.
 462          */
 463         spin_lock(&pagecache_lock);
 464         spin_lock(&pagemap_lru_lock);
 465         maxscan = zone->inactive_clean_pages;
 466         while ((page_lru = zone->inactive_clean_list.prev) !=
 467                         &zone->inactive_clean_list && maxscan--) {
 468                 page = list_entry(page_lru, struct page, lru);
 469
 470                 /* Wrong page on list?! (list corruption, should not happen) */
 471                 if (!PageInactiveClean(page)) {
 472                         printk("VM: reclaim_page, wrong page on list.\n");
 473                         list_del(page_lru);
 474                         page->zone->inactive_clean_pages--;
 475                         continue;
 476                 }
 477
 478                 /* Page is or was in use?  Move it to the active list. */
 479                 if (PageTestandClearReferenced(page) || page->age > 0 ||
 480                                 (!page->buffers && page_count(page) > 1)) {
 481                         del_page_from_inactive_clean_list(page);
 482                         add_page_to_active_list(page);
 483                         continue;
 484                 }
 485
 486                 /* The page is dirty, or locked, move to inactive_dirty list. */
 487                 if (page->buffers || TryLockPage(page)) {
 488                         del_page_from_inactive_clean_list(page);
 489                         add_page_to_inactive_dirty_list(page);
 490                         continue;
 491                 }
 492
 493                 /* OK, remove the page from the caches. */
 494                 if (PageSwapCache(page)) {
 495                         __delete_from_swap_cache(page);
 496                         goto found_page;
 497                 }
 498
 499                 if (page->mapping) {
 500                         __remove_inode_page(page);
 501                         goto found_page;
 502                 }
 503
 504                 /* We should never ever get here. */
 505                 printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
 506                 list_del(page_lru);
 507                 zone->inactive_clean_pages--;
 508                 UnlockPage(page);
 509         }
 510         /* Reset page pointer, maybe we encountered an unfreeable page. */
 511         page = NULL;
 512         goto out;
 513
 514 found_page:
 515         del_page_from_inactive_clean_list(page);
 516         UnlockPage(page);
 517         page->age = PAGE_AGE_START;
 518         if (page_count(page) != 1)
 519                 printk("VM: reclaim_page, found page with count %d!\n",
 520                                 page_count(page));
 521 out:
 522         spin_unlock(&pagemap_lru_lock);
 523         spin_unlock(&pagecache_lock);
 524         memory_pressure++;
 525         return page;
 526 }
 527
 528 /**
 529  * page_launder - clean dirty inactive pages, move to inactive_clean list
 530  * @gfp_mask: what operations we are allowed to do
 531  * @sync: should we wait synchronously for the cleaning of pages
 532  *
 533  * When this function is called, we are most likely low on free +
 534  * inactive_clean pages. Since we want to refill those pages as
 535  * soon as possible, we'll make two loops over the inactive list,
 536  * one to move the already cleaned pages to the inactive_clean lists
 537  * and one to (often asynchronously) clean the dirty inactive pages.
 538  *
 539  * In situations where kswapd cannot keep up, user processes will
 540  * end up calling this function. Since the user process needs to
 541  * have a page before it can continue with its allocation, we'll
 542  * do synchronous page flushing in that case.
 543  *
 544  * This code is heavily inspired by the FreeBSD source code. Thanks
 545  * go out to Matthew Dillon.
 546  */
 547 #define MAX_LAUNDER             (4 * (1 << page_cluster))
 548 int page_launder(int gfp_mask, int sync)
 549 {
 550         int launder_loop, maxscan, cleaned_pages, maxlaunder;
 551         int can_get_io_locks;
 552         struct list_head * page_lru;
 553         struct page * page;
 554
 555         /*
 556          * We can only grab the IO locks (eg. for flushing dirty
 557          * buffers to disk) if __GFP_IO is set.
 558          */
 559         can_get_io_locks = gfp_mask & __GFP_IO;
 560
 561         launder_loop = 0;
 562         maxlaunder = 0;
 563         cleaned_pages = 0;
 564
 565 dirty_page_rescan:
 566         spin_lock(&pagemap_lru_lock);
 567         maxscan = nr_inactive_dirty_pages;
 568         while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
 569                                 maxscan-- > 0) {
 570                 page = list_entry(page_lru, struct page, lru);
 571
 572                 /* Wrong page on list?! (list corruption, should not happen) */
 573                 if (!PageInactiveDirty(page)) {
 574                         printk("VM: page_launder, wrong page on list.\n");
 575                         list_del(page_lru);
 576                         nr_inactive_dirty_pages--;
 577                         page->zone->inactive_dirty_pages--;
 578                         continue;
 579                 }
 580
 581                 /* Page is or was in use?  Move it to the active list. */
 582                 if (PageTestandClearReferenced(page) || page->age > 0 ||
 583                                 (!page->buffers && page_count(page) > 1) ||
 584                                 page_ramdisk(page)) {
 585                         del_page_from_inactive_dirty_list(page);
 586                         add_page_to_active_list(page);
 587                         continue;
 588                 }
 589
 590                 /*
 591                  * The page is locked. IO in progress?
 592                  * Move it to the back of the list.
 593                  */
 594                 if (TryLockPage(page)) {
 595                         list_del(page_lru);
 596                         list_add(page_lru, &inactive_dirty_list);
 597                         continue;
 598                 }
 599
 600                 /*
 601                  * Dirty swap-cache page? Write it out if
 602                  * last copy..
 603                  */
 604                 if (PageDirty(page)) {
 605                         int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
 606                         if (!writepage)
 607                                 goto page_active;
 608
 609                         /* Can't start IO? Move it to the back of the list */
 610                         if (!can_get_io_locks) {
 611                                 list_del(page_lru);
 612                                 list_add(page_lru, &inactive_dirty_list);
 613                                 UnlockPage(page);
 614                                 continue;
 615                         }
 616
 617                         /* OK, do a physical asynchronous write to swap.  */
 618                         ClearPageDirty(page);
 619                         page_cache_get(page);
 620                         spin_unlock(&pagemap_lru_lock);
 621
 622                         writepage(page);
 623                         page_cache_release(page);
 624
 625                         /* And re-start the thing.. */
 626                         spin_lock(&pagemap_lru_lock);
 627                         continue;
 628                 }
 629
 630                 /*
 631                  * If the page has buffers, try to free the buffer mappings
 632                  * associated with this page. If we succeed we either free
 633                  * the page (in case it was a buffercache only page) or we
 634                  * move the page to the inactive_clean list.
 635                  *
 636                  * On the first round, we should free all previously cleaned
 637                  * buffer pages
 638                  */
 639                 if (page->buffers) {
 640                         int wait, clearedbuf;
 641                         int freed_page = 0;
 642                         /*
 643                          * Since we might be doing disk IO, we have to
 644                          * drop the spinlock and take an extra reference
 645                          * on the page so it doesn't go away from under us.
 646                          */
 647                         del_page_from_inactive_dirty_list(page);
 648                         page_cache_get(page);
 649                         spin_unlock(&pagemap_lru_lock);
 650
 651                         /* Will we do (asynchronous) IO? */
 652                         if (launder_loop && maxlaunder == 0 && sync)
 653                                 wait = 2;       /* Synchrounous IO */
 654                         else if (launder_loop && maxlaunder-- > 0)
 655                                 wait = 1;       /* Async IO */
 656                         else
 657                                 wait = 0;       /* No IO */
 658
 659                         /* Try to free the page buffers. */
 660                         clearedbuf = try_to_free_buffers(page, wait);
 661
 662                         /*
 663                          * Re-take the spinlock. Note that we cannot
 664                          * unlock the page yet since we're still
 665                          * accessing the page_struct here...
 666                          */
 667                         spin_lock(&pagemap_lru_lock);
 668
 669                         /* The buffers were not freed. */
 670                         if (!clearedbuf) {
 671                                 add_page_to_inactive_dirty_list(page);
 672
 673                         /* The page was only in the buffer cache. */
 674                         } else if (!page->mapping) {
 675                                 atomic_dec(&buffermem_pages);
 676                                 freed_page = 1;
 677                                 cleaned_pages++;
 678
 679                         /* The page has more users besides the cache and us. */
 680                         } else if (page_count(page) > 2) {
 681                                 add_page_to_active_list(page);
 682
 683                         /* OK, we "created" a freeable page. */
 684                         } else /* page->mapping && page_count(page) == 2 */ {
 685                                 add_page_to_inactive_clean_list(page);
 686                                 cleaned_pages++;
 687                         }
 688
 689                         /*
 690                          * Unlock the page and drop the extra reference.
 691                          * We can only do it here because we ar accessing
 692                          * the page struct above.
 693                          */
 694                         UnlockPage(page);
 695                         page_cache_release(page);
 696
 697                         /*
 698                          * If we're freeing buffer cache pages, stop when
 699                          * we've got enough free memory.
 700                          */
 701                         if (freed_page && !free_shortage())
 702                                 break;
 703                         continue;
 704                 } else if (page->mapping && !PageDirty(page)) {
 705                         /*
 706                          * If a page had an extra reference in
 707                          * deactivate_page(), we will find it here.
 708                          * Now the page is really freeable, so we
 709                          * move it to the inactive_clean list.
 710                          */
 711                         del_page_from_inactive_dirty_list(page);
 712                         add_page_to_inactive_clean_list(page);
 713                         UnlockPage(page);
 714                         cleaned_pages++;
 715                 } else {
 716 page_active:
 717                         /*
 718                          * OK, we don't know what to do with the page.
 719                          * It's no use keeping it here, so we move it to
 720                          * the active list.
 721                          */
 722                         del_page_from_inactive_dirty_list(page);
 723                         add_page_to_active_list(page);
 724                         UnlockPage(page);
 725                 }
 726         }
 727         spin_unlock(&pagemap_lru_lock);
 728
 729         /*
 730          * If we don't have enough free pages, we loop back once
 731          * to queue the dirty pages for writeout. When we were called
 732          * by a user process (that /needs/ a free page) and we didn't
 733          * free anything yet, we wait synchronously on the writeout of
 734          * MAX_SYNC_LAUNDER pages.
 735          *
 736          * We also wake up bdflush, since bdflush should, under most
 737          * loads, flush out the dirty pages before we have to wait on
 738          * IO.
 739          */
 740         if (can_get_io_locks && !launder_loop && free_shortage()) {
 741                 launder_loop = 1;
 742                 /* If we cleaned pages, never do synchronous IO. */
 743                 if (cleaned_pages)
 744                         sync = 0;
 745                 /* We only do a few "out of order" flushes. */
 746                 maxlaunder = MAX_LAUNDER;
 747                 /* Kflushd takes care of the rest. */
 748                 wakeup_bdflush(0);
 749                 goto dirty_page_rescan;
 750         }
 751
 752         /* Return the number of pages moved to the inactive_clean list. */
 753         return cleaned_pages;
 754 }
 755
 756 /**
 757  * refill_inactive_scan - scan the active list and find pages to deactivate
 758  * @priority: the priority at which to scan
 759  * @oneshot: exit after deactivating one page
 760  *
 761  * This function will scan a portion of the active list to find
 762  * unused pages, those pages will then be moved to the inactive list.
 763  */
 764 int refill_inactive_scan(unsigned int priority, int oneshot)
 765 {
 766         struct list_head * page_lru;
 767         struct page * page;
 768         int maxscan, page_active = 0;
 769         int ret = 0;
 770
 771         /* Take the lock while messing with the list... */
 772         spin_lock(&pagemap_lru_lock);
 773         maxscan = nr_active_pages >> priority;
 774         while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
 775                 page = list_entry(page_lru, struct page, lru);
 776
 777                 /* Wrong page on list?! (list corruption, should not happen) */
 778                 if (!PageActive(page)) {
 779                         printk("VM: refill_inactive, wrong page on list.\n");
 780                         list_del(page_lru);
 781                         nr_active_pages--;
 782                         continue;
 783                 }
 784
 785                 /* Do aging on the pages. */
 786                 if (PageTestandClearReferenced(page)) {
 787                         age_page_up_nolock(page);
 788                         page_active = 1;
 789                 } else {
 790                         age_page_down_ageonly(page);
 791                         /*
 792                          * Since we don't hold a reference on the page
 793                          * ourselves, we have to do our test a bit more
 794                          * strict then deactivate_page(). This is needed
 795                          * since otherwise the system could hang shuffling
 796                          * unfreeable pages from the active list to the
 797                          * inactive_dirty list and back again...
 798                          *
 799                          * SUBTLE: we can have buffer pages with count 1.
 800                          */
 801                         if (page->age == 0 && page_count(page) <=
 802                                                 (page->buffers ? 2 : 1)) {
 803                                 deactivate_page_nolock(page);
 804                                 page_active = 0;
 805                         } else {
 806                                 page_active = 1;
 807                         }
 808                 }
 809                 /*
 810                  * If the page is still on the active list, move it
 811                  * to the other end of the list. Otherwise it was
 812                  * deactivated by age_page_down and we exit successfully.
 813                  */
 814                 if (page_active || PageActive(page)) {
 815                         list_del(page_lru);
 816                         list_add(page_lru, &active_list);
 817                 } else {
 818                         ret = 1;
 819                         if (oneshot)
 820                                 break;
 821                 }
 822         }
 823         spin_unlock(&pagemap_lru_lock);
 824
 825         return ret;
 826 }
 827
 828 /*
 829  * Check if there are zones with a severe shortage of free pages,
 830  * or if all zones have a minor shortage.
 831  */
 832 int free_shortage(void)
 833 {
 834         pg_data_t *pgdat = pgdat_list;
 835         int sum = 0;
 836         int freeable = nr_free_pages() + nr_inactive_clean_pages();
 837         int freetarget = freepages.high + inactive_target / 3;
 838
 839         /* Are we low on free pages globally? */
 840         if (freeable < freetarget)
 841                 return freetarget - freeable;
 842
 843         /* If not, are we very low on any particular zone? */
 844         do {
 845                 int i;
 846                 for(i = 0; i < MAX_NR_ZONES; i++) {
 847                         zone_t *zone = pgdat->node_zones+ i;
 848                         if (zone->size && (zone->inactive_clean_pages +
 849                                         zone->free_pages < zone->pages_min+1)) {
 850                                 /* + 1 to have overlap with alloc_pages() !! */
 851                                 sum += zone->pages_min + 1;
 852                                 sum -= zone->free_pages;
 853                                 sum -= zone->inactive_clean_pages;
 854                         }
 855                 }
 856                 pgdat = pgdat->node_next;
 857         } while (pgdat);
 858
 859         return sum;
 860 }
 861
 862 /*
 863  * How many inactive pages are we short?
 864  */
 865 int inactive_shortage(void)
 866 {
 867         int shortage = 0;
 868
 869         shortage += freepages.high;
 870         shortage += inactive_target;
 871         shortage -= nr_free_pages();
 872         shortage -= nr_inactive_clean_pages();
 873         shortage -= nr_inactive_dirty_pages;
 874
 875         if (shortage > 0)
 876                 return shortage;
 877
 878         return 0;
 879 }
 880
 881 /*
 882  * We need to make the locks finer granularity, but right
 883  * now we need this so that we can do page allocations
 884  * without holding the kernel lock etc.
 885  *
 886  * We want to try to free "count" pages, and we want to
 887  * cluster them so that we get good swap-out behaviour.
 888  *
 889  * OTOH, if we're a user process (and not kswapd), we
 890  * really care about latency. In that case we don't try
 891  * to free too many pages.
 892  */
 893 static int refill_inactive(unsigned int gfp_mask, int user)
 894 {
 895         int priority, count, start_count, made_progress;
 896         unsigned long idle_time;
 897
 898         count = inactive_shortage() + free_shortage();
 899         if (user)
 900                 count = (1 << page_cluster);
 901         start_count = count;
 902
 903         /* Always trim SLAB caches when memory gets low. */
 904         kmem_cache_reap(gfp_mask);
 905
 906         /*
 907          * Calculate the minimum time (in seconds) a process must
 908          * have slept before we consider it for idle swapping.
 909          * This must be the number of seconds it takes to go through
 910          * all of the cache. Doing this idle swapping makes the VM
 911          * smoother once we start hitting swap.
 912          */
 913         idle_time = atomic_read(&page_cache_size);
 914         idle_time += atomic_read(&buffermem_pages);
 915         idle_time /= (inactive_target + 1);
 916
 917         priority = 6;
 918         do {
 919                 made_progress = 0;
 920
 921                 if (current->need_resched) {
 922                         __set_current_state(TASK_RUNNING);
 923                         schedule();
 924                 }
 925
 926                 while (refill_inactive_scan(priority, 1) ||
 927                                 swap_out(priority, gfp_mask, idle_time)) {
 928                         made_progress = 1;
 929                         if (--count <= 0)
 930                                 goto done;
 931                 }
 932
 933                 /*
 934                  * don't be too light against the d/i cache since
 935                  * refill_inactive() almost never fail when there's
 936                  * really plenty of memory free.
 937                  */
 938                 shrink_dcache_memory(priority, gfp_mask);
 939                 shrink_icache_memory(priority, gfp_mask);
 940
 941                 /*
 942                  * Then, try to page stuff out..
 943                  */
 944                 while (swap_out(priority, gfp_mask, 0)) {
 945                         made_progress = 1;
 946                         if (--count <= 0)
 947                                 goto done;
 948                 }
 949
 950                 /*
 951                  * If we either have enough free memory, or if
 952                  * page_launder() will be able to make enough
 953                  * free memory, then stop.
 954                  */
 955                 if (!inactive_shortage() || !free_shortage())
 956                         goto done;
 957
 958                 /*
 959                  * Only switch to a lower "priority" if we
 960                  * didn't make any useful progress in the
 961                  * last loop.
 962                  */
 963                 if (!made_progress)
 964                         priority--;
 965         } while (priority >= 0);
 966
 967         /* Always end on a refill_inactive.., may sleep... */
 968         while (refill_inactive_scan(0, 1)) {
 969                 if (--count <= 0)
 970                         goto done;
 971         }
 972
 973 done:
 974         return (count < start_count);
 975 }
 976
 977 static int do_try_to_free_pages(unsigned int gfp_mask, int user)
 978 {
 979         int ret = 0;
 980
 981         /*
 982          * If we're low on free pages, move pages from the
 983          * inactive_dirty list to the inactive_clean list.
 984          *
 985          * Usually bdflush will have pre-cleaned the pages
 986          * before we get around to moving them to the other
 987          * list, so this is a relatively cheap operation.
 988          */
 989         if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
 990                         nr_inactive_clean_pages())
 991                 ret += page_launder(gfp_mask, user);
 992
 993         /*
 994          * If needed, we move pages from the active list
 995          * to the inactive list. We also "eat" pages from
 996          * the inode and dentry cache whenever we do this.
 997          */
 998         if (free_shortage() || inactive_shortage()) {
 999                 shrink_dcache_memory(6, gfp_mask);
1000                 shrink_icache_memory(6, gfp_mask);
1001                 ret += refill_inactive(gfp_mask, user);
1002         } else {
1003                 /*
1004                  * Reclaim unused slab cache memory.
1005                  */
1006                 kmem_cache_reap(gfp_mask);
1007                 ret = 1;
1008         }
1009
1010         return ret;
1011 }
1012
1013 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
1014 DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
1015 struct task_struct *kswapd_task;
1016
1017 /*
1018  * The background pageout daemon, started as a kernel thread
1019  * from the init process.
1020  *
1021  * This basically trickles out pages so that we have _some_
1022  * free memory available even if there is no other activity
1023  * that frees anything up. This is needed for things like routing
1024  * etc, where we otherwise might have all activity going on in
1025  * asynchronous contexts that cannot page things out.
1026  *
1027  * If there are applications that are active memory-allocators
1028  * (most normal use), this basically shouldn't matter.
1029  */
1030 int kswapd(void *unused)
1031 {
1032         struct task_struct *tsk = current;
1033
1034         tsk->session = 1;
1035         tsk->pgrp = 1;
1036         strcpy(tsk->comm, "kswapd");
1037         sigfillset(&tsk->blocked);
1038         kswapd_task = tsk;
1039
1040         /*
1041          * Tell the memory management that we're a "memory allocator",
1042          * and that if we need more memory we should get access to it
1043          * regardless (see "__alloc_pages()"). "kswapd" should
1044          * never get caught in the normal page freeing logic.
1045          *
1046          * (Kswapd normally doesn't need memory anyway, but sometimes
1047          * you need a small amount of memory in order to be able to
1048          * page out something else, and this flag essentially protects
1049          * us from recursively trying to free more memory as we're
1050          * trying to free the first piece of memory in the first place).
1051          */
1052         tsk->flags |= PF_MEMALLOC;
1053
1054         /*
1055          * Kswapd main loop.
1056          */
1057         for (;;) {
1058                 static int recalc = 0;
1059
1060                 /* If needed, try to free some memory. */
1061                 if (inactive_shortage() || free_shortage()) {
1062                         int wait = 0;
1063                         /* Do we need to do some synchronous flushing? */
1064                         if (waitqueue_active(&kswapd_done))
1065                                 wait = 1;
1066                         do_try_to_free_pages(GFP_KSWAPD, wait);
1067                 }
1068
1069                 /*
1070                  * Do some (very minimal) background scanning. This
1071                  * will scan all pages on the active list once
1072                  * every minute. This clears old referenced bits
1073                  * and moves unused pages to the inactive list.
1074                  */
1075                 refill_inactive_scan(6, 0);
1076
1077                 /* Once a second, recalculate some VM stats. */
1078                 if (time_after(jiffies, recalc + HZ)) {
1079                         recalc = jiffies;
1080                         recalculate_vm_stats();
1081                 }
1082
1083                 /*
1084                  * Wake up everybody waiting for free memory
1085                  * and unplug the disk queue.
1086                  */
1087                 wake_up_all(&kswapd_done);
1088                 run_task_queue(&tq_disk);
1089
1090                 /*
1091                  * We go to sleep if either the free page shortage
1092                  * or the inactive page shortage is gone. We do this
1093                  * because:
1094                  * 1) we need no more free pages   or
1095                  * 2) the inactive pages need to be flushed to disk,
1096                  *    it wouldn't help to eat CPU time now ...
1097                  *
1098                  * We go to sleep for one second, but if it's needed
1099                  * we'll be woken up earlier...
1100                  */
1101                 if (!free_shortage() || !inactive_shortage()) {
1102                         interruptible_sleep_on_timeout(&kswapd_wait, HZ);
1103                 /*
1104                  * If we couldn't free enough memory, we see if it was
1105                  * due to the system just not having enough memory.
1106                  * If that is the case, the only solution is to kill
1107                  * a process (the alternative is enternal deadlock).
1108                  *
1109                  * If there still is enough memory around, we just loop
1110                  * and try free some more memory...
1111                  */
1112                 } else if (out_of_memory()) {
1113                         oom_kill();
1114                 }
1115         }
1116 }
1117
1118 void wakeup_kswapd(int block)
1119 {
1120         DECLARE_WAITQUEUE(wait, current);
1121
1122         if (current == kswapd_task)
1123                 return;
1124
1125         if (!block) {
1126                 if (waitqueue_active(&kswapd_wait))
1127                         wake_up(&kswapd_wait);
1128                 return;
1129         }
1130
1131         /*
1132          * Kswapd could wake us up before we get a chance
1133          * to sleep, so we have to be very careful here to
1134          * prevent SMP races...
1135          */
1136         __set_current_state(TASK_UNINTERRUPTIBLE);
1137         add_wait_queue(&kswapd_done, &wait);
1138
1139         if (waitqueue_active(&kswapd_wait))
1140                 wake_up(&kswapd_wait);
1141         schedule();
1142
1143         remove_wait_queue(&kswapd_done, &wait);
1144         __set_current_state(TASK_RUNNING);
1145 }
1146
1147 /*
1148  * Called by non-kswapd processes when they want more
1149  * memory but are unable to sleep on kswapd because
1150  * they might be holding some IO locks ...
1151  */
1152 int try_to_free_pages(unsigned int gfp_mask)
1153 {
1154         int ret = 1;
1155
1156         if (gfp_mask & __GFP_WAIT) {
1157                 current->flags |= PF_MEMALLOC;
1158                 ret = do_try_to_free_pages(gfp_mask, 1);
1159                 current->flags &= ~PF_MEMALLOC;
1160         }
1161
1162         return ret;
1163 }
1164
1165 DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
1166 /*
1167  * Kreclaimd will move pages from the inactive_clean list to the
1168  * free list, in order to keep atomic allocations possible under
1169  * all circumstances. Even when kswapd is blocked on IO.
1170  */
1171 int kreclaimd(void *unused)
1172 {
1173         struct task_struct *tsk = current;
1174         pg_data_t *pgdat;
1175
1176         tsk->session = 1;
1177         tsk->pgrp = 1;
1178         strcpy(tsk->comm, "kreclaimd");
1179         sigfillset(&tsk->blocked);
1180         current->flags |= PF_MEMALLOC;
1181
1182         while (1) {
1183
1184                 /*
1185                  * We sleep until someone wakes us up from
1186                  * page_alloc.c::__alloc_pages().
1187                  */
1188                 interruptible_sleep_on(&kreclaimd_wait);
1189
1190                 /*
1191                  * Move some pages from the inactive_clean lists to
1192                  * the free lists, if it is needed.
1193                  */
1194                 pgdat = pgdat_list;
1195                 do {
1196                         int i;
1197                         for(i = 0; i < MAX_NR_ZONES; i++) {
1198                                 zone_t *zone = pgdat->node_zones + i;
1199                                 if (!zone->size)
1200                                         continue;
1201
1202                                 while (zone->free_pages < zone->pages_low) {
1203                                         struct page * page;
1204                                         page = reclaim_page(zone);
1205                                         if (!page)
1206                                                 break;
1207                                         __free_page(page);
1208                                 }
1209                         }
1210                         pgdat = pgdat->node_next;
1211                 } while (pgdat);
1212         }
1213 }
1214
1215
1216 static int __init kswapd_init(void)
1217 {
1218         printk("Starting kswapd v1.8\n");
1219         swap_setup();
1220         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1221         kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1222         return 0;
1223 }
1224
1225 module_init(kswapd_init)