mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95, Stephen Tweedie.
   7  *  kswapd added: 7.1.96  sct
   8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11  */
  12
  13 #include <linux/mm.h>
  14 #include <linux/sched.h>
  15 #include <linux/head.h>
  16 #include <linux/kernel.h>
  17 #include <linux/kernel_stat.h>
  18 #include <linux/errno.h>
  19 #include <linux/string.h>
  20 #include <linux/swap.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/smp_lock.h>
  23 #include <linux/slab.h>
  24 #include <linux/dcache.h>
  25 #include <linux/fs.h>
  26 #include <linux/pagemap.h>
  27 #include <linux/init.h>
  28
  29 #include <asm/bitops.h>
  30 #include <asm/pgtable.h>
  31
  32 /*
  33  * When are we next due for a page scan?
  34  */
  35 static unsigned long next_swap_jiffies = 0;
  36
  37 /*
  38  * How often do we do a pageout scan during normal conditions?
  39  * Default is four times a second.
  40  */
  41 int swapout_interval = HZ / 4;
  42
  43 /*
  44  * The wait queue for waking up the pageout daemon:
  45  */
  46 struct wait_queue * kswapd_wait = NULL;
  47
  48 static void init_swap_timer(void);
  49
  50 /*
  51  * The swap-out functions return 1 if they successfully
  52  * threw something out, and we got a free page. It returns
  53  * zero if it couldn't do anything, and any other value
  54  * indicates it decreased rss, but the page was shared.
  55  *
  56  * NOTE! If it sleeps, it *must* return 1 to make sure we
  57  * don't continue with the swap-out. Otherwise we may be
  58  * using a process that no longer actually exists (it might
  59  * have died while we slept).
  60  */
  61 static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
  62         unsigned long address, pte_t * page_table, int gfp_mask)
  63 {
  64         pte_t pte;
  65         unsigned long entry;
  66         unsigned long page;
  67         struct page * page_map;
  68
  69         pte = *page_table;
  70         if (!pte_present(pte))
  71                 return 0;
  72         page = pte_page(pte);
  73         if (MAP_NR(page) >= max_mapnr)
  74                 return 0;
  75
  76         page_map = mem_map + MAP_NR(page);
  77         if (PageReserved(page_map)
  78             || PageLocked(page_map)
  79             || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
  80                 return 0;
  81
  82         /*
  83          * Deal with page aging.  There are several special cases to
  84          * consider:
  85          *
  86          * Page has been accessed, but is swap cached.  If the page is
  87          * getting sufficiently "interesting" --- its age is getting
  88          * high --- then if we are sufficiently short of free swap
  89          * pages, then delete the swap cache.  We can only do this if
  90          * the swap page's reference count is one: ie. there are no
  91          * other references to it beyond the swap cache (as there must
  92          * still be PTEs pointing to it if count > 1).
  93          *
  94          * If the page has NOT been touched, and its age reaches zero,
  95          * then we are swapping it out:
  96          *
  97          *   If there is already a swap cache page for this page, then
  98          *   another process has already allocated swap space, so just
  99          *   dereference the physical page and copy in the swap entry
 100          *   from the swap cache.
 101          *
 102          * Note, we rely on all pages read in from swap either having
 103          * the swap cache flag set, OR being marked writable in the pte,
 104          * but NEVER BOTH.  (It IS legal to be neither cached nor dirty,
 105          * however.)
 106          *
 107          * -- Stephen Tweedie 1998 */
 108
 109         if (PageSwapCache(page_map)) {
 110                 if (pte_write(pte)) {
 111                         struct page *found;
 112                         printk ("VM: Found a writable swap-cached page!\n");
 113                         /* Try to diagnose the problem ... */
 114                         found = find_page(&swapper_inode, page_map->offset);
 115                         if (found) {
 116                                 printk("page=%p@%08lx, found=%p, count=%d\n",
 117                                         page_map, page_map->offset,
 118                                         found, atomic_read(&found->count));
 119                                 __free_page(found);
 120                         } else
 121                                 printk ("Spurious, page not in cache\n");
 122                         return 0;
 123                 }
 124         }
 125
 126         if (pte_young(pte)) {
 127                 set_pte(page_table, pte_mkold(pte));
 128                 touch_page(page_map);
 129                 /*
 130                  * We should test here to see if we want to recover any
 131                  * swap cache page here.  We do this if the page seeing
 132                  * enough activity, AND we are sufficiently low on swap
 133                  *
 134                  * We need to track both the number of available swap
 135                  * pages and the total number present before we can do
 136                  * this...
 137                  */
 138                 return 0;
 139         }
 140
 141         age_page(page_map);
 142         if (page_map->age)
 143                 return 0;
 144
 145         if (pte_dirty(pte)) {
 146                 if (vma->vm_ops && vma->vm_ops->swapout) {
 147                         pid_t pid = tsk->pid;
 148                         vma->vm_mm->rss--;
 149                         if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
 150                                 kill_proc(pid, SIGBUS, 1);
 151                 } else {
 152                         /*
 153                          * This is a dirty, swappable page.  First of all,
 154                          * get a suitable swap entry for it, and make sure
 155                          * we have the swap cache set up to associate the
 156                          * page with that swap entry.
 157                          */
 158                         entry = in_swap_cache(page_map);
 159                         if (!entry) {
 160                                 entry = get_swap_page();
 161                                 if (!entry)
 162                                         return 0; /* No swap space left */
 163                         }
 164
 165                         vma->vm_mm->rss--;
 166                         tsk->nswap++;
 167                         flush_cache_page(vma, address);
 168                         set_pte(page_table, __pte(entry));
 169                         flush_tlb_page(vma, address);
 170                         swap_duplicate(entry);
 171
 172                         /* Now to write back the page.  We have two
 173                          * cases: if the page is already part of the
 174                          * swap cache, then it is already on disk.  Just
 175                          * free the page and return (we release the swap
 176                          * cache on the last accessor too).
 177                          *
 178                          * If we have made a new swap entry, then we
 179                          * start the write out to disk.  If the page is
 180                          * shared, however, we still need to keep the
 181                          * copy in memory, so we add it to the swap
 182                          * cache. */
 183                         if (PageSwapCache(page_map)) {
 184                                 free_page_and_swap_cache(page);
 185                                 return (atomic_read(&page_map->count) == 0);
 186                         }
 187                         add_to_swap_cache(page_map, entry);
 188                         /* We checked we were unlocked way up above, and we
 189                            have been careful not to stall until here */
 190                         set_bit(PG_locked, &page_map->flags);
 191                         /* OK, do a physical write to swap.  */
 192                         rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT));
 193                 }
 194                 /* Now we can free the current physical page.  We also
 195                  * free up the swap cache if this is the last use of the
 196                  * page.  Note that there is a race here: the page may
 197                  * still be shared COW by another process, but that
 198                  * process may exit while we are writing out the page
 199                  * asynchronously.  That's no problem, shrink_mmap() can
 200                  * correctly clean up the occassional unshared page
 201                  * which gets left behind in the swap cache. */
 202                 free_page_and_swap_cache(page);
 203                 return 1;       /* we slept: the process may not exist any more */
 204         }
 205
 206         /* The page was _not_ dirty, but still has a zero age.  It must
 207          * already be uptodate on disk.  If it is in the swap cache,
 208          * then we can just unlink the page now.  Remove the swap cache
 209          * too if this is the last user.  */
 210         if ((entry = in_swap_cache(page_map)))  {
 211                 vma->vm_mm->rss--;
 212                 flush_cache_page(vma, address);
 213                 set_pte(page_table, __pte(entry));
 214                 flush_tlb_page(vma, address);
 215                 swap_duplicate(entry);
 216                 free_page_and_swap_cache(page);
 217                 return (atomic_read(&page_map->count) == 0);
 218         }
 219         /*
 220          * A clean page to be discarded?  Must be mmap()ed from
 221          * somewhere.  Unlink the pte, and tell the filemap code to
 222          * discard any cached backing page if this is the last user.
 223          */
 224         if (PageSwapCache(page_map)) {
 225                 printk ("VM: How can this page _still_ be cached?");
 226                 return 0;
 227         }
 228         vma->vm_mm->rss--;
 229         flush_cache_page(vma, address);
 230         pte_clear(page_table);
 231         flush_tlb_page(vma, address);
 232         entry = page_unuse(page_map);
 233         __free_page(page_map);
 234         return entry;
 235 }
 236
 237 /*
 238  * A new implementation of swap_out().  We do not swap complete processes,
 239  * but only a small number of blocks, before we continue with the next
 240  * process.  The number of blocks actually swapped is determined on the
 241  * number of page faults, that this process actually had in the last time,
 242  * so we won't swap heavily used processes all the time ...
 243  *
 244  * Note: the priority argument is a hint on much CPU to waste with the
 245  *       swap block search, not a hint, of how much blocks to swap with
 246  *       each process.
 247  *
 248  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 249  */
 250
 251 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
 252         pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 253 {
 254         pte_t * pte;
 255         unsigned long pmd_end;
 256
 257         if (pmd_none(*dir))
 258                 return 0;
 259         if (pmd_bad(*dir)) {
 260                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 261                 pmd_clear(dir);
 262                 return 0;
 263         }
 264
 265         pte = pte_offset(dir, address);
 266
 267         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 268         if (end > pmd_end)
 269                 end = pmd_end;
 270
 271         do {
 272                 int result;
 273                 tsk->swap_address = address + PAGE_SIZE;
 274                 result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 275                 if (result)
 276                         return result;
 277                 address += PAGE_SIZE;
 278                 pte++;
 279         } while (address < end);
 280         return 0;
 281 }
 282
 283 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
 284         pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 285 {
 286         pmd_t * pmd;
 287         unsigned long pgd_end;
 288
 289         if (pgd_none(*dir))
 290                 return 0;
 291         if (pgd_bad(*dir)) {
 292                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 293                 pgd_clear(dir);
 294                 return 0;
 295         }
 296
 297         pmd = pmd_offset(dir, address);
 298
 299         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 300         if (end > pgd_end)
 301                 end = pgd_end;
 302
 303         do {
 304                 int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
 305                 if (result)
 306                         return result;
 307                 address = (address + PMD_SIZE) & PMD_MASK;
 308                 pmd++;
 309         } while (address < end);
 310         return 0;
 311 }
 312
 313 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 314         pgd_t *pgdir, unsigned long start, int gfp_mask)
 315 {
 316         unsigned long end;
 317
 318         /* Don't swap out areas like shared memory which have their
 319             own separate swapping mechanism or areas which are locked down */
 320         if (vma->vm_flags & (VM_SHM | VM_LOCKED))
 321                 return 0;
 322
 323         end = vma->vm_end;
 324         while (start < end) {
 325                 int result = swap_out_pgd(tsk, vma, pgdir, start, end, gfp_mask);
 326                 if (result)
 327                         return result;
 328                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 329                 pgdir++;
 330         }
 331         return 0;
 332 }
 333
 334 static int swap_out_process(struct task_struct * p, int gfp_mask)
 335 {
 336         unsigned long address;
 337         struct vm_area_struct* vma;
 338
 339         /*
 340          * Go through process' page directory.
 341          */
 342         address = p->swap_address;
 343
 344         /*
 345          * Find the proper vm-area
 346          */
 347         vma = find_vma(p->mm, address);
 348         if (!vma) {
 349                 p->swap_address = 0;
 350                 return 0;
 351         }
 352         if (address < vma->vm_start)
 353                 address = vma->vm_start;
 354
 355         for (;;) {
 356                 int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, gfp_mask);
 357                 if (result)
 358                         return result;
 359                 vma = vma->vm_next;
 360                 if (!vma)
 361                         break;
 362                 address = vma->vm_start;
 363         }
 364         p->swap_address = 0;
 365         return 0;
 366 }
 367
 368 /*
 369  * Select the task with maximal swap_cnt and try to swap out a page.
 370  * N.B. This function returns only 0 or 1.  Return values != 1 from
 371  * the lower level routines result in continued processing.
 372  */
 373 static int swap_out(unsigned int priority, int gfp_mask)
 374 {
 375         struct task_struct * p, * pbest;
 376         int counter, assign, max_cnt;
 377
 378         /*
 379          * We make one or two passes through the task list, indexed by
 380          * assign = {0, 1}:
 381          *   Pass 1: select the swappable task with maximal swap_cnt.
 382          *   Pass 2: assign new swap_cnt values, then select as above.
 383          * With this approach, there's no need to remember the last task
 384          * swapped out.  If the swap-out fails, we clear swap_cnt so the
 385          * task won't be selected again until all others have been tried.
 386          */
 387         counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
 388         for (; counter >= 0; counter--) {
 389                 assign = 0;
 390                 max_cnt = 0;
 391                 pbest = NULL;
 392         select:
 393                 read_lock(&tasklist_lock);
 394                 p = init_task.next_task;
 395                 for (; p != &init_task; p = p->next_task) {
 396                         if (!p->swappable)
 397                                 continue;
 398                         if (p->mm->rss <= 0)
 399                                 continue;
 400                         if (assign) {
 401                                 /*
 402                                  * If we didn't select a task on pass 1,
 403                                  * assign each task a new swap_cnt.
 404                                  * Normalise the number of pages swapped
 405                                  * by multiplying by (RSS / 1MB)
 406                                  */
 407                                 p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
 408                         }
 409                         if (p->swap_cnt > max_cnt) {
 410                                 max_cnt = p->swap_cnt;
 411                                 pbest = p;
 412                         }
 413                 }
 414                 read_unlock(&tasklist_lock);
 415                 if (!pbest) {
 416                         if (!assign) {
 417                                 assign = 1;
 418                                 goto select;
 419                         }
 420                         goto out;
 421                 }
 422                 pbest->swap_cnt--;
 423
 424                 switch (swap_out_process(pbest, gfp_mask)) {
 425                 case 0:
 426                         /*
 427                          * Clear swap_cnt so we don't look at this task
 428                          * again until we've tried all of the others.
 429                          * (We didn't block, so the task is still here.)
 430                          */
 431                         pbest->swap_cnt = 0;
 432                         break;
 433                 case 1:
 434                         return 1;
 435                 default:
 436                         break;
 437                 };
 438         }
 439 out:
 440         return 0;
 441 }
 442
 443 /*
 444  * We are much more aggressive about trying to swap out than we used
 445  * to be.  This works out OK, because we now do proper aging on page
 446  * contents.
 447  */
 448 static int do_try_to_free_page(int gfp_mask)
 449 {
 450         static int state = 0;
 451         int i=6;
 452         int stop;
 453
 454         /* Always trim SLAB caches when memory gets low. */
 455         kmem_cache_reap(gfp_mask);
 456
 457         /* We try harder if we are waiting .. */
 458         stop = 3;
 459         if (gfp_mask & __GFP_WAIT)
 460                 stop = 0;
 461
 462         if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages)
 463                    || (page_cache_size * 100 > page_cache.borrow_percent * num_physpages))
 464                 shrink_mmap(i, gfp_mask);
 465
 466         switch (state) {
 467                 do {
 468                 case 0:
 469                         if (shrink_mmap(i, gfp_mask))
 470                                 return 1;
 471                         state = 1;
 472                 case 1:
 473                         if (shm_swap(i, gfp_mask))
 474                                 return 1;
 475                         state = 2;
 476                 case 2:
 477                         if (swap_out(i, gfp_mask))
 478                                 return 1;
 479                         state = 3;
 480                 case 3:
 481                         shrink_dcache_memory(i, gfp_mask);
 482                         state = 0;
 483                 i--;
 484                 } while ((i - stop) >= 0);
 485         }
 486         return 0;
 487 }
 488
 489 /*
 490  * Before we start the kernel thread, print out the
 491  * kswapd initialization message (otherwise the init message
 492  * may be printed in the middle of another driver's init
 493  * message).  It looks very bad when that happens.
 494  */
 495 void __init kswapd_setup(void)
 496 {
 497        int i;
 498        char *revision="$Revision: 1.5 $", *s, *e;
 499
 500        if ((s = strchr(revision, ':')) &&
 501            (e = strchr(s, '$')))
 502                s++, i = e - s;
 503        else
 504                s = revision, i = -1;
 505        printk ("Starting kswapd v%.*s\n", i, s);
 506 }
 507
 508 /*
 509  * The background pageout daemon.
 510  * Started as a kernel thread from the init process.
 511  */
 512 int kswapd(void *unused)
 513 {
 514         struct wait_queue wait = { current, NULL };
 515         current->session = 1;
 516         current->pgrp = 1;
 517         sprintf(current->comm, "kswapd");
 518         sigfillset(&current->blocked);
 519
 520         /*
 521          *      As a kernel thread we want to tamper with system buffers
 522          *      and other internals and thus be subject to the SMP locking
 523          *      rules. (On a uniprocessor box this does nothing).
 524          */
 525         lock_kernel();
 526
 527         /* Give kswapd a realtime priority. */
 528         current->policy = SCHED_FIFO;
 529         current->rt_priority = 32;  /* Fixme --- we need to standardise our
 530                                     namings for POSIX.4 realtime scheduling
 531                                     priorities.  */
 532
 533         /*
 534          * Tell the memory management that we're a "memory allocator",
 535          * and that if we need more memory we should get access to it
 536          * regardless (see "try_to_free_pages()"). "kswapd" should
 537          * never get caught in the normal page freeing logic.
 538          *
 539          * (Kswapd normally doesn't need memory anyway, but sometimes
 540          * you need a small amount of memory in order to be able to
 541          * page out something else, and this flag essentially protects
 542          * us from recursively trying to free more memory as we're
 543          * trying to free the first piece of memory in the first place).
 544          */
 545         current->flags |= PF_MEMALLOC;
 546
 547         init_swap_timer();
 548         add_wait_queue(&kswapd_wait, &wait);
 549         while (1) {
 550                 int tries;
 551
 552                 current->state = TASK_INTERRUPTIBLE;
 553                 flush_signals(current);
 554                 run_task_queue(&tq_disk);
 555                 schedule();
 556                 swapstats.wakeups++;
 557
 558                 /*
 559                  * Do the background pageout: be
 560                  * more aggressive if we're really
 561                  * low on free memory.
 562                  *
 563                  * We try page_daemon.tries_base times, divided by
 564                  * an 'urgency factor'. In practice this will mean
 565                  * a value of pager_daemon.tries_base / 8 or 4 = 64
 566                  * or 128 pages at a time.
 567                  * This gives us 64 (or 128) * 4k * 4 (times/sec) =
 568                  * 1 (or 2) MB/s swapping bandwidth in low-priority
 569                  * background paging. This number rises to 8 MB/s
 570                  * when the priority is highest (but then we'll be
 571                  * woken up more often and the rate will be even
 572                  * higher).
 573                  */
 574                 tries = pager_daemon.tries_base;
 575                 tries >>= 4*free_memory_available();
 576
 577                 do {
 578                         do_try_to_free_page(0);
 579                         /*
 580                          * Syncing large chunks is faster than swapping
 581                          * synchronously (less head movement). -- Rik.
 582                          */
 583                         if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
 584                                 run_task_queue(&tq_disk);
 585                         if (free_memory_available() > 1)
 586                                 break;
 587                 } while (--tries > 0);
 588         }
 589         /* As if we could ever get here - maybe we want to make this killable */
 590         remove_wait_queue(&kswapd_wait, &wait);
 591         unlock_kernel();
 592         return 0;
 593 }
 594
 595 /*
 596  * We need to make the locks finer granularity, but right
 597  * now we need this so that we can do page allocations
 598  * without holding the kernel lock etc.
 599  *
 600  * The "PF_MEMALLOC" flag protects us against recursion:
 601  * if we need more memory as part of a swap-out effort we
 602  * will just silently return "success" to tell the page
 603  * allocator to accept the allocation.
 604  */
 605 int try_to_free_pages(unsigned int gfp_mask, int count)
 606 {
 607         int retval = 1;
 608
 609         lock_kernel();
 610         if (!(current->flags & PF_MEMALLOC)) {
 611                 current->flags |= PF_MEMALLOC;
 612                 do {
 613                         retval = do_try_to_free_page(gfp_mask);
 614                         if (!retval)
 615                                 break;
 616                         count--;
 617                 } while (count > 0);
 618                 current->flags &= ~PF_MEMALLOC;
 619         }
 620         unlock_kernel();
 621         return retval;
 622 }
 623
 624 /*
 625  * The swap_tick function gets called on every clock tick.
 626  */
 627 void swap_tick(void)
 628 {
 629         unsigned long now, want;
 630         int want_wakeup = 0;
 631
 632         want = next_swap_jiffies;
 633         now = jiffies;
 634
 635         /*
 636          * Examine the memory queues. Mark memory low
 637          * if there is nothing available in the three
 638          * highest queues.
 639          *
 640          * Schedule for wakeup if there isn't lots
 641          * of free memory.
 642          */
 643         switch (free_memory_available()) {
 644         case 0:
 645                 want = now;
 646                 /* Fall through */
 647         case 1:
 648                 want_wakeup = 1;
 649         default:
 650         }
 651
 652         if ((long) (now - want) >= 0) {
 653                 if (want_wakeup || (num_physpages * buffer_mem.max_percent) < (buffermem >> PAGE_SHIFT) * 100
 654                                 || (num_physpages * page_cache.max_percent < page_cache_size * 100)) {
 655                         /* Set the next wake-up time */
 656                         next_swap_jiffies = now + swapout_interval;
 657                         wake_up(&kswapd_wait);
 658                 }
 659         }
 660         timer_active |= (1<<SWAP_TIMER);
 661 }
 662
 663 /*
 664  * Initialise the swap timer
 665  */
 666
 667 void init_swap_timer(void)
 668 {
 669         timer_table[SWAP_TIMER].expires = 0;
 670         timer_table[SWAP_TIMER].fn = swap_tick;
 671         timer_active |= (1<<SWAP_TIMER);
 672 }