mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95, Stephen Tweedie.
   7  *  kswapd added: 7.1.96  sct
   8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11  */
  12
  13 #include <linux/slab.h>
  14 #include <linux/kernel_stat.h>
  15 #include <linux/swap.h>
  16 #include <linux/swapctl.h>
  17 #include <linux/smp_lock.h>
  18 #include <linux/pagemap.h>
  19 #include <linux/init.h>
  20
  21 #include <asm/pgtable.h>
  22
  23 /*
  24  * The wait queue for waking up the pageout daemon:
  25  */
  26 static struct task_struct * kswapd_task = NULL;
  27
  28 static void init_swap_timer(void);
  29
  30 /*
  31  * The swap-out functions return 1 if they successfully
  32  * threw something out, and we got a free page. It returns
  33  * zero if it couldn't do anything, and any other value
  34  * indicates it decreased rss, but the page was shared.
  35  *
  36  * NOTE! If it sleeps, it *must* return 1 to make sure we
  37  * don't continue with the swap-out. Otherwise we may be
  38  * using a process that no longer actually exists (it might
  39  * have died while we slept).
  40  */
  41 static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
  42         unsigned long address, pte_t * page_table, int gfp_mask)
  43 {
  44         pte_t pte;
  45         unsigned long entry;
  46         unsigned long page;
  47         struct page * page_map;
  48
  49         pte = *page_table;
  50         if (!pte_present(pte))
  51                 return 0;
  52         page = pte_page(pte);
  53         if (MAP_NR(page) >= max_mapnr)
  54                 return 0;
  55
  56         page_map = mem_map + MAP_NR(page);
  57         if (PageReserved(page_map)
  58             || PageLocked(page_map)
  59             || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
  60                 return 0;
  61
  62         /*
  63          * Deal with page aging.  There are several special cases to
  64          * consider:
  65          *
  66          * Page has been accessed, but is swap cached.  If the page is
  67          * getting sufficiently "interesting" --- its age is getting
  68          * high --- then if we are sufficiently short of free swap
  69          * pages, then delete the swap cache.  We can only do this if
  70          * the swap page's reference count is one: ie. there are no
  71          * other references to it beyond the swap cache (as there must
  72          * still be PTEs pointing to it if count > 1).
  73          *
  74          * If the page has NOT been touched, and its age reaches zero,
  75          * then we are swapping it out:
  76          *
  77          *   If there is already a swap cache page for this page, then
  78          *   another process has already allocated swap space, so just
  79          *   dereference the physical page and copy in the swap entry
  80          *   from the swap cache.
  81          *
  82          * Note, we rely on all pages read in from swap either having
  83          * the swap cache flag set, OR being marked writable in the pte,
  84          * but NEVER BOTH.  (It IS legal to be neither cached nor dirty,
  85          * however.)
  86          *
  87          * -- Stephen Tweedie 1998 */
  88
  89         if (PageSwapCache(page_map)) {
  90                 if (pte_write(pte)) {
  91                         struct page *found;
  92                         printk ("VM: Found a writable swap-cached page!\n");
  93                         /* Try to diagnose the problem ... */
  94                         found = find_page(&swapper_inode, page_map->offset);
  95                         if (found) {
  96                                 printk("page=%p@%08lx, found=%p, count=%d\n",
  97                                         page_map, page_map->offset,
  98                                         found, atomic_read(&found->count));
  99                                 __free_page(found);
 100                         } else
 101                                 printk ("Spurious, page not in cache\n");
 102                         return 0;
 103                 }
 104         }
 105
 106         if (pte_young(pte)) {
 107                 /*
 108                  * Transfer the "accessed" bit from the page
 109                  * tables to the global page map.
 110                  */
 111                 set_pte(page_table, pte_mkold(pte));
 112                 set_bit(PG_referenced, &page_map->flags);
 113
 114                 /*
 115                  * We should test here to see if we want to recover any
 116                  * swap cache page here.  We do this if the page seeing
 117                  * enough activity, AND we are sufficiently low on swap
 118                  *
 119                  * We need to track both the number of available swap
 120                  * pages and the total number present before we can do
 121                  * this...
 122                  */
 123                 return 0;
 124         }
 125
 126         if (pte_dirty(pte)) {
 127                 if (vma->vm_ops && vma->vm_ops->swapout) {
 128                         pid_t pid = tsk->pid;
 129                         vma->vm_mm->rss--;
 130                         if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
 131                                 kill_proc(pid, SIGBUS, 1);
 132                 } else {
 133                         /*
 134                          * This is a dirty, swappable page.  First of all,
 135                          * get a suitable swap entry for it, and make sure
 136                          * we have the swap cache set up to associate the
 137                          * page with that swap entry.
 138                          */
 139                         entry = in_swap_cache(page_map);
 140                         if (!entry) {
 141                                 entry = get_swap_page();
 142                                 if (!entry)
 143                                         return 0; /* No swap space left */
 144                         }
 145
 146                         vma->vm_mm->rss--;
 147                         tsk->nswap++;
 148                         flush_cache_page(vma, address);
 149                         set_pte(page_table, __pte(entry));
 150                         flush_tlb_page(vma, address);
 151                         swap_duplicate(entry);
 152
 153                         /* Now to write back the page.  We have two
 154                          * cases: if the page is already part of the
 155                          * swap cache, then it is already on disk.  Just
 156                          * free the page and return (we release the swap
 157                          * cache on the last accessor too).
 158                          *
 159                          * If we have made a new swap entry, then we
 160                          * start the write out to disk.  If the page is
 161                          * shared, however, we still need to keep the
 162                          * copy in memory, so we add it to the swap
 163                          * cache. */
 164                         if (PageSwapCache(page_map)) {
 165                                 free_page(page);
 166                                 return (atomic_read(&page_map->count) == 0);
 167                         }
 168                         add_to_swap_cache(page_map, entry);
 169                         /* We checked we were unlocked way up above, and we
 170                            have been careful not to stall until here */
 171                         set_bit(PG_locked, &page_map->flags);
 172                         /* OK, do a physical write to swap.  */
 173                         rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT));
 174                 }
 175                 /* Now we can free the current physical page.  We also
 176                  * free up the swap cache if this is the last use of the
 177                  * page.  Note that there is a race here: the page may
 178                  * still be shared COW by another process, but that
 179                  * process may exit while we are writing out the page
 180                  * asynchronously.  That's no problem, shrink_mmap() can
 181                  * correctly clean up the occassional unshared page
 182                  * which gets left behind in the swap cache. */
 183                 free_page(page);
 184                 return 1;       /* we slept: the process may not exist any more */
 185         }
 186
 187         /* The page was _not_ dirty, but still has a zero age.  It must
 188          * already be uptodate on disk.  If it is in the swap cache,
 189          * then we can just unlink the page now.  Remove the swap cache
 190          * too if this is the last user.  */
 191         if ((entry = in_swap_cache(page_map)))  {
 192                 vma->vm_mm->rss--;
 193                 flush_cache_page(vma, address);
 194                 set_pte(page_table, __pte(entry));
 195                 flush_tlb_page(vma, address);
 196                 swap_duplicate(entry);
 197                 free_page(page);
 198                 return (atomic_read(&page_map->count) == 0);
 199         }
 200         /*
 201          * A clean page to be discarded?  Must be mmap()ed from
 202          * somewhere.  Unlink the pte, and tell the filemap code to
 203          * discard any cached backing page if this is the last user.
 204          */
 205         if (PageSwapCache(page_map)) {
 206                 printk ("VM: How can this page _still_ be cached?");
 207                 return 0;
 208         }
 209         vma->vm_mm->rss--;
 210         flush_cache_page(vma, address);
 211         pte_clear(page_table);
 212         flush_tlb_page(vma, address);
 213         entry = (atomic_read(&page_map->count) == 1);
 214         __free_page(page_map);
 215         return entry;
 216 }
 217
 218 /*
 219  * A new implementation of swap_out().  We do not swap complete processes,
 220  * but only a small number of blocks, before we continue with the next
 221  * process.  The number of blocks actually swapped is determined on the
 222  * number of page faults, that this process actually had in the last time,
 223  * so we won't swap heavily used processes all the time ...
 224  *
 225  * Note: the priority argument is a hint on much CPU to waste with the
 226  *       swap block search, not a hint, of how much blocks to swap with
 227  *       each process.
 228  *
 229  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 230  */
 231
 232 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
 233         pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 234 {
 235         pte_t * pte;
 236         unsigned long pmd_end;
 237
 238         if (pmd_none(*dir))
 239                 return 0;
 240         if (pmd_bad(*dir)) {
 241                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 242                 pmd_clear(dir);
 243                 return 0;
 244         }
 245
 246         pte = pte_offset(dir, address);
 247
 248         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 249         if (end > pmd_end)
 250                 end = pmd_end;
 251
 252         do {
 253                 int result;
 254                 tsk->swap_address = address + PAGE_SIZE;
 255                 result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 256                 if (result)
 257                         return result;
 258                 address += PAGE_SIZE;
 259                 pte++;
 260         } while (address < end);
 261         return 0;
 262 }
 263
 264 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
 265         pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 266 {
 267         pmd_t * pmd;
 268         unsigned long pgd_end;
 269
 270         if (pgd_none(*dir))
 271                 return 0;
 272         if (pgd_bad(*dir)) {
 273                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 274                 pgd_clear(dir);
 275                 return 0;
 276         }
 277
 278         pmd = pmd_offset(dir, address);
 279
 280         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 281         if (end > pgd_end)
 282                 end = pgd_end;
 283
 284         do {
 285                 int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
 286                 if (result)
 287                         return result;
 288                 address = (address + PMD_SIZE) & PMD_MASK;
 289                 pmd++;
 290         } while (address < end);
 291         return 0;
 292 }
 293
 294 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 295         unsigned long address, int gfp_mask)
 296 {
 297         pgd_t *pgdir;
 298         unsigned long end;
 299
 300         /* Don't swap out areas like shared memory which have their
 301             own separate swapping mechanism or areas which are locked down */
 302         if (vma->vm_flags & (VM_SHM | VM_LOCKED))
 303                 return 0;
 304
 305         pgdir = pgd_offset(tsk->mm, address);
 306
 307         end = vma->vm_end;
 308         while (address < end) {
 309                 int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
 310                 if (result)
 311                         return result;
 312                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 313                 pgdir++;
 314         }
 315         return 0;
 316 }
 317
 318 static int swap_out_process(struct task_struct * p, int gfp_mask)
 319 {
 320         unsigned long address;
 321         struct vm_area_struct* vma;
 322
 323         /*
 324          * Go through process' page directory.
 325          */
 326         address = p->swap_address;
 327
 328         /*
 329          * Find the proper vm-area
 330          */
 331         vma = find_vma(p->mm, address);
 332         if (vma) {
 333                 if (address < vma->vm_start)
 334                         address = vma->vm_start;
 335
 336                 for (;;) {
 337                         int result = swap_out_vma(p, vma, address, gfp_mask);
 338                         if (result)
 339                                 return result;
 340                         vma = vma->vm_next;
 341                         if (!vma)
 342                                 break;
 343                         address = vma->vm_start;
 344                 }
 345         }
 346
 347         /* We didn't find anything for the process */
 348         p->swap_cnt = 0;
 349         p->swap_address = 0;
 350         return 0;
 351 }
 352
 353 /*
 354  * Select the task with maximal swap_cnt and try to swap out a page.
 355  * N.B. This function returns only 0 or 1.  Return values != 1 from
 356  * the lower level routines result in continued processing.
 357  */
 358 static int swap_out(unsigned int priority, int gfp_mask)
 359 {
 360         struct task_struct * p, * pbest;
 361         int counter, assign, max_cnt;
 362
 363         /*
 364          * We make one or two passes through the task list, indexed by
 365          * assign = {0, 1}:
 366          *   Pass 1: select the swappable task with maximal swap_cnt.
 367          *   Pass 2: assign new swap_cnt values, then select as above.
 368          * With this approach, there's no need to remember the last task
 369          * swapped out.  If the swap-out fails, we clear swap_cnt so the
 370          * task won't be selected again until all others have been tried.
 371          */
 372         counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
 373         for (; counter >= 0; counter--) {
 374                 assign = 0;
 375                 max_cnt = 0;
 376                 pbest = NULL;
 377         select:
 378                 read_lock(&tasklist_lock);
 379                 p = init_task.next_task;
 380                 for (; p != &init_task; p = p->next_task) {
 381                         if (!p->swappable)
 382                                 continue;
 383                         if (p->mm->rss <= 0)
 384                                 continue;
 385                         if (assign) {
 386                                 /*
 387                                  * If we didn't select a task on pass 1,
 388                                  * assign each task a new swap_cnt.
 389                                  * Normalise the number of pages swapped
 390                                  * by multiplying by (RSS / 1MB)
 391                                  */
 392                                 p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
 393                         }
 394                         if (p->swap_cnt > max_cnt) {
 395                                 max_cnt = p->swap_cnt;
 396                                 pbest = p;
 397                         }
 398                 }
 399                 read_unlock(&tasklist_lock);
 400                 if (!pbest) {
 401                         if (!assign) {
 402                                 assign = 1;
 403                                 goto select;
 404                         }
 405                         goto out;
 406                 }
 407                 pbest->swap_cnt--;
 408
 409                 /*
 410                  * Nonzero means we cleared out something, but only "1" means
 411                  * that we actually free'd up a page as a result.
 412                  */
 413                 if (swap_out_process(pbest, gfp_mask) == 1)
 414                                 return 1;
 415         }
 416 out:
 417         return 0;
 418 }
 419
 420 /*
 421  * Before we start the kernel thread, print out the
 422  * kswapd initialization message (otherwise the init message
 423  * may be printed in the middle of another driver's init
 424  * message).  It looks very bad when that happens.
 425  */
 426 void __init kswapd_setup(void)
 427 {
 428        int i;
 429        char *revision="$Revision: 1.5 $", *s, *e;
 430
 431        swap_setup();
 432
 433        if ((s = strchr(revision, ':')) &&
 434            (e = strchr(s, '$')))
 435                s++, i = e - s;
 436        else
 437                s = revision, i = -1;
 438        printk ("Starting kswapd v%.*s\n", i, s);
 439 }
 440
 441 #define free_memory(fn) \
 442         count++; do { if (!--count) goto done; } while (fn)
 443
 444 static int kswapd_free_pages(int kswapd_state)
 445 {
 446         unsigned long end_time;
 447
 448         /* Always trim SLAB caches when memory gets low. */
 449         kmem_cache_reap(0);
 450
 451         /* max one hundreth of a second */
 452         end_time = jiffies + (HZ-1)/100;
 453         do {
 454                 int priority = 5;
 455                 int count = pager_daemon.swap_cluster;
 456
 457                 switch (kswapd_state) {
 458                         do {
 459                         default:
 460                                 free_memory(shrink_mmap(priority, 0));
 461                                 kswapd_state++;
 462                         case 1:
 463                                 free_memory(shm_swap(priority, 0));
 464                                 kswapd_state++;
 465                         case 2:
 466                                 free_memory(swap_out(priority, 0));
 467                                 shrink_dcache_memory(priority, 0);
 468                                 kswapd_state = 0;
 469                         } while (--priority >= 0);
 470                         return kswapd_state;
 471                 }
 472 done:
 473                 if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
 474                         break;
 475         } while (time_before_eq(jiffies,end_time));
 476         return kswapd_state;
 477 }
 478
 479 /*
 480  * The background pageout daemon.
 481  * Started as a kernel thread from the init process.
 482  */
 483 int kswapd(void *unused)
 484 {
 485         current->session = 1;
 486         current->pgrp = 1;
 487         strcpy(current->comm, "kswapd");
 488         sigfillset(&current->blocked);
 489
 490         /*
 491          *      As a kernel thread we want to tamper with system buffers
 492          *      and other internals and thus be subject to the SMP locking
 493          *      rules. (On a uniprocessor box this does nothing).
 494          */
 495         lock_kernel();
 496
 497         /*
 498          * Set the base priority to something smaller than a
 499          * regular process. We will scale up the priority
 500          * dynamically depending on how much memory we need.
 501          */
 502         current->priority = (DEF_PRIORITY * 2) / 3;
 503
 504         /*
 505          * Tell the memory management that we're a "memory allocator",
 506          * and that if we need more memory we should get access to it
 507          * regardless (see "try_to_free_pages()"). "kswapd" should
 508          * never get caught in the normal page freeing logic.
 509          *
 510          * (Kswapd normally doesn't need memory anyway, but sometimes
 511          * you need a small amount of memory in order to be able to
 512          * page out something else, and this flag essentially protects
 513          * us from recursively trying to free more memory as we're
 514          * trying to free the first piece of memory in the first place).
 515          */
 516         current->flags |= PF_MEMALLOC;
 517
 518         init_swap_timer();
 519         kswapd_task = current;
 520         while (1) {
 521                 int state = 0;
 522
 523                 current->state = TASK_INTERRUPTIBLE;
 524                 flush_signals(current);
 525                 run_task_queue(&tq_disk);
 526                 schedule();
 527                 swapstats.wakeups++;
 528                 state = kswapd_free_pages(state);
 529         }
 530         /* As if we could ever get here - maybe we want to make this killable */
 531         kswapd_task = NULL;
 532         unlock_kernel();
 533         return 0;
 534 }
 535
 536 /*
 537  * We need to make the locks finer granularity, but right
 538  * now we need this so that we can do page allocations
 539  * without holding the kernel lock etc.
 540  *
 541  * The "PF_MEMALLOC" flag protects us against recursion:
 542  * if we need more memory as part of a swap-out effort we
 543  * will just silently return "success" to tell the page
 544  * allocator to accept the allocation.
 545  *
 546  * We want to try to free "count" pages, and we need to
 547  * cluster them so that we get good swap-out behaviour. See
 548  * the "free_memory()" macro for details.
 549  */
 550 int try_to_free_pages(unsigned int gfp_mask, int count)
 551 {
 552         int retval;
 553
 554         lock_kernel();
 555
 556         /* Always trim SLAB caches when memory gets low. */
 557         kmem_cache_reap(gfp_mask);
 558
 559         retval = 1;
 560         if (!(current->flags & PF_MEMALLOC)) {
 561                 int priority;
 562
 563                 current->flags |= PF_MEMALLOC;
 564
 565                 priority = 5;
 566                 do {
 567                         free_memory(shrink_mmap(priority, gfp_mask));
 568                         free_memory(shm_swap(priority, gfp_mask));
 569                         free_memory(swap_out(priority, gfp_mask));
 570                         shrink_dcache_memory(priority, gfp_mask);
 571                 } while (--priority >= 0);
 572                 retval = 0;
 573 done:
 574                 current->flags &= ~PF_MEMALLOC;
 575         }
 576         unlock_kernel();
 577
 578         return retval;
 579 }
 580
 581 /*
 582  * Wake up kswapd according to the priority
 583  *      0 - no wakeup
 584  *      1 - wake up as a low-priority process
 585  *      2 - wake up as a normal process
 586  *      3 - wake up as an almost real-time process
 587  *
 588  * This plays mind-games with the "goodness()"
 589  * function in kernel/sched.c.
 590  */
 591 static inline void kswapd_wakeup(struct task_struct *p, int priority)
 592 {
 593         if (priority) {
 594                 p->counter = p->priority << priority;
 595                 wake_up_process(p);
 596         }
 597 }
 598
 599 /*
 600  * The swap_tick function gets called on every clock tick.
 601  */
 602 void swap_tick(void)
 603 {
 604         struct task_struct *p = kswapd_task;
 605
 606         /*
 607          * Only bother to try to wake kswapd up
 608          * if the task exists and can be woken.
 609          */
 610         if (p && (p->state & TASK_INTERRUPTIBLE)) {
 611                 unsigned int pages;
 612                 int want_wakeup;
 613
 614                 /*
 615                  * Schedule for wakeup if there isn't lots
 616                  * of free memory or if there is too much
 617                  * of it used for buffers or pgcache.
 618                  *
 619                  * "want_wakeup" is our priority: 0 means
 620                  * not to wake anything up, while 3 means
 621                  * that we'd better give kswapd a realtime
 622                  * priority.
 623                  */
 624                 want_wakeup = 0;
 625                 pages = nr_free_pages;
 626                 if (pages < freepages.high)
 627                         want_wakeup = 1;
 628                 if (pages < freepages.low)
 629                         want_wakeup = 2;
 630                 if (pages < freepages.min)
 631                         want_wakeup = 3;
 632
 633                 kswapd_wakeup(p,want_wakeup);
 634         }
 635
 636         timer_active |= (1<<SWAP_TIMER);
 637 }
 638
 639 /*
 640  * Initialise the swap timer
 641  */
 642
 643 void init_swap_timer(void)
 644 {
 645         timer_table[SWAP_TIMER].expires = jiffies;
 646         timer_table[SWAP_TIMER].fn = swap_tick;
 647         timer_active |= (1<<SWAP_TIMER);
 648 }