mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95, Stephen Tweedie.
   7  *  kswapd added: 7.1.96  sct
   8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11  */
  12
  13 #include <linux/slab.h>
  14 #include <linux/kernel_stat.h>
  15 #include <linux/swap.h>
  16 #include <linux/swapctl.h>
  17 #include <linux/smp_lock.h>
  18 #include <linux/pagemap.h>
  19 #include <linux/init.h>
  20
  21 #include <asm/pgtable.h>
  22
  23 /*
  24  * The swap-out functions return 1 if they successfully
  25  * threw something out, and we got a free page. It returns
  26  * zero if it couldn't do anything, and any other value
  27  * indicates it decreased rss, but the page was shared.
  28  *
  29  * NOTE! If it sleeps, it *must* return 1 to make sure we
  30  * don't continue with the swap-out. Otherwise we may be
  31  * using a process that no longer actually exists (it might
  32  * have died while we slept).
  33  */
  34 static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
  35 {
  36         pte_t pte;
  37         unsigned long entry;
  38         unsigned long page_addr;
  39         struct page * page;
  40
  41         pte = *page_table;
  42         if (!pte_present(pte))
  43                 goto out_failed;
  44         page_addr = pte_page(pte);
  45         if (MAP_NR(page_addr) >= max_mapnr)
  46                 goto out_failed;
  47
  48         page = mem_map + MAP_NR(page_addr);
  49         spin_lock(&vma->vm_mm->page_table_lock);
  50         if (pte_val(pte) != pte_val(*page_table))
  51                 goto out_failed_unlock;
  52
  53         /*
  54          * Dont be too eager to get aging right if
  55          * memory is dangerously low.
  56          */
  57         if (!low_on_memory && pte_young(pte)) {
  58                 /*
  59                  * Transfer the "accessed" bit from the page
  60                  * tables to the global page map.
  61                  */
  62                 set_pte(page_table, pte_mkold(pte));
  63                 set_bit(PG_referenced, &page->flags);
  64                 goto out_failed_unlock;
  65         }
  66
  67         if (PageReserved(page)
  68             || PageLocked(page)
  69             || ((gfp_mask & __GFP_DMA) && !PageDMA(page)))
  70                 goto out_failed_unlock;
  71
  72         /*
  73          * Is the page already in the swap cache? If so, then
  74          * we can just drop our reference to it without doing
  75          * any IO - it's already up-to-date on disk.
  76          *
  77          * Return 0, as we didn't actually free any real
  78          * memory, and we should just continue our scan.
  79          */
  80         if (PageSwapCache(page)) {
  81                 entry = page->offset;
  82                 swap_duplicate(entry);
  83                 set_pte(page_table, __pte(entry));
  84 drop_pte:
  85                 vma->vm_mm->rss--;
  86                 flush_tlb_page(vma, address);
  87                 __free_page(page);
  88                 goto out_failed_unlock;
  89         }
  90
  91         /*
  92          * Is it a clean page? Then it must be recoverable
  93          * by just paging it in again, and we can just drop
  94          * it..
  95          *
  96          * However, this won't actually free any real
  97          * memory, as the page will just be in the page cache
  98          * somewhere, and as such we should just continue
  99          * our scan.
 100          *
 101          * Basically, this just makes it possible for us to do
 102          * some real work in the future in "shrink_mmap()".
 103          */
 104         if (!pte_dirty(pte)) {
 105                 pte_clear(page_table);
 106                 goto drop_pte;
 107         }
 108
 109         /*
 110          * Don't go down into the swap-out stuff if
 111          * we cannot do I/O! Avoid recursing on FS
 112          * locks etc.
 113          */
 114         if (!(gfp_mask & __GFP_IO))
 115                 goto out_failed_unlock;
 116
 117         /*
 118          * Ok, it's really dirty. That means that
 119          * we should either create a new swap cache
 120          * entry for it, or we should write it back
 121          * to its own backing store.
 122          *
 123          * Note that in neither case do we actually
 124          * know that we make a page available, but
 125          * as we potentially sleep we can no longer
 126          * continue scanning, so we migth as well
 127          * assume we free'd something.
 128          *
 129          * NOTE NOTE NOTE! This should just set a
 130          * dirty bit in 'page', and just drop the
 131          * pte. All the hard work would be done by
 132          * shrink_mmap().
 133          *
 134          * That would get rid of a lot of problems.
 135          */
 136         flush_cache_page(vma, address);
 137         if (vma->vm_ops && vma->vm_ops->swapout) {
 138                 int error;
 139                 pte_clear(page_table);
 140                 spin_unlock(&vma->vm_mm->page_table_lock);
 141                 flush_tlb_page(vma, address);
 142                 vma->vm_mm->rss--;
 143                 error = vma->vm_ops->swapout(vma, page);
 144                 if (!error)
 145                         goto out_free_success;
 146                 __free_page(page);
 147                 return error;
 148         }
 149
 150         /*
 151          * This is a dirty, swappable page.  First of all,
 152          * get a suitable swap entry for it, and make sure
 153          * we have the swap cache set up to associate the
 154          * page with that swap entry.
 155          */
 156         entry = get_swap_page();
 157         if (!entry)
 158                 goto out_failed_unlock; /* No swap space left */
 159
 160         vma->vm_mm->rss--;
 161         set_pte(page_table, __pte(entry));
 162         spin_unlock(&vma->vm_mm->page_table_lock);
 163
 164         flush_tlb_page(vma, address);
 165         swap_duplicate(entry);  /* One for the process, one for the swap cache */
 166
 167         /* This will also lock the page */
 168         add_to_swap_cache(page, entry);
 169
 170         /* OK, do a physical asynchronous write to swap.  */
 171         rw_swap_page(WRITE, page, 0);
 172
 173 out_free_success:
 174         __free_page(page);
 175         return 1;
 176 out_failed_unlock:
 177         spin_unlock(&vma->vm_mm->page_table_lock);
 178 out_failed:
 179         return 0;
 180 }
 181
 182 /*
 183  * A new implementation of swap_out().  We do not swap complete processes,
 184  * but only a small number of blocks, before we continue with the next
 185  * process.  The number of blocks actually swapped is determined on the
 186  * number of page faults, that this process actually had in the last time,
 187  * so we won't swap heavily used processes all the time ...
 188  *
 189  * Note: the priority argument is a hint on much CPU to waste with the
 190  *       swap block search, not a hint, of how much blocks to swap with
 191  *       each process.
 192  *
 193  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 194  */
 195
 196 static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 197 {
 198         pte_t * pte;
 199         unsigned long pmd_end;
 200
 201         if (pmd_none(*dir))
 202                 return 0;
 203         if (pmd_bad(*dir)) {
 204                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 205                 pmd_clear(dir);
 206                 return 0;
 207         }
 208
 209         pte = pte_offset(dir, address);
 210
 211         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 212         if (end > pmd_end)
 213                 end = pmd_end;
 214
 215         do {
 216                 int result;
 217                 vma->vm_mm->swap_address = address + PAGE_SIZE;
 218                 result = try_to_swap_out(vma, address, pte, gfp_mask);
 219                 if (result)
 220                         return result;
 221                 address += PAGE_SIZE;
 222                 pte++;
 223         } while (address < end);
 224         return 0;
 225 }
 226
 227 static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 228 {
 229         pmd_t * pmd;
 230         unsigned long pgd_end;
 231
 232         if (pgd_none(*dir))
 233                 return 0;
 234         if (pgd_bad(*dir)) {
 235                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 236                 pgd_clear(dir);
 237                 return 0;
 238         }
 239
 240         pmd = pmd_offset(dir, address);
 241
 242         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 243         if (end > pgd_end)
 244                 end = pgd_end;
 245
 246         do {
 247                 int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
 248                 if (result)
 249                         return result;
 250                 address = (address + PMD_SIZE) & PMD_MASK;
 251                 pmd++;
 252         } while (address < end);
 253         return 0;
 254 }
 255
 256 static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 257 {
 258         pgd_t *pgdir;
 259         unsigned long end;
 260
 261         /* Don't swap out areas which are locked down */
 262         if (vma->vm_flags & VM_LOCKED)
 263                 return 0;
 264
 265         pgdir = pgd_offset(vma->vm_mm, address);
 266
 267         end = vma->vm_end;
 268         while (address < end) {
 269                 int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
 270                 if (result)
 271                         return result;
 272                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 273                 pgdir++;
 274         }
 275         return 0;
 276 }
 277
 278 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 279 {
 280         unsigned long address;
 281         struct vm_area_struct* vma;
 282
 283         /*
 284          * Go through process' page directory.
 285          */
 286         address = mm->swap_address;
 287
 288         /*
 289          * Find the proper vm-area
 290          */
 291         vma = find_vma(mm, address);
 292         if (vma) {
 293                 if (address < vma->vm_start)
 294                         address = vma->vm_start;
 295
 296                 for (;;) {
 297                         int result = swap_out_vma(vma, address, gfp_mask);
 298                         if (result)
 299                                 return result;
 300                         vma = vma->vm_next;
 301                         if (!vma)
 302                                 break;
 303                         address = vma->vm_start;
 304                 }
 305         }
 306
 307         /* We didn't find anything for the process */
 308         mm->swap_cnt = 0;
 309         mm->swap_address = 0;
 310         return 0;
 311 }
 312
 313 /*
 314  * Select the task with maximal swap_cnt and try to swap out a page.
 315  * N.B. This function returns only 0 or 1.  Return values != 1 from
 316  * the lower level routines result in continued processing.
 317  */
 318 static int swap_out(unsigned int priority, int gfp_mask)
 319 {
 320         struct task_struct * p;
 321         int counter;
 322
 323         /*
 324          * We make one or two passes through the task list, indexed by
 325          * assign = {0, 1}:
 326          *   Pass 1: select the swappable task with maximal RSS that has
 327          *         not yet been swapped out.
 328          *   Pass 2: re-assign rss swap_cnt values, then select as above.
 329          *
 330          * With this approach, there's no need to remember the last task
 331          * swapped out.  If the swap-out fails, we clear swap_cnt so the
 332          * task won't be selected again until all others have been tried.
 333          *
 334          * Think of swap_cnt as a "shadow rss" - it tells us which process
 335          * we want to page out (always try largest first).
 336          */
 337         counter = nr_threads / (priority+1);
 338         if (counter < 1)
 339                 counter = 1;
 340         if (counter > nr_threads)
 341                 counter = nr_threads;
 342
 343         for (; counter >= 0; counter--) {
 344                 int assign = 0;
 345                 int max_cnt = 0;
 346                 struct mm_struct *best = NULL;
 347                 int pid = 0;
 348         select:
 349                 read_lock(&tasklist_lock);
 350                 p = init_task.next_task;
 351                 for (; p != &init_task; p = p->next_task) {
 352                         struct mm_struct *mm = p->mm;
 353                         if (!p->swappable || !mm)
 354                                 continue;
 355                         if (mm->rss <= 0)
 356                                 continue;
 357                         /* Refresh swap_cnt? */
 358                         if (assign)
 359                                 mm->swap_cnt = mm->rss;
 360                         if (mm->swap_cnt > max_cnt) {
 361                                 max_cnt = mm->swap_cnt;
 362                                 best = mm;
 363                                 pid = p->pid;
 364                         }
 365                 }
 366                 read_unlock(&tasklist_lock);
 367                 if (!best) {
 368                         if (!assign) {
 369                                 assign = 1;
 370                                 goto select;
 371                         }
 372                         goto out;
 373                 } else {
 374                         int ret;
 375
 376                         atomic_inc(&best->mm_count);
 377                         ret = swap_out_mm(best, gfp_mask);
 378                         mmdrop(best);
 379
 380                         if (!ret)
 381                                 continue;
 382
 383                         if (ret < 0)
 384                                 kill_proc(pid, SIGBUS, 1);
 385                         return 1;
 386                 }
 387         }
 388 out:
 389         return 0;
 390 }
 391
 392 /*
 393  * We need to make the locks finer granularity, but right
 394  * now we need this so that we can do page allocations
 395  * without holding the kernel lock etc.
 396  *
 397  * We want to try to free "count" pages, and we need to
 398  * cluster them so that we get good swap-out behaviour. See
 399  * the "free_memory()" macro for details.
 400  */
 401 static int do_try_to_free_pages(unsigned int gfp_mask)
 402 {
 403         int priority;
 404         int count = SWAP_CLUSTER_MAX;
 405
 406         lock_kernel();
 407
 408         /* Always trim SLAB caches when memory gets low. */
 409         kmem_cache_reap(gfp_mask);
 410
 411         priority = 6;
 412         do {
 413                 while (shrink_mmap(priority, gfp_mask)) {
 414                         if (!--count)
 415                                 goto done;
 416                 }
 417
 418                 /* Try to get rid of some shared memory pages.. */
 419                 if (gfp_mask & __GFP_IO) {
 420                         while (shm_swap(priority, gfp_mask)) {
 421                                 if (!--count)
 422                                         goto done;
 423                         }
 424                 }
 425
 426                 /* Then, try to page stuff out.. */
 427                 while (swap_out(priority, gfp_mask)) {
 428                         if (!--count)
 429                                 goto done;
 430                 }
 431
 432                 shrink_dcache_memory(priority, gfp_mask);
 433         } while (--priority >= 0);
 434 done:
 435         unlock_kernel();
 436
 437         return priority >= 0;
 438 }
 439
 440 /*
 441  * Before we start the kernel thread, print out the
 442  * kswapd initialization message (otherwise the init message
 443  * may be printed in the middle of another driver's init
 444  * message).  It looks very bad when that happens.
 445  */
 446 void __init kswapd_setup(void)
 447 {
 448        int i;
 449        char *revision="$Revision: 1.5 $", *s, *e;
 450
 451        swap_setup();
 452
 453        if ((s = strchr(revision, ':')) &&
 454            (e = strchr(s, '$')))
 455                s++, i = e - s;
 456        else
 457                s = revision, i = -1;
 458        printk ("Starting kswapd v%.*s\n", i, s);
 459 }
 460
 461 static struct task_struct *kswapd_process;
 462
 463 /*
 464  * The background pageout daemon, started as a kernel thread
 465  * from the init process.
 466  *
 467  * This basically executes once a second, trickling out pages
 468  * so that we have _some_ free memory available even if there
 469  * is no other activity that frees anything up. This is needed
 470  * for things like routing etc, where we otherwise might have
 471  * all activity going on in asynchronous contexts that cannot
 472  * page things out.
 473  *
 474  * If there are applications that are active memory-allocators
 475  * (most normal use), this basically shouldn't matter.
 476  */
 477 int kswapd(void *unused)
 478 {
 479         struct task_struct *tsk = current;
 480
 481         kswapd_process = tsk;
 482         tsk->session = 1;
 483         tsk->pgrp = 1;
 484         strcpy(tsk->comm, "kswapd");
 485         sigfillset(&tsk->blocked);
 486
 487         /*
 488          * Tell the memory management that we're a "memory allocator",
 489          * and that if we need more memory we should get access to it
 490          * regardless (see "__get_free_pages()"). "kswapd" should
 491          * never get caught in the normal page freeing logic.
 492          *
 493          * (Kswapd normally doesn't need memory anyway, but sometimes
 494          * you need a small amount of memory in order to be able to
 495          * page out something else, and this flag essentially protects
 496          * us from recursively trying to free more memory as we're
 497          * trying to free the first piece of memory in the first place).
 498          */
 499         tsk->flags |= PF_MEMALLOC;
 500
 501         while (1) {
 502                 /*
 503                  * Wake up once a second to see if we need to make
 504                  * more memory available.
 505                  *
 506                  * If we actually get into a low-memory situation,
 507                  * the processes needing more memory will wake us
 508                  * up on a more timely basis.
 509                  */
 510                 do {
 511                         if (nr_free_pages >= freepages.high)
 512                                 break;
 513
 514                         if (!do_try_to_free_pages(GFP_KSWAPD))
 515                                 break;
 516                         run_task_queue(&tq_disk);
 517                 } while (!tsk->need_resched);
 518                 tsk->state = TASK_INTERRUPTIBLE;
 519                 schedule_timeout(HZ);
 520         }
 521 }
 522
 523 /*
 524  * Called by non-kswapd processes when they want more
 525  * memory.
 526  *
 527  * In a perfect world, this should just wake up kswapd
 528  * and return. We don't actually want to swap stuff out
 529  * from user processes, because the locking issues are
 530  * nasty to the extreme (file write locks, and MM locking)
 531  *
 532  * One option might be to let kswapd do all the page-out
 533  * and VM page table scanning that needs locking, and this
 534  * process thread could do just the mmap shrink stage that
 535  * can be done by just dropping cached pages without having
 536  * any deadlock issues.
 537  */
 538 int try_to_free_pages(unsigned int gfp_mask)
 539 {
 540         int retval = 1;
 541
 542         wake_up_process(kswapd_process);
 543         if (gfp_mask & __GFP_WAIT)
 544                 retval = do_try_to_free_pages(gfp_mask);
 545         return retval;
 546 }
 547