mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95, Stephen Tweedie.
   7  *  kswapd added: 7.1.96  sct
   8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11  */
  12
  13 #include <linux/slab.h>
  14 #include <linux/kernel_stat.h>
  15 #include <linux/swap.h>
  16 #include <linux/swapctl.h>
  17 #include <linux/smp_lock.h>
  18 #include <linux/pagemap.h>
  19 #include <linux/init.h>
  20
  21 #include <asm/pgtable.h>
  22
  23 /*
  24  * The swap-out functions return 1 if they successfully
  25  * threw something out, and we got a free page. It returns
  26  * zero if it couldn't do anything, and any other value
  27  * indicates it decreased rss, but the page was shared.
  28  *
  29  * NOTE! If it sleeps, it *must* return 1 to make sure we
  30  * don't continue with the swap-out. Otherwise we may be
  31  * using a process that no longer actually exists (it might
  32  * have died while we slept).
  33  */
  34 static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
  35         unsigned long address, pte_t * page_table, int gfp_mask)
  36 {
  37         pte_t pte;
  38         unsigned long entry;
  39         unsigned long page_addr;
  40         struct page * page;
  41
  42         pte = *page_table;
  43         if (!pte_present(pte))
  44                 goto out_failed;
  45         page_addr = pte_page(pte);
  46         if (MAP_NR(page_addr) >= max_mapnr)
  47                 goto out_failed;
  48
  49         page = mem_map + MAP_NR(page_addr);
  50         write_lock(&tsk->mm->page_table_lock);
  51         if (pte_val(pte) != pte_val(*page_table))
  52                 goto out_failed_unlock;
  53
  54         /*
  55          * Dont be too eager to get aging right if
  56          * memory is dangerously low.
  57          */
  58         if (!low_on_memory && pte_young(pte)) {
  59                 /*
  60                  * Transfer the "accessed" bit from the page
  61                  * tables to the global page map.
  62                  */
  63                 set_pte(page_table, pte_mkold(pte));
  64                 set_bit(PG_referenced, &page->flags);
  65                 goto out_failed_unlock;
  66         }
  67
  68         if (PageReserved(page)
  69             || PageLocked(page)
  70             || ((gfp_mask & __GFP_DMA) && !PageDMA(page)))
  71                 goto out_failed_unlock;
  72
  73         /*
  74          * Is the page already in the swap cache? If so, then
  75          * we can just drop our reference to it without doing
  76          * any IO - it's already up-to-date on disk.
  77          *
  78          * Return 0, as we didn't actually free any real
  79          * memory, and we should just continue our scan.
  80          */
  81         if (PageSwapCache(page)) {
  82                 entry = page->offset;
  83                 swap_duplicate(entry);
  84                 set_pte(page_table, __pte(entry));
  85 drop_pte:
  86                 vma->vm_mm->rss--;
  87                 flush_tlb_page(vma, address);
  88                 __free_page(page);
  89                 goto out_failed_unlock;
  90         }
  91
  92         /*
  93          * Is it a clean page? Then it must be recoverable
  94          * by just paging it in again, and we can just drop
  95          * it..
  96          *
  97          * However, this won't actually free any real
  98          * memory, as the page will just be in the page cache
  99          * somewhere, and as such we should just continue
 100          * our scan.
 101          *
 102          * Basically, this just makes it possible for us to do
 103          * some real work in the future in "shrink_mmap()".
 104          */
 105         if (!pte_dirty(pte)) {
 106                 pte_clear(page_table);
 107                 goto drop_pte;
 108         }
 109
 110         /*
 111          * Don't go down into the swap-out stuff if
 112          * we cannot do I/O! Avoid recursing on FS
 113          * locks etc.
 114          */
 115         if (!(gfp_mask & __GFP_IO))
 116                 goto out_failed_unlock;
 117
 118         /*
 119          * Ok, it's really dirty. That means that
 120          * we should either create a new swap cache
 121          * entry for it, or we should write it back
 122          * to its own backing store.
 123          *
 124          * Note that in neither case do we actually
 125          * know that we make a page available, but
 126          * as we potentially sleep we can no longer
 127          * continue scanning, so we migth as well
 128          * assume we free'd something.
 129          *
 130          * NOTE NOTE NOTE! This should just set a
 131          * dirty bit in 'page', and just drop the
 132          * pte. All the hard work would be done by
 133          * shrink_mmap().
 134          *
 135          * That would get rid of a lot of problems.
 136          */
 137         flush_cache_page(vma, address);
 138         if (vma->vm_ops && vma->vm_ops->swapout) {
 139                 pid_t pid = tsk->pid;
 140                 pte_clear(page_table);
 141                 write_unlock(&tsk->mm->page_table_lock);
 142                 flush_tlb_page(vma, address);
 143                 vma->vm_mm->rss--;
 144
 145                 if (vma->vm_ops->swapout(vma, page))
 146                         kill_proc(pid, SIGBUS, 1);
 147                 goto out_free_success;
 148         }
 149
 150         /*
 151          * This is a dirty, swappable page.  First of all,
 152          * get a suitable swap entry for it, and make sure
 153          * we have the swap cache set up to associate the
 154          * page with that swap entry.
 155          */
 156         entry = get_swap_page();
 157         if (!entry)
 158                 goto out_failed; /* No swap space left */
 159
 160         vma->vm_mm->rss--;
 161         tsk->mm->nswap++;
 162         set_pte(page_table, __pte(entry));
 163         write_unlock(&tsk->mm->page_table_lock);
 164
 165         flush_tlb_page(vma, address);
 166         swap_duplicate(entry);  /* One for the process, one for the swap cache */
 167
 168         /* This will also lock the page */
 169         add_to_swap_cache(page, entry);
 170
 171         /* OK, do a physical asynchronous write to swap.  */
 172         rw_swap_page(WRITE, page, 0);
 173
 174 out_free_success:
 175         __free_page(page);
 176         return 1;
 177 out_failed_unlock:
 178         write_unlock(&tsk->mm->page_table_lock);
 179 out_failed:
 180         return 0;
 181 }
 182
 183 /*
 184  * A new implementation of swap_out().  We do not swap complete processes,
 185  * but only a small number of blocks, before we continue with the next
 186  * process.  The number of blocks actually swapped is determined on the
 187  * number of page faults, that this process actually had in the last time,
 188  * so we won't swap heavily used processes all the time ...
 189  *
 190  * Note: the priority argument is a hint on much CPU to waste with the
 191  *       swap block search, not a hint, of how much blocks to swap with
 192  *       each process.
 193  *
 194  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 195  */
 196
 197 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
 198         pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 199 {
 200         pte_t * pte;
 201         unsigned long pmd_end;
 202
 203         if (pmd_none(*dir))
 204                 return 0;
 205         if (pmd_bad(*dir)) {
 206                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 207                 pmd_clear(dir);
 208                 return 0;
 209         }
 210
 211         pte = pte_offset(dir, address);
 212
 213         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 214         if (end > pmd_end)
 215                 end = pmd_end;
 216
 217         do {
 218                 int result;
 219                 tsk->mm->swap_address = address + PAGE_SIZE;
 220                 result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 221                 if (result)
 222                         return result;
 223                 address += PAGE_SIZE;
 224                 pte++;
 225         } while (address < end);
 226         return 0;
 227 }
 228
 229 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
 230         pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 231 {
 232         pmd_t * pmd;
 233         unsigned long pgd_end;
 234
 235         if (pgd_none(*dir))
 236                 return 0;
 237         if (pgd_bad(*dir)) {
 238                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 239                 pgd_clear(dir);
 240                 return 0;
 241         }
 242
 243         pmd = pmd_offset(dir, address);
 244
 245         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 246         if (end > pgd_end)
 247                 end = pgd_end;
 248
 249         do {
 250                 int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
 251                 if (result)
 252                         return result;
 253                 address = (address + PMD_SIZE) & PMD_MASK;
 254                 pmd++;
 255         } while (address < end);
 256         return 0;
 257 }
 258
 259 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 260         unsigned long address, int gfp_mask)
 261 {
 262         pgd_t *pgdir;
 263         unsigned long end;
 264
 265         /* Don't swap out areas which are locked down */
 266         if (vma->vm_flags & VM_LOCKED)
 267                 return 0;
 268
 269         pgdir = pgd_offset(tsk->mm, address);
 270
 271         end = vma->vm_end;
 272         while (address < end) {
 273                 int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
 274                 if (result)
 275                         return result;
 276                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 277                 pgdir++;
 278         }
 279         return 0;
 280 }
 281
 282 static int swap_out_process(struct task_struct * p, int gfp_mask)
 283 {
 284         unsigned long address;
 285         struct vm_area_struct* vma;
 286
 287         /*
 288          * Go through process' page directory.
 289          */
 290         address = p->mm->swap_address;
 291
 292         /*
 293          * Find the proper vm-area
 294          */
 295         vma = find_vma(p->mm, address);
 296         if (vma) {
 297                 if (address < vma->vm_start)
 298                         address = vma->vm_start;
 299
 300                 for (;;) {
 301                         int result = swap_out_vma(p, vma, address, gfp_mask);
 302                         if (result)
 303                                 return result;
 304                         vma = vma->vm_next;
 305                         if (!vma)
 306                                 break;
 307                         address = vma->vm_start;
 308                 }
 309         }
 310
 311         /* We didn't find anything for the process */
 312         p->mm->swap_cnt = 0;
 313         p->mm->swap_address = 0;
 314         return 0;
 315 }
 316
 317 /*
 318  * Select the task with maximal swap_cnt and try to swap out a page.
 319  * N.B. This function returns only 0 or 1.  Return values != 1 from
 320  * the lower level routines result in continued processing.
 321  */
 322 static int swap_out(unsigned int priority, int gfp_mask)
 323 {
 324         struct task_struct * p, * pbest;
 325         int counter, assign, max_cnt;
 326
 327         /*
 328          * We make one or two passes through the task list, indexed by
 329          * assign = {0, 1}:
 330          *   Pass 1: select the swappable task with maximal RSS that has
 331          *         not yet been swapped out.
 332          *   Pass 2: re-assign rss swap_cnt values, then select as above.
 333          *
 334          * With this approach, there's no need to remember the last task
 335          * swapped out.  If the swap-out fails, we clear swap_cnt so the
 336          * task won't be selected again until all others have been tried.
 337          *
 338          * Think of swap_cnt as a "shadow rss" - it tells us which process
 339          * we want to page out (always try largest first).
 340          */
 341         counter = nr_tasks / (priority+1);
 342         if (counter < 1)
 343                 counter = 1;
 344         if (counter > nr_tasks)
 345                 counter = nr_tasks;
 346
 347         for (; counter >= 0; counter--) {
 348                 assign = 0;
 349                 max_cnt = 0;
 350                 pbest = NULL;
 351         select:
 352                 read_lock(&tasklist_lock);
 353                 p = init_task.next_task;
 354                 for (; p != &init_task; p = p->next_task) {
 355                         if (!p->mm->swappable)
 356                                 continue;
 357                         if (p->mm->rss <= 0)
 358                                 continue;
 359                         /* Refresh swap_cnt? */
 360                         if (assign)
 361                                 p->mm->swap_cnt = p->mm->rss;
 362                         if (p->mm->swap_cnt > max_cnt) {
 363                                 max_cnt = p->mm->swap_cnt;
 364                                 pbest = p;
 365                         }
 366                 }
 367                 read_unlock(&tasklist_lock);
 368                 if (!pbest) {
 369                         if (!assign) {
 370                                 assign = 1;
 371                                 goto select;
 372                         }
 373                         goto out;
 374                 }
 375
 376                 if (swap_out_process(pbest, gfp_mask))
 377                         return 1;
 378         }
 379 out:
 380         return 0;
 381 }
 382
 383 /*
 384  * We need to make the locks finer granularity, but right
 385  * now we need this so that we can do page allocations
 386  * without holding the kernel lock etc.
 387  *
 388  * We want to try to free "count" pages, and we need to
 389  * cluster them so that we get good swap-out behaviour. See
 390  * the "free_memory()" macro for details.
 391  */
 392 static int do_try_to_free_pages(unsigned int gfp_mask)
 393 {
 394         int priority;
 395         int count = SWAP_CLUSTER_MAX;
 396
 397         lock_kernel();
 398
 399         /* Always trim SLAB caches when memory gets low. */
 400         kmem_cache_reap(gfp_mask);
 401
 402         priority = 6;
 403         do {
 404                 while (shrink_mmap(priority, gfp_mask)) {
 405                         if (!--count)
 406                                 goto done;
 407                 }
 408
 409                 /* Try to get rid of some shared memory pages.. */
 410                 if (gfp_mask & __GFP_IO) {
 411                         while (shm_swap(priority, gfp_mask)) {
 412                                 if (!--count)
 413                                         goto done;
 414                         }
 415                 }
 416
 417                 /* Then, try to page stuff out.. */
 418                 while (swap_out(priority, gfp_mask)) {
 419                         if (!--count)
 420                                 goto done;
 421                 }
 422
 423                 shrink_dcache_memory(priority, gfp_mask);
 424         } while (--priority >= 0);
 425 done:
 426         unlock_kernel();
 427
 428         return priority >= 0;
 429 }
 430
 431 /*
 432  * Before we start the kernel thread, print out the
 433  * kswapd initialization message (otherwise the init message
 434  * may be printed in the middle of another driver's init
 435  * message).  It looks very bad when that happens.
 436  */
 437 void __init kswapd_setup(void)
 438 {
 439        int i;
 440        char *revision="$Revision: 1.5 $", *s, *e;
 441
 442        swap_setup();
 443
 444        if ((s = strchr(revision, ':')) &&
 445            (e = strchr(s, '$')))
 446                s++, i = e - s;
 447        else
 448                s = revision, i = -1;
 449        printk ("Starting kswapd v%.*s\n", i, s);
 450 }
 451
 452 static struct task_struct *kswapd_process;
 453
 454 /*
 455  * The background pageout daemon, started as a kernel thread
 456  * from the init process.
 457  *
 458  * This basically executes once a second, trickling out pages
 459  * so that we have _some_ free memory available even if there
 460  * is no other activity that frees anything up. This is needed
 461  * for things like routing etc, where we otherwise might have
 462  * all activity going on in asynchronous contexts that cannot
 463  * page things out.
 464  *
 465  * If there are applications that are active memory-allocators
 466  * (most normal use), this basically shouldn't matter.
 467  */
 468 int kswapd(void *unused)
 469 {
 470         struct task_struct *tsk = current;
 471
 472         kswapd_process = tsk;
 473         tsk->session = 1;
 474         tsk->pgrp = 1;
 475         strcpy(tsk->comm, "kswapd");
 476         sigfillset(&tsk->blocked);
 477
 478         /*
 479          * Tell the memory management that we're a "memory allocator",
 480          * and that if we need more memory we should get access to it
 481          * regardless (see "__get_free_pages()"). "kswapd" should
 482          * never get caught in the normal page freeing logic.
 483          *
 484          * (Kswapd normally doesn't need memory anyway, but sometimes
 485          * you need a small amount of memory in order to be able to
 486          * page out something else, and this flag essentially protects
 487          * us from recursively trying to free more memory as we're
 488          * trying to free the first piece of memory in the first place).
 489          */
 490         tsk->flags |= PF_MEMALLOC;
 491
 492         while (1) {
 493                 /*
 494                  * Wake up once a second to see if we need to make
 495                  * more memory available.
 496                  *
 497                  * If we actually get into a low-memory situation,
 498                  * the processes needing more memory will wake us
 499                  * up on a more timely basis.
 500                  */
 501                 do {
 502                         if (nr_free_pages >= freepages.high)
 503                                 break;
 504
 505                         if (!do_try_to_free_pages(GFP_KSWAPD))
 506                                 break;
 507                         run_task_queue(&tq_disk);
 508                 } while (!tsk->need_resched);
 509                 tsk->state = TASK_INTERRUPTIBLE;
 510                 schedule_timeout(HZ);
 511         }
 512 }
 513
 514 /*
 515  * Called by non-kswapd processes when they want more
 516  * memory.
 517  *
 518  * In a perfect world, this should just wake up kswapd
 519  * and return. We don't actually want to swap stuff out
 520  * from user processes, because the locking issues are
 521  * nasty to the extreme (file write locks, and MM locking)
 522  *
 523  * One option might be to let kswapd do all the page-out
 524  * and VM page table scanning that needs locking, and this
 525  * process thread could do just the mmap shrink stage that
 526  * can be done by just dropping cached pages without having
 527  * any deadlock issues.
 528  */
 529 int try_to_free_pages(unsigned int gfp_mask)
 530 {
 531         int retval = 1;
 532
 533         wake_up_process(kswapd_process);
 534         if (gfp_mask & __GFP_WAIT)
 535                 retval = do_try_to_free_pages(gfp_mask);
 536         return retval;
 537 }
 538