mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95, Stephen Tweedie.
   7  *  kswapd added: 7.1.96  sct
   8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11  */
  12
  13 #include <linux/slab.h>
  14 #include <linux/kernel_stat.h>
  15 #include <linux/swap.h>
  16 #include <linux/swapctl.h>
  17 #include <linux/smp_lock.h>
  18 #include <linux/pagemap.h>
  19 #include <linux/init.h>
  20
  21 #include <asm/pgtable.h>
  22
  23 /*
  24  * The swap-out functions return 1 if they successfully
  25  * threw something out, and we got a free page. It returns
  26  * zero if it couldn't do anything, and any other value
  27  * indicates it decreased rss, but the page was shared.
  28  *
  29  * NOTE! If it sleeps, it *must* return 1 to make sure we
  30  * don't continue with the swap-out. Otherwise we may be
  31  * using a process that no longer actually exists (it might
  32  * have died while we slept).
  33  */
  34 static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
  35         unsigned long address, pte_t * page_table, int gfp_mask)
  36 {
  37         pte_t pte;
  38         unsigned long entry;
  39         unsigned long page;
  40         struct page * page_map;
  41
  42         pte = *page_table;
  43         if (!pte_present(pte))
  44                 return 0;
  45         page = pte_page(pte);
  46         if (MAP_NR(page) >= max_mapnr)
  47                 return 0;
  48
  49         page_map = mem_map + MAP_NR(page);
  50         if (PageReserved(page_map)
  51             || PageLocked(page_map)
  52             || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
  53                 return 0;
  54
  55         if (pte_young(pte)) {
  56                 /*
  57                  * Transfer the "accessed" bit from the page
  58                  * tables to the global page map.
  59                  */
  60                 set_pte(page_table, pte_mkold(pte));
  61                 set_bit(PG_referenced, &page_map->flags);
  62                 return 0;
  63         }
  64
  65         /*
  66          * Is the page already in the swap cache? If so, then
  67          * we can just drop our reference to it without doing
  68          * any IO - it's already up-to-date on disk.
  69          *
  70          * Return 0, as we didn't actually free any real
  71          * memory, and we should just continue our scan.
  72          */
  73         if (PageSwapCache(page_map)) {
  74                 entry = page_map->offset;
  75                 swap_duplicate(entry);
  76                 set_pte(page_table, __pte(entry));
  77 drop_pte:
  78                 vma->vm_mm->rss--;
  79                 flush_tlb_page(vma, address);
  80                 __free_page(page_map);
  81                 return 0;
  82         }
  83
  84         /*
  85          * Is it a clean page? Then it must be recoverable
  86          * by just paging it in again, and we can just drop
  87          * it..
  88          *
  89          * However, this won't actually free any real
  90          * memory, as the page will just be in the page cache
  91          * somewhere, and as such we should just continue
  92          * our scan.
  93          *
  94          * Basically, this just makes it possible for us to do
  95          * some real work in the future in "shrink_mmap()".
  96          */
  97         if (!pte_dirty(pte)) {
  98                 pte_clear(page_table);
  99                 goto drop_pte;
 100         }
 101
 102         /*
 103          * Don't go down into the swap-out stuff if
 104          * we cannot do I/O! Avoid recursing on FS
 105          * locks etc.
 106          */
 107         if (!(gfp_mask & __GFP_IO))
 108                 return 0;
 109
 110         /*
 111          * Ok, it's really dirty. That means that
 112          * we should either create a new swap cache
 113          * entry for it, or we should write it back
 114          * to its own backing store.
 115          *
 116          * Note that in neither case do we actually
 117          * know that we make a page available, but
 118          * as we potentially sleep we can no longer
 119          * continue scanning, so we migth as well
 120          * assume we free'd something.
 121          *
 122          * NOTE NOTE NOTE! This should just set a
 123          * dirty bit in page_map, and just drop the
 124          * pte. All the hard work would be done by
 125          * shrink_mmap().
 126          *
 127          * That would get rid of a lot of problems.
 128          */
 129         flush_cache_page(vma, address);
 130         if (vma->vm_ops && vma->vm_ops->swapout) {
 131                 pid_t pid = tsk->pid;
 132                 pte_clear(page_table);
 133                 flush_tlb_page(vma, address);
 134                 vma->vm_mm->rss--;
 135
 136                 if (vma->vm_ops->swapout(vma, page_map))
 137                         kill_proc(pid, SIGBUS, 1);
 138                 __free_page(page_map);
 139                 return 1;
 140         }
 141
 142         /*
 143          * This is a dirty, swappable page.  First of all,
 144          * get a suitable swap entry for it, and make sure
 145          * we have the swap cache set up to associate the
 146          * page with that swap entry.
 147          */
 148         entry = get_swap_page();
 149         if (!entry)
 150                 return 0; /* No swap space left */
 151
 152         vma->vm_mm->rss--;
 153         tsk->nswap++;
 154         set_pte(page_table, __pte(entry));
 155         flush_tlb_page(vma, address);
 156         swap_duplicate(entry);  /* One for the process, one for the swap cache */
 157         add_to_swap_cache(page_map, entry);
 158         /* We checked we were unlocked way up above, and we
 159            have been careful not to stall until here */
 160         set_bit(PG_locked, &page_map->flags);
 161
 162         /* OK, do a physical asynchronous write to swap.  */
 163         rw_swap_page(WRITE, entry, (char *) page, 0);
 164
 165         __free_page(page_map);
 166         return 1;
 167 }
 168
 169 /*
 170  * A new implementation of swap_out().  We do not swap complete processes,
 171  * but only a small number of blocks, before we continue with the next
 172  * process.  The number of blocks actually swapped is determined on the
 173  * number of page faults, that this process actually had in the last time,
 174  * so we won't swap heavily used processes all the time ...
 175  *
 176  * Note: the priority argument is a hint on much CPU to waste with the
 177  *       swap block search, not a hint, of how much blocks to swap with
 178  *       each process.
 179  *
 180  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 181  */
 182
 183 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
 184         pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 185 {
 186         pte_t * pte;
 187         unsigned long pmd_end;
 188
 189         if (pmd_none(*dir))
 190                 return 0;
 191         if (pmd_bad(*dir)) {
 192                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 193                 pmd_clear(dir);
 194                 return 0;
 195         }
 196
 197         pte = pte_offset(dir, address);
 198
 199         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 200         if (end > pmd_end)
 201                 end = pmd_end;
 202
 203         do {
 204                 int result;
 205                 tsk->swap_address = address + PAGE_SIZE;
 206                 result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 207                 if (result)
 208                         return result;
 209                 address += PAGE_SIZE;
 210                 pte++;
 211         } while (address < end);
 212         return 0;
 213 }
 214
 215 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
 216         pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 217 {
 218         pmd_t * pmd;
 219         unsigned long pgd_end;
 220
 221         if (pgd_none(*dir))
 222                 return 0;
 223         if (pgd_bad(*dir)) {
 224                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 225                 pgd_clear(dir);
 226                 return 0;
 227         }
 228
 229         pmd = pmd_offset(dir, address);
 230
 231         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 232         if (end > pgd_end)
 233                 end = pgd_end;
 234
 235         do {
 236                 int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
 237                 if (result)
 238                         return result;
 239                 address = (address + PMD_SIZE) & PMD_MASK;
 240                 pmd++;
 241         } while (address < end);
 242         return 0;
 243 }
 244
 245 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 246         unsigned long address, int gfp_mask)
 247 {
 248         pgd_t *pgdir;
 249         unsigned long end;
 250
 251         /* Don't swap out areas like shared memory which have their
 252             own separate swapping mechanism or areas which are locked down */
 253         if (vma->vm_flags & (VM_SHM | VM_LOCKED))
 254                 return 0;
 255
 256         pgdir = pgd_offset(tsk->mm, address);
 257
 258         end = vma->vm_end;
 259         while (address < end) {
 260                 int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
 261                 if (result)
 262                         return result;
 263                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 264                 pgdir++;
 265         }
 266         return 0;
 267 }
 268
 269 static int swap_out_process(struct task_struct * p, int gfp_mask)
 270 {
 271         unsigned long address;
 272         struct vm_area_struct* vma;
 273
 274         /*
 275          * Go through process' page directory.
 276          */
 277         address = p->swap_address;
 278
 279         /*
 280          * Find the proper vm-area
 281          */
 282         vma = find_vma(p->mm, address);
 283         if (vma) {
 284                 if (address < vma->vm_start)
 285                         address = vma->vm_start;
 286
 287                 for (;;) {
 288                         int result = swap_out_vma(p, vma, address, gfp_mask);
 289                         if (result)
 290                                 return result;
 291                         vma = vma->vm_next;
 292                         if (!vma)
 293                                 break;
 294                         address = vma->vm_start;
 295                 }
 296         }
 297
 298         /* We didn't find anything for the process */
 299         p->swap_cnt = 0;
 300         p->swap_address = 0;
 301         return 0;
 302 }
 303
 304 /*
 305  * Select the task with maximal swap_cnt and try to swap out a page.
 306  * N.B. This function returns only 0 or 1.  Return values != 1 from
 307  * the lower level routines result in continued processing.
 308  */
 309 static int swap_out(unsigned int priority, int gfp_mask)
 310 {
 311         struct task_struct * p, * pbest;
 312         int counter, assign, max_cnt;
 313
 314         /*
 315          * We make one or two passes through the task list, indexed by
 316          * assign = {0, 1}:
 317          *   Pass 1: select the swappable task with maximal RSS that has
 318          *         not yet been swapped out.
 319          *   Pass 2: re-assign rss swap_cnt values, then select as above.
 320          *
 321          * With this approach, there's no need to remember the last task
 322          * swapped out.  If the swap-out fails, we clear swap_cnt so the
 323          * task won't be selected again until all others have been tried.
 324          *
 325          * Think of swap_cnt as a "shadow rss" - it tells us which process
 326          * we want to page out (always try largest first).
 327          */
 328         counter = nr_tasks / (priority+1);
 329         if (counter < 1)
 330                 counter = 1;
 331         if (counter > nr_tasks)
 332                 counter = nr_tasks;
 333
 334         for (; counter >= 0; counter--) {
 335                 assign = 0;
 336                 max_cnt = 0;
 337                 pbest = NULL;
 338         select:
 339                 read_lock(&tasklist_lock);
 340                 p = init_task.next_task;
 341                 for (; p != &init_task; p = p->next_task) {
 342                         if (!p->swappable)
 343                                 continue;
 344                         if (p->mm->rss <= 0)
 345                                 continue;
 346                         /* Refresh swap_cnt? */
 347                         if (assign)
 348                                 p->swap_cnt = p->mm->rss;
 349                         if (p->swap_cnt > max_cnt) {
 350                                 max_cnt = p->swap_cnt;
 351                                 pbest = p;
 352                         }
 353                 }
 354                 read_unlock(&tasklist_lock);
 355                 if (!pbest) {
 356                         if (!assign) {
 357                                 assign = 1;
 358                                 goto select;
 359                         }
 360                         goto out;
 361                 }
 362
 363                 if (swap_out_process(pbest, gfp_mask))
 364                         return 1;
 365         }
 366 out:
 367         return 0;
 368 }
 369
 370 /*
 371  * We need to make the locks finer granularity, but right
 372  * now we need this so that we can do page allocations
 373  * without holding the kernel lock etc.
 374  *
 375  * We want to try to free "count" pages, and we need to
 376  * cluster them so that we get good swap-out behaviour. See
 377  * the "free_memory()" macro for details.
 378  */
 379 static int do_try_to_free_pages(unsigned int gfp_mask)
 380 {
 381         int priority;
 382         int count = SWAP_CLUSTER_MAX;
 383
 384         lock_kernel();
 385
 386         /* Always trim SLAB caches when memory gets low. */
 387         kmem_cache_reap(gfp_mask);
 388
 389         priority = 6;
 390         do {
 391                 while (shrink_mmap(priority, gfp_mask)) {
 392                         if (!--count)
 393                                 goto done;
 394                 }
 395
 396                 /* Try to get rid of some shared memory pages.. */
 397                 if (gfp_mask & __GFP_IO) {
 398                         while (shm_swap(priority, gfp_mask)) {
 399                                 if (!--count)
 400                                         goto done;
 401                         }
 402                 }
 403
 404                 /* Then, try to page stuff out.. */
 405                 while (swap_out(priority, gfp_mask)) {
 406                         if (!--count)
 407                                 goto done;
 408                 }
 409
 410                 shrink_dcache_memory(priority, gfp_mask);
 411         } while (--priority >= 0);
 412 done:
 413         unlock_kernel();
 414
 415         return priority >= 0;
 416 }
 417
 418 /*
 419  * Before we start the kernel thread, print out the
 420  * kswapd initialization message (otherwise the init message
 421  * may be printed in the middle of another driver's init
 422  * message).  It looks very bad when that happens.
 423  */
 424 void __init kswapd_setup(void)
 425 {
 426        int i;
 427        char *revision="$Revision: 1.5 $", *s, *e;
 428
 429        swap_setup();
 430
 431        if ((s = strchr(revision, ':')) &&
 432            (e = strchr(s, '$')))
 433                s++, i = e - s;
 434        else
 435                s = revision, i = -1;
 436        printk ("Starting kswapd v%.*s\n", i, s);
 437 }
 438
 439 static struct task_struct *kswapd_process;
 440
 441 /*
 442  * The background pageout daemon, started as a kernel thread
 443  * from the init process.
 444  *
 445  * This basically executes once a second, trickling out pages
 446  * so that we have _some_ free memory available even if there
 447  * is no other activity that frees anything up. This is needed
 448  * for things like routing etc, where we otherwise might have
 449  * all activity going on in asynchronous contexts that cannot
 450  * page things out.
 451  *
 452  * If there are applications that are active memory-allocators
 453  * (most normal use), this basically shouldn't matter.
 454  */
 455 int kswapd(void *unused)
 456 {
 457         struct task_struct *tsk = current;
 458
 459         kswapd_process = tsk;
 460         tsk->session = 1;
 461         tsk->pgrp = 1;
 462         strcpy(tsk->comm, "kswapd");
 463         sigfillset(&tsk->blocked);
 464
 465         /*
 466          * Tell the memory management that we're a "memory allocator",
 467          * and that if we need more memory we should get access to it
 468          * regardless (see "__get_free_pages()"). "kswapd" should
 469          * never get caught in the normal page freeing logic.
 470          *
 471          * (Kswapd normally doesn't need memory anyway, but sometimes
 472          * you need a small amount of memory in order to be able to
 473          * page out something else, and this flag essentially protects
 474          * us from recursively trying to free more memory as we're
 475          * trying to free the first piece of memory in the first place).
 476          */
 477         tsk->flags |= PF_MEMALLOC;
 478
 479         while (1) {
 480                 /*
 481                  * Wake up once a second to see if we need to make
 482                  * more memory available.
 483                  *
 484                  * If we actually get into a low-memory situation,
 485                  * the processes needing more memory will wake us
 486                  * up on a more timely basis.
 487                  */
 488                 do {
 489                         if (nr_free_pages >= freepages.high)
 490                                 break;
 491
 492                         if (!do_try_to_free_pages(GFP_KSWAPD))
 493                                 break;
 494                 } while (!tsk->need_resched);
 495                 run_task_queue(&tq_disk);
 496                 tsk->state = TASK_INTERRUPTIBLE;
 497                 schedule_timeout(HZ);
 498         }
 499 }
 500
 501 /*
 502  * Called by non-kswapd processes when they want more
 503  * memory.
 504  *
 505  * In a perfect world, this should just wake up kswapd
 506  * and return. We don't actually want to swap stuff out
 507  * from user processes, because the locking issues are
 508  * nasty to the extreme (file write locks, and MM locking)
 509  *
 510  * One option might be to let kswapd do all the page-out
 511  * and VM page table scanning that needs locking, and this
 512  * process thread could do just the mmap shrink stage that
 513  * can be done by just dropping cached pages without having
 514  * any deadlock issues.
 515  */
 516 int try_to_free_pages(unsigned int gfp_mask)
 517 {
 518         int retval = 1;
 519
 520         wake_up_process(kswapd_process);
 521         if (gfp_mask & __GFP_WAIT)
 522                 retval = do_try_to_free_pages(gfp_mask);
 523         return retval;
 524 }
 525