mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95, Stephen Tweedie.
   7  *  kswapd added: 7.1.96  sct
   8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  12  */
  13
  14 #include <linux/slab.h>
  15 #include <linux/kernel_stat.h>
  16 #include <linux/swap.h>
  17 #include <linux/swapctl.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/init.h>
  21 #include <linux/highmem.h>
  22 #include <linux/file.h>
  23
  24 #include <asm/pgalloc.h>
  25
  26 /*
  27  * The swap-out functions return 1 if they successfully
  28  * threw something out, and we got a free page. It returns
  29  * zero if it couldn't do anything, and any other value
  30  * indicates it decreased rss, but the page was shared.
  31  *
  32  * NOTE! If it sleeps, it *must* return 1 to make sure we
  33  * don't continue with the swap-out. Otherwise we may be
  34  * using a process that no longer actually exists (it might
  35  * have died while we slept).
  36  */
  37 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
  38 {
  39         pte_t pte;
  40         swp_entry_t entry;
  41         struct page * page;
  42         int (*swapout)(struct page *, struct file *);
  43
  44         pte = *page_table;
  45         if (!pte_present(pte))
  46                 goto out_failed;
  47         page = pte_page(pte);
  48         if ((!VALID_PAGE(page)) || PageReserved(page))
  49                 goto out_failed;
  50
  51         if (mm->swap_cnt)
  52                 mm->swap_cnt--;
  53
  54         /* Don't look at this pte if it's been accessed recently. */
  55         if (pte_young(pte)) {
  56                 /*
  57                  * Transfer the "accessed" bit from the page
  58                  * tables to the global page map.
  59                  */
  60                 set_pte(page_table, pte_mkold(pte));
  61                 SetPageReferenced(page);
  62                 goto out_failed;
  63         }
  64
  65         if (TryLockPage(page))
  66                 goto out_failed;
  67
  68         /*
  69          * Is the page already in the swap cache? If so, then
  70          * we can just drop our reference to it without doing
  71          * any IO - it's already up-to-date on disk.
  72          *
  73          * Return 0, as we didn't actually free any real
  74          * memory, and we should just continue our scan.
  75          */
  76         if (PageSwapCache(page)) {
  77                 entry.val = page->index;
  78                 swap_duplicate(entry);
  79                 set_pte(page_table, swp_entry_to_pte(entry));
  80 drop_pte:
  81                 UnlockPage(page);
  82                 vma->vm_mm->rss--;
  83                 flush_tlb_page(vma, address);
  84                 page_cache_release(page);
  85                 goto out_failed;
  86         }
  87
  88         /*
  89          * Is it a clean page? Then it must be recoverable
  90          * by just paging it in again, and we can just drop
  91          * it..
  92          *
  93          * However, this won't actually free any real
  94          * memory, as the page will just be in the page cache
  95          * somewhere, and as such we should just continue
  96          * our scan.
  97          *
  98          * Basically, this just makes it possible for us to do
  99          * some real work in the future in "shrink_mmap()".
 100          */
 101         if (!pte_dirty(pte)) {
 102                 flush_cache_page(vma, address);
 103                 pte_clear(page_table);
 104                 goto drop_pte;
 105         }
 106
 107         /*
 108          * Don't go down into the swap-out stuff if
 109          * we cannot do I/O! Avoid recursing on FS
 110          * locks etc.
 111          */
 112         if (!(gfp_mask & __GFP_IO))
 113                 goto out_unlock;
 114
 115         /*
 116          * Don't do any of the expensive stuff if
 117          * we're not really interested in this zone.
 118          */
 119         if (page->zone->free_pages > page->zone->pages_high)
 120                 goto out_unlock;
 121
 122         /*
 123          * Ok, it's really dirty. That means that
 124          * we should either create a new swap cache
 125          * entry for it, or we should write it back
 126          * to its own backing store.
 127          *
 128          * Note that in neither case do we actually
 129          * know that we make a page available, but
 130          * as we potentially sleep we can no longer
 131          * continue scanning, so we migth as well
 132          * assume we free'd something.
 133          *
 134          * NOTE NOTE NOTE! This should just set a
 135          * dirty bit in 'page', and just drop the
 136          * pte. All the hard work would be done by
 137          * shrink_mmap().
 138          *
 139          * That would get rid of a lot of problems.
 140          */
 141         flush_cache_page(vma, address);
 142         if (vma->vm_ops && (swapout = vma->vm_ops->swapout)) {
 143                 int error;
 144                 struct file *file = vma->vm_file;
 145                 if (file) get_file(file);
 146                 pte_clear(page_table);
 147                 vma->vm_mm->rss--;
 148                 flush_tlb_page(vma, address);
 149                 vmlist_access_unlock(vma->vm_mm);
 150                 error = swapout(page, file);
 151                 UnlockPage(page);
 152                 if (file) fput(file);
 153                 if (!error)
 154                         goto out_free_success;
 155                 page_cache_release(page);
 156                 return error;
 157         }
 158
 159         /*
 160          * This is a dirty, swappable page.  First of all,
 161          * get a suitable swap entry for it, and make sure
 162          * we have the swap cache set up to associate the
 163          * page with that swap entry.
 164          */
 165         entry = get_swap_page();
 166         if (!entry.val)
 167                 goto out_unlock; /* No swap space left */
 168
 169         if (!(page = prepare_highmem_swapout(page)))
 170                 goto out_swap_free;
 171
 172         swap_duplicate(entry);  /* One for the process, one for the swap cache */
 173
 174         /* Add it to the swap cache */
 175         add_to_swap_cache(page, entry);
 176
 177         /* Put the swap entry into the pte after the page is in swapcache */
 178         vma->vm_mm->rss--;
 179         set_pte(page_table, swp_entry_to_pte(entry));
 180         flush_tlb_page(vma, address);
 181         vmlist_access_unlock(vma->vm_mm);
 182
 183         /* OK, do a physical asynchronous write to swap.  */
 184         rw_swap_page(WRITE, page, 0);
 185
 186 out_free_success:
 187         page_cache_release(page);
 188         return 1;
 189 out_swap_free:
 190         swap_free(entry);
 191 out_failed:
 192         return 0;
 193 out_unlock:
 194         UnlockPage(page);
 195         return 0;
 196 }
 197
 198 /*
 199  * A new implementation of swap_out().  We do not swap complete processes,
 200  * but only a small number of blocks, before we continue with the next
 201  * process.  The number of blocks actually swapped is determined on the
 202  * number of page faults, that this process actually had in the last time,
 203  * so we won't swap heavily used processes all the time ...
 204  *
 205  * Note: the priority argument is a hint on much CPU to waste with the
 206  *       swap block search, not a hint, of how much blocks to swap with
 207  *       each process.
 208  *
 209  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 210  */
 211
 212 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 213 {
 214         pte_t * pte;
 215         unsigned long pmd_end;
 216
 217         if (pmd_none(*dir))
 218                 return 0;
 219         if (pmd_bad(*dir)) {
 220                 pmd_ERROR(*dir);
 221                 pmd_clear(dir);
 222                 return 0;
 223         }
 224
 225         pte = pte_offset(dir, address);
 226
 227         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 228         if (end > pmd_end)
 229                 end = pmd_end;
 230
 231         do {
 232                 int result;
 233                 vma->vm_mm->swap_address = address + PAGE_SIZE;
 234                 result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
 235                 if (result)
 236                         return result;
 237                 if (!mm->swap_cnt)
 238                         return 0;
 239                 address += PAGE_SIZE;
 240                 pte++;
 241         } while (address && (address < end));
 242         return 0;
 243 }
 244
 245 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 246 {
 247         pmd_t * pmd;
 248         unsigned long pgd_end;
 249
 250         if (pgd_none(*dir))
 251                 return 0;
 252         if (pgd_bad(*dir)) {
 253                 pgd_ERROR(*dir);
 254                 pgd_clear(dir);
 255                 return 0;
 256         }
 257
 258         pmd = pmd_offset(dir, address);
 259
 260         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 261         if (pgd_end && (end > pgd_end))
 262                 end = pgd_end;
 263
 264         do {
 265                 int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
 266                 if (result)
 267                         return result;
 268                 if (!mm->swap_cnt)
 269                         return 0;
 270                 address = (address + PMD_SIZE) & PMD_MASK;
 271                 pmd++;
 272         } while (address && (address < end));
 273         return 0;
 274 }
 275
 276 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 277 {
 278         pgd_t *pgdir;
 279         unsigned long end;
 280
 281         /* Don't swap out areas which are locked down */
 282         if (vma->vm_flags & VM_LOCKED)
 283                 return 0;
 284
 285         pgdir = pgd_offset(vma->vm_mm, address);
 286
 287         end = vma->vm_end;
 288         if (address >= end)
 289                 BUG();
 290         do {
 291                 int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
 292                 if (result)
 293                         return result;
 294                 if (!mm->swap_cnt)
 295                         return 0;
 296                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 297                 pgdir++;
 298         } while (address && (address < end));
 299         return 0;
 300 }
 301
 302 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 303 {
 304         unsigned long address;
 305         struct vm_area_struct* vma;
 306
 307         /*
 308          * Go through process' page directory.
 309          */
 310         address = mm->swap_address;
 311
 312         /*
 313          * Find the proper vm-area after freezing the vma chain
 314          * and ptes.
 315          */
 316         vmlist_access_lock(mm);
 317         vma = find_vma(mm, address);
 318         if (vma) {
 319                 if (address < vma->vm_start)
 320                         address = vma->vm_start;
 321
 322                 for (;;) {
 323                         int result = swap_out_vma(mm, vma, address, gfp_mask);
 324                         if (result)
 325                                 return result;
 326                         vma = vma->vm_next;
 327                         if (!vma)
 328                                 break;
 329                         address = vma->vm_start;
 330                 }
 331         }
 332         vmlist_access_unlock(mm);
 333
 334         /* We didn't find anything for the process */
 335         mm->swap_cnt = 0;
 336         mm->swap_address = 0;
 337         return 0;
 338 }
 339
 340 /*
 341  * Select the task with maximal swap_cnt and try to swap out a page.
 342  * N.B. This function returns only 0 or 1.  Return values != 1 from
 343  * the lower level routines result in continued processing.
 344  */
 345 static int swap_out(unsigned int priority, int gfp_mask)
 346 {
 347         struct task_struct * p;
 348         int counter;
 349         int __ret = 0;
 350
 351         lock_kernel();
 352         /*
 353          * We make one or two passes through the task list, indexed by
 354          * assign = {0, 1}:
 355          *   Pass 1: select the swappable task with maximal RSS that has
 356          *         not yet been swapped out.
 357          *   Pass 2: re-assign rss swap_cnt values, then select as above.
 358          *
 359          * With this approach, there's no need to remember the last task
 360          * swapped out.  If the swap-out fails, we clear swap_cnt so the
 361          * task won't be selected again until all others have been tried.
 362          *
 363          * Think of swap_cnt as a "shadow rss" - it tells us which process
 364          * we want to page out (always try largest first).
 365          */
 366         counter = (nr_threads << 2) >> (priority >> 2);
 367         if (counter < 1)
 368                 counter = 1;
 369
 370         for (; counter >= 0; counter--) {
 371                 unsigned long max_cnt = 0;
 372                 struct mm_struct *best = NULL;
 373                 int pid = 0;
 374                 int assign = 0;
 375         select:
 376                 read_lock(&tasklist_lock);
 377                 p = init_task.next_task;
 378                 for (; p != &init_task; p = p->next_task) {
 379                         struct mm_struct *mm = p->mm;
 380                         if (!p->swappable || !mm)
 381                                 continue;
 382                         if (mm->rss <= 0)
 383                                 continue;
 384                         /* Refresh swap_cnt? */
 385                         if (assign == 1)
 386                                 mm->swap_cnt = mm->rss;
 387                         if (mm->swap_cnt > max_cnt) {
 388                                 max_cnt = mm->swap_cnt;
 389                                 best = mm;
 390                                 pid = p->pid;
 391                         }
 392                 }
 393                 read_unlock(&tasklist_lock);
 394                 if (!best) {
 395                         if (!assign) {
 396                                 assign = 1;
 397                                 goto select;
 398                         }
 399                         goto out;
 400                 } else {
 401                         int ret;
 402
 403                         atomic_inc(&best->mm_count);
 404                         ret = swap_out_mm(best, gfp_mask);
 405                         mmdrop(best);
 406
 407                         if (!ret)
 408                                 continue;
 409
 410                         if (ret < 0)
 411                                 kill_proc(pid, SIGBUS, 1);
 412                         __ret = 1;
 413                         goto out;
 414                 }
 415         }
 416 out:
 417         unlock_kernel();
 418         return __ret;
 419 }
 420
 421 /*
 422  * Check if there is any memory pressure (free_pages < pages_low)
 423  */
 424 static inline int memory_pressure(void)
 425 {
 426         pg_data_t *pgdat = pgdat_list;
 427
 428         do {
 429                 int i;
 430                 for(i = 0; i < MAX_NR_ZONES; i++) {
 431                         zone_t *zone = pgdat->node_zones+ i;
 432                         if (zone->size &&
 433                             zone->free_pages < zone->pages_low)
 434                                 return 1;
 435                 }
 436                 pgdat = pgdat->node_next;
 437         } while (pgdat);
 438
 439         return 0;
 440 }
 441
 442 /*
 443  * Check if all zones have recently had memory_pressure (zone_wake_kswapd)
 444  */
 445 static inline int keep_kswapd_awake(void)
 446 {
 447         int all_recent = 1;
 448         pg_data_t *pgdat = pgdat_list;
 449
 450         do {
 451                 int i;
 452                 for(i = 0; i < MAX_NR_ZONES; i++) {
 453                         zone_t *zone = pgdat->node_zones+ i;
 454                         if (zone->size) {
 455                                 if (zone->free_pages < zone->pages_min)
 456                                         return 1;
 457                                 if (!zone->zone_wake_kswapd)
 458                                         all_recent = 0;
 459                         }
 460                 }
 461                 pgdat = pgdat->node_next;
 462         } while (pgdat);
 463
 464         return all_recent;
 465 }
 466
 467 /*
 468  * We need to make the locks finer granularity, but right
 469  * now we need this so that we can do page allocations
 470  * without holding the kernel lock etc.
 471  *
 472  * We want to try to free "count" pages, and we want to
 473  * cluster them so that we get good swap-out behaviour.
 474  *
 475  * Don't try _too_ hard, though. We don't want to have bad
 476  * latency.
 477  *
 478  * Note: only called by kswapd and try_to_free_pages
 479  *       both can WAIT at top level.
 480  */
 481 #define FREE_COUNT      8
 482 #define SWAP_COUNT      16
 483 static int do_try_to_free_pages(unsigned int gfp_mask)
 484 {
 485         int priority;
 486         int count = FREE_COUNT;
 487         int swap_count;
 488
 489         /* Always trim SLAB caches when memory gets low. */
 490         kmem_cache_reap(gfp_mask);
 491
 492         priority = 64;
 493         do {
 494                 if (current->need_resched) {
 495                         schedule();
 496                         /* time has passed - pressure too? */
 497                         if (!memory_pressure())
 498                                 goto done;
 499                 }
 500
 501                 while (shrink_mmap(priority, gfp_mask)) {
 502                         if (!--count)
 503                                 goto done;
 504                 }
 505
 506                 /* check if mission completed */
 507                 if (!keep_kswapd_awake())
 508                         goto done;
 509
 510                 /* Try to get rid of some shared memory pages.. */
 511                 if (gfp_mask & __GFP_IO) {
 512                         /*
 513                          * don't be too light against the d/i cache since
 514                          * shrink_mmap() almost never fail when there's
 515                          * really plenty of memory free.
 516                          */
 517                         count -= shrink_dcache_memory(priority, gfp_mask);
 518                         count -= shrink_icache_memory(priority, gfp_mask);
 519                         /*
 520                          * Not currently working, see fixme in shrink_?cache_memory
 521                          * In the inner funtions there is a comment:
 522                          * "To help debugging, a zero exit status indicates
 523                          *  all slabs were released." (-arca?)
 524                          * lets handle it in a primitive but working way...
 525                          *      if (count <= 0)
 526                          *              goto done;
 527                          */
 528                         if (!keep_kswapd_awake())
 529                                 goto done;
 530
 531                         while (shm_swap(priority, gfp_mask)) {
 532                                 if (!--count)
 533                                         goto done;
 534                         }
 535                 }
 536
 537                 /*
 538                  * Then, try to page stuff out..
 539                  *
 540                  * This will not actually free any pages (they get
 541                  * put in the swap cache), so we must not count this
 542                  * as a "count" success.
 543                  */
 544                 swap_count = SWAP_COUNT;
 545                 while (swap_out(priority, gfp_mask))
 546                         if (--swap_count < 0)
 547                                 break;
 548
 549         } while (--priority >= 0);
 550
 551         /* Always end on a shrink_mmap.., may sleep... */
 552         while (shrink_mmap(0, gfp_mask)) {
 553                 if (!--count)
 554                         goto done;
 555         }
 556         /* Return 1 if any page is freed, or
 557          * there are no more memory pressure   */
 558         return (count < FREE_COUNT || !keep_kswapd_awake());
 559
 560 done:
 561         return 1;
 562 }
 563
 564 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 565
 566 /*
 567  * The background pageout daemon, started as a kernel thread
 568  * from the init process.
 569  *
 570  * This basically trickles out pages so that we have _some_
 571  * free memory available even if there is no other activity
 572  * that frees anything up. This is needed for things like routing
 573  * etc, where we otherwise might have all activity going on in
 574  * asynchronous contexts that cannot page things out.
 575  *
 576  * If there are applications that are active memory-allocators
 577  * (most normal use), this basically shouldn't matter.
 578  */
 579 int kswapd(void *unused)
 580 {
 581         struct task_struct *tsk = current;
 582
 583         tsk->session = 1;
 584         tsk->pgrp = 1;
 585         strcpy(tsk->comm, "kswapd");
 586         sigfillset(&tsk->blocked);
 587
 588         /*
 589          * Tell the memory management that we're a "memory allocator",
 590          * and that if we need more memory we should get access to it
 591          * regardless (see "__alloc_pages()"). "kswapd" should
 592          * never get caught in the normal page freeing logic.
 593          *
 594          * (Kswapd normally doesn't need memory anyway, but sometimes
 595          * you need a small amount of memory in order to be able to
 596          * page out something else, and this flag essentially protects
 597          * us from recursively trying to free more memory as we're
 598          * trying to free the first piece of memory in the first place).
 599          */
 600         tsk->flags |= PF_MEMALLOC;
 601
 602         for (;;) {
 603                 if (!keep_kswapd_awake()) {
 604                         interruptible_sleep_on(&kswapd_wait);
 605                 }
 606
 607                 do_try_to_free_pages(GFP_KSWAPD);
 608         }
 609 }
 610
 611 /*
 612  * Called by non-kswapd processes when they want more
 613  * memory.
 614  *
 615  * In a perfect world, this should just wake up kswapd
 616  * and return. We don't actually want to swap stuff out
 617  * from user processes, because the locking issues are
 618  * nasty to the extreme (file write locks, and MM locking)
 619  *
 620  * One option might be to let kswapd do all the page-out
 621  * and VM page table scanning that needs locking, and this
 622  * process thread could do just the mmap shrink stage that
 623  * can be done by just dropping cached pages without having
 624  * any deadlock issues.
 625  */
 626 int try_to_free_pages(unsigned int gfp_mask)
 627 {
 628         int retval = 1;
 629
 630         if (gfp_mask & __GFP_WAIT) {
 631                 current->state = TASK_RUNNING;
 632                 current->flags |= PF_MEMALLOC;
 633                 retval = do_try_to_free_pages(gfp_mask);
 634                 current->flags &= ~PF_MEMALLOC;
 635         }
 636
 637         /* someone needed memory that kswapd had not provided
 638          * make sure kswapd runs, should not happen often */
 639         if (waitqueue_active(&kswapd_wait))
 640                 wake_up_interruptible(&kswapd_wait);
 641
 642         return retval;
 643 }
 644
 645 static int __init kswapd_init(void)
 646 {
 647         printk("Starting kswapd v1.7\n");
 648         swap_setup();
 649         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 650         return 0;
 651 }
 652
 653 module_init(kswapd_init)