mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95, Stephen Tweedie.
   7  *  kswapd added: 7.1.96  sct
   8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  12  */
  13
  14 #include <linux/slab.h>
  15 #include <linux/kernel_stat.h>
  16 #include <linux/swap.h>
  17 #include <linux/swapctl.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/init.h>
  21 #include <linux/highmem.h>
  22 #include <linux/file.h>
  23
  24 #include <asm/pgalloc.h>
  25
  26 /*
  27  * The swap-out functions return 1 if they successfully
  28  * threw something out, and we got a free page. It returns
  29  * zero if it couldn't do anything, and any other value
  30  * indicates it decreased rss, but the page was shared.
  31  *
  32  * NOTE! If it sleeps, it *must* return 1 to make sure we
  33  * don't continue with the swap-out. Otherwise we may be
  34  * using a process that no longer actually exists (it might
  35  * have died while we slept).
  36  */
  37 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
  38 {
  39         pte_t pte;
  40         swp_entry_t entry;
  41         struct page * page;
  42         int (*swapout)(struct page *, struct file *);
  43
  44         pte = *page_table;
  45         if (!pte_present(pte))
  46                 goto out_failed;
  47         page = pte_page(pte);
  48         if ((page-mem_map >= max_mapnr) || PageReserved(page))
  49                 goto out_failed;
  50
  51         mm->swap_cnt--;
  52         /* Don't look at this pte if it's been accessed recently. */
  53         if (pte_young(pte)) {
  54                 /*
  55                  * Transfer the "accessed" bit from the page
  56                  * tables to the global page map.
  57                  */
  58                 set_pte(page_table, pte_mkold(pte));
  59                 set_bit(PG_referenced, &page->flags);
  60                 goto out_failed;
  61         }
  62
  63         if (PageLocked(page))
  64                 goto out_failed;
  65
  66         /*
  67          * Is the page already in the swap cache? If so, then
  68          * we can just drop our reference to it without doing
  69          * any IO - it's already up-to-date on disk.
  70          *
  71          * Return 0, as we didn't actually free any real
  72          * memory, and we should just continue our scan.
  73          */
  74         if (PageSwapCache(page)) {
  75                 entry.val = page->index;
  76                 swap_duplicate(entry);
  77                 set_pte(page_table, swp_entry_to_pte(entry));
  78 drop_pte:
  79                 vma->vm_mm->rss--;
  80                 flush_tlb_page(vma, address);
  81                 __free_page(page);
  82                 goto out_failed;
  83         }
  84
  85         /*
  86          * Is it a clean page? Then it must be recoverable
  87          * by just paging it in again, and we can just drop
  88          * it..
  89          *
  90          * However, this won't actually free any real
  91          * memory, as the page will just be in the page cache
  92          * somewhere, and as such we should just continue
  93          * our scan.
  94          *
  95          * Basically, this just makes it possible for us to do
  96          * some real work in the future in "shrink_mmap()".
  97          */
  98         if (!pte_dirty(pte)) {
  99                 flush_cache_page(vma, address);
 100                 pte_clear(page_table);
 101                 goto drop_pte;
 102         }
 103
 104         /*
 105          * Don't go down into the swap-out stuff if
 106          * we cannot do I/O! Avoid recursing on FS
 107          * locks etc.
 108          */
 109         if (!(gfp_mask & __GFP_IO))
 110                 goto out_failed;
 111
 112         /*
 113          * Ok, it's really dirty. That means that
 114          * we should either create a new swap cache
 115          * entry for it, or we should write it back
 116          * to its own backing store.
 117          *
 118          * Note that in neither case do we actually
 119          * know that we make a page available, but
 120          * as we potentially sleep we can no longer
 121          * continue scanning, so we migth as well
 122          * assume we free'd something.
 123          *
 124          * NOTE NOTE NOTE! This should just set a
 125          * dirty bit in 'page', and just drop the
 126          * pte. All the hard work would be done by
 127          * shrink_mmap().
 128          *
 129          * That would get rid of a lot of problems.
 130          */
 131         flush_cache_page(vma, address);
 132         if (vma->vm_ops && (swapout = vma->vm_ops->swapout)) {
 133                 int error;
 134                 struct file *file = vma->vm_file;
 135                 if (file) get_file(file);
 136                 pte_clear(page_table);
 137                 vma->vm_mm->rss--;
 138                 flush_tlb_page(vma, address);
 139                 vmlist_access_unlock(vma->vm_mm);
 140                 error = swapout(page, file);
 141                 if (file) fput(file);
 142                 if (!error)
 143                         goto out_free_success;
 144                 __free_page(page);
 145                 return error;
 146         }
 147
 148         /*
 149          * This is a dirty, swappable page.  First of all,
 150          * get a suitable swap entry for it, and make sure
 151          * we have the swap cache set up to associate the
 152          * page with that swap entry.
 153          */
 154         entry = acquire_swap_entry(page);
 155         if (!entry.val)
 156                 goto out_failed; /* No swap space left */
 157
 158         if (!(page = prepare_highmem_swapout(page)))
 159                 goto out_swap_free;
 160
 161         swap_duplicate(entry);  /* One for the process, one for the swap cache */
 162
 163         /* This will also lock the page */
 164         add_to_swap_cache(page, entry);
 165         /* Put the swap entry into the pte after the page is in swapcache */
 166         vma->vm_mm->rss--;
 167         set_pte(page_table, swp_entry_to_pte(entry));
 168         flush_tlb_page(vma, address);
 169         vmlist_access_unlock(vma->vm_mm);
 170
 171         /* OK, do a physical asynchronous write to swap.  */
 172         rw_swap_page(WRITE, page, 0);
 173
 174 out_free_success:
 175         __free_page(page);
 176         return 1;
 177 out_swap_free:
 178         swap_free(entry);
 179 out_failed:
 180         return 0;
 181
 182 }
 183
 184 /*
 185  * A new implementation of swap_out().  We do not swap complete processes,
 186  * but only a small number of blocks, before we continue with the next
 187  * process.  The number of blocks actually swapped is determined on the
 188  * number of page faults, that this process actually had in the last time,
 189  * so we won't swap heavily used processes all the time ...
 190  *
 191  * Note: the priority argument is a hint on much CPU to waste with the
 192  *       swap block search, not a hint, of how much blocks to swap with
 193  *       each process.
 194  *
 195  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 196  */
 197
 198 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 199 {
 200         pte_t * pte;
 201         unsigned long pmd_end;
 202
 203         if (pmd_none(*dir))
 204                 return 0;
 205         if (pmd_bad(*dir)) {
 206                 pmd_ERROR(*dir);
 207                 pmd_clear(dir);
 208                 return 0;
 209         }
 210
 211         pte = pte_offset(dir, address);
 212
 213         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 214         if (end > pmd_end)
 215                 end = pmd_end;
 216
 217         do {
 218                 int result;
 219                 vma->vm_mm->swap_address = address + PAGE_SIZE;
 220                 result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
 221                 if (result)
 222                         return result;
 223                 if (!mm->swap_cnt)
 224                         return 0;
 225                 address += PAGE_SIZE;
 226                 pte++;
 227         } while (address && (address < end));
 228         return 0;
 229 }
 230
 231 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 232 {
 233         pmd_t * pmd;
 234         unsigned long pgd_end;
 235
 236         if (pgd_none(*dir))
 237                 return 0;
 238         if (pgd_bad(*dir)) {
 239                 pgd_ERROR(*dir);
 240                 pgd_clear(dir);
 241                 return 0;
 242         }
 243
 244         pmd = pmd_offset(dir, address);
 245
 246         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 247         if (pgd_end && (end > pgd_end))
 248                 end = pgd_end;
 249
 250         do {
 251                 int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
 252                 if (result)
 253                         return result;
 254                 if (!mm->swap_cnt)
 255                         return 0;
 256                 address = (address + PMD_SIZE) & PMD_MASK;
 257                 pmd++;
 258         } while (address && (address < end));
 259         return 0;
 260 }
 261
 262 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 263 {
 264         pgd_t *pgdir;
 265         unsigned long end;
 266
 267         /* Don't swap out areas which are locked down */
 268         if (vma->vm_flags & VM_LOCKED)
 269                 return 0;
 270
 271         pgdir = pgd_offset(vma->vm_mm, address);
 272
 273         end = vma->vm_end;
 274         if (address >= end)
 275                 BUG();
 276         do {
 277                 int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
 278                 if (result)
 279                         return result;
 280                 if (!mm->swap_cnt)
 281                         return 0;
 282                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 283                 pgdir++;
 284         } while (address && (address < end));
 285         return 0;
 286 }
 287
 288 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 289 {
 290         unsigned long address;
 291         struct vm_area_struct* vma;
 292
 293         /*
 294          * Go through process' page directory.
 295          */
 296         address = mm->swap_address;
 297
 298         /*
 299          * Find the proper vm-area after freezing the vma chain
 300          * and ptes.
 301          */
 302         vmlist_access_lock(mm);
 303         vma = find_vma(mm, address);
 304         if (vma) {
 305                 if (address < vma->vm_start)
 306                         address = vma->vm_start;
 307
 308                 for (;;) {
 309                         int result = swap_out_vma(mm, vma, address, gfp_mask);
 310                         if (result)
 311                                 return result;
 312                         vma = vma->vm_next;
 313                         if (!vma)
 314                                 break;
 315                         address = vma->vm_start;
 316                 }
 317         }
 318         vmlist_access_unlock(mm);
 319
 320         /* We didn't find anything for the process */
 321         mm->swap_cnt = 0;
 322         mm->swap_address = 0;
 323         return 0;
 324 }
 325
 326 /*
 327  * Select the task with maximal swap_cnt and try to swap out a page.
 328  * N.B. This function returns only 0 or 1.  Return values != 1 from
 329  * the lower level routines result in continued processing.
 330  */
 331 int swap_out(unsigned int priority, int gfp_mask)
 332 {
 333         struct task_struct * p;
 334         int counter;
 335         int __ret = 0;
 336         int assign = 0;
 337
 338         lock_kernel();
 339         /*
 340          * We make one or two passes through the task list, indexed by
 341          * assign = {0, 1}:
 342          *   Pass 1: select the swappable task with maximal RSS that has
 343          *         not yet been swapped out.
 344          *   Pass 2: re-assign rss swap_cnt values, then select as above.
 345          *
 346          * With this approach, there's no need to remember the last task
 347          * swapped out.  If the swap-out fails, we clear swap_cnt so the
 348          * task won't be selected again until all others have been tried.
 349          *
 350          * Think of swap_cnt as a "shadow rss" - it tells us which process
 351          * we want to page out (always try largest first).
 352          */
 353         counter = nr_threads / (priority+1);
 354         if (counter < 1)
 355                 counter = 1;
 356
 357         for (; counter >= 0; counter--) {
 358                 unsigned long max_cnt = 0;
 359                 struct mm_struct *best = NULL;
 360                 int pid = 0;
 361         select:
 362                 read_lock(&tasklist_lock);
 363                 p = init_task.next_task;
 364                 for (; p != &init_task; p = p->next_task) {
 365                         struct mm_struct *mm = p->mm;
 366                         p->hog = 0;
 367                         if (!p->swappable || !mm)
 368                                 continue;
 369                         if (mm->rss <= 0)
 370                                 continue;
 371                         /* Refresh swap_cnt? */
 372                         if (assign == 1)
 373                                 mm->swap_cnt = mm->rss;
 374                         if (mm->swap_cnt > max_cnt) {
 375                                 max_cnt = mm->swap_cnt;
 376                                 best = mm;
 377                                 pid = p->pid;
 378                         }
 379                 }
 380                 if (assign == 1) {
 381                         /* we just assigned swap_cnt, normalise values */
 382                         assign = 2;
 383                         p = init_task.next_task;
 384                         for (; p != &init_task; p = p->next_task) {
 385                                 int i = 0;
 386                                 struct mm_struct *mm = p->mm;
 387                                 if (!p->swappable || !mm || mm->rss <= 0)
 388                                         continue;
 389                                 /* small processes are swapped out less */
 390                                 while ((mm->swap_cnt << 2 * (i + 1) < max_cnt)
 391                                                 && i++ < 10)
 392                                 mm->swap_cnt >>= i;
 393                                 mm->swap_cnt += i; /* if swap_cnt reaches 0 */
 394                                 /* we're big -> hog treatment */
 395                                 if (!i)
 396                                         p->hog = 1;
 397                         }
 398                 }
 399                 read_unlock(&tasklist_lock);
 400                 if (!best) {
 401                         if (!assign) {
 402                                 assign = 1;
 403                                 goto select;
 404                         }
 405                         goto out;
 406                 } else {
 407                         int ret;
 408
 409                         atomic_inc(&best->mm_count);
 410                         ret = swap_out_mm(best, gfp_mask);
 411                         mmdrop(best);
 412
 413                         if (!ret)
 414                                 continue;
 415
 416                         if (ret < 0)
 417                                 kill_proc(pid, SIGBUS, 1);
 418                         __ret = 1;
 419                         goto out;
 420                 }
 421         }
 422 out:
 423         unlock_kernel();
 424         return __ret;
 425 }
 426
 427 /*
 428  * We need to make the locks finer granularity, but right
 429  * now we need this so that we can do page allocations
 430  * without holding the kernel lock etc.
 431  *
 432  * We want to try to free "count" pages, and we need to
 433  * cluster them so that we get good swap-out behaviour. See
 434  * the "free_memory()" macro for details.
 435  */
 436 static int do_try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 437 {
 438         int priority;
 439         int count = SWAP_CLUSTER_MAX;
 440
 441         /* Always trim SLAB caches when memory gets low. */
 442         kmem_cache_reap(gfp_mask);
 443
 444         priority = 6;
 445         do {
 446                 while (shrink_mmap(priority, gfp_mask, zone)) {
 447                         if (!--count)
 448                                 goto done;
 449                 }
 450
 451
 452                 /* Try to get rid of some shared memory pages.. */
 453                 if (gfp_mask & __GFP_IO) {
 454                         /*
 455                          * don't be too light against the d/i cache since
 456                          * shrink_mmap() almost never fail when there's
 457                          * really plenty of memory free.
 458                          */
 459                         count -= shrink_dcache_memory(priority, gfp_mask, zone);
 460                         count -= shrink_icache_memory(priority, gfp_mask, zone);
 461                         if (count <= 0)
 462                                 goto done;
 463                         while (shm_swap(priority, gfp_mask, zone)) {
 464                                 if (!--count)
 465                                         goto done;
 466                         }
 467                 }
 468
 469                 /* Then, try to page stuff out.. */
 470                 while (swap_out(priority, gfp_mask)) {
 471                         if (!--count)
 472                                 goto done;
 473                 }
 474         } while (--priority >= 0);
 475 done:
 476
 477         return priority >= 0;
 478 }
 479
 480 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 481
 482 /*
 483  * The background pageout daemon, started as a kernel thread
 484  * from the init process.
 485  *
 486  * This basically trickles out pages so that we have _some_
 487  * free memory available even if there is no other activity
 488  * that frees anything up. This is needed for things like routing
 489  * etc, where we otherwise might have all activity going on in
 490  * asynchronous contexts that cannot page things out.
 491  *
 492  * If there are applications that are active memory-allocators
 493  * (most normal use), this basically shouldn't matter.
 494  */
 495 int kswapd(void *unused)
 496 {
 497         int i;
 498         struct task_struct *tsk = current;
 499         pg_data_t *pgdat;
 500         zone_t *zone;
 501
 502         tsk->session = 1;
 503         tsk->pgrp = 1;
 504         strcpy(tsk->comm, "kswapd");
 505         sigfillset(&tsk->blocked);
 506
 507         /*
 508          * Tell the memory management that we're a "memory allocator",
 509          * and that if we need more memory we should get access to it
 510          * regardless (see "__alloc_pages()"). "kswapd" should
 511          * never get caught in the normal page freeing logic.
 512          *
 513          * (Kswapd normally doesn't need memory anyway, but sometimes
 514          * you need a small amount of memory in order to be able to
 515          * page out something else, and this flag essentially protects
 516          * us from recursively trying to free more memory as we're
 517          * trying to free the first piece of memory in the first place).
 518          */
 519         tsk->flags |= PF_MEMALLOC;
 520
 521         while (1) {
 522                 /*
 523                  * If we actually get into a low-memory situation,
 524                  * the processes needing more memory will wake us
 525                  * up on a more timely basis.
 526                  */
 527                 pgdat = pgdat_list;
 528                 while (pgdat) {
 529                         for (i = 0; i < MAX_NR_ZONES; i++) {
 530                             int count = SWAP_CLUSTER_MAX;
 531                             zone = pgdat->node_zones + i;
 532                             do {
 533                                 if (tsk->need_resched)
 534                                         schedule();
 535                                 if ((!zone->size) || (!zone->zone_wake_kswapd))
 536                                         continue;
 537                                 do_try_to_free_pages(GFP_KSWAPD, zone);
 538                            } while (zone->free_pages < zone->pages_low &&
 539                                            --count);
 540                         }
 541                         pgdat = pgdat->node_next;
 542                 }
 543                 run_task_queue(&tq_disk);
 544                 tsk->state = TASK_INTERRUPTIBLE;
 545                 interruptible_sleep_on(&kswapd_wait);
 546         }
 547 }
 548
 549 /*
 550  * Called by non-kswapd processes when they want more
 551  * memory.
 552  *
 553  * In a perfect world, this should just wake up kswapd
 554  * and return. We don't actually want to swap stuff out
 555  * from user processes, because the locking issues are
 556  * nasty to the extreme (file write locks, and MM locking)
 557  *
 558  * One option might be to let kswapd do all the page-out
 559  * and VM page table scanning that needs locking, and this
 560  * process thread could do just the mmap shrink stage that
 561  * can be done by just dropping cached pages without having
 562  * any deadlock issues.
 563  */
 564 int try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 565 {
 566         int retval = 1;
 567
 568         if (gfp_mask & __GFP_WAIT) {
 569                 current->flags |= PF_MEMALLOC;
 570                 retval = do_try_to_free_pages(gfp_mask, zone);
 571                 current->flags &= ~PF_MEMALLOC;
 572         }
 573         return retval;
 574 }
 575
 576 static int __init kswapd_init(void)
 577 {
 578         printk("Starting kswapd v1.6\n");
 579         swap_setup();
 580         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 581         return 0;
 582 }
 583
 584 module_init(kswapd_init)