mm/rmap.c

   1 /*
   2  * mm/rmap.c - physical to virtual reverse mappings
   3  *
   4  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5  * Released under the General Public License (GPL).
   6  *
   7  * Simple, low overhead reverse mapping scheme.
   8  * Please try to keep this thing as modular as possible.
   9  *
  10  * Provides methods for unmapping each kind of mapped page:
  11  * the anon methods track anonymous pages, and
  12  * the file methods track pages belonging to an inode.
  13  *
  14  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17  * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  18  */
  19
  20 /*
  21  * Lock ordering in mm:
  22  *
  23  * inode->i_sem (while writing or truncating, not reading or faulting)
  24  *   inode->i_alloc_sem
  25  *
  26  * When a page fault occurs in writing from user to file, down_read
  27  * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
  28  * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
  29  * taken together; in truncation, i_sem is taken outermost.
  30  *
  31  * mm->mmap_sem
  32  *   page->flags PG_locked (lock_page)
  33  *     mapping->i_mmap_lock
  34  *       anon_vma->lock
  35  *         mm->page_table_lock
  36  *           zone->lru_lock (in mark_page_accessed)
  37  *           swap_list_lock (in swap_free etc's swap_info_get)
  38  *             swap_device_lock (in swap_duplicate, swap_info_get)
  39  *             mapping->private_lock (in __set_page_dirty_buffers)
  40  *             inode_lock (in set_page_dirty's __mark_inode_dirty)
  41  *               sb_lock (within inode_lock in fs/fs-writeback.c)
  42  *               mapping->tree_lock (widely used, in set_page_dirty,
  43  *                         in arch-dependent flush_dcache_mmap_lock,
  44  *                         within inode_lock in __sync_single_inode)
  45  */
  46
  47 #include <linux/mm.h>
  48 #include <linux/pagemap.h>
  49 #include <linux/swap.h>
  50 #include <linux/swapops.h>
  51 #include <linux/slab.h>
  52 #include <linux/init.h>
  53 #include <linux/rmap.h>
  54 #include <linux/rcupdate.h>
  55
  56 #include <asm/tlbflush.h>
  57
  58 //#define RMAP_DEBUG /* can be enabled only for debugging */
  59
  60 kmem_cache_t *anon_vma_cachep;
  61
  62 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
  63 {
  64 #ifdef RMAP_DEBUG
  65         struct anon_vma *anon_vma = find_vma->anon_vma;
  66         struct vm_area_struct *vma;
  67         unsigned int mapcount = 0;
  68         int found = 0;
  69
  70         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  71                 mapcount++;
  72                 BUG_ON(mapcount > 100000);
  73                 if (vma == find_vma)
  74                         found = 1;
  75         }
  76         BUG_ON(!found);
  77 #endif
  78 }
  79
  80 /* This must be called under the mmap_sem. */
  81 int anon_vma_prepare(struct vm_area_struct *vma)
  82 {
  83         struct anon_vma *anon_vma = vma->anon_vma;
  84
  85         might_sleep();
  86         if (unlikely(!anon_vma)) {
  87                 struct mm_struct *mm = vma->vm_mm;
  88                 struct anon_vma *allocated, *locked;
  89
  90                 anon_vma = find_mergeable_anon_vma(vma);
  91                 if (anon_vma) {
  92                         allocated = NULL;
  93                         locked = anon_vma;
  94                         spin_lock(&locked->lock);
  95                 } else {
  96                         anon_vma = anon_vma_alloc();
  97                         if (unlikely(!anon_vma))
  98                                 return -ENOMEM;
  99                         allocated = anon_vma;
 100                         locked = NULL;
 101                 }
 102
 103                 /* page_table_lock to protect against threads */
 104                 spin_lock(&mm->page_table_lock);
 105                 if (likely(!vma->anon_vma)) {
 106                         vma->anon_vma = anon_vma;
 107                         list_add(&vma->anon_vma_node, &anon_vma->head);
 108                         allocated = NULL;
 109                 }
 110                 spin_unlock(&mm->page_table_lock);
 111
 112                 if (locked)
 113                         spin_unlock(&locked->lock);
 114                 if (unlikely(allocated))
 115                         anon_vma_free(allocated);
 116         }
 117         return 0;
 118 }
 119
 120 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
 121 {
 122         if (!vma->anon_vma) {
 123                 BUG_ON(!next->anon_vma);
 124                 vma->anon_vma = next->anon_vma;
 125                 list_add(&vma->anon_vma_node, &next->anon_vma_node);
 126         } else {
 127                 /* if they're both non-null they must be the same */
 128                 BUG_ON(vma->anon_vma != next->anon_vma);
 129         }
 130         list_del(&next->anon_vma_node);
 131 }
 132
 133 void __anon_vma_link(struct vm_area_struct *vma)
 134 {
 135         struct anon_vma *anon_vma = vma->anon_vma;
 136
 137         if (anon_vma) {
 138                 list_add(&vma->anon_vma_node, &anon_vma->head);
 139                 validate_anon_vma(vma);
 140         }
 141 }
 142
 143 void anon_vma_link(struct vm_area_struct *vma)
 144 {
 145         struct anon_vma *anon_vma = vma->anon_vma;
 146
 147         if (anon_vma) {
 148                 spin_lock(&anon_vma->lock);
 149                 list_add(&vma->anon_vma_node, &anon_vma->head);
 150                 validate_anon_vma(vma);
 151                 spin_unlock(&anon_vma->lock);
 152         }
 153 }
 154
 155 void anon_vma_unlink(struct vm_area_struct *vma)
 156 {
 157         struct anon_vma *anon_vma = vma->anon_vma;
 158         int empty;
 159
 160         if (!anon_vma)
 161                 return;
 162
 163         spin_lock(&anon_vma->lock);
 164         validate_anon_vma(vma);
 165         list_del(&vma->anon_vma_node);
 166
 167         /* We must garbage collect the anon_vma if it's empty */
 168         empty = list_empty(&anon_vma->head);
 169         spin_unlock(&anon_vma->lock);
 170
 171         if (empty)
 172                 anon_vma_free(anon_vma);
 173 }
 174
 175 static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
 176 {
 177         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 178                                                 SLAB_CTOR_CONSTRUCTOR) {
 179                 struct anon_vma *anon_vma = data;
 180
 181                 spin_lock_init(&anon_vma->lock);
 182                 INIT_LIST_HEAD(&anon_vma->head);
 183         }
 184 }
 185
 186 void __init anon_vma_init(void)
 187 {
 188         anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
 189                         0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
 190 }
 191
 192 /*
 193  * Getting a lock on a stable anon_vma from a page off the LRU is
 194  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
 195  */
 196 static struct anon_vma *page_lock_anon_vma(struct page *page)
 197 {
 198         struct anon_vma *anon_vma = NULL;
 199         unsigned long anon_mapping;
 200
 201         rcu_read_lock();
 202         anon_mapping = (unsigned long) page->mapping;
 203         if (!(anon_mapping & PAGE_MAPPING_ANON))
 204                 goto out;
 205         if (!page_mapped(page))
 206                 goto out;
 207
 208         anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
 209         spin_lock(&anon_vma->lock);
 210 out:
 211         rcu_read_unlock();
 212         return anon_vma;
 213 }
 214
 215 /*
 216  * At what user virtual address is page expected in vma?
 217  */
 218 static inline unsigned long
 219 vma_address(struct page *page, struct vm_area_struct *vma)
 220 {
 221         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 222         unsigned long address;
 223
 224         address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 225         if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 226                 /* page should be within any vma from prio_tree_next */
 227                 BUG_ON(!PageAnon(page));
 228                 return -EFAULT;
 229         }
 230         return address;
 231 }
 232
 233 /*
 234  * At what user virtual address is page expected in vma? checking that the
 235  * page matches the vma: currently only used by unuse_process, on anon pages.
 236  */
 237 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 238 {
 239         if (PageAnon(page)) {
 240                 if ((void *)vma->anon_vma !=
 241                     (void *)page->mapping - PAGE_MAPPING_ANON)
 242                         return -EFAULT;
 243         } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
 244                 if (vma->vm_file->f_mapping != page->mapping)
 245                         return -EFAULT;
 246         } else
 247                 return -EFAULT;
 248         return vma_address(page, vma);
 249 }
 250
 251 /*
 252  * Subfunctions of page_referenced: page_referenced_one called
 253  * repeatedly from either page_referenced_anon or page_referenced_file.
 254  */
 255 static int page_referenced_one(struct page *page,
 256         struct vm_area_struct *vma, unsigned int *mapcount)
 257 {
 258         struct mm_struct *mm = vma->vm_mm;
 259         unsigned long address;
 260         pgd_t *pgd;
 261         pmd_t *pmd;
 262         pte_t *pte;
 263         int referenced = 0;
 264
 265         if (!mm->rss)
 266                 goto out;
 267         address = vma_address(page, vma);
 268         if (address == -EFAULT)
 269                 goto out;
 270
 271         spin_lock(&mm->page_table_lock);
 272
 273         pgd = pgd_offset(mm, address);
 274         if (!pgd_present(*pgd))
 275                 goto out_unlock;
 276
 277         pmd = pmd_offset(pgd, address);
 278         if (!pmd_present(*pmd))
 279                 goto out_unlock;
 280
 281         pte = pte_offset_map(pmd, address);
 282         if (!pte_present(*pte))
 283                 goto out_unmap;
 284
 285         if (page_to_pfn(page) != pte_pfn(*pte))
 286                 goto out_unmap;
 287
 288         if (ptep_clear_flush_young(vma, address, pte))
 289                 referenced++;
 290
 291         if (mm != current->mm && has_swap_token(mm))
 292                 referenced++;
 293
 294         (*mapcount)--;
 295
 296 out_unmap:
 297         pte_unmap(pte);
 298 out_unlock:
 299         spin_unlock(&mm->page_table_lock);
 300 out:
 301         return referenced;
 302 }
 303
 304 static int page_referenced_anon(struct page *page)
 305 {
 306         unsigned int mapcount;
 307         struct anon_vma *anon_vma;
 308         struct vm_area_struct *vma;
 309         int referenced = 0;
 310
 311         anon_vma = page_lock_anon_vma(page);
 312         if (!anon_vma)
 313                 return referenced;
 314
 315         mapcount = page_mapcount(page);
 316         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 317                 referenced += page_referenced_one(page, vma, &mapcount);
 318                 if (!mapcount)
 319                         break;
 320         }
 321         spin_unlock(&anon_vma->lock);
 322         return referenced;
 323 }
 324
 325 /**
 326  * page_referenced_file - referenced check for object-based rmap
 327  * @page: the page we're checking references on.
 328  *
 329  * For an object-based mapped page, find all the places it is mapped and
 330  * check/clear the referenced flag.  This is done by following the page->mapping
 331  * pointer, then walking the chain of vmas it holds.  It returns the number
 332  * of references it found.
 333  *
 334  * This function is only called from page_referenced for object-based pages.
 335  */
 336 static int page_referenced_file(struct page *page)
 337 {
 338         unsigned int mapcount;
 339         struct address_space *mapping = page->mapping;
 340         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 341         struct vm_area_struct *vma;
 342         struct prio_tree_iter iter;
 343         int referenced = 0;
 344
 345         /*
 346          * The caller's checks on page->mapping and !PageAnon have made
 347          * sure that this is a file page: the check for page->mapping
 348          * excludes the case just before it gets set on an anon page.
 349          */
 350         BUG_ON(PageAnon(page));
 351
 352         /*
 353          * The page lock not only makes sure that page->mapping cannot
 354          * suddenly be NULLified by truncation, it makes sure that the
 355          * structure at mapping cannot be freed and reused yet,
 356          * so we can safely take mapping->i_mmap_lock.
 357          */
 358         BUG_ON(!PageLocked(page));
 359
 360         spin_lock(&mapping->i_mmap_lock);
 361
 362         /*
 363          * i_mmap_lock does not stabilize mapcount at all, but mapcount
 364          * is more likely to be accurate if we note it after spinning.
 365          */
 366         mapcount = page_mapcount(page);
 367
 368         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 369                 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
 370                                   == (VM_LOCKED|VM_MAYSHARE)) {
 371                         referenced++;
 372                         break;
 373                 }
 374                 referenced += page_referenced_one(page, vma, &mapcount);
 375                 if (!mapcount)
 376                         break;
 377         }
 378
 379         spin_unlock(&mapping->i_mmap_lock);
 380         return referenced;
 381 }
 382
 383 /**
 384  * page_referenced - test if the page was referenced
 385  * @page: the page to test
 386  * @is_locked: caller holds lock on the page
 387  *
 388  * Quick test_and_clear_referenced for all mappings to a page,
 389  * returns the number of ptes which referenced the page.
 390  */
 391 int page_referenced(struct page *page, int is_locked)
 392 {
 393         int referenced = 0;
 394
 395         if (page_test_and_clear_young(page))
 396                 referenced++;
 397
 398         if (TestClearPageReferenced(page))
 399                 referenced++;
 400
 401         if (page_mapped(page) && page->mapping) {
 402                 if (PageAnon(page))
 403                         referenced += page_referenced_anon(page);
 404                 else if (is_locked)
 405                         referenced += page_referenced_file(page);
 406                 else if (TestSetPageLocked(page))
 407                         referenced++;
 408                 else {
 409                         if (page->mapping)
 410                                 referenced += page_referenced_file(page);
 411                         unlock_page(page);
 412                 }
 413         }
 414         return referenced;
 415 }
 416
 417 /**
 418  * page_add_anon_rmap - add pte mapping to an anonymous page
 419  * @page:       the page to add the mapping to
 420  * @vma:        the vm area in which the mapping is added
 421  * @address:    the user virtual address mapped
 422  *
 423  * The caller needs to hold the mm->page_table_lock.
 424  */
 425 void page_add_anon_rmap(struct page *page,
 426         struct vm_area_struct *vma, unsigned long address)
 427 {
 428         struct anon_vma *anon_vma = vma->anon_vma;
 429         pgoff_t index;
 430
 431         BUG_ON(PageReserved(page));
 432         BUG_ON(!anon_vma);
 433
 434         anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 435         index = (address - vma->vm_start) >> PAGE_SHIFT;
 436         index += vma->vm_pgoff;
 437         index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 438
 439         if (atomic_inc_and_test(&page->_mapcount)) {
 440                 page->index = index;
 441                 page->mapping = (struct address_space *) anon_vma;
 442                 inc_page_state(nr_mapped);
 443         }
 444         /* else checking page index and mapping is racy */
 445 }
 446
 447 /**
 448  * page_add_file_rmap - add pte mapping to a file page
 449  * @page: the page to add the mapping to
 450  *
 451  * The caller needs to hold the mm->page_table_lock.
 452  */
 453 void page_add_file_rmap(struct page *page)
 454 {
 455         BUG_ON(PageAnon(page));
 456         if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 457                 return;
 458
 459         if (atomic_inc_and_test(&page->_mapcount))
 460                 inc_page_state(nr_mapped);
 461 }
 462
 463 /**
 464  * page_remove_rmap - take down pte mapping from a page
 465  * @page: page to remove mapping from
 466  *
 467  * Caller needs to hold the mm->page_table_lock.
 468  */
 469 void page_remove_rmap(struct page *page)
 470 {
 471         BUG_ON(PageReserved(page));
 472
 473         if (atomic_add_negative(-1, &page->_mapcount)) {
 474                 BUG_ON(page_mapcount(page) < 0);
 475                 /*
 476                  * It would be tidy to reset the PageAnon mapping here,
 477                  * but that might overwrite a racing page_add_anon_rmap
 478                  * which increments mapcount after us but sets mapping
 479                  * before us: so leave the reset to free_hot_cold_page,
 480                  * and remember that it's only reliable while mapped.
 481                  * Leaving it set also helps swapoff to reinstate ptes
 482                  * faster for those pages still in swapcache.
 483                  */
 484                 if (page_test_and_clear_dirty(page))
 485                         set_page_dirty(page);
 486                 dec_page_state(nr_mapped);
 487         }
 488 }
 489
 490 /*
 491  * Subfunctions of try_to_unmap: try_to_unmap_one called
 492  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 493  */
 494 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 495 {
 496         struct mm_struct *mm = vma->vm_mm;
 497         unsigned long address;
 498         pgd_t *pgd;
 499         pmd_t *pmd;
 500         pte_t *pte;
 501         pte_t pteval;
 502         int ret = SWAP_AGAIN;
 503
 504         if (!mm->rss)
 505                 goto out;
 506         address = vma_address(page, vma);
 507         if (address == -EFAULT)
 508                 goto out;
 509
 510         /*
 511          * We need the page_table_lock to protect us from page faults,
 512          * munmap, fork, etc...
 513          */
 514         spin_lock(&mm->page_table_lock);
 515
 516         pgd = pgd_offset(mm, address);
 517         if (!pgd_present(*pgd))
 518                 goto out_unlock;
 519
 520         pmd = pmd_offset(pgd, address);
 521         if (!pmd_present(*pmd))
 522                 goto out_unlock;
 523
 524         pte = pte_offset_map(pmd, address);
 525         if (!pte_present(*pte))
 526                 goto out_unmap;
 527
 528         if (page_to_pfn(page) != pte_pfn(*pte))
 529                 goto out_unmap;
 530
 531         /*
 532          * If the page is mlock()d, we cannot swap it out.
 533          * If it's recently referenced (perhaps page_referenced
 534          * skipped over this mm) then we should reactivate it.
 535          */
 536         if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
 537                         ptep_clear_flush_young(vma, address, pte)) {
 538                 ret = SWAP_FAIL;
 539                 goto out_unmap;
 540         }
 541
 542         /*
 543          * Don't pull an anonymous page out from under get_user_pages.
 544          * GUP carefully breaks COW and raises page count (while holding
 545          * page_table_lock, as we have here) to make sure that the page
 546          * cannot be freed.  If we unmap that page here, a user write
 547          * access to the virtual address will bring back the page, but
 548          * its raised count will (ironically) be taken to mean it's not
 549          * an exclusive swap page, do_wp_page will replace it by a copy
 550          * page, and the user never get to see the data GUP was holding
 551          * the original page for.
 552          *
 553          * This test is also useful for when swapoff (unuse_process) has
 554          * to drop page lock: its reference to the page stops existing
 555          * ptes from being unmapped, so swapoff can make progress.
 556          */
 557         if (PageSwapCache(page) &&
 558             page_count(page) != page_mapcount(page) + 2) {
 559                 ret = SWAP_FAIL;
 560                 goto out_unmap;
 561         }
 562
 563         /* Nuke the page table entry. */
 564         flush_cache_page(vma, address);
 565         pteval = ptep_clear_flush(vma, address, pte);
 566
 567         /* Move the dirty bit to the physical page now the pte is gone. */
 568         if (pte_dirty(pteval))
 569                 set_page_dirty(page);
 570
 571         if (PageAnon(page)) {
 572                 swp_entry_t entry = { .val = page->private };
 573                 /*
 574                  * Store the swap location in the pte.
 575                  * See handle_pte_fault() ...
 576                  */
 577                 BUG_ON(!PageSwapCache(page));
 578                 swap_duplicate(entry);
 579                 set_pte(pte, swp_entry_to_pte(entry));
 580                 BUG_ON(pte_file(*pte));
 581         }
 582
 583         mm->rss--;
 584         page_remove_rmap(page);
 585         page_cache_release(page);
 586
 587 out_unmap:
 588         pte_unmap(pte);
 589 out_unlock:
 590         spin_unlock(&mm->page_table_lock);
 591 out:
 592         return ret;
 593 }
 594
 595 /*
 596  * objrmap doesn't work for nonlinear VMAs because the assumption that
 597  * offset-into-file correlates with offset-into-virtual-addresses does not hold.
 598  * Consequently, given a particular page and its ->index, we cannot locate the
 599  * ptes which are mapping that page without an exhaustive linear search.
 600  *
 601  * So what this code does is a mini "virtual scan" of each nonlinear VMA which
 602  * maps the file to which the target page belongs.  The ->vm_private_data field
 603  * holds the current cursor into that scan.  Successive searches will circulate
 604  * around the vma's virtual address space.
 605  *
 606  * So as more replacement pressure is applied to the pages in a nonlinear VMA,
 607  * more scanning pressure is placed against them as well.   Eventually pages
 608  * will become fully unmapped and are eligible for eviction.
 609  *
 610  * For very sparsely populated VMAs this is a little inefficient - chances are
 611  * there there won't be many ptes located within the scan cluster.  In this case
 612  * maybe we could scan further - to the end of the pte page, perhaps.
 613  */
 614 #define CLUSTER_SIZE    min(32*PAGE_SIZE, PMD_SIZE)
 615 #define CLUSTER_MASK    (~(CLUSTER_SIZE - 1))
 616
 617 static void try_to_unmap_cluster(unsigned long cursor,
 618         unsigned int *mapcount, struct vm_area_struct *vma)
 619 {
 620         struct mm_struct *mm = vma->vm_mm;
 621         pgd_t *pgd;
 622         pmd_t *pmd;
 623         pte_t *pte;
 624         pte_t pteval;
 625         struct page *page;
 626         unsigned long address;
 627         unsigned long end;
 628         unsigned long pfn;
 629
 630         /*
 631          * We need the page_table_lock to protect us from page faults,
 632          * munmap, fork, etc...
 633          */
 634         spin_lock(&mm->page_table_lock);
 635
 636         address = (vma->vm_start + cursor) & CLUSTER_MASK;
 637         end = address + CLUSTER_SIZE;
 638         if (address < vma->vm_start)
 639                 address = vma->vm_start;
 640         if (end > vma->vm_end)
 641                 end = vma->vm_end;
 642
 643         pgd = pgd_offset(mm, address);
 644         if (!pgd_present(*pgd))
 645                 goto out_unlock;
 646
 647         pmd = pmd_offset(pgd, address);
 648         if (!pmd_present(*pmd))
 649                 goto out_unlock;
 650
 651         for (pte = pte_offset_map(pmd, address);
 652                         address < end; pte++, address += PAGE_SIZE) {
 653
 654                 if (!pte_present(*pte))
 655                         continue;
 656
 657                 pfn = pte_pfn(*pte);
 658                 if (!pfn_valid(pfn))
 659                         continue;
 660
 661                 page = pfn_to_page(pfn);
 662                 BUG_ON(PageAnon(page));
 663                 if (PageReserved(page))
 664                         continue;
 665
 666                 if (ptep_clear_flush_young(vma, address, pte))
 667                         continue;
 668
 669                 /* Nuke the page table entry. */
 670                 flush_cache_page(vma, address);
 671                 pteval = ptep_clear_flush(vma, address, pte);
 672
 673                 /* If nonlinear, store the file page offset in the pte. */
 674                 if (page->index != linear_page_index(vma, address))
 675                         set_pte(pte, pgoff_to_pte(page->index));
 676
 677                 /* Move the dirty bit to the physical page now the pte is gone. */
 678                 if (pte_dirty(pteval))
 679                         set_page_dirty(page);
 680
 681                 page_remove_rmap(page);
 682                 page_cache_release(page);
 683                 mm->rss--;
 684                 (*mapcount)--;
 685         }
 686
 687         pte_unmap(pte);
 688
 689 out_unlock:
 690         spin_unlock(&mm->page_table_lock);
 691 }
 692
 693 static int try_to_unmap_anon(struct page *page)
 694 {
 695         struct anon_vma *anon_vma;
 696         struct vm_area_struct *vma;
 697         int ret = SWAP_AGAIN;
 698
 699         anon_vma = page_lock_anon_vma(page);
 700         if (!anon_vma)
 701                 return ret;
 702
 703         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 704                 ret = try_to_unmap_one(page, vma);
 705                 if (ret == SWAP_FAIL || !page_mapped(page))
 706                         break;
 707         }
 708         spin_unlock(&anon_vma->lock);
 709         return ret;
 710 }
 711
 712 /**
 713  * try_to_unmap_file - unmap file page using the object-based rmap method
 714  * @page: the page to unmap
 715  *
 716  * Find all the mappings of a page using the mapping pointer and the vma chains
 717  * contained in the address_space struct it points to.
 718  *
 719  * This function is only called from try_to_unmap for object-based pages.
 720  */
 721 static int try_to_unmap_file(struct page *page)
 722 {
 723         struct address_space *mapping = page->mapping;
 724         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 725         struct vm_area_struct *vma;
 726         struct prio_tree_iter iter;
 727         int ret = SWAP_AGAIN;
 728         unsigned long cursor;
 729         unsigned long max_nl_cursor = 0;
 730         unsigned long max_nl_size = 0;
 731         unsigned int mapcount;
 732
 733         spin_lock(&mapping->i_mmap_lock);
 734         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 735                 ret = try_to_unmap_one(page, vma);
 736                 if (ret == SWAP_FAIL || !page_mapped(page))
 737                         goto out;
 738         }
 739
 740         if (list_empty(&mapping->i_mmap_nonlinear))
 741                 goto out;
 742
 743         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 744                                                 shared.vm_set.list) {
 745                 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 746                         continue;
 747                 cursor = (unsigned long) vma->vm_private_data;
 748                 if (cursor > max_nl_cursor)
 749                         max_nl_cursor = cursor;
 750                 cursor = vma->vm_end - vma->vm_start;
 751                 if (cursor > max_nl_size)
 752                         max_nl_size = cursor;
 753         }
 754
 755         if (max_nl_size == 0) { /* any nonlinears locked or reserved */
 756                 ret = SWAP_FAIL;
 757                 goto out;
 758         }
 759
 760         /*
 761          * We don't try to search for this page in the nonlinear vmas,
 762          * and page_referenced wouldn't have found it anyway.  Instead
 763          * just walk the nonlinear vmas trying to age and unmap some.
 764          * The mapcount of the page we came in with is irrelevant,
 765          * but even so use it as a guide to how hard we should try?
 766          */
 767         mapcount = page_mapcount(page);
 768         if (!mapcount)
 769                 goto out;
 770         cond_resched_lock(&mapping->i_mmap_lock);
 771
 772         max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 773         if (max_nl_cursor == 0)
 774                 max_nl_cursor = CLUSTER_SIZE;
 775
 776         do {
 777                 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 778                                                 shared.vm_set.list) {
 779                         if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 780                                 continue;
 781                         cursor = (unsigned long) vma->vm_private_data;
 782                         while (vma->vm_mm->rss &&
 783                                 cursor < max_nl_cursor &&
 784                                 cursor < vma->vm_end - vma->vm_start) {
 785                                 try_to_unmap_cluster(cursor, &mapcount, vma);
 786                                 cursor += CLUSTER_SIZE;
 787                                 vma->vm_private_data = (void *) cursor;
 788                                 if ((int)mapcount <= 0)
 789                                         goto out;
 790                         }
 791                         vma->vm_private_data = (void *) max_nl_cursor;
 792                 }
 793                 cond_resched_lock(&mapping->i_mmap_lock);
 794                 max_nl_cursor += CLUSTER_SIZE;
 795         } while (max_nl_cursor <= max_nl_size);
 796
 797         /*
 798          * Don't loop forever (perhaps all the remaining pages are
 799          * in locked vmas).  Reset cursor on all unreserved nonlinear
 800          * vmas, now forgetting on which ones it had fallen behind.
 801          */
 802         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 803                                                 shared.vm_set.list) {
 804                 if (!(vma->vm_flags & VM_RESERVED))
 805                         vma->vm_private_data = NULL;
 806         }
 807 out:
 808         spin_unlock(&mapping->i_mmap_lock);
 809         return ret;
 810 }
 811
 812 /**
 813  * try_to_unmap - try to remove all page table mappings to a page
 814  * @page: the page to get unmapped
 815  *
 816  * Tries to remove all the page table entries which are mapping this
 817  * page, used in the pageout path.  Caller must hold the page lock.
 818  * Return values are:
 819  *
 820  * SWAP_SUCCESS - we succeeded in removing all mappings
 821  * SWAP_AGAIN   - we missed a mapping, try again later
 822  * SWAP_FAIL    - the page is unswappable
 823  */
 824 int try_to_unmap(struct page *page)
 825 {
 826         int ret;
 827
 828         BUG_ON(PageReserved(page));
 829         BUG_ON(!PageLocked(page));
 830
 831         if (PageAnon(page))
 832                 ret = try_to_unmap_anon(page);
 833         else
 834                 ret = try_to_unmap_file(page);
 835
 836         if (!page_mapped(page))
 837                 ret = SWAP_SUCCESS;
 838         return ret;
 839 }