mm/rmap.c

   1 /*
   2  * mm/rmap.c - physical to virtual reverse mappings
   3  *
   4  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5  * Released under the General Public License (GPL).
   6  *
   7  *
   8  * Simple, low overhead pte-based reverse mapping scheme.
   9  * This is kept modular because we may want to experiment
  10  * with object-based reverse mapping schemes. Please try
  11  * to keep this thing as modular as possible.
  12  */
  13
  14 /*
  15  * Locking:
  16  * - the page->pte.chain is protected by the PG_chainlock bit,
  17  *   which nests within the the mm->page_table_lock,
  18  *   which nests within the page lock.
  19  * - because swapout locking is opposite to the locking order
  20  *   in the page fault path, the swapout path uses trylocks
  21  *   on the mm->page_table_lock
  22  */
  23 #include <linux/mm.h>
  24 #include <linux/pagemap.h>
  25 #include <linux/swap.h>
  26 #include <linux/swapops.h>
  27 #include <linux/slab.h>
  28 #include <linux/init.h>
  29 #include <linux/rmap-locking.h>
  30 #include <linux/cache.h>
  31 #include <linux/percpu.h>
  32
  33 #include <asm/pgalloc.h>
  34 #include <asm/rmap.h>
  35 #include <asm/tlb.h>
  36 #include <asm/tlbflush.h>
  37
  38 /* #define DEBUG_RMAP */
  39
  40 /*
  41  * Shared pages have a chain of pte_chain structures, used to locate
  42  * all the mappings to this page. We only need a pointer to the pte
  43  * here, the page struct for the page table page contains the process
  44  * it belongs to and the offset within that process.
  45  *
  46  * We use an array of pte pointers in this structure to minimise cache misses
  47  * while traversing reverse maps.
  48  */
  49 #define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t))
  50
  51 /*
  52  * next_and_idx encodes both the address of the next pte_chain and the
  53  * offset of the highest-index used pte in ptes[].
  54  */
  55 struct pte_chain {
  56         unsigned long next_and_idx;
  57         pte_addr_t ptes[NRPTE];
  58 } ____cacheline_aligned;
  59
  60 kmem_cache_t    *pte_chain_cache;
  61
  62 static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain)
  63 {
  64         return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE);
  65 }
  66
  67 static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr)
  68 {
  69         return (struct pte_chain *)(pte_chain_addr & ~NRPTE);
  70 }
  71
  72 static inline int pte_chain_idx(struct pte_chain *pte_chain)
  73 {
  74         return pte_chain->next_and_idx & NRPTE;
  75 }
  76
  77 static inline unsigned long
  78 pte_chain_encode(struct pte_chain *pte_chain, int idx)
  79 {
  80         return (unsigned long)pte_chain | idx;
  81 }
  82
  83 /*
  84  * pte_chain list management policy:
  85  *
  86  * - If a page has a pte_chain list then it is shared by at least two processes,
  87  *   because a single sharing uses PageDirect. (Well, this isn't true yet,
  88  *   coz this code doesn't collapse singletons back to PageDirect on the remove
  89  *   path).
  90  * - A pte_chain list has free space only in the head member - all succeeding
  91  *   members are 100% full.
  92  * - If the head element has free space, it occurs in its leading slots.
  93  * - All free space in the pte_chain is at the start of the head member.
  94  * - Insertion into the pte_chain puts a pte pointer in the last free slot of
  95  *   the head member.
  96  * - Removal from a pte chain moves the head pte of the head member onto the
  97  *   victim pte and frees the head member if it became empty.
  98  */
  99
 100 /**
 101  ** VM stuff below this comment
 102  **/
 103
 104 /**
 105  * page_referenced - test if the page was referenced
 106  * @page: the page to test
 107  *
 108  * Quick test_and_clear_referenced for all mappings to a page,
 109  * returns the number of processes which referenced the page.
 110  * Caller needs to hold the pte_chain_lock.
 111  *
 112  * If the page has a single-entry pte_chain, collapse that back to a PageDirect
 113  * representation.  This way, it's only done under memory pressure.
 114  */
 115 int page_referenced(struct page * page)
 116 {
 117         struct pte_chain *pc;
 118         int referenced = 0;
 119
 120         if (page_test_and_clear_young(page))
 121                 mark_page_accessed(page);
 122
 123         if (TestClearPageReferenced(page))
 124                 referenced++;
 125
 126         if (PageDirect(page)) {
 127                 pte_t *pte = rmap_ptep_map(page->pte.direct);
 128                 if (ptep_test_and_clear_young(pte))
 129                         referenced++;
 130                 rmap_ptep_unmap(pte);
 131         } else {
 132                 int nr_chains = 0;
 133
 134                 /* Check all the page tables mapping this page. */
 135                 for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) {
 136                         int i;
 137
 138                         for (i = pte_chain_idx(pc); i < NRPTE; i++) {
 139                                 pte_addr_t pte_paddr = pc->ptes[i];
 140                                 pte_t *p;
 141
 142                                 p = rmap_ptep_map(pte_paddr);
 143                                 if (ptep_test_and_clear_young(p))
 144                                         referenced++;
 145                                 rmap_ptep_unmap(p);
 146                                 nr_chains++;
 147                         }
 148                 }
 149                 if (nr_chains == 1) {
 150                         pc = page->pte.chain;
 151                         page->pte.direct = pc->ptes[NRPTE-1];
 152                         SetPageDirect(page);
 153                         pc->ptes[NRPTE-1] = 0;
 154                         __pte_chain_free(pc);
 155                 }
 156         }
 157         return referenced;
 158 }
 159
 160 /**
 161  * page_add_rmap - add reverse mapping entry to a page
 162  * @page: the page to add the mapping to
 163  * @ptep: the page table entry mapping this page
 164  *
 165  * Add a new pte reverse mapping to a page.
 166  * The caller needs to hold the mm->page_table_lock.
 167  */
 168 struct pte_chain *
 169 page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
 170 {
 171         pte_addr_t pte_paddr = ptep_to_paddr(ptep);
 172         struct pte_chain *cur_pte_chain;
 173
 174         if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 175                 return pte_chain;
 176
 177         pte_chain_lock(page);
 178
 179         if (page->pte.direct == 0) {
 180                 page->pte.direct = pte_paddr;
 181                 SetPageDirect(page);
 182                 inc_page_state(nr_mapped);
 183                 goto out;
 184         }
 185
 186         if (PageDirect(page)) {
 187                 /* Convert a direct pointer into a pte_chain */
 188                 ClearPageDirect(page);
 189                 pte_chain->ptes[NRPTE-1] = page->pte.direct;
 190                 pte_chain->ptes[NRPTE-2] = pte_paddr;
 191                 pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2);
 192                 page->pte.direct = 0;
 193                 page->pte.chain = pte_chain;
 194                 pte_chain = NULL;       /* We consumed it */
 195                 goto out;
 196         }
 197
 198         cur_pte_chain = page->pte.chain;
 199         if (cur_pte_chain->ptes[0]) {   /* It's full */
 200                 pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain,
 201                                                                 NRPTE - 1);
 202                 page->pte.chain = pte_chain;
 203                 pte_chain->ptes[NRPTE-1] = pte_paddr;
 204                 pte_chain = NULL;       /* We consumed it */
 205                 goto out;
 206         }
 207         cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr;
 208         cur_pte_chain->next_and_idx--;
 209 out:
 210         pte_chain_unlock(page);
 211         return pte_chain;
 212 }
 213
 214 /**
 215  * page_remove_rmap - take down reverse mapping to a page
 216  * @page: page to remove mapping from
 217  * @ptep: page table entry to remove
 218  *
 219  * Removes the reverse mapping from the pte_chain of the page,
 220  * after that the caller can clear the page table entry and free
 221  * the page.
 222  * Caller needs to hold the mm->page_table_lock.
 223  */
 224 void page_remove_rmap(struct page *page, pte_t *ptep)
 225 {
 226         pte_addr_t pte_paddr = ptep_to_paddr(ptep);
 227         struct pte_chain *pc;
 228
 229         if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 230                 return;
 231
 232         pte_chain_lock(page);
 233
 234         if (!page_mapped(page))
 235                 goto out_unlock;        /* remap_page_range() from a driver? */
 236
 237         if (PageDirect(page)) {
 238                 if (page->pte.direct == pte_paddr) {
 239                         page->pte.direct = 0;
 240                         ClearPageDirect(page);
 241                         goto out;
 242                 }
 243         } else {
 244                 struct pte_chain *start = page->pte.chain;
 245                 struct pte_chain *next;
 246                 int victim_i = pte_chain_idx(start);
 247
 248                 for (pc = start; pc; pc = next) {
 249                         int i;
 250
 251                         next = pte_chain_next(pc);
 252                         if (next)
 253                                 prefetch(next);
 254                         for (i = pte_chain_idx(pc); i < NRPTE; i++) {
 255                                 pte_addr_t pa = pc->ptes[i];
 256
 257                                 if (pa != pte_paddr)
 258                                         continue;
 259                                 pc->ptes[i] = start->ptes[victim_i];
 260                                 start->ptes[victim_i] = 0;
 261                                 if (victim_i == NRPTE-1) {
 262                                         /* Emptied a pte_chain */
 263                                         page->pte.chain = pte_chain_next(start);
 264                                         __pte_chain_free(start);
 265                                 } else {
 266                                         start->next_and_idx++;
 267                                 }
 268                                 goto out;
 269                         }
 270                 }
 271         }
 272 out:
 273         if (page->pte.direct == 0 && page_test_and_clear_dirty(page))
 274                 set_page_dirty(page);
 275         if (!page_mapped(page))
 276                 dec_page_state(nr_mapped);
 277 out_unlock:
 278         pte_chain_unlock(page);
 279         return;
 280 }
 281
 282 /**
 283  * try_to_unmap_one - worker function for try_to_unmap
 284  * @page: page to unmap
 285  * @ptep: page table entry to unmap from page
 286  *
 287  * Internal helper function for try_to_unmap, called for each page
 288  * table entry mapping a page. Because locking order here is opposite
 289  * to the locking order used by the page fault path, we use trylocks.
 290  * Locking:
 291  *          page lock                   shrink_list(), trylock
 292  *              pte_chain_lock          shrink_list()
 293  *                  mm->page_table_lock try_to_unmap_one(), trylock
 294  */
 295 static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
 296 static int try_to_unmap_one(struct page * page, pte_addr_t paddr)
 297 {
 298         pte_t *ptep = rmap_ptep_map(paddr);
 299         unsigned long address = ptep_to_address(ptep);
 300         struct mm_struct * mm = ptep_to_mm(ptep);
 301         struct vm_area_struct * vma;
 302         pte_t pte;
 303         int ret;
 304
 305         if (!mm)
 306                 BUG();
 307
 308         /*
 309          * We need the page_table_lock to protect us from page faults,
 310          * munmap, fork, etc...
 311          */
 312         if (!spin_trylock(&mm->page_table_lock)) {
 313                 rmap_ptep_unmap(ptep);
 314                 return SWAP_AGAIN;
 315         }
 316
 317
 318         /* During mremap, it's possible pages are not in a VMA. */
 319         vma = find_vma(mm, address);
 320         if (!vma) {
 321                 ret = SWAP_FAIL;
 322                 goto out_unlock;
 323         }
 324
 325         /* The page is mlock()d, we cannot swap it out. */
 326         if (vma->vm_flags & VM_LOCKED) {
 327                 ret = SWAP_FAIL;
 328                 goto out_unlock;
 329         }
 330
 331         /* Nuke the page table entry. */
 332         flush_cache_page(vma, address);
 333         pte = ptep_clear_flush(vma, address, ptep);
 334
 335         if (PageSwapCache(page)) {
 336                 /*
 337                  * Store the swap location in the pte.
 338                  * See handle_pte_fault() ...
 339                  */
 340                 swp_entry_t entry = { .val = page->index };
 341                 swap_duplicate(entry);
 342                 set_pte(ptep, swp_entry_to_pte(entry));
 343                 BUG_ON(pte_file(*ptep));
 344         } else {
 345                 unsigned long pgidx;
 346                 /*
 347                  * If a nonlinear mapping then store the file page offset
 348                  * in the pte.
 349                  */
 350                 pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
 351                 pgidx += vma->vm_pgoff;
 352                 pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 353                 if (page->index != pgidx) {
 354                         set_pte(ptep, pgoff_to_pte(page->index));
 355                         BUG_ON(!pte_file(*ptep));
 356                 }
 357         }
 358
 359         /* Move the dirty bit to the physical page now the pte is gone. */
 360         if (pte_dirty(pte))
 361                 set_page_dirty(page);
 362
 363         mm->rss--;
 364         page_cache_release(page);
 365         ret = SWAP_SUCCESS;
 366
 367 out_unlock:
 368         rmap_ptep_unmap(ptep);
 369         spin_unlock(&mm->page_table_lock);
 370         return ret;
 371 }
 372
 373 /**
 374  * try_to_unmap - try to remove all page table mappings to a page
 375  * @page: the page to get unmapped
 376  *
 377  * Tries to remove all the page table entries which are mapping this
 378  * page, used in the pageout path.  Caller must hold the page lock
 379  * and its pte chain lock.  Return values are:
 380  *
 381  * SWAP_SUCCESS - we succeeded in removing all mappings
 382  * SWAP_AGAIN   - we missed a trylock, try again later
 383  * SWAP_FAIL    - the page is unswappable
 384  */
 385 int try_to_unmap(struct page * page)
 386 {
 387         struct pte_chain *pc, *next_pc, *start;
 388         int ret = SWAP_SUCCESS;
 389         int victim_i;
 390
 391         /* This page should not be on the pageout lists. */
 392         if (PageReserved(page))
 393                 BUG();
 394         if (!PageLocked(page))
 395                 BUG();
 396         /* We need backing store to swap out a page. */
 397         if (!page->mapping)
 398                 BUG();
 399
 400         if (PageDirect(page)) {
 401                 ret = try_to_unmap_one(page, page->pte.direct);
 402                 if (ret == SWAP_SUCCESS) {
 403                         if (page_test_and_clear_dirty(page))
 404                                 set_page_dirty(page);
 405                         page->pte.direct = 0;
 406                         ClearPageDirect(page);
 407                 }
 408                 goto out;
 409         }
 410
 411         start = page->pte.chain;
 412         victim_i = pte_chain_idx(start);
 413         for (pc = start; pc; pc = next_pc) {
 414                 int i;
 415
 416                 next_pc = pte_chain_next(pc);
 417                 if (next_pc)
 418                         prefetch(next_pc);
 419                 for (i = pte_chain_idx(pc); i < NRPTE; i++) {
 420                         pte_addr_t pte_paddr = pc->ptes[i];
 421
 422                         switch (try_to_unmap_one(page, pte_paddr)) {
 423                         case SWAP_SUCCESS:
 424                                 /*
 425                                  * Release a slot.  If we're releasing the
 426                                  * first pte in the first pte_chain then
 427                                  * pc->ptes[i] and start->ptes[victim_i] both
 428                                  * refer to the same thing.  It works out.
 429                                  */
 430                                 pc->ptes[i] = start->ptes[victim_i];
 431                                 start->ptes[victim_i] = 0;
 432                                 victim_i++;
 433                                 if (victim_i == NRPTE) {
 434                                         page->pte.chain = pte_chain_next(start);
 435                                         __pte_chain_free(start);
 436                                         start = page->pte.chain;
 437                                         victim_i = 0;
 438                                 } else {
 439                                         start->next_and_idx++;
 440                                 }
 441                                 if (page->pte.direct == 0 &&
 442                                     page_test_and_clear_dirty(page))
 443                                         set_page_dirty(page);
 444                                 break;
 445                         case SWAP_AGAIN:
 446                                 /* Skip this pte, remembering status. */
 447                                 ret = SWAP_AGAIN;
 448                                 continue;
 449                         case SWAP_FAIL:
 450                                 ret = SWAP_FAIL;
 451                                 goto out;
 452                         }
 453                 }
 454         }
 455 out:
 456         if (!page_mapped(page))
 457                 dec_page_state(nr_mapped);
 458         return ret;
 459 }
 460
 461 /**
 462  ** No more VM stuff below this comment, only pte_chain helper
 463  ** functions.
 464  **/
 465
 466 static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags)
 467 {
 468         struct pte_chain *pc = p;
 469
 470         memset(pc, 0, sizeof(*pc));
 471 }
 472
 473 DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0;
 474
 475 /**
 476  * __pte_chain_free - free pte_chain structure
 477  * @pte_chain: pte_chain struct to free
 478  */
 479 void __pte_chain_free(struct pte_chain *pte_chain)
 480 {
 481         struct pte_chain **pte_chainp;
 482
 483         pte_chainp = &get_cpu_var(local_pte_chain);
 484         if (pte_chain->next_and_idx)
 485                 pte_chain->next_and_idx = 0;
 486         if (*pte_chainp)
 487                 kmem_cache_free(pte_chain_cache, *pte_chainp);
 488         *pte_chainp = pte_chain;
 489         put_cpu_var(local_pte_chain);
 490 }
 491
 492 /*
 493  * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap().
 494  *
 495  * The caller of page_add_rmap() must perform the allocation because
 496  * page_add_rmap() is invariably called under spinlock.  Often, page_add_rmap()
 497  * will not actually use the pte_chain, because there is space available in one
 498  * of the existing pte_chains which are attached to the page.  So the case of
 499  * allocating and then freeing a single pte_chain is specially optimised here,
 500  * with a one-deep per-cpu cache.
 501  */
 502 struct pte_chain *pte_chain_alloc(int gfp_flags)
 503 {
 504         struct pte_chain *ret;
 505         struct pte_chain **pte_chainp;
 506
 507         might_sleep_if(gfp_flags & __GFP_WAIT);
 508
 509         pte_chainp = &get_cpu_var(local_pte_chain);
 510         if (*pte_chainp) {
 511                 ret = *pte_chainp;
 512                 *pte_chainp = NULL;
 513                 put_cpu_var(local_pte_chain);
 514         } else {
 515                 put_cpu_var(local_pte_chain);
 516                 ret = kmem_cache_alloc(pte_chain_cache, gfp_flags);
 517         }
 518         return ret;
 519 }
 520
 521 void __init pte_chain_init(void)
 522 {
 523         pte_chain_cache = kmem_cache_create(    "pte_chain",
 524                                                 sizeof(struct pte_chain),
 525                                                 0,
 526                                                 SLAB_MUST_HWCACHE_ALIGN,
 527                                                 pte_chain_ctor,
 528                                                 NULL);
 529
 530         if (!pte_chain_cache)
 531                 panic("failed to create pte_chain cache!\n");
 532 }