2 * mm/rmap.c - physical to virtual reverse mappings
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL).
8 * Simple, low overhead pte-based reverse mapping scheme.
9 * This is kept modular because we may want to experiment
10 * with object-based reverse mapping schemes. Please try
11 * to keep this thing as modular as possible.
16 * - the page->pte.chain is protected by the PG_chainlock bit,
17 * which nests within the the mm->page_table_lock,
18 * which nests within the page lock.
19 * - because swapout locking is opposite to the locking order
20 * in the page fault path, the swapout path uses trylocks
21 * on the mm->page_table_lock
24 #include <linux/pagemap.h>
25 #include <linux/swap.h>
26 #include <linux/swapops.h>
27 #include <linux/slab.h>
28 #include <linux/init.h>
29 #include <linux/rmap-locking.h>
30 #include <linux/cache.h>
31 #include <linux/percpu.h>
33 #include <asm/pgalloc.h>
36 #include <asm/tlbflush.h>
38 /* #define DEBUG_RMAP */
41 * Shared pages have a chain of pte_chain structures, used to locate
42 * all the mappings to this page. We only need a pointer to the pte
43 * here, the page struct for the page table page contains the process
44 * it belongs to and the offset within that process.
46 * We use an array of pte pointers in this structure to minimise cache misses
47 * while traversing reverse maps.
49 #define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t))
52 * next_and_idx encodes both the address of the next pte_chain and the
53 * offset of the highest-index used pte in ptes[].
56 unsigned long next_and_idx
;
57 pte_addr_t ptes
[NRPTE
];
58 } ____cacheline_aligned
;
60 kmem_cache_t
*pte_chain_cache
;
62 static inline struct pte_chain
*pte_chain_next(struct pte_chain
*pte_chain
)
64 return (struct pte_chain
*)(pte_chain
->next_and_idx
& ~NRPTE
);
67 static inline struct pte_chain
*pte_chain_ptr(unsigned long pte_chain_addr
)
69 return (struct pte_chain
*)(pte_chain_addr
& ~NRPTE
);
72 static inline int pte_chain_idx(struct pte_chain
*pte_chain
)
74 return pte_chain
->next_and_idx
& NRPTE
;
77 static inline unsigned long
78 pte_chain_encode(struct pte_chain
*pte_chain
, int idx
)
80 return (unsigned long)pte_chain
| idx
;
84 * pte_chain list management policy:
86 * - If a page has a pte_chain list then it is shared by at least two processes,
87 * because a single sharing uses PageDirect. (Well, this isn't true yet,
88 * coz this code doesn't collapse singletons back to PageDirect on the remove
90 * - A pte_chain list has free space only in the head member - all succeeding
91 * members are 100% full.
92 * - If the head element has free space, it occurs in its leading slots.
93 * - All free space in the pte_chain is at the start of the head member.
94 * - Insertion into the pte_chain puts a pte pointer in the last free slot of
96 * - Removal from a pte chain moves the head pte of the head member onto the
97 * victim pte and frees the head member if it became empty.
101 ** VM stuff below this comment
105 * page_referenced - test if the page was referenced
106 * @page: the page to test
108 * Quick test_and_clear_referenced for all mappings to a page,
109 * returns the number of processes which referenced the page.
110 * Caller needs to hold the pte_chain_lock.
112 * If the page has a single-entry pte_chain, collapse that back to a PageDirect
113 * representation. This way, it's only done under memory pressure.
115 int page_referenced(struct page
* page
)
117 struct pte_chain
*pc
;
120 if (TestClearPageReferenced(page
))
123 if (PageDirect(page
)) {
124 pte_t
*pte
= rmap_ptep_map(page
->pte
.direct
);
125 if (ptep_test_and_clear_young(pte
))
127 rmap_ptep_unmap(pte
);
131 /* Check all the page tables mapping this page. */
132 for (pc
= page
->pte
.chain
; pc
; pc
= pte_chain_next(pc
)) {
135 for (i
= NRPTE
-1; i
>= 0; i
--) {
136 pte_addr_t pte_paddr
= pc
->ptes
[i
];
141 p
= rmap_ptep_map(pte_paddr
);
142 if (ptep_test_and_clear_young(p
))
148 if (nr_chains
== 1) {
149 pc
= page
->pte
.chain
;
150 page
->pte
.direct
= pc
->ptes
[NRPTE
-1];
152 pc
->ptes
[NRPTE
-1] = 0;
153 __pte_chain_free(pc
);
160 * page_add_rmap - add reverse mapping entry to a page
161 * @page: the page to add the mapping to
162 * @ptep: the page table entry mapping this page
164 * Add a new pte reverse mapping to a page.
165 * The caller needs to hold the mm->page_table_lock.
168 page_add_rmap(struct page
*page
, pte_t
*ptep
, struct pte_chain
*pte_chain
)
170 pte_addr_t pte_paddr
= ptep_to_paddr(ptep
);
171 struct pte_chain
*cur_pte_chain
;
173 if (!pfn_valid(page_to_pfn(page
)) || PageReserved(page
))
176 pte_chain_lock(page
);
178 if (page
->pte
.direct
== 0) {
179 page
->pte
.direct
= pte_paddr
;
181 inc_page_state(nr_mapped
);
185 if (PageDirect(page
)) {
186 /* Convert a direct pointer into a pte_chain */
187 ClearPageDirect(page
);
188 pte_chain
->ptes
[NRPTE
-1] = page
->pte
.direct
;
189 pte_chain
->ptes
[NRPTE
-2] = pte_paddr
;
190 pte_chain
->next_and_idx
= pte_chain_encode(NULL
, NRPTE
-2);
191 page
->pte
.direct
= 0;
192 page
->pte
.chain
= pte_chain
;
193 pte_chain
= NULL
; /* We consumed it */
197 cur_pte_chain
= page
->pte
.chain
;
198 if (cur_pte_chain
->ptes
[0]) { /* It's full */
199 pte_chain
->next_and_idx
= pte_chain_encode(cur_pte_chain
,
201 page
->pte
.chain
= pte_chain
;
202 pte_chain
->ptes
[NRPTE
-1] = pte_paddr
;
203 pte_chain
= NULL
; /* We consumed it */
206 cur_pte_chain
->ptes
[pte_chain_idx(cur_pte_chain
) - 1] = pte_paddr
;
207 cur_pte_chain
->next_and_idx
--;
209 pte_chain_unlock(page
);
214 * page_remove_rmap - take down reverse mapping to a page
215 * @page: page to remove mapping from
216 * @ptep: page table entry to remove
218 * Removes the reverse mapping from the pte_chain of the page,
219 * after that the caller can clear the page table entry and free
221 * Caller needs to hold the mm->page_table_lock.
223 void page_remove_rmap(struct page
*page
, pte_t
*ptep
)
225 pte_addr_t pte_paddr
= ptep_to_paddr(ptep
);
226 struct pte_chain
*pc
;
228 if (!pfn_valid(page_to_pfn(page
)) || PageReserved(page
))
231 pte_chain_lock(page
);
233 if (!page_mapped(page
))
234 goto out_unlock
; /* remap_page_range() from a driver? */
236 if (PageDirect(page
)) {
237 if (page
->pte
.direct
== pte_paddr
) {
238 page
->pte
.direct
= 0;
239 ClearPageDirect(page
);
243 struct pte_chain
*start
= page
->pte
.chain
;
244 struct pte_chain
*next
;
247 for (pc
= start
; pc
; pc
= next
) {
250 next
= pte_chain_next(pc
);
253 for (i
= pte_chain_idx(pc
); i
< NRPTE
; i
++) {
254 pte_addr_t pa
= pc
->ptes
[i
];
260 pc
->ptes
[i
] = start
->ptes
[victim_i
];
261 start
->ptes
[victim_i
] = 0;
262 if (victim_i
== NRPTE
-1) {
263 /* Emptied a pte_chain */
264 page
->pte
.chain
= pte_chain_next(start
);
265 __pte_chain_free(start
);
267 start
->next_and_idx
++;
274 if (!page_mapped(page
))
275 dec_page_state(nr_mapped
);
277 pte_chain_unlock(page
);
282 * try_to_unmap_one - worker function for try_to_unmap
283 * @page: page to unmap
284 * @ptep: page table entry to unmap from page
286 * Internal helper function for try_to_unmap, called for each page
287 * table entry mapping a page. Because locking order here is opposite
288 * to the locking order used by the page fault path, we use trylocks.
290 * page lock shrink_list(), trylock
291 * pte_chain_lock shrink_list()
292 * mm->page_table_lock try_to_unmap_one(), trylock
294 static int FASTCALL(try_to_unmap_one(struct page
*, pte_addr_t
));
295 static int try_to_unmap_one(struct page
* page
, pte_addr_t paddr
)
297 pte_t
*ptep
= rmap_ptep_map(paddr
);
298 unsigned long address
= ptep_to_address(ptep
);
299 struct mm_struct
* mm
= ptep_to_mm(ptep
);
300 struct vm_area_struct
* vma
;
308 * We need the page_table_lock to protect us from page faults,
309 * munmap, fork, etc...
311 if (!spin_trylock(&mm
->page_table_lock
)) {
312 rmap_ptep_unmap(ptep
);
317 /* During mremap, it's possible pages are not in a VMA. */
318 vma
= find_vma(mm
, address
);
324 /* The page is mlock()d, we cannot swap it out. */
325 if (vma
->vm_flags
& VM_LOCKED
) {
330 /* Nuke the page table entry. */
331 flush_cache_page(vma
, address
);
332 pte
= ptep_get_and_clear(ptep
);
333 flush_tlb_page(vma
, address
);
335 if (PageSwapCache(page
)) {
337 * Store the swap location in the pte.
338 * See handle_pte_fault() ...
340 swp_entry_t entry
= { .val
= page
->index
};
341 swap_duplicate(entry
);
342 set_pte(ptep
, swp_entry_to_pte(entry
));
343 BUG_ON(pte_file(*ptep
));
347 * If a nonlinear mapping then store the file page offset
350 pgidx
= (address
- vma
->vm_start
) >> PAGE_SHIFT
;
351 pgidx
+= vma
->vm_pgoff
;
352 pgidx
>>= PAGE_CACHE_SHIFT
- PAGE_SHIFT
;
353 if (page
->index
!= pgidx
) {
354 set_pte(ptep
, pgoff_to_pte(page
->index
));
355 BUG_ON(!pte_file(*ptep
));
359 /* Move the dirty bit to the physical page now the pte is gone. */
361 set_page_dirty(page
);
364 page_cache_release(page
);
368 rmap_ptep_unmap(ptep
);
369 spin_unlock(&mm
->page_table_lock
);
374 * try_to_unmap - try to remove all page table mappings to a page
375 * @page: the page to get unmapped
377 * Tries to remove all the page table entries which are mapping this
378 * page, used in the pageout path. Caller must hold the page lock
379 * and its pte chain lock. Return values are:
381 * SWAP_SUCCESS - we succeeded in removing all mappings
382 * SWAP_AGAIN - we missed a trylock, try again later
383 * SWAP_FAIL - the page is unswappable
385 int try_to_unmap(struct page
* page
)
387 struct pte_chain
*pc
, *next_pc
, *start
;
388 int ret
= SWAP_SUCCESS
;
391 /* This page should not be on the pageout lists. */
392 if (PageReserved(page
))
394 if (!PageLocked(page
))
396 /* We need backing store to swap out a page. */
400 if (PageDirect(page
)) {
401 ret
= try_to_unmap_one(page
, page
->pte
.direct
);
402 if (ret
== SWAP_SUCCESS
) {
403 page
->pte
.direct
= 0;
404 ClearPageDirect(page
);
409 start
= page
->pte
.chain
;
410 for (pc
= start
; pc
; pc
= next_pc
) {
413 next_pc
= pte_chain_next(pc
);
416 for (i
= pte_chain_idx(pc
); i
< NRPTE
; i
++) {
417 pte_addr_t pte_paddr
= pc
->ptes
[i
];
424 switch (try_to_unmap_one(page
, pte_paddr
)) {
427 * Release a slot. If we're releasing the
428 * first pte in the first pte_chain then
429 * pc->ptes[i] and start->ptes[victim_i] both
430 * refer to the same thing. It works out.
432 pc
->ptes
[i
] = start
->ptes
[victim_i
];
433 start
->ptes
[victim_i
] = 0;
435 if (victim_i
== NRPTE
) {
436 page
->pte
.chain
= pte_chain_next(start
);
437 __pte_chain_free(start
);
438 start
= page
->pte
.chain
;
441 start
->next_and_idx
++;
445 /* Skip this pte, remembering status. */
455 if (!page_mapped(page
))
456 dec_page_state(nr_mapped
);
461 ** No more VM stuff below this comment, only pte_chain helper
465 static void pte_chain_ctor(void *p
, kmem_cache_t
*cachep
, unsigned long flags
)
467 struct pte_chain
*pc
= p
;
469 memset(pc
, 0, sizeof(*pc
));
472 DEFINE_PER_CPU(struct pte_chain
*, local_pte_chain
) = 0;
475 * __pte_chain_free - free pte_chain structure
476 * @pte_chain: pte_chain struct to free
478 void __pte_chain_free(struct pte_chain
*pte_chain
)
480 struct pte_chain
**pte_chainp
;
482 pte_chainp
= &get_cpu_var(local_pte_chain
);
483 if (pte_chain
->next_and_idx
)
484 pte_chain
->next_and_idx
= 0;
486 kmem_cache_free(pte_chain_cache
, *pte_chainp
);
487 *pte_chainp
= pte_chain
;
488 put_cpu_var(local_pte_chain
);
492 * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap().
494 * The caller of page_add_rmap() must perform the allocation because
495 * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap()
496 * will not actually use the pte_chain, because there is space available in one
497 * of the existing pte_chains which are attached to the page. So the case of
498 * allocating and then freeing a single pte_chain is specially optimised here,
499 * with a one-deep per-cpu cache.
501 struct pte_chain
*pte_chain_alloc(int gfp_flags
)
503 struct pte_chain
*ret
;
504 struct pte_chain
**pte_chainp
;
506 if (gfp_flags
& __GFP_WAIT
)
509 pte_chainp
= &get_cpu_var(local_pte_chain
);
513 put_cpu_var(local_pte_chain
);
515 put_cpu_var(local_pte_chain
);
516 ret
= kmem_cache_alloc(pte_chain_cache
, gfp_flags
);
521 void __init
pte_chain_init(void)
523 pte_chain_cache
= kmem_cache_create( "pte_chain",
524 sizeof(struct pte_chain
),
526 SLAB_MUST_HWCACHE_ALIGN
,
530 if (!pte_chain_cache
)
531 panic("failed to create pte_chain cache!\n");