2 * mm/rmap.c - physical to virtual reverse mappings
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL).
8 * Simple, low overhead pte-based reverse mapping scheme.
9 * This is kept modular because we may want to experiment
10 * with object-based reverse mapping schemes. Please try
11 * to keep this thing as modular as possible.
16 * - the page->pte.chain is protected by the PG_chainlock bit,
17 * which nests within the zone->lru_lock, then the
18 * mm->page_table_lock, and then the page lock.
19 * - because swapout locking is opposite to the locking order
20 * in the page fault path, the swapout path uses trylocks
21 * on the mm->page_table_lock
24 #include <linux/pagemap.h>
25 #include <linux/swapops.h>
26 #include <linux/slab.h>
27 #include <linux/init.h>
28 #include <linux/rmap-locking.h>
29 #include <linux/cache.h>
30 #include <linux/percpu.h>
32 #include <asm/pgalloc.h>
35 #include <asm/tlbflush.h>
37 /* #define DEBUG_RMAP */
40 * Shared pages have a chain of pte_chain structures, used to locate
41 * all the mappings to this page. We only need a pointer to the pte
42 * here, the page struct for the page table page contains the process
43 * it belongs to and the offset within that process.
45 * We use an array of pte pointers in this structure to minimise cache misses
46 * while traversing reverse maps.
48 #define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t))
51 struct pte_chain
*next
;
52 pte_addr_t ptes
[NRPTE
];
53 } ____cacheline_aligned
;
55 kmem_cache_t
*pte_chain_cache
;
58 * pte_chain list management policy:
60 * - If a page has a pte_chain list then it is shared by at least two processes,
61 * because a single sharing uses PageDirect. (Well, this isn't true yet,
62 * coz this code doesn't collapse singletons back to PageDirect on the remove
64 * - A pte_chain list has free space only in the head member - all succeeding
65 * members are 100% full.
66 * - If the head element has free space, it occurs in its leading slots.
67 * - All free space in the pte_chain is at the start of the head member.
68 * - Insertion into the pte_chain puts a pte pointer in the last free slot of
70 * - Removal from a pte chain moves the head pte of the head member onto the
71 * victim pte and frees the head member if it became empty.
75 ** VM stuff below this comment
79 * page_referenced - test if the page was referenced
80 * @page: the page to test
82 * Quick test_and_clear_referenced for all mappings to a page,
83 * returns the number of processes which referenced the page.
84 * Caller needs to hold the pte_chain_lock.
86 * If the page has a single-entry pte_chain, collapse that back to a PageDirect
87 * representation. This way, it's only done under memory pressure.
89 int page_referenced(struct page
* page
)
91 struct pte_chain
* pc
;
94 if (TestClearPageReferenced(page
))
97 if (PageDirect(page
)) {
98 pte_t
*pte
= rmap_ptep_map(page
->pte
.direct
);
99 if (ptep_test_and_clear_young(pte
))
101 rmap_ptep_unmap(pte
);
105 /* Check all the page tables mapping this page. */
106 for (pc
= page
->pte
.chain
; pc
; pc
= pc
->next
) {
109 for (i
= NRPTE
-1; i
>= 0; i
--) {
110 pte_addr_t pte_paddr
= pc
->ptes
[i
];
115 p
= rmap_ptep_map(pte_paddr
);
116 if (ptep_test_and_clear_young(p
))
122 if (nr_chains
== 1) {
123 pc
= page
->pte
.chain
;
124 page
->pte
.direct
= pc
->ptes
[NRPTE
-1];
126 pc
->ptes
[NRPTE
-1] = 0;
127 __pte_chain_free(pc
);
134 * page_add_rmap - add reverse mapping entry to a page
135 * @page: the page to add the mapping to
136 * @ptep: the page table entry mapping this page
138 * Add a new pte reverse mapping to a page.
139 * The caller needs to hold the mm->page_table_lock.
142 page_add_rmap(struct page
*page
, pte_t
*ptep
, struct pte_chain
*pte_chain
)
144 pte_addr_t pte_paddr
= ptep_to_paddr(ptep
);
145 struct pte_chain
*cur_pte_chain
;
151 if (!pte_present(*ptep
))
153 if (!ptep_to_mm(ptep
))
157 if (!pfn_valid(page_to_pfn(page
)) || PageReserved(page
))
160 pte_chain_lock(page
);
164 * This stuff needs help to get up to highmem speed.
167 struct pte_chain
* pc
;
168 if (PageDirect(page
)) {
169 if (page
->pte
.direct
== pte_paddr
)
172 for (pc
= page
->pte
.chain
; pc
; pc
= pc
->next
) {
173 for (i
= 0; i
< NRPTE
; i
++) {
174 pte_addr_t p
= pc
->ptes
[i
];
176 if (p
&& p
== pte_paddr
)
184 if (page
->pte
.direct
== 0) {
185 page
->pte
.direct
= pte_paddr
;
187 inc_page_state(nr_mapped
);
191 if (PageDirect(page
)) {
192 /* Convert a direct pointer into a pte_chain */
193 ClearPageDirect(page
);
194 pte_chain
->ptes
[NRPTE
-1] = page
->pte
.direct
;
195 pte_chain
->ptes
[NRPTE
-2] = pte_paddr
;
196 page
->pte
.direct
= 0;
197 page
->pte
.chain
= pte_chain
;
198 pte_chain
= NULL
; /* We consumed it */
202 cur_pte_chain
= page
->pte
.chain
;
203 if (cur_pte_chain
->ptes
[0]) { /* It's full */
204 pte_chain
->next
= cur_pte_chain
;
205 page
->pte
.chain
= pte_chain
;
206 pte_chain
->ptes
[NRPTE
-1] = pte_paddr
;
207 pte_chain
= NULL
; /* We consumed it */
211 BUG_ON(!cur_pte_chain
->ptes
[NRPTE
-1]);
213 for (i
= NRPTE
-2; i
>= 0; i
--) {
214 if (!cur_pte_chain
->ptes
[i
]) {
215 cur_pte_chain
->ptes
[i
] = pte_paddr
;
221 pte_chain_unlock(page
);
222 inc_page_state(nr_reverse_maps
);
227 * page_remove_rmap - take down reverse mapping to a page
228 * @page: page to remove mapping from
229 * @ptep: page table entry to remove
231 * Removes the reverse mapping from the pte_chain of the page,
232 * after that the caller can clear the page table entry and free
234 * Caller needs to hold the mm->page_table_lock.
236 void page_remove_rmap(struct page
* page
, pte_t
* ptep
)
238 pte_addr_t pte_paddr
= ptep_to_paddr(ptep
);
239 struct pte_chain
*pc
;
243 if (!pfn_valid(page_to_pfn(page
)) || PageReserved(page
))
245 if (!page_mapped(page
))
246 return; /* remap_page_range() from a driver? */
248 pte_chain_lock(page
);
250 if (PageDirect(page
)) {
251 if (page
->pte
.direct
== pte_paddr
) {
252 page
->pte
.direct
= 0;
253 dec_page_state(nr_reverse_maps
);
254 ClearPageDirect(page
);
258 struct pte_chain
*start
= page
->pte
.chain
;
261 for (pc
= start
; pc
; pc
= pc
->next
) {
266 for (i
= 0; i
< NRPTE
; i
++) {
267 pte_addr_t pa
= pc
->ptes
[i
];
275 pc
->ptes
[i
] = start
->ptes
[victim_i
];
276 dec_page_state(nr_reverse_maps
);
277 start
->ptes
[victim_i
] = 0;
278 if (victim_i
== NRPTE
-1) {
279 /* Emptied a pte_chain */
280 page
->pte
.chain
= start
->next
;
281 __pte_chain_free(start
);
283 /* Do singleton->PageDirect here */
290 /* Not found. This should NEVER happen! */
291 printk(KERN_ERR
"page_remove_rmap: pte_chain %p not present.\n", ptep
);
292 printk(KERN_ERR
"page_remove_rmap: only found: ");
293 if (PageDirect(page
)) {
294 printk("%llx", (u64
)page
->pte
.direct
);
296 for (pc
= page
->pte
.chain
; pc
; pc
= pc
->next
) {
298 for (i
= 0; i
< NRPTE
; i
++)
299 printk(" %d:%llx", i
, (u64
)pc
->ptes
[i
]);
303 printk(KERN_ERR
"page_remove_rmap: driver cleared PG_reserved ?\n");
307 pte_chain_unlock(page
);
308 if (!page_mapped(page
))
309 dec_page_state(nr_mapped
);
314 * try_to_unmap_one - worker function for try_to_unmap
315 * @page: page to unmap
316 * @ptep: page table entry to unmap from page
318 * Internal helper function for try_to_unmap, called for each page
319 * table entry mapping a page. Because locking order here is opposite
320 * to the locking order used by the page fault path, we use trylocks.
322 * zone->lru_lock page_launder()
323 * page lock page_launder(), trylock
324 * pte_chain_lock page_launder()
325 * mm->page_table_lock try_to_unmap_one(), trylock
327 static int FASTCALL(try_to_unmap_one(struct page
*, pte_addr_t
));
328 static int try_to_unmap_one(struct page
* page
, pte_addr_t paddr
)
330 pte_t
*ptep
= rmap_ptep_map(paddr
);
331 unsigned long address
= ptep_to_address(ptep
);
332 struct mm_struct
* mm
= ptep_to_mm(ptep
);
333 struct vm_area_struct
* vma
;
341 * We need the page_table_lock to protect us from page faults,
342 * munmap, fork, etc...
344 if (!spin_trylock(&mm
->page_table_lock
)) {
345 rmap_ptep_unmap(ptep
);
350 /* During mremap, it's possible pages are not in a VMA. */
351 vma
= find_vma(mm
, address
);
357 /* The page is mlock()d, we cannot swap it out. */
358 if (vma
->vm_flags
& VM_LOCKED
) {
363 /* Nuke the page table entry. */
364 flush_cache_page(vma
, address
);
365 pte
= ptep_get_and_clear(ptep
);
366 flush_tlb_page(vma
, address
);
368 /* Store the swap location in the pte. See handle_pte_fault() ... */
369 if (PageSwapCache(page
)) {
370 swp_entry_t entry
= { .val
= page
->index
};
371 swap_duplicate(entry
);
372 set_pte(ptep
, swp_entry_to_pte(entry
));
375 /* Move the dirty bit to the physical page now the pte is gone. */
377 set_page_dirty(page
);
380 page_cache_release(page
);
384 rmap_ptep_unmap(ptep
);
385 spin_unlock(&mm
->page_table_lock
);
390 * try_to_unmap - try to remove all page table mappings to a page
391 * @page: the page to get unmapped
393 * Tries to remove all the page table entries which are mapping this
394 * page, used in the pageout path. Caller must hold zone->lru_lock
395 * and the page lock. Return values are:
397 * SWAP_SUCCESS - we succeeded in removing all mappings
398 * SWAP_AGAIN - we missed a trylock, try again later
399 * SWAP_FAIL - the page is unswappable
400 * SWAP_ERROR - an error occurred
402 int try_to_unmap(struct page
* page
)
404 struct pte_chain
*pc
, *next_pc
, *start
;
405 int ret
= SWAP_SUCCESS
;
408 /* This page should not be on the pageout lists. */
409 if (PageReserved(page
))
411 if (!PageLocked(page
))
413 /* We need backing store to swap out a page. */
417 if (PageDirect(page
)) {
418 ret
= try_to_unmap_one(page
, page
->pte
.direct
);
419 if (ret
== SWAP_SUCCESS
) {
420 page
->pte
.direct
= 0;
421 dec_page_state(nr_reverse_maps
);
422 ClearPageDirect(page
);
427 start
= page
->pte
.chain
;
428 for (pc
= start
; pc
; pc
= next_pc
) {
434 for (i
= 0; i
< NRPTE
; i
++) {
435 pte_addr_t pte_paddr
= pc
->ptes
[i
];
442 switch (try_to_unmap_one(page
, pte_paddr
)) {
445 * Release a slot. If we're releasing the
446 * first pte in the first pte_chain then
447 * pc->ptes[i] and start->ptes[victim_i] both
448 * refer to the same thing. It works out.
450 pc
->ptes
[i
] = start
->ptes
[victim_i
];
451 start
->ptes
[victim_i
] = 0;
452 dec_page_state(nr_reverse_maps
);
454 if (victim_i
== NRPTE
) {
455 page
->pte
.chain
= start
->next
;
456 __pte_chain_free(start
);
457 start
= page
->pte
.chain
;
462 /* Skip this pte, remembering status. */
475 if (!page_mapped(page
))
476 dec_page_state(nr_mapped
);
481 ** No more VM stuff below this comment, only pte_chain helper
485 static void pte_chain_ctor(void *p
, kmem_cache_t
*cachep
, unsigned long flags
)
487 struct pte_chain
*pc
= p
;
489 memset(pc
, 0, sizeof(*pc
));
492 DEFINE_PER_CPU(struct pte_chain
*, local_pte_chain
) = 0;
495 * __pte_chain_free - free pte_chain structure
496 * @pte_chain: pte_chain struct to free
498 void __pte_chain_free(struct pte_chain
*pte_chain
)
501 struct pte_chain
**pte_chainp
;
504 pte_chain
->next
= NULL
;
505 pte_chainp
= &per_cpu(local_pte_chain
, cpu
);
507 kmem_cache_free(pte_chain_cache
, *pte_chainp
);
508 *pte_chainp
= pte_chain
;
513 * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap().
515 * The caller of page_add_rmap() must perform the allocation because
516 * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap()
517 * will not actually use the pte_chain, because there is space available in one
518 * of the existing pte_chains which are attached to the page. So the case of
519 * allocating and then freeing a single pte_chain is specially optimised here,
520 * with a one-deep per-cpu cache.
522 struct pte_chain
*pte_chain_alloc(int gfp_flags
)
525 struct pte_chain
*ret
;
526 struct pte_chain
**pte_chainp
;
528 if (gfp_flags
& __GFP_WAIT
)
532 pte_chainp
= &per_cpu(local_pte_chain
, cpu
);
539 ret
= kmem_cache_alloc(pte_chain_cache
, gfp_flags
);
544 void __init
pte_chain_init(void)
546 pte_chain_cache
= kmem_cache_create( "pte_chain",
547 sizeof(struct pte_chain
),
553 if (!pte_chain_cache
)
554 panic("failed to create pte_chain cache!\n");