[PATCH] Fix up 'linux-dvb' maintainers entry
[linux-2.6/history.git] / mm / rmap.c
bloba383b7d051147803e749260699c2a8d0e5bf7a13
1 /*
2 * mm/rmap.c - physical to virtual reverse mappings
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL).
8 * Simple, low overhead pte-based reverse mapping scheme.
9 * This is kept modular because we may want to experiment
10 * with object-based reverse mapping schemes. Please try
11 * to keep this thing as modular as possible.
15 * Locking:
16 * - the page->pte.chain is protected by the PG_chainlock bit,
17 * which nests within the the mm->page_table_lock,
18 * which nests within the page lock.
19 * - because swapout locking is opposite to the locking order
20 * in the page fault path, the swapout path uses trylocks
21 * on the mm->page_table_lock
23 #include <linux/mm.h>
24 #include <linux/pagemap.h>
25 #include <linux/swap.h>
26 #include <linux/swapops.h>
27 #include <linux/slab.h>
28 #include <linux/init.h>
29 #include <linux/rmap-locking.h>
30 #include <linux/cache.h>
31 #include <linux/percpu.h>
33 #include <asm/pgalloc.h>
34 #include <asm/rmap.h>
35 #include <asm/tlb.h>
36 #include <asm/tlbflush.h>
38 /* #define DEBUG_RMAP */
41 * Shared pages have a chain of pte_chain structures, used to locate
42 * all the mappings to this page. We only need a pointer to the pte
43 * here, the page struct for the page table page contains the process
44 * it belongs to and the offset within that process.
46 * We use an array of pte pointers in this structure to minimise cache misses
47 * while traversing reverse maps.
49 #define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t))
52 * next_and_idx encodes both the address of the next pte_chain and the
53 * offset of the highest-index used pte in ptes[].
55 struct pte_chain {
56 unsigned long next_and_idx;
57 pte_addr_t ptes[NRPTE];
58 } ____cacheline_aligned;
60 kmem_cache_t *pte_chain_cache;
62 static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain)
64 return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE);
67 static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr)
69 return (struct pte_chain *)(pte_chain_addr & ~NRPTE);
72 static inline int pte_chain_idx(struct pte_chain *pte_chain)
74 return pte_chain->next_and_idx & NRPTE;
77 static inline unsigned long
78 pte_chain_encode(struct pte_chain *pte_chain, int idx)
80 return (unsigned long)pte_chain | idx;
84 * pte_chain list management policy:
86 * - If a page has a pte_chain list then it is shared by at least two processes,
87 * because a single sharing uses PageDirect. (Well, this isn't true yet,
88 * coz this code doesn't collapse singletons back to PageDirect on the remove
89 * path).
90 * - A pte_chain list has free space only in the head member - all succeeding
91 * members are 100% full.
92 * - If the head element has free space, it occurs in its leading slots.
93 * - All free space in the pte_chain is at the start of the head member.
94 * - Insertion into the pte_chain puts a pte pointer in the last free slot of
95 * the head member.
96 * - Removal from a pte chain moves the head pte of the head member onto the
97 * victim pte and frees the head member if it became empty.
101 ** VM stuff below this comment
105 * page_referenced - test if the page was referenced
106 * @page: the page to test
108 * Quick test_and_clear_referenced for all mappings to a page,
109 * returns the number of processes which referenced the page.
110 * Caller needs to hold the pte_chain_lock.
112 * If the page has a single-entry pte_chain, collapse that back to a PageDirect
113 * representation. This way, it's only done under memory pressure.
115 int page_referenced(struct page * page)
117 struct pte_chain *pc;
118 int referenced = 0;
120 if (page_test_and_clear_young(page))
121 mark_page_accessed(page);
123 if (TestClearPageReferenced(page))
124 referenced++;
126 if (PageDirect(page)) {
127 pte_t *pte = rmap_ptep_map(page->pte.direct);
128 if (ptep_test_and_clear_young(pte))
129 referenced++;
130 rmap_ptep_unmap(pte);
131 } else {
132 int nr_chains = 0;
134 /* Check all the page tables mapping this page. */
135 for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) {
136 int i;
138 for (i = pte_chain_idx(pc); i < NRPTE; i++) {
139 pte_addr_t pte_paddr = pc->ptes[i];
140 pte_t *p;
142 p = rmap_ptep_map(pte_paddr);
143 if (ptep_test_and_clear_young(p))
144 referenced++;
145 rmap_ptep_unmap(p);
146 nr_chains++;
149 if (nr_chains == 1) {
150 pc = page->pte.chain;
151 page->pte.direct = pc->ptes[NRPTE-1];
152 SetPageDirect(page);
153 pc->ptes[NRPTE-1] = 0;
154 __pte_chain_free(pc);
157 return referenced;
161 * page_add_rmap - add reverse mapping entry to a page
162 * @page: the page to add the mapping to
163 * @ptep: the page table entry mapping this page
165 * Add a new pte reverse mapping to a page.
166 * The caller needs to hold the mm->page_table_lock.
168 struct pte_chain *
169 page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
171 pte_addr_t pte_paddr = ptep_to_paddr(ptep);
172 struct pte_chain *cur_pte_chain;
174 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
175 return pte_chain;
177 pte_chain_lock(page);
179 if (page->pte.direct == 0) {
180 page->pte.direct = pte_paddr;
181 SetPageDirect(page);
182 inc_page_state(nr_mapped);
183 goto out;
186 if (PageDirect(page)) {
187 /* Convert a direct pointer into a pte_chain */
188 ClearPageDirect(page);
189 pte_chain->ptes[NRPTE-1] = page->pte.direct;
190 pte_chain->ptes[NRPTE-2] = pte_paddr;
191 pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2);
192 page->pte.direct = 0;
193 page->pte.chain = pte_chain;
194 pte_chain = NULL; /* We consumed it */
195 goto out;
198 cur_pte_chain = page->pte.chain;
199 if (cur_pte_chain->ptes[0]) { /* It's full */
200 pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain,
201 NRPTE - 1);
202 page->pte.chain = pte_chain;
203 pte_chain->ptes[NRPTE-1] = pte_paddr;
204 pte_chain = NULL; /* We consumed it */
205 goto out;
207 cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr;
208 cur_pte_chain->next_and_idx--;
209 out:
210 pte_chain_unlock(page);
211 return pte_chain;
215 * page_remove_rmap - take down reverse mapping to a page
216 * @page: page to remove mapping from
217 * @ptep: page table entry to remove
219 * Removes the reverse mapping from the pte_chain of the page,
220 * after that the caller can clear the page table entry and free
221 * the page.
222 * Caller needs to hold the mm->page_table_lock.
224 void page_remove_rmap(struct page *page, pte_t *ptep)
226 pte_addr_t pte_paddr = ptep_to_paddr(ptep);
227 struct pte_chain *pc;
229 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
230 return;
232 pte_chain_lock(page);
234 if (!page_mapped(page))
235 goto out_unlock; /* remap_page_range() from a driver? */
237 if (PageDirect(page)) {
238 if (page->pte.direct == pte_paddr) {
239 page->pte.direct = 0;
240 ClearPageDirect(page);
241 goto out;
243 } else {
244 struct pte_chain *start = page->pte.chain;
245 struct pte_chain *next;
246 int victim_i = pte_chain_idx(start);
248 for (pc = start; pc; pc = next) {
249 int i;
251 next = pte_chain_next(pc);
252 if (next)
253 prefetch(next);
254 for (i = pte_chain_idx(pc); i < NRPTE; i++) {
255 pte_addr_t pa = pc->ptes[i];
257 if (pa != pte_paddr)
258 continue;
259 pc->ptes[i] = start->ptes[victim_i];
260 start->ptes[victim_i] = 0;
261 if (victim_i == NRPTE-1) {
262 /* Emptied a pte_chain */
263 page->pte.chain = pte_chain_next(start);
264 __pte_chain_free(start);
265 } else {
266 start->next_and_idx++;
268 goto out;
272 out:
273 if (page->pte.direct == 0 && page_test_and_clear_dirty(page))
274 set_page_dirty(page);
275 if (!page_mapped(page))
276 dec_page_state(nr_mapped);
277 out_unlock:
278 pte_chain_unlock(page);
279 return;
283 * try_to_unmap_one - worker function for try_to_unmap
284 * @page: page to unmap
285 * @ptep: page table entry to unmap from page
287 * Internal helper function for try_to_unmap, called for each page
288 * table entry mapping a page. Because locking order here is opposite
289 * to the locking order used by the page fault path, we use trylocks.
290 * Locking:
291 * page lock shrink_list(), trylock
292 * pte_chain_lock shrink_list()
293 * mm->page_table_lock try_to_unmap_one(), trylock
295 static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
296 static int try_to_unmap_one(struct page * page, pte_addr_t paddr)
298 pte_t *ptep = rmap_ptep_map(paddr);
299 unsigned long address = ptep_to_address(ptep);
300 struct mm_struct * mm = ptep_to_mm(ptep);
301 struct vm_area_struct * vma;
302 pte_t pte;
303 int ret;
305 if (!mm)
306 BUG();
309 * We need the page_table_lock to protect us from page faults,
310 * munmap, fork, etc...
312 if (!spin_trylock(&mm->page_table_lock)) {
313 rmap_ptep_unmap(ptep);
314 return SWAP_AGAIN;
318 /* During mremap, it's possible pages are not in a VMA. */
319 vma = find_vma(mm, address);
320 if (!vma) {
321 ret = SWAP_FAIL;
322 goto out_unlock;
325 /* The page is mlock()d, we cannot swap it out. */
326 if (vma->vm_flags & VM_LOCKED) {
327 ret = SWAP_FAIL;
328 goto out_unlock;
331 /* Nuke the page table entry. */
332 flush_cache_page(vma, address);
333 pte = ptep_clear_flush(vma, address, ptep);
335 if (PageSwapCache(page)) {
337 * Store the swap location in the pte.
338 * See handle_pte_fault() ...
340 swp_entry_t entry = { .val = page->index };
341 swap_duplicate(entry);
342 set_pte(ptep, swp_entry_to_pte(entry));
343 BUG_ON(pte_file(*ptep));
344 } else {
345 unsigned long pgidx;
347 * If a nonlinear mapping then store the file page offset
348 * in the pte.
350 pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
351 pgidx += vma->vm_pgoff;
352 pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
353 if (page->index != pgidx) {
354 set_pte(ptep, pgoff_to_pte(page->index));
355 BUG_ON(!pte_file(*ptep));
359 /* Move the dirty bit to the physical page now the pte is gone. */
360 if (pte_dirty(pte))
361 set_page_dirty(page);
363 mm->rss--;
364 page_cache_release(page);
365 ret = SWAP_SUCCESS;
367 out_unlock:
368 rmap_ptep_unmap(ptep);
369 spin_unlock(&mm->page_table_lock);
370 return ret;
374 * try_to_unmap - try to remove all page table mappings to a page
375 * @page: the page to get unmapped
377 * Tries to remove all the page table entries which are mapping this
378 * page, used in the pageout path. Caller must hold the page lock
379 * and its pte chain lock. Return values are:
381 * SWAP_SUCCESS - we succeeded in removing all mappings
382 * SWAP_AGAIN - we missed a trylock, try again later
383 * SWAP_FAIL - the page is unswappable
385 int try_to_unmap(struct page * page)
387 struct pte_chain *pc, *next_pc, *start;
388 int ret = SWAP_SUCCESS;
389 int victim_i;
391 /* This page should not be on the pageout lists. */
392 if (PageReserved(page))
393 BUG();
394 if (!PageLocked(page))
395 BUG();
396 /* We need backing store to swap out a page. */
397 if (!page->mapping)
398 BUG();
400 if (PageDirect(page)) {
401 ret = try_to_unmap_one(page, page->pte.direct);
402 if (ret == SWAP_SUCCESS) {
403 if (page_test_and_clear_dirty(page))
404 set_page_dirty(page);
405 page->pte.direct = 0;
406 ClearPageDirect(page);
408 goto out;
411 start = page->pte.chain;
412 victim_i = pte_chain_idx(start);
413 for (pc = start; pc; pc = next_pc) {
414 int i;
416 next_pc = pte_chain_next(pc);
417 if (next_pc)
418 prefetch(next_pc);
419 for (i = pte_chain_idx(pc); i < NRPTE; i++) {
420 pte_addr_t pte_paddr = pc->ptes[i];
422 switch (try_to_unmap_one(page, pte_paddr)) {
423 case SWAP_SUCCESS:
425 * Release a slot. If we're releasing the
426 * first pte in the first pte_chain then
427 * pc->ptes[i] and start->ptes[victim_i] both
428 * refer to the same thing. It works out.
430 pc->ptes[i] = start->ptes[victim_i];
431 start->ptes[victim_i] = 0;
432 victim_i++;
433 if (victim_i == NRPTE) {
434 page->pte.chain = pte_chain_next(start);
435 __pte_chain_free(start);
436 start = page->pte.chain;
437 victim_i = 0;
438 } else {
439 start->next_and_idx++;
441 if (page->pte.direct == 0 &&
442 page_test_and_clear_dirty(page))
443 set_page_dirty(page);
444 break;
445 case SWAP_AGAIN:
446 /* Skip this pte, remembering status. */
447 ret = SWAP_AGAIN;
448 continue;
449 case SWAP_FAIL:
450 ret = SWAP_FAIL;
451 goto out;
455 out:
456 if (!page_mapped(page))
457 dec_page_state(nr_mapped);
458 return ret;
462 ** No more VM stuff below this comment, only pte_chain helper
463 ** functions.
466 static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags)
468 struct pte_chain *pc = p;
470 memset(pc, 0, sizeof(*pc));
473 DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0;
476 * __pte_chain_free - free pte_chain structure
477 * @pte_chain: pte_chain struct to free
479 void __pte_chain_free(struct pte_chain *pte_chain)
481 struct pte_chain **pte_chainp;
483 pte_chainp = &get_cpu_var(local_pte_chain);
484 if (pte_chain->next_and_idx)
485 pte_chain->next_and_idx = 0;
486 if (*pte_chainp)
487 kmem_cache_free(pte_chain_cache, *pte_chainp);
488 *pte_chainp = pte_chain;
489 put_cpu_var(local_pte_chain);
493 * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap().
495 * The caller of page_add_rmap() must perform the allocation because
496 * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap()
497 * will not actually use the pte_chain, because there is space available in one
498 * of the existing pte_chains which are attached to the page. So the case of
499 * allocating and then freeing a single pte_chain is specially optimised here,
500 * with a one-deep per-cpu cache.
502 struct pte_chain *pte_chain_alloc(int gfp_flags)
504 struct pte_chain *ret;
505 struct pte_chain **pte_chainp;
507 might_sleep_if(gfp_flags & __GFP_WAIT);
509 pte_chainp = &get_cpu_var(local_pte_chain);
510 if (*pte_chainp) {
511 ret = *pte_chainp;
512 *pte_chainp = NULL;
513 put_cpu_var(local_pte_chain);
514 } else {
515 put_cpu_var(local_pte_chain);
516 ret = kmem_cache_alloc(pte_chain_cache, gfp_flags);
518 return ret;
521 void __init pte_chain_init(void)
523 pte_chain_cache = kmem_cache_create( "pte_chain",
524 sizeof(struct pte_chain),
526 SLAB_MUST_HWCACHE_ALIGN,
527 pte_chain_ctor,
528 NULL);
530 if (!pte_chain_cache)
531 panic("failed to create pte_chain cache!\n");