USB: serial: usb_debug: Make static
[linux-2.6/mini2440.git] / arch / x86 / xen / mmu.c
blob503c240e26c73539c2d4061f0d186b92a7c20c83
1 /*
2 * Xen mmu operations
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/debugfs.h>
44 #include <linux/bug.h>
46 #include <asm/pgtable.h>
47 #include <asm/tlbflush.h>
48 #include <asm/fixmap.h>
49 #include <asm/mmu_context.h>
50 #include <asm/paravirt.h>
51 #include <asm/linkage.h>
53 #include <asm/xen/hypercall.h>
54 #include <asm/xen/hypervisor.h>
56 #include <xen/page.h>
57 #include <xen/interface/xen.h>
59 #include "multicalls.h"
60 #include "mmu.h"
61 #include "debugfs.h"
63 #define MMU_UPDATE_HISTO 30
65 #ifdef CONFIG_XEN_DEBUG_FS
67 static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
88 u32 prot_commit;
89 u32 prot_commit_batched;
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96 } mmu_stats;
98 static u8 zero_stats;
100 static inline void check_zero(void)
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
108 #define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
111 #else /* !CONFIG_XEN_DEBUG_FS */
113 #define ADD_STATS(elem, val) do { (void)(val); } while(0)
115 #endif /* CONFIG_XEN_DEBUG_FS */
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary.
121 #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
124 #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
125 #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
127 /* Placeholder for holes in the address space */
128 static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
129 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
131 /* Array of pointers to pages containing p2m entries */
132 static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
133 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
135 /* Arrays of p2m arrays expressed in mfns used for save/restore */
136 static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
138 static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
139 __page_aligned_bss;
141 static inline unsigned p2m_top_index(unsigned long pfn)
143 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
144 return pfn / P2M_ENTRIES_PER_PAGE;
147 static inline unsigned p2m_index(unsigned long pfn)
149 return pfn % P2M_ENTRIES_PER_PAGE;
152 /* Build the parallel p2m_top_mfn structures */
153 void xen_setup_mfn_list_list(void)
155 unsigned pfn, idx;
157 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
158 unsigned topidx = p2m_top_index(pfn);
160 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
163 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
164 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
165 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
168 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
170 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
171 virt_to_mfn(p2m_top_mfn_list);
172 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
175 /* Set up p2m_top to point to the domain-builder provided p2m pages */
176 void __init xen_build_dynamic_phys_to_machine(void)
178 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
179 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
180 unsigned pfn;
182 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
183 unsigned topidx = p2m_top_index(pfn);
185 p2m_top[topidx] = &mfn_list[pfn];
189 unsigned long get_phys_to_machine(unsigned long pfn)
191 unsigned topidx, idx;
193 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
194 return INVALID_P2M_ENTRY;
196 topidx = p2m_top_index(pfn);
197 idx = p2m_index(pfn);
198 return p2m_top[topidx][idx];
200 EXPORT_SYMBOL_GPL(get_phys_to_machine);
202 static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
204 unsigned long *p;
205 unsigned i;
207 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
208 BUG_ON(p == NULL);
210 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
211 p[i] = INVALID_P2M_ENTRY;
213 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
214 free_page((unsigned long)p);
215 else
216 *mfnp = virt_to_mfn(p);
219 void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
221 unsigned topidx, idx;
223 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
224 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
225 return;
228 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
229 BUG_ON(mfn != INVALID_P2M_ENTRY);
230 return;
233 topidx = p2m_top_index(pfn);
234 if (p2m_top[topidx] == p2m_missing) {
235 /* no need to allocate a page to store an invalid entry */
236 if (mfn == INVALID_P2M_ENTRY)
237 return;
238 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
241 idx = p2m_index(pfn);
242 p2m_top[topidx][idx] = mfn;
245 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
247 unsigned long address = (unsigned long)vaddr;
248 unsigned int level;
249 pte_t *pte;
250 unsigned offset;
253 * if the PFN is in the linear mapped vaddr range, we can just use
254 * the (quick) virt_to_machine() p2m lookup
256 if (virt_addr_valid(vaddr))
257 return virt_to_machine(vaddr);
259 /* otherwise we have to do a (slower) full page-table walk */
261 pte = lookup_address(address, &level);
262 BUG_ON(pte == NULL);
263 offset = address & ~PAGE_MASK;
264 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
267 void make_lowmem_page_readonly(void *vaddr)
269 pte_t *pte, ptev;
270 unsigned long address = (unsigned long)vaddr;
271 unsigned int level;
273 pte = lookup_address(address, &level);
274 BUG_ON(pte == NULL);
276 ptev = pte_wrprotect(*pte);
278 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
279 BUG();
282 void make_lowmem_page_readwrite(void *vaddr)
284 pte_t *pte, ptev;
285 unsigned long address = (unsigned long)vaddr;
286 unsigned int level;
288 pte = lookup_address(address, &level);
289 BUG_ON(pte == NULL);
291 ptev = pte_mkwrite(*pte);
293 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
294 BUG();
298 static bool xen_page_pinned(void *ptr)
300 struct page *page = virt_to_page(ptr);
302 return PagePinned(page);
305 static void xen_extend_mmu_update(const struct mmu_update *update)
307 struct multicall_space mcs;
308 struct mmu_update *u;
310 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
312 if (mcs.mc != NULL) {
313 ADD_STATS(mmu_update_extended, 1);
314 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
316 mcs.mc->args[1]++;
318 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
319 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
320 else
321 ADD_STATS(mmu_update_histo[0], 1);
322 } else {
323 ADD_STATS(mmu_update, 1);
324 mcs = __xen_mc_entry(sizeof(*u));
325 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
326 ADD_STATS(mmu_update_histo[1], 1);
329 u = mcs.args;
330 *u = *update;
333 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
335 struct mmu_update u;
337 preempt_disable();
339 xen_mc_batch();
341 /* ptr may be ioremapped for 64-bit pagetable setup */
342 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
343 u.val = pmd_val_ma(val);
344 xen_extend_mmu_update(&u);
346 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
348 xen_mc_issue(PARAVIRT_LAZY_MMU);
350 preempt_enable();
353 void xen_set_pmd(pmd_t *ptr, pmd_t val)
355 ADD_STATS(pmd_update, 1);
357 /* If page is not pinned, we can just update the entry
358 directly */
359 if (!xen_page_pinned(ptr)) {
360 *ptr = val;
361 return;
364 ADD_STATS(pmd_update_pinned, 1);
366 xen_set_pmd_hyper(ptr, val);
370 * Associate a virtual page frame with a given physical page frame
371 * and protection flags for that frame.
373 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
375 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
378 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
379 pte_t *ptep, pte_t pteval)
381 /* updates to init_mm may be done without lock */
382 if (mm == &init_mm)
383 preempt_disable();
385 ADD_STATS(set_pte_at, 1);
386 // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
387 ADD_STATS(set_pte_at_current, mm == current->mm);
388 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
390 if (mm == current->mm || mm == &init_mm) {
391 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
392 struct multicall_space mcs;
393 mcs = xen_mc_entry(0);
395 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
396 ADD_STATS(set_pte_at_batched, 1);
397 xen_mc_issue(PARAVIRT_LAZY_MMU);
398 goto out;
399 } else
400 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
401 goto out;
403 xen_set_pte(ptep, pteval);
405 out:
406 if (mm == &init_mm)
407 preempt_enable();
410 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
411 unsigned long addr, pte_t *ptep)
413 /* Just return the pte as-is. We preserve the bits on commit */
414 return *ptep;
417 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
418 pte_t *ptep, pte_t pte)
420 struct mmu_update u;
422 xen_mc_batch();
424 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
425 u.val = pte_val_ma(pte);
426 xen_extend_mmu_update(&u);
428 ADD_STATS(prot_commit, 1);
429 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
431 xen_mc_issue(PARAVIRT_LAZY_MMU);
434 /* Assume pteval_t is equivalent to all the other *val_t types. */
435 static pteval_t pte_mfn_to_pfn(pteval_t val)
437 if (val & _PAGE_PRESENT) {
438 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
439 pteval_t flags = val & PTE_FLAGS_MASK;
440 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
443 return val;
446 static pteval_t pte_pfn_to_mfn(pteval_t val)
448 if (val & _PAGE_PRESENT) {
449 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
450 pteval_t flags = val & PTE_FLAGS_MASK;
451 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
454 return val;
457 pteval_t xen_pte_val(pte_t pte)
459 return pte_mfn_to_pfn(pte.pte);
462 pgdval_t xen_pgd_val(pgd_t pgd)
464 return pte_mfn_to_pfn(pgd.pgd);
467 pte_t xen_make_pte(pteval_t pte)
469 pte = pte_pfn_to_mfn(pte);
470 return native_make_pte(pte);
473 pgd_t xen_make_pgd(pgdval_t pgd)
475 pgd = pte_pfn_to_mfn(pgd);
476 return native_make_pgd(pgd);
479 pmdval_t xen_pmd_val(pmd_t pmd)
481 return pte_mfn_to_pfn(pmd.pmd);
484 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
486 struct mmu_update u;
488 preempt_disable();
490 xen_mc_batch();
492 /* ptr may be ioremapped for 64-bit pagetable setup */
493 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
494 u.val = pud_val_ma(val);
495 xen_extend_mmu_update(&u);
497 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
499 xen_mc_issue(PARAVIRT_LAZY_MMU);
501 preempt_enable();
504 void xen_set_pud(pud_t *ptr, pud_t val)
506 ADD_STATS(pud_update, 1);
508 /* If page is not pinned, we can just update the entry
509 directly */
510 if (!xen_page_pinned(ptr)) {
511 *ptr = val;
512 return;
515 ADD_STATS(pud_update_pinned, 1);
517 xen_set_pud_hyper(ptr, val);
520 void xen_set_pte(pte_t *ptep, pte_t pte)
522 ADD_STATS(pte_update, 1);
523 // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
524 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
526 #ifdef CONFIG_X86_PAE
527 ptep->pte_high = pte.pte_high;
528 smp_wmb();
529 ptep->pte_low = pte.pte_low;
530 #else
531 *ptep = pte;
532 #endif
535 #ifdef CONFIG_X86_PAE
536 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
538 set_64bit((u64 *)ptep, native_pte_val(pte));
541 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
543 ptep->pte_low = 0;
544 smp_wmb(); /* make sure low gets written first */
545 ptep->pte_high = 0;
548 void xen_pmd_clear(pmd_t *pmdp)
550 set_pmd(pmdp, __pmd(0));
552 #endif /* CONFIG_X86_PAE */
554 pmd_t xen_make_pmd(pmdval_t pmd)
556 pmd = pte_pfn_to_mfn(pmd);
557 return native_make_pmd(pmd);
560 #if PAGETABLE_LEVELS == 4
561 pudval_t xen_pud_val(pud_t pud)
563 return pte_mfn_to_pfn(pud.pud);
566 pud_t xen_make_pud(pudval_t pud)
568 pud = pte_pfn_to_mfn(pud);
570 return native_make_pud(pud);
573 pgd_t *xen_get_user_pgd(pgd_t *pgd)
575 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
576 unsigned offset = pgd - pgd_page;
577 pgd_t *user_ptr = NULL;
579 if (offset < pgd_index(USER_LIMIT)) {
580 struct page *page = virt_to_page(pgd_page);
581 user_ptr = (pgd_t *)page->private;
582 if (user_ptr)
583 user_ptr += offset;
586 return user_ptr;
589 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
591 struct mmu_update u;
593 u.ptr = virt_to_machine(ptr).maddr;
594 u.val = pgd_val_ma(val);
595 xen_extend_mmu_update(&u);
599 * Raw hypercall-based set_pgd, intended for in early boot before
600 * there's a page structure. This implies:
601 * 1. The only existing pagetable is the kernel's
602 * 2. It is always pinned
603 * 3. It has no user pagetable attached to it
605 void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
607 preempt_disable();
609 xen_mc_batch();
611 __xen_set_pgd_hyper(ptr, val);
613 xen_mc_issue(PARAVIRT_LAZY_MMU);
615 preempt_enable();
618 void xen_set_pgd(pgd_t *ptr, pgd_t val)
620 pgd_t *user_ptr = xen_get_user_pgd(ptr);
622 ADD_STATS(pgd_update, 1);
624 /* If page is not pinned, we can just update the entry
625 directly */
626 if (!xen_page_pinned(ptr)) {
627 *ptr = val;
628 if (user_ptr) {
629 WARN_ON(xen_page_pinned(user_ptr));
630 *user_ptr = val;
632 return;
635 ADD_STATS(pgd_update_pinned, 1);
636 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
638 /* If it's pinned, then we can at least batch the kernel and
639 user updates together. */
640 xen_mc_batch();
642 __xen_set_pgd_hyper(ptr, val);
643 if (user_ptr)
644 __xen_set_pgd_hyper(user_ptr, val);
646 xen_mc_issue(PARAVIRT_LAZY_MMU);
648 #endif /* PAGETABLE_LEVELS == 4 */
651 * (Yet another) pagetable walker. This one is intended for pinning a
652 * pagetable. This means that it walks a pagetable and calls the
653 * callback function on each page it finds making up the page table,
654 * at every level. It walks the entire pagetable, but it only bothers
655 * pinning pte pages which are below limit. In the normal case this
656 * will be STACK_TOP_MAX, but at boot we need to pin up to
657 * FIXADDR_TOP.
659 * For 32-bit the important bit is that we don't pin beyond there,
660 * because then we start getting into Xen's ptes.
662 * For 64-bit, we must skip the Xen hole in the middle of the address
663 * space, just after the big x86-64 virtual hole.
665 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
666 int (*func)(struct mm_struct *mm, struct page *,
667 enum pt_level),
668 unsigned long limit)
670 int flush = 0;
671 unsigned hole_low, hole_high;
672 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
673 unsigned pgdidx, pudidx, pmdidx;
675 /* The limit is the last byte to be touched */
676 limit--;
677 BUG_ON(limit >= FIXADDR_TOP);
679 if (xen_feature(XENFEAT_auto_translated_physmap))
680 return 0;
683 * 64-bit has a great big hole in the middle of the address
684 * space, which contains the Xen mappings. On 32-bit these
685 * will end up making a zero-sized hole and so is a no-op.
687 hole_low = pgd_index(USER_LIMIT);
688 hole_high = pgd_index(PAGE_OFFSET);
690 pgdidx_limit = pgd_index(limit);
691 #if PTRS_PER_PUD > 1
692 pudidx_limit = pud_index(limit);
693 #else
694 pudidx_limit = 0;
695 #endif
696 #if PTRS_PER_PMD > 1
697 pmdidx_limit = pmd_index(limit);
698 #else
699 pmdidx_limit = 0;
700 #endif
702 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
703 pud_t *pud;
705 if (pgdidx >= hole_low && pgdidx < hole_high)
706 continue;
708 if (!pgd_val(pgd[pgdidx]))
709 continue;
711 pud = pud_offset(&pgd[pgdidx], 0);
713 if (PTRS_PER_PUD > 1) /* not folded */
714 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
716 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
717 pmd_t *pmd;
719 if (pgdidx == pgdidx_limit &&
720 pudidx > pudidx_limit)
721 goto out;
723 if (pud_none(pud[pudidx]))
724 continue;
726 pmd = pmd_offset(&pud[pudidx], 0);
728 if (PTRS_PER_PMD > 1) /* not folded */
729 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
731 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
732 struct page *pte;
734 if (pgdidx == pgdidx_limit &&
735 pudidx == pudidx_limit &&
736 pmdidx > pmdidx_limit)
737 goto out;
739 if (pmd_none(pmd[pmdidx]))
740 continue;
742 pte = pmd_page(pmd[pmdidx]);
743 flush |= (*func)(mm, pte, PT_PTE);
748 out:
749 /* Do the top level last, so that the callbacks can use it as
750 a cue to do final things like tlb flushes. */
751 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
753 return flush;
756 static int xen_pgd_walk(struct mm_struct *mm,
757 int (*func)(struct mm_struct *mm, struct page *,
758 enum pt_level),
759 unsigned long limit)
761 return __xen_pgd_walk(mm, mm->pgd, func, limit);
764 /* If we're using split pte locks, then take the page's lock and
765 return a pointer to it. Otherwise return NULL. */
766 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
768 spinlock_t *ptl = NULL;
770 #if USE_SPLIT_PTLOCKS
771 ptl = __pte_lockptr(page);
772 spin_lock_nest_lock(ptl, &mm->page_table_lock);
773 #endif
775 return ptl;
778 static void xen_pte_unlock(void *v)
780 spinlock_t *ptl = v;
781 spin_unlock(ptl);
784 static void xen_do_pin(unsigned level, unsigned long pfn)
786 struct mmuext_op *op;
787 struct multicall_space mcs;
789 mcs = __xen_mc_entry(sizeof(*op));
790 op = mcs.args;
791 op->cmd = level;
792 op->arg1.mfn = pfn_to_mfn(pfn);
793 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
796 static int xen_pin_page(struct mm_struct *mm, struct page *page,
797 enum pt_level level)
799 unsigned pgfl = TestSetPagePinned(page);
800 int flush;
802 if (pgfl)
803 flush = 0; /* already pinned */
804 else if (PageHighMem(page))
805 /* kmaps need flushing if we found an unpinned
806 highpage */
807 flush = 1;
808 else {
809 void *pt = lowmem_page_address(page);
810 unsigned long pfn = page_to_pfn(page);
811 struct multicall_space mcs = __xen_mc_entry(0);
812 spinlock_t *ptl;
814 flush = 0;
817 * We need to hold the pagetable lock between the time
818 * we make the pagetable RO and when we actually pin
819 * it. If we don't, then other users may come in and
820 * attempt to update the pagetable by writing it,
821 * which will fail because the memory is RO but not
822 * pinned, so Xen won't do the trap'n'emulate.
824 * If we're using split pte locks, we can't hold the
825 * entire pagetable's worth of locks during the
826 * traverse, because we may wrap the preempt count (8
827 * bits). The solution is to mark RO and pin each PTE
828 * page while holding the lock. This means the number
829 * of locks we end up holding is never more than a
830 * batch size (~32 entries, at present).
832 * If we're not using split pte locks, we needn't pin
833 * the PTE pages independently, because we're
834 * protected by the overall pagetable lock.
836 ptl = NULL;
837 if (level == PT_PTE)
838 ptl = xen_pte_lock(page, mm);
840 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
841 pfn_pte(pfn, PAGE_KERNEL_RO),
842 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
844 if (ptl) {
845 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
847 /* Queue a deferred unlock for when this batch
848 is completed. */
849 xen_mc_callback(xen_pte_unlock, ptl);
853 return flush;
856 /* This is called just after a mm has been created, but it has not
857 been used yet. We need to make sure that its pagetable is all
858 read-only, and can be pinned. */
859 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
861 vm_unmap_aliases();
863 xen_mc_batch();
865 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
866 /* re-enable interrupts for flushing */
867 xen_mc_issue(0);
869 kmap_flush_unused();
871 xen_mc_batch();
874 #ifdef CONFIG_X86_64
876 pgd_t *user_pgd = xen_get_user_pgd(pgd);
878 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
880 if (user_pgd) {
881 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
882 xen_do_pin(MMUEXT_PIN_L4_TABLE,
883 PFN_DOWN(__pa(user_pgd)));
886 #else /* CONFIG_X86_32 */
887 #ifdef CONFIG_X86_PAE
888 /* Need to make sure unshared kernel PMD is pinnable */
889 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
890 PT_PMD);
891 #endif
892 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
893 #endif /* CONFIG_X86_64 */
894 xen_mc_issue(0);
897 static void xen_pgd_pin(struct mm_struct *mm)
899 __xen_pgd_pin(mm, mm->pgd);
903 * On save, we need to pin all pagetables to make sure they get their
904 * mfns turned into pfns. Search the list for any unpinned pgds and pin
905 * them (unpinned pgds are not currently in use, probably because the
906 * process is under construction or destruction).
908 * Expected to be called in stop_machine() ("equivalent to taking
909 * every spinlock in the system"), so the locking doesn't really
910 * matter all that much.
912 void xen_mm_pin_all(void)
914 unsigned long flags;
915 struct page *page;
917 spin_lock_irqsave(&pgd_lock, flags);
919 list_for_each_entry(page, &pgd_list, lru) {
920 if (!PagePinned(page)) {
921 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
922 SetPageSavePinned(page);
926 spin_unlock_irqrestore(&pgd_lock, flags);
930 * The init_mm pagetable is really pinned as soon as its created, but
931 * that's before we have page structures to store the bits. So do all
932 * the book-keeping now.
934 static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
935 enum pt_level level)
937 SetPagePinned(page);
938 return 0;
941 void __init xen_mark_init_mm_pinned(void)
943 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
946 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
947 enum pt_level level)
949 unsigned pgfl = TestClearPagePinned(page);
951 if (pgfl && !PageHighMem(page)) {
952 void *pt = lowmem_page_address(page);
953 unsigned long pfn = page_to_pfn(page);
954 spinlock_t *ptl = NULL;
955 struct multicall_space mcs;
958 * Do the converse to pin_page. If we're using split
959 * pte locks, we must be holding the lock for while
960 * the pte page is unpinned but still RO to prevent
961 * concurrent updates from seeing it in this
962 * partially-pinned state.
964 if (level == PT_PTE) {
965 ptl = xen_pte_lock(page, mm);
967 if (ptl)
968 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
971 mcs = __xen_mc_entry(0);
973 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
974 pfn_pte(pfn, PAGE_KERNEL),
975 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
977 if (ptl) {
978 /* unlock when batch completed */
979 xen_mc_callback(xen_pte_unlock, ptl);
983 return 0; /* never need to flush on unpin */
986 /* Release a pagetables pages back as normal RW */
987 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
989 xen_mc_batch();
991 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
993 #ifdef CONFIG_X86_64
995 pgd_t *user_pgd = xen_get_user_pgd(pgd);
997 if (user_pgd) {
998 xen_do_pin(MMUEXT_UNPIN_TABLE,
999 PFN_DOWN(__pa(user_pgd)));
1000 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1003 #endif
1005 #ifdef CONFIG_X86_PAE
1006 /* Need to make sure unshared kernel PMD is unpinned */
1007 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1008 PT_PMD);
1009 #endif
1011 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1013 xen_mc_issue(0);
1016 static void xen_pgd_unpin(struct mm_struct *mm)
1018 __xen_pgd_unpin(mm, mm->pgd);
1022 * On resume, undo any pinning done at save, so that the rest of the
1023 * kernel doesn't see any unexpected pinned pagetables.
1025 void xen_mm_unpin_all(void)
1027 unsigned long flags;
1028 struct page *page;
1030 spin_lock_irqsave(&pgd_lock, flags);
1032 list_for_each_entry(page, &pgd_list, lru) {
1033 if (PageSavePinned(page)) {
1034 BUG_ON(!PagePinned(page));
1035 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1036 ClearPageSavePinned(page);
1040 spin_unlock_irqrestore(&pgd_lock, flags);
1043 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1045 spin_lock(&next->page_table_lock);
1046 xen_pgd_pin(next);
1047 spin_unlock(&next->page_table_lock);
1050 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1052 spin_lock(&mm->page_table_lock);
1053 xen_pgd_pin(mm);
1054 spin_unlock(&mm->page_table_lock);
1058 #ifdef CONFIG_SMP
1059 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1060 we need to repoint it somewhere else before we can unpin it. */
1061 static void drop_other_mm_ref(void *info)
1063 struct mm_struct *mm = info;
1064 struct mm_struct *active_mm;
1066 #ifdef CONFIG_X86_64
1067 active_mm = read_pda(active_mm);
1068 #else
1069 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
1070 #endif
1072 if (active_mm == mm)
1073 leave_mm(smp_processor_id());
1075 /* If this cpu still has a stale cr3 reference, then make sure
1076 it has been flushed. */
1077 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
1078 load_cr3(swapper_pg_dir);
1079 arch_flush_lazy_cpu_mode();
1083 static void xen_drop_mm_ref(struct mm_struct *mm)
1085 cpumask_var_t mask;
1086 unsigned cpu;
1088 if (current->active_mm == mm) {
1089 if (current->mm == mm)
1090 load_cr3(swapper_pg_dir);
1091 else
1092 leave_mm(smp_processor_id());
1093 arch_flush_lazy_cpu_mode();
1096 /* Get the "official" set of cpus referring to our pagetable. */
1097 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1098 for_each_online_cpu(cpu) {
1099 if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
1100 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1101 continue;
1102 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1104 return;
1106 cpumask_copy(mask, &mm->cpu_vm_mask);
1108 /* It's possible that a vcpu may have a stale reference to our
1109 cr3, because its in lazy mode, and it hasn't yet flushed
1110 its set of pending hypercalls yet. In this case, we can
1111 look at its actual current cr3 value, and force it to flush
1112 if needed. */
1113 for_each_online_cpu(cpu) {
1114 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1115 cpumask_set_cpu(cpu, mask);
1118 if (!cpumask_empty(mask))
1119 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1120 free_cpumask_var(mask);
1122 #else
1123 static void xen_drop_mm_ref(struct mm_struct *mm)
1125 if (current->active_mm == mm)
1126 load_cr3(swapper_pg_dir);
1128 #endif
1131 * While a process runs, Xen pins its pagetables, which means that the
1132 * hypervisor forces it to be read-only, and it controls all updates
1133 * to it. This means that all pagetable updates have to go via the
1134 * hypervisor, which is moderately expensive.
1136 * Since we're pulling the pagetable down, we switch to use init_mm,
1137 * unpin old process pagetable and mark it all read-write, which
1138 * allows further operations on it to be simple memory accesses.
1140 * The only subtle point is that another CPU may be still using the
1141 * pagetable because of lazy tlb flushing. This means we need need to
1142 * switch all CPUs off this pagetable before we can unpin it.
1144 void xen_exit_mmap(struct mm_struct *mm)
1146 get_cpu(); /* make sure we don't move around */
1147 xen_drop_mm_ref(mm);
1148 put_cpu();
1150 spin_lock(&mm->page_table_lock);
1152 /* pgd may not be pinned in the error exit path of execve */
1153 if (xen_page_pinned(mm->pgd))
1154 xen_pgd_unpin(mm);
1156 spin_unlock(&mm->page_table_lock);
1159 #ifdef CONFIG_XEN_DEBUG_FS
1161 static struct dentry *d_mmu_debug;
1163 static int __init xen_mmu_debugfs(void)
1165 struct dentry *d_xen = xen_init_debugfs();
1167 if (d_xen == NULL)
1168 return -ENOMEM;
1170 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1172 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1174 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1175 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1176 &mmu_stats.pgd_update_pinned);
1177 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1178 &mmu_stats.pgd_update_pinned);
1180 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1181 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1182 &mmu_stats.pud_update_pinned);
1183 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1184 &mmu_stats.pud_update_pinned);
1186 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1187 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1188 &mmu_stats.pmd_update_pinned);
1189 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1190 &mmu_stats.pmd_update_pinned);
1192 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1193 // debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1194 // &mmu_stats.pte_update_pinned);
1195 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1196 &mmu_stats.pte_update_pinned);
1198 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1199 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1200 &mmu_stats.mmu_update_extended);
1201 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1202 mmu_stats.mmu_update_histo, 20);
1204 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1205 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1206 &mmu_stats.set_pte_at_batched);
1207 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1208 &mmu_stats.set_pte_at_current);
1209 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1210 &mmu_stats.set_pte_at_kernel);
1212 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1213 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1214 &mmu_stats.prot_commit_batched);
1216 return 0;
1218 fs_initcall(xen_mmu_debugfs);
1220 #endif /* CONFIG_XEN_DEBUG_FS */