xen64: use arbitrary_virt_to_machine for xen_set_pmd
[linux-2.6/linux-loongson.git] / arch / x86 / xen / mmu.c
bloba8f023271819ba2157341622433472146227002f
1 /*
2 * Xen mmu operations
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/bug.h>
45 #include <asm/pgtable.h>
46 #include <asm/tlbflush.h>
47 #include <asm/mmu_context.h>
48 #include <asm/paravirt.h>
49 #include <asm/linkage.h>
51 #include <asm/xen/hypercall.h>
52 #include <asm/xen/hypervisor.h>
54 #include <xen/page.h>
55 #include <xen/interface/xen.h>
57 #include "multicalls.h"
58 #include "mmu.h"
60 #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
61 #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
63 /* Placeholder for holes in the address space */
64 static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
67 /* Array of pointers to pages containing p2m entries */
68 static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
69 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
71 /* Arrays of p2m arrays expressed in mfns used for save/restore */
72 static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
74 static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
75 __page_aligned_bss;
77 static inline unsigned p2m_top_index(unsigned long pfn)
79 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
80 return pfn / P2M_ENTRIES_PER_PAGE;
83 static inline unsigned p2m_index(unsigned long pfn)
85 return pfn % P2M_ENTRIES_PER_PAGE;
88 /* Build the parallel p2m_top_mfn structures */
89 void xen_setup_mfn_list_list(void)
91 unsigned pfn, idx;
93 for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
94 unsigned topidx = p2m_top_index(pfn);
96 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
99 for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
100 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
101 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
104 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
106 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
107 virt_to_mfn(p2m_top_mfn_list);
108 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
111 /* Set up p2m_top to point to the domain-builder provided p2m pages */
112 void __init xen_build_dynamic_phys_to_machine(void)
114 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
115 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
116 unsigned pfn;
118 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
119 unsigned topidx = p2m_top_index(pfn);
121 p2m_top[topidx] = &mfn_list[pfn];
125 unsigned long get_phys_to_machine(unsigned long pfn)
127 unsigned topidx, idx;
129 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
130 return INVALID_P2M_ENTRY;
132 topidx = p2m_top_index(pfn);
133 idx = p2m_index(pfn);
134 return p2m_top[topidx][idx];
136 EXPORT_SYMBOL_GPL(get_phys_to_machine);
138 static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
140 unsigned long *p;
141 unsigned i;
143 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
144 BUG_ON(p == NULL);
146 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
147 p[i] = INVALID_P2M_ENTRY;
149 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
150 free_page((unsigned long)p);
151 else
152 *mfnp = virt_to_mfn(p);
155 void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
157 unsigned topidx, idx;
159 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
160 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
161 return;
164 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
165 BUG_ON(mfn != INVALID_P2M_ENTRY);
166 return;
169 topidx = p2m_top_index(pfn);
170 if (p2m_top[topidx] == p2m_missing) {
171 /* no need to allocate a page to store an invalid entry */
172 if (mfn == INVALID_P2M_ENTRY)
173 return;
174 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
177 idx = p2m_index(pfn);
178 p2m_top[topidx][idx] = mfn;
181 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
183 unsigned long address = (unsigned long)vaddr;
184 unsigned int level;
185 pte_t *pte = lookup_address(address, &level);
186 unsigned offset = address & ~PAGE_MASK;
188 BUG_ON(pte == NULL);
190 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
193 void make_lowmem_page_readonly(void *vaddr)
195 pte_t *pte, ptev;
196 unsigned long address = (unsigned long)vaddr;
197 unsigned int level;
199 pte = lookup_address(address, &level);
200 BUG_ON(pte == NULL);
202 ptev = pte_wrprotect(*pte);
204 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
205 BUG();
208 void make_lowmem_page_readwrite(void *vaddr)
210 pte_t *pte, ptev;
211 unsigned long address = (unsigned long)vaddr;
212 unsigned int level;
214 pte = lookup_address(address, &level);
215 BUG_ON(pte == NULL);
217 ptev = pte_mkwrite(*pte);
219 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
220 BUG();
224 static bool page_pinned(void *ptr)
226 struct page *page = virt_to_page(ptr);
228 return PagePinned(page);
231 static void extend_mmu_update(const struct mmu_update *update)
233 struct multicall_space mcs;
234 struct mmu_update *u;
236 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
238 if (mcs.mc != NULL)
239 mcs.mc->args[1]++;
240 else {
241 mcs = __xen_mc_entry(sizeof(*u));
242 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
245 u = mcs.args;
246 *u = *update;
249 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
251 struct mmu_update u;
253 preempt_disable();
255 xen_mc_batch();
257 /* ptr may be ioremapped for 64-bit pagetable setup */
258 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
259 u.val = pmd_val_ma(val);
260 extend_mmu_update(&u);
262 xen_mc_issue(PARAVIRT_LAZY_MMU);
264 preempt_enable();
267 void xen_set_pmd(pmd_t *ptr, pmd_t val)
269 /* If page is not pinned, we can just update the entry
270 directly */
271 if (!page_pinned(ptr)) {
272 *ptr = val;
273 return;
276 xen_set_pmd_hyper(ptr, val);
280 * Associate a virtual page frame with a given physical page frame
281 * and protection flags for that frame.
283 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
285 pgd_t *pgd;
286 pud_t *pud;
287 pmd_t *pmd;
288 pte_t *pte;
290 pgd = swapper_pg_dir + pgd_index(vaddr);
291 if (pgd_none(*pgd)) {
292 BUG();
293 return;
295 pud = pud_offset(pgd, vaddr);
296 if (pud_none(*pud)) {
297 BUG();
298 return;
300 pmd = pmd_offset(pud, vaddr);
301 if (pmd_none(*pmd)) {
302 BUG();
303 return;
305 pte = pte_offset_kernel(pmd, vaddr);
306 /* <mfn,flags> stored as-is, to permit clearing entries */
307 xen_set_pte(pte, mfn_pte(mfn, flags));
310 * It's enough to flush this one mapping.
311 * (PGE mappings get flushed as well)
313 __flush_tlb_one(vaddr);
316 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
317 pte_t *ptep, pte_t pteval)
319 /* updates to init_mm may be done without lock */
320 if (mm == &init_mm)
321 preempt_disable();
323 if (mm == current->mm || mm == &init_mm) {
324 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
325 struct multicall_space mcs;
326 mcs = xen_mc_entry(0);
328 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
329 xen_mc_issue(PARAVIRT_LAZY_MMU);
330 goto out;
331 } else
332 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
333 goto out;
335 xen_set_pte(ptep, pteval);
337 out:
338 if (mm == &init_mm)
339 preempt_enable();
342 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
344 /* Just return the pte as-is. We preserve the bits on commit */
345 return *ptep;
348 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
349 pte_t *ptep, pte_t pte)
351 struct mmu_update u;
353 xen_mc_batch();
355 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
356 u.val = pte_val_ma(pte);
357 extend_mmu_update(&u);
359 xen_mc_issue(PARAVIRT_LAZY_MMU);
362 /* Assume pteval_t is equivalent to all the other *val_t types. */
363 static pteval_t pte_mfn_to_pfn(pteval_t val)
365 if (val & _PAGE_PRESENT) {
366 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
367 pteval_t flags = val & ~PTE_MASK;
368 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
371 return val;
374 static pteval_t pte_pfn_to_mfn(pteval_t val)
376 if (val & _PAGE_PRESENT) {
377 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
378 pteval_t flags = val & ~PTE_MASK;
379 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
382 return val;
385 pteval_t xen_pte_val(pte_t pte)
387 return pte_mfn_to_pfn(pte.pte);
390 pgdval_t xen_pgd_val(pgd_t pgd)
392 return pte_mfn_to_pfn(pgd.pgd);
395 pte_t xen_make_pte(pteval_t pte)
397 pte = pte_pfn_to_mfn(pte);
398 return native_make_pte(pte);
401 pgd_t xen_make_pgd(pgdval_t pgd)
403 pgd = pte_pfn_to_mfn(pgd);
404 return native_make_pgd(pgd);
407 pmdval_t xen_pmd_val(pmd_t pmd)
409 return pte_mfn_to_pfn(pmd.pmd);
412 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
414 struct mmu_update u;
416 preempt_disable();
418 xen_mc_batch();
420 /* ptr may be ioremapped for 64-bit pagetable setup */
421 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
422 u.val = pud_val_ma(val);
423 extend_mmu_update(&u);
425 xen_mc_issue(PARAVIRT_LAZY_MMU);
427 preempt_enable();
430 void xen_set_pud(pud_t *ptr, pud_t val)
432 /* If page is not pinned, we can just update the entry
433 directly */
434 if (!page_pinned(ptr)) {
435 *ptr = val;
436 return;
439 xen_set_pud_hyper(ptr, val);
442 void xen_set_pte(pte_t *ptep, pte_t pte)
444 #ifdef CONFIG_X86_PAE
445 ptep->pte_high = pte.pte_high;
446 smp_wmb();
447 ptep->pte_low = pte.pte_low;
448 #else
449 *ptep = pte;
450 #endif
453 #ifdef CONFIG_X86_PAE
454 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
456 set_64bit((u64 *)ptep, native_pte_val(pte));
459 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
461 ptep->pte_low = 0;
462 smp_wmb(); /* make sure low gets written first */
463 ptep->pte_high = 0;
466 void xen_pmd_clear(pmd_t *pmdp)
468 set_pmd(pmdp, __pmd(0));
470 #endif /* CONFIG_X86_PAE */
472 pmd_t xen_make_pmd(pmdval_t pmd)
474 pmd = pte_pfn_to_mfn(pmd);
475 return native_make_pmd(pmd);
478 #if PAGETABLE_LEVELS == 4
479 pudval_t xen_pud_val(pud_t pud)
481 return pte_mfn_to_pfn(pud.pud);
484 pud_t xen_make_pud(pudval_t pud)
486 pud = pte_pfn_to_mfn(pud);
488 return native_make_pud(pud);
491 void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
493 struct mmu_update u;
495 preempt_disable();
497 xen_mc_batch();
499 u.ptr = virt_to_machine(ptr).maddr;
500 u.val = pgd_val_ma(val);
501 extend_mmu_update(&u);
503 xen_mc_issue(PARAVIRT_LAZY_MMU);
505 preempt_enable();
508 void xen_set_pgd(pgd_t *ptr, pgd_t val)
510 /* If page is not pinned, we can just update the entry
511 directly */
512 if (!page_pinned(ptr)) {
513 *ptr = val;
514 return;
517 xen_set_pgd_hyper(ptr, val);
519 #endif /* PAGETABLE_LEVELS == 4 */
522 (Yet another) pagetable walker. This one is intended for pinning a
523 pagetable. This means that it walks a pagetable and calls the
524 callback function on each page it finds making up the page table,
525 at every level. It walks the entire pagetable, but it only bothers
526 pinning pte pages which are below pte_limit. In the normal case
527 this will be TASK_SIZE, but at boot we need to pin up to
528 FIXADDR_TOP. But the important bit is that we don't pin beyond
529 there, because then we start getting into Xen's ptes.
531 static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
532 unsigned long limit)
534 pgd_t *pgd = pgd_base;
535 int flush = 0;
536 unsigned long addr = 0;
537 unsigned long pgd_next;
539 BUG_ON(limit > FIXADDR_TOP);
541 if (xen_feature(XENFEAT_auto_translated_physmap))
542 return 0;
544 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
545 pud_t *pud;
546 unsigned long pud_limit, pud_next;
548 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
550 if (!pgd_val(*pgd))
551 continue;
553 pud = pud_offset(pgd, 0);
555 if (PTRS_PER_PUD > 1) /* not folded */
556 flush |= (*func)(virt_to_page(pud), PT_PUD);
558 for (; addr != pud_limit; pud++, addr = pud_next) {
559 pmd_t *pmd;
560 unsigned long pmd_limit;
562 pud_next = pud_addr_end(addr, pud_limit);
564 if (pud_next < limit)
565 pmd_limit = pud_next;
566 else
567 pmd_limit = limit;
569 if (pud_none(*pud))
570 continue;
572 pmd = pmd_offset(pud, 0);
574 if (PTRS_PER_PMD > 1) /* not folded */
575 flush |= (*func)(virt_to_page(pmd), PT_PMD);
577 for (; addr != pmd_limit; pmd++) {
578 addr += (PAGE_SIZE * PTRS_PER_PTE);
579 if ((pmd_limit-1) < (addr-1)) {
580 addr = pmd_limit;
581 break;
584 if (pmd_none(*pmd))
585 continue;
587 flush |= (*func)(pmd_page(*pmd), PT_PTE);
592 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
594 return flush;
597 static spinlock_t *lock_pte(struct page *page)
599 spinlock_t *ptl = NULL;
601 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
602 ptl = __pte_lockptr(page);
603 spin_lock(ptl);
604 #endif
606 return ptl;
609 static void do_unlock(void *v)
611 spinlock_t *ptl = v;
612 spin_unlock(ptl);
615 static void xen_do_pin(unsigned level, unsigned long pfn)
617 struct mmuext_op *op;
618 struct multicall_space mcs;
620 mcs = __xen_mc_entry(sizeof(*op));
621 op = mcs.args;
622 op->cmd = level;
623 op->arg1.mfn = pfn_to_mfn(pfn);
624 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
627 static int pin_page(struct page *page, enum pt_level level)
629 unsigned pgfl = TestSetPagePinned(page);
630 int flush;
632 if (pgfl)
633 flush = 0; /* already pinned */
634 else if (PageHighMem(page))
635 /* kmaps need flushing if we found an unpinned
636 highpage */
637 flush = 1;
638 else {
639 void *pt = lowmem_page_address(page);
640 unsigned long pfn = page_to_pfn(page);
641 struct multicall_space mcs = __xen_mc_entry(0);
642 spinlock_t *ptl;
644 flush = 0;
646 ptl = NULL;
647 if (level == PT_PTE)
648 ptl = lock_pte(page);
650 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
651 pfn_pte(pfn, PAGE_KERNEL_RO),
652 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
654 if (level == PT_PTE)
655 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
657 if (ptl) {
658 /* Queue a deferred unlock for when this batch
659 is completed. */
660 xen_mc_callback(do_unlock, ptl);
664 return flush;
667 /* This is called just after a mm has been created, but it has not
668 been used yet. We need to make sure that its pagetable is all
669 read-only, and can be pinned. */
670 void xen_pgd_pin(pgd_t *pgd)
672 xen_mc_batch();
674 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
675 /* re-enable interrupts for kmap_flush_unused */
676 xen_mc_issue(0);
677 kmap_flush_unused();
678 xen_mc_batch();
681 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
682 xen_mc_issue(0);
686 * On save, we need to pin all pagetables to make sure they get their
687 * mfns turned into pfns. Search the list for any unpinned pgds and pin
688 * them (unpinned pgds are not currently in use, probably because the
689 * process is under construction or destruction).
691 void xen_mm_pin_all(void)
693 unsigned long flags;
694 struct page *page;
696 spin_lock_irqsave(&pgd_lock, flags);
698 list_for_each_entry(page, &pgd_list, lru) {
699 if (!PagePinned(page)) {
700 xen_pgd_pin((pgd_t *)page_address(page));
701 SetPageSavePinned(page);
705 spin_unlock_irqrestore(&pgd_lock, flags);
709 * The init_mm pagetable is really pinned as soon as its created, but
710 * that's before we have page structures to store the bits. So do all
711 * the book-keeping now.
713 static __init int mark_pinned(struct page *page, enum pt_level level)
715 SetPagePinned(page);
716 return 0;
719 void __init xen_mark_init_mm_pinned(void)
721 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
724 static int unpin_page(struct page *page, enum pt_level level)
726 unsigned pgfl = TestClearPagePinned(page);
728 if (pgfl && !PageHighMem(page)) {
729 void *pt = lowmem_page_address(page);
730 unsigned long pfn = page_to_pfn(page);
731 spinlock_t *ptl = NULL;
732 struct multicall_space mcs;
734 if (level == PT_PTE) {
735 ptl = lock_pte(page);
737 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
740 mcs = __xen_mc_entry(0);
742 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
743 pfn_pte(pfn, PAGE_KERNEL),
744 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
746 if (ptl) {
747 /* unlock when batch completed */
748 xen_mc_callback(do_unlock, ptl);
752 return 0; /* never need to flush on unpin */
755 /* Release a pagetables pages back as normal RW */
756 static void xen_pgd_unpin(pgd_t *pgd)
758 xen_mc_batch();
760 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
762 pgd_walk(pgd, unpin_page, TASK_SIZE);
764 xen_mc_issue(0);
768 * On resume, undo any pinning done at save, so that the rest of the
769 * kernel doesn't see any unexpected pinned pagetables.
771 void xen_mm_unpin_all(void)
773 unsigned long flags;
774 struct page *page;
776 spin_lock_irqsave(&pgd_lock, flags);
778 list_for_each_entry(page, &pgd_list, lru) {
779 if (PageSavePinned(page)) {
780 BUG_ON(!PagePinned(page));
781 printk("unpinning pinned %p\n", page_address(page));
782 xen_pgd_unpin((pgd_t *)page_address(page));
783 ClearPageSavePinned(page);
787 spin_unlock_irqrestore(&pgd_lock, flags);
790 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
792 spin_lock(&next->page_table_lock);
793 xen_pgd_pin(next->pgd);
794 spin_unlock(&next->page_table_lock);
797 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
799 spin_lock(&mm->page_table_lock);
800 xen_pgd_pin(mm->pgd);
801 spin_unlock(&mm->page_table_lock);
805 #ifdef CONFIG_SMP
806 /* Another cpu may still have their %cr3 pointing at the pagetable, so
807 we need to repoint it somewhere else before we can unpin it. */
808 static void drop_other_mm_ref(void *info)
810 struct mm_struct *mm = info;
811 struct mm_struct *active_mm;
813 #ifdef CONFIG_X86_64
814 active_mm = read_pda(active_mm);
815 #else
816 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
817 #endif
819 if (active_mm == mm)
820 leave_mm(smp_processor_id());
822 /* If this cpu still has a stale cr3 reference, then make sure
823 it has been flushed. */
824 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
825 load_cr3(swapper_pg_dir);
826 arch_flush_lazy_cpu_mode();
830 static void drop_mm_ref(struct mm_struct *mm)
832 cpumask_t mask;
833 unsigned cpu;
835 if (current->active_mm == mm) {
836 if (current->mm == mm)
837 load_cr3(swapper_pg_dir);
838 else
839 leave_mm(smp_processor_id());
840 arch_flush_lazy_cpu_mode();
843 /* Get the "official" set of cpus referring to our pagetable. */
844 mask = mm->cpu_vm_mask;
846 /* It's possible that a vcpu may have a stale reference to our
847 cr3, because its in lazy mode, and it hasn't yet flushed
848 its set of pending hypercalls yet. In this case, we can
849 look at its actual current cr3 value, and force it to flush
850 if needed. */
851 for_each_online_cpu(cpu) {
852 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
853 cpu_set(cpu, mask);
856 if (!cpus_empty(mask))
857 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
859 #else
860 static void drop_mm_ref(struct mm_struct *mm)
862 if (current->active_mm == mm)
863 load_cr3(swapper_pg_dir);
865 #endif
868 * While a process runs, Xen pins its pagetables, which means that the
869 * hypervisor forces it to be read-only, and it controls all updates
870 * to it. This means that all pagetable updates have to go via the
871 * hypervisor, which is moderately expensive.
873 * Since we're pulling the pagetable down, we switch to use init_mm,
874 * unpin old process pagetable and mark it all read-write, which
875 * allows further operations on it to be simple memory accesses.
877 * The only subtle point is that another CPU may be still using the
878 * pagetable because of lazy tlb flushing. This means we need need to
879 * switch all CPUs off this pagetable before we can unpin it.
881 void xen_exit_mmap(struct mm_struct *mm)
883 get_cpu(); /* make sure we don't move around */
884 drop_mm_ref(mm);
885 put_cpu();
887 spin_lock(&mm->page_table_lock);
889 /* pgd may not be pinned in the error exit path of execve */
890 if (page_pinned(mm->pgd))
891 xen_pgd_unpin(mm->pgd);
893 spin_unlock(&mm->page_table_lock);