From b443039b5d8fa7246ce7be3e04756121508c48eb Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 31 Jan 2017 17:10:44 -0800 Subject: [PATCH] kernel - Augment vm_fault_page() for vkernel operations * Augment vm_fault_page(), adding required elements from vm_fault() that were missing. * In particular, this fixes a bug in any copyout to a growable stack segment (including the copyouts the exec code does), where the stack was not being properly grown and the copyout/suword64/etc calls were failing when they shouldn't have. * Note optimization in pmap_clean_pte(). When turning off VPTE_RW, we do not have to MADV_INVAL to the real host if we determine that VPTE_M is not set. Due to the way the vkernel works and host works, the host will utilize a read-only real PTE for read faults on VPTE_RW entries, in order to be able to detect writes by forcing a write fault when the write occurs. * Fix a race between pmap_enter() and pmap_page_protect(). The vkernel doesn't have a convenient pv-lock like the read kernel pmap code, so use the vm_page spinlock instead. * Generally use atomic ops for other operations on ptes that should be using atomic ops. * Only clear PG_WRITEABLE while holding the vm_page spinlock. --- sys/platform/pc64/x86_64/pmap.c | 9 +- sys/platform/vkernel64/platform/pmap.c | 49 +++++++---- sys/platform/vkernel64/platform/pmap_inval.c | 37 ++++++-- sys/vm/vm_fault.c | 124 ++++++++++++++++++++------- 4 files changed, 157 insertions(+), 62 deletions(-) diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c index 73b1439eab..204d0f0a72 100644 --- a/sys/platform/pc64/x86_64/pmap.c +++ b/sys/platform/pc64/x86_64/pmap.c @@ -5368,10 +5368,11 @@ pmap_clearbit(vm_page_t m, int bit_index) pt_entry_t pbits; pmap_t pmap; - if (bit_index == PG_RW_IDX) - vm_page_flag_clear(m, PG_WRITEABLE); - if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) + if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { + if (bit_index == PG_RW_IDX) + vm_page_flag_clear(m, PG_WRITEABLE); return; + } /* * PG_M or PG_A case @@ -5490,6 +5491,8 @@ restart: pv_put(pv); goto restart; } + if (bit_index == PG_RW_IDX) + vm_page_flag_clear(m, PG_WRITEABLE); vm_page_spin_unlock(m); } diff --git a/sys/platform/vkernel64/platform/pmap.c b/sys/platform/vkernel64/platform/pmap.c index b01a054de3..175625c76e 100644 --- a/sys/platform/vkernel64/platform/pmap.c +++ b/sys/platform/vkernel64/platform/pmap.c @@ -1122,7 +1122,7 @@ pmap_unwire_pgtable(pmap_t pmap, vm_offset_t va, vm_page_t m) KKASSERT(m->wire_count == 0); vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); vm_page_flash(m); - vm_page_free_zero(m); + vm_page_free(m); return 1; } else { /* XXX SMP race to 1 if not holding vmobj */ @@ -1215,7 +1215,7 @@ pmap_pinit(struct pmap *pmap) VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_ZERO); pmap->pm_pdirm = ptdpg; - vm_page_flag_clear(ptdpg, PG_MAPPED); + vm_page_flag_clear(ptdpg, PG_MAPPED | PG_WRITEABLE); vm_page_wire(ptdpg); vm_page_wakeup(ptdpg); pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); @@ -1248,9 +1248,9 @@ pmap_puninit(pmap_t pmap) KKASSERT(pmap->pm_pml4 != NULL); pmap_kremove((vm_offset_t)pmap->pm_pml4); vm_page_busy_wait(p, FALSE, "pgpun"); - atomic_add_int(&p->wire_count, -1); - atomic_add_int(&vmstats.v_wire_count, -1); - vm_page_free_zero(p); + vm_page_unwire(p, 0); + vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); + vm_page_free(p); pmap->pm_pdirm = NULL; atomic_add_long(&pmap->pm_stats.wired_count, -1); KKASSERT(pmap->pm_stats.wired_count == 0); @@ -1415,6 +1415,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) * Map the page table page into its parent, giving it 1 wire count. */ vm_page_wire(m); + vm_page_unmanage(m); atomic_add_long(&pmap->pm_stats.resident_count, 1); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); @@ -1818,10 +1819,10 @@ pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va) rtval = 0; if (pv) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - m->md.pv_list_count--; - KKASSERT(m->md.pv_list_count >= 0); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); + m->md.pv_list_count--; + KKASSERT(m->md.pv_list_count >= 0); pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); atomic_add_int(&pmap->pm_generation, 1); vm_page_spin_unlock(m); @@ -2128,16 +2129,17 @@ restart: vm_page_dirty(m); } TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); - atomic_add_int(&pmap->pm_generation, 1); - m->md.pv_list_count--; - KKASSERT(m->md.pv_list_count >= 0); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); + m->md.pv_list_count--; + KKASSERT(m->md.pv_list_count >= 0); + pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); + atomic_add_int(&pmap->pm_generation, 1); vm_page_spin_unlock(m); pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); - vm_object_drop(pmobj); free_pv_entry(pv); + + vm_object_drop(pmobj); vm_page_spin_lock(m); } KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); @@ -2338,6 +2340,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, pt_entry_t origpte, newpte; vm_paddr_t opa; vm_page_t mpte; + int spun = 0; if (pmap == NULL) return; @@ -2431,12 +2434,20 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, */ if (pmap_initialized) { if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { + /* + * WARNING! We are using m's spin-lock as a + * man's pte lock to interlock against + * pmap_page_protect() operations. + * + * This is a bad hack (obviously). + */ pv = get_pv_entry(); vm_page_spin_lock(m); pmap_insert_entry(pmap, va, mpte, m, pv); pa |= VPTE_MANAGED; vm_page_flag_set(m, PG_MAPPED); - vm_page_spin_unlock(m); + spun = 1; + /* vm_page_spin_unlock(m); */ } } @@ -2468,6 +2479,8 @@ validate: kprintf("pmap [M] race @ %016jx\n", va); atomic_set_long(pte, VPTE_M); } + if (spun) + vm_page_spin_unlock(m); if (mpte) vm_page_wakeup(mpte); @@ -2867,7 +2880,6 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); atomic_add_int(&pmap->pm_generation, 1); save_generation = pmap->pm_generation; - m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) @@ -2948,10 +2960,11 @@ pmap_clearbit(vm_page_t m, int bit) pt_entry_t *pte; pt_entry_t pbits; - if (bit == VPTE_RW) - vm_page_flag_clear(m, PG_WRITEABLE); - if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) + if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { + if (bit == VPTE_RW) + vm_page_flag_clear(m, PG_WRITEABLE); return; + } /* * Loop over all current mappings setting/clearing as appropos If @@ -3033,6 +3046,8 @@ restart: } } } + if (bit == VPTE_RW) + vm_page_flag_clear(m, PG_WRITEABLE); vm_page_spin_unlock(m); } diff --git a/sys/platform/vkernel64/platform/pmap_inval.c b/sys/platform/vkernel64/platform/pmap_inval.c index 68c6d8c606..7242026663 100644 --- a/sys/platform/vkernel64/platform/pmap_inval.c +++ b/sys/platform/vkernel64/platform/pmap_inval.c @@ -272,9 +272,14 @@ pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) /* * These carefully handle interactions with other cpus and return * the original vpte. Clearing VPTE_RW prevents us from racing the - * setting of VPTE_M, allowing us to invalidate the tlb (the real cpu's + * setting of VPTE_M, allowing us to invalidate the TLB (the real cpu's * pmap) and get good status for VPTE_M. * + * By using an atomic op we can detect if the real PTE is writable by + * testing whether VPTE_M was set. If it wasn't set, the real PTE is + * already read-only and we do not have to waste time invalidating it + * further. + * * clean: clear VPTE_M and VPTE_RW * setro: clear VPTE_RW * load&clear: clear entire field @@ -285,9 +290,18 @@ pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) vpte_t pte; if (vmm_enabled == 0) { - pte = atomic_swap_long(ptep, *ptep & ~(VPTE_RW | VPTE_M)); - if (pte & VPTE_RW) - pmap_inval_cpu(pmap, va, PAGE_SIZE); + for (;;) { + pte = *ptep; + cpu_ccfence(); + if ((pte & VPTE_RW) == 0) + break; + if (atomic_cmpset_long(ptep, + pte, + pte & ~(VPTE_RW | VPTE_M))) { + pmap_inval_cpu(pmap, va, PAGE_SIZE); + break; + } + } } else { pte = *ptep & ~(VPTE_RW | VPTE_M); guest_sync_addr(pmap, ptep, &pte); @@ -335,11 +349,16 @@ pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) vpte_t pte; if (vmm_enabled == 0) { - pte = atomic_swap_long(ptep, *ptep & ~VPTE_RW); - if (pte & VPTE_M) - atomic_set_long(ptep, VPTE_M); - if (pte & VPTE_RW) - pmap_inval_cpu(pmap, va, PAGE_SIZE); + for (;;) { + pte = *ptep; + cpu_ccfence(); + if ((pte & VPTE_RW) == 0) + break; + if (atomic_cmpset_long(ptep, pte, pte & ~VPTE_RW)) { + pmap_inval_cpu(pmap, va, PAGE_SIZE); + break; + } + } } else { pte = *ptep & ~(VPTE_RW | VPTE_M); guest_sync_addr(pmap, ptep, &pte); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index a749bd36d7..e8a5ac6ac8 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -369,12 +369,12 @@ RetryFault: } /* - * If we are user-wiring a r/w segment, and it is COW, then - * we need to do the COW operation. Note that we don't + * If we are user-wiring a r/w segment, and it is COW, then + * we need to do the COW operation. Note that we don't * currently COW RO sections now, because it is NOT desirable - * to COW .text. We simply keep .text from ever being COW'ed - * and take the heat that one cannot debug wired .text sections. - */ + * to COW .text. We simply keep .text from ever being COW'ed + * and take the heat that one cannot debug wired .text sections. + */ result = vm_map_lookup(&fs.map, vaddr, VM_PROT_READ|VM_PROT_WRITE| VM_PROT_OVERRIDE_WRITE, @@ -395,8 +395,10 @@ RetryFault: * XXX We have a shared lock, this will have a MP race but * I don't see how it can hurt anything. */ - if ((fs.entry->protection & VM_PROT_WRITE) == 0) - fs.entry->max_protection &= ~VM_PROT_WRITE; + if ((fs.entry->protection & VM_PROT_WRITE) == 0) { + atomic_clear_char(&fs.entry->max_protection, + VM_PROT_WRITE); + } } /* @@ -746,6 +748,7 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, struct faultstate fs; int result; int retry; + int growstack; vm_prot_t orig_fault_type = fault_type; retry = 0; @@ -762,6 +765,8 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, */ fs.m = pmap_fault_page_quick(map->pmap, vaddr, fault_type, busyp); if (fs.m) { + if (fault_type & (VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE)) + vm_page_dirty(fs.m); *errorp = 0; return(fs.m); } @@ -770,9 +775,10 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, * Otherwise take a concurrency hit and do a formal page * fault. */ + fs.vp = NULL; fs.shared = vm_shared_fault; fs.first_shared = vm_shared_fault; - fs.vp = NULL; + growstack = 1; lwkt_gettoken(&map->token); /* @@ -805,9 +811,61 @@ RetryFault: &first_pindex, &fs.first_prot, &fs.wired); if (result != KERN_SUCCESS) { - *errorp = result; - fs.m = NULL; - goto done; + if (result == KERN_FAILURE_NOFAULT) { + *errorp = KERN_FAILURE; + fs.m = NULL; + goto done; + } + if (result != KERN_PROTECTION_FAILURE || + (fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE) + { + if (result == KERN_INVALID_ADDRESS && growstack && + map != &kernel_map && curproc != NULL) { + result = vm_map_growstack(curproc, vaddr); + if (result == KERN_SUCCESS) { + growstack = 0; + ++retry; + goto RetryFault; + } + result = KERN_FAILURE; + } + fs.m = NULL; + *errorp = result; + goto done; + } + + /* + * If we are user-wiring a r/w segment, and it is COW, then + * we need to do the COW operation. Note that we don't + * currently COW RO sections now, because it is NOT desirable + * to COW .text. We simply keep .text from ever being COW'ed + * and take the heat that one cannot debug wired .text sections. + */ + result = vm_map_lookup(&fs.map, vaddr, + VM_PROT_READ|VM_PROT_WRITE| + VM_PROT_OVERRIDE_WRITE, + &fs.entry, &fs.first_object, + &first_pindex, &fs.first_prot, + &fs.wired); + if (result != KERN_SUCCESS) { + /* could also be KERN_FAILURE_NOFAULT */ + *errorp = KERN_FAILURE; + fs.m = NULL; + goto done; + } + + /* + * If we don't COW now, on a user wire, the user will never + * be able to write to the mapping. If we don't make this + * restriction, the bookkeeping would be nearly impossible. + * + * XXX We have a shared lock, this will have a MP race but + * I don't see how it can hurt anything. + */ + if ((fs.entry->protection & VM_PROT_WRITE) == 0) { + atomic_clear_char(&fs.entry->max_protection, + VM_PROT_WRITE); + } } /* @@ -1017,7 +1075,8 @@ RetryFault: * Unlock everything, and return the held or busied page. */ if (busyp) { - if (fault_type & VM_PROT_WRITE) { + if (fault_type & (VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE)) { + vm_page_dirty(fs.m); *busyp = 1; } else { *busyp = 0; @@ -1245,7 +1304,6 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, } if ((vpte & VPTE_PS) || vshift == 0) break; - KKASSERT(vshift >= VPTE_PAGE_BITS); /* * Get the page table page. Nominally we only read the page @@ -1285,17 +1343,21 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, for (;;) { vpte_t nvpte; + /* + * Reload for the cmpset, but make sure the pte is + * still valid. + */ vpte = *ptep; cpu_ccfence(); nvpte = vpte; - if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_V) && - (vpte & VPTE_RW)) { + if ((vpte & VPTE_V) == 0) + break; + + if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_RW)) nvpte |= VPTE_M | VPTE_A; - } - if ((fault_type & VM_PROT_READ) && (vpte & VPTE_V)) { + if (fault_type & VM_PROT_READ) nvpte |= VPTE_A; - } if (vpte == nvpte) break; if (atomic_cmpset_long(ptep, vpte, nvpte)) { @@ -1311,20 +1373,16 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, } /* - * We can't reconcile the real page table's M bit back to the - * virtual page table, so instead we force the real page table pte - * to be read-only on a read-fault for a VPTE_RW vpagetable - * fault, and set the VPTE_M bit manually (above) for a write-fault - * on a VPTE_RW vpagetable fault. - * - * The virtual kernel must MADV_INVAL areas that it clears the - * VPTE_M bit on to ensure that the real kernel is able to take - * a write fault to set VPTE_M again. - * - * If we find the VPTE_M bit already set above, we don't have to - * downgrade here. + * When the vkernel sets VPTE_RW it expects the real kernel to + * reflect VPTE_M back when the page is modified via the mapping. + * In order to accomplish this the real kernel must map the page + * read-only for read faults and use write faults to reflect VPTE_M + * back. * - * This guarantees that the M bit will be set. + * Once VPTE_M has been set, the real kernel's pte allows writing. + * If the vkernel clears VPTE_M the vkernel must be sure to + * MADV_INVAL the real kernel's mappings to force the real kernel + * to re-fault on the next write so oit can set VPTE_M again. */ if ((fault_type & VM_PROT_WRITE) == 0 && (vpte & (VPTE_RW | VPTE_M)) != (VPTE_RW | VPTE_M)) { @@ -2306,8 +2364,8 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_object_hold(src_object); vm_object_hold(dst_object); for (vaddr = dst_entry->start, dst_offset = 0; - vaddr < dst_entry->end; - vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { + vaddr < dst_entry->end; + vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { /* * Allocate a page in the destination object -- 2.11.4.GIT