From a7a03a5f23a9828719d348cb3c6c880d28003738 Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Fri, 30 Dec 2016 12:21:26 -0800
Subject: [PATCH] kernel - Fix swap issue, implement dynamic pmap PT/PD/PDP
 deletion

* The pmap code is now able to dynamically destroy PT, PD, and PDP
  page table pages when they become empty.  To do this we had to
  recode the higher-level page tables to wire on creation of a lower-level
  pv_entry instead of wiring on pte entry.

  DragonFly previously left PD and PDP pages intact, and possibly also PTs,
  until process exit.  In normal operation this had no real impact since
  most programs don't bloat up enough for the extra page table pages to
  matter, but its good to finally fix it as it allows the pmap footprint
  to be significantly reduced in the very few situations where a program
  bloats and unbloats during operation.

* Fix an issue with recent swap changes.  We must increase the stripe
  between multiple swap devices to match the number of entries available
  on a radix leaf, which increased from 32 to 64.  This fixes a pstat -s
  accounting error that would sometimes attribute swap frees to the wrong
  device.

* Refactor the RSS limiting code to scan the pmap instead of scan the
  vm_map and related underlying objects.  This greatly enhances performance
  because the underlying objects might have many pages that are not mapped.
  By scanning the pmap, we avoid having to sift through them all.

  Also makes use of the dynamic removal feature in the pmap code to restrict
  the effort required to do the pmap scan, and allows us to avoid most of
  the issues related to stacked VM objects.
---
 sys/platform/pc64/x86_64/pmap.c        | 475 +++++++++++++++++++++++++++------
 sys/platform/vkernel64/platform/pmap.c | 135 ++++++++++
 sys/sys/dmap.h                         |   3 -
 sys/vm/pmap.h                          |  20 ++
 sys/vm/swap_pager.c                    |  37 +--
 sys/vm/swap_pager.h                    |  13 +-
 sys/vm/vm_map.h                        |   2 +-
 sys/vm/vm_pageout.c                    | 369 +++++++------------------
 sys/vm/vm_swap.c                       |  29 +-
 9 files changed, 675 insertions(+), 408 deletions(-)

diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c
index 7b0c9cc322..6647e51e58 100644
--- a/sys/platform/pc64/x86_64/pmap.c
+++ b/sys/platform/pc64/x86_64/pmap.c
@@ -230,6 +230,9 @@ uint64_t pmap_bits_default[] = {
 static pt_entry_t *pt_crashdumpmap;
 static caddr_t crashdumpmap;
 
+static int pmap_debug = 0;
+SYSCTL_INT(_machdep, OID_AUTO, pmap_debug, CTLFLAG_RW,
+    &pmap_debug, 0, "Debug pmap's");
 #ifdef PMAP_DEBUG2
 static int pmap_enter_debug = 0;
 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW,
@@ -272,14 +275,14 @@ static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex
 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp);
 static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex);
 static void pv_put(pv_entry_t pv);
-static void pv_free(pv_entry_t pv);
+static void pv_free(pv_entry_t pv, pv_entry_t pvp, int putaway);
 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex);
 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
 		      pv_entry_t *pvpp);
 static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex,
 		      pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va);
 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp,
-			pmap_inval_bulk_t *bulk);
+			pmap_inval_bulk_t *bulk, int destroy);
 static vm_page_t pmap_remove_pv_page(pv_entry_t pv);
 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp,
 			pmap_inval_bulk_t *bulk);
@@ -1829,7 +1832,8 @@ pmap_puninit(pmap_t pmap)
 			pv_lock(pv);
 		KKASSERT(pv == pmap->pm_pmlpv);
 		p = pmap_remove_pv_page(pv);
-		pv_free(pv);
+		pv_free(pv, NULL, 1);
+		pv = NULL;	/* safety */
 		pmap_kremove((vm_offset_t)pmap->pm_pml4);
 		vm_page_busy_wait(p, FALSE, "pgpun");
 		KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED));
@@ -1872,8 +1876,9 @@ pmap_pinit2(struct pmap *pmap)
  *
  * This function returns two locked pv_entry's, one representing the
  * requested pv and one representing the requested pv's parent pv.  If
- * the pv did not previously exist it will be mapped into its parent
- * and wired, otherwise no additional wire count will be added.
+ * an intermediate page table does not exist it will be created, mapped,
+ * wired, and the parent page table will be given an additional hold
+ * count representing the presence of the child pv_entry.
  */
 static
 pv_entry_t
@@ -1912,14 +1917,16 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
 			KKASSERT(pvpp == NULL);
 		else
 			KKASSERT(pvpp != NULL);
-		if (pvpp) {
-			pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT);
-			pvp = pmap_allocpte(pmap, pt_pindex, NULL);
-			if (isnew)
-				vm_page_wire_quick(pvp->pv_m);
-			*pvpp = pvp;
+		pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT);
+		pvp = pmap_allocpte(pmap, pt_pindex, NULL);
+		if (isnew) {
+			vm_page_wire_quick(pvp->pv_m);
+			if (pvpp)
+				*pvpp = pvp;
+			else
+				pv_put(pvp);
 		} else {
-			pvp = NULL;
+			*pvpp = pvp;
 		}
 		return(pv);
 	}
@@ -1993,12 +2000,17 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
 	}
 
 	/*
-	 * This code is only reached if isnew is TRUE and this is not a
-	 * terminal PV.  We need to allocate a vm_page for the page table
-	 * at this level and enter it into the parent page table.
+	 * (isnew) is TRUE, pv is not terminal.
+	 *
+	 * (1) Add a wire count to the parent page table (pvp).
+	 * (2) Allocate a VM page for the page table.
+	 * (3) Enter the VM page into the parent page table.
 	 *
 	 * page table pages are marked PG_WRITEABLE and PG_MAPPED.
 	 */
+	if (pvp)
+		vm_page_wire_quick(pvp->pv_m);
+
 	for (;;) {
 		m = vm_page_alloc(NULL, pv->pv_pindex,
 				  VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
@@ -2020,9 +2032,9 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
 	vm_page_wire(m);	/* wire for mapping in parent */
 
 	/*
-	 * Wire the page into pvp, bump the wire-count for pvp's page table
-	 * page.  Bump the resident_count for the pmap.  There is no pvp
-	 * for the top level, address the pm_pml4[] array directly.
+	 * Wire the page into pvp.  Bump the resident_count for the pmap.
+	 * There is no pvp for the top level, address the pm_pml4[] array
+	 * directly.
 	 *
 	 * If the caller wants the parent we return it, otherwise
 	 * we just put it away.
@@ -2049,15 +2061,13 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
 				      "pg bad wirecount");
 			}
 			atomic_add_long(&pmap->pm_stats.resident_count, -1);
-		} else {
-			vm_page_wire_quick(pvp->pv_m);
 		}
 		*ptep = VM_PAGE_TO_PHYS(m) |
-		    (pmap->pmap_bits[PG_U_IDX] |
-		    pmap->pmap_bits[PG_RW_IDX] |
-		    pmap->pmap_bits[PG_V_IDX] |
-		    pmap->pmap_bits[PG_A_IDX] |
-		    pmap->pmap_bits[PG_M_IDX]);
+			(pmap->pmap_bits[PG_U_IDX] |
+			 pmap->pmap_bits[PG_RW_IDX] |
+			 pmap->pmap_bits[PG_V_IDX] |
+			 pmap->pmap_bits[PG_A_IDX] |
+			 pmap->pmap_bits[PG_M_IDX]);
 	}
 	vm_page_wakeup(m);
 notnew:
@@ -2237,11 +2247,11 @@ retry:
 	 */
 	pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b));
 	npte = VM_PAGE_TO_PHYS(xpv->pv_m) |
-	    (pmap->pmap_bits[PG_U_IDX] |
-	    pmap->pmap_bits[PG_RW_IDX] |
-	    pmap->pmap_bits[PG_V_IDX] |
-	    pmap->pmap_bits[PG_A_IDX] |
-	    pmap->pmap_bits[PG_M_IDX]);
+	       (pmap->pmap_bits[PG_U_IDX] |
+		pmap->pmap_bits[PG_RW_IDX] |
+		pmap->pmap_bits[PG_V_IDX] |
+		pmap->pmap_bits[PG_A_IDX] |
+		pmap->pmap_bits[PG_M_IDX]);
 
 	/*
 	 * Dispose of previous page table page if it was local to the
@@ -2279,8 +2289,8 @@ retry:
 	 */
 	if (*pt == 0) {
 		*pt = npte;
-		vm_page_wire_quick(xpv->pv_m);
-		vm_page_wire_quick(proc_pd_pv->pv_m);
+		vm_page_wire_quick(xpv->pv_m);		/* shared pt -> proc */
+		vm_page_wire_quick(proc_pd_pv->pv_m);	/* proc pd for sh pt */
 		atomic_add_long(&pmap->pm_stats.resident_count, 1);
 	} else if (*pt != npte) {
 		opte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, pt, npte);
@@ -2291,7 +2301,7 @@ retry:
 
 		*pt = npte;
 #endif
-		vm_page_wire_quick(xpv->pv_m);	/* pgtable pg that is npte */
+		vm_page_wire_quick(xpv->pv_m);		/* shared pt -> proc */
 
 		/*
 		 * Clean up opte, bump the wire_count for the process
@@ -2340,6 +2350,7 @@ retry:
 struct pmap_release_info {
 	pmap_t	pmap;
 	int	retry;
+	pv_entry_t pvp;
 };
 
 static int pmap_release_callback(pv_entry_t pv, void *data);
@@ -2364,10 +2375,15 @@ pmap_release(struct pmap *pmap)
 	info.pmap = pmap;
 	do {
 		info.retry = 0;
+		info.pvp = NULL;
+
 		spin_lock(&pmap->pm_spin);
 		RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL,
 			pmap_release_callback, &info);
 		spin_unlock(&pmap->pm_spin);
+
+		if (info.pvp)
+			pv_put(info.pvp);
 	} while (info.retry);
 
 
@@ -2381,14 +2397,22 @@ pmap_release(struct pmap *pmap)
 	KKASSERT(pmap->pm_stats.wired_count == 0);
 }
 
+/*
+ * Called from low to high.  We must cache the proper parent pv so we
+ * can adjust its wired count.
+ */
 static int
 pmap_release_callback(pv_entry_t pv, void *data)
 {
 	struct pmap_release_info *info = data;
 	pmap_t pmap = info->pmap;
+	vm_pindex_t pindex;
 	int r;
 
-	if (pv_hold_try(pv)) {
+	if (info->pvp == pv) {
+		spin_unlock(&pmap->pm_spin);
+		info->pvp = NULL;
+	} else if (pv_hold_try(pv)) {
 		spin_unlock(&pmap->pm_spin);
 	} else {
 		spin_unlock(&pmap->pm_spin);
@@ -2400,7 +2424,60 @@ pmap_release_callback(pv_entry_t pv, void *data)
 		info->retry = 1;
 		return(-1);
 	}
-	r = pmap_release_pv(pv, NULL, NULL);
+
+	if (pv->pv_pindex < pmap_pt_pindex(0)) {
+		/*
+		 * parent is PT
+		 */
+		pindex = pv->pv_pindex >> NPTEPGSHIFT;
+		pindex += NUPTE_TOTAL;
+	} else if (pv->pv_pindex < pmap_pd_pindex(0)) {
+		/*
+		 * parent is PD
+		 */
+		pindex = (pv->pv_pindex - NUPTE_TOTAL) >> NPDEPGSHIFT;
+		pindex += NUPTE_TOTAL + NUPT_TOTAL;
+	} else if (pv->pv_pindex < pmap_pdp_pindex(0)) {
+		/*
+		 * parent is PDP
+		 */
+		pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL) >>
+			 NPDPEPGSHIFT;
+		pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
+	} else if (pv->pv_pindex < pmap_pml4_pindex()) {
+		/*
+		 * parent is PML4 (there's only one)
+		 */
+#if 0
+		pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL -
+			   NUPD_TOTAL) >> NPML4EPGSHIFT;
+		pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL;
+#endif
+		pindex = pmap_pml4_pindex();
+	} else {
+		/*
+		 * parent is NULL
+		 */
+		if (info->pvp) {
+			pv_put(info->pvp);
+			info->pvp = NULL;
+		}
+		pindex = 0;
+	}
+	if (pindex) {
+		if (info->pvp && info->pvp->pv_pindex != pindex) {
+			pv_put(info->pvp);
+			info->pvp = NULL;
+		}
+		if (info->pvp == NULL)
+			info->pvp = pv_get(pmap, pindex);
+	} else {
+		if (info->pvp) {
+			pv_put(info->pvp);
+			info->pvp = NULL;
+		}
+	}
+	r = pmap_release_pv(pv, info->pvp, NULL);
 	spin_lock(&pmap->pm_spin);
 	return(r);
 }
@@ -2425,8 +2502,11 @@ pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk)
 	 *
 	 * This will clean out the pte at any level of the page table.
 	 * If smp != 0 all cpus are affected.
+	 *
+	 * Do not tear-down recursively, its faster to just let the
+	 * release run its course.
 	 */
-	pmap_remove_pv_pte(pv, pvp, bulk);
+	pmap_remove_pv_pte(pv, pvp, bulk, 0);
 
 	/*
 	 * Terminal pvs are unhooked from their vm_pages.  Because
@@ -2471,13 +2551,15 @@ pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk)
 
 	vm_page_free(p);
 skip:
-	pv_free(pv);
+	pv_free(pv, pvp, 1);
+
 	return 0;
 }
 
 /*
  * This function will remove the pte associated with a pv from its parent.
- * Terminal pv's are supported.  All cpus are affected if smp != 0.
+ * Terminal pv's are supported.  All cpus specified by (bulk) are properly
+ * invalidated.
  *
  * The wire count will be dropped on the parent page table.  The wire
  * count on the page being removed (pv->pv_m) from the parent page table
@@ -2488,13 +2570,15 @@ skip:
  * NOTE: Cannot be called on kernel page table pages, only KVM terminal
  *	 pages and user page table and terminal pages.
  *
- * The pv must be locked.
+ * The pv must be locked.  The pvp, if supplied, must be locked.  All
+ * supplied pv's will remain locked on return.
  *
  * XXX must lock parent pv's if they exist to remove pte XXX
  */
 static
 void
-pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk)
+pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk,
+		   int destroy)
 {
 	vm_pindex_t ptepindex = pv->pv_pindex;
 	pmap_t pmap = pv->pv_pmap;
@@ -2547,8 +2631,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk)
 			pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
 				     (pd_index >> NPML4EPGSHIFT);
 			pvp = pv_get(pv->pv_pmap, pdp_pindex);
-			if (pvp)
-				gotpvp = 1;
+			gotpvp = 1;
 		}
 		if (pvp) {
 			pd = pv_pte_lookup(pvp, pd_index &
@@ -2647,19 +2730,60 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk)
 		if (pte & pmap->pmap_bits[PG_G_IDX])
 			cpu_invlpg((void *)va);
 	}
+	KKASSERT(pv->pv_m == p);	/* XXX remove me later */
 
 	/*
-	 * Unwire the parent page table page.  The wire_count cannot go below
-	 * 1 here because the parent page table page is itself still mapped.
+	 * If requested, scrap the underlying pv->pv_m and the underlying
+	 * pv.  If this is a page-table-page we must also free the page.
 	 *
-	 * XXX remove the assertions later.
+	 * pvp must be returned locked.
 	 */
-	KKASSERT(pv->pv_m == p);
-	if (pvp && vm_page_unwire_quick(pvp->pv_m))
-		panic("pmap_remove_pv_pte: Insufficient wire_count");
+	if (destroy == 1) {
+		/*
+		 * page table page (PT, PD, PDP, PML4), caller was responsible
+		 * for testing wired_count.
+		 */
+		vm_page_t p;
 
-	if (gotpvp)
-		pv_put(pvp);
+		KKASSERT(pv->pv_m->wire_count == 1);
+		p = pmap_remove_pv_page(pv);
+		pv_free(pv, pvp, 1);
+		pv = NULL;
+
+		KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED));
+		vm_page_busy_wait(p, FALSE, "pgpun");
+		vm_page_unwire(p, 0);
+		vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
+		vm_page_free(p);
+	} else if (destroy == 2) {
+		/*
+		 * Normal page (leave page untouched)
+		 */
+		pmap_remove_pv_page(pv);
+		pv_free(pv, pvp, 1);
+		pv = NULL;		/* safety */
+	}
+
+	/*
+	 * If we acquired pvp ourselves then we are responsible for
+	 * recursively deleting it.
+	 */
+	if (pvp && gotpvp) {
+		/*
+		 * Recursively destroy higher-level page tables.
+		 *
+		 * This is optional.  If we do not, they will still
+		 * be destroyed when the process exits.
+		 */
+		if (pvp->pv_m &&
+		    pvp->pv_m->wire_count == 1 &&
+		    pvp->pv_pindex != pmap_pml4_pindex()) {
+			pmap_remove_pv_pte(pvp, NULL, bulk, 1);
+			pvp = NULL;	/* safety */
+		} else {
+			pv_put(pvp);
+		}
+	}
 }
 
 /*
@@ -2684,6 +2808,7 @@ pmap_remove_pv_page(pv_entry_t pv)
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
 	vm_page_spin_unlock(m);
+
 	return(m);
 }
 
@@ -3187,7 +3312,7 @@ pv_put(pv_entry_t pv)
  */
 static
 void
-pv_free(pv_entry_t pv)
+pv_free(pv_entry_t pv, pv_entry_t pvp, int putaway)
 {
 	pmap_t pmap;
 
@@ -3209,7 +3334,8 @@ pv_free(pv_entry_t pv)
 		 * and do it normally.  Drop two refs and the lock all in
 		 * one go.
 		 */
-		if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) {
+		if (putaway &&
+		    atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) {
 #ifdef PMAP_DEBUG2
 			if (pmap_enter_debug > 0) {
 				--pmap_enter_debug;
@@ -3217,11 +3343,16 @@ pv_free(pv_entry_t pv)
 			}
 #endif
 			zfree(pvzone, pv);
+			if (pvp)
+				vm_page_unwire_quick(pvp->pv_m);
 			return;
 		}
 		pv_drop(pv);	/* ref for pv_pmap */
 	}
-	pv_put(pv);
+	if (putaway)
+		pv_put(pv);
+	if (pvp)
+		vm_page_unwire_quick(pvp->pv_m);
 }
 
 /*
@@ -3289,6 +3420,7 @@ struct pmap_scan_info {
 	pmap_inval_bulk_t bulk_core;
 	pmap_inval_bulk_t *bulk;
 	int count;
+	int stop;
 };
 
 static int pmap_scan_cmp(pv_entry_t pv, void *data);
@@ -3306,6 +3438,7 @@ pmap_scan(struct pmap_scan_info *info, int smp_inval)
 	struct pv_entry dummy_pv;
 	int generation;
 
+	info->stop = 0;
 	if (pmap == NULL)
 		return;
 	if (smp_inval) {
@@ -3517,6 +3650,12 @@ pmap_scan_callback(pv_entry_t pv, void *data)
 	int generation;
 
 	/*
+	 * Stop if requested
+	 */
+	if (info->stop)
+		return -1;
+
+	/*
 	 * Pull the PD pindex from the pv before releasing the spinlock.
 	 *
 	 * WARNING: pv is faked for kernel pmap scans.
@@ -3555,6 +3694,8 @@ pmap_scan_callback(pv_entry_t pv, void *data)
 	pt_pv = NULL;
 
 	for (; sva < eva; sva = va_next) {
+		if (info->stop)
+			break;
 		if (sva >= VM_MAX_USER_ADDRESS) {
 			if (pt_pv) {
 				pv_put(pt_pv);
@@ -3584,18 +3725,18 @@ pmap_scan_callback(pv_entry_t pv, void *data)
 		 * PT cache
 		 */
 		if (pt_pv == NULL) {
-			if (pd_pv) {
-				pv_put(pd_pv);
-				pd_pv = NULL;
-			}
+			vm_page_wire_quick(pd_pv->pv_m);
+			pv_unlock(pd_pv);
 			pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
+			pv_lock(pd_pv);
+			vm_page_unwire_quick(pd_pv->pv_m);
 		} else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) {
-			if (pd_pv) {
-				pv_put(pd_pv);
-				pd_pv = NULL;
-			}
+			vm_page_wire_quick(pd_pv->pv_m);
+			pv_unlock(pd_pv);
 			pv_put(pt_pv);
 			pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
+			pv_lock(pd_pv);
+			vm_page_unwire_quick(pd_pv->pv_m);
 		}
 
 		/*
@@ -3610,9 +3751,6 @@ pmap_scan_callback(pv_entry_t pv, void *data)
 			 * Possible unmanaged (shared from another pmap)
 			 * page table page.
 			 */
-			if (pd_pv == NULL)
-				pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
-			KKASSERT(pd_pv != NULL);
 			ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva));
 			if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
 				info->func(pmap, info, NULL, pd_pv, 1,
@@ -3663,10 +3801,19 @@ kernel_skip:
 
 		while (sva < va_next) {
 			/*
-			 * Yield every 64 pages.
+			 * Yield every 64 pages, stop if requested.
 			 */
 			if ((++info->count & 63) == 0)
 				lwkt_user_yield();
+			if (info->stop)
+				break;
+
+			/*
+			 * Check if pt_pv has been lost (probably due to
+			 * a remove of the underlying pages).
+			 */
+			if (pt_pv && pt_pv->pv_pmap == NULL)
+				break;
 
 			/*
 			 * Acquire the related pte_pv, if any.  If *ptep == 0
@@ -3686,22 +3833,27 @@ kernel_skip:
 						    &error);
 				if (error) {
 					if (pd_pv) {
-						pv_put(pd_pv);
-						pd_pv = NULL;
+						vm_page_wire_quick(pd_pv->pv_m);
+						pv_unlock(pd_pv);
 					}
-					pv_put(pt_pv);	 /* must be non-NULL */
-					pt_pv = NULL;
+					vm_page_wire_quick(pt_pv->pv_m);
+					pv_unlock(pt_pv);/* must be non-NULL */
 					pv_lock(pte_pv); /* safe to block now */
 					pv_put(pte_pv);
 					pte_pv = NULL;
-					pt_pv = pv_get(pmap,
-						       pmap_pt_pindex(sva));
+					pv_lock(pt_pv);
+					vm_page_unwire_quick(pt_pv->pv_m);
+
 					/*
 					 * pt_pv reloaded, need new ptep
 					 */
 					KKASSERT(pt_pv != NULL);
 					ptep = pv_pte_lookup(pt_pv,
 							pmap_pte_index(sva));
+					if (pd_pv) {
+						pv_lock(pd_pv);
+						vm_page_unwire_quick(pd_pv->pv_m);
+					}
 					continue;
 				}
 			} else {
@@ -3728,7 +3880,7 @@ kernel_skip:
 			/*
 			 * Ready for the callback.  The locked pte_pv (if any)
 			 * is consumed by the callback.  pte_pv will exist if
-			 *  the page is managed, and will not exist if it
+			 * the page is managed, and will not exist if it
 			 * isn't.
 			 */
 			if (pte_pv) {
@@ -3738,8 +3890,23 @@ kernel_skip:
 				    "pte_pv %p pm_generation %d/%d",
 				    *ptep, oldpte, sva, pte_pv,
 				    generation, pmap->pm_generation));
+				/*
+				 * We must unlock pd_pv across the callback
+				 * to avoid deadlocks on any recursive
+				 * disposal.  Re-check that it still exists
+				 * after re-locking.
+				 */
+				if (pd_pv)
+					pv_unlock(pd_pv);
 				info->func(pmap, info, pte_pv, pt_pv, 0,
-				    sva, ptep, info->arg);
+					   sva, ptep, info->arg);
+				if (pd_pv) {
+					pv_lock(pd_pv);
+					if (pd_pv->pv_pmap == NULL) {
+						pv_put(pd_pv);
+						pd_pv = NULL;
+					}
+				}
 			} else {
 				/*
 				 * Check for insertion race.  Since there is no
@@ -3770,6 +3937,11 @@ kernel_skip:
 
 				/*
 				 * Didn't race
+				 *
+				 * We must unlock pd_pv across the callback
+				 * to avoid deadlocks on any recursive
+				 * disposal.  Re-check that it still exists
+				 * after re-locking.
 				 */
 				KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) ==
 				    pmap->pmap_bits[PG_V_IDX],
@@ -3777,8 +3949,17 @@ kernel_skip:
 				    "pte_pv NULL pm_generation %d/%d",
 				     *ptep, oldpte, sva,
 				     generation, pmap->pm_generation));
+				if (pd_pv)
+					pv_unlock(pd_pv);
 				info->func(pmap, info, NULL, pt_pv, 0,
-				    sva, ptep, info->arg);
+					   sva, ptep, info->arg);
+				if (pd_pv) {
+					pv_lock(pd_pv);
+					if (pd_pv->pv_pmap == NULL) {
+						pv_put(pd_pv);
+						pd_pv = NULL;
+					}
+				}
 			}
 			pte_pv = NULL;
 			sva += PAGE_SIZE;
@@ -3840,10 +4021,24 @@ pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
 		/*
 		 * This will also drop pt_pv's wire_count. Note that
 		 * terminal pages are not wired based on mmu presence.
+		 *
+		 * NOTE: If this is the kernel_pmap, pt_pv can be NULL.
+		 */
+		pmap_remove_pv_pte(pte_pv, pt_pv, info->bulk, 2);
+		pte_pv = NULL;	/* safety */
+
+		/*
+		 * Recursively destroy higher-level page tables.
+		 *
+		 * This is optional.  If we do not, they will still
+		 * be destroyed when the process exits.
 		 */
-		pmap_remove_pv_pte(pte_pv, pt_pv, info->bulk);
-		pmap_remove_pv_page(pte_pv);
-		pv_free(pte_pv);
+		if (pt_pv && pt_pv->pv_m && pt_pv->pv_m->wire_count == 1 &&
+		    pt_pv->pv_pindex != pmap_pml4_pindex()) {
+			pv_hold(pt_pv);
+			pmap_remove_pv_pte(pt_pv, NULL, info->bulk, 1);
+			pv_lock(pt_pv);
+		}
 	} else if (sharept == 0) {
 		/*
 		 * Unmanaged page table (pt, pd, or pdp. Not pte).
@@ -3921,10 +4116,13 @@ pmap_remove_all(vm_page_t m)
 		 * Holding no spinlocks, pv is locked.
 		 */
 		pmap_inval_bulk_init(&bulk, pv->pv_pmap);
-		pmap_remove_pv_pte(pv, NULL, &bulk);
+		pmap_remove_pv_pte(pv, NULL, &bulk, 2);
+		pv = NULL;	/* safety */
 		pmap_inval_bulk_flush(&bulk);
+#if 0
 		pmap_remove_pv_page(pv);
-		pv_free(pv);
+		pv_free(pv, 1);
+#endif
 		vm_page_spin_lock(m);
 	}
 	KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
@@ -3932,6 +4130,51 @@ pmap_remove_all(vm_page_t m)
 }
 
 /*
+ * Removes the page from a particular pmap
+ */
+void
+pmap_remove_specific(pmap_t pmap, vm_page_t m)
+{
+	pv_entry_t pv;
+	pmap_inval_bulk_t bulk;
+
+	if (!pmap_initialized)
+		return;
+
+again:
+	vm_page_spin_lock(m);
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+		if (pv->pv_pmap != pmap)
+			continue;
+		KKASSERT(pv->pv_m == m);
+		if (pv_hold_try(pv)) {
+			vm_page_spin_unlock(m);
+		} else {
+			vm_page_spin_unlock(m);
+			pv_lock(pv);
+		}
+		if (pv->pv_m != m) {
+			pv_put(pv);
+			goto again;
+		}
+
+		/*
+		 * Holding no spinlocks, pv is locked.
+		 */
+		pmap_inval_bulk_init(&bulk, pv->pv_pmap);
+		pmap_remove_pv_pte(pv, NULL, &bulk, 2);
+		pv = NULL;	/* safety */
+		pmap_inval_bulk_flush(&bulk);
+#if 0
+		pmap_remove_pv_page(pv);
+		pv_free(pv, 1);
+#endif
+		goto again;
+	}
+	vm_page_spin_unlock(m);
+}
+
+/*
  * Set the physical protection on the specified range of this map
  * as requested.  This function is typically only used for debug watchpoints
  * and COW pages.
@@ -4194,22 +4437,16 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	if (opa) {
 		if (pte_pv) {
 			/*
-			 * pmap_remove_pv_pte() unwires pt_pv and assumes
-			 * we will free pte_pv, but since we are reusing
-			 * pte_pv we want to retain the wire count.
-			 *
 			 * pt_pv won't exist for a kernel page (managed or
 			 * otherwise).
 			 */
-			if (pt_pv)
-				vm_page_wire_quick(pt_pv->pv_m);
 			if (prot & VM_PROT_NOSYNC) {
-				pmap_remove_pv_pte(pte_pv, pt_pv, NULL);
+				pmap_remove_pv_pte(pte_pv, pt_pv, NULL, 0);
 			} else {
 				pmap_inval_bulk_t bulk;
 
 				pmap_inval_bulk_init(&bulk, pmap);
-				pmap_remove_pv_pte(pte_pv, pt_pv, &bulk);
+				pmap_remove_pv_pte(pte_pv, pt_pv, &bulk, 0);
 				pmap_inval_bulk_flush(&bulk);
 			}
 			if (pte_pv->pv_m)
@@ -5342,3 +5579,67 @@ pmap_object_free(vm_object_t object)
 		kfree(pmap, M_OBJPMAP);
 	}
 }
+
+/*
+ * pmap_pgscan_callback - Used by pmap_pgscan to acquire the related
+ * VM page and issue a pginfo->callback.
+ *
+ * We are expected to dispose of any non-NULL pte_pv.
+ */
+static
+void
+pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info,
+		      pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
+		      vm_offset_t va, pt_entry_t *ptep, void *arg)
+{
+	struct pmap_pgscan_info *pginfo = arg;
+	vm_page_t m;
+
+	if (pte_pv) {
+		/*
+		 * Try to busy the page while we hold the pte_pv locked.
+		 */
+		m = PHYS_TO_VM_PAGE(*ptep & PG_FRAME);
+		if (vm_page_busy_try(m, TRUE) == 0) {
+			if (m == PHYS_TO_VM_PAGE(*ptep & PG_FRAME)) {
+				/*
+				 * The callback is issued with the pte_pv
+				 * unlocked and put away, and the pt_pv
+				 * unlocked.
+				 */
+				pv_put(pte_pv);
+				if (pt_pv)
+					pv_unlock(pt_pv);
+				if (pginfo->callback(pginfo, va, m) < 0)
+					info->stop = 1;
+				if (pt_pv)
+					pv_lock(pt_pv);
+			} else {
+				pv_put(pte_pv);
+			}
+		} else {
+			++pginfo->busycount;
+			pv_put(pte_pv);
+		}
+	} else if (sharept) {
+		/* shared page table */
+	} else {
+		/* else unmanaged page */
+	}
+}
+
+void
+pmap_pgscan(struct pmap_pgscan_info *pginfo)
+{
+	struct pmap_scan_info info;
+
+	pginfo->offset = pginfo->beg_addr;
+	info.pmap = pginfo->pmap;
+	info.sva = pginfo->beg_addr;
+	info.eva = pginfo->end_addr;
+	info.func = pmap_pgscan_callback;
+	info.arg = pginfo;
+	pmap_scan(&info, 0);
+	if (info.stop == 0)
+		pginfo->offset = pginfo->end_addr;
+}
diff --git a/sys/platform/vkernel64/platform/pmap.c b/sys/platform/vkernel64/platform/pmap.c
index f2df3b6409..9a2fee1b1d 100644
--- a/sys/platform/vkernel64/platform/pmap.c
+++ b/sys/platform/vkernel64/platform/pmap.c
@@ -2232,6 +2232,59 @@ pmap_remove_all(vm_page_t m)
 }
 
 /*
+ * Removes the page from a particular pmap
+ */
+void
+pmap_remove_specific(pmap_t pmap, vm_page_t m)
+{
+	pt_entry_t *pte, tpte;
+	pv_entry_t pv;
+
+	lwkt_gettoken(&vm_token);
+again:
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+		if (pv->pv_pmap != pmap)
+			continue;
+
+		KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0);
+		--pv->pv_pmap->pm_stats.resident_count;
+
+		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
+		KKASSERT(pte != NULL);
+
+		tpte = pmap_inval_loadandclear(pte, pv->pv_pmap, pv->pv_va);
+		if (tpte & VPTE_WIRED)
+			pv->pv_pmap->pm_stats.wired_count--;
+		KKASSERT(pv->pv_pmap->pm_stats.wired_count >= 0);
+
+		if (tpte & VPTE_A)
+			vm_page_flag_set(m, PG_REFERENCED);
+
+		/*
+		 * Update the vm_page_t clean and reference bits.
+		 */
+		if (tpte & VPTE_M) {
+			if (pmap_track_modified(pv->pv_pmap, pv->pv_va))
+				vm_page_dirty(m);
+		}
+		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
+		++pv->pv_pmap->pm_generation;
+		m->md.pv_list_count--;
+		atomic_add_int(&m->object->agg_pv_list_count, -1);
+		KKASSERT(m->md.pv_list_count >= 0);
+		if (TAILQ_EMPTY(&m->md.pv_list))
+			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
+		vm_object_hold(pv->pv_pmap->pm_pteobj);
+		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
+		vm_object_drop(pv->pv_pmap->pm_pteobj);
+		free_pv_entry(pv);
+		goto again;
+	}
+	lwkt_reltoken(&vm_token);
+}
+
+/*
  * Set the physical protection on the specified range of this map
  * as requested.
  *
@@ -3476,3 +3529,85 @@ pmap_object_free(vm_object_t object)
 {
 	/* empty */
 }
+
+void
+pmap_pgscan(struct pmap_pgscan_info *pginfo)
+{
+	pmap_t pmap = pginfo->pmap;
+	vm_offset_t sva = pginfo->beg_addr;
+	vm_offset_t eva = pginfo->end_addr;
+	vm_offset_t va_next;
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	pd_entry_t ptpaddr, *pde;
+	pt_entry_t *pte;
+	int stop = 0;
+
+	lwkt_gettoken(&vm_token);
+
+	for (; sva < eva; sva = va_next) {
+		if (stop)
+			break;
+
+		pml4e = pmap_pml4e(pmap, sva);
+		if ((*pml4e & VPTE_V) == 0) {
+			va_next = (sva + NBPML4) & ~PML4MASK;
+			if (va_next < sva)
+				va_next = eva;
+			continue;
+		}
+
+		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+		if ((*pdpe & VPTE_V) == 0) {
+			va_next = (sva + NBPDP) & ~PDPMASK;
+			if (va_next < sva)
+				va_next = eva;
+			continue;
+		}
+
+		va_next = (sva + NBPDR) & ~PDRMASK;
+		if (va_next < sva)
+			va_next = eva;
+
+		pde = pmap_pdpe_to_pde(pdpe, sva);
+		ptpaddr = *pde;
+
+		/*
+		 * Check for large page (ignore).
+		 */
+		if ((ptpaddr & VPTE_PS) != 0) {
+#if 0
+			pmap_clean_pde(pde, pmap, sva);
+			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
+#endif
+			continue;
+		}
+
+		/*
+		 * Weed out invalid mappings. Note: we assume that the page
+		 * directory table is always allocated, and in kernel virtual.
+		 */
+		if (ptpaddr == 0)
+			continue;
+
+		if (va_next > eva)
+			va_next = eva;
+
+		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
+		    sva += PAGE_SIZE) {
+			vm_page_t m;
+
+			if (stop)
+				break;
+			if ((*pte & VPTE_MANAGED) == 0)
+				continue;
+
+			m = PHYS_TO_VM_PAGE(*pte & VPTE_FRAME);
+			if (vm_page_busy_try(m, TRUE) == 0) {
+				if (pginfo->callback(pginfo, sva, m) < 0)
+					stop = 1;
+			}
+		}
+	}
+	lwkt_reltoken(&vm_token);
+}
diff --git a/sys/sys/dmap.h b/sys/sys/dmap.h
index 28c3d984a3..da2119d2e1 100644
--- a/sys/sys/dmap.h
+++ b/sys/sys/dmap.h
@@ -51,9 +51,6 @@ struct dmap {
 	swblk_t dm_alloc;	/* amount of physical swap space allocated */
 	swblk_t dm_map[NDMAP];	/* first disk block number in each chunk */
 };
-#ifdef _KERNEL
-extern int dmmax;
-#endif
 
 /* The following structure is that ``returned'' from a call to vstodb(). */
 struct dblock {
diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h
index 7d534c76c2..b0b2d14385 100644
--- a/sys/vm/pmap.h
+++ b/sys/vm/pmap.h
@@ -94,6 +94,20 @@ struct vmspace;
 struct vmspace_entry;
 struct vm_map_entry;
 
+struct pmap_pgscan_info {
+	struct pmap	*pmap;
+	vm_offset_t	beg_addr;
+	vm_offset_t	end_addr;
+	vm_offset_t	offset;
+	vm_pindex_t	limit;
+	vm_pindex_t	busycount;
+	vm_pindex_t	cleancount;
+	vm_pindex_t	actioncount;
+	int		(*callback)(struct pmap_pgscan_info *,
+				    vm_offset_t va,
+				    struct vm_page *);
+};
+
 /*
  * Most of these variables represent parameters set up by low level MD kernel
  * boot code to be used by higher level MI initialization code to identify
@@ -123,6 +137,11 @@ extern vm_offset_t virtual2_end;
 extern vm_paddr_t phys_avail[];	
 
 /*
+ * High-level pmap scan
+ */
+void pmap_pgscan(struct pmap_pgscan_info *info);
+
+/*
  * Return true if the passed address is in the kernel address space.
  * This is mainly a check that the address is NOT in the user address space.
  *
@@ -170,6 +189,7 @@ void		 pmap_puninit (pmap_t);
 void		 pmap_pinit0 (pmap_t);
 void		 pmap_pinit2 (pmap_t);
 void		 pmap_protect (pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
+void		 pmap_remove_specific (pmap_t, vm_page_t);
 void		 pmap_qenter (vm_offset_t, struct vm_page **, int);
 void		 pmap_qremove (vm_offset_t, int);
 void		 pmap_qremove_quick (vm_offset_t, int);
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 8597b70da7..5ff09eae41 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -178,7 +178,7 @@ extern struct vnode *swapdev_vp;
 extern struct swdevt *swdevt;
 extern int nswdev;
 
-#define BLK2DEVIDX(blk) (nswdev > 1 ? blk / dmmax % nswdev : 0)
+#define BLK2DEVIDX(blk) (nswdev > 1 ? blk / SWB_DMMAX % nswdev : 0)
 
 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
@@ -267,15 +267,13 @@ struct pagerops swappagerops = {
 };
 
 /*
- * dmmax is in page-sized chunks with the new swap system.  It was
- * dev-bsized chunks in the old.  dmmax is always a power of 2.
+ * SWB_DMMAX is in page-sized chunks with the new swap system.  It was
+ * dev-bsized chunks in the old.  SWB_DMMAX is always a power of 2.
  *
  * swap_*() routines are externally accessible.  swp_*() routines are
  * internal.
  */
 
-int dmmax;
-static int dmmax_mask;
 int nswap_lowat = 128;		/* in pages, swap_pager_almost_full warn */
 int nswap_hiwat = 512;		/* in pages, swap_pager_almost_full warn */
 
@@ -340,11 +338,6 @@ swp_sizecheck(void)
 static void
 swap_pager_init(void *arg __unused)
 {
-	/*
-	 * Device Stripe, in PAGE_SIZE'd blocks
-	 */
-	dmmax = SWB_NPAGES * 2;
-	dmmax_mask = ~(dmmax - 1);
 }
 SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_THIRD, swap_pager_init, NULL);
 
@@ -540,9 +533,6 @@ swp_pager_getswapspace(vm_object_t object, int npages)
  *	Note:  This routine may not block (it could in the old swap code),
  *	and through the use of the new blist routines it does not block.
  *
- *	We must be called at splvm() to avoid races with bitmap frees from
- *	vm_page_remove() aka swap_pager_page_removed().
- *
  * This routine may not block.
  */
 
@@ -1032,7 +1022,7 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
 		 */
 		if (
 		    biox && (biox_blkno + btoc(bufx->b_bcount) != blk ||
-		     ((biox_blkno ^ blk) & dmmax_mask)
+		     ((biox_blkno ^ blk) & ~SWB_DMMASK)
 		    )
 		) {
 			if (bp->b_cmd == BUF_CMD_READ) {
@@ -1060,9 +1050,7 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
 		 */
 		if (blk == SWAPBLK_NONE) {
 			/*
-			 * We can only get here if we are reading.  Since
-			 * we are at splvm() we can safely modify b_resid,
-			 * even if chain ops are in progress.
+			 * We can only get here if we are reading.
 			 */
 			bzero(data, PAGE_SIZE);
 			bp->b_resid -= PAGE_SIZE;
@@ -1317,7 +1305,7 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
 		iblk = swp_pager_meta_ctl(object, mreq->pindex + i, 0);
 		if (iblk != blk + i)
 			break;
-		if ((blk ^ iblk) & dmmax_mask)
+		if ((blk ^ iblk) & ~SWB_DMMASK)
 			break;
 		m = vm_page_lookup_busy_try(object, mreq->pindex + i,
 					    TRUE, &error);
@@ -1615,11 +1603,10 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
 
 		/*
 		 * The I/O we are constructing cannot cross a physical
-		 * disk boundry in the swap stripe.  Note: we are still
-		 * at splvm().
+		 * disk boundry in the swap stripe.
 		 */
-		if ((blk ^ (blk + n)) & dmmax_mask) {
-			j = ((blk + dmmax) & dmmax_mask) - blk;
+		if ((blk ^ (blk + n)) & ~SWB_DMMASK) {
+			j = ((blk + SWB_DMMAX) & ~SWB_DMMASK) - blk;
 			swp_pager_freeswapspace(object, blk + j, n - j);
 			n = j;
 		}
@@ -1770,7 +1757,7 @@ swp_pager_async_iodone(struct bio *bio)
 	}
 
 	/*
-	 * set object, raise to splvm().
+	 * set object.
 	 */
 	if (bp->b_xio.xio_npages)
 		object = bp->b_xio.xio_pages[0]->object;
@@ -2122,9 +2109,7 @@ swp_pager_swapoff_callback(struct swblock *swap, void *data)
  ************************************************************************
  *
  *	These routines manipulate the swap metadata stored in the 
- *	OBJT_SWAP object.  All swp_*() routines must be called at
- *	splvm() because swap can be freed up by the low level vm_page
- *	code which might be called from interrupts beyond what splbio() covers.
+ *	OBJT_SWAP object.
  *
  *	Swap metadata is implemented with a global hash and not directly
  *	linked into the object.  Instead the object simply contains
diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h
index 6ad21d68f6..3bb1414a44 100644
--- a/sys/vm/swap_pager.h
+++ b/sys/vm/swap_pager.h
@@ -54,13 +54,20 @@
 #endif
 
 /*
- * SWB_NPAGES must be a power of 2.  It may be set to 1, 2, 4, 8, or 16
- * pages per allocation.  We recommend you stick with the default of 8.
- * The 16-page limit is due to the radix code (kern/subr_blist.c).
+ * SWB_NPAGES must be a power of 2.  Note that DMMAX may not exceed
+ * SWBLK_BITS, so the limit for SWB_NPAGES is (SWBLK_BITS / 2).
  */
 #define SWB_NPAGES	16
 
 /*
+ * DMMAX is the stripe size and must be a power of 2 >= SWBLK_BITS to ensure
+ * that the blist code does not allocate a contiguous range that crosses a
+ * stripe.
+ */
+#define SWB_DMMAX	SWBLK_BITS
+#define SWB_DMMASK	(SWB_DMMAX - 1)
+
+/*
  * Piecemeal swap metadata structure.  Swap is stored in a RBTREE.  Swap
  * blocks are page-sized.  e.g. block 1 is offset +4096 from block 0.
  *
diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h
index 8ebdea6780..44b5b1a7b6 100644
--- a/sys/vm/vm_map.h
+++ b/sys/vm/vm_map.h
@@ -274,7 +274,7 @@ struct vm_map {
 	u_int president_cache;		/* Remember president count */
 	u_int president_ticks;		/* Save ticks for cache */
 	struct lwkt_token token;	/* Soft serializer */
-	vm_ooffset_t pgout_offset;	/* for RLIMIT_RSS scans */
+	vm_offset_t pgout_offset;	/* for RLIMIT_RSS scans */
 #define	min_offset		header.start
 #define max_offset		header.end
 };
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 6d8694b7b4..1b1fc1ad70 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -200,9 +200,6 @@ SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
 
 #if !defined(NO_SWAPPING)
-static vm_pindex_t vm_pageout_object_deactivate_pages(vm_map_t map,
-			vm_object_t object, vm_pindex_t limit,
-			vm_pindex_t obj_beg, vm_pindex_t obj_end);
 static void vm_req_vmdaemon (void);
 #endif
 static void vm_pageout_page_stats(int q);
@@ -528,136 +525,14 @@ vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
 #if !defined(NO_SWAPPING)
 
 /*
- * Deactivate pages until the map RSS falls below the specified limit.
- *
- * This code is part of the process rlimit and vm_daemon handler and not
- * part of the normal demand-paging code.  We only check the top-level
- * object.
- *
- * The map must be locked.
- * The caller must hold the vm_object.
- */
-static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *);
-static int vm_pageout_object_deactivate_pages_cmp(vm_page_t, void *);
-
-static vm_pindex_t
-vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
-				   vm_pindex_t limit,
-				   vm_pindex_t obj_beg,
-				   vm_pindex_t obj_end)
-{
-	struct rb_vm_page_scan_info info;
-	int remove_mode;
-
-	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
-
-	info.count = 0;
-	info.backing_offset_index = obj_beg;
-	info.backing_object = object;
-
-	for (;;) {
-		vm_pindex_t advance;
-
-		if (pmap_resident_tlnw_count(vm_map_pmap(map)) <= limit)
-			break;
-		if (object->type == OBJT_DEVICE ||
-		    object->type == OBJT_MGTDEVICE ||
-		    object->type == OBJT_PHYS) {
-			break;
-		}
-#if 0
-		if (object->paging_in_progress)
-			break;
-#endif
-
-		remove_mode = 0;
-		if (object->shadow_count > 1)
-			remove_mode = 1;
-
-		/*
-		 * scan the objects entire memory queue.  We hold the
-		 * object's token so the scan should not race anything.
-		 *
-		 * The callback will adjust backing_offset_index past the
-		 * last index scanned.  This value only matters if we
-		 * terminate early.
-		 */
-		info.limit = remove_mode;
-		info.map = map;
-		info.desired = limit;
-		info.start_pindex = obj_beg;
-		info.end_pindex = obj_end;
-		info.object = object;
-
-		vm_page_rb_tree_RB_SCAN(&object->rb_memq,
-				vm_pageout_object_deactivate_pages_cmp,
-				vm_pageout_object_deactivate_pages_callback,
-				&info);
-
-		/*
-		 * Backing object recursion (we will loop up).
-		 */
-		while ((object = info.object->backing_object) != NULL) {
-			vm_object_hold(object);
-			if (object != info.object->backing_object) {
-				vm_object_drop(object);
-				continue;
-			}
-			break;
-		}
-		if (object == NULL) {
-			if (info.object != info.backing_object)
-				vm_object_drop(info.object);
-			break;
-		}
-		advance = OFF_TO_IDX(info.object->backing_object_offset);
-		info.start_pindex += advance;
-		info.end_pindex += advance;
-		info.backing_offset_index += advance;
-		if (info.object != info.backing_object) {
-			vm_object_lock_swap();
-			vm_object_drop(info.object);
-		}
-		info.object = object;
-	}
-
-	/*
-	 * Return how far we want the caller to advance.  The caller will
-	 * ignore this value and use obj_end if the RSS limit is still not
-	 * satisfied.
-	 */
-	return (info.backing_offset_index - info.start_pindex);
-}
-
-/*
- * Only page indices above start_pindex
+ * Callback function, page busied for us.  We must dispose of the busy
+ * condition.  Any related pmap pages may be held but will not be locked.
  */
 static
 int
-vm_pageout_object_deactivate_pages_cmp(vm_page_t p, void *data)
-{
-	struct rb_vm_page_scan_info *info = data;
-
-	if (p->pindex < info->start_pindex)
-		return -1;
-	if (p->pindex >= info->end_pindex)
-		return +1;
-	return 0;
-}
-
-/*
- * The caller must hold the vm_object.
- *
- * info->count is bumped for every page removed from the process pmap.
- *
- * info->backing_offset_index is updated past the last scanned page.
- * This value will be ignored and the scan forced to the mapent boundary
- * by the caller if the resident count remains too high.
- */
-static int
-vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
+vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
+			vm_page_t p)
 {
-	struct rb_vm_page_scan_info *info = data;
 	int actcount;
 	int cleanit = 0;
 
@@ -666,30 +541,24 @@ vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
 	 *		 once the RSS is below the required level.
 	 */
 	KKASSERT((p->flags & PG_MARKER) == 0);
-	if (pmap_resident_tlnw_count(vm_map_pmap(info->map)) <=
-	    info->desired) {
+	if (pmap_resident_tlnw_count(info->pmap) <= info->limit) {
+		vm_page_wakeup(p);
 		return(-1);
 	}
 
 	mycpu->gd_cnt.v_pdpages++;
-	info->backing_offset_index = p->pindex + 1;
 
-	if (vm_page_busy_try(p, TRUE))
-		return(0);
-
-	if (p->object != info->object) {
-		vm_page_wakeup(p);
-		return(0);
-	}
 	if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) {
 		vm_page_wakeup(p);
 		goto done;
 	}
-	if (!pmap_page_exists_quick(vm_map_pmap(info->map), p)) {
-		vm_page_wakeup(p);
-		goto done;
-	}
 
+	++info->actioncount;
+
+	/*
+	 * Check if the page has been referened recently.  If it has,
+	 * activate it and skip.
+	 */
 	actcount = pmap_ts_referenced(p);
 	if (actcount) {
 		vm_page_flag_set(p, PG_REFERENCED);
@@ -697,59 +566,52 @@ vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
 		actcount = 1;
 	}
 
-	vm_page_and_queue_spin_lock(p);
-	if (p->queue - p->pc != PQ_ACTIVE && (p->flags & PG_REFERENCED)) {
-		vm_page_and_queue_spin_unlock(p);
-		vm_page_activate(p);
-		p->act_count += actcount;
-		vm_page_flag_clear(p, PG_REFERENCED);
-	} else if (p->queue - p->pc == PQ_ACTIVE) {
-		if ((p->flags & PG_REFERENCED) == 0) {
-			/* use ACT_ADVANCE for a faster decline */
-			p->act_count -= min(p->act_count, ACT_ADVANCE);
-			if (!info->limit &&
-			    (vm_pageout_algorithm || (p->act_count == 0))) {
+	if (actcount) {
+		if (p->queue - p->pc != PQ_ACTIVE) {
+			vm_page_and_queue_spin_lock(p);
+			if (p->queue - p->pc != PQ_ACTIVE) {
 				vm_page_and_queue_spin_unlock(p);
-				vm_page_deactivate(p);
-				cleanit = 1;
+				vm_page_activate(p);
 			} else {
-				TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
-					     p, pageq);
-				TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
-						  p, pageq);
 				vm_page_and_queue_spin_unlock(p);
 			}
 		} else {
-			vm_page_and_queue_spin_unlock(p);
-			vm_page_activate(p);
-			vm_page_flag_clear(p, PG_REFERENCED);
+			p->act_count += actcount;
+			if (p->act_count > ACT_MAX)
+				p->act_count = ACT_MAX;
+		}
+		vm_page_flag_clear(p, PG_REFERENCED);
+		vm_page_wakeup(p);
+		goto done;
+	}
 
-			vm_page_and_queue_spin_lock(p);
-			if (p->queue - p->pc == PQ_ACTIVE) {
-				if (p->act_count < (ACT_MAX - ACT_ADVANCE))
-					p->act_count += ACT_ADVANCE;
-				TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
-					     p, pageq);
-				TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
-						  p, pageq);
-			}
-			vm_page_and_queue_spin_unlock(p);
+	/*
+	 * Remove the page from this particular pmap.  Once we do this, our
+	 * pmap scans will not see it again (unless it gets faulted in), so
+	 * we must actively dispose of or deal with the page.
+	 */
+	pmap_remove_specific(info->pmap, p);
+
+	/*
+	 * If the page is not mapped to another process (i.e. as would be
+	 * typical if this were a shared page from a library) then deactivate
+	 * the page and clean it in two passes only.
+	 *
+	 * If the page hasn't been referenced since the last check, remove it
+	 * from the pmap.  If it is no longer mapped, deactivate it
+	 * immediately, accelerating the normal decline.
+	 *
+	 * Once the page has been removed from the pmap the RSS code no
+	 * longer tracks it so we have to make sure that it is staged for
+	 * potential flush action.
+	 */
+	if ((p->flags & PG_MAPPED) == 0) {
+		if (p->queue - p->pc == PQ_ACTIVE) {
+			vm_page_deactivate(p);
 		}
-	} else if (p->queue - p->pc == PQ_INACTIVE) {
-#if 0
-		TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
-			     p, pageq);
-		TAILQ_INSERT_HEAD(&vm_page_queues[p->queue].pl,
-				  p, pageq);
-#endif
-		/* use ACT_ADVANCE for a faster decline */
-		p->act_count -= min(p->act_count, ACT_ADVANCE);
-		vm_page_and_queue_spin_unlock(p);
-		if (p->act_count == 0) {
+		if (p->queue - p->pc == PQ_INACTIVE) {
 			cleanit = 1;
 		}
-	} else {
-		vm_page_and_queue_spin_unlock(p);
 	}
 
 	/*
@@ -768,120 +630,78 @@ vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
 		int vmflush_flags;
 		struct vnode *vpfailed = NULL;
 
-		vmflush_flags = VM_PAGER_TRY_TO_CACHE | VM_PAGER_ALLOW_ACTIVE;
-		if (swap_user_async == 0)
-			vmflush_flags |= VM_PAGER_PUT_SYNC;
+		info->offset = va;
 
-		if (vm_pageout_memuse_mode >= 1)
-			vm_page_protect(p, VM_PROT_NONE);
 		if (vm_pageout_memuse_mode >= 2) {
+			vmflush_flags = VM_PAGER_TRY_TO_CACHE |
+					VM_PAGER_ALLOW_ACTIVE;
+			if (swap_user_async == 0)
+				vmflush_flags |= VM_PAGER_PUT_SYNC;
 			vm_page_flag_set(p, PG_WINATCFLS);
-			info->count += vm_pageout_page(p, &max_launder,
-						       &vnodes_skipped,
-						       &vpfailed, 1, vmflush_flags);
+			info->cleancount +=
+				vm_pageout_page(p, &max_launder,
+						&vnodes_skipped,
+						&vpfailed, 1, vmflush_flags);
 		} else {
-			++info->count;
 			vm_page_wakeup(p);
+			++info->cleancount;
 		}
 	} else {
 		vm_page_wakeup(p);
 	}
-
 done:
 	lwkt_user_yield();
-	return(0);
+	return 0;
 }
 
 /*
  * Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
- * that is relatively difficult to do.
+ * that is relatively difficult to do.  We try to keep track of where we
+ * left off last time to reduce scan overhead.
  *
  * Called when vm_pageout_memuse_mode is >= 1.
  */
 void
 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
 {
-	vm_map_entry_t tmpe;
-	vm_object_t obj;
-	vm_ooffset_t pgout_offset;
-	vm_ooffset_t tmpe_end;
-	vm_pindex_t obj_beg;
-	vm_pindex_t obj_end;
-	vm_pindex_t count;
+	vm_offset_t pgout_offset;
+	struct pmap_pgscan_info info;
 	int retries = 3;
 
-	lockmgr(&map->lock, LK_EXCLUSIVE);
-
-	/*
-	 * Scan the map incrementally.
-	 */
 	pgout_offset = map->pgout_offset;
 again:
-	tmpe = map->header.next;
-	obj_beg = 0;
-	obj_end = 0;
-	tmpe_end = 0;
-	obj = NULL;
-
-	while (tmpe != &map->header) {
-		if (tmpe->end <= pgout_offset) {
-			tmpe = tmpe->next;
-			continue;
-		}
-		if (tmpe->maptype == VM_MAPTYPE_NORMAL ||
-		    tmpe->maptype == VM_MAPTYPE_VPAGETABLE) {
-			obj = tmpe->object.vm_object;
-			if (obj && obj->shadow_count <= 1) {
-				if (pgout_offset < tmpe->start) {
-					obj_beg = tmpe->offset >> PAGE_SHIFT;
-					obj_end = ((tmpe->end - tmpe->start) +
-						   tmpe->offset) >> PAGE_SHIFT;
-				} else {
-					obj_beg = (pgout_offset - tmpe->start +
-						   tmpe->offset) >> PAGE_SHIFT;
-					obj_end = (tmpe->end - tmpe->start +
-						   tmpe->offset) >> PAGE_SHIFT;
-				}
-				tmpe_end = tmpe->end;
-				break;
-			}
-			obj = NULL;
-		}
-		tmpe = tmpe->next;
-	}
-
-	/*
-	 * Attempt to continue where we left off until the RLIMIT is
-	 * satisfied or we run out of retries.  Note that the map remains
-	 * locked, so the program is not going to be taking any faults
-	 * while we are doing this.
-	 *
-	 * Only circle around in this particular function when the
-	 * memuse_mode is >= 2.
-	 */
-	if (obj)  {
-		vm_object_hold(obj);
-		count = vm_pageout_object_deactivate_pages(map, obj, limit,
-						   obj_beg, obj_end);
-		vm_object_drop(obj);
-		if (pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
-			pgout_offset = tmpe_end;
-			goto again;
-		}
-		pgout_offset += count << PAGE_SHIFT;
-	} else {
+#if 0
+	kprintf("%016jx ", pgout_offset);
+#endif
+	if (pgout_offset < VM_MIN_USER_ADDRESS)
+		pgout_offset = VM_MIN_USER_ADDRESS;
+	if (pgout_offset >= VM_MAX_USER_ADDRESS)
 		pgout_offset = 0;
-		if (pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
-			if (retries && vm_pageout_memuse_mode >= 2) {
-				--retries;
-				goto again;
-			}
-		}
-	}
+	info.pmap = vm_map_pmap(map);
+	info.limit = limit;
+	info.beg_addr = pgout_offset;
+	info.end_addr = VM_MAX_USER_ADDRESS;
+	info.callback = vm_pageout_mdp_callback;
+	info.cleancount = 0;
+	info.actioncount = 0;
+	info.busycount = 0;
+
+	pmap_pgscan(&info);
+	pgout_offset = info.offset;
+#if 0
+	kprintf("%016jx %08lx %08lx\n", pgout_offset,
+		info.cleancount, info.actioncount);
+#endif
 
+	if (pgout_offset != VM_MAX_USER_ADDRESS &&
+	    pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
+		goto again;
+	} else if (retries &&
+		   pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
+		--retries;
+		goto again;
+	}
 	map->pgout_offset = pgout_offset;
-
-	vm_map_unlock(map);
 }
 #endif
 
@@ -2399,7 +2219,8 @@ vm_daemon_callback(struct proc *p, void *data __unused)
 	vm = p->p_vmspace;
 	vmspace_hold(vm);
 	size = pmap_resident_tlnw_count(&vm->vm_pmap);
-	if (limit >= 0 && size >= limit && vm_pageout_memuse_mode >= 1) {
+	if (limit >= 0 && size > 4096 &&
+	    size - 4096 >= limit && vm_pageout_memuse_mode >= 1) {
 		vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
 	}
 	vmspace_drop(vm);
diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c
index cab7c13865..c4c8fe0db1 100644
--- a/sys/vm/vm_swap.c
+++ b/sys/vm/vm_swap.c
@@ -107,17 +107,17 @@ swapdev_strategy(struct vop_strategy_args *ap)
 	 */
 	nbio = push_bio(bio);
 	if (nswdev > 1) {
-		off = blkno % dmmax;
-		if (off + sz > dmmax) {
+		off = blkno % SWB_DMMAX;
+		if (off + sz > SWB_DMMAX) {
 			bp->b_error = EINVAL;
 			bp->b_flags |= B_ERROR;
 			biodone(bio);
 			return 0;
 		}
-		seg = blkno / dmmax;
+		seg = blkno / SWB_DMMAX;
 		index = seg % nswdev;
 		seg /= nswdev;
-		nbio->bio_offset = (off_t)(seg * dmmax + off) << PAGE_SHIFT;
+		nbio->bio_offset = (off_t)(seg * SWB_DMMAX + off) << PAGE_SHIFT;
 	} else {
 		index = 0;
 		nbio->bio_offset = bio->bio_offset;
@@ -237,7 +237,7 @@ sys_swapon(struct swapon_args *uap)
 /*
  * Swfree(index) frees the index'th portion of the swap map.
  * Each of the nswdev devices provides 1/nswdev'th of the swap
- * space, which is laid out with blocks of dmmax pages circularly
+ * space, which is laid out with blocks of SWB_DMMAX pages circularly
  * among the devices.
  *
  * The new swap code uses page-sized blocks.  The old swap code used
@@ -354,11 +354,12 @@ swaponvp(struct thread *td, struct vnode *vp, u_quad_t nblks)
 	sp->sw_nused = 0;
 
 	/*
-	 * nblks, nswap, and dmmax are PAGE_SIZE'd parameters now, not
+	 * nblks, nswap, and SWB_DMMAX are PAGE_SIZE'd parameters now, not
 	 * DEV_BSIZE'd.   aligned_nblks is used to calculate the
 	 * size of the swap bitmap, taking into account the stripe size.
 	 */
-	aligned_nblks = (swblk_t)((nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1));
+	aligned_nblks = (swblk_t)((nblks + SWB_DMMASK) &
+				  ~(u_swblk_t)SWB_DMMASK);
 	sp->sw_nblks = aligned_nblks;
 
 	if (aligned_nblks * nswdev > nswap)
@@ -369,9 +370,9 @@ swaponvp(struct thread *td, struct vnode *vp, u_quad_t nblks)
 	else
 		blist_resize(&swapblist, nswap, 0);
 
-	for (dvbase = dmmax; dvbase < aligned_nblks; dvbase += dmmax) {
-		blk = min(aligned_nblks - dvbase, dmmax);
-		vsbase = index * dmmax + dvbase * nswdev;
+	for (dvbase = SWB_DMMAX; dvbase < aligned_nblks; dvbase += SWB_DMMAX) {
+		blk = min(aligned_nblks - dvbase, SWB_DMMAX);
+		vsbase = index * SWB_DMMAX + dvbase * nswdev;
 		blist_free(swapblist, vsbase, blk);
 		vm_swap_size += blk;
 		vm_swap_max += blk;
@@ -532,9 +533,9 @@ swapoff_one(int index)
 	 * Prevent further allocations on this device
 	 */
 	sp->sw_flags |= SW_CLOSING;
-	for (dvbase = dmmax; dvbase < aligned_nblks; dvbase += dmmax) {
-		blk = min(aligned_nblks - dvbase, dmmax);
-		vsbase = index * dmmax + dvbase * nswdev;
+	for (dvbase = SWB_DMMAX; dvbase < aligned_nblks; dvbase += SWB_DMMAX) {
+		blk = min(aligned_nblks - dvbase, SWB_DMMAX);
+		vsbase = index * SWB_DMMAX + dvbase * nswdev;
 		vm_swap_size -= blist_fill(swapblist, vsbase, blk);
 		vm_swap_max -= blk;
 	}
@@ -591,7 +592,7 @@ swapacctspace(swblk_t base, swblk_t count)
 	swblk_t seg;
 
 	vm_swap_size += count;
-	seg = base / dmmax;
+	seg = base / SWB_DMMAX;
 	index = seg % nswdev;
 	swdevt[index].sw_nused -= count;
 }
-- 
2.11.4.GIT