From e32fb2aa5ec45f533085d913b32f3b5b441047fe Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Sun, 19 May 2019 12:59:49 -0700
Subject: [PATCH] kernel - VM rework part 13 - Core pmap work, stabilize &
 optimize

* Refactor the vm_page_hash hash again to get a better distribution.

* I tried to only hash shared objects but this resulted in a number of
  edge cases where program re-use could miss the optimization.

* Add a sysctl vm.page_hash_vnode_only (default off).  If turned on,
  only vm_page's associated with vnodes will be hashed.  This should
  generally not be necessary.

* Refactor vm_page_list_find2() again to avoid all duplicate queue
  checks.  This time I mocked the algorithm up in userland and twisted
  it until it did what I wanted.

* VM_FAULT_QUICK_DEBUG was accidently left on, turn it off.

* Do not remove the original page from the pmap when vm_fault_object()
  must do a COW.  And just in case this is ever added back in later,
  don't do it using pmap_remove_specific() !!!  Use pmap_remove_pages()
  to avoid the backing scan lock.

  vm_fault_page() will now do this removal (for procfs rwmem), the normal
  vm_fault will of course replace the page anyway, and the umtx code
  uses different recovery mechanisms now and should be ok.

* Optimize vm_map_entry_shadow() for the situation where the old
  object is no longer shared.  Get rid of an unnecessary transient
  kmalloc() and vm_object_hold_shared().
---
 sys/vm/vm_fault.c       |  49 +++++++-----------
 sys/vm/vm_map.c         |   9 ++--
 sys/vm/vm_page.c        | 129 ++++++++++++++++++++++++++++++++++--------------
 test/debug/vmpageinfo.c |   2 +-
 4 files changed, 116 insertions(+), 73 deletions(-)

diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index f7801c1ab2..73755e1b07 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -161,10 +161,17 @@ __read_mostly int vm_shared_fault = 1;
 TUNABLE_INT("vm.shared_fault", &vm_shared_fault);
 SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW,
 		&vm_shared_fault, 0, "Allow shared token on vm_object");
-__read_mostly static int vm_fault_quick_enable = 0;
+__read_mostly static int vm_fault_quick_enable = 1;
 TUNABLE_INT("vm.fault_quick", &vm_fault_quick_enable);
 SYSCTL_INT(_vm, OID_AUTO, fault_quick, CTLFLAG_RW,
 		&vm_fault_quick_enable, 0, "Allow fast vm_fault shortcut");
+
+/*
+ * Define here for debugging ioctls.  Note that these are globals, so
+ * they were cause a ton of cache line bouncing.  Only use for debugging
+ * purposes.
+ */
+/*#define VM_FAULT_QUICK_DEBUG */
 #ifdef VM_FAULT_QUICK_DEBUG
 static long vm_fault_quick_success_count = 0;
 SYSCTL_LONG(_vm, OID_AUTO, fault_quick_success_count, CTLFLAG_RW,
@@ -865,14 +872,6 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
 
 	/*
 	 * Don't waste time if the object is only being used by one vm_map.
-	 *
-	 * WARNING! We can't rely on obj->ref_count here because it might
-	 *	    be part of a shared ba chain, and we can't rely on
-	 *	    ba->refs for the same reason.  The combination of it
-	 *	    being the ba embedded in the entry (aka first_ba) AND
-	 *	    ref_count == 1 would work, but OBJ_ONEMAPPING is better
-	 *	    because it will remain flagged even when ref_count > 1
-	 *	    for situations where entries are clipped.
 	 */
 	obj = fs->first_ba->object;
 	if (obj->flags & OBJ_ONEMAPPING)
@@ -2265,26 +2264,15 @@ next:
 				/*
 				 * Oh, well, lets copy it.
 				 *
-				 * Why are we unmapping the original page
-				 * here?  Well, in short, not all accessors
-				 * of user memory go through the pmap.  The
-				 * procfs code doesn't have access user memory
-				 * via a local pmap, so vm_fault_page*()
-				 * can't call pmap_enter().  And the umtx*()
-				 * code may modify the COW'd page via a DMAP
-				 * or kernel mapping and not via the pmap,
-				 * leaving the original page still mapped
-				 * read-only into the pmap.
-				 *
-				 * So we have to remove the page from at
-				 * least the current pmap if it is in it.
+				 * We used to unmap the original page here
+				 * because vm_fault_page() didn't and this
+				 * would cause havoc for the umtx*() code
+				 * and the procfs code.
 				 *
-				 * We used to just remove it from all pmaps
-				 * but that creates inefficiencies on SMP,
-				 * particularly for COW program & library
-				 * mappings that are concurrently exec'd.
-				 * Only remove the page from the current
-				 * pmap.
+				 * This is no longer necessary.  The
+				 * vm_fault_page() routine will now unmap the
+				 * page after a COW, and the umtx code will
+				 * recover on its own.
 				 */
 				/*
 				 * NOTE: Since fs->m is a backing page, it
@@ -2293,10 +2281,9 @@ next:
 				 */
 				KKASSERT(fs->first_shared == 0);
 				vm_page_copy(fs->m, fs->first_m);
-				/*vm_page_protect(fs->m, VM_PROT_NONE);*/
-				pmap_remove_specific(
+				/* pmap_remove_specific(
 				    &curthread->td_lwp->lwp_vmspace->vm_pmap,
-				    fs->m);
+				    fs->m); */
 			}
 
 			/*
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index c478b62417..aa3071c435 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -740,28 +740,29 @@ vm_map_entry_shadow(vm_map_entry_t entry)
 		length = 0x7FFFFFFF;
 	else
 		length = atop(entry->ba.end - entry->ba.start);
-	ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
 
 	/*
 	 * Don't create the new object if the old object isn't shared.
+	 * This case occurs quite often when programs fork/exec/wait.
 	 *
 	 * Caller ensures source exists (all backing_ba's must have objects),
 	 * typically indirectly by virtue of the NEEDS_COPY flag being set.
+	 * We have a ref on source by virtue of the entry and do not need
+	 * to lock it to do this test.
 	 */
 	source = entry->ba.object;
 	KKASSERT(source);
-	vm_object_hold_shared(source);
 
 	if (source->type != OBJT_VNODE) {
 		if (source->ref_count == 1 &&
 		    source->handle == NULL &&
 		    (source->type == OBJT_DEFAULT ||
 		     source->type == OBJT_SWAP)) {
-			vm_object_drop(source);
-			kfree(ba, M_MAP_BACKING);
 			goto done;
 		}
 	}
+	ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
+	vm_object_hold_shared(source);
 
 	/*
 	 * Once it becomes part of a backing_ba chain it can wind up anywhere,
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 583619dd14..9da2327b7a 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -106,6 +106,9 @@ struct vm_page_hash_elm {
 	int		unused01;
 };
 
+#define VM_PAGE_HASH_SET	4		/* power of 2, set-assoc */
+#define VM_PAGE_HASH_MAX	(1024 * 1024)	/* power of 2, max size */
+
 /*
  * SET - Minimum required set associative size, must be a power of 2.  We
  *	 want this to match or exceed the set-associativeness of the cpu,
@@ -130,8 +133,9 @@ static struct alist vm_contig_alist;
 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
 
-static struct vm_page_hash_elm *vm_page_hash;
+__read_mostly static int vm_page_hash_vnode_only;
 __read_mostly static int vm_page_hash_size;
+__read_mostly static struct vm_page_hash_elm *vm_page_hash;
 
 static u_long vm_dma_reserved = 0;
 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
@@ -140,6 +144,14 @@ SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
 	    &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
 
+SYSCTL_INT(_vm, OID_AUTO, page_hash_vnode_only, CTLFLAG_RW,
+	    &vm_page_hash_vnode_only, 0, "Only hash vnode pages");
+#if 0
+static int vm_page_hash_debug;
+SYSCTL_INT(_vm, OID_AUTO, page_hash_debug, CTLFLAG_RW,
+	    &vm_page_hash_debug, 0, "Only hash vnode pages");
+#endif
+
 static int vm_contig_verbose = 0;
 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
 
@@ -706,15 +718,18 @@ vm_page_startup_finish(void *dummy __unused)
 	 * Set the set_assoc_mask based on the fitted number of CPUs.
 	 * This is a mask, so we subject 1.
 	 *
-	 * w/PQ_L2_SIZE = 1024:
+	 * w/PQ_L2_SIZE = 1024, Don't let the associativity drop below 8.
+	 * So if we have 256 CPUs, two hyper-threads will wind up sharing.
 	 *
-	 *	Don't let the associativity drop below 8.  So if we have
-	 *	256 CPUs, two hyper-threads will wind up sharing.  The
-	 *	maximum is PQ_L2_SIZE.
+	 * The maximum is PQ_L2_SIZE.  However, we limit the starting
+	 * maximum to 16 (mask = 15) in order to improve the cache locality
+	 * of related kernel data structures.
 	 */
 	mask = PQ_L2_SIZE / ncpus_fit - 1;
 	if (mask < 7)		/* minimum is 8-way w/256 CPU threads */
 		mask = 7;
+	if (mask < 15)
+		mask = 15;
 	cpu_ccfence();
 	set_assoc_mask = mask;
 
@@ -795,8 +810,8 @@ vm_page_startup_finish(void *dummy __unused)
 	vm_page_hash_size = 4096;
 	while (vm_page_hash_size < (vm_page_array_size / 16))
 		vm_page_hash_size <<= 1;
-	if (vm_page_hash_size > 1024*1024)
-		vm_page_hash_size = 1024*1024;
+	if (vm_page_hash_size > VM_PAGE_HASH_MAX)
+		vm_page_hash_size = VM_PAGE_HASH_MAX;
 
 	/*
 	 * hash table for vm_page_lookup_quick()
@@ -1500,9 +1515,12 @@ vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
 {
 	size_t hi;
 
-	hi = ((object->pg_color << 8) ^ (uintptr_t)object) + (pindex << 2);
-	hi &= vm_page_hash_size - 1;
-	hi &= ~3;
+	/* mix it up */
+	hi = (intptr_t)object ^ object->pg_color ^ pindex;
+	hi += object->pg_color * pindex;
+	hi = hi ^ (hi >> 20);
+	hi &= vm_page_hash_size - 1;		/* bounds */
+	hi &= ~(VM_PAGE_HASH_SET - 1);		/* set-assoc */
 	return (&vm_page_hash[hi]);
 }
 
@@ -1522,7 +1540,7 @@ vm_page_hash_get(vm_object_t object, vm_pindex_t pindex)
 	if (vm_page_hash == NULL)
 		return NULL;
 	mp = vm_page_hash_hash(object, pindex);
-	for (i = 0; i < 4; ++i) {
+	for (i = 0; i < VM_PAGE_HASH_SET; ++i) {
 		m = mp[i].m;
 		cpu_ccfence();
 		if (m == NULL)
@@ -1552,28 +1570,49 @@ vm_page_hash_enter(vm_page_t m)
 	struct vm_page_hash_elm *best;
 	int i;
 
-	if (vm_page_hash &&
-	    m > &vm_page_array[0] &&
-	    m < &vm_page_array[vm_page_array_size]) {
-		mp = vm_page_hash_hash(m->object, m->pindex);
-		best = mp;
-		for (i = 0; i < 4; ++i) {
-			if (mp[i].m == m) {
-				mp[i].ticks = ticks;
-				return;
-			}
+	/*
+	 * Only enter type-stable vm_pages with well-shared objects.
+	 */
+	if (vm_page_hash == NULL ||
+	    m < &vm_page_array[0] ||
+	    m >= &vm_page_array[vm_page_array_size])
+		return;
+	if (m->object == NULL)
+		return;
+#if 0
+	/*
+	 * Disabled at the moment, there are some degenerate conditions
+	 * with often-exec'd programs that get ignored.  In particular,
+	 * the kernel's elf loader does a vn_rdwr() on the first page of
+	 * a binary.
+	 */
+	if (m->object->ref_count <= 2 || (m->object->flags & OBJ_ONEMAPPING))
+		return;
+#endif
+	if (vm_page_hash_vnode_only && m->object->type != OBJT_VNODE)
+		return;
 
-			/*
-			 * The best choice is the oldest entry
-			 */
-			if ((ticks - best->ticks) < (ticks - mp[i].ticks) ||
-			    (int)(ticks - mp[i].ticks) < 0) {
-				best = &mp[i];
-			}
+	/*
+	 *
+	 */
+	mp = vm_page_hash_hash(m->object, m->pindex);
+	best = mp;
+	for (i = 0; i < VM_PAGE_HASH_SET; ++i) {
+		if (mp[i].m == m) {
+			mp[i].ticks = ticks;
+			return;
+		}
+
+		/*
+		 * The best choice is the oldest entry
+		 */
+		if ((ticks - best->ticks) < (ticks - mp[i].ticks) ||
+		    (int)(ticks - mp[i].ticks) < 0) {
+			best = &mp[i];
 		}
-		best->m = m;
-		best->ticks = ticks;
 	}
+	best->m = m;
+	best->ticks = ticks;
 }
 
 /*
@@ -1902,12 +1941,17 @@ _vm_page_list_find2(int basequeue, int index, int *lastp)
 	struct vpgqueues *pq;
 	vm_page_t m = NULL;
 	int pqmask = set_assoc_mask >> 1;
-	int pqstart = 0;
 	int pqi;
-	int i;
+	int range;
+	int skip_start;
+	int skip_next;
+	int count;
 
 	index &= PQ_L2_MASK;
 	pq = &vm_page_queues[basequeue];
+	count = 0;
+	skip_start = -1;
+	skip_next = -1;
 
 	/*
 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
@@ -1922,9 +1966,16 @@ _vm_page_list_find2(int basequeue, int index, int *lastp)
 	 */
 	do {
 		pqmask = (pqmask << 1) | 1;
-		for (i = pqstart; i <= pqmask; ++i) {
-			pqi = (index & ~pqmask) | ((index + i) & pqmask);
-			if (TAILQ_FIRST(&pq[pqi].pl)) {
+
+		pqi = index;
+		range = pqmask + 1;
+
+		while (range > 0) {
+			if (pqi >= skip_start && pqi < skip_next) {
+				range -= skip_next - pqi;
+				pqi = (pqi & ~pqmask) | (skip_next & pqmask);
+			}
+			if (range > 0 && TAILQ_FIRST(&pq[pqi].pl)) {
 				spin_lock(&pq[pqi].spin);
 				TAILQ_FOREACH(m, &pq[pqi].pl, pageq) {
 					if (spin_trylock(&m->spin) == 0)
@@ -1936,14 +1987,18 @@ _vm_page_list_find2(int basequeue, int index, int *lastp)
 					 * If we had to wander too far, set
 					 * *lastp to skip past empty queues.
 					 */
-					if (i >= 8)
+					if (count >= 8)
 						*lastp = pqi & PQ_L2_MASK;
 					return(m);
 				}
 				spin_unlock(&pq[pqi].spin);
 			}
+			--range;
+			++count;
+			pqi = (pqi & ~pqmask) | ((pqi + 1) & pqmask);
 		}
-		pqstart = i;
+		skip_start = pqi & ~pqmask;
+		skip_next = (pqi | pqmask) + 1;
 	} while (pqmask != PQ_L2_MASK);
 
 	return(m);
diff --git a/test/debug/vmpageinfo.c b/test/debug/vmpageinfo.c
index 77419afe99..c7d0a042df 100644
--- a/test/debug/vmpageinfo.c
+++ b/test/debug/vmpageinfo.c
@@ -1,7 +1,7 @@
 /*
  * VMPAGEINFO.C
  *
- * cc -I/usr/src/sys vmpageinfo.c -o /usr/local/bin/vmpageinfo -lkvm
+ * cc -I/usr/src/sys vmpageinfo.c -o ~/bin/vmpageinfo -lkvm
  *
  * vmpageinfo
  *
-- 
2.11.4.GIT