From 31efdff0873efcd437dc814d7de1c4bf1fc1f61f Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Sun, 15 Oct 2017 10:54:59 -0700
Subject: [PATCH] kernel - Simplify umtx_sleep and umtx_wakeup support

* Rip out the vm_page_action / vm_page_event() API.  This code was
  fairly SMP unfriendly and created serious bottlenecks with large
  threaded user programs using mutexes.

* Replace with a simpler mechanism that simply wakes up any UMTX
  domain tsleeps after a fork().

* Implement a 4uS spin loop in umtx_sleep() similar to what the
  pipe code does.
---
 sys/kern/kern_fork.c |  22 +++++++++
 sys/kern/kern_umtx.c |  97 +++++++++++++++++++++++++++----------
 sys/vm/vm_fault.c    |  15 +++---
 sys/vm/vm_page.c     | 133 ---------------------------------------------------
 sys/vm/vm_page.h     |  21 +-------
 sys/vm/vm_page2.h    |  18 -------
 6 files changed, 104 insertions(+), 202 deletions(-)

diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 6dd53d371d..d6d5e5a9d1 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -107,6 +107,26 @@ rb_lwp_compare(struct lwp *lp1, struct lwp *lp2)
 RB_GENERATE2(lwp_rb_tree, lwp, u.lwp_rbnode, rb_lwp_compare, lwpid_t, lwp_tid);
 
 /*
+ * When forking, memory underpinning umtx-supported mutexes may be set
+ * COW causing the physical address to change.  We must wakeup any threads
+ * blocked on the physical address to allow them to re-resolve their VM.
+ */
+static void
+wake_umtx_threads(struct proc *p1)
+{
+	struct lwp *lp;
+	struct thread *td;
+
+	RB_FOREACH(lp, lwp_rb_tree, &p1->p_lwp_tree) {
+		td = lp->lwp_thread;
+		if (td && (td->td_flags & TDF_TSLEEPQ) &&
+		    (td->td_wdomain & PDOMAIN_MASK) == PDOMAIN_UMTX) {
+			wakeup_domain(td->td_wchan, PDOMAIN_UMTX);
+		}
+	}
+}
+
+/*
  * fork() system call
  */
 int
@@ -312,6 +332,7 @@ fork1(struct lwp *lp1, int flags, struct proc **procp)
 		}
 
 		vm_fork(p1, 0, flags);
+		wake_umtx_threads(p1);
 
 		/*
 		 * Close all file descriptors.
@@ -634,6 +655,7 @@ fork1(struct lwp *lp1, int flags, struct proc **procp)
 	PHOLD(p1);
 
 	vm_fork(p1, p2, flags);
+	wake_umtx_threads(p1);
 
 	/*
 	 * Create the first lwp associated with the new proc.
diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c
index a2ce14f13a..ec3af2c62b 100644
--- a/sys/kern/kern_umtx.c
+++ b/sys/kern/kern_umtx.c
@@ -47,6 +47,7 @@
 #include <sys/sysunion.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
+#include <sys/sysctl.h>
 #include <sys/module.h>
 
 #include <cpu/lwbuf.h>
@@ -67,7 +68,14 @@
 
 #include <machine/vmm.h>
 
-static void umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action);
+/*
+ * Improve umtx performance by polling for 4uS before going to sleep.
+ * This can avoid many IPIs in typical pthreads mutex situations.
+ */
+#ifdef _RDTSC_SUPPORTED_
+static int umtx_delay = 4000;
+SYSCTL_INT(_kern, OID_AUTO, umtx_delay, CTLFLAG_RW, &umtx_delay, 0, "");
+#endif
 
 /*
  * If the contents of the userland-supplied pointer matches the specified
@@ -101,12 +109,11 @@ sys_umtx_sleep(struct umtx_sleep_args *uap)
 {
     struct lwbuf lwb_cache;
     struct lwbuf *lwb;
-    struct vm_page_action action;
     vm_page_t m;
     void *waddr;
     int offset;
     int timeout;
-    int error = EBUSY;
+    int error;
 
     if (uap->timeout < 0)
 	return (EINVAL);
@@ -125,12 +132,23 @@ sys_umtx_sleep(struct umtx_sleep_args *uap)
      * Otherwise the physical page we sleep on my not match the page
      * being woken up.
      *
+     * The returned page is held, and this hold count prevents it from
+     * being paged out.  This is important since we are sleeping on what
+     * is essentially a physical address (which avoids all sorts of
+     * collision-space issues).
+     *
      * WARNING! We can only use vm_fault_page*() for reading data.  We
      *		cannot use it for writing data because there is no pmap
      *	        interlock to protect against flushes/pageouts.
+     *
+     *		(XXX with recent code work, this may no longer be an issue)
+     *
+     * WARNING! If the user program replaces the mapping of the underlying
+     *		uap->ptr, the physical address may change and the umtx code
+     *		will not be able to match wakeups with tsleeps.
      */
     m = vm_fault_page_quick((vm_offset_t)uap->ptr,
-			    VM_PROT_READ|VM_PROT_WRITE, &error, NULL);
+			    VM_PROT_READ | VM_PROT_WRITE, &error, NULL);
     if (m == NULL) {
 	error = EFAULT;
 	goto done;
@@ -138,38 +156,72 @@ sys_umtx_sleep(struct umtx_sleep_args *uap)
     lwb = lwbuf_alloc(m, &lwb_cache);
     offset = (vm_offset_t)uap->ptr & PAGE_MASK;
 
-    /*
-     * The critical section is required to interlock the tsleep against
-     * a wakeup from another cpu.  The lfence forces synchronization.
-     */
+    error = EBUSY;
     if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value) {
+#ifdef _RDTSC_SUPPORTED_
+	/*
+	 * Poll a little while before sleeping, most mutexes are
+	 * short-lived.
+	 */
+	if (umtx_delay) {
+		int64_t tsc_target;
+		int good = 0;
+
+		tsc_target = tsc_get_target(umtx_delay);
+		while (tsc_test_target(tsc_target) == 0) {
+			cpu_lfence();
+			if (*(int *)(lwbuf_kva(lwb) + offset) != uap->value) {
+				good = 1;
+				break;
+			}
+			cpu_pause();
+		}
+		if (good) {
+			error = EBUSY;
+			goto skip;
+		}
+	}
+#endif
+	/*
+	 * Calculate the timeout.  This will be acccurate to within ~2 ticks.
+	 */
 	if ((timeout = uap->timeout) != 0) {
 	    timeout = (timeout / 1000000) * hz +
 		      ((timeout % 1000000) * hz + 999999) / 1000000;
 	}
+
+	/*
+	 * Calculate the physical address of the mutex.  This gives us
+	 * good distribution between unrelated processes using the
+	 * feature.
+	 */
 	waddr = (void *)((intptr_t)VM_PAGE_TO_PHYS(m) + offset);
 
 	/*
 	 * Wake us up if the memory location COWs while we are sleeping.
+	 * Use a critical section to tighten up the interlock.  Also,
+	 * tsleep_remove() requires the caller be in a critical section.
 	 */
 	crit_enter();
-	vm_page_init_action(m, &action, umtx_sleep_page_action_cow, waddr);
-	vm_page_register_action(&action, VMEVENT_COW);
 
 	/*
 	 * We must interlock just before sleeping.  If we interlock before
 	 * registration the lock operations done by the registration can
 	 * interfere with it.
+	 *
+	 * We cannot leave our interlock hanging on return because this
+	 * will interfere with umtx_wakeup() calls with limited wakeup
+	 * counts.
 	 */
 	tsleep_interlock(waddr, PCATCH | PDOMAIN_UMTX);
-	if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value &&
-	    action.event == VMEVENT_COW) {
+	cpu_lfence();
+	if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value) {
 		error = tsleep(waddr, PCATCH | PINTERLOCKED | PDOMAIN_UMTX,
 			       "umtxsl", timeout);
 	} else {
+		tsleep_remove(curthread);
 		error = EBUSY;
 	}
-	vm_page_unregister_action(&action);
 	crit_exit();
 	/* Always break out in case of signal, even if restartable */
 	if (error == ERESTART)
@@ -177,7 +229,7 @@ sys_umtx_sleep(struct umtx_sleep_args *uap)
     } else {
 	error = EBUSY;
     }
-
+skip:
     lwbuf_free(lwb);
     vm_page_unhold(m);
 done:
@@ -185,17 +237,6 @@ done:
 }
 
 /*
- * If this page is being copied it may no longer represent the page
- * underlying our virtual address.  Wake up any umtx_sleep()'s
- * that were waiting on its physical address to force them to retry.
- */
-static void
-umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action)
-{
-    wakeup_domain(action->data, PDOMAIN_UMTX);
-}
-
-/*
  * umtx_wakeup { const int *ptr, int count }
  *
  * Wakeup the specified number of processes held in umtx_sleep() on the
@@ -227,7 +268,7 @@ sys_umtx_wakeup(struct umtx_wakeup_args *uap)
     if ((vm_offset_t)uap->ptr & (sizeof(int) - 1))
 	return (EFAULT);
     m = vm_fault_page_quick((vm_offset_t)uap->ptr,
-			    VM_PROT_READ, &error, NULL);
+			    VM_PROT_READ | VM_PROT_WRITE, &error, NULL);
     if (m == NULL) {
 	error = EFAULT;
 	goto done;
@@ -235,12 +276,16 @@ sys_umtx_wakeup(struct umtx_wakeup_args *uap)
     offset = (vm_offset_t)uap->ptr & PAGE_MASK;
     waddr = (void *)((intptr_t)VM_PAGE_TO_PHYS(m) + offset);
 
+#if 1
     if (uap->count == 1) {
 	wakeup_domain_one(waddr, PDOMAIN_UMTX);
     } else {
 	/* XXX wakes them all up for now */
 	wakeup_domain(waddr, PDOMAIN_UMTX);
     }
+#else
+    wakeup_domain(waddr, PDOMAIN_UMTX);
+#endif
     vm_page_unhold(m);
     error = 0;
 done:
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 29a1bb2b3b..fb0d3d07c4 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -322,8 +322,9 @@ RetryFault:
 	/*
 	 * Find the vm_map_entry representing the backing store and resolve
 	 * the top level object and page index.  This may have the side
-	 * effect of executing a copy-on-write on the map entry and/or
-	 * creating a shadow object, but will not COW any actual VM pages.
+	 * effect of executing a copy-on-write on the map entry,
+	 * creating a shadow object, or splitting an anonymous entry for
+	 * performance, but will not COW any actual VM pages.
 	 *
 	 * On success fs.map is left read-locked and various other fields 
 	 * are initialized but not otherwise referenced or locked.
@@ -774,11 +775,15 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	 *
 	 * This works great for normal programs but will always return
 	 * NULL for host lookups of vkernel maps in VMM mode.
+	 *
+	 * NOTE: pmap_fault_page_quick() might not busy the page.  If
+	 *	 VM_PROT_WRITE or VM_PROT_OVERRIDE_WRITE is set in
+	 *	 fault_type and pmap_fault_page_quick() returns non-NULL,
+	 *	 it will safely dirty the returned vm_page_t for us.  We
+	 *	 cannot safely dirty it here (it might not be busy).
 	 */
 	fs.m = pmap_fault_page_quick(map->pmap, vaddr, fault_type, busyp);
 	if (fs.m) {
-		if (fault_type & (VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE))
-			vm_page_dirty(fs.m);
 		*errorp = 0;
 		return(fs.m);
 	}
@@ -2075,7 +2080,6 @@ readrest:
 				KKASSERT(fs->first_shared == 0);
 				vm_page_copy(fs->m, fs->first_m);
 				vm_page_protect(fs->m, VM_PROT_NONE);
-				vm_page_event(fs->m, VMEVENT_COW);
 			}
 
 			/*
@@ -2415,7 +2419,6 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
 			panic("vm_fault_copy_wired: page missing");
 
 		vm_page_copy(src_m, dst_m);
-		vm_page_event(src_m, VMEVENT_COW);
 
 		/*
 		 * Enter it in the pmap...
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 08ce9dc747..aa6df20c10 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -115,35 +115,11 @@ static vm_page_t vm_page_select_cache(u_short pg_color);
 static vm_page_t _vm_page_list_find2(int basequeue, int index);
 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
 
-MALLOC_DEFINE(M_ACTIONHASH, "acthash", "vmpage action hash");
-
 /*
  * Array of tailq lists
  */
 __cachealign struct vpgqueues vm_page_queues[PQ_COUNT];
 
-LIST_HEAD(vm_page_action_list, vm_page_action);
-
-/*
- * Action hash for user umtx support.  Contention is governed by both
- * tsleep/wakeup handling (kern/kern_synch.c) and action_hash[] below.
- * Because action_hash[] represents active table locks, a modest fixed
- * value well in excess of MAXCPU works here.
- *
- * There is also scan overhead depending on the number of threads in
- * umtx*() calls, so we also size the hash table based on maxproc.
- */
-struct vm_page_action_hash {
-	struct vm_page_action_list list;
-	struct lock	lk;
-} __cachealign;
-
-#define VMACTION_MINHSIZE	256
-
-struct vm_page_action_hash	*action_hash;
-static int vmaction_hsize;
-static int vmaction_hmask;
-
 static volatile int vm_pages_waiting;
 static struct alist vm_contig_alist;
 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
@@ -562,8 +538,6 @@ vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
  * allocations.
  *
  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
- *
- * Also setup the action_hash[] table here (which is only used by userland)
  */
 static void
 vm_page_startup_finish(void *dummy __unused)
@@ -574,7 +548,6 @@ vm_page_startup_finish(void *dummy __unused)
 	alist_blk_t xcount;
 	alist_blk_t bfree;
 	vm_page_t m;
-	int i;
 
 	spin_lock(&vm_contig_spin);
 	for (;;) {
@@ -643,33 +616,6 @@ vm_page_startup_finish(void *dummy __unused)
 		(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
 		(PAGE_SIZE / 1024),
 		(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
-
-	/*
-	 * Scale the action_hash[] array.  Primary contention occurs due
-	 * to cpu locks, scaled to ncpus, and scan overhead may be incurred
-	 * depending on the number of threads, which we scale to maxproc.
-	 *
-	 * NOTE: Action lock might recurse due to callback, so allow
-	 *	 recursion.
-	 */
-	vmaction_hsize = VMACTION_MINHSIZE;
-	if (vmaction_hsize < ncpus * 2)
-		vmaction_hsize = ncpus * 2;
-	if (vmaction_hsize < maxproc / 16)
-		vmaction_hsize = maxproc / 16;
-	vmaction_hmask = 1;
-	while (vmaction_hmask < vmaction_hsize)
-		vmaction_hmask = (vmaction_hmask << 1) | 1;
-	vmaction_hsize = vmaction_hmask + 1;
-
-	action_hash = kmalloc(sizeof(action_hash[0]) * vmaction_hsize,
-			      M_ACTIONHASH,
-			      M_WAITOK | M_ZERO);
-
-	for (i = 0; i < vmaction_hsize; i++) {
-		LIST_INIT(&action_hash[i].list);
-		lockinit(&action_hash[i].lk, "actlk", 0, LK_CANRECURSE);
-	}
 }
 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
 	vm_page_startup_finish, NULL);
@@ -3274,85 +3220,6 @@ vm_page_test_dirty(vm_page_t m)
 	}
 }
 
-/*
- * Register an action, associating it with its vm_page
- */
-void
-vm_page_register_action(vm_page_action_t action, vm_page_event_t event)
-{
-	struct vm_page_action_hash *hash;
-	int hv;
-
-	hv = (int)((intptr_t)action->m >> 8) & vmaction_hmask;
-	hash = &action_hash[hv];
-
-	lockmgr(&hash->lk, LK_EXCLUSIVE);
-	vm_page_flag_set(action->m, PG_ACTIONLIST);
-	action->event = event;
-	LIST_INSERT_HEAD(&hash->list, action, entry);
-	lockmgr(&hash->lk, LK_RELEASE);
-}
-
-/*
- * Unregister an action, disassociating it from its related vm_page
- */
-void
-vm_page_unregister_action(vm_page_action_t action)
-{
-	struct vm_page_action_hash *hash;
-	int hv;
-
-	hv = (int)((intptr_t)action->m >> 8) & vmaction_hmask;
-	hash = &action_hash[hv];
-	lockmgr(&hash->lk, LK_EXCLUSIVE);
-	if (action->event != VMEVENT_NONE) {
-		action->event = VMEVENT_NONE;
-		LIST_REMOVE(action, entry);
-
-		if (LIST_EMPTY(&hash->list))
-			vm_page_flag_clear(action->m, PG_ACTIONLIST);
-	}
-	lockmgr(&hash->lk, LK_RELEASE);
-}
-
-/*
- * Issue an event on a VM page.  Corresponding action structures are
- * removed from the page's list and called.
- *
- * If the vm_page has no more pending action events we clear its
- * PG_ACTIONLIST flag.
- */
-void
-vm_page_event_internal(vm_page_t m, vm_page_event_t event)
-{
-	struct vm_page_action_hash *hash;
-	struct vm_page_action *scan;
-	struct vm_page_action *next;
-	int hv;
-	int all;
-
-	hv = (int)((intptr_t)m >> 8) & vmaction_hmask;
-	hash = &action_hash[hv];
-	all = 1;
-
-	lockmgr(&hash->lk, LK_EXCLUSIVE);
-	LIST_FOREACH_MUTABLE(scan, &hash->list, entry, next) {
-		if (scan->m == m) {
-			if (scan->event == event) {
-				scan->event = VMEVENT_NONE;
-				LIST_REMOVE(scan, entry);
-				scan->func(m, scan);
-				/* XXX */
-			} else {
-				all = 0;
-			}
-		}
-	}
-	if (all)
-		vm_page_flag_clear(m, PG_ACTIONLIST);
-	lockmgr(&hash->lk, LK_RELEASE);
-}
-
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 120532804d..3d06ef1e7b 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -99,19 +99,6 @@
 
 #endif
 
-typedef enum vm_page_event { VMEVENT_NONE, VMEVENT_COW } vm_page_event_t;
-
-struct vm_page_action {
-	LIST_ENTRY(vm_page_action) entry;
-	struct vm_page		*m;
-	vm_page_event_t		event;
-	void			(*func)(struct vm_page *,
-					struct vm_page_action *);
-	void			*data;
-};
-
-typedef struct vm_page_action *vm_page_action_t;
-
 /*
  *	Management of resident (logical) pages.
  *
@@ -301,12 +288,11 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define PG_RAM		0x00002000	/* read ahead mark */
 #define PG_SWAPPED	0x00004000	/* backed by swap */
 #define PG_NOTMETA	0x00008000	/* do not back with swap */
-#define PG_ACTIONLIST	0x00010000	/* lookaside action list present */
+#define PG_UNUSED10000	0x00010000
 #define PG_SBUSY	0x00020000	/* soft-busy also set */
 #define PG_NEED_COMMIT	0x00040000	/* clean page requires commit */
 
-#define PG_KEEP_NEWPAGE_MASK	(PG_BUSY | PG_SBUSY |		\
-				 PG_WANTED | PG_ACTIONLIST)
+#define PG_KEEP_NEWPAGE_MASK	(PG_BUSY | PG_SBUSY | PG_WANTED)
 
 
 /*
@@ -451,10 +437,7 @@ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid);
 void vm_page_free_toq(vm_page_t m);
 void vm_page_free_contig(vm_page_t m, unsigned long size);
 vm_page_t vm_page_free_fromq_fast(void);
-void vm_page_event_internal(vm_page_t, vm_page_event_t);
 void vm_page_dirty(vm_page_t m);
-void vm_page_register_action(vm_page_action_t action, vm_page_event_t event);
-void vm_page_unregister_action(vm_page_action_t action);
 void vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg);
 void VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
 			int also_m_busy, const char *wmsg VM_PAGE_DEBUG_ARGS);
diff --git a/sys/vm/vm_page2.h b/sys/vm/vm_page2.h
index 8dac2c4a2b..c7b1930a20 100644
--- a/sys/vm/vm_page2.h
+++ b/sys/vm/vm_page2.h
@@ -166,24 +166,6 @@ vm_paging_needed(void)
     return 0;
 }
 
-static __inline
-void
-vm_page_event(vm_page_t m, vm_page_event_t event)
-{
-    if (m->flags & PG_ACTIONLIST)
-	vm_page_event_internal(m, event);
-}
-
-static __inline
-void
-vm_page_init_action(vm_page_t m, vm_page_action_t action,
-		    void (*func)(vm_page_t, vm_page_action_t), void *data)
-{
-    action->m = m;
-    action->func = func;
-    action->data = data;
-}
-
 /*
  * Clear dirty bits in the VM page but truncate the
  * end to a DEV_BSIZE'd boundary.
-- 
2.11.4.GIT