From cff27badea2d316c1dd7581302d824e7a49fa9f3 Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Mon, 5 Dec 2016 15:07:43 -0800
Subject: [PATCH] kernel - Spiff up locks a bit

* Do a little optimization of _spin_lock_contested().  The critical path
  is able to avoid two atomic ops in the initialization portion of the
  contested path.

* Optimize _spin_lock_shared_contested() to use atomic_fetchadd_long()
  to add a shared-lock count instead of atomic_cmpset_long().  Shared
  spinlocks are used heavily and this will prevent a lot of unnecessary
  spinning when many cpus are using the same lock at the same time.

* Hold fdp->fd_spin across fdp->fd_cdir and fdp->fd_ncdir modifications.
  This completes other work which caches fdp->fd_ncdir and avoids having
  to obtain the spin-lock when the cache matches.

Discussed-with: Mateusz Guzik (mjg_)
---
 sys/kern/kern_spinlock.c | 13 ++++++++-----
 sys/kern/lwkt_token.c    |  7 +++----
 sys/kern/vfs_syscalls.c  |  8 ++++++--
 sys/sys/spinlock2.h      | 13 ++++++++-----
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/sys/kern/kern_spinlock.c b/sys/kern/kern_spinlock.c
index ce158ddf0d..c086c68b86 100644
--- a/sys/kern/kern_spinlock.c
+++ b/sys/kern/kern_spinlock.c
@@ -188,7 +188,7 @@ spin_trylock_contested(struct spinlock *spin)
  *	as well (no difference).
  */
 void
-_spin_lock_contested(struct spinlock *spin, const char *ident)
+_spin_lock_contested(struct spinlock *spin, const char *ident, int value)
 {
 	struct indefinite_info info = { 0, 0, ident };
 	int i;
@@ -196,8 +196,10 @@ _spin_lock_contested(struct spinlock *spin, const char *ident)
 	/*
 	 * Handle degenerate case.
 	 */
-	if (atomic_cmpset_int(&spin->counta, SPINLOCK_SHARED|0, 1))
-		return;
+	if (value == SPINLOCK_SHARED) {
+		if (atomic_cmpset_int(&spin->counta, SPINLOCK_SHARED|0, 1))
+			return;
+	}
 
 	/*
 	 * Transfer our count to the high bits, then loop until we can
@@ -208,7 +210,8 @@ _spin_lock_contested(struct spinlock *spin, const char *ident)
 	 * understands that this may occur.
 	 */
 	atomic_add_int(&spin->counta, SPINLOCK_EXCLWAIT - 1);
-	atomic_clear_int(&spin->counta, SPINLOCK_SHARED);
+	if (value & SPINLOCK_SHARED)
+		atomic_clear_int(&spin->counta, SPINLOCK_SHARED);
 
 #ifdef DEBUG_LOCKS_LATENCY
 	long j;
@@ -262,7 +265,7 @@ _spin_lock_contested(struct spinlock *spin, const char *ident)
  * The caller has not modified counta.
  */
 void
-_spin_lock_shared_contested(struct spinlock *spin, const char *ident)
+_spin_lock_shared_contested(struct spinlock *spin, const char *ident, int value)
 {
 	struct indefinite_info info = { 0, 0, ident };
 	int i;
diff --git a/sys/kern/lwkt_token.c b/sys/kern/lwkt_token.c
index 0e6fe832b8..84cdd83f0e 100644
--- a/sys/kern/lwkt_token.c
+++ b/sys/kern/lwkt_token.c
@@ -331,14 +331,13 @@ _lwkt_trytokref(lwkt_tokref_t ref, thread_t td, long mode)
 			oref = tok->t_ref;	/* can be NULL */
 			cpu_ccfence();
 			if ((count & (TOK_EXCLUSIVE/*|TOK_EXCLREQ*/)) == 0) {
-				/* XXX EXCLREQ should work */
 				/*
-				 * It is possible to get the token shared.
+				 * It may be possible to get the token shared.
 				 */
-				if (atomic_cmpset_long(&tok->t_count, count,
-						       count + TOK_INCR)) {
+				if ((atomic_fetchadd_long(&tok->t_count, TOK_INCR) & TOK_EXCLUSIVE) == 0) {
 					return TRUE;
 				}
+				atomic_fetchadd_long(&tok->t_count, -TOK_INCR);
 				/* retry */
 			} else if ((count & TOK_EXCLUSIVE) &&
 				   oref >= &td->td_toks_base &&
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index bbde8f42d3..e445ad7ff1 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1643,11 +1643,13 @@ sys_fchdir(struct fchdir_args *uap)
 		cache_dropmount(mp);
 	}
 	if (error == 0) {
+		spin_lock(&fdp->fd_spin);
 		ovp = fdp->fd_cdir;
 		onch = fdp->fd_ncdir;
-		vn_unlock(vp);		/* leave ref intact */
 		fdp->fd_cdir = vp;
 		fdp->fd_ncdir = nch;
+		spin_unlock(&fdp->fd_spin);
+		vn_unlock(vp);		/* leave ref intact */
 		cache_drop(&onch);
 		vrele(ovp);
 	} else {
@@ -1682,11 +1684,13 @@ kern_chdir(struct nlookupdata *nd)
 	error = checkvp_chdir(vp, td);
 	vn_unlock(vp);
 	if (error == 0) {
+		spin_lock(&fdp->fd_spin);
 		ovp = fdp->fd_cdir;
 		onch = fdp->fd_ncdir;
-		cache_unlock(&nd->nl_nch);	/* leave reference intact */
 		fdp->fd_ncdir = nd->nl_nch;
 		fdp->fd_cdir = vp;
+		spin_unlock(&fdp->fd_spin);
+		cache_unlock(&nd->nl_nch);	/* leave reference intact */
 		cache_drop(&onch);
 		vrele(ovp);
 		cache_zero(&nd->nl_nch);
diff --git a/sys/sys/spinlock2.h b/sys/sys/spinlock2.h
index ae1b34dca7..de6e07dad3 100644
--- a/sys/sys/spinlock2.h
+++ b/sys/sys/spinlock2.h
@@ -54,8 +54,9 @@
 extern struct spinlock pmap_spin;
 
 int spin_trylock_contested(struct spinlock *spin);
-void _spin_lock_contested(struct spinlock *spin, const char *ident);
-void _spin_lock_shared_contested(struct spinlock *spin, const char *ident);
+void _spin_lock_contested(struct spinlock *spin, const char *ident, int count);
+void _spin_lock_shared_contested(struct spinlock *spin, const char *ident,
+			int count);
 void _spin_pool_lock(void *chan, const char *ident);
 void _spin_pool_unlock(void *chan);
 
@@ -111,11 +112,13 @@ spin_held(struct spinlock *spin)
 static __inline void
 _spin_lock_quick(globaldata_t gd, struct spinlock *spin, const char *ident)
 {
+	int count;
+
 	++gd->gd_curthread->td_critcount;
 	cpu_ccfence();
 	++gd->gd_spinlocks;
-	if (atomic_fetchadd_int(&spin->counta, 1) != 0)
-		_spin_lock_contested(spin, ident);
+	if ((count = atomic_fetchadd_int(&spin->counta, 1)) != 0)
+		_spin_lock_contested(spin, ident, count + 1);
 #ifdef DEBUG_LOCKS
 	int i;
 	for (i = 0; i < SPINLOCK_DEBUG_ARRAY_SIZE; i++) {
@@ -199,7 +202,7 @@ _spin_lock_shared_quick(globaldata_t gd, struct spinlock *spin,
 		atomic_set_int(&spin->counta, SPINLOCK_SHARED);
 	} else if ((counta & SPINLOCK_SHARED) == 0) {
 		atomic_add_int(&spin->counta, -1);
-		_spin_lock_shared_contested(spin, ident);
+		_spin_lock_shared_contested(spin, ident, counta);
 	}
 #ifdef DEBUG_LOCKS
 	int i;
-- 
2.11.4.GIT