From 3536c341ffda90bfdcc8310ef91231f18c81db52 Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Sun, 4 Dec 2016 09:21:45 -0800
Subject: [PATCH] kernel - Overhaul namecache operations to reduce SMP
 contention

* Overhaul the namecache code to remove a significant amount of cacheline
  ping-ponging from the namecache paths.  This primarily effects
  multi-socket systems but also improves multi-core single-socket systems.

  Cacheline ping-ponging in the critical path can constrict a multi-core
  system to roughly ~1-2M operations per second running through that path.
  For example, even if looking up different paths or stating different
  files, even something as simple as a non-atomic ++global_counter
  seriously derates performance when it is being executed on all cores at
  once.

  In the simple non-conflicting single-component stat() case, this improves
  performance from ~2.5M/second to ~25M/second on a 4-socket 48-core opteron
  and has a similar improvement on a 2-socket 32-thread xeon, as well as
  significantly improves namecache perf on single-socket multi-core systems.

* Remove the vfs.cache.numcalls and vfs.cache.numchecks debugging counters.
  These global counters caused significant cache ping-ponging and were only
  being used for debugging.

* Implement a poor-man's referenced-structure pcpu cache for struct mount
  and struct namecache.  This allows atomic ops on the ref-count for these
  structures to be avoided in certain critical path cases.  For now limit
  to ncdir and nrdir (nrdir particularly, which is usually the same across
  nearly all processes in the system).  Eventually we will want to expand
  this cache to handle more cases.

  Because we are holding refs persistently, add a bit of infrastructure to
  clear the cache as necessary (e.g. when doing an unmount, for example).

* Shift the 'cachedvnodes' global to a per-cpu accumulator, then roll-up
  the counter back to the global approximately once per second.  The code
  critical paths adjust only the per-cpu accumulator, removing another
  global cache ping-pong from nearly all vnode and nlookup paths.

* The nlookup structure now 'Borrows' the ucred reference from td->td_ucred
  instead of crhold()ing it, removing another global ref/unref from all
  nlookup paths.

* We have a large hash table of spinlocks for nchash, add a little pad
  from 24 to 32 bytes.  Its ok that two spin locks share the same cache
  line (its a huge table), adding the pad cleans up cacheline-crossing
  cases.

* Add a bit of pad to put mount->mnt_refs on its own cache-line verses
  prior fields which are accessed shared.  But don't bother isolating it
  completely.
---
 sys/kern/vfs_cache.c    | 216 ++++++++++++++++++++++++++++++++++++++----------
 sys/kern/vfs_lock.c     |  36 ++++++--
 sys/kern/vfs_mount.c    |  12 ++-
 sys/kern/vfs_nlookup.c  |  20 +++--
 sys/kern/vfs_syscalls.c |   8 ++
 sys/sys/globaldata.h    |   3 +-
 sys/sys/mount.h         |   1 +
 sys/sys/namecache.h     |   2 +
 sys/sys/nchstats.h      |   4 +-
 sys/sys/nlookup.h       |   3 +-
 sys/sys/vnode.h         |   2 +-
 11 files changed, 240 insertions(+), 67 deletions(-)

diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 46f5680da5..0354a17318 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -128,9 +128,14 @@ MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 LIST_HEAD(nchash_list, namecache);
 
+/*
+ * Don't cachealign, but at least pad to 32 bytes so entries
+ * don't cross a cache line.
+ */
 struct nchash_head {
-       struct nchash_list list;
-       struct spinlock	spin;
+       struct nchash_list list;	/* 16 bytes */
+       struct spinlock	spin;	/* 8 bytes */
+       long	pad01;		/* 8 bytes */
 };
 
 struct ncmount_cache {
@@ -209,6 +214,7 @@ static long	ncmount_cache_overwrite;
 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_overwrite, CTLFLAG_RW,
 	    &ncmount_cache_overwrite, 0, "mpcache entry overwrites");
 
+static __inline void _cache_drop(struct namecache *ncp);
 static int cache_resolve_mp(struct mount *mp);
 static struct vnode *cache_dvpref(struct namecache *ncp);
 static void _cache_lock(struct namecache *ncp);
@@ -228,12 +234,6 @@ SYSCTL_INT(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
 static int numcache;
 SYSCTL_INT(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
     "Number of namecaches entries");
-static u_long numcalls;
-SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcalls, CTLFLAG_RD, &numcalls, 0,
-    "Number of namecache lookups");
-static u_long numchecks;
-SYSCTL_ULONG(_vfs_cache, OID_AUTO, numchecks, CTLFLAG_RD, &numchecks, 0,
-    "Number of checked entries in namecache lookups");
 
 struct nchstats nchstats[SMP_MAXCPU];
 /*
@@ -265,6 +265,100 @@ SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 static struct namecache *cache_zap(struct namecache *ncp, int nonblock);
 
 /*
+ * Cache mount points and namecache records in order to avoid unnecessary
+ * atomic ops on mnt_refs and ncp->refs.  This improves concurrent SMP
+ * performance and is particularly important on multi-socket systems to
+ * reduce cache-line ping-ponging.
+ *
+ * Try to keep the pcpu structure within one cache line (~64 bytes).
+ */
+#define MNTCACHE_COUNT      5
+
+struct mntcache {
+	struct mount	*mntary[MNTCACHE_COUNT];
+	struct namecache *ncp1;
+	struct namecache *ncp2;
+	int		iter;
+	int		unused01;
+} __cachealign;
+
+static struct mntcache	pcpu_mntcache[MAXCPU];
+
+static
+void
+_cache_mntref(struct mount *mp)
+{
+	struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
+	int i;
+
+	for (i = 0; i < MNTCACHE_COUNT; ++i) {
+		if (cache->mntary[i] != mp)
+			continue;
+		if (atomic_cmpset_ptr((void *)&cache->mntary[i], mp, NULL))
+			return;
+	}
+	atomic_add_int(&mp->mnt_refs, 1);
+}
+
+static
+void
+_cache_mntrel(struct mount *mp)
+{
+	struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
+	int i;
+
+	for (i = 0; i < MNTCACHE_COUNT; ++i) {
+		if (cache->mntary[i] == NULL) {
+			mp = atomic_swap_ptr((void *)&cache->mntary[i], mp);
+			if (mp == NULL)
+				return;
+		}
+	}
+	i = (int)((uint32_t)++cache->iter % (uint32_t)MNTCACHE_COUNT);
+	mp = atomic_swap_ptr((void *)&cache->mntary[i], mp);
+	if (mp)
+		atomic_add_int(&mp->mnt_refs, -1);
+}
+
+/*
+ * Clears all cached mount points on all cpus.  This routine should only
+ * be called when we are waiting for a mount to clear, e.g. so we can
+ * unmount.
+ */
+void
+cache_clearmntcache(void)
+{
+	int n;
+
+	for (n = 0; n < ncpus; ++n) {
+		struct mntcache *cache = &pcpu_mntcache[n];
+		struct namecache *ncp;
+		struct mount *mp;
+		int i;
+
+		for (i = 0; i < MNTCACHE_COUNT; ++i) {
+			if (cache->mntary[i]) {
+				mp = atomic_swap_ptr(
+					(void *)&cache->mntary[i], NULL);
+				if (mp)
+					atomic_add_int(&mp->mnt_refs, -1);
+			}
+		}
+		if (cache->ncp1) {
+			ncp = atomic_swap_ptr((void *)&cache->ncp1, NULL);
+			if (ncp)
+				_cache_drop(ncp);
+		}
+		if (cache->ncp2) {
+			ncp = atomic_swap_ptr((void *)&cache->ncp2, NULL);
+			if (ncp)
+				_cache_drop(ncp);
+		}
+	}
+}
+
+
+/*
  * Namespace locking.  The caller must already hold a reference to the
  * namecache structure in order to lock/unlock it.  This function prevents
  * the namespace from being created or destroyed by accessors other then
@@ -903,52 +997,91 @@ cache_zero(struct nchandle *nch)
  *
  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
  *	    use read spinlocks here.
- *
- * MPSAFE if nch is
  */
 struct nchandle *
 cache_hold(struct nchandle *nch)
 {
 	_cache_hold(nch->ncp);
-	atomic_add_int(&nch->mount->mnt_refs, 1);
+	_cache_mntref(nch->mount);
 	return(nch);
 }
 
 /*
  * Create a copy of a namecache handle for an already-referenced
  * entry.
- *
- * MPSAFE if nch is
  */
 void
 cache_copy(struct nchandle *nch, struct nchandle *target)
 {
+	struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
+	struct namecache *ncp;
+
 	*target = *nch;
-	if (target->ncp)
-		_cache_hold(target->ncp);
-	atomic_add_int(&nch->mount->mnt_refs, 1);
+	_cache_mntref(target->mount);
+	ncp = target->ncp;
+	if (ncp) {
+		if (ncp == cache->ncp1) {
+			if (atomic_cmpset_ptr((void *)&cache->ncp1, ncp, NULL))
+				return;
+		}
+		if (ncp == cache->ncp2) {
+			if (atomic_cmpset_ptr((void *)&cache->ncp2, ncp, NULL))
+				return;
+		}
+		_cache_hold(ncp);
+	}
 }
 
-/*
- * MPSAFE if nch is
- */
 void
 cache_changemount(struct nchandle *nch, struct mount *mp)
 {
-	atomic_add_int(&nch->mount->mnt_refs, -1);
+	_cache_mntref(mp);
+	_cache_mntrel(nch->mount);
 	nch->mount = mp;
-	atomic_add_int(&nch->mount->mnt_refs, 1);
 }
 
 void
 cache_drop(struct nchandle *nch)
 {
-	atomic_add_int(&nch->mount->mnt_refs, -1);
+	_cache_mntrel(nch->mount);
 	_cache_drop(nch->ncp);
 	nch->ncp = NULL;
 	nch->mount = NULL;
 }
 
+/*
+ * Drop the nchandle, but try to cache the ref to avoid global atomic
+ * ops.  This is typically done on the system root and jail root nchandles.
+ */
+void
+cache_drop_and_cache(struct nchandle *nch)
+{
+	struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
+	struct namecache *ncp;
+
+	_cache_mntrel(nch->mount);
+	ncp = nch->ncp;
+	if (cache->ncp1 == NULL) {
+		ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp);
+		if (ncp == NULL)
+			goto done;
+	}
+	if (cache->ncp2 == NULL) {
+		ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp);
+		if (ncp == NULL)
+			goto done;
+	}
+	if (++cache->iter & 1)
+		ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp);
+	else
+		ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp);
+	if (ncp)
+		_cache_drop(ncp);
+done:
+	nch->ncp = NULL;
+	nch->mount = NULL;
+}
+
 int
 cache_lockstatus(struct nchandle *nch)
 {
@@ -1171,7 +1304,7 @@ cache_get(struct nchandle *nch, struct nchandle *target)
 	KKASSERT(nch->ncp->nc_refs > 0);
 	target->mount = nch->mount;
 	target->ncp = _cache_get(nch->ncp);
-	atomic_add_int(&target->mount->mnt_refs, 1);
+	_cache_mntref(target->mount);
 }
 
 void
@@ -1180,7 +1313,7 @@ cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
 	KKASSERT(nch->ncp->nc_refs > 0);
 	target->mount = nch->mount;
 	target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
-	atomic_add_int(&target->mount->mnt_refs, 1);
+	_cache_mntref(target->mount);
 }
 
 /*
@@ -1200,7 +1333,7 @@ _cache_put(struct namecache *ncp)
 void
 cache_put(struct nchandle *nch)
 {
-	atomic_add_int(&nch->mount->mnt_refs, -1);
+	_cache_mntrel(nch->mount);
 	_cache_put(nch->ncp);
 	nch->ncp = NULL;
 	nch->mount = NULL;
@@ -2755,7 +2888,6 @@ cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
 	globaldata_t gd;
 	int par_locked;
 
-	numcalls++;
 	gd = mycpu;
 	mp = par_nch->mount;
 	par_locked = 0;
@@ -2780,8 +2912,6 @@ restart:
 		spin_lock_shared(&nchpp->spin);
 
 	LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
-		numchecks++;
-
 		/*
 		 * Break out if we find a matching entry.  Note that
 		 * UNRESOLVED entries may match, but DESTROYED entries
@@ -2881,7 +3011,8 @@ found:
 		++gd->gd_nchstats->ncs_neghits;
 	nch.mount = mp;
 	nch.ncp = ncp;
-	atomic_add_int(&nch.mount->mnt_refs, 1);
+	_cache_mntref(nch.mount);
+
 	return(nch);
 }
 
@@ -2906,7 +3037,6 @@ cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc,
 	if (ncp_shared_lock_disable || excl)
 		return(EWOULDBLOCK);
 
-	numcalls++;
 	gd = mycpu;
 	mp = par_nch->mount;
 
@@ -2926,8 +3056,6 @@ cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc,
 	spin_lock_shared(&nchpp->spin);
 
 	LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
-		numchecks++;
-
 		/*
 		 * Break out if we find a matching entry.  Note that
 		 * UNRESOLVED entries may match, but DESTROYED entries
@@ -2973,7 +3101,7 @@ found:
 	res_nch->mount = mp;
 	res_nch->ncp = ncp;
 	++gd->gd_nchstats->ncs_goodhits;
-	atomic_add_int(&res_nch->mount->mnt_refs, 1);
+	_cache_mntref(res_nch->mount);
 
 	KKASSERT(ncp->nc_error != EWOULDBLOCK);
 	return(ncp->nc_error);
@@ -2996,7 +3124,6 @@ cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
 	globaldata_t gd;
 	int par_locked;
 
-	numcalls++;
 	gd = mycpu;
 	mp = par_nch->mount;
 	par_locked = 0;
@@ -3011,8 +3138,6 @@ cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
 restart:
 	spin_lock(&nchpp->spin);
 	LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
-		numchecks++;
-
 		/*
 		 * Break out if we find a matching entry.  Note that
 		 * UNRESOLVED entries may match, but DESTROYED entries
@@ -3107,7 +3232,8 @@ found:
 		++gd->gd_nchstats->ncs_neghits;
 	nch.mount = mp;
 	nch.ncp = ncp;
-	atomic_add_int(&nch.mount->mnt_refs, 1);
+	_cache_mntref(nch.mount);
+
 	return(nch);
 failed:
 	if (new_ncp) {
@@ -3163,7 +3289,7 @@ cache_findmount_callback(struct mount *mp, void *data)
 	    mp->mnt_ncmounton.ncp == info->nch_ncp
 	) {
 	    info->result = mp;
-	    atomic_add_int(&mp->mnt_refs, 1);
+	    _cache_mntref(mp);
 	    return(-1);
 	}
 	return(0);
@@ -3193,7 +3319,7 @@ cache_findmount(struct nchandle *nch)
 				/*
 				 * Cache hit (positive)
 				 */
-				atomic_add_int(&mp->mnt_refs, 1);
+				_cache_mntref(mp);
 				spin_unlock_shared(&ncc->spin);
 				++ncmount_cache_hit;
 				return(mp);
@@ -3241,7 +3367,7 @@ skip:
 		spin_lock(&ncc->spin);
 		if (info.result == NULL) {
 			if (ncc->isneg == 0 && ncc->mp)
-				atomic_add_int(&ncc->mp->mnt_refs, -1);
+				_cache_mntrel(ncc->mp);
 			ncc->ncp = nch->ncp;
 			ncc->mp = nch->mount;
 			ncc->isneg = 1;
@@ -3249,8 +3375,8 @@ skip:
 			++ncmount_cache_overwrite;
 		} else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) {
 			if (ncc->isneg == 0 && ncc->mp)
-				atomic_add_int(&ncc->mp->mnt_refs, -1);
-			atomic_add_int(&info.result->mnt_refs, 1);
+				_cache_mntrel(ncc->mp);
+			_cache_mntref(info.result);
 			ncc->ncp = nch->ncp;
 			ncc->mp = info.result;
 			ncc->isneg = 0;
@@ -3267,7 +3393,7 @@ skip:
 void
 cache_dropmount(struct mount *mp)
 {
-	atomic_add_int(&mp->mnt_refs, -1);
+	_cache_mntrel(mp);
 }
 
 void
@@ -3301,7 +3427,7 @@ cache_unmounting(struct mount *mp)
 		spin_lock(&ncc->spin);
 		if (ncc->isneg == 0 &&
 		    ncc->ncp == nch->ncp && ncc->mp == mp) {
-			atomic_add_int(&mp->mnt_refs, -1);
+			_cache_mntrel(mp);
 			ncc->ncp = NULL;
 			ncc->mp = NULL;
 		}
@@ -3725,7 +3851,7 @@ cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
 {
 	nch->ncp = cache_alloc(0);
 	nch->mount = mp;
-	atomic_add_int(&mp->mnt_refs, 1);
+	_cache_mntref(mp);
 	if (vp)
 		_cache_setvp(nch->mount, nch->ncp, vp);
 }
diff --git a/sys/kern/vfs_lock.c b/sys/kern/vfs_lock.c
index 0ae048d4bf..10df38069d 100644
--- a/sys/kern/vfs_lock.c
+++ b/sys/kern/vfs_lock.c
@@ -280,6 +280,28 @@ vref(struct vnode *vp)
 }
 
 /*
+ * Count number of cached vnodes.  This is middling expensive so be
+ * careful not to make this call in the critical path, particularly
+ * not updating the global.  Each cpu tracks its own accumulator.
+ * The individual accumulators are not accurate and must be summed
+ * together.
+ */
+int
+countcachedvnodes(int gupdate)
+{
+	int i;
+	int n = 0;
+
+	for (i = 0; i < ncpus; ++i) {
+		globaldata_t gd = globaldata_find(i);
+		n += gd->gd_cachedvnodes;
+	}
+	if (gupdate)
+		cachedvnodes = n;
+	return n;
+}
+
+/*
  * Release a ref on an active or inactive vnode.
  *
  * Caller has no other requirements.
@@ -331,7 +353,7 @@ vrele(struct vnode *vp)
 			vx_unlock(vp);
 		} else {
 			if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
-				atomic_add_int(&cachedvnodes, 1);
+				atomic_add_int(&mycpu->gd_cachedvnodes, 1);
 				break;
 			}
 		}
@@ -469,7 +491,7 @@ vget(struct vnode *vp, int flags)
 	 *	 not protect our access to the refcnt or other fields.
 	 */
 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
-		atomic_add_int(&cachedvnodes, -1);
+		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 
 	if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
 		/*
@@ -593,7 +615,7 @@ void
 vx_get(struct vnode *vp)
 {
 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
-		atomic_add_int(&cachedvnodes, -1);
+		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 }
 
@@ -607,7 +629,7 @@ vx_get_nonblock(struct vnode *vp)
 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error == 0) {
 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
-			atomic_add_int(&cachedvnodes, -1);
+			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 	}
 	return(error);
 }
@@ -649,7 +671,7 @@ cleanfreevnode(int maxcount)
 	/*
 	 * Try to deactivate some vnodes cached on the active list.
 	 */
-	if (cachedvnodes < inactivevnodes)
+	if (countcachedvnodes(0) < inactivevnodes)
 		goto skip;
 
 	for (count = 0; count < maxcount * 2; count++) {
@@ -698,7 +720,7 @@ cleanfreevnode(int maxcount)
 		 * Try to deactivate the vnode.
 		 */
 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
-			atomic_add_int(&cachedvnodes, -1);
+			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
 
 		spin_unlock(&vfs_spin);
@@ -919,7 +941,7 @@ void
 allocvnode_gc(void)
 {
 	if (numvnodes >= maxvnodes &&
-	    cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
+	    countcachedvnodes(0) + inactivevnodes >= maxvnodes * 5 / 10) {
 		freesomevnodes(batchfreevnodes);
 	}
 }
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index a5d2b2bbf7..0bae734fb9 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -459,6 +459,8 @@ vnlru_proc(void)
 			      SHUTDOWN_PRI_FIRST);
 
 	for (;;) {
+		int ncached;
+
 		kproc_suspend_loop();
 
 		/*
@@ -468,12 +470,13 @@ vnlru_proc(void)
 		 *
 		 * (long) -> deal with 64 bit machines, intermediate overflow
 		 */
+		ncached = countcachedvnodes(1);
 		if (numvnodes >= maxvnodes * 9 / 10 &&
-		    cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
+		    ncached + inactivevnodes >= maxvnodes * 5 / 10) {
 			int count = numvnodes - maxvnodes * 9 / 10;
 
-			if (count > (cachedvnodes + inactivevnodes) / 100)
-				count = (cachedvnodes + inactivevnodes) / 100;
+			if (count > (ncached + inactivevnodes) / 100)
+				count = (ncached + inactivevnodes) / 100;
 			if (count < 5)
 				count = 5;
 			freesomevnodes(count);
@@ -490,8 +493,9 @@ vnlru_proc(void)
 		 * Nothing to do if most of our vnodes are already on
 		 * the free list.
 		 */
+		ncached = countcachedvnodes(1);
 		if (numvnodes <= maxvnodes * 9 / 10 ||
-		    cachedvnodes + inactivevnodes <= maxvnodes * 5 / 10) {
+		    ncached + inactivevnodes <= maxvnodes * 5 / 10) {
 			tsleep(vnlruthread, 0, "vlruwt", hz);
 			continue;
 		}
diff --git a/sys/kern/vfs_nlookup.c b/sys/kern/vfs_nlookup.c
index 837ee26a7c..daf07a713d 100644
--- a/sys/kern/vfs_nlookup.c
+++ b/sys/kern/vfs_nlookup.c
@@ -117,12 +117,14 @@ nlookup_init(struct nlookupdata *nd,
 	    cache_copy(&p->p_fd->fd_nrdir, &nd->nl_rootnch);
 	    if (p->p_fd->fd_njdir.ncp)
 		cache_copy(&p->p_fd->fd_njdir, &nd->nl_jailnch);
-	    nd->nl_cred = crhold(p->p_ucred);
+	    nd->nl_cred = td->td_ucred;
+	    nd->nl_flags |= NLC_BORROWCRED;
 	} else {
 	    cache_copy(&rootnch, &nd->nl_nch);
 	    cache_copy(&nd->nl_nch, &nd->nl_rootnch);
 	    cache_copy(&nd->nl_nch, &nd->nl_jailnch);
-	    nd->nl_cred = crhold(proc0.p_ucred);
+	    nd->nl_cred = proc0.p_ucred;
+	    nd->nl_flags |= NLC_BORROWCRED;
 	}
 	nd->nl_td = td;
 	nd->nl_flags |= flags;
@@ -271,6 +273,7 @@ nlookup_init_root(struct nlookupdata *nd,
     return(error);
 }
 
+#if 0
 /*
  * Set a different credential; this credential will be used by future
  * operations performed on nd.nl_open_vp and nlookupdata structure.
@@ -282,10 +285,13 @@ nlookup_set_cred(struct nlookupdata *nd, struct ucred *cred)
 
 	if (nd->nl_cred != cred) {
 		cred = crhold(cred);
-		crfree(nd->nl_cred);
+		if ((nd->nl_flags & NLC_BORROWCRED) == 0)
+			crfree(nd->nl_cred);
+		nd->nl_flags &= ~NLC_BORROWCRED;
 		nd->nl_cred = cred;
 	}
 }
+#endif
 
 /*
  * Cleanup a nlookupdata structure after we are through with it.  This may
@@ -305,16 +311,18 @@ nlookup_done(struct nlookupdata *nd)
 	cache_drop(&nd->nl_nch);	/* NULL's out the nch */
     }
     if (nd->nl_rootnch.ncp)
-	cache_drop(&nd->nl_rootnch);
+	cache_drop_and_cache(&nd->nl_rootnch);
     if (nd->nl_jailnch.ncp)
-	cache_drop(&nd->nl_jailnch);
+	cache_drop_and_cache(&nd->nl_jailnch);
     if ((nd->nl_flags & NLC_HASBUF) && nd->nl_path) {
 	objcache_put(namei_oc, nd->nl_path);
 	nd->nl_path = NULL;
     }
     if (nd->nl_cred) {
-	crfree(nd->nl_cred);
+	if ((nd->nl_flags & NLC_BORROWCRED) == 0)
+	    crfree(nd->nl_cred);
 	nd->nl_cred = NULL;
+	nd->nl_flags &= ~NLC_BORROWCRED;
     }
     if (nd->nl_open_vp) {
 	if (nd->nl_flags & NLC_LOCKVP) {
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 3885f9add8..5dba7909b6 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -713,11 +713,13 @@ dounmount(struct mount *mp, int flags)
 		cache_inval(&mp->mnt_ncmountpt, CINV_DESTROY|CINV_CHILDREN);
 		cache_unlock(&mp->mnt_ncmountpt);
 
+		cache_clearmntcache();
 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
 			allproc_scan(&unmount_allproc_cb, mp);
 		}
 
+		cache_clearmntcache();
 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
 
@@ -761,9 +763,12 @@ dounmount(struct mount *mp, int flags)
 	 * Scans can get temporary refs on a mountpoint (thought really
 	 * heavy duty stuff like cache_findmount() do not).
 	 */
+	if (mp->mnt_refs != 1)
+		cache_clearmntcache();
 	for (retry = 0; retry < 10 && mp->mnt_refs != 1; ++retry) {
 		cache_unmounting(mp);
 		tsleep(&mp->mnt_refs, 0, "mntbsy", hz / 10 + 1);
+		cache_clearmntcache();
 	}
 	if (mp->mnt_refs != 1) {
 		if ((flags & MNT_FORCE) == 0) {
@@ -860,10 +865,13 @@ dounmount(struct mount *mp, int flags)
 	 * to busy the mount after we decided to do the unmount.
 	 */
 	if (freeok) {
+		if (mp->mnt_refs > 1)
+			cache_clearmntcache();
 		while (mp->mnt_refs > 1) {
 			cache_unmounting(mp);
 			wakeup(mp);
 			tsleep(&mp->mnt_refs, 0, "umntrwait", hz / 10 + 1);
+			cache_clearmntcache();
 		}
 		lwkt_reltoken(&mp->mnt_token);
 		mount_drop(mp);
diff --git a/sys/sys/globaldata.h b/sys/sys/globaldata.h
index 091cbbb730..ce6ff18f8a 100644
--- a/sys/sys/globaldata.h
+++ b/sys/sys/globaldata.h
@@ -168,7 +168,8 @@ struct globaldata {
 	int		gd_timer_running;
 	u_int		gd_idle_repeat;		/* repeated switches to idle */
 	int		gd_quick_color;		/* page-coloring helper */
-	int		gd_ireserved[6];
+	int		gd_cachedvnodes;	/* accum across all cpus */
+	int		gd_ireserved[5];
 	const char	*gd_infomsg;		/* debugging */
 	struct lwkt_tokref gd_handoff;		/* hand-off tokref */
 	void		*gd_delayed_wakeup[2];
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index 0ded258491..8072c7f61a 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -240,6 +240,7 @@ struct mount {
 	struct vop_ops	*mnt_vn_fifo_ops;	/* for use by the VFS */
 	struct nchandle mnt_ncmountpt;		/* mount point */
 	struct nchandle mnt_ncmounton;		/* mounted on */
+	char		mnt_pad[24];		/* (try to cache-align refs) */
 	int		mnt_refs;		/* nchandle references */
 	int		mnt_hold;		/* prevent kfree */
 	struct lwkt_token mnt_token;		/* token lock if not MPSAFE */
diff --git a/sys/sys/namecache.h b/sys/sys/namecache.h
index 9c4601d3c7..0890370bdc 100644
--- a/sys/sys/namecache.h
+++ b/sys/sys/namecache.h
@@ -176,6 +176,7 @@ struct componentname;
 struct nlcomponent;
 struct mount;
 
+void	cache_clearmntcache(void);
 void	cache_lock(struct nchandle *nch);
 void	cache_lock_maybe_shared(struct nchandle *nch, int excl);
 void	cache_relock(struct nchandle *nch1, struct ucred *cred1,
@@ -216,6 +217,7 @@ void	cache_copy(struct nchandle *nch, struct nchandle *target);
 void	cache_changemount(struct nchandle *nch, struct mount *mp);
 void	cache_put(struct nchandle *nch);
 void	cache_drop(struct nchandle *nch);
+void	cache_drop_and_cache(struct nchandle *nch);
 void	cache_zero(struct nchandle *nch);
 void	cache_rename(struct nchandle *fnch, struct nchandle *tnch);
 void	cache_unlink(struct nchandle *nch);
diff --git a/sys/sys/nchstats.h b/sys/sys/nchstats.h
index d77d34aa50..36696ed2d1 100644
--- a/sys/sys/nchstats.h
+++ b/sys/sys/nchstats.h
@@ -39,6 +39,8 @@
 /*
  * Statistics on the usefulness of namei caches.
  * (per-cpu)
+ *
+ * Allocated in an array so make sure this is cache-aligned.
  */
 struct	nchstats {
 	unsigned long	ncs_goodhits;	/* hits that we can really use */
@@ -49,6 +51,6 @@ struct	nchstats {
 	unsigned long	ncs_longhits;  	/* path lookup hits */
 	unsigned long	ncs_longmiss;	/* path lookup misses */
 	unsigned long	ncs_unused;	/* number of times we attempt it */
-};
+} __cachealign;
 
 #endif /* _SYS_NCHSTATS_H_ */
diff --git a/sys/sys/nlookup.h b/sys/sys/nlookup.h
index e503d246f2..673e69855d 100644
--- a/sys/sys/nlookup.h
+++ b/sys/sys/nlookup.h
@@ -130,7 +130,7 @@ struct nlookupdata {
 #define NLC_EXEC		0x01000000	/* require execute access */
 #define NLC_EXCL		0x02000000	/* open check: exclusive */
 #define NLC_OWN			0x04000000	/* open check: owner override */
-#define NLC_UNUSED08000000	0x08000000
+#define NLC_BORROWCRED		0x08000000	/* cred ref borrowed */
 #define NLC_STICKY		0x10000000	/* indicate sticky case */
 #define NLC_APPENDONLY		0x20000000	/* indicate append-only */
 #define NLC_IMMUTABLE		0x40000000	/* indicate immutable set */
@@ -151,7 +151,6 @@ int nlookup_init_at(struct nlookupdata *, struct file **, int, const char *,
 		enum uio_seg, int);
 int nlookup_init_raw(struct nlookupdata *, const char *, enum uio_seg, int, struct ucred *, struct nchandle *);
 int nlookup_init_root(struct nlookupdata *, const char *, enum uio_seg, int, struct ucred *, struct nchandle *, struct nchandle *);
-void nlookup_set_cred(struct nlookupdata *nd, struct ucred *cred);
 void nlookup_zero(struct nlookupdata *);
 void nlookup_done(struct nlookupdata *);
 void nlookup_done_at(struct nlookupdata *, struct file *);
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 8696da91ad..4fddd2132f 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -356,7 +356,6 @@ extern	struct vattr va_null;		/* predefined null vattr structure */
 extern	int numvnodes;
 extern	int inactivevnodes;
 extern	int activevnodes;
-extern	int cachedvnodes;
 
 /*
  * This macro is very helpful in defining those offsets in the vdesc struct.
@@ -565,6 +564,7 @@ void	vfs_subr_init(void);
 void	vfs_mount_init(void);
 void	vfs_lock_init(void);
 void	mount_init(struct mount *mp);
+int	countcachedvnodes(int gupdate);
 
 void	vn_syncer_add(struct vnode *, int);
 void	vn_syncer_remove(struct vnode *, int);
-- 
2.11.4.GIT