From 04bd6171a2376758dca089a0ac0148432e60ebfc Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Mon, 22 Feb 2010 07:40:05 -0800
Subject: [PATCH] kernel - Refactor vnode_free_list, vnode reuse algorithm

* Rip out most of he VAGEx stuff.  It might come back in another form
  later.

* Split the vnode_free_list into three parts, separated by two markers
  (vnode_free_mid1 and vnode_free_mid2).

* Insert vnodes on the free list based on the following.  New vnodes
  are allocated from the base of the list.

  At the HEAD		- If the vnode is VRECLAIMED (i.e. dead)
  end of first section	- If the vnode has no cached VM or SWAP data
  end of second section	- If the vnode has cached SWAP data and no cached VM
  at the TAIL		- If the vnode has cached VM data

* Implement a rover to slowly scan vnodes in the list when allocating
  and shift them to the appropriate section.  This fixes a degenerate
  condition in the placement of the markers.

* A Vnode is removed and usually immediately reinserted whenever it
  is accesesd by userland but not held open, giving us a LRU-like
  algorithm within each section of the list but non-LRU-like transits
  between sections of the list.

  Transits between sections are determined more by how the VM system
  recycles related VM cache pages.  Cached SWAP data only occurs if
  the swapcache is turned on.

* Future: Might use VAGE to implement a second go-around in the queue
  or a burst re-placement in the queue when the data set is found to
  be too big to fit.
---
 sys/kern/vfs_lock.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 90 insertions(+), 7 deletions(-)

diff --git a/sys/kern/vfs_lock.c b/sys/kern/vfs_lock.c
index 99f8419543..242d7e3f6d 100644
--- a/sys/kern/vfs_lock.c
+++ b/sys/kern/vfs_lock.c
@@ -86,8 +86,11 @@ static struct sysref_class vnode_sysref_class = {
  * at the tail.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
-static struct vnode	vnode_free_mid;
+static struct vnode	vnode_free_mid1;
+static struct vnode	vnode_free_mid2;
+static struct vnode	vnode_free_rover;
 static struct spinlock	vfs_spin = SPINLOCK_INITIALIZER(vfs_spin);
+static enum { ROVER_MID1, ROVER_MID2 } rover_state = ROVER_MID2;
 
 int  freevnodes = 0;
 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD,
@@ -108,7 +111,9 @@ void
 vfs_lock_init(void)
 {
 	TAILQ_INIT(&vnode_free_list);
-	TAILQ_INSERT_HEAD(&vnode_free_list, &vnode_free_mid, v_freelist);
+	TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_mid1, v_freelist);
+	TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_mid2, v_freelist);
+	TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_rover, v_freelist);
 	spin_init(&vfs_spin);
 	kmalloc_raise_limit(M_VNODE, 0);	/* unlimited */
 }
@@ -192,12 +197,21 @@ __vfree(struct vnode *vp)
 #endif
 	spin_lock_wr(&vfs_spin);
 	KKASSERT((vp->v_flag & VFREE) == 0);
-	if (vp->v_flag & VRECLAIMED)
+
+	/*
+	 * Distinguish between basically dead vnodes, vnodes with cached
+	 * data, and vnodes without cached data.  A rover will shift the
+	 * vnodes around as their cache status is lost.
+	 */
+	if (vp->v_flag & VRECLAIMED) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
-	else if (vp->v_flag & (VAGE0 | VAGE1))
-		TAILQ_INSERT_BEFORE(&vnode_free_mid, vp, v_freelist);
-	else
+	} else if (vp->v_object && vp->v_object->resident_page_count) {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+	} else if (vp->v_object && vp->v_object->swblock_count) {
+		TAILQ_INSERT_BEFORE(&vnode_free_mid2, vp, v_freelist);
+	} else {
+		TAILQ_INSERT_BEFORE(&vnode_free_mid1, vp, v_freelist);
+	}
 	freevnodes++;
 	_vsetflags(vp, VFREE);
 	spin_unlock_wr(&vfs_spin);
@@ -616,6 +630,69 @@ vx_put(struct vnode *vp)
 }
 
 /*
+ * The rover looks for vnodes past the midline with no cached data and
+ * moves them to before the midline.  If we do not do this the midline
+ * can wind up in a degenerate state.
+ */
+static
+void
+vnode_rover_locked(void)
+{
+	struct vnode *vp;
+
+	/*
+	 * Get the vnode after the rover.  The rover roves between mid1 and
+	 * the end so the only special vnode it can encounter is mid2.
+	 */
+	vp = TAILQ_NEXT(&vnode_free_rover, v_freelist);
+	if (vp == &vnode_free_mid2) {
+		vp = TAILQ_NEXT(vp, v_freelist);
+		rover_state = ROVER_MID2;
+	}
+	KKASSERT(vp != &vnode_free_mid1);
+
+	/*
+	 * Start over if we finished the scan.
+	 */
+	TAILQ_REMOVE(&vnode_free_list, &vnode_free_rover, v_freelist);
+	if (vp == NULL) {
+		TAILQ_INSERT_AFTER(&vnode_free_list, &vnode_free_mid1,
+				   &vnode_free_rover, v_freelist);
+		rover_state = ROVER_MID1;
+		return;
+	}
+	TAILQ_INSERT_AFTER(&vnode_free_list, vp, &vnode_free_rover, v_freelist);
+
+	/*
+	 * Shift vp if appropriate.
+	 */
+	if (vp->v_object && vp->v_object->resident_page_count) {
+		/*
+		 * Promote vnode with resident pages to section 3.
+		 * (This case shouldn't happen).
+		 */
+		if (rover_state == ROVER_MID1) {
+			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+		}
+	} else if (vp->v_object && vp->v_object->swblock_count) {
+		/*
+		 * Demote vnode with only swap pages to section 2
+		 */
+		if (rover_state == ROVER_MID2) {
+			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+			TAILQ_INSERT_BEFORE(&vnode_free_mid2, vp, v_freelist);
+		}
+	} else {
+		/*
+		 * Demote vnode with no cached data to section 1
+		 */
+		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		TAILQ_INSERT_BEFORE(&vnode_free_mid1, vp, v_freelist);
+	}
+}
+
+/*
  * Try to reuse a vnode from the free list.
  *
  * NOTE: The returned vnode is not completely initialized.
@@ -643,9 +720,15 @@ allocfreevnode(void)
 		 * vhold here.
 		 */
 		spin_lock_wr(&vfs_spin);
+		vnode_rover_locked();
+		vnode_rover_locked();
 		vp = TAILQ_FIRST(&vnode_free_list);
-		if (vp == &vnode_free_mid)
+		while (vp == &vnode_free_mid1 || vp == &vnode_free_mid2 ||
+		       vp == &vnode_free_rover) {
 			vp = TAILQ_NEXT(vp, v_freelist);
+		}
+		if (vp == NULL)
+			break;
 		if (vx_lock_nonblock(vp)) {
 			KKASSERT(vp->v_flag & VFREE);
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
-- 
2.11.4.GIT