From de996e865902051a4e8e4945e7116be8850f871c Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Wed, 19 Aug 2009 13:54:29 -0700
Subject: [PATCH] HAMMER - Rework write pipelining

* Rework write pipelining so it is based on pending direct writes on an
  inode-by-inode basis.  ip->rsv_recs and hmp->rsv_recs are now
  decremented after the direct has completed rather then when the sync
  code has processed the record.

  This fixes serious buffer cache overloading when doing linear writes.

* Implement write clustering or bawrite() calls based on a filesystem
  block getting filled up instead of relying on the buffer cache's bdwrite()
  to keep ahead of the mark.

* vfs.hammer.cluster_enable now effects both read and write clustering.
---
 sys/vfs/hammer/hammer.h        |  1 +
 sys/vfs/hammer/hammer_io.c     |  8 ++++++--
 sys/vfs/hammer/hammer_object.c | 35 +++++++++++++++++++++++++----------
 sys/vfs/hammer/hammer_vfsops.c |  3 +++
 sys/vfs/hammer/hammer_vnops.c  | 35 ++++++++++++++++++++++++++++++++++-
 5 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h
index a2b198a25f..501750cefd 100644
--- a/sys/vfs/hammer/hammer.h
+++ b/sys/vfs/hammer/hammer.h
@@ -859,6 +859,7 @@ extern int hammer_count_io_running_write;
 extern int hammer_count_io_locked;
 extern int hammer_limit_dirtybufspace;
 extern int hammer_limit_recs;
+extern int hammer_limit_inode_recs;
 extern int hammer_bio_count;
 extern int hammer_verify_zone;
 extern int hammer_verify_data;
diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c
index ca44a9baee..a472879015 100644
--- a/sys/vfs/hammer/hammer_io.c
+++ b/sys/vfs/hammer/hammer_io.c
@@ -1294,10 +1294,14 @@ hammer_io_direct_write_complete(struct bio *nbio)
 
 	KKASSERT(record != NULL);
 	KKASSERT(record->flags & HAMMER_RECF_DIRECT_IO);
-	record->flags &= ~HAMMER_RECF_DIRECT_IO;
 	if (record->flags & HAMMER_RECF_DIRECT_WAIT) {
-		record->flags &= ~HAMMER_RECF_DIRECT_WAIT;
+		record->flags &= ~(HAMMER_RECF_DIRECT_IO |
+				   HAMMER_RECF_DIRECT_WAIT);
+		/* record can disappear once DIRECT_IO flag is cleared */
 		wakeup(&record->flags);
+	} else {
+		record->flags &= ~HAMMER_RECF_DIRECT_IO;
+		/* record can disappear once DIRECT_IO flag is cleared */
 	}
 }
 
diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c
index 4f3f5c4c94..b54b6bb04b 100644
--- a/sys/vfs/hammer/hammer_object.c
+++ b/sys/vfs/hammer/hammer_object.c
@@ -363,6 +363,7 @@ hammer_rel_mem_record(struct hammer_record *record)
 	hammer_reserve_t resv;
 	hammer_inode_t ip;
 	hammer_inode_t target_ip;
+	int diddrop;
 
 	hammer_unref(&record->lock);
 
@@ -401,21 +402,18 @@ hammer_rel_mem_record(struct hammer_record *record)
 				hammer_ref(&target_ip->lock);
 			}
 
+			/*
+			 * Remove the record from the B-Tree
+			 */
 			if (record->flags & HAMMER_RECF_ONRBTREE) {
 				RB_REMOVE(hammer_rec_rb_tree,
 					  &record->ip->rec_tree,
 					  record);
-				KKASSERT(ip->rsv_recs > 0);
-				--hmp->rsv_recs;
-				--ip->rsv_recs;
-				hmp->rsv_databytes -= record->leaf.data_len;
 				record->flags &= ~HAMMER_RECF_ONRBTREE;
-
-				if (RB_EMPTY(&record->ip->rec_tree)) {
-					record->ip->flags &= ~HAMMER_INODE_XDIRTY;
-					record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
-					hammer_test_inode(record->ip);
-				}
+				KKASSERT(ip->rsv_recs > 0);
+				diddrop = 1;
+			} else {
+				diddrop = 0;
 			}
 
 			/*
@@ -428,6 +426,23 @@ hammer_rel_mem_record(struct hammer_record *record)
 				hammer_io_direct_wait(record);
 			}
 
+			/*
+			 * Account for the completion after the direct IO
+			 * has completed.
+			 */
+			if (diddrop) {
+				--hmp->rsv_recs;
+				--ip->rsv_recs;
+				hmp->rsv_databytes -= record->leaf.data_len;
+
+				if (RB_EMPTY(&record->ip->rec_tree)) {
+					record->ip->flags &= ~HAMMER_INODE_XDIRTY;
+					record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
+					hammer_test_inode(record->ip);
+				}
+				if (ip->rsv_recs == hammer_limit_inode_recs - 1)
+					wakeup(&ip->rsv_recs);
+			}
 
 			/*
 			 * Do this test after removing record from the B-Tree.
diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c
index aa30526dbe..331f43f91a 100644
--- a/sys/vfs/hammer/hammer_vfsops.c
+++ b/sys/vfs/hammer/hammer_vfsops.c
@@ -95,6 +95,7 @@ int hammer_count_io_running_write;
 int hammer_count_io_locked;
 int hammer_limit_dirtybufspace;		/* per-mount */
 int hammer_limit_recs;			/* as a whole XXX */
+int hammer_limit_inode_recs = 1024;	/* per inode */
 int hammer_autoflush = 2000;		/* auto flush */
 int hammer_bio_count;
 int hammer_verify_zone;
@@ -132,6 +133,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW,
 	   &hammer_limit_dirtybufspace, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_recs, CTLFLAG_RW,
 	   &hammer_limit_recs, 0, "");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_inode_recs, CTLFLAG_RW,
+	   &hammer_limit_inode_recs, 0, "");
 
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_fsyncs, CTLFLAG_RD,
 	   &hammer_count_fsyncs, 0, "");
diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c
index a92e9c4276..f7d3209648 100644
--- a/sys/vfs/hammer/hammer_vnops.c
+++ b/sys/vfs/hammer/hammer_vnops.c
@@ -379,7 +379,6 @@ hammer_vop_write(struct vop_write_args *ap)
 	int error;
 	int n;
 	int flags;
-	int delta;
 	int seqcount;
 	int bigwrite;
 
@@ -465,6 +464,22 @@ hammer_vop_write(struct vop_write_args *ap)
 			bwillwrite(blksize);
 
 		/*
+		 * Control the number of pending records associated with
+		 * this inode.  If too many have accumulated start a
+		 * flush.  Try to maintain a pipeline with the flusher.
+		 */
+		if (ip->rsv_recs >= hammer_limit_inode_recs) {
+			hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
+		}
+		if (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
+			while (ip->rsv_recs >= hammer_limit_inode_recs) {
+				tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
+			}
+			hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
+		}
+
+#if 0
+		/*
 		 * Do not allow HAMMER to blow out system memory by
 		 * accumulating too many records.   Records are so well
 		 * decoupled from the buffer cache that it is possible
@@ -503,6 +518,7 @@ hammer_vop_write(struct vop_write_args *ap)
 			if (delta > 0)
 				tsleep(&trans, 0, "hmrslo", delta);
 		}
+#endif
 
 		/*
 		 * Calculate the blocksize at the current offset and figure
@@ -602,12 +618,29 @@ hammer_vop_write(struct vop_write_args *ap)
 
 		/*
 		 * Final buffer disposition.
+		 *
+		 * Because meta-data updates are deferred, HAMMER is
+		 * especially sensitive to excessive bdwrite()s because
+		 * the I/O stream is not broken up by disk reads.  So the
+		 * buffer cache simply cannot keep up.
+		 *
+		 * WARNING!  blksize is variable.  cluster_write() is
+		 * expected to not blow up if it encounters buffers that
+		 * do not match the passed blksize.
 		 */
 		bp->b_flags |= B_AGE;
 		if (ap->a_ioflag & IO_SYNC) {
 			bwrite(bp);
 		} else if (ap->a_ioflag & IO_DIRECT) {
 			bawrite(bp);
+		} else if (offset + n == blksize) {
+			if (hammer_cluster_enable == 0 ||
+			    (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
+				bawrite(bp);
+			} else {
+				cluster_write(bp, ip->ino_data.size,
+					      blksize, seqcount);
+			}
 		} else {
 			bdwrite(bp);
 		}
-- 
2.11.4.GIT