From de996e865902051a4e8e4945e7116be8850f871c Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 19 Aug 2009 13:54:29 -0700 Subject: [PATCH] HAMMER - Rework write pipelining * Rework write pipelining so it is based on pending direct writes on an inode-by-inode basis. ip->rsv_recs and hmp->rsv_recs are now decremented after the direct has completed rather then when the sync code has processed the record. This fixes serious buffer cache overloading when doing linear writes. * Implement write clustering or bawrite() calls based on a filesystem block getting filled up instead of relying on the buffer cache's bdwrite() to keep ahead of the mark. * vfs.hammer.cluster_enable now effects both read and write clustering. --- sys/vfs/hammer/hammer.h | 1 + sys/vfs/hammer/hammer_io.c | 8 ++++++-- sys/vfs/hammer/hammer_object.c | 35 +++++++++++++++++++++++++---------- sys/vfs/hammer/hammer_vfsops.c | 3 +++ sys/vfs/hammer/hammer_vnops.c | 35 ++++++++++++++++++++++++++++++++++- 5 files changed, 69 insertions(+), 13 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index a2b198a25f..501750cefd 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -859,6 +859,7 @@ extern int hammer_count_io_running_write; extern int hammer_count_io_locked; extern int hammer_limit_dirtybufspace; extern int hammer_limit_recs; +extern int hammer_limit_inode_recs; extern int hammer_bio_count; extern int hammer_verify_zone; extern int hammer_verify_data; diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index ca44a9baee..a472879015 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -1294,10 +1294,14 @@ hammer_io_direct_write_complete(struct bio *nbio) KKASSERT(record != NULL); KKASSERT(record->flags & HAMMER_RECF_DIRECT_IO); - record->flags &= ~HAMMER_RECF_DIRECT_IO; if (record->flags & HAMMER_RECF_DIRECT_WAIT) { - record->flags &= ~HAMMER_RECF_DIRECT_WAIT; + record->flags &= ~(HAMMER_RECF_DIRECT_IO | + HAMMER_RECF_DIRECT_WAIT); + /* record can disappear once DIRECT_IO flag is cleared */ wakeup(&record->flags); + } else { + record->flags &= ~HAMMER_RECF_DIRECT_IO; + /* record can disappear once DIRECT_IO flag is cleared */ } } diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 4f3f5c4c94..b54b6bb04b 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -363,6 +363,7 @@ hammer_rel_mem_record(struct hammer_record *record) hammer_reserve_t resv; hammer_inode_t ip; hammer_inode_t target_ip; + int diddrop; hammer_unref(&record->lock); @@ -401,21 +402,18 @@ hammer_rel_mem_record(struct hammer_record *record) hammer_ref(&target_ip->lock); } + /* + * Remove the record from the B-Tree + */ if (record->flags & HAMMER_RECF_ONRBTREE) { RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record); - KKASSERT(ip->rsv_recs > 0); - --hmp->rsv_recs; - --ip->rsv_recs; - hmp->rsv_databytes -= record->leaf.data_len; record->flags &= ~HAMMER_RECF_ONRBTREE; - - if (RB_EMPTY(&record->ip->rec_tree)) { - record->ip->flags &= ~HAMMER_INODE_XDIRTY; - record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY; - hammer_test_inode(record->ip); - } + KKASSERT(ip->rsv_recs > 0); + diddrop = 1; + } else { + diddrop = 0; } /* @@ -428,6 +426,23 @@ hammer_rel_mem_record(struct hammer_record *record) hammer_io_direct_wait(record); } + /* + * Account for the completion after the direct IO + * has completed. + */ + if (diddrop) { + --hmp->rsv_recs; + --ip->rsv_recs; + hmp->rsv_databytes -= record->leaf.data_len; + + if (RB_EMPTY(&record->ip->rec_tree)) { + record->ip->flags &= ~HAMMER_INODE_XDIRTY; + record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY; + hammer_test_inode(record->ip); + } + if (ip->rsv_recs == hammer_limit_inode_recs - 1) + wakeup(&ip->rsv_recs); + } /* * Do this test after removing record from the B-Tree. diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index aa30526dbe..331f43f91a 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -95,6 +95,7 @@ int hammer_count_io_running_write; int hammer_count_io_locked; int hammer_limit_dirtybufspace; /* per-mount */ int hammer_limit_recs; /* as a whole XXX */ +int hammer_limit_inode_recs = 1024; /* per inode */ int hammer_autoflush = 2000; /* auto flush */ int hammer_bio_count; int hammer_verify_zone; @@ -132,6 +133,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW, &hammer_limit_dirtybufspace, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_recs, CTLFLAG_RW, &hammer_limit_recs, 0, ""); +SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_inode_recs, CTLFLAG_RW, + &hammer_limit_inode_recs, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, count_fsyncs, CTLFLAG_RD, &hammer_count_fsyncs, 0, ""); diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index a92e9c4276..f7d3209648 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -379,7 +379,6 @@ hammer_vop_write(struct vop_write_args *ap) int error; int n; int flags; - int delta; int seqcount; int bigwrite; @@ -465,6 +464,22 @@ hammer_vop_write(struct vop_write_args *ap) bwillwrite(blksize); /* + * Control the number of pending records associated with + * this inode. If too many have accumulated start a + * flush. Try to maintain a pipeline with the flusher. + */ + if (ip->rsv_recs >= hammer_limit_inode_recs) { + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); + } + if (ip->rsv_recs >= hammer_limit_inode_recs * 2) { + while (ip->rsv_recs >= hammer_limit_inode_recs) { + tsleep(&ip->rsv_recs, 0, "hmrwww", hz); + } + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); + } + +#if 0 + /* * Do not allow HAMMER to blow out system memory by * accumulating too many records. Records are so well * decoupled from the buffer cache that it is possible @@ -503,6 +518,7 @@ hammer_vop_write(struct vop_write_args *ap) if (delta > 0) tsleep(&trans, 0, "hmrslo", delta); } +#endif /* * Calculate the blocksize at the current offset and figure @@ -602,12 +618,29 @@ hammer_vop_write(struct vop_write_args *ap) /* * Final buffer disposition. + * + * Because meta-data updates are deferred, HAMMER is + * especially sensitive to excessive bdwrite()s because + * the I/O stream is not broken up by disk reads. So the + * buffer cache simply cannot keep up. + * + * WARNING! blksize is variable. cluster_write() is + * expected to not blow up if it encounters buffers that + * do not match the passed blksize. */ bp->b_flags |= B_AGE; if (ap->a_ioflag & IO_SYNC) { bwrite(bp); } else if (ap->a_ioflag & IO_DIRECT) { bawrite(bp); + } else if (offset + n == blksize) { + if (hammer_cluster_enable == 0 || + (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { + bawrite(bp); + } else { + cluster_write(bp, ip->ino_data.size, + blksize, seqcount); + } } else { bdwrite(bp); } -- 2.11.4.GIT