From a9d52b76688f644d63b53aaf4884cf686b0131ff Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 28 Jun 2008 18:10:55 +0000 Subject: [PATCH] HAMMER 59C/Many: Stabilization pass - fixes for large file issues * Rename hammer_delete_range_all() to hammer_delete_clean(). This function is no longer responsible for deleting data and db records when a file is removed. The normal hammer_delete_range() function now does that for all cases. * Adjust hammer_delete_range() to not restart at offset 0 when it hits a deadlock or must defer the remainder of the operation to another flush cycle. When removing very large files (greater then 500GB) the constant restarts resulted in exponential delays. Instead the function restarts near where it had left off the previous try. * Change ip->sync_trunc_off's dynamics so it can be used to cache the restart offset. The truncation state can now be left intact on the backend and the frontend will merge new truncations into it instead of replacing the truncation. Add a new field, ip->save_trunc_off to handle the append overwrite check optimization (sync_trunc_off used to be used for that, no longer). * Fix minor bugs, and use the new bwillwrite() API. * Remove the #if 0'd out cluster_write() code entirely (it never worked anyway). The new buffer cache subsystem in the kernel allows bdwrite() to be used with no real penalty. --- sys/vfs/hammer/hammer.h | 5 ++- sys/vfs/hammer/hammer_disk.h | 8 ++-- sys/vfs/hammer/hammer_inode.c | 75 ++++++++++++++++++++------------ sys/vfs/hammer/hammer_object.c | 99 +++++++++++++++++++++++------------------- sys/vfs/hammer/hammer_vnops.c | 33 ++------------ 5 files changed, 113 insertions(+), 107 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 5e01f3304d..17d335f0ab 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.94 2008/06/27 20:56:59 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.95 2008/06/28 18:10:55 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -257,6 +257,7 @@ struct hammer_inode { */ int sync_flags; /* to-sync flags cache */ off_t sync_trunc_off; /* to-sync truncation */ + off_t save_trunc_off; /* write optimization */ struct hammer_btree_leaf_elm sync_ino_leaf; /* to-sync cache */ struct hammer_inode_data sync_ino_data; /* to-sync cache */ }; @@ -974,7 +975,7 @@ int hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record); int hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip, int64_t ran_beg, int64_t ran_end, int truncating); -int hammer_ip_delete_range_all(hammer_cursor_t cursor, hammer_inode_t ip, +int hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp); int hammer_ip_sync_data(hammer_cursor_t cursor, hammer_inode_t ip, int64_t offset, void *data, int bytes); diff --git a/sys/vfs/hammer/hammer_disk.h b/sys/vfs/hammer/hammer_disk.h index 84be158942..2ecb72da5e 100644 --- a/sys/vfs/hammer/hammer_disk.h +++ b/sys/vfs/hammer/hammer_disk.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.42 2008/06/23 21:42:48 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.43 2008/06/28 18:10:55 dillon Exp $ */ #ifndef VFS_HAMMER_DISK_H_ @@ -526,9 +526,6 @@ typedef struct hammer_volume_ondisk *hammer_volume_ondisk_t; /* * Record types are fairly straightforward. The B-Tree includes the record * type in its index sort. - * - * NOTE: hammer_ip_delete_range_all() deletes all record types greater - * then HAMMER_RECTYPE_INODE. */ #define HAMMER_RECTYPE_UNKNOWN 0 #define HAMMER_RECTYPE_LOWEST 1 /* lowest record type avail */ @@ -541,6 +538,9 @@ typedef struct hammer_volume_ondisk *hammer_volume_ondisk_t; #define HAMMER_RECTYPE_EXT 0x0013 /* ext attributes */ #define HAMMER_RECTYPE_FIX 0x0014 /* fixed attribute */ #define HAMMER_RECTYPE_MOVED 0x8000 /* special recovery flag */ +#define HAMMER_RECTYPE_MAX 0xFFFF + +#define HAMMER_RECTYPE_CLEAN_START HAMMER_RECTYPE_EXT #define HAMMER_FIXKEY_SYMLINK 1 diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index 8100d80edf..35d4963d5d 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.86 2008/06/27 20:56:59 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.87 2008/06/28 18:10:55 dillon Exp $ */ #include "hammer.h" @@ -346,7 +346,8 @@ loop: ip->cache[1].ip = ip; if (hmp->ronly || (hmp->hflags & HMNT_SLAVE)) ip->flags |= HAMMER_INODE_RO; - ip->sync_trunc_off = ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; + ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off = + 0x7FFFFFFFFFFFFFFFLL; RB_INIT(&ip->rec_tree); TAILQ_INIT(&ip->target_list); @@ -395,12 +396,12 @@ retry: /* * The file should not contain any data past the file size - * stored in the inode. Setting sync_trunc_off to the + * stored in the inode. Setting save_trunc_off to the * file size instead of max reduces B-Tree lookup overheads * on append by allowing the flusher to avoid checking for * record overwrites. */ - ip->sync_trunc_off = ip->ino_data.size; + ip->save_trunc_off = ip->ino_data.size; } /* @@ -504,6 +505,7 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap, ip->cache[1].ip = ip; ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; + /* ip->save_trunc_off = 0; (already zero) */ RB_INIT(&ip->rec_tree); TAILQ_INIT(&ip->target_list); @@ -1315,20 +1317,36 @@ hammer_flush_inode_core(hammer_inode_t ip, int flags) /* * Snapshot the state of the inode for the backend flusher. * - * The truncation must be retained in the frontend until after - * we've actually performed the record deletion. - * - * We continue to retain sync_trunc_off even when all truncations + * We continue to retain save_trunc_off even when all truncations * have been resolved as an optimization to determine if we can * skip the B-Tree lookup for overwrite deletions. * * NOTE: The DELETING flag is a mod flag, but it is also sticky, * and stays in ip->flags. Once set, it stays set until the * inode is destroyed. + * + * NOTE: If a truncation from a previous flush cycle had to be + * continued into this one, the TRUNCATED flag will still be + * set in sync_flags. */ - ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK); - if (ip->sync_flags & HAMMER_INODE_TRUNCATED) - ip->sync_trunc_off = ip->trunc_off; + if (ip->flags & HAMMER_INODE_TRUNCATED) { + if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { + if (ip->sync_trunc_off > ip->trunc_off) + ip->sync_trunc_off = ip->trunc_off; + } else { + ip->sync_trunc_off = ip->trunc_off; + } + + /* + * The save_trunc_off used to cache whether the B-Tree + * holds any records past that point is not used until + * after the truncation has succeeded, so we can safely + * set it now. + */ + if (ip->save_trunc_off > ip->sync_trunc_off) + ip->save_trunc_off = ip->sync_trunc_off; + } + ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK); ip->sync_ino_leaf = ip->ino_leaf; ip->sync_ino_data = ip->ino_data; ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; @@ -1526,8 +1544,10 @@ hammer_flush_inode_done(hammer_inode_t ip) /* * Merge left-over flags back into the frontend and fix the state. + * Incomplete truncations are retained by the backend. */ - ip->flags |= ip->sync_flags; + ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED; + ip->sync_flags &= HAMMER_INODE_TRUNCATED; /* * The backend may have adjusted nlinks, so if the adjusted nlinks @@ -1564,6 +1584,14 @@ hammer_flush_inode_done(hammer_inode_t ip) ip->flags |= HAMMER_INODE_REFLUSH; /* + * Clean up the vnode ref + */ + if (ip->flags & HAMMER_INODE_VHELD) { + ip->flags &= ~HAMMER_INODE_VHELD; + vrele(ip->vp); + } + + /* * Adjust flush_state. The target state (idle or setup) shouldn't * be terribly important since we will reflush if we really need * to do anything. @@ -1591,14 +1619,6 @@ hammer_flush_inode_done(hammer_inode_t ip) --hammer_count_iqueued; /* - * Clean up the vnode ref - */ - if (ip->flags & HAMMER_INODE_VHELD) { - ip->flags &= ~HAMMER_INODE_VHELD; - vrele(ip->vp); - } - - /* * If the frontend made more changes and requested another flush, * then try to get it running. */ @@ -1892,7 +1912,8 @@ hammer_sync_inode(hammer_inode_t ip) * while we were blocked so we only use sync_trunc_off. * * This operation can blow out the buffer cache, EWOULDBLOCK - * means we were unable to complete the deletion. + * means we were unable to complete the deletion. The + * deletion will update sync_trunc_off in that case. */ error = hammer_ip_delete_range(&cursor, ip, aligned_trunc_off, @@ -1960,6 +1981,10 @@ hammer_sync_inode(hammer_inode_t ip) /* * If we are deleting the inode the frontend had better not have * any active references on elements making up the inode. + * + * The call to hammer_ip_delete_clean() cleans up auxillary records + * but not DB or DATA records. Those must have already been deleted + * by the normal truncation mechanic. */ if (error == 0 && ip->sync_ino_data.nlinks == 0 && RB_EMPTY(&ip->rec_tree) && @@ -1967,7 +1992,7 @@ hammer_sync_inode(hammer_inode_t ip) (ip->flags & HAMMER_INODE_DELETED) == 0) { int count1 = 0; - error = hammer_ip_delete_range_all(&cursor, ip, &count1); + error = hammer_ip_delete_clean(&cursor, ip, &count1); if (error == 0) { ip->flags |= HAMMER_INODE_DELETED; ip->sync_flags &= ~HAMMER_INODE_DELETING; @@ -1995,12 +2020,8 @@ hammer_sync_inode(hammer_inode_t ip) --ip->hmp->rootvol->ondisk->vol0_stat_inodes; hammer_modify_volume_done(trans.rootvol); } - } else if (error == EWOULDBLOCK) { - ip->flags |= HAMMER_INODE_WOULDBLOCK; - error = 0; - goto defer_buffer_flush; } else { - Debugger("hammer_ip_delete_range_all errored"); + Debugger("hammer_ip_delete_clean errored"); } } diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index e429a66e82..5670a1c28d 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.77 2008/06/27 20:56:59 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.78 2008/06/28 18:10:55 dillon Exp $ */ #include "hammer.h" @@ -969,13 +969,13 @@ hammer_record_needs_overwrite_delete(hammer_record_t record) file_offset = record->leaf.base.key; else file_offset = record->leaf.base.key - record->leaf.data_len; - r = (file_offset < ip->sync_trunc_off); + r = (file_offset < ip->save_trunc_off); if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { - if (ip->sync_trunc_off <= record->leaf.base.key) - ip->sync_trunc_off = record->leaf.base.key + 1; + if (ip->save_trunc_off <= record->leaf.base.key) + ip->save_trunc_off = record->leaf.base.key + 1; } else { - if (ip->sync_trunc_off < record->leaf.base.key) - ip->sync_trunc_off = record->leaf.base.key; + if (ip->save_trunc_off < record->leaf.base.key) + ip->save_trunc_off = record->leaf.base.key; } return(r); } @@ -1581,16 +1581,23 @@ hammer_ip_resolve_data(hammer_cursor_t cursor) * If truncating is non-zero in-memory records associated with the back-end * are ignored. If truncating is > 1 we can return EWOULDBLOCK. * - * NOTE: An unaligned range will cause new records to be added to cover - * the edge cases. (XXX not implemented yet). + * NOTES: * - * NOTE: Replacement via reservations (see hammer_ip_sync_record_cursor()) - * also do not deal with unaligned ranges. + * * An unaligned range will cause new records to be added to cover + * the edge cases. (XXX not implemented yet). * - * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024). + * * Replacement via reservations (see hammer_ip_sync_record_cursor()) + * also do not deal with unaligned ranges. * - * NOTE: Record keys for regular file data have to be special-cased since - * they indicate the end of the range (key = base + bytes). + * * ran_end is inclusive (e.g. 0,1023 instead of 0,1024). + * + * * Record keys for regular file data have to be special-cased since + * they indicate the end of the range (key = base + bytes). + * + * * This function may be asked to delete ridiculously huge ranges, for + * example if someone truncates or removes a 1TB regular file. We + * must be very careful on restarts and we may have to stop w/ + * EWOULDBLOCK to avoid blowing out the buffer cache. */ int hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip, @@ -1600,6 +1607,7 @@ hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip, hammer_btree_leaf_elm_t leaf; int error; int64_t off; + int64_t tmp64; #if 0 kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end); @@ -1614,35 +1622,35 @@ retry: cursor->key_beg.create_tid = 0; cursor->key_beg.delete_tid = 0; cursor->key_beg.obj_type = 0; - cursor->asof = ip->obj_asof; - cursor->flags &= ~HAMMER_CURSOR_INITMASK; - cursor->flags |= HAMMER_CURSOR_ASOF; - cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY; - cursor->flags |= HAMMER_CURSOR_BACKEND; - cursor->key_end = cursor->key_beg; if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { cursor->key_beg.key = ran_beg; cursor->key_beg.rec_type = HAMMER_RECTYPE_DB; - cursor->key_end.rec_type = HAMMER_RECTYPE_DB; - cursor->key_end.key = ran_end; } else { /* * The key in the B-Tree is (base+bytes), so the first possible * matching key is ran_beg + 1. */ - int64_t tmp64; - cursor->key_beg.key = ran_beg + 1; cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA; - cursor->key_end.rec_type = HAMMER_RECTYPE_DATA; + } + cursor->key_end = cursor->key_beg; + if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { + cursor->key_end.key = ran_end; + } else { tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */ if (tmp64 < ran_end) cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL; else cursor->key_end.key = ran_end + MAXPHYS + 1; } + + cursor->asof = ip->obj_asof; + cursor->flags &= ~HAMMER_CURSOR_INITMASK; + cursor->flags |= HAMMER_CURSOR_ASOF; + cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY; + cursor->flags |= HAMMER_CURSOR_BACKEND; cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE; error = hammer_ip_first(cursor); @@ -1689,6 +1697,8 @@ retry: break; panic("hammer right edge case\n"); } + } else { + off = leaf->base.key; } /* @@ -1700,20 +1710,27 @@ retry: * data if the retention policy dictates. The function * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next() * uses to perform a fixup. - * - * If we have built up too many meta-buffers we risk - * deadlocking the kernel and must stop. This can occur - * when deleting ridiculously huge files. */ if (truncating == 0 || hammer_cursor_ondisk(cursor)) { error = hammer_ip_delete_record(cursor, ip, trans->tid); + /* + * If we have built up too many meta-buffers we risk + * deadlocking the kernel and must stop. This can + * occur when deleting ridiculously huge files. + * sync_trunc_off is updated so the next cycle does + * not re-iterate records we have already deleted. + * + * This is only done with formal truncations. + */ if (truncating > 1 && error == 0 && hammer_flusher_meta_limit(ip->hmp)) { + ip->sync_trunc_off = off; error = EWOULDBLOCK; } } if (error) break; + ran_beg = off; /* for restart */ error = hammer_ip_next(cursor); } if (cursor->node) @@ -1731,17 +1748,13 @@ retry: } /* - * Backend truncation - delete all records. - * - * Delete all user records associated with an inode except the inode record - * itself. Directory entries are not deleted (they must be properly disposed - * of or nlinks would get upset). - * - * This function can return EWOULDBLOCK. + * This function deletes remaining auxillary records when an inode is + * being deleted. This function explicitly does not delete the + * inode record, directory entry, data, or db records. Those must be + * properly disposed of prior to this call. */ int -hammer_ip_delete_range_all(hammer_cursor_t cursor, hammer_inode_t ip, - int *countp) +hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp) { hammer_transaction_t trans = cursor->trans; hammer_btree_leaf_elm_t leaf; @@ -1756,11 +1769,11 @@ retry: cursor->key_beg.create_tid = 0; cursor->key_beg.delete_tid = 0; cursor->key_beg.obj_type = 0; - cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; + cursor->key_beg.rec_type = HAMMER_RECTYPE_CLEAN_START; cursor->key_beg.key = HAMMER_MIN_KEY; cursor->key_end = cursor->key_beg; - cursor->key_end.rec_type = 0xFFFF; + cursor->key_end.rec_type = HAMMER_RECTYPE_MAX; cursor->key_end.key = HAMMER_MAX_KEY; cursor->asof = ip->obj_asof; @@ -1789,12 +1802,8 @@ retry: * Directory entries (and delete-on-disk directory entries) * must be synced and cannot be deleted. */ - if (leaf->base.rec_type != HAMMER_RECTYPE_DIRENTRY) { - error = hammer_ip_delete_record(cursor, ip, trans->tid); - ++*countp; - if (error == 0 && hammer_flusher_meta_limit(ip->hmp)) - error = EWOULDBLOCK; - } + error = hammer_ip_delete_record(cursor, ip, trans->tid); + ++*countp; if (error) break; error = hammer_ip_next(cursor); diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 4779b294b1..0e9f06a56f 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.77 2008/06/26 04:06:23 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.78 2008/06/28 18:10:55 dillon Exp $ */ #include @@ -359,6 +359,8 @@ hammer_vop_write(struct vop_write_args *ap) if ((error = hammer_checkspace(hmp)) != 0) break; + blksize = hammer_blocksize(uio->uio_offset); + /* * Do not allow HAMMER to blow out the buffer cache. Very * large UIOs can lockout other processes due to bwillwrite() @@ -387,7 +389,7 @@ hammer_vop_write(struct vop_write_args *ap) if (uio->uio_segflg != UIO_NOCOPY) { vn_unlock(ap->a_vp); if ((ap->a_ioflag & IO_NOBWILL) == 0) - bwillwrite(); + bwillwrite(blksize); } /* @@ -431,7 +433,6 @@ hammer_vop_write(struct vop_write_args *ap) * Calculate the blocksize at the current offset and figure * out how much we can actually write. */ - blksize = hammer_blocksize(uio->uio_offset); blkmask = blksize - 1; offset = (int)uio->uio_offset & blkmask; base_offset = uio->uio_offset & ~(int64_t)blkmask; @@ -524,9 +525,6 @@ hammer_vop_write(struct vop_write_args *ap) /* * Final buffer disposition. * - * If write_mode is non-zero we call bawrite() - * unconditionally. Otherwise we only use bawrite() - * if the writes are clearly sequential. */ bp->b_flags |= B_AGE; if (ap->a_ioflag & IO_SYNC) { @@ -536,29 +534,6 @@ hammer_vop_write(struct vop_write_args *ap) } else { bdwrite(bp); } -#if 0 - else if (hammer_write_mode && - ((int)uio->uio_offset & blkmask) == 0) { -#if 0 - bp->b_flags |= B_CLUSTEROK; - cluster_write(bp, ip->ino_data.size, XXX seqcount); -#else - bdwrite(bp); -#endif - } else if ((ap->a_ioflag >> 16) == IO_SEQMAX && - ((int)uio->uio_offset & blkmask) == 0) { - /* - * If seqcount indicates sequential operation and - * we just finished filling a buffer, push it out - * now to prevent the buffer cache from becoming - * too full, which would trigger non-optimal - * flushes. - */ - bawrite(bp); - } else { - bdwrite(bp); - } -#endif } hammer_done_transaction(&trans); return (error); -- 2.11.4.GIT