From 67ccf301480f9e849f61618db28f2fc2f383f8c6 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 26 Apr 2008 19:08:14 +0000 Subject: [PATCH] HAMMER 38E/Many: Undo/Synchronization and crash recovery * Clean up interlocks between the frontend and backend. * Deal with the case where the backend needs to sync a record to disk that the frontend wishes to delete. This basically just involves converting the record from a deleted in-memory record to a delete-on-disk record, so the synced record does not become visible to userland. * Deal with the case when an inode is being destroyed where the backend wishes to delete an in-memory record without syncing it to disk. * Document numerous special cases for future attention. --- sys/vfs/hammer/hammer.h | 21 ++++--- sys/vfs/hammer/hammer_btree.c | 5 +- sys/vfs/hammer/hammer_inode.c | 72 +++++++++++++++------- sys/vfs/hammer/hammer_object.c | 128 ++++++++++++++++++++++++++++------------ sys/vfs/hammer/hammer_recover.c | 7 ++- 5 files changed, 163 insertions(+), 70 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 1d73c382c..7110caec4 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.51 2008/04/26 08:02:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.52 2008/04/26 19:08:14 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -275,13 +275,19 @@ struct hammer_record { typedef struct hammer_record *hammer_record_t; +/* + * Record flags. Note that FE can only be set by the frontend if the + * record has not been interlocked by the backend w/ BE. + */ #define HAMMER_RECF_ALLOCDATA 0x0001 #define HAMMER_RECF_ONRBTREE 0x0002 #define HAMMER_RECF_DELETED_FE 0x0004 /* deleted (frontend) */ -#define HAMMER_RECF_INBAND 0x0008 -#define HAMMER_RECF_DELETED_BE 0x0010 /* deleted (backend) */ -#define HAMMER_RECF_WANTED 0x0020 -#define HAMMER_RECF_DELETE_ONDISK 0x0040 +#define HAMMER_RECF_DELETED_BE 0x0008 /* deleted (backend) */ +#define HAMMER_RECF_INBAND 0x0010 +#define HAMMER_RECF_INTERLOCK_BE 0x0020 /* backend interlock */ +#define HAMMER_RECF_WANTED 0x0040 +#define HAMMER_RECF_DELETE_ONDISK 0x0080 +#define HAMMER_RECF_CONVERT_DELETE_ONDISK 0x0100 /* special case */ /* * In-memory structures representing on-disk structures. @@ -567,11 +573,10 @@ int hammer_sync_hmp(hammer_mount_t hmp, int waitfor); hammer_record_t hammer_alloc_mem_record(hammer_inode_t ip); -void hammer_flush_record_done(hammer_record_t record); +void hammer_flush_record_done(hammer_record_t record, int error); void hammer_wait_mem_record(hammer_record_t record); void hammer_rel_mem_record(hammer_record_t record); -void hammer_delete_mem_record(hammer_record_t record); - +void hammer_cleardep_mem_record(struct hammer_record *record); int hammer_cursor_up(hammer_cursor_t cursor); int hammer_cursor_down(hammer_cursor_t cursor); diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index f9cbaa2f1..73205a459 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.38 2008/04/26 08:02:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.39 2008/04/26 19:08:14 dillon Exp $ */ /* @@ -654,6 +654,9 @@ hammer_btree_extract(hammer_cursor_t cursor, int flags) * flag set and that call must return ENOENT before this function can be * called. * + * The caller may depend on the cursor's exclusive lock after return to + * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE_ONDISK). + * * ENOSPC is returned if there is no room to insert a new record. */ int diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index ffedc1456..c681413a3 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.39 2008/04/26 08:02:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.40 2008/04/26 19:08:14 dillon Exp $ */ #include "hammer.h" @@ -464,13 +464,23 @@ retry: record->rec.inode.base.base.create_tid = trans->tid; record->rec.inode.base.data_len = sizeof(ip->sync_ino_data); record->data = (void *)&ip->sync_ino_data; + record->flags |= HAMMER_RECF_INTERLOCK_BE; error = hammer_ip_sync_record(trans, record); if (error) { kprintf("error %d\n", error); Debugger("hammer_update_inode3"); } - hammer_delete_mem_record(record); + + /* + * The record isn't managed by the inode's record tree, + * destroy it whether we succeed or fail. + */ + record->flags &= ~HAMMER_RECF_INTERLOCK_BE; + record->flags |= HAMMER_RECF_DELETED_FE; + record->state = HAMMER_FST_IDLE; + KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL); hammer_rel_mem_record(record); + if (error == 0) { ip->sync_flags &= ~(HAMMER_INODE_RDIRTY | HAMMER_INODE_DDIRTY | @@ -785,7 +795,12 @@ hammer_flush_inode_copysync(hammer_inode_t ip) /* * Mark records for backend flush, accumulate a count of the number of - * records which could not be marked. + * records which could not be marked. Records marked for deletion + * by the frontend never make it to the media. It is possible for + * a record queued to the backend to wind up with FE set after the + * fact, as long as BE has not yet been set. The backend deals with + * this race by syncing the record as if FE had not been set, and + * then converting the record to a delete-on-disk record. */ static int hammer_mark_record_callback(hammer_record_t rec, void *data) @@ -875,21 +890,35 @@ hammer_sync_record_callback(hammer_record_t record, void *data) /* * Skip records that do not belong to the current flush. Records * belonging to the flush will have been referenced for us. - * - * Skip records that were deleted by the backend itself. Records - * deleted by the frontend after their state has changed to FLUSH - * are not considered to be deleted by the backend. - * - * XXX special delete-on-disk records can be deleted by the backend - * prior to the sync due to a truncation operation. This is kinda - * a hack to deal with it. */ if (record->state != HAMMER_FST_FLUSH) return(0); - if (record->flags & HAMMER_RECF_DELETED_BE) { - hammer_flush_record_done(record); + + /* + * Interlock the record using the BE flag. Once BE is set the + * frontend cannot change the state of FE. + * + * NOTE: If FE is set prior to us setting BE we still sync the + * record out, but the flush completion code converts it to + * a delete-on-disk record instead of destroying it. + */ + hammer_lock_ex(&record->lock); + if (record->flags & HAMMER_RECF_INTERLOCK_BE) { + hammer_unlock(&record->lock); return(0); } + record->flags |= HAMMER_RECF_INTERLOCK_BE; + + /* + * If DELETED_FE is set we may have already sent dependant pieces + * to the disk and we must flush the record as if it hadn't been + * deleted. This creates a bit of a mess because we have to + * have ip_sync_record convert the record to DELETE_ONDISK before + * it inserts the B-Tree record. Otherwise the media sync might + * be visible to the frontend. + */ + if (record->flags & HAMMER_RECF_DELETED_FE) + record->flags |= HAMMER_RECF_CONVERT_DELETE_ONDISK; /* * Assign the create_tid for new records. Deletions already @@ -907,7 +936,7 @@ hammer_sync_record_callback(hammer_record_t record, void *data) Debugger("sync failed rec"); } } - hammer_flush_record_done(record); + hammer_flush_record_done(record, error); return(error); } @@ -997,7 +1026,7 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete) hammer_record_t rec; RB_FOREACH(rec, hammer_rec_rb_tree, &ip->rec_tree) { - KKASSERT(rec->flags & HAMMER_RECF_DELETED_BE); + KKASSERT(rec->state == HAMMER_FST_FLUSH); } } @@ -1110,12 +1139,13 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete) ip->sync_flags &= ~(HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY| HAMMER_INODE_XDIRTY|HAMMER_INODE_ITIMES); while (RB_ROOT(&ip->rec_tree)) { - hammer_record_t rec = RB_ROOT(&ip->rec_tree); - hammer_ref(&rec->lock); - KKASSERT(rec->lock.refs == 1); - hammer_delete_mem_record(rec); - rec->flags |= HAMMER_RECF_DELETED_BE; - hammer_rel_mem_record(rec); + hammer_record_t record = RB_ROOT(&ip->rec_tree); + hammer_ref(&record->lock); + KKASSERT(record->lock.refs == 1); + record->flags |= HAMMER_RECF_DELETED_FE; + record->flags |= HAMMER_RECF_DELETED_BE; + hammer_cleardep_mem_record(record); + hammer_rel_mem_record(record); } break; case HAMMER_INODE_ONDISK: diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 14fcc5b59..b92146432 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.44 2008/04/26 08:02:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.45 2008/04/26 19:08:14 dillon Exp $ */ #include "hammer.h" @@ -70,7 +70,7 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) return(1); /* - * Never match against a deleted item. + * Never match against an item deleted by the front-end. */ if (rec1->flags & HAMMER_RECF_DELETED_FE) return(1); @@ -184,16 +184,46 @@ hammer_wait_mem_record(hammer_record_t record) } /* - * Called from the backend, hammer_inode.c, when a record has been - * flushed to disk. + * Called from the backend, hammer_inode.c, after a record has been + * flushed to disk. The record has been exclusively locked by the + * caller and interlocked with BE. * - * The backend has likely marked this record for deletion as well. + * We clean up the state, unlock, and release the record (the record + * was referenced by the fact that it was in the HAMMER_FST_FLUSH state). */ void -hammer_flush_record_done(hammer_record_t record) +hammer_flush_record_done(hammer_record_t record, int error) { KKASSERT(record->state == HAMMER_FST_FLUSH); + KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); + + if (error) { + /* + * An error occured, the backend was unable to sync the + * record to its media. Leave the record intact. + */ + } else if (record->flags & HAMMER_RECF_CONVERT_DELETE_ONDISK) { + /* + * + */ + if (record->flags & HAMMER_RECF_DELETED_BE) { + record->flags |= HAMMER_RECF_DELETED_FE; + hammer_cleardep_mem_record(record); + } else { + KKASSERT(record->flags & HAMMER_RECF_DELETE_ONDISK); + } + } else { + /* + * Normal completion, record has been disposed of (by + * having been synchronized to the media). + */ + record->flags |= HAMMER_RECF_DELETED_FE; + hammer_cleardep_mem_record(record); + } record->state = HAMMER_FST_IDLE; + record->flags &= ~HAMMER_RECF_INTERLOCK_BE; + record->flags &= ~HAMMER_RECF_CONVERT_DELETE_ONDISK; + hammer_unlock(&record->lock); if (record->flags & HAMMER_RECF_WANTED) { record->flags &= ~HAMMER_RECF_WANTED; wakeup(record); @@ -202,19 +232,18 @@ hammer_flush_record_done(hammer_record_t record) } /* - * Destroy a memory record. The record is marked deleted and will - * be freed when the last reference goes away. Any dependancies are - * also destroyed. + * Clear dependancies associated with a memory record. */ void -hammer_delete_mem_record(hammer_record_t record) +hammer_cleardep_mem_record(struct hammer_record *record) { hammer_depend_t depend; - record->flags |= HAMMER_RECF_DELETED_FE; while ((depend = TAILQ_FIRST(&record->depend_list)) != NULL) { - TAILQ_REMOVE(&record->depend_list, depend, rec_entry); - TAILQ_REMOVE(&depend->ip->depend_list, depend, ip_entry); + TAILQ_REMOVE(&record->depend_list, depend, + rec_entry); + TAILQ_REMOVE(&depend->ip->depend_list, depend, + ip_entry); --depend->ip->depend_count; /* NOTE: inode is not flushed */ hammer_rel_inode(depend->ip, 0); @@ -236,6 +265,9 @@ hammer_rel_mem_record(struct hammer_record *record) if (record->flags & HAMMER_RECF_DELETED_FE) { if (record->lock.refs == 0) { + KKASSERT(record->state == HAMMER_FST_IDLE); + KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL); + if (record->flags & HAMMER_RECF_ONRBTREE) { RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, @@ -253,14 +285,6 @@ hammer_rel_mem_record(struct hammer_record *record) return; } } - - /* - * If someone wanted the record wake them up. - */ - if (record->flags & HAMMER_RECF_WANTED) { - record->flags &= ~HAMMER_RECF_WANTED; - wakeup(record); - } } /* @@ -272,7 +296,7 @@ int hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t rec) { if (cursor->flags & HAMMER_CURSOR_BACKEND) { - if (rec->flags & HAMMER_RECF_DELETED_BE) + if (rec->flags & HAMMER_RECF_INTERLOCK_BE) return(0); } else { if (rec->flags & HAMMER_RECF_DELETED_FE) @@ -547,13 +571,14 @@ hammer_ip_del_directory(struct hammer_transaction *trans, * only be called from the backend. */ record = cursor->iprec; - if (record->state == HAMMER_FST_FLUSH) { + if (record->flags & HAMMER_RECF_INTERLOCK_BE) { KKASSERT(cursor->deadlk_rec == NULL); hammer_ref(&record->lock); cursor->deadlk_rec = record; error = EDEADLK; } else { - hammer_delete_mem_record(cursor->iprec); + record->flags |= HAMMER_RECF_DELETED_FE; + hammer_cleardep_mem_record(record); error = 0; } } else { @@ -756,10 +781,9 @@ done: * * Any inode dependancies will queue the inode to the backend. * - * NOTE: The frontend can mark the record deleted while it is queued to - * the backend. The deletion applies to a frontend operation and the - * record must be treated as NOT having been deleted on the backend, so - * we ignore the flag. + * This routine can only be called by the backend and the record + * must have been interlocked with BE. It will remain interlocked on + * return. The caller is responsible for the record's disposition. */ int hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record) @@ -773,7 +797,16 @@ hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record) int error; KKASSERT(record->state == HAMMER_FST_FLUSH); + KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); + /* + * XXX A record with a dependancy is typically a directory record. + * The related inode must also be synchronized. This code is not + * currently synchronizing the inode atomically. XXX + * + * XXX Additional dependancies from the frontend might be added while + * the backend is syncing the record? + */ while ((depend = TAILQ_FIRST(&record->depend_list)) != NULL) { TAILQ_REMOVE(&record->depend_list, depend, rec_entry); TAILQ_REMOVE(&depend->ip->depend_list, depend, ip_entry); @@ -794,6 +827,7 @@ retry: if (error) return(error); cursor.key_beg = record->rec.base.base; + cursor.flags |= HAMMER_CURSOR_BACKEND; /* * If we are deleting an exact match must be found on-disk. @@ -802,8 +836,6 @@ retry: error = hammer_btree_lookup(&cursor); if (error == 0) error = hammer_ip_delete_record(&cursor, trans->tid); - if (error == 0) - hammer_delete_mem_record(record); goto done; } @@ -906,17 +938,25 @@ retry: error = hammer_btree_insert(&cursor, &elm); /* - * Clean up on success, or fall through on error. + * This occurs when the frontend creates a record and queues it to + * the backend, then tries to delete the record. The backend must + * still sync the record to the media as if it were not deleted, + * but must interlock with the frontend to ensure that the + * synchronized record is not visible to the frontend, which means + * converted the 'deleted' record to a delete-on-disk record. */ - if (error == 0) { - hammer_delete_mem_record(record); - goto done; + if (error == 0 && (record->flags & HAMMER_RECF_CONVERT_DELETE_ONDISK)) { + KKASSERT((record->flags & HAMMER_RECF_DELETE_ONDISK) == 0); + record->flags |= HAMMER_RECF_DELETE_ONDISK; + record->flags &= ~HAMMER_RECF_DELETED_FE; } /* - * Try to unwind the allocation + * If the error occured unwind the operation. */ - hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE); + if (error) + hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE); + done: hammer_done_cursor(&cursor); if (error == EDEADLK) @@ -984,7 +1024,8 @@ hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record) */ while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) { if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){ - hammer_delete_mem_record(record); + record->flags |= HAMMER_RECF_DELETED_FE; + KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL); hammer_rel_mem_record(record); return (EEXIST); } @@ -1516,12 +1557,21 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) int error; int dodelete; + KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); + /* - * In-memory (unsynchronized) records can simply be freed. + * In-memory (unsynchronized) records can simply be freed. This + * only occurs in range iterations since all other records are + * individually synchronized. Thus there should be no confusion with + * the interlock. + * + * */ if (cursor->record == &cursor->iprec->rec) { - hammer_delete_mem_record(cursor->iprec); + KKASSERT((cursor->iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0); + cursor->iprec->flags |= HAMMER_RECF_DELETED_FE; cursor->iprec->flags |= HAMMER_RECF_DELETED_BE; + hammer_cleardep_mem_record(cursor->iprec); return(0); } diff --git a/sys/vfs/hammer/hammer_recover.c b/sys/vfs/hammer/hammer_recover.c index 3c92a376b..81126d86f 100644 --- a/sys/vfs/hammer/hammer_recover.c +++ b/sys/vfs/hammer/hammer_recover.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.11 2008/04/26 08:02:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.12 2008/04/26 19:08:14 dillon Exp $ */ #include "hammer.h" @@ -40,7 +40,9 @@ static int hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off); static void hammer_recover_copy_undo(hammer_off_t undo_offset, char *src, char *dst, int bytes); +#if 0 static void hammer_recover_debug_dump(int w, char *buf, int bytes); +#endif static int hammer_recover_undo(hammer_mount_t hmp, hammer_fifo_undo_t undo, int bytes); @@ -317,6 +319,8 @@ hammer_recover_copy_undo(hammer_off_t undo_offset, bcopy(src, dst, bytes); } +#if 0 + static void hammer_recover_debug_dump(int w, char *buf, int bytes) { @@ -330,3 +334,4 @@ hammer_recover_debug_dump(int w, char *buf, int bytes) kprintf("\n"); } +#endif -- 2.11.4.GIT