From c9ce54d64fd77d8901f15263a4acbfda66fc1d0b Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 3 Sep 2009 08:18:43 -0700 Subject: [PATCH] HAMMER - Fix lost inode issue (primarily with nohistory mounts) * When a HAMMER cursor is unlocked it becomes tracked and unrelated B-Tree operations will cause the tracked cursor's nodes and indices to be updated. The cursor structure also has a leaf element pointer which was not being properly updated. This could lead to panics and lost inodes. Properly adjust the leaf element pointer in tracked cursors. * The bug primarily occurs with nohistory mounts or nohistory sub-trees due to the larger number of physical deletions made to the B-Tree, but could also occur (rarely) with normal mounts. * Add additional assertions to catch any further occurrences (though I think all the cases have been covered now). * Add a new sysctl vfs.hammer.error_panic which can be set to e.g. 9 to cause critical errors to panic immediately instead of returning through the call stack, making debugging possible. Reported-by: Numerous people --- sys/vfs/hammer/hammer_btree.c | 7 ++++++ sys/vfs/hammer/hammer_cursor.c | 50 ++++++++++++++++++++++++++++++++++++++- sys/vfs/hammer/hammer_inode.c | 2 ++ sys/vfs/hammer/hammer_mirror.c | 6 +++++ sys/vfs/hammer/hammer_object.c | 13 +++++++++- sys/vfs/hammer/hammer_pfs.c | 2 ++ sys/vfs/hammer/hammer_prune.c | 3 +++ sys/vfs/hammer/hammer_rebalance.c | 3 +++ sys/vfs/hammer/hammer_reblock.c | 10 +++++++- sys/vfs/hammer/hammer_vfsops.c | 14 ++++++++--- sys/vfs/hammer/hammer_vnops.c | 3 +++ 11 files changed, 107 insertions(+), 6 deletions(-) diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index 2c05e12798..575a4fbe2a 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -1703,6 +1703,7 @@ btree_split_leaf(hammer_cursor_t cursor) */ leaf = cursor->node; ondisk = leaf->ondisk; + KKASSERT(ondisk->count > 2); split = (ondisk->count + 1) / 2; if (cursor->index <= split) --split; @@ -2296,6 +2297,11 @@ btree_remove(hammer_cursor_t cursor) * The passed inode has no relationship to the cursor position other * then being in the same pseudofs as the insertion or deletion we * are propagating the mirror_tid for. + * + * WARNING! Because we push and pop the passed cursor, it may be + * modified by other B-Tree operations while it is unlocked + * and things like the node & leaf pointers, and indexes might + * change. */ void hammer_btree_do_propagation(hammer_cursor_t cursor, @@ -2333,6 +2339,7 @@ hammer_btree_do_propagation(hammer_cursor_t cursor, error = hammer_btree_mirror_propagate(ncursor, mirror_tid); KKASSERT(error == 0); hammer_pop_cursor(cursor, ncursor); + /* WARNING: cursor's leaf pointer may change after pop */ } diff --git a/sys/vfs/hammer/hammer_cursor.c b/sys/vfs/hammer/hammer_cursor.c index 027a323c8a..a4d08bed9e 100644 --- a/sys/vfs/hammer/hammer_cursor.c +++ b/sys/vfs/hammer/hammer_cursor.c @@ -488,6 +488,15 @@ hammer_cursor_down(hammer_cursor_t cursor) * used for the mirror propagation and physical node removal cases but * ultimately the intention is to use them for all deadlock recovery * operations. + * + * WARNING! The contents of the cursor may be modified while unlocked. + * passive modifications including adjusting the node, parent, + * indexes, and leaf pointer. + * + * An outright removal of the element the cursor was pointing at + * will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set, + * which chains to causing the HAMMER_CURSOR_RETEST to be set + * when the cursor is locked again. */ void hammer_unlock_cursor(hammer_cursor_t cursor) @@ -671,11 +680,18 @@ void hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode) { hammer_cursor_t cursor; + hammer_node_ondisk_t ondisk; + hammer_node_ondisk_t nndisk; + + ondisk = onode->ondisk; + nndisk = nnode->ondisk; while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) { TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); KKASSERT(cursor->node == onode); + if (cursor->leaf == &ondisk->elms[cursor->index].leaf) + cursor->leaf = &nndisk->elms[cursor->index].leaf; cursor->node = nnode; hammer_ref_node(nnode); hammer_rel_node(onode); @@ -690,13 +706,18 @@ void hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index) { hammer_cursor_t cursor; + hammer_node_ondisk_t ondisk; KKASSERT(parent != NULL); + ondisk = node->ondisk; + while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) { KKASSERT(cursor->node == node); KKASSERT(cursor->index == 0); TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry); + if (cursor->leaf == &ondisk->elms[cursor->index].leaf) + cursor->leaf = NULL; cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; cursor->node = parent; cursor->index = index; @@ -712,6 +733,11 @@ void hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index) { hammer_cursor_t cursor; + hammer_node_ondisk_t ondisk; + hammer_node_ondisk_t nndisk; + + ondisk = onode->ondisk; + nndisk = nnode->ondisk; again: TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { @@ -720,6 +746,8 @@ again: continue; TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); + if (cursor->leaf == &ondisk->elms[cursor->index].leaf) + cursor->leaf = &nndisk->elms[cursor->index - index].leaf; cursor->node = nnode; cursor->index -= index; hammer_ref_node(nnode); @@ -741,6 +769,11 @@ hammer_cursor_moved_element(hammer_node_t onode, hammer_node_t nnode, int oindex, int nindex) { hammer_cursor_t cursor; + hammer_node_ondisk_t ondisk; + hammer_node_ondisk_t nndisk; + + ondisk = onode->ondisk; + nndisk = nnode->ondisk; again: TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { @@ -749,6 +782,8 @@ again: continue; TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); + if (cursor->leaf == &ondisk->elms[oindex].leaf) + cursor->leaf = &nndisk->elms[nindex].leaf; cursor->node = nnode; cursor->index = nindex; hammer_ref_node(nnode); @@ -793,12 +828,19 @@ void hammer_cursor_deleted_element(hammer_node_t node, int index) { hammer_cursor_t cursor; + hammer_node_ondisk_t ondisk; + + ondisk = node->ondisk; TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { KKASSERT(cursor->node == node); if (cursor->index == index) { cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; + if (cursor->leaf == &ondisk->elms[cursor->index].leaf) + cursor->leaf = NULL; } else if (cursor->index > index) { + if (cursor->leaf == &ondisk->elms[cursor->index].leaf) + cursor->leaf = &ondisk->elms[cursor->index - 1].leaf; --cursor->index; } } @@ -813,11 +855,17 @@ void hammer_cursor_inserted_element(hammer_node_t node, int index) { hammer_cursor_t cursor; + hammer_node_ondisk_t ondisk; + + ondisk = node->ondisk; TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { KKASSERT(cursor->node == node); - if (cursor->index >= index) + if (cursor->index >= index) { + if (cursor->leaf == &ondisk->elms[cursor->index].leaf) + cursor->leaf = &ondisk->elms[cursor->index + 1].leaf; ++cursor->index; + } } } diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index f1ebb7f199..c5794d2214 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -2583,6 +2583,8 @@ done: * * We must release our cursor lock to avoid a 3-way deadlock * due to the exclusive sync lock the finalizer must get. + * + * WARNING: See warnings in hammer_unlock_cursor() function. */ if (hammer_flusher_meta_limit(hmp)) { hammer_unlock_cursor(cursor); diff --git a/sys/vfs/hammer/hammer_mirror.c b/sys/vfs/hammer/hammer_mirror.c index 78f8027d37..06422db7d0 100644 --- a/sys/vfs/hammer/hammer_mirror.c +++ b/sys/vfs/hammer/hammer_mirror.c @@ -382,6 +382,8 @@ hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip, /* * Don't blow out the buffer cache. Leave room for frontend * cache as well. + * + * WARNING: See warnings in hammer_unlock_cursor() function. */ while (hammer_flusher_meta_halflimit(trans->hmp) || hammer_flusher_undo_exhausted(trans, 2)) { @@ -894,6 +896,10 @@ hammer_mirror_write(hammer_cursor_t cursor, hammer_modify_volume_done(trans->rootvol); } + /* + * WARNING! cursor's leaf pointer may have changed after + * do_propagation returns. + */ if (error == 0 && doprop) hammer_btree_do_propagation(cursor, NULL, &mrec->leaf); diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index b54b6bb04b..530713bd73 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -97,6 +97,8 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) /* * Basic record comparison code similar to hammer_btree_cmp(). + * + * obj_id is not compared and may not yet be assigned in the record. */ static int hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec) @@ -785,13 +787,15 @@ hammer_ip_del_directory(struct hammer_transaction *trans, /* * If the record is on-disk we have to queue the deletion by * the record's key. This also causes lookups to skip the - * record. + * record (lookups for the purposes of finding an unused + * directory key do not skip the record). */ KKASSERT(dip->flags & (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK)); record = hammer_alloc_mem_record(dip, 0); record->type = HAMMER_MEM_RECORD_DEL; record->leaf.base = cursor->leaf->base; + KKASSERT(dip->obj_id == record->leaf.base.obj_id); /* * ip may be NULL, indicating the deletion of a directory @@ -1267,6 +1271,9 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) * sync a valid directory entry to disk due to dependancies, * we must convert the record to a covering delete so the * frontend does not have visibility on the synced entry. + * + * WARNING: cursor's leaf pointer may have changed after do_propagation + * returns! */ if (error == 0) { if (doprop) { @@ -1282,6 +1289,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); record->flags &= ~HAMMER_RECF_DELETED_FE; record->type = HAMMER_MEM_RECORD_DEL; + KKASSERT(record->ip->obj_id == record->leaf.base.obj_id); KKASSERT(record->flush_state == HAMMER_FST_FLUSH); record->flags &= ~HAMMER_RECF_CONVERT_DELETE; KKASSERT((record->flags & (HAMMER_RECF_COMMITTED | @@ -2368,6 +2376,9 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags, * cursor->ip is NULL when called from the pruning, mirroring, * and pfs code. If non-NULL propagation will be conditionalized * on whether the PFS is in no-history mode or not. + * + * WARNING: cursor's leaf pointer may have changed after do_propagation + * returns! */ if (doprop) { if (cursor->ip) diff --git a/sys/vfs/hammer/hammer_pfs.c b/sys/vfs/hammer/hammer_pfs.c index 49e8465661..c8ef139366 100644 --- a/sys/vfs/hammer/hammer_pfs.c +++ b/sys/vfs/hammer/hammer_pfs.c @@ -402,6 +402,8 @@ retry: * We only care about leafs. Internal nodes can be returned * in mirror-filtered mode (they are used to generate SKIP * mrecords), but we don't need them for this code. + * + * WARNING: See warnings in hammer_unlock_cursor() function. */ cursor.flags |= HAMMER_CURSOR_ATEDISK; if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF) { diff --git a/sys/vfs/hammer/hammer_prune.c b/sys/vfs/hammer/hammer_prune.c index d266a85649..c0dca4090c 100644 --- a/sys/vfs/hammer/hammer_prune.c +++ b/sys/vfs/hammer/hammer_prune.c @@ -216,6 +216,9 @@ retry: } ++prune->stat_scanrecords; + /* + * WARNING: See warnings in hammer_unlock_cursor() function. + */ while (hammer_flusher_meta_halflimit(trans->hmp) || hammer_flusher_undo_exhausted(trans, 2)) { hammer_unlock_cursor(&cursor); diff --git a/sys/vfs/hammer/hammer_rebalance.c b/sys/vfs/hammer/hammer_rebalance.c index 43fe789a49..012254e536 100644 --- a/sys/vfs/hammer/hammer_rebalance.c +++ b/sys/vfs/hammer/hammer_rebalance.c @@ -146,6 +146,9 @@ retry: /* * Update returned scan position and do a flush if * necessary. + * + * WARNING: See warnings in hammer_unlock_cursor() + * function. */ elm = &cursor.node->ondisk->elms[cursor.index].leaf; rebal->key_cur = elm->base; diff --git a/sys/vfs/hammer/hammer_reblock.c b/sys/vfs/hammer/hammer_reblock.c index 3e4eed4861..23cdba87b5 100644 --- a/sys/vfs/hammer/hammer_reblock.c +++ b/sys/vfs/hammer/hammer_reblock.c @@ -143,6 +143,8 @@ retry: /* * If there is insufficient free space it may be due to * reserved bigblocks, which flushing might fix. + * + * WARNING: See warnings in hammer_unlock_cursor() function. */ if (hammer_checkspace(trans->hmp, slop)) { if (++checkspace_count == 10) { @@ -161,6 +163,8 @@ retry: * crossing a synchronization boundary. * * NOTE: cursor.node may have changed on return. + * + * WARNING: See warnings in hammer_unlock_cursor() function. */ hammer_sync_lock_sh(trans); error = hammer_reblock_helper(reblock, &cursor, elm); @@ -186,7 +190,8 @@ retry: * them. But if we allocate too many we can still deadlock * the buffer cache. * - * (The cursor's node and element may change!) + * WARNING: See warnings in hammer_unlock_cursor() function. + * (The cursor's node and element may change!) */ if (bd_heatup()) { hammer_unlock_cursor(&cursor); @@ -287,6 +292,9 @@ hammer_reblock_helper(struct hammer_ioc_reblock *reblock, * This is nasty, the uncache code may have to get * vnode locks and because of that we can't hold * the cursor locked. + * + * WARNING: See warnings in hammer_unlock_cursor() + * function. */ leaf = elm->leaf; hammer_unlock_cursor(cursor); diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index d8e609fd5d..be8d6429cd 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -57,6 +57,7 @@ int hammer_debug_btree; int hammer_debug_tid; int hammer_debug_recover; /* -1 will disable, +1 will force */ int hammer_debug_recover_faults; +int hammer_error_panic; /* panic on error levels */ int hammer_cluster_enable = 1; /* enable read clustering by default */ int hammer_count_fsyncs; int hammer_count_inodes; @@ -127,6 +128,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_recover, CTLFLAG_RW, &hammer_debug_recover, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_recover_faults, CTLFLAG_RW, &hammer_debug_recover_faults, 0, ""); +SYSCTL_INT(_vfs_hammer, OID_AUTO, error_panic, CTLFLAG_RW, + &hammer_error_panic, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, cluster_enable, CTLFLAG_RW, &hammer_cluster_enable, 0, ""); @@ -775,10 +778,13 @@ hammer_critical_error(hammer_mount_t hmp, hammer_inode_t ip, int error, const char *msg) { hmp->flags |= HAMMER_MOUNT_CRITICAL_ERROR; + krateprintf(&hmp->krate, - "HAMMER(%s): Critical error inode=%lld %s\n", - hmp->mp->mnt_stat.f_mntfromname, - (long long)(ip ? ip->obj_id : -1), msg); + "HAMMER(%s): Critical error inode=%jd error=%d %s\n", + hmp->mp->mnt_stat.f_mntfromname, + (intmax_t)(ip ? ip->obj_id : -1), + error, msg); + if (hmp->ronly == 0) { hmp->ronly = 2; /* special errored read-only mode */ hmp->mp->mnt_flag |= MNT_RDONLY; @@ -786,6 +792,8 @@ hammer_critical_error(hammer_mount_t hmp, hammer_inode_t ip, hmp->mp->mnt_stat.f_mntfromname); } hmp->error = error; + if (hammer_error_panic > 2) + Debugger("Entering debugger"); } diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index a125ba0305..3e4625b6e1 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -3101,6 +3101,9 @@ retry: * * If any changes whatsoever have been made to the cursor * set EDEADLK and retry. + * + * WARNING: See warnings in hammer_unlock_cursor() + * function. */ if (error == 0 && ip && ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { -- 2.11.4.GIT