From f90dde4c06ea86fca65841766b334ff0b4dc03b8 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 26 Apr 2008 02:54:00 +0000 Subject: [PATCH] HAMMER 38D/Many: Undo/Synchronization and crash recovery * The flusher now waits for I/O to complete at the appropriate points. * Implement instant crash recovery. The UNDO FIFO is scanned backwards and reapplied to the filesystem on mount. There is still more work to do here, inode<->inode associations (e.g. directory entry vs file) are not yet bound together. * Clean up I/O sequencing a lot and get rid of a ton of unnecessary flusher wakeups. --- sys/vfs/hammer/hammer.h | 11 +- sys/vfs/hammer/hammer_flusher.c | 54 +- sys/vfs/hammer/hammer_inode.c | 131 +++-- sys/vfs/hammer/hammer_io.c | 19 +- sys/vfs/hammer/hammer_object.c | 5 +- sys/vfs/hammer/hammer_ondisk.c | 25 +- sys/vfs/hammer/hammer_recover.c | 1094 ++++++++++++--------------------------- sys/vfs/hammer/hammer_undo.c | 25 +- sys/vfs/hammer/hammer_vfsops.c | 19 +- sys/vfs/hammer/hammer_vnops.c | 7 +- 10 files changed, 526 insertions(+), 864 deletions(-) rewrite sys/vfs/hammer/hammer_recover.c (90%) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index af82916f3e..83ce4a3aff 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.49 2008/04/25 21:49:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.50 2008/04/26 02:54:00 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -231,6 +231,9 @@ typedef struct hammer_inode *hammer_inode_t; #define HAMMER_MAX_INODE_CURSORS 4 +#define HAMMER_FLUSH_SIGNAL 0x0001 +#define HAMMER_FLUSH_FORCE 0x0002 + /* * Structure used to represent an unsynchronized record in-memory. This * structure is orgranized in a per-inode RB-tree. If the inode is not @@ -468,6 +471,7 @@ struct hammer_mount { int flusher_seq; int flusher_act; int flusher_exiting; + int reclaim_count; thread_t flusher_td; u_int check_interrupt; uuid_t fsid; @@ -478,6 +482,7 @@ struct hammer_mount { struct hammer_io_list meta_list; /* dirty meta bufs */ struct hammer_io_list lose_list; /* loose buffers */ int locked_dirty_count; /* meta/volu count */ + int io_running_count; hammer_tid_t asof; hammer_off_t next_tid; u_int32_t namekey_iterator; @@ -685,7 +690,7 @@ void hammer_done_transaction(struct hammer_transaction *trans); void hammer_modify_inode(struct hammer_transaction *trans, hammer_inode_t ip, int flags); -void hammer_flush_inode(hammer_inode_t ip, int forceit); +void hammer_flush_inode(hammer_inode_t ip, int flags); void hammer_flush_inode_done(hammer_inode_t ip); void hammer_wait_inode(hammer_inode_t ip); @@ -745,6 +750,8 @@ void hammer_flusher_destroy(hammer_mount_t hmp); void hammer_flusher_sync(hammer_mount_t hmp); void hammer_flusher_async(hammer_mount_t hmp); +int hammer_recover(hammer_mount_t hmp, hammer_volume_t rootvol); + #endif static __inline void diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c index 332fb364d4..451695c736 100644 --- a/sys/vfs/hammer/hammer_flusher.c +++ b/sys/vfs/hammer/hammer_flusher.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.3 2008/04/25 21:49:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.4 2008/04/26 02:54:00 dillon Exp $ */ /* * HAMMER dependancy flusher thread @@ -53,17 +53,21 @@ hammer_flusher_sync(hammer_mount_t hmp) { int seq; - seq = ++hmp->flusher_seq; - wakeup(&hmp->flusher_seq); - while ((int)(seq - hmp->flusher_act) > 0) - tsleep(&hmp->flusher_act, 0, "hmrfls", 0); + if (hmp->flusher_td) { + seq = ++hmp->flusher_seq; + wakeup(&hmp->flusher_seq); + while ((int)(seq - hmp->flusher_act) > 0) + tsleep(&hmp->flusher_act, 0, "hmrfls", 0); + } } void hammer_flusher_async(hammer_mount_t hmp) { - ++hmp->flusher_seq; - wakeup(&hmp->flusher_seq); + if (hmp->flusher_td) { + ++hmp->flusher_seq; + wakeup(&hmp->flusher_seq); + } } void @@ -76,11 +80,13 @@ hammer_flusher_create(hammer_mount_t hmp) void hammer_flusher_destroy(hammer_mount_t hmp) { - hmp->flusher_exiting = 1; - ++hmp->flusher_seq; - wakeup(&hmp->flusher_seq); - while (hmp->flusher_td) - tsleep(&hmp->flusher_exiting, 0, "hmrwex", 0); + if (hmp->flusher_td) { + hmp->flusher_exiting = 1; + ++hmp->flusher_seq; + wakeup(&hmp->flusher_seq); + while (hmp->flusher_td) + tsleep(&hmp->flusher_exiting, 0, "hmrwex", 0); + } } static void @@ -122,7 +128,6 @@ hammer_flusher_clean_loose_ios(hammer_mount_t hmp) TAILQ_REMOVE(io->mod_list, io, mod_entry); io->mod_list = NULL; hammer_ref(&io->lock); - kprintf("DELETE LOOSE %p\n", io); buffer = (void *)io; hammer_rel_buffer(buffer, 0); } @@ -144,6 +149,9 @@ hammer_flusher_flush(hammer_mount_t hmp) rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; start_offset = rootmap->next_offset; + if (hammer_debug_general & 0x00010000) + kprintf("x"); + while ((ip = TAILQ_FIRST(&hmp->flush_list)) != NULL) { TAILQ_REMOVE(&hmp->flush_list, ip, flush_entry); @@ -177,8 +185,6 @@ hammer_flusher_finalize(hammer_mount_t hmp, hammer_volume_t root_volume, hammer_blockmap_t rootmap; hammer_io_t io; - kprintf("FINALIZE %d\n", hmp->locked_dirty_count); - /* * Flush undo bufs */ @@ -202,26 +208,34 @@ hammer_flusher_finalize(hammer_mount_t hmp, hammer_volume_t root_volume, } /* - * XXX wait for I/O's to complete + * Wait for I/O to complete */ + crit_enter(); + while (hmp->io_running_count) { + kprintf("WAIT1 %d\n", hmp->io_running_count); + tsleep(&hmp->io_running_count, 0, "hmrfl1", 0); + } + crit_exit(); /* * Update the volume header */ rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; if (rootmap->first_offset != start_offset) { - kprintf("FINALIZE: ACTIVE VOLUME STAGE 1\n"); hammer_modify_volume(NULL, root_volume, NULL, 0); rootmap->first_offset = start_offset; hammer_modify_volume_done(root_volume); hammer_io_flush(&root_volume->io); - } else { - kprintf("FINALIZE: ACTIVE VOLUME STAGE 2\n"); } /* - * XXX wait for I/O to complete + * Wait for I/O to complete */ + crit_enter(); + while (hmp->io_running_count) { + tsleep(&hmp->io_running_count, 0, "hmrfl2", 0); + } + crit_exit(); /* * Flush meta-data diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index 3f5b7b7df8..56f7be09a9 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.37 2008/04/25 21:49:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.38 2008/04/26 02:54:00 dillon Exp $ */ #include "hammer.h" @@ -422,7 +422,7 @@ retry: if (error == 0) { error = hammer_ip_delete_record(&cursor, trans->tid); - if (error) { + if (error && error != EDEADLK) { kprintf("error %d\n", error); Debugger("hammer_update_inode2"); } @@ -471,6 +471,15 @@ retry: } } } + if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { + /* + * Clean out any left-over flags if the inode has been + * destroyed. + */ + ip->sync_flags &= ~(HAMMER_INODE_RDIRTY | + HAMMER_INODE_DDIRTY | + HAMMER_INODE_ITIMES); + } return(error); } @@ -537,28 +546,56 @@ retry: void hammer_rel_inode(struct hammer_inode *ip, int flush) { - if (ip->lock.refs == 1) { + /* + * Handle disposition when dropping the last ref. + */ + while (ip->lock.refs == 1) { if (curthread == ip->hmp->flusher_td) { /* - * We are the flusher, actually dispose of the inode. - * The unload routine inherits our (last) reference. + * We are the flusher, do any required flushes + * before unloading the inode. */ + int error = 0; + KKASSERT(ip->flush_state == HAMMER_FST_IDLE); - KKASSERT(ip->cursor_ip_refs == 0); + while (error == 0 && + (ip->flags & HAMMER_INODE_MODMASK)) { + hammer_ref(&ip->lock); + hammer_flush_inode_copysync(ip); + error = hammer_sync_inode(ip, 1); + hammer_flush_inode_done(ip); + } + if (error) + kprintf("hammer_sync_inode failed error %d\n", + error); + if (ip->lock.refs > 1) + continue; hammer_unload_inode(ip, (void *)MNT_NOWAIT); + return; + } + if ((ip->flags & HAMMER_INODE_MODMASK) == 0) { + hammer_unload_inode(ip, (void *)MNT_NOWAIT); + return; + } + + /* + * Hand the inode over to the flusher, which will + * add another ref to it. + */ + if (++ip->hmp->reclaim_count > 256) { + ip->hmp->reclaim_count = 0; + hammer_flush_inode(ip, HAMMER_FLUSH_FORCE | + HAMMER_FLUSH_SIGNAL); } else { - /* - * flush_list inherits our last reference. - * - * Only the flusher can actually destroy the inode, - * there had better still be a ref on it if we aren't - * it. - */ - hammer_flush_inode(ip, 1); - KKASSERT(ip->lock.refs > 1); - hammer_unref(&ip->lock); + hammer_flush_inode(ip, HAMMER_FLUSH_FORCE); } - } else if (flush && ip->flush_state == HAMMER_FST_IDLE && + /* retry */ + } + + /* + * Inode still has multiple refs + */ + if (flush && ip->flush_state == HAMMER_FST_IDLE && curthread != ip->hmp->flusher_td) { /* * Flush requested, make the inode visible to the flusher. @@ -589,32 +626,25 @@ hammer_rel_inode(struct hammer_inode *ip, int flush) static int hammer_unload_inode(struct hammer_inode *ip, void *data) { - int error; KASSERT(ip->lock.refs == 1, ("hammer_unload_inode: %d refs\n", ip->lock.refs)); KKASSERT(ip->vp == NULL); + KKASSERT(ip->flush_state == HAMMER_FST_IDLE); + KKASSERT(ip->cursor_ip_refs == 0); + KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0); - do { - hammer_flush_inode_copysync(ip); - error = hammer_sync_inode(ip, 1); - } while (error == 0 && (ip->flags & HAMMER_INODE_MODMASK)); + KKASSERT(RB_EMPTY(&ip->rec_tree)); + KKASSERT(TAILQ_EMPTY(&ip->bio_list)); + KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list)); + + RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip); + + hammer_uncache_node(&ip->cache[0]); + hammer_uncache_node(&ip->cache[1]); + --hammer_count_inodes; + kfree(ip, M_HAMMER); - if (error) - kprintf("hammer_sync_inode failed error %d\n", error); - if (ip->lock.refs == 1) { - KKASSERT(RB_EMPTY(&ip->rec_tree)); - KKASSERT(TAILQ_EMPTY(&ip->bio_list)); - KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list)); - RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip); - - hammer_uncache_node(&ip->cache[0]); - hammer_uncache_node(&ip->cache[1]); - --hammer_count_inodes; - kfree(ip, M_HAMMER); - } else { - hammer_flush_inode_done(ip); - } return(0); } @@ -650,7 +680,7 @@ hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags) * troublesome because some dirty buffers may not have been queued yet. */ void -hammer_flush_inode(hammer_inode_t ip, int forceit) +hammer_flush_inode(hammer_inode_t ip, int flags) { if (ip->flush_state != HAMMER_FST_IDLE && (ip->flags & HAMMER_INODE_MODMASK)) { @@ -658,17 +688,20 @@ hammer_flush_inode(hammer_inode_t ip, int forceit) return; } hammer_lock_ex(&ip->lock); - if (ip->flush_state == HAMMER_FST_IDLE && - ((ip->flags & HAMMER_INODE_MODMASK) || forceit)) { - hammer_ref(&ip->lock); + if (ip->flush_state == HAMMER_FST_IDLE) { + if ((ip->flags & HAMMER_INODE_MODMASK) || + (flags & HAMMER_FLUSH_FORCE)) { + hammer_ref(&ip->lock); - hammer_flush_inode_copysync(ip); - /* - * Move the inode to the flush list and add a ref to it - * representing it on the list. - */ - TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry); - hammer_flusher_async(ip->hmp); + hammer_flush_inode_copysync(ip); + /* + * Move the inode to the flush list and add a ref to + * it representing it on the list. + */ + TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry); + if (flags & HAMMER_FLUSH_SIGNAL) + hammer_flusher_async(ip->hmp); + } } hammer_unlock(&ip->lock); } @@ -780,6 +813,7 @@ hammer_flush_inode_done(hammer_inode_t ip) while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) { TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act); TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act); + ip->flags |= HAMMER_INODE_XDIRTY; ip->flags |= HAMMER_INODE_REFLUSH; kprintf("rebio %p ip %p @%016llx,%d\n", bio, ip, bio->bio_offset, bio->bio_buf->b_bufsize); } @@ -790,7 +824,6 @@ hammer_flush_inode_done(hammer_inode_t ip) */ if (ip->flags & HAMMER_INODE_REFLUSH) { ip->flags &= ~HAMMER_INODE_REFLUSH; - kprintf("reflush %p\n", ip); hammer_flush_inode(ip, 0); } else { if (ip->flags & HAMMER_INODE_FLUSHW) { diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 2384a56384..8f5c382853 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.26 2008/04/25 21:49:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.27 2008/04/26 02:54:00 dillon Exp $ */ /* * IO Primitives and buffer cache management @@ -383,6 +383,7 @@ hammer_io_flush(struct hammer_io *io) */ io->released = 1; io->running = 1; + ++io->hmp->io_running_count; bawrite(bp); } @@ -562,12 +563,18 @@ hammer_io_complete(struct buf *bp) KKASSERT(iou->io.released == 1); + if (iou->io.running) { + if (--iou->io.hmp->io_running_count == 0) + wakeup(&iou->io.hmp->io_running_count); + KKASSERT(iou->io.hmp->io_running_count >= 0); + iou->io.running = 0; + } + /* * If no lock references remain and we can acquire the IO lock and * someone at some point wanted us to flush (B_LOCKED test), then * try to dispose of the IO. */ - iou->io.running = 0; if (iou->io.waiting) { iou->io.waiting = 0; wakeup(iou); @@ -617,7 +624,6 @@ hammer_io_deallocate(struct buf *bp) hammer_io_disassociate(iou, 0); if (iou->io.bp == NULL && iou->io.type != HAMMER_STRUCTURE_VOLUME) { - kprintf("ADD LOOSE %p\n", &iou->io); KKASSERT(iou->io.mod_list == NULL); iou->io.mod_list = &iou->io.hmp->lose_list; TAILQ_INSERT_TAIL(iou->io.mod_list, &iou->io, mod_entry); @@ -685,6 +691,13 @@ hammer_io_checkwrite(struct buf *bp) io->mod_list = NULL; io->modified = 0; } + + /* + * The kernel is going to start the IO, set io->running. + */ + KKASSERT(io->running == 0); + io->running = 1; + ++io->hmp->io_running_count; return(0); } diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 681231ad1d..e50cd0844a 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.42 2008/04/25 21:49:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.43 2008/04/26 02:54:00 dillon Exp $ */ #include "hammer.h" @@ -723,10 +723,8 @@ retry: */ if (record->flags & HAMMER_RECF_DELETE_ONDISK) { error = hammer_btree_lookup(&cursor); - kprintf("DELETE MEM ENTRY1 %d\n", error); if (error == 0) error = hammer_ip_delete_record(&cursor, trans->tid); - kprintf("DELETE MEM ENTRY2 %d\n", error); if (error == 0) record->flags |= HAMMER_RECF_DELETED_FE; goto done; @@ -1154,7 +1152,6 @@ next_memory: if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { cursor->flags |= HAMMER_CURSOR_ATEDISK; cursor->flags |= HAMMER_CURSOR_ATEMEM; - kprintf("SKIP MEM ENTRY\n"); goto next_btree; } } diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c index 39ca421fc5..b5cc2a8c81 100644 --- a/sys/vfs/hammer/hammer_ondisk.c +++ b/sys/vfs/hammer/hammer_ondisk.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.38 2008/04/25 21:49:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.39 2008/04/26 02:54:00 dillon Exp $ */ /* * Manage HAMMER's on-disk structures. These routines are primarily @@ -496,13 +496,25 @@ hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset, zoneX_offset = buf_offset; zone = HAMMER_ZONE_DECODE(buf_offset); - if (zone == HAMMER_ZONE_LARGE_DATA_INDEX || - zone == HAMMER_ZONE_SMALL_DATA_INDEX) { + /* + * What is the buffer class? + */ + switch(zone) { + case HAMMER_ZONE_LARGE_DATA_INDEX: + case HAMMER_ZONE_SMALL_DATA_INDEX: iotype = HAMMER_STRUCTURE_DATA_BUFFER; - } else { + break; + case HAMMER_ZONE_UNDO_INDEX: + iotype = HAMMER_STRUCTURE_UNDO_BUFFER; + break; + default: iotype = HAMMER_STRUCTURE_META_BUFFER; + break; } + /* + * Handle blockmap offset translations + */ if (zone >= HAMMER_ZONE_BTREE_INDEX) { buf_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp); KKASSERT(*errorp == 0); @@ -510,6 +522,10 @@ hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset, buf_offset = hammer_undo_lookup(hmp, buf_offset, errorp); KKASSERT(*errorp == 0); } + + /* + * Locate the buffer given its zone-2 offset. + */ buf_offset &= ~HAMMER_BUFMASK64; KKASSERT((buf_offset & HAMMER_ZONE_RAW_BUFFER) == HAMMER_ZONE_RAW_BUFFER); @@ -715,6 +731,7 @@ hammer_rel_buffer(hammer_buffer_t buffer, int flush) hammer_unref(&buffer->io.lock); crit_exit(); if (freeme) { + KKASSERT(buffer->io.mod_list == NULL); --hammer_count_buffers; kfree(buffer, M_HAMMER); } diff --git a/sys/vfs/hammer/hammer_recover.c b/sys/vfs/hammer/hammer_recover.c dissimilarity index 90% index 1a8cbcde62..a410416e7f 100644 --- a/sys/vfs/hammer/hammer_recover.c +++ b/sys/vfs/hammer/hammer_recover.c @@ -1,767 +1,327 @@ -/* - * Copyright (c) 2008 The DragonFly Project. All rights reserved. - * - * This code is derived from software contributed to The DragonFly Project - * by Matthew Dillon - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of The DragonFly Project nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific, prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.9 2008/03/18 05:19:16 dillon Exp $ - */ - -#include "hammer.h" - -#if 0 - -static int hammer_recover_buffer_stage2(hammer_cluster_t cluster, - int32_t buf_no); -static int hammer_recover_record(hammer_cluster_t cluster, - hammer_buffer_t buffer, int32_t rec_offset, - hammer_record_ondisk_t rec); -static int hammer_recover_btree(hammer_cluster_t cluster, - hammer_buffer_t buffer, int32_t rec_offset, - hammer_record_ondisk_t rec); - -/* - * Recover a cluster. The caller has referenced and locked the cluster. - * - * Generally returns 0 on success and EIO if the recovery was unsuccessful. - * - * WARNING! The cluster being recovered must not have any cached buffers - * (and hence no cached b-tree nodes). Any cached nodes will become seriously - * corrupted since we rip it all up and regenerate the B-Tree. - */ -int -hammer_recover(hammer_cluster_t cluster) -{ - int buf_no; - int rec_no; - int maxblk; - int nbuffers; - int buffer_count; - int record_count; - - kprintf("HAMMER_RECOVER %d:%d\n", - cluster->volume->vol_no, cluster->clu_no); - /*Debugger("RECOVER");*/ - KKASSERT(cluster->ondisk->synchronized_rec_id); - if (RB_ROOT(&cluster->rb_bufs_root)) { - panic("hammer_recover: cluster %d:%d has cached buffers!", - cluster->volume->vol_no, - cluster->clu_no); - } - - if (hammer_alist_find(&cluster->volume->alist, cluster->clu_no, - cluster->clu_no + 1, 0) != cluster->clu_no) { - Debugger("hammer_recover: cluster not allocated!"); - } - - nbuffers = cluster->ondisk->clu_limit / HAMMER_BUFSIZE; - hammer_modify_cluster(cluster); - - /* - * Clear statistics. - */ - cluster->ondisk->stat_inodes = 0; - cluster->ondisk->stat_records = 0; - cluster->ondisk->stat_data_bufs = 0; - cluster->ondisk->stat_rec_bufs = 0; - cluster->ondisk->stat_idx_bufs = 0; - - /* - * Reset allocation heuristics. - */ - cluster->ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS; - cluster->ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS; - cluster->ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS; - - /* - * Re-initialize the master, B-Tree, and mdata A-lists, and - * recover the record A-list. - */ - hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1, - HAMMER_ASTATE_FREE); - hammer_alist_init(&cluster->alist_btree, - HAMMER_FSBUF_MAXBLKS, - (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS, - HAMMER_ASTATE_ALLOC); - hammer_alist_init(&cluster->alist_mdata, - HAMMER_FSBUF_MAXBLKS, - (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS, - HAMMER_ASTATE_ALLOC); - hammer_alist_recover(&cluster->alist_record, - 0, - HAMMER_FSBUF_MAXBLKS, - (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS); - kprintf("\n"); - - kprintf("hammer_recover(1): cluster_free %d\n", - cluster->alist_master.meta->bm_alist_freeblks); - - /* - * The cluster is now in good enough shape that general allocations - * are possible. Construct an empty B-Tree root. - */ - { - hammer_node_t croot; - int error; - - croot = hammer_alloc_btree(cluster, &error); - if (error == 0) { - hammer_modify_node_noundo(croot); - bzero(croot->ondisk, sizeof(*croot->ondisk)); - croot->ondisk->count = 0; - croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF; - cluster->ondisk->clu_btree_root = croot->node_offset; - hammer_rel_node(croot); - } - KKASSERT(error == 0); - } - kprintf("hammer_recover(2): cluster_free %d\n", - cluster->alist_master.meta->bm_alist_freeblks); - - /* - * Scan the cluster's recovered record A-list. Just get the meta - * blocks and ignore all-allocated/uninitialized sections (which - * we use to indicate reserved areas not assigned to record buffers). - * - * The all-free sections are initialized and this is indicated by - * the alist config's bl_inverted flag being set. These sections - * will be returned for recovery purposes. - */ - buffer_count = 0; - record_count = 0; - - rec_no = HAMMER_FSBUF_MAXBLKS; - maxblk = nbuffers * HAMMER_FSBUF_MAXBLKS; - for (;;) { - rec_no = hammer_alist_find(&cluster->alist_record, - rec_no, - maxblk, - HAMMER_ALIST_FIND_NOSTACK | - HAMMER_ALIST_FIND_INITONLY); - if (rec_no == HAMMER_ALIST_BLOCK_NONE) - break; - buf_no = rec_no / HAMMER_FSBUF_MAXBLKS; - KKASSERT(buf_no > 0 && buf_no <= nbuffers); - ++buffer_count; - kprintf("(%d)", buf_no); - record_count += hammer_recover_buffer_stage2(cluster, buf_no); - rec_no += HAMMER_FSBUF_MAXBLKS; - } - kprintf("HAMMER_RECOVER DONE %d:%d buffers=%d records=%d\n", - cluster->volume->vol_no, cluster->clu_no, - buffer_count, record_count); - - /* - * Validate the parent cluster pointer. XXX - */ - - /* - * On successful recovery mark the cluster validated. - */ - cluster->io.validated = 1; - return(0); -} - -/* - * This is used in the alist callback and must return a negative error - * code or a positive free block count. - */ -int -buffer_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count) -{ - hammer_cluster_t cluster; - hammer_record_ondisk_t rec; - hammer_buffer_t buffer; - int32_t buf_no; - int32_t rec_no; - int32_t rec_offset; - int32_t r; - int error; - int xcount; - - /* - * Extract cluster and buffer number to recover - */ - cluster = info; - buf_no = blk / HAMMER_FSBUF_MAXBLKS; - - kprintf("(%d)", buf_no); - buffer = hammer_get_buffer(cluster, buf_no, 0, &error); - if (error) { - /* - * If we are unable to access the buffer leave it in a - * reserved state on the master alist. - */ - kprintf("hammer_recover_buffer_stage1: error " - "recovering %d:%d:%d\n", - cluster->volume->vol_no, cluster->clu_no, buf_no); - r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no); - KKASSERT(r == buf_no); - return(-error); - } - KKASSERT(buffer->buf_type == HAMMER_FSBUF_RECORDS); - - /* - * If the buffer contains no allocated records tell our parent to - * mark it as all-allocated/uninitialized and do not reserve it - * in the master list. - */ - if (hammer_alist_find(&buffer->alist, 0, HAMMER_RECORD_NODES, 0) == - HAMMER_ALIST_BLOCK_NONE) { - kprintf("GENERAL RECOVERY BUFFER %d\n", - blk / HAMMER_FSBUF_MAXBLKS); - hammer_rel_buffer(buffer, 0); - return(-EDOM); - } - - - /* - * Mark the buffer as allocated in the cluster's master A-list. - */ - r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no); - KKASSERT(r == buf_no); - ++cluster->ondisk->stat_rec_bufs; - - kprintf("recover buffer1 %d:%d:%d cluster_free %d\n", - cluster->volume->vol_no, - cluster->clu_no, buf_no, - cluster->alist_master.meta->bm_alist_freeblks); - - /* - * Recover the buffer, scan and validate allocated records. Records - * which cannot be recovered are freed. - * - * The parent a-list must be properly adjusted so don't just call - * hammer_alist_recover() on the underlying buffer. Go through the - * parent. - */ - hammer_modify_buffer(buffer); - count = hammer_alist_recover(&buffer->alist, 0, 0, HAMMER_RECORD_NODES); - xcount = 0; - kprintf("hammer_recover_buffer count1 %d/%d\n", - HAMMER_RECORD_NODES - count, HAMMER_RECORD_NODES); - rec_no = 0; - for (;;) { - rec_no = hammer_alist_find(&buffer->alist, rec_no, - HAMMER_RECORD_NODES, 0); - if (rec_no == HAMMER_ALIST_BLOCK_NONE) - break; -#if 0 - kprintf("recover record %d:%d:%d %d\n", - cluster->volume->vol_no, - cluster->clu_no, buf_no, rec_no); -#endif - rec_offset = offsetof(union hammer_fsbuf_ondisk, - record.recs[rec_no]); - rec_offset += buf_no * HAMMER_BUFSIZE; - rec = &buffer->ondisk->record.recs[rec_no]; - error = hammer_recover_record(cluster, buffer, rec_offset, rec); - if (error) { - kprintf("hammer_recover_record: failed %d:%d@%d\n", - cluster->clu_no, buffer->buf_no, rec_offset); - hammer_alist_free(&buffer->alist, rec_no, 1); - if (hammer_debug_recover_faults) - Debugger("FAILED"); - ++count; /* free count */ - --xcount; - } - ++rec_no; - ++xcount; - } - kprintf("hammer_recover_buffer count2 %d/%d/%d\n", - HAMMER_RECORD_NODES - count, xcount, HAMMER_RECORD_NODES); - KKASSERT(HAMMER_RECORD_NODES - count == xcount); - hammer_rel_buffer(buffer, 0); - return(count); -} - -/* - * Recover a record, at least into a state that doesn't blow up the - * filesystem. Returns 0 on success, non-zero if the record is - * unrecoverable. - */ -static int -hammer_recover_record(hammer_cluster_t cluster, hammer_buffer_t buffer, - int32_t rec_offset, hammer_record_ondisk_t rec) -{ - hammer_buffer_t dbuf; - u_int64_t syncid = cluster->ondisk->synchronized_rec_id; - int32_t data_offset; - int32_t data_len; - int32_t nblks; - int32_t dbuf_no; - int32_t dblk_no; - int32_t base_blk; - int32_t r; - int error = 0; - - /* - * We have to discard any records with rec_id's greater then the - * last sync of the cluster header (which guarenteed all related - * buffers had been synced). Otherwise the record may reference - * information that was never synced to disk. - */ - if (rec->base.rec_id >= syncid) { - kprintf("recover record: syncid too large %016llx/%016llx\n", - rec->base.rec_id, syncid); - if (hammer_debug_recover_faults) - Debugger("DebugSyncid"); - return(EINVAL); - } - -#if 0 - /* XXX undo incomplete deletions */ - if (rec->base.base.delete_tid > syncid) - rec->base.base.delete_tid = 0; -#endif - - /* - * Validate the record's B-Tree key - */ - KKASSERT(rec->base.base.rec_type != 0); - if (rec->base.base.rec_type != HAMMER_RECTYPE_CLUSTER) { - if (hammer_btree_cmp(&rec->base.base, - &cluster->ondisk->clu_btree_beg) < 0) { - kprintf("recover record: range low\n"); - Debugger("RANGE LOW"); - return(EINVAL); - } - if (hammer_btree_cmp(&rec->base.base, - &cluster->ondisk->clu_btree_end) >= 0) { - kprintf("recover record: range high\n"); - Debugger("RANGE HIGH"); - return(EINVAL); - } - } - - /* - * Validate the record's data. If the offset is 0 there is no data - * (or it is zero-fill) and we can return success immediately. - * Otherwise make sure everything is ok. - */ - data_offset = rec->base.data_offset; - data_len = rec->base.data_len; - - if (data_len == 0) - rec->base.data_offset = data_offset = 0; - if (data_offset == 0) - goto done; - - /* - * Non-zero data offset, recover the data - */ - if (data_offset < HAMMER_BUFSIZE || - data_offset >= cluster->ondisk->clu_limit || - data_len < 0 || data_len > HAMMER_MAXDATA || - data_offset + data_len > cluster->ondisk->clu_limit) { - kprintf("recover record: bad offset/len %d/%d\n", - data_offset, data_len); - Debugger("BAD OFFSET"); - return(EINVAL); - } - - /* - * Check data_offset relative to rec_offset - */ - if (data_offset < rec_offset && data_offset + data_len > rec_offset) { - kprintf("recover record: bad offset: overlapping1\n"); - Debugger("BAD OFFSET - OVERLAP1"); - return(EINVAL); - } - if (data_offset >= rec_offset && - data_offset < rec_offset + sizeof(struct hammer_base_record)) { - kprintf("recover record: bad offset: overlapping2\n"); - Debugger("BAD OFFSET - OVERLAP2"); - return(EINVAL); - } - - /* - * Check for data embedded in the record - */ - if (data_offset >= rec_offset && - data_offset < rec_offset + HAMMER_RECORD_SIZE) { - if (data_offset + data_len > rec_offset + HAMMER_RECORD_SIZE) { - kprintf("recover record: bad offset: overlapping3\n"); - Debugger("BAD OFFSET - OVERLAP3"); - return(EINVAL); - } - goto done; - } - - KKASSERT(cluster->io.modified); - /* - * Recover the allocated data either out of the cluster's master alist - * or as a buffer sub-allocation. - */ - if ((data_len & HAMMER_BUFMASK) == 0) { - if (data_offset & HAMMER_BUFMASK) { - kprintf("recover record: bad offset: unaligned\n"); - Debugger("BAD OFFSET - UNALIGNED"); - return(EINVAL); - } - nblks = data_len / HAMMER_BUFSIZE; - dbuf_no = data_offset / HAMMER_BUFSIZE; - /* XXX power-of-2 check data_len */ - - r = hammer_alist_alloc_fwd(&cluster->alist_master, - nblks, dbuf_no); - if (r == HAMMER_ALIST_BLOCK_NONE) { - kprintf("recover record: cannot recover offset1\n"); - Debugger("CANNOT ALLOC DATABUFFER"); - return(EINVAL); - } - if (r != dbuf_no) { - kprintf("recover record: cannot recover offset2\n"); - hammer_alist_free(&cluster->alist_master, r, nblks); - KKASSERT(0); - return(EINVAL); - } - ++cluster->ondisk->stat_data_bufs; - } else { - if ((data_offset & ~HAMMER_BUFMASK) != - ((data_offset + data_len - 1) & ~HAMMER_BUFMASK)) { - kprintf("recover record: overlaps multiple bufs\n"); - Debugger("OVERLAP MULT"); - return(EINVAL); - } - if ((data_offset & HAMMER_BUFMASK) < - sizeof(struct hammer_fsbuf_head)) { - kprintf("recover record: data in header area\n"); - Debugger("DATA IN HEADER AREA"); - return(EINVAL); - } - if (data_offset & HAMMER_DATA_BLKMASK) { - kprintf("recover record: data blk unaligned\n"); - Debugger("DATA BLK UNALIGNED"); - return(EINVAL); - } - - /* - * Ok, recover the space in the data buffer. - */ - dbuf_no = data_offset / HAMMER_BUFSIZE; - r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, dbuf_no); - if (r != dbuf_no && r != HAMMER_ALIST_BLOCK_NONE) - hammer_alist_free(&cluster->alist_master, r, 1); - if (r == dbuf_no) { - /* - * This is the first time we've tried to recover - * data in this data buffer, reinit it (but don't - * zero it out, obviously). - * - * Calling initbuffer marks the data blocks within - * the buffer as being all-allocated. We have to - * mark it free. - */ - dbuf = hammer_get_buffer(cluster, dbuf_no, - 0, &error); - if (error == 0) { - KKASSERT(dbuf->buf_type == HAMMER_FSBUF_DATA); - hammer_modify_buffer(dbuf); - hammer_initbuffer(&dbuf->alist, - &dbuf->ondisk->head, - HAMMER_FSBUF_DATA); - /*dbuf->buf_type = HAMMER_FSBUF_DATA;*/ - base_blk = dbuf_no * HAMMER_FSBUF_MAXBLKS; - hammer_alist_free(&cluster->alist_mdata, - base_blk, - HAMMER_DATA_NODES); - kprintf("FREE DATA %d/%d\n", base_blk, HAMMER_DATA_NODES); - ++cluster->ondisk->stat_data_bufs; - } - } else { - /* - * We've seen this data buffer before. - */ - dbuf = hammer_get_buffer(cluster, dbuf_no, - 0, &error); - } - if (error) { - kprintf("recover record: data: getbuf failed\n"); - KKASSERT(0); - return(EINVAL); - } - - if (dbuf->buf_type != HAMMER_FSBUF_DATA) { - hammer_rel_buffer(dbuf, 0); - kprintf("recover record: data: wrong buffer type\n"); - KKASSERT(0); - return(EINVAL); - } - - /* - * Figure out the data block number and number of blocks. - */ - nblks = (data_len + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK; - nblks /= HAMMER_DATA_BLKSIZE; - dblk_no = ((data_offset & HAMMER_BUFMASK) - offsetof(union hammer_fsbuf_ondisk, data.data)) / HAMMER_DATA_BLKSIZE; - if ((data_offset & HAMMER_BUFMASK) != offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no])) { - kprintf("dblk_no %d does not match data_offset %d/%d\n", - dblk_no, - offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no]), - (data_offset & HAMMER_BUFMASK)); - hammer_rel_buffer(dbuf, 0); - kprintf("recover record: data: not block aligned\n"); - Debugger("bad data"); - return(EINVAL); - } - hammer_modify_buffer(dbuf); - dblk_no += dbuf_no * HAMMER_FSBUF_MAXBLKS; - r = hammer_alist_alloc_fwd(&cluster->alist_mdata, nblks, dblk_no); - if (r != dblk_no) { - if (r != HAMMER_ALIST_BLOCK_NONE) - hammer_alist_free(&cluster->alist_mdata, r, nblks); - hammer_rel_buffer(dbuf, 0); - kprintf("recover record: data: unable to realloc dbuf %d dblk %d\n", dbuf_no, dblk_no % HAMMER_FSBUF_MAXBLKS); - KKASSERT(0); - return(EINVAL); - } - hammer_rel_buffer(dbuf, 0); - } -done: - return(0); -} - -/* - * Rebuild the B-Tree for the records residing in the specified buffer. - * - * Return the number of records recovered. - */ -static int -hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no) -{ - hammer_record_ondisk_t rec; - hammer_buffer_t buffer; - int32_t rec_no; - int32_t rec_offset; - int record_count = 0; - int error; - - buffer = hammer_get_buffer(cluster, buf_no, 0, &error); - if (error) { - /* - * If we are unable to access the buffer leave it in a - * reserved state on the master alist. - */ - kprintf("hammer_recover_buffer_stage2: error " - "recovering %d:%d:%d\n", - cluster->volume->vol_no, cluster->clu_no, buf_no); - Debugger("RECOVER BUFFER STAGE2 FAIL"); - return(0); - } - - /* - * Recover the buffer, scan and validate allocated records. Records - * which cannot be recovered are freed. - */ - rec_no = 0; - for (;;) { - rec_no = hammer_alist_find(&buffer->alist, rec_no, - HAMMER_RECORD_NODES, 0); - if (rec_no == HAMMER_ALIST_BLOCK_NONE) - break; - rec_offset = offsetof(union hammer_fsbuf_ondisk, - record.recs[rec_no]); - rec_offset += buf_no * HAMMER_BUFSIZE; - rec = &buffer->ondisk->record.recs[rec_no]; - error = hammer_recover_btree(cluster, buffer, rec_offset, rec); - if (error) { - kprintf("hammer_recover_btree: failed %d:%d@%08x " - "error %d buffer %p rec %p rec_no %d " - " cluster_free %d\n", - cluster->clu_no, buffer->buf_no, rec_offset, - error, buffer, rec, rec_no, - cluster->alist_master.meta->bm_alist_freeblks - ); - Debugger("recover_btree failed"); - /* XXX free the record and its data? */ - /*hammer_alist_free(&buffer->alist, rec_no, 1);*/ - } else { - ++record_count; - } - ++rec_no; - } - hammer_rel_buffer(buffer, 0); - return(record_count); -} - -/* - * Enter a single record into the B-Tree. - */ -static int -hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, - int32_t rec_offset, hammer_record_ondisk_t rec) -{ - struct hammer_cursor cursor; - union hammer_btree_elm elm; - hammer_cluster_t ncluster; - int error = 0; - - /* - * Check for a spike record. When spiking into a new cluster do - * NOT allow a recursive recovery to occur. We use a lot of - * stack and the only thing we actually modify in the target - * cluster is its parent pointer. - */ - if (rec->base.base.rec_type == HAMMER_RECTYPE_CLUSTER) { - hammer_volume_t ovolume = cluster->volume; - hammer_volume_t nvolume; - - nvolume = hammer_get_volume(ovolume->hmp, rec->spike.vol_no, - &error); - if (error) { - Debugger("recover_btree1"); - return(error); - } - ncluster = hammer_get_cluster(nvolume, rec->spike.clu_no, - &error, GET_CLUSTER_NORECOVER); - hammer_rel_volume(nvolume, 0); - if (error) { - Debugger("recover_btree2"); - return(error); - } - - /* - * Validate the cluster. Allow the offset to be fixed up. - */ - if (ncluster->ondisk->clu_btree_parent_vol_no != ovolume->vol_no || - ncluster->ondisk->clu_btree_parent_clu_no != cluster->clu_no) { - kprintf("hammer_recover: Bad cluster spike hookup: " - "%d:%d != %d:%d\n", - ncluster->ondisk->clu_btree_parent_vol_no, - ncluster->ondisk->clu_btree_parent_clu_no, - ovolume->vol_no, - cluster->clu_no); - error = EINVAL; - hammer_rel_cluster(ncluster, 0); - Debugger("recover_btree3"); - return(error); - } - } else { - ncluster = NULL; - } - - /* - * Locate the insertion point. Note that we are using the cluster- - * localized cursor init so parent will start out NULL. - * - * The key(s) used for spike's are bounds and different from the - * key embedded in the spike record. A special B-Tree insertion - * call is made to deal with spikes. - */ - error = hammer_init_cursor_cluster(&cursor, cluster); - if (error) { - Debugger("recover_btree6"); - goto failed; - } - KKASSERT(cursor.node); - if (ncluster) - cursor.key_beg = ncluster->ondisk->clu_btree_beg; - else - cursor.key_beg = rec->base.base; - cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_RECOVER; - - error = hammer_btree_lookup(&cursor); - KKASSERT(error != EDEADLK); - KKASSERT(cursor.node); - if (error == 0) { - kprintf("hammer_recover_btree: Duplicate record cursor %p rec %p ncluster %p\n", - &cursor, rec, ncluster); - hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], HAMMER_BTREE_TYPE_LEAF, cursor.index); - Debugger("duplicate record"); - } - if (error != ENOENT) { - Debugger("recover_btree5"); - goto failed; - } - - - if (ncluster) { - /* - * Spike record - */ - kprintf("recover spike clu %d %016llx-%016llx clusterfree %d\n", - ncluster->clu_no, - ncluster->ondisk->clu_btree_beg.obj_id, - ncluster->ondisk->clu_btree_end.obj_id, - cluster->alist_master.meta->bm_alist_freeblks); - error = hammer_btree_insert_cluster(&cursor, ncluster, - rec_offset); - kprintf("recover spike record error %d clusterfree %d\n", - error, - cluster->alist_master.meta->bm_alist_freeblks); - KKASSERT(error != EDEADLK); - if (error) - Debugger("spike recovery"); - } else { - /* - * Normal record - */ -#if 0 - kprintf("recover recrd clu %d %016llx\n", - cluster->clu_no, rec->base.base.obj_id); -#endif - elm.leaf.base = rec->base.base; - elm.leaf.rec_offset = rec_offset; - elm.leaf.data_offset = rec->base.data_offset; - elm.leaf.data_len = rec->base.data_len; - elm.leaf.data_crc = rec->base.data_crc; - - error = hammer_btree_insert(&cursor, &elm); - KKASSERT(error != EDEADLK); - } - - /* - * Success if error is 0! - */ - if (error == 0) { - /* - * Update the cluster header's statistics count. stat_records - * is very important for proper reservation of B-Tree space. - * Note that a spike record counts as 2. - */ - ++cluster->ondisk->stat_records; - if (rec->base.base.rec_type == HAMMER_RECTYPE_INODE) - ++cluster->ondisk->stat_inodes; - if (rec->base.base.rec_type == HAMMER_RECTYPE_CLUSTER) - ++cluster->ondisk->stat_records; - } - if (error) { - kprintf("hammer_recover_btree: insertion failed\n"); - } - -failed: - if (ncluster) - hammer_rel_cluster(ncluster, 0); - hammer_done_cursor(&cursor); - return(error); -} - -#endif +/* + * Copyright (c) 2008 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.10 2008/04/26 02:54:00 dillon Exp $ + */ + +#include "hammer.h" + +static int hammer_check_tail_signature(hammer_fifo_tail_t tail, + hammer_off_t end_off); +static void hammer_recover_copy_undo(hammer_off_t undo_offset, + char *src, char *dst, int bytes); +static void hammer_recover_debug_dump(int w, char *buf, int bytes); +static int hammer_recover_undo(hammer_mount_t hmp, hammer_fifo_undo_t undo, + int bytes); + +/* + * Recover a filesystem on mount + */ +int +hammer_recover(hammer_mount_t hmp, hammer_volume_t root_volume) +{ + hammer_blockmap_t rootmap; + hammer_buffer_t buffer; + hammer_off_t scan_offset; + hammer_off_t bytes; + hammer_fifo_tail_t tail; + hammer_fifo_undo_t undo; + int error; + + /* + * Examine the UNDO FIFO. If it is empty the filesystem is clean + * and no action need be taken. + */ + rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; + if (rootmap->first_offset == rootmap->next_offset) + return(0); + + if (rootmap->next_offset < rootmap->first_offset) + bytes = rootmap->alloc_offset - rootmap->first_offset + + rootmap->next_offset; + bytes = (rootmap->next_offset - rootmap->first_offset); + kprintf("HAMMER(%s) Start Recovery (%lld bytes of UNDO)\n", + root_volume->ondisk->vol_name, bytes); + + /* + * Scan the UNDOs backwards. + */ + scan_offset = rootmap->next_offset; + buffer = NULL; + if (scan_offset > rootmap->alloc_offset) { + kprintf("HAMMER(%s) UNDO record at %016llx FIFO overflow\n", + root_volume->ondisk->vol_name, + scan_offset); + error = EIO; + goto failed; + } + + while ((int64_t)bytes > 0) { + kprintf("scan_offset %016llx\n", scan_offset); + if (scan_offset - sizeof(*tail) < + HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) { + kprintf("HAMMER(%s) UNDO record at %016llx FIFO " + "underflow\n", + root_volume->ondisk->vol_name, + scan_offset); + error = EIO; + break; + } + if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) { + scan_offset = rootmap->alloc_offset; + continue; + } + tail = hammer_bread(hmp, scan_offset - sizeof(*tail), + &error, &buffer); + if (error) { + kprintf("HAMMER(%s) Unable to read UNDO TAIL " + "at %016llx\n", + root_volume->ondisk->vol_name, + scan_offset - sizeof(*tail)); + break; + } + + if (hammer_check_tail_signature(tail, scan_offset) != 0) { + kprintf("HAMMER(%s) Illegal UNDO TAIL signature " + "at %016llx\n", + root_volume->ondisk->vol_name, + scan_offset - sizeof(*tail)); + error = EIO; + break; + } + undo = (void *)((char *)tail + sizeof(*tail) - tail->tail_size); + + error = hammer_recover_undo(hmp, undo, + HAMMER_BUFSIZE - + (int)((char *)undo - (char *)buffer->ondisk)); + if (error) { + kprintf("HAMMER(%s) UNDO record at %016llx failed\n", + root_volume->ondisk->vol_name, + scan_offset - tail->tail_size); + break; + } + scan_offset -= tail->tail_size; + bytes -= tail->tail_size; + } +failed: + if (buffer) + hammer_rel_buffer(buffer, 0); + return (error); +} + +static int +hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off) +{ + int max_bytes; + + max_bytes = ((end_off - sizeof(*tail)) & HAMMER_BUFMASK); + max_bytes += sizeof(*tail); + + /* + * tail overlaps buffer boundary + */ + if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64) { + return(1); + } + + /* + * signature check, the tail signature is allowed to be the head + * signature only for 8-byte PADs. + */ + switch(tail->tail_signature) { + case HAMMER_TAIL_SIGNATURE: + break; + case HAMMER_HEAD_SIGNATURE: + if (tail->tail_type != HAMMER_HEAD_TYPE_PAD || + tail->tail_size != sizeof(*tail)) { + return(2); + } + break; + } + + /* + * The undo structure must not overlap a buffer boundary. + */ + if (tail->tail_size < 0 || tail->tail_size > max_bytes) { + return(3); + } + return(0); +} + +static int +hammer_recover_undo(hammer_mount_t hmp, hammer_fifo_undo_t undo, int bytes) +{ + hammer_fifo_tail_t tail; + hammer_volume_t volume; + hammer_buffer_t buffer; + int zone; + int error; + int vol_no; + int max_bytes; + u_int32_t offset; + + /* + * Basic sanity checks + */ + if (bytes < HAMMER_HEAD_ALIGN) { + kprintf("HAMMER: Undo alignment error (%d)\n", bytes); + return(EIO); + } + if (undo->head.hdr_signature != HAMMER_HEAD_SIGNATURE) { + kprintf("HAMMER: Bad head signature %04x\n", + undo->head.hdr_signature); + return(EIO); + } + if (undo->head.hdr_size < HAMMER_HEAD_ALIGN || + undo->head.hdr_size > bytes) { + kprintf("HAMMER: Bad size %d\n", bytes); + return(EIO); + } + + /* + * Skip PAD records. Note that PAD records also do not require + * a tail. + */ + if (undo->head.hdr_type == HAMMER_HEAD_TYPE_PAD) + return(0); + + /* + * Check the tail + */ + bytes = undo->head.hdr_size; + tail = (void *)((char *)undo + bytes - sizeof(*tail)); + if (tail->tail_size != undo->head.hdr_size) { + kprintf("HAMMER: Bad tail size %d\n", tail->tail_size); + return(EIO); + } + if (tail->tail_type != undo->head.hdr_type) { + kprintf("HAMMER: Bad tail type %d\n", tail->tail_type); + return(EIO); + } + + /* + * Only process UNDO records + */ + if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO) + return(0); + + /* + * Validate the UNDO record. + */ + max_bytes = undo->head.hdr_size - sizeof(*undo) - sizeof(*tail); + if (undo->undo_data_bytes < 0 || undo->undo_data_bytes > max_bytes) { + kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n", + undo->undo_data_bytes, max_bytes); + return(EIO); + } + + /* + * The undo offset may only be a zone-1 or zone-2 offset. + * + * Currently we only support a zone-1 offset representing the + * volume header. + */ + zone = HAMMER_ZONE_DECODE(undo->undo_offset); + offset = undo->undo_offset & HAMMER_BUFMASK; + + if (offset + undo->undo_data_bytes > HAMMER_BUFSIZE) { + kprintf("HAMMER: Corrupt UNDO record, bad offset\n"); + return (EIO); + } + + switch(zone) { + case HAMMER_ZONE_RAW_VOLUME_INDEX: + vol_no = HAMMER_VOL_DECODE(undo->undo_offset); + volume = hammer_get_volume(hmp, vol_no, &error); + if (volume == NULL) { + kprintf("HAMMER: UNDO record, " + "cannot access volume %d\n", vol_no); + break; + } + hammer_modify_volume(NULL, volume, NULL, 0); + hammer_recover_copy_undo(undo->undo_offset, + (char *)(undo + 1), + (char *)volume->ondisk + offset, + undo->undo_data_bytes); + hammer_modify_volume_done(volume); + hammer_io_flush(&volume->io); + hammer_rel_volume(volume, 0); + break; + case HAMMER_ZONE_RAW_BUFFER_INDEX: + buffer = hammer_get_buffer(hmp, undo->undo_offset, 0, &error); + if (buffer == NULL) { + kprintf("HAMMER: UNDO record, " + "cannot access buffer %016llx\n", + undo->undo_offset); + break; + } + hammer_modify_buffer(NULL, buffer, NULL, 0); + hammer_recover_copy_undo(undo->undo_offset, + (char *)(undo + 1), + (char *)buffer->ondisk + offset, + undo->undo_data_bytes); + hammer_modify_buffer_done(buffer); + hammer_io_flush(&buffer->io); + hammer_rel_buffer(buffer, 0); + break; + default: + kprintf("HAMMER: Corrupt UNDO record\n"); + error = EIO; + } + return (error); +} + +static void +hammer_recover_copy_undo(hammer_off_t undo_offset, + char *src, char *dst, int bytes) +{ + kprintf("UNDO %016llx:", undo_offset); + hammer_recover_debug_dump(22, dst, bytes); + kprintf("%22s", "to:"); + hammer_recover_debug_dump(22, src, bytes); + bcopy(src, dst, bytes); +} + +static void +hammer_recover_debug_dump(int w, char *buf, int bytes) +{ + int i; + + for (i = 0; i < bytes; ++i) { + if (i && (i & 15) == 0) + kprintf("\n%*.*s", w, w, ""); + kprintf(" %02x", (unsigned char)buf[i]); + } + kprintf("\n"); +} + diff --git a/sys/vfs/hammer/hammer_undo.c b/sys/vfs/hammer/hammer_undo.c index c90a9fefae..f070fc036c 100644 --- a/sys/vfs/hammer/hammer_undo.c +++ b/sys/vfs/hammer/hammer_undo.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.5 2008/04/25 21:49:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.6 2008/04/26 02:54:00 dillon Exp $ */ /* @@ -71,11 +71,11 @@ hammer_undo_lookup(hammer_mount_t hmp, hammer_off_t zone3_off, int *errorp) /* * Generate an UNDO record for the block of data at the specified zone1 - * offset. + * or zone2 offset. */ int hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io, - hammer_off_t zone1_off, void *base, int len) + hammer_off_t zone_off, void *base, int len) { hammer_volume_t root_volume; hammer_volume_ondisk_t ondisk; @@ -90,9 +90,6 @@ hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io, int error; int bytes; - bytes = ((len + 7) & ~7) + sizeof(struct hammer_fifo_undo) + - sizeof(struct hammer_fifo_tail); - root_volume = trans->rootvol; ondisk = root_volume->ondisk; undomap = &ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; @@ -100,10 +97,14 @@ hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io, /* no undo recursion */ hammer_modify_volume(NULL, root_volume, NULL, 0); +again: /* * Allocate space in the FIFO */ -again: + bytes = ((len + HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK) + + sizeof(struct hammer_fifo_undo) + + sizeof(struct hammer_fifo_tail); + next_offset = undomap->next_offset; /* @@ -159,13 +160,19 @@ again: * We're good, create the entry. */ undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE; - undo->head.hdr_type = HAMMER_HEAD_TYPE_PAD; + undo->head.hdr_type = HAMMER_HEAD_TYPE_UNDO; undo->head.hdr_size = bytes; undo->head.reserved01 = 0; undo->head.hdr_crc = 0; - undo->undo_offset = zone1_off; + undo->undo_offset = zone_off; undo->undo_data_bytes = len; bcopy(base, undo + 1, len); + + tail = (void *)((char *)undo + bytes - sizeof(*tail)); + tail->tail_signature = HAMMER_TAIL_SIGNATURE; + tail->tail_type = HAMMER_HEAD_TYPE_UNDO; + tail->tail_size = bytes; + undo->head.hdr_crc = crc32(undo, bytes); hammer_modify_buffer_done(buffer); diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index c5e96417c6..8c7bdd7aff 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.27 2008/04/25 21:49:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.28 2008/04/26 02:54:00 dillon Exp $ */ #include @@ -289,7 +289,20 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, */ rootvol = hammer_get_root_volume(hmp, &error); if (error) + goto failed; + + /* + * Perform any necessary UNDO operations + */ + error = hammer_recover(hmp, rootvol); + if (error) { + kprintf("Failed to recover HAMMER filesystem on mount\n"); goto done; + } + + /* + * Finish setup now that we have a good root volume + */ ksnprintf(mp->mnt_stat.f_mntfromname, sizeof(mp->mnt_stat.f_mntfromname), "%s", rootvol->ondisk->vol_name); @@ -301,8 +314,6 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, hmp->next_tid = rootvol->ondisk->vol0_next_tid; kprintf("on-disk next_tid %016llx\n", hmp->next_tid); - hammer_rel_volume(rootvol, 0); - hammer_flusher_create(hmp); /* @@ -319,6 +330,8 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, /*vn_unlock(hmp->rootvp);*/ done: + hammer_rel_volume(rootvol, 0); +failed: /* * Cleanup and return. */ diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index da92b606c2..fcdb8889a9 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.39 2008/04/25 21:49:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.40 2008/04/26 02:54:00 dillon Exp $ */ #include @@ -172,7 +172,7 @@ hammer_vop_fsync(struct vop_fsync_args *ap) { hammer_inode_t ip = VTOI(ap->a_vp); - hammer_flush_inode(ip, 0); + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); if (ap->a_waitfor == MNT_WAIT) hammer_wait_inode(ip); return (ip->error); @@ -1896,7 +1896,8 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) else TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act); hammer_modify_inode(NULL, ip, HAMMER_INODE_XDIRTY); - hammer_flush_inode(ip, 0); + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); + kprintf("a"); return(0); } -- 2.11.4.GIT