From: Matthew Dillon Date: Mon, 15 Mar 2010 09:07:23 +0000 (-0700) Subject: HAMMER VFS - Major retooling of the refcount mechanics, and fix a deadlock X-Git-Tag: v2.7.0~84 X-Git-Url: https://repo.or.cz/w/dragonfly.git/commitdiff_plain/250aec18e447d7c9db30d4f905d21333e825a9d9 HAMMER VFS - Major retooling of the refcount mechanics, and fix a deadlock * Add an interlock to hammer_lock->refs which is independent of hammer_lock->lockval. * Retool the refcount mechanics to integrate the interlock on 0->1 and 1->0 transitions. In addition implement a check bit to deal with serialization races between threads which forces all threads to interlock. This deals with the case where one thread bumps the refcount (lock->refs) but is unable to immediateliy acquire the interlock. Other threads doing the same thing will race and lock->refs will be bumped far higher than 1. The check bit is set on the 0->1 transition and enforces all threads to serialize until one (usually the first to acquire the lock) is able to dispose of the check condition. The transition interlocks are used to do I/O loading and validation on the 0->N transition and I/O unloading and invalidation on the 1->0 transition. * The new integrated mechanics also simplify hammer_io's handling of B_LOCKED slightly and properly interlocks changes in disposition for the related buffer cache buffer against refcount transitions. * The new integrated refcount/interlock is also more optimal as the combined operation can be done in a single atomic_cmpset_int() call, and fully MPSAFE. Even though other major portions of HAMMER are not yet MPSAFE getting this particular bit dealt with will ease MP work later on. * Retool the volume, buffer, and node referencing code to use the new mechanics. This fixes a deadlock. These routines were previously acquiring an exclusive lock instead of an interlock. The exclusive lock could combine with shared locks held by the same thread to create a deadlock situation against other threads. Use of the interlock fixes this particular deadlock. Reported-by: Francois Tigeot (deadlock report) --- diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 1662b81d6d..6a437a02a9 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -127,13 +127,22 @@ typedef struct hammer_transaction *hammer_transaction_t; * HAMMER locks */ struct hammer_lock { - int refs; /* active references delay writes */ + volatile u_int refs; /* active references */ volatile u_int lockval; /* lock count and control bits */ - struct thread *owner; /* owner if exclusively held */ + struct thread *lowner; /* owner if exclusively held */ + struct thread *rowner; /* owner if exclusively held */ }; +#define HAMMER_REFS_LOCKED 0x40000000 /* transition check */ +#define HAMMER_REFS_WANTED 0x20000000 /* transition check */ +#define HAMMER_REFS_CHECK 0x10000000 /* transition check */ + +#define HAMMER_REFS_FLAGS (HAMMER_REFS_LOCKED | \ + HAMMER_REFS_WANTED | \ + HAMMER_REFS_CHECK) + #define HAMMER_LOCKF_EXCLUSIVE 0x40000000 -#define HAMMER_LOCKF_WANTED 0x80000000 +#define HAMMER_LOCKF_WANTED 0x20000000 static __inline int hammer_notlocked(struct hammer_lock *lock) @@ -147,16 +156,37 @@ hammer_islocked(struct hammer_lock *lock) return(lock->lockval != 0); } +/* + * Returns the number of refs on the object. + */ static __inline int hammer_isactive(struct hammer_lock *lock) { - return(lock->refs != 0); + return(lock->refs & ~HAMMER_REFS_FLAGS); +} + +static __inline int +hammer_oneref(struct hammer_lock *lock) +{ + return((lock->refs & ~HAMMER_REFS_FLAGS) == 1); } static __inline int -hammer_islastref(struct hammer_lock *lock) +hammer_norefs(struct hammer_lock *lock) { - return(lock->refs == 1); + return((lock->refs & ~HAMMER_REFS_FLAGS) == 0); +} + +static __inline int +hammer_norefsorlock(struct hammer_lock *lock) +{ + return(lock->refs == 0); +} + +static __inline int +hammer_refsorlock(struct hammer_lock *lock) +{ + return(lock->refs != 0); } /* @@ -166,7 +196,7 @@ static __inline int hammer_lock_excl_owned(struct hammer_lock *lock, thread_t td) { if ((lock->lockval & HAMMER_LOCKF_EXCLUSIVE) && - lock->owner == td) { + lock->lowner == td) { return(1); } return(0); @@ -576,7 +606,6 @@ struct hammer_io { struct buf *bp; int64_t offset; /* zone-2 offset */ int bytes; /* buffer cache buffer size */ - int loading; /* loading/unloading interlock */ int modify_refs; u_int modified : 1; /* bp's data was modified */ @@ -660,7 +689,6 @@ struct hammer_node { TAILQ_HEAD(, hammer_cursor) cursor_list; /* deadlock recovery */ struct hammer_node_cache_list cache_list; /* passive caches */ int flags; - int loading; /* load interlock */ }; #define HAMMER_NODE_DELETED 0x0001 @@ -1034,7 +1062,15 @@ void hammer_lock_downgrade(struct hammer_lock *lock); int hammer_lock_status(struct hammer_lock *lock); void hammer_unlock(struct hammer_lock *lock); void hammer_ref(struct hammer_lock *lock); -void hammer_unref(struct hammer_lock *lock); +int hammer_ref_interlock(struct hammer_lock *lock); +int hammer_ref_interlock_true(struct hammer_lock *lock); +void hammer_ref_interlock_done(struct hammer_lock *lock); +void hammer_rel(struct hammer_lock *lock); +int hammer_rel_interlock(struct hammer_lock *lock, int locked); +void hammer_rel_interlock_done(struct hammer_lock *lock, int orig_locked); +int hammer_get_interlock(struct hammer_lock *lock); +int hammer_try_interlock_norefs(struct hammer_lock *lock); +void hammer_put_interlock(struct hammer_lock *lock, int error); void hammer_sync_lock_ex(hammer_transaction_t trans); void hammer_sync_lock_sh(hammer_transaction_t trans); @@ -1148,8 +1184,8 @@ int hammer_ref_volume(hammer_volume_t volume); int hammer_ref_buffer(hammer_buffer_t buffer); void hammer_flush_buffer_nodes(hammer_buffer_t buffer); -void hammer_rel_volume(hammer_volume_t volume, int flush); -void hammer_rel_buffer(hammer_buffer_t buffer, int flush); +void hammer_rel_volume(hammer_volume_t volume, int locked); +void hammer_rel_buffer(hammer_buffer_t buffer, int locked); int hammer_vfs_export(struct mount *mp, int op, const struct export_args *export); @@ -1164,7 +1200,7 @@ void hammer_delete_node(hammer_transaction_t trans, void hammer_cache_node(hammer_node_cache_t cache, hammer_node_t node); void hammer_uncache_node(hammer_node_cache_t cache); -void hammer_flush_node(hammer_node_t node); +void hammer_flush_node(hammer_node_t node, int locked); void hammer_dup_buffer(struct hammer_buffer **bufferp, struct hammer_buffer *buffer); diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index 3eaa1b34d3..8a6ce34e60 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -2346,7 +2346,7 @@ btree_remove(hammer_cursor_t cursor) ondisk->type = HAMMER_BTREE_TYPE_DELETED; ondisk->count = 0; hammer_modify_node_done(node); - hammer_flush_node(node); + hammer_flush_node(node, 0); hammer_delete_node(cursor->trans, node); } else { /* @@ -2413,7 +2413,7 @@ btree_remove(hammer_cursor_t cursor) hammer_modify_node_done(parent); hammer_cursor_removed_node(node, parent, cursor->parent_index); hammer_cursor_deleted_element(parent, cursor->parent_index); - hammer_flush_node(node); + hammer_flush_node(node, 0); hammer_delete_node(cursor->trans, node); /* @@ -2804,7 +2804,7 @@ hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) *parent->node->ondisk = *parent->copy; hammer_modify_node_done(parent->node); if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) { - hammer_flush_node(parent->node); + hammer_flush_node(parent->node, 0); hammer_delete_node(cursor->trans, parent->node); } } diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c index 6066a5d0c9..abf1ceb93b 100644 --- a/sys/vfs/hammer/hammer_flusher.c +++ b/sys/vfs/hammer/hammer_flusher.c @@ -479,8 +479,6 @@ hammer_flusher_clean_loose_ios(hammer_mount_t hmp) KKASSERT(io->mod_list == &hmp->lose_list); TAILQ_REMOVE(&hmp->lose_list, io, mod_entry); io->mod_list = NULL; - if (io->lock.refs == 0) - ++hammer_count_refedbufs; hammer_ref(&io->lock); buffer = (void *)io; hammer_rel_buffer(buffer, 0); @@ -610,8 +608,6 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final) while ((io = TAILQ_FIRST(&hmp->data_list)) != NULL) { if (io->ioerror) break; - if (io->lock.refs == 0) - ++hammer_count_refedbufs; hammer_ref(&io->lock); hammer_io_write_interlock(io); KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME); @@ -753,8 +749,6 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final) if (io->ioerror) break; KKASSERT(io->modify_refs == 0); - if (io->lock.refs == 0) - ++hammer_count_refedbufs; hammer_ref(&io->lock); KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME); hammer_io_flush(io, 0); @@ -845,8 +839,6 @@ hammer_flusher_flush_undos(hammer_mount_t hmp, int mode) while ((io = TAILQ_FIRST(&hmp->undo_list)) != NULL) { if (io->ioerror) break; - if (io->lock.refs == 0) - ++hammer_count_refedbufs; hammer_ref(&io->lock); KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME); hammer_io_write_interlock(io); diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index 0537d11d17..78bac00c0a 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -902,7 +902,7 @@ hammer_free_inode(hammer_inode_t ip) struct hammer_mount *hmp; hmp = ip->hmp; - KKASSERT(ip->lock.refs == 1); + KKASSERT(hammer_oneref(&ip->lock)); hammer_uncache_node(&ip->cache[0]); hammer_uncache_node(&ip->cache[1]); hammer_uncache_node(&ip->cache[2]); @@ -1118,9 +1118,9 @@ hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data) int res; hammer_ref(&ip->lock); - if (ip->lock.refs == 2 && ip->vp) + if (hammer_isactive(&ip->lock) == 2 && ip->vp) vclean_unlocked(ip->vp); - if (ip->lock.refs == 1 && ip->vp == NULL) + if (hammer_isactive(&ip->lock) == 1 && ip->vp == NULL) res = 0; else res = -1; /* stop, someone is using the inode */ @@ -1155,8 +1155,8 @@ hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization) void hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm) { - hammer_unref(&pfsm->lock); - if (pfsm->lock.refs == 0) { + hammer_rel(&pfsm->lock); + if (hammer_norefs(&pfsm->lock)) { RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm); kfree(pfsm, hmp->m_misc); } @@ -1426,7 +1426,7 @@ hammer_rel_inode(struct hammer_inode *ip, int flush) * Handle disposition when dropping the last ref. */ for (;;) { - if (ip->lock.refs == 1) { + if (hammer_oneref(&ip->lock)) { /* * Determine whether on-disk action is needed for * the inode's final disposition. @@ -1435,7 +1435,7 @@ hammer_rel_inode(struct hammer_inode *ip, int flush) hammer_inode_unloadable_check(ip, 0); if (ip->flags & HAMMER_INODE_MODMASK) { hammer_flush_inode(ip, 0); - } else if (ip->lock.refs == 1) { + } else if (hammer_oneref(&ip->lock)) { hammer_unload_inode(ip); break; } @@ -1447,9 +1447,9 @@ hammer_rel_inode(struct hammer_inode *ip, int flush) * The inode still has multiple refs, try to drop * one ref. */ - KKASSERT(ip->lock.refs >= 1); - if (ip->lock.refs > 1) { - hammer_unref(&ip->lock); + KKASSERT(hammer_isactive(&ip->lock) >= 1); + if (hammer_isactive(&ip->lock) > 1) { + hammer_rel(&ip->lock); break; } } @@ -1467,8 +1467,8 @@ hammer_unload_inode(struct hammer_inode *ip) { hammer_mount_t hmp = ip->hmp; - KASSERT(ip->lock.refs == 1, - ("hammer_unload_inode: %d refs\n", ip->lock.refs)); + KASSERT(hammer_oneref(&ip->lock), + ("hammer_unload_inode: %d refs\n", hammer_isactive(&ip->lock))); KKASSERT(ip->vp == NULL); KKASSERT(ip->flush_state == HAMMER_FST_IDLE); KKASSERT(ip->cursor_ip_refs == 0); @@ -1515,7 +1515,7 @@ hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused) --rec->flush_group->refs; else hammer_ref(&rec->lock); - KKASSERT(rec->lock.refs == 1); + KKASSERT(hammer_oneref(&rec->lock)); rec->flush_state = HAMMER_FST_IDLE; rec->flush_group = NULL; rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */ @@ -1538,7 +1538,7 @@ hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused) ip->flush_group = NULL; /* fall through */ case HAMMER_FST_SETUP: - hammer_unref(&ip->lock); + hammer_rel(&ip->lock); ip->flush_state = HAMMER_FST_IDLE; /* fall through */ case HAMMER_FST_IDLE: @@ -2975,7 +2975,7 @@ defer_buffer_flush: while (RB_ROOT(&ip->rec_tree)) { hammer_record_t record = RB_ROOT(&ip->rec_tree); hammer_ref(&record->lock); - KKASSERT(record->lock.refs == 1); + KKASSERT(hammer_oneref(&record->lock)); record->flags |= HAMMER_RECF_DELETED_BE; ++record->ip->rec_generation; hammer_rel_mem_record(record); diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 5842b563ae..0a8539c72b 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -203,8 +203,8 @@ hammer_io_clear_error(struct hammer_io *io) { if (io->ioerror) { io->ioerror = 0; - hammer_unref(&io->lock); - KKASSERT(io->lock.refs > 0); + hammer_rel(&io->lock); + KKASSERT(hammer_isactive(&io->lock)); } } @@ -361,7 +361,7 @@ hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset) BUF_KERNPROC(bp); iou->io.reclaim = 1; iou->io.waitdep = 1; - KKASSERT(iou->io.lock.refs == 1); + KKASSERT(hammer_isactive(&iou->io.lock) == 1); hammer_rel_buffer(&iou->buffer, 0); /*hammer_io_deallocate(bp);*/ #endif @@ -380,12 +380,15 @@ hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset) /* * This routine is called on the last reference to a hammer structure. - * The io is usually interlocked with io.loading and io.refs must be 1. + * The io must be interlocked with a refcount of zero. The hammer structure + * will remain interlocked on return. * - * This routine may return a non-NULL bp to the caller for dispoal. Disposal - * simply means the caller finishes decrementing the ref-count on the - * IO structure then brelse()'s the bp. The bp may or may not still be - * passively associated with the IO. + * This routine may return a non-NULL bp to the caller for dispoal. + * The caller typically brelse()'s the bp. + * + * The bp may or may not still be passively associated with the IO. It + * will remain passively associated if it is unreleasable (e.g. a modified + * meta-data buffer). * * The only requirement here is that modified meta-data and volume-header * buffer may NOT be disassociated from the IO structure, and consequently @@ -600,7 +603,7 @@ hammer_io_flush(struct hammer_io *io, int reclaim) */ hammer_ref(&io->lock); hammer_io_clear_modify(io, 0); - hammer_unref(&io->lock); + hammer_rel(&io->lock); if (hammer_debug_io & 0x0002) kprintf("hammer io_write %016jx\n", bp->b_bio1.bio_offset); @@ -648,7 +651,7 @@ hammer_io_modify(hammer_io_t io, int count) /* * Shortcut if nothing to do. */ - KKASSERT(io->lock.refs != 0 && io->bp != NULL); + KKASSERT(hammer_isactive(&io->lock) && io->bp != NULL); io->modify_refs += count; if (io->modified && io->released == 0) return; @@ -817,7 +820,7 @@ restart: } } /* caller must still have ref on io */ - KKASSERT(io->lock.refs > 0); + KKASSERT(hammer_isactive(&io->lock)); } /* @@ -924,8 +927,6 @@ hammer_io_complete(struct buf *bp) default: if (iou->io.ioerror == 0) { iou->io.ioerror = 1; - if (iou->io.lock.refs == 0) - ++hammer_count_refedbufs; hammer_ref(&iou->io.lock); } break; @@ -963,10 +964,11 @@ hammer_io_complete(struct buf *bp) /* * If B_LOCKED is set someone wanted to deallocate the bp at some - * point, do it now if refs has become zero. + * point, try to do it now. The operation will fail if there are + * refs or if hammer_io_deallocate() is unable to gain the + * interlock. */ - if ((bp->b_flags & B_LOCKED) && iou->io.lock.refs == 0) { - KKASSERT(iou->io.modified == 0); + if (bp->b_flags & B_LOCKED) { --hammer_count_io_locked; bp->b_flags &= ~B_LOCKED; hammer_io_deallocate(bp); @@ -995,13 +997,21 @@ hammer_io_deallocate(struct buf *bp) hammer_io_structure_t iou = (void *)LIST_FIRST(&bp->b_dep); KKASSERT((bp->b_flags & B_LOCKED) == 0 && iou->io.running == 0); - if (iou->io.lock.refs > 0 || iou->io.modified) { + if (hammer_try_interlock_norefs(&iou->io.lock) == 0) { + /* + * We cannot safely disassociate a bp from a referenced + * or interlocked HAMMER structure. + */ + bp->b_flags |= B_LOCKED; + ++hammer_count_io_locked; + } else if (iou->io.modified) { /* * It is not legal to disassociate a modified buffer. This * case really shouldn't ever occur. */ bp->b_flags |= B_LOCKED; ++hammer_count_io_locked; + hammer_put_interlock(&iou->io.lock, 0); } else { /* * Disassociate the BP. If the io has no refs left we @@ -1016,6 +1026,7 @@ hammer_io_deallocate(struct buf *bp) TAILQ_INSERT_TAIL(iou->io.mod_list, &iou->io, mod_entry); crit_exit(); } + hammer_put_interlock(&iou->io.lock, 1); } } @@ -1090,7 +1101,7 @@ hammer_io_checkwrite(struct buf *bp) if (io->modify_refs == 0 && io->modified) { hammer_ref(&io->lock); hammer_io_clear_modify(io, 0); - hammer_unref(&io->lock); + hammer_rel(&io->lock); } else if (io->modified) { KKASSERT(io->type == HAMMER_STRUCTURE_DATA_BUFFER); } diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index ac4d8e69d2..1d7f19fed7 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -369,9 +369,9 @@ hammer_rel_mem_record(struct hammer_record *record) hammer_inode_t target_ip; int diddrop; - hammer_unref(&record->lock); + hammer_rel(&record->lock); - if (record->lock.refs == 0) { + if (hammer_norefs(&record->lock)) { /* * Upon release of the last reference wakeup any waiters. * The record structure may get destroyed so callers will @@ -391,7 +391,7 @@ hammer_rel_mem_record(struct hammer_record *record) if (record->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) { - KKASSERT(ip->lock.refs > 0); + KKASSERT(hammer_isactive(&ip->lock) > 0); KKASSERT(record->flush_state != HAMMER_FST_FLUSH); /* diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c index 83c61a2ce0..d71bdab1f3 100644 --- a/sys/vfs/hammer/hammer_ondisk.c +++ b/sys/vfs/hammer/hammer_ondisk.c @@ -50,6 +50,7 @@ static int hammer_load_volume(hammer_volume_t volume); static int hammer_load_buffer(hammer_buffer_t buffer, int isnew); static int hammer_load_node(hammer_transaction_t trans, hammer_node_t node, int isnew); +static void _hammer_rel_node(hammer_node_t node, int locked); static int hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2) @@ -290,14 +291,14 @@ hammer_unload_volume(hammer_volume_t volume, void *data __unused) */ if (volume->io.ioerror) { volume->io.ioerror = 0; - hammer_unref(&volume->io.lock); + hammer_rel(&volume->io.lock); } /* - * This should release the bp. + * This should release the bp. Releasing the volume with flush set + * implies the interlock is set. */ - KKASSERT(volume->io.lock.refs == 0); - hammer_ref(&volume->io.lock); + hammer_ref_interlock_true(&volume->io.lock); hammer_rel_volume(volume, 1); KKASSERT(volume->io.bp == NULL); @@ -305,7 +306,7 @@ hammer_unload_volume(hammer_volume_t volume, void *data __unused) * There should be no references on the volume, no clusters, and * no super-clusters. */ - KKASSERT(volume->io.lock.refs == 0); + KKASSERT(hammer_norefs(&volume->io.lock)); volume->ondisk = NULL; if (volume->devvp) { @@ -375,18 +376,18 @@ hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp) *errorp = ENOENT; return(NULL); } - hammer_ref(&volume->io.lock); /* - * Deal with on-disk info + * Reference the volume, load/check the data on the 0->1 transition. + * hammer_load_volume() will dispose of the interlock on return, + * and also clean up the ref count on error. */ - if (volume->ondisk == NULL || volume->io.loading) { + if (hammer_ref_interlock(&volume->io.lock)) { *errorp = hammer_load_volume(volume); - if (*errorp) { - hammer_rel_volume(volume, 1); + if (*errorp) volume = NULL; - } } else { + KKASSERT(volume->ondisk); *errorp = 0; } return(volume); @@ -397,16 +398,14 @@ hammer_ref_volume(hammer_volume_t volume) { int error; - hammer_ref(&volume->io.lock); - /* - * Deal with on-disk info + * Reference the volume and deal with the check condition used to + * load its ondisk info. */ - if (volume->ondisk == NULL || volume->io.loading) { + if (hammer_ref_interlock(&volume->io.lock)) { error = hammer_load_volume(volume); - if (error) - hammer_rel_volume(volume, 1); } else { + KKASSERT(volume->ondisk); error = 0; } return (error); @@ -419,18 +418,17 @@ hammer_get_root_volume(struct hammer_mount *hmp, int *errorp) volume = hmp->rootvol; KKASSERT(volume != NULL); - hammer_ref(&volume->io.lock); /* - * Deal with on-disk info + * Reference the volume and deal with the check condition used to + * load its ondisk info. */ - if (volume->ondisk == NULL || volume->io.loading) { + if (hammer_ref_interlock(&volume->io.lock)) { *errorp = hammer_load_volume(volume); - if (*errorp) { - hammer_rel_volume(volume, 1); + if (*errorp) volume = NULL; - } } else { + KKASSERT(volume->ondisk); *errorp = 0; } return (volume); @@ -438,58 +436,46 @@ hammer_get_root_volume(struct hammer_mount *hmp, int *errorp) /* * Load a volume's on-disk information. The volume must be referenced and - * not locked. We temporarily acquire an exclusive lock to interlock - * against releases or multiple get's. + * the interlock is held on call. The interlock will be released on return. + * The reference will also be released on return if an error occurs. */ static int hammer_load_volume(hammer_volume_t volume) { int error; - ++volume->io.loading; - hammer_lock_ex(&volume->io.lock); - if (volume->ondisk == NULL) { error = hammer_io_read(volume->devvp, &volume->io, volume->maxraw_off); - if (error == 0) + if (error == 0) { volume->ondisk = (void *)volume->io.bp->b_data; + hammer_ref_interlock_done(&volume->io.lock); + } else { + hammer_rel_volume(volume, 1); + } } else { error = 0; } - --volume->io.loading; - hammer_unlock(&volume->io.lock); return(error); } /* - * Release a volume. Call hammer_io_release on the last reference. We have - * to acquire an exclusive lock to interlock against volume->ondisk tests - * in hammer_load_volume(), and hammer_io_release() also expects an exclusive - * lock to be held. + * Release a previously acquired reference on the volume. * * Volumes are not unloaded from memory during normal operation. */ void -hammer_rel_volume(hammer_volume_t volume, int flush) +hammer_rel_volume(hammer_volume_t volume, int locked) { - struct buf *bp = NULL; + struct buf *bp; - crit_enter(); - if (volume->io.lock.refs == 1) { - ++volume->io.loading; - hammer_lock_ex(&volume->io.lock); - if (volume->io.lock.refs == 1) { - volume->ondisk = NULL; - bp = hammer_io_release(&volume->io, flush); - } - --volume->io.loading; - hammer_unlock(&volume->io.lock); + if (hammer_rel_interlock(&volume->io.lock, locked)) { + volume->ondisk = NULL; + bp = hammer_io_release(&volume->io, locked); + hammer_rel_interlock_done(&volume->io.lock, locked); + if (bp) + brelse(bp); } - hammer_unref(&volume->io.lock); - if (bp) - brelse(bp); - crit_exit(); } int @@ -537,17 +523,28 @@ again: */ buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, buf_offset); if (buffer) { - if (buffer->io.lock.refs == 0) - ++hammer_count_refedbufs; - hammer_ref(&buffer->io.lock); - /* * Once refed the ondisk field will not be cleared by - * any other action. + * any other action. Shortcut the operation if the + * ondisk structure is valid. */ - if (buffer->ondisk && buffer->io.loading == 0) { + if (hammer_ref_interlock(&buffer->io.lock) == 0) { + hammer_io_advance(&buffer->io); + KKASSERT(buffer->ondisk); *errorp = 0; + return(buffer); + } + + /* + * 0->1 transition or defered 0->1 transition (CHECK), + * interlock now held. Shortcut if ondisk is already + * assigned. + */ + ++hammer_count_refedbufs; + if (buffer->ondisk) { hammer_io_advance(&buffer->io); + hammer_ref_interlock_done(&buffer->io.lock); + *errorp = 0; return(buffer); } @@ -637,15 +634,16 @@ again: (zone2_offset & HAMMER_OFF_SHORT_MASK); buffer->io.bytes = bytes; TAILQ_INIT(&buffer->clist); - hammer_ref(&buffer->io.lock); + hammer_ref_interlock_true(&buffer->io.lock); /* * Insert the buffer into the RB tree and handle late collisions. */ if (RB_INSERT(hammer_buf_rb_tree, &hmp->rb_bufs_root, buffer)) { hammer_rel_volume(volume, 0); - buffer->io.volume = NULL; /* safety */ - hammer_unref(&buffer->io.lock); /* safety */ + buffer->io.volume = NULL; /* safety */ + if (hammer_rel_interlock(&buffer->io.lock, 1)) /* safety */ + hammer_rel_interlock_done(&buffer->io.lock, 1); --hammer_count_buffers; kfree(buffer, hmp->m_misc); goto again; @@ -654,19 +652,18 @@ again: found: /* - * Deal with on-disk info and loading races. + * The buffer is referenced and interlocked. Load the buffer + * if necessary. hammer_load_buffer() deals with the interlock + * and, if an error is returned, also deals with the ref. */ - if (buffer->ondisk == NULL || buffer->io.loading) { + if (buffer->ondisk == NULL) { *errorp = hammer_load_buffer(buffer, isnew); - if (*errorp) { - hammer_rel_buffer(buffer, 1); + if (*errorp) buffer = NULL; - } else { - hammer_io_advance(&buffer->io); - } } else { - *errorp = 0; hammer_io_advance(&buffer->io); + hammer_ref_interlock_done(&buffer->io.lock); + *errorp = 0; } return(buffer); } @@ -745,7 +742,7 @@ hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset, base_offset); if (buffer) { error = hammer_ref_buffer(buffer); - if (error == 0 && buffer->io.lock.refs != 1) { + if (error == 0 && !hammer_oneref(&buffer->io.lock)) { error = EAGAIN; hammer_rel_buffer(buffer, 0); } @@ -778,6 +775,13 @@ hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset, return (ret_error); } +/* + * Given a referenced and interlocked buffer load/validate the data. + * + * The buffer interlock will be released on return. If an error is + * returned the buffer reference will also be released (and the buffer + * pointer will thus be stale). + */ static int hammer_load_buffer(hammer_buffer_t buffer, int isnew) { @@ -788,8 +792,6 @@ hammer_load_buffer(hammer_buffer_t buffer, int isnew) * Load the buffer's on-disk info */ volume = buffer->io.volume; - ++buffer->io.loading; - hammer_lock_ex(&buffer->io.lock); if (hammer_debug_io & 0x0001) { kprintf("load_buffer %016llx %016llx isnew=%d od=%p\n", @@ -812,8 +814,12 @@ hammer_load_buffer(hammer_buffer_t buffer, int isnew) } else { error = 0; } - --buffer->io.loading; - hammer_unlock(&buffer->io.lock); + if (error == 0) { + hammer_io_advance(&buffer->io); + hammer_ref_interlock_done(&buffer->io.lock); + } else { + hammer_rel_buffer(buffer, 1); + } return (error); } @@ -830,25 +836,24 @@ hammer_unload_buffer(hammer_buffer_t buffer, void *data) { struct hammer_volume *volume = (struct hammer_volume *) data; - if (volume != NULL && volume != buffer->io.volume) { - /* - * We are only interested in unloading buffers of volume, - * so skip it - */ + /* + * If volume != NULL we are only interested in unloading buffers + * associated with a particular volume. + */ + if (volume != NULL && volume != buffer->io.volume) return 0; - } /* * Clean up the persistent ref ioerror might have on the buffer - * and acquire a ref (steal ioerror's if we can). + * and acquire a ref. Expect a 0->1 transition. */ if (buffer->io.ioerror) { buffer->io.ioerror = 0; - } else { - if (buffer->io.lock.refs == 0) - ++hammer_count_refedbufs; - hammer_ref(&buffer->io.lock); + hammer_rel(&buffer->io.lock); + --hammer_count_refedbufs; } + hammer_ref_interlock_true(&buffer->io.lock); + ++hammer_count_refedbufs; /* * We must not flush a dirty buffer to disk on umount. It should @@ -860,9 +865,8 @@ hammer_unload_buffer(hammer_buffer_t buffer, void *data) */ hammer_io_clear_modify(&buffer->io, 1); hammer_flush_buffer_nodes(buffer); - KKASSERT(buffer->io.lock.refs == 1); buffer->io.waitdep = 1; - hammer_rel_buffer(buffer, 2); + hammer_rel_buffer(buffer, 1); return(0); } @@ -874,10 +878,13 @@ int hammer_ref_buffer(hammer_buffer_t buffer) { int error; + int locked; - if (buffer->io.lock.refs == 0) - ++hammer_count_refedbufs; - hammer_ref(&buffer->io.lock); + /* + * Acquire a ref, plus the buffer will be interlocked on the + * 0->1 transition. + */ + locked = hammer_ref_interlock(&buffer->io.lock); /* * At this point a biodone() will not touch the buffer other then @@ -888,20 +895,18 @@ hammer_ref_buffer(hammer_buffer_t buffer) */ if (buffer->io.mod_list == &buffer->io.hmp->lose_list) { crit_enter(); - TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, mod_entry); - buffer->io.mod_list = NULL; + if (buffer->io.mod_list == &buffer->io.hmp->lose_list) { + TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, + mod_entry); + buffer->io.mod_list = NULL; + } crit_exit(); } - if (buffer->ondisk == NULL || buffer->io.loading) { + if (locked) { + ++hammer_count_refedbufs; error = hammer_load_buffer(buffer, 0); - if (error) { - hammer_rel_buffer(buffer, 1); - /* - * NOTE: buffer pointer can become stale after - * the above release. - */ - } + /* NOTE: on error the buffer pointer is stale */ } else { error = 0; } @@ -909,8 +914,9 @@ hammer_ref_buffer(hammer_buffer_t buffer) } /* - * Release a buffer. We have to deal with several places where - * another thread can ref the buffer. + * Release a reference on the buffer. On the 1->0 transition the + * underlying IO will be released but the data reference is left + * cached. * * Only destroy the structure itself if the related buffer cache buffer * was disassociated from it. This ties the management of the structure @@ -918,7 +924,7 @@ hammer_ref_buffer(hammer_buffer_t buffer) * embedded io is referenced or not. */ void -hammer_rel_buffer(hammer_buffer_t buffer, int flush) +hammer_rel_buffer(hammer_buffer_t buffer, int locked) { hammer_volume_t volume; hammer_mount_t hmp; @@ -927,42 +933,52 @@ hammer_rel_buffer(hammer_buffer_t buffer, int flush) hmp = buffer->io.hmp; - crit_enter(); - if (buffer->io.lock.refs == 1) { - ++buffer->io.loading; /* force interlock check */ - hammer_lock_ex(&buffer->io.lock); - if (buffer->io.lock.refs == 1) { - bp = hammer_io_release(&buffer->io, flush); - - if (buffer->io.lock.refs == 1) - --hammer_count_refedbufs; - - if (buffer->io.bp == NULL && - buffer->io.lock.refs == 1) { - /* - * Final cleanup - * - * NOTE: It is impossible for any associated - * B-Tree nodes to have refs if the buffer - * has no additional refs. - */ - RB_REMOVE(hammer_buf_rb_tree, - &buffer->io.hmp->rb_bufs_root, - buffer); - volume = buffer->io.volume; - buffer->io.volume = NULL; /* sanity */ - hammer_rel_volume(volume, 0); - hammer_io_clear_modlist(&buffer->io); - hammer_flush_buffer_nodes(buffer); - KKASSERT(TAILQ_EMPTY(&buffer->clist)); - freeme = 1; - } - } - --buffer->io.loading; - hammer_unlock(&buffer->io.lock); + if (hammer_rel_interlock(&buffer->io.lock, locked) == 0) + return; + + /* + * hammer_count_refedbufs accounting. Decrement if we are in + * the error path or if CHECK is clear. + * + * If we are not in the error path and CHECK is set the caller + * probably just did a hammer_ref() and didn't account for it, + * so we don't account for the loss here. + */ + if (locked || (buffer->io.lock.refs & HAMMER_REFS_CHECK) == 0) + --hammer_count_refedbufs; + + /* + * If the caller locked us or the normal released transitions + * from 1->0 (and acquired the lock) attempt to release the + * io. If the called locked us we tell hammer_io_release() + * to flush (which would be the unload or failure path). + */ + bp = hammer_io_release(&buffer->io, locked); + + /* + * If the buffer has no bp association and no refs we can destroy + * it. + * + * NOTE: It is impossible for any associated B-Tree nodes to have + * refs if the buffer has no additional refs. + */ + if (buffer->io.bp == NULL && hammer_norefs(&buffer->io.lock)) { + RB_REMOVE(hammer_buf_rb_tree, + &buffer->io.hmp->rb_bufs_root, + buffer); + volume = buffer->io.volume; + buffer->io.volume = NULL; /* sanity */ + hammer_rel_volume(volume, 0); + hammer_io_clear_modlist(&buffer->io); + hammer_flush_buffer_nodes(buffer); + KKASSERT(TAILQ_EMPTY(&buffer->clist)); + freeme = 1; } - hammer_unref(&buffer->io.lock); - crit_exit(); + + /* + * Cleanup + */ + hammer_rel_interlock_done(&buffer->io.lock, locked); if (bp) brelse(bp); if (freeme) { @@ -1115,6 +1131,7 @@ hammer_get_node(hammer_transaction_t trans, hammer_off_t node_offset, { hammer_mount_t hmp = trans->hmp; hammer_node_t node; + int doload; KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_BTREE); @@ -1135,34 +1152,41 @@ again: kfree(node, hmp->m_misc); goto again; } - } - hammer_ref(&node->lock); - if (node->ondisk) { - *errorp = 0; - hammer_io_advance(&node->buffer->io); + doload = hammer_ref_interlock_true(&node->lock); } else { + doload = hammer_ref_interlock(&node->lock); + } + if (doload) { *errorp = hammer_load_node(trans, node, isnew); trans->flags |= HAMMER_TRANSF_DIDIO; - } - if (*errorp) { - hammer_rel_node(node); - node = NULL; + if (*errorp) + node = NULL; + } else { + KKASSERT(node->ondisk); + *errorp = 0; + hammer_io_advance(&node->buffer->io); } return(node); } /* - * Reference an already-referenced node. + * Reference an already-referenced node. 0->1 transitions should assert + * so we do not have to deal with hammer_ref() setting CHECK. */ void hammer_ref_node(hammer_node_t node) { - KKASSERT(node->lock.refs > 0 && node->ondisk != NULL); + KKASSERT(hammer_isactive(&node->lock) && node->ondisk != NULL); hammer_ref(&node->lock); } /* - * Load a node's on-disk data reference. + * Load a node's on-disk data reference. Called with the node referenced + * and interlocked. + * + * On return the node interlock will be unlocked. If a non-zero error code + * is returned the node will also be dereferenced (and the caller's pointer + * will be stale). */ static int hammer_load_node(hammer_transaction_t trans, hammer_node_t node, int isnew) @@ -1172,8 +1196,6 @@ hammer_load_node(hammer_transaction_t trans, hammer_node_t node, int isnew) int error; error = 0; - ++node->loading; - hammer_lock_ex(&node->lock); if (node->ondisk == NULL) { /* * This is a little confusing but the jist is that @@ -1230,8 +1252,11 @@ hammer_load_node(hammer_transaction_t trans, hammer_node_t node, int isnew) error = EIO; } failed: - --node->loading; - hammer_unlock(&node->lock); + if (error) { + _hammer_rel_node(node, 1); + } else { + hammer_ref_interlock_done(&node->lock); + } return (error); } @@ -1243,25 +1268,27 @@ hammer_ref_node_safe(hammer_transaction_t trans, hammer_node_cache_t cache, int *errorp) { hammer_node_t node; + int doload; node = cache->node; if (node != NULL) { - hammer_ref(&node->lock); - if (node->ondisk) { + doload = hammer_ref_interlock(&node->lock); + if (doload) { + *errorp = hammer_load_node(trans, node, 0); + if (*errorp) + node = NULL; + } else { + KKASSERT(node->ondisk); if (node->flags & HAMMER_NODE_CRCBAD) { if (trans->flags & HAMMER_TRANSF_CRCDOM) *errorp = EDOM; else *errorp = EIO; + _hammer_rel_node(node, 0); + node = NULL; } else { *errorp = 0; } - } else { - *errorp = hammer_load_node(trans, node, 0); - } - if (*errorp) { - hammer_rel_node(node); - node = NULL; } } else { *errorp = ENOENT; @@ -1272,28 +1299,42 @@ hammer_ref_node_safe(hammer_transaction_t trans, hammer_node_cache_t cache, /* * Release a hammer_node. On the last release the node dereferences * its underlying buffer and may or may not be destroyed. + * + * If locked is non-zero the passed node has been interlocked by the + * caller and we are in the failure/unload path, otherwise it has not and + * we are doing a normal release. + * + * This function will dispose of the interlock and the reference. + * On return the node pointer is stale. */ void -hammer_rel_node(hammer_node_t node) +_hammer_rel_node(hammer_node_t node, int locked) { hammer_buffer_t buffer; /* - * If this isn't the last ref just decrement the ref count and - * return. + * Deref the node. If this isn't the 1->0 transition we're basically + * done. If locked is non-zero this function will just deref the + * locked node and return TRUE, otherwise it will deref the locked + * node and either lock and return TRUE on the 1->0 transition or + * not lock and return FALSE. */ - if (node->lock.refs > 1) { - hammer_unref(&node->lock); + if (hammer_rel_interlock(&node->lock, locked) == 0) return; - } /* - * If there is no ondisk info or no buffer the node failed to load, - * remove the last reference and destroy the node. + * Either locked was non-zero and we are interlocked, or the + * hammer_rel_interlock() call returned non-zero and we are + * interlocked. + * + * The ref-count must still be decremented if locked != 0 so + * the cleanup required still varies a bit. + * + * hammer_flush_node() when called with 1 or 2 will dispose of + * the lock and possible ref-count. */ if (node->ondisk == NULL) { - hammer_unref(&node->lock); - hammer_flush_node(node); + hammer_flush_node(node, locked + 1); /* node is stale now */ return; } @@ -1302,8 +1343,10 @@ hammer_rel_node(hammer_node_t node) * Do not disassociate the node from the buffer if it represents * a modified B-Tree node that still needs its crc to be generated. */ - if (node->flags & HAMMER_NODE_NEEDSCRC) + if (node->flags & HAMMER_NODE_NEEDSCRC) { + hammer_rel_interlock_done(&node->lock, locked); return; + } /* * Do final cleanups and then either destroy the node and leave it @@ -1313,20 +1356,27 @@ hammer_rel_node(hammer_node_t node) node->ondisk = NULL; if ((node->flags & HAMMER_NODE_FLUSH) == 0) { - hammer_unref(&node->lock); - hammer_rel_buffer(buffer, 0); - return; - } + /* + * Normal release. + */ + hammer_rel_interlock_done(&node->lock, locked); + } else { + /* + * Destroy the node. + */ + hammer_flush_node(node, locked + 1); + /* node is stale */ - /* - * Destroy the node. - */ - hammer_unref(&node->lock); - hammer_flush_node(node); - /* node is stale */ + } hammer_rel_buffer(buffer, 0); } +void +hammer_rel_node(hammer_node_t node) +{ + _hammer_rel_node(node, 0); +} + /* * Free space on-media associated with a B-Tree node. */ @@ -1371,26 +1421,40 @@ hammer_uncache_node(hammer_node_cache_t cache) TAILQ_REMOVE(&node->cache_list, cache, entry); cache->node = NULL; if (TAILQ_EMPTY(&node->cache_list)) - hammer_flush_node(node); + hammer_flush_node(node, 0); } } /* * Remove a node's cache references and destroy the node if it has no * other references or backing store. + * + * locked == 0 Normal unlocked operation + * locked == 1 Call hammer_rel_interlock_done(..., 0); + * locked == 2 Call hammer_rel_interlock_done(..., 1); + * + * XXX for now this isn't even close to being MPSAFE so the refs check + * is sufficient. */ void -hammer_flush_node(hammer_node_t node) +hammer_flush_node(hammer_node_t node, int locked) { hammer_node_cache_t cache; hammer_buffer_t buffer; hammer_mount_t hmp = node->hmp; + int dofree; while ((cache = TAILQ_FIRST(&node->cache_list)) != NULL) { TAILQ_REMOVE(&node->cache_list, cache, entry); cache->node = NULL; } - if (node->lock.refs == 0 && node->ondisk == NULL) { + + /* + * NOTE: refs is predisposed if another thread is blocking and + * will be larger than 0 in that case. We aren't MPSAFE + * here. + */ + if (node->ondisk == NULL && hammer_norefs(&node->lock)) { KKASSERT((node->flags & HAMMER_NODE_NEEDSCRC) == 0); RB_REMOVE(hammer_nod_rb_tree, &node->hmp->rb_nods_root, node); if ((buffer = node->buffer) != NULL) { @@ -1398,6 +1462,21 @@ hammer_flush_node(hammer_node_t node) TAILQ_REMOVE(&buffer->clist, node, entry); /* buffer is unreferenced because ondisk is NULL */ } + dofree = 1; + } else { + dofree = 0; + } + + /* + * Deal with the interlock if locked == 1 or locked == 2. + */ + if (locked) + hammer_rel_interlock_done(&node->lock, locked - 1); + + /* + * Destroy if requested + */ + if (dofree) { --hammer_count_nodes; kfree(node, hmp->m_misc); } @@ -1419,12 +1498,11 @@ hammer_flush_buffer_nodes(hammer_buffer_t buffer) KKASSERT(node->ondisk == NULL); KKASSERT((node->flags & HAMMER_NODE_NEEDSCRC) == 0); - if (node->lock.refs == 0) { + if (hammer_try_interlock_norefs(&node->lock)) { hammer_ref(&node->lock); node->flags |= HAMMER_NODE_FLUSH; - hammer_rel_node(node); + _hammer_rel_node(node, 1); } else { - KKASSERT(node->loading != 0); KKASSERT(node->buffer != NULL); buffer = node->buffer; node->buffer = NULL; diff --git a/sys/vfs/hammer/hammer_recover.c b/sys/vfs/hammer/hammer_recover.c index cd7d082ebc..a5bd012681 100644 --- a/sys/vfs/hammer/hammer_recover.c +++ b/sys/vfs/hammer/hammer_recover.c @@ -963,6 +963,7 @@ int hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data) { int final = *(int *)data; + int flush; if (buffer->io.recovered) { buffer->io.recovered = 0; @@ -975,16 +976,17 @@ hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data) } hammer_rel_buffer(buffer, 0); } else { - if (buffer->io.lock.refs == 0) + flush = hammer_ref_interlock(&buffer->io.lock); + if (flush) ++hammer_count_refedbufs; - hammer_ref(&buffer->io.lock); + if (final < 0) { hammer_io_clear_error(&buffer->io); hammer_io_clear_modify(&buffer->io, 1); } - KKASSERT(buffer->io.lock.refs == 1); + KKASSERT(hammer_oneref(&buffer->io.lock)); buffer->io.reclaim = 1; - hammer_rel_buffer(buffer, 1); + hammer_rel_buffer(buffer, flush); } return(0); } diff --git a/sys/vfs/hammer/hammer_subs.c b/sys/vfs/hammer/hammer_subs.c index 267159a8fa..39f1a59c9c 100644 --- a/sys/vfs/hammer/hammer_subs.c +++ b/sys/vfs/hammer/hammer_subs.c @@ -47,30 +47,31 @@ hammer_lock_ex_ident(struct hammer_lock *lock, const char *ident) u_int lv; u_int nlv; - KKASSERT(lock->refs > 0); + KKASSERT(lock->refs); for (;;) { lv = lock->lockval; if (lv == 0) { nlv = 1 | HAMMER_LOCKF_EXCLUSIVE; if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { - lock->owner = td; + lock->lowner = td; break; } - } else if ((lv & HAMMER_LOCKF_EXCLUSIVE) && lock->owner == td) { + } else if ((lv & HAMMER_LOCKF_EXCLUSIVE) && + lock->lowner == td) { nlv = (lv + 1); if (atomic_cmpset_int(&lock->lockval, lv, nlv)) break; } else { if (hammer_debug_locks) { kprintf("hammer_lock_ex: held by %p\n", - lock->owner); + lock->lowner); } nlv = lv | HAMMER_LOCKF_WANTED; ++hammer_contention_count; - tsleep_interlock(lock, 0); + tsleep_interlock(&lock->lockval, 0); if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { - tsleep(lock, PINTERLOCKED, ident, 0); + tsleep(&lock->lockval, PINTERLOCKED, ident, 0); if (hammer_debug_locks) kprintf("hammer_lock_ex: try again\n"); } @@ -89,18 +90,19 @@ hammer_lock_ex_try(struct hammer_lock *lock) u_int lv; u_int nlv; - KKASSERT(lock->refs > 0); + KKASSERT(lock->refs); for (;;) { lv = lock->lockval; if (lv == 0) { nlv = 1 | HAMMER_LOCKF_EXCLUSIVE; if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { - lock->owner = td; + lock->lowner = td; error = 0; break; } - } else if ((lv & HAMMER_LOCKF_EXCLUSIVE) && lock->owner == td) { + } else if ((lv & HAMMER_LOCKF_EXCLUSIVE) && + lock->lowner == td) { nlv = (lv + 1); if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { error = 0; @@ -126,8 +128,9 @@ hammer_lock_sh(struct hammer_lock *lock) thread_t td = curthread; u_int lv; u_int nlv; + const char *ident = "hmrlck"; - KKASSERT(lock->refs > 0); + KKASSERT(lock->refs); for (;;) { lv = lock->lockval; @@ -135,7 +138,7 @@ hammer_lock_sh(struct hammer_lock *lock) nlv = (lv + 1); if (atomic_cmpset_int(&lock->lockval, lv, nlv)) break; - } else if (lock->owner == td) { + } else if (lock->lowner == td) { /* * Disallowed case, drop into kernel debugger for * now. A cont continues w/ an exclusive lock. @@ -149,10 +152,9 @@ hammer_lock_sh(struct hammer_lock *lock) } else { nlv = lv | HAMMER_LOCKF_WANTED; ++hammer_contention_count; - tsleep_interlock(lock, 0); - if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { - tsleep(lock, PINTERLOCKED, "hmrlck", 0); - } + tsleep_interlock(&lock->lockval, 0); + if (atomic_cmpset_int(&lock->lockval, lv, nlv)) + tsleep(&lock->lockval, PINTERLOCKED, ident, 0); } } } @@ -165,7 +167,7 @@ hammer_lock_sh_try(struct hammer_lock *lock) u_int nlv; int error; - KKASSERT(lock->refs > 0); + KKASSERT(lock->refs); for (;;) { lv = lock->lockval; @@ -175,7 +177,7 @@ hammer_lock_sh_try(struct hammer_lock *lock) error = 0; break; } - } else if (lock->owner == td) { + } else if (lock->lowner == td) { /* * Disallowed case, drop into kernel debugger for * now. A cont continues w/ an exclusive lock. @@ -217,12 +219,12 @@ hammer_lock_upgrade(struct hammer_lock *lock) if ((lv & ~HAMMER_LOCKF_WANTED) == 1) { nlv = lv | HAMMER_LOCKF_EXCLUSIVE; if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { - lock->owner = td; + lock->lowner = td; error = 0; break; } } else if (lv & HAMMER_LOCKF_EXCLUSIVE) { - if (lock->owner != curthread) + if (lock->lowner != curthread) panic("hammer_lock_upgrade: illegal state"); error = 0; break; @@ -251,19 +253,19 @@ hammer_lock_downgrade(struct hammer_lock *lock) KKASSERT((lock->lockval & ~HAMMER_LOCKF_WANTED) == (HAMMER_LOCKF_EXCLUSIVE | 1)); - KKASSERT(lock->owner == td); + KKASSERT(lock->lowner == td); /* * NOTE: Must clear owner before releasing exclusivity */ - lock->owner = NULL; + lock->lowner = NULL; for (;;) { lv = lock->lockval; nlv = lv & ~(HAMMER_LOCKF_EXCLUSIVE | HAMMER_LOCKF_WANTED); if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { if (lv & HAMMER_LOCKF_WANTED) - wakeup(lock); + wakeup(&lock->lockval); break; } } @@ -279,7 +281,7 @@ hammer_unlock(struct hammer_lock *lock) lv = lock->lockval; KKASSERT(lv != 0); if (lv & HAMMER_LOCKF_EXCLUSIVE) - KKASSERT(lock->owner == td); + KKASSERT(lock->lowner == td); for (;;) { lv = lock->lockval; @@ -291,10 +293,10 @@ hammer_unlock(struct hammer_lock *lock) } else if (nlv == 1) { nlv = 0; if (lv & HAMMER_LOCKF_EXCLUSIVE) - lock->owner = NULL; + lock->lowner = NULL; if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { if (lv & HAMMER_LOCKF_WANTED) - wakeup(lock); + wakeup(&lock->lockval); break; } } else { @@ -319,18 +321,418 @@ hammer_lock_status(struct hammer_lock *lock) panic("hammer_lock_status: lock must be held: %p", lock); } +/* + * Bump the ref count for a lock (not the excl/share count, but a separate + * structural reference count). The CHECK flag will be set on a 0->1 + * transition. + * + * This function does nothing to serialize races between multple threads. + * The caller can interlock it later on to deal with serialization. + * + * MPSAFE + */ void hammer_ref(struct hammer_lock *lock) { - KKASSERT(lock->refs >= 0); - atomic_add_int(&lock->refs, 1); + u_int lv; + u_int nlv; + + for (;;) { + lv = lock->refs; + if ((lv & ~HAMMER_REFS_FLAGS) == 0) { + nlv = (lv + 1) | HAMMER_REFS_CHECK; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) + return; + } else { + nlv = (lv + 1); + KKASSERT((int)nlv > 0); + if (atomic_cmpset_int(&lock->refs, lv, nlv)) + return; + } + } + /* not reached */ +} + +/* + * Drop the ref count for a lock (not the excl/share count, but a separate + * structural reference count). The CHECK flag will be cleared on a 1->0 + * transition. + * + * This function does nothing to serialize races between multple threads. + * + * MPSAFE + */ +void +hammer_rel(struct hammer_lock *lock) +{ + u_int lv; + u_int nlv; + + for (;;) { + lv = lock->refs; + if ((lv & ~HAMMER_REFS_FLAGS) == 1) { + nlv = (lv - 1) & ~HAMMER_REFS_CHECK; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) + return; + } else { + KKASSERT((int)lv > 0); + nlv = (lv - 1); + if (atomic_cmpset_int(&lock->refs, lv, nlv)) + return; + } + } + /* not reached */ +} + +/* + * The hammer_*_interlock() and hammer_*_interlock_done() functions are + * more sophisticated versions which handle MP transition races and block + * when necessary. + * + * hammer_ref_interlock() bumps the ref-count and conditionally acquires + * the interlock for 0->1 transitions or if the CHECK is found to be set. + * + * This case will return TRUE, the interlock will be held, and the CHECK + * bit also set. Other threads attempting to ref will see the CHECK bit + * and block until we clean up. + * + * FALSE is returned for transitions other than 0->1 when the CHECK bit + * is not found to be set, or if the function loses the race with another + * thread. + * + * TRUE is only returned to one thread and the others will block. + * Effectively a TRUE indicator means 'someone transitioned 0->1 + * and you are the first guy to successfully lock it after that, so you + * need to check'. Due to races the ref-count may be greater than 1 upon + * return. + * + * MPSAFE + */ +int +hammer_ref_interlock(struct hammer_lock *lock) +{ + u_int lv; + u_int nlv; + + /* + * Integrated reference count bump, lock, and check, with hot-path. + * + * (a) Return 1 (+LOCKED, +CHECK) 0->1 transition + * (b) Return 0 (-LOCKED, -CHECK) N->N+1 transition + * (c) Break out (+CHECK) Check condition and Cannot lock + * (d) Return 1 (+LOCKED, +CHECK) Successfully locked + */ + for (;;) { + lv = lock->refs; + if (lv == 0) { + nlv = 1 | HAMMER_REFS_LOCKED | HAMMER_REFS_CHECK; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + lock->rowner = curthread; + return(1); + } + } else { + nlv = (lv + 1); + if ((lv & ~HAMMER_REFS_FLAGS) == 0) + nlv |= HAMMER_REFS_CHECK; + if ((nlv & HAMMER_REFS_CHECK) == 0) { + if (atomic_cmpset_int(&lock->refs, lv, nlv)) + return(0); + } else if (lv & HAMMER_REFS_LOCKED) { + /* CHECK also set here */ + if (atomic_cmpset_int(&lock->refs, lv, nlv)) + break; + } else { + /* CHECK also set here */ + nlv |= HAMMER_REFS_LOCKED; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + lock->rowner = curthread; + return(1); + } + } + } + } + + /* + * Defered check condition because we were unable to acquire the + * lock. We must block until the check condition is cleared due + * to a race with another thread, or we are able to acquire the + * lock. + * + * (a) Return 0 (-CHECK) Another thread handled it + * (b) Return 1 (+LOCKED, +CHECK) We handled it. + */ + for (;;) { + lv = lock->refs; + if ((lv & HAMMER_REFS_CHECK) == 0) + return(0); + if (lv & HAMMER_REFS_LOCKED) { + tsleep_interlock(&lock->refs, 0); + nlv = (lv | HAMMER_REFS_WANTED); + if (atomic_cmpset_int(&lock->refs, lv, nlv)) + tsleep(&lock->refs, PINTERLOCKED, "h1lk", 0); + } else { + /* CHECK also set here */ + nlv = lv | HAMMER_REFS_LOCKED; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + lock->rowner = curthread; + return(1); + } + } + } + /* not reached */ +} + +/* + * This is the same as hammer_ref_interlock() but asserts that the + * 0->1 transition is always true, thus the lock must have no references + * on entry or have CHECK set, and will have one reference with the + * interlock held on return. It must also not be interlocked on entry + * by anyone. + * + * NOTE that CHECK will never be found set when the ref-count is 0. + * + * TRUE is always returned to match the API for hammer_ref_interlock(). + * This function returns with one ref, the lock held, and the CHECK bit set. + */ +int +hammer_ref_interlock_true(struct hammer_lock *lock) +{ + u_int lv; + u_int nlv; + + for (;;) { + lv = lock->refs; + + if (lv) { + panic("hammer_ref_interlock_true: bad lock %p %08x\n", + lock, lock->refs); + } + nlv = 1 | HAMMER_REFS_LOCKED | HAMMER_REFS_CHECK; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + lock->rowner = curthread; + return (1); + } + } +} + +/* + * Unlock the interlock acquired by hammer_ref_interlock() and clear the + * CHECK flag. The ref-count remains unchanged. + * + * This routine is called in the load path when the load succeeds. + */ +void +hammer_ref_interlock_done(struct hammer_lock *lock) +{ + u_int lv; + u_int nlv; + + for (;;) { + lv = lock->refs; + nlv = lv & ~HAMMER_REFS_FLAGS; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + if (lv & HAMMER_REFS_WANTED) + wakeup(&lock->refs); + break; + } + } +} + +/* + * hammer_rel_interlock() works a bit differently in that it must + * acquire the lock in tandem with a 1->0 transition. CHECK is + * not used. + * + * TRUE is returned on 1->0 transitions with the lock held on return + * and FALSE is returned otherwise with the lock not held. + * + * It is important to note that the refs are not stable and may + * increase while we hold the lock, the TRUE indication only means + * that we transitioned 1->0, not necessarily that we stayed at 0. + * + * Another thread bumping refs while we hold the lock will set CHECK, + * causing one of the competing hammer_ref_interlock() calls to + * return TRUE after we release our lock. + * + * MPSAFE + */ +int +hammer_rel_interlock(struct hammer_lock *lock, int locked) +{ + u_int lv; + u_int nlv; + + /* + * In locked mode (failure/unload path) we release the + * ref-count but leave it locked. + */ + if (locked) { + hammer_rel(lock); + return(1); + } + + /* + * Integrated reference count drop with LOCKED, plus the hot-path + * returns. + */ + for (;;) { + lv = lock->refs; + + if (lv == 1) { + nlv = 0 | HAMMER_REFS_LOCKED; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + lock->rowner = curthread; + return(1); + } + } else if ((lv & ~HAMMER_REFS_FLAGS) == 1) { + if ((lv & HAMMER_REFS_LOCKED) == 0) { + nlv = (lv - 1) | HAMMER_REFS_LOCKED; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + lock->rowner = curthread; + return(1); + } + } else { + nlv = lv | HAMMER_REFS_WANTED; + tsleep_interlock(&lock->refs, 0); + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + tsleep(&lock->refs, PINTERLOCKED, + "h0lk", 0); + } + } + } else { + nlv = (lv - 1); + KKASSERT((int)nlv >= 0); + if (atomic_cmpset_int(&lock->refs, lv, nlv)) + return(0); + } + } + /* not reached */ } +/* + * Unlock the interlock acquired by hammer_rel_interlock(). + * + * If orig_locked is non-zero the interlock was originally held prior to + * the hammer_rel_interlock() call and passed through to us. In this + * case we want to retain the CHECK error state if not transitioning + * to 0. + * + * The code is the same either way so we do not have to conditionalize + * on orig_locked. + */ void -hammer_unref(struct hammer_lock *lock) +hammer_rel_interlock_done(struct hammer_lock *lock, int orig_locked __unused) { - KKASSERT(lock->refs > 0); - atomic_subtract_int(&lock->refs, 1); + u_int lv; + u_int nlv; + + for (;;) { + lv = lock->refs; + nlv = lv & ~(HAMMER_REFS_LOCKED | HAMMER_REFS_WANTED); + if ((lv & ~HAMMER_REFS_FLAGS) == 0) + nlv &= ~HAMMER_REFS_CHECK; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + if (lv & HAMMER_REFS_WANTED) + wakeup(&lock->refs); + break; + } + } +} + +/* + * Acquire the interlock on lock->refs. + * + * Return TRUE if CHECK is currently set. Note that CHECK will not + * be set if the reference count is 0, but can get set if this function + * is preceeded by, say, hammer_ref(), or through races with other + * threads. The return value allows the caller to use the same logic + * as hammer_ref_interlock(). + * + * MPSAFE + */ +int +hammer_get_interlock(struct hammer_lock *lock) +{ + u_int lv; + u_int nlv; + + for (;;) { + lv = lock->refs; + if (lv & HAMMER_REFS_LOCKED) { + nlv = lv | HAMMER_REFS_WANTED; + tsleep_interlock(&lock->refs, 0); + if (atomic_cmpset_int(&lock->refs, lv, nlv)) + tsleep(&lock->refs, PINTERLOCKED, "hilk", 0); + } else { + nlv = (lv | HAMMER_REFS_LOCKED); + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + lock->rowner = curthread; + return((lv & HAMMER_REFS_CHECK) ? 1 : 0); + } + } + } +} + +/* + * Attempt to acquire the interlock and expect 0 refs. Used by the buffer + * cache callback code to disassociate or lock the bufs related to HAMMER + * structures. + * + * During teardown the related bp will be acquired by hammer_io_release() + * which interocks our test. + * + * Returns non-zero on success, zero on failure. + */ +int +hammer_try_interlock_norefs(struct hammer_lock *lock) +{ + u_int lv; + u_int nlv; + + for (;;) { + lv = lock->refs; + if (lv == 0) { + nlv = lv | HAMMER_REFS_LOCKED; + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + lock->rowner = curthread; + return(1); + } + } else { + return(0); + } + } + /* not reached */ +} + +/* + * Release the interlock on lock->refs. This function will set + * CHECK if the refs is non-zero and error is non-zero, and clear + * CHECK otherwise. + * + * MPSAFE + */ +void +hammer_put_interlock(struct hammer_lock *lock, int error) +{ + u_int lv; + u_int nlv; + + for (;;) { + lv = lock->refs; + KKASSERT(lv & HAMMER_REFS_LOCKED); + nlv = lv & ~(HAMMER_REFS_LOCKED | HAMMER_REFS_WANTED); + + if ((nlv & ~HAMMER_REFS_FLAGS) == 0 || error == 0) + nlv &= ~HAMMER_REFS_CHECK; + else + nlv |= HAMMER_REFS_CHECK; + + if (atomic_cmpset_int(&lock->refs, lv, nlv)) { + if (lv & HAMMER_REFS_WANTED) + wakeup(&lock->refs); + return; + } + } } /*