From bcac4bbbd5f3273e9992610b0eb080dac518985a Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 18 Jun 2008 01:13:30 +0000 Subject: [PATCH] HAMMER 56B/Many: Performance tuning - MEDIA STRUCTURES CHANGED! * MEDIA CHANGE: The atime has been moved back into the inode data proper. The nlinks field has also been moved. * PERFORMANCE: The CRC for cached B-Tree nodes was being run on every access instead of just the first time. This was the cause of HAMMER's poor directory scanning performance and cpu-intensive write flushes. Adjusted to only check the CRC on the initial load into the buffer cache. * PERFORMANCE: The CRC for modified B-Tree nodes was being regenerated every time the node was modified, so a large number of insertions or deletions modifying the same B-Tree need needlessly regenerated the CRC each time. Adjusted to delay generation of the CRC until just before the buffer is flushed to the physical media. Just for the record, B-Tree nodes are 4K and it takes ~25uS to run a CRC on them. Needless to say removing the unnecessary calls solved a lot of performance issues. * PERFORMANCE: Removed limitations in the node caching algorithms. Now more then one inode can cache pointers to the same B-Tree node. * PERFORMANCE: When calculating the parent B-Tree node we have to scan the element array to locate the index that points back to the child. Use a power-of-2 algorithm instead of a linear scan. * PERFORMANCE: Clean up the selection of ip->cache[0] or ip->cache[1] based on whether we are trying to cache the location of the inode or the location of the file object's data. --- sys/vfs/hammer/hammer.h | 67 +++++++++++++++++----- sys/vfs/hammer/hammer_btree.c | 7 +-- sys/vfs/hammer/hammer_btree.h | 4 +- sys/vfs/hammer/hammer_cursor.c | 38 ++++++------ sys/vfs/hammer/hammer_disk.h | 11 ++-- sys/vfs/hammer/hammer_inode.c | 57 ++++++++++++------ sys/vfs/hammer/hammer_io.c | 25 +++++++- sys/vfs/hammer/hammer_object.c | 22 +++---- sys/vfs/hammer/hammer_ondisk.c | 111 ++++++++++++++++-------------------- sys/vfs/hammer/hammer_transaction.c | 9 ++- sys/vfs/hammer/hammer_vfsops.c | 4 +- sys/vfs/hammer/hammer_vnops.c | 58 ++++++++++--------- 12 files changed, 246 insertions(+), 167 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 6406e5a47d..8cd6b55fc6 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.85 2008/06/17 04:02:38 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.86 2008/06/18 01:13:30 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -66,6 +67,17 @@ MALLOC_DECLARE(M_HAMMER); +/* + * Kernel trace + */ +#if !defined(KTR_HAMMER) +#define KTR_HAMMER KTR_ALL +#endif +KTR_INFO_MASTER_EXTERN(hammer); + +/* + * Misc structures + */ struct hammer_mount; /* @@ -166,6 +178,17 @@ typedef struct hammer_objid_cache { } *hammer_objid_cache_t; /* + * Associate an inode with a B-Tree node to cache search start positions + */ +typedef struct hammer_node_cache { + TAILQ_ENTRY(hammer_node_cache) entry; + struct hammer_node *node; + struct hammer_inode *ip; +} *hammer_node_cache_t; + +TAILQ_HEAD(hammer_node_cache_list, hammer_node_cache); + +/* * Structure used to represent an inode in-memory. * * The record and data associated with an inode may be out of sync with @@ -217,7 +240,7 @@ struct hammer_inode { struct hammer_btree_leaf_elm ino_leaf; /* in-memory cache */ struct hammer_inode_data ino_data; /* in-memory cache */ struct hammer_rec_rb_tree rec_tree; /* in-memory cache */ - struct hammer_node *cache[2]; /* search initiate cache */ + struct hammer_node_cache cache[2]; /* search initiate cache */ /* * When a demark is created to synchronize an inode to @@ -412,6 +435,7 @@ struct hammer_io { u_int recovered : 1; /* has recovery ref */ u_int waitmod : 1; /* waiting for modify_refs */ u_int reclaim : 1; /* reclaim requested */ + u_int gencrc : 1; /* crc needs to be generated */ }; typedef struct hammer_io *hammer_io_t; @@ -480,14 +504,15 @@ struct hammer_node { struct hammer_mount *hmp; struct hammer_buffer *buffer; /* backing buffer */ hammer_node_ondisk_t ondisk; /* ptr to on-disk structure */ - struct hammer_node **cache1; /* passive cache(s) */ - struct hammer_node **cache2; + struct hammer_node_cache_list cache_list; /* passive caches */ int flags; int loading; /* load interlock */ }; #define HAMMER_NODE_DELETED 0x0001 #define HAMMER_NODE_FLUSH 0x0002 +#define HAMMER_NODE_CRCGOOD 0x0004 +#define HAMMER_NODE_NEEDSCRC 0x0008 typedef struct hammer_node *hammer_node_t; @@ -702,9 +727,8 @@ int hammer_vop_inactive(struct vop_inactive_args *); int hammer_vop_reclaim(struct vop_reclaim_args *); int hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp); struct hammer_inode *hammer_get_inode(hammer_transaction_t trans, - struct hammer_node **cache, - u_int64_t obj_id, hammer_tid_t asof, int flags, - int *errorp); + hammer_inode_t dip, u_int64_t obj_id, + hammer_tid_t asof, int flags, int *errorp); void hammer_put_inode(struct hammer_inode *ip); void hammer_put_inode_ref(struct hammer_inode *ip); @@ -776,7 +800,7 @@ int64_t hammer_directory_namekey(void *name, int len); int hammer_nohistory(hammer_inode_t ip); int hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, - struct hammer_node **cache, hammer_inode_t ip); + hammer_node_cache_t cache, hammer_inode_t ip); int hammer_reinit_cursor(hammer_cursor_t cursor); void hammer_normalize_cursor(hammer_cursor_t cursor); void hammer_done_cursor(hammer_cursor_t cursor); @@ -801,7 +825,7 @@ int btree_set_parent(hammer_transaction_t trans, hammer_node_t node, int hammer_btree_lock_children(hammer_cursor_t cursor, struct hammer_node_locklist **locklistp); void hammer_btree_unlock_children(struct hammer_node_locklist **locklistp); - +int hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node); void hammer_print_btree_node(hammer_node_ondisk_t ondisk); void hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i); @@ -833,13 +857,13 @@ hammer_node_t hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset, int isnew, int *errorp); void hammer_ref_node(hammer_node_t node); hammer_node_t hammer_ref_node_safe(struct hammer_mount *hmp, - struct hammer_node **cache, int *errorp); + hammer_node_cache_t cache, int *errorp); void hammer_rel_node(hammer_node_t node); void hammer_delete_node(hammer_transaction_t trans, hammer_node_t node); -void hammer_cache_node(hammer_node_t node, - struct hammer_node **cache); -void hammer_uncache_node(struct hammer_node **cache); +void hammer_cache_node(hammer_node_cache_t cache, + hammer_node_t node); +void hammer_uncache_node(hammer_node_cache_t cache); void hammer_flush_node(hammer_node_t node); void hammer_dup_buffer(struct hammer_buffer **bufferp, @@ -992,6 +1016,9 @@ hammer_lock_ex(struct hammer_lock *lock) hammer_lock_ex_ident(lock, "hmrlck"); } +/* + * Indicate that a B-Tree node is being modified. + */ static __inline void hammer_modify_node_noundo(hammer_transaction_t trans, hammer_node_t node) { @@ -1020,10 +1047,22 @@ hammer_modify_node(hammer_transaction_t trans, hammer_node_t node, --node->buffer->io.modify_refs; /* only want one ref */ } +/* + * Indicate that the specified modifications have been completed. + * + * Do not try to generate the crc here, it's very expensive to do and a + * sequence of insertions or deletions can result in many calls to this + * function on the same node. + */ static __inline void hammer_modify_node_done(hammer_node_t node) { - node->ondisk->crc = crc32(&node->ondisk->crc + 1, HAMMER_BTREE_CRCSIZE); + node->flags |= HAMMER_NODE_CRCGOOD; + if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0) { + node->flags |= HAMMER_NODE_NEEDSCRC; + node->buffer->io.gencrc = 1; + hammer_ref_node(node); + } hammer_modify_buffer_done(node->buffer); } diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index 9fdd1ec7d4..22da7d344c 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.54 2008/06/17 04:02:38 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.55 2008/06/18 01:13:30 dillon Exp $ */ /* @@ -83,8 +83,6 @@ #include static int btree_search(hammer_cursor_t cursor, int flags); -static int hammer_btree_search_node(hammer_base_elm_t elm, - hammer_node_ondisk_t node); static int btree_split_internal(hammer_cursor_t cursor); static int btree_split_leaf(hammer_cursor_t cursor); static int btree_remove(hammer_cursor_t cursor); @@ -1259,7 +1257,7 @@ done: * return an index whos compare result is > 1 but may only return an index * whos compare result is <= 1 if it is the first element with that result. */ -static int +int hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node) { int b; @@ -2406,7 +2404,6 @@ hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i) elm->internal.subtree_offset); break; case HAMMER_BTREE_TYPE_RECORD: - kprintf("\tatime = %016llx\n", elm->leaf.atime); kprintf("\tdata_offset = %016llx\n", elm->leaf.data_offset); kprintf("\tdata_len = %08x\n", elm->leaf.data_len); kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc); diff --git a/sys/vfs/hammer/hammer_btree.h b/sys/vfs/hammer/hammer_btree.h index 72054b65ed..e0b4004660 100644 --- a/sys/vfs/hammer/hammer_btree.h +++ b/sys/vfs/hammer/hammer_btree.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.17 2008/06/10 22:30:21 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.18 2008/06/18 01:13:30 dillon Exp $ */ /* @@ -144,7 +144,7 @@ struct hammer_btree_internal_elm { */ struct hammer_btree_leaf_elm { struct hammer_base_elm base; - hammer_off_t atime; /* access time */ + hammer_off_t unused00; /* access time */ hammer_off_t data_offset; int32_t data_len; hammer_crc_t data_crc; diff --git a/sys/vfs/hammer/hammer_cursor.c b/sys/vfs/hammer/hammer_cursor.c index fb152ec117..22889f72c0 100644 --- a/sys/vfs/hammer/hammer_cursor.c +++ b/sys/vfs/hammer/hammer_cursor.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.30 2008/06/17 04:02:38 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.31 2008/06/18 01:13:30 dillon Exp $ */ /* @@ -47,7 +47,7 @@ static int hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive); */ int hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, - struct hammer_node **cache, hammer_inode_t ip) + hammer_node_cache_t cache, hammer_inode_t ip) { hammer_volume_t volume; hammer_node_t node; @@ -72,7 +72,7 @@ hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, /* * Step 1 - acquire a locked node from the cache if possible */ - if (cache && *cache) { + if (cache && cache->node) { node = hammer_ref_node_safe(trans->hmp, cache, &error); if (error == 0) { hammer_lock_sh(&node->lock); @@ -130,24 +130,6 @@ hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, return(error); } -#if 0 -int -hammer_reinit_cursor(hammer_cursor_t cursor) -{ - hammer_transaction_t trans; - hammer_inode_t ip; - struct hammer_node **cache; - - trans = cursor->trans; - ip = cursor->ip; - hammer_done_cursor(cursor); - cache = ip ? &ip->cache[0] : NULL; - error = hammer_init_cursor(trans, cursor, cache, ip); - return (error); -} - -#endif - /* * Normalize a cursor. Sometimes cursors can be left in a state * where node is NULL. If the cursor is in this state, cursor up. @@ -345,12 +327,24 @@ hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive) } KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0); elm = NULL; - for (i = 0; i < parent->ondisk->count; ++i) { + + /* + * Locate the parent index to the child node as quickly + * as possible. + */ + if (node->ondisk->count) { + i = hammer_btree_search_node( + &node->ondisk->elms[0].base, node->ondisk); + } else { + i = 0; + } + while (i < parent->ondisk->count) { elm = &parent->ondisk->elms[i]; if (parent->ondisk->elms[i].internal.subtree_offset == node->node_offset) { break; } + ++i; } if (i == parent->ondisk->count) { hammer_unlock(&parent->lock); diff --git a/sys/vfs/hammer/hammer_disk.h b/sys/vfs/hammer/hammer_disk.h index 802d0ba917..2b1c659e62 100644 --- a/sys/vfs/hammer/hammer_disk.h +++ b/sys/vfs/hammer/hammer_disk.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.37 2008/06/17 04:02:38 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.38 2008/06/18 01:13:30 dillon Exp $ */ #ifndef VFS_HAMMER_DISK_H_ @@ -580,16 +580,19 @@ struct hammer_inode_data { u_int8_t reserved01; u_int16_t reserved02; u_int32_t reserved03; - u_int64_t mtime; - u_int64_t size; /* filesystem object size */ u_int64_t nlinks; /* hard links */ - u_int64_t reserved04; + u_int64_t size; /* filesystem object size */ + u_int64_t mtime; + u_int64_t atime; /* atime must be just after mtime */ union { char reserved06[24]; char symlink[24]; /* HAMMER_INODE_BASESYMLEN */ } ext; }; +#define HAMMER_ITIMES_BASE(ino_data) (&(ino_data)->mtime) +#define HAMMER_ITIMES_BYTES (sizeof(u_int64_t) * 2) + #define HAMMER_INODE_DATA_VERSION 1 #define HAMMER_OBJID_ROOT 1 #define HAMMER_INODE_BASESYMLEN 24 diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index ea6906399d..f9da33456b 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.76 2008/06/17 04:02:38 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.77 2008/06/18 01:13:30 dillon Exp $ */ #include "hammer.h" @@ -259,7 +259,7 @@ hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp) * Called from the frontend. */ struct hammer_inode * -hammer_get_inode(hammer_transaction_t trans, struct hammer_node **cache, +hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip, u_int64_t obj_id, hammer_tid_t asof, int flags, int *errorp) { hammer_mount_t hmp = trans->hmp; @@ -291,6 +291,8 @@ loop: ip->obj_asof = iinfo.obj_asof; ip->hmp = hmp; ip->flags = flags & HAMMER_INODE_RO; + ip->cache[0].ip = ip; + ip->cache[1].ip = ip; if (hmp->ronly) ip->flags |= HAMMER_INODE_RO; ip->sync_trunc_off = ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; @@ -301,7 +303,7 @@ loop: * Locate the on-disk inode. */ retry: - hammer_init_cursor(trans, &cursor, cache, NULL); + hammer_init_cursor(trans, &cursor, (dip ? &dip->cache[0] : NULL), NULL); cursor.key_beg.localization = HAMMER_LOCALIZE_INODE; cursor.key_beg.obj_id = ip->obj_id; cursor.key_beg.key = 0; @@ -328,9 +330,17 @@ retry: if (*errorp == 0) { ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf; ip->ino_data = cursor.data->inode; - hammer_cache_node(cursor.node, &ip->cache[0]); - if (cache) - hammer_cache_node(cursor.node, cache); + + /* + * cache[0] tries to cache the location of the object inode. + * The assumption is that it is near the directory inode. + * + * cache[1] tries to cache the location of the object data. + * The assumption is that it is near the directory data. + */ + hammer_cache_node(&ip->cache[0], cursor.node); + if (dip && dip->cache[1].node) + hammer_cache_node(&ip->cache[1], dip->cache[1].node); /* * The file should not contain any data past the file size @@ -412,12 +422,14 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap, ip->hmp = hmp; ip->flush_state = HAMMER_FST_IDLE; ip->flags = HAMMER_INODE_DDIRTY | HAMMER_INODE_ITIMES; + ip->cache[0].ip = ip; + ip->cache[1].ip = ip; ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; RB_INIT(&ip->rec_tree); TAILQ_INIT(&ip->target_list); - ip->ino_leaf.atime = trans->time; + ip->ino_data.atime = trans->time; ip->ino_data.mtime = trans->time; ip->ino_data.size = 0; ip->ino_data.nlinks = 0; @@ -541,7 +553,7 @@ retry: ip->flags |= HAMMER_INODE_DELONDISK; } if (cursor->node) - hammer_cache_node(cursor->node, &ip->cache[0]); + hammer_cache_node(&ip->cache[0], cursor->node); } if (error == EDEADLK) { hammer_done_cursor(cursor); @@ -665,7 +677,9 @@ retry: cursor->key_beg.obj_type = 0; cursor->asof = ip->obj_asof; cursor->flags &= ~HAMMER_CURSOR_INITMASK; - cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF; + cursor->flags |= HAMMER_CURSOR_ASOF; + cursor->flags |= HAMMER_CURSOR_GET_LEAF; + cursor->flags |= HAMMER_CURSOR_GET_DATA; cursor->flags |= HAMMER_CURSOR_BACKEND; error = hammer_btree_lookup(cursor); @@ -675,17 +689,28 @@ retry: } if (error == 0) { /* - * Do not generate UNDO records for atime updates. + * atime/mtime updates can be done in place, but + * they are nasty because we also have to update the + * data_crc in the B-Tree leaf, which means we + * ALSO have to generate UNDO records. */ + hammer_modify_buffer(trans, cursor->data_buffer, + HAMMER_ITIMES_BASE(&cursor->data->inode), + HAMMER_ITIMES_BYTES); + cursor->data->inode.atime = ip->sync_ino_data.atime; + cursor->data->inode.mtime = ip->sync_ino_data.mtime; + hammer_modify_buffer_done(cursor->data_buffer); + leaf = cursor->leaf; - hammer_modify_node(trans, cursor->node, - &leaf->atime, sizeof(leaf->atime)); - leaf->atime = ip->sync_ino_leaf.atime; + hammer_modify_node(trans, cursor->node, + &leaf->data_crc, + sizeof(leaf->data_crc)); + leaf->data_crc = crc32(cursor->data, leaf->data_len); hammer_modify_node_done(cursor->node); - /*rec->ino_mtime = ip->sync_ino_rec.ino_mtime;*/ + ip->sync_flags &= ~HAMMER_INODE_ITIMES; /* XXX recalculate crc */ - hammer_cache_node(cursor->node, &ip->cache[0]); + hammer_cache_node(&ip->cache[0], cursor->node); } if (error == EDEADLK) { hammer_done_cursor(cursor); @@ -1765,7 +1790,7 @@ hammer_sync_inode(hammer_inode_t ip) if (tmp_error) error = tmp_error; } - hammer_cache_node(cursor.node, &ip->cache[1]); + hammer_cache_node(&ip->cache[1], cursor.node); /* * Re-seek for inode update. diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 69efc3fa25..e5f3da33cd 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.42 2008/06/17 04:02:38 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.43 2008/06/18 01:13:30 dillon Exp $ */ /* * IO Primitives and buffer cache management @@ -455,6 +455,29 @@ hammer_io_flush(struct hammer_io *io) hammer_io_clear_modify(io); /* + * We delay generating the CRCs for B-Tree nodes until the very + * last minute. + */ + if (io->gencrc) { + io->gencrc = 0; + if (io->type == HAMMER_STRUCTURE_META_BUFFER) { + hammer_buffer_t buffer = (void *)io; + hammer_node_t node; + +restart: + TAILQ_FOREACH(node, &buffer->clist, entry) { + if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0) + continue; + node->flags &= ~HAMMER_NODE_NEEDSCRC; + KKASSERT(node->ondisk); + node->ondisk->crc = crc32(&node->ondisk->crc + 1, HAMMER_BTREE_CRCSIZE); + hammer_rel_node(node); + goto restart; + } + } + } + + /* * Transfer ownership to the kernel and initiate I/O. */ io->running = 1; diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index b54d30523f..1adeea6129 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.69 2008/06/17 04:02:38 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.70 2008/06/18 01:13:30 dillon Exp $ */ #include "hammer.h" @@ -776,8 +776,10 @@ hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes) /* * Reserve blockmap space placemarked with an in-memory record. * - * This routine is called by the front-end in order to be able to directly - * flush a buffer cache buffer. + * This routine is called by the frontend in order to be able to directly + * flush a buffer cache buffer. The frontend has locked the related buffer + * cache buffers and we should be able to manipulate any overlapping + * in-memory records. */ hammer_record_t hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, @@ -1317,8 +1319,8 @@ next_btree: cursor->flags &= ~HAMMER_CURSOR_DELBTREE; if (error == 0) { cursor->flags &= ~HAMMER_CURSOR_ATEDISK; - hammer_cache_node(cursor->node, - &cursor->ip->cache[1]); + hammer_cache_node(&cursor->ip->cache[1], + cursor->node); } else { cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_ATEDISK; @@ -1654,11 +1656,11 @@ retry: error = hammer_ip_next(cursor); } if (cursor->node) - hammer_cache_node(cursor->node, &ip->cache[1]); + hammer_cache_node(&ip->cache[1], cursor->node); if (error == EDEADLK) { hammer_done_cursor(cursor); - error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip); + error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); if (error == 0) goto retry; } @@ -1732,10 +1734,10 @@ retry: error = hammer_ip_next(cursor); } if (cursor->node) - hammer_cache_node(cursor->node, &ip->cache[1]); + hammer_cache_node(&ip->cache[1], cursor->node); if (error == EDEADLK) { hammer_done_cursor(cursor); - error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip); + error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); if (error == 0) goto retry; } @@ -1884,7 +1886,7 @@ hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip) /* * Check directory empty */ - hammer_init_cursor(trans, &cursor, &ip->cache[0], ip); + hammer_init_cursor(trans, &cursor, &ip->cache[1], ip); cursor.key_beg.localization = HAMMER_LOCALIZE_MISC; cursor.key_beg.obj_id = ip->obj_id; diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c index 8bf3858406..68a3715b1a 100644 --- a/sys/vfs/hammer/hammer_ondisk.c +++ b/sys/vfs/hammer/hammer_ondisk.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.58 2008/06/17 04:02:38 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.59 2008/06/18 01:13:30 dillon Exp $ */ /* * Manage HAMMER's on-disk structures. These routines are primarily @@ -504,12 +504,12 @@ hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset, int vol_no; int zone; + buf_offset &= ~HAMMER_BUFMASK64; again: /* * Shortcut if the buffer is already cached */ - buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, - buf_offset & ~HAMMER_BUFMASK64); + buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, buf_offset); if (buffer) { if (buffer->io.lock.refs == 0) ++hammer_count_refedbufs; @@ -578,12 +578,9 @@ again: return(NULL); /* - * Calculate the base zone2-offset and acquire the volume - * * NOTE: zone2_offset and maxbuf_off are both full zone-2 offset * specifications. */ - zone2_offset &= ~HAMMER_BUFMASK64; KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER); vol_no = HAMMER_VOL_DECODE(zone2_offset); @@ -792,6 +789,9 @@ hammer_rel_buffer(hammer_buffer_t buffer, int flush) if (buffer->io.lock.refs == 1) { hammer_io_release(&buffer->io, flush); + if (buffer->io.lock.refs == 1) + --hammer_count_refedbufs; + if (buffer->io.bp == NULL && buffer->io.lock.refs == 1) { /* @@ -810,8 +810,6 @@ hammer_rel_buffer(hammer_buffer_t buffer, int flush) hammer_io_clear_modlist(&buffer->io); hammer_flush_buffer_nodes(buffer); KKASSERT(TAILQ_EMPTY(&buffer->clist)); - if (buffer->io.lock.refs == 1) - --hammer_count_refedbufs; freeme = 1; } } @@ -947,6 +945,7 @@ again: node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO); node->node_offset = node_offset; node->hmp = hmp; + TAILQ_INIT(&node->cache_list); if (RB_INSERT(hammer_nod_rb_tree, &hmp->rb_nods_root, node)) { --hammer_count_nodes; kfree(node, M_HAMMER); @@ -1017,15 +1016,18 @@ hammer_load_node(hammer_node_t node, int isnew) node->buffer = buffer; } } - if (error == 0) { - node->ondisk = (void *)((char *)buffer->ondisk + - (node->node_offset & HAMMER_BUFMASK)); - if (isnew == 0 && - hammer_crc_test_btree(node->ondisk) == 0) { + if (error) + goto failed; + node->ondisk = (void *)((char *)buffer->ondisk + + (node->node_offset & HAMMER_BUFMASK)); + if (isnew == 0 && + (node->flags & HAMMER_NODE_CRCGOOD) == 0) { + if (hammer_crc_test_btree(node->ondisk) == 0) Debugger("CRC FAILED: B-TREE NODE"); - } + node->flags |= HAMMER_NODE_CRCGOOD; } } +failed: --node->loading; hammer_unlock(&node->lock); return (error); @@ -1035,12 +1037,12 @@ hammer_load_node(hammer_node_t node, int isnew) * Safely reference a node, interlock against flushes via the IO subsystem. */ hammer_node_t -hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache, +hammer_ref_node_safe(struct hammer_mount *hmp, hammer_node_cache_t cache, int *errorp) { hammer_node_t node; - node = *cache; + node = cache->node; if (node != NULL) { hammer_ref(&node->lock); if (node->ondisk) @@ -1087,6 +1089,13 @@ hammer_rel_node(hammer_node_t node) } /* + * Do not disassociate the node from the buffer if it represents + * a modified B-Tree node that still needs its crc to be generated. + */ + if (node->flags & HAMMER_NODE_NEEDSCRC) + return; + + /* * Do final cleanups and then either destroy the node and leave it * passively cached. The buffer reference is removed regardless. */ @@ -1120,64 +1129,37 @@ hammer_delete_node(hammer_transaction_t trans, hammer_node_t node) } /* - * Passively cache a referenced hammer_node in *cache. The caller may - * release the node on return. + * Passively cache a referenced hammer_node. The caller may release + * the node on return. */ void -hammer_cache_node(hammer_node_t node, struct hammer_node **cache) +hammer_cache_node(hammer_node_cache_t cache, hammer_node_t node) { - hammer_node_t old; - /* * If the node is being deleted, don't cache it! */ if (node->flags & HAMMER_NODE_DELETED) return; - - /* - * Cache the node. If we previously cached a different node we - * have to give HAMMER a chance to destroy it. - */ -again: - if (node->cache1 != cache) { - if (node->cache2 != cache) { - if ((old = *cache) != NULL) { - KKASSERT(node->lock.refs != 0); - hammer_uncache_node(cache); - goto again; - } - if (node->cache2) - *node->cache2 = NULL; - node->cache2 = node->cache1; - node->cache1 = cache; - *cache = node; - } else { - struct hammer_node **tmp; - tmp = node->cache1; - node->cache1 = node->cache2; - node->cache2 = tmp; - } - } + if (cache->node == node) + return; + while (cache->node) + hammer_uncache_node(cache); + if (node->flags & HAMMER_NODE_DELETED) + return; + cache->node = node; + TAILQ_INSERT_TAIL(&node->cache_list, cache, entry); } void -hammer_uncache_node(struct hammer_node **cache) +hammer_uncache_node(hammer_node_cache_t cache) { hammer_node_t node; - if ((node = *cache) != NULL) { - *cache = NULL; - if (node->cache1 == cache) { - node->cache1 = node->cache2; - node->cache2 = NULL; - } else if (node->cache2 == cache) { - node->cache2 = NULL; - } else { - panic("hammer_uncache_node: missing cache linkage"); - } - if (node->cache1 == NULL && node->cache2 == NULL) { + if ((node = cache->node) != NULL) { + TAILQ_REMOVE(&node->cache_list, cache, entry); + cache->node = NULL; + if (TAILQ_EMPTY(&node->cache_list)) hammer_flush_node(node); - } } } @@ -1188,13 +1170,15 @@ hammer_uncache_node(struct hammer_node **cache) void hammer_flush_node(hammer_node_t node) { + hammer_node_cache_t cache; hammer_buffer_t buffer; - if (node->cache1) - *node->cache1 = NULL; - if (node->cache2) - *node->cache2 = NULL; + while ((cache = TAILQ_FIRST(&node->cache_list)) != NULL) { + TAILQ_REMOVE(&node->cache_list, cache, entry); + cache->node = NULL; + } if (node->lock.refs == 0 && node->ondisk == NULL) { + KKASSERT((node->flags & HAMMER_NODE_NEEDSCRC) == 0); RB_REMOVE(hammer_nod_rb_tree, &node->hmp->rb_nods_root, node); if ((buffer = node->buffer) != NULL) { node->buffer = NULL; @@ -1220,6 +1204,7 @@ hammer_flush_buffer_nodes(hammer_buffer_t buffer) while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) { KKASSERT(node->ondisk == NULL); + KKASSERT((node->flags & HAMMER_NODE_NEEDSCRC) == 0); if (node->lock.refs == 0) { hammer_ref(&node->lock); diff --git a/sys/vfs/hammer/hammer_transaction.c b/sys/vfs/hammer/hammer_transaction.c index 7ea41cff1b..4f04ca649e 100644 --- a/sys/vfs/hammer/hammer_transaction.c +++ b/sys/vfs/hammer/hammer_transaction.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_transaction.c,v 1.17 2008/06/10 08:51:02 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_transaction.c,v 1.18 2008/06/18 01:13:30 dillon Exp $ */ #include "hammer.h" @@ -120,11 +120,18 @@ hammer_done_transaction(struct hammer_transaction *trans) static hammer_tid_t hammer_alloc_tid(hammer_transaction_t trans, int count) { +#if 0 struct timespec ts; +#endif hammer_tid_t tid; +#if 0 getnanotime(&ts); +#endif + tid = time_second * 1000000000LL; +#if 0 tid = ts.tv_sec * 1000000000LL + ts.tv_nsec; +#endif if (tid < trans->hmp->next_tid) tid = trans->hmp->next_tid; if (tid >= 0xFFFFFFFFFFFFF000ULL) diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index 2f5ab4e13e..3f90325f31 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.48 2008/06/17 04:02:38 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.49 2008/06/18 01:13:30 dillon Exp $ */ #include @@ -173,6 +173,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, verify_zone, CTLFLAG_RW, SYSCTL_INT(_vfs_hammer, OID_AUTO, write_mode, CTLFLAG_RW, &hammer_write_mode, 0, ""); +KTR_INFO_MASTER(hammer); + /* * VFS ABI */ diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 842d710ffa..baa89093e0 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.71 2008/06/17 04:02:38 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.72 2008/06/18 01:13:30 dillon Exp $ */ #include @@ -247,7 +247,7 @@ hammer_vop_read(struct vop_read_args *ap) } if ((ip->flags & HAMMER_INODE_RO) == 0 && (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { - ip->ino_leaf.atime = trans.time; + ip->ino_data.atime = trans.time; hammer_modify_inode(ip, HAMMER_INODE_ITIMES); } hammer_done_transaction(&trans); @@ -452,12 +452,11 @@ hammer_vop_write(struct vop_write_args *ap) } else if (hammer_write_mode && (uio->uio_offset & HAMMER_BUFMASK) == 0) { #if 1 - /* strategy write cannot handled clustered writes */ bp->b_flags |= B_CLUSTEROK; cluster_write(bp, ip->ino_data.size, seqcount); #else -#endif bawrite(bp); +#endif } else if ((ap->a_ioflag >> 16) == IO_SEQMAX && (uio->uio_offset & HAMMER_BUFMASK) == 0) { /* @@ -622,11 +621,19 @@ hammer_vop_getattr(struct vop_getattr_args *ap) vap->va_rmajor = 0; vap->va_rminor = 0; vap->va_size = ip->ino_data.size; - if (ip->flags & HAMMER_INODE_RO) - hammer_to_timespec(ip->ino_data.mtime, &vap->va_atime); - else - hammer_to_timespec(ip->ino_leaf.atime, &vap->va_atime); - hammer_to_timespec(ip->ino_data.mtime, &vap->va_mtime); + + /* + * We must provide a consistent atime and mtime for snapshots + * so people can do a 'tar cf - ... | md5' on them and get + * consistent results. + */ + if (ip->flags & HAMMER_INODE_RO) { + hammer_to_timespec(ip->ino_data.ctime, &vap->va_atime); + hammer_to_timespec(ip->ino_data.ctime, &vap->va_mtime); + } else { + hammer_to_timespec(ip->ino_data.atime, &vap->va_atime); + hammer_to_timespec(ip->ino_data.mtime, &vap->va_mtime); + } hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime); vap->va_flags = ip->ino_data.uflags; vap->va_gen = 1; /* hammer inums are unique for all time */ @@ -705,7 +712,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) * dip. */ if (nlen == 0) { - ip = hammer_get_inode(&trans, &dip->cache[1], dip->obj_id, + ip = hammer_get_inode(&trans, dip, dip->obj_id, asof, flags, &error); if (error == 0) { error = hammer_get_vnode(ip, &vp); @@ -730,7 +737,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) */ namekey = hammer_directory_namekey(ncp->nc_name, nlen); - error = hammer_init_cursor(&trans, &cursor, &dip->cache[0], dip); + error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); cursor.key_beg.localization = HAMMER_LOCALIZE_MISC; cursor.key_beg.obj_id = dip->obj_id; cursor.key_beg.key = namekey; @@ -769,8 +776,8 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) } hammer_done_cursor(&cursor); if (error == 0) { - ip = hammer_get_inode(&trans, &dip->cache[1], - obj_id, asof, flags, &error); + ip = hammer_get_inode(&trans, dip, obj_id, + asof, flags, &error); if (error == 0) { error = hammer_get_vnode(ip, &vp); hammer_rel_inode(ip, 0); @@ -837,7 +844,7 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) hammer_simple_transaction(&trans, dip->hmp); - ip = hammer_get_inode(&trans, &dip->cache[1], parent_obj_id, + ip = hammer_get_inode(&trans, dip, parent_obj_id, asof, dip->flags, &error); if (ip) { error = hammer_get_vnode(ip, ap->a_vpp); @@ -1137,7 +1144,7 @@ hammer_vop_readdir(struct vop_readdir_args *ap) * Key range (begin and end inclusive) to scan. Directory keys * directly translate to a 64 bit 'seek' position. */ - hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip); + hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); cursor.key_beg.localization = HAMMER_LOCALIZE_MISC; cursor.key_beg.obj_id = ip->obj_id; cursor.key_beg.create_tid = 0; @@ -1233,7 +1240,7 @@ hammer_vop_readlink(struct vop_readlink_args *ap) * Long version */ hammer_simple_transaction(&trans, ip->hmp); - hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip); + hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); /* * Key range (begin and end inclusive) to scan. Directory keys @@ -1355,7 +1362,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap) */ namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen); retry: - hammer_init_cursor(&trans, &cursor, &fdip->cache[0], fdip); + hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); cursor.key_beg.localization = HAMMER_LOCALIZE_MISC; cursor.key_beg.obj_id = fdip->obj_id; cursor.key_beg.key = namekey; @@ -1622,7 +1629,7 @@ hammer_vop_setattr(struct vop_setattr_args *ap) break; } if (vap->va_atime.tv_sec != VNOVAL) { - ip->ino_leaf.atime = + ip->ino_data.atime = hammer_timespec_to_transid(&vap->va_atime); modflags |= HAMMER_INODE_ITIMES; } @@ -2056,7 +2063,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) done: if (cursor.node) - hammer_cache_node(cursor.node, &ip->cache[1]); + hammer_cache_node(&ip->cache[1], cursor.node); hammer_done_cursor(&cursor); hammer_done_transaction(&trans); return(error); @@ -2220,7 +2227,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) #endif if (cursor.node) { - hammer_cache_node(cursor.node, &ip->cache[1]); + hammer_cache_node(&ip->cache[1], cursor.node); #if 0 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]); #endif @@ -2255,11 +2262,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) /* * Write to a regular file. Because this is a strategy call the OS is - * trying to actually sync data to the media. HAMMER can only flush - * the entire inode (so the TID remains properly synchronized). - * - * Basically all we do here is place the bio on the inode's flush queue - * and activate the flusher. + * trying to actually get data onto the media. */ static int @@ -2413,7 +2416,7 @@ hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen); retry: - hammer_init_cursor(trans, &cursor, &dip->cache[0], dip); + hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); cursor.key_beg.localization = HAMMER_LOCALIZE_MISC; cursor.key_beg.obj_id = dip->obj_id; cursor.key_beg.key = namekey; @@ -2460,8 +2463,7 @@ retry: */ if (error == 0) { hammer_unlock(&cursor.ip->lock); - ip = hammer_get_inode(trans, &dip->cache[1], - cursor.data->entry.obj_id, + ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, dip->hmp->asof, 0, &error); hammer_lock_sh(&cursor.ip->lock); if (error == ENOENT) { -- 2.11.4.GIT