From c847e8387ad749d611d395742d337213aefef3b9 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 23 Jun 2015 23:01:54 -0700 Subject: [PATCH] hammer2 - Refactor frontend part 9/many * Create initial frontend/backend XOP infrastructure. frontend: hammer2_xop_alloc() hammer2_xop_start() ... hammer2_xop_collect() loop ... hammer2_xop_retire(xop, HAMMER2_XOPMASK_VOP) backend: (backend is called with the shared xop structure in separate backend threads for each node belonging to the cluster appropriate for the operation). ... issue chain calls as needed ... ... hammer2_xop_feed() ... (feed chains back to frontend) hammer2_xop_feed(NULL) (feed NULL chain) hammer2_xop_retire(xop, 1U << clindex) The XOP contains a FIFO, allowing the backend to pipeline results when appropriate (e.g. readdir). If a sequence of results are expected, the backend should finish with a NULL chain. If not, the backend can just feed back whatever is expected. Often this will just be the chain representing the inode. The frontend calls hammer2_xop_collect() to collect results from all the backend nodes. The collect function handles quorum validation and consolidates the results from a sufficient number of cluster nodes into a single result for the frontend. * The frontend can disconnect from the operation at any time in order to be able to return a result, even if backend elements are still running. This typically occurs when a sufficient number of nodes in the cluster have responded to validate the quorum. This also allows backend nodes to stall indefinitely without stalling the frontend. * Because frontend concurrency is lost due to the bulk of the work being done by the backend, the hammer2 mount code will allocate ~16 or so work threads per node to distribute potentially many frontend operations. * Most frontend operations use existing cache layers to retains frontend concurrency. Specifically, inode meta-data access and modifications, and logical buffer cache operations (when cached), and cached vnodes via the namecache. If the cache is not available, operations will wind up using the VOP/XOP infrastructure, including buffer-cache strategy routines (in an upcoming commit). * Implement readdir() using the new infrastructure as an initial test. * Fix an ip->meta synchronization bug related to hardlinks that was introduced by the ip->meta local copy work. --- sys/vfs/hammer2/TODO | 3 + sys/vfs/hammer2/hammer2.h | 134 ++++++++++-- sys/vfs/hammer2/hammer2_chain.c | 4 +- sys/vfs/hammer2/hammer2_cluster.c | 408 +++++++++++++++++++++++++++++++++- sys/vfs/hammer2/hammer2_inode.c | 55 +++-- sys/vfs/hammer2/hammer2_thread.c | 448 ++++++++++++++++++++++++++++++++++++-- sys/vfs/hammer2/hammer2_vfsops.c | 56 +++-- sys/vfs/hammer2/hammer2_vnops.c | 124 ++++------- 8 files changed, 1064 insertions(+), 168 deletions(-) diff --git a/sys/vfs/hammer2/TODO b/sys/vfs/hammer2/TODO index 9634e8cf8d..f6b15d4c7f 100644 --- a/sys/vfs/hammer2/TODO +++ b/sys/vfs/hammer2/TODO @@ -1,4 +1,7 @@ +* mtx locks should not track td_locks count?. They can be acquired by one + thread and released by another. Need API function for exclusive locks. + * Convert xops and hammer2_update_spans() from cluster back into chain calls * syncthr leaves inode locks for entire sync, which is wrong. diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h index 201b17c59b..02af4956ee 100644 --- a/sys/vfs/hammer2/hammer2.h +++ b/sys/vfs/hammer2/hammer2.h @@ -105,7 +105,7 @@ struct hammer2_span; struct hammer2_state; struct hammer2_msg; struct hammer2_thread; -struct hammer2_xop; +union hammer2_xop; /* * Mutex and lock shims. Hammer2 requires support for asynchronous and @@ -133,6 +133,11 @@ typedef struct spinlock hammer2_spin_t; #define hammer2_spin_unsh spin_unlock_shared #define hammer2_spin_unex spin_unlock +TAILQ_HEAD(hammer2_xop_list, hammer2_xop_head); + +typedef struct hammer2_xop_list hammer2_xop_list_t; + + /* * General lock support */ @@ -419,13 +424,40 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); /* * Flags passed to hammer2_chain_lookup() and hammer2_chain_next() * - * NOTE: MATCHIND allows an indirect block / freemap node to be returned - * when the passed key range matches the radix. Remember that key_end - * is inclusive (e.g. {0x000,0xFFF}, not {0x000,0x1000}). + * NOTES: + * NOLOCK - Input and output chains are referenced only and not + * locked. Output chain might be temporarily locked + * internally. + * + * NODATA - Asks that the chain->data not be resolved in order + * to avoid I/O. + * + * NODIRECT - Prevents a lookup of offset 0 in an inode from returning + * the inode itself if the inode is in DIRECTDATA mode + * (i.e. file is <= 512 bytes). Used by the synchronization + * code to prevent confusion. + * + * SHARED - The input chain is expected to be locked shared, + * and the output chain is locked shared. + * + * MATCHIND - Allows an indirect block / freemap node to be returned + * when the passed key range matches the radix. Remember + * that key_end is inclusive (e.g. {0x000,0xFFF}, + * not {0x000,0x1000}). + * + * (Cannot be used for remote or cluster ops). + * + * ALLNODES - Allows NULL focus. + * + * ALWAYS - Always resolve the data. If ALWAYS and NODATA are both + * missing, bulk file data is not resolved but inodes and + * other meta-data will. * - * NOTE: NODIRECT prevents a lookup of offset 0 in an inode from returning - * the inode itself if the inode is in DIRECTDATA mode (i.e. file is - * <= 512 bytes). + * NOUNLOCK - Used by hammer2_chain_next() to leave the lock on + * the input chain intact. The chain is still dropped. + * This allows the caller to add a reference to the chain + * and retain it in a locked state (used by the + * XOP/feed/collect code). */ #define HAMMER2_LOOKUP_NOLOCK 0x00000001 /* ref only */ #define HAMMER2_LOOKUP_NODATA 0x00000002 /* data left NULL */ @@ -434,6 +466,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); #define HAMMER2_LOOKUP_MATCHIND 0x00000200 /* return all chains */ #define HAMMER2_LOOKUP_ALLNODES 0x00000400 /* allow NULL focus */ #define HAMMER2_LOOKUP_ALWAYS 0x00000800 /* resolve data */ +#define HAMMER2_LOOKUP_NOUNLOCK 0x00001000 /* leave lock intact */ /* * Flags passed to hammer2_chain_modify() and hammer2_chain_resize() @@ -533,7 +566,11 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); * to a chain still part of the synchronized set. */ #define HAMMER2_MAXCLUSTER 8 +#define HAMMER2_XOPFIFO 16 +#define HAMMER2_XOPFIFO_MASK (HAMMER2_XOPFIFO - 1) #define HAMMER2_XOPGROUPS 16 +#define HAMMER2_XOPGROUPS_MASK (HAMMER2_XOPGROUPS - 1) +#define HAMMER2_XOPMASK_VOP 0x80000000U struct hammer2_cluster_item { #if 0 @@ -559,6 +596,7 @@ typedef struct hammer2_cluster_item hammer2_cluster_item_t; */ #define HAMMER2_CITEM_INVALID 0x00000001 #define HAMMER2_CITEM_FEMOD 0x00000002 +#define HAMMER2_CITEM_NULL 0x00000004 struct hammer2_cluster { int refs; /* track for deallocation */ @@ -788,6 +826,7 @@ struct hammer2_thread { int repidx; hammer2_trans_t trans; struct lock lk; /* thread control lock */ + hammer2_xop_list_t xopq; }; typedef struct hammer2_thread hammer2_thread_t; @@ -802,25 +841,53 @@ typedef struct hammer2_thread hammer2_thread_t; /* - * hammer2_xop - container for VOP/XOP operation. + * hammer2_xop - container for VOP/XOP operation (allocated, not on stack). * * This structure is used to distribute a VOP operation across multiple * nodes. It provides a rendezvous for concurrent node execution and * can be detached from the frontend operation to allow the frontend to * return early. */ -struct hammer2_xop { - struct hammer2_xop *next; - void (*func)(struct hammer2_thread *thr, - struct hammer2_xop *xop); - int refs; - hammer2_inode_t *dip; - hammer2_inode_t *ip; +typedef void (*hammer2_xop_func_t)(union hammer2_xop *xop, int clidx); + +typedef struct hammer2_xop_fifo { + TAILQ_ENTRY(hammer2_xop_head) entry; + hammer2_chain_t *array[HAMMER2_XOPFIFO]; + int errors[HAMMER2_XOPFIFO]; + int ri; + int wi; + int unused03; +} hammer2_xop_fifo_t; + +struct hammer2_xop_head { + hammer2_xop_func_t func; + struct hammer2_inode *ip; + struct hammer2_xop_group *xgrp; + uint32_t check_counter; + uint32_t run_mask; + uint32_t chk_mask; + int state; + int error; + hammer2_key_t lkey; + hammer2_key_t nkey; + hammer2_xop_fifo_t collect[HAMMER2_MAXCLUSTER]; + hammer2_cluster_t cluster; /* help collections */ }; -typedef struct hammer2_xop hammer2_xop_t; +typedef struct hammer2_xop_head hammer2_xop_head_t; + +struct hammer2_xop_readdir { + hammer2_xop_head_t head; +}; + +typedef struct hammer2_xop_readdir hammer2_xop_readdir_t; + +union hammer2_xop { + hammer2_xop_head_t head; + hammer2_xop_readdir_t xop_readdir; +}; -TAILQ_HEAD(hammer2_xop_list, hammer2_xop); +typedef union hammer2_xop hammer2_xop_t; /* * hammer2_xop_group - Manage XOP support threads. @@ -828,8 +895,6 @@ TAILQ_HEAD(hammer2_xop_list, hammer2_xop); struct hammer2_xop_group { hammer2_thread_t thrs[HAMMER2_MAXCLUSTER]; hammer2_mtx_t mtx; - hammer2_xop_t marker; - hammer2_xop_t **xop_tailp; }; typedef struct hammer2_xop_group hammer2_xop_group_t; @@ -949,7 +1014,7 @@ struct hammer2_pfs { uint8_t pfs_mode; /* operating mode PFSMODE */ uint8_t unused01; uint8_t unused02; - uint32_t unused03; + int xop_iterator; long inmem_inodes; uint32_t inmem_dirty_chains; int count_lwinprog; /* logical write in prog */ @@ -973,6 +1038,11 @@ typedef struct hammer2_pfs hammer2_pfs_t; #define HAMMER2_LWINPROG_MASK 0x7FFFFFFF /* + * hammer2_cluster_check + */ +#define HAMMER2_CHECK_NULL 0x00000001 + +/* * Bulkscan */ #define HAMMER2_BULK_ABORT 0x00000001 @@ -1091,6 +1161,7 @@ const char *hammer2_error_str(int error); void hammer2_inode_lock(hammer2_inode_t *ip, int how); void hammer2_inode_unlock(hammer2_inode_t *ip, hammer2_cluster_t *cluster); hammer2_cluster_t *hammer2_inode_cluster(hammer2_inode_t *ip, int how); +hammer2_chain_t *hammer2_inode_chain(hammer2_inode_t *ip, int clindex, int how); hammer2_mtx_state_t hammer2_inode_lock_temp_release(hammer2_inode_t *ip); void hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, hammer2_mtx_state_t ostate); @@ -1230,7 +1301,7 @@ void hammer2_chain_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, hammer2_chain_t **parentp, hammer2_chain_t *chain, int flags); int hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_chain_t **chainp, - hammer2_ioc_pfs_t *pfs); + hammer2_ioc_pfs_t *pmp); void hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent, hammer2_chain_t *chain, int flags); void hammer2_chain_delete_duplicate(hammer2_trans_t *trans, @@ -1302,8 +1373,19 @@ void hammer2_io_bqrelse(hammer2_io_t **diop); /* * hammer2_xops.c */ -void hammer2_xop_group_init(hammer2_pfs_t *pfs, hammer2_xop_group_t *xgrp); -int hammer2_xop_readdir(struct vop_readdir_args *ap); +void hammer2_xop_group_init(hammer2_pfs_t *pmp, hammer2_xop_group_t *xgrp); +hammer2_xop_t *hammer2_xop_alloc(hammer2_inode_t *ip, hammer2_xop_func_t func); +void hammer2_xop_helper_create(hammer2_pfs_t *pmp); +void hammer2_xop_helper_cleanup(hammer2_pfs_t *pmp); +void hammer2_xop_start(hammer2_xop_head_t *xop); +int hammer2_xop_collect(hammer2_xop_head_t *xop); +void hammer2_xop_retire(hammer2_xop_head_t *xop, uint32_t mask); +int hammer2_xop_active(hammer2_xop_head_t *xop); +int hammer2_xop_feed(hammer2_xop_head_t *xop, hammer2_chain_t *chain, + int clindex, int error); + + +void hammer2_xop_readdir(hammer2_xop_t *xop, int clidx); int hammer2_xop_readlink(struct vop_readlink_args *ap); int hammer2_xop_nresolve(struct vop_nresolve_args *ap); int hammer2_xop_nlookupdotdot(struct vop_nlookupdotdot_args *ap); @@ -1352,6 +1434,8 @@ void hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_dev_t *hmp, int hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes); uint8_t hammer2_cluster_type(hammer2_cluster_t *cluster); const hammer2_media_data_t *hammer2_cluster_rdata(hammer2_cluster_t *cluster); +const hammer2_media_data_t *hammer2_cluster_rdata_bytes( + hammer2_cluster_t *cluster, size_t *bytesp); hammer2_media_data_t *hammer2_cluster_wdata(hammer2_cluster_t *cluster); hammer2_cluster_t *hammer2_cluster_from_chain(hammer2_chain_t *chain); int hammer2_cluster_modified(hammer2_cluster_t *cluster); @@ -1369,6 +1453,8 @@ void hammer2_cluster_drop(hammer2_cluster_t *cluster); void hammer2_cluster_wait(hammer2_cluster_t *cluster); void hammer2_cluster_lock(hammer2_cluster_t *cluster, int how); void hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how); +int hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t lokey, + int flags); void hammer2_cluster_resolve(hammer2_cluster_t *cluster); void hammer2_cluster_forcegood(hammer2_cluster_t *cluster); hammer2_cluster_t *hammer2_cluster_copy(hammer2_cluster_t *ocluster); @@ -1413,7 +1499,7 @@ void hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, void hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *pcluster, hammer2_cluster_t *cluster, int flags); int hammer2_cluster_snapshot(hammer2_trans_t *trans, - hammer2_cluster_t *ocluster, hammer2_ioc_pfs_t *pfs); + hammer2_cluster_t *ocluster, hammer2_ioc_pfs_t *pmp); hammer2_cluster_t *hammer2_cluster_parent(hammer2_cluster_t *cluster); int hammer2_bulk_scan(hammer2_trans_t *trans, hammer2_chain_t *parent, diff --git a/sys/vfs/hammer2/hammer2_chain.c b/sys/vfs/hammer2/hammer2_chain.c index 533a29d599..16d446a1ed 100644 --- a/sys/vfs/hammer2/hammer2_chain.c +++ b/sys/vfs/hammer2/hammer2_chain.c @@ -1925,8 +1925,10 @@ hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, if (chain) { key_beg = chain->bref.key + ((hammer2_key_t)1 << chain->bref.keybits); - if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) + if ((flags & (HAMMER2_LOOKUP_NOLOCK | + HAMMER2_LOOKUP_NOUNLOCK)) == 0) { hammer2_chain_unlock(chain); + } hammer2_chain_drop(chain); /* diff --git a/sys/vfs/hammer2/hammer2_cluster.c b/sys/vfs/hammer2/hammer2_cluster.c index a5c241e330..3399f933da 100644 --- a/sys/vfs/hammer2/hammer2_cluster.c +++ b/sys/vfs/hammer2/hammer2_cluster.c @@ -149,9 +149,10 @@ hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) uint8_t hammer2_cluster_type(hammer2_cluster_t *cluster) { - KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); - if (cluster->error == 0) + if (cluster->error == 0) { + KKASSERT(cluster->focus != NULL); return(cluster->focus->bref.type); + } return 0; } @@ -163,9 +164,10 @@ hammer2_cluster_type(hammer2_cluster_t *cluster) int hammer2_cluster_modified(hammer2_cluster_t *cluster) { - KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); - if (cluster->error == 0) + if (cluster->error == 0) { + KKASSERT(cluster->focus != NULL); return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); + } return 0; } @@ -182,8 +184,8 @@ hammer2_cluster_modified(hammer2_cluster_t *cluster) void hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) { - KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); if (cluster->error == 0) { + KKASSERT(cluster->focus != NULL); *bref = cluster->focus->bref; bref->data_off = 0; } else { @@ -773,6 +775,392 @@ skip4: } /* + * This is used by the XOPS subsystem to calculate the state of + * the collection and tell hammer2_xop_collect() what to do with it. + * The collection can be in various states of desynchronization, the + * caller specifically wants to resolve the passed-in key. + * + * Return values: + * 0 - Quorum agreement, key is valid + * + * ENOENT - Quorum agreement, end of scan + * + * ESRCH - Quorum agreement, key is INVALID (caller should + * skip key). + * + * EIO - Quorum agreement but all elements had errors. + * + * EDEADLK - No quorum agreement possible for key, a repair + * may be needed. Caller has to decide what to do, + * possibly iterating the key or generating an EIO. + * + * EINPROGRESS - No quorum agreement yet, but agreement is still + * possible if caller waits for more responses. Caller + * should not iterate key. + * + * XXX needs to handle SOFT_MASTER and SOFT_SLAVE + */ +int +hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags) +{ + hammer2_chain_t *chain; + hammer2_chain_t *focus; + hammer2_pfs_t *pmp; + hammer2_tid_t quorum_tid; + hammer2_tid_t last_best_quorum_tid; + uint32_t nflags; + int ttlmasters; + int ttlslaves; + int nmasters; + int nmasters_keymatch; + int nslaves; + int nquorum; + int umasters; /* unknown masters (still in progress) */ + int smpresent; + int i; + + cluster->error = 0; + cluster->focus = NULL; + + nflags = 0; + ttlmasters = 0; + ttlslaves = 0; + nmasters = 0; + nmasters_keymatch = 0; + umasters = 0; + nslaves = 0; + + /* + * Calculate quorum + */ + pmp = cluster->pmp; + KKASSERT(pmp != NULL || cluster->nchains == 0); + nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; + smpresent = 0; + + /* + * Pass 1 + * + * NOTE: A NULL chain is not necessarily an error, it could be + * e.g. a lookup failure or the end of an iteration. + * Process normally. + */ + for (i = 0; i < cluster->nchains; ++i) { + cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; + cluster->array[i].flags |= HAMMER2_CITEM_INVALID; + + chain = cluster->array[i].chain; + if (chain && chain->error) { + if (cluster->focus == NULL || cluster->focus == chain) { + /* error will be overridden by valid focus */ + cluster->error = chain->error; + } + + /* + * Must count total masters and slaves whether the + * chain is errored or not. + */ + switch (cluster->pmp->pfs_types[i]) { + case HAMMER2_PFSTYPE_MASTER: + ++ttlmasters; + break; + case HAMMER2_PFSTYPE_SLAVE: + ++ttlslaves; + break; + } + continue; + } + switch (cluster->pmp->pfs_types[i]) { + case HAMMER2_PFSTYPE_MASTER: + ++ttlmasters; + break; + case HAMMER2_PFSTYPE_SLAVE: + ++ttlslaves; + break; + case HAMMER2_PFSTYPE_SOFT_MASTER: + nflags |= HAMMER2_CLUSTER_WRSOFT; + nflags |= HAMMER2_CLUSTER_RDSOFT; + smpresent = 1; + break; + case HAMMER2_PFSTYPE_SOFT_SLAVE: + nflags |= HAMMER2_CLUSTER_RDSOFT; + break; + case HAMMER2_PFSTYPE_SUPROOT: + /* + * Degenerate cluster representing the super-root + * topology on a single device. Fake stuff so + * cluster ops work as expected. + */ + nflags |= HAMMER2_CLUSTER_WRHARD; + nflags |= HAMMER2_CLUSTER_RDHARD; + cluster->focus_index = i; + cluster->focus = chain; + cluster->error = chain ? chain->error : 0; + break; + default: + break; + } + } + + /* + * Pass 2 + * + * Resolve nmasters - master nodes fully match + * + * Resolve umasters - master nodes operation still + * in progress + * + * Resolve nmasters_keymatch - master nodes match the passed-in + * key and may or may not match + * the quorum-agreed tid. + * + * The quorum-agreed TID is the highest matching TID. + */ + last_best_quorum_tid = HAMMER2_TID_MAX; + quorum_tid = 0; /* fix gcc warning */ + + while (nmasters < nquorum && last_best_quorum_tid != 0) { + nmasters = 0; + quorum_tid = 0; + + for (i = 0; i < cluster->nchains; ++i) { + /* XXX SOFT smpresent handling */ + if (cluster->pmp->pfs_types[i] != + HAMMER2_PFSTYPE_MASTER) { + continue; + } + + chain = cluster->array[i].chain; + + /* + * Skip elements still in progress. umasters keeps + * track of masters that might still be in-progress. + */ + if (chain == NULL && (cluster->array[i].flags & + HAMMER2_CITEM_NULL) == 0) { + ++umasters; + continue; + } + + /* + * Key match? + */ + if (flags & HAMMER2_CHECK_NULL) { + if (chain == NULL) { + ++nmasters; + ++nmasters_keymatch; + } + } else if (chain && chain->bref.key == key) { + ++nmasters_keymatch; + if (quorum_tid < last_best_quorum_tid && + (quorum_tid < chain->bref.modify_tid || + nmasters == 0)) { + /* + * Better TID located, reset + * nmasters count. + */ + nmasters = 0; + quorum_tid = chain->bref.modify_tid; + } + if (quorum_tid == chain->bref.modify_tid) { + /* + * TID matches current collection. + */ + ++nmasters; + if (chain->error == 0) { + cluster->focus = chain; + cluster->focus_index = i; + } + } + } + } + if (nmasters >= nquorum) + break; + last_best_quorum_tid = quorum_tid; + } + + /* + kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n", + nmasters, nquorum, nmasters_keymatch, umasters); + */ + + /* + * Early return if we do not have enough masters. + */ + if (nmasters < nquorum) { + if (nmasters + umasters >= nquorum) + return EINPROGRESS; + if (nmasters_keymatch < nquorum) + return ESRCH; + return EDEADLK; + } + + /* + * Validated end of scan. + */ + if (flags & HAMMER2_CHECK_NULL) + return ENOENT; + + /* + * If we have a NULL focus at this point the agreeing quorum all + * had chain errors. + */ + if (cluster->focus == NULL) + return EIO; + + /* + * Pass 3 + * + * We have quorum agreement, validate elements, not end of scan. + */ + for (i = 0; i < cluster->nchains; ++i) { + chain = cluster->array[i].chain; + if (chain == NULL || + chain->bref.key != key || + chain->bref.modify_tid != quorum_tid) { + continue; + } + + switch (cluster->pmp->pfs_types[i]) { + case HAMMER2_PFSTYPE_MASTER: + cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; + cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; + nflags |= HAMMER2_CLUSTER_WRHARD; + nflags |= HAMMER2_CLUSTER_RDHARD; + break; + case HAMMER2_PFSTYPE_SLAVE: + /* + * We must have enough up-to-date masters to reach + * a quorum and the slave modify_tid must match the + * quorum's modify_tid. + * + * Do not select an errored slave. + */ + cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; + nflags |= HAMMER2_CLUSTER_RDHARD; + ++nslaves; + break; + case HAMMER2_PFSTYPE_SOFT_MASTER: + /* + * Directly mounted soft master always wins. There + * should be only one. + */ + cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; + cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; + break; + case HAMMER2_PFSTYPE_SOFT_SLAVE: + /* + * Directly mounted soft slave always wins. There + * should be only one. + * + * XXX + */ + cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; + break; + case HAMMER2_PFSTYPE_SUPROOT: + /* + * spmp (degenerate case) + */ + cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; + cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; + break; + default: + break; + } + } + + /* + * Focus now set, adjust ddflag. Skip this pass if the focus + * is bad or if we are at the PFS root (the bref won't match at + * the PFS root, obviously). + */ + focus = cluster->focus; + if (focus) { + cluster->ddflag = + (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); + } else { + cluster->ddflag = 0; + goto skip4; + } + if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) + goto skip4; + + /* + * Pass 4 + * + * Validate the elements that were not marked invalid. They should + * match. + */ + for (i = 0; i < cluster->nchains; ++i) { + int ddflag; + + chain = cluster->array[i].chain; + + if (chain == NULL) + continue; + if (chain == focus) + continue; + if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) + continue; + + ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); + if (chain->bref.type != focus->bref.type || + chain->bref.key != focus->bref.key || + chain->bref.keybits != focus->bref.keybits || + chain->bref.modify_tid != focus->bref.modify_tid || + chain->bytes != focus->bytes || + ddflag != cluster->ddflag) { + cluster->array[i].flags |= HAMMER2_CITEM_INVALID; + if (hammer2_debug & 1) + kprintf("cluster_resolve: matching modify_tid failed " + "bref test: idx=%d type=%02x/%02x " + "key=%016jx/%d-%016jx/%d " + "mod=%016jx/%016jx bytes=%u/%u\n", + i, + chain->bref.type, focus->bref.type, + chain->bref.key, chain->bref.keybits, + focus->bref.key, focus->bref.keybits, + chain->bref.modify_tid, focus->bref.modify_tid, + chain->bytes, focus->bytes); + if (hammer2_debug & 0x4000) + panic("cluster_resolve"); + /* flag issue and force resync? */ + } + } +skip4: + + if (ttlslaves == 0) + nflags |= HAMMER2_CLUSTER_NOSOFT; + if (ttlmasters == 0) + nflags |= HAMMER2_CLUSTER_NOHARD; + + /* + * Set SSYNCED or MSYNCED for slaves and masters respectively if + * all available nodes (even if 0 are available) are fully + * synchronized. This is used by the synchronization thread to + * determine if there is work it could potentially accomplish. + */ + if (nslaves == ttlslaves) + nflags |= HAMMER2_CLUSTER_SSYNCED; + if (nmasters == ttlmasters) + nflags |= HAMMER2_CLUSTER_MSYNCED; + + /* + * Determine if the cluster was successfully locked for the + * requested operation and generate an error code. The cluster + * will not be locked (or ref'd) if an error is returned. + * + * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok() + * to determine if reading or writing is possible. If writing, the + * cluster still requires a call to hammer2_cluster_modify() first. + */ + atomic_set_int(&cluster->flags, nflags); + atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); + + return 0; +} + +/* * This is used by the sync thread to force non-NULL elements of a copy * of the pmp->iroot cluster to be good which is required to prime the * sync. @@ -1693,12 +2081,22 @@ hammer2_cluster_parent(hammer2_cluster_t *cluster) const hammer2_media_data_t * hammer2_cluster_rdata(hammer2_cluster_t *cluster) { + KKASSERT(cluster->focus != NULL); + return(cluster->focus->data); +} + +const hammer2_media_data_t * +hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp) +{ + KKASSERT(cluster->focus != NULL); + *bytesp = cluster->focus->bytes; return(cluster->focus->data); } hammer2_media_data_t * hammer2_cluster_wdata(hammer2_cluster_t *cluster) { + KKASSERT(cluster->focus != NULL); KKASSERT(hammer2_cluster_modified(cluster)); return(cluster->focus->data); } diff --git a/sys/vfs/hammer2/hammer2_inode.c b/sys/vfs/hammer2/hammer2_inode.c index 7ea198db53..104e4651c3 100644 --- a/sys/vfs/hammer2/hammer2_inode.c +++ b/sys/vfs/hammer2/hammer2_inode.c @@ -143,6 +143,25 @@ hammer2_inode_cluster(hammer2_inode_t *ip, int how) return cluster; } +/* + * Select a chain out of an inode's cluster and lock it. + */ +hammer2_chain_t * +hammer2_inode_chain(hammer2_inode_t *ip, int clindex, int how) +{ + hammer2_chain_t *chain; + + if (clindex >= ip->cluster.nchains) + chain = NULL; + else + chain = ip->cluster.array[clindex].chain; + if (chain) { + hammer2_chain_ref(chain); + hammer2_chain_lock(chain, how); + } + return chain; +} + void hammer2_inode_unlock(hammer2_inode_t *ip, hammer2_cluster_t *cluster) { @@ -797,15 +816,17 @@ hammer2_hardlink_shiftup(hammer2_trans_t *trans, hammer2_cluster_t *cluster, hammer2_cluster_t *dcluster, int nlinks, int *errorp) { - const hammer2_inode_data_t *iptmp; hammer2_inode_data_t *nipdata; hammer2_cluster_t *xcluster; hammer2_key_t key_dummy; hammer2_key_t lhc; hammer2_blockref_t bref; + lhc = ip->meta.inum; +#if 0 iptmp = &hammer2_cluster_rdata(cluster)->ipdata; lhc = iptmp->meta.inum; +#endif KKASSERT((lhc & HAMMER2_DIRHASH_VISIBLE) == 0); /* @@ -866,21 +887,20 @@ hammer2_hardlink_shiftup(hammer2_trans_t *trans, hammer2_cluster_t *cluster, * target. The name isn't used but to ease debugging give it * a name after its inode number. */ + hammer2_inode_modify(trans, ip); hammer2_cluster_modify(trans, cluster, 0); + nipdata = &hammer2_cluster_wdata(cluster)->ipdata; ksnprintf(nipdata->filename, sizeof(nipdata->filename), "0x%016jx", (intmax_t)nipdata->meta.inum); - nipdata->meta.name_len = strlen(nipdata->filename); - nipdata->meta.name_key = lhc; - nipdata->meta.nlinks += nlinks; + ip->meta.name_len = strlen(nipdata->filename); + ip->meta.name_key = lhc; + ip->meta.nlinks += nlinks; /* - * Resync ip->meta. Some fields have to be retained. + * Resync nipdata->meta from the local copy. */ - nipdata->meta.size = ip->meta.size; - nipdata->meta.mtime = ip->meta.mtime; - ip->meta = nipdata->meta; - + nipdata->meta = ip->meta; hammer2_cluster_modsync(cluster); } @@ -1051,24 +1071,21 @@ hammer2_inode_connect(hammer2_trans_t *trans, * We must fixup the name stored in the inode data. * The bref key has already been adjusted by inode_connect(). */ + hammer2_inode_modify(trans, ip); hammer2_cluster_modify(trans, ncluster, 0); wipdata = &hammer2_cluster_wdata(ncluster)->ipdata; KKASSERT(name_len < HAMMER2_INODE_MAXNAME); bcopy(name, wipdata->filename, name_len); - wipdata->meta.name_key = lhc; - wipdata->meta.name_len = name_len; - wipdata->meta.nlinks = 1; - hammer2_cluster_modsync(ncluster); + ip->meta.name_key = lhc; + ip->meta.name_len = name_len; + ip->meta.nlinks = 1; /* - * Resync the in-memory inode, some fields must be retained. + * Resync wipdata->meta from the local copy. */ - if (ip) { /* XXX move_to_hidden passes NULL */ - wipdata->meta.size = ip->meta.size; - wipdata->meta.mtime = ip->meta.mtime; - ip->meta = wipdata->meta; - } + wipdata->meta = ip->meta; + hammer2_cluster_modsync(ncluster); } /* diff --git a/sys/vfs/hammer2/hammer2_thread.c b/sys/vfs/hammer2/hammer2_thread.c index 79c4bac532..011df0bc34 100644 --- a/sys/vfs/hammer2/hammer2_thread.c +++ b/sys/vfs/hammer2/hammer2_thread.c @@ -70,8 +70,14 @@ hammer2_thr_create(hammer2_thread_t *thr, hammer2_pfs_t *pmp, thr->pmp = pmp; thr->clindex = clindex; thr->repidx = repidx; - lwkt_create(func, thr, &thr->td, NULL, 0, -1, - "%s-%s", id, pmp->pfs_names[clindex]); + TAILQ_INIT(&thr->xopq); + if (repidx >= 0) { + lwkt_create(func, thr, &thr->td, NULL, 0, -1, + "%s-%s.%02d", id, pmp->pfs_names[clindex], repidx); + } else { + lwkt_create(func, thr, &thr->td, NULL, 0, -1, + "%s-%s", id, pmp->pfs_names[clindex]); + } } /* @@ -941,8 +947,396 @@ void hammer2_xop_group_init(hammer2_pfs_t *pmp, hammer2_xop_group_t *xgrp) { hammer2_mtx_init(&xgrp->mtx, "h2xopq"); - xgrp->xop_tailp = &xgrp->marker.next; - xgrp->marker.refs = 0x7FFFFFFF; +} + +/* + * Allocate a XOP request. + * + * Once allocated a XOP request can be started, collected, and retired, + * and can be retired early if desired. + * + * NOTE: Fifo indices might not be zero but ri == wi on objcache_get(). + */ +hammer2_xop_t * +hammer2_xop_alloc(hammer2_inode_t *ip, hammer2_xop_func_t func) +{ + hammer2_xop_t *xop; + + xop = objcache_get(cache_xops, M_WAITOK); + xop->head.ip = ip; + xop->head.func = func; + xop->head.state = 0; + xop->head.error = 0; + xop->head.lkey = 0; + xop->head.nkey = 0; + + xop->head.cluster.nchains = ip->cluster.nchains; + xop->head.cluster.pmp = ip->pmp; + xop->head.cluster.flags = HAMMER2_CLUSTER_LOCKED; + + /* + * run_mask - Active thread (or frontend) associated with XOP + */ + xop->head.run_mask = HAMMER2_XOPMASK_VOP; + + hammer2_inode_ref(ip); + + return xop; +} + +/* + * A mounted PFS needs Xops threads to support frontend operations. + */ +void +hammer2_xop_helper_create(hammer2_pfs_t *pmp) +{ + int i; + int j; + + kprintf("XOP_HELPER_CREATE: %d\n", pmp->pfs_nmasters); + for (i = 0; i < pmp->pfs_nmasters; ++i) { + for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { + if (pmp->xop_groups[j].thrs[i].td) + continue; + hammer2_thr_create(&pmp->xop_groups[j].thrs[i], pmp, + "h2xop", i, j, + hammer2_primary_xops_thread); + } + } +} + +void +hammer2_xop_helper_cleanup(hammer2_pfs_t *pmp) +{ + int i; + int j; + + for (i = 0; i < pmp->pfs_nmasters; ++i) { + for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { + if (pmp->xop_groups[j].thrs[i].td) + hammer2_thr_delete(&pmp->xop_groups[j].thrs[i]); + } + } +} + + + + +/* + * Start a XOP request, queueing it to all nodes in the cluster to + * execute the cluster op. + * + * XXX optimize single-target case. + */ +void +hammer2_xop_start(hammer2_xop_head_t *xop) +{ + hammer2_xop_group_t *xgrp; + hammer2_thread_t *thr; + hammer2_pfs_t *pmp; + int g; + int i; + + pmp = xop->ip->pmp; + + g = pmp->xop_iterator++; + g = g & HAMMER2_XOPGROUPS_MASK; + xgrp = &pmp->xop_groups[g]; + xop->xgrp = xgrp; + + for (i = 0; i < xop->ip->cluster.nchains; ++i) { + thr = &xgrp->thrs[i]; + if (thr->td) { + lockmgr(&thr->lk, LK_EXCLUSIVE); + if (thr->td && + (thr->flags & HAMMER2_THREAD_STOP) == 0) { + atomic_set_int(&xop->run_mask, 1U << i); + TAILQ_INSERT_TAIL(&thr->xopq, xop, + collect[i].entry); + } + lockmgr(&thr->lk, LK_RELEASE); + wakeup(&thr->flags); + } + } +} + +/* + * Retire a XOP. Used by both the VOP frontend and by the XOP backend. + */ +void +hammer2_xop_retire(hammer2_xop_head_t *xop, uint32_t mask) +{ + hammer2_xop_group_t *xgrp; + hammer2_chain_t *chain; + int i; + + xgrp = xop->xgrp; + + /* + * Remove the frontend or remove a backend feeder. When removing + * the frontend we must wakeup any backend feeders who are waiting + * for FIFO space. + * + * XXX optimize wakeup. + */ + KKASSERT(xop->run_mask & mask); + if (atomic_fetchadd_int(&xop->run_mask, -mask) != mask) { + if (mask == HAMMER2_XOPMASK_VOP) + wakeup(xop); + return; + } + + /* + * Cleanup the collection cluster. + */ + for (i = 0; i < xop->cluster.nchains; ++i) { + xop->cluster.array[i].flags = 0; + chain = xop->cluster.array[i].chain; + if (chain) { + xop->cluster.array[i].chain = NULL; + hammer2_chain_unlock(chain); + hammer2_chain_drop(chain); + } + } + + /* + * Cleanup the fifos, use check_counter to optimize the loop. + */ + mask = xop->chk_mask; + for (i = 0; mask && i < HAMMER2_MAXCLUSTER; ++i) { + hammer2_xop_fifo_t *fifo = &xop->collect[i]; + while (fifo->ri != fifo->wi) { + chain = fifo->array[fifo->ri & HAMMER2_XOPFIFO_MASK]; + if (chain) { + hammer2_chain_unlock(chain); + hammer2_chain_drop(chain); + } + ++fifo->ri; + if (fifo->wi - fifo->ri < HAMMER2_XOPFIFO / 2) + wakeup(xop); /* XXX optimize */ + } + mask &= ~(1U << i); + } + + /* + * The inode is only held at this point, simply drop it. + */ + if (xop->ip) { + hammer2_inode_drop(xop->ip); + xop->ip = NULL; + } + + objcache_put(cache_xops, xop); +} + +/* + * (Backend) Returns non-zero if the frontend is still attached. + */ +int +hammer2_xop_active(hammer2_xop_head_t *xop) +{ + if (xop->run_mask & HAMMER2_XOPMASK_VOP) + return 1; + else + return 0; +} + +/* + * (Backend) Feed chain data through the cluster validator and back to + * the frontend. Chains are fed from multiple nodes concurrently + * and pipelined via per-node FIFOs in the XOP. + * + * No xop lock is needed because we are only manipulating fields under + * our direct control. + * + * Returns 0 on success and a hammer error code if sync is permanently + * lost. + */ +int +hammer2_xop_feed(hammer2_xop_head_t *xop, hammer2_chain_t *chain, + int clindex, int error) +{ + hammer2_xop_fifo_t *fifo; + + /* + * Multi-threaded entry into the XOP collector. We own the + * fifo->wi for our clindex. + */ + fifo = &xop->collect[clindex]; + + while (fifo->ri == fifo->wi - HAMMER2_XOPFIFO) { + tsleep_interlock(xop, 0); + if (hammer2_xop_active(xop) == 0) { + error = EINTR; + goto done; + } + if (fifo->ri == fifo->wi - HAMMER2_XOPFIFO) { + tsleep(xop, PINTERLOCKED, "h2feed", hz*60); + } + } + if (chain) + hammer2_chain_ref(chain); + fifo->errors[fifo->wi & HAMMER2_XOPFIFO_MASK] = error; + fifo->array[fifo->wi & HAMMER2_XOPFIFO_MASK] = chain; + cpu_sfence(); + ++fifo->wi; + atomic_set_int(&xop->chk_mask, 1U << clindex); + atomic_add_int(&xop->check_counter, 1); + wakeup(&xop->check_counter); /* XXX optimize */ + error = 0; +done: + return error; +} + +/* + * (Frontend) collect a response from a running cluster op. + * + * Responses are fed from all appropriate nodes concurrently + * and collected into a cohesive response >= nkey. lkey is + * then set to nkey and nkey is advanced prior to return. + * The caller may depend on xop->lkey reflecting the current + * key of the returned response. + * + * The collector will return the instant quorum or other requirements + * are met, even if some nodes get behind or become non-responsive. + * + * HAMMER2_XOP_COLLECT_NOWAIT - Used to 'poll' a completed collection, + * usually called synchronously from the + * node XOPs for the strategy code to + * fake the frontend collection and complete + * the BIO as soon as possible. + * + * HAMMER2_XOP_SYNCHRONIZER - Reqeuest synchronization with a particular + * cluster index, prevents looping when that + * index is out of sync so caller can act on + * the out of sync element. ESRCH and EDEADLK + * can be returned if this flag is specified. + * + * Returns 0 on success plus a filled out xop->cluster structure. + * Return ENOENT on normal termination. + * Otherwise return an error. + */ +int +hammer2_xop_collect(hammer2_xop_head_t *xop) +{ + hammer2_xop_fifo_t *fifo; + hammer2_chain_t *chain; + hammer2_key_t lokey; + int error; + int keynull; + int adv; /* advance the element */ + int i; + uint32_t check_counter; + +loop: + /* + * First loop tries to advance pieces of the cluster which + * are out of sync. + */ + lokey = HAMMER2_KEY_MAX; + keynull = HAMMER2_CHECK_NULL; + check_counter = xop->check_counter; + cpu_lfence(); + + for (i = 0; i < xop->cluster.nchains; ++i) { + chain = xop->cluster.array[i].chain; + if (chain == NULL) { + adv = 1; + } else if (chain->bref.key < xop->nkey) { + adv = 1; + } else { + keynull &= ~HAMMER2_CHECK_NULL; + if (lokey > chain->bref.key) + lokey = chain->bref.key; + adv = 0; + } + if (adv == 0) + continue; + + /* + * Advance element if possible, advanced element may be NULL. + */ + if (chain) { + hammer2_chain_unlock(chain); + hammer2_chain_drop(chain); + } + fifo = &xop->collect[i]; + if (fifo->ri != fifo->wi) { + cpu_lfence(); + chain = fifo->array[fifo->ri & HAMMER2_XOPFIFO_MASK]; + ++fifo->ri; + xop->cluster.array[i].chain = chain; + if (chain == NULL) { + xop->cluster.array[i].flags |= + HAMMER2_CITEM_NULL; + } + if (fifo->wi - fifo->ri < HAMMER2_XOPFIFO / 2) + wakeup(xop); /* XXX optimize */ + --i; /* loop on same index */ + } else { + /* + * Retain CITEM_NULL flag. If set just repeat EOF. + * If not, the NULL,0 combination indicates an + * operation in-progress. + */ + xop->cluster.array[i].chain = NULL; + /* retain any CITEM_NULL setting */ + } + } + + /* + * Determine whether the lowest collected key meets clustering + * requirements. Returns: + * + * 0 - key valid, cluster can be returned. + * + * ENOENT - normal end of scan, return ENOENT. + * + * ESRCH - sufficient elements collected, quorum agreement + * that lokey is not a valid element and should be + * skipped. + * + * EDEADLK - sufficient elements collected, no quorum agreement + * (and no agreement possible). In this situation a + * repair is needed, for now we loop. + * + * EINPROGRESS - insufficient elements collected to resolve, wait + * for event and loop. + */ + error = hammer2_cluster_check(&xop->cluster, lokey, keynull); + if (error == EINPROGRESS) { + if (xop->check_counter == check_counter) { + tsleep_interlock(&xop->check_counter, 0); + cpu_lfence(); + if (xop->check_counter == check_counter) { + tsleep(&xop->check_counter, PINTERLOCKED, + "h2coll", hz*60); + } + } + goto loop; + } + if (error == ESRCH) { + if (lokey != HAMMER2_KEY_MAX) { + xop->nkey = lokey + 1; + goto loop; + } + error = ENOENT; + } + if (error == EDEADLK) { + kprintf("hammer2: no quorum possible lkey %016jx\n", + lokey); + if (lokey != HAMMER2_KEY_MAX) { + xop->nkey = lokey + 1; + goto loop; + } + error = ENOENT; + } + if (lokey == HAMMER2_KEY_MAX) + xop->nkey = lokey; + else + xop->nkey = lokey + 1; + + return error; } /* @@ -959,13 +1353,13 @@ hammer2_primary_xops_thread(void *arg) { hammer2_thread_t *thr = arg; hammer2_pfs_t *pmp; - hammer2_xop_t *xop; - hammer2_xop_t *prev; + hammer2_xop_head_t *xop; hammer2_xop_group_t *xgrp; + uint32_t mask; pmp = thr->pmp; xgrp = &pmp->xop_groups[thr->repidx]; - prev = &xgrp->marker; + mask = 1U << thr->clindex; lockmgr(&thr->lk, LK_EXCLUSIVE); while ((thr->flags & HAMMER2_THREAD_STOP) == 0) { @@ -994,19 +1388,26 @@ hammer2_primary_xops_thread(void *arg) } /* - * Process requests. All requests are persistent until the - * last thread has processed it. + * Process requests. Each request can be multi-queued. + * + * If we get behind and the frontend VOP is no longer active, + * we retire the request without processing it. The callback + * may also abort processing if the frontend VOP becomes + * inactive. */ - kprintf("xops_slave clindex %d\n", thr->clindex); - - while ((xop = prev->next) != NULL) { - if (atomic_fetchadd_int(&prev->refs, -1) == 1) { - KKASSERT(prev == xgrp->marker.next); - xgrp->marker.next = xop; - objcache_put(cache_xops, prev); + while ((xop = TAILQ_FIRST(&thr->xopq)) != NULL) { + TAILQ_REMOVE(&thr->xopq, xop, + collect[thr->clindex].entry); + if (hammer2_xop_active(xop)) { + lockmgr(&thr->lk, LK_RELEASE); + xop->func((hammer2_xop_t *)xop, thr->clindex); + hammer2_xop_retire(xop, mask); + lockmgr(&thr->lk, LK_EXCLUSIVE); + } else { + hammer2_xop_feed(xop, NULL, thr->clindex, + ECONNABORTED); + hammer2_xop_retire(xop, mask); } - xop->func(thr, xop); - prev = xop; } /* @@ -1014,6 +1415,17 @@ hammer2_primary_xops_thread(void *arg) */ lksleep(&thr->flags, &thr->lk, 0, "h2idle", 0); } + + /* + * Cleanup / termination + */ + while ((xop = TAILQ_FIRST(&thr->xopq)) != NULL) { + kprintf("hammer2_thread: aborting xop %p\n", xop->func); + TAILQ_REMOVE(&thr->xopq, xop, + collect[thr->clindex].entry); + hammer2_xop_retire(xop, mask); + } + thr->td = NULL; wakeup(thr); lockmgr(&thr->lk, LK_RELEASE); diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c index 18c4ed876b..c008392f87 100644 --- a/sys/vfs/hammer2/hammer2_vfsops.c +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -259,15 +259,30 @@ hammer2_vfs_init(struct vfsconf *conf) margs_vop.objsize = sizeof(hammer2_xop_t); margs_vop.mtype = M_HAMMER2; + /* + * Note thaht for the XOPS cache we want backing store allocations + * to use M_ZERO. This is not allowed in objcache_get() (to avoid + * confusion), so use the backing store function that does it. This + * means that initial XOPS objects are zerod but REUSED objects are + * not. So we are responsible for cleaning the object up sufficiently + * for our needs before objcache_put()ing it back (typically just the + * FIFO indices). + */ cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc, - 0, 1, NULL, NULL, NULL, objcache_malloc_alloc, - objcache_malloc_free, &margs_read); + 0, 1, NULL, NULL, NULL, + objcache_malloc_alloc, + objcache_malloc_free, + &margs_read); cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc, - 0, 1, NULL, NULL, NULL, objcache_malloc_alloc, - objcache_malloc_free, &margs_write); + 0, 1, NULL, NULL, NULL, + objcache_malloc_alloc, + objcache_malloc_free, + &margs_write); cache_xops = objcache_create(margs_vop.mtype->ks_shortdesc, - 0, 1, NULL, NULL, NULL, objcache_malloc_alloc, - objcache_malloc_free, &margs_vop); + 0, 1, NULL, NULL, NULL, + objcache_malloc_alloc_zero, + objcache_malloc_free, + &margs_vop); lockinit(&hammer2_mntlk, "mntlk", 0, 0); @@ -484,22 +499,17 @@ hammer2_pfsalloc(hammer2_cluster_t *cluster, */ if (pmp->sync_thrs[i].td == NULL) { hammer2_thr_create(&pmp->sync_thrs[i], pmp, - "h2nod", i, 0, + "h2nod", i, -1, hammer2_primary_sync_thread); } - - /* - * Xops support threads - */ - for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { - if (pmp->xop_groups[j].thrs[i].td) - continue; - hammer2_thr_create(&pmp->xop_groups[j].thrs[i], pmp, - "h2xop", i, j, - hammer2_primary_xops_thread); - } } + /* + * Create missing Xop threads + */ + if (pmp->mp) + hammer2_xop_helper_create(pmp); + hammer2_mtx_unlock(&iroot->lock); hammer2_inode_drop(iroot); done: @@ -1283,6 +1293,11 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags) } /* + * Cleanup the frontend support XOPS threads + */ + hammer2_xop_helper_cleanup(pmp); + + /* * Cleanup our reference on ihidden. */ if (pmp->ihidden) { @@ -1329,6 +1344,11 @@ hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp) kprintf("hammer2_mount hmp=%p ++mount_count=%d\n", rchain->hmp, rchain->hmp->mount_count); } + + /* + * Create missing Xop threads + */ + hammer2_xop_helper_create(pmp); } /* diff --git a/sys/vfs/hammer2/hammer2_vnops.c b/sys/vfs/hammer2/hammer2_vnops.c index 8c5bc8ec8b..080bcd5f65 100644 --- a/sys/vfs/hammer2/hammer2_vnops.c +++ b/sys/vfs/hammer2/hammer2_vnops.c @@ -467,15 +467,10 @@ static int hammer2_vop_readdir(struct vop_readdir_args *ap) { - const hammer2_inode_data_t *ripdata; - hammer2_inode_t *ip; - hammer2_inode_t *xip; - hammer2_cluster_t *cparent; - hammer2_cluster_t *cluster; - hammer2_cluster_t *xcluster; + hammer2_xop_readdir_t *xop; hammer2_blockref_t bref; + hammer2_inode_t *ip; hammer2_tid_t inum; - hammer2_key_t key_next; hammer2_key_t lkey; struct uio *uio; off_t *cookies; @@ -483,6 +478,7 @@ hammer2_vop_readdir(struct vop_readdir_args *ap) int cookie_index; int ncookies; int error; + int eofflag; int dtype; int r; @@ -490,6 +486,8 @@ hammer2_vop_readdir(struct vop_readdir_args *ap) ip = VTOI(ap->a_vp); uio = ap->a_uio; saveoff = uio->uio_offset; + eofflag = 0; + error = 0; /* * Setup cookies directory entry cookies if requested @@ -505,11 +503,7 @@ hammer2_vop_readdir(struct vop_readdir_args *ap) } cookie_index = 0; - hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS | HAMMER2_RESOLVE_SHARED); - cparent = hammer2_inode_cluster(ip, HAMMER2_RESOLVE_ALWAYS | - HAMMER2_RESOLVE_SHARED); - - ripdata = &hammer2_cluster_rdata(cparent)->ipdata; + hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); /* * Handle artificial entries. To ensure that only positive 64 bit @@ -520,11 +514,8 @@ hammer2_vop_readdir(struct vop_readdir_args *ap) * Entry 0 is used for '.' and entry 1 is used for '..'. Do not * allow '..' to cross the mount point into (e.g.) the super-root. */ - error = 0; - cluster = (void *)(intptr_t)-1; /* non-NULL for early goto done case */ - if (saveoff == 0) { - inum = ripdata->meta.inum & HAMMER2_DIRHASH_USERMSK; + inum = ip->meta.inum & HAMMER2_DIRHASH_USERMSK; r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, "."); if (r) goto done; @@ -542,33 +533,9 @@ hammer2_vop_readdir(struct vop_readdir_args *ap) * * (ip is the current dir. xip is the parent dir). */ - inum = ripdata->meta.inum & HAMMER2_DIRHASH_USERMSK; - while (ip->pip != NULL && ip != ip->pmp->iroot) { - xip = ip->pip; - hammer2_inode_ref(xip); - hammer2_inode_unlock(ip, cparent); - hammer2_inode_lock(xip, HAMMER2_RESOLVE_ALWAYS | - HAMMER2_RESOLVE_SHARED); - xcluster = hammer2_inode_cluster(xip, - HAMMER2_RESOLVE_ALWAYS | - HAMMER2_RESOLVE_SHARED); - - hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS | - HAMMER2_RESOLVE_SHARED); - cparent = hammer2_inode_cluster(ip, - HAMMER2_RESOLVE_ALWAYS | - HAMMER2_RESOLVE_SHARED); - hammer2_inode_drop(xip); - ripdata = &hammer2_cluster_rdata(cparent)->ipdata; - if (xip == ip->pip) { - inum = hammer2_cluster_rdata(xcluster)-> - ipdata.meta.inum & - HAMMER2_DIRHASH_USERMSK; - hammer2_inode_unlock(xip, xcluster); - break; - } - hammer2_inode_unlock(xip, xcluster); - } + inum = ip->meta.inum & HAMMER2_DIRHASH_USERMSK; + if (ip->pip && ip != ip->pmp->iroot) + inum = ip->pip->meta.inum & HAMMER2_DIRHASH_USERMSK; r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, ".."); if (r) goto done; @@ -583,31 +550,35 @@ hammer2_vop_readdir(struct vop_readdir_args *ap) lkey = saveoff | HAMMER2_DIRHASH_VISIBLE; if (hammer2_debug & 0x0020) kprintf("readdir: lkey %016jx\n", lkey); + if (error) + goto done; /* + * Use XOP for cluster scan. + * * parent is the inode cluster, already locked for us. Don't * double lock shared locks as this will screw up upgrades. */ - if (error) { - goto done; - } - cluster = hammer2_cluster_lookup(cparent, &key_next, lkey, lkey, - HAMMER2_LOOKUP_SHARED); - if (cluster == NULL) { - cluster = hammer2_cluster_lookup(cparent, &key_next, - lkey, (hammer2_key_t)-1, - HAMMER2_LOOKUP_SHARED); - } - if (cluster) - hammer2_cluster_bref(cluster, &bref); - while (cluster) { - if (hammer2_debug & 0x0020) - kprintf("readdir: p=%p chain=%p %016jx (next %016jx)\n", - cparent->focus, cluster->focus, - bref.key, key_next); + xop = &hammer2_xop_alloc(ip, hammer2_xop_readdir)->xop_readdir; + xop->head.lkey = lkey; + hammer2_xop_start(&xop->head); + for (;;) { + const hammer2_inode_data_t *ripdata; + + error = hammer2_xop_collect(&xop->head); + if (error) + break; + if (cookie_index == ncookies) + break; + if (hammer2_debug & 0x0020) + kprintf("cluster chain %p %p\n", + xop->head.cluster.focus, + (xop->head.cluster.focus ? + xop->head.cluster.focus->data : (void *)-1)); + ripdata = &hammer2_cluster_rdata(&xop->head.cluster)->ipdata; + hammer2_cluster_bref(&xop->head.cluster, &bref); if (bref.type == HAMMER2_BREF_TYPE_INODE) { - ripdata = &hammer2_cluster_rdata(cluster)->ipdata; dtype = hammer2_get_dtype(ripdata); saveoff = bref.key & HAMMER2_DIRHASH_USERMSK; r = vop_write_dirent(&error, uio, @@ -625,32 +596,19 @@ hammer2_vop_readdir(struct vop_readdir_args *ap) /* XXX chain error */ kprintf("bad chain type readdir %d\n", bref.type); } - - /* - * Keys may not be returned in order so once we have a - * placemarker (cluster) the scan must allow the full range - * or some entries will be missed. - */ - cluster = hammer2_cluster_next(cparent, cluster, &key_next, - key_next, (hammer2_key_t)-1, - HAMMER2_LOOKUP_SHARED); - if (cluster) { - hammer2_cluster_bref(cluster, &bref); - saveoff = (bref.key & HAMMER2_DIRHASH_USERMSK) + 1; - } else { - saveoff = (hammer2_key_t)-1; - } - if (cookie_index == ncookies) - break; } - if (cluster) { - hammer2_cluster_unlock(cluster); - hammer2_cluster_drop(cluster); + hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); + if (error == ENOENT) { + error = 0; + eofflag = 1; + saveoff = (hammer2_key_t)-1; + } else { + saveoff = bref.key & HAMMER2_DIRHASH_USERMSK; } done: - hammer2_inode_unlock(ip, cparent); + hammer2_inode_unlock(ip, NULL); if (ap->a_eofflag) - *ap->a_eofflag = (cluster == NULL); + *ap->a_eofflag = eofflag; if (hammer2_debug & 0x0020) kprintf("readdir: done at %016jx\n", saveoff); uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE; -- 2.11.4.GIT