hammer2 - Add kernel-thread-based async bulk free
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
blob735f50a8d0776a7121e595a22991c25d5d30ee27
1 /*
2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
53 * CLUSTER OPERATIONS
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
59 * - Most complex functions, quorum management on transaction ids.
61 * - Locking and data accesses must be internally asynchronous.
63 * - Validate and manage cache coherency primitives (cache state
64 * is stored in chain topologies but must be validated by these
65 * functions).
67 * (2) Lookups and Scans
68 * hammer2_cluster_lookup()
69 * hammer2_cluster_next()
71 * - Depend on locking & data retrieval functions, but still complex.
73 * - Must do quorum management on transaction ids.
75 * - Lookup and Iteration ops Must be internally asynchronous.
77 * (3) Modifying Operations
78 * hammer2_cluster_create()
80 * - Can usually punt on failures, operation continues unless quorum
81 * is lost. If quorum is lost, must wait for resynchronization
82 * (depending on the management mode).
84 * - Must disconnect node on failures (also not flush), remount, and
85 * resynchronize.
87 * - Network links (via kdmsg) are relatively easy to issue as the
88 * complex underworkings of hammer2_chain.c don't have to messed
89 * with (the protocol is at a higher level than block-level).
91 * - Multiple local disk nodes (i.e. block devices) are another matter.
92 * Chain operations have to be dispatched to per-node threads (xN)
93 * because we can't asynchronize potentially very complex chain
94 * operations in hammer2_chain.c (it would be a huge mess).
96 * (these threads are also used to terminate incoming kdmsg ops from
97 * other machines).
99 * - Single-node filesystems do not use threads and will simply call
100 * hammer2_chain.c functions directly. This short-cut is handled
101 * at the base of each cluster function.
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
110 #include "hammer2.h"
113 * Returns the bref type of the cluster's foucs.
115 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116 * The cluster must be locked.
118 uint8_t
119 hammer2_cluster_type(hammer2_cluster_t *cluster)
121 if (cluster->error == 0) {
122 KKASSERT(cluster->focus != NULL);
123 return(cluster->focus->bref.type);
125 return 0;
129 * Returns non-zero if the cluster's focus is flagged as being modified.
131 * If the cluster is errored, returns 0.
133 static
135 hammer2_cluster_modified(hammer2_cluster_t *cluster)
137 if (cluster->error == 0) {
138 KKASSERT(cluster->focus != NULL);
139 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
141 return 0;
145 * Returns the bref of the cluster's focus, sans any data-offset information
146 * (since offset information is per-node and wouldn't be useful).
148 * Callers use this function to access modify_tid, mirror_tid, type,
149 * key, and keybits.
151 * If the cluster is errored, returns an empty bref.
152 * The cluster must be locked.
154 void
155 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
157 if (cluster->error == 0) {
158 KKASSERT(cluster->focus != NULL);
159 *bref = cluster->focus->bref;
160 bref->data_off = 0;
161 } else {
162 bzero(bref, sizeof(*bref));
167 * Create a degenerate cluster with one ref from a single locked chain.
168 * The returned cluster will be focused on the chain and inherit its
169 * error state.
171 * The chain's lock and reference are transfered to the new cluster, so
172 * the caller should not try to unlock the chain separately.
174 * We fake the flags.
176 hammer2_cluster_t *
177 hammer2_cluster_from_chain(hammer2_chain_t *chain)
179 hammer2_cluster_t *cluster;
181 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
182 cluster->array[0].chain = chain;
183 cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
184 cluster->nchains = 1;
185 cluster->focus = chain;
186 cluster->focus_index = 0;
187 cluster->pmp = chain->pmp;
188 cluster->refs = 1;
189 cluster->error = chain->error;
190 cluster->flags = HAMMER2_CLUSTER_LOCKED |
191 HAMMER2_CLUSTER_WRHARD |
192 HAMMER2_CLUSTER_RDHARD |
193 HAMMER2_CLUSTER_MSYNCED |
194 HAMMER2_CLUSTER_SSYNCED;
196 return cluster;
200 * Add a reference to a cluster and its underlying chains.
202 * We must also ref the underlying chains in order to allow ref/unlock
203 * sequences to later re-lock.
205 void
206 hammer2_cluster_ref(hammer2_cluster_t *cluster)
208 atomic_add_int(&cluster->refs, 1);
212 * Drop the caller's reference to the cluster. When the ref count drops to
213 * zero this function frees the cluster and drops all underlying chains.
215 * In-progress read I/Os are typically detached from the cluster once the
216 * first one returns (the remaining stay attached to the DIOs but are then
217 * ignored and drop naturally).
219 void
220 hammer2_cluster_drop(hammer2_cluster_t *cluster)
222 hammer2_chain_t *chain;
223 int i;
225 KKASSERT(cluster->refs > 0);
226 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
227 cluster->focus = NULL; /* safety XXX chg to assert */
228 cluster->focus_index = 0;
230 for (i = 0; i < cluster->nchains; ++i) {
231 chain = cluster->array[i].chain;
232 if (chain) {
233 hammer2_chain_drop(chain);
234 cluster->array[i].chain = NULL; /* safety */
237 cluster->nchains = 0; /* safety */
239 kfree(cluster, M_HAMMER2);
240 /* cluster is invalid */
245 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
247 * WARNING! This function expects the caller to handle resolution of the
248 * cluster. We never re-resolve the cluster in this function,
249 * because it might be used to temporarily unlock/relock a cparent
250 * in an iteration or recursrion, and the cparents elements do not
251 * necessarily match.
253 void
254 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
256 hammer2_chain_t *chain;
257 int i;
259 /* cannot be on inode-embedded cluster template, must be on copy */
260 KKASSERT(cluster->refs > 0);
261 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
262 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
263 panic("hammer2_cluster_lock: cluster %p already locked!\n",
264 cluster);
266 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
269 * Lock chains and resolve state.
271 for (i = 0; i < cluster->nchains; ++i) {
272 chain = cluster->array[i].chain;
273 if (chain == NULL)
274 continue;
275 hammer2_chain_lock(chain, how);
280 * Calculate the clustering state for the cluster and set its focus.
281 * This routine must be called with care. For example, it should not
282 * normally be called after relocking a non-leaf cluster because parent
283 * clusters help iterations and each element might be at a slightly different
284 * indirect node (each node's topology is independently indexed).
286 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
287 * operations. Typically this is only set on a quorum of MASTERs or
288 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
289 * is present, this bit is *not* set on a quorum of MASTERs. The
290 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
291 * that create/modify/delete elements use it.
293 * The chains making up the cluster may be narrowed down based on quorum
294 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
295 * narrowed down to a single chain as long as the entire subtopology is known
296 * to be intact. So, for example, we can narrow a read-only op to a single
297 * fast SLAVE but if we focus a CACHE chain we must still retain at least
298 * a SLAVE to ensure that the subtopology can be accessed.
300 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
301 * to be maintained once the topology is validated as-of the top level of
302 * the operation.
304 * If a failure occurs the operation must be aborted by higher-level code and
305 * retried. XXX
307 void
308 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
310 hammer2_chain_t *chain;
311 hammer2_chain_t *focus;
312 hammer2_pfs_t *pmp;
313 hammer2_tid_t quorum_tid;
314 hammer2_tid_t last_best_quorum_tid;
315 int focus_pfs_type;
316 uint32_t nflags;
317 int ttlmasters;
318 int ttlslaves;
319 int nmasters;
320 int nslaves;
321 int nquorum;
322 int smpresent;
323 int i;
325 cluster->error = 0;
326 cluster->focus = NULL;
328 focus_pfs_type = 0;
329 nflags = 0;
330 ttlmasters = 0;
331 ttlslaves = 0;
332 nmasters = 0;
333 nslaves = 0;
336 * Calculate quorum
338 pmp = cluster->pmp;
339 KKASSERT(pmp != NULL || cluster->nchains == 0);
340 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
341 smpresent = 0;
344 * Pass 1
346 * NOTE: A NULL chain is not necessarily an error, it could be
347 * e.g. a lookup failure or the end of an iteration.
348 * Process normally.
350 for (i = 0; i < cluster->nchains; ++i) {
351 chain = cluster->array[i].chain;
352 if (chain && chain->error) {
353 if (cluster->focus == NULL || cluster->focus == chain) {
354 /* error will be overridden by valid focus */
355 cluster->error = chain->error;
359 * Must count total masters and slaves whether the
360 * chain is errored or not.
362 switch (cluster->pmp->pfs_types[i]) {
363 case HAMMER2_PFSTYPE_SUPROOT:
364 case HAMMER2_PFSTYPE_MASTER:
365 ++ttlmasters;
366 break;
367 case HAMMER2_PFSTYPE_SLAVE:
368 ++ttlslaves;
369 break;
371 continue;
373 switch (cluster->pmp->pfs_types[i]) {
374 case HAMMER2_PFSTYPE_MASTER:
375 ++ttlmasters;
376 break;
377 case HAMMER2_PFSTYPE_SLAVE:
378 ++ttlslaves;
379 break;
380 case HAMMER2_PFSTYPE_SOFT_MASTER:
381 nflags |= HAMMER2_CLUSTER_WRSOFT;
382 nflags |= HAMMER2_CLUSTER_RDSOFT;
383 smpresent = 1;
384 break;
385 case HAMMER2_PFSTYPE_SOFT_SLAVE:
386 nflags |= HAMMER2_CLUSTER_RDSOFT;
387 break;
388 case HAMMER2_PFSTYPE_SUPROOT:
390 * Degenerate cluster representing the super-root
391 * topology on a single device. Fake stuff so
392 * cluster ops work as expected.
394 nflags |= HAMMER2_CLUSTER_WRHARD;
395 nflags |= HAMMER2_CLUSTER_RDHARD;
396 cluster->focus_index = i;
397 cluster->focus = chain;
398 cluster->error = chain ? chain->error : 0;
399 ++ttlmasters;
400 break;
401 default:
402 break;
407 * Pass 2
409 * Resolve masters. Calculate nmasters for the highest matching
410 * TID, if a quorum cannot be attained try the next lower matching
411 * TID until we exhaust TIDs.
413 * NOTE: A NULL chain is not necessarily an error, it could be
414 * e.g. a lookup failure or the end of an iteration.
415 * Process normally.
417 last_best_quorum_tid = HAMMER2_TID_MAX;
418 quorum_tid = 0; /* fix gcc warning */
420 while (nmasters < nquorum && last_best_quorum_tid != 0) {
421 nmasters = 0;
422 quorum_tid = 0;
424 for (i = 0; i < cluster->nchains; ++i) {
425 switch (cluster->pmp->pfs_types[i]) {
426 case HAMMER2_PFSTYPE_SUPROOT:
427 case HAMMER2_PFSTYPE_MASTER:
428 break;
429 default:
430 continue;
432 chain = cluster->array[i].chain;
434 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
436 * Invalid as in unsynchronized, cannot be
437 * used to calculate the quorum.
439 } else if (chain == NULL && quorum_tid == 0) {
441 * NULL chain on master matches NULL chains
442 * on other masters.
444 ++nmasters;
445 } else if (quorum_tid < last_best_quorum_tid &&
446 chain != NULL &&
447 (quorum_tid < chain->bref.modify_tid ||
448 nmasters == 0)) {
450 * Better TID located, reset nmasters count.
452 nmasters = 1;
453 quorum_tid = chain->bref.modify_tid;
454 } else if (chain &&
455 quorum_tid == chain->bref.modify_tid) {
457 * TID matches current collection.
459 ++nmasters;
462 if (nmasters >= nquorum)
463 break;
464 last_best_quorum_tid = quorum_tid;
468 * Pass 3
470 * NOTE: A NULL chain is not necessarily an error, it could be
471 * e.g. a lookup failure or the end of an iteration.
472 * Process normally.
474 for (i = 0; i < cluster->nchains; ++i) {
475 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
476 chain = cluster->array[i].chain;
477 if (chain && chain->error) {
478 if (cluster->focus == NULL || cluster->focus == chain) {
479 /* error will be overridden by valid focus */
480 cluster->error = chain->error;
482 continue;
485 switch (cluster->pmp->pfs_types[i]) {
486 case HAMMER2_PFSTYPE_MASTER:
488 * We must have enough up-to-date masters to reach
489 * a quorum and the master modify_tid must match
490 * the quorum's modify_tid.
492 * Do not select an errored or out-of-sync master.
494 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
495 nflags |= HAMMER2_CLUSTER_UNHARD;
496 } else if (nmasters >= nquorum &&
497 (chain == NULL || chain->error == 0) &&
498 ((chain == NULL && quorum_tid == 0) ||
499 (chain != NULL && quorum_tid ==
500 chain->bref.modify_tid))) {
501 nflags |= HAMMER2_CLUSTER_WRHARD;
502 nflags |= HAMMER2_CLUSTER_RDHARD;
503 if (!smpresent) {
504 cluster->array[i].flags |=
505 HAMMER2_CITEM_FEMOD;
507 if (cluster->focus == NULL ||
508 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
509 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
510 cluster->focus_index = i;
511 cluster->focus = chain; /* NULL ok */
512 cluster->error = chain ? chain->error :
515 } else if (chain == NULL || chain->error == 0) {
516 nflags |= HAMMER2_CLUSTER_UNHARD;
518 break;
519 case HAMMER2_PFSTYPE_SLAVE:
521 * We must have enough up-to-date masters to reach
522 * a quorum and the slave modify_tid must match the
523 * quorum's modify_tid.
525 * Do not select an errored slave.
527 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
528 nflags |= HAMMER2_CLUSTER_UNHARD;
529 } else if (nmasters >= nquorum &&
530 (chain == NULL || chain->error == 0) &&
531 ((chain == NULL && quorum_tid == 0) ||
532 (chain && quorum_tid ==
533 chain->bref.modify_tid))) {
534 ++nslaves;
535 nflags |= HAMMER2_CLUSTER_RDHARD;
536 #if 0
537 /* XXX optimize for RESOLVE_RDONLY */
538 if (cluster->focus == NULL) {
539 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
540 cluster->focus_index = i;
541 cluster->focus = chain; /* NULL ok */
542 cluster->error = chain ? chain->error :
545 #endif
546 } else if (chain == NULL || chain->error == 0) {
547 nflags |= HAMMER2_CLUSTER_UNSOFT;
549 break;
550 case HAMMER2_PFSTYPE_SOFT_MASTER:
552 * Directly mounted soft master always wins. There
553 * should be only one.
555 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
556 cluster->focus_index = i;
557 cluster->focus = chain;
558 cluster->error = chain ? chain->error : 0;
559 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
560 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
561 break;
562 case HAMMER2_PFSTYPE_SOFT_SLAVE:
564 * Directly mounted soft slave always wins. There
565 * should be only one.
567 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
568 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
569 cluster->focus_index = i;
570 cluster->focus = chain;
571 cluster->error = chain ? chain->error : 0;
572 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
574 break;
575 case HAMMER2_PFSTYPE_SUPROOT:
577 * spmp (degenerate case)
579 KKASSERT(i == 0);
580 cluster->focus_index = i;
581 cluster->focus = chain;
582 cluster->error = chain ? chain->error : 0;
583 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
584 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
585 break;
586 default:
587 break;
592 * Focus now set, adjust ddflag. Skip this pass if the focus
593 * is bad or if we are at the PFS root (the bref won't match at
594 * the PFS root, obviously).
596 focus = cluster->focus;
597 if (focus) {
598 cluster->ddflag =
599 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
600 } else {
601 cluster->ddflag = 0;
602 goto skip4;
604 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
605 goto skip4;
608 * Pass 4
610 * Validate the elements that were not marked invalid. They should
611 * match.
613 for (i = 0; i < cluster->nchains; ++i) {
614 int ddflag;
616 chain = cluster->array[i].chain;
618 if (chain == NULL)
619 continue;
620 if (chain == focus)
621 continue;
622 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
623 continue;
625 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
626 if (chain->bref.type != focus->bref.type ||
627 chain->bref.key != focus->bref.key ||
628 chain->bref.keybits != focus->bref.keybits ||
629 chain->bref.modify_tid != focus->bref.modify_tid ||
630 chain->bytes != focus->bytes ||
631 ddflag != cluster->ddflag) {
632 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
633 if (hammer2_debug & 1)
634 kprintf("cluster_resolve: matching modify_tid failed "
635 "bref test: idx=%d type=%02x/%02x "
636 "key=%016jx/%d-%016jx/%d "
637 "mod=%016jx/%016jx bytes=%u/%u\n",
639 chain->bref.type, focus->bref.type,
640 chain->bref.key, chain->bref.keybits,
641 focus->bref.key, focus->bref.keybits,
642 chain->bref.modify_tid, focus->bref.modify_tid,
643 chain->bytes, focus->bytes);
644 if (hammer2_debug & 0x4000)
645 panic("cluster_resolve");
646 /* flag issue and force resync? */
649 skip4:
651 if (ttlslaves == 0)
652 nflags |= HAMMER2_CLUSTER_NOSOFT;
653 if (ttlmasters == 0)
654 nflags |= HAMMER2_CLUSTER_NOHARD;
657 * Set SSYNCED or MSYNCED for slaves and masters respectively if
658 * all available nodes (even if 0 are available) are fully
659 * synchronized. This is used by the synchronization thread to
660 * determine if there is work it could potentially accomplish.
662 if (nslaves == ttlslaves)
663 nflags |= HAMMER2_CLUSTER_SSYNCED;
664 if (nmasters == ttlmasters)
665 nflags |= HAMMER2_CLUSTER_MSYNCED;
668 * Determine if the cluster was successfully locked for the
669 * requested operation and generate an error code. The cluster
670 * will not be locked (or ref'd) if an error is returned.
672 atomic_set_int(&cluster->flags, nflags);
673 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
677 * This is used by the XOPS subsystem to calculate the state of
678 * the collection and tell hammer2_xop_collect() what to do with it.
679 * The collection can be in various states of desynchronization, the
680 * caller specifically wants to resolve the passed-in key.
682 * Return values:
683 * 0 - Quorum agreement, key is valid
685 * ENOENT - Quorum agreement, end of scan
687 * ESRCH - Quorum agreement, key is INVALID (caller should
688 * skip key).
690 * EIO - Quorum agreement but all elements had errors.
692 * EDEADLK - No quorum agreement possible for key, a repair
693 * may be needed. Caller has to decide what to do,
694 * possibly iterating the key or generating an EIO.
696 * EINPROGRESS - No quorum agreement yet, but agreement is still
697 * possible if caller waits for more responses. Caller
698 * should not iterate key.
700 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
702 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
705 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
707 hammer2_chain_t *chain;
708 hammer2_chain_t *focus;
709 hammer2_pfs_t *pmp;
710 hammer2_tid_t quorum_tid;
711 hammer2_tid_t last_best_quorum_tid;
712 uint32_t nflags;
713 int ttlmasters;
714 int ttlslaves;
715 int nmasters;
716 int nmasters_keymatch;
717 int nslaves;
718 int nquorum;
719 int umasters; /* unknown masters (still in progress) */
720 int smpresent;
721 int error;
722 int i;
724 cluster->error = 0;
725 cluster->focus = NULL;
727 pmp = cluster->pmp;
728 KKASSERT(pmp != NULL || cluster->nchains == 0);
731 * Calculate quorum
733 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
734 smpresent = 0;
735 nflags = 0;
736 ttlmasters = 0;
737 ttlslaves = 0;
740 * Pass 1
742 * NOTE: A NULL chain is not necessarily an error, it could be
743 * e.g. a lookup failure or the end of an iteration.
744 * Process normally.
746 for (i = 0; i < cluster->nchains; ++i) {
747 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
748 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
750 chain = cluster->array[i].chain;
751 error = cluster->array[i].error;
752 if (chain && error) {
753 if (cluster->focus == NULL || cluster->focus == chain) {
754 /* error will be overridden by valid focus */
755 /* XXX */
759 * Must count total masters and slaves whether the
760 * chain is errored or not.
762 switch (cluster->pmp->pfs_types[i]) {
763 case HAMMER2_PFSTYPE_SUPROOT:
764 case HAMMER2_PFSTYPE_MASTER:
765 ++ttlmasters;
766 break;
767 case HAMMER2_PFSTYPE_SLAVE:
768 ++ttlslaves;
769 break;
771 continue;
773 switch (cluster->pmp->pfs_types[i]) {
774 case HAMMER2_PFSTYPE_MASTER:
775 ++ttlmasters;
776 break;
777 case HAMMER2_PFSTYPE_SLAVE:
778 ++ttlslaves;
779 break;
780 case HAMMER2_PFSTYPE_SOFT_MASTER:
781 nflags |= HAMMER2_CLUSTER_WRSOFT;
782 nflags |= HAMMER2_CLUSTER_RDSOFT;
783 smpresent = 1;
784 break;
785 case HAMMER2_PFSTYPE_SOFT_SLAVE:
786 nflags |= HAMMER2_CLUSTER_RDSOFT;
787 break;
788 case HAMMER2_PFSTYPE_SUPROOT:
790 * Degenerate cluster representing the super-root
791 * topology on a single device. Fake stuff so
792 * cluster ops work as expected.
794 ++ttlmasters;
795 nflags |= HAMMER2_CLUSTER_WRHARD;
796 nflags |= HAMMER2_CLUSTER_RDHARD;
797 cluster->focus_index = i;
798 cluster->focus = chain;
799 cluster->error = error;
800 break;
801 default:
802 break;
807 * Pass 2
809 * Resolve nmasters - master nodes fully match
811 * Resolve umasters - master nodes operation still
812 * in progress
814 * Resolve nmasters_keymatch - master nodes match the passed-in
815 * key and may or may not match
816 * the quorum-agreed tid.
818 * The quorum-agreed TID is the highest matching TID.
820 last_best_quorum_tid = HAMMER2_TID_MAX;
821 umasters = 0;
822 nmasters = 0;
823 nmasters_keymatch = 0;
824 quorum_tid = 0; /* fix gcc warning */
826 while (nmasters < nquorum && last_best_quorum_tid != 0) {
827 umasters = 0;
828 nmasters = 0;
829 nmasters_keymatch = 0;
830 quorum_tid = 0;
832 for (i = 0; i < cluster->nchains; ++i) {
833 /* XXX SOFT smpresent handling */
834 switch(cluster->pmp->pfs_types[i]) {
835 case HAMMER2_PFSTYPE_MASTER:
836 case HAMMER2_PFSTYPE_SUPROOT:
837 break;
838 default:
839 continue;
842 chain = cluster->array[i].chain;
843 error = cluster->array[i].error;
846 * Skip elements still in progress. umasters keeps
847 * track of masters that might still be in-progress.
849 if (chain == NULL && (cluster->array[i].flags &
850 HAMMER2_CITEM_NULL) == 0) {
851 ++umasters;
852 continue;
856 * Key match?
858 if (flags & HAMMER2_CHECK_NULL) {
859 if (chain == NULL) {
860 ++nmasters;
861 ++nmasters_keymatch;
862 if (cluster->error == 0)
863 cluster->error = error;
865 } else if (chain &&
866 (key == (hammer2_key_t)-1 ||
867 chain->bref.key == key)) {
868 ++nmasters_keymatch;
870 if (chain->bref.modify_tid <
871 last_best_quorum_tid &&
872 quorum_tid < chain->bref.modify_tid) {
874 * Select new TID as master if better
875 * than any found so far in this loop,
876 * as long as it does not reach the
877 * best tid found in the previous loop.
879 nmasters = 0;
880 quorum_tid = chain->bref.modify_tid;
882 if (quorum_tid == chain->bref.modify_tid) {
884 * TID matches current collection.
886 * (error handled in next pass)
888 ++nmasters;
889 if (chain->error == 0) {
890 cluster->focus = chain;
891 cluster->focus_index = i;
896 if (nmasters >= nquorum)
897 break;
898 last_best_quorum_tid = quorum_tid;
902 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
903 nmasters, nquorum, nmasters_keymatch, umasters);
907 * Early return if we do not have enough masters.
909 if (nmasters < nquorum) {
910 if (nmasters + umasters >= nquorum)
911 return EINPROGRESS;
912 if (nmasters_keymatch < nquorum)
913 return ESRCH;
914 return EDEADLK;
918 * Validated end of scan.
920 if (flags & HAMMER2_CHECK_NULL) {
921 if (cluster->error == 0)
922 cluster->error = ENOENT;
923 return cluster->error;
927 * If we have a NULL focus at this point the agreeing quorum all
928 * had chain errors.
930 if (cluster->focus == NULL)
931 return EIO;
934 * Pass 3
936 * We have quorum agreement, validate elements, not end of scan.
938 nslaves = 0;
939 cluster->error = 0;
941 for (i = 0; i < cluster->nchains; ++i) {
942 chain = cluster->array[i].chain;
943 error = cluster->array[i].error;
944 if (chain == NULL ||
945 chain->bref.key != key ||
946 chain->bref.modify_tid != quorum_tid) {
947 continue;
951 * Quorum Match
953 * XXX for now, cumulative error.
955 if (cluster->error == 0)
956 cluster->error = error;
958 switch (cluster->pmp->pfs_types[i]) {
959 case HAMMER2_PFSTYPE_MASTER:
960 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
961 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
962 nflags |= HAMMER2_CLUSTER_WRHARD;
963 nflags |= HAMMER2_CLUSTER_RDHARD;
964 break;
965 case HAMMER2_PFSTYPE_SLAVE:
967 * We must have enough up-to-date masters to reach
968 * a quorum and the slave modify_tid must match the
969 * quorum's modify_tid.
971 * Do not select an errored slave.
973 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
974 nflags |= HAMMER2_CLUSTER_RDHARD;
975 ++nslaves;
976 break;
977 case HAMMER2_PFSTYPE_SOFT_MASTER:
979 * Directly mounted soft master always wins. There
980 * should be only one.
982 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
983 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
984 break;
985 case HAMMER2_PFSTYPE_SOFT_SLAVE:
987 * Directly mounted soft slave always wins. There
988 * should be only one.
990 * XXX
992 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
993 break;
994 case HAMMER2_PFSTYPE_SUPROOT:
996 * spmp (degenerate case)
998 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
999 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1000 nflags |= HAMMER2_CLUSTER_WRHARD;
1001 nflags |= HAMMER2_CLUSTER_RDHARD;
1002 break;
1003 default:
1004 break;
1009 * Focus now set, adjust ddflag. Skip this pass if the focus
1010 * is bad or if we are at the PFS root (the bref won't match at
1011 * the PFS root, obviously).
1013 focus = cluster->focus;
1014 if (focus) {
1015 cluster->ddflag =
1016 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1017 } else {
1018 cluster->ddflag = 0;
1019 goto skip4;
1021 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1022 goto skip4;
1025 * Pass 4
1027 * Validate the elements that were not marked invalid. They should
1028 * match.
1030 for (i = 0; i < cluster->nchains; ++i) {
1031 int ddflag;
1033 chain = cluster->array[i].chain;
1035 if (chain == NULL)
1036 continue;
1037 if (chain == focus)
1038 continue;
1039 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1040 continue;
1042 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1043 if (chain->bref.type != focus->bref.type ||
1044 chain->bref.key != focus->bref.key ||
1045 chain->bref.keybits != focus->bref.keybits ||
1046 chain->bref.modify_tid != focus->bref.modify_tid ||
1047 chain->bytes != focus->bytes ||
1048 ddflag != cluster->ddflag) {
1049 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1050 if (hammer2_debug & 1)
1051 kprintf("cluster_resolve: matching modify_tid failed "
1052 "bref test: idx=%d type=%02x/%02x "
1053 "key=%016jx/%d-%016jx/%d "
1054 "mod=%016jx/%016jx bytes=%u/%u\n",
1056 chain->bref.type, focus->bref.type,
1057 chain->bref.key, chain->bref.keybits,
1058 focus->bref.key, focus->bref.keybits,
1059 chain->bref.modify_tid, focus->bref.modify_tid,
1060 chain->bytes, focus->bytes);
1061 if (hammer2_debug & 0x4000)
1062 panic("cluster_resolve");
1063 /* flag issue and force resync? */
1066 skip4:
1068 if (ttlslaves == 0)
1069 nflags |= HAMMER2_CLUSTER_NOSOFT;
1070 if (ttlmasters == 0)
1071 nflags |= HAMMER2_CLUSTER_NOHARD;
1074 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1075 * all available nodes (even if 0 are available) are fully
1076 * synchronized. This is used by the synchronization thread to
1077 * determine if there is work it could potentially accomplish.
1079 if (nslaves == ttlslaves)
1080 nflags |= HAMMER2_CLUSTER_SSYNCED;
1081 if (nmasters == ttlmasters)
1082 nflags |= HAMMER2_CLUSTER_MSYNCED;
1085 * Determine if the cluster was successfully locked for the
1086 * requested operation and generate an error code. The cluster
1087 * will not be locked (or ref'd) if an error is returned.
1089 atomic_set_int(&cluster->flags, nflags);
1090 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1092 return cluster->error;
1096 * This is used by the sync thread to force non-NULL elements of a copy
1097 * of the pmp->iroot cluster to be good which is required to prime the
1098 * sync.
1100 void
1101 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1103 int i;
1105 for (i = 0; i < cluster->nchains; ++i) {
1106 if (cluster->array[i].chain)
1107 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1112 * Unlock a cluster. Refcount and focus is maintained.
1114 void
1115 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1117 hammer2_chain_t *chain;
1118 int i;
1120 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1121 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1122 cluster);
1124 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1125 KKASSERT(cluster->refs > 0);
1126 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1128 for (i = 0; i < cluster->nchains; ++i) {
1129 chain = cluster->array[i].chain;
1130 if (chain)
1131 hammer2_chain_unlock(chain);
1135 /************************************************************************
1136 * CLUSTER I/O *
1137 ************************************************************************
1140 * WARNING! blockref[] array data is not universal. These functions should
1141 * only be used to access universal data.
1143 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1144 * complete if necessary. The I/O's should have already been
1145 * initiated by the cluster_lock/chain_lock operation.
1147 * The cluster must already be in a modified state before wdata
1148 * is called. The data will already be available for this case.
1150 const hammer2_media_data_t *
1151 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1153 KKASSERT(cluster->focus != NULL);
1154 return(cluster->focus->data);
1157 hammer2_media_data_t *
1158 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1160 KKASSERT(cluster->focus != NULL);
1161 KKASSERT(hammer2_cluster_modified(cluster));
1162 return(cluster->focus->data);