2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
59 * - Most complex functions, quorum management on transaction ids.
61 * - Locking and data accesses must be internally asynchronous.
63 * - Validate and manage cache coherency primitives (cache state
64 * is stored in chain topologies but must be validated by these
67 * (2) Lookups and Scans
68 * hammer2_cluster_lookup()
69 * hammer2_cluster_next()
71 * - Depend on locking & data retrieval functions, but still complex.
73 * - Must do quorum management on transaction ids.
75 * - Lookup and Iteration ops Must be internally asynchronous.
77 * (3) Modifying Operations
78 * hammer2_cluster_create()
80 * - Can usually punt on failures, operation continues unless quorum
81 * is lost. If quorum is lost, must wait for resynchronization
82 * (depending on the management mode).
84 * - Must disconnect node on failures (also not flush), remount, and
87 * - Network links (via kdmsg) are relatively easy to issue as the
88 * complex underworkings of hammer2_chain.c don't have to messed
89 * with (the protocol is at a higher level than block-level).
91 * - Multiple local disk nodes (i.e. block devices) are another matter.
92 * Chain operations have to be dispatched to per-node threads (xN)
93 * because we can't asynchronize potentially very complex chain
94 * operations in hammer2_chain.c (it would be a huge mess).
96 * (these threads are also used to terminate incoming kdmsg ops from
99 * - Single-node filesystems do not use threads and will simply call
100 * hammer2_chain.c functions directly. This short-cut is handled
101 * at the base of each cluster function.
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
113 * Returns the bref type of the cluster's foucs.
115 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116 * The cluster must be locked.
119 hammer2_cluster_type(hammer2_cluster_t
*cluster
)
121 if (cluster
->error
== 0) {
122 KKASSERT(cluster
->focus
!= NULL
);
123 return(cluster
->focus
->bref
.type
);
130 * Returns non-zero if the cluster's focus is flagged as being modified.
132 * If the cluster is errored, returns 0.
136 hammer2_cluster_modified(hammer2_cluster_t
*cluster
)
138 if (cluster
->error
== 0) {
139 KKASSERT(cluster
->focus
!= NULL
);
140 return((cluster
->focus
->flags
& HAMMER2_CHAIN_MODIFIED
) != 0);
147 * Returns the bref of the cluster's focus, sans any data-offset information
148 * (since offset information is per-node and wouldn't be useful).
150 * Callers use this function to access modify_tid, mirror_tid, type,
153 * If the cluster is errored, returns an empty bref.
154 * The cluster must be locked.
157 hammer2_cluster_bref(hammer2_cluster_t
*cluster
, hammer2_blockref_t
*bref
)
159 if (cluster
->error
== 0) {
160 KKASSERT(cluster
->focus
!= NULL
);
161 *bref
= cluster
->focus
->bref
;
164 bzero(bref
, sizeof(*bref
));
169 * Create a degenerate cluster with one ref from a single locked chain.
170 * The returned cluster will be focused on the chain and inherit its
173 * The chain's lock and reference are transfered to the new cluster, so
174 * the caller should not try to unlock the chain separately.
179 hammer2_cluster_from_chain(hammer2_chain_t
*chain
)
181 hammer2_cluster_t
*cluster
;
183 cluster
= kmalloc(sizeof(*cluster
), M_HAMMER2
, M_WAITOK
| M_ZERO
);
184 cluster
->array
[0].chain
= chain
;
185 cluster
->array
[0].flags
= HAMMER2_CITEM_FEMOD
;
186 cluster
->nchains
= 1;
187 cluster
->focus
= chain
;
188 cluster
->focus_index
= 0;
189 cluster
->pmp
= chain
->pmp
;
191 cluster
->error
= chain
->error
;
192 cluster
->flags
= HAMMER2_CLUSTER_LOCKED
|
193 HAMMER2_CLUSTER_WRHARD
|
194 HAMMER2_CLUSTER_RDHARD
|
195 HAMMER2_CLUSTER_MSYNCED
|
196 HAMMER2_CLUSTER_SSYNCED
;
202 * Add a reference to a cluster and its underlying chains.
204 * We must also ref the underlying chains in order to allow ref/unlock
205 * sequences to later re-lock.
208 hammer2_cluster_ref(hammer2_cluster_t
*cluster
)
210 atomic_add_int(&cluster
->refs
, 1);
214 * Drop the caller's reference to the cluster. When the ref count drops to
215 * zero this function frees the cluster and drops all underlying chains.
217 * In-progress read I/Os are typically detached from the cluster once the
218 * first one returns (the remaining stay attached to the DIOs but are then
219 * ignored and drop naturally).
222 hammer2_cluster_drop(hammer2_cluster_t
*cluster
)
224 hammer2_chain_t
*chain
;
227 KKASSERT(cluster
->refs
> 0);
228 if (atomic_fetchadd_int(&cluster
->refs
, -1) == 1) {
229 cluster
->focus
= NULL
; /* safety XXX chg to assert */
230 cluster
->focus_index
= 0;
232 for (i
= 0; i
< cluster
->nchains
; ++i
) {
233 chain
= cluster
->array
[i
].chain
;
235 hammer2_chain_drop(chain
);
236 cluster
->array
[i
].chain
= NULL
; /* safety */
239 cluster
->nchains
= 0; /* safety */
241 kfree(cluster
, M_HAMMER2
);
242 /* cluster is invalid */
247 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
249 * WARNING! This function expects the caller to handle resolution of the
250 * cluster. We never re-resolve the cluster in this function,
251 * because it might be used to temporarily unlock/relock a cparent
252 * in an iteration or recursrion, and the cparents elements do not
256 hammer2_cluster_lock(hammer2_cluster_t
*cluster
, int how
)
258 hammer2_chain_t
*chain
;
261 /* cannot be on inode-embedded cluster template, must be on copy */
262 KKASSERT(cluster
->refs
> 0);
263 KKASSERT((cluster
->flags
& HAMMER2_CLUSTER_INODE
) == 0);
264 if (cluster
->flags
& HAMMER2_CLUSTER_LOCKED
) {
265 panic("hammer2_cluster_lock: cluster %p already locked!\n",
268 atomic_set_int(&cluster
->flags
, HAMMER2_CLUSTER_LOCKED
);
271 * Lock chains and resolve state.
273 for (i
= 0; i
< cluster
->nchains
; ++i
) {
274 chain
= cluster
->array
[i
].chain
;
277 hammer2_chain_lock(chain
, how
);
282 * Calculate the clustering state for the cluster and set its focus.
283 * This routine must be called with care. For example, it should not
284 * normally be called after relocking a non-leaf cluster because parent
285 * clusters help iterations and each element might be at a slightly different
286 * indirect node (each node's topology is independently indexed).
288 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
289 * operations. Typically this is only set on a quorum of MASTERs or
290 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
291 * is present, this bit is *not* set on a quorum of MASTERs. The
292 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
293 * that create/modify/delete elements use it.
295 * The chains making up the cluster may be narrowed down based on quorum
296 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
297 * narrowed down to a single chain as long as the entire subtopology is known
298 * to be intact. So, for example, we can narrow a read-only op to a single
299 * fast SLAVE but if we focus a CACHE chain we must still retain at least
300 * a SLAVE to ensure that the subtopology can be accessed.
302 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
303 * to be maintained once the topology is validated as-of the top level of
306 * If a failure occurs the operation must be aborted by higher-level code and
310 hammer2_cluster_resolve(hammer2_cluster_t
*cluster
)
312 hammer2_chain_t
*chain
;
313 hammer2_chain_t
*focus
;
315 hammer2_tid_t quorum_tid
;
316 hammer2_tid_t last_best_quorum_tid
;
328 cluster
->focus
= NULL
;
341 KKASSERT(pmp
!= NULL
|| cluster
->nchains
== 0);
342 nquorum
= pmp
? pmp
->pfs_nmasters
/ 2 + 1 : 0;
348 * NOTE: A NULL chain is not necessarily an error, it could be
349 * e.g. a lookup failure or the end of an iteration.
352 for (i
= 0; i
< cluster
->nchains
; ++i
) {
353 chain
= cluster
->array
[i
].chain
;
354 if (chain
&& chain
->error
) {
355 if (cluster
->focus
== NULL
|| cluster
->focus
== chain
) {
356 /* error will be overridden by valid focus */
357 cluster
->error
= chain
->error
;
361 * Must count total masters and slaves whether the
362 * chain is errored or not.
364 switch (cluster
->pmp
->pfs_types
[i
]) {
365 case HAMMER2_PFSTYPE_SUPROOT
:
366 case HAMMER2_PFSTYPE_MASTER
:
369 case HAMMER2_PFSTYPE_SLAVE
:
375 switch (cluster
->pmp
->pfs_types
[i
]) {
376 case HAMMER2_PFSTYPE_MASTER
:
379 case HAMMER2_PFSTYPE_SLAVE
:
382 case HAMMER2_PFSTYPE_SOFT_MASTER
:
383 nflags
|= HAMMER2_CLUSTER_WRSOFT
;
384 nflags
|= HAMMER2_CLUSTER_RDSOFT
;
387 case HAMMER2_PFSTYPE_SOFT_SLAVE
:
388 nflags
|= HAMMER2_CLUSTER_RDSOFT
;
390 case HAMMER2_PFSTYPE_SUPROOT
:
392 * Degenerate cluster representing the super-root
393 * topology on a single device. Fake stuff so
394 * cluster ops work as expected.
396 nflags
|= HAMMER2_CLUSTER_WRHARD
;
397 nflags
|= HAMMER2_CLUSTER_RDHARD
;
398 cluster
->focus_index
= i
;
399 cluster
->focus
= chain
;
400 cluster
->error
= chain
? chain
->error
: 0;
411 * Resolve masters. Calculate nmasters for the highest matching
412 * TID, if a quorum cannot be attained try the next lower matching
413 * TID until we exhaust TIDs.
415 * NOTE: A NULL chain is not necessarily an error, it could be
416 * e.g. a lookup failure or the end of an iteration.
419 last_best_quorum_tid
= HAMMER2_TID_MAX
;
420 quorum_tid
= 0; /* fix gcc warning */
422 while (nmasters
< nquorum
&& last_best_quorum_tid
!= 0) {
426 for (i
= 0; i
< cluster
->nchains
; ++i
) {
427 switch (cluster
->pmp
->pfs_types
[i
]) {
428 case HAMMER2_PFSTYPE_SUPROOT
:
429 case HAMMER2_PFSTYPE_MASTER
:
434 chain
= cluster
->array
[i
].chain
;
436 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
) {
438 * Invalid as in unsynchronized, cannot be
439 * used to calculate the quorum.
441 } else if (chain
== NULL
&& quorum_tid
== 0) {
443 * NULL chain on master matches NULL chains
447 } else if (quorum_tid
< last_best_quorum_tid
&&
449 (quorum_tid
< chain
->bref
.modify_tid
||
452 * Better TID located, reset nmasters count.
455 quorum_tid
= chain
->bref
.modify_tid
;
457 quorum_tid
== chain
->bref
.modify_tid
) {
459 * TID matches current collection.
464 if (nmasters
>= nquorum
)
466 last_best_quorum_tid
= quorum_tid
;
472 * NOTE: A NULL chain is not necessarily an error, it could be
473 * e.g. a lookup failure or the end of an iteration.
476 for (i
= 0; i
< cluster
->nchains
; ++i
) {
477 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_FEMOD
;
478 chain
= cluster
->array
[i
].chain
;
479 if (chain
&& chain
->error
) {
480 if (cluster
->focus
== NULL
|| cluster
->focus
== chain
) {
481 /* error will be overridden by valid focus */
482 cluster
->error
= chain
->error
;
487 switch (cluster
->pmp
->pfs_types
[i
]) {
488 case HAMMER2_PFSTYPE_MASTER
:
490 * We must have enough up-to-date masters to reach
491 * a quorum and the master modify_tid must match
492 * the quorum's modify_tid.
494 * Do not select an errored or out-of-sync master.
496 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
) {
497 nflags
|= HAMMER2_CLUSTER_UNHARD
;
498 } else if (nmasters
>= nquorum
&&
499 (chain
== NULL
|| chain
->error
== 0) &&
500 ((chain
== NULL
&& quorum_tid
== 0) ||
501 (chain
!= NULL
&& quorum_tid
==
502 chain
->bref
.modify_tid
))) {
503 nflags
|= HAMMER2_CLUSTER_WRHARD
;
504 nflags
|= HAMMER2_CLUSTER_RDHARD
;
506 cluster
->array
[i
].flags
|=
509 if (cluster
->focus
== NULL
||
510 focus_pfs_type
== HAMMER2_PFSTYPE_SLAVE
) {
511 focus_pfs_type
= HAMMER2_PFSTYPE_MASTER
;
512 cluster
->focus_index
= i
;
513 cluster
->focus
= chain
; /* NULL ok */
514 cluster
->error
= chain
? chain
->error
:
517 } else if (chain
== NULL
|| chain
->error
== 0) {
518 nflags
|= HAMMER2_CLUSTER_UNHARD
;
521 case HAMMER2_PFSTYPE_SLAVE
:
523 * We must have enough up-to-date masters to reach
524 * a quorum and the slave modify_tid must match the
525 * quorum's modify_tid.
527 * Do not select an errored slave.
529 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
) {
530 nflags
|= HAMMER2_CLUSTER_UNHARD
;
531 } else if (nmasters
>= nquorum
&&
532 (chain
== NULL
|| chain
->error
== 0) &&
533 ((chain
== NULL
&& quorum_tid
== 0) ||
534 (chain
&& quorum_tid
==
535 chain
->bref
.modify_tid
))) {
537 nflags
|= HAMMER2_CLUSTER_RDHARD
;
539 /* XXX optimize for RESOLVE_RDONLY */
540 if (cluster
->focus
== NULL
) {
541 focus_pfs_type
= HAMMER2_PFSTYPE_SLAVE
;
542 cluster
->focus_index
= i
;
543 cluster
->focus
= chain
; /* NULL ok */
544 cluster
->error
= chain
? chain
->error
:
548 } else if (chain
== NULL
|| chain
->error
== 0) {
549 nflags
|= HAMMER2_CLUSTER_UNSOFT
;
552 case HAMMER2_PFSTYPE_SOFT_MASTER
:
554 * Directly mounted soft master always wins. There
555 * should be only one.
557 KKASSERT(focus_pfs_type
!= HAMMER2_PFSTYPE_SOFT_MASTER
);
558 cluster
->focus_index
= i
;
559 cluster
->focus
= chain
;
560 cluster
->error
= chain
? chain
->error
: 0;
561 focus_pfs_type
= HAMMER2_PFSTYPE_SOFT_MASTER
;
562 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
564 case HAMMER2_PFSTYPE_SOFT_SLAVE
:
566 * Directly mounted soft slave always wins. There
567 * should be only one.
569 KKASSERT(focus_pfs_type
!= HAMMER2_PFSTYPE_SOFT_SLAVE
);
570 if (focus_pfs_type
!= HAMMER2_PFSTYPE_SOFT_MASTER
) {
571 cluster
->focus_index
= i
;
572 cluster
->focus
= chain
;
573 cluster
->error
= chain
? chain
->error
: 0;
574 focus_pfs_type
= HAMMER2_PFSTYPE_SOFT_SLAVE
;
577 case HAMMER2_PFSTYPE_SUPROOT
:
579 * spmp (degenerate case)
582 cluster
->focus_index
= i
;
583 cluster
->focus
= chain
;
584 cluster
->error
= chain
? chain
->error
: 0;
585 focus_pfs_type
= HAMMER2_PFSTYPE_SUPROOT
;
586 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
594 * Focus now set, adjust ddflag. Skip this pass if the focus
595 * is bad or if we are at the PFS root (the bref won't match at
596 * the PFS root, obviously).
598 focus
= cluster
->focus
;
601 (cluster
->focus
->bref
.type
== HAMMER2_BREF_TYPE_INODE
);
606 if (cluster
->focus
->flags
& HAMMER2_CHAIN_PFSBOUNDARY
)
612 * Validate the elements that were not marked invalid. They should
615 for (i
= 0; i
< cluster
->nchains
; ++i
) {
618 chain
= cluster
->array
[i
].chain
;
624 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
)
627 ddflag
= (chain
->bref
.type
== HAMMER2_BREF_TYPE_INODE
);
628 if (chain
->bref
.type
!= focus
->bref
.type
||
629 chain
->bref
.key
!= focus
->bref
.key
||
630 chain
->bref
.keybits
!= focus
->bref
.keybits
||
631 chain
->bref
.modify_tid
!= focus
->bref
.modify_tid
||
632 chain
->bytes
!= focus
->bytes
||
633 ddflag
!= cluster
->ddflag
) {
634 cluster
->array
[i
].flags
|= HAMMER2_CITEM_INVALID
;
635 if (hammer2_debug
& 1)
636 kprintf("cluster_resolve: matching modify_tid failed "
637 "bref test: idx=%d type=%02x/%02x "
638 "key=%016jx/%d-%016jx/%d "
639 "mod=%016jx/%016jx bytes=%u/%u\n",
641 chain
->bref
.type
, focus
->bref
.type
,
642 chain
->bref
.key
, chain
->bref
.keybits
,
643 focus
->bref
.key
, focus
->bref
.keybits
,
644 chain
->bref
.modify_tid
, focus
->bref
.modify_tid
,
645 chain
->bytes
, focus
->bytes
);
646 if (hammer2_debug
& 0x4000)
647 panic("cluster_resolve");
648 /* flag issue and force resync? */
654 nflags
|= HAMMER2_CLUSTER_NOSOFT
;
656 nflags
|= HAMMER2_CLUSTER_NOHARD
;
659 * Set SSYNCED or MSYNCED for slaves and masters respectively if
660 * all available nodes (even if 0 are available) are fully
661 * synchronized. This is used by the synchronization thread to
662 * determine if there is work it could potentially accomplish.
664 if (nslaves
== ttlslaves
)
665 nflags
|= HAMMER2_CLUSTER_SSYNCED
;
666 if (nmasters
== ttlmasters
)
667 nflags
|= HAMMER2_CLUSTER_MSYNCED
;
670 * Determine if the cluster was successfully locked for the
671 * requested operation and generate an error code. The cluster
672 * will not be locked (or ref'd) if an error is returned.
674 atomic_set_int(&cluster
->flags
, nflags
);
675 atomic_clear_int(&cluster
->flags
, HAMMER2_CLUSTER_ZFLAGS
& ~nflags
);
679 * This is used by the XOPS subsystem to calculate the state of
680 * the collection and tell hammer2_xop_collect() what to do with it.
681 * The collection can be in various states of desynchronization, the
682 * caller specifically wants to resolve the passed-in key.
685 * 0 - Quorum agreement, key is valid
687 * ENOENT - Quorum agreement, end of scan
689 * ESRCH - Quorum agreement, key is INVALID (caller should
692 * EIO - Quorum agreement but all elements had errors.
694 * EDEADLK - No quorum agreement possible for key, a repair
695 * may be needed. Caller has to decide what to do,
696 * possibly iterating the key or generating an EIO.
698 * EINPROGRESS - No quorum agreement yet, but agreement is still
699 * possible if caller waits for more responses. Caller
700 * should not iterate key.
702 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
704 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
707 hammer2_cluster_check(hammer2_cluster_t
*cluster
, hammer2_key_t key
, int flags
)
709 hammer2_chain_t
*chain
;
710 hammer2_chain_t
*focus
;
712 hammer2_tid_t quorum_tid
;
713 hammer2_tid_t last_best_quorum_tid
;
718 int nmasters_keymatch
;
721 int umasters
; /* unknown masters (still in progress) */
727 cluster
->focus
= NULL
;
730 KKASSERT(pmp
!= NULL
|| cluster
->nchains
== 0);
735 nquorum
= pmp
? pmp
->pfs_nmasters
/ 2 + 1 : 0;
744 * NOTE: A NULL chain is not necessarily an error, it could be
745 * e.g. a lookup failure or the end of an iteration.
748 for (i
= 0; i
< cluster
->nchains
; ++i
) {
749 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_FEMOD
;
750 cluster
->array
[i
].flags
|= HAMMER2_CITEM_INVALID
;
752 chain
= cluster
->array
[i
].chain
;
753 error
= cluster
->array
[i
].error
;
754 if (chain
&& error
) {
755 if (cluster
->focus
== NULL
|| cluster
->focus
== chain
) {
756 /* error will be overridden by valid focus */
761 * Must count total masters and slaves whether the
762 * chain is errored or not.
764 switch (cluster
->pmp
->pfs_types
[i
]) {
765 case HAMMER2_PFSTYPE_SUPROOT
:
766 case HAMMER2_PFSTYPE_MASTER
:
769 case HAMMER2_PFSTYPE_SLAVE
:
775 switch (cluster
->pmp
->pfs_types
[i
]) {
776 case HAMMER2_PFSTYPE_MASTER
:
779 case HAMMER2_PFSTYPE_SLAVE
:
782 case HAMMER2_PFSTYPE_SOFT_MASTER
:
783 nflags
|= HAMMER2_CLUSTER_WRSOFT
;
784 nflags
|= HAMMER2_CLUSTER_RDSOFT
;
787 case HAMMER2_PFSTYPE_SOFT_SLAVE
:
788 nflags
|= HAMMER2_CLUSTER_RDSOFT
;
790 case HAMMER2_PFSTYPE_SUPROOT
:
792 * Degenerate cluster representing the super-root
793 * topology on a single device. Fake stuff so
794 * cluster ops work as expected.
797 nflags
|= HAMMER2_CLUSTER_WRHARD
;
798 nflags
|= HAMMER2_CLUSTER_RDHARD
;
799 cluster
->focus_index
= i
;
800 cluster
->focus
= chain
;
801 cluster
->error
= error
;
811 * Resolve nmasters - master nodes fully match
813 * Resolve umasters - master nodes operation still
816 * Resolve nmasters_keymatch - master nodes match the passed-in
817 * key and may or may not match
818 * the quorum-agreed tid.
820 * The quorum-agreed TID is the highest matching TID.
822 last_best_quorum_tid
= HAMMER2_TID_MAX
;
825 nmasters_keymatch
= 0;
826 quorum_tid
= 0; /* fix gcc warning */
828 while (nmasters
< nquorum
&& last_best_quorum_tid
!= 0) {
831 nmasters_keymatch
= 0;
834 for (i
= 0; i
< cluster
->nchains
; ++i
) {
835 /* XXX SOFT smpresent handling */
836 switch(cluster
->pmp
->pfs_types
[i
]) {
837 case HAMMER2_PFSTYPE_MASTER
:
838 case HAMMER2_PFSTYPE_SUPROOT
:
844 chain
= cluster
->array
[i
].chain
;
845 error
= cluster
->array
[i
].error
;
848 * Skip elements still in progress. umasters keeps
849 * track of masters that might still be in-progress.
851 if (chain
== NULL
&& (cluster
->array
[i
].flags
&
852 HAMMER2_CITEM_NULL
) == 0) {
860 if (flags
& HAMMER2_CHECK_NULL
) {
864 if (cluster
->error
== 0)
865 cluster
->error
= error
;
868 (key
== (hammer2_key_t
)-1 ||
869 chain
->bref
.key
== key
)) {
872 if (chain
->bref
.modify_tid
<
873 last_best_quorum_tid
&&
874 quorum_tid
< chain
->bref
.modify_tid
) {
876 * Select new TID as master if better
877 * than any found so far in this loop,
878 * as long as it does not reach the
879 * best tid found in the previous loop.
882 quorum_tid
= chain
->bref
.modify_tid
;
884 if (quorum_tid
== chain
->bref
.modify_tid
) {
886 * TID matches current collection.
888 * (error handled in next pass)
891 if (chain
->error
== 0) {
892 cluster
->focus
= chain
;
893 cluster
->focus_index
= i
;
898 if (nmasters
>= nquorum
)
900 last_best_quorum_tid
= quorum_tid
;
904 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
905 nmasters, nquorum, nmasters_keymatch, umasters);
909 * Early return if we do not have enough masters.
911 if (nmasters
< nquorum
) {
912 if (nmasters
+ umasters
>= nquorum
)
913 return HAMMER2_ERROR_EINPROGRESS
;
914 if (nmasters_keymatch
< nquorum
)
915 return HAMMER2_ERROR_ESRCH
;
916 return HAMMER2_ERROR_EDEADLK
;
920 * Validated end of scan.
922 if (flags
& HAMMER2_CHECK_NULL
) {
923 if (cluster
->error
== 0)
924 cluster
->error
= HAMMER2_ERROR_ENOENT
;
925 return cluster
->error
;
929 * If we have a NULL focus at this point the agreeing quorum all
932 if (cluster
->focus
== NULL
)
933 return HAMMER2_ERROR_EIO
;
938 * We have quorum agreement, validate elements, not end of scan.
943 for (i
= 0; i
< cluster
->nchains
; ++i
) {
944 chain
= cluster
->array
[i
].chain
;
945 error
= cluster
->array
[i
].error
;
947 chain
->bref
.key
!= key
||
948 chain
->bref
.modify_tid
!= quorum_tid
) {
955 * XXX for now, cumulative error.
957 if (cluster
->error
== 0)
958 cluster
->error
= error
;
960 switch (cluster
->pmp
->pfs_types
[i
]) {
961 case HAMMER2_PFSTYPE_MASTER
:
962 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
963 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
964 nflags
|= HAMMER2_CLUSTER_WRHARD
;
965 nflags
|= HAMMER2_CLUSTER_RDHARD
;
967 case HAMMER2_PFSTYPE_SLAVE
:
969 * We must have enough up-to-date masters to reach
970 * a quorum and the slave modify_tid must match the
971 * quorum's modify_tid.
973 * Do not select an errored slave.
975 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
976 nflags
|= HAMMER2_CLUSTER_RDHARD
;
979 case HAMMER2_PFSTYPE_SOFT_MASTER
:
981 * Directly mounted soft master always wins. There
982 * should be only one.
984 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
985 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
987 case HAMMER2_PFSTYPE_SOFT_SLAVE
:
989 * Directly mounted soft slave always wins. There
990 * should be only one.
994 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
996 case HAMMER2_PFSTYPE_SUPROOT
:
998 * spmp (degenerate case)
1000 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
1001 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
1002 nflags
|= HAMMER2_CLUSTER_WRHARD
;
1003 nflags
|= HAMMER2_CLUSTER_RDHARD
;
1011 * Focus now set, adjust ddflag. Skip this pass if the focus
1012 * is bad or if we are at the PFS root (the bref won't match at
1013 * the PFS root, obviously).
1015 focus
= cluster
->focus
;
1018 (cluster
->focus
->bref
.type
== HAMMER2_BREF_TYPE_INODE
);
1020 cluster
->ddflag
= 0;
1023 if (cluster
->focus
->flags
& HAMMER2_CHAIN_PFSBOUNDARY
)
1029 * Validate the elements that were not marked invalid. They should
1032 for (i
= 0; i
< cluster
->nchains
; ++i
) {
1035 chain
= cluster
->array
[i
].chain
;
1041 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
)
1044 ddflag
= (chain
->bref
.type
== HAMMER2_BREF_TYPE_INODE
);
1045 if (chain
->bref
.type
!= focus
->bref
.type
||
1046 chain
->bref
.key
!= focus
->bref
.key
||
1047 chain
->bref
.keybits
!= focus
->bref
.keybits
||
1048 chain
->bref
.modify_tid
!= focus
->bref
.modify_tid
||
1049 chain
->bytes
!= focus
->bytes
||
1050 ddflag
!= cluster
->ddflag
) {
1051 cluster
->array
[i
].flags
|= HAMMER2_CITEM_INVALID
;
1052 if (hammer2_debug
& 1)
1053 kprintf("cluster_resolve: matching modify_tid failed "
1054 "bref test: idx=%d type=%02x/%02x "
1055 "key=%016jx/%d-%016jx/%d "
1056 "mod=%016jx/%016jx bytes=%u/%u\n",
1058 chain
->bref
.type
, focus
->bref
.type
,
1059 chain
->bref
.key
, chain
->bref
.keybits
,
1060 focus
->bref
.key
, focus
->bref
.keybits
,
1061 chain
->bref
.modify_tid
, focus
->bref
.modify_tid
,
1062 chain
->bytes
, focus
->bytes
);
1063 if (hammer2_debug
& 0x4000)
1064 panic("cluster_resolve");
1065 /* flag issue and force resync? */
1071 nflags
|= HAMMER2_CLUSTER_NOSOFT
;
1072 if (ttlmasters
== 0)
1073 nflags
|= HAMMER2_CLUSTER_NOHARD
;
1076 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1077 * all available nodes (even if 0 are available) are fully
1078 * synchronized. This is used by the synchronization thread to
1079 * determine if there is work it could potentially accomplish.
1081 if (nslaves
== ttlslaves
)
1082 nflags
|= HAMMER2_CLUSTER_SSYNCED
;
1083 if (nmasters
== ttlmasters
)
1084 nflags
|= HAMMER2_CLUSTER_MSYNCED
;
1087 * Determine if the cluster was successfully locked for the
1088 * requested operation and generate an error code. The cluster
1089 * will not be locked (or ref'd) if an error is returned.
1091 atomic_set_int(&cluster
->flags
, nflags
);
1092 atomic_clear_int(&cluster
->flags
, HAMMER2_CLUSTER_ZFLAGS
& ~nflags
);
1094 return cluster
->error
;
1098 * This is used by the sync thread to force non-NULL elements of a copy
1099 * of the pmp->iroot cluster to be good which is required to prime the
1103 hammer2_cluster_forcegood(hammer2_cluster_t
*cluster
)
1107 for (i
= 0; i
< cluster
->nchains
; ++i
) {
1108 if (cluster
->array
[i
].chain
)
1109 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
1114 * Unlock a cluster. Refcount and focus is maintained.
1117 hammer2_cluster_unlock(hammer2_cluster_t
*cluster
)
1119 hammer2_chain_t
*chain
;
1122 if ((cluster
->flags
& HAMMER2_CLUSTER_LOCKED
) == 0) {
1123 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1126 KKASSERT(cluster
->flags
& HAMMER2_CLUSTER_LOCKED
);
1127 KKASSERT(cluster
->refs
> 0);
1128 atomic_clear_int(&cluster
->flags
, HAMMER2_CLUSTER_LOCKED
);
1130 for (i
= 0; i
< cluster
->nchains
; ++i
) {
1131 chain
= cluster
->array
[i
].chain
;
1133 hammer2_chain_unlock(chain
);
1137 /************************************************************************
1139 ************************************************************************
1142 * WARNING! blockref[] array data is not universal. These functions should
1143 * only be used to access universal data.
1145 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1146 * complete if necessary. The I/O's should have already been
1147 * initiated by the cluster_lock/chain_lock operation.
1149 * The cluster must already be in a modified state before wdata
1150 * is called. The data will already be available for this case.
1152 const hammer2_media_data_t
*
1153 hammer2_cluster_rdata(hammer2_cluster_t
*cluster
)
1155 hammer2_chain_t
*chain
;
1157 chain
= cluster
->focus
;
1158 KKASSERT(chain
!= NULL
&& chain
->lockcnt
);
1160 hammer2_io_bkvasync(chain
->dio
);
1161 return (chain
->data
);
1164 hammer2_media_data_t
*
1165 hammer2_cluster_wdata(hammer2_cluster_t
*cluster
)
1167 hammer2_chain_t
*chain
;
1169 chain
= cluster
->focus
;
1170 KKASSERT(chain
!= NULL
&& chain
->lockcnt
&&
1171 hammer2_cluster_modified(cluster
));
1173 hammer2_io_bkvasync(chain
->dio
);
1174 return(chain
->data
);