2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
59 * - Most complex functions, quorum management on transaction ids.
61 * - Locking and data accesses must be internally asynchronous.
63 * - Validate and manage cache coherency primitives (cache state
64 * is stored in chain topologies but must be validated by these
67 * (2) Lookups and Scans
68 * hammer2_cluster_lookup()
69 * hammer2_cluster_next()
71 * - Depend on locking & data retrieval functions, but still complex.
73 * - Must do quorum management on transaction ids.
75 * - Lookup and Iteration ops Must be internally asynchronous.
77 * (3) Modifying Operations
78 * hammer2_cluster_create()
80 * - Can usually punt on failures, operation continues unless quorum
81 * is lost. If quorum is lost, must wait for resynchronization
82 * (depending on the management mode).
84 * - Must disconnect node on failures (also not flush), remount, and
87 * - Network links (via kdmsg) are relatively easy to issue as the
88 * complex underworkings of hammer2_chain.c don't have to messed
89 * with (the protocol is at a higher level than block-level).
91 * - Multiple local disk nodes (i.e. block devices) are another matter.
92 * Chain operations have to be dispatched to per-node threads (xN)
93 * because we can't asynchronize potentially very complex chain
94 * operations in hammer2_chain.c (it would be a huge mess).
96 * (these threads are also used to terminate incoming kdmsg ops from
99 * - Single-node filesystems do not use threads and will simply call
100 * hammer2_chain.c functions directly. This short-cut is handled
101 * at the base of each cluster function.
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
113 * Returns the bref type of the cluster's foucs.
115 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116 * The cluster must be locked.
119 hammer2_cluster_type(hammer2_cluster_t
*cluster
)
121 if (cluster
->error
== 0) {
122 KKASSERT(cluster
->focus
!= NULL
);
123 return(cluster
->focus
->bref
.type
);
129 * Returns non-zero if the cluster's focus is flagged as being modified.
131 * If the cluster is errored, returns 0.
135 hammer2_cluster_modified(hammer2_cluster_t
*cluster
)
137 if (cluster
->error
== 0) {
138 KKASSERT(cluster
->focus
!= NULL
);
139 return((cluster
->focus
->flags
& HAMMER2_CHAIN_MODIFIED
) != 0);
145 * Returns the bref of the cluster's focus, sans any data-offset information
146 * (since offset information is per-node and wouldn't be useful).
148 * Callers use this function to access modify_tid, mirror_tid, type,
151 * If the cluster is errored, returns an empty bref.
152 * The cluster must be locked.
155 hammer2_cluster_bref(hammer2_cluster_t
*cluster
, hammer2_blockref_t
*bref
)
157 if (cluster
->error
== 0) {
158 KKASSERT(cluster
->focus
!= NULL
);
159 *bref
= cluster
->focus
->bref
;
162 bzero(bref
, sizeof(*bref
));
167 * Create a degenerate cluster with one ref from a single locked chain.
168 * The returned cluster will be focused on the chain and inherit its
171 * The chain's lock and reference are transfered to the new cluster, so
172 * the caller should not try to unlock the chain separately.
177 hammer2_cluster_from_chain(hammer2_chain_t
*chain
)
179 hammer2_cluster_t
*cluster
;
181 cluster
= kmalloc(sizeof(*cluster
), M_HAMMER2
, M_WAITOK
| M_ZERO
);
182 cluster
->array
[0].chain
= chain
;
183 cluster
->array
[0].flags
= HAMMER2_CITEM_FEMOD
;
184 cluster
->nchains
= 1;
185 cluster
->focus
= chain
;
186 cluster
->focus_index
= 0;
187 cluster
->pmp
= chain
->pmp
;
189 cluster
->error
= chain
->error
;
190 cluster
->flags
= HAMMER2_CLUSTER_LOCKED
|
191 HAMMER2_CLUSTER_WRHARD
|
192 HAMMER2_CLUSTER_RDHARD
|
193 HAMMER2_CLUSTER_MSYNCED
|
194 HAMMER2_CLUSTER_SSYNCED
;
200 * Add a reference to a cluster and its underlying chains.
202 * We must also ref the underlying chains in order to allow ref/unlock
203 * sequences to later re-lock.
206 hammer2_cluster_ref(hammer2_cluster_t
*cluster
)
208 atomic_add_int(&cluster
->refs
, 1);
212 * Drop the caller's reference to the cluster. When the ref count drops to
213 * zero this function frees the cluster and drops all underlying chains.
215 * In-progress read I/Os are typically detached from the cluster once the
216 * first one returns (the remaining stay attached to the DIOs but are then
217 * ignored and drop naturally).
220 hammer2_cluster_drop(hammer2_cluster_t
*cluster
)
222 hammer2_chain_t
*chain
;
225 KKASSERT(cluster
->refs
> 0);
226 if (atomic_fetchadd_int(&cluster
->refs
, -1) == 1) {
227 cluster
->focus
= NULL
; /* safety XXX chg to assert */
228 cluster
->focus_index
= 0;
230 for (i
= 0; i
< cluster
->nchains
; ++i
) {
231 chain
= cluster
->array
[i
].chain
;
233 hammer2_chain_drop(chain
);
234 cluster
->array
[i
].chain
= NULL
; /* safety */
237 cluster
->nchains
= 0; /* safety */
239 kfree(cluster
, M_HAMMER2
);
240 /* cluster is invalid */
245 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
247 * WARNING! This function expects the caller to handle resolution of the
248 * cluster. We never re-resolve the cluster in this function,
249 * because it might be used to temporarily unlock/relock a cparent
250 * in an iteration or recursrion, and the cparents elements do not
254 hammer2_cluster_lock(hammer2_cluster_t
*cluster
, int how
)
256 hammer2_chain_t
*chain
;
259 /* cannot be on inode-embedded cluster template, must be on copy */
260 KKASSERT(cluster
->refs
> 0);
261 KKASSERT((cluster
->flags
& HAMMER2_CLUSTER_INODE
) == 0);
262 if (cluster
->flags
& HAMMER2_CLUSTER_LOCKED
) {
263 panic("hammer2_cluster_lock: cluster %p already locked!\n",
266 atomic_set_int(&cluster
->flags
, HAMMER2_CLUSTER_LOCKED
);
269 * Lock chains and resolve state.
271 for (i
= 0; i
< cluster
->nchains
; ++i
) {
272 chain
= cluster
->array
[i
].chain
;
275 hammer2_chain_lock(chain
, how
);
280 * Calculate the clustering state for the cluster and set its focus.
281 * This routine must be called with care. For example, it should not
282 * normally be called after relocking a non-leaf cluster because parent
283 * clusters help iterations and each element might be at a slightly different
284 * indirect node (each node's topology is independently indexed).
286 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
287 * operations. Typically this is only set on a quorum of MASTERs or
288 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
289 * is present, this bit is *not* set on a quorum of MASTERs. The
290 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
291 * that create/modify/delete elements use it.
293 * The chains making up the cluster may be narrowed down based on quorum
294 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
295 * narrowed down to a single chain as long as the entire subtopology is known
296 * to be intact. So, for example, we can narrow a read-only op to a single
297 * fast SLAVE but if we focus a CACHE chain we must still retain at least
298 * a SLAVE to ensure that the subtopology can be accessed.
300 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
301 * to be maintained once the topology is validated as-of the top level of
304 * If a failure occurs the operation must be aborted by higher-level code and
308 hammer2_cluster_resolve(hammer2_cluster_t
*cluster
)
310 hammer2_chain_t
*chain
;
311 hammer2_chain_t
*focus
;
313 hammer2_tid_t quorum_tid
;
314 hammer2_tid_t last_best_quorum_tid
;
326 cluster
->focus
= NULL
;
339 KKASSERT(pmp
!= NULL
|| cluster
->nchains
== 0);
340 nquorum
= pmp
? pmp
->pfs_nmasters
/ 2 + 1 : 0;
346 * NOTE: A NULL chain is not necessarily an error, it could be
347 * e.g. a lookup failure or the end of an iteration.
350 for (i
= 0; i
< cluster
->nchains
; ++i
) {
351 chain
= cluster
->array
[i
].chain
;
352 if (chain
&& chain
->error
) {
353 if (cluster
->focus
== NULL
|| cluster
->focus
== chain
) {
354 /* error will be overridden by valid focus */
355 cluster
->error
= chain
->error
;
359 * Must count total masters and slaves whether the
360 * chain is errored or not.
362 switch (cluster
->pmp
->pfs_types
[i
]) {
363 case HAMMER2_PFSTYPE_SUPROOT
:
364 case HAMMER2_PFSTYPE_MASTER
:
367 case HAMMER2_PFSTYPE_SLAVE
:
373 switch (cluster
->pmp
->pfs_types
[i
]) {
374 case HAMMER2_PFSTYPE_MASTER
:
377 case HAMMER2_PFSTYPE_SLAVE
:
380 case HAMMER2_PFSTYPE_SOFT_MASTER
:
381 nflags
|= HAMMER2_CLUSTER_WRSOFT
;
382 nflags
|= HAMMER2_CLUSTER_RDSOFT
;
385 case HAMMER2_PFSTYPE_SOFT_SLAVE
:
386 nflags
|= HAMMER2_CLUSTER_RDSOFT
;
388 case HAMMER2_PFSTYPE_SUPROOT
:
390 * Degenerate cluster representing the super-root
391 * topology on a single device. Fake stuff so
392 * cluster ops work as expected.
394 nflags
|= HAMMER2_CLUSTER_WRHARD
;
395 nflags
|= HAMMER2_CLUSTER_RDHARD
;
396 cluster
->focus_index
= i
;
397 cluster
->focus
= chain
;
398 cluster
->error
= chain
? chain
->error
: 0;
409 * Resolve masters. Calculate nmasters for the highest matching
410 * TID, if a quorum cannot be attained try the next lower matching
411 * TID until we exhaust TIDs.
413 * NOTE: A NULL chain is not necessarily an error, it could be
414 * e.g. a lookup failure or the end of an iteration.
417 last_best_quorum_tid
= HAMMER2_TID_MAX
;
418 quorum_tid
= 0; /* fix gcc warning */
420 while (nmasters
< nquorum
&& last_best_quorum_tid
!= 0) {
424 for (i
= 0; i
< cluster
->nchains
; ++i
) {
425 switch (cluster
->pmp
->pfs_types
[i
]) {
426 case HAMMER2_PFSTYPE_SUPROOT
:
427 case HAMMER2_PFSTYPE_MASTER
:
432 chain
= cluster
->array
[i
].chain
;
434 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
) {
436 * Invalid as in unsynchronized, cannot be
437 * used to calculate the quorum.
439 } else if (chain
== NULL
&& quorum_tid
== 0) {
441 * NULL chain on master matches NULL chains
445 } else if (quorum_tid
< last_best_quorum_tid
&&
447 (quorum_tid
< chain
->bref
.modify_tid
||
450 * Better TID located, reset nmasters count.
453 quorum_tid
= chain
->bref
.modify_tid
;
455 quorum_tid
== chain
->bref
.modify_tid
) {
457 * TID matches current collection.
462 if (nmasters
>= nquorum
)
464 last_best_quorum_tid
= quorum_tid
;
470 * NOTE: A NULL chain is not necessarily an error, it could be
471 * e.g. a lookup failure or the end of an iteration.
474 for (i
= 0; i
< cluster
->nchains
; ++i
) {
475 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_FEMOD
;
476 chain
= cluster
->array
[i
].chain
;
477 if (chain
&& chain
->error
) {
478 if (cluster
->focus
== NULL
|| cluster
->focus
== chain
) {
479 /* error will be overridden by valid focus */
480 cluster
->error
= chain
->error
;
485 switch (cluster
->pmp
->pfs_types
[i
]) {
486 case HAMMER2_PFSTYPE_MASTER
:
488 * We must have enough up-to-date masters to reach
489 * a quorum and the master modify_tid must match
490 * the quorum's modify_tid.
492 * Do not select an errored or out-of-sync master.
494 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
) {
495 nflags
|= HAMMER2_CLUSTER_UNHARD
;
496 } else if (nmasters
>= nquorum
&&
497 (chain
== NULL
|| chain
->error
== 0) &&
498 ((chain
== NULL
&& quorum_tid
== 0) ||
499 (chain
!= NULL
&& quorum_tid
==
500 chain
->bref
.modify_tid
))) {
501 nflags
|= HAMMER2_CLUSTER_WRHARD
;
502 nflags
|= HAMMER2_CLUSTER_RDHARD
;
504 cluster
->array
[i
].flags
|=
507 if (cluster
->focus
== NULL
||
508 focus_pfs_type
== HAMMER2_PFSTYPE_SLAVE
) {
509 focus_pfs_type
= HAMMER2_PFSTYPE_MASTER
;
510 cluster
->focus_index
= i
;
511 cluster
->focus
= chain
; /* NULL ok */
512 cluster
->error
= chain
? chain
->error
:
515 } else if (chain
== NULL
|| chain
->error
== 0) {
516 nflags
|= HAMMER2_CLUSTER_UNHARD
;
519 case HAMMER2_PFSTYPE_SLAVE
:
521 * We must have enough up-to-date masters to reach
522 * a quorum and the slave modify_tid must match the
523 * quorum's modify_tid.
525 * Do not select an errored slave.
527 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
) {
528 nflags
|= HAMMER2_CLUSTER_UNHARD
;
529 } else if (nmasters
>= nquorum
&&
530 (chain
== NULL
|| chain
->error
== 0) &&
531 ((chain
== NULL
&& quorum_tid
== 0) ||
532 (chain
&& quorum_tid
==
533 chain
->bref
.modify_tid
))) {
535 nflags
|= HAMMER2_CLUSTER_RDHARD
;
537 /* XXX optimize for RESOLVE_RDONLY */
538 if (cluster
->focus
== NULL
) {
539 focus_pfs_type
= HAMMER2_PFSTYPE_SLAVE
;
540 cluster
->focus_index
= i
;
541 cluster
->focus
= chain
; /* NULL ok */
542 cluster
->error
= chain
? chain
->error
:
546 } else if (chain
== NULL
|| chain
->error
== 0) {
547 nflags
|= HAMMER2_CLUSTER_UNSOFT
;
550 case HAMMER2_PFSTYPE_SOFT_MASTER
:
552 * Directly mounted soft master always wins. There
553 * should be only one.
555 KKASSERT(focus_pfs_type
!= HAMMER2_PFSTYPE_SOFT_MASTER
);
556 cluster
->focus_index
= i
;
557 cluster
->focus
= chain
;
558 cluster
->error
= chain
? chain
->error
: 0;
559 focus_pfs_type
= HAMMER2_PFSTYPE_SOFT_MASTER
;
560 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
562 case HAMMER2_PFSTYPE_SOFT_SLAVE
:
564 * Directly mounted soft slave always wins. There
565 * should be only one.
567 KKASSERT(focus_pfs_type
!= HAMMER2_PFSTYPE_SOFT_SLAVE
);
568 if (focus_pfs_type
!= HAMMER2_PFSTYPE_SOFT_MASTER
) {
569 cluster
->focus_index
= i
;
570 cluster
->focus
= chain
;
571 cluster
->error
= chain
? chain
->error
: 0;
572 focus_pfs_type
= HAMMER2_PFSTYPE_SOFT_SLAVE
;
575 case HAMMER2_PFSTYPE_SUPROOT
:
577 * spmp (degenerate case)
580 cluster
->focus_index
= i
;
581 cluster
->focus
= chain
;
582 cluster
->error
= chain
? chain
->error
: 0;
583 focus_pfs_type
= HAMMER2_PFSTYPE_SUPROOT
;
584 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
592 * Focus now set, adjust ddflag. Skip this pass if the focus
593 * is bad or if we are at the PFS root (the bref won't match at
594 * the PFS root, obviously).
596 focus
= cluster
->focus
;
599 (cluster
->focus
->bref
.type
== HAMMER2_BREF_TYPE_INODE
);
604 if (cluster
->focus
->flags
& HAMMER2_CHAIN_PFSBOUNDARY
)
610 * Validate the elements that were not marked invalid. They should
613 for (i
= 0; i
< cluster
->nchains
; ++i
) {
616 chain
= cluster
->array
[i
].chain
;
622 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
)
625 ddflag
= (chain
->bref
.type
== HAMMER2_BREF_TYPE_INODE
);
626 if (chain
->bref
.type
!= focus
->bref
.type
||
627 chain
->bref
.key
!= focus
->bref
.key
||
628 chain
->bref
.keybits
!= focus
->bref
.keybits
||
629 chain
->bref
.modify_tid
!= focus
->bref
.modify_tid
||
630 chain
->bytes
!= focus
->bytes
||
631 ddflag
!= cluster
->ddflag
) {
632 cluster
->array
[i
].flags
|= HAMMER2_CITEM_INVALID
;
633 if (hammer2_debug
& 1)
634 kprintf("cluster_resolve: matching modify_tid failed "
635 "bref test: idx=%d type=%02x/%02x "
636 "key=%016jx/%d-%016jx/%d "
637 "mod=%016jx/%016jx bytes=%u/%u\n",
639 chain
->bref
.type
, focus
->bref
.type
,
640 chain
->bref
.key
, chain
->bref
.keybits
,
641 focus
->bref
.key
, focus
->bref
.keybits
,
642 chain
->bref
.modify_tid
, focus
->bref
.modify_tid
,
643 chain
->bytes
, focus
->bytes
);
644 if (hammer2_debug
& 0x4000)
645 panic("cluster_resolve");
646 /* flag issue and force resync? */
652 nflags
|= HAMMER2_CLUSTER_NOSOFT
;
654 nflags
|= HAMMER2_CLUSTER_NOHARD
;
657 * Set SSYNCED or MSYNCED for slaves and masters respectively if
658 * all available nodes (even if 0 are available) are fully
659 * synchronized. This is used by the synchronization thread to
660 * determine if there is work it could potentially accomplish.
662 if (nslaves
== ttlslaves
)
663 nflags
|= HAMMER2_CLUSTER_SSYNCED
;
664 if (nmasters
== ttlmasters
)
665 nflags
|= HAMMER2_CLUSTER_MSYNCED
;
668 * Determine if the cluster was successfully locked for the
669 * requested operation and generate an error code. The cluster
670 * will not be locked (or ref'd) if an error is returned.
672 atomic_set_int(&cluster
->flags
, nflags
);
673 atomic_clear_int(&cluster
->flags
, HAMMER2_CLUSTER_ZFLAGS
& ~nflags
);
677 * This is used by the XOPS subsystem to calculate the state of
678 * the collection and tell hammer2_xop_collect() what to do with it.
679 * The collection can be in various states of desynchronization, the
680 * caller specifically wants to resolve the passed-in key.
683 * 0 - Quorum agreement, key is valid
685 * ENOENT - Quorum agreement, end of scan
687 * ESRCH - Quorum agreement, key is INVALID (caller should
690 * EIO - Quorum agreement but all elements had errors.
692 * EDEADLK - No quorum agreement possible for key, a repair
693 * may be needed. Caller has to decide what to do,
694 * possibly iterating the key or generating an EIO.
696 * EINPROGRESS - No quorum agreement yet, but agreement is still
697 * possible if caller waits for more responses. Caller
698 * should not iterate key.
700 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
702 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
705 hammer2_cluster_check(hammer2_cluster_t
*cluster
, hammer2_key_t key
, int flags
)
707 hammer2_chain_t
*chain
;
708 hammer2_chain_t
*focus
;
710 hammer2_tid_t quorum_tid
;
711 hammer2_tid_t last_best_quorum_tid
;
716 int nmasters_keymatch
;
719 int umasters
; /* unknown masters (still in progress) */
725 cluster
->focus
= NULL
;
728 KKASSERT(pmp
!= NULL
|| cluster
->nchains
== 0);
733 nquorum
= pmp
? pmp
->pfs_nmasters
/ 2 + 1 : 0;
739 nmasters_keymatch
= 0;
747 * NOTE: A NULL chain is not necessarily an error, it could be
748 * e.g. a lookup failure or the end of an iteration.
751 for (i
= 0; i
< cluster
->nchains
; ++i
) {
752 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_FEMOD
;
753 cluster
->array
[i
].flags
|= HAMMER2_CITEM_INVALID
;
755 chain
= cluster
->array
[i
].chain
;
756 error
= cluster
->array
[i
].error
;
757 if (chain
&& error
) {
758 if (cluster
->focus
== NULL
|| cluster
->focus
== chain
) {
759 /* error will be overridden by valid focus */
764 * Must count total masters and slaves whether the
765 * chain is errored or not.
767 switch (cluster
->pmp
->pfs_types
[i
]) {
768 case HAMMER2_PFSTYPE_SUPROOT
:
769 case HAMMER2_PFSTYPE_MASTER
:
772 case HAMMER2_PFSTYPE_SLAVE
:
778 switch (cluster
->pmp
->pfs_types
[i
]) {
779 case HAMMER2_PFSTYPE_MASTER
:
782 case HAMMER2_PFSTYPE_SLAVE
:
785 case HAMMER2_PFSTYPE_SOFT_MASTER
:
786 nflags
|= HAMMER2_CLUSTER_WRSOFT
;
787 nflags
|= HAMMER2_CLUSTER_RDSOFT
;
790 case HAMMER2_PFSTYPE_SOFT_SLAVE
:
791 nflags
|= HAMMER2_CLUSTER_RDSOFT
;
793 case HAMMER2_PFSTYPE_SUPROOT
:
795 * Degenerate cluster representing the super-root
796 * topology on a single device. Fake stuff so
797 * cluster ops work as expected.
800 nflags
|= HAMMER2_CLUSTER_WRHARD
;
801 nflags
|= HAMMER2_CLUSTER_RDHARD
;
802 cluster
->focus_index
= i
;
803 cluster
->focus
= chain
;
804 cluster
->error
= error
;
814 * Resolve nmasters - master nodes fully match
816 * Resolve umasters - master nodes operation still
819 * Resolve nmasters_keymatch - master nodes match the passed-in
820 * key and may or may not match
821 * the quorum-agreed tid.
823 * The quorum-agreed TID is the highest matching TID.
825 last_best_quorum_tid
= HAMMER2_TID_MAX
;
826 quorum_tid
= 0; /* fix gcc warning */
828 while (nmasters
< nquorum
&& last_best_quorum_tid
!= 0) {
832 for (i
= 0; i
< cluster
->nchains
; ++i
) {
833 /* XXX SOFT smpresent handling */
834 switch(cluster
->pmp
->pfs_types
[i
]) {
835 case HAMMER2_PFSTYPE_MASTER
:
836 case HAMMER2_PFSTYPE_SUPROOT
:
842 chain
= cluster
->array
[i
].chain
;
843 error
= cluster
->array
[i
].error
;
846 * Skip elements still in progress. umasters keeps
847 * track of masters that might still be in-progress.
849 if (chain
== NULL
&& (cluster
->array
[i
].flags
&
850 HAMMER2_CITEM_NULL
) == 0) {
858 if (flags
& HAMMER2_CHECK_NULL
) {
862 if (cluster
->error
== 0)
863 cluster
->error
= error
;
866 (key
== (hammer2_key_t
)-1 ||
867 chain
->bref
.key
== key
)) {
869 if (quorum_tid
< last_best_quorum_tid
&&
870 (quorum_tid
< chain
->bref
.modify_tid
||
873 * Better TID located, reset
877 quorum_tid
= chain
->bref
.modify_tid
;
879 if (quorum_tid
== chain
->bref
.modify_tid
) {
881 * TID matches current collection.
883 * (error handled in next pass)
886 if (chain
->error
== 0) {
887 cluster
->focus
= chain
;
888 cluster
->focus_index
= i
;
893 if (nmasters
>= nquorum
)
895 last_best_quorum_tid
= quorum_tid
;
899 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
900 nmasters, nquorum, nmasters_keymatch, umasters);
904 * Early return if we do not have enough masters.
906 if (nmasters
< nquorum
) {
907 if (nmasters
+ umasters
>= nquorum
)
909 if (nmasters_keymatch
< nquorum
)
915 * Validated end of scan.
917 if (flags
& HAMMER2_CHECK_NULL
) {
918 if (cluster
->error
== 0)
919 cluster
->error
= ENOENT
;
920 return cluster
->error
;
924 * If we have a NULL focus at this point the agreeing quorum all
927 if (cluster
->focus
== NULL
)
933 * We have quorum agreement, validate elements, not end of scan.
936 for (i
= 0; i
< cluster
->nchains
; ++i
) {
937 chain
= cluster
->array
[i
].chain
;
938 error
= cluster
->array
[i
].error
;
940 chain
->bref
.key
!= key
||
941 chain
->bref
.modify_tid
!= quorum_tid
) {
948 * XXX for now, cumulative error.
950 if (cluster
->error
== 0)
951 cluster
->error
= error
;
953 switch (cluster
->pmp
->pfs_types
[i
]) {
954 case HAMMER2_PFSTYPE_MASTER
:
955 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
956 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
957 nflags
|= HAMMER2_CLUSTER_WRHARD
;
958 nflags
|= HAMMER2_CLUSTER_RDHARD
;
960 case HAMMER2_PFSTYPE_SLAVE
:
962 * We must have enough up-to-date masters to reach
963 * a quorum and the slave modify_tid must match the
964 * quorum's modify_tid.
966 * Do not select an errored slave.
968 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
969 nflags
|= HAMMER2_CLUSTER_RDHARD
;
972 case HAMMER2_PFSTYPE_SOFT_MASTER
:
974 * Directly mounted soft master always wins. There
975 * should be only one.
977 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
978 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
980 case HAMMER2_PFSTYPE_SOFT_SLAVE
:
982 * Directly mounted soft slave always wins. There
983 * should be only one.
987 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
989 case HAMMER2_PFSTYPE_SUPROOT
:
991 * spmp (degenerate case)
993 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
994 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
995 nflags
|= HAMMER2_CLUSTER_WRHARD
;
996 nflags
|= HAMMER2_CLUSTER_RDHARD
;
1004 * Focus now set, adjust ddflag. Skip this pass if the focus
1005 * is bad or if we are at the PFS root (the bref won't match at
1006 * the PFS root, obviously).
1008 focus
= cluster
->focus
;
1011 (cluster
->focus
->bref
.type
== HAMMER2_BREF_TYPE_INODE
);
1013 cluster
->ddflag
= 0;
1016 if (cluster
->focus
->flags
& HAMMER2_CHAIN_PFSBOUNDARY
)
1022 * Validate the elements that were not marked invalid. They should
1025 for (i
= 0; i
< cluster
->nchains
; ++i
) {
1028 chain
= cluster
->array
[i
].chain
;
1034 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
)
1037 ddflag
= (chain
->bref
.type
== HAMMER2_BREF_TYPE_INODE
);
1038 if (chain
->bref
.type
!= focus
->bref
.type
||
1039 chain
->bref
.key
!= focus
->bref
.key
||
1040 chain
->bref
.keybits
!= focus
->bref
.keybits
||
1041 chain
->bref
.modify_tid
!= focus
->bref
.modify_tid
||
1042 chain
->bytes
!= focus
->bytes
||
1043 ddflag
!= cluster
->ddflag
) {
1044 cluster
->array
[i
].flags
|= HAMMER2_CITEM_INVALID
;
1045 if (hammer2_debug
& 1)
1046 kprintf("cluster_resolve: matching modify_tid failed "
1047 "bref test: idx=%d type=%02x/%02x "
1048 "key=%016jx/%d-%016jx/%d "
1049 "mod=%016jx/%016jx bytes=%u/%u\n",
1051 chain
->bref
.type
, focus
->bref
.type
,
1052 chain
->bref
.key
, chain
->bref
.keybits
,
1053 focus
->bref
.key
, focus
->bref
.keybits
,
1054 chain
->bref
.modify_tid
, focus
->bref
.modify_tid
,
1055 chain
->bytes
, focus
->bytes
);
1056 if (hammer2_debug
& 0x4000)
1057 panic("cluster_resolve");
1058 /* flag issue and force resync? */
1064 nflags
|= HAMMER2_CLUSTER_NOSOFT
;
1065 if (ttlmasters
== 0)
1066 nflags
|= HAMMER2_CLUSTER_NOHARD
;
1069 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1070 * all available nodes (even if 0 are available) are fully
1071 * synchronized. This is used by the synchronization thread to
1072 * determine if there is work it could potentially accomplish.
1074 if (nslaves
== ttlslaves
)
1075 nflags
|= HAMMER2_CLUSTER_SSYNCED
;
1076 if (nmasters
== ttlmasters
)
1077 nflags
|= HAMMER2_CLUSTER_MSYNCED
;
1080 * Determine if the cluster was successfully locked for the
1081 * requested operation and generate an error code. The cluster
1082 * will not be locked (or ref'd) if an error is returned.
1084 atomic_set_int(&cluster
->flags
, nflags
);
1085 atomic_clear_int(&cluster
->flags
, HAMMER2_CLUSTER_ZFLAGS
& ~nflags
);
1087 return cluster
->error
;
1091 * This is used by the sync thread to force non-NULL elements of a copy
1092 * of the pmp->iroot cluster to be good which is required to prime the
1096 hammer2_cluster_forcegood(hammer2_cluster_t
*cluster
)
1100 for (i
= 0; i
< cluster
->nchains
; ++i
) {
1101 if (cluster
->array
[i
].chain
)
1102 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
1107 * Copy a cluster, returned a ref'd cluster. All underlying chains
1108 * are also ref'd, but not locked. Focus state is also copied.
1110 * Original cluster does not have to be locked but usually is.
1111 * New cluster will not be flagged as locked.
1113 * Callers using this function to initialize a new cluster from an inode
1114 * generally lock and resolve the resulting cluster.
1116 * Callers which use this function to save/restore a cluster structure
1117 * generally retain the focus state and do not re-resolve it. Caller should
1118 * not try to re-resolve internal (cparent) node state during an iteration
1119 * as the individual tracking elements of cparent in an iteration may not
1120 * match even though they are correct.
1123 hammer2_cluster_copy(hammer2_cluster_t
*ocluster
)
1125 hammer2_pfs_t
*pmp
= ocluster
->pmp
;
1126 hammer2_cluster_t
*ncluster
;
1127 hammer2_chain_t
*chain
;
1130 ncluster
= kmalloc(sizeof(*ncluster
), M_HAMMER2
, M_WAITOK
| M_ZERO
);
1131 ncluster
->pmp
= pmp
;
1132 ncluster
->nchains
= ocluster
->nchains
;
1135 for (i
= 0; i
< ocluster
->nchains
; ++i
) {
1136 chain
= ocluster
->array
[i
].chain
;
1137 ncluster
->array
[i
].chain
= chain
;
1138 ncluster
->array
[i
].flags
= ocluster
->array
[i
].flags
;
1140 hammer2_chain_ref(chain
);
1142 ncluster
->focus_index
= ocluster
->focus_index
;
1143 ncluster
->focus
= ocluster
->focus
;
1144 ncluster
->flags
= ocluster
->flags
& ~(HAMMER2_CLUSTER_LOCKED
|
1145 HAMMER2_CLUSTER_INODE
);
1151 * Unlock a cluster. Refcount and focus is maintained.
1154 hammer2_cluster_unlock(hammer2_cluster_t
*cluster
)
1156 hammer2_chain_t
*chain
;
1159 if ((cluster
->flags
& HAMMER2_CLUSTER_LOCKED
) == 0) {
1160 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1163 KKASSERT(cluster
->flags
& HAMMER2_CLUSTER_LOCKED
);
1164 KKASSERT(cluster
->refs
> 0);
1165 atomic_clear_int(&cluster
->flags
, HAMMER2_CLUSTER_LOCKED
);
1167 for (i
= 0; i
< cluster
->nchains
; ++i
) {
1168 chain
= cluster
->array
[i
].chain
;
1170 hammer2_chain_unlock(chain
);
1174 /************************************************************************
1176 ************************************************************************
1179 * WARNING! blockref[] array data is not universal. These functions should
1180 * only be used to access universal data.
1182 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1183 * complete if necessary. The I/O's should have already been
1184 * initiated by the cluster_lock/chain_lock operation.
1186 * The cluster must already be in a modified state before wdata
1187 * is called. The data will already be available for this case.
1189 const hammer2_media_data_t
*
1190 hammer2_cluster_rdata(hammer2_cluster_t
*cluster
)
1192 KKASSERT(cluster
->focus
!= NULL
);
1193 return(cluster
->focus
->data
);
1196 hammer2_media_data_t
*
1197 hammer2_cluster_wdata(hammer2_cluster_t
*cluster
)
1199 KKASSERT(cluster
->focus
!= NULL
);
1200 KKASSERT(hammer2_cluster_modified(cluster
));
1201 return(cluster
->focus
->data
);