2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
59 * - Most complex functions, quorum management on transaction ids.
61 * - Locking and data accesses must be internally asynchronous.
63 * - Validate and manage cache coherency primitives (cache state
64 * is stored in chain topologies but must be validated by these
67 * (2) Lookups and Scans
68 * hammer2_cluster_lookup()
69 * hammer2_cluster_next()
71 * - Depend on locking & data retrieval functions, but still complex.
73 * - Must do quorum management on transaction ids.
75 * - Lookup and Iteration ops Must be internally asynchronous.
77 * (3) Modifying Operations
78 * hammer2_cluster_create()
80 * - Can usually punt on failures, operation continues unless quorum
81 * is lost. If quorum is lost, must wait for resynchronization
82 * (depending on the management mode).
84 * - Must disconnect node on failures (also not flush), remount, and
87 * - Network links (via kdmsg) are relatively easy to issue as the
88 * complex underworkings of hammer2_chain.c don't have to messed
89 * with (the protocol is at a higher level than block-level).
91 * - Multiple local disk nodes (i.e. block devices) are another matter.
92 * Chain operations have to be dispatched to per-node threads (xN)
93 * because we can't asynchronize potentially very complex chain
94 * operations in hammer2_chain.c (it would be a huge mess).
96 * (these threads are also used to terminate incoming kdmsg ops from
99 * - Single-node filesystems do not use threads and will simply call
100 * hammer2_chain.c functions directly. This short-cut is handled
101 * at the base of each cluster function.
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
111 * Returns the bref type of the cluster's foucs.
113 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
114 * The cluster must be locked.
117 hammer2_cluster_type(hammer2_cluster_t
*cluster
)
119 if (cluster
->error
== 0) {
120 KKASSERT(cluster
->focus
!= NULL
);
121 return(cluster
->focus
->bref
.type
);
127 * Returns the bref of the cluster's focus, sans any data-offset information
128 * (since offset information is per-node and wouldn't be useful).
130 * Callers use this function to access modify_tid, mirror_tid, type,
133 * If the cluster is errored, returns an empty bref.
134 * The cluster must be locked.
137 hammer2_cluster_bref(hammer2_cluster_t
*cluster
, hammer2_blockref_t
*bref
)
139 if (cluster
->error
== 0) {
140 KKASSERT(cluster
->focus
!= NULL
);
141 *bref
= cluster
->focus
->bref
;
144 bzero(bref
, sizeof(*bref
));
149 * Create a degenerate cluster with one ref from a single locked chain.
150 * The returned cluster will be focused on the chain and inherit its
153 * The chain's lock and reference are transfered to the new cluster, so
154 * the caller should not try to unlock the chain separately.
159 hammer2_dummy_xop_from_chain(hammer2_xop_head_t
*xop
, hammer2_chain_t
*chain
)
161 hammer2_cluster_t
*cluster
;
163 bzero(xop
, sizeof(*xop
));
165 cluster
= &xop
->cluster
;
166 cluster
->array
[0].chain
= chain
;
167 cluster
->array
[0].flags
= HAMMER2_CITEM_FEMOD
;
168 cluster
->nchains
= 1;
169 cluster
->focus
= chain
;
170 cluster
->focus_index
= 0;
171 cluster
->pmp
= chain
->pmp
;
173 cluster
->error
= chain
->error
;
174 cluster
->flags
= HAMMER2_CLUSTER_LOCKED
|
175 HAMMER2_CLUSTER_WRHARD
|
176 HAMMER2_CLUSTER_RDHARD
|
177 HAMMER2_CLUSTER_MSYNCED
|
178 HAMMER2_CLUSTER_SSYNCED
;
182 * Add a reference to a cluster and its underlying chains.
184 * We must also ref the underlying chains in order to allow ref/unlock
185 * sequences to later re-lock.
188 hammer2_cluster_ref(hammer2_cluster_t
*cluster
)
190 atomic_add_int(&cluster
->refs
, 1);
194 * Drop the caller's reference to the cluster. When the ref count drops to
195 * zero this function frees the cluster and drops all underlying chains.
197 * In-progress read I/Os are typically detached from the cluster once the
198 * first one returns (the remaining stay attached to the DIOs but are then
199 * ignored and drop naturally).
202 hammer2_cluster_drop(hammer2_cluster_t
*cluster
)
204 hammer2_chain_t
*chain
;
207 KKASSERT(cluster
->refs
> 0);
208 if (atomic_fetchadd_int(&cluster
->refs
, -1) == 1) {
209 cluster
->focus
= NULL
; /* safety XXX chg to assert */
210 cluster
->focus_index
= 0;
212 for (i
= 0; i
< cluster
->nchains
; ++i
) {
213 chain
= cluster
->array
[i
].chain
;
215 hammer2_chain_drop(chain
);
216 cluster
->array
[i
].chain
= NULL
; /* safety */
219 cluster
->nchains
= 0; /* safety */
221 kfree(cluster
, M_HAMMER2
);
222 /* cluster is invalid */
227 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
229 * WARNING! This function expects the caller to handle resolution of the
230 * cluster. We never re-resolve the cluster in this function,
231 * because it might be used to temporarily unlock/relock a cparent
232 * in an iteration or recursrion, and the cparents elements do not
236 hammer2_cluster_lock(hammer2_cluster_t
*cluster
, int how
)
238 hammer2_chain_t
*chain
;
241 /* cannot be on inode-embedded cluster template, must be on copy */
242 KKASSERT(cluster
->refs
> 0);
243 KKASSERT((cluster
->flags
& HAMMER2_CLUSTER_INODE
) == 0);
244 if (cluster
->flags
& HAMMER2_CLUSTER_LOCKED
) {
245 panic("hammer2_cluster_lock: cluster %p already locked!\n",
248 atomic_set_int(&cluster
->flags
, HAMMER2_CLUSTER_LOCKED
);
251 * Lock chains and resolve state.
253 for (i
= 0; i
< cluster
->nchains
; ++i
) {
254 chain
= cluster
->array
[i
].chain
;
257 hammer2_chain_lock(chain
, how
);
262 hammer2_cluster_unhold(hammer2_cluster_t
*cluster
)
264 hammer2_chain_t
*chain
;
267 for (i
= 0; i
< cluster
->nchains
; ++i
) {
268 chain
= cluster
->array
[i
].chain
;
271 hammer2_chain_unhold(chain
);
276 hammer2_cluster_rehold(hammer2_cluster_t
*cluster
)
278 hammer2_chain_t
*chain
;
281 for (i
= 0; i
< cluster
->nchains
; ++i
) {
282 chain
= cluster
->array
[i
].chain
;
285 hammer2_chain_rehold(chain
);
290 * This is used by the XOPS subsystem to calculate the state of
291 * the collection and tell hammer2_xop_collect() what to do with it.
292 * The collection can be in various states of desynchronization, the
293 * caller specifically wants to resolve the passed-in key.
295 * Return values (HAMMER2_ERROR_*):
297 * 0 - Quorum agreement, key is valid
299 * ENOENT - Quorum agreement, end of scan
301 * ESRCH - Quorum agreement, key is INVALID (caller should
304 * EIO - Quorum agreement but all elements had errors.
306 * EDEADLK - No quorum agreement possible for key, a repair
307 * may be needed. Caller has to decide what to do,
308 * possibly iterating the key or generating an EIO.
310 * EINPROGRESS - No quorum agreement yet, but agreement is still
311 * possible if caller waits for more responses. Caller
312 * should not iterate key.
314 * CHECK - CRC check error
316 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
318 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
321 hammer2_cluster_check(hammer2_cluster_t
*cluster
, hammer2_key_t key
, int flags
)
323 hammer2_chain_t
*chain
;
324 hammer2_chain_t
*focus
;
326 hammer2_tid_t quorum_tid
;
327 hammer2_tid_t last_best_quorum_tid
;
332 int nmasters_keymatch
;
335 int umasters
; /* unknown masters (still in progress) */
340 cluster
->focus
= NULL
;
343 KKASSERT(pmp
!= NULL
|| cluster
->nchains
== 0);
348 nquorum
= pmp
? pmp
->pfs_nmasters
/ 2 + 1 : 0;
356 * NOTE: A NULL chain is not necessarily an error, it could be
357 * e.g. a lookup failure or the end of an iteration.
360 for (i
= 0; i
< cluster
->nchains
; ++i
) {
361 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_FEMOD
;
362 cluster
->array
[i
].flags
|= HAMMER2_CITEM_INVALID
;
364 chain
= cluster
->array
[i
].chain
;
365 error
= cluster
->array
[i
].error
;
366 if (chain
&& error
) {
367 if (cluster
->focus
== NULL
|| cluster
->focus
== chain
) {
368 /* error will be overridden by valid focus */
373 * Must count total masters and slaves whether the
374 * chain is errored or not.
376 switch (cluster
->pmp
->pfs_types
[i
]) {
377 case HAMMER2_PFSTYPE_SUPROOT
:
378 case HAMMER2_PFSTYPE_MASTER
:
381 case HAMMER2_PFSTYPE_SLAVE
:
387 switch (cluster
->pmp
->pfs_types
[i
]) {
388 case HAMMER2_PFSTYPE_MASTER
:
391 case HAMMER2_PFSTYPE_SLAVE
:
394 case HAMMER2_PFSTYPE_SOFT_MASTER
:
395 nflags
|= HAMMER2_CLUSTER_WRSOFT
;
396 nflags
|= HAMMER2_CLUSTER_RDSOFT
;
398 case HAMMER2_PFSTYPE_SOFT_SLAVE
:
399 nflags
|= HAMMER2_CLUSTER_RDSOFT
;
401 case HAMMER2_PFSTYPE_SUPROOT
:
403 * Degenerate cluster representing the super-root
404 * topology on a single device. Fake stuff so
405 * cluster ops work as expected.
408 nflags
|= HAMMER2_CLUSTER_WRHARD
;
409 nflags
|= HAMMER2_CLUSTER_RDHARD
;
410 cluster
->focus_index
= i
;
411 cluster
->focus
= chain
;
412 cluster
->error
= error
;
422 * Resolve nmasters - master nodes fully match
424 * Resolve umasters - master nodes operation still
427 * Resolve nmasters_keymatch - master nodes match the passed-in
428 * key and may or may not match
429 * the quorum-agreed tid.
431 * The quorum-agreed TID is the highest matching TID.
433 last_best_quorum_tid
= HAMMER2_TID_MAX
;
436 nmasters_keymatch
= 0;
437 quorum_tid
= 0; /* fix gcc warning */
439 while (nmasters
< nquorum
&& last_best_quorum_tid
!= 0) {
442 nmasters_keymatch
= 0;
445 for (i
= 0; i
< cluster
->nchains
; ++i
) {
446 /* XXX SOFT smpresent handling */
447 switch(cluster
->pmp
->pfs_types
[i
]) {
448 case HAMMER2_PFSTYPE_MASTER
:
449 case HAMMER2_PFSTYPE_SUPROOT
:
455 chain
= cluster
->array
[i
].chain
;
456 error
= cluster
->array
[i
].error
;
459 * Skip elements still in progress. umasters keeps
460 * track of masters that might still be in-progress.
462 if (chain
== NULL
&& (cluster
->array
[i
].flags
&
463 HAMMER2_CITEM_NULL
) == 0) {
471 if (flags
& HAMMER2_CHECK_NULL
) {
475 if (cluster
->error
== 0)
476 cluster
->error
= error
;
479 (key
== (hammer2_key_t
)-1 ||
480 chain
->bref
.key
== key
)) {
483 if (chain
->bref
.modify_tid
<
484 last_best_quorum_tid
&&
485 quorum_tid
< chain
->bref
.modify_tid
) {
487 * Select new TID as master if better
488 * than any found so far in this loop,
489 * as long as it does not reach the
490 * best tid found in the previous loop.
493 quorum_tid
= chain
->bref
.modify_tid
;
495 if (quorum_tid
== chain
->bref
.modify_tid
) {
497 * TID matches current collection.
499 * (error handled in next pass)
502 if (chain
->error
== 0) {
503 cluster
->focus
= chain
;
504 cluster
->focus_index
= i
;
509 if (nmasters
>= nquorum
)
511 last_best_quorum_tid
= quorum_tid
;
515 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
516 nmasters, nquorum, nmasters_keymatch, umasters);
520 * Early return if we do not have enough masters.
522 if (nmasters
< nquorum
) {
523 if (nmasters
+ umasters
>= nquorum
)
524 return HAMMER2_ERROR_EINPROGRESS
;
525 if (nmasters_keymatch
< nquorum
)
526 return HAMMER2_ERROR_ESRCH
;
527 return HAMMER2_ERROR_EDEADLK
;
531 * Validated end of scan.
533 if (flags
& HAMMER2_CHECK_NULL
) {
534 if (cluster
->error
== 0)
535 cluster
->error
= HAMMER2_ERROR_ENOENT
;
536 return cluster
->error
;
540 * If we have a NULL focus at this point the agreeing quorum all
543 if (cluster
->focus
== NULL
)
544 return HAMMER2_ERROR_EIO
;
549 * We have quorum agreement, validate elements, not end of scan.
554 for (i
= 0; i
< cluster
->nchains
; ++i
) {
555 chain
= cluster
->array
[i
].chain
;
556 error
= cluster
->array
[i
].error
;
558 chain
->bref
.key
!= key
||
559 chain
->bref
.modify_tid
!= quorum_tid
) {
566 * XXX for now, cumulative error.
568 if (cluster
->error
== 0)
569 cluster
->error
= error
;
571 switch (cluster
->pmp
->pfs_types
[i
]) {
572 case HAMMER2_PFSTYPE_MASTER
:
573 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
574 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
575 nflags
|= HAMMER2_CLUSTER_WRHARD
;
576 nflags
|= HAMMER2_CLUSTER_RDHARD
;
578 case HAMMER2_PFSTYPE_SLAVE
:
580 * We must have enough up-to-date masters to reach
581 * a quorum and the slave modify_tid must match the
582 * quorum's modify_tid.
584 * Do not select an errored slave.
586 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
587 nflags
|= HAMMER2_CLUSTER_RDHARD
;
590 case HAMMER2_PFSTYPE_SOFT_MASTER
:
592 * Directly mounted soft master always wins. There
593 * should be only one.
595 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
596 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
598 case HAMMER2_PFSTYPE_SOFT_SLAVE
:
600 * Directly mounted soft slave always wins. There
601 * should be only one.
605 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
607 case HAMMER2_PFSTYPE_SUPROOT
:
609 * spmp (degenerate case)
611 cluster
->array
[i
].flags
|= HAMMER2_CITEM_FEMOD
;
612 cluster
->array
[i
].flags
&= ~HAMMER2_CITEM_INVALID
;
613 nflags
|= HAMMER2_CLUSTER_WRHARD
;
614 nflags
|= HAMMER2_CLUSTER_RDHARD
;
622 * Focus now set, adjust ddflag. Skip this pass if the focus
623 * is bad or if we are at the PFS root (the bref won't match at
624 * the PFS root, obviously).
626 * focus is probably not locked and it isn't safe to test its
627 * content (e.g. focus->data, focus->dio, other content). We
628 * do not synchronize the dio to the cpu here. In fact, in numerous
629 * situations the frontend doesn't even need to access its dio/data,
630 * so synchronizing it here would be wasteful.
632 focus
= cluster
->focus
;
635 (cluster
->focus
->bref
.type
== HAMMER2_BREF_TYPE_INODE
);
640 if (cluster
->focus
->flags
& HAMMER2_CHAIN_PFSBOUNDARY
)
646 * Validate the elements that were not marked invalid. They should
649 for (i
= 0; i
< cluster
->nchains
; ++i
) {
652 chain
= cluster
->array
[i
].chain
;
658 if (cluster
->array
[i
].flags
& HAMMER2_CITEM_INVALID
)
661 ddflag
= (chain
->bref
.type
== HAMMER2_BREF_TYPE_INODE
);
662 if (chain
->bref
.type
!= focus
->bref
.type
||
663 chain
->bref
.key
!= focus
->bref
.key
||
664 chain
->bref
.keybits
!= focus
->bref
.keybits
||
665 chain
->bref
.modify_tid
!= focus
->bref
.modify_tid
||
666 chain
->bytes
!= focus
->bytes
||
667 ddflag
!= cluster
->ddflag
) {
668 cluster
->array
[i
].flags
|= HAMMER2_CITEM_INVALID
;
669 if (hammer2_debug
& 1)
670 kprintf("cluster_check: matching modify_tid failed "
671 "bref test: idx=%d type=%02x/%02x "
672 "key=%016jx/%d-%016jx/%d "
673 "mod=%016jx/%016jx bytes=%u/%u\n",
675 chain
->bref
.type
, focus
->bref
.type
,
676 chain
->bref
.key
, chain
->bref
.keybits
,
677 focus
->bref
.key
, focus
->bref
.keybits
,
678 chain
->bref
.modify_tid
, focus
->bref
.modify_tid
,
679 chain
->bytes
, focus
->bytes
);
680 if (hammer2_debug
& 0x4000)
681 panic("cluster_check");
682 /* flag issue and force resync? */
688 nflags
|= HAMMER2_CLUSTER_NOSOFT
;
690 nflags
|= HAMMER2_CLUSTER_NOHARD
;
693 * Set SSYNCED or MSYNCED for slaves and masters respectively if
694 * all available nodes (even if 0 are available) are fully
695 * synchronized. This is used by the synchronization thread to
696 * determine if there is work it could potentially accomplish.
698 if (nslaves
== ttlslaves
)
699 nflags
|= HAMMER2_CLUSTER_SSYNCED
;
700 if (nmasters
== ttlmasters
)
701 nflags
|= HAMMER2_CLUSTER_MSYNCED
;
704 * Determine if the cluster was successfully locked for the
705 * requested operation and generate an error code. The cluster
706 * will not be locked (or ref'd) if an error is returned.
708 atomic_set_int(&cluster
->flags
, nflags
);
709 atomic_clear_int(&cluster
->flags
, HAMMER2_CLUSTER_ZFLAGS
& ~nflags
);
711 return cluster
->error
;
715 * Unlock a cluster. Refcount and focus is maintained.
718 hammer2_cluster_unlock(hammer2_cluster_t
*cluster
)
720 hammer2_chain_t
*chain
;
723 if ((cluster
->flags
& HAMMER2_CLUSTER_LOCKED
) == 0) {
724 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
727 KKASSERT(cluster
->flags
& HAMMER2_CLUSTER_LOCKED
);
728 KKASSERT(cluster
->refs
> 0);
729 atomic_clear_int(&cluster
->flags
, HAMMER2_CLUSTER_LOCKED
);
731 for (i
= 0; i
< cluster
->nchains
; ++i
) {
732 chain
= cluster
->array
[i
].chain
;
734 hammer2_chain_unlock(chain
);