2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
58 * hammer2_cluster_lock()
59 * hammer2_cluster_parent()
61 * - Most complex functions, quorum management on transaction ids.
63 * - Locking and data accesses must be internally asynchronous.
65 * - Validate and manage cache coherency primitives (cache state
66 * is stored in chain topologies but must be validated by these
69 * (2) Lookups and Scans
70 * hammer2_cluster_lookup()
71 * hammer2_cluster_next()
73 * - Depend on locking & data retrieval functions, but still complex.
75 * - Must do quorum management on transaction ids.
77 * - Lookup and Iteration ops Must be internally asynchronous.
79 * (3) Modifying Operations
80 * hammer2_cluster_create()
81 * hammer2_cluster_rename()
82 * hammer2_cluster_delete()
83 * hammer2_cluster_modify()
84 * hammer2_cluster_modsync()
86 * - Can usually punt on failures, operation continues unless quorum
87 * is lost. If quorum is lost, must wait for resynchronization
88 * (depending on the management mode).
90 * - Must disconnect node on failures (also not flush), remount, and
93 * - Network links (via kdmsg) are relatively easy to issue as the
94 * complex underworkings of hammer2_chain.c don't have to messed
95 * with (the protocol is at a higher level than block-level).
97 * - Multiple local disk nodes (i.e. block devices) are another matter.
98 * Chain operations have to be dispatched to per-node threads (xN)
99 * because we can't asynchronize potentially very complex chain
100 * operations in hammer2_chain.c (it would be a huge mess).
102 * (these threads are also used to terminate incoming kdmsg ops from
105 * - Single-node filesystems do not use threads and will simply call
106 * hammer2_chain.c functions directly. This short-cut is handled
107 * at the base of each cluster function.
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
119 * Returns TRUE if any chain in the cluster needs to be resized.
122 hammer2_cluster_need_resize(hammer2_cluster_t
*cluster
, int bytes
)
124 hammer2_chain_t
*chain
;
127 for (i
= 0; i
< cluster
->nchains
; ++i
) {
128 chain
= cluster
->array
[i
].chain
;
129 if (chain
&& chain
->bytes
!= bytes
)
136 hammer2_cluster_type(hammer2_cluster_t
*cluster
)
138 return(cluster
->focus
->bref
.type
);
142 hammer2_cluster_modified(hammer2_cluster_t
*cluster
)
144 return((cluster
->focus
->flags
& HAMMER2_CHAIN_MODIFIED
) != 0);
148 * Return a bref representative of the cluster. Any data offset is removed
149 * (since it would only be applicable to a particular chain in the cluster).
151 * However, the radix portion of data_off is used for many purposes and will
155 hammer2_cluster_bref(hammer2_cluster_t
*cluster
, hammer2_blockref_t
*bref
)
157 *bref
= cluster
->focus
->bref
;
158 bref
->data_off
&= HAMMER2_OFF_MASK_RADIX
;
162 * Return non-zero if the chain representing an inode has been flagged
163 * as having been unlinked. Allows the vnode reclaim to avoid loading
164 * the inode data from disk e.g. when unmount or recycling old, clean
168 hammer2_cluster_isunlinked(hammer2_cluster_t
*cluster
)
170 hammer2_chain_t
*chain
;
175 for (i
= 0; i
< cluster
->nchains
; ++i
) {
176 chain
= cluster
->array
[i
].chain
;
178 flags
|= chain
->flags
;
180 return (flags
& HAMMER2_CHAIN_UNLINKED
);
184 hammer2_cluster_set_chainflags(hammer2_cluster_t
*cluster
, uint32_t flags
)
186 hammer2_chain_t
*chain
;
189 for (i
= 0; i
< cluster
->nchains
; ++i
) {
190 chain
= cluster
->array
[i
].chain
;
192 atomic_set_int(&chain
->flags
, flags
);
197 hammer2_cluster_clr_chainflags(hammer2_cluster_t
*cluster
, uint32_t flags
)
199 hammer2_chain_t
*chain
;
202 for (i
= 0; i
< cluster
->nchains
; ++i
) {
203 chain
= cluster
->array
[i
].chain
;
205 atomic_clear_int(&chain
->flags
, flags
);
210 hammer2_cluster_setflush(hammer2_trans_t
*trans
, hammer2_cluster_t
*cluster
)
212 hammer2_chain_t
*chain
;
215 for (i
= 0; i
< cluster
->nchains
; ++i
) {
216 chain
= cluster
->array
[i
].chain
;
218 hammer2_chain_setflush(trans
, chain
);
223 hammer2_cluster_setmethod_check(hammer2_trans_t
*trans
,
224 hammer2_cluster_t
*cluster
,
227 hammer2_chain_t
*chain
;
230 for (i
= 0; i
< cluster
->nchains
; ++i
) {
231 chain
= cluster
->array
[i
].chain
;
233 KKASSERT(chain
->flags
& HAMMER2_CHAIN_MODIFIED
);
234 chain
->bref
.methods
&= ~HAMMER2_ENC_CHECK(-1);
235 chain
->bref
.methods
|= HAMMER2_ENC_CHECK(check_algo
);
241 * Create a cluster with one ref from the specified chain. The chain
242 * is not further referenced. The caller typically supplies a locked
243 * chain and transfers ownership to the cluster.
245 * The returned cluster will be focused on the chain (strictly speaking,
246 * the focus should be NULL if the chain is not locked but we do not check
247 * for this condition).
250 hammer2_cluster_from_chain(hammer2_chain_t
*chain
)
252 hammer2_cluster_t
*cluster
;
254 cluster
= kmalloc(sizeof(*cluster
), M_HAMMER2
, M_WAITOK
| M_ZERO
);
255 cluster
->array
[0].chain
= chain
;
256 cluster
->nchains
= 1;
257 cluster
->focus
= chain
;
258 cluster
->pmp
= chain
->pmp
;
265 * Allocates a cluster and its underlying chain structures. The underlying
266 * chains will be locked. The cluster and underlying chains will have one
267 * ref and will be focused on the first chain.
269 * XXX focus on first chain.
272 hammer2_cluster_alloc(hammer2_pfsmount_t
*pmp
,
273 hammer2_trans_t
*trans
, hammer2_blockref_t
*bref
)
275 hammer2_cluster_t
*cluster
;
276 hammer2_cluster_t
*rcluster
;
277 hammer2_chain_t
*chain
;
278 hammer2_chain_t
*rchain
;
280 u_int bytes
= 1U << (int)(bref
->data_off
& HAMMER2_OFF_MASK_RADIX
);
284 KKASSERT(pmp
!= NULL
);
287 * Construct the appropriate system structure.
290 case HAMMER2_BREF_TYPE_INODE
:
291 case HAMMER2_BREF_TYPE_INDIRECT
:
292 case HAMMER2_BREF_TYPE_FREEMAP_NODE
:
293 case HAMMER2_BREF_TYPE_DATA
:
294 case HAMMER2_BREF_TYPE_FREEMAP_LEAF
:
296 * Chain's are really only associated with the hmp but we
297 * maintain a pmp association for per-mount memory tracking
298 * purposes. The pmp can be NULL.
301 case HAMMER2_BREF_TYPE_VOLUME
:
302 case HAMMER2_BREF_TYPE_FREEMAP
:
304 panic("hammer2_cluster_alloc volume type illegal for op");
307 panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
311 cluster
= kmalloc(sizeof(*cluster
), M_HAMMER2
, M_WAITOK
| M_ZERO
);
314 rcluster
= &pmp
->iroot
->cluster
;
315 for (i
= 0; i
< rcluster
->nchains
; ++i
) {
316 rchain
= rcluster
->array
[i
].chain
;
317 chain
= hammer2_chain_alloc(rchain
->hmp
, pmp
, trans
, bref
);
319 chain
->hmp
= rchain
->hmp
;
321 chain
->bytes
= bytes
;
323 chain
->flags
= HAMMER2_CHAIN_ALLOCATED
;
327 * NOTE: When loading a chain from backing store or creating a
328 * snapshot, trans will be NULL and the caller is
329 * responsible for setting these fields.
331 cluster
->array
[i
].chain
= chain
;
333 cluster
->nchains
= i
;
335 cluster
->focus
= cluster
->array
[0].chain
;
341 * Add a reference to a cluster.
343 * We must also ref the underlying chains in order to allow ref/unlock
344 * sequences to later re-lock.
347 hammer2_cluster_ref(hammer2_cluster_t
*cluster
)
349 hammer2_chain_t
*chain
;
352 atomic_add_int(&cluster
->refs
, 1);
353 for (i
= 0; i
< cluster
->nchains
; ++i
) {
354 chain
= cluster
->array
[i
].chain
;
356 hammer2_chain_ref(chain
);
361 * Drop the caller's reference to the cluster. When the ref count drops to
362 * zero this function frees the cluster and drops all underlying chains.
364 * In-progress read I/Os are typically detached from the cluster once the
365 * first one returns (the remaining stay attached to the DIOs but are then
366 * ignored and drop naturally).
369 hammer2_cluster_drop(hammer2_cluster_t
*cluster
)
371 hammer2_chain_t
*chain
;
374 KKASSERT(cluster
->refs
> 0);
375 for (i
= 0; i
< cluster
->nchains
; ++i
) {
376 chain
= cluster
->array
[i
].chain
;
378 hammer2_chain_drop(chain
);
379 if (cluster
->refs
== 1)
380 cluster
->array
[i
].chain
= NULL
;
383 if (atomic_fetchadd_int(&cluster
->refs
, -1) == 1) {
384 cluster
->focus
= NULL
; /* safety */
385 kfree(cluster
, M_HAMMER2
);
386 /* cluster is invalid */
391 hammer2_cluster_wait(hammer2_cluster_t
*cluster
)
393 tsleep(cluster
->focus
, 0, "h2clcw", 1);
397 * Lock and ref a cluster. This adds a ref to the cluster and its chains
398 * and then locks them.
400 * The act of locking a cluster sets its focus if not already set.
403 hammer2_cluster_lock(hammer2_cluster_t
*cluster
, int how
)
405 hammer2_chain_t
*chain
;
406 hammer2_chain_t
*tmp
;
410 if ((how
& HAMMER2_RESOLVE_NOREF
) == 0)
411 atomic_add_int(&cluster
->refs
, 1);
415 for (i
= 0; i
< cluster
->nchains
; ++i
) {
416 chain
= cluster
->array
[i
].chain
;
418 error
= hammer2_chain_lock(chain
, how
);
421 tmp
= cluster
->array
[i
].chain
;
422 hammer2_chain_unlock(tmp
);
424 atomic_add_int(&cluster
->refs
, -1);
427 if (cluster
->focus
== NULL
)
428 cluster
->focus
= chain
;
435 * Replace the contents of dst with src, adding a reference to src's chains.
436 * dst is assumed to already have a ref and any chains present in dst are
437 * assumed to be locked and will be unlocked.
439 * If the chains in src are locked, only one of (src) or (dst) should be
440 * considered locked by the caller after return, not both.
443 hammer2_cluster_replace(hammer2_cluster_t
*dst
, hammer2_cluster_t
*src
)
445 hammer2_chain_t
*chain
;
446 hammer2_chain_t
*tmp
;
449 KKASSERT(dst
->refs
== 1);
452 for (i
= 0; i
< src
->nchains
; ++i
) {
453 chain
= src
->array
[i
].chain
;
455 hammer2_chain_ref(chain
);
456 if (i
< dst
->nchains
&&
457 (tmp
= dst
->array
[i
].chain
) != NULL
) {
458 hammer2_chain_unlock(tmp
);
460 dst
->array
[i
].chain
= chain
;
461 if (dst
->focus
== NULL
)
465 while (i
< dst
->nchains
) {
466 chain
= dst
->array
[i
].chain
;
468 hammer2_chain_unlock(chain
);
469 dst
->array
[i
].chain
= NULL
;
473 dst
->nchains
= src
->nchains
;
477 * Replace the contents of the locked destination with the contents of the
478 * locked source. Destination must have one ref.
480 * Returns with the destination still with one ref and the copied chains
481 * with an additional lock (representing their state on the destination).
482 * The original chains associated with the destination are unlocked.
485 hammer2_cluster_replace_locked(hammer2_cluster_t
*dst
, hammer2_cluster_t
*src
)
487 hammer2_chain_t
*chain
;
488 hammer2_chain_t
*tmp
;
491 KKASSERT(dst
->refs
== 1);
494 for (i
= 0; i
< src
->nchains
; ++i
) {
495 chain
= src
->array
[i
].chain
;
497 hammer2_chain_lock(chain
, 0);
498 if (i
< dst
->nchains
&&
499 (tmp
= dst
->array
[i
].chain
) != NULL
) {
500 hammer2_chain_unlock(tmp
);
502 dst
->array
[i
].chain
= chain
;
503 if (dst
->focus
== NULL
)
507 while (i
< dst
->nchains
) {
508 chain
= dst
->array
[i
].chain
;
510 hammer2_chain_unlock(chain
);
511 dst
->array
[i
].chain
= NULL
;
515 dst
->nchains
= src
->nchains
;
519 * Copy a cluster, returned a ref'd cluster. All underlying chains
520 * are also ref'd, but not locked. The cluster focus is not set because
521 * the cluster is not yet locked (and the originating cluster does not
522 * have to be locked either).
525 hammer2_cluster_copy(hammer2_cluster_t
*ocluster
)
527 hammer2_pfsmount_t
*pmp
= ocluster
->pmp
;
528 hammer2_cluster_t
*ncluster
;
529 hammer2_chain_t
*chain
;
532 ncluster
= kmalloc(sizeof(*ncluster
), M_HAMMER2
, M_WAITOK
| M_ZERO
);
534 ncluster
->nchains
= ocluster
->nchains
;
537 for (i
= 0; i
< ocluster
->nchains
; ++i
) {
538 chain
= ocluster
->array
[i
].chain
;
539 ncluster
->array
[i
].chain
= chain
;
541 hammer2_chain_ref(chain
);
547 * Unlock and deref a cluster. The cluster is destroyed if this is the
551 hammer2_cluster_unlock(hammer2_cluster_t
*cluster
)
553 hammer2_chain_t
*chain
;
556 KKASSERT(cluster
->refs
> 0);
557 for (i
= 0; i
< cluster
->nchains
; ++i
) {
558 chain
= cluster
->array
[i
].chain
;
560 hammer2_chain_unlock(chain
);
561 if (cluster
->refs
== 1)
562 cluster
->array
[i
].chain
= NULL
; /* safety */
565 if (atomic_fetchadd_int(&cluster
->refs
, -1) == 1) {
566 cluster
->focus
= NULL
;
567 kfree(cluster
, M_HAMMER2
);
568 /* cluster = NULL; safety */
573 * Resize the cluster's physical storage allocation in-place. This may
574 * replace the cluster's chains.
577 hammer2_cluster_resize(hammer2_trans_t
*trans
, hammer2_inode_t
*ip
,
578 hammer2_cluster_t
*cparent
, hammer2_cluster_t
*cluster
,
579 int nradix
, int flags
)
581 hammer2_chain_t
*chain
;
584 KKASSERT(cparent
->pmp
== cluster
->pmp
); /* can be NULL */
585 KKASSERT(cparent
->nchains
== cluster
->nchains
);
587 cluster
->focus
= NULL
;
588 for (i
= 0; i
< cluster
->nchains
; ++i
) {
589 chain
= cluster
->array
[i
].chain
;
591 KKASSERT(cparent
->array
[i
].chain
);
592 hammer2_chain_resize(trans
, ip
,
593 cparent
->array
[i
].chain
, chain
,
595 if (cluster
->focus
== NULL
)
596 cluster
->focus
= chain
;
602 * Set an inode's cluster modified, marking the related chains RW and
603 * duplicating them if necessary.
605 * The passed-in chain is a localized copy of the chain previously acquired
606 * when the inode was locked (and possilby replaced in the mean time), and
607 * must also be updated. In fact, we update it first and then synchronize
608 * the inode's cluster cache.
610 hammer2_inode_data_t
*
611 hammer2_cluster_modify_ip(hammer2_trans_t
*trans
, hammer2_inode_t
*ip
,
612 hammer2_cluster_t
*cluster
, int flags
)
614 atomic_set_int(&ip
->flags
, HAMMER2_INODE_MODIFIED
);
615 hammer2_cluster_modify(trans
, cluster
, flags
);
617 hammer2_inode_repoint(ip
, NULL
, cluster
);
620 return (&hammer2_cluster_wdata(cluster
)->ipdata
);
624 * Adjust the cluster's chains to allow modification and adjust the
625 * focus. Data will be accessible on return.
628 hammer2_cluster_modify(hammer2_trans_t
*trans
, hammer2_cluster_t
*cluster
,
631 hammer2_chain_t
*chain
;
634 cluster
->focus
= NULL
;
635 for (i
= 0; i
< cluster
->nchains
; ++i
) {
636 chain
= cluster
->array
[i
].chain
;
638 hammer2_chain_modify(trans
, chain
, flags
);
639 if (cluster
->focus
== NULL
)
640 cluster
->focus
= chain
;
646 * Synchronize modifications from the focus to other chains in a cluster.
647 * Convenient because nominal API users can just modify the contents of the
648 * focus (at least for non-blockref data).
650 * Nominal front-end operations only edit non-block-table data in a single
651 * chain. This code copies such modifications to the other chains in the
652 * cluster. Blocktable modifications are handled on a chain-by-chain basis
653 * by both the frontend and the backend and will explode in fireworks if
657 hammer2_cluster_modsync(hammer2_cluster_t
*cluster
)
659 hammer2_chain_t
*focus
;
660 hammer2_chain_t
*scan
;
661 const hammer2_inode_data_t
*ripdata
;
662 hammer2_inode_data_t
*wipdata
;
665 focus
= cluster
->focus
;
666 KKASSERT(focus
->flags
& HAMMER2_CHAIN_MODIFIED
);
668 for (i
= 0; i
< cluster
->nchains
; ++i
) {
669 scan
= cluster
->array
[i
].chain
;
670 if (scan
== NULL
|| scan
== focus
)
672 KKASSERT(scan
->flags
& HAMMER2_CHAIN_MODIFIED
);
673 KKASSERT(focus
->bytes
== scan
->bytes
&&
674 focus
->bref
.type
== scan
->bref
.type
);
675 switch(focus
->bref
.type
) {
676 case HAMMER2_BREF_TYPE_INODE
:
677 ripdata
= &focus
->data
->ipdata
;
678 wipdata
= &scan
->data
->ipdata
;
679 if ((ripdata
->op_flags
&
680 HAMMER2_OPFLAG_DIRECTDATA
) == 0) {
681 bcopy(ripdata
, wipdata
,
682 offsetof(hammer2_inode_data_t
, u
));
686 case HAMMER2_BREF_TYPE_DATA
:
687 bcopy(focus
->data
, scan
->data
, focus
->bytes
);
689 case HAMMER2_BREF_TYPE_FREEMAP_NODE
:
690 case HAMMER2_BREF_TYPE_FREEMAP_LEAF
:
691 case HAMMER2_BREF_TYPE_FREEMAP
:
692 case HAMMER2_BREF_TYPE_VOLUME
:
693 panic("hammer2_cluster_modsync: illegal node type");
697 panic("hammer2_cluster_modsync: unknown node type");
704 * Lookup initialization/completion API
707 hammer2_cluster_lookup_init(hammer2_cluster_t
*cparent
, int flags
)
709 hammer2_cluster_t
*cluster
;
712 cluster
= kmalloc(sizeof(*cluster
), M_HAMMER2
, M_WAITOK
| M_ZERO
);
713 cluster
->pmp
= cparent
->pmp
; /* can be NULL */
714 /* cluster->focus = NULL; already null */
716 for (i
= 0; i
< cparent
->nchains
; ++i
) {
717 cluster
->array
[i
].chain
= cparent
->array
[i
].chain
;
718 if (cluster
->focus
== NULL
)
719 cluster
->focus
= cluster
->array
[i
].chain
;
721 cluster
->nchains
= cparent
->nchains
;
724 * Independently lock (this will also give cluster 1 ref)
726 if (flags
& HAMMER2_LOOKUP_SHARED
) {
727 hammer2_cluster_lock(cluster
, HAMMER2_RESOLVE_ALWAYS
|
728 HAMMER2_RESOLVE_SHARED
);
730 hammer2_cluster_lock(cluster
, HAMMER2_RESOLVE_ALWAYS
);
736 hammer2_cluster_lookup_done(hammer2_cluster_t
*cparent
)
739 hammer2_cluster_unlock(cparent
);
743 * Locate first match or overlap under parent, return a new cluster
746 hammer2_cluster_lookup(hammer2_cluster_t
*cparent
, hammer2_key_t
*key_nextp
,
747 hammer2_key_t key_beg
, hammer2_key_t key_end
,
748 int flags
, int *ddflagp
)
750 hammer2_pfsmount_t
*pmp
;
751 hammer2_cluster_t
*cluster
;
752 hammer2_chain_t
*chain
;
753 hammer2_key_t key_accum
;
754 hammer2_key_t key_next
;
755 hammer2_key_t bref_key
;
763 pmp
= cparent
->pmp
; /* can be NULL */
764 key_accum
= *key_nextp
;
771 cluster
= kmalloc(sizeof(*cluster
), M_HAMMER2
, M_WAITOK
| M_ZERO
);
772 cluster
->pmp
= pmp
; /* can be NULL */
774 /* cluster->focus = NULL; already null */
775 cparent
->focus
= NULL
;
778 for (i
= 0; i
< cparent
->nchains
; ++i
) {
779 key_next
= *key_nextp
;
780 if (cparent
->array
[i
].chain
== NULL
) {
784 chain
= hammer2_chain_lookup(&cparent
->array
[i
].chain
,
787 &cparent
->array
[i
].cache_index
,
789 if (cparent
->focus
== NULL
)
790 cparent
->focus
= cparent
->array
[i
].chain
;
791 cluster
->array
[i
].chain
= chain
;
795 if (cluster
->focus
== NULL
) {
796 bref_type
= chain
->bref
.type
;
797 bref_key
= chain
->bref
.key
;
798 bref_keybits
= chain
->bref
.keybits
;
799 bytes
= chain
->bytes
;
801 cluster
->focus
= chain
;
803 KKASSERT(bref_type
== chain
->bref
.type
);
804 KKASSERT(bref_key
== chain
->bref
.key
);
805 KKASSERT(bref_keybits
== chain
->bref
.keybits
);
806 KKASSERT(bytes
== chain
->bytes
);
807 KKASSERT(*ddflagp
== ddflag
);
809 if (key_accum
> key_next
)
810 key_accum
= key_next
;
812 *key_nextp
= key_accum
;
813 cluster
->nchains
= i
;
815 if (null_count
== i
) {
816 hammer2_cluster_drop(cluster
);
824 * Locate next match or overlap under parent, replace cluster
827 hammer2_cluster_next(hammer2_cluster_t
*cparent
, hammer2_cluster_t
*cluster
,
828 hammer2_key_t
*key_nextp
,
829 hammer2_key_t key_beg
, hammer2_key_t key_end
, int flags
)
831 hammer2_chain_t
*chain
;
832 hammer2_key_t key_accum
;
833 hammer2_key_t key_next
;
837 key_accum
= *key_nextp
;
839 cluster
->focus
= NULL
;
840 cparent
->focus
= NULL
;
842 for (i
= 0; i
< cparent
->nchains
; ++i
) {
843 key_next
= *key_nextp
;
844 chain
= cluster
->array
[i
].chain
;
846 if (cparent
->focus
== NULL
)
847 cparent
->focus
= cparent
->array
[i
].chain
;
851 if (cparent
->array
[i
].chain
== NULL
) {
852 if (flags
& HAMMER2_LOOKUP_NOLOCK
)
853 hammer2_chain_drop(chain
);
855 hammer2_chain_unlock(chain
);
859 chain
= hammer2_chain_next(&cparent
->array
[i
].chain
, chain
,
860 &key_next
, key_beg
, key_end
,
861 &cparent
->array
[i
].cache_index
,
863 if (cparent
->focus
== NULL
)
864 cparent
->focus
= cparent
->array
[i
].chain
;
865 cluster
->array
[i
].chain
= chain
;
868 } else if (cluster
->focus
== NULL
) {
869 cluster
->focus
= chain
;
871 if (key_accum
> key_next
)
872 key_accum
= key_next
;
875 if (null_count
== i
) {
876 hammer2_cluster_drop(cluster
);
884 * XXX initial NULL cluster needs reworking (pass **clusterp ?)
886 * The raw scan function is similar to lookup/next but does not seek to a key.
887 * Blockrefs are iterated via first_chain = (parent, NULL) and
888 * next_chain = (parent, chain).
890 * The passed-in parent must be locked and its data resolved. The returned
891 * chain will be locked. Pass chain == NULL to acquire the first sub-chain
892 * under parent and then iterate with the passed-in chain (which this
893 * function will unlock).
896 hammer2_cluster_scan(hammer2_cluster_t
*cparent
, hammer2_cluster_t
*cluster
,
899 hammer2_chain_t
*chain
;
905 for (i
= 0; i
< cparent
->nchains
; ++i
) {
906 chain
= cluster
->array
[i
].chain
;
911 if (cparent
->array
[i
].chain
== NULL
) {
912 if (flags
& HAMMER2_LOOKUP_NOLOCK
)
913 hammer2_chain_drop(chain
);
915 hammer2_chain_unlock(chain
);
920 chain
= hammer2_chain_scan(cparent
->array
[i
].chain
, chain
,
921 &cparent
->array
[i
].cache_index
,
923 cluster
->array
[i
].chain
= chain
;
928 if (null_count
== i
) {
929 hammer2_cluster_drop(cluster
);
938 * Create a new cluster using the specified key
941 hammer2_cluster_create(hammer2_trans_t
*trans
, hammer2_cluster_t
*cparent
,
942 hammer2_cluster_t
**clusterp
,
943 hammer2_key_t key
, int keybits
,
944 int type
, size_t bytes
, int flags
)
946 hammer2_cluster_t
*cluster
;
947 hammer2_pfsmount_t
*pmp
;
951 pmp
= trans
->pmp
; /* can be NULL */
953 if ((cluster
= *clusterp
) == NULL
) {
954 cluster
= kmalloc(sizeof(*cluster
), M_HAMMER2
,
956 cluster
->pmp
= pmp
; /* can be NULL */
959 cluster
->focus
= NULL
;
960 cparent
->focus
= NULL
;
963 * NOTE: cluster->array[] entries can initially be NULL. If
964 * *clusterp is supplied, skip NULL entries, otherwise
967 for (i
= 0; i
< cparent
->nchains
; ++i
) {
968 if (*clusterp
&& cluster
->array
[i
].chain
== NULL
) {
969 if (cparent
->focus
== NULL
)
970 cparent
->focus
= cparent
->array
[i
].chain
;
973 error
= hammer2_chain_create(trans
, &cparent
->array
[i
].chain
,
974 &cluster
->array
[i
].chain
, pmp
,
977 KKASSERT(error
== 0);
978 if (cparent
->focus
== NULL
)
979 cparent
->focus
= cparent
->array
[i
].chain
;
980 if (cluster
->focus
== NULL
)
981 cluster
->focus
= cluster
->array
[i
].chain
;
983 cluster
->nchains
= i
;
990 * Rename a cluster to a new parent.
992 * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
993 * are used from a passed-in non-NULL bref pointer. All other fields
994 * are extracted from the original chain for each chain in the
998 hammer2_cluster_rename(hammer2_trans_t
*trans
, hammer2_blockref_t
*bref
,
999 hammer2_cluster_t
*cparent
, hammer2_cluster_t
*cluster
,
1002 hammer2_chain_t
*chain
;
1003 hammer2_blockref_t xbref
;
1006 cluster
->focus
= NULL
;
1007 cparent
->focus
= NULL
;
1009 for (i
= 0; i
< cluster
->nchains
; ++i
) {
1010 chain
= cluster
->array
[i
].chain
;
1013 xbref
= chain
->bref
;
1014 xbref
.key
= bref
->key
;
1015 xbref
.keybits
= bref
->keybits
;
1016 hammer2_chain_rename(trans
, &xbref
,
1017 &cparent
->array
[i
].chain
,
1020 hammer2_chain_rename(trans
, NULL
,
1021 &cparent
->array
[i
].chain
,
1024 cluster
->array
[i
].chain
= chain
;
1025 if (cluster
->focus
== NULL
)
1026 cluster
->focus
= chain
;
1027 if (cparent
->focus
== NULL
)
1028 cparent
->focus
= cparent
->array
[i
].chain
;
1030 if (cparent
->focus
== NULL
)
1031 cparent
->focus
= cparent
->array
[i
].chain
;
1037 * Mark a cluster deleted
1040 hammer2_cluster_delete(hammer2_trans_t
*trans
, hammer2_cluster_t
*cparent
,
1041 hammer2_cluster_t
*cluster
, int flags
)
1043 hammer2_chain_t
*chain
;
1044 hammer2_chain_t
*parent
;
1047 if (cparent
== NULL
) {
1048 kprintf("cparent is NULL\n");
1052 for (i
= 0; i
< cluster
->nchains
; ++i
) {
1053 parent
= (i
< cparent
->nchains
) ?
1054 cparent
->array
[i
].chain
: NULL
;
1055 chain
= cluster
->array
[i
].chain
;
1058 if (chain
->parent
!= parent
) {
1059 kprintf("hammer2_cluster_delete: parent "
1060 "mismatch chain=%p parent=%p against=%p\n",
1061 chain
, chain
->parent
, parent
);
1063 hammer2_chain_delete(trans
, parent
, chain
, flags
);
1069 * Create a snapshot of the specified {parent, ochain} with the specified
1070 * label. The originating hammer2_inode must be exclusively locked for
1073 * The ioctl code has already synced the filesystem.
1076 hammer2_cluster_snapshot(hammer2_trans_t
*trans
, hammer2_cluster_t
*ocluster
,
1077 hammer2_ioc_pfs_t
*pfs
)
1079 hammer2_mount_t
*hmp
;
1080 hammer2_cluster_t
*ncluster
;
1081 const hammer2_inode_data_t
*ripdata
;
1082 hammer2_inode_data_t
*wipdata
;
1083 hammer2_chain_t
*nchain
;
1084 hammer2_inode_t
*nip
;
1092 kprintf("snapshot %s\n", pfs
->name
);
1094 name_len
= strlen(pfs
->name
);
1095 lhc
= hammer2_dirhash(pfs
->name
, name_len
);
1100 ripdata
= &hammer2_cluster_rdata(ocluster
)->ipdata
;
1101 opfs_clid
= ripdata
->pfs_clid
;
1102 hmp
= ocluster
->focus
->hmp
;
1105 * Create the snapshot directory under the super-root
1107 * Set PFS type, generate a unique filesystem id, and generate
1108 * a cluster id. Use the same clid when snapshotting a PFS root,
1109 * which theoretically allows the snapshot to be used as part of
1110 * the same cluster (perhaps as a cache).
1112 * Copy the (flushed) blockref array. Theoretically we could use
1113 * chain_duplicate() but it becomes difficult to disentangle
1114 * the shared core so for now just brute-force it.
1120 nip
= hammer2_inode_create(trans
, hmp
->spmp
->iroot
, &vat
,
1121 proc0
.p_ucred
, pfs
->name
, name_len
,
1125 wipdata
= hammer2_cluster_modify_ip(trans
, nip
, ncluster
, 0);
1126 wipdata
->pfs_type
= HAMMER2_PFSTYPE_SNAPSHOT
;
1127 kern_uuidgen(&wipdata
->pfs_fsid
, 1);
1128 if (ocluster
->focus
->flags
& HAMMER2_CHAIN_PFSBOUNDARY
)
1129 wipdata
->pfs_clid
= opfs_clid
;
1131 kern_uuidgen(&wipdata
->pfs_clid
, 1);
1133 for (i
= 0; i
< ncluster
->nchains
; ++i
) {
1134 nchain
= ncluster
->array
[i
].chain
;
1136 nchain
->bref
.flags
|= HAMMER2_BREF_FLAG_PFSROOT
;
1139 /* XXX can't set this unless we do an explicit flush, which
1140 we also need a pmp assigned to do, else the flush code
1141 won't flush ncluster because it thinks it is crossing a
1143 hammer2_cluster_set_chainflags(ncluster
,
1144 HAMMER2_CHAIN_PFSBOUNDARY
);
1147 /* XXX hack blockset copy */
1148 /* XXX doesn't work with real cluster */
1149 KKASSERT(ocluster
->nchains
== 1);
1150 wipdata
->u
.blockset
= ripdata
->u
.blockset
;
1151 hammer2_cluster_modsync(ncluster
);
1152 for (i
= 0; i
< ncluster
->nchains
; ++i
) {
1153 nchain
= ncluster
->array
[i
].chain
;
1155 hammer2_flush(trans
, nchain
);
1157 hammer2_inode_unlock_ex(nip
, ncluster
);
1163 * Return locked parent cluster given a locked child. The child remains
1164 * locked on return. The new parent's focus follows the child's focus
1165 * and the parent is always resolved.
1168 hammer2_cluster_parent(hammer2_cluster_t
*cluster
)
1170 hammer2_cluster_t
*cparent
;
1173 cparent
= hammer2_cluster_copy(cluster
);
1174 for (i
= 0; i
< cparent
->nchains
; ++i
) {
1175 hammer2_chain_t
*chain
;
1176 hammer2_chain_t
*rchain
;
1179 * Calculate parent for each element. Old chain has an extra
1180 * ref for cparent but the lock remains with cluster.
1182 chain
= cparent
->array
[i
].chain
;
1185 while ((rchain
= chain
->parent
) != NULL
) {
1186 hammer2_chain_ref(rchain
);
1187 hammer2_chain_unlock(chain
);
1188 hammer2_chain_lock(rchain
, HAMMER2_RESOLVE_ALWAYS
);
1189 hammer2_chain_lock(chain
, HAMMER2_RESOLVE_ALWAYS
);
1190 hammer2_chain_drop(rchain
);
1191 if (chain
->parent
== rchain
)
1193 hammer2_chain_unlock(rchain
);
1195 if (cluster
->focus
== chain
)
1196 cparent
->focus
= rchain
;
1197 cparent
->array
[i
].chain
= rchain
;
1198 hammer2_chain_drop(chain
);
1203 /************************************************************************
1205 ************************************************************************
1208 * WARNING! blockref[] array data is not universal. These functions should
1209 * only be used to access universal data.
1211 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1212 * complete if necessary. The I/O's should have already been
1213 * initiated by the cluster_lock/chain_lock operation.
1215 * The cluster must already be in a modified state before wdata
1216 * is called. The data will already be available for this case.
1218 const hammer2_media_data_t
*
1219 hammer2_cluster_rdata(hammer2_cluster_t
*cluster
)
1221 return(cluster
->focus
->data
);
1224 hammer2_media_data_t
*
1225 hammer2_cluster_wdata(hammer2_cluster_t
*cluster
)
1227 KKASSERT(hammer2_cluster_modified(cluster
));
1228 return(cluster
->focus
->data
);
1232 * Load async into independent buffer - used to load logical buffers from
1233 * underlying device data. The callback is made for the first validated
1234 * data found, or NULL if no valid data is available.
1236 * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1237 * in the inode with an exclusive lock held), the chain structure may be
1241 hammer2_cluster_load_async(hammer2_cluster_t
*cluster
,
1242 void (*callback
)(hammer2_iocb_t
*iocb
), void *ptr
)
1244 hammer2_chain_t
*chain
;
1245 hammer2_iocb_t
*iocb
;
1246 hammer2_mount_t
*hmp
;
1247 hammer2_blockref_t
*bref
;
1251 * Try to find a chain whos data is already resolved. If none can
1252 * be found, start with the first chain.
1255 for (i
= 0; i
< cluster
->nchains
; ++i
) {
1256 chain
= cluster
->array
[i
].chain
;
1257 if (chain
&& chain
->data
)
1260 if (i
== cluster
->nchains
) {
1261 chain
= cluster
->array
[0].chain
;
1265 iocb
= &cluster
->iocb
;
1266 iocb
->callback
= callback
;
1267 iocb
->dio
= NULL
; /* for already-validated case */
1268 iocb
->cluster
= cluster
;
1269 iocb
->chain
= chain
;
1271 iocb
->lbase
= (off_t
)i
;
1276 * Data already validated
1284 * We must resolve to a device buffer, either by issuing I/O or
1285 * by creating a zero-fill element. We do not mark the buffer
1286 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1287 * API must still be used to do that).
1289 * The device buffer is variable-sized in powers of 2 down
1290 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
1291 * chunk always contains buffers of the same size. (XXX)
1293 * The minimum physical IO size may be larger than the variable
1296 bref
= &chain
->bref
;
1300 /* handled by callback? <- TODO XXX even needed for loads? */
1302 * The getblk() optimization for a 100% overwrite can only be used
1303 * if the physical block size matches the request.
1305 if ((chain
->flags
& HAMMER2_CHAIN_INITIAL
) &&
1306 chain
->bytes
== hammer2_devblksize(chain
->bytes
)) {
1307 error
= hammer2_io_new(hmp
, bref
->data_off
, chain
->bytes
, &dio
);
1308 KKASSERT(error
== 0);
1316 * Otherwise issue a read
1318 hammer2_adjreadcounter(&chain
->bref
, chain
->bytes
);
1319 hammer2_io_getblk(hmp
, bref
->data_off
, chain
->bytes
, iocb
);
1322 /************************************************************************
1324 ************************************************************************
1326 * A node failure can occur for numerous reasons.
1328 * - A read I/O may fail
1329 * - A write I/O may fail
1330 * - An unexpected chain might be found (or be missing)
1331 * - A node might disconnect temporarily and reconnect later
1332 * (for example, a USB stick could get pulled, or a node might
1333 * be programmatically disconnected).
1334 * - A node might run out of space during a modifying operation.
1336 * When a read failure or an unexpected chain state is found, the chain and
1337 * parent chain at the failure point for the nodes involved (the nodes
1338 * which we determine to be in error) are flagged as failed and removed
1339 * from the cluster. The node itself is allowed to remain active. The
1340 * highest common point (usually a parent chain) is queued to the
1341 * resynchronization thread for action.
1343 * When a write I/O fails or a node runs out of space, we first adjust
1344 * as if a read failure occurs but we further disable flushes on the
1345 * ENTIRE node. Concurrent modifying transactions are allowed to complete
1346 * but any new modifying transactions will automatically remove the node
1347 * from consideration in all related cluster structures and not generate
1348 * any new modified chains. The ROOT chain for the failed node(s) is queued
1349 * to the resynchronization thread for action.
1351 * A temporary disconnect is handled as if a write failure occurred.
1353 * Any of these failures might or might not stall related high level VNOPS,
1354 * depending on what has failed, what nodes remain, the type of cluster,
1355 * and the operating state of the cluster.
1357 * FLUSH ON WRITE-DISABLED NODES
1359 * A flush on a write-disabled node is not allowed to write anything because
1360 * we cannot safely update the mirror_tid anywhere on the failed node. The
1361 * synchronization thread uses mirror_tid to calculate incremental resyncs.
1362 * Dirty meta-data related to the failed node is thrown away.
1364 * Dirty buffer cache buffers and inodes are only thrown away if they can be
1365 * retired... that is, if the filesystem still has enough nodes to complete
1369 /************************************************************************
1370 * SYNCHRONIZATION THREAD *
1371 ************************************************************************
1373 * This thread is responsible for [re]synchronizing the cluster representing
1374 * a PFS. Any out-of-sync or failed node starts this thread on a
1375 * node-by-node basis when the failure is detected.
1377 * Clusters needing resynchronization are queued at the highest point
1378 * where the parent on the failed node is still valid, or a special
1379 * incremental scan from the ROOT is queued if no parent exists. This
1380 * thread is also responsible for waiting for reconnections of the failed
1381 * node if the cause was due to a disconnect, and waiting for space to be
1382 * freed up if the cause was due to running out of space.
1384 * If the cause is due to a node running out of space, this thread will also
1385 * remove older (unlocked) snapshots to make new space, recover space, and
1386 * then start resynchronization.
1388 * Each resynchronization pass virtually snapshots the PFS on the good nodes
1389 * and synchronizes using that snapshot against the target node. This
1390 * ensures a consistent chain topology and also avoids interference between
1391 * the resynchronization thread and frontend operations.
1393 * Since these are per-node threads it is possible to resynchronize several