hammer2 - Stabilization pass
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
blob62f99fbda3072ab9d1d2ee8b0405999b9dcd28b8
1 /*
2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
53 * CLUSTER OPERATIONS
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
58 * hammer2_cluster_lock()
59 * hammer2_cluster_parent()
61 * - Most complex functions, quorum management on transaction ids.
63 * - Locking and data accesses must be internally asynchronous.
65 * - Validate and manage cache coherency primitives (cache state
66 * is stored in chain topologies but must be validated by these
67 * functions).
69 * (2) Lookups and Scans
70 * hammer2_cluster_lookup()
71 * hammer2_cluster_next()
73 * - Depend on locking & data retrieval functions, but still complex.
75 * - Must do quorum management on transaction ids.
77 * - Lookup and Iteration ops Must be internally asynchronous.
79 * (3) Modifying Operations
80 * hammer2_cluster_create()
81 * hammer2_cluster_rename()
82 * hammer2_cluster_delete()
83 * hammer2_cluster_modify()
84 * hammer2_cluster_modsync()
86 * - Can usually punt on failures, operation continues unless quorum
87 * is lost. If quorum is lost, must wait for resynchronization
88 * (depending on the management mode).
90 * - Must disconnect node on failures (also not flush), remount, and
91 * resynchronize.
93 * - Network links (via kdmsg) are relatively easy to issue as the
94 * complex underworkings of hammer2_chain.c don't have to messed
95 * with (the protocol is at a higher level than block-level).
97 * - Multiple local disk nodes (i.e. block devices) are another matter.
98 * Chain operations have to be dispatched to per-node threads (xN)
99 * because we can't asynchronize potentially very complex chain
100 * operations in hammer2_chain.c (it would be a huge mess).
102 * (these threads are also used to terminate incoming kdmsg ops from
103 * other machines).
105 * - Single-node filesystems do not use threads and will simply call
106 * hammer2_chain.c functions directly. This short-cut is handled
107 * at the base of each cluster function.
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
116 #include "hammer2.h"
119 * Returns non-zero if any chain in the cluster needs to be resized.
120 * Errored elements are not used in the calculation.
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
125 hammer2_chain_t *chain;
126 int i;
128 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129 for (i = 0; i < cluster->nchains; ++i) {
130 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131 continue;
132 chain = cluster->array[i].chain;
133 if (chain == NULL)
134 continue;
135 if (chain->error)
136 continue;
137 if (chain->bytes != bytes)
138 return 1;
140 return 0;
144 * Returns the bref type of the cluster's foucs.
146 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147 * The cluster must be locked.
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
152 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153 if (cluster->error == 0)
154 return(cluster->focus->bref.type);
155 return 0;
159 * Returns non-zero if the cluster's focus is flagged as being modified.
161 * If the cluster is errored, returns 0.
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
166 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167 if (cluster->error == 0)
168 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
169 return 0;
173 * Returns the bref of the cluster's focus, sans any data-offset information
174 * (since offset information is per-node and wouldn't be useful).
176 * Callers use this function to access modify_tid, mirror_tid, type,
177 * key, and keybits.
179 * If the cluster is errored, returns an empty bref.
180 * The cluster must be locked.
182 void
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
185 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186 if (cluster->error == 0) {
187 *bref = cluster->focus->bref;
188 bref->data_off = 0;
189 } else {
190 bzero(bref, sizeof(*bref));
195 * Return non-zero if the chain representing an inode has been flagged
196 * as having been unlinked. Allows the vnode reclaim to avoid loading
197 * the inode data from disk e.g. when unmount or recycling old, clean
198 * vnodes.
200 * The cluster does not need to be locked.
201 * The focus cannot be used since the cluster might not be locked.
204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
206 hammer2_chain_t *chain;
207 int flags;
208 int i;
210 flags = 0;
211 for (i = 0; i < cluster->nchains; ++i) {
212 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
213 continue;
214 chain = cluster->array[i].chain;
215 if (chain)
216 flags |= chain->flags;
218 return (flags & HAMMER2_CHAIN_UNLINKED);
222 * Set a bitmask of flags in all chains related to a cluster.
223 * The cluster should probably be locked.
225 * XXX Only operate on FEMOD elements?
227 void
228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
230 hammer2_chain_t *chain;
231 int i;
233 for (i = 0; i < cluster->nchains; ++i) {
234 chain = cluster->array[i].chain;
235 if (chain)
236 atomic_set_int(&chain->flags, flags);
241 * Set a bitmask of flags in all chains related to a cluster.
242 * The cluster should probably be locked.
244 * XXX Only operate on FEMOD elements?
246 void
247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
249 hammer2_chain_t *chain;
250 int i;
252 for (i = 0; i < cluster->nchains; ++i) {
253 chain = cluster->array[i].chain;
254 if (chain)
255 atomic_clear_int(&chain->flags, flags);
260 * Flag the cluster for flushing recursively up to the root. Despite the
261 * work it does, this is relatively benign. It just makes sure that the
262 * flusher has top-down visibility to this cluster.
264 * Errored chains are not flagged for flushing.
266 * The cluster should probably be locked.
268 void
269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
271 hammer2_chain_t *chain;
272 int i;
274 for (i = 0; i < cluster->nchains; ++i) {
275 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
276 continue;
277 chain = cluster->array[i].chain;
278 if (chain == NULL)
279 continue;
280 if (chain->error)
281 continue;
282 hammer2_chain_setflush(trans, chain);
287 * Set the check mode for the cluster.
288 * Errored elements of the cluster are ignored.
290 * The cluster must be locked and modified.
292 void
293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
294 hammer2_cluster_t *cluster,
295 int check_algo)
297 hammer2_chain_t *chain;
298 int i;
300 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
301 for (i = 0; i < cluster->nchains; ++i) {
302 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
303 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
304 continue;
306 chain = cluster->array[i].chain;
307 if (chain == NULL)
308 continue;
309 if (chain->error)
310 continue;
311 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
312 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
313 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
318 * Create a degenerate cluster with one ref from a single locked chain.
319 * The returned cluster will be focused on the chain and inherit its
320 * error state.
322 * The chain's lock and reference are transfered to the new cluster, so
323 * the caller should not try to unlock the chain separately.
325 * We fake the flags.
327 hammer2_cluster_t *
328 hammer2_cluster_from_chain(hammer2_chain_t *chain)
330 hammer2_cluster_t *cluster;
332 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
333 cluster->array[0].chain = chain;
334 cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
335 cluster->nchains = 1;
336 cluster->focus = chain;
337 cluster->focus_index = 0;
338 cluster->pmp = chain->pmp;
339 cluster->refs = 1;
340 cluster->error = chain->error;
341 cluster->flags = HAMMER2_CLUSTER_LOCKED |
342 HAMMER2_CLUSTER_WRHARD |
343 HAMMER2_CLUSTER_RDHARD |
344 HAMMER2_CLUSTER_MSYNCED |
345 HAMMER2_CLUSTER_SSYNCED;
347 return cluster;
351 * Add a reference to a cluster and its underlying chains.
353 * We must also ref the underlying chains in order to allow ref/unlock
354 * sequences to later re-lock.
356 void
357 hammer2_cluster_ref(hammer2_cluster_t *cluster)
359 atomic_add_int(&cluster->refs, 1);
363 * Drop the caller's reference to the cluster. When the ref count drops to
364 * zero this function frees the cluster and drops all underlying chains.
366 * In-progress read I/Os are typically detached from the cluster once the
367 * first one returns (the remaining stay attached to the DIOs but are then
368 * ignored and drop naturally).
370 void
371 hammer2_cluster_drop(hammer2_cluster_t *cluster)
373 hammer2_chain_t *chain;
374 int i;
376 KKASSERT(cluster->refs > 0);
377 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
378 cluster->focus = NULL; /* safety XXX chg to assert */
379 cluster->focus_index = 0;
381 for (i = 0; i < cluster->nchains; ++i) {
382 chain = cluster->array[i].chain;
383 if (chain) {
384 hammer2_chain_drop(chain);
385 cluster->array[i].chain = NULL; /* safety */
388 cluster->nchains = 0; /* safety */
390 kfree(cluster, M_HAMMER2);
391 /* cluster is invalid */
395 void
396 hammer2_cluster_wait(hammer2_cluster_t *cluster)
398 tsleep(cluster->focus, 0, "h2clcw", 1);
402 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
404 * WARNING! This function expects the caller to handle resolution of the
405 * cluster. We never re-resolve the cluster in this function,
406 * because it might be used to temporarily unlock/relock a cparent
407 * in an iteration or recursrion, and the cparents elements do not
408 * necessarily match.
410 void
411 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
413 hammer2_chain_t *chain;
414 int i;
416 /* cannot be on inode-embedded cluster template, must be on copy */
417 KKASSERT(cluster->refs > 0);
418 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
419 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
420 panic("hammer2_cluster_lock: cluster %p already locked!\n",
421 cluster);
423 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
426 * Lock chains and resolve state.
428 for (i = 0; i < cluster->nchains; ++i) {
429 if (i == idx)
430 continue;
431 chain = cluster->array[i].chain;
432 if (chain == NULL)
433 continue;
434 hammer2_chain_lock(chain, how);
438 void
439 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
441 hammer2_cluster_lock_except(cluster, -1, how);
445 * Calculate the clustering state for the cluster and set its focus.
446 * This routine must be called with care. For example, it should not
447 * normally be called after relocking a non-leaf cluster because parent
448 * clusters help iterations and each element might be at a slightly different
449 * indirect node (each node's topology is independently indexed).
451 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
452 * operations. Typically this is only set on a quorum of MASTERs or
453 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
454 * is present, this bit is *not* set on a quorum of MASTERs. The
455 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
456 * that create/modify/delete elements use it.
458 * The chains making up the cluster may be narrowed down based on quorum
459 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
460 * narrowed down to a single chain as long as the entire subtopology is known
461 * to be intact. So, for example, we can narrow a read-only op to a single
462 * fast SLAVE but if we focus a CACHE chain we must still retain at least
463 * a SLAVE to ensure that the subtopology can be accessed.
465 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
466 * to be maintained once the topology is validated as-of the top level of
467 * the operation.
469 * If a failure occurs the operation must be aborted by higher-level code and
470 * retried. XXX
472 void
473 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
475 hammer2_chain_t *chain;
476 hammer2_chain_t *focus;
477 hammer2_pfs_t *pmp;
478 hammer2_tid_t quorum_tid;
479 hammer2_tid_t last_best_quorum_tid;
480 int focus_pfs_type;
481 uint32_t nflags;
482 int ttlmasters;
483 int ttlslaves;
484 int nmasters;
485 int nslaves;
486 int nquorum;
487 int smpresent;
488 int i;
490 cluster->error = 0;
491 cluster->focus = NULL;
493 focus_pfs_type = 0;
494 nflags = 0;
495 ttlmasters = 0;
496 ttlslaves = 0;
497 nmasters = 0;
498 nslaves = 0;
501 * Calculate quorum
503 pmp = cluster->pmp;
504 KKASSERT(pmp != NULL || cluster->nchains == 0);
505 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
506 smpresent = 0;
509 * Pass 1
511 * NOTE: A NULL chain is not necessarily an error, it could be
512 * e.g. a lookup failure or the end of an iteration.
513 * Process normally.
515 for (i = 0; i < cluster->nchains; ++i) {
516 chain = cluster->array[i].chain;
517 if (chain && chain->error) {
518 if (cluster->focus == NULL || cluster->focus == chain) {
519 /* error will be overridden by valid focus */
520 cluster->error = chain->error;
524 * Must count total masters and slaves whether the
525 * chain is errored or not.
527 switch (cluster->pmp->pfs_types[i]) {
528 case HAMMER2_PFSTYPE_MASTER:
529 ++ttlmasters;
530 break;
531 case HAMMER2_PFSTYPE_SLAVE:
532 ++ttlslaves;
533 break;
535 continue;
537 switch (cluster->pmp->pfs_types[i]) {
538 case HAMMER2_PFSTYPE_MASTER:
539 ++ttlmasters;
540 break;
541 case HAMMER2_PFSTYPE_SLAVE:
542 ++ttlslaves;
543 break;
544 case HAMMER2_PFSTYPE_SOFT_MASTER:
545 nflags |= HAMMER2_CLUSTER_WRSOFT;
546 nflags |= HAMMER2_CLUSTER_RDSOFT;
547 smpresent = 1;
548 break;
549 case HAMMER2_PFSTYPE_SOFT_SLAVE:
550 nflags |= HAMMER2_CLUSTER_RDSOFT;
551 break;
552 case HAMMER2_PFSTYPE_SUPROOT:
554 * Degenerate cluster representing the super-root
555 * topology on a single device. Fake stuff so
556 * cluster ops work as expected.
558 nflags |= HAMMER2_CLUSTER_WRHARD;
559 nflags |= HAMMER2_CLUSTER_RDHARD;
560 cluster->focus_index = i;
561 cluster->focus = chain;
562 cluster->error = chain ? chain->error : 0;
563 break;
564 default:
565 break;
570 * Pass 2
572 * Resolve masters. Calculate nmasters for the highest matching
573 * TID, if a quorum cannot be attained try the next lower matching
574 * TID until we exhaust TIDs.
576 * NOTE: A NULL chain is not necessarily an error, it could be
577 * e.g. a lookup failure or the end of an iteration.
578 * Process normally.
580 last_best_quorum_tid = HAMMER2_TID_MAX;
581 quorum_tid = 0; /* fix gcc warning */
583 while (nmasters < nquorum && last_best_quorum_tid != 0) {
584 nmasters = 0;
585 quorum_tid = 0;
587 for (i = 0; i < cluster->nchains; ++i) {
588 if (cluster->pmp->pfs_types[i] !=
589 HAMMER2_PFSTYPE_MASTER) {
590 continue;
592 chain = cluster->array[i].chain;
594 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
596 * Invalid as in unsynchronized, cannot be
597 * used to calculate the quorum.
599 } else if (chain == NULL && quorum_tid == 0) {
601 * NULL chain on master matches NULL chains
602 * on other masters.
604 ++nmasters;
605 } else if (quorum_tid < last_best_quorum_tid &&
606 chain != NULL &&
607 (quorum_tid < chain->bref.modify_tid ||
608 nmasters == 0)) {
610 * Better TID located, reset nmasters count.
612 nmasters = 1;
613 quorum_tid = chain->bref.modify_tid;
614 } else if (chain &&
615 quorum_tid == chain->bref.modify_tid) {
617 * TID matches current collection.
619 ++nmasters;
622 if (nmasters >= nquorum)
623 break;
624 last_best_quorum_tid = quorum_tid;
628 * Pass 3
630 * NOTE: A NULL chain is not necessarily an error, it could be
631 * e.g. a lookup failure or the end of an iteration.
632 * Process normally.
634 for (i = 0; i < cluster->nchains; ++i) {
635 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
636 chain = cluster->array[i].chain;
637 if (chain && chain->error) {
638 if (cluster->focus == NULL || cluster->focus == chain) {
639 /* error will be overridden by valid focus */
640 cluster->error = chain->error;
642 continue;
645 switch (cluster->pmp->pfs_types[i]) {
646 case HAMMER2_PFSTYPE_MASTER:
648 * We must have enough up-to-date masters to reach
649 * a quorum and the master modify_tid must match
650 * the quorum's modify_tid.
652 * Do not select an errored or out-of-sync master.
654 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
655 nflags |= HAMMER2_CLUSTER_UNHARD;
656 } else if (nmasters >= nquorum &&
657 (chain == NULL || chain->error == 0) &&
658 ((chain == NULL && quorum_tid == 0) ||
659 (chain != NULL && quorum_tid ==
660 chain->bref.modify_tid))) {
661 nflags |= HAMMER2_CLUSTER_WRHARD;
662 nflags |= HAMMER2_CLUSTER_RDHARD;
663 if (!smpresent) {
664 cluster->array[i].flags |=
665 HAMMER2_CITEM_FEMOD;
667 if (cluster->focus == NULL ||
668 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
669 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
670 cluster->focus_index = i;
671 cluster->focus = chain; /* NULL ok */
672 cluster->error = chain ? chain->error :
675 } else if (chain == NULL || chain->error == 0) {
676 nflags |= HAMMER2_CLUSTER_UNHARD;
678 break;
679 case HAMMER2_PFSTYPE_SLAVE:
681 * We must have enough up-to-date masters to reach
682 * a quorum and the slave modify_tid must match the
683 * quorum's modify_tid.
685 * Do not select an errored slave.
687 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
688 nflags |= HAMMER2_CLUSTER_UNHARD;
689 } else if (nmasters >= nquorum &&
690 (chain == NULL || chain->error == 0) &&
691 ((chain == NULL && quorum_tid == 0) ||
692 (chain && quorum_tid ==
693 chain->bref.modify_tid))) {
694 ++nslaves;
695 nflags |= HAMMER2_CLUSTER_RDHARD;
696 #if 0
697 /* XXX optimize for RESOLVE_RDONLY */
698 if (cluster->focus == NULL) {
699 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
700 cluster->focus_index = i;
701 cluster->focus = chain; /* NULL ok */
702 cluster->error = chain ? chain->error :
705 #endif
706 } else if (chain == NULL || chain->error == 0) {
707 nflags |= HAMMER2_CLUSTER_UNSOFT;
709 break;
710 case HAMMER2_PFSTYPE_SOFT_MASTER:
712 * Directly mounted soft master always wins. There
713 * should be only one.
715 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
716 cluster->focus_index = i;
717 cluster->focus = chain;
718 cluster->error = chain ? chain->error : 0;
719 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
720 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
721 break;
722 case HAMMER2_PFSTYPE_SOFT_SLAVE:
724 * Directly mounted soft slave always wins. There
725 * should be only one.
727 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
728 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
729 cluster->focus_index = i;
730 cluster->focus = chain;
731 cluster->error = chain ? chain->error : 0;
732 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
734 break;
735 case HAMMER2_PFSTYPE_SUPROOT:
737 * spmp (degenerate case)
739 KKASSERT(i == 0);
740 cluster->focus_index = i;
741 cluster->focus = chain;
742 cluster->error = chain ? chain->error : 0;
743 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
744 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
745 break;
746 default:
747 break;
752 * Focus now set, adjust ddflag. Skip this pass if the focus
753 * is bad or if we are at the PFS root (the bref won't match at
754 * the PFS root, obviously).
756 focus = cluster->focus;
757 if (focus) {
758 cluster->ddflag =
759 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
760 } else {
761 cluster->ddflag = 0;
762 goto skip4;
764 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
765 goto skip4;
768 * Pass 4
770 * Validate the elements that were not marked invalid. They should
771 * match.
773 for (i = 0; i < cluster->nchains; ++i) {
774 int ddflag;
776 chain = cluster->array[i].chain;
778 if (chain == NULL)
779 continue;
780 if (chain == focus)
781 continue;
782 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
783 continue;
785 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
786 if (chain->bref.type != focus->bref.type ||
787 chain->bref.key != focus->bref.key ||
788 chain->bref.keybits != focus->bref.keybits ||
789 chain->bref.modify_tid != focus->bref.modify_tid ||
790 chain->bytes != focus->bytes ||
791 ddflag != cluster->ddflag) {
792 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
793 if (hammer2_debug & 1)
794 kprintf("cluster_resolve: matching modify_tid failed "
795 "bref test: idx=%d type=%02x/%02x "
796 "key=%016jx/%d-%016jx/%d "
797 "mod=%016jx/%016jx bytes=%u/%u\n",
799 chain->bref.type, focus->bref.type,
800 chain->bref.key, chain->bref.keybits,
801 focus->bref.key, focus->bref.keybits,
802 chain->bref.modify_tid, focus->bref.modify_tid,
803 chain->bytes, focus->bytes);
804 if (hammer2_debug & 0x4000)
805 panic("cluster_resolve");
806 /* flag issue and force resync? */
809 skip4:
811 if (ttlslaves == 0)
812 nflags |= HAMMER2_CLUSTER_NOSOFT;
813 if (ttlmasters == 0)
814 nflags |= HAMMER2_CLUSTER_NOHARD;
817 * Set SSYNCED or MSYNCED for slaves and masters respectively if
818 * all available nodes (even if 0 are available) are fully
819 * synchronized. This is used by the synchronization thread to
820 * determine if there is work it could potentially accomplish.
822 if (nslaves == ttlslaves)
823 nflags |= HAMMER2_CLUSTER_SSYNCED;
824 if (nmasters == ttlmasters)
825 nflags |= HAMMER2_CLUSTER_MSYNCED;
828 * Determine if the cluster was successfully locked for the
829 * requested operation and generate an error code. The cluster
830 * will not be locked (or ref'd) if an error is returned.
832 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
833 * to determine if reading or writing is possible. If writing, the
834 * cluster still requires a call to hammer2_cluster_modify() first.
836 atomic_set_int(&cluster->flags, nflags);
837 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
841 * This is used by the sync thread to force non-NULL elements of a copy
842 * of the pmp->iroot cluster to be good which is required to prime the
843 * sync.
845 void
846 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
848 int i;
850 for (i = 0; i < cluster->nchains; ++i) {
851 if (cluster->array[i].chain)
852 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
857 * Copy a cluster, returned a ref'd cluster. All underlying chains
858 * are also ref'd, but not locked. Focus state is also copied.
860 * Original cluster does not have to be locked but usually is.
861 * New cluster will not be flagged as locked.
863 * Callers using this function to initialize a new cluster from an inode
864 * generally lock and resolve the resulting cluster.
866 * Callers which use this function to save/restore a cluster structure
867 * generally retain the focus state and do not re-resolve it. Caller should
868 * not try to re-resolve internal (cparent) node state during an iteration
869 * as the individual tracking elements of cparent in an iteration may not
870 * match even though they are correct.
872 hammer2_cluster_t *
873 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
875 hammer2_pfs_t *pmp = ocluster->pmp;
876 hammer2_cluster_t *ncluster;
877 hammer2_chain_t *chain;
878 int i;
880 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
881 ncluster->pmp = pmp;
882 ncluster->nchains = ocluster->nchains;
883 ncluster->refs = 1;
885 for (i = 0; i < ocluster->nchains; ++i) {
886 chain = ocluster->array[i].chain;
887 ncluster->array[i].chain = chain;
888 ncluster->array[i].flags = ocluster->array[i].flags;
889 if (chain)
890 hammer2_chain_ref(chain);
892 ncluster->focus_index = ocluster->focus_index;
893 ncluster->focus = ocluster->focus;
894 ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
895 HAMMER2_CLUSTER_INODE);
897 return (ncluster);
901 * Unlock a cluster. Refcount and focus is maintained.
903 void
904 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
906 hammer2_chain_t *chain;
907 int i;
909 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
910 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
911 cluster);
913 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
914 KKASSERT(cluster->refs > 0);
915 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
917 for (i = 0; i < cluster->nchains; ++i) {
918 if (i == idx)
919 continue;
920 chain = cluster->array[i].chain;
921 if (chain)
922 hammer2_chain_unlock(chain);
926 void
927 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
929 hammer2_cluster_unlock_except(cluster, -1);
933 * Resize the cluster's physical storage allocation in-place. This may
934 * replace the cluster's chains.
936 void
937 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
938 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
939 int nradix, int flags)
941 hammer2_chain_t *chain;
942 int i;
944 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */
945 KKASSERT(cparent->nchains == cluster->nchains);
947 for (i = 0; i < cluster->nchains; ++i) {
948 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
949 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
950 continue;
952 chain = cluster->array[i].chain;
953 if (chain) {
954 KKASSERT(cparent->array[i].chain);
955 hammer2_chain_resize(trans, ip,
956 cparent->array[i].chain, chain,
957 nradix, flags);
963 * Set an inode's cluster modified, marking the related chains RW and
964 * duplicating them if necessary.
966 * The passed-in chain is a localized copy of the chain previously acquired
967 * when the inode was locked (and possilby replaced in the mean time), and
968 * must also be updated. In fact, we update it first and then synchronize
969 * the inode's cluster cache.
971 hammer2_inode_data_t *
972 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
973 hammer2_cluster_t *cluster, int flags)
975 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
976 hammer2_cluster_modify(trans, cluster, flags);
978 hammer2_inode_repoint(ip, NULL, cluster);
979 if (ip->vp)
980 vsetisdirty(ip->vp);
981 return (&hammer2_cluster_wdata(cluster)->ipdata);
985 * Adjust the cluster's chains to allow modification and adjust the
986 * focus. Data will be accessible on return.
988 * If our focused master errors on modify, re-resolve the cluster to
989 * try to select a different master.
991 void
992 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
993 int flags)
995 hammer2_chain_t *chain;
996 int resolve_again;
997 int i;
999 resolve_again = 0;
1000 for (i = 0; i < cluster->nchains; ++i) {
1001 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1002 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1003 continue;
1005 chain = cluster->array[i].chain;
1006 if (chain == NULL)
1007 continue;
1008 if (chain->error)
1009 continue;
1010 hammer2_chain_modify(trans, chain, flags);
1011 if (cluster->focus == chain && chain->error) {
1012 cluster->error = chain->error;
1013 resolve_again = 1;
1016 if (resolve_again)
1017 hammer2_cluster_resolve(cluster);
1021 * Synchronize modifications from the focus to other chains in a cluster.
1022 * Convenient because nominal API users can just modify the contents of the
1023 * focus (at least for non-blockref data).
1025 * Nominal front-end operations only edit non-block-table data in a single
1026 * chain. This code copies such modifications to the other chains in the
1027 * cluster. Blocktable modifications are handled on a chain-by-chain basis
1028 * by both the frontend and the backend and will explode in fireworks if
1029 * blindly copied.
1031 void
1032 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
1034 hammer2_chain_t *focus;
1035 hammer2_chain_t *scan;
1036 const hammer2_inode_data_t *ripdata;
1037 hammer2_inode_data_t *wipdata;
1038 int i;
1040 focus = cluster->focus;
1041 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
1043 for (i = 0; i < cluster->nchains; ++i) {
1044 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1045 continue;
1046 scan = cluster->array[i].chain;
1047 if (scan == NULL || scan == focus)
1048 continue;
1049 if (scan->error)
1050 continue;
1051 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
1052 KKASSERT(focus->bytes == scan->bytes &&
1053 focus->bref.type == scan->bref.type);
1054 switch(focus->bref.type) {
1055 case HAMMER2_BREF_TYPE_INODE:
1056 ripdata = &focus->data->ipdata;
1057 wipdata = &scan->data->ipdata;
1058 if ((ripdata->op_flags &
1059 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1060 bcopy(ripdata, wipdata,
1061 offsetof(hammer2_inode_data_t, u));
1062 break;
1064 /* fall through to full copy */
1065 case HAMMER2_BREF_TYPE_DATA:
1066 bcopy(focus->data, scan->data, focus->bytes);
1067 break;
1068 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1069 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1070 case HAMMER2_BREF_TYPE_FREEMAP:
1071 case HAMMER2_BREF_TYPE_VOLUME:
1072 panic("hammer2_cluster_modsync: illegal node type");
1073 /* NOT REACHED */
1074 break;
1075 default:
1076 panic("hammer2_cluster_modsync: unknown node type");
1077 break;
1083 * Lookup initialization/completion API. Returns a locked, fully resolved
1084 * cluster with one ref.
1086 hammer2_cluster_t *
1087 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
1089 hammer2_cluster_t *cluster;
1091 cluster = hammer2_cluster_copy(cparent);
1092 if (flags & HAMMER2_LOOKUP_SHARED) {
1093 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
1094 HAMMER2_RESOLVE_SHARED);
1095 } else {
1096 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1098 hammer2_cluster_resolve(cluster);
1100 return (cluster);
1103 void
1104 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
1106 if (cparent) {
1107 hammer2_cluster_unlock(cparent);
1108 hammer2_cluster_drop(cparent);
1113 * Locate first match or overlap under parent, return a new, locked, resolved
1114 * cluster with one ref.
1116 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1118 hammer2_cluster_t *
1119 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
1120 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1122 hammer2_pfs_t *pmp;
1123 hammer2_cluster_t *cluster;
1124 hammer2_chain_t *chain;
1125 hammer2_key_t key_accum;
1126 hammer2_key_t key_next;
1127 int null_count;
1128 int rflags;
1129 int i;
1131 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1133 pmp = cparent->pmp; /* can be NULL */
1134 key_accum = *key_nextp;
1135 null_count = 0;
1136 if (flags & HAMMER2_LOOKUP_SHARED)
1137 rflags = HAMMER2_RESOLVE_SHARED;
1138 else
1139 rflags = 0;
1141 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
1142 cluster->pmp = pmp; /* can be NULL */
1143 cluster->refs = 1;
1144 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1145 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1148 * Iterating earlier cluster elements with later elements still
1149 * locked is a problem, so we have to unlock the parent and then
1150 * re-lock as we go.
1152 hammer2_cluster_unlock(cparent);
1153 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1156 * Pass-1, issue lookups.
1158 for (i = 0; i < cparent->nchains; ++i) {
1159 cluster->array[i].flags = cparent->array[i].flags;
1160 key_next = *key_nextp;
1163 * Always relock the parent as we go.
1165 if (cparent->array[i].chain) {
1166 hammer2_chain_lock(cparent->array[i].chain, rflags);
1170 * Nothing to base the lookup, or parent was not synchronized.
1172 if (cparent->array[i].chain == NULL ||
1173 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
1174 ++null_count;
1175 continue;
1178 chain = hammer2_chain_lookup(&cparent->array[i].chain,
1179 &key_next,
1180 key_beg, key_end,
1181 &cparent->array[i].cache_index,
1182 flags);
1183 cluster->array[i].chain = chain;
1184 if (chain == NULL) {
1185 ++null_count;
1187 if (key_accum > key_next)
1188 key_accum = key_next;
1192 * Cleanup
1194 cluster->nchains = i;
1195 *key_nextp = key_accum;
1198 * The cluster must be resolved, out of sync elements may be present.
1200 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1202 if (null_count != i)
1203 hammer2_cluster_resolve(cluster);
1204 if (null_count == i ||
1205 (cluster->focus == NULL &&
1206 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1207 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1208 hammer2_cluster_unlock(cluster);
1209 hammer2_cluster_drop(cluster);
1210 cluster = NULL;
1213 return (cluster);
1217 * Locate next match or overlap under parent, replace the passed-in cluster.
1218 * The returned cluster is a new, locked, resolved cluster with one ref.
1220 * Must never be called with HAMMER2_LOOKUP_MATCHIND.
1222 hammer2_cluster_t *
1223 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1224 hammer2_key_t *key_nextp,
1225 hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1227 hammer2_chain_t *ochain;
1228 hammer2_chain_t *nchain;
1229 hammer2_key_t key_accum;
1230 hammer2_key_t key_next;
1231 int parent_index;
1232 int cluster_index;
1233 int null_count;
1234 int rflags;
1235 int i;
1237 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
1239 key_accum = *key_nextp;
1240 null_count = 0;
1241 parent_index = cparent->focus_index; /* save prior focus */
1242 cluster_index = cluster->focus_index;
1243 if (flags & HAMMER2_LOOKUP_SHARED)
1244 rflags = HAMMER2_RESOLVE_SHARED;
1245 else
1246 rflags = 0;
1248 cluster->focus = NULL; /* XXX needed any more? */
1249 /*cparent->focus = NULL;*/
1250 cluster->focus_index = 0; /* XXX needed any more? */
1251 /*cparent->focus_index = 0;*/
1253 cluster->ddflag = 0;
1256 * The parent is always locked on entry, the iterator may be locked
1257 * depending on flags.
1259 * We must temporarily unlock the passed-in clusters to avoid a
1260 * deadlock between elements of the cluster with other threads.
1261 * We will fixup the lock in the loop.
1263 * Note that this will clear the focus.
1265 * Reflag the clusters as locked, because we will relock them
1266 * as we go.
1268 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1269 hammer2_cluster_unlock(cluster);
1270 cluster->flags |= HAMMER2_CLUSTER_LOCKED;
1272 hammer2_cluster_unlock(cparent);
1273 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1275 for (i = 0; i < cparent->nchains; ++i) {
1276 key_next = *key_nextp;
1277 ochain = cluster->array[i].chain;
1280 * Always relock the parent as we go.
1282 if (cparent->array[i].chain)
1283 hammer2_chain_lock(cparent->array[i].chain, rflags);
1286 * Nothing to iterate from. These cases can occur under
1287 * normal operations. For example, during synchronization
1288 * a slave might reach the end of its scan while records
1289 * are still left on the master(s).
1291 if (ochain == NULL) {
1292 ++null_count;
1293 continue;
1295 if (cparent->array[i].chain == NULL ||
1296 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1297 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1298 /* ochain has not yet been relocked */
1299 hammer2_chain_drop(ochain);
1300 cluster->array[i].chain = NULL;
1301 ++null_count;
1302 continue;
1306 * Relock the child if necessary. Parent and child will then
1307 * be locked as expected by hammer2_chain_next() and flags.
1309 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1310 hammer2_chain_lock(ochain, rflags);
1311 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1312 &key_next, key_beg, key_end,
1313 &cparent->array[i].cache_index,
1314 flags);
1315 /* ochain now invalid but can still be used for focus check */
1316 if (parent_index == i) {
1317 cparent->focus_index = i;
1318 cparent->focus = cparent->array[i].chain;
1321 cluster->array[i].chain = nchain;
1322 if (nchain == NULL) {
1323 ++null_count;
1325 if (key_accum > key_next)
1326 key_accum = key_next;
1330 * Cleanup
1332 cluster->nchains = i;
1333 *key_nextp = key_accum;
1336 * The cluster must be resolved, out of sync elements may be present.
1338 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
1340 if (null_count != i)
1341 hammer2_cluster_resolve(cluster);
1342 if (null_count == i ||
1343 (cluster->focus == NULL &&
1344 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
1345 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1346 hammer2_cluster_unlock(cluster);
1347 hammer2_cluster_drop(cluster);
1348 cluster = NULL;
1350 return(cluster);
1354 * Advance just one chain in the cluster and recalculate the invalid bit.
1355 * The cluster index is allowed to be flagged invalid on input and is
1356 * recalculated on return.
1358 * (used during synchronization to advance past a chain being deleted).
1360 * The chain being advanced must not be the focus and the clusters in
1361 * question must have already passed normal cluster_lookup/cluster_next
1362 * checks.
1364 * The cluster always remains intact on return, so void function.
1366 void
1367 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1368 hammer2_cluster_t *cluster,
1369 hammer2_key_t *key_nextp,
1370 hammer2_key_t key_beg,
1371 hammer2_key_t key_end,
1372 int i, int flags)
1374 hammer2_chain_t *ochain;
1375 hammer2_chain_t *nchain;
1376 hammer2_chain_t *focus;
1377 hammer2_key_t key_accum;
1378 hammer2_key_t key_next;
1379 int ddflag;
1381 key_accum = *key_nextp;
1382 key_next = *key_nextp;
1383 ochain = cluster->array[i].chain;
1384 if (ochain == NULL)
1385 goto done;
1386 KKASSERT(ochain != cluster->focus);
1388 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1389 &key_next, key_beg, key_end,
1390 &cparent->array[i].cache_index,
1391 flags);
1392 /* ochain now invalid */
1393 if (cparent->focus_index == i)
1394 cparent->focus = cparent->array[i].chain;
1397 * Install nchain. Note that nchain can be NULL, and can also
1398 * be in an unlocked state depending on flags.
1400 cluster->array[i].chain = nchain;
1401 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1403 if (key_accum > key_next)
1404 key_accum = key_next;
1406 focus = cluster->focus;
1407 if (focus == NULL)
1408 goto done;
1409 if (nchain == NULL)
1410 goto done;
1411 #if 0
1412 if (nchain == focus) /* ASSERTED NOT TRUE */
1414 #endif
1415 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1416 if (nchain->bref.type != focus->bref.type ||
1417 nchain->bref.key != focus->bref.key ||
1418 nchain->bref.keybits != focus->bref.keybits ||
1419 nchain->bref.modify_tid != focus->bref.modify_tid ||
1420 nchain->bytes != focus->bytes ||
1421 ddflag != cluster->ddflag) {
1422 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1425 done:
1426 *key_nextp = key_accum;
1427 #if 0
1429 * For now don't re-resolve cluster->flags.
1431 hammer2_cluster_resolve(cluster);
1432 #endif
1436 * Create a new cluster using the specified key
1439 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1440 hammer2_cluster_t **clusterp,
1441 hammer2_key_t key, int keybits,
1442 int type, size_t bytes, int flags)
1444 hammer2_cluster_t *cluster;
1445 hammer2_pfs_t *pmp;
1446 int error;
1447 int i;
1449 pmp = trans->pmp; /* can be NULL */
1451 if ((cluster = *clusterp) == NULL) {
1452 cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1453 M_WAITOK | M_ZERO);
1454 cluster->pmp = pmp; /* can be NULL */
1455 cluster->refs = 1;
1456 cluster->flags = HAMMER2_CLUSTER_LOCKED;
1458 cluster->focus_index = 0;
1459 cluster->focus = NULL;
1462 * NOTE: cluster->array[] entries can initially be NULL. If
1463 * *clusterp is supplied, skip NULL entries, otherwise
1464 * create new chains.
1466 for (i = 0; i < cparent->nchains; ++i) {
1467 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1468 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1469 continue;
1471 if (*clusterp) {
1472 if ((cluster->array[i].flags &
1473 HAMMER2_CITEM_FEMOD) == 0) {
1474 cluster->array[i].flags |=
1475 HAMMER2_CITEM_INVALID;
1476 continue;
1478 if (cluster->array[i].chain == NULL)
1479 continue;
1481 error = hammer2_chain_create(trans, &cparent->array[i].chain,
1482 &cluster->array[i].chain, pmp,
1483 key, keybits,
1484 type, bytes, flags);
1485 if (cparent->focus_index == i)
1486 cparent->focus = cparent->array[i].chain;
1487 KKASSERT(error == 0);
1488 if (cluster->focus == NULL) {
1489 cluster->focus_index = i;
1490 cluster->focus = cluster->array[i].chain;
1492 if (cparent->focus == cparent->array[i].chain) {
1493 cluster->focus_index = i;
1494 cluster->focus = cluster->array[i].chain;
1497 cluster->nchains = i;
1498 *clusterp = cluster;
1499 hammer2_cluster_resolve(cluster);
1501 return error;
1505 * Rename a cluster to a new parent.
1507 * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1508 * So the data_off field is not relevant. Only the key and
1509 * keybits are used.
1511 void
1512 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1513 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1514 int flags)
1516 hammer2_chain_t *chain;
1517 hammer2_blockref_t xbref;
1518 int i;
1520 #if 0
1521 cluster->focus = NULL;
1522 cparent->focus = NULL;
1523 cluster->focus_index = 0;
1524 cparent->focus_index = 0;
1525 #endif
1527 for (i = 0; i < cluster->nchains; ++i) {
1528 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1529 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1530 continue;
1532 chain = cluster->array[i].chain;
1533 if (chain) {
1534 if (bref) {
1535 xbref = chain->bref;
1536 xbref.key = bref->key;
1537 xbref.keybits = bref->keybits;
1538 hammer2_chain_rename(trans, &xbref,
1539 &cparent->array[i].chain,
1540 chain, flags);
1541 } else {
1542 hammer2_chain_rename(trans, NULL,
1543 &cparent->array[i].chain,
1544 chain, flags);
1546 if (cparent->focus_index == i)
1547 cparent->focus = cparent->array[i].chain;
1548 KKASSERT(cluster->array[i].chain == chain); /*remove*/
1554 * Mark a cluster deleted
1556 void
1557 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1558 hammer2_cluster_t *cluster, int flags)
1560 hammer2_chain_t *chain;
1561 hammer2_chain_t *parent;
1562 int i;
1564 if (cparent == NULL) {
1565 kprintf("cparent is NULL\n");
1566 return;
1569 for (i = 0; i < cluster->nchains; ++i) {
1570 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1571 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1572 continue;
1574 parent = cparent->array[i].chain;
1575 chain = cluster->array[i].chain;
1576 if (chain == NULL)
1577 continue;
1578 if (chain->parent != parent) {
1579 kprintf("hammer2_cluster_delete: parent "
1580 "mismatch chain=%p parent=%p against=%p\n",
1581 chain, chain->parent, parent);
1582 } else {
1583 hammer2_chain_delete(trans, parent, chain, flags);
1589 * Create a snapshot of the specified {parent, ochain} with the specified
1590 * label. The originating hammer2_inode must be exclusively locked for
1591 * safety.
1593 * The ioctl code has already synced the filesystem.
1596 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1597 hammer2_ioc_pfs_t *pfs)
1599 hammer2_dev_t *hmp;
1600 hammer2_cluster_t *ncluster;
1601 const hammer2_inode_data_t *ripdata;
1602 hammer2_inode_data_t *wipdata;
1603 hammer2_chain_t *nchain;
1604 hammer2_inode_t *nip;
1605 size_t name_len;
1606 hammer2_key_t lhc;
1607 struct vattr vat;
1608 #if 0
1609 uuid_t opfs_clid;
1610 #endif
1611 int error;
1612 int i;
1614 kprintf("snapshot %s\n", pfs->name);
1616 name_len = strlen(pfs->name);
1617 lhc = hammer2_dirhash(pfs->name, name_len);
1620 * Get the clid
1622 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1623 #if 0
1624 opfs_clid = ripdata->pfs_clid;
1625 #endif
1626 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */
1629 * Create the snapshot directory under the super-root
1631 * Set PFS type, generate a unique filesystem id, and generate
1632 * a cluster id. Use the same clid when snapshotting a PFS root,
1633 * which theoretically allows the snapshot to be used as part of
1634 * the same cluster (perhaps as a cache).
1636 * Copy the (flushed) blockref array. Theoretically we could use
1637 * chain_duplicate() but it becomes difficult to disentangle
1638 * the shared core so for now just brute-force it.
1640 VATTR_NULL(&vat);
1641 vat.va_type = VDIR;
1642 vat.va_mode = 0755;
1643 ncluster = NULL;
1644 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1645 proc0.p_ucred, pfs->name, name_len,
1646 &ncluster,
1647 HAMMER2_INSERT_PFSROOT, &error);
1649 if (nip) {
1650 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1651 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1652 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1653 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1654 kern_uuidgen(&wipdata->pfs_fsid, 1);
1657 * Give the snapshot its own private cluster. As a snapshot
1658 * no further synchronization with the original cluster will
1659 * be done.
1661 #if 0
1662 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1663 wipdata->pfs_clid = opfs_clid;
1664 else
1665 kern_uuidgen(&wipdata->pfs_clid, 1);
1666 #endif
1667 kern_uuidgen(&wipdata->pfs_clid, 1);
1669 for (i = 0; i < ncluster->nchains; ++i) {
1670 if ((ncluster->array[i].flags &
1671 HAMMER2_CITEM_FEMOD) == 0) {
1672 ncluster->array[i].flags |=
1673 HAMMER2_CITEM_INVALID;
1674 continue;
1676 nchain = ncluster->array[i].chain;
1677 if (nchain)
1678 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1680 #if 0
1681 /* XXX can't set this unless we do an explicit flush, which
1682 we also need a pmp assigned to do, else the flush code
1683 won't flush ncluster because it thinks it is crossing a
1684 flush boundary */
1685 hammer2_cluster_set_chainflags(ncluster,
1686 HAMMER2_CHAIN_PFSBOUNDARY);
1687 #endif
1689 /* XXX hack blockset copy */
1690 /* XXX doesn't work with real cluster */
1691 KKASSERT(ocluster->nchains == 1);
1692 wipdata->u.blockset = ripdata->u.blockset;
1693 hammer2_cluster_modsync(ncluster);
1694 for (i = 0; i < ncluster->nchains; ++i) {
1695 nchain = ncluster->array[i].chain;
1696 if (nchain)
1697 hammer2_flush(trans, nchain, 1);
1699 hammer2_inode_unlock(nip, ncluster);
1701 return (error);
1705 * Return locked parent cluster given a locked child. The child remains
1706 * locked on return. The new parent's focus follows the child's focus
1707 * and the parent is always resolved.
1709 * We must temporarily unlock the passed-in cluster to avoid a deadlock
1710 * between elements of the cluster.
1712 * We must not try to hammer2_cluster_resolve() cparent. The individual
1713 * parent chains for the nodes are the correct parents for the cluster but
1714 * do not necessarily match, so resolve would likely implode.
1716 hammer2_cluster_t *
1717 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1719 hammer2_cluster_t *cparent;
1720 int i;
1722 cparent = hammer2_cluster_copy(cluster);
1723 hammer2_cluster_unlock(cluster);
1725 for (i = 0; i < cparent->nchains; ++i) {
1726 hammer2_chain_t *chain;
1727 hammer2_chain_t *rchain;
1730 * Calculate parent for each element. Old chain has an extra
1731 * ref for cparent but the lock remains with cluster.
1733 chain = cparent->array[i].chain;
1734 if (chain == NULL)
1735 continue;
1736 while ((rchain = chain->parent) != NULL) {
1737 hammer2_chain_ref(rchain);
1738 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1739 if (chain->parent == rchain)
1740 break;
1741 hammer2_chain_unlock(rchain);
1742 hammer2_chain_drop(rchain);
1744 cparent->array[i].chain = rchain;
1745 hammer2_chain_drop(chain);
1747 cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1748 /* hammer2_cluster_resolve(cparent); */
1749 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
1751 return cparent;
1754 /************************************************************************
1755 * CLUSTER I/O *
1756 ************************************************************************
1759 * WARNING! blockref[] array data is not universal. These functions should
1760 * only be used to access universal data.
1762 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1763 * complete if necessary. The I/O's should have already been
1764 * initiated by the cluster_lock/chain_lock operation.
1766 * The cluster must already be in a modified state before wdata
1767 * is called. The data will already be available for this case.
1769 const hammer2_media_data_t *
1770 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1772 return(cluster->focus->data);
1775 hammer2_media_data_t *
1776 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1778 KKASSERT(hammer2_cluster_modified(cluster));
1779 return(cluster->focus->data);
1783 * Load cluster data asynchronously with callback.
1785 * The callback is made for the first validated data found, or NULL
1786 * if no valid data is available.
1788 * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1789 * in the inode with an exclusive lock held), the chain structure may be
1790 * shared.
1792 void
1793 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1794 void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1796 hammer2_chain_t *chain;
1797 hammer2_iocb_t *iocb;
1798 hammer2_dev_t *hmp;
1799 hammer2_blockref_t *bref;
1800 int i;
1802 i = cluster->focus_index;
1803 chain = cluster->focus;
1805 iocb = &cluster->iocb;
1806 iocb->callback = callback;
1807 iocb->dio = NULL; /* for already-validated case */
1808 iocb->cluster = cluster;
1809 iocb->chain = chain;
1810 iocb->ptr = ptr;
1811 iocb->lbase = (off_t)i;
1812 iocb->flags = 0;
1813 iocb->error = 0;
1816 * Data already validated
1818 if (chain->data) {
1819 callback(iocb);
1820 return;
1824 * We must resolve to a device buffer, either by issuing I/O or
1825 * by creating a zero-fill element. We do not mark the buffer
1826 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1827 * API must still be used to do that).
1829 * The device buffer is variable-sized in powers of 2 down
1830 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage
1831 * chunk always contains buffers of the same size. (XXX)
1833 * The minimum physical IO size may be larger than the variable
1834 * block size.
1836 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1837 * matches hammer2_devblksize()? Or does the freemap's
1838 * pre-zeroing handle the case for us?
1840 bref = &chain->bref;
1841 hmp = chain->hmp;
1843 #if 0
1844 /* handled by callback? <- TODO XXX even needed for loads? */
1846 * The getblk() optimization for a 100% overwrite can only be used
1847 * if the physical block size matches the request.
1849 if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1850 chain->bytes == hammer2_devblksize(chain->bytes)) {
1851 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1852 KKASSERT(error == 0);
1853 iocb->dio = dio;
1854 callback(iocb);
1855 return;
1857 #endif
1860 * Otherwise issue a read
1862 hammer2_adjreadcounter(&chain->bref, chain->bytes);
1863 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);