kernel/hammer2: Fix compilation without INVARIANTS.
[dragonfly.git] / sys / vfs / hammer2 / hammer2_cluster.c
blob138a96c9ea024f21f0a6f3c826cb9cf004bba0ab
1 /*
2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 * The cluster module collects multiple chains representing the same
36 * information from different nodes into a single entity. It allows direct
37 * access to media data as long as it is not blockref array data (which
38 * will obviously have to be different at each node).
40 * This module also handles I/O dispatch, status rollup, and various
41 * mastership arrangements including quorum operations. It effectively
42 * presents one topology to the vnops layer.
44 * Many of the API calls mimic chain API calls but operate on clusters
45 * instead of chains. Please see hammer2_chain.c for more complete code
46 * documentation of the API functions.
48 * WARNING! This module is *extremely* complex. It must issue asynchronous
49 * locks and I/O, do quorum and/or master-slave processing, and
50 * it must operate properly even if some nodes are broken (which
51 * can also mean indefinite locks).
53 * CLUSTER OPERATIONS
55 * Cluster operations can be broken down into three pieces:
57 * (1) Chain locking and data retrieval.
59 * - Most complex functions, quorum management on transaction ids.
61 * - Locking and data accesses must be internally asynchronous.
63 * - Validate and manage cache coherency primitives (cache state
64 * is stored in chain topologies but must be validated by these
65 * functions).
67 * (2) Lookups and Scans
68 * hammer2_cluster_lookup()
69 * hammer2_cluster_next()
71 * - Depend on locking & data retrieval functions, but still complex.
73 * - Must do quorum management on transaction ids.
75 * - Lookup and Iteration ops Must be internally asynchronous.
77 * (3) Modifying Operations
78 * hammer2_cluster_create()
80 * - Can usually punt on failures, operation continues unless quorum
81 * is lost. If quorum is lost, must wait for resynchronization
82 * (depending on the management mode).
84 * - Must disconnect node on failures (also not flush), remount, and
85 * resynchronize.
87 * - Network links (via kdmsg) are relatively easy to issue as the
88 * complex underworkings of hammer2_chain.c don't have to messed
89 * with (the protocol is at a higher level than block-level).
91 * - Multiple local disk nodes (i.e. block devices) are another matter.
92 * Chain operations have to be dispatched to per-node threads (xN)
93 * because we can't asynchronize potentially very complex chain
94 * operations in hammer2_chain.c (it would be a huge mess).
96 * (these threads are also used to terminate incoming kdmsg ops from
97 * other machines).
99 * - Single-node filesystems do not use threads and will simply call
100 * hammer2_chain.c functions directly. This short-cut is handled
101 * at the base of each cluster function.
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
110 #include "hammer2.h"
113 * Returns the bref type of the cluster's foucs.
115 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116 * The cluster must be locked.
118 uint8_t
119 hammer2_cluster_type(hammer2_cluster_t *cluster)
121 if (cluster->error == 0) {
122 KKASSERT(cluster->focus != NULL);
123 return(cluster->focus->bref.type);
125 return 0;
128 #ifdef INVARIANTS
130 * Returns non-zero if the cluster's focus is flagged as being modified.
132 * If the cluster is errored, returns 0.
134 static
136 hammer2_cluster_modified(hammer2_cluster_t *cluster)
138 if (cluster->error == 0) {
139 KKASSERT(cluster->focus != NULL);
140 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
142 return 0;
144 #endif
147 * Returns the bref of the cluster's focus, sans any data-offset information
148 * (since offset information is per-node and wouldn't be useful).
150 * Callers use this function to access modify_tid, mirror_tid, type,
151 * key, and keybits.
153 * If the cluster is errored, returns an empty bref.
154 * The cluster must be locked.
156 void
157 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
159 if (cluster->error == 0) {
160 KKASSERT(cluster->focus != NULL);
161 *bref = cluster->focus->bref;
162 bref->data_off = 0;
163 } else {
164 bzero(bref, sizeof(*bref));
169 * Create a degenerate cluster with one ref from a single locked chain.
170 * The returned cluster will be focused on the chain and inherit its
171 * error state.
173 * The chain's lock and reference are transfered to the new cluster, so
174 * the caller should not try to unlock the chain separately.
176 * We fake the flags.
178 hammer2_cluster_t *
179 hammer2_cluster_from_chain(hammer2_chain_t *chain)
181 hammer2_cluster_t *cluster;
183 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
184 cluster->array[0].chain = chain;
185 cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
186 cluster->nchains = 1;
187 cluster->focus = chain;
188 cluster->focus_index = 0;
189 cluster->pmp = chain->pmp;
190 cluster->refs = 1;
191 cluster->error = chain->error;
192 cluster->flags = HAMMER2_CLUSTER_LOCKED |
193 HAMMER2_CLUSTER_WRHARD |
194 HAMMER2_CLUSTER_RDHARD |
195 HAMMER2_CLUSTER_MSYNCED |
196 HAMMER2_CLUSTER_SSYNCED;
198 return cluster;
202 * Add a reference to a cluster and its underlying chains.
204 * We must also ref the underlying chains in order to allow ref/unlock
205 * sequences to later re-lock.
207 void
208 hammer2_cluster_ref(hammer2_cluster_t *cluster)
210 atomic_add_int(&cluster->refs, 1);
214 * Drop the caller's reference to the cluster. When the ref count drops to
215 * zero this function frees the cluster and drops all underlying chains.
217 * In-progress read I/Os are typically detached from the cluster once the
218 * first one returns (the remaining stay attached to the DIOs but are then
219 * ignored and drop naturally).
221 void
222 hammer2_cluster_drop(hammer2_cluster_t *cluster)
224 hammer2_chain_t *chain;
225 int i;
227 KKASSERT(cluster->refs > 0);
228 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
229 cluster->focus = NULL; /* safety XXX chg to assert */
230 cluster->focus_index = 0;
232 for (i = 0; i < cluster->nchains; ++i) {
233 chain = cluster->array[i].chain;
234 if (chain) {
235 hammer2_chain_drop(chain);
236 cluster->array[i].chain = NULL; /* safety */
239 cluster->nchains = 0; /* safety */
241 kfree(cluster, M_HAMMER2);
242 /* cluster is invalid */
247 * Lock a cluster. Cluster must already be referenced. Focus is maintained.
249 * WARNING! This function expects the caller to handle resolution of the
250 * cluster. We never re-resolve the cluster in this function,
251 * because it might be used to temporarily unlock/relock a cparent
252 * in an iteration or recursrion, and the cparents elements do not
253 * necessarily match.
255 void
256 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
258 hammer2_chain_t *chain;
259 int i;
261 /* cannot be on inode-embedded cluster template, must be on copy */
262 KKASSERT(cluster->refs > 0);
263 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
264 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
265 panic("hammer2_cluster_lock: cluster %p already locked!\n",
266 cluster);
268 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
271 * Lock chains and resolve state.
273 for (i = 0; i < cluster->nchains; ++i) {
274 chain = cluster->array[i].chain;
275 if (chain == NULL)
276 continue;
277 hammer2_chain_lock(chain, how);
282 * Calculate the clustering state for the cluster and set its focus.
283 * This routine must be called with care. For example, it should not
284 * normally be called after relocking a non-leaf cluster because parent
285 * clusters help iterations and each element might be at a slightly different
286 * indirect node (each node's topology is independently indexed).
288 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
289 * operations. Typically this is only set on a quorum of MASTERs or
290 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER
291 * is present, this bit is *not* set on a quorum of MASTERs. The
292 * synchronization code ignores this bit, but all hammer2_cluster_*() calls
293 * that create/modify/delete elements use it.
295 * The chains making up the cluster may be narrowed down based on quorum
296 * acceptability, and if RESOLVE_RDONLY is specified the chains can be
297 * narrowed down to a single chain as long as the entire subtopology is known
298 * to be intact. So, for example, we can narrow a read-only op to a single
299 * fast SLAVE but if we focus a CACHE chain we must still retain at least
300 * a SLAVE to ensure that the subtopology can be accessed.
302 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
303 * to be maintained once the topology is validated as-of the top level of
304 * the operation.
306 * If a failure occurs the operation must be aborted by higher-level code and
307 * retried. XXX
309 void
310 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
312 hammer2_chain_t *chain;
313 hammer2_chain_t *focus;
314 hammer2_pfs_t *pmp;
315 hammer2_tid_t quorum_tid;
316 hammer2_tid_t last_best_quorum_tid;
317 int focus_pfs_type;
318 uint32_t nflags;
319 int ttlmasters;
320 int ttlslaves;
321 int nmasters;
322 int nslaves;
323 int nquorum;
324 int smpresent;
325 int i;
327 cluster->error = 0;
328 cluster->focus = NULL;
330 focus_pfs_type = 0;
331 nflags = 0;
332 ttlmasters = 0;
333 ttlslaves = 0;
334 nmasters = 0;
335 nslaves = 0;
338 * Calculate quorum
340 pmp = cluster->pmp;
341 KKASSERT(pmp != NULL || cluster->nchains == 0);
342 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
343 smpresent = 0;
346 * Pass 1
348 * NOTE: A NULL chain is not necessarily an error, it could be
349 * e.g. a lookup failure or the end of an iteration.
350 * Process normally.
352 for (i = 0; i < cluster->nchains; ++i) {
353 chain = cluster->array[i].chain;
354 if (chain && chain->error) {
355 if (cluster->focus == NULL || cluster->focus == chain) {
356 /* error will be overridden by valid focus */
357 cluster->error = chain->error;
361 * Must count total masters and slaves whether the
362 * chain is errored or not.
364 switch (cluster->pmp->pfs_types[i]) {
365 case HAMMER2_PFSTYPE_SUPROOT:
366 case HAMMER2_PFSTYPE_MASTER:
367 ++ttlmasters;
368 break;
369 case HAMMER2_PFSTYPE_SLAVE:
370 ++ttlslaves;
371 break;
373 continue;
375 switch (cluster->pmp->pfs_types[i]) {
376 case HAMMER2_PFSTYPE_MASTER:
377 ++ttlmasters;
378 break;
379 case HAMMER2_PFSTYPE_SLAVE:
380 ++ttlslaves;
381 break;
382 case HAMMER2_PFSTYPE_SOFT_MASTER:
383 nflags |= HAMMER2_CLUSTER_WRSOFT;
384 nflags |= HAMMER2_CLUSTER_RDSOFT;
385 smpresent = 1;
386 break;
387 case HAMMER2_PFSTYPE_SOFT_SLAVE:
388 nflags |= HAMMER2_CLUSTER_RDSOFT;
389 break;
390 case HAMMER2_PFSTYPE_SUPROOT:
392 * Degenerate cluster representing the super-root
393 * topology on a single device. Fake stuff so
394 * cluster ops work as expected.
396 nflags |= HAMMER2_CLUSTER_WRHARD;
397 nflags |= HAMMER2_CLUSTER_RDHARD;
398 cluster->focus_index = i;
399 cluster->focus = chain;
400 cluster->error = chain ? chain->error : 0;
401 ++ttlmasters;
402 break;
403 default:
404 break;
409 * Pass 2
411 * Resolve masters. Calculate nmasters for the highest matching
412 * TID, if a quorum cannot be attained try the next lower matching
413 * TID until we exhaust TIDs.
415 * NOTE: A NULL chain is not necessarily an error, it could be
416 * e.g. a lookup failure or the end of an iteration.
417 * Process normally.
419 last_best_quorum_tid = HAMMER2_TID_MAX;
420 quorum_tid = 0; /* fix gcc warning */
422 while (nmasters < nquorum && last_best_quorum_tid != 0) {
423 nmasters = 0;
424 quorum_tid = 0;
426 for (i = 0; i < cluster->nchains; ++i) {
427 switch (cluster->pmp->pfs_types[i]) {
428 case HAMMER2_PFSTYPE_SUPROOT:
429 case HAMMER2_PFSTYPE_MASTER:
430 break;
431 default:
432 continue;
434 chain = cluster->array[i].chain;
436 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
438 * Invalid as in unsynchronized, cannot be
439 * used to calculate the quorum.
441 } else if (chain == NULL && quorum_tid == 0) {
443 * NULL chain on master matches NULL chains
444 * on other masters.
446 ++nmasters;
447 } else if (quorum_tid < last_best_quorum_tid &&
448 chain != NULL &&
449 (quorum_tid < chain->bref.modify_tid ||
450 nmasters == 0)) {
452 * Better TID located, reset nmasters count.
454 nmasters = 1;
455 quorum_tid = chain->bref.modify_tid;
456 } else if (chain &&
457 quorum_tid == chain->bref.modify_tid) {
459 * TID matches current collection.
461 ++nmasters;
464 if (nmasters >= nquorum)
465 break;
466 last_best_quorum_tid = quorum_tid;
470 * Pass 3
472 * NOTE: A NULL chain is not necessarily an error, it could be
473 * e.g. a lookup failure or the end of an iteration.
474 * Process normally.
476 for (i = 0; i < cluster->nchains; ++i) {
477 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
478 chain = cluster->array[i].chain;
479 if (chain && chain->error) {
480 if (cluster->focus == NULL || cluster->focus == chain) {
481 /* error will be overridden by valid focus */
482 cluster->error = chain->error;
484 continue;
487 switch (cluster->pmp->pfs_types[i]) {
488 case HAMMER2_PFSTYPE_MASTER:
490 * We must have enough up-to-date masters to reach
491 * a quorum and the master modify_tid must match
492 * the quorum's modify_tid.
494 * Do not select an errored or out-of-sync master.
496 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
497 nflags |= HAMMER2_CLUSTER_UNHARD;
498 } else if (nmasters >= nquorum &&
499 (chain == NULL || chain->error == 0) &&
500 ((chain == NULL && quorum_tid == 0) ||
501 (chain != NULL && quorum_tid ==
502 chain->bref.modify_tid))) {
503 nflags |= HAMMER2_CLUSTER_WRHARD;
504 nflags |= HAMMER2_CLUSTER_RDHARD;
505 if (!smpresent) {
506 cluster->array[i].flags |=
507 HAMMER2_CITEM_FEMOD;
509 if (cluster->focus == NULL ||
510 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
511 focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
512 cluster->focus_index = i;
513 cluster->focus = chain; /* NULL ok */
514 cluster->error = chain ? chain->error :
517 } else if (chain == NULL || chain->error == 0) {
518 nflags |= HAMMER2_CLUSTER_UNHARD;
520 break;
521 case HAMMER2_PFSTYPE_SLAVE:
523 * We must have enough up-to-date masters to reach
524 * a quorum and the slave modify_tid must match the
525 * quorum's modify_tid.
527 * Do not select an errored slave.
529 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
530 nflags |= HAMMER2_CLUSTER_UNHARD;
531 } else if (nmasters >= nquorum &&
532 (chain == NULL || chain->error == 0) &&
533 ((chain == NULL && quorum_tid == 0) ||
534 (chain && quorum_tid ==
535 chain->bref.modify_tid))) {
536 ++nslaves;
537 nflags |= HAMMER2_CLUSTER_RDHARD;
538 #if 0
539 /* XXX optimize for RESOLVE_RDONLY */
540 if (cluster->focus == NULL) {
541 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
542 cluster->focus_index = i;
543 cluster->focus = chain; /* NULL ok */
544 cluster->error = chain ? chain->error :
547 #endif
548 } else if (chain == NULL || chain->error == 0) {
549 nflags |= HAMMER2_CLUSTER_UNSOFT;
551 break;
552 case HAMMER2_PFSTYPE_SOFT_MASTER:
554 * Directly mounted soft master always wins. There
555 * should be only one.
557 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
558 cluster->focus_index = i;
559 cluster->focus = chain;
560 cluster->error = chain ? chain->error : 0;
561 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
562 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
563 break;
564 case HAMMER2_PFSTYPE_SOFT_SLAVE:
566 * Directly mounted soft slave always wins. There
567 * should be only one.
569 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
570 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
571 cluster->focus_index = i;
572 cluster->focus = chain;
573 cluster->error = chain ? chain->error : 0;
574 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
576 break;
577 case HAMMER2_PFSTYPE_SUPROOT:
579 * spmp (degenerate case)
581 KKASSERT(i == 0);
582 cluster->focus_index = i;
583 cluster->focus = chain;
584 cluster->error = chain ? chain->error : 0;
585 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
586 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
587 break;
588 default:
589 break;
594 * Focus now set, adjust ddflag. Skip this pass if the focus
595 * is bad or if we are at the PFS root (the bref won't match at
596 * the PFS root, obviously).
598 focus = cluster->focus;
599 if (focus) {
600 cluster->ddflag =
601 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
602 } else {
603 cluster->ddflag = 0;
604 goto skip4;
606 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
607 goto skip4;
610 * Pass 4
612 * Validate the elements that were not marked invalid. They should
613 * match.
615 for (i = 0; i < cluster->nchains; ++i) {
616 int ddflag;
618 chain = cluster->array[i].chain;
620 if (chain == NULL)
621 continue;
622 if (chain == focus)
623 continue;
624 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
625 continue;
627 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
628 if (chain->bref.type != focus->bref.type ||
629 chain->bref.key != focus->bref.key ||
630 chain->bref.keybits != focus->bref.keybits ||
631 chain->bref.modify_tid != focus->bref.modify_tid ||
632 chain->bytes != focus->bytes ||
633 ddflag != cluster->ddflag) {
634 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
635 if (hammer2_debug & 1)
636 kprintf("cluster_resolve: matching modify_tid failed "
637 "bref test: idx=%d type=%02x/%02x "
638 "key=%016jx/%d-%016jx/%d "
639 "mod=%016jx/%016jx bytes=%u/%u\n",
641 chain->bref.type, focus->bref.type,
642 chain->bref.key, chain->bref.keybits,
643 focus->bref.key, focus->bref.keybits,
644 chain->bref.modify_tid, focus->bref.modify_tid,
645 chain->bytes, focus->bytes);
646 if (hammer2_debug & 0x4000)
647 panic("cluster_resolve");
648 /* flag issue and force resync? */
651 skip4:
653 if (ttlslaves == 0)
654 nflags |= HAMMER2_CLUSTER_NOSOFT;
655 if (ttlmasters == 0)
656 nflags |= HAMMER2_CLUSTER_NOHARD;
659 * Set SSYNCED or MSYNCED for slaves and masters respectively if
660 * all available nodes (even if 0 are available) are fully
661 * synchronized. This is used by the synchronization thread to
662 * determine if there is work it could potentially accomplish.
664 if (nslaves == ttlslaves)
665 nflags |= HAMMER2_CLUSTER_SSYNCED;
666 if (nmasters == ttlmasters)
667 nflags |= HAMMER2_CLUSTER_MSYNCED;
670 * Determine if the cluster was successfully locked for the
671 * requested operation and generate an error code. The cluster
672 * will not be locked (or ref'd) if an error is returned.
674 atomic_set_int(&cluster->flags, nflags);
675 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
679 * This is used by the XOPS subsystem to calculate the state of
680 * the collection and tell hammer2_xop_collect() what to do with it.
681 * The collection can be in various states of desynchronization, the
682 * caller specifically wants to resolve the passed-in key.
684 * Return values:
685 * 0 - Quorum agreement, key is valid
687 * ENOENT - Quorum agreement, end of scan
689 * ESRCH - Quorum agreement, key is INVALID (caller should
690 * skip key).
692 * EIO - Quorum agreement but all elements had errors.
694 * EDEADLK - No quorum agreement possible for key, a repair
695 * may be needed. Caller has to decide what to do,
696 * possibly iterating the key or generating an EIO.
698 * EINPROGRESS - No quorum agreement yet, but agreement is still
699 * possible if caller waits for more responses. Caller
700 * should not iterate key.
702 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
704 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
707 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
709 hammer2_chain_t *chain;
710 hammer2_chain_t *focus;
711 hammer2_pfs_t *pmp;
712 hammer2_tid_t quorum_tid;
713 hammer2_tid_t last_best_quorum_tid;
714 uint32_t nflags;
715 int ttlmasters;
716 int ttlslaves;
717 int nmasters;
718 int nmasters_keymatch;
719 int nslaves;
720 int nquorum;
721 int umasters; /* unknown masters (still in progress) */
722 int smpresent;
723 int error;
724 int i;
726 cluster->error = 0;
727 cluster->focus = NULL;
729 pmp = cluster->pmp;
730 KKASSERT(pmp != NULL || cluster->nchains == 0);
733 * Calculate quorum
735 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
736 smpresent = 0;
737 nflags = 0;
738 ttlmasters = 0;
739 ttlslaves = 0;
742 * Pass 1
744 * NOTE: A NULL chain is not necessarily an error, it could be
745 * e.g. a lookup failure or the end of an iteration.
746 * Process normally.
748 for (i = 0; i < cluster->nchains; ++i) {
749 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
750 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
752 chain = cluster->array[i].chain;
753 error = cluster->array[i].error;
754 if (chain && error) {
755 if (cluster->focus == NULL || cluster->focus == chain) {
756 /* error will be overridden by valid focus */
757 /* XXX */
761 * Must count total masters and slaves whether the
762 * chain is errored or not.
764 switch (cluster->pmp->pfs_types[i]) {
765 case HAMMER2_PFSTYPE_SUPROOT:
766 case HAMMER2_PFSTYPE_MASTER:
767 ++ttlmasters;
768 break;
769 case HAMMER2_PFSTYPE_SLAVE:
770 ++ttlslaves;
771 break;
773 continue;
775 switch (cluster->pmp->pfs_types[i]) {
776 case HAMMER2_PFSTYPE_MASTER:
777 ++ttlmasters;
778 break;
779 case HAMMER2_PFSTYPE_SLAVE:
780 ++ttlslaves;
781 break;
782 case HAMMER2_PFSTYPE_SOFT_MASTER:
783 nflags |= HAMMER2_CLUSTER_WRSOFT;
784 nflags |= HAMMER2_CLUSTER_RDSOFT;
785 smpresent = 1;
786 break;
787 case HAMMER2_PFSTYPE_SOFT_SLAVE:
788 nflags |= HAMMER2_CLUSTER_RDSOFT;
789 break;
790 case HAMMER2_PFSTYPE_SUPROOT:
792 * Degenerate cluster representing the super-root
793 * topology on a single device. Fake stuff so
794 * cluster ops work as expected.
796 ++ttlmasters;
797 nflags |= HAMMER2_CLUSTER_WRHARD;
798 nflags |= HAMMER2_CLUSTER_RDHARD;
799 cluster->focus_index = i;
800 cluster->focus = chain;
801 cluster->error = error;
802 break;
803 default:
804 break;
809 * Pass 2
811 * Resolve nmasters - master nodes fully match
813 * Resolve umasters - master nodes operation still
814 * in progress
816 * Resolve nmasters_keymatch - master nodes match the passed-in
817 * key and may or may not match
818 * the quorum-agreed tid.
820 * The quorum-agreed TID is the highest matching TID.
822 last_best_quorum_tid = HAMMER2_TID_MAX;
823 umasters = 0;
824 nmasters = 0;
825 nmasters_keymatch = 0;
826 quorum_tid = 0; /* fix gcc warning */
828 while (nmasters < nquorum && last_best_quorum_tid != 0) {
829 umasters = 0;
830 nmasters = 0;
831 nmasters_keymatch = 0;
832 quorum_tid = 0;
834 for (i = 0; i < cluster->nchains; ++i) {
835 /* XXX SOFT smpresent handling */
836 switch(cluster->pmp->pfs_types[i]) {
837 case HAMMER2_PFSTYPE_MASTER:
838 case HAMMER2_PFSTYPE_SUPROOT:
839 break;
840 default:
841 continue;
844 chain = cluster->array[i].chain;
845 error = cluster->array[i].error;
848 * Skip elements still in progress. umasters keeps
849 * track of masters that might still be in-progress.
851 if (chain == NULL && (cluster->array[i].flags &
852 HAMMER2_CITEM_NULL) == 0) {
853 ++umasters;
854 continue;
858 * Key match?
860 if (flags & HAMMER2_CHECK_NULL) {
861 if (chain == NULL) {
862 ++nmasters;
863 ++nmasters_keymatch;
864 if (cluster->error == 0)
865 cluster->error = error;
867 } else if (chain &&
868 (key == (hammer2_key_t)-1 ||
869 chain->bref.key == key)) {
870 ++nmasters_keymatch;
872 if (chain->bref.modify_tid <
873 last_best_quorum_tid &&
874 quorum_tid < chain->bref.modify_tid) {
876 * Select new TID as master if better
877 * than any found so far in this loop,
878 * as long as it does not reach the
879 * best tid found in the previous loop.
881 nmasters = 0;
882 quorum_tid = chain->bref.modify_tid;
884 if (quorum_tid == chain->bref.modify_tid) {
886 * TID matches current collection.
888 * (error handled in next pass)
890 ++nmasters;
891 if (chain->error == 0) {
892 cluster->focus = chain;
893 cluster->focus_index = i;
898 if (nmasters >= nquorum)
899 break;
900 last_best_quorum_tid = quorum_tid;
904 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
905 nmasters, nquorum, nmasters_keymatch, umasters);
909 * Early return if we do not have enough masters.
911 if (nmasters < nquorum) {
912 if (nmasters + umasters >= nquorum)
913 return HAMMER2_ERROR_EINPROGRESS;
914 if (nmasters_keymatch < nquorum)
915 return HAMMER2_ERROR_ESRCH;
916 return HAMMER2_ERROR_EDEADLK;
920 * Validated end of scan.
922 if (flags & HAMMER2_CHECK_NULL) {
923 if (cluster->error == 0)
924 cluster->error = HAMMER2_ERROR_ENOENT;
925 return cluster->error;
929 * If we have a NULL focus at this point the agreeing quorum all
930 * had chain errors.
932 if (cluster->focus == NULL)
933 return HAMMER2_ERROR_EIO;
936 * Pass 3
938 * We have quorum agreement, validate elements, not end of scan.
940 nslaves = 0;
941 cluster->error = 0;
943 for (i = 0; i < cluster->nchains; ++i) {
944 chain = cluster->array[i].chain;
945 error = cluster->array[i].error;
946 if (chain == NULL ||
947 chain->bref.key != key ||
948 chain->bref.modify_tid != quorum_tid) {
949 continue;
953 * Quorum Match
955 * XXX for now, cumulative error.
957 if (cluster->error == 0)
958 cluster->error = error;
960 switch (cluster->pmp->pfs_types[i]) {
961 case HAMMER2_PFSTYPE_MASTER:
962 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
963 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
964 nflags |= HAMMER2_CLUSTER_WRHARD;
965 nflags |= HAMMER2_CLUSTER_RDHARD;
966 break;
967 case HAMMER2_PFSTYPE_SLAVE:
969 * We must have enough up-to-date masters to reach
970 * a quorum and the slave modify_tid must match the
971 * quorum's modify_tid.
973 * Do not select an errored slave.
975 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
976 nflags |= HAMMER2_CLUSTER_RDHARD;
977 ++nslaves;
978 break;
979 case HAMMER2_PFSTYPE_SOFT_MASTER:
981 * Directly mounted soft master always wins. There
982 * should be only one.
984 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
985 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
986 break;
987 case HAMMER2_PFSTYPE_SOFT_SLAVE:
989 * Directly mounted soft slave always wins. There
990 * should be only one.
992 * XXX
994 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
995 break;
996 case HAMMER2_PFSTYPE_SUPROOT:
998 * spmp (degenerate case)
1000 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1001 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1002 nflags |= HAMMER2_CLUSTER_WRHARD;
1003 nflags |= HAMMER2_CLUSTER_RDHARD;
1004 break;
1005 default:
1006 break;
1011 * Focus now set, adjust ddflag. Skip this pass if the focus
1012 * is bad or if we are at the PFS root (the bref won't match at
1013 * the PFS root, obviously).
1015 focus = cluster->focus;
1016 if (focus) {
1017 cluster->ddflag =
1018 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1019 } else {
1020 cluster->ddflag = 0;
1021 goto skip4;
1023 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1024 goto skip4;
1027 * Pass 4
1029 * Validate the elements that were not marked invalid. They should
1030 * match.
1032 for (i = 0; i < cluster->nchains; ++i) {
1033 int ddflag;
1035 chain = cluster->array[i].chain;
1037 if (chain == NULL)
1038 continue;
1039 if (chain == focus)
1040 continue;
1041 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1042 continue;
1044 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1045 if (chain->bref.type != focus->bref.type ||
1046 chain->bref.key != focus->bref.key ||
1047 chain->bref.keybits != focus->bref.keybits ||
1048 chain->bref.modify_tid != focus->bref.modify_tid ||
1049 chain->bytes != focus->bytes ||
1050 ddflag != cluster->ddflag) {
1051 cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1052 if (hammer2_debug & 1)
1053 kprintf("cluster_resolve: matching modify_tid failed "
1054 "bref test: idx=%d type=%02x/%02x "
1055 "key=%016jx/%d-%016jx/%d "
1056 "mod=%016jx/%016jx bytes=%u/%u\n",
1058 chain->bref.type, focus->bref.type,
1059 chain->bref.key, chain->bref.keybits,
1060 focus->bref.key, focus->bref.keybits,
1061 chain->bref.modify_tid, focus->bref.modify_tid,
1062 chain->bytes, focus->bytes);
1063 if (hammer2_debug & 0x4000)
1064 panic("cluster_resolve");
1065 /* flag issue and force resync? */
1068 skip4:
1070 if (ttlslaves == 0)
1071 nflags |= HAMMER2_CLUSTER_NOSOFT;
1072 if (ttlmasters == 0)
1073 nflags |= HAMMER2_CLUSTER_NOHARD;
1076 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1077 * all available nodes (even if 0 are available) are fully
1078 * synchronized. This is used by the synchronization thread to
1079 * determine if there is work it could potentially accomplish.
1081 if (nslaves == ttlslaves)
1082 nflags |= HAMMER2_CLUSTER_SSYNCED;
1083 if (nmasters == ttlmasters)
1084 nflags |= HAMMER2_CLUSTER_MSYNCED;
1087 * Determine if the cluster was successfully locked for the
1088 * requested operation and generate an error code. The cluster
1089 * will not be locked (or ref'd) if an error is returned.
1091 atomic_set_int(&cluster->flags, nflags);
1092 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1094 return cluster->error;
1098 * This is used by the sync thread to force non-NULL elements of a copy
1099 * of the pmp->iroot cluster to be good which is required to prime the
1100 * sync.
1102 void
1103 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1105 int i;
1107 for (i = 0; i < cluster->nchains; ++i) {
1108 if (cluster->array[i].chain)
1109 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1114 * Unlock a cluster. Refcount and focus is maintained.
1116 void
1117 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1119 hammer2_chain_t *chain;
1120 int i;
1122 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1123 kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1124 cluster);
1126 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1127 KKASSERT(cluster->refs > 0);
1128 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1130 for (i = 0; i < cluster->nchains; ++i) {
1131 chain = cluster->array[i].chain;
1132 if (chain)
1133 hammer2_chain_unlock(chain);
1137 /************************************************************************
1138 * CLUSTER I/O *
1139 ************************************************************************
1142 * WARNING! blockref[] array data is not universal. These functions should
1143 * only be used to access universal data.
1145 * NOTE! The rdata call will wait for at least one of the chain I/Os to
1146 * complete if necessary. The I/O's should have already been
1147 * initiated by the cluster_lock/chain_lock operation.
1149 * The cluster must already be in a modified state before wdata
1150 * is called. The data will already be available for this case.
1152 const hammer2_media_data_t *
1153 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1155 hammer2_chain_t *chain;
1157 chain = cluster->focus;
1158 KKASSERT(chain != NULL && chain->lockcnt);
1159 if (chain->dio)
1160 hammer2_io_bkvasync(chain->dio);
1161 return (chain->data);
1164 hammer2_media_data_t *
1165 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1167 hammer2_chain_t *chain;
1169 chain = cluster->focus;
1170 KKASSERT(chain != NULL && chain->lockcnt &&
1171 hammer2_cluster_modified(cluster));
1172 if (chain->dio)
1173 hammer2_io_bkvasync(chain->dio);
1174 return(chain->data);