hammer2 - Revamp flush and xopq mechanism, stabilization
[dragonfly.git] / sys / vfs / hammer2 / hammer2_synchro.c
blob5d5811d110d149c1946f203da8b749239ed55845
1 /*
2 * Copyright (c) 2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 * This module implements the cluster synchronizer. Basically the way
36 * it works is that a thread is created for each cluster node in a PFS.
37 * This thread is responsible for synchronizing the current node using
38 * data from other nodes.
40 * Any out of sync master or slave can get back into synchronization as
41 * long as a quorum of masters agree on the update_tid. If a quorum is
42 * not available it may still be possible to synchronize to the highest
43 * available update_tid as a way of trying to catch up as much as possible
44 * until a quorum is available.
46 * If no quorum is possible (which can happen even if all masters are
47 * available, if the update_tid does not match), then manual intervention
48 * may be required to resolve discrepancies.
50 #include "hammer2.h"
52 typedef struct hammer2_deferred_ip {
53 struct hammer2_deferred_ip *next;
54 hammer2_inode_t *ip;
55 } hammer2_deferred_ip_t;
57 typedef struct hammer2_deferred_list {
58 hammer2_deferred_ip_t *base;
59 int count;
60 } hammer2_deferred_list_t;
63 #define HAMMER2_SYNCHRO_DEBUG 1
65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
66 hammer2_deferred_list_t *list, int isroot);
67 #if 0
68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
69 nerror = hammer2_sync_insert(
70 thr, &parent, &chain,
71 focus->bref.modify_tid,
72 idx, focus);
73 #endif
74 static int hammer2_sync_insert(hammer2_thread_t *thr,
75 hammer2_chain_t **parentp, hammer2_chain_t **chainp,
76 hammer2_tid_t modify_tid, int idx,
77 hammer2_chain_t *focus);
78 static int hammer2_sync_destroy(hammer2_thread_t *thr,
79 hammer2_chain_t **parentp, hammer2_chain_t **chainp,
80 hammer2_tid_t mtid, int idx);
81 static int hammer2_sync_replace(hammer2_thread_t *thr,
82 hammer2_chain_t *parent, hammer2_chain_t *chain,
83 hammer2_tid_t mtid, int idx,
84 hammer2_chain_t *focus, int isroot);
86 /****************************************************************************
87 * HAMMER2 SYNC THREADS *
88 ****************************************************************************/
90 * Primary management thread for an element of a node. A thread will exist
91 * for each element requiring management.
93 * No management threads are needed for the SPMP or for any PMP with only
94 * a single MASTER.
96 * On the SPMP - handles bulkfree and dedup operations
97 * On a PFS - handles remastering and synchronization
99 void
100 hammer2_primary_sync_thread(void *arg)
102 hammer2_thread_t *thr = arg;
103 hammer2_pfs_t *pmp;
104 hammer2_deferred_list_t list;
105 hammer2_deferred_ip_t *defer;
106 int error;
107 uint32_t flags;
108 uint32_t nflags;
110 pmp = thr->pmp;
111 bzero(&list, sizeof(list));
113 for (;;) {
114 flags = thr->flags;
115 cpu_ccfence();
118 * Handle stop request
120 if (flags & HAMMER2_THREAD_STOP)
121 break;
124 * Handle freeze request
126 if (flags & HAMMER2_THREAD_FREEZE) {
127 nflags = (flags & ~(HAMMER2_THREAD_FREEZE |
128 HAMMER2_THREAD_CLIENTWAIT)) |
129 HAMMER2_THREAD_FROZEN;
130 if (!atomic_cmpset_int(&thr->flags, flags, nflags))
131 continue;
132 if (flags & HAMMER2_THREAD_CLIENTWAIT)
133 wakeup(&thr->flags);
134 flags = nflags;
135 /* fall through */
138 if (flags & HAMMER2_THREAD_UNFREEZE) {
139 nflags = flags & ~(HAMMER2_THREAD_UNFREEZE |
140 HAMMER2_THREAD_FROZEN |
141 HAMMER2_THREAD_CLIENTWAIT);
142 if (!atomic_cmpset_int(&thr->flags, flags, nflags))
143 continue;
144 if (flags & HAMMER2_THREAD_CLIENTWAIT)
145 wakeup(&thr->flags);
146 flags = nflags;
147 /* fall through */
151 * Force idle if frozen until unfrozen or stopped.
153 if (flags & HAMMER2_THREAD_FROZEN) {
154 nflags = flags | HAMMER2_THREAD_WAITING;
155 tsleep_interlock(&thr->flags, 0);
156 if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
157 tsleep(&thr->flags, PINTERLOCKED, "frozen", 0);
158 atomic_clear_int(&thr->flags,
159 HAMMER2_THREAD_WAITING);
161 continue;
165 * Reset state on REMASTER request
167 if (thr->flags & HAMMER2_THREAD_REMASTER) {
168 nflags = flags & ~HAMMER2_THREAD_REMASTER;
169 if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
170 /* reset state here */
172 continue;
176 * Synchronization scan.
178 if (hammer2_debug & 0x8000)
179 kprintf("sync_slaves pfs %s clindex %d\n",
180 pmp->pfs_names[thr->clindex], thr->clindex);
181 hammer2_trans_init(pmp, 0);
183 hammer2_inode_ref(pmp->iroot);
185 for (;;) {
186 int didbreak = 0;
187 /* XXX lock synchronize pmp->modify_tid */
188 error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1);
189 if (hammer2_debug & 0x8000) {
190 kprintf("sync_slaves error %d defer %p\n",
191 error, list.base);
193 if (error != EAGAIN)
194 break;
195 while ((defer = list.base) != NULL) {
196 hammer2_inode_t *nip;
198 nip = defer->ip;
199 error = hammer2_sync_slaves(thr, nip, &list, 0);
200 if (error && error != EAGAIN && error != ENOENT)
201 break;
202 if (hammer2_thr_break(thr)) {
203 didbreak = 1;
204 break;
208 * If no additional defers occurred we can
209 * remove this one, otherwise keep it on
210 * the list and retry once the additional
211 * defers have completed.
213 if (defer == list.base) {
214 --list.count;
215 list.base = defer->next;
216 kfree(defer, M_HAMMER2);
217 defer = NULL; /* safety */
218 hammer2_inode_drop(nip);
223 * If the thread is being remastered, frozen, or
224 * stopped, clean up any left-over deferals.
226 if (didbreak || (error && error != EAGAIN)) {
227 kprintf("didbreak\n");
228 while ((defer = list.base) != NULL) {
229 --list.count;
230 hammer2_inode_drop(defer->ip);
231 list.base = defer->next;
232 kfree(defer, M_HAMMER2);
234 if (error == 0 || error == EAGAIN)
235 error = EINPROGRESS;
236 break;
240 hammer2_inode_drop(pmp->iroot);
241 hammer2_trans_done(pmp);
243 if (error && error != EINPROGRESS)
244 kprintf("hammer2_sync_slaves: error %d\n", error);
247 * Wait for event, or 5-second poll.
249 nflags = flags | HAMMER2_THREAD_WAITING;
250 tsleep_interlock(&thr->flags, 0);
251 if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
252 tsleep(&thr->flags, 0, "h2idle", hz * 5);
253 atomic_clear_int(&thr->flags, HAMMER2_THREAD_WAITING);
256 thr->td = NULL;
257 hammer2_thr_return(thr, HAMMER2_THREAD_STOPPED);
258 /* thr structure can go invalid after this point */
259 wakeup(thr);
262 #if 0
264 * Given a locked cluster created from pmp->iroot, update the PFS's
265 * reporting status.
267 static
268 void
269 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
271 hammer2_pfs_t *pmp = thr->pmp;
273 flags &= HAMMER2_CLUSTER_ZFLAGS;
274 if (pmp->cluster_flags == flags)
275 return;
276 pmp->cluster_flags = flags;
278 kprintf("pfs %p", pmp);
279 if (flags & HAMMER2_CLUSTER_MSYNCED)
280 kprintf(" masters-all-good");
281 if (flags & HAMMER2_CLUSTER_SSYNCED)
282 kprintf(" slaves-all-good");
284 if (flags & HAMMER2_CLUSTER_WRHARD)
285 kprintf(" quorum/rw");
286 else if (flags & HAMMER2_CLUSTER_RDHARD)
287 kprintf(" quorum/ro");
289 if (flags & HAMMER2_CLUSTER_UNHARD)
290 kprintf(" out-of-sync-masters");
291 else if (flags & HAMMER2_CLUSTER_NOHARD)
292 kprintf(" no-masters-visible");
294 if (flags & HAMMER2_CLUSTER_WRSOFT)
295 kprintf(" soft/rw");
296 else if (flags & HAMMER2_CLUSTER_RDSOFT)
297 kprintf(" soft/ro");
299 if (flags & HAMMER2_CLUSTER_UNSOFT)
300 kprintf(" out-of-sync-slaves");
301 else if (flags & HAMMER2_CLUSTER_NOSOFT)
302 kprintf(" no-slaves-visible");
303 kprintf("\n");
305 #endif
307 #if 0
308 static
309 void
310 dumpcluster(const char *label,
311 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
313 hammer2_chain_t *chain;
314 int i;
316 if ((hammer2_debug & 1) == 0)
317 return;
319 kprintf("%s\t", label);
320 KKASSERT(cparent->nchains == cluster->nchains);
321 for (i = 0; i < cparent->nchains; ++i) {
322 if (i)
323 kprintf("\t");
324 kprintf("%d ", i);
325 if ((chain = cparent->array[i].chain) != NULL) {
326 kprintf("%016jx%s ",
327 chain->bref.key,
328 ((cparent->array[i].flags &
329 HAMMER2_CITEM_INVALID) ? "(I)" : " ")
331 } else {
332 kprintf(" NULL %s ", " ");
334 if ((chain = cluster->array[i].chain) != NULL) {
335 kprintf("%016jx%s ",
336 chain->bref.key,
337 ((cluster->array[i].flags &
338 HAMMER2_CITEM_INVALID) ? "(I)" : " ")
340 } else {
341 kprintf(" NULL %s ", " ");
343 kprintf("\n");
346 #endif
349 * Each out of sync node sync-thread must issue an all-nodes XOP scan of
350 * the inode. This creates a multiplication effect since the XOP scan itself
351 * issues to all nodes. However, this is the only way we can safely
352 * synchronize nodes which might have disparate I/O bandwidths and the only
353 * way we can safely deal with stalled nodes.
355 static
357 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
358 hammer2_deferred_list_t *list, int isroot)
360 hammer2_xop_scanall_t *xop;
361 hammer2_chain_t *parent;
362 hammer2_chain_t *chain;
363 hammer2_pfs_t *pmp;
364 hammer2_key_t key_next;
365 hammer2_tid_t sync_tid;
366 int cache_index = -1;
367 int needrescan;
368 int want_update;
369 int error;
370 int nerror;
371 int idx;
372 int n;
374 pmp = ip->pmp;
375 idx = thr->clindex; /* cluster node we are responsible for */
376 needrescan = 0;
377 want_update = 0;
378 sync_tid = 0;
379 chain = NULL;
380 parent = NULL;
382 #if 0
384 * Nothing to do if all slaves are synchronized.
385 * Nothing to do if cluster not authoritatively readable.
387 if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
388 return(0);
389 if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
390 return(HAMMER2_ERROR_INCOMPLETE);
391 #endif
393 error = 0;
396 * Resolve the root inode of the PFS and determine if synchronization
397 * is needed by checking modify_tid.
400 hammer2_xop_ipcluster_t *xop2;
401 hammer2_chain_t *focus;
403 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
404 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
405 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
406 idx);
407 hammer2_inode_unlock(ip);
408 error = hammer2_xop_collect(&xop2->head, 0);
409 if (error == 0 && (focus = xop2->head.cluster.focus) != NULL) {
410 sync_tid = focus->bref.modify_tid; /* XXX */
411 chain = hammer2_inode_chain_and_parent(ip, idx,
412 &parent,
413 HAMMER2_RESOLVE_ALWAYS |
414 HAMMER2_RESOLVE_SHARED);
415 want_update = (chain->bref.modify_tid != sync_tid);
416 if (chain) {
417 hammer2_chain_unlock(chain);
418 hammer2_chain_drop(chain);
419 chain = NULL;
421 if (parent) {
422 hammer2_chain_unlock(parent);
423 hammer2_chain_drop(parent);
424 parent = NULL;
427 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
430 if (want_update == 0)
431 return(0);
434 * The inode is left unlocked during the scan. Issue a XOP
435 * that does *not* include our cluster index to iterate
436 * properly synchronized elements and resolve our cluster index
437 * against it.
439 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
440 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
441 xop->key_beg = HAMMER2_KEY_MIN;
442 xop->key_end = HAMMER2_KEY_MAX;
443 xop->resolve_flags = HAMMER2_RESOLVE_SHARED |
444 HAMMER2_RESOLVE_ALWAYS;
445 xop->lookup_flags = HAMMER2_LOOKUP_SHARED |
446 HAMMER2_LOOKUP_NODIRECT |
447 HAMMER2_LOOKUP_ALWAYS;
448 hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx);
449 parent = hammer2_inode_chain(ip, idx,
450 HAMMER2_RESOLVE_ALWAYS |
451 HAMMER2_RESOLVE_SHARED);
452 hammer2_inode_unlock(ip);
454 chain = hammer2_chain_lookup(&parent, &key_next,
455 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
456 &cache_index,
457 HAMMER2_LOOKUP_SHARED |
458 HAMMER2_LOOKUP_NODIRECT |
459 HAMMER2_LOOKUP_NODATA);
460 error = hammer2_xop_collect(&xop->head, 0);
461 kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n",
462 ip->meta.name_key, chain,
463 (chain ? chain->bref.key : -1));
465 for (;;) {
467 * We are done if our scan is done and the XOP scan is done.
468 * We are done if the XOP scan failed (that is, we don't
469 * have authoritative data to synchronize with).
471 int advance_local = 0;
472 int advance_xop = 0;
473 int dodefer = 0;
474 hammer2_chain_t *focus;
476 if (chain == NULL && error == ENOENT)
477 break;
478 if (error && error != ENOENT)
479 break;
482 * Compare
484 if (chain && error == ENOENT) {
486 * If we have local chains but the XOP scan is done,
487 * the chains need to be deleted.
489 n = -1;
490 focus = NULL;
491 } else if (chain == NULL) {
493 * If our local scan is done but the XOP scan is not,
494 * we need to create the missing chain(s).
496 n = 1;
497 focus = xop->head.cluster.focus;
498 } else {
500 * Otherwise compare to determine the action
501 * needed.
503 focus = xop->head.cluster.focus;
504 n = hammer2_chain_cmp(chain, focus);
508 * Take action based on comparison results.
510 if (n < 0) {
512 * Delete extranious local data. This will
513 * automatically advance the chain.
515 nerror = hammer2_sync_destroy(thr, &parent, &chain,
516 0, idx);
517 } else if (n == 0 && chain->bref.modify_tid !=
518 focus->bref.modify_tid) {
520 * Matching key but local data or meta-data requires
521 * updating. If we will recurse, we still need to
522 * update to compatible content first but we do not
523 * synchronize modify_tid until the entire recursion
524 * has completed successfully.
526 * NOTE: Do not try to access hardlink pointers as if
527 * they were normal inodes, the inode cache will
528 * get seriously confused.
530 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE &&
531 focus->data->ipdata.meta.type !=
532 HAMMER2_OBJTYPE_HARDLINK) {
533 nerror = hammer2_sync_replace(
534 thr, parent, chain,
536 idx, focus, 0);
537 dodefer = 1;
538 } else {
539 nerror = hammer2_sync_replace(
540 thr, parent, chain,
541 focus->bref.modify_tid,
542 idx, focus, 0);
544 advance_local = 1;
545 advance_xop = 1;
546 } else if (n == 0) {
548 * 100% match, advance both
550 advance_local = 1;
551 advance_xop = 1;
552 nerror = 0;
553 } else if (n > 0) {
555 * Insert missing local data.
557 * If we will recurse, we still need to update to
558 * compatible content first but we do not synchronize
559 * modify_tid until the entire recursion has
560 * completed successfully.
562 * NOTE: Do not try to access hardlink pointers as if
563 * they were normal inodes, the inode cache will
564 * get seriously confused.
566 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE &&
567 focus->data->ipdata.meta.type !=
568 HAMMER2_OBJTYPE_HARDLINK) {
569 nerror = hammer2_sync_insert(
570 thr, &parent, &chain,
572 idx, focus);
573 dodefer = 2;
574 } else {
575 nerror = hammer2_sync_insert(
576 thr, &parent, &chain,
577 focus->bref.modify_tid,
578 idx, focus);
580 advance_local = 1;
581 advance_xop = 1;
585 * We cannot recurse depth-first because the XOP is still
586 * running in node threads for this scan. Create a placemarker
587 * by obtaining and record the hammer2_inode.
589 * We excluded our node from the XOP so we must temporarily
590 * add it to xop->head.cluster so it is properly incorporated
591 * into the inode.
593 * The deferral is pushed onto a LIFO list for bottom-up
594 * synchronization.
596 if (error == 0 && dodefer) {
597 hammer2_inode_t *nip;
598 hammer2_deferred_ip_t *defer;
600 KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
602 defer = kmalloc(sizeof(*defer), M_HAMMER2,
603 M_WAITOK | M_ZERO);
604 KKASSERT(xop->head.cluster.array[idx].chain == NULL);
605 xop->head.cluster.array[idx].flags =
606 HAMMER2_CITEM_INVALID;
607 xop->head.cluster.array[idx].chain = chain;
608 nip = hammer2_inode_get(pmp, ip,
609 &xop->head.cluster, idx);
610 xop->head.cluster.array[idx].chain = NULL;
612 hammer2_inode_ref(nip);
613 hammer2_inode_unlock(nip);
615 defer->next = list->base;
616 defer->ip = nip;
617 list->base = defer;
618 ++list->count;
619 needrescan = 1;
623 * If at least one deferral was added and the deferral
624 * list has grown too large, stop adding more. This
625 * will trigger an EAGAIN return.
627 if (needrescan && list->count > 1000)
628 break;
631 * Advancements for iteration.
633 if (advance_xop) {
634 error = hammer2_xop_collect(&xop->head, 0);
636 if (advance_local) {
637 chain = hammer2_chain_next(&parent, chain, &key_next,
638 key_next, HAMMER2_KEY_MAX,
639 &cache_index,
640 HAMMER2_LOOKUP_SHARED |
641 HAMMER2_LOOKUP_NODIRECT |
642 HAMMER2_LOOKUP_NODATA);
645 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
646 if (chain) {
647 hammer2_chain_unlock(chain);
648 hammer2_chain_drop(chain);
650 if (parent) {
651 hammer2_chain_unlock(parent);
652 hammer2_chain_drop(parent);
656 * If we added deferrals we want the caller to synchronize them
657 * and then call us again.
659 * NOTE: In this situation we do not yet want to synchronize our
660 * inode, setting the error code also has that effect.
662 if ((error == 0 || error == ENOENT) && needrescan)
663 error = EAGAIN;
666 * If no error occurred we can synchronize the inode meta-data
667 * and modify_tid. Only limited changes are made to PFSROOTs.
669 * XXX inode lock was lost
671 if (error == 0 || error == ENOENT) {
672 hammer2_xop_ipcluster_t *xop2;
673 hammer2_chain_t *focus;
675 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
676 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
677 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
678 idx);
679 hammer2_inode_unlock(ip);
680 error = hammer2_xop_collect(&xop2->head, 0);
681 if (error == 0) {
682 focus = xop2->head.cluster.focus;
683 kprintf("syncthr: update inode %p (%s)\n",
684 focus,
685 (focus ?
686 (char *)focus->data->ipdata.filename : "?"));
687 chain = hammer2_inode_chain_and_parent(ip, idx,
688 &parent,
689 HAMMER2_RESOLVE_ALWAYS |
690 HAMMER2_RESOLVE_SHARED);
692 KKASSERT(parent != NULL);
693 nerror = hammer2_sync_replace(
694 thr, parent, chain,
695 sync_tid,
696 idx, focus, isroot);
697 hammer2_chain_unlock(chain);
698 hammer2_chain_drop(chain);
699 hammer2_chain_unlock(parent);
700 hammer2_chain_drop(parent);
701 /* XXX */
703 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
706 return error;
710 * Create a missing chain by copying the focus from another device.
712 * On entry *parentp and focus are both locked shared. The chain will be
713 * created and returned in *chainp also locked shared.
715 static
717 hammer2_sync_insert(hammer2_thread_t *thr,
718 hammer2_chain_t **parentp, hammer2_chain_t **chainp,
719 hammer2_tid_t mtid, int idx, hammer2_chain_t *focus)
721 hammer2_chain_t *chain;
722 hammer2_key_t dummy;
723 int cache_index = -1;
725 #if HAMMER2_SYNCHRO_DEBUG
726 if (hammer2_debug & 1)
727 kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
728 *parentp,
729 (*parentp)->bref.type,
730 (*parentp)->bref.key,
731 idx,
732 focus->bref.type, focus->bref.key, mtid);
733 #endif
736 * Parent requires an exclusive lock for the insertion.
737 * We must unlock the child to avoid deadlocks while
738 * relocking the parent.
740 if (*chainp) {
741 hammer2_chain_unlock(*chainp);
742 hammer2_chain_drop(*chainp);
743 *chainp = NULL;
745 hammer2_chain_unlock(*parentp);
746 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
749 * We must reissue the lookup to properly position (*parentp)
750 * for the insertion.
752 chain = hammer2_chain_lookup(parentp, &dummy,
753 focus->bref.key, focus->bref.key,
754 &cache_index,
755 HAMMER2_LOOKUP_NODIRECT |
756 HAMMER2_LOOKUP_ALWAYS);
757 KKASSERT(chain == NULL);
759 chain = NULL;
760 hammer2_chain_create(parentp, &chain,
761 thr->pmp, focus->bref.methods,
762 focus->bref.key, focus->bref.keybits,
763 focus->bref.type, focus->bytes,
764 mtid, 0, 0);
765 hammer2_chain_modify(chain, mtid, 0, 0);
768 * Copy focus to new chain
771 /* type already set */
772 chain->bref.methods = focus->bref.methods;
773 /* keybits already set */
774 chain->bref.vradix = focus->bref.vradix;
775 /* mirror_tid set by flush */
776 KKASSERT(chain->bref.modify_tid == mtid);
777 chain->bref.flags = focus->bref.flags;
778 /* key already present */
779 /* check code will be recalculated */
782 * Copy data body.
784 switch(chain->bref.type) {
785 case HAMMER2_BREF_TYPE_INODE:
786 if ((focus->data->ipdata.meta.op_flags &
787 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
788 /* do not copy block table */
789 bcopy(focus->data, chain->data,
790 offsetof(hammer2_inode_data_t, u));
791 break;
793 /* fall through copy whole thing */
794 case HAMMER2_BREF_TYPE_DATA:
795 bcopy(focus->data, chain->data, chain->bytes);
796 hammer2_chain_setcheck(chain, chain->data);
797 break;
798 default:
799 KKASSERT(0);
800 break;
803 hammer2_chain_unlock(chain); /* unlock, leave ref */
804 *chainp = chain; /* will be returned locked */
807 * Avoid ordering deadlock when relocking shared.
809 hammer2_chain_unlock(*parentp);
810 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
811 HAMMER2_RESOLVE_ALWAYS);
812 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
813 HAMMER2_RESOLVE_ALWAYS);
815 return 0;
819 * Destroy an extranious chain.
821 * Both *parentp and *chainp are locked shared.
823 * On return, *chainp will be adjusted to point to the next element in the
824 * iteration and locked shared.
826 static
828 hammer2_sync_destroy(hammer2_thread_t *thr,
829 hammer2_chain_t **parentp, hammer2_chain_t **chainp,
830 hammer2_tid_t mtid, int idx)
832 hammer2_chain_t *chain;
833 hammer2_chain_t *parent;
834 hammer2_key_t key_next;
835 hammer2_key_t save_key;
836 int cache_index = -1;
838 chain = *chainp;
840 #if HAMMER2_SYNCHRO_DEBUG
841 if (hammer2_debug & 1)
842 kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
843 *parentp, chain,
844 idx, chain->bref.type, chain->bref.key);
845 #endif
847 save_key = chain->bref.key;
848 if (save_key != HAMMER2_KEY_MAX)
849 ++save_key;
852 * Try to avoid unnecessary I/O.
854 * XXX accounting not propagated up properly. We might have to do
855 * a RESOLVE_MAYBE here and pass 0 for the flags.
857 hammer2_chain_unlock(chain); /* relock exclusive */
858 hammer2_chain_unlock(*parentp);
859 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
860 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
862 hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT);
863 hammer2_chain_unlock(chain);
864 hammer2_chain_drop(chain);
865 chain = NULL; /* safety */
867 hammer2_chain_unlock(*parentp); /* relock shared */
868 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
869 HAMMER2_RESOLVE_ALWAYS);
870 *chainp = hammer2_chain_lookup(&parent, &key_next,
871 save_key, HAMMER2_KEY_MAX,
872 &cache_index,
873 HAMMER2_LOOKUP_SHARED |
874 HAMMER2_LOOKUP_NODIRECT |
875 HAMMER2_LOOKUP_NODATA);
876 return 0;
880 * cparent is locked exclusively, with an extra ref, cluster is not locked.
881 * Replace element [i] in the cluster.
883 static
885 hammer2_sync_replace(hammer2_thread_t *thr,
886 hammer2_chain_t *parent, hammer2_chain_t *chain,
887 hammer2_tid_t mtid, int idx,
888 hammer2_chain_t *focus, int isroot)
890 int nradix;
891 uint8_t otype;
893 #if HAMMER2_SYNCHRO_DEBUG
894 if (hammer2_debug & 1)
895 kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
896 chain,
897 idx,
898 focus->bref.type, focus->bref.key, mtid);
899 #endif
900 hammer2_chain_unlock(chain);
901 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
902 if (chain->bytes != focus->bytes) {
903 /* XXX what if compressed? */
904 nradix = hammer2_getradix(chain->bytes);
905 hammer2_chain_resize(NULL, parent, chain,
906 mtid, 0,
907 nradix, 0);
909 hammer2_chain_modify(chain, mtid, 0, 0);
910 otype = chain->bref.type;
911 chain->bref.type = focus->bref.type;
912 chain->bref.methods = focus->bref.methods;
913 chain->bref.keybits = focus->bref.keybits;
914 chain->bref.vradix = focus->bref.vradix;
915 /* mirror_tid updated by flush */
916 KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid);
917 chain->bref.flags = focus->bref.flags;
918 /* key already present */
919 /* check code will be recalculated */
920 chain->error = 0;
923 * Copy data body.
925 switch(chain->bref.type) {
926 case HAMMER2_BREF_TYPE_INODE:
928 * Special case PFSROOTs, only limited changes can be made
929 * since the meta-data contains miscellanious distinguishing
930 * fields.
932 if (isroot) {
933 chain->data->ipdata.meta.uflags =
934 focus->data->ipdata.meta.uflags;
935 chain->data->ipdata.meta.rmajor =
936 focus->data->ipdata.meta.rmajor;
937 chain->data->ipdata.meta.rminor =
938 focus->data->ipdata.meta.rminor;
939 chain->data->ipdata.meta.ctime =
940 focus->data->ipdata.meta.ctime;
941 chain->data->ipdata.meta.mtime =
942 focus->data->ipdata.meta.mtime;
943 chain->data->ipdata.meta.atime =
944 focus->data->ipdata.meta.atime;
945 /* not btime */
946 chain->data->ipdata.meta.uid =
947 focus->data->ipdata.meta.uid;
948 chain->data->ipdata.meta.gid =
949 focus->data->ipdata.meta.gid;
950 chain->data->ipdata.meta.mode =
951 focus->data->ipdata.meta.mode;
952 chain->data->ipdata.meta.ncopies =
953 focus->data->ipdata.meta.ncopies;
954 chain->data->ipdata.meta.comp_algo =
955 focus->data->ipdata.meta.comp_algo;
956 chain->data->ipdata.meta.check_algo =
957 focus->data->ipdata.meta.check_algo;
958 chain->data->ipdata.meta.data_quota =
959 focus->data->ipdata.meta.data_quota;
960 chain->data->ipdata.meta.inode_quota =
961 focus->data->ipdata.meta.inode_quota;
964 * last snapshot tid controls overwrite
966 if (chain->data->ipdata.meta.pfs_lsnap_tid <
967 focus->data->ipdata.meta.pfs_lsnap_tid) {
968 chain->data->ipdata.meta.pfs_lsnap_tid =
969 focus->data->ipdata.meta.pfs_lsnap_tid;
972 hammer2_chain_setcheck(chain, chain->data);
973 break;
977 * Normal replacement.
979 if ((focus->data->ipdata.meta.op_flags &
980 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
982 * If DIRECTDATA is transitioning to 0 or the old
983 * chain is not an inode we have to initialize
984 * the block table.
986 if (otype != HAMMER2_BREF_TYPE_INODE ||
987 (chain->data->ipdata.meta.op_flags &
988 HAMMER2_OPFLAG_DIRECTDATA)) {
989 kprintf("chain inode trans away from dd\n");
990 bzero(&chain->data->ipdata.u,
991 sizeof(chain->data->ipdata.u));
993 bcopy(focus->data, chain->data,
994 offsetof(hammer2_inode_data_t, u));
995 /* XXX setcheck on inode should not be needed */
996 hammer2_chain_setcheck(chain, chain->data);
997 break;
999 /* fall through */
1000 case HAMMER2_BREF_TYPE_DATA:
1001 bcopy(focus->data, chain->data, chain->bytes);
1002 hammer2_chain_setcheck(chain, chain->data);
1003 break;
1004 default:
1005 KKASSERT(0);
1006 break;
1009 hammer2_chain_unlock(chain);
1010 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
1011 HAMMER2_RESOLVE_MAYBE);
1013 return 0;