hammer2 - Remote xop implementation part 1
[dragonfly.git] / sys / vfs / hammer2 / hammer2_synchro.c
blobc79480c4d3fb14802df8b907695729394f5b9dcc
1 /*
2 * Copyright (c) 2015-2018 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 * This module implements the cluster synchronizer. Basically the way
36 * it works is that a thread is created for each cluster node in a PFS.
37 * This thread is responsible for synchronizing the current node using
38 * data from other nodes.
40 * Any out of sync master or slave can get back into synchronization as
41 * long as a quorum of masters agree on the update_tid. If a quorum is
42 * not available it may still be possible to synchronize to the highest
43 * available update_tid as a way of trying to catch up as much as possible
44 * until a quorum is available.
46 * If no quorum is possible (which can happen even if all masters are
47 * available, if the update_tid does not match), then manual intervention
48 * may be required to resolve discrepancies.
50 #include "hammer2.h"
52 typedef struct hammer2_deferred_ip {
53 struct hammer2_deferred_ip *next;
54 hammer2_inode_t *ip;
55 } hammer2_deferred_ip_t;
57 typedef struct hammer2_deferred_list {
58 hammer2_deferred_ip_t *base;
59 int count;
60 } hammer2_deferred_list_t;
63 #define HAMMER2_SYNCHRO_DEBUG 1
65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
66 hammer2_deferred_list_t *list, int isroot);
67 #if 0
68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
69 nerror = hammer2_sync_insert(
70 thr, &parent, &chain,
71 focus->bref.modify_tid,
72 idx, focus);
73 #endif
74 static int hammer2_sync_insert(hammer2_thread_t *thr,
75 hammer2_chain_t **parentp, hammer2_chain_t **chainp,
76 hammer2_tid_t modify_tid, int idx,
77 hammer2_xop_head_t *xop, hammer2_chain_t *focus);
78 static int hammer2_sync_destroy(hammer2_thread_t *thr,
79 hammer2_chain_t **parentp, hammer2_chain_t **chainp,
80 hammer2_tid_t mtid, int idx);
81 static int hammer2_sync_replace(hammer2_thread_t *thr,
82 hammer2_chain_t *parent, hammer2_chain_t *chain,
83 hammer2_tid_t mtid, int idx,
84 hammer2_xop_head_t *xop, hammer2_chain_t *focus,
85 int isroot);
87 /****************************************************************************
88 * HAMMER2 SYNC THREADS *
89 ****************************************************************************/
91 * Primary management thread for an element of a node. A thread will exist
92 * for each element requiring management.
94 * No management threads are needed for the SPMP or for any PMP with only
95 * a single MASTER.
97 * On the SPMP - handles bulkfree and dedup operations
98 * On a PFS - handles remastering and synchronization
100 void
101 hammer2_primary_sync_thread(void *arg)
103 hammer2_thread_t *thr = arg;
104 hammer2_pfs_t *pmp;
105 hammer2_deferred_list_t list;
106 hammer2_deferred_ip_t *defer;
107 int error;
108 uint32_t flags;
109 uint32_t nflags;
111 pmp = thr->pmp;
112 bzero(&list, sizeof(list));
114 for (;;) {
115 flags = thr->flags;
116 cpu_ccfence();
119 * Handle stop request
121 if (flags & HAMMER2_THREAD_STOP)
122 break;
125 * Handle freeze request
127 if (flags & HAMMER2_THREAD_FREEZE) {
128 nflags = (flags & ~(HAMMER2_THREAD_FREEZE |
129 HAMMER2_THREAD_WAITING)) |
130 HAMMER2_THREAD_FROZEN;
131 if (!atomic_cmpset_int(&thr->flags, flags, nflags))
132 continue;
133 if (flags & HAMMER2_THREAD_WAITING)
134 wakeup(&thr->flags);
135 continue;
138 if (flags & HAMMER2_THREAD_UNFREEZE) {
139 nflags = flags & ~(HAMMER2_THREAD_UNFREEZE |
140 HAMMER2_THREAD_FROZEN |
141 HAMMER2_THREAD_WAITING);
142 if (!atomic_cmpset_int(&thr->flags, flags, nflags))
143 continue;
144 if (flags & HAMMER2_THREAD_WAITING)
145 wakeup(&thr->flags);
146 continue;
150 * Force idle if frozen until unfrozen or stopped.
152 if (flags & HAMMER2_THREAD_FROZEN) {
153 nflags = flags | HAMMER2_THREAD_WAITING;
155 tsleep_interlock(&thr->flags, 0);
156 if (atomic_cmpset_int(&thr->flags, flags, nflags))
157 tsleep(&thr->flags, PINTERLOCKED, "frozen", 0);
158 continue;
162 * Reset state on REMASTER request
164 if (thr->flags & HAMMER2_THREAD_REMASTER) {
165 nflags = flags & ~HAMMER2_THREAD_REMASTER;
166 if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
167 /* reset state here */
169 continue;
173 * Synchronization scan.
175 if (hammer2_debug & 0x8000)
176 kprintf("sync_slaves pfs %s clindex %d\n",
177 pmp->pfs_names[thr->clindex], thr->clindex);
178 hammer2_trans_init(pmp, 0);
180 hammer2_inode_ref(pmp->iroot);
182 for (;;) {
183 int didbreak = 0;
184 /* XXX lock synchronize pmp->modify_tid */
185 error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1);
186 if (hammer2_debug & 0x8000) {
187 kprintf("sync_slaves error %d defer %p\n",
188 error, list.base);
190 if (error != HAMMER2_ERROR_EAGAIN)
191 break;
192 while ((defer = list.base) != NULL) {
193 hammer2_inode_t *nip;
195 nip = defer->ip;
196 error = hammer2_sync_slaves(thr, nip, &list,
197 (nip == pmp->iroot));
198 if (error &&
199 error != HAMMER2_ERROR_EAGAIN &&
200 error != HAMMER2_ERROR_ENOENT) {
201 break;
203 if (hammer2_thr_break(thr)) {
204 didbreak = 1;
205 break;
209 * If no additional defers occurred we can
210 * remove this one, otherwise keep it on
211 * the list and retry once the additional
212 * defers have completed.
214 if (defer == list.base) {
215 --list.count;
216 list.base = defer->next;
217 kfree(defer, M_HAMMER2);
218 defer = NULL; /* safety */
219 hammer2_inode_drop(nip);
224 * If the thread is being remastered, frozen, or
225 * stopped, clean up any left-over deferals.
227 if (didbreak ||
228 (error && error != HAMMER2_ERROR_EAGAIN)) {
229 kprintf("didbreak\n");
230 while ((defer = list.base) != NULL) {
231 --list.count;
232 hammer2_inode_drop(defer->ip);
233 list.base = defer->next;
234 kfree(defer, M_HAMMER2);
236 if (error == 0 || error == HAMMER2_ERROR_EAGAIN)
237 error = HAMMER2_ERROR_EINPROGRESS;
238 break;
242 hammer2_inode_drop(pmp->iroot);
243 hammer2_trans_done(pmp, 0);
245 if (error && error != HAMMER2_ERROR_EINPROGRESS)
246 kprintf("hammer2_sync_slaves: error %d\n", error);
249 * Wait for event, or 5-second poll.
251 nflags = flags | HAMMER2_THREAD_WAITING;
252 tsleep_interlock(&thr->flags, 0);
253 if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
254 tsleep(&thr->flags, 0, "h2idle", hz * 5);
257 thr->td = NULL;
258 hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED);
259 /* thr structure can go invalid after this point */
262 #if 0
264 * Given a locked cluster created from pmp->iroot, update the PFS's
265 * reporting status.
267 static
268 void
269 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
271 hammer2_pfs_t *pmp = thr->pmp;
273 flags &= HAMMER2_CLUSTER_ZFLAGS;
274 if (pmp->cluster_flags == flags)
275 return;
276 pmp->cluster_flags = flags;
278 kprintf("pfs %p", pmp);
279 if (flags & HAMMER2_CLUSTER_MSYNCED)
280 kprintf(" masters-all-good");
281 if (flags & HAMMER2_CLUSTER_SSYNCED)
282 kprintf(" slaves-all-good");
284 if (flags & HAMMER2_CLUSTER_WRHARD)
285 kprintf(" quorum/rw");
286 else if (flags & HAMMER2_CLUSTER_RDHARD)
287 kprintf(" quorum/ro");
289 if (flags & HAMMER2_CLUSTER_UNHARD)
290 kprintf(" out-of-sync-masters");
291 else if (flags & HAMMER2_CLUSTER_NOHARD)
292 kprintf(" no-masters-visible");
294 if (flags & HAMMER2_CLUSTER_WRSOFT)
295 kprintf(" soft/rw");
296 else if (flags & HAMMER2_CLUSTER_RDSOFT)
297 kprintf(" soft/ro");
299 if (flags & HAMMER2_CLUSTER_UNSOFT)
300 kprintf(" out-of-sync-slaves");
301 else if (flags & HAMMER2_CLUSTER_NOSOFT)
302 kprintf(" no-slaves-visible");
303 kprintf("\n");
305 #endif
307 #if 0
308 static
309 void
310 dumpcluster(const char *label,
311 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
313 hammer2_chain_t *chain;
314 int i;
316 if ((hammer2_debug & 1) == 0)
317 return;
319 kprintf("%s\t", label);
320 KKASSERT(cparent->nchains == cluster->nchains);
321 for (i = 0; i < cparent->nchains; ++i) {
322 if (i)
323 kprintf("\t");
324 kprintf("%d ", i);
325 if ((chain = cparent->array[i].chain) != NULL) {
326 kprintf("%016jx%s ",
327 chain->bref.key,
328 ((cparent->array[i].flags &
329 HAMMER2_CITEM_INVALID) ? "(I)" : " ")
331 } else {
332 kprintf(" NULL %s ", " ");
334 if ((chain = cluster->array[i].chain) != NULL) {
335 kprintf("%016jx%s ",
336 chain->bref.key,
337 ((cluster->array[i].flags &
338 HAMMER2_CITEM_INVALID) ? "(I)" : " ")
340 } else {
341 kprintf(" NULL %s ", " ");
343 kprintf("\n");
346 #endif
349 * Each out of sync node sync-thread must issue an all-nodes XOP scan of
350 * the inode. This creates a multiplication effect since the XOP scan itself
351 * issues to all nodes. However, this is the only way we can safely
352 * synchronize nodes which might have disparate I/O bandwidths and the only
353 * way we can safely deal with stalled nodes.
355 * XXX serror / merror rollup and handling.
357 static
359 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
360 hammer2_deferred_list_t *list, int isroot)
362 hammer2_xop_scanall_t *xop;
363 hammer2_chain_t *parent;
364 hammer2_chain_t *chain;
365 hammer2_pfs_t *pmp;
366 hammer2_key_t key_next;
367 hammer2_tid_t sync_tid;
368 int needrescan;
369 int want_update;
370 int serror; /* slave error */
371 int merror; /* master error (from xop_collect) */
372 int nerror; /* temporary error */
373 int idx;
374 int n;
376 pmp = ip->pmp;
377 idx = thr->clindex; /* cluster node we are responsible for */
378 needrescan = 0;
379 want_update = 0;
380 sync_tid = 0;
381 chain = NULL;
382 parent = NULL;
384 #if 0
386 * Nothing to do if all slaves are synchronized.
387 * Nothing to do if cluster not authoritatively readable.
389 if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
390 return(0);
391 if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
392 return(HAMMER2_ERROR_INCOMPLETE);
393 #endif
395 merror = 0;
398 * Resolve the root inode of the PFS and determine if synchronization
399 * is needed by checking modify_tid.
401 * Retain the synchronization TID from the focus inode and use it
402 * later to synchronize the focus inode if/when the recursion
403 * succeeds.
406 hammer2_xop_ipcluster_t *xop2;
407 hammer2_chain_t *focus;
409 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
410 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
411 hammer2_xop_start_except(&xop2->head, &hammer2_ipcluster_desc,
412 idx);
413 hammer2_inode_unlock(ip);
414 merror = hammer2_xop_collect(&xop2->head, 0);
415 if (merror == 0 && (focus = xop2->head.cluster.focus) != NULL) {
416 sync_tid = focus->bref.modify_tid;
417 chain = hammer2_inode_chain_and_parent(ip, idx,
418 &parent,
419 HAMMER2_RESOLVE_ALWAYS |
420 HAMMER2_RESOLVE_SHARED);
421 want_update = (chain->bref.modify_tid != sync_tid);
422 if (chain) {
423 hammer2_chain_unlock(chain);
424 hammer2_chain_drop(chain);
425 chain = NULL;
427 if (parent) {
428 hammer2_chain_unlock(parent);
429 hammer2_chain_drop(parent);
430 parent = NULL;
433 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
436 if (want_update == 0)
437 return(0);
440 * The inode is left unlocked during the scan. Issue a XOP
441 * that does *not* include our cluster index to iterate
442 * properly synchronized elements and resolve our cluster index
443 * against it.
445 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
446 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
447 xop->key_beg = HAMMER2_KEY_MIN;
448 xop->key_end = HAMMER2_KEY_MAX;
449 xop->resolve_flags = HAMMER2_RESOLVE_SHARED |
450 HAMMER2_RESOLVE_ALWAYS;
451 xop->lookup_flags = HAMMER2_LOOKUP_SHARED |
452 HAMMER2_LOOKUP_NODIRECT |
453 HAMMER2_LOOKUP_ALWAYS;
454 hammer2_xop_start_except(&xop->head, &hammer2_scanall_desc, idx);
455 parent = hammer2_inode_chain(ip, idx,
456 HAMMER2_RESOLVE_ALWAYS |
457 HAMMER2_RESOLVE_SHARED);
458 hammer2_inode_unlock(ip);
460 chain = hammer2_chain_lookup(&parent, &key_next,
461 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
462 &serror,
463 HAMMER2_LOOKUP_SHARED |
464 HAMMER2_LOOKUP_NODIRECT |
465 HAMMER2_LOOKUP_NODATA);
466 merror = hammer2_xop_collect(&xop->head, 0);
467 if (hammer2_debug & 0x8000) {
468 kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n",
469 ip->meta.name_key, chain,
470 (chain ? chain->bref.key : -1));
473 for (;;) {
475 * We are done if our scan is done and the XOP scan is done.
476 * We are done if the XOP scan failed (that is, we don't
477 * have authoritative data to synchronize with).
479 int advance_local = 0;
480 int advance_xop = 0;
481 int dodefer = 0;
482 hammer2_chain_t *focus;
484 if (chain == NULL && merror == HAMMER2_ERROR_ENOENT)
485 break;
486 if (merror && merror != HAMMER2_ERROR_ENOENT)
487 break;
490 * Compare
492 if (chain && merror == HAMMER2_ERROR_ENOENT) {
494 * If we have local chains but the XOP scan is done,
495 * the chains need to be deleted.
497 n = -1;
498 focus = NULL;
499 } else if (chain == NULL) {
501 * If our local scan is done but the XOP scan is not,
502 * we need to create the missing chain(s).
504 n = 1;
505 focus = xop->head.cluster.focus;
506 } else {
508 * Otherwise compare to determine the action
509 * needed.
511 focus = xop->head.cluster.focus;
512 n = hammer2_chain_cmp(chain, focus);
516 * Take action based on comparison results.
518 if (n < 0) {
520 * Delete extranious local data. This will
521 * automatically advance the chain.
523 nerror = hammer2_sync_destroy(thr, &parent, &chain,
524 0, idx);
525 } else if (n == 0 && chain->bref.modify_tid !=
526 focus->bref.modify_tid) {
528 * Matching key but local data or meta-data requires
529 * updating. If we will recurse, we still need to
530 * update to compatible content first but we do not
531 * synchronize modify_tid until the entire recursion
532 * has completed successfully.
534 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
535 nerror = hammer2_sync_replace(
536 thr, parent, chain,
538 idx, &xop->head, focus, 0);
539 dodefer = 1;
540 } else {
541 nerror = hammer2_sync_replace(
542 thr, parent, chain,
543 focus->bref.modify_tid,
544 idx, &xop->head, focus, 0);
546 advance_local = 1;
547 advance_xop = 1;
548 } else if (n == 0) {
550 * 100% match, advance both
552 advance_local = 1;
553 advance_xop = 1;
554 nerror = 0;
555 } else if (n > 0) {
557 * Insert missing local data.
559 * If we will recurse, we still need to update to
560 * compatible content first but we do not synchronize
561 * modify_tid until the entire recursion has
562 * completed successfully.
564 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
565 nerror = hammer2_sync_insert(
566 thr, &parent, &chain,
568 idx, &xop->head, focus);
569 dodefer = 2;
570 } else {
571 nerror = hammer2_sync_insert(
572 thr, &parent, &chain,
573 focus->bref.modify_tid,
574 idx, &xop->head, focus);
576 advance_local = 1;
577 advance_xop = 1;
581 * We cannot recurse depth-first because the XOP is still
582 * running in node threads for this scan. Create a placemarker
583 * by obtaining and record the hammer2_inode.
585 * We excluded our node from the XOP so we must temporarily
586 * add it to xop->head.cluster so it is properly incorporated
587 * into the inode.
589 * The deferral is pushed onto a LIFO list for bottom-up
590 * synchronization.
592 if (merror == 0 && dodefer) {
593 hammer2_inode_t *nip;
594 hammer2_deferred_ip_t *defer;
596 KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
598 defer = kmalloc(sizeof(*defer), M_HAMMER2,
599 M_WAITOK | M_ZERO);
600 KKASSERT(xop->head.cluster.array[idx].chain == NULL);
601 xop->head.cluster.array[idx].flags =
602 HAMMER2_CITEM_INVALID;
603 xop->head.cluster.array[idx].chain = chain;
604 nip = hammer2_inode_get(pmp, ip, &xop->head, idx);
605 xop->head.cluster.array[idx].chain = NULL;
607 hammer2_inode_ref(nip);
608 hammer2_inode_unlock(nip);
610 defer->next = list->base;
611 defer->ip = nip;
612 list->base = defer;
613 ++list->count;
614 needrescan = 1;
618 * If at least one deferral was added and the deferral
619 * list has grown too large, stop adding more. This
620 * will trigger an HAMMER2_ERROR_EAGAIN return.
622 if (needrescan && list->count > 1000)
623 break;
626 * Advancements for iteration.
628 if (advance_xop) {
629 merror = hammer2_xop_collect(&xop->head, 0);
631 if (advance_local) {
632 chain = hammer2_chain_next(&parent, chain, &key_next,
633 key_next, HAMMER2_KEY_MAX,
634 &serror,
635 HAMMER2_LOOKUP_SHARED |
636 HAMMER2_LOOKUP_NODIRECT |
637 HAMMER2_LOOKUP_NODATA);
640 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
641 if (chain) {
642 hammer2_chain_unlock(chain);
643 hammer2_chain_drop(chain);
645 if (parent) {
646 hammer2_chain_unlock(parent);
647 hammer2_chain_drop(parent);
651 * If we added deferrals we want the caller to synchronize them
652 * and then call us again.
654 * NOTE: In this situation we do not yet want to synchronize our
655 * inode, setting the error code also has that effect.
657 if ((merror == 0 || merror == HAMMER2_ERROR_ENOENT) && needrescan)
658 merror = HAMMER2_ERROR_EAGAIN;
661 * If no error occurred we can synchronize the inode meta-data
662 * and modify_tid. Only limited changes are made to PFSROOTs.
664 * XXX inode lock was lost
666 if (merror == 0 || merror == HAMMER2_ERROR_ENOENT) {
667 hammer2_xop_ipcluster_t *xop2;
668 hammer2_chain_t *focus;
670 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
671 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
672 hammer2_xop_start_except(&xop2->head, &hammer2_ipcluster_desc,
673 idx);
674 hammer2_inode_unlock(ip);
675 merror = hammer2_xop_collect(&xop2->head, 0);
676 if (merror == 0) {
677 focus = xop2->head.cluster.focus;
678 if ((hammer2_debug & 0x8000) && focus) {
679 const char *filename;
681 filename = hammer2_xop_gdata(&xop2->head)->
682 ipdata.filename;
683 kprintf("syncthr: update inode %p (%s)\n",
684 focus, filename);
685 hammer2_xop_pdata(&xop2->head);
687 chain = hammer2_inode_chain_and_parent(ip, idx,
688 &parent,
689 HAMMER2_RESOLVE_ALWAYS |
690 HAMMER2_RESOLVE_SHARED);
692 KKASSERT(parent != NULL);
693 nerror = hammer2_sync_replace(
694 thr, parent, chain,
695 sync_tid,
696 idx, &xop2->head, focus, isroot);
697 hammer2_chain_unlock(chain);
698 hammer2_chain_drop(chain);
699 hammer2_chain_unlock(parent);
700 hammer2_chain_drop(parent);
701 /* XXX */
703 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
706 return merror;
710 * Create a missing chain by copying the focus from another device.
712 * On entry *parentp and focus are both locked shared. The chain will be
713 * created and returned in *chainp also locked shared.
715 static
717 hammer2_sync_insert(hammer2_thread_t *thr,
718 hammer2_chain_t **parentp, hammer2_chain_t **chainp,
719 hammer2_tid_t mtid, int idx, hammer2_xop_head_t *xop,
720 hammer2_chain_t *focus)
722 hammer2_chain_t *chain;
723 hammer2_key_t dummy;
724 int error;
726 #if HAMMER2_SYNCHRO_DEBUG
727 if (hammer2_debug & 1)
728 kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
729 *parentp,
730 (*parentp)->bref.type,
731 (*parentp)->bref.key,
732 idx,
733 focus->bref.type, focus->bref.key, mtid);
734 #endif
737 * Parent requires an exclusive lock for the insertion.
738 * We must unlock the child to avoid deadlocks while
739 * relocking the parent.
741 if (*chainp) {
742 hammer2_chain_unlock(*chainp);
743 hammer2_chain_drop(*chainp);
744 *chainp = NULL;
746 hammer2_chain_unlock(*parentp);
747 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
750 * We must reissue the lookup to properly position (*parentp)
751 * for the insertion.
753 chain = hammer2_chain_lookup(parentp, &dummy,
754 focus->bref.key, focus->bref.key,
755 &error,
756 HAMMER2_LOOKUP_NODIRECT |
757 HAMMER2_LOOKUP_ALWAYS);
758 KKASSERT(chain == NULL);
760 chain = NULL;
761 error = hammer2_chain_create(parentp, &chain,
762 thr->pmp, focus->bref.methods,
763 focus->bref.key, focus->bref.keybits,
764 focus->bref.type, focus->bytes,
765 mtid, 0, 0);
766 if (error == 0) {
767 const hammer2_media_data_t *data;
769 error = hammer2_chain_modify(chain, mtid, 0, 0);
770 if (error)
771 goto failed;
774 * Copy focus to new chain
777 /* type already set */
778 chain->bref.methods = focus->bref.methods;
779 /* keybits already set */
780 chain->bref.vradix = focus->bref.vradix;
781 /* mirror_tid set by flush */
782 KKASSERT(chain->bref.modify_tid == mtid);
783 chain->bref.flags = focus->bref.flags;
784 /* key already present */
785 /* check code will be recalculated */
788 * Copy data body.
790 switch(chain->bref.type) {
791 case HAMMER2_BREF_TYPE_INODE:
792 data = hammer2_xop_gdata(xop);
794 if ((data->ipdata.meta.op_flags &
795 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
796 /* do not copy block table */
797 bcopy(data, chain->data,
798 offsetof(hammer2_inode_data_t, u));
799 hammer2_xop_pdata(xop);
800 break;
802 hammer2_xop_pdata(xop);
803 /* fall through copy whole thing */
804 case HAMMER2_BREF_TYPE_DATA:
805 data = hammer2_xop_gdata(xop);
806 bcopy(data, chain->data, chain->bytes);
807 hammer2_chain_setcheck(chain, chain->data);
808 hammer2_xop_pdata(xop);
809 break;
810 case HAMMER2_BREF_TYPE_DIRENT:
812 * Directory entries embed data in the blockref.
814 if (chain->bytes) {
815 data = hammer2_xop_gdata(xop);
816 bcopy(data, chain->data, chain->bytes);
817 hammer2_chain_setcheck(chain, chain->data);
818 hammer2_xop_pdata(xop);
819 } else {
820 chain->bref.check = focus->bref.check;
822 chain->bref.embed = focus->bref.embed;
823 break;
824 default:
825 KKASSERT(0);
826 break;
830 failed:
831 if (chain)
832 hammer2_chain_unlock(chain); /* unlock, leave ref */
833 *chainp = chain; /* will be returned locked */
836 * Avoid an ordering deadlock when relocking shared.
838 hammer2_chain_unlock(*parentp);
839 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
840 HAMMER2_RESOLVE_ALWAYS);
841 if (chain) {
842 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
843 HAMMER2_RESOLVE_ALWAYS);
844 error = chain->error;
847 return error;
851 * Destroy an extranious chain.
853 * Both *parentp and *chainp are locked shared.
855 * On return, *chainp will be adjusted to point to the next element in the
856 * iteration and locked shared.
858 static
860 hammer2_sync_destroy(hammer2_thread_t *thr,
861 hammer2_chain_t **parentp, hammer2_chain_t **chainp,
862 hammer2_tid_t mtid, int idx)
864 hammer2_chain_t *chain;
865 hammer2_key_t key_next;
866 hammer2_key_t save_key;
867 int error;
869 chain = *chainp;
871 #if HAMMER2_SYNCHRO_DEBUG
872 if (hammer2_debug & 1)
873 kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
874 *parentp, chain,
875 idx, chain->bref.type, chain->bref.key);
876 #endif
878 save_key = chain->bref.key;
879 if (save_key != HAMMER2_KEY_MAX)
880 ++save_key;
883 * Try to avoid unnecessary I/O.
885 * XXX accounting not propagated up properly. We might have to do
886 * a RESOLVE_MAYBE here and pass 0 for the flags.
888 hammer2_chain_unlock(chain); /* relock exclusive */
889 hammer2_chain_unlock(*parentp);
890 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
891 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
893 hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT);
894 hammer2_chain_unlock(chain);
895 hammer2_chain_drop(chain);
896 chain = NULL; /* safety */
898 hammer2_chain_unlock(*parentp); /* relock shared */
899 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
900 HAMMER2_RESOLVE_ALWAYS);
901 *chainp = hammer2_chain_lookup(parentp, &key_next,
902 save_key, HAMMER2_KEY_MAX,
903 &error,
904 HAMMER2_LOOKUP_SHARED |
905 HAMMER2_LOOKUP_NODIRECT |
906 HAMMER2_LOOKUP_NODATA);
907 return error;
911 * cparent is locked exclusively, with an extra ref, cluster is not locked.
912 * Replace element [i] in the cluster.
914 static
916 hammer2_sync_replace(hammer2_thread_t *thr,
917 hammer2_chain_t *parent, hammer2_chain_t *chain,
918 hammer2_tid_t mtid, int idx,
919 hammer2_xop_head_t *xop, hammer2_chain_t *focus,
920 int isroot)
922 uint8_t otype;
923 int nradix;
924 int error;
926 #if HAMMER2_SYNCHRO_DEBUG
927 if (hammer2_debug & 1)
928 kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
929 chain,
930 idx,
931 focus->bref.type, focus->bref.key, mtid);
932 #endif
933 hammer2_chain_unlock(chain);
934 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
935 error = chain->error;
936 if (error == 0) {
937 const hammer2_media_data_t *data;
939 if (chain->bytes != focus->bytes) {
940 /* XXX what if compressed? */
941 nradix = hammer2_getradix(chain->bytes);
942 error = hammer2_chain_resize(chain, mtid, 0, nradix, 0);
943 if (error)
944 goto failed;
946 error = hammer2_chain_modify(chain, mtid, 0, 0);
947 if (error)
948 goto failed;
949 otype = chain->bref.type;
950 data = hammer2_xop_gdata(xop);
951 chain->bref.type = focus->bref.type;
952 chain->bref.methods = focus->bref.methods;
953 chain->bref.keybits = focus->bref.keybits;
954 chain->bref.vradix = focus->bref.vradix;
955 /* mirror_tid updated by flush */
956 KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid);
957 chain->bref.flags = focus->bref.flags;
958 /* key already present */
959 /* check code will be recalculated */
962 * Copy data body.
964 switch(chain->bref.type) {
965 case HAMMER2_BREF_TYPE_INODE:
967 * Special case PFSROOTs, only limited changes can
968 * be made since the meta-data contains miscellanious
969 * distinguishing fields.
971 if (isroot) {
972 chain->data->ipdata.meta.uflags =
973 data->ipdata.meta.uflags;
974 chain->data->ipdata.meta.rmajor =
975 data->ipdata.meta.rmajor;
976 chain->data->ipdata.meta.rminor =
977 data->ipdata.meta.rminor;
978 chain->data->ipdata.meta.ctime =
979 data->ipdata.meta.ctime;
980 chain->data->ipdata.meta.mtime =
981 data->ipdata.meta.mtime;
982 chain->data->ipdata.meta.atime =
983 data->ipdata.meta.atime;
984 /* not btime */
985 chain->data->ipdata.meta.uid =
986 data->ipdata.meta.uid;
987 chain->data->ipdata.meta.gid =
988 data->ipdata.meta.gid;
989 chain->data->ipdata.meta.mode =
990 data->ipdata.meta.mode;
991 chain->data->ipdata.meta.ncopies =
992 data->ipdata.meta.ncopies;
993 chain->data->ipdata.meta.comp_algo =
994 data->ipdata.meta.comp_algo;
995 chain->data->ipdata.meta.check_algo =
996 data->ipdata.meta.check_algo;
997 chain->data->ipdata.meta.data_quota =
998 data->ipdata.meta.data_quota;
999 chain->data->ipdata.meta.inode_quota =
1000 data->ipdata.meta.inode_quota;
1003 * last snapshot tid controls overwrite
1005 if (chain->data->ipdata.meta.pfs_lsnap_tid <
1006 data->ipdata.meta.pfs_lsnap_tid) {
1007 chain->data->ipdata.meta.pfs_lsnap_tid =
1008 data->ipdata.meta.pfs_lsnap_tid;
1011 hammer2_chain_setcheck(chain, chain->data);
1012 break;
1016 * Normal replacement.
1018 if ((data->ipdata.meta.op_flags &
1019 HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1021 * If DIRECTDATA is transitioning to 0 or the
1022 * old chain is not an inode we have to
1023 * initialize the block table.
1025 if (otype != HAMMER2_BREF_TYPE_INODE ||
1026 (chain->data->ipdata.meta.op_flags &
1027 HAMMER2_OPFLAG_DIRECTDATA)) {
1028 kprintf("chain inode trans "
1029 "away from dd\n");
1030 bzero(&chain->data->ipdata.u,
1031 sizeof(chain->data->ipdata.u));
1033 bcopy(data, chain->data,
1034 offsetof(hammer2_inode_data_t, u));
1035 /* XXX setcheck on inode should not be needed */
1036 hammer2_chain_setcheck(chain, chain->data);
1037 break;
1039 /* fall through */
1040 case HAMMER2_BREF_TYPE_DATA:
1041 bcopy(data, chain->data, chain->bytes);
1042 hammer2_chain_setcheck(chain, chain->data);
1043 break;
1044 case HAMMER2_BREF_TYPE_DIRENT:
1046 * Directory entries embed data in the blockref.
1048 if (chain->bytes) {
1049 bcopy(data, chain->data, chain->bytes);
1050 hammer2_chain_setcheck(chain, chain->data);
1051 } else {
1052 chain->bref.check = focus->bref.check;
1054 chain->bref.embed = focus->bref.embed;
1055 break;
1056 default:
1057 KKASSERT(0);
1058 break;
1060 hammer2_xop_pdata(xop);
1063 failed:
1064 hammer2_chain_unlock(chain);
1065 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
1066 HAMMER2_RESOLVE_MAYBE);
1068 return error;