sys/vfs/hammer2/hammer2_flush.c

   1 /*
   2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@dragonflybsd.org>
   6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in
  16  *    the documentation and/or other materials provided with the
  17  *    distribution.
  18  * 3. Neither the name of The DragonFly Project nor the names of its
  19  *    contributors may be used to endorse or promote products derived
  20  *    from this software without specific, prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35 /*
  36  *                      TRANSACTION AND FLUSH HANDLING
  37  *
  38  * Deceptively simple but actually fairly difficult to implement properly is
  39  * how I would describe it.
  40  *
  41  * Flushing generally occurs bottom-up but requires a top-down scan to
  42  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
  43  * tells how to recurse downward to find these chains.
  44  */
  45
  46 #include <sys/cdefs.h>
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/types.h>
  50 #include <sys/lock.h>
  51 #include <sys/uuid.h>
  52
  53 #include "hammer2.h"
  54
  55 #define FLUSH_DEBUG 0
  56
  57 #define HAMMER2_FLUSH_DEPTH_LIMIT       10      /* stack recursion limit */
  58
  59
  60 /*
  61  * Recursively flush the specified chain.  The chain is locked and
  62  * referenced by the caller and will remain so on return.  The chain
  63  * will remain referenced throughout but can temporarily lose its
  64  * lock during the recursion to avoid unnecessarily stalling user
  65  * processes.
  66  */
  67 struct hammer2_flush_info {
  68         hammer2_chain_t *parent;
  69         hammer2_trans_t *trans;
  70         int             depth;
  71         int             diddeferral;
  72         int             cache_index;
  73         struct h2_flush_list flushq;
  74         hammer2_xid_t   sync_xid;       /* memory synchronization point */
  75         hammer2_chain_t *debug;
  76 };
  77
  78 typedef struct hammer2_flush_info hammer2_flush_info_t;
  79
  80 static void hammer2_flush_core(hammer2_flush_info_t *info,
  81                                 hammer2_chain_t *chain, int deleting);
  82 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
  83
  84 /*
  85  * For now use a global transaction manager.  What we ultimately want to do
  86  * is give each non-overlapping hmp/pmp group its own transaction manager.
  87  *
  88  * Transactions govern XID tracking on the physical media (the hmp), but they
  89  * also govern TID tracking which is per-PFS and thus might cross multiple
  90  * hmp's.  So we can't just stuff tmanage into hammer2_dev or
  91  * hammer2_pfs.
  92  */
  93 void
  94 hammer2_trans_manage_init(hammer2_trans_manage_t *tman)
  95 {
  96         lockinit(&tman->translk, "h2trans", 0, 0);
  97         TAILQ_INIT(&tman->transq);
  98         tman->flush_xid = 1;
  99         tman->alloc_xid = tman->flush_xid + 1;
 100 }
 101
 102 hammer2_xid_t
 103 hammer2_trans_newxid(hammer2_pfs_t *pmp)
 104 {
 105         hammer2_xid_t xid;
 106
 107         for (;;) {
 108                 xid = atomic_fetchadd_int(&pmp->tmanage.alloc_xid, 1);
 109                 if (xid)
 110                         break;
 111         }
 112         return xid;
 113 }
 114
 115 /*
 116  * Transaction support functions for writing to the filesystem.
 117  *
 118  * Initializing a new transaction allocates a transaction ID.  Typically
 119  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
 120  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
 121  * media target.  The latter mode is used by the recovery code.
 122  *
 123  * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
 124  * other is a set of any number of concurrent filesystem operations.  We
 125  * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
 126  * or we can have <running_flush> + <concurrent_fs_ops>.
 127  *
 128  * During a flush, new fs_ops are only blocked until the fs_ops prior to
 129  * the flush complete.  The new fs_ops can then run concurrent with the flush.
 130  *
 131  * Buffer-cache transactions operate as fs_ops but never block.  A
 132  * buffer-cache flush will run either before or after the current pending
 133  * flush depending on its state.
 134  */
 135 void
 136 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfs_t *pmp, int flags)
 137 {
 138         hammer2_trans_manage_t *tman;
 139         hammer2_trans_t *head;
 140
 141         tman = &pmp->tmanage;
 142
 143         bzero(trans, sizeof(*trans));
 144         trans->pmp = pmp;
 145         trans->flags = flags;
 146         trans->td = curthread;
 147
 148         lockmgr(&tman->translk, LK_EXCLUSIVE);
 149
 150         if (flags & HAMMER2_TRANS_ISFLUSH) {
 151                 /*
 152                  * If multiple flushes are trying to run we have to
 153                  * wait until it is our turn.  All flushes are serialized.
 154                  *
 155                  * We queue ourselves and then wait to become the head
 156                  * of the queue, allowing all prior flushes to complete.
 157                  *
 158                  * Multiple normal transactions can share the current
 159                  * transaction id but a flush transaction needs its own
 160                  * unique TID for proper block table update accounting.
 161                  */
 162                 ++tman->flushcnt;
 163                 ++pmp->modify_tid;
 164                 tman->flush_xid = hammer2_trans_newxid(pmp);
 165                 trans->sync_xid = tman->flush_xid;
 166                 trans->modify_tid = pmp->modify_tid;
 167                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
 168                 if (TAILQ_FIRST(&tman->transq) != trans) {
 169                         trans->blocked = 1;
 170                         while (trans->blocked) {
 171                                 lksleep(&trans->sync_xid, &tman->translk,
 172                                         0, "h2multf", hz);
 173                         }
 174                 }
 175         } else if (tman->flushcnt == 0) {
 176                 /*
 177                  * No flushes are pending, we can go.  Use prior flush_xid + 1.
 178                  *
 179                  * WARNING!  Also see hammer2_chain_setflush()
 180                  */
 181                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
 182                 trans->sync_xid = tman->flush_xid + 1;
 183
 184                 /* XXX improve/optimize inode allocation */
 185         } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) {
 186                 /*
 187                  * A buffer cache transaction is requested while a flush
 188                  * is in progress.  The flush's PREFLUSH flag must be set
 189                  * in this situation.
 190                  *
 191                  * The buffer cache flush takes on the main flush's
 192                  * transaction id.
 193                  */
 194                 TAILQ_FOREACH(head, &tman->transq, entry) {
 195                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
 196                                 break;
 197                 }
 198                 KKASSERT(head);
 199                 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH);
 200                 trans->flags |= HAMMER2_TRANS_PREFLUSH;
 201                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
 202                 trans->sync_xid = head->sync_xid;
 203                 trans->modify_tid = head->modify_tid;
 204                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
 205                 /* not allowed to block */
 206         } else {
 207                 /*
 208                  * A normal transaction is requested while a flush is in
 209                  * progress.  We insert after the current flush and may
 210                  * block.
 211                  *
 212                  * WARNING!  Also see hammer2_chain_setflush()
 213                  */
 214                 TAILQ_FOREACH(head, &tman->transq, entry) {
 215                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
 216                                 break;
 217                 }
 218                 KKASSERT(head);
 219                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
 220                 trans->sync_xid = head->sync_xid + 1;
 221                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
 222
 223                 /*
 224                  * XXX for now we must block new transactions, synchronous
 225                  * flush mode is on by default.
 226                  *
 227                  * If synchronous flush mode is enabled concurrent
 228                  * frontend transactions during the flush are not
 229                  * allowed (except we don't have a choice for buffer
 230                  * cache ops).
 231                  */
 232                 if (hammer2_synchronous_flush > 0 ||
 233                     TAILQ_FIRST(&tman->transq) != head) {
 234                         trans->blocked = 1;
 235                         while (trans->blocked) {
 236                                 lksleep(&trans->sync_xid, &tman->translk,
 237                                         0, "h2multf", hz);
 238                         }
 239                 }
 240         }
 241         if (flags & HAMMER2_TRANS_NEWINODE) {
 242                 if (pmp->spmp_hmp) {
 243                         /*
 244                          * Super-root transaction, all new inodes have an
 245                          * inode number of 1.  Normal pfs inode cache
 246                          * semantics are not used.
 247                          */
 248                         trans->inode_tid = 1;
 249                 } else {
 250                         /*
 251                          * Normal transaction
 252                          */
 253                         if (pmp->inode_tid < HAMMER2_INODE_START)
 254                                 pmp->inode_tid = HAMMER2_INODE_START;
 255                         trans->inode_tid = pmp->inode_tid++;
 256                 }
 257         }
 258
 259         lockmgr(&tman->translk, LK_RELEASE);
 260 }
 261
 262 void
 263 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
 264 {
 265         hammer2_trans_manage_t *tman;
 266         hammer2_trans_t *head;
 267
 268         tman = &pmp->tmanage;
 269         lockmgr(&tman->translk, LK_EXCLUSIVE);
 270         if (tman->flushcnt) {
 271                 TAILQ_FOREACH(head, &tman->transq, entry) {
 272                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
 273                                 break;
 274                 }
 275                 KKASSERT(head);
 276                 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH);
 277         }
 278         lockmgr(&tman->translk, LK_RELEASE);
 279 }
 280
 281 void
 282 hammer2_trans_done(hammer2_trans_t *trans)
 283 {
 284         hammer2_trans_manage_t *tman;
 285         hammer2_trans_t *head;
 286         hammer2_trans_t *scan;
 287
 288         tman = &trans->pmp->tmanage;
 289
 290         /*
 291          * Remove.
 292          */
 293         lockmgr(&tman->translk, LK_EXCLUSIVE);
 294         TAILQ_REMOVE(&tman->transq, trans, entry);
 295         head = TAILQ_FIRST(&tman->transq);
 296
 297         /*
 298          * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT
 299          * up through the next flush.  (If the head is a flush then we
 300          * stop there, unlike the unblock code following this section).
 301          */
 302         if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
 303                 --tman->flushcnt;
 304                 scan = head;
 305                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
 306                         atomic_clear_int(&scan->flags,
 307                                          HAMMER2_TRANS_CONCURRENT);
 308                         scan = TAILQ_NEXT(scan, entry);
 309                 }
 310         }
 311
 312         /*
 313          * Unblock the head of the queue and any additional transactions
 314          * up to the next flush.  The head can be a flush and it will be
 315          * unblocked along with the non-flush transactions following it
 316          * (which are allowed to run concurrently with it).
 317          *
 318          * In synchronous flush mode we stop if the head transaction is
 319          * a flush.
 320          */
 321         if (head && head->blocked) {
 322                 head->blocked = 0;
 323                 wakeup(&head->sync_xid);
 324
 325                 if (hammer2_synchronous_flush > 0)
 326                         scan = head;
 327                 else
 328                         scan = TAILQ_NEXT(head, entry);
 329                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
 330                         if (scan->blocked) {
 331                                 scan->blocked = 0;
 332                                 wakeup(&scan->sync_xid);
 333                         }
 334                         scan = TAILQ_NEXT(scan, entry);
 335                 }
 336         }
 337         lockmgr(&tman->translk, LK_RELEASE);
 338 }
 339
 340 /*
 341  * Chains undergoing destruction are removed from the in-memory topology.
 342  * To avoid getting lost these chains are placed on the delayed flush
 343  * queue which will properly dispose of them.
 344  *
 345  * We do this instead of issuing an immediate flush in order to give
 346  * recursive deletions (rm -rf, etc) a chance to remove more of the
 347  * hierarchy, potentially allowing an enormous amount of write I/O to
 348  * be avoided.
 349  */
 350 void
 351 hammer2_delayed_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
 352 {
 353         if ((chain->flags & HAMMER2_CHAIN_DELAYED) == 0) {
 354                 hammer2_spin_ex(&chain->hmp->list_spin);
 355                 if ((chain->flags & (HAMMER2_CHAIN_DELAYED |
 356                                      HAMMER2_CHAIN_DEFERRED)) == 0) {
 357                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELAYED |
 358                                                       HAMMER2_CHAIN_DEFERRED);
 359                         TAILQ_INSERT_TAIL(&chain->hmp->flushq,
 360                                           chain, flush_node);
 361                         hammer2_chain_ref(chain);
 362                 }
 363                 hammer2_spin_unex(&chain->hmp->list_spin);
 364         }
 365 }
 366
 367 /*
 368  * Flush the chain and all modified sub-chains through the specified
 369  * synchronization point, propagating parent chain modifications, modify_tid,
 370  * and mirror_tid updates back up as needed.
 371  *
 372  * Caller must have interlocked against any non-flush-related modifying
 373  * operations in progress whos XXX values are less than or equal
 374  * to the passed sync_xid.
 375  *
 376  * Caller must have already vetted synchronization points to ensure they
 377  * are properly flushed.  Only snapshots and cluster flushes can create
 378  * these sorts of synchronization points.
 379  *
 380  * This routine can be called from several places but the most important
 381  * is from VFS_SYNC.
 382  *
 383  * chain is locked on call and will remain locked on return.  The chain's
 384  * UPDATE flag indicates that its parent's block table (which is not yet
 385  * part of the flush) should be updated.  The chain may be replaced by
 386  * the call if it was modified.
 387  */
 388 void
 389 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain, int istop)
 390 {
 391         hammer2_chain_t *scan;
 392         hammer2_flush_info_t info;
 393         hammer2_dev_t *hmp;
 394         int loops;
 395
 396         /*
 397          * Execute the recursive flush and handle deferrals.
 398          *
 399          * Chains can be ridiculously long (thousands deep), so to
 400          * avoid blowing out the kernel stack the recursive flush has a
 401          * depth limit.  Elements at the limit are placed on a list
 402          * for re-execution after the stack has been popped.
 403          */
 404         bzero(&info, sizeof(info));
 405         TAILQ_INIT(&info.flushq);
 406         info.trans = trans;
 407         info.sync_xid = trans->sync_xid;
 408         info.cache_index = -1;
 409
 410         /*
 411          * Calculate parent (can be NULL), if not NULL the flush core
 412          * expects the parent to be referenced so it can easily lock/unlock
 413          * it without it getting ripped up.
 414          */
 415         if ((info.parent = chain->parent) != NULL)
 416                 hammer2_chain_ref(info.parent);
 417
 418         /*
 419          * Extra ref needed because flush_core expects it when replacing
 420          * chain.
 421          */
 422         hammer2_chain_ref(chain);
 423         hmp = chain->hmp;
 424         loops = 0;
 425
 426         for (;;) {
 427                 /*
 428                  * Move hmp->flushq to info.flushq if non-empty so it can
 429                  * be processed.
 430                  */
 431                 if (TAILQ_FIRST(&hmp->flushq) != NULL) {
 432                         hammer2_spin_ex(&chain->hmp->list_spin);
 433                         TAILQ_CONCAT(&info.flushq, &hmp->flushq, flush_node);
 434                         hammer2_spin_unex(&chain->hmp->list_spin);
 435                 }
 436
 437                 /*
 438                  * Unwind deep recursions which had been deferred.  This
 439                  * can leave the FLUSH_* bits set for these chains, which
 440                  * will be handled when we [re]flush chain after the unwind.
 441                  */
 442                 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) {
 443                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
 444                         TAILQ_REMOVE(&info.flushq, scan, flush_node);
 445                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED |
 446                                                        HAMMER2_CHAIN_DELAYED);
 447
 448                         /*
 449                          * Now that we've popped back up we can do a secondary
 450                          * recursion on the deferred elements.
 451                          *
 452                          * NOTE: hammer2_flush() may replace scan.
 453                          */
 454                         if (hammer2_debug & 0x0040)
 455                                 kprintf("deferred flush %p\n", scan);
 456                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
 457                         hammer2_flush(trans, scan, 0);
 458                         hammer2_chain_unlock(scan);
 459                         hammer2_chain_drop(scan);       /* ref from deferral */
 460                 }
 461
 462                 /*
 463                  * [re]flush chain.
 464                  */
 465                 info.diddeferral = 0;
 466                 hammer2_flush_core(&info, chain, istop);
 467
 468                 /*
 469                  * Only loop if deep recursions have been deferred.
 470                  */
 471                 if (TAILQ_EMPTY(&info.flushq))
 472                         break;
 473
 474                 if (++loops % 1000 == 0) {
 475                         kprintf("hammer2_flush: excessive loops on %p\n",
 476                                 chain);
 477                         if (hammer2_debug & 0x100000)
 478                                 Debugger("hell4");
 479                 }
 480         }
 481         hammer2_chain_drop(chain);
 482         if (info.parent)
 483                 hammer2_chain_drop(info.parent);
 484 }
 485
 486 /*
 487  * This is the core of the chain flushing code.  The chain is locked by the
 488  * caller and must also have an extra ref on it by the caller, and remains
 489  * locked and will have an extra ref on return.  Upon return, the caller can
 490  * test the UPDATE bit on the child to determine if the parent needs updating.
 491  *
 492  * (1) Determine if this node is a candidate for the flush, return if it is
 493  *     not.  fchain and vchain are always candidates for the flush.
 494  *
 495  * (2) If we recurse too deep the chain is entered onto the deferral list and
 496  *     the current flush stack is aborted until after the deferral list is
 497  *     run.
 498  *
 499  * (3) Recursively flush live children (rbtree).  This can create deferrals.
 500  *     A successful flush clears the MODIFIED and UPDATE bits on the children
 501  *     and typically causes the parent to be marked MODIFIED as the children
 502  *     update the parent's block table.  A parent might already be marked
 503  *     MODIFIED due to a deletion (whos blocktable update in the parent is
 504  *     handled by the frontend), or if the parent itself is modified by the
 505  *     frontend for other reasons.
 506  *
 507  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
 508  *     Deleted-but-open inodes can still be individually flushed via the
 509  *     filesystem syncer.
 510  *
 511  * (5) Note that an unmodified child may still need the block table in its
 512  *     parent updated (e.g. rename/move).  The child will have UPDATE set
 513  *     in this case.
 514  *
 515  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
 516  *
 517  * blockref.modify_tid is consistent only within a PFS, and will not be
 518  * consistent during synchronization.  mirror_tid is consistent across the
 519  * block device regardless of the PFS.
 520  */
 521 static void
 522 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
 523                    int istop)
 524 {
 525         hammer2_chain_t *parent;
 526         hammer2_dev_t *hmp;
 527         int diddeferral;
 528
 529         /*
 530          * (1) Optimize downward recursion to locate nodes needing action.
 531          *     Nothing to do if none of these flags are set.
 532          */
 533         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) {
 534                 if (hammer2_debug & 0x200) {
 535                         if (info->debug == NULL)
 536                                 info->debug = chain;
 537                 } else {
 538                         return;
 539                 }
 540         }
 541
 542         hmp = chain->hmp;
 543         diddeferral = info->diddeferral;
 544         parent = info->parent;          /* can be NULL */
 545
 546         /*
 547          * Downward search recursion
 548          */
 549         if (chain->flags & (HAMMER2_CHAIN_DEFERRED | HAMMER2_CHAIN_DELAYED)) {
 550                 /*
 551                  * Already deferred.
 552                  */
 553                 ++info->diddeferral;
 554         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
 555                 /*
 556                  * Recursion depth reached.
 557                  */
 558                 KKASSERT((chain->flags & HAMMER2_CHAIN_DELAYED) == 0);
 559                 hammer2_chain_ref(chain);
 560                 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
 561                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
 562                 ++info->diddeferral;
 563         } else if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) && istop == 0) {
 564                 /*
 565                  * We do not recurse through PFSROOTs.  PFSROOT flushes are
 566                  * handled by the related pmp's (whether mounted or not,
 567                  * including during recovery).
 568                  *
 569                  * But we must still process the PFSROOT chains for block
 570                  * table updates in their parent (which IS part of our flush).
 571                  *
 572                  * Note that the volume root, vchain, does not set this flag.
 573                  */
 574                 ;
 575         } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
 576                 /*
 577                  * Downward recursion search (actual flush occurs bottom-up).
 578                  * pre-clear ONFLUSH.  It can get set again due to races,
 579                  * which we want so the scan finds us again in the next flush.
 580                  * These races can also include
 581                  *
 582                  * Flush recursions stop at PFSROOT boundaries.  Each PFS
 583                  * must be individually flushed and then the root must
 584                  * be flushed.
 585                  */
 586                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
 587                 info->parent = chain;
 588                 hammer2_spin_ex(&chain->core.spin);
 589                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
 590                         NULL, hammer2_flush_recurse, info);
 591                 hammer2_spin_unex(&chain->core.spin);
 592                 info->parent = parent;
 593                 if (info->diddeferral)
 594                         hammer2_chain_setflush(info->trans, chain);
 595         }
 596
 597         /*
 598          * Now we are in the bottom-up part of the recursion.
 599          *
 600          * Do not update chain if lower layers were deferred.
 601          */
 602         if (info->diddeferral)
 603                 goto done;
 604
 605         /*
 606          * Propagate the DESTROY flag downwards.  This dummies up the flush
 607          * code and tries to invalidate related buffer cache buffers to
 608          * avoid the disk write.
 609          */
 610         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
 611                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
 612
 613         /*
 614          * Chain was already modified or has become modified, flush it out.
 615          */
 616 again:
 617         if ((hammer2_debug & 0x200) &&
 618             info->debug &&
 619             (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) {
 620                 hammer2_chain_t *scan = chain;
 621
 622                 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain);
 623                 while (scan) {
 624                         kprintf("    chain %p [%08x] bref=%016jx:%02x\n",
 625                                 scan, scan->flags,
 626                                 scan->bref.key, scan->bref.type);
 627                         if (scan == info->debug)
 628                                 break;
 629                         scan = scan->parent;
 630                 }
 631         }
 632
 633         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 634                 /*
 635                  * Dispose of the modified bit.
 636                  *
 637                  * UPDATE should already be set.
 638                  * bref.mirror_tid should already be set.
 639                  */
 640                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
 641                          chain == &hmp->vchain);
 642                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
 643
 644                 /*
 645                  * Manage threads waiting for excessive dirty memory to
 646                  * be retired.
 647                  */
 648                 if (chain->pmp)
 649                         hammer2_pfs_memory_wakeup(chain->pmp);
 650
 651                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) ||
 652                     chain == &hmp->vchain ||
 653                     chain == &hmp->fchain) {
 654                         /*
 655                          * Drop the ref from the MODIFIED bit we cleared,
 656                          * net -1 ref.
 657                          */
 658                         hammer2_chain_drop(chain);
 659                 } else {
 660                         /*
 661                          * Drop the ref from the MODIFIED bit we cleared and
 662                          * set a ref for the UPDATE bit we are setting.  Net
 663                          * 0 refs.
 664                          */
 665                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 666                 }
 667
 668                 /*
 669                  * Issue the flush.  This is indirect via the DIO.
 670                  *
 671                  * NOTE: A DELETED node that reaches this point must be
 672                  *       flushed for synchronization point consistency.
 673                  *
 674                  * NOTE: Even though MODIFIED was already set, the related DIO
 675                  *       might not be dirty due to a system buffer cache
 676                  *       flush and must be set dirty if we are going to make
 677                  *       further modifications to the buffer.  Chains with
 678                  *       embedded data don't need this.
 679                  */
 680                 if (hammer2_debug & 0x1000) {
 681                         kprintf("Flush %p.%d %016jx/%d sync_xid=%08x "
 682                                 "data=%016jx\n",
 683                                 chain, chain->bref.type,
 684                                 chain->bref.key, chain->bref.keybits,
 685                                 info->sync_xid,
 686                                 chain->bref.data_off);
 687                 }
 688                 if (hammer2_debug & 0x2000) {
 689                         Debugger("Flush hell");
 690                 }
 691
 692                 /*
 693                  * Update chain CRCs for flush.
 694                  *
 695                  * NOTE: Volume headers are NOT flushed here as they require
 696                  *       special processing.
 697                  */
 698                 switch(chain->bref.type) {
 699                 case HAMMER2_BREF_TYPE_FREEMAP:
 700                         /*
 701                          * Update the volume header's freemap_tid to the
 702                          * freemap's flushing mirror_tid.
 703                          *
 704                          * (note: embedded data, do not call setdirty)
 705                          */
 706                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
 707                         KKASSERT(chain == &hmp->fchain);
 708                         hmp->voldata.freemap_tid = chain->bref.mirror_tid;
 709                         kprintf("sync freemap mirror_tid %08jx\n",
 710                                 (intmax_t)chain->bref.mirror_tid);
 711
 712                         /*
 713                          * The freemap can be flushed independently of the
 714                          * main topology, but for the case where it is
 715                          * flushed in the same transaction, and flushed
 716                          * before vchain (a case we want to allow for
 717                          * performance reasons), make sure modifications
 718                          * made during the flush under vchain use a new
 719                          * transaction id.
 720                          *
 721                          * Otherwise the mount recovery code will get confused.
 722                          */
 723                         ++hmp->voldata.mirror_tid;
 724                         break;
 725                 case HAMMER2_BREF_TYPE_VOLUME:
 726                         /*
 727                          * The free block table is flushed by
 728                          * hammer2_vfs_sync() before it flushes vchain.
 729                          * We must still hold fchain locked while copying
 730                          * voldata to volsync, however.
 731                          *
 732                          * (note: embedded data, do not call setdirty)
 733                          */
 734                         hammer2_chain_lock(&hmp->fchain,
 735                                            HAMMER2_RESOLVE_ALWAYS);
 736                         hammer2_voldata_lock(hmp);
 737                         kprintf("sync volume  mirror_tid %08jx\n",
 738                                 (intmax_t)chain->bref.mirror_tid);
 739
 740                         /*
 741                          * Update the volume header's mirror_tid to the
 742                          * main topology's flushing mirror_tid.  It is
 743                          * possible that voldata.mirror_tid is already
 744                          * beyond bref.mirror_tid due to the bump we made
 745                          * above in BREF_TYPE_FREEMAP.
 746                          */
 747                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
 748                                 hmp->voldata.mirror_tid =
 749                                         chain->bref.mirror_tid;
 750                         }
 751
 752                         /*
 753                          * The volume header is flushed manually by the
 754                          * syncer, not here.  All we do here is adjust the
 755                          * crc's.
 756                          */
 757                         KKASSERT(chain->data != NULL);
 758                         KKASSERT(chain->dio == NULL);
 759
 760                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
 761                                 hammer2_icrc32(
 762                                         (char *)&hmp->voldata +
 763                                          HAMMER2_VOLUME_ICRC1_OFF,
 764                                         HAMMER2_VOLUME_ICRC1_SIZE);
 765                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
 766                                 hammer2_icrc32(
 767                                         (char *)&hmp->voldata +
 768                                          HAMMER2_VOLUME_ICRC0_OFF,
 769                                         HAMMER2_VOLUME_ICRC0_SIZE);
 770                         hmp->voldata.icrc_volheader =
 771                                 hammer2_icrc32(
 772                                         (char *)&hmp->voldata +
 773                                          HAMMER2_VOLUME_ICRCVH_OFF,
 774                                         HAMMER2_VOLUME_ICRCVH_SIZE);
 775
 776                         kprintf("syncvolhdr %016jx %016jx\n",
 777                                 hmp->voldata.mirror_tid,
 778                                 hmp->vchain.bref.mirror_tid);
 779                         hmp->volsync = hmp->voldata;
 780                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
 781                         hammer2_voldata_unlock(hmp);
 782                         hammer2_chain_unlock(&hmp->fchain);
 783                         break;
 784                 case HAMMER2_BREF_TYPE_DATA:
 785                         /*
 786                          * Data elements have already been flushed via the
 787                          * logical file buffer cache.  Their hash was set in
 788                          * the bref by the vop_write code.  Do not re-dirty.
 789                          *
 790                          * Make sure any device buffer(s) have been flushed
 791                          * out here (there aren't usually any to flush) XXX.
 792                          */
 793                         break;
 794                 case HAMMER2_BREF_TYPE_INDIRECT:
 795                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
 796                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
 797                         /*
 798                          * Buffer I/O will be cleaned up when the volume is
 799                          * flushed (but the kernel is free to flush it before
 800                          * then, as well).
 801                          */
 802                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
 803                         hammer2_chain_setcheck(chain, chain->data);
 804                         break;
 805                 case HAMMER2_BREF_TYPE_INODE:
 806                         /*
 807                          * NOTE: We must call io_setdirty() to make any late
 808                          *       changes to the inode data, the system might
 809                          *       have already flushed the buffer.
 810                          */
 811                         if (chain->data->ipdata.op_flags &
 812                             HAMMER2_OPFLAG_PFSROOT) {
 813                                 /*
 814                                  * non-NULL pmp if mounted as a PFS.  We must
 815                                  * sync fields cached in the pmp? XXX
 816                                  */
 817                                 hammer2_inode_data_t *ipdata;
 818
 819                                 hammer2_io_setdirty(chain->dio);
 820                                 ipdata = &chain->data->ipdata;
 821                                 if (chain->pmp) {
 822                                         ipdata->pfs_inum =
 823                                                 chain->pmp->inode_tid;
 824                                 }
 825                         } else {
 826                                 /* can't be mounted as a PFS */
 827                         }
 828
 829                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
 830                         hammer2_chain_setcheck(chain, chain->data);
 831                         break;
 832                 default:
 833                         KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
 834                         panic("hammer2_flush_core: unsupported "
 835                               "embedded bref %d",
 836                               chain->bref.type);
 837                         /* NOT REACHED */
 838                 }
 839
 840                 /*
 841                  * If the chain was destroyed try to avoid unnecessary I/O.
 842                  * (this only really works if the DIO system buffer is the
 843                  * same size as chain->bytes).
 844                  */
 845                 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) {
 846                         hammer2_io_setinval(chain->dio, chain->bytes);
 847                 }
 848         }
 849
 850         /*
 851          * If UPDATE is set the parent block table may need to be updated.
 852          *
 853          * NOTE: UPDATE may be set on vchain or fchain in which case
 854          *       parent could be NULL.  It's easiest to allow the case
 855          *       and test for NULL.  parent can also wind up being NULL
 856          *       due to a deletion so we need to handle the case anyway.
 857          *
 858          * If no parent exists we can just clear the UPDATE bit.  If the
 859          * chain gets reattached later on the bit will simply get set
 860          * again.
 861          */
 862         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) {
 863                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 864                 hammer2_chain_drop(chain);
 865         }
 866
 867         /*
 868          * The chain may need its blockrefs updated in the parent.  This
 869          * requires some fancy footwork.
 870          */
 871         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
 872                 hammer2_blockref_t *base;
 873                 int count;
 874
 875                 /*
 876                  * Both parent and chain must be locked.  This requires
 877                  * temporarily unlocking the chain.  We have to deal with
 878                  * the case where the chain might be reparented or modified
 879                  * while it was unlocked.
 880                  */
 881                 hammer2_chain_unlock(chain);
 882                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
 883                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
 884                 if (chain->parent != parent) {
 885                         kprintf("PARENT MISMATCH ch=%p p=%p/%p\n",
 886                                 chain, chain->parent, parent);
 887                         hammer2_chain_unlock(parent);
 888                         goto done;
 889                 }
 890
 891                 /*
 892                  * Check race condition.  If someone got in and modified
 893                  * it again while it was unlocked, we have to loop up.
 894                  */
 895                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 896                         hammer2_chain_unlock(parent);
 897                         kprintf("hammer2_flush: chain %p flush-mod race\n",
 898                                 chain);
 899                         goto again;
 900                 }
 901
 902                 /*
 903                  * Clear UPDATE flag, mark parent modified, update its
 904                  * modify_tid if necessary, and adjust the parent blockmap.
 905                  */
 906                 if (chain->flags & HAMMER2_CHAIN_UPDATE) {
 907                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 908                         hammer2_chain_drop(chain);
 909                 }
 910
 911                 /*
 912                  * (optional code)
 913                  *
 914                  * Avoid actually modifying and updating the parent if it
 915                  * was flagged for destruction.  This can greatly reduce
 916                  * disk I/O in large tree removals because the
 917                  * hammer2_io_setinval() call in the upward recursion
 918                  * (see MODIFIED code above) can only handle a few cases.
 919                  */
 920                 if (parent->flags & HAMMER2_CHAIN_DESTROY) {
 921                         if (parent->bref.modify_tid < chain->bref.modify_tid) {
 922                                 parent->bref.modify_tid =
 923                                         chain->bref.modify_tid;
 924                         }
 925                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED |
 926                                                         HAMMER2_CHAIN_BMAPUPD);
 927                         hammer2_chain_unlock(parent);
 928                         goto skipupdate;
 929                 }
 930
 931                 /*
 932                  * We are updating the parent's blockmap, the parent must
 933                  * be set modified.
 934                  */
 935                 hammer2_chain_modify(info->trans, parent, 0);
 936                 if (parent->bref.modify_tid < chain->bref.modify_tid)
 937                         parent->bref.modify_tid = chain->bref.modify_tid;
 938
 939                 /*
 940                  * Calculate blockmap pointer
 941                  */
 942                 switch(parent->bref.type) {
 943                 case HAMMER2_BREF_TYPE_INODE:
 944                         /*
 945                          * Access the inode's block array.  However, there is
 946                          * no block array if the inode is flagged DIRECTDATA.
 947                          */
 948                         if (parent->data &&
 949                             (parent->data->ipdata.op_flags &
 950                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
 951                                 base = &parent->data->
 952                                         ipdata.u.blockset.blockref[0];
 953                         } else {
 954                                 base = NULL;
 955                         }
 956                         count = HAMMER2_SET_COUNT;
 957                         break;
 958                 case HAMMER2_BREF_TYPE_INDIRECT:
 959                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
 960                         if (parent->data)
 961                                 base = &parent->data->npdata[0];
 962                         else
 963                                 base = NULL;
 964                         count = parent->bytes / sizeof(hammer2_blockref_t);
 965                         break;
 966                 case HAMMER2_BREF_TYPE_VOLUME:
 967                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
 968                         count = HAMMER2_SET_COUNT;
 969                         break;
 970                 case HAMMER2_BREF_TYPE_FREEMAP:
 971                         base = &parent->data->npdata[0];
 972                         count = HAMMER2_SET_COUNT;
 973                         break;
 974                 default:
 975                         base = NULL;
 976                         count = 0;
 977                         panic("hammer2_flush_core: "
 978                               "unrecognized blockref type: %d",
 979                               parent->bref.type);
 980                 }
 981
 982                 /*
 983                  * Blocktable updates
 984                  *
 985                  * We synchronize pending statistics at this time.  Delta
 986                  * adjustments designated for the current and upper level
 987                  * are synchronized.
 988                  */
 989                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
 990                         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
 991                                 hammer2_spin_ex(&parent->core.spin);
 992                                 hammer2_base_delete(info->trans, parent,
 993                                                     base, count,
 994                                                     &info->cache_index, chain);
 995                                 hammer2_spin_unex(&parent->core.spin);
 996                                 /* base_delete clears both bits */
 997                         } else {
 998                                 atomic_clear_int(&chain->flags,
 999                                                  HAMMER2_CHAIN_BMAPUPD);
1000                         }
1001                 }
1002                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
1003                         hammer2_spin_ex(&parent->core.spin);
1004                         hammer2_base_insert(info->trans, parent,
1005                                             base, count,
1006                                             &info->cache_index, chain);
1007                         hammer2_spin_unex(&parent->core.spin);
1008                         /* base_insert sets BMAPPED */
1009                 }
1010                 hammer2_chain_unlock(parent);
1011         }
1012 skipupdate:
1013         ;
1014
1015         /*
1016          * Final cleanup after flush
1017          */
1018 done:
1019         KKASSERT(chain->refs > 0);
1020         if (hammer2_debug & 0x200) {
1021                 if (info->debug == chain)
1022                         info->debug = NULL;
1023         }
1024 }
1025
1026 /*
1027  * Flush recursion helper, called from flush_core, calls flush_core.
1028  *
1029  * Flushes the children of the caller's chain (info->parent), restricted
1030  * by sync_tid.  Set info->domodify if the child's blockref must propagate
1031  * back up to the parent.
1032  *
1033  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
1034  * flush scan order prevents any chains from being lost.  A child can be
1035  * executes more than once.
1036  *
1037  * WARNING! If we do not call hammer2_flush_core() we must update
1038  *          bref.mirror_tid ourselves to indicate that the flush has
1039  *          processed the child.
1040  *
1041  * WARNING! parent->core spinlock is held on entry and return.
1042  */
1043 static int
1044 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
1045 {
1046         hammer2_flush_info_t *info = data;
1047         /*hammer2_trans_t *trans = info->trans;*/
1048         hammer2_chain_t *parent = info->parent;
1049
1050         /*
1051          * (child can never be fchain or vchain so a special check isn't
1052          *  needed).
1053          *
1054          * We must ref the child before unlocking the spinlock.
1055          *
1056          * The caller has added a ref to the parent so we can temporarily
1057          * unlock it in order to lock the child.
1058          */
1059         hammer2_chain_ref(child);
1060         hammer2_spin_unex(&parent->core.spin);
1061
1062         hammer2_chain_unlock(parent);
1063         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
1064
1065         /*
1066          * Recurse and collect deferral data.  We're in the media flush,
1067          * this can cross PFS boundaries.
1068          */
1069         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1070                 ++info->depth;
1071                 hammer2_flush_core(info, child, 0);
1072                 --info->depth;
1073         } else if (hammer2_debug & 0x200) {
1074                 if (info->debug == NULL)
1075                         info->debug = child;
1076                 ++info->depth;
1077                 hammer2_flush_core(info, child, 0);
1078                 --info->depth;
1079                 if (info->debug == child)
1080                         info->debug = NULL;
1081         }
1082
1083         /*
1084          * Relock to continue the loop
1085          */
1086         hammer2_chain_unlock(child);
1087         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1088         hammer2_chain_drop(child);
1089         KKASSERT(info->parent == parent);
1090         hammer2_spin_ex(&parent->core.spin);
1091
1092         return (0);
1093 }