sys/vfs/hammer2/hammer2_flush.c

   1 /*
   2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@dragonflybsd.org>
   6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in
  16  *    the documentation and/or other materials provided with the
  17  *    distribution.
  18  * 3. Neither the name of The DragonFly Project nor the names of its
  19  *    contributors may be used to endorse or promote products derived
  20  *    from this software without specific, prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35 /*
  36  *                      TRANSACTION AND FLUSH HANDLING
  37  *
  38  * Deceptively simple but actually fairly difficult to implement properly is
  39  * how I would describe it.
  40  *
  41  * The biggest issue is that each PFS may belong to a cluster so its media
  42  * modify_tid and mirror_tid fields are in a completely different domain
  43  * than the topology related to the super-root.
  44  *
  45  * Flushing generally occurs bottom-up but requires a top-down scan to
  46  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
  47  * tells how to recurse downward to find these chains.
  48  */
  49
  50 #include <sys/cdefs.h>
  51 #include <sys/param.h>
  52 #include <sys/systm.h>
  53 #include <sys/types.h>
  54 #include <sys/lock.h>
  55 #include <sys/uuid.h>
  56
  57 #include "hammer2.h"
  58
  59 #define FLUSH_DEBUG 0
  60
  61 /*
  62  * Recursively flush the specified chain.  The chain is locked and
  63  * referenced by the caller and will remain so on return.  The chain
  64  * will remain referenced throughout but can temporarily lose its
  65  * lock during the recursion to avoid unnecessarily stalling user
  66  * processes.
  67  */
  68 struct hammer2_flush_info {
  69         hammer2_chain_t *parent;
  70         hammer2_trans_t *trans;
  71         int             depth;
  72         int             diddeferral;
  73         int             cache_index;
  74         struct h2_flush_list flushq;
  75         hammer2_xid_t   sync_xid;       /* memory synchronization point */
  76         hammer2_chain_t *debug;
  77 };
  78
  79 typedef struct hammer2_flush_info hammer2_flush_info_t;
  80
  81 static void hammer2_flush_core(hammer2_flush_info_t *info,
  82                                 hammer2_chain_t *chain, int deleting);
  83 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
  84 #if 0
  85 static void hammer2_rollup_stats(hammer2_chain_t *parent,
  86                                 hammer2_chain_t *child, int how);
  87 #endif
  88
  89
  90 #if 0
  91 static __inline
  92 void
  93 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref,
  94                     int how)
  95 {
  96         hammer2_key_t bytes;
  97
  98         if (bref->type != 0) {
  99                 bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX);
 100                 if (bref->type == HAMMER2_BREF_TYPE_INODE)
 101                         info->inode_count += how;
 102                 if (how < 0)
 103                         info->data_count -= bytes;
 104                 else
 105                         info->data_count += bytes;
 106         }
 107 }
 108 #endif
 109
 110 /*
 111  * For now use a global transaction manager.  What we ultimately want to do
 112  * is give each non-overlapping hmp/pmp group its own transaction manager.
 113  *
 114  * Transactions govern XID tracking on the physical media (the hmp), but they
 115  * also govern TID tracking which is per-PFS and thus might cross multiple
 116  * hmp's.  So we can't just stuff tmanage into hammer2_mount or
 117  * hammer2_pfsmount.
 118  */
 119 static hammer2_trans_manage_t   tmanage;
 120
 121 void
 122 hammer2_trans_manage_init(void)
 123 {
 124         lockinit(&tmanage.translk, "h2trans", 0, 0);
 125         TAILQ_INIT(&tmanage.transq);
 126         tmanage.flush_xid = 1;
 127         tmanage.alloc_xid = tmanage.flush_xid + 1;
 128 }
 129
 130 hammer2_xid_t
 131 hammer2_trans_newxid(hammer2_pfsmount_t *pmp __unused)
 132 {
 133         hammer2_xid_t xid;
 134
 135         for (;;) {
 136                 xid = atomic_fetchadd_int(&tmanage.alloc_xid, 1);
 137                 if (xid)
 138                         break;
 139         }
 140         return xid;
 141 }
 142
 143 /*
 144  * Transaction support functions for writing to the filesystem.
 145  *
 146  * Initializing a new transaction allocates a transaction ID.  Typically
 147  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
 148  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
 149  * media target.  The latter mode is used by the recovery code.
 150  *
 151  * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
 152  * other is a set of any number of concurrent filesystem operations.  We
 153  * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
 154  * or we can have <running_flush> + <concurrent_fs_ops>.
 155  *
 156  * During a flush, new fs_ops are only blocked until the fs_ops prior to
 157  * the flush complete.  The new fs_ops can then run concurrent with the flush.
 158  *
 159  * Buffer-cache transactions operate as fs_ops but never block.  A
 160  * buffer-cache flush will run either before or after the current pending
 161  * flush depending on its state.
 162  */
 163 void
 164 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
 165 {
 166         hammer2_trans_manage_t *tman;
 167         hammer2_trans_t *head;
 168
 169         tman = &tmanage;
 170
 171         bzero(trans, sizeof(*trans));
 172         trans->pmp = pmp;
 173         trans->flags = flags;
 174         trans->td = curthread;
 175
 176         lockmgr(&tman->translk, LK_EXCLUSIVE);
 177
 178         if (flags & HAMMER2_TRANS_ISFLUSH) {
 179                 /*
 180                  * If multiple flushes are trying to run we have to
 181                  * wait until it is our turn.  All flushes are serialized.
 182                  *
 183                  * We queue ourselves and then wait to become the head
 184                  * of the queue, allowing all prior flushes to complete.
 185                  *
 186                  * Multiple normal transactions can share the current
 187                  * transaction id but a flush transaction needs its own
 188                  * unique TID for proper block table update accounting.
 189                  */
 190                 ++tman->flushcnt;
 191                 ++pmp->alloc_tid;
 192                 pmp->flush_tid = pmp->alloc_tid;
 193                 tman->flush_xid = hammer2_trans_newxid(pmp);
 194                 trans->sync_xid = tman->flush_xid;
 195                 ++pmp->alloc_tid;
 196                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
 197                 if (TAILQ_FIRST(&tman->transq) != trans) {
 198                         trans->blocked = 1;
 199                         while (trans->blocked) {
 200                                 lksleep(&trans->sync_xid, &tman->translk,
 201                                         0, "h2multf", hz);
 202                         }
 203                 }
 204         } else if (tman->flushcnt == 0) {
 205                 /*
 206                  * No flushes are pending, we can go.  Use prior flush_xid + 1.
 207                  *
 208                  * WARNING!  Also see hammer2_chain_setflush()
 209                  */
 210                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
 211                 trans->sync_xid = tman->flush_xid + 1;
 212
 213                 /* XXX improve/optimize inode allocation */
 214         } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) {
 215                 /*
 216                  * A buffer cache transaction is requested while a flush
 217                  * is in progress.  The flush's PREFLUSH flag must be set
 218                  * in this situation.
 219                  *
 220                  * The buffer cache flush takes on the main flush's
 221                  * transaction id.
 222                  */
 223                 TAILQ_FOREACH(head, &tman->transq, entry) {
 224                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
 225                                 break;
 226                 }
 227                 KKASSERT(head);
 228                 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH);
 229                 trans->flags |= HAMMER2_TRANS_PREFLUSH;
 230                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
 231                 trans->sync_xid = head->sync_xid;
 232                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
 233                 /* not allowed to block */
 234         } else {
 235                 /*
 236                  * A normal transaction is requested while a flush is in
 237                  * progress.  We insert after the current flush and may
 238                  * block.
 239                  *
 240                  * WARNING!  Also see hammer2_chain_setflush()
 241                  */
 242                 TAILQ_FOREACH(head, &tman->transq, entry) {
 243                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
 244                                 break;
 245                 }
 246                 KKASSERT(head);
 247                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
 248                 trans->sync_xid = head->sync_xid + 1;
 249                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
 250
 251                 /*
 252                  * XXX for now we must block new transactions, synchronous
 253                  * flush mode is on by default.
 254                  *
 255                  * If synchronous flush mode is enabled concurrent
 256                  * frontend transactions during the flush are not
 257                  * allowed (except we don't have a choice for buffer
 258                  * cache ops).
 259                  */
 260                 if (hammer2_synchronous_flush > 0 ||
 261                     TAILQ_FIRST(&tman->transq) != head) {
 262                         trans->blocked = 1;
 263                         while (trans->blocked) {
 264                                 lksleep(&trans->sync_xid,
 265                                         &tman->translk, 0,
 266                                         "h2multf", hz);
 267                         }
 268                 }
 269         }
 270         if (flags & HAMMER2_TRANS_NEWINODE) {
 271                 if (pmp->spmp_hmp) {
 272                         /*
 273                          * Super-root transaction, all new inodes have an
 274                          * inode number of 1.  Normal pfs inode cache
 275                          * semantics are not used.
 276                          */
 277                         trans->inode_tid = 1;
 278                 } else {
 279                         /*
 280                          * Normal transaction
 281                          */
 282                         if (pmp->inode_tid < HAMMER2_INODE_START)
 283                                 pmp->inode_tid = HAMMER2_INODE_START;
 284                         trans->inode_tid = pmp->inode_tid++;
 285                 }
 286         }
 287
 288         lockmgr(&tman->translk, LK_RELEASE);
 289 }
 290
 291 /*
 292  * This may only be called while in a flush transaction.  It's a bit of a
 293  * hack but after flushing a PFS we need to flush each volume root as part
 294  * of the same transaction.
 295  */
 296 void
 297 hammer2_trans_spmp(hammer2_trans_t *trans, hammer2_pfsmount_t *spmp)
 298 {
 299         ++spmp->alloc_tid;
 300         spmp->flush_tid = spmp->alloc_tid;
 301         ++spmp->alloc_tid;
 302         trans->pmp = spmp;
 303 }
 304
 305
 306 void
 307 hammer2_trans_done(hammer2_trans_t *trans)
 308 {
 309         hammer2_trans_manage_t *tman;
 310         hammer2_trans_t *head;
 311         hammer2_trans_t *scan;
 312
 313         tman = &tmanage;
 314
 315         /*
 316          * Remove.
 317          */
 318         lockmgr(&tman->translk, LK_EXCLUSIVE);
 319         TAILQ_REMOVE(&tman->transq, trans, entry);
 320         head = TAILQ_FIRST(&tman->transq);
 321
 322         /*
 323          * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT
 324          * up through the next flush.  (If the head is a flush then we
 325          * stop there, unlike the unblock code following this section).
 326          */
 327         if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
 328                 --tman->flushcnt;
 329                 scan = head;
 330                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
 331                         atomic_clear_int(&scan->flags,
 332                                          HAMMER2_TRANS_CONCURRENT);
 333                         scan = TAILQ_NEXT(scan, entry);
 334                 }
 335         }
 336
 337         /*
 338          * Unblock the head of the queue and any additional transactions
 339          * up to the next flush.  The head can be a flush and it will be
 340          * unblocked along with the non-flush transactions following it
 341          * (which are allowed to run concurrently with it).
 342          *
 343          * In synchronous flush mode we stop if the head transaction is
 344          * a flush.
 345          */
 346         if (head && head->blocked) {
 347                 head->blocked = 0;
 348                 wakeup(&head->sync_xid);
 349
 350                 if (hammer2_synchronous_flush > 0)
 351                         scan = head;
 352                 else
 353                         scan = TAILQ_NEXT(head, entry);
 354                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
 355                         if (scan->blocked) {
 356                                 scan->blocked = 0;
 357                                 wakeup(&scan->sync_xid);
 358                         }
 359                         scan = TAILQ_NEXT(scan, entry);
 360                 }
 361         }
 362         lockmgr(&tman->translk, LK_RELEASE);
 363 }
 364
 365 /*
 366  * Flush the chain and all modified sub-chains through the specified
 367  * synchronization point, propagating parent chain modifications and
 368  * mirror_tid updates back up as needed.
 369  *
 370  * Caller must have interlocked against any non-flush-related modifying
 371  * operations in progress whos XXX values are less than or equal
 372  * to the passed sync_xid.
 373  *
 374  * Caller must have already vetted synchronization points to ensure they
 375  * are properly flushed.  Only snapshots and cluster flushes can create
 376  * these sorts of synchronization points.
 377  *
 378  * This routine can be called from several places but the most important
 379  * is from VFS_SYNC.
 380  *
 381  * chain is locked on call and will remain locked on return.  The chain's
 382  * UPDATE flag indicates that its parent's block table (which is not yet
 383  * part of the flush) should be updated.  The chain may be replaced by
 384  * the call if it was modified.
 385  */
 386 void
 387 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
 388 {
 389         hammer2_chain_t *scan;
 390         hammer2_flush_info_t info;
 391         int loops;
 392
 393         /*
 394          * Execute the recursive flush and handle deferrals.
 395          *
 396          * Chains can be ridiculously long (thousands deep), so to
 397          * avoid blowing out the kernel stack the recursive flush has a
 398          * depth limit.  Elements at the limit are placed on a list
 399          * for re-execution after the stack has been popped.
 400          */
 401         bzero(&info, sizeof(info));
 402         TAILQ_INIT(&info.flushq);
 403         info.trans = trans;
 404         info.sync_xid = trans->sync_xid;
 405         info.cache_index = -1;
 406
 407         /*
 408          * Calculate parent (can be NULL), if not NULL the flush core
 409          * expects the parent to be referenced so it can easily lock/unlock
 410          * it without it getting ripped up.
 411          */
 412         if ((info.parent = chain->parent) != NULL)
 413                 hammer2_chain_ref(info.parent);
 414
 415         /*
 416          * Extra ref needed because flush_core expects it when replacing
 417          * chain.
 418          */
 419         hammer2_chain_ref(chain);
 420         loops = 0;
 421
 422         for (;;) {
 423                 /*
 424                  * Unwind deep recursions which had been deferred.  This
 425                  * can leave the FLUSH_* bits set for these chains, which
 426                  * will be handled when we [re]flush chain after the unwind.
 427                  */
 428                 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) {
 429                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
 430                         TAILQ_REMOVE(&info.flushq, scan, flush_node);
 431                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
 432
 433                         /*
 434                          * Now that we've popped back up we can do a secondary
 435                          * recursion on the deferred elements.
 436                          *
 437                          * NOTE: hammer2_flush() may replace scan.
 438                          */
 439                         if (hammer2_debug & 0x0040)
 440                                 kprintf("deferred flush %p\n", scan);
 441                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
 442                         hammer2_chain_drop(scan);       /* ref from deferral */
 443                         hammer2_flush(trans, scan);
 444                         hammer2_chain_unlock(scan);
 445                 }
 446
 447                 /*
 448                  * [re]flush chain.
 449                  */
 450                 info.diddeferral = 0;
 451                 hammer2_flush_core(&info, chain, 0);
 452
 453                 /*
 454                  * Only loop if deep recursions have been deferred.
 455                  */
 456                 if (TAILQ_EMPTY(&info.flushq))
 457                         break;
 458
 459                 if (++loops % 1000 == 0) {
 460                         kprintf("hammer2_flush: excessive loops on %p\n",
 461                                 chain);
 462                         if (hammer2_debug & 0x100000)
 463                                 Debugger("hell4");
 464                 }
 465         }
 466         hammer2_chain_drop(chain);
 467         if (info.parent)
 468                 hammer2_chain_drop(info.parent);
 469 }
 470
 471 /*
 472  * This is the core of the chain flushing code.  The chain is locked by the
 473  * caller and must also have an extra ref on it by the caller, and remains
 474  * locked and will have an extra ref on return.  Upon return, the caller can
 475  * test the UPDATE bit on the child to determine if the parent needs updating.
 476  *
 477  * (1) Determine if this node is a candidate for the flush, return if it is
 478  *     not.  fchain and vchain are always candidates for the flush.
 479  *
 480  * (2) If we recurse too deep the chain is entered onto the deferral list and
 481  *     the current flush stack is aborted until after the deferral list is
 482  *     run.
 483  *
 484  * (3) Recursively flush live children (rbtree).  This can create deferrals.
 485  *     A successful flush clears the MODIFIED and UPDATE bits on the children
 486  *     and typically causes the parent to be marked MODIFIED as the children
 487  *     update the parent's block table.  A parent might already be marked
 488  *     MODIFIED due to a deletion (whos blocktable update in the parent is
 489  *     handled by the frontend), or if the parent itself is modified by the
 490  *     frontend for other reasons.
 491  *
 492  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
 493  *     Deleted-but-open inodes can still be individually flushed via the
 494  *     filesystem syncer.
 495  *
 496  * (5) Note that an unmodified child may still need the block table in its
 497  *     parent updated (e.g. rename/move).  The child will have UPDATE set
 498  *     in this case.
 499  *
 500  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
 501  *
 502  * blockref.modify_tid and blockref.mirror_tid are consistent only within a
 503  * PFS.  This is why we cannot cache sync_tid in the transaction structure.
 504  * Instead we access it from the pmp.
 505  */
 506 static void
 507 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
 508                    int deleting)
 509 {
 510         hammer2_chain_t *parent;
 511         hammer2_mount_t *hmp;
 512         hammer2_pfsmount_t *pmp;
 513         int diddeferral;
 514
 515         /*
 516          * (1) Optimize downward recursion to locate nodes needing action.
 517          *     Nothing to do if none of these flags are set.
 518          */
 519         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) {
 520                 if (hammer2_debug & 0x200) {
 521                         if (info->debug == NULL)
 522                                 info->debug = chain;
 523                 } else {
 524                         return;
 525                 }
 526         }
 527
 528         hmp = chain->hmp;
 529         pmp = chain->pmp;
 530         diddeferral = info->diddeferral;
 531         parent = info->parent;          /* can be NULL */
 532
 533         /*
 534          * mirror_tid should not be forward-indexed
 535          */
 536         KKASSERT(chain->bref.mirror_tid <= pmp->flush_tid);
 537
 538         /*
 539          * Downward search recursion
 540          */
 541         if (chain->flags & HAMMER2_CHAIN_DEFERRED) {
 542                 /*
 543                  * Already deferred.
 544                  */
 545                 ++info->diddeferral;
 546         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
 547                 /*
 548                  * Recursion depth reached.
 549                  */
 550                 hammer2_chain_ref(chain);
 551                 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
 552                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
 553                 ++info->diddeferral;
 554         } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
 555                 /*
 556                  * Downward recursion search (actual flush occurs bottom-up).
 557                  * pre-clear ONFLUSH.  It can get set again due to races,
 558                  * which we want so the scan finds us again in the next flush.
 559                  */
 560                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
 561                 info->parent = chain;
 562                 spin_lock(&chain->core.cst.spin);
 563                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
 564                         NULL, hammer2_flush_recurse, info);
 565                 spin_unlock(&chain->core.cst.spin);
 566                 info->parent = parent;
 567                 if (info->diddeferral)
 568                         hammer2_chain_setflush(info->trans, chain);
 569         }
 570
 571         /*
 572          * Now we are in the bottom-up part of the recursion.
 573          *
 574          * Do not update chain if lower layers were deferred.
 575          */
 576         if (info->diddeferral)
 577                 goto done;
 578
 579         /*
 580          * Propagate the DESTROY flag downwards.  This dummies up the flush
 581          * code and tries to invalidate related buffer cache buffers to
 582          * avoid the disk write.
 583          */
 584         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
 585                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
 586
 587         /*
 588          * Chain was already modified or has become modified, flush it out.
 589          */
 590 again:
 591         if ((hammer2_debug & 0x200) &&
 592             info->debug &&
 593             (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) {
 594                 hammer2_chain_t *scan = chain;
 595
 596                 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain);
 597                 while (scan) {
 598                         kprintf("    chain %p [%08x] bref=%016jx:%02x\n",
 599                                 scan, scan->flags,
 600                                 scan->bref.key, scan->bref.type);
 601                         if (scan == info->debug)
 602                                 break;
 603                         scan = scan->parent;
 604                 }
 605         }
 606
 607         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 608                 /*
 609                  * Dispose of the modified bit.  UPDATE should already be
 610                  * set.
 611                  */
 612                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
 613                          chain == &hmp->vchain);
 614                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
 615                 hammer2_pfs_memory_wakeup(pmp);
 616                 chain->bref.mirror_tid = pmp->flush_tid;
 617
 618                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) ||
 619                     chain == &hmp->vchain ||
 620                     chain == &hmp->fchain) {
 621                         /*
 622                          * Drop the ref from the MODIFIED bit we cleared,
 623                          * net -1 ref.
 624                          */
 625                         hammer2_chain_drop(chain);
 626                 } else {
 627                         /*
 628                          * Drop the ref from the MODIFIED bit we cleared and
 629                          * set a ref for the UPDATE bit we are setting.  Net
 630                          * 0 refs.
 631                          */
 632                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 633                 }
 634
 635                 /*
 636                  * Issue flush.
 637                  *
 638                  * A DELETED node that reaches this point must be flushed for
 639                  * synchronization point consistency.
 640                  *
 641                  * Update bref.mirror_tid, clear MODIFIED, and set UPDATE.
 642                  */
 643                 if (hammer2_debug & 0x1000) {
 644                         kprintf("Flush %p.%d %016jx/%d sync_xid=%08x "
 645                                 "data=%016jx\n",
 646                                 chain, chain->bref.type,
 647                                 chain->bref.key, chain->bref.keybits,
 648                                 info->sync_xid,
 649                                 chain->bref.data_off);
 650                 }
 651                 if (hammer2_debug & 0x2000) {
 652                         Debugger("Flush hell");
 653                 }
 654
 655                 /*
 656                  * Update chain CRCs for flush.
 657                  *
 658                  * NOTE: Volume headers are NOT flushed here as they require
 659                  *       special processing.
 660                  */
 661                 switch(chain->bref.type) {
 662                 case HAMMER2_BREF_TYPE_FREEMAP:
 663                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
 664                         hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
 665                         break;
 666                 case HAMMER2_BREF_TYPE_VOLUME:
 667                         /*
 668                          * The free block table is flushed by hammer2_vfs_sync()
 669                          * before it flushes vchain.  We must still hold fchain
 670                          * locked while copying voldata to volsync, however.
 671                          */
 672                         hammer2_voldata_lock(hmp);
 673                         hammer2_chain_lock(&hmp->fchain,
 674                                            HAMMER2_RESOLVE_ALWAYS);
 675                         /*
 676                          * There is no parent to our root vchain and fchain to
 677                          * synchronize the bref to, their updated mirror_tid's
 678                          * must be synchronized to the volume header.
 679                          */
 680                         hmp->voldata.mirror_tid = chain->bref.mirror_tid;
 681                         hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
 682                         kprintf("mirror_tid %08jx\n",
 683                                 (intmax_t)chain->bref.mirror_tid);
 684
 685                         /*
 686                          * The volume header is flushed manually by the
 687                          * syncer, not here.  All we do here is adjust the
 688                          * crc's.
 689                          */
 690                         KKASSERT(chain->data != NULL);
 691                         KKASSERT(chain->dio == NULL);
 692
 693                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
 694                                 hammer2_icrc32(
 695                                         (char *)&hmp->voldata +
 696                                          HAMMER2_VOLUME_ICRC1_OFF,
 697                                         HAMMER2_VOLUME_ICRC1_SIZE);
 698                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
 699                                 hammer2_icrc32(
 700                                         (char *)&hmp->voldata +
 701                                          HAMMER2_VOLUME_ICRC0_OFF,
 702                                         HAMMER2_VOLUME_ICRC0_SIZE);
 703                         hmp->voldata.icrc_volheader =
 704                                 hammer2_icrc32(
 705                                         (char *)&hmp->voldata +
 706                                          HAMMER2_VOLUME_ICRCVH_OFF,
 707                                         HAMMER2_VOLUME_ICRCVH_SIZE);
 708                         hmp->volsync = hmp->voldata;
 709                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
 710                         hammer2_chain_unlock(&hmp->fchain);
 711                         hammer2_voldata_unlock(hmp);
 712                         break;
 713                 case HAMMER2_BREF_TYPE_DATA:
 714                         /*
 715                          * Data elements have already been flushed via the
 716                          * logical file buffer cache.  Their hash was set in
 717                          * the bref by the vop_write code.
 718                          *
 719                          * Make sure any device buffer(s) have been flushed
 720                          * out here (there aren't usually any to flush) XXX.
 721                          */
 722                         break;
 723                 case HAMMER2_BREF_TYPE_INDIRECT:
 724                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
 725                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
 726                         /*
 727                          * Buffer I/O will be cleaned up when the volume is
 728                          * flushed (but the kernel is free to flush it before
 729                          * then, as well).
 730                          */
 731                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
 732                         break;
 733                 case HAMMER2_BREF_TYPE_INODE:
 734                         if (chain->data->ipdata.op_flags &
 735                             HAMMER2_OPFLAG_PFSROOT) {
 736                                 /*
 737                                  * non-NULL pmp if mounted as a PFS.  We must
 738                                  * sync fields cached in the pmp.
 739                                  */
 740                                 hammer2_inode_data_t *ipdata;
 741
 742                                 ipdata = &chain->data->ipdata;
 743                                 ipdata->pfs_inum = pmp->inode_tid;
 744                         } else {
 745                                 /* can't be mounted as a PFS */
 746                                 KKASSERT((chain->flags &
 747                                           HAMMER2_CHAIN_PFSROOT) == 0);
 748                         }
 749
 750                         /*
 751                          * Update inode statistics.  Pending stats in chain
 752                          * are cleared out on UPDATE so expect that bit to
 753                          * be set here too or the statistics will not be
 754                          * rolled-up properly.
 755                          */
 756                         {
 757                                 hammer2_inode_data_t *ipdata;
 758
 759                                 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
 760                                 ipdata = &chain->data->ipdata;
 761                                 ipdata->data_count += chain->data_count;
 762                                 ipdata->inode_count += chain->inode_count;
 763                         }
 764                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
 765                         break;
 766                 default:
 767                         KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
 768                         panic("hammer2_flush_core: unsupported "
 769                               "embedded bref %d",
 770                               chain->bref.type);
 771                         /* NOT REACHED */
 772                 }
 773
 774                 /*
 775                  * If the chain was destroyed try to avoid unnecessary I/O.
 776                  * (this only really works if the DIO system buffer is the
 777                  * same size as chain->bytes).
 778                  */
 779                 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) {
 780                         hammer2_io_setinval(chain->dio, chain->bytes);
 781                 }
 782         }
 783
 784         /*
 785          * If UPDATE is set the parent block table may need to be updated.
 786          *
 787          * NOTE: UPDATE may be set on vchain or fchain in which case
 788          *       parent could be NULL.  It's easiest to allow the case
 789          *       and test for NULL.  parent can also wind up being NULL
 790          *       due to a deletion so we need to handle the case anyway.
 791          *
 792          * If no parent exists we can just clear the UPDATE bit.  If the
 793          * chain gets reattached later on the bit will simply get set
 794          * again.
 795          */
 796         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) {
 797                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 798                 hammer2_chain_drop(chain);
 799         }
 800
 801         /*
 802          * The chain may need its blockrefs updated in the parent.  This
 803          * requires some fancy footwork.
 804          */
 805         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
 806                 hammer2_blockref_t *base;
 807                 int count;
 808
 809                 /*
 810                  * Both parent and chain must be locked.  This requires
 811                  * temporarily unlocking the chain.  We have to deal with
 812                  * the case where the chain might be reparented or modified
 813                  * while it was unlocked.
 814                  */
 815                 hammer2_chain_unlock(chain);
 816                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
 817                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
 818                 if (chain->parent != parent) {
 819                         kprintf("PARENT MISMATCH ch=%p p=%p/%p\n", chain, chain->parent, parent);
 820                         hammer2_chain_unlock(parent);
 821                         goto done;
 822                 }
 823
 824                 /*
 825                  * Check race condition.  If someone got in and modified
 826                  * it again while it was unlocked, we have to loop up.
 827                  */
 828                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 829                         hammer2_chain_unlock(parent);
 830                         kprintf("hammer2_flush: chain %p flush-mod race\n",
 831                                 chain);
 832                         goto again;
 833                 }
 834
 835                 /*
 836                  * Clear UPDATE flag
 837                  */
 838                 if (chain->flags & HAMMER2_CHAIN_UPDATE) {
 839                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 840                         hammer2_chain_drop(chain);
 841                 }
 842                 hammer2_chain_modify(info->trans, parent, 0);
 843
 844                 /*
 845                  * Calculate blockmap pointer
 846                  */
 847                 switch(parent->bref.type) {
 848                 case HAMMER2_BREF_TYPE_INODE:
 849                         /*
 850                          * Access the inode's block array.  However, there is
 851                          * no block array if the inode is flagged DIRECTDATA.
 852                          */
 853                         if (parent->data &&
 854                             (parent->data->ipdata.op_flags &
 855                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
 856                                 base = &parent->data->
 857                                         ipdata.u.blockset.blockref[0];
 858                         } else {
 859                                 base = NULL;
 860                         }
 861                         count = HAMMER2_SET_COUNT;
 862                         break;
 863                 case HAMMER2_BREF_TYPE_INDIRECT:
 864                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
 865                         if (parent->data)
 866                                 base = &parent->data->npdata[0];
 867                         else
 868                                 base = NULL;
 869                         count = parent->bytes / sizeof(hammer2_blockref_t);
 870                         break;
 871                 case HAMMER2_BREF_TYPE_VOLUME:
 872                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
 873                         count = HAMMER2_SET_COUNT;
 874                         break;
 875                 case HAMMER2_BREF_TYPE_FREEMAP:
 876                         base = &parent->data->npdata[0];
 877                         count = HAMMER2_SET_COUNT;
 878                         break;
 879                 default:
 880                         base = NULL;
 881                         count = 0;
 882                         panic("hammer2_flush_core: "
 883                               "unrecognized blockref type: %d",
 884                               parent->bref.type);
 885                 }
 886
 887                 /*
 888                  * Blocktable updates
 889                  *
 890                  * We synchronize pending statistics at this time.  Delta
 891                  * adjustments designated for the current and upper level
 892                  * are synchronized.
 893                  */
 894                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
 895                         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
 896                                 hammer2_base_delete(info->trans, parent,
 897                                                     base, count,
 898                                                     &info->cache_index, chain);
 899                                 /* base_delete clears both bits */
 900                         } else {
 901                                 atomic_clear_int(&chain->flags,
 902                                                  HAMMER2_CHAIN_BMAPUPD);
 903                         }
 904                 }
 905                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
 906                         parent->data_count += chain->data_count +
 907                                               chain->data_count_up;
 908                         parent->inode_count += chain->inode_count +
 909                                                chain->inode_count_up;
 910                         chain->data_count = 0;
 911                         chain->inode_count = 0;
 912                         chain->data_count_up = 0;
 913                         chain->inode_count_up = 0;
 914                         hammer2_base_insert(info->trans, parent,
 915                                             base, count,
 916                                             &info->cache_index, chain);
 917                         /* base_insert sets BMAPPED */
 918                 }
 919                 hammer2_chain_unlock(parent);
 920         }
 921
 922         /*
 923          * Final cleanup after flush
 924          */
 925 done:
 926         KKASSERT(chain->refs > 1);
 927         KKASSERT(chain->bref.mirror_tid <= chain->pmp->flush_tid);
 928         if (hammer2_debug & 0x200) {
 929                 if (info->debug == chain)
 930                         info->debug = NULL;
 931         }
 932 }
 933
 934 /*
 935  * Flush recursion helper, called from flush_core, calls flush_core.
 936  *
 937  * Flushes the children of the caller's chain (info->parent), restricted
 938  * by sync_tid.  Set info->domodify if the child's blockref must propagate
 939  * back up to the parent.
 940  *
 941  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
 942  * flush scan order prevents any chains from being lost.  A child can be
 943  * executes more than once.
 944  *
 945  * WARNING! If we do not call hammer2_flush_core() we must update
 946  *          bref.mirror_tid ourselves to indicate that the flush has
 947  *          processed the child.
 948  *
 949  * WARNING! parent->core spinlock is held on entry and return.
 950  *
 951  * WARNING! Flushes do not cross PFS boundaries.  Specifically, a flush must
 952  *          not cross a pfs-root boundary.
 953  */
 954 static int
 955 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
 956 {
 957         hammer2_flush_info_t *info = data;
 958         /*hammer2_trans_t *trans = info->trans;*/
 959         hammer2_chain_t *parent = info->parent;
 960
 961         /*
 962          * (child can never be fchain or vchain so a special check isn't
 963          *  needed).
 964          *
 965          * We must ref the child before unlocking the spinlock.
 966          *
 967          * The caller has added a ref to the parent so we can temporarily
 968          * unlock it in order to lock the child.
 969          */
 970         hammer2_chain_ref(child);
 971         spin_unlock(&parent->core.cst.spin);
 972
 973         hammer2_chain_unlock(parent);
 974         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
 975
 976         /*
 977          * Never recurse across a mounted PFS boundary.
 978          *
 979          * Recurse and collect deferral data.
 980          */
 981         if ((child->flags & HAMMER2_CHAIN_PFSBOUNDARY) == 0 ||
 982             child->pmp == NULL) {
 983                 if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
 984                         ++info->depth;
 985                         hammer2_flush_core(info, child, 0); /* XXX deleting */
 986                         --info->depth;
 987                 } else if (hammer2_debug & 0x200) {
 988                         if (info->debug == NULL)
 989                                 info->debug = child;
 990                         ++info->depth;
 991                         hammer2_flush_core(info, child, 0); /* XXX deleting */
 992                         --info->depth;
 993                         if (info->debug == child)
 994                                 info->debug = NULL;
 995                 }
 996         }
 997
 998         /*
 999          * Relock to continue the loop
1000          */
1001         hammer2_chain_unlock(child);
1002         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1003         hammer2_chain_drop(child);
1004         KKASSERT(info->parent == parent);
1005         spin_lock(&parent->core.cst.spin);
1006
1007         return (0);
1008 }
1009
1010
1011 #if 0
1012 void
1013 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how)
1014 {
1015 #if 0
1016         hammer2_chain_t *grandp;
1017 #endif
1018
1019         parent->data_count += child->data_count;
1020         parent->inode_count += child->inode_count;
1021         child->data_count = 0;
1022         child->inode_count = 0;
1023         if (how < 0) {
1024                 parent->data_count -= child->bytes;
1025                 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1026                         parent->inode_count -= 1;
1027 #if 0
1028                         /* XXX child->data may be NULL atm */
1029                         parent->data_count -= child->data->ipdata.data_count;
1030                         parent->inode_count -= child->data->ipdata.inode_count;
1031 #endif
1032                 }
1033         } else if (how > 0) {
1034                 parent->data_count += child->bytes;
1035                 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1036                         parent->inode_count += 1;
1037 #if 0
1038                         /* XXX child->data may be NULL atm */
1039                         parent->data_count += child->data->ipdata.data_count;
1040                         parent->inode_count += child->data->ipdata.inode_count;
1041 #endif
1042                 }
1043         }
1044         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1045                 parent->data->ipdata.data_count += parent->data_count;
1046                 parent->data->ipdata.inode_count += parent->inode_count;
1047 #if 0
1048                 for (grandp = parent->above->first_parent;
1049                      grandp;
1050                      grandp = grandp->next_parent) {
1051                         grandp->data_count += parent->data_count;
1052                         grandp->inode_count += parent->inode_count;
1053                 }
1054 #endif
1055                 parent->data_count = 0;
1056                 parent->inode_count = 0;
1057         }
1058 }
1059 #endif