sys/vfs/hammer/hammer_inode.c

   1 /*
   2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 #include <vm/vm_page2.h>
  36
  37 #include "hammer.h"
  38
  39 static int      hammer_unload_inode(struct hammer_inode *ip);
  40 static void     hammer_free_inode(hammer_inode_t ip);
  41 static void     hammer_flush_inode_core(hammer_inode_t ip,
  42                                         hammer_flush_group_t flg, int flags);
  43 static int      hammer_setup_child_callback(hammer_record_t rec, void *data);
  44 #if 0
  45 static int      hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
  46 #endif
  47 static int      hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
  48                                         hammer_flush_group_t flg);
  49 static int      hammer_setup_parent_inodes_helper(hammer_record_t record,
  50                                         int depth, hammer_flush_group_t flg);
  51 static void     hammer_inode_wakereclaims(hammer_inode_t ip);
  52 static struct hammer_inostats *hammer_inode_inostats(hammer_mount_t hmp,
  53                                         pid_t pid);
  54 static struct hammer_inode *__hammer_find_inode(hammer_transaction_t trans,
  55                                         int64_t obj_id, hammer_tid_t asof,
  56                                         uint32_t localization);
  57
  58 struct krate hammer_gen_krate = { 1 };
  59
  60 /*
  61  * RB-Tree support for inode structures
  62  */
  63 int
  64 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
  65 {
  66         if (ip1->obj_localization < ip2->obj_localization)
  67                 return(-1);
  68         if (ip1->obj_localization > ip2->obj_localization)
  69                 return(1);
  70         if (ip1->obj_id < ip2->obj_id)
  71                 return(-1);
  72         if (ip1->obj_id > ip2->obj_id)
  73                 return(1);
  74         if (ip1->obj_asof < ip2->obj_asof)
  75                 return(-1);
  76         if (ip1->obj_asof > ip2->obj_asof)
  77                 return(1);
  78         return(0);
  79 }
  80
  81 int
  82 hammer_redo_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
  83 {
  84         if (ip1->redo_fifo_start < ip2->redo_fifo_start)
  85                 return(-1);
  86         if (ip1->redo_fifo_start > ip2->redo_fifo_start)
  87                 return(1);
  88         return(0);
  89 }
  90
  91 /*
  92  * RB-Tree support for inode structures / special LOOKUP_INFO
  93  */
  94 static int
  95 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
  96 {
  97         if (info->obj_localization < ip->obj_localization)
  98                 return(-1);
  99         if (info->obj_localization > ip->obj_localization)
 100                 return(1);
 101         if (info->obj_id < ip->obj_id)
 102                 return(-1);
 103         if (info->obj_id > ip->obj_id)
 104                 return(1);
 105         if (info->obj_asof < ip->obj_asof)
 106                 return(-1);
 107         if (info->obj_asof > ip->obj_asof)
 108                 return(1);
 109         return(0);
 110 }
 111
 112 /*
 113  * Used by hammer_scan_inode_snapshots() to locate all of an object's
 114  * snapshots.  Note that the asof field is not tested, which we can get
 115  * away with because it is the lowest-priority field.
 116  */
 117 static int
 118 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
 119 {
 120         hammer_inode_info_t info = data;
 121
 122         if (ip->obj_localization > info->obj_localization)
 123                 return(1);
 124         if (ip->obj_localization < info->obj_localization)
 125                 return(-1);
 126         if (ip->obj_id > info->obj_id)
 127                 return(1);
 128         if (ip->obj_id < info->obj_id)
 129                 return(-1);
 130         return(0);
 131 }
 132
 133 /*
 134  * Used by hammer_unload_pseudofs() to locate all inodes associated with
 135  * a particular PFS.
 136  */
 137 static int
 138 hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
 139 {
 140         uint32_t localization = *(uint32_t *)data;
 141         if (ip->obj_localization > localization)
 142                 return(1);
 143         if (ip->obj_localization < localization)
 144                 return(-1);
 145         return(0);
 146 }
 147
 148 /*
 149  * RB-Tree support for pseudofs structures
 150  */
 151 static int
 152 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
 153 {
 154         if (p1->localization < p2->localization)
 155                 return(-1);
 156         if (p1->localization > p2->localization)
 157                 return(1);
 158         return(0);
 159 }
 160
 161
 162 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
 163 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
 164                 hammer_inode_info_cmp, hammer_inode_info_t);
 165 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
 166              hammer_pfs_rb_compare, uint32_t, localization);
 167
 168 /*
 169  * The kernel is not actively referencing this vnode but is still holding
 170  * it cached.
 171  *
 172  * This is called from the frontend.
 173  *
 174  * MPALMOSTSAFE
 175  */
 176 int
 177 hammer_vop_inactive(struct vop_inactive_args *ap)
 178 {
 179         struct hammer_inode *ip = VTOI(ap->a_vp);
 180         hammer_mount_t hmp;
 181
 182         /*
 183          * Degenerate case
 184          */
 185         if (ip == NULL) {
 186                 vrecycle(ap->a_vp);
 187                 return(0);
 188         }
 189
 190         /*
 191          * If the inode no longer has visibility in the filesystem try to
 192          * recycle it immediately, even if the inode is dirty.  Recycling
 193          * it quickly allows the system to reclaim buffer cache and VM
 194          * resources which can matter a lot in a heavily loaded system.
 195          *
 196          * This can deadlock in vfsync() if we aren't careful.
 197          *
 198          * Do not queue the inode to the flusher if we still have visibility,
 199          * otherwise namespace calls such as chmod will unnecessarily generate
 200          * multiple inode updates.
 201          */
 202         if (ip->ino_data.nlinks == 0) {
 203                 hmp = ip->hmp;
 204                 lwkt_gettoken(&hmp->fs_token);
 205                 hammer_inode_unloadable_check(ip, 0);
 206                 if (ip->flags & HAMMER_INODE_MODMASK)
 207                         hammer_flush_inode(ip, 0);
 208                 lwkt_reltoken(&hmp->fs_token);
 209                 vrecycle(ap->a_vp);
 210         }
 211         return(0);
 212 }
 213
 214 /*
 215  * Release the vnode association.  This is typically (but not always)
 216  * the last reference on the inode.
 217  *
 218  * Once the association is lost we are on our own with regards to
 219  * flushing the inode.
 220  *
 221  * We must interlock ip->vp so hammer_get_vnode() can avoid races.
 222  */
 223 int
 224 hammer_vop_reclaim(struct vop_reclaim_args *ap)
 225 {
 226         struct hammer_inode *ip;
 227         hammer_mount_t hmp;
 228         struct vnode *vp;
 229
 230         vp = ap->a_vp;
 231
 232         if ((ip = vp->v_data) != NULL) {
 233                 hmp = ip->hmp;
 234                 lwkt_gettoken(&hmp->fs_token);
 235                 hammer_lock_ex(&ip->lock);
 236                 vp->v_data = NULL;
 237                 ip->vp = NULL;
 238
 239                 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
 240                         ++hammer_count_reclaims;
 241                         ++hmp->count_reclaims;
 242                         ip->flags |= HAMMER_INODE_RECLAIM;
 243                 }
 244                 hammer_unlock(&ip->lock);
 245                 vclrisdirty(vp);
 246                 hammer_rel_inode(ip, 1);
 247                 lwkt_reltoken(&hmp->fs_token);
 248         }
 249         return(0);
 250 }
 251
 252 /*
 253  * Inform the kernel that the inode is dirty.  This will be checked
 254  * by vn_unlock().
 255  *
 256  * Theoretically in order to reclaim a vnode the hammer_vop_reclaim()
 257  * must be called which will interlock against our inode lock, so
 258  * if VRECLAIMED is not set vp->v_mount (as used by vsetisdirty())
 259  * should be stable without having to acquire any new locks.
 260  */
 261 void
 262 hammer_inode_dirty(struct hammer_inode *ip)
 263 {
 264         struct vnode *vp;
 265
 266         if ((ip->flags & HAMMER_INODE_MODMASK) &&
 267             (vp = ip->vp) != NULL &&
 268             (vp->v_flag & (VRECLAIMED | VISDIRTY)) == 0) {
 269                 vsetisdirty(vp);
 270         }
 271 }
 272
 273 /*
 274  * Return a locked vnode for the specified inode.  The inode must be
 275  * referenced but NOT LOCKED on entry and will remain referenced on
 276  * return.
 277  *
 278  * Called from the frontend.
 279  */
 280 int
 281 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
 282 {
 283         hammer_mount_t hmp;
 284         struct vnode *vp;
 285         int error = 0;
 286         uint8_t obj_type;
 287
 288         hmp = ip->hmp;
 289
 290         for (;;) {
 291                 if ((vp = ip->vp) == NULL) {
 292                         error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
 293                         if (error)
 294                                 break;
 295                         hammer_lock_ex(&ip->lock);
 296                         if (ip->vp != NULL) {
 297                                 hammer_unlock(&ip->lock);
 298                                 vp = *vpp;
 299                                 vp->v_type = VBAD;
 300                                 vx_put(vp);
 301                                 continue;
 302                         }
 303                         hammer_ref(&ip->lock);
 304                         vp = *vpp;
 305                         ip->vp = vp;
 306
 307                         obj_type = ip->ino_data.obj_type;
 308                         vp->v_type = hammer_get_vnode_type(obj_type);
 309
 310                         hammer_inode_wakereclaims(ip);
 311
 312                         switch(ip->ino_data.obj_type) {
 313                         case HAMMER_OBJTYPE_CDEV:
 314                         case HAMMER_OBJTYPE_BDEV:
 315                                 vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
 316                                 addaliasu(vp, ip->ino_data.rmajor,
 317                                           ip->ino_data.rminor);
 318                                 break;
 319                         case HAMMER_OBJTYPE_FIFO:
 320                                 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
 321                                 break;
 322                         case HAMMER_OBJTYPE_REGFILE:
 323                                 break;
 324                         default:
 325                                 break;
 326                         }
 327
 328                         /*
 329                          * Only mark as the root vnode if the ip is not
 330                          * historical, otherwise the VFS cache will get
 331                          * confused.  The other half of the special handling
 332                          * is in hammer_vop_nlookupdotdot().
 333                          *
 334                          * Pseudo-filesystem roots can be accessed via
 335                          * non-root filesystem paths and setting VROOT may
 336                          * confuse the namecache.  Set VPFSROOT instead.
 337                          */
 338                         if (ip->obj_id == HAMMER_OBJID_ROOT) {
 339                                 if (ip->obj_asof == hmp->asof) {
 340                                         if (ip->obj_localization == 0)
 341                                                 vsetflags(vp, VROOT);
 342                                         else
 343                                                 vsetflags(vp, VPFSROOT);
 344                                 } else {
 345                                         vsetflags(vp, VPFSROOT);
 346                                 }
 347                         }
 348
 349                         vp->v_data = (void *)ip;
 350                         /* vnode locked by getnewvnode() */
 351                         /* make related vnode dirty if inode dirty? */
 352                         hammer_unlock(&ip->lock);
 353                         if (vp->v_type == VREG) {
 354                                 vinitvmio(vp, ip->ino_data.size,
 355                                           hammer_blocksize(ip->ino_data.size),
 356                                           hammer_blockoff(ip->ino_data.size));
 357                         }
 358                         break;
 359                 }
 360
 361                 /*
 362                  * Interlock vnode clearing.  This does not prevent the
 363                  * vnode from going into a reclaimed state but it does
 364                  * prevent it from being destroyed or reused so the vget()
 365                  * will properly fail.
 366                  */
 367                 hammer_lock_ex(&ip->lock);
 368                 if ((vp = ip->vp) == NULL) {
 369                         hammer_unlock(&ip->lock);
 370                         continue;
 371                 }
 372                 vhold(vp);
 373                 hammer_unlock(&ip->lock);
 374
 375                 /*
 376                  * loop if the vget fails (aka races), or if the vp
 377                  * no longer matches ip->vp.
 378                  */
 379                 if (vget(vp, LK_EXCLUSIVE) == 0) {
 380                         if (vp == ip->vp) {
 381                                 vdrop(vp);
 382                                 break;
 383                         }
 384                         vput(vp);
 385                 }
 386                 vdrop(vp);
 387         }
 388         *vpp = vp;
 389         return(error);
 390 }
 391
 392 /*
 393  * Locate all copies of the inode for obj_id compatible with the specified
 394  * asof, reference, and issue the related call-back.  This routine is used
 395  * for direct-io invalidation and does not create any new inodes.
 396  */
 397 void
 398 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
 399                             int (*callback)(hammer_inode_t ip, void *data),
 400                             void *data)
 401 {
 402         hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
 403                                    hammer_inode_info_cmp_all_history,
 404                                    callback, iinfo);
 405 }
 406
 407 /*
 408  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
 409  * do not attach or detach the related vnode (use hammer_get_vnode() for
 410  * that).
 411  *
 412  * The flags argument is only applied for newly created inodes, and only
 413  * certain flags are inherited.
 414  *
 415  * Called from the frontend.
 416  */
 417 struct hammer_inode *
 418 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
 419                  int64_t obj_id, hammer_tid_t asof, uint32_t localization,
 420                  int flags, int *errorp)
 421 {
 422         hammer_mount_t hmp = trans->hmp;
 423         struct hammer_node_cache *cachep;
 424         struct hammer_cursor cursor;
 425         struct hammer_inode *ip;
 426
 427
 428         /*
 429          * Determine if we already have an inode cached.  If we do then
 430          * we are golden.
 431          *
 432          * If we find an inode with no vnode we have to mark the
 433          * transaction such that hammer_inode_waitreclaims() is
 434          * called later on to avoid building up an infinite number
 435          * of inodes.  Otherwise we can continue to * add new inodes
 436          * faster then they can be disposed of, even with the tsleep
 437          * delay.
 438          *
 439          * If we find a dummy inode we return a failure so dounlink
 440          * (which does another lookup) doesn't try to mess with the
 441          * link count.  hammer_vop_nresolve() uses hammer_get_dummy_inode()
 442          * to ref dummy inodes.
 443          */
 444 loop:
 445         *errorp = 0;
 446         ip = __hammer_find_inode(trans, obj_id, asof, localization);
 447         if (ip) {
 448                 if (ip->flags & HAMMER_INODE_DUMMY) {
 449                         *errorp = ENOENT;
 450                         return(NULL);
 451                 }
 452                 hammer_ref(&ip->lock);
 453                 return(ip);
 454         }
 455
 456         /*
 457          * Allocate a new inode structure and deal with races later.
 458          */
 459         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
 460         ++hammer_count_inodes;
 461         ++hmp->count_inodes;
 462         ip->obj_id = obj_id;
 463         ip->obj_asof = asof;
 464         ip->obj_localization = localization;
 465         ip->hmp = hmp;
 466         ip->flags = flags & HAMMER_INODE_RO;
 467         ip->cache[0].ip = ip;
 468         ip->cache[1].ip = ip;
 469         ip->cache[2].ip = ip;
 470         ip->cache[3].ip = ip;
 471         if (hmp->ronly)
 472                 ip->flags |= HAMMER_INODE_RO;
 473         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
 474                 0x7FFFFFFFFFFFFFFFLL;
 475         RB_INIT(&ip->rec_tree);
 476         TAILQ_INIT(&ip->target_list);
 477         hammer_ref(&ip->lock);
 478
 479         /*
 480          * Locate the on-disk inode.  If this is a PFS root we always
 481          * access the current version of the root inode and (if it is not
 482          * a master) always access information under it with a snapshot
 483          * TID.
 484          *
 485          * We cache recent inode lookups in this directory in dip->cache[2].
 486          * If we can't find it we assume the inode we are looking for is
 487          * close to the directory inode.
 488          */
 489 retry:
 490         cachep = NULL;
 491         if (dip) {
 492                 if (dip->cache[2].node)
 493                         cachep = &dip->cache[2];
 494                 else
 495                         cachep = &dip->cache[0];
 496         }
 497         hammer_init_cursor(trans, &cursor, cachep, NULL);
 498         cursor.key_beg.localization = localization | HAMMER_LOCALIZE_INODE;
 499         cursor.key_beg.obj_id = ip->obj_id;
 500         cursor.key_beg.key = 0;
 501         cursor.key_beg.create_tid = 0;
 502         cursor.key_beg.delete_tid = 0;
 503         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
 504         cursor.key_beg.obj_type = 0;
 505
 506         cursor.asof = asof;
 507         cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA |
 508                        HAMMER_CURSOR_ASOF;
 509
 510         *errorp = hammer_btree_lookup(&cursor);
 511         if (*errorp == EDEADLK) {
 512                 hammer_done_cursor(&cursor);
 513                 goto retry;
 514         }
 515
 516         /*
 517          * On success the B-Tree lookup will hold the appropriate
 518          * buffer cache buffers and provide a pointer to the requested
 519          * information.  Copy the information to the in-memory inode
 520          * and cache the B-Tree node to improve future operations.
 521          */
 522         if (*errorp == 0) {
 523                 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
 524                 ip->ino_data = cursor.data->inode;
 525
 526                 /*
 527                  * cache[0] tries to cache the location of the object inode.
 528                  * The assumption is that it is near the directory inode.
 529                  *
 530                  * cache[1] tries to cache the location of the object data.
 531                  * We might have something in the governing directory from
 532                  * scan optimizations (see the strategy code in
 533                  * hammer_vnops.c).
 534                  *
 535                  * We update dip->cache[2], if possible, with the location
 536                  * of the object inode for future directory shortcuts.
 537                  */
 538                 hammer_cache_node(&ip->cache[0], cursor.node);
 539                 if (dip) {
 540                         if (dip->cache[3].node) {
 541                                 hammer_cache_node(&ip->cache[1],
 542                                                   dip->cache[3].node);
 543                         }
 544                         hammer_cache_node(&dip->cache[2], cursor.node);
 545                 }
 546
 547                 /*
 548                  * The file should not contain any data past the file size
 549                  * stored in the inode.  Setting save_trunc_off to the
 550                  * file size instead of max reduces B-Tree lookup overheads
 551                  * on append by allowing the flusher to avoid checking for
 552                  * record overwrites.
 553                  */
 554                 ip->save_trunc_off = ip->ino_data.size;
 555
 556                 /*
 557                  * Locate and assign the pseudofs management structure to
 558                  * the inode.
 559                  */
 560                 if (dip && dip->obj_localization == ip->obj_localization) {
 561                         ip->pfsm = dip->pfsm;
 562                         hammer_ref(&ip->pfsm->lock);
 563                 } else {
 564                         ip->pfsm = hammer_load_pseudofs(trans,
 565                                                         ip->obj_localization,
 566                                                         errorp);
 567                         *errorp = 0;    /* ignore ENOENT */
 568                 }
 569         }
 570
 571         /*
 572          * The inode is placed on the red-black tree and will be synced to
 573          * the media when flushed or by the filesystem sync.  If this races
 574          * another instantiation/lookup the insertion will fail.
 575          */
 576         if (*errorp == 0) {
 577                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
 578                         hammer_free_inode(ip);
 579                         hammer_done_cursor(&cursor);
 580                         goto loop;
 581                 }
 582                 ip->flags |= HAMMER_INODE_ONDISK;
 583         } else {
 584                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
 585                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
 586                         --hmp->rsv_inodes;
 587                 }
 588
 589                 hammer_free_inode(ip);
 590                 ip = NULL;
 591         }
 592         hammer_done_cursor(&cursor);
 593
 594         /*
 595          * NEWINODE is only set if the inode becomes dirty later,
 596          * setting it here just leads to unnecessary stalls.
 597          *
 598          * trans->flags |= HAMMER_TRANSF_NEWINODE;
 599          */
 600         return (ip);
 601 }
 602
 603 /*
 604  * Get a dummy inode to placemark a broken directory entry.
 605  */
 606 struct hammer_inode *
 607 hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
 608                  int64_t obj_id, hammer_tid_t asof, uint32_t localization,
 609                  int flags, int *errorp)
 610 {
 611         hammer_mount_t hmp = trans->hmp;
 612         struct hammer_inode *ip;
 613
 614         /*
 615          * Determine if we already have an inode cached.  If we do then
 616          * we are golden.
 617          *
 618          * If we find an inode with no vnode we have to mark the
 619          * transaction such that hammer_inode_waitreclaims() is
 620          * called later on to avoid building up an infinite number
 621          * of inodes.  Otherwise we can continue to * add new inodes
 622          * faster then they can be disposed of, even with the tsleep
 623          * delay.
 624          *
 625          * If we find a non-fake inode we return an error.  Only fake
 626          * inodes can be returned by this routine.
 627          */
 628 loop:
 629         *errorp = 0;
 630         ip = __hammer_find_inode(trans, obj_id, asof, localization);
 631         if (ip) {
 632                 if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
 633                         *errorp = ENOENT;
 634                         return(NULL);
 635                 }
 636                 hammer_ref(&ip->lock);
 637                 return(ip);
 638         }
 639
 640         /*
 641          * Allocate a new inode structure and deal with races later.
 642          */
 643         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
 644         ++hammer_count_inodes;
 645         ++hmp->count_inodes;
 646         ip->obj_id = obj_id;
 647         ip->obj_asof = asof;
 648         ip->obj_localization = localization;
 649         ip->hmp = hmp;
 650         ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY;
 651         ip->cache[0].ip = ip;
 652         ip->cache[1].ip = ip;
 653         ip->cache[2].ip = ip;
 654         ip->cache[3].ip = ip;
 655         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
 656                 0x7FFFFFFFFFFFFFFFLL;
 657         RB_INIT(&ip->rec_tree);
 658         TAILQ_INIT(&ip->target_list);
 659         hammer_ref(&ip->lock);
 660
 661         /*
 662          * Populate the dummy inode.  Leave everything zero'd out.
 663          *
 664          * (ip->ino_leaf and ip->ino_data)
 665          *
 666          * Make the dummy inode a FIFO object which most copy programs
 667          * will properly ignore.
 668          */
 669         ip->save_trunc_off = ip->ino_data.size;
 670         ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
 671
 672         /*
 673          * Locate and assign the pseudofs management structure to
 674          * the inode.
 675          */
 676         if (dip && dip->obj_localization == ip->obj_localization) {
 677                 ip->pfsm = dip->pfsm;
 678                 hammer_ref(&ip->pfsm->lock);
 679         } else {
 680                 ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
 681                                                 errorp);
 682                 *errorp = 0;    /* ignore ENOENT */
 683         }
 684
 685         /*
 686          * The inode is placed on the red-black tree and will be synced to
 687          * the media when flushed or by the filesystem sync.  If this races
 688          * another instantiation/lookup the insertion will fail.
 689          *
 690          * NOTE: Do not set HAMMER_INODE_ONDISK.  The inode is a fake.
 691          */
 692         if (*errorp == 0) {
 693                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
 694                         hammer_free_inode(ip);
 695                         goto loop;
 696                 }
 697         } else {
 698                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
 699                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
 700                         --hmp->rsv_inodes;
 701                 }
 702                 hammer_free_inode(ip);
 703                 ip = NULL;
 704         }
 705         trans->flags |= HAMMER_TRANSF_NEWINODE;
 706         return (ip);
 707 }
 708
 709 /*
 710  * Return a referenced inode only if it is in our inode cache.
 711  * Dummy inodes do not count.
 712  */
 713 struct hammer_inode *
 714 hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
 715                   hammer_tid_t asof, uint32_t localization)
 716 {
 717         struct hammer_inode *ip;
 718
 719         ip = __hammer_find_inode(trans, obj_id, asof, localization);
 720         if (ip) {
 721                 if (ip->flags & HAMMER_INODE_DUMMY)
 722                         ip = NULL;
 723                 else
 724                         hammer_ref(&ip->lock);
 725         }
 726         return(ip);
 727 }
 728
 729 /*
 730  * Return a referenced inode only if it is in our inode cache.
 731  * This function does not reference inode.
 732  */
 733 static struct hammer_inode *
 734 __hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
 735                   hammer_tid_t asof, uint32_t localization)
 736 {
 737         hammer_mount_t hmp = trans->hmp;
 738         struct hammer_inode_info iinfo;
 739         struct hammer_inode *ip;
 740
 741         iinfo.obj_id = obj_id;
 742         iinfo.obj_asof = asof;
 743         iinfo.obj_localization = localization;
 744
 745         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
 746
 747         return(ip);
 748 }
 749
 750 /*
 751  * Create a new filesystem object, returning the inode in *ipp.  The
 752  * returned inode will be referenced.  The inode is created in-memory.
 753  *
 754  * If pfsm is non-NULL the caller wishes to create the root inode for
 755  * a master PFS.
 756  */
 757 int
 758 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
 759                     struct ucred *cred,
 760                     hammer_inode_t dip, const char *name, int namelen,
 761                     hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
 762 {
 763         hammer_mount_t hmp;
 764         hammer_inode_t ip;
 765         uid_t xuid;
 766         int error;
 767         int64_t namekey;
 768         uint32_t dummy;
 769
 770         hmp = trans->hmp;
 771
 772         /*
 773          * Disallow the creation of new inodes in directories which
 774          * have been deleted.  In HAMMER, this will cause a record
 775          * syncing assertion later on in the flush code.
 776          */
 777         if (dip && dip->ino_data.nlinks == 0) {
 778                 *ipp = NULL;
 779                 return (EINVAL);
 780         }
 781
 782         /*
 783          * Allocate inode
 784          */
 785         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
 786         ++hammer_count_inodes;
 787         ++hmp->count_inodes;
 788         trans->flags |= HAMMER_TRANSF_NEWINODE;
 789
 790         if (pfsm) {
 791                 KKASSERT(pfsm->localization != 0);
 792                 ip->obj_id = HAMMER_OBJID_ROOT;
 793                 ip->obj_localization = pfsm->localization;
 794         } else {
 795                 KKASSERT(dip != NULL);
 796                 namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
 797                 ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
 798                 ip->obj_localization = dip->obj_localization;
 799         }
 800
 801         KKASSERT(ip->obj_id != 0);
 802         ip->obj_asof = hmp->asof;
 803         ip->hmp = hmp;
 804         ip->flush_state = HAMMER_FST_IDLE;
 805         ip->flags = HAMMER_INODE_DDIRTY |
 806                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME;
 807         ip->cache[0].ip = ip;
 808         ip->cache[1].ip = ip;
 809         ip->cache[2].ip = ip;
 810         ip->cache[3].ip = ip;
 811
 812         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
 813         /* ip->save_trunc_off = 0; (already zero) */
 814         RB_INIT(&ip->rec_tree);
 815         TAILQ_INIT(&ip->target_list);
 816
 817         ip->ino_data.atime = trans->time;
 818         ip->ino_data.mtime = trans->time;
 819         ip->ino_data.size = 0;
 820         ip->ino_data.nlinks = 0;
 821
 822         /*
 823          * A nohistory designator on the parent directory is inherited by
 824          * the child.  We will do this even for pseudo-fs creation... the
 825          * sysad can turn it off.
 826          */
 827         if (dip) {
 828                 ip->ino_data.uflags = dip->ino_data.uflags &
 829                                       (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP);
 830         }
 831
 832         ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
 833         ip->ino_leaf.base.localization = ip->obj_localization |
 834                                          HAMMER_LOCALIZE_INODE;
 835         ip->ino_leaf.base.obj_id = ip->obj_id;
 836         ip->ino_leaf.base.key = 0;
 837         ip->ino_leaf.base.create_tid = 0;
 838         ip->ino_leaf.base.delete_tid = 0;
 839         ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
 840         ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
 841
 842         ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
 843         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
 844         ip->ino_data.mode = vap->va_mode;
 845         ip->ino_data.ctime = trans->time;
 846
 847         /*
 848          * If we are running version 2 or greater directory entries are
 849          * inode-localized instead of data-localized.
 850          */
 851         if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
 852                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
 853                         ip->ino_data.cap_flags |=
 854                                 HAMMER_INODE_CAP_DIR_LOCAL_INO;
 855                 }
 856         }
 857         if (trans->hmp->version >= HAMMER_VOL_VERSION_SIX) {
 858                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
 859                         ip->ino_data.cap_flags |=
 860                                 HAMMER_INODE_CAP_DIRHASH_ALG1;
 861                 }
 862         }
 863
 864         /*
 865          * Setup the ".." pointer.  This only needs to be done for directories
 866          * but we do it for all objects as a recovery aid if dip exists.
 867          * The inode is probably a PFS root if dip is NULL.
 868          */
 869         if (dip)
 870                 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
 871
 872         switch(ip->ino_leaf.base.obj_type) {
 873         case HAMMER_OBJTYPE_CDEV:
 874         case HAMMER_OBJTYPE_BDEV:
 875                 ip->ino_data.rmajor = vap->va_rmajor;
 876                 ip->ino_data.rminor = vap->va_rminor;
 877                 break;
 878         default:
 879                 break;
 880         }
 881
 882         /*
 883          * Calculate default uid/gid and overwrite with information from
 884          * the vap.
 885          */
 886         if (dip) {
 887                 xuid = hammer_to_unix_xid(&dip->ino_data.uid);
 888                 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
 889                                              xuid, cred, &vap->va_mode);
 890         } else {
 891                 xuid = 0;
 892         }
 893         ip->ino_data.mode = vap->va_mode;
 894
 895         if (vap->va_vaflags & VA_UID_UUID_VALID)
 896                 ip->ino_data.uid = vap->va_uid_uuid;
 897         else if (vap->va_uid != (uid_t)VNOVAL)
 898                 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
 899         else
 900                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
 901
 902         if (vap->va_vaflags & VA_GID_UUID_VALID)
 903                 ip->ino_data.gid = vap->va_gid_uuid;
 904         else if (vap->va_gid != (gid_t)VNOVAL)
 905                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
 906         else if (dip)
 907                 ip->ino_data.gid = dip->ino_data.gid;
 908
 909         hammer_ref(&ip->lock);
 910
 911         if (pfsm) {
 912                 ip->pfsm = pfsm;
 913                 hammer_ref(&pfsm->lock);
 914                 error = 0;
 915         } else if (dip->obj_localization == ip->obj_localization) {
 916                 ip->pfsm = dip->pfsm;
 917                 hammer_ref(&ip->pfsm->lock);
 918                 error = 0;
 919         } else {
 920                 ip->pfsm = hammer_load_pseudofs(trans,
 921                                                 ip->obj_localization,
 922                                                 &error);
 923                 error = 0;      /* ignore ENOENT */
 924         }
 925
 926         if (error) {
 927                 hammer_free_inode(ip);
 928                 ip = NULL;
 929         } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
 930                 hpanic("duplicate obj_id %llx", (long long)ip->obj_id);
 931                 /* not reached */
 932                 hammer_free_inode(ip);
 933         }
 934         *ipp = ip;
 935         return(error);
 936 }
 937
 938 /*
 939  * Final cleanup / freeing of an inode structure
 940  */
 941 static void
 942 hammer_free_inode(hammer_inode_t ip)
 943 {
 944         struct hammer_mount *hmp;
 945
 946         hmp = ip->hmp;
 947         KKASSERT(hammer_oneref(&ip->lock));
 948         hammer_uncache_node(&ip->cache[0]);
 949         hammer_uncache_node(&ip->cache[1]);
 950         hammer_uncache_node(&ip->cache[2]);
 951         hammer_uncache_node(&ip->cache[3]);
 952         hammer_inode_wakereclaims(ip);
 953         if (ip->objid_cache)
 954                 hammer_clear_objid(ip);
 955         --hammer_count_inodes;
 956         --hmp->count_inodes;
 957         if (ip->pfsm) {
 958                 hammer_rel_pseudofs(hmp, ip->pfsm);
 959                 ip->pfsm = NULL;
 960         }
 961         kfree(ip, hmp->m_inodes);
 962 }
 963
 964 /*
 965  * Retrieve pseudo-fs data.  NULL will never be returned.
 966  *
 967  * If an error occurs *errorp will be set and a default template is returned,
 968  * otherwise *errorp is set to 0.  Typically when an error occurs it will
 969  * be ENOENT.
 970  */
 971 hammer_pseudofs_inmem_t
 972 hammer_load_pseudofs(hammer_transaction_t trans,
 973                      uint32_t localization, int *errorp)
 974 {
 975         hammer_mount_t hmp = trans->hmp;
 976         hammer_inode_t ip;
 977         hammer_pseudofs_inmem_t pfsm;
 978         struct hammer_cursor cursor;
 979         int bytes;
 980
 981 retry:
 982         pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
 983         if (pfsm) {
 984                 hammer_ref(&pfsm->lock);
 985                 *errorp = 0;
 986                 return(pfsm);
 987         }
 988
 989         /*
 990          * PFS records are associated with the root inode (not the PFS root
 991          * inode, but the real root).  Avoid an infinite recursion if loading
 992          * the PFS for the real root.
 993          */
 994         if (localization) {
 995                 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
 996                                       HAMMER_MAX_TID,
 997                                       HAMMER_DEF_LOCALIZATION, 0, errorp);
 998         } else {
 999                 ip = NULL;
1000         }
1001
1002         pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK | M_ZERO);
1003         pfsm->localization = localization;
1004         pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
1005         pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
1006
1007         hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
1008         cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION |
1009                                       HAMMER_LOCALIZE_MISC;
1010         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
1011         cursor.key_beg.create_tid = 0;
1012         cursor.key_beg.delete_tid = 0;
1013         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
1014         cursor.key_beg.obj_type = 0;
1015         cursor.key_beg.key = localization;
1016         cursor.asof = HAMMER_MAX_TID;
1017         cursor.flags |= HAMMER_CURSOR_ASOF;
1018
1019         if (ip)
1020                 *errorp = hammer_ip_lookup(&cursor);
1021         else
1022                 *errorp = hammer_btree_lookup(&cursor);
1023         if (*errorp == 0) {
1024                 *errorp = hammer_ip_resolve_data(&cursor);
1025                 if (*errorp == 0) {
1026                         if (cursor.data->pfsd.mirror_flags &
1027                             HAMMER_PFSD_DELETED) {
1028                                 *errorp = ENOENT;
1029                         } else {
1030                                 bytes = cursor.leaf->data_len;
1031                                 if (bytes > sizeof(pfsm->pfsd))
1032                                         bytes = sizeof(pfsm->pfsd);
1033                                 bcopy(cursor.data, &pfsm->pfsd, bytes);
1034                         }
1035                 }
1036         }
1037         hammer_done_cursor(&cursor);
1038
1039         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
1040         hammer_ref(&pfsm->lock);
1041         if (ip)
1042                 hammer_rel_inode(ip, 0);
1043         if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
1044                 kfree(pfsm, hmp->m_misc);
1045                 goto retry;
1046         }
1047         return(pfsm);
1048 }
1049
1050 /*
1051  * Store pseudo-fs data.  The backend will automatically delete any prior
1052  * on-disk pseudo-fs data but we have to delete in-memory versions.
1053  */
1054 int
1055 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
1056 {
1057         struct hammer_cursor cursor;
1058         hammer_record_t record;
1059         hammer_inode_t ip;
1060         int error;
1061
1062         /*
1063          * PFS records are associated with the root inode (not the PFS root
1064          * inode, but the real root).
1065          */
1066         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1067                               HAMMER_DEF_LOCALIZATION, 0, &error);
1068 retry:
1069         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
1070         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
1071         cursor.key_beg.localization = ip->obj_localization |
1072                                       HAMMER_LOCALIZE_MISC;
1073         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
1074         cursor.key_beg.create_tid = 0;
1075         cursor.key_beg.delete_tid = 0;
1076         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
1077         cursor.key_beg.obj_type = 0;
1078         cursor.key_beg.key = pfsm->localization;
1079         cursor.asof = HAMMER_MAX_TID;
1080         cursor.flags |= HAMMER_CURSOR_ASOF;
1081
1082         /*
1083          * Replace any in-memory version of the record.
1084          */
1085         error = hammer_ip_lookup(&cursor);
1086         if (error == 0 && hammer_cursor_inmem(&cursor)) {
1087                 record = cursor.iprec;
1088                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
1089                         KKASSERT(cursor.deadlk_rec == NULL);
1090                         hammer_ref(&record->lock);
1091                         cursor.deadlk_rec = record;
1092                         error = EDEADLK;
1093                 } else {
1094                         record->flags |= HAMMER_RECF_DELETED_FE;
1095                         error = 0;
1096                 }
1097         }
1098
1099         /*
1100          * Allocate replacement general record.  The backend flush will
1101          * delete any on-disk version of the record.
1102          */
1103         if (error == 0 || error == ENOENT) {
1104                 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
1105                 record->type = HAMMER_MEM_RECORD_GENERAL;
1106
1107                 record->leaf.base.localization = ip->obj_localization |
1108                                                  HAMMER_LOCALIZE_MISC;
1109                 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
1110                 record->leaf.base.key = pfsm->localization;
1111                 record->leaf.data_len = sizeof(pfsm->pfsd);
1112                 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
1113                 error = hammer_ip_add_record(trans, record);
1114         }
1115         hammer_done_cursor(&cursor);
1116         if (error == EDEADLK)
1117                 goto retry;
1118         hammer_rel_inode(ip, 0);
1119         return(error);
1120 }
1121
1122 /*
1123  * Create a root directory for a PFS if one does not alredy exist.
1124  *
1125  * The PFS root stands alone so we must also bump the nlinks count
1126  * to prevent it from being destroyed on release.
1127  */
1128 int
1129 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
1130                        hammer_pseudofs_inmem_t pfsm)
1131 {
1132         hammer_inode_t ip;
1133         struct vattr vap;
1134         int error;
1135
1136         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1137                               pfsm->localization, 0, &error);
1138         if (ip == NULL) {
1139                 vattr_null(&vap);
1140                 vap.va_mode = 0755;
1141                 vap.va_type = VDIR;
1142                 error = hammer_create_inode(trans, &vap, cred,
1143                                             NULL, NULL, 0,
1144                                             pfsm, &ip);
1145                 if (error == 0) {
1146                         ++ip->ino_data.nlinks;
1147                         hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY);
1148                 }
1149         }
1150         if (ip)
1151                 hammer_rel_inode(ip, 0);
1152         return(error);
1153 }
1154
1155 /*
1156  * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
1157  * if we are unable to disassociate all the inodes.
1158  */
1159 static
1160 int
1161 hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
1162 {
1163         int res;
1164
1165         hammer_ref(&ip->lock);
1166         if (ip->vp && (ip->vp->v_flag & VPFSROOT)) {
1167                 /*
1168                  * The hammer pfs-upgrade directive itself might have the
1169                  * root of the pfs open.  Just allow it.
1170                  */
1171                 res = 0;
1172         } else {
1173                 /*
1174                  * Don't allow any subdirectories or files to be open.
1175                  */
1176                 if (hammer_isactive(&ip->lock) == 2 && ip->vp)
1177                         vclean_unlocked(ip->vp);
1178                 if (hammer_isactive(&ip->lock) == 1 && ip->vp == NULL)
1179                         res = 0;
1180                 else
1181                         res = -1;       /* stop, someone is using the inode */
1182         }
1183         hammer_rel_inode(ip, 0);
1184         return(res);
1185 }
1186
1187 int
1188 hammer_unload_pseudofs(hammer_transaction_t trans, uint32_t localization)
1189 {
1190         int res;
1191         int try;
1192
1193         for (try = res = 0; try < 4; ++try) {
1194                 res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
1195                                            hammer_inode_pfs_cmp,
1196                                            hammer_unload_pseudofs_callback,
1197                                            &localization);
1198                 if (res == 0 && try > 1)
1199                         break;
1200                 hammer_flusher_sync(trans->hmp);
1201         }
1202         if (res != 0)
1203                 res = ENOTEMPTY;
1204         return(res);
1205 }
1206
1207
1208 /*
1209  * Release a reference on a PFS
1210  */
1211 void
1212 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
1213 {
1214         hammer_rel(&pfsm->lock);
1215         if (hammer_norefs(&pfsm->lock)) {
1216                 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
1217                 kfree(pfsm, hmp->m_misc);
1218         }
1219 }
1220
1221 /*
1222  * Called by hammer_sync_inode().
1223  */
1224 static int
1225 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
1226 {
1227         hammer_transaction_t trans = cursor->trans;
1228         hammer_record_t record;
1229         int error;
1230         int redirty;
1231
1232 retry:
1233         error = 0;
1234
1235         /*
1236          * If the inode has a presence on-disk then locate it and mark
1237          * it deleted, setting DELONDISK.
1238          *
1239          * The record may or may not be physically deleted, depending on
1240          * the retention policy.
1241          */
1242         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
1243             HAMMER_INODE_ONDISK) {
1244                 hammer_normalize_cursor(cursor);
1245                 cursor->key_beg.localization = ip->obj_localization |
1246                                                HAMMER_LOCALIZE_INODE;
1247                 cursor->key_beg.obj_id = ip->obj_id;
1248                 cursor->key_beg.key = 0;
1249                 cursor->key_beg.create_tid = 0;
1250                 cursor->key_beg.delete_tid = 0;
1251                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1252                 cursor->key_beg.obj_type = 0;
1253                 cursor->asof = ip->obj_asof;
1254                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1255                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
1256                 cursor->flags |= HAMMER_CURSOR_BACKEND;
1257
1258                 error = hammer_btree_lookup(cursor);
1259                 if (hammer_debug_inode)
1260                         hdkprintf("IPDEL %p %08x %d", ip, ip->flags, error);
1261
1262                 if (error == 0) {
1263                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
1264                         if (hammer_debug_inode)
1265                                 hdkprintf("error %d\n", error);
1266                         if (error == 0) {
1267                                 ip->flags |= HAMMER_INODE_DELONDISK;
1268                         }
1269                         if (cursor->node)
1270                                 hammer_cache_node(&ip->cache[0], cursor->node);
1271                 }
1272                 if (error == EDEADLK) {
1273                         hammer_done_cursor(cursor);
1274                         error = hammer_init_cursor(trans, cursor,
1275                                                    &ip->cache[0], ip);
1276                         if (hammer_debug_inode)
1277                                 hdkprintf("IPDED %p %d\n", ip, error);
1278                         if (error == 0)
1279                                 goto retry;
1280                 }
1281         }
1282
1283         /*
1284          * Ok, write out the initial record or a new record (after deleting
1285          * the old one), unless the DELETED flag is set.  This routine will
1286          * clear DELONDISK if it writes out a record.
1287          *
1288          * Update our inode statistics if this is the first application of
1289          * the inode on-disk.
1290          */
1291         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
1292                 /*
1293                  * Generate a record and write it to the media.  We clean-up
1294                  * the state before releasing so we do not have to set-up
1295                  * a flush_group.
1296                  */
1297                 record = hammer_alloc_mem_record(ip, 0);
1298                 record->type = HAMMER_MEM_RECORD_INODE;
1299                 record->flush_state = HAMMER_FST_FLUSH;
1300                 record->leaf = ip->sync_ino_leaf;
1301                 record->leaf.base.create_tid = trans->tid;
1302                 record->leaf.data_len = sizeof(ip->sync_ino_data);
1303                 record->leaf.create_ts = trans->time32;
1304                 record->data = (void *)&ip->sync_ino_data;
1305                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
1306
1307                 /*
1308                  * If this flag is set we cannot sync the new file size
1309                  * because we haven't finished related truncations.  The
1310                  * inode will be flushed in another flush group to finish
1311                  * the job.
1312                  */
1313                 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
1314                     ip->sync_ino_data.size != ip->ino_data.size) {
1315                         redirty = 1;
1316                         ip->sync_ino_data.size = ip->ino_data.size;
1317                 } else {
1318                         redirty = 0;
1319                 }
1320
1321                 for (;;) {
1322                         error = hammer_ip_sync_record_cursor(cursor, record);
1323                         if (hammer_debug_inode)
1324                                 hdkprintf("GENREC %p rec %08x %d\n",
1325                                         ip, record->flags, error);
1326                         if (error != EDEADLK)
1327                                 break;
1328                         hammer_done_cursor(cursor);
1329                         error = hammer_init_cursor(trans, cursor,
1330                                                    &ip->cache[0], ip);
1331                         if (hammer_debug_inode)
1332                                 hdkprintf("GENREC reinit %d\n", error);
1333                         if (error)
1334                                 break;
1335                 }
1336
1337                 /*
1338                  * Note:  The record was never on the inode's record tree
1339                  * so just wave our hands importantly and destroy it.
1340                  */
1341                 record->flags |= HAMMER_RECF_COMMITTED;
1342                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
1343                 record->flush_state = HAMMER_FST_IDLE;
1344                 ++ip->rec_generation;
1345                 hammer_rel_mem_record(record);
1346
1347                 /*
1348                  * Finish up.
1349                  */
1350                 if (error == 0) {
1351                         if (hammer_debug_inode)
1352                                 hdkprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
1353                         ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1354                                             HAMMER_INODE_SDIRTY |
1355                                             HAMMER_INODE_ATIME |
1356                                             HAMMER_INODE_MTIME);
1357                         ip->flags &= ~HAMMER_INODE_DELONDISK;
1358                         if (redirty)
1359                                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
1360
1361                         /*
1362                          * Root volume count of inodes
1363                          */
1364                         hammer_sync_lock_sh(trans);
1365                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
1366                                 hammer_modify_volume_field(trans,
1367                                                            trans->rootvol,
1368                                                            vol0_stat_inodes);
1369                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
1370                                 hammer_modify_volume_done(trans->rootvol);
1371                                 ip->flags |= HAMMER_INODE_ONDISK;
1372                                 if (hammer_debug_inode)
1373                                         hdkprintf("NOWONDISK %p\n", ip);
1374                         }
1375                         hammer_sync_unlock(trans);
1376                 }
1377         }
1378
1379         /*
1380          * If the inode has been destroyed, clean out any left-over flags
1381          * that may have been set by the frontend.
1382          */
1383         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
1384                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1385                                     HAMMER_INODE_SDIRTY |
1386                                     HAMMER_INODE_ATIME |
1387                                     HAMMER_INODE_MTIME);
1388         }
1389         return(error);
1390 }
1391
1392 /*
1393  * Update only the itimes fields.
1394  *
1395  * ATIME can be updated without generating any UNDO.  MTIME is updated
1396  * with UNDO so it is guaranteed to be synchronized properly in case of
1397  * a crash.
1398  *
1399  * Neither field is included in the B-Tree leaf element's CRC, which is how
1400  * we can get away with updating ATIME the way we do.
1401  */
1402 static int
1403 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
1404 {
1405         hammer_transaction_t trans = cursor->trans;
1406         int error;
1407
1408 retry:
1409         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) !=
1410             HAMMER_INODE_ONDISK) {
1411                 return(0);
1412         }
1413
1414         hammer_normalize_cursor(cursor);
1415         cursor->key_beg.localization = ip->obj_localization |
1416                                        HAMMER_LOCALIZE_INODE;
1417         cursor->key_beg.obj_id = ip->obj_id;
1418         cursor->key_beg.key = 0;
1419         cursor->key_beg.create_tid = 0;
1420         cursor->key_beg.delete_tid = 0;
1421         cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1422         cursor->key_beg.obj_type = 0;
1423         cursor->asof = ip->obj_asof;
1424         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1425         cursor->flags |= HAMMER_CURSOR_ASOF;
1426         cursor->flags |= HAMMER_CURSOR_GET_LEAF;
1427         cursor->flags |= HAMMER_CURSOR_GET_DATA;
1428         cursor->flags |= HAMMER_CURSOR_BACKEND;
1429
1430         error = hammer_btree_lookup(cursor);
1431         if (error == 0) {
1432                 hammer_cache_node(&ip->cache[0], cursor->node);
1433                 if (ip->sync_flags & HAMMER_INODE_MTIME) {
1434                         /*
1435                          * Updating MTIME requires an UNDO.  Just cover
1436                          * both atime and mtime.
1437                          */
1438                         hammer_sync_lock_sh(trans);
1439                         hammer_modify_buffer(trans, cursor->data_buffer,
1440                                 &cursor->data->inode.mtime,
1441                                 sizeof(cursor->data->inode.atime) +
1442                                 sizeof(cursor->data->inode.mtime));
1443                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1444                         cursor->data->inode.mtime = ip->sync_ino_data.mtime;
1445                         hammer_modify_buffer_done(cursor->data_buffer);
1446                         hammer_sync_unlock(trans);
1447                 } else if (ip->sync_flags & HAMMER_INODE_ATIME) {
1448                         /*
1449                          * Updating atime only can be done in-place with
1450                          * no UNDO.
1451                          */
1452                         hammer_sync_lock_sh(trans);
1453                         hammer_modify_buffer_noundo(trans, cursor->data_buffer);
1454                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1455                         hammer_modify_buffer_done(cursor->data_buffer);
1456                         hammer_sync_unlock(trans);
1457                 }
1458                 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME);
1459         }
1460         if (error == EDEADLK) {
1461                 hammer_done_cursor(cursor);
1462                 error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip);
1463                 if (error == 0)
1464                         goto retry;
1465         }
1466         return(error);
1467 }
1468
1469 /*
1470  * Release a reference on an inode, flush as requested.
1471  *
1472  * On the last reference we queue the inode to the flusher for its final
1473  * disposition.
1474  */
1475 void
1476 hammer_rel_inode(struct hammer_inode *ip, int flush)
1477 {
1478         /*
1479          * Handle disposition when dropping the last ref.
1480          */
1481         for (;;) {
1482                 if (hammer_oneref(&ip->lock)) {
1483                         /*
1484                          * Determine whether on-disk action is needed for
1485                          * the inode's final disposition.
1486                          */
1487                         KKASSERT(ip->vp == NULL);
1488                         hammer_inode_unloadable_check(ip, 0);
1489                         if (ip->flags & HAMMER_INODE_MODMASK) {
1490                                 hammer_flush_inode(ip, 0);
1491                         } else if (hammer_oneref(&ip->lock)) {
1492                                 hammer_unload_inode(ip);
1493                                 break;
1494                         }
1495                 } else {
1496                         if (flush)
1497                                 hammer_flush_inode(ip, 0);
1498
1499                         /*
1500                          * The inode still has multiple refs, try to drop
1501                          * one ref.
1502                          */
1503                         KKASSERT(hammer_isactive(&ip->lock) >= 1);
1504                         if (hammer_isactive(&ip->lock) > 1) {
1505                                 hammer_rel(&ip->lock);
1506                                 break;
1507                         }
1508                 }
1509         }
1510 }
1511
1512 /*
1513  * Unload and destroy the specified inode.  Must be called with one remaining
1514  * reference.  The reference is disposed of.
1515  *
1516  * The inode must be completely clean.
1517  */
1518 static int
1519 hammer_unload_inode(struct hammer_inode *ip)
1520 {
1521         hammer_mount_t hmp = ip->hmp;
1522
1523         KASSERT(hammer_oneref(&ip->lock),
1524                 ("hammer_unload_inode: %d refs", hammer_isactive(&ip->lock)));
1525         KKASSERT(ip->vp == NULL);
1526         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
1527         KKASSERT(ip->cursor_ip_refs == 0);
1528         KKASSERT(hammer_notlocked(&ip->lock));
1529         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
1530
1531         KKASSERT(RB_EMPTY(&ip->rec_tree));
1532         KKASSERT(TAILQ_EMPTY(&ip->target_list));
1533
1534         if (ip->flags & HAMMER_INODE_RDIRTY) {
1535                 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
1536                 ip->flags &= ~HAMMER_INODE_RDIRTY;
1537         }
1538         RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
1539
1540         hammer_free_inode(ip);
1541         return(0);
1542 }
1543
1544 /*
1545  * Called during unmounting if a critical error occured.  The in-memory
1546  * inode and all related structures are destroyed.
1547  *
1548  * If a critical error did not occur the unmount code calls the standard
1549  * release and asserts that the inode is gone.
1550  */
1551 int
1552 hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused)
1553 {
1554         hammer_record_t rec;
1555
1556         /*
1557          * Get rid of the inodes in-memory records, regardless of their
1558          * state, and clear the mod-mask.
1559          */
1560         while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
1561                 TAILQ_REMOVE(&ip->target_list, rec, target_entry);
1562                 rec->target_ip = NULL;
1563                 if (rec->flush_state == HAMMER_FST_SETUP)
1564                         rec->flush_state = HAMMER_FST_IDLE;
1565         }
1566         while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
1567                 if (rec->flush_state == HAMMER_FST_FLUSH)
1568                         --rec->flush_group->refs;
1569                 else
1570                         hammer_ref(&rec->lock);
1571                 KKASSERT(hammer_oneref(&rec->lock));
1572                 rec->flush_state = HAMMER_FST_IDLE;
1573                 rec->flush_group = NULL;
1574                 rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */
1575                 rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */
1576                 ++ip->rec_generation;
1577                 hammer_rel_mem_record(rec);
1578         }
1579         ip->flags &= ~HAMMER_INODE_MODMASK;
1580         ip->sync_flags &= ~HAMMER_INODE_MODMASK;
1581         KKASSERT(ip->vp == NULL);
1582
1583         /*
1584          * Remove the inode from any flush group, force it idle.  FLUSH
1585          * and SETUP states have an inode ref.
1586          */
1587         switch(ip->flush_state) {
1588         case HAMMER_FST_FLUSH:
1589                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
1590                 --ip->flush_group->refs;
1591                 ip->flush_group = NULL;
1592                 /* fall through */
1593         case HAMMER_FST_SETUP:
1594                 hammer_rel(&ip->lock);
1595                 ip->flush_state = HAMMER_FST_IDLE;
1596                 /* fall through */
1597         case HAMMER_FST_IDLE:
1598                 break;
1599         }
1600
1601         /*
1602          * There shouldn't be any associated vnode.  The unload needs at
1603          * least one ref, if we do have a vp steal its ip ref.
1604          */
1605         if (ip->vp) {
1606                 hdkprintf("Unexpected vnode association ip %p vp %p\n",
1607                         ip, ip->vp);
1608                 ip->vp->v_data = NULL;
1609                 ip->vp = NULL;
1610         } else {
1611                 hammer_ref(&ip->lock);
1612         }
1613         hammer_unload_inode(ip);
1614         return(0);
1615 }
1616
1617 /*
1618  * Called on mount -u when switching from RW to RO or vise-versa.  Adjust
1619  * the read-only flag for cached inodes.
1620  *
1621  * This routine is called from a RB_SCAN().
1622  */
1623 int
1624 hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
1625 {
1626         hammer_mount_t hmp = ip->hmp;
1627
1628         if (hmp->ronly || hmp->asof != HAMMER_MAX_TID)
1629                 ip->flags |= HAMMER_INODE_RO;
1630         else
1631                 ip->flags &= ~HAMMER_INODE_RO;
1632         return(0);
1633 }
1634
1635 /*
1636  * A transaction has modified an inode, requiring updates as specified by
1637  * the passed flags.
1638  *
1639  * HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime,
1640  *                      and not including size changes due to write-append
1641  *                      (but other size changes are included).
1642  * HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to
1643  *                      write-append.
1644  * HAMMER_INODE_XDIRTY: Dirty in-memory records
1645  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
1646  * HAMMER_INODE_DELETED: Inode record/data must be deleted
1647  * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
1648  */
1649 void
1650 hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
1651 {
1652         /*
1653          * ronly of 0 or 2 does not trigger assertion.
1654          * 2 is a special error state
1655          */
1656         KKASSERT(ip->hmp->ronly != 1 ||
1657                   (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
1658                             HAMMER_INODE_SDIRTY |
1659                             HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
1660                             HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
1661         if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
1662                 ip->flags |= HAMMER_INODE_RSV_INODES;
1663                 ++ip->hmp->rsv_inodes;
1664         }
1665
1666         /*
1667          * Set the NEWINODE flag in the transaction if the inode
1668          * transitions to a dirty state.  This is used to track
1669          * the load on the inode cache.
1670          */
1671         if (trans &&
1672             (ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1673             (flags & HAMMER_INODE_MODMASK)) {
1674                 trans->flags |= HAMMER_TRANSF_NEWINODE;
1675         }
1676         if (flags & HAMMER_INODE_MODMASK)
1677                 hammer_inode_dirty(ip);
1678         ip->flags |= flags;
1679 }
1680
1681 /*
1682  * Attempt to quickly update the atime for a hammer inode.  Return 0 on
1683  * success, -1 on failure.
1684  *
1685  * We attempt to update the atime with only the ip lock and not the
1686  * whole filesystem lock in order to improve concurrency.  We can only
1687  * do this safely if the ATIME flag is already pending on the inode.
1688  *
1689  * This function is called via a vnops path (ip pointer is stable) without
1690  * fs_token held.
1691  */
1692 int
1693 hammer_update_atime_quick(hammer_inode_t ip)
1694 {
1695         struct timeval tv;
1696         int res = -1;
1697
1698         if ((ip->flags & HAMMER_INODE_RO) ||
1699             (ip->hmp->mp->mnt_flag & MNT_NOATIME)) {
1700                 /*
1701                  * Silently indicate success on read-only mount/snap
1702                  */
1703                 res = 0;
1704         } else if (ip->flags & HAMMER_INODE_ATIME) {
1705                 /*
1706                  * Double check with inode lock held against backend.  This
1707                  * is only safe if all we need to do is update
1708                  * ino_data.atime.
1709                  */
1710                 getmicrotime(&tv);
1711                 hammer_lock_ex(&ip->lock);
1712                 if (ip->flags & HAMMER_INODE_ATIME) {
1713                         ip->ino_data.atime =
1714                             (unsigned long)tv.tv_sec * 1000000ULL + tv.tv_usec;
1715                         res = 0;
1716                 }
1717                 hammer_unlock(&ip->lock);
1718         }
1719         return res;
1720 }
1721
1722 /*
1723  * Request that an inode be flushed.  This whole mess cannot block and may
1724  * recurse (if not synchronous).  Once requested HAMMER will attempt to
1725  * actively flush the inode until the flush can be done.
1726  *
1727  * The inode may already be flushing, or may be in a setup state.  We can
1728  * place the inode in a flushing state if it is currently idle and flag it
1729  * to reflush if it is currently flushing.
1730  *
1731  * Upon return if the inode could not be flushed due to a setup
1732  * dependancy, then it will be automatically flushed when the dependancy
1733  * is satisfied.
1734  */
1735 void
1736 hammer_flush_inode(hammer_inode_t ip, int flags)
1737 {
1738         hammer_mount_t hmp;
1739         hammer_flush_group_t flg;
1740         int good;
1741
1742         /*
1743          * fill_flush_group is the first flush group we may be able to
1744          * continue filling, it may be open or closed but it will always
1745          * be past the currently flushing (running) flg.
1746          *
1747          * next_flush_group is the next open flush group.
1748          */
1749         hmp = ip->hmp;
1750         while ((flg = hmp->fill_flush_group) != NULL) {
1751                 KKASSERT(flg->running == 0);
1752                 if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit &&
1753                     flg->total_count <= hammer_autoflush) {
1754                         break;
1755                 }
1756                 hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
1757                 hammer_flusher_async(ip->hmp, flg);
1758         }
1759         if (flg == NULL) {
1760                 flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
1761                 flg->seq = hmp->flusher.next++;
1762                 if (hmp->next_flush_group == NULL)
1763                         hmp->next_flush_group = flg;
1764                 if (hmp->fill_flush_group == NULL)
1765                         hmp->fill_flush_group = flg;
1766                 RB_INIT(&flg->flush_tree);
1767                 TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
1768         }
1769
1770         /*
1771          * Trivial 'nothing to flush' case.  If the inode is in a SETUP
1772          * state we have to put it back into an IDLE state so we can
1773          * drop the extra ref.
1774          *
1775          * If we have a parent dependancy we must still fall through
1776          * so we can run it.
1777          */
1778         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
1779                 if (ip->flush_state == HAMMER_FST_SETUP &&
1780                     TAILQ_EMPTY(&ip->target_list)) {
1781                         ip->flush_state = HAMMER_FST_IDLE;
1782                         hammer_rel_inode(ip, 0);
1783                 }
1784                 if (ip->flush_state == HAMMER_FST_IDLE)
1785                         return;
1786         }
1787
1788         /*
1789          * Our flush action will depend on the current state.
1790          */
1791         switch(ip->flush_state) {
1792         case HAMMER_FST_IDLE:
1793                 /*
1794                  * We have no dependancies and can flush immediately.  Some
1795                  * our children may not be flushable so we have to re-test
1796                  * with that additional knowledge.
1797                  */
1798                 hammer_flush_inode_core(ip, flg, flags);
1799                 break;
1800         case HAMMER_FST_SETUP:
1801                 /*
1802                  * Recurse upwards through dependancies via target_list
1803                  * and start their flusher actions going if possible.
1804                  *
1805                  * 'good' is our connectivity.  -1 means we have none and
1806                  * can't flush, 0 means there weren't any dependancies, and
1807                  * 1 means we have good connectivity.
1808                  */
1809                 good = hammer_setup_parent_inodes(ip, 0, flg);
1810
1811                 if (good >= 0) {
1812                         /*
1813                          * We can continue if good >= 0.  Determine how
1814                          * many records under our inode can be flushed (and
1815                          * mark them).
1816                          */
1817                         hammer_flush_inode_core(ip, flg, flags);
1818                 } else {
1819                         /*
1820                          * Parent has no connectivity, tell it to flush
1821                          * us as soon as it does.
1822                          *
1823                          * The REFLUSH flag is also needed to trigger
1824                          * dependancy wakeups.
1825                          */
1826                         ip->flags |= HAMMER_INODE_CONN_DOWN |
1827                                      HAMMER_INODE_REFLUSH;
1828                         if (flags & HAMMER_FLUSH_SIGNAL) {
1829                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1830                                 hammer_flusher_async(ip->hmp, flg);
1831                         }
1832                 }
1833                 break;
1834         case HAMMER_FST_FLUSH:
1835                 /*
1836                  * We are already flushing, flag the inode to reflush
1837                  * if needed after it completes its current flush.
1838                  *
1839                  * The REFLUSH flag is also needed to trigger
1840                  * dependancy wakeups.
1841                  */
1842                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
1843                         ip->flags |= HAMMER_INODE_REFLUSH;
1844                 if (flags & HAMMER_FLUSH_SIGNAL) {
1845                         ip->flags |= HAMMER_INODE_RESIGNAL;
1846                         hammer_flusher_async(ip->hmp, flg);
1847                 }
1848                 break;
1849         }
1850 }
1851
1852 /*
1853  * Scan ip->target_list, which is a list of records owned by PARENTS to our
1854  * ip which reference our ip.
1855  *
1856  * XXX This is a huge mess of recursive code, but not one bit of it blocks
1857  *     so for now do not ref/deref the structures.  Note that if we use the
1858  *     ref/rel code later, the rel CAN block.
1859  */
1860 static int
1861 hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
1862                            hammer_flush_group_t flg)
1863 {
1864         hammer_record_t depend;
1865         int good;
1866         int r;
1867
1868         /*
1869          * If we hit our recursion limit and we have parent dependencies
1870          * We cannot continue.  Returning < 0 will cause us to be flagged
1871          * for reflush.  Returning -2 cuts off additional dependency checks
1872          * because they are likely to also hit the depth limit.
1873          *
1874          * We cannot return < 0 if there are no dependencies or there might
1875          * not be anything to wakeup (ip).
1876          */
1877         if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
1878                 if (hammer_debug_general & 0x10000)
1879                         hkrateprintf(&hammer_gen_krate,
1880                             "Warning: depth limit reached on "
1881                             "setup recursion, inode %p %016llx\n",
1882                             ip, (long long)ip->obj_id);
1883                 return(-2);
1884         }
1885
1886         /*
1887          * Scan dependencies
1888          */
1889         good = 0;
1890         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
1891                 r = hammer_setup_parent_inodes_helper(depend, depth, flg);
1892                 KKASSERT(depend->target_ip == ip);
1893                 if (r < 0 && good == 0)
1894                         good = -1;
1895                 if (r > 0)
1896                         good = 1;
1897
1898                 /*
1899                  * If we failed due to the recursion depth limit then stop
1900                  * now.
1901                  */
1902                 if (r == -2)
1903                         break;
1904         }
1905         return(good);
1906 }
1907
1908 /*
1909  * This helper function takes a record representing the dependancy between
1910  * the parent inode and child inode.
1911  *
1912  * record               = record in question (*rec in below)
1913  * record->ip           = parent inode (*pip in below)
1914  * record->target_ip    = child inode (*ip in below)
1915  *
1916  * *pip--------------\
1917  *    ^               \rec_tree
1918  *     \               \
1919  *      \ip            /\\\\\ rbtree of recs from parent inode's view
1920  *       \            //\\\\\\
1921  *        \          / ........
1922  *         \        /
1923  *          \------*rec------target_ip------>*ip
1924  *               ...target_entry<----...----->target_list<---...
1925  *                                            list of recs from inode's view
1926  *
1927  * We are asked to recurse upwards and convert the record from SETUP
1928  * to FLUSH if possible.
1929  *
1930  * Return 1 if the record gives us connectivity
1931  *
1932  * Return 0 if the record is not relevant
1933  *
1934  * Return -1 if we can't resolve the dependancy and there is no connectivity.
1935  */
1936 static int
1937 hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
1938                                   hammer_flush_group_t flg)
1939 {
1940         hammer_inode_t pip;
1941         int good;
1942
1943         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
1944         pip = record->ip;
1945
1946         /*
1947          * If the record is already flushing, is it in our flush group?
1948          *
1949          * If it is in our flush group but it is a general record or a
1950          * delete-on-disk, it does not improve our connectivity (return 0),
1951          * and if the target inode is not trying to destroy itself we can't
1952          * allow the operation yet anyway (the second return -1).
1953          */
1954         if (record->flush_state == HAMMER_FST_FLUSH) {
1955                 /*
1956                  * If not in our flush group ask the parent to reflush
1957                  * us as soon as possible.
1958                  */
1959                 if (record->flush_group != flg) {
1960                         pip->flags |= HAMMER_INODE_REFLUSH;
1961                         record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1962                         return(-1);
1963                 }
1964
1965                 /*
1966                  * If in our flush group everything is already set up,
1967                  * just return whether the record will improve our
1968                  * visibility or not.
1969                  */
1970                 if (record->type == HAMMER_MEM_RECORD_ADD)
1971                         return(1);
1972                 return(0);
1973         }
1974
1975         /*
1976          * It must be a setup record.  Try to resolve the setup dependancies
1977          * by recursing upwards so we can place ip on the flush list.
1978          *
1979          * Limit ourselves to 20 levels of recursion to avoid blowing out
1980          * the kernel stack.  If we hit the recursion limit we can't flush
1981          * until the parent flushes.  The parent will flush independantly
1982          * on its own and ultimately a deep recursion will be resolved.
1983          */
1984         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1985
1986         good = hammer_setup_parent_inodes(pip, depth + 1, flg);
1987
1988         /*
1989          * If good < 0 the parent has no connectivity and we cannot safely
1990          * flush the directory entry, which also means we can't flush our
1991          * ip.  Flag us for downward recursion once the parent's
1992          * connectivity is resolved.  Flag the parent for [re]flush or it
1993          * may not check for downward recursions.
1994          */
1995         if (good < 0) {
1996                 pip->flags |= HAMMER_INODE_REFLUSH;
1997                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1998                 return(good);
1999         }
2000
2001         /*
2002          * We are go, place the parent inode in a flushing state so we can
2003          * place its record in a flushing state.  Note that the parent
2004          * may already be flushing.  The record must be in the same flush
2005          * group as the parent.
2006          */
2007         if (pip->flush_state != HAMMER_FST_FLUSH)
2008                 hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
2009         KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
2010
2011         /*
2012          * It is possible for a rename to create a loop in the recursion
2013          * and revisit a record.  This will result in the record being
2014          * placed in a flush state unexpectedly.  This check deals with
2015          * the case.
2016          */
2017         if (record->flush_state == HAMMER_FST_FLUSH) {
2018                 if (record->type == HAMMER_MEM_RECORD_ADD)
2019                         return(1);
2020                 return(0);
2021         }
2022
2023         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
2024
2025 #if 0
2026         if (record->type == HAMMER_MEM_RECORD_DEL &&
2027             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
2028                 /*
2029                  * Regardless of flushing state we cannot sync this path if the
2030                  * record represents a delete-on-disk but the target inode
2031                  * is not ready to sync its own deletion.
2032                  *
2033                  * XXX need to count effective nlinks to determine whether
2034                  * the flush is ok, otherwise removing a hardlink will
2035                  * just leave the DEL record to rot.
2036                  */
2037                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
2038                 return(-1);
2039         } else
2040 #endif
2041         if (pip->flush_group == flg) {
2042                 /*
2043                  * Because we have not calculated nlinks yet we can just
2044                  * set records to the flush state if the parent is in
2045                  * the same flush group as we are.
2046                  */
2047                 record->flush_state = HAMMER_FST_FLUSH;
2048                 record->flush_group = flg;
2049                 ++record->flush_group->refs;
2050                 hammer_ref(&record->lock);
2051
2052                 /*
2053                  * A general directory-add contributes to our visibility.
2054                  *
2055                  * Otherwise it is probably a directory-delete or
2056                  * delete-on-disk record and does not contribute to our
2057                  * visbility (but we can still flush it).
2058                  */
2059                 if (record->type == HAMMER_MEM_RECORD_ADD)
2060                         return(1);
2061                 return(0);
2062         } else {
2063                 /*
2064                  * If the parent is not in our flush group we cannot
2065                  * flush this record yet, there is no visibility.
2066                  * We tell the parent to reflush and mark ourselves
2067                  * so the parent knows it should flush us too.
2068                  */
2069                 pip->flags |= HAMMER_INODE_REFLUSH;
2070                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
2071                 return(-1);
2072         }
2073 }
2074
2075 /*
2076  * This is the core routine placing an inode into the FST_FLUSH state.
2077  */
2078 static void
2079 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
2080 {
2081         hammer_mount_t hmp = ip->hmp;
2082         int go_count;
2083
2084         /*
2085          * Set flush state and prevent the flusher from cycling into
2086          * the next flush group.  Do not place the ip on the list yet.
2087          * Inodes not in the idle state get an extra reference.
2088          */
2089         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
2090         if (ip->flush_state == HAMMER_FST_IDLE)
2091                 hammer_ref(&ip->lock);
2092         ip->flush_state = HAMMER_FST_FLUSH;
2093         ip->flush_group = flg;
2094         ++hmp->flusher.group_lock;
2095         ++hmp->count_iqueued;
2096         ++hammer_count_iqueued;
2097         ++flg->total_count;
2098         hammer_redo_fifo_start_flush(ip);
2099
2100 #if 0
2101         /*
2102          * We need to be able to vfsync/truncate from the backend.
2103          *
2104          * XXX Any truncation from the backend will acquire the vnode
2105          *     independently.
2106          */
2107         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
2108         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
2109                 ip->flags |= HAMMER_INODE_VHELD;
2110                 vref(ip->vp);
2111         }
2112 #endif
2113
2114         /*
2115          * Figure out how many in-memory records we can actually flush
2116          * (not including inode meta-data, buffers, etc).
2117          */
2118         KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
2119         if (flags & HAMMER_FLUSH_RECURSION) {
2120                 /*
2121                  * If this is a upwards recursion we do not want to
2122                  * recurse down again!
2123                  */
2124                 go_count = 1;
2125 #if 0
2126         } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2127                 /*
2128                  * No new records are added if we must complete a flush
2129                  * from a previous cycle, but we do have to move the records
2130                  * from the previous cycle to the current one.
2131                  */
2132 #if 0
2133                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2134                                    hammer_syncgrp_child_callback, NULL);
2135 #endif
2136                 go_count = 1;
2137 #endif
2138         } else {
2139                 /*
2140                  * Normal flush, scan records and bring them into the flush.
2141                  * Directory adds and deletes are usually skipped (they are
2142                  * grouped with the related inode rather then with the
2143                  * directory).
2144                  *
2145                  * go_count can be negative, which means the scan aborted
2146                  * due to the flush group being over-full and we should
2147                  * flush what we have.
2148                  */
2149                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2150                                    hammer_setup_child_callback, NULL);
2151         }
2152
2153         /*
2154          * This is a more involved test that includes go_count.  If we
2155          * can't flush, flag the inode and return.  If go_count is 0 we
2156          * were are unable to flush any records in our rec_tree and
2157          * must ignore the XDIRTY flag.
2158          */
2159         if (go_count == 0) {
2160                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
2161                         --hmp->count_iqueued;
2162                         --hammer_count_iqueued;
2163
2164                         --flg->total_count;
2165                         ip->flush_state = HAMMER_FST_SETUP;
2166                         ip->flush_group = NULL;
2167                         if (flags & HAMMER_FLUSH_SIGNAL) {
2168                                 ip->flags |= HAMMER_INODE_REFLUSH |
2169                                              HAMMER_INODE_RESIGNAL;
2170                         } else {
2171                                 ip->flags |= HAMMER_INODE_REFLUSH;
2172                         }
2173 #if 0
2174                         if (ip->flags & HAMMER_INODE_VHELD) {
2175                                 ip->flags &= ~HAMMER_INODE_VHELD;
2176                                 vrele(ip->vp);
2177                         }
2178 #endif
2179
2180                         /*
2181                          * REFLUSH is needed to trigger dependancy wakeups
2182                          * when an inode is in SETUP.
2183                          */
2184                         ip->flags |= HAMMER_INODE_REFLUSH;
2185                         if (--hmp->flusher.group_lock == 0)
2186                                 wakeup(&hmp->flusher.group_lock);
2187                         return;
2188                 }
2189         }
2190
2191         /*
2192          * Snapshot the state of the inode for the backend flusher.
2193          *
2194          * We continue to retain save_trunc_off even when all truncations
2195          * have been resolved as an optimization to determine if we can
2196          * skip the B-Tree lookup for overwrite deletions.
2197          *
2198          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
2199          * and stays in ip->flags.  Once set, it stays set until the
2200          * inode is destroyed.
2201          */
2202         if (ip->flags & HAMMER_INODE_TRUNCATED) {
2203                 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
2204                 ip->sync_trunc_off = ip->trunc_off;
2205                 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
2206                 ip->flags &= ~HAMMER_INODE_TRUNCATED;
2207                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
2208
2209                 /*
2210                  * The save_trunc_off used to cache whether the B-Tree
2211                  * holds any records past that point is not used until
2212                  * after the truncation has succeeded, so we can safely
2213                  * set it now.
2214                  */
2215                 if (ip->save_trunc_off > ip->sync_trunc_off)
2216                         ip->save_trunc_off = ip->sync_trunc_off;
2217         }
2218         ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK &
2219                            ~HAMMER_INODE_TRUNCATED);
2220         ip->sync_ino_leaf = ip->ino_leaf;
2221         ip->sync_ino_data = ip->ino_data;
2222         ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED;
2223
2224         /*
2225          * The flusher list inherits our inode and reference.
2226          */
2227         KKASSERT(flg->running == 0);
2228         RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
2229         if (--hmp->flusher.group_lock == 0)
2230                 wakeup(&hmp->flusher.group_lock);
2231
2232         /*
2233          * Auto-flush the group if it grows too large.  Make sure the
2234          * inode reclaim wait pipeline continues to work.
2235          */
2236         if (flg->total_count >= hammer_autoflush ||
2237             flg->total_count >= hammer_limit_reclaims / 4) {
2238                 if (hmp->fill_flush_group == flg)
2239                         hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
2240                 hammer_flusher_async(hmp, flg);
2241         }
2242 }
2243
2244 /*
2245  * Callback for scan of ip->rec_tree.  Try to include each record in our
2246  * flush.  ip->flush_group has been set but the inode has not yet been
2247  * moved into a flushing state.
2248  *
2249  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
2250  * both inodes.
2251  *
2252  * We return 1 for any record placed or found in FST_FLUSH, which prevents
2253  * the caller from shortcutting the flush.
2254  */
2255 static int
2256 hammer_setup_child_callback(hammer_record_t rec, void *data)
2257 {
2258         hammer_flush_group_t flg;
2259         hammer_inode_t target_ip;
2260         hammer_inode_t ip;
2261         int r;
2262
2263         /*
2264          * Records deleted or committed by the backend are ignored.
2265          * Note that the flush detects deleted frontend records at
2266          * multiple points to deal with races.  This is just the first
2267          * line of defense.  The only time HAMMER_RECF_DELETED_FE cannot
2268          * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
2269          * messes up link-count calculations.
2270          *
2271          * NOTE: Don't get confused between record deletion and, say,
2272          * directory entry deletion.  The deletion of a directory entry
2273          * which is on-media has nothing to do with the record deletion
2274          * flags.
2275          */
2276         if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
2277                           HAMMER_RECF_COMMITTED)) {
2278                 if (rec->flush_state == HAMMER_FST_FLUSH) {
2279                         KKASSERT(rec->flush_group == rec->ip->flush_group);
2280                         r = 1;
2281                 } else {
2282                         r = 0;
2283                 }
2284                 return(r);
2285         }
2286
2287         /*
2288          * If the record is in an idle state it has no dependancies and
2289          * can be flushed.
2290          */
2291         ip = rec->ip;
2292         flg = ip->flush_group;
2293         r = 0;
2294
2295         switch(rec->flush_state) {
2296         case HAMMER_FST_IDLE:
2297                 /*
2298                  * The record has no setup dependancy, we can flush it.
2299                  */
2300                 KKASSERT(rec->target_ip == NULL);
2301                 rec->flush_state = HAMMER_FST_FLUSH;
2302                 rec->flush_group = flg;
2303                 ++flg->refs;
2304                 hammer_ref(&rec->lock);
2305                 r = 1;
2306                 break;
2307         case HAMMER_FST_SETUP:
2308                 /*
2309                  * The record has a setup dependancy.  These are typically
2310                  * directory entry adds and deletes.  Such entries will be
2311                  * flushed when their inodes are flushed so we do not
2312                  * usually have to add them to the flush here.  However,
2313                  * if the target_ip has set HAMMER_INODE_CONN_DOWN then
2314                  * it is asking us to flush this record (and it).
2315                  */
2316                 target_ip = rec->target_ip;
2317                 KKASSERT(target_ip != NULL);
2318                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
2319
2320                 /*
2321                  * If the target IP is already flushing in our group
2322                  * we could associate the record, but target_ip has
2323                  * already synced ino_data to sync_ino_data and we
2324                  * would also have to adjust nlinks.   Plus there are
2325                  * ordering issues for adds and deletes.
2326                  *
2327                  * Reflush downward if this is an ADD, and upward if
2328                  * this is a DEL.
2329                  */
2330                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
2331                         if (rec->type == HAMMER_MEM_RECORD_ADD)
2332                                 ip->flags |= HAMMER_INODE_REFLUSH;
2333                         else
2334                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
2335                         break;
2336                 }
2337
2338                 /*
2339                  * Target IP is not yet flushing.  This can get complex
2340                  * because we have to be careful about the recursion.
2341                  *
2342                  * Directories create an issue for us in that if a flush
2343                  * of a directory is requested the expectation is to flush
2344                  * any pending directory entries, but this will cause the
2345                  * related inodes to recursively flush as well.  We can't
2346                  * really defer the operation so just get as many as we
2347                  * can and
2348                  */
2349 #if 0
2350                 if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
2351                     (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
2352                         /*
2353                          * We aren't reclaiming and the target ip was not
2354                          * previously prevented from flushing due to this
2355                          * record dependancy.  Do not flush this record.
2356                          */
2357                         /*r = 0;*/
2358                 } else
2359 #endif
2360                 if (flg->total_count + flg->refs >
2361                            ip->hmp->undo_rec_limit) {
2362                         /*
2363                          * Our flush group is over-full and we risk blowing
2364                          * out the UNDO FIFO.  Stop the scan, flush what we
2365                          * have, then reflush the directory.
2366                          *
2367                          * The directory may be forced through multiple
2368                          * flush groups before it can be completely
2369                          * flushed.
2370                          */
2371                         ip->flags |= HAMMER_INODE_RESIGNAL |
2372                                      HAMMER_INODE_REFLUSH;
2373                         r = -1;
2374                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
2375                         /*
2376                          * If the target IP is not flushing we can force
2377                          * it to flush, even if it is unable to write out
2378                          * any of its own records we have at least one in
2379                          * hand that we CAN deal with.
2380                          */
2381                         rec->flush_state = HAMMER_FST_FLUSH;
2382                         rec->flush_group = flg;
2383                         ++flg->refs;
2384                         hammer_ref(&rec->lock);
2385                         hammer_flush_inode_core(target_ip, flg,
2386                                                 HAMMER_FLUSH_RECURSION);
2387                         r = 1;
2388                 } else {
2389                         /*
2390                          * General or delete-on-disk record.
2391                          *
2392                          * XXX this needs help.  If a delete-on-disk we could
2393                          * disconnect the target.  If the target has its own
2394                          * dependancies they really need to be flushed.
2395                          *
2396                          * XXX
2397                          */
2398                         rec->flush_state = HAMMER_FST_FLUSH;
2399                         rec->flush_group = flg;
2400                         ++flg->refs;
2401                         hammer_ref(&rec->lock);
2402                         hammer_flush_inode_core(target_ip, flg,
2403                                                 HAMMER_FLUSH_RECURSION);
2404                         r = 1;
2405                 }
2406                 break;
2407         case HAMMER_FST_FLUSH:
2408                 /*
2409                  * The record could be part of a previous flush group if the
2410                  * inode is a directory (the record being a directory entry).
2411                  * Once the flush group was closed a hammer_test_inode()
2412                  * function can cause a new flush group to be setup, placing
2413                  * the directory inode itself in a new flush group.
2414                  *
2415                  * When associated with a previous flush group we count it
2416                  * as if it were in our current flush group, since it will
2417                  * effectively be flushed by the time we flush our current
2418                  * flush group.
2419                  */
2420                 KKASSERT(
2421                     rec->ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY ||
2422                     rec->flush_group == flg);
2423                 r = 1;
2424                 break;
2425         }
2426         return(r);
2427 }
2428
2429 #if 0
2430 /*
2431  * This version just moves records already in a flush state to the new
2432  * flush group and that is it.
2433  */
2434 static int
2435 hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
2436 {
2437         hammer_inode_t ip = rec->ip;
2438
2439         switch(rec->flush_state) {
2440         case HAMMER_FST_FLUSH:
2441                 KKASSERT(rec->flush_group == ip->flush_group);
2442                 break;
2443         default:
2444                 break;
2445         }
2446         return(0);
2447 }
2448 #endif
2449
2450 /*
2451  * Wait for a previously queued flush to complete.
2452  *
2453  * If a critical error occured we don't try to wait.
2454  */
2455 void
2456 hammer_wait_inode(hammer_inode_t ip)
2457 {
2458         /*
2459          * The inode can be in a SETUP state in which case RESIGNAL
2460          * should be set.  If RESIGNAL is not set then the previous
2461          * flush completed and a later operation placed the inode
2462          * in a passive setup state again, so we're done.
2463          *
2464          * The inode can be in a FLUSH state in which case we
2465          * can just wait for completion.
2466          */
2467         while (ip->flush_state == HAMMER_FST_FLUSH ||
2468             (ip->flush_state == HAMMER_FST_SETUP &&
2469              (ip->flags & HAMMER_INODE_RESIGNAL))) {
2470                 /*
2471                  * Don't try to flush on a critical error
2472                  */
2473                 if (ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
2474                         break;
2475
2476                 /*
2477                  * If the inode was already being flushed its flg
2478                  * may not have been queued to the backend.  We
2479                  * have to make sure it gets queued or we can wind
2480                  * up blocked or deadlocked (particularly if we are
2481                  * the vnlru thread).
2482                  */
2483                 if (ip->flush_state == HAMMER_FST_FLUSH) {
2484                         KKASSERT(ip->flush_group);
2485                         if (ip->flush_group->closed == 0) {
2486                                 if (hammer_debug_inode) {
2487                                         hkprintf("debug: forcing "
2488                                                 "async flush ip %016jx\n",
2489                                                 (intmax_t)ip->obj_id);
2490                                 }
2491                                 hammer_flusher_async(ip->hmp, ip->flush_group);
2492                                 continue; /* retest */
2493                         }
2494                 }
2495
2496                 /*
2497                  * In a flush state with the flg queued to the backend
2498                  * or in a setup state with RESIGNAL set, we can safely
2499                  * wait.
2500                  */
2501                 ip->flags |= HAMMER_INODE_FLUSHW;
2502                 tsleep(&ip->flags, 0, "hmrwin", 0);
2503         }
2504
2505 #if 0
2506         /*
2507          * The inode may have been in a passive setup state,
2508          * call flush to make sure we get signaled.
2509          */
2510         if (ip->flush_state == HAMMER_FST_SETUP)
2511                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2512 #endif
2513
2514 }
2515
2516 /*
2517  * Called by the backend code when a flush has been completed.
2518  * The inode has already been removed from the flush list.
2519  *
2520  * A pipelined flush can occur, in which case we must re-enter the
2521  * inode on the list and re-copy its fields.
2522  */
2523 void
2524 hammer_flush_inode_done(hammer_inode_t ip, int error)
2525 {
2526         hammer_mount_t hmp;
2527         int dorel;
2528
2529         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
2530
2531         hmp = ip->hmp;
2532
2533         /*
2534          * Auto-reflush if the backend could not completely flush
2535          * the inode.  This fixes a case where a deferred buffer flush
2536          * could cause fsync to return early.
2537          */
2538         if (ip->sync_flags & HAMMER_INODE_MODMASK)
2539                 ip->flags |= HAMMER_INODE_REFLUSH;
2540
2541         /*
2542          * Merge left-over flags back into the frontend and fix the state.
2543          * Incomplete truncations are retained by the backend.
2544          */
2545         ip->error = error;
2546         ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
2547         ip->sync_flags &= HAMMER_INODE_TRUNCATED;
2548
2549         /*
2550          * The backend may have adjusted nlinks, so if the adjusted nlinks
2551          * does not match the fronttend set the frontend's DDIRTY flag again.
2552          */
2553         if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
2554                 ip->flags |= HAMMER_INODE_DDIRTY;
2555
2556         /*
2557          * Fix up the dirty buffer status.
2558          */
2559         if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
2560                 ip->flags |= HAMMER_INODE_BUFS;
2561         }
2562         hammer_redo_fifo_end_flush(ip);
2563
2564         /*
2565          * Re-set the XDIRTY flag if some of the inode's in-memory records
2566          * could not be flushed.
2567          */
2568         KKASSERT((RB_EMPTY(&ip->rec_tree) &&
2569                   (ip->flags & HAMMER_INODE_XDIRTY) == 0) ||
2570                  (!RB_EMPTY(&ip->rec_tree) &&
2571                   (ip->flags & HAMMER_INODE_XDIRTY) != 0));
2572
2573         /*
2574          * Do not lose track of inodes which no longer have vnode
2575          * assocations, otherwise they may never get flushed again.
2576          *
2577          * The reflush flag can be set superfluously, causing extra pain
2578          * for no reason.  If the inode is no longer modified it no longer
2579          * needs to be flushed.
2580          */
2581         if (ip->flags & HAMMER_INODE_MODMASK) {
2582                 if (ip->vp == NULL)
2583                         ip->flags |= HAMMER_INODE_REFLUSH;
2584         } else {
2585                 ip->flags &= ~HAMMER_INODE_REFLUSH;
2586         }
2587
2588         /*
2589          * The fs token is held but the inode lock is not held.  Because this
2590          * is a backend flush it is possible that the vnode has no references
2591          * and cause a reclaim race inside vsetisdirty() if/when it blocks.
2592          *
2593          * Therefore, we must lock the inode around this particular dirtying
2594          * operation.  We don't have to around other dirtying operations
2595          * where the vnode is implicitly or explicitly held.
2596          */
2597         if (ip->flags & HAMMER_INODE_MODMASK) {
2598                 hammer_lock_ex(&ip->lock);
2599                 hammer_inode_dirty(ip);
2600                 hammer_unlock(&ip->lock);
2601         }
2602
2603         /*
2604          * Adjust the flush state.
2605          */
2606         if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2607                 /*
2608                  * We were unable to flush out all our records, leave the
2609                  * inode in a flush state and in the current flush group.
2610                  * The flush group will be re-run.
2611                  *
2612                  * This occurs if the UNDO block gets too full or there is
2613                  * too much dirty meta-data and allows the flusher to
2614                  * finalize the UNDO block and then re-flush.
2615                  */
2616                 ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
2617                 dorel = 0;
2618         } else {
2619                 /*
2620                  * Remove from the flush_group
2621                  */
2622                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
2623                 ip->flush_group = NULL;
2624
2625 #if 0
2626                 /*
2627                  * Clean up the vnode ref and tracking counts.
2628                  */
2629                 if (ip->flags & HAMMER_INODE_VHELD) {
2630                         ip->flags &= ~HAMMER_INODE_VHELD;
2631                         vrele(ip->vp);
2632                 }
2633 #endif
2634                 --hmp->count_iqueued;
2635                 --hammer_count_iqueued;
2636
2637                 /*
2638                  * And adjust the state.
2639                  */
2640                 if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
2641                         ip->flush_state = HAMMER_FST_IDLE;
2642                         dorel = 1;
2643                 } else {
2644                         ip->flush_state = HAMMER_FST_SETUP;
2645                         dorel = 0;
2646                 }
2647
2648                 /*
2649                  * If the frontend is waiting for a flush to complete,
2650                  * wake it up.
2651                  */
2652                 if (ip->flags & HAMMER_INODE_FLUSHW) {
2653                         ip->flags &= ~HAMMER_INODE_FLUSHW;
2654                         wakeup(&ip->flags);
2655                 }
2656
2657                 /*
2658                  * If the frontend made more changes and requested another
2659                  * flush, then try to get it running.
2660                  *
2661                  * Reflushes are aborted when the inode is errored out.
2662                  */
2663                 if (ip->flags & HAMMER_INODE_REFLUSH) {
2664                         ip->flags &= ~HAMMER_INODE_REFLUSH;
2665                         if (ip->flags & HAMMER_INODE_RESIGNAL) {
2666                                 ip->flags &= ~HAMMER_INODE_RESIGNAL;
2667                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2668                         } else {
2669                                 hammer_flush_inode(ip, 0);
2670                         }
2671                 }
2672         }
2673
2674         /*
2675          * If we have no parent dependancies we can clear CONN_DOWN
2676          */
2677         if (TAILQ_EMPTY(&ip->target_list))
2678                 ip->flags &= ~HAMMER_INODE_CONN_DOWN;
2679
2680         /*
2681          * If the inode is now clean drop the space reservation.
2682          */
2683         if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2684             (ip->flags & HAMMER_INODE_RSV_INODES)) {
2685                 ip->flags &= ~HAMMER_INODE_RSV_INODES;
2686                 --hmp->rsv_inodes;
2687         }
2688
2689         ip->flags &= ~HAMMER_INODE_SLAVEFLUSH;
2690
2691         if (dorel)
2692                 hammer_rel_inode(ip, 0);
2693 }
2694
2695 /*
2696  * Called from hammer_sync_inode() to synchronize in-memory records
2697  * to the media.
2698  */
2699 static int
2700 hammer_sync_record_callback(hammer_record_t record, void *data)
2701 {
2702         hammer_cursor_t cursor = data;
2703         hammer_transaction_t trans = cursor->trans;
2704         hammer_mount_t hmp = trans->hmp;
2705         int error;
2706
2707         /*
2708          * Skip records that do not belong to the current flush.
2709          */
2710         ++hammer_stats_record_iterations;
2711         if (record->flush_state != HAMMER_FST_FLUSH)
2712                 return(0);
2713
2714         if (record->flush_group != record->ip->flush_group) {
2715                 hdkprintf("rec %p ip %p bad flush group %p %p\n",
2716                         record,
2717                         record->ip,
2718                         record->flush_group,
2719                         record->ip->flush_group);
2720                 if (hammer_debug_critical)
2721                         Debugger("blah2");
2722                 return(0);
2723         }
2724         KKASSERT(record->flush_group == record->ip->flush_group);
2725
2726         /*
2727          * Interlock the record using the BE flag.  Once BE is set the
2728          * frontend cannot change the state of FE.
2729          *
2730          * NOTE: If FE is set prior to us setting BE we still sync the
2731          * record out, but the flush completion code converts it to
2732          * a delete-on-disk record instead of destroying it.
2733          */
2734         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
2735         record->flags |= HAMMER_RECF_INTERLOCK_BE;
2736
2737         /*
2738          * The backend has already disposed of the record.
2739          */
2740         if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) {
2741                 error = 0;
2742                 goto done;
2743         }
2744
2745         /*
2746          * If the whole inode is being deleted and all on-disk records will
2747          * be deleted very soon, we can't sync any new records to disk
2748          * because they will be deleted in the same transaction they were
2749          * created in (delete_tid == create_tid), which will assert.
2750          *
2751          * XXX There may be a case with RECORD_ADD with DELETED_FE set
2752          * that we currently panic on.
2753          */
2754         if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
2755                 switch(record->type) {
2756                 case HAMMER_MEM_RECORD_DATA:
2757                         /*
2758                          * We don't have to do anything, if the record was
2759                          * committed the space will have been accounted for
2760                          * in the blockmap.
2761                          */
2762                         /* fall through */
2763                 case HAMMER_MEM_RECORD_GENERAL:
2764                         /*
2765                          * Set deleted-by-backend flag.  Do not set the
2766                          * backend committed flag, because we are throwing
2767                          * the record away.
2768                          */
2769                         record->flags |= HAMMER_RECF_DELETED_BE;
2770                         ++record->ip->rec_generation;
2771                         error = 0;
2772                         goto done;
2773                 case HAMMER_MEM_RECORD_ADD:
2774                         hpanic("illegal add during inode deletion record %p",
2775                                 record);
2776                         break; /* NOT REACHED */
2777                 case HAMMER_MEM_RECORD_INODE:
2778                         hpanic("attempt to sync inode record %p?", record);
2779                         break; /* NOT REACHED */
2780                 case HAMMER_MEM_RECORD_DEL:
2781                         /*
2782                          * Follow through and issue the on-disk deletion
2783                          */
2784                         break;
2785                 }
2786         }
2787
2788         /*
2789          * If DELETED_FE is set special handling is needed for directory
2790          * entries.  Dependant pieces related to the directory entry may
2791          * have already been synced to disk.  If this occurs we have to
2792          * sync the directory entry and then change the in-memory record
2793          * from an ADD to a DELETE to cover the fact that it's been
2794          * deleted by the frontend.
2795          *
2796          * A directory delete covering record (MEM_RECORD_DEL) can never
2797          * be deleted by the frontend.
2798          *
2799          * Any other record type (aka DATA) can be deleted by the frontend.
2800          * XXX At the moment the flusher must skip it because there may
2801          * be another data record in the flush group for the same block,
2802          * meaning that some frontend data changes can leak into the backend's
2803          * synchronization point.
2804          */
2805         if (record->flags & HAMMER_RECF_DELETED_FE) {
2806                 if (record->type == HAMMER_MEM_RECORD_ADD) {
2807                         /*
2808                          * Convert a front-end deleted directory-add to
2809                          * a directory-delete entry later.
2810                          */
2811                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
2812                 } else {
2813                         /*
2814                          * Dispose of the record (race case).  Mark as
2815                          * deleted by backend (and not committed).
2816                          */
2817                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
2818                         record->flags |= HAMMER_RECF_DELETED_BE;
2819                         ++record->ip->rec_generation;
2820                         error = 0;
2821                         goto done;
2822                 }
2823         }
2824
2825         /*
2826          * Assign the create_tid for new records.  Deletions already
2827          * have the record's entire key properly set up.
2828          */
2829         if (record->type != HAMMER_MEM_RECORD_DEL) {
2830                 record->leaf.base.create_tid = trans->tid;
2831                 record->leaf.create_ts = trans->time32;
2832         }
2833
2834         /*
2835          * This actually moves the record to the on-media B-Tree.  We
2836          * must also generate REDO_TERM entries in the UNDO/REDO FIFO
2837          * indicating that the related REDO_WRITE(s) have been committed.
2838          *
2839          * During recovery any REDO_TERM's within the nominal recovery span
2840          * are ignored since the related meta-data is being undone, causing
2841          * any matching REDO_WRITEs to execute.  The REDO_TERMs outside
2842          * the nominal recovery span will match against REDO_WRITEs and
2843          * prevent them from being executed (because the meta-data has
2844          * already been synchronized).
2845          */
2846         if (record->flags & HAMMER_RECF_REDO) {
2847                 KKASSERT(record->type == HAMMER_MEM_RECORD_DATA);
2848                 hammer_generate_redo(trans, record->ip,
2849                                      record->leaf.base.key -
2850                                          record->leaf.data_len,
2851                                      HAMMER_REDO_TERM_WRITE,
2852                                      NULL,
2853                                      record->leaf.data_len);
2854         }
2855
2856         for (;;) {
2857                 error = hammer_ip_sync_record_cursor(cursor, record);
2858                 if (error != EDEADLK)
2859                         break;
2860                 hammer_done_cursor(cursor);
2861                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
2862                                            record->ip);
2863                 if (error)
2864                         break;
2865         }
2866         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
2867
2868         if (error)
2869                 error = -error;
2870 done:
2871         hammer_flush_record_done(record, error);
2872
2873         /*
2874          * Do partial finalization if we have built up too many dirty
2875          * buffers.  Otherwise a buffer cache deadlock can occur when
2876          * doing things like creating tens of thousands of tiny files.
2877          *
2878          * We must release our cursor lock to avoid a 3-way deadlock
2879          * due to the exclusive sync lock the finalizer must get.
2880          *
2881          * WARNING: See warnings in hammer_unlock_cursor() function.
2882          */
2883         if (hammer_flusher_meta_limit(hmp) ||
2884             vm_page_count_severe()) {
2885                 hammer_unlock_cursor(cursor);
2886                 hammer_flusher_finalize(trans, 0);
2887                 hammer_lock_cursor(cursor);
2888         }
2889         return(error);
2890 }
2891
2892 /*
2893  * Backend function called by the flusher to sync an inode to media.
2894  */
2895 int
2896 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
2897 {
2898         struct hammer_cursor cursor;
2899         hammer_node_t tmp_node;
2900         hammer_record_t depend;
2901         hammer_record_t next;
2902         int error, tmp_error;
2903         uint64_t nlinks;
2904
2905         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
2906                 return(0);
2907
2908         error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2909         if (error)
2910                 goto done;
2911
2912         /*
2913          * Any directory records referencing this inode which are not in
2914          * our current flush group must adjust our nlink count for the
2915          * purposes of synchronizating to disk.
2916          *
2917          * Records which are in our flush group can be unlinked from our
2918          * inode now, potentially allowing the inode to be physically
2919          * deleted.
2920          *
2921          * This cannot block.
2922          */
2923         nlinks = ip->ino_data.nlinks;
2924         next = TAILQ_FIRST(&ip->target_list);
2925         while ((depend = next) != NULL) {
2926                 next = TAILQ_NEXT(depend, target_entry);
2927                 if (depend->flush_state == HAMMER_FST_FLUSH &&
2928                     depend->flush_group == ip->flush_group) {
2929                         /*
2930                          * If this is an ADD that was deleted by the frontend
2931                          * the frontend nlinks count will have already been
2932                          * decremented, but the backend is going to sync its
2933                          * directory entry and must account for it.  The
2934                          * record will be converted to a delete-on-disk when
2935                          * it gets synced.
2936                          *
2937                          * If the ADD was not deleted by the frontend we
2938                          * can remove the dependancy from our target_list.
2939                          */
2940                         if (depend->flags & HAMMER_RECF_DELETED_FE) {
2941                                 ++nlinks;
2942                         } else {
2943                                 TAILQ_REMOVE(&ip->target_list, depend,
2944                                              target_entry);
2945                                 depend->target_ip = NULL;
2946                         }
2947                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
2948                         /*
2949                          * Not part of our flush group and not deleted by
2950                          * the front-end, adjust the link count synced to
2951                          * the media (undo what the frontend did when it
2952                          * queued the record).
2953                          */
2954                         KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
2955                         switch(depend->type) {
2956                         case HAMMER_MEM_RECORD_ADD:
2957                                 --nlinks;
2958                                 break;
2959                         case HAMMER_MEM_RECORD_DEL:
2960                                 ++nlinks;
2961                                 break;
2962                         default:
2963                                 break;
2964                         }
2965                 }
2966         }
2967
2968         /*
2969          * Set dirty if we had to modify the link count.
2970          */
2971         if (ip->sync_ino_data.nlinks != nlinks) {
2972                 KKASSERT((int64_t)nlinks >= 0);
2973                 ip->sync_ino_data.nlinks = nlinks;
2974                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2975         }
2976
2977         /*
2978          * If there is a trunction queued destroy any data past the (aligned)
2979          * truncation point.  Userland will have dealt with the buffer
2980          * containing the truncation point for us.
2981          *
2982          * We don't flush pending frontend data buffers until after we've
2983          * dealt with the truncation.
2984          */
2985         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2986                 /*
2987                  * Interlock trunc_off.  The VOP front-end may continue to
2988                  * make adjustments to it while we are blocked.
2989                  */
2990                 off_t trunc_off;
2991                 off_t aligned_trunc_off;
2992                 int blkmask;
2993
2994                 trunc_off = ip->sync_trunc_off;
2995                 blkmask = hammer_blocksize(trunc_off) - 1;
2996                 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
2997
2998                 /*
2999                  * Delete any whole blocks on-media.  The front-end has
3000                  * already cleaned out any partial block and made it
3001                  * pending.  The front-end may have updated trunc_off
3002                  * while we were blocked so we only use sync_trunc_off.
3003                  *
3004                  * This operation can blow out the buffer cache, EWOULDBLOCK
3005                  * means we were unable to complete the deletion.  The
3006                  * deletion will update sync_trunc_off in that case.
3007                  */
3008                 error = hammer_ip_delete_range(&cursor, ip,
3009                                                 aligned_trunc_off,
3010                                                 0x7FFFFFFFFFFFFFFFLL, 2);
3011                 if (error == EWOULDBLOCK) {
3012                         ip->flags |= HAMMER_INODE_WOULDBLOCK;
3013                         error = 0;
3014                         goto defer_buffer_flush;
3015                 }
3016
3017                 if (error)
3018                         goto done;
3019
3020                 /*
3021                  * Generate a REDO_TERM_TRUNC entry in the UNDO/REDO FIFO.
3022                  *
3023                  * XXX we do this even if we did not previously generate
3024                  * a REDO_TRUNC record.  This operation may enclosed the
3025                  * range for multiple prior truncation entries in the REDO
3026                  * log.
3027                  */
3028                 if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR &&
3029                     (ip->flags & HAMMER_INODE_RDIRTY)) {
3030                         hammer_generate_redo(trans, ip, aligned_trunc_off,
3031                                              HAMMER_REDO_TERM_TRUNC,
3032                                              NULL, 0);
3033                 }
3034
3035                 /*
3036                  * Clear the truncation flag on the backend after we have
3037                  * completed the deletions.  Backend data is now good again
3038                  * (including new records we are about to sync, below).
3039                  *
3040                  * Leave sync_trunc_off intact.  As we write additional
3041                  * records the backend will update sync_trunc_off.  This
3042                  * tells the backend whether it can skip the overwrite
3043                  * test.  This should work properly even when the backend
3044                  * writes full blocks where the truncation point straddles
3045                  * the block because the comparison is against the base
3046                  * offset of the record.
3047                  */
3048                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
3049                 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
3050         } else {
3051                 error = 0;
3052         }
3053
3054         /*
3055          * Now sync related records.  These will typically be directory
3056          * entries, records tracking direct-writes, or delete-on-disk records.
3057          */
3058         if (error == 0) {
3059                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
3060                                     hammer_sync_record_callback, &cursor);
3061                 if (tmp_error < 0)
3062                         tmp_error = -error;
3063                 if (tmp_error)
3064                         error = tmp_error;
3065         }
3066         hammer_cache_node(&ip->cache[1], cursor.node);
3067
3068         /*
3069          * Re-seek for inode update, assuming our cache hasn't been ripped
3070          * out from under us.
3071          */
3072         if (error == 0) {
3073                 tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
3074                 if (tmp_node) {
3075                         hammer_cursor_downgrade(&cursor);
3076                         hammer_lock_sh(&tmp_node->lock);
3077                         if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
3078                                 hammer_cursor_seek(&cursor, tmp_node, 0);
3079                         hammer_unlock(&tmp_node->lock);
3080                         hammer_rel_node(tmp_node);
3081                 }
3082                 error = 0;
3083         }
3084
3085         /*
3086          * If we are deleting the inode the frontend had better not have
3087          * any active references on elements making up the inode.
3088          *
3089          * The call to hammer_ip_delete_clean() cleans up auxillary records
3090          * but not DB or DATA records.  Those must have already been deleted
3091          * by the normal truncation mechanic.
3092          */
3093         if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
3094                 RB_EMPTY(&ip->rec_tree)  &&
3095             (ip->sync_flags & HAMMER_INODE_DELETING) &&
3096             (ip->flags & HAMMER_INODE_DELETED) == 0) {
3097                 int count1 = 0;
3098
3099                 error = hammer_ip_delete_clean(&cursor, ip, &count1);
3100                 if (error == 0) {
3101                         ip->flags |= HAMMER_INODE_DELETED;
3102                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
3103                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
3104                         KKASSERT(RB_EMPTY(&ip->rec_tree));
3105
3106                         /*
3107                          * Set delete_tid in both the frontend and backend
3108                          * copy of the inode record.  The DELETED flag handles
3109                          * this, do not set DDIRTY.
3110                          */
3111                         ip->ino_leaf.base.delete_tid = trans->tid;
3112                         ip->sync_ino_leaf.base.delete_tid = trans->tid;
3113                         ip->ino_leaf.delete_ts = trans->time32;
3114                         ip->sync_ino_leaf.delete_ts = trans->time32;
3115
3116
3117                         /*
3118                          * Adjust the inode count in the volume header
3119                          */
3120                         hammer_sync_lock_sh(trans);
3121                         if (ip->flags & HAMMER_INODE_ONDISK) {
3122                                 hammer_modify_volume_field(trans,
3123                                                            trans->rootvol,
3124                                                            vol0_stat_inodes);
3125                                 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
3126                                 hammer_modify_volume_done(trans->rootvol);
3127                         }
3128                         hammer_sync_unlock(trans);
3129                 }
3130         }
3131
3132         if (error)
3133                 goto done;
3134         ip->sync_flags &= ~HAMMER_INODE_BUFS;
3135
3136 defer_buffer_flush:
3137         /*
3138          * Now update the inode's on-disk inode-data and/or on-disk record.
3139          * DELETED and ONDISK are managed only in ip->flags.
3140          *
3141          * In the case of a defered buffer flush we still update the on-disk
3142          * inode to satisfy visibility requirements if there happen to be
3143          * directory dependancies.
3144          */
3145         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
3146         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
3147                 /*
3148                  * If deleted and on-disk, don't set any additional flags.
3149                  * the delete flag takes care of things.
3150                  *
3151                  * Clear flags which may have been set by the frontend.
3152                  */
3153                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
3154                                     HAMMER_INODE_SDIRTY |
3155                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
3156                                     HAMMER_INODE_DELETING);
3157                 break;
3158         case HAMMER_INODE_DELETED:
3159                 /*
3160                  * Take care of the case where a deleted inode was never
3161                  * flushed to the disk in the first place.
3162                  *
3163                  * Clear flags which may have been set by the frontend.
3164                  */
3165                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
3166                                     HAMMER_INODE_SDIRTY |
3167                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
3168                                     HAMMER_INODE_DELETING);
3169                 while (RB_ROOT(&ip->rec_tree)) {
3170                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
3171                         hammer_ref(&record->lock);
3172                         KKASSERT(hammer_oneref(&record->lock));
3173                         record->flags |= HAMMER_RECF_DELETED_BE;
3174                         ++record->ip->rec_generation;
3175                         hammer_rel_mem_record(record);
3176                 }
3177                 break;
3178         case HAMMER_INODE_ONDISK:
3179                 /*
3180                  * If already on-disk, do not set any additional flags.
3181                  */
3182                 break;
3183         default:
3184                 /*
3185                  * If not on-disk and not deleted, set DDIRTY to force
3186                  * an initial record to be written.
3187                  *
3188                  * Also set the create_tid in both the frontend and backend
3189                  * copy of the inode record.
3190                  */
3191                 ip->ino_leaf.base.create_tid = trans->tid;
3192                 ip->ino_leaf.create_ts = trans->time32;
3193                 ip->sync_ino_leaf.base.create_tid = trans->tid;
3194                 ip->sync_ino_leaf.create_ts = trans->time32;
3195                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
3196                 break;
3197         }
3198
3199         /*
3200          * If DDIRTY or SDIRTY is set, write out a new record.
3201          * If the inode is already on-disk the old record is marked as
3202          * deleted.
3203          *
3204          * If DELETED is set hammer_update_inode() will delete the existing
3205          * record without writing out a new one.
3206          */
3207         if (ip->flags & HAMMER_INODE_DELETED) {
3208                 error = hammer_update_inode(&cursor, ip);
3209         } else
3210         if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY)) &&
3211             (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
3212                 error = hammer_update_itimes(&cursor, ip);
3213         } else
3214         if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY |
3215                               HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
3216                 error = hammer_update_inode(&cursor, ip);
3217         }
3218 done:
3219         if (ip->flags & HAMMER_INODE_MODMASK)
3220                 hammer_inode_dirty(ip);
3221         if (error) {
3222                 hammer_critical_error(ip->hmp, ip, error,
3223                                       "while syncing inode");
3224         }
3225         hammer_done_cursor(&cursor);
3226         return(error);
3227 }
3228
3229 /*
3230  * This routine is called when the OS is no longer actively referencing
3231  * the inode (but might still be keeping it cached), or when releasing
3232  * the last reference to an inode.
3233  *
3234  * At this point if the inode's nlinks count is zero we want to destroy
3235  * it, which may mean destroying it on-media too.
3236  */
3237 void
3238 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
3239 {
3240         struct vnode *vp;
3241
3242         /*
3243          * Set the DELETING flag when the link count drops to 0 and the
3244          * OS no longer has any opens on the inode.
3245          *
3246          * The backend will clear DELETING (a mod flag) and set DELETED
3247          * (a state flag) when it is actually able to perform the
3248          * operation.
3249          *
3250          * Don't reflag the deletion if the flusher is currently syncing
3251          * one that was already flagged.  A previously set DELETING flag
3252          * may bounce around flags and sync_flags until the operation is
3253          * completely done.
3254          *
3255          * Do not attempt to modify a snapshot inode (one set to read-only).
3256          */
3257         if (ip->ino_data.nlinks == 0 &&
3258             ((ip->flags | ip->sync_flags) & (HAMMER_INODE_RO|HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
3259                 ip->flags |= HAMMER_INODE_DELETING;
3260                 ip->flags |= HAMMER_INODE_TRUNCATED;
3261                 ip->trunc_off = 0;
3262                 vp = NULL;
3263                 if (getvp) {
3264                         if (hammer_get_vnode(ip, &vp) != 0)
3265                                 return;
3266                 }
3267
3268                 /*
3269                  * Final cleanup
3270                  */
3271                 if (ip->vp)
3272                         nvtruncbuf(ip->vp, 0, HAMMER_BUFSIZE, 0, 0);
3273                 if (ip->flags & HAMMER_INODE_MODMASK)
3274                         hammer_inode_dirty(ip);
3275                 if (getvp)
3276                         vput(vp);
3277         }
3278 }
3279
3280 /*
3281  * After potentially resolving a dependancy the inode is tested
3282  * to determine whether it needs to be reflushed.
3283  */
3284 void
3285 hammer_test_inode(hammer_inode_t ip)
3286 {
3287         if (ip->flags & HAMMER_INODE_REFLUSH) {
3288                 ip->flags &= ~HAMMER_INODE_REFLUSH;
3289                 hammer_ref(&ip->lock);
3290                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
3291                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
3292                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
3293                 } else {
3294                         hammer_flush_inode(ip, 0);
3295                 }
3296                 hammer_rel_inode(ip, 0);
3297         }
3298 }
3299
3300 /*
3301  * Clear the RECLAIM flag on an inode.  This occurs when the inode is
3302  * reassociated with a vp or just before it gets freed.
3303  *
3304  * Pipeline wakeups to threads blocked due to an excessive number of
3305  * detached inodes.  This typically occurs when atime updates accumulate
3306  * while scanning a directory tree.
3307  */
3308 static void
3309 hammer_inode_wakereclaims(hammer_inode_t ip)
3310 {
3311         struct hammer_reclaim *reclaim;
3312         hammer_mount_t hmp = ip->hmp;
3313
3314         if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
3315                 return;
3316
3317         --hammer_count_reclaims;
3318         --hmp->count_reclaims;
3319         ip->flags &= ~HAMMER_INODE_RECLAIM;
3320
3321         if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
3322                 KKASSERT(reclaim->count > 0);
3323                 if (--reclaim->count == 0) {
3324                         TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
3325                         wakeup(reclaim);
3326                 }
3327         }
3328 }
3329
3330 /*
3331  * Setup our reclaim pipeline.  We only let so many detached (and dirty)
3332  * inodes build up before we start blocking.  This routine is called
3333  * if a new inode is created or an inode is loaded from media.
3334  *
3335  * When we block we don't care *which* inode has finished reclaiming,
3336  * as long as one does.
3337  *
3338  * The reclaim pipeline is primarily governed by the auto-flush which is
3339  * 1/4 hammer_limit_reclaims.  We don't want to block if the count is
3340  * less than 1/2 hammer_limit_reclaims.  From 1/2 to full count is
3341  * dynamically governed.
3342  */
3343 void
3344 hammer_inode_waitreclaims(hammer_transaction_t trans)
3345 {
3346         hammer_mount_t hmp = trans->hmp;
3347         struct hammer_reclaim reclaim;
3348         int lower_limit;
3349
3350         /*
3351          * Track inode load, delay if the number of reclaiming inodes is
3352          * between 2/4 and 4/4 hammer_limit_reclaims, depending.
3353          */
3354         if (curthread->td_proc) {
3355                 struct hammer_inostats *stats;
3356
3357                 stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
3358                 ++stats->count;
3359
3360                 if (stats->count > hammer_limit_reclaims / 2)
3361                         stats->count = hammer_limit_reclaims / 2;
3362                 lower_limit = hammer_limit_reclaims - stats->count;
3363                 if (hammer_debug_general & 0x10000) {
3364                         hdkprintf("pid %5d limit %d\n",
3365                                 (int)curthread->td_proc->p_pid, lower_limit);
3366                 }
3367         } else {
3368                 lower_limit = hammer_limit_reclaims * 3 / 4;
3369         }
3370         if (hmp->count_reclaims >= lower_limit) {
3371                 reclaim.count = 1;
3372                 TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
3373                 tsleep(&reclaim, 0, "hmrrcm", hz);
3374                 if (reclaim.count > 0)
3375                         TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
3376         }
3377 }
3378
3379 /*
3380  * Keep track of reclaim statistics on a per-pid basis using a loose
3381  * 4-way set associative hash table.  Collisions inherit the count of
3382  * the previous entry.
3383  *
3384  * NOTE: We want to be careful here to limit the chain size.  If the chain
3385  *       size is too large a pid will spread its stats out over too many
3386  *       entries under certain types of heavy filesystem activity and
3387  *       wind up not delaying long enough.
3388  */
3389 static
3390 struct hammer_inostats *
3391 hammer_inode_inostats(hammer_mount_t hmp, pid_t pid)
3392 {
3393         struct hammer_inostats *stats;
3394         int delta;
3395         int chain;
3396         static volatile int iterator;   /* we don't care about MP races */
3397
3398         /*
3399          * Chain up to 4 times to find our entry.
3400          */
3401         for (chain = 0; chain < 4; ++chain) {
3402                 stats = &hmp->inostats[(pid + chain) & HAMMER_INOSTATS_HMASK];
3403                 if (stats->pid == pid)
3404                         break;
3405         }
3406
3407         /*
3408          * Replace one of the four chaining entries with our new entry.
3409          */
3410         if (chain == 4) {
3411                 stats = &hmp->inostats[(pid + (iterator++ & 3)) &
3412                                        HAMMER_INOSTATS_HMASK];
3413                 stats->pid = pid;
3414         }
3415
3416         /*
3417          * Decay the entry
3418          */
3419         if (stats->count && stats->ltick != ticks) {
3420                 delta = ticks - stats->ltick;
3421                 stats->ltick = ticks;
3422                 if (delta <= 0 || delta > hz * 60)
3423                         stats->count = 0;
3424                 else
3425                         stats->count = stats->count * hz / (hz + delta);
3426         }
3427         if (hammer_debug_general & 0x10000)
3428                 hdkprintf("pid %5d stats %d\n", (int)pid, stats->count);
3429         return (stats);
3430 }
3431
3432 #if 0
3433
3434 /*
3435  * XXX not used, doesn't work very well due to the large batching nature
3436  * of flushes.
3437  *
3438  * A larger then normal backlog of inodes is sitting in the flusher,
3439  * enforce a general slowdown to let it catch up.  This routine is only
3440  * called on completion of a non-flusher-related transaction which
3441  * performed B-Tree node I/O.
3442  *
3443  * It is possible for the flusher to stall in a continuous load.
3444  * blogbench -i1000 -o seems to do a good job generating this sort of load.
3445  * If the flusher is unable to catch up the inode count can bloat until
3446  * we run out of kvm.
3447  *
3448  * This is a bit of a hack.
3449  */
3450 void
3451 hammer_inode_waithard(hammer_mount_t hmp)
3452 {
3453         /*
3454          * Hysteresis.
3455          */
3456         if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
3457                 if (hmp->count_reclaims < hammer_limit_reclaims / 2 &&
3458                     hmp->count_iqueued < hmp->count_inodes / 20) {
3459                         hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
3460                         return;
3461                 }
3462         } else {
3463                 if (hmp->count_reclaims < hammer_limit_reclaims ||
3464                     hmp->count_iqueued < hmp->count_inodes / 10) {
3465                         return;
3466                 }
3467                 hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY;
3468         }
3469
3470         /*
3471          * Block for one flush cycle.
3472          */
3473         hammer_flusher_wait_next(hmp);
3474 }
3475
3476 #endif