sys/vfs/hammer/hammer_inode.c

   1 /*
   2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.99 2008/07/11 01:22:29 dillon Exp $
  35  */
  36
  37 #include "hammer.h"
  38 #include <vm/vm_extern.h>
  39 #include <sys/buf.h>
  40 #include <sys/buf2.h>
  41
  42 static int      hammer_unload_inode(struct hammer_inode *ip);
  43 static void     hammer_free_inode(hammer_inode_t ip);
  44 static void     hammer_flush_inode_core(hammer_inode_t ip, int flags);
  45 static int      hammer_setup_child_callback(hammer_record_t rec, void *data);
  46 static int      hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
  47 static int      hammer_setup_parent_inodes(hammer_inode_t ip);
  48 static int      hammer_setup_parent_inodes_helper(hammer_record_t record);
  49 static void     hammer_inode_wakereclaims(hammer_inode_t ip);
  50
  51 #ifdef DEBUG_TRUNCATE
  52 extern struct hammer_inode *HammerTruncIp;
  53 #endif
  54
  55 /*
  56  * RB-Tree support for inode structures
  57  */
  58 int
  59 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
  60 {
  61         if (ip1->obj_localization < ip2->obj_localization)
  62                 return(-1);
  63         if (ip1->obj_localization > ip2->obj_localization)
  64                 return(1);
  65         if (ip1->obj_id < ip2->obj_id)
  66                 return(-1);
  67         if (ip1->obj_id > ip2->obj_id)
  68                 return(1);
  69         if (ip1->obj_asof < ip2->obj_asof)
  70                 return(-1);
  71         if (ip1->obj_asof > ip2->obj_asof)
  72                 return(1);
  73         return(0);
  74 }
  75
  76 /*
  77  * RB-Tree support for inode structures / special LOOKUP_INFO
  78  */
  79 static int
  80 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
  81 {
  82         if (info->obj_localization < ip->obj_localization)
  83                 return(-1);
  84         if (info->obj_localization > ip->obj_localization)
  85                 return(1);
  86         if (info->obj_id < ip->obj_id)
  87                 return(-1);
  88         if (info->obj_id > ip->obj_id)
  89                 return(1);
  90         if (info->obj_asof < ip->obj_asof)
  91                 return(-1);
  92         if (info->obj_asof > ip->obj_asof)
  93                 return(1);
  94         return(0);
  95 }
  96
  97 /*
  98  * Used by hammer_scan_inode_snapshots() to locate all of an object's
  99  * snapshots.  Note that the asof field is not tested, which we can get
 100  * away with because it is the lowest-priority field.
 101  */
 102 static int
 103 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
 104 {
 105         hammer_inode_info_t info = data;
 106
 107         if (ip->obj_localization > info->obj_localization)
 108                 return(1);
 109         if (ip->obj_localization < info->obj_localization)
 110                 return(-1);
 111         if (ip->obj_id > info->obj_id)
 112                 return(1);
 113         if (ip->obj_id < info->obj_id)
 114                 return(-1);
 115         return(0);
 116 }
 117
 118 /*
 119  * RB-Tree support for pseudofs structures
 120  */
 121 static int
 122 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
 123 {
 124         if (p1->localization < p2->localization)
 125                 return(-1);
 126         if (p1->localization > p2->localization)
 127                 return(1);
 128         return(0);
 129 }
 130
 131
 132 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
 133 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
 134                 hammer_inode_info_cmp, hammer_inode_info_t);
 135 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
 136              hammer_pfs_rb_compare, u_int32_t, localization);
 137
 138 /*
 139  * The kernel is not actively referencing this vnode but is still holding
 140  * it cached.
 141  *
 142  * This is called from the frontend.
 143  */
 144 int
 145 hammer_vop_inactive(struct vop_inactive_args *ap)
 146 {
 147         struct hammer_inode *ip = VTOI(ap->a_vp);
 148
 149         /*
 150          * Degenerate case
 151          */
 152         if (ip == NULL) {
 153                 vrecycle(ap->a_vp);
 154                 return(0);
 155         }
 156
 157         /*
 158          * If the inode no longer has visibility in the filesystem try to
 159          * recycle it immediately, even if the inode is dirty.  Recycling
 160          * it quickly allows the system to reclaim buffer cache and VM
 161          * resources which can matter a lot in a heavily loaded system.
 162          *
 163          * This can deadlock in vfsync() if we aren't careful.
 164          *
 165          * Do not queue the inode to the flusher if we still have visibility,
 166          * otherwise namespace calls such as chmod will unnecessarily generate
 167          * multiple inode updates.
 168          */
 169         hammer_inode_unloadable_check(ip, 0);
 170         if (ip->ino_data.nlinks == 0) {
 171                 if (ip->flags & HAMMER_INODE_MODMASK)
 172                         hammer_flush_inode(ip, 0);
 173                 vrecycle(ap->a_vp);
 174         }
 175         return(0);
 176 }
 177
 178 /*
 179  * Release the vnode association.  This is typically (but not always)
 180  * the last reference on the inode.
 181  *
 182  * Once the association is lost we are on our own with regards to
 183  * flushing the inode.
 184  */
 185 int
 186 hammer_vop_reclaim(struct vop_reclaim_args *ap)
 187 {
 188         struct hammer_inode *ip;
 189         hammer_mount_t hmp;
 190         struct vnode *vp;
 191
 192         vp = ap->a_vp;
 193
 194         if ((ip = vp->v_data) != NULL) {
 195                 hmp = ip->hmp;
 196                 vp->v_data = NULL;
 197                 ip->vp = NULL;
 198
 199                 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
 200                         ++hammer_count_reclaiming;
 201                         ++hmp->inode_reclaims;
 202                         ip->flags |= HAMMER_INODE_RECLAIM;
 203                         if (hmp->inode_reclaims > HAMMER_RECLAIM_FLUSH &&
 204                             (hmp->inode_reclaims & 255) == 0) {
 205                                 hammer_flusher_async(hmp);
 206                         }
 207                 }
 208                 hammer_rel_inode(ip, 1);
 209         }
 210         return(0);
 211 }
 212
 213 /*
 214  * Return a locked vnode for the specified inode.  The inode must be
 215  * referenced but NOT LOCKED on entry and will remain referenced on
 216  * return.
 217  *
 218  * Called from the frontend.
 219  */
 220 int
 221 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
 222 {
 223         hammer_mount_t hmp;
 224         struct vnode *vp;
 225         int error = 0;
 226         u_int8_t obj_type;
 227
 228         hmp = ip->hmp;
 229
 230         for (;;) {
 231                 if ((vp = ip->vp) == NULL) {
 232                         error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
 233                         if (error)
 234                                 break;
 235                         hammer_lock_ex(&ip->lock);
 236                         if (ip->vp != NULL) {
 237                                 hammer_unlock(&ip->lock);
 238                                 vp->v_type = VBAD;
 239                                 vx_put(vp);
 240                                 continue;
 241                         }
 242                         hammer_ref(&ip->lock);
 243                         vp = *vpp;
 244                         ip->vp = vp;
 245
 246                         obj_type = ip->ino_data.obj_type;
 247                         vp->v_type = hammer_get_vnode_type(obj_type);
 248
 249                         hammer_inode_wakereclaims(ip);
 250
 251                         switch(ip->ino_data.obj_type) {
 252                         case HAMMER_OBJTYPE_CDEV:
 253                         case HAMMER_OBJTYPE_BDEV:
 254                                 vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
 255                                 addaliasu(vp, ip->ino_data.rmajor,
 256                                           ip->ino_data.rminor);
 257                                 break;
 258                         case HAMMER_OBJTYPE_FIFO:
 259                                 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
 260                                 break;
 261                         default:
 262                                 break;
 263                         }
 264
 265                         /*
 266                          * Only mark as the root vnode if the ip is not
 267                          * historical, otherwise the VFS cache will get
 268                          * confused.  The other half of the special handling
 269                          * is in hammer_vop_nlookupdotdot().
 270                          *
 271                          * Pseudo-filesystem roots also do not count.
 272                          */
 273                         if (ip->obj_id == HAMMER_OBJID_ROOT &&
 274                             ip->obj_asof == hmp->asof &&
 275                             ip->obj_localization == 0) {
 276                                 vp->v_flag |= VROOT;
 277                         }
 278
 279                         vp->v_data = (void *)ip;
 280                         /* vnode locked by getnewvnode() */
 281                         /* make related vnode dirty if inode dirty? */
 282                         hammer_unlock(&ip->lock);
 283                         if (vp->v_type == VREG)
 284                                 vinitvmio(vp, ip->ino_data.size);
 285                         break;
 286                 }
 287
 288                 /*
 289                  * loop if the vget fails (aka races), or if the vp
 290                  * no longer matches ip->vp.
 291                  */
 292                 if (vget(vp, LK_EXCLUSIVE) == 0) {
 293                         if (vp == ip->vp)
 294                                 break;
 295                         vput(vp);
 296                 }
 297         }
 298         *vpp = vp;
 299         return(error);
 300 }
 301
 302 /*
 303  * Locate all copies of the inode for obj_id compatible with the specified
 304  * asof, reference, and issue the related call-back.  This routine is used
 305  * for direct-io invalidation and does not create any new inodes.
 306  */
 307 void
 308 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
 309                             int (*callback)(hammer_inode_t ip, void *data),
 310                             void *data)
 311 {
 312         hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
 313                                    hammer_inode_info_cmp_all_history,
 314                                    callback, iinfo);
 315 }
 316
 317 /*
 318  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
 319  * do not attach or detach the related vnode (use hammer_get_vnode() for
 320  * that).
 321  *
 322  * The flags argument is only applied for newly created inodes, and only
 323  * certain flags are inherited.
 324  *
 325  * Called from the frontend.
 326  */
 327 struct hammer_inode *
 328 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
 329                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
 330                  int flags, int *errorp)
 331 {
 332         hammer_mount_t hmp = trans->hmp;
 333         struct hammer_inode_info iinfo;
 334         struct hammer_cursor cursor;
 335         struct hammer_inode *ip;
 336
 337
 338         /*
 339          * Determine if we already have an inode cached.  If we do then
 340          * we are golden.
 341          */
 342         iinfo.obj_id = obj_id;
 343         iinfo.obj_asof = asof;
 344         iinfo.obj_localization = localization;
 345 loop:
 346         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
 347         if (ip) {
 348                 hammer_ref(&ip->lock);
 349                 *errorp = 0;
 350                 return(ip);
 351         }
 352
 353         /*
 354          * Allocate a new inode structure and deal with races later.
 355          */
 356         ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
 357         ++hammer_count_inodes;
 358         ++hmp->count_inodes;
 359         ip->obj_id = obj_id;
 360         ip->obj_asof = iinfo.obj_asof;
 361         ip->obj_localization = localization;
 362         ip->hmp = hmp;
 363         ip->flags = flags & HAMMER_INODE_RO;
 364         ip->cache[0].ip = ip;
 365         ip->cache[1].ip = ip;
 366         if (hmp->ronly)
 367                 ip->flags |= HAMMER_INODE_RO;
 368         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
 369                 0x7FFFFFFFFFFFFFFFLL;
 370         RB_INIT(&ip->rec_tree);
 371         TAILQ_INIT(&ip->target_list);
 372         hammer_ref(&ip->lock);
 373
 374         /*
 375          * Locate the on-disk inode.  If this is a PFS root we always
 376          * access the current version of the root inode and (if it is not
 377          * a master) always access information under it with a snapshot
 378          * TID.
 379          */
 380 retry:
 381         hammer_init_cursor(trans, &cursor, (dip ? &dip->cache[0] : NULL), NULL);
 382         cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
 383         cursor.key_beg.obj_id = ip->obj_id;
 384         cursor.key_beg.key = 0;
 385         cursor.key_beg.create_tid = 0;
 386         cursor.key_beg.delete_tid = 0;
 387         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
 388         cursor.key_beg.obj_type = 0;
 389
 390         cursor.asof = iinfo.obj_asof;
 391         cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA |
 392                        HAMMER_CURSOR_ASOF;
 393
 394         *errorp = hammer_btree_lookup(&cursor);
 395         if (*errorp == EDEADLK) {
 396                 hammer_done_cursor(&cursor);
 397                 goto retry;
 398         }
 399
 400         /*
 401          * On success the B-Tree lookup will hold the appropriate
 402          * buffer cache buffers and provide a pointer to the requested
 403          * information.  Copy the information to the in-memory inode
 404          * and cache the B-Tree node to improve future operations.
 405          */
 406         if (*errorp == 0) {
 407                 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
 408                 ip->ino_data = cursor.data->inode;
 409
 410                 /*
 411                  * cache[0] tries to cache the location of the object inode.
 412                  * The assumption is that it is near the directory inode.
 413                  *
 414                  * cache[1] tries to cache the location of the object data.
 415                  * The assumption is that it is near the directory data.
 416                  */
 417                 hammer_cache_node(&ip->cache[0], cursor.node);
 418                 if (dip && dip->cache[1].node)
 419                         hammer_cache_node(&ip->cache[1], dip->cache[1].node);
 420
 421                 /*
 422                  * The file should not contain any data past the file size
 423                  * stored in the inode.  Setting save_trunc_off to the
 424                  * file size instead of max reduces B-Tree lookup overheads
 425                  * on append by allowing the flusher to avoid checking for
 426                  * record overwrites.
 427                  */
 428                 ip->save_trunc_off = ip->ino_data.size;
 429
 430                 /*
 431                  * Locate and assign the pseudofs management structure to
 432                  * the inode.
 433                  */
 434                 if (dip && dip->obj_localization == ip->obj_localization) {
 435                         ip->pfsm = dip->pfsm;
 436                         hammer_ref(&ip->pfsm->lock);
 437                 } else {
 438                         ip->pfsm = hammer_load_pseudofs(trans,
 439                                                         ip->obj_localization,
 440                                                         errorp);
 441                         *errorp = 0;    /* ignore ENOENT */
 442                 }
 443         }
 444
 445         /*
 446          * The inode is placed on the red-black tree and will be synced to
 447          * the media when flushed or by the filesystem sync.  If this races
 448          * another instantiation/lookup the insertion will fail.
 449          */
 450         if (*errorp == 0) {
 451                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
 452                         hammer_free_inode(ip);
 453                         hammer_done_cursor(&cursor);
 454                         goto loop;
 455                 }
 456                 ip->flags |= HAMMER_INODE_ONDISK;
 457         } else {
 458                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
 459                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
 460                         --hmp->rsv_inodes;
 461                 }
 462
 463                 hammer_free_inode(ip);
 464                 ip = NULL;
 465         }
 466         hammer_done_cursor(&cursor);
 467         return (ip);
 468 }
 469
 470 /*
 471  * Create a new filesystem object, returning the inode in *ipp.  The
 472  * returned inode will be referenced.  The inode is created in-memory.
 473  *
 474  * If pfsm is non-NULL the caller wishes to create the root inode for
 475  * a master PFS.
 476  */
 477 int
 478 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
 479                     struct ucred *cred, hammer_inode_t dip,
 480                     hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
 481 {
 482         hammer_mount_t hmp;
 483         hammer_inode_t ip;
 484         uid_t xuid;
 485         int error;
 486
 487         hmp = trans->hmp;
 488
 489         ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
 490         ++hammer_count_inodes;
 491         ++hmp->count_inodes;
 492
 493         if (pfsm) {
 494                 KKASSERT(pfsm->localization != 0);
 495                 ip->obj_id = HAMMER_OBJID_ROOT;
 496                 ip->obj_localization = pfsm->localization;
 497         } else {
 498                 KKASSERT(dip != NULL);
 499                 ip->obj_id = hammer_alloc_objid(hmp, dip);
 500                 ip->obj_localization = dip->obj_localization;
 501         }
 502
 503         KKASSERT(ip->obj_id != 0);
 504         ip->obj_asof = hmp->asof;
 505         ip->hmp = hmp;
 506         ip->flush_state = HAMMER_FST_IDLE;
 507         ip->flags = HAMMER_INODE_DDIRTY |
 508                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME;
 509         ip->cache[0].ip = ip;
 510         ip->cache[1].ip = ip;
 511
 512         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
 513         /* ip->save_trunc_off = 0; (already zero) */
 514         RB_INIT(&ip->rec_tree);
 515         TAILQ_INIT(&ip->target_list);
 516
 517         ip->ino_data.atime = trans->time;
 518         ip->ino_data.mtime = trans->time;
 519         ip->ino_data.size = 0;
 520         ip->ino_data.nlinks = 0;
 521
 522         /*
 523          * A nohistory designator on the parent directory is inherited by
 524          * the child.  We will do this even for pseudo-fs creation... the
 525          * sysad can turn it off.
 526          */
 527         if (dip) {
 528                 ip->ino_data.uflags = dip->ino_data.uflags &
 529                                       (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP);
 530         }
 531
 532         ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
 533         ip->ino_leaf.base.localization = ip->obj_localization +
 534                                          HAMMER_LOCALIZE_INODE;
 535         ip->ino_leaf.base.obj_id = ip->obj_id;
 536         ip->ino_leaf.base.key = 0;
 537         ip->ino_leaf.base.create_tid = 0;
 538         ip->ino_leaf.base.delete_tid = 0;
 539         ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
 540         ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
 541
 542         ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
 543         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
 544         ip->ino_data.mode = vap->va_mode;
 545         ip->ino_data.ctime = trans->time;
 546
 547         /*
 548          * Setup the ".." pointer.  This only needs to be done for directories
 549          * but we do it for all objects as a recovery aid.
 550          */
 551         if (dip)
 552                 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
 553 #if 0
 554         /*
 555          * The parent_obj_localization field only applies to pseudo-fs roots.
 556          * XXX this is no longer applicable, PFSs are no longer directly
 557          * tied into the parent's directory structure.
 558          */
 559         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
 560             ip->obj_id == HAMMER_OBJID_ROOT) {
 561                 ip->ino_data.ext.obj.parent_obj_localization =
 562                                                 dip->obj_localization;
 563         }
 564 #endif
 565
 566         switch(ip->ino_leaf.base.obj_type) {
 567         case HAMMER_OBJTYPE_CDEV:
 568         case HAMMER_OBJTYPE_BDEV:
 569                 ip->ino_data.rmajor = vap->va_rmajor;
 570                 ip->ino_data.rminor = vap->va_rminor;
 571                 break;
 572         default:
 573                 break;
 574         }
 575
 576         /*
 577          * Calculate default uid/gid and overwrite with information from
 578          * the vap.
 579          */
 580         if (dip) {
 581                 xuid = hammer_to_unix_xid(&dip->ino_data.uid);
 582                 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
 583                                              xuid, cred, &vap->va_mode);
 584         } else {
 585                 xuid = 0;
 586         }
 587         ip->ino_data.mode = vap->va_mode;
 588
 589         if (vap->va_vaflags & VA_UID_UUID_VALID)
 590                 ip->ino_data.uid = vap->va_uid_uuid;
 591         else if (vap->va_uid != (uid_t)VNOVAL)
 592                 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
 593         else
 594                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
 595
 596         if (vap->va_vaflags & VA_GID_UUID_VALID)
 597                 ip->ino_data.gid = vap->va_gid_uuid;
 598         else if (vap->va_gid != (gid_t)VNOVAL)
 599                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
 600         else if (dip)
 601                 ip->ino_data.gid = dip->ino_data.gid;
 602
 603         hammer_ref(&ip->lock);
 604
 605         if (pfsm) {
 606                 ip->pfsm = pfsm;
 607                 hammer_ref(&pfsm->lock);
 608                 error = 0;
 609         } else if (dip->obj_localization == ip->obj_localization) {
 610                 ip->pfsm = dip->pfsm;
 611                 hammer_ref(&ip->pfsm->lock);
 612                 error = 0;
 613         } else {
 614                 ip->pfsm = hammer_load_pseudofs(trans,
 615                                                 ip->obj_localization,
 616                                                 &error);
 617                 error = 0;      /* ignore ENOENT */
 618         }
 619
 620         if (error) {
 621                 hammer_free_inode(ip);
 622                 ip = NULL;
 623         } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
 624                 panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
 625                 /* not reached */
 626                 hammer_free_inode(ip);
 627         }
 628         *ipp = ip;
 629         return(error);
 630 }
 631
 632 /*
 633  * Final cleanup / freeing of an inode structure
 634  */
 635 static void
 636 hammer_free_inode(hammer_inode_t ip)
 637 {
 638         KKASSERT(ip->lock.refs == 1);
 639         hammer_uncache_node(&ip->cache[0]);
 640         hammer_uncache_node(&ip->cache[1]);
 641         hammer_inode_wakereclaims(ip);
 642         if (ip->objid_cache)
 643                 hammer_clear_objid(ip);
 644         --hammer_count_inodes;
 645         --ip->hmp->count_inodes;
 646         if (ip->pfsm) {
 647                 hammer_rel_pseudofs(ip->hmp, ip->pfsm);
 648                 ip->pfsm = NULL;
 649         }
 650         kfree(ip, M_HAMMER);
 651         ip = NULL;
 652 }
 653
 654 /*
 655  * Retrieve pseudo-fs data.  NULL will never be returned.
 656  *
 657  * If an error occurs *errorp will be set and a default template is returned,
 658  * otherwise *errorp is set to 0.  Typically when an error occurs it will
 659  * be ENOENT.
 660  */
 661 hammer_pseudofs_inmem_t
 662 hammer_load_pseudofs(hammer_transaction_t trans,
 663                      u_int32_t localization, int *errorp)
 664 {
 665         hammer_mount_t hmp = trans->hmp;
 666         hammer_inode_t ip;
 667         hammer_pseudofs_inmem_t pfsm;
 668         struct hammer_cursor cursor;
 669         int bytes;
 670
 671 retry:
 672         pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
 673         if (pfsm) {
 674                 hammer_ref(&pfsm->lock);
 675                 *errorp = 0;
 676                 return(pfsm);
 677         }
 678
 679         /*
 680          * PFS records are stored in the root inode (not the PFS root inode,
 681          * but the real root).  Avoid an infinite recursion if loading
 682          * the PFS for the real root.
 683          */
 684         if (localization) {
 685                 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
 686                                       HAMMER_MAX_TID,
 687                                       HAMMER_DEF_LOCALIZATION, 0, errorp);
 688         } else {
 689                 ip = NULL;
 690         }
 691
 692         pfsm = kmalloc(sizeof(*pfsm), M_HAMMER, M_WAITOK | M_ZERO);
 693         pfsm->localization = localization;
 694         pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
 695         pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
 696
 697         hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
 698         cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
 699                                       HAMMER_LOCALIZE_MISC;
 700         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
 701         cursor.key_beg.create_tid = 0;
 702         cursor.key_beg.delete_tid = 0;
 703         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
 704         cursor.key_beg.obj_type = 0;
 705         cursor.key_beg.key = localization;
 706         cursor.asof = HAMMER_MAX_TID;
 707         cursor.flags |= HAMMER_CURSOR_ASOF;
 708
 709         if (ip)
 710                 *errorp = hammer_ip_lookup(&cursor);
 711         else
 712                 *errorp = hammer_btree_lookup(&cursor);
 713         if (*errorp == 0) {
 714                 *errorp = hammer_ip_resolve_data(&cursor);
 715                 if (*errorp == 0) {
 716                         bytes = cursor.leaf->data_len;
 717                         if (bytes > sizeof(pfsm->pfsd))
 718                                 bytes = sizeof(pfsm->pfsd);
 719                         bcopy(cursor.data, &pfsm->pfsd, bytes);
 720                 }
 721         }
 722         hammer_done_cursor(&cursor);
 723
 724         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
 725         hammer_ref(&pfsm->lock);
 726         if (ip)
 727                 hammer_rel_inode(ip, 0);
 728         if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
 729                 kfree(pfsm, M_HAMMER);
 730                 goto retry;
 731         }
 732         return(pfsm);
 733 }
 734
 735 /*
 736  * Store pseudo-fs data.  The backend will automatically delete any prior
 737  * on-disk pseudo-fs data but we have to delete in-memory versions.
 738  */
 739 int
 740 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
 741 {
 742         struct hammer_cursor cursor;
 743         hammer_record_t record;
 744         hammer_inode_t ip;
 745         int error;
 746
 747         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
 748                               HAMMER_DEF_LOCALIZATION, 0, &error);
 749 retry:
 750         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
 751         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
 752         cursor.key_beg.localization = ip->obj_localization +
 753                                       HAMMER_LOCALIZE_MISC;
 754         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
 755         cursor.key_beg.create_tid = 0;
 756         cursor.key_beg.delete_tid = 0;
 757         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
 758         cursor.key_beg.obj_type = 0;
 759         cursor.key_beg.key = pfsm->localization;
 760         cursor.asof = HAMMER_MAX_TID;
 761         cursor.flags |= HAMMER_CURSOR_ASOF;
 762
 763         error = hammer_ip_lookup(&cursor);
 764         if (error == 0 && hammer_cursor_inmem(&cursor)) {
 765                 record = cursor.iprec;
 766                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
 767                         KKASSERT(cursor.deadlk_rec == NULL);
 768                         hammer_ref(&record->lock);
 769                         cursor.deadlk_rec = record;
 770                         error = EDEADLK;
 771                 } else {
 772                         record->flags |= HAMMER_RECF_DELETED_FE;
 773                         error = 0;
 774                 }
 775         }
 776         if (error == 0 || error == ENOENT) {
 777                 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
 778                 record->type = HAMMER_MEM_RECORD_GENERAL;
 779
 780                 record->leaf.base.localization = ip->obj_localization +
 781                                                  HAMMER_LOCALIZE_MISC;
 782                 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
 783                 record->leaf.base.key = pfsm->localization;
 784                 record->leaf.data_len = sizeof(pfsm->pfsd);
 785                 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
 786                 error = hammer_ip_add_record(trans, record);
 787         }
 788         hammer_done_cursor(&cursor);
 789         if (error == EDEADLK)
 790                 goto retry;
 791         hammer_rel_inode(ip, 0);
 792         return(error);
 793 }
 794
 795 /*
 796  * Create a root directory for a PFS if one does not alredy exist.
 797  *
 798  * The PFS root stands alone so we must also bump the nlinks count
 799  * to prevent it from being destroyed on release.
 800  */
 801 int
 802 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
 803                        hammer_pseudofs_inmem_t pfsm)
 804 {
 805         hammer_inode_t ip;
 806         struct vattr vap;
 807         int error;
 808
 809         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
 810                               pfsm->localization, 0, &error);
 811         if (ip == NULL) {
 812                 vattr_null(&vap);
 813                 vap.va_mode = 0755;
 814                 vap.va_type = VDIR;
 815                 error = hammer_create_inode(trans, &vap, cred, NULL, pfsm, &ip);
 816                 if (error == 0) {
 817                         ++ip->ino_data.nlinks;
 818                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
 819                 }
 820         }
 821         if (ip)
 822                 hammer_rel_inode(ip, 0);
 823         return(error);
 824 }
 825
 826 /*
 827  * Release a reference on a PFS
 828  */
 829 void
 830 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
 831 {
 832         hammer_unref(&pfsm->lock);
 833         if (pfsm->lock.refs == 0) {
 834                 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
 835                 kfree(pfsm, M_HAMMER);
 836         }
 837 }
 838
 839 /*
 840  * Called by hammer_sync_inode().
 841  */
 842 static int
 843 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
 844 {
 845         hammer_transaction_t trans = cursor->trans;
 846         hammer_record_t record;
 847         int error;
 848         int redirty;
 849
 850 retry:
 851         error = 0;
 852
 853         /*
 854          * If the inode has a presence on-disk then locate it and mark
 855          * it deleted, setting DELONDISK.
 856          *
 857          * The record may or may not be physically deleted, depending on
 858          * the retention policy.
 859          */
 860         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
 861             HAMMER_INODE_ONDISK) {
 862                 hammer_normalize_cursor(cursor);
 863                 cursor->key_beg.localization = ip->obj_localization +
 864                                                HAMMER_LOCALIZE_INODE;
 865                 cursor->key_beg.obj_id = ip->obj_id;
 866                 cursor->key_beg.key = 0;
 867                 cursor->key_beg.create_tid = 0;
 868                 cursor->key_beg.delete_tid = 0;
 869                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
 870                 cursor->key_beg.obj_type = 0;
 871                 cursor->asof = ip->obj_asof;
 872                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
 873                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
 874                 cursor->flags |= HAMMER_CURSOR_BACKEND;
 875
 876                 error = hammer_btree_lookup(cursor);
 877                 if (hammer_debug_inode)
 878                         kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
 879                 if (error) {
 880                         kprintf("error %d\n", error);
 881                         Debugger("hammer_update_inode");
 882                 }
 883
 884                 if (error == 0) {
 885                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
 886                         if (hammer_debug_inode)
 887                                 kprintf(" error %d\n", error);
 888                         if (error && error != EDEADLK) {
 889                                 kprintf("error %d\n", error);
 890                                 Debugger("hammer_update_inode2");
 891                         }
 892                         if (error == 0) {
 893                                 ip->flags |= HAMMER_INODE_DELONDISK;
 894                         }
 895                         if (cursor->node)
 896                                 hammer_cache_node(&ip->cache[0], cursor->node);
 897                 }
 898                 if (error == EDEADLK) {
 899                         hammer_done_cursor(cursor);
 900                         error = hammer_init_cursor(trans, cursor,
 901                                                    &ip->cache[0], ip);
 902                         if (hammer_debug_inode)
 903                                 kprintf("IPDED %p %d\n", ip, error);
 904                         if (error == 0)
 905                                 goto retry;
 906                 }
 907         }
 908
 909         /*
 910          * Ok, write out the initial record or a new record (after deleting
 911          * the old one), unless the DELETED flag is set.  This routine will
 912          * clear DELONDISK if it writes out a record.
 913          *
 914          * Update our inode statistics if this is the first application of
 915          * the inode on-disk.
 916          */
 917         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
 918                 /*
 919                  * Generate a record and write it to the media
 920                  */
 921                 record = hammer_alloc_mem_record(ip, 0);
 922                 record->type = HAMMER_MEM_RECORD_INODE;
 923                 record->flush_state = HAMMER_FST_FLUSH;
 924                 record->leaf = ip->sync_ino_leaf;
 925                 record->leaf.base.create_tid = trans->tid;
 926                 record->leaf.data_len = sizeof(ip->sync_ino_data);
 927                 record->leaf.create_ts = trans->time32;
 928                 record->data = (void *)&ip->sync_ino_data;
 929                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
 930
 931                 /*
 932                  * If this flag is set we cannot sync the new file size
 933                  * because we haven't finished related truncations.  The
 934                  * inode will be flushed in another flush group to finish
 935                  * the job.
 936                  */
 937                 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
 938                     ip->sync_ino_data.size != ip->ino_data.size) {
 939                         redirty = 1;
 940                         ip->sync_ino_data.size = ip->ino_data.size;
 941                 } else {
 942                         redirty = 0;
 943                 }
 944
 945                 for (;;) {
 946                         error = hammer_ip_sync_record_cursor(cursor, record);
 947                         if (hammer_debug_inode)
 948                                 kprintf("GENREC %p rec %08x %d\n",
 949                                         ip, record->flags, error);
 950                         if (error != EDEADLK)
 951                                 break;
 952                         hammer_done_cursor(cursor);
 953                         error = hammer_init_cursor(trans, cursor,
 954                                                    &ip->cache[0], ip);
 955                         if (hammer_debug_inode)
 956                                 kprintf("GENREC reinit %d\n", error);
 957                         if (error)
 958                                 break;
 959                 }
 960                 if (error) {
 961                         kprintf("error %d\n", error);
 962                         Debugger("hammer_update_inode3");
 963                 }
 964
 965                 /*
 966                  * The record isn't managed by the inode's record tree,
 967                  * destroy it whether we succeed or fail.
 968                  */
 969                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
 970                 record->flags |= HAMMER_RECF_DELETED_FE;
 971                 record->flush_state = HAMMER_FST_IDLE;
 972                 hammer_rel_mem_record(record);
 973
 974                 /*
 975                  * Finish up.
 976                  */
 977                 if (error == 0) {
 978                         if (hammer_debug_inode)
 979                                 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
 980                         ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
 981                                             HAMMER_INODE_ATIME |
 982                                             HAMMER_INODE_MTIME);
 983                         ip->flags &= ~HAMMER_INODE_DELONDISK;
 984                         if (redirty)
 985                                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
 986
 987                         /*
 988                          * Root volume count of inodes
 989                          */
 990                         hammer_sync_lock_sh(trans);
 991                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
 992                                 hammer_modify_volume_field(trans,
 993                                                            trans->rootvol,
 994                                                            vol0_stat_inodes);
 995                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
 996                                 hammer_modify_volume_done(trans->rootvol);
 997                                 ip->flags |= HAMMER_INODE_ONDISK;
 998                                 if (hammer_debug_inode)
 999                                         kprintf("NOWONDISK %p\n", ip);
1000                         }
1001                         hammer_sync_unlock(trans);
1002                 }
1003         }
1004
1005         /*
1006          * If the inode has been destroyed, clean out any left-over flags
1007          * that may have been set by the frontend.
1008          */
1009         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
1010                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1011                                     HAMMER_INODE_ATIME |
1012                                     HAMMER_INODE_MTIME);
1013         }
1014         return(error);
1015 }
1016
1017 /*
1018  * Update only the itimes fields.
1019  *
1020  * ATIME can be updated without generating any UNDO.  MTIME is updated
1021  * with UNDO so it is guaranteed to be synchronized properly in case of
1022  * a crash.
1023  *
1024  * Neither field is included in the B-Tree leaf element's CRC, which is how
1025  * we can get away with updating ATIME the way we do.
1026  */
1027 static int
1028 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
1029 {
1030         hammer_transaction_t trans = cursor->trans;
1031         int error;
1032
1033 retry:
1034         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) !=
1035             HAMMER_INODE_ONDISK) {
1036                 return(0);
1037         }
1038
1039         hammer_normalize_cursor(cursor);
1040         cursor->key_beg.localization = ip->obj_localization +
1041                                        HAMMER_LOCALIZE_INODE;
1042         cursor->key_beg.obj_id = ip->obj_id;
1043         cursor->key_beg.key = 0;
1044         cursor->key_beg.create_tid = 0;
1045         cursor->key_beg.delete_tid = 0;
1046         cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1047         cursor->key_beg.obj_type = 0;
1048         cursor->asof = ip->obj_asof;
1049         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1050         cursor->flags |= HAMMER_CURSOR_ASOF;
1051         cursor->flags |= HAMMER_CURSOR_GET_LEAF;
1052         cursor->flags |= HAMMER_CURSOR_GET_DATA;
1053         cursor->flags |= HAMMER_CURSOR_BACKEND;
1054
1055         error = hammer_btree_lookup(cursor);
1056         if (error) {
1057                 kprintf("error %d\n", error);
1058                 Debugger("hammer_update_itimes1");
1059         }
1060         if (error == 0) {
1061                 hammer_cache_node(&ip->cache[0], cursor->node);
1062                 if (ip->sync_flags & HAMMER_INODE_MTIME) {
1063                         /*
1064                          * Updating MTIME requires an UNDO.  Just cover
1065                          * both atime and mtime.
1066                          */
1067                         hammer_sync_lock_sh(trans);
1068                         hammer_modify_buffer(trans, cursor->data_buffer,
1069                                      HAMMER_ITIMES_BASE(&cursor->data->inode),
1070                                      HAMMER_ITIMES_BYTES);
1071                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1072                         cursor->data->inode.mtime = ip->sync_ino_data.mtime;
1073                         hammer_modify_buffer_done(cursor->data_buffer);
1074                         hammer_sync_unlock(trans);
1075                 } else if (ip->sync_flags & HAMMER_INODE_ATIME) {
1076                         /*
1077                          * Updating atime only can be done in-place with
1078                          * no UNDO.
1079                          */
1080                         hammer_sync_lock_sh(trans);
1081                         hammer_modify_buffer(trans, cursor->data_buffer,
1082                                              NULL, 0);
1083                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1084                         hammer_modify_buffer_done(cursor->data_buffer);
1085                         hammer_sync_unlock(trans);
1086                 }
1087                 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME);
1088         }
1089         if (error == EDEADLK) {
1090                 hammer_done_cursor(cursor);
1091                 error = hammer_init_cursor(trans, cursor,
1092                                            &ip->cache[0], ip);
1093                 if (error == 0)
1094                         goto retry;
1095         }
1096         return(error);
1097 }
1098
1099 /*
1100  * Release a reference on an inode, flush as requested.
1101  *
1102  * On the last reference we queue the inode to the flusher for its final
1103  * disposition.
1104  */
1105 void
1106 hammer_rel_inode(struct hammer_inode *ip, int flush)
1107 {
1108         hammer_mount_t hmp = ip->hmp;
1109
1110         /*
1111          * Handle disposition when dropping the last ref.
1112          */
1113         for (;;) {
1114                 if (ip->lock.refs == 1) {
1115                         /*
1116                          * Determine whether on-disk action is needed for
1117                          * the inode's final disposition.
1118                          */
1119                         KKASSERT(ip->vp == NULL);
1120                         hammer_inode_unloadable_check(ip, 0);
1121                         if (ip->flags & HAMMER_INODE_MODMASK) {
1122                                 if (hmp->rsv_inodes > desiredvnodes) {
1123                                         hammer_flush_inode(ip,
1124                                                            HAMMER_FLUSH_SIGNAL);
1125                                 } else {
1126                                         hammer_flush_inode(ip, 0);
1127                                 }
1128                         } else if (ip->lock.refs == 1) {
1129                                 hammer_unload_inode(ip);
1130                                 break;
1131                         }
1132                 } else {
1133                         if (flush)
1134                                 hammer_flush_inode(ip, 0);
1135
1136                         /*
1137                          * The inode still has multiple refs, try to drop
1138                          * one ref.
1139                          */
1140                         KKASSERT(ip->lock.refs >= 1);
1141                         if (ip->lock.refs > 1) {
1142                                 hammer_unref(&ip->lock);
1143                                 break;
1144                         }
1145                 }
1146         }
1147 }
1148
1149 /*
1150  * Unload and destroy the specified inode.  Must be called with one remaining
1151  * reference.  The reference is disposed of.
1152  *
1153  * This can only be called in the context of the flusher.
1154  */
1155 static int
1156 hammer_unload_inode(struct hammer_inode *ip)
1157 {
1158         hammer_mount_t hmp = ip->hmp;
1159
1160         KASSERT(ip->lock.refs == 1,
1161                 ("hammer_unload_inode: %d refs\n", ip->lock.refs));
1162         KKASSERT(ip->vp == NULL);
1163         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
1164         KKASSERT(ip->cursor_ip_refs == 0);
1165         KKASSERT(ip->lock.lockcount == 0);
1166         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
1167
1168         KKASSERT(RB_EMPTY(&ip->rec_tree));
1169         KKASSERT(TAILQ_EMPTY(&ip->target_list));
1170
1171         RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
1172
1173         hammer_free_inode(ip);
1174         return(0);
1175 }
1176
1177 /*
1178  * Called on mount -u when switching from RW to RO or vise-versa.  Adjust
1179  * the read-only flag for cached inodes.
1180  *
1181  * This routine is called from a RB_SCAN().
1182  */
1183 int
1184 hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
1185 {
1186         hammer_mount_t hmp = ip->hmp;
1187
1188         if (hmp->ronly || hmp->asof != HAMMER_MAX_TID)
1189                 ip->flags |= HAMMER_INODE_RO;
1190         else
1191                 ip->flags &= ~HAMMER_INODE_RO;
1192         return(0);
1193 }
1194
1195 /*
1196  * A transaction has modified an inode, requiring updates as specified by
1197  * the passed flags.
1198  *
1199  * HAMMER_INODE_DDIRTY: Inode data has been updated
1200  * HAMMER_INODE_XDIRTY: Dirty in-memory records
1201  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
1202  * HAMMER_INODE_DELETED: Inode record/data must be deleted
1203  * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
1204  */
1205 void
1206 hammer_modify_inode(hammer_inode_t ip, int flags)
1207 {
1208         KKASSERT(ip->hmp->ronly == 0 ||
1209                   (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
1210                             HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
1211                             HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
1212         if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
1213                 ip->flags |= HAMMER_INODE_RSV_INODES;
1214                 ++ip->hmp->rsv_inodes;
1215         }
1216
1217         ip->flags |= flags;
1218 }
1219
1220 /*
1221  * Request that an inode be flushed.  This whole mess cannot block and may
1222  * recurse (if not synchronous).  Once requested HAMMER will attempt to
1223  * actively flush the inode until the flush can be done.
1224  *
1225  * The inode may already be flushing, or may be in a setup state.  We can
1226  * place the inode in a flushing state if it is currently idle and flag it
1227  * to reflush if it is currently flushing.
1228  *
1229  * If the HAMMER_FLUSH_SYNCHRONOUS flag is specified we will attempt to
1230  * flush the indoe synchronously using the caller's context.
1231  */
1232 void
1233 hammer_flush_inode(hammer_inode_t ip, int flags)
1234 {
1235         int good;
1236
1237         /*
1238          * Trivial 'nothing to flush' case.  If the inode is ina SETUP
1239          * state we have to put it back into an IDLE state so we can
1240          * drop the extra ref.
1241          */
1242         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
1243                 if (ip->flush_state == HAMMER_FST_SETUP) {
1244                         ip->flush_state = HAMMER_FST_IDLE;
1245                         hammer_rel_inode(ip, 0);
1246                 }
1247                 return;
1248         }
1249
1250         /*
1251          * Our flush action will depend on the current state.
1252          */
1253         switch(ip->flush_state) {
1254         case HAMMER_FST_IDLE:
1255                 /*
1256                  * We have no dependancies and can flush immediately.  Some
1257                  * our children may not be flushable so we have to re-test
1258                  * with that additional knowledge.
1259                  */
1260                 hammer_flush_inode_core(ip, flags);
1261                 break;
1262         case HAMMER_FST_SETUP:
1263                 /*
1264                  * Recurse upwards through dependancies via target_list
1265                  * and start their flusher actions going if possible.
1266                  *
1267                  * 'good' is our connectivity.  -1 means we have none and
1268                  * can't flush, 0 means there weren't any dependancies, and
1269                  * 1 means we have good connectivity.
1270                  */
1271                 good = hammer_setup_parent_inodes(ip);
1272
1273                 /*
1274                  * We can continue if good >= 0.  Determine how many records
1275                  * under our inode can be flushed (and mark them).
1276                  */
1277                 if (good >= 0) {
1278                         hammer_flush_inode_core(ip, flags);
1279                 } else {
1280                         ip->flags |= HAMMER_INODE_REFLUSH;
1281                         if (flags & HAMMER_FLUSH_SIGNAL) {
1282                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1283                                 hammer_flusher_async(ip->hmp);
1284                         }
1285                 }
1286                 break;
1287         default:
1288                 /*
1289                  * We are already flushing, flag the inode to reflush
1290                  * if needed after it completes its current flush.
1291                  */
1292                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
1293                         ip->flags |= HAMMER_INODE_REFLUSH;
1294                 if (flags & HAMMER_FLUSH_SIGNAL) {
1295                         ip->flags |= HAMMER_INODE_RESIGNAL;
1296                         hammer_flusher_async(ip->hmp);
1297                 }
1298                 break;
1299         }
1300 }
1301
1302 /*
1303  * Scan ip->target_list, which is a list of records owned by PARENTS to our
1304  * ip which reference our ip.
1305  *
1306  * XXX This is a huge mess of recursive code, but not one bit of it blocks
1307  *     so for now do not ref/deref the structures.  Note that if we use the
1308  *     ref/rel code later, the rel CAN block.
1309  */
1310 static int
1311 hammer_setup_parent_inodes(hammer_inode_t ip)
1312 {
1313         hammer_record_t depend;
1314 #if 0
1315         hammer_record_t next;
1316         hammer_inode_t  pip;
1317 #endif
1318         int good;
1319         int r;
1320
1321         good = 0;
1322         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
1323                 r = hammer_setup_parent_inodes_helper(depend);
1324                 KKASSERT(depend->target_ip == ip);
1325                 if (r < 0 && good == 0)
1326                         good = -1;
1327                 if (r > 0)
1328                         good = 1;
1329         }
1330         return(good);
1331
1332 #if 0
1333 retry:
1334         good = 0;
1335         next = TAILQ_FIRST(&ip->target_list);
1336         if (next) {
1337                 hammer_ref(&next->lock);
1338                 hammer_ref(&next->ip->lock);
1339         }
1340         while ((depend = next) != NULL) {
1341                 if (depend->target_ip == NULL) {
1342                         pip = depend->ip;
1343                         hammer_rel_mem_record(depend);
1344                         hammer_rel_inode(pip, 0);
1345                         goto retry;
1346                 }
1347                 KKASSERT(depend->target_ip == ip);
1348                 next = TAILQ_NEXT(depend, target_entry);
1349                 if (next) {
1350                         hammer_ref(&next->lock);
1351                         hammer_ref(&next->ip->lock);
1352                 }
1353                 r = hammer_setup_parent_inodes_helper(depend);
1354                 if (r < 0 && good == 0)
1355                         good = -1;
1356                 if (r > 0)
1357                         good = 1;
1358                 pip = depend->ip;
1359                 hammer_rel_mem_record(depend);
1360                 hammer_rel_inode(pip, 0);
1361         }
1362         return(good);
1363 #endif
1364 }
1365
1366 /*
1367  * This helper function takes a record representing the dependancy between
1368  * the parent inode and child inode.
1369  *
1370  * record->ip           = parent inode
1371  * record->target_ip    = child inode
1372  *
1373  * We are asked to recurse upwards and convert the record from SETUP
1374  * to FLUSH if possible.
1375  *
1376  * Return 1 if the record gives us connectivity
1377  *
1378  * Return 0 if the record is not relevant
1379  *
1380  * Return -1 if we can't resolve the dependancy and there is no connectivity.
1381  */
1382 static int
1383 hammer_setup_parent_inodes_helper(hammer_record_t record)
1384 {
1385         hammer_mount_t hmp;
1386         hammer_inode_t pip;
1387         int good;
1388
1389         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
1390         pip = record->ip;
1391         hmp = pip->hmp;
1392
1393         /*
1394          * If the record is already flushing, is it in our flush group?
1395          *
1396          * If it is in our flush group but it is a general record or a
1397          * delete-on-disk, it does not improve our connectivity (return 0),
1398          * and if the target inode is not trying to destroy itself we can't
1399          * allow the operation yet anyway (the second return -1).
1400          */
1401         if (record->flush_state == HAMMER_FST_FLUSH) {
1402                 if (record->flush_group != hmp->flusher.next) {
1403                         pip->flags |= HAMMER_INODE_REFLUSH;
1404                         return(-1);
1405                 }
1406                 if (record->type == HAMMER_MEM_RECORD_ADD)
1407                         return(1);
1408                 /* GENERAL or DEL */
1409                 return(0);
1410         }
1411
1412         /*
1413          * It must be a setup record.  Try to resolve the setup dependancies
1414          * by recursing upwards so we can place ip on the flush list.
1415          */
1416         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1417
1418         good = hammer_setup_parent_inodes(pip);
1419
1420         /*
1421          * We can't flush ip because it has no connectivity (XXX also check
1422          * nlinks for pre-existing connectivity!).  Flag it so any resolution
1423          * recurses back down.
1424          */
1425         if (good < 0) {
1426                 pip->flags |= HAMMER_INODE_REFLUSH;
1427                 return(good);
1428         }
1429
1430         /*
1431          * We are go, place the parent inode in a flushing state so we can
1432          * place its record in a flushing state.  Note that the parent
1433          * may already be flushing.  The record must be in the same flush
1434          * group as the parent.
1435          */
1436         if (pip->flush_state != HAMMER_FST_FLUSH)
1437                 hammer_flush_inode_core(pip, HAMMER_FLUSH_RECURSION);
1438         KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
1439         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1440
1441 #if 0
1442         if (record->type == HAMMER_MEM_RECORD_DEL &&
1443             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
1444                 /*
1445                  * Regardless of flushing state we cannot sync this path if the
1446                  * record represents a delete-on-disk but the target inode
1447                  * is not ready to sync its own deletion.
1448                  *
1449                  * XXX need to count effective nlinks to determine whether
1450                  * the flush is ok, otherwise removing a hardlink will
1451                  * just leave the DEL record to rot.
1452                  */
1453                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
1454                 return(-1);
1455         } else
1456 #endif
1457         if (pip->flush_group == pip->hmp->flusher.next) {
1458                 /*
1459                  * This is the record we wanted to synchronize.  If the
1460                  * record went into a flush state while we blocked it
1461                  * had better be in the correct flush group.
1462                  */
1463                 if (record->flush_state != HAMMER_FST_FLUSH) {
1464                         record->flush_state = HAMMER_FST_FLUSH;
1465                         record->flush_group = pip->flush_group;
1466                         hammer_ref(&record->lock);
1467                 } else {
1468                         KKASSERT(record->flush_group == pip->flush_group);
1469                 }
1470                 if (record->type == HAMMER_MEM_RECORD_ADD)
1471                         return(1);
1472
1473                 /*
1474                  * A general or delete-on-disk record does not contribute
1475                  * to our visibility.  We can still flush it, however.
1476                  */
1477                 return(0);
1478         } else {
1479                 /*
1480                  * We couldn't resolve the dependancies, request that the
1481                  * inode be flushed when the dependancies can be resolved.
1482                  */
1483                 pip->flags |= HAMMER_INODE_REFLUSH;
1484                 return(-1);
1485         }
1486 }
1487
1488 /*
1489  * This is the core routine placing an inode into the FST_FLUSH state.
1490  */
1491 static void
1492 hammer_flush_inode_core(hammer_inode_t ip, int flags)
1493 {
1494         int go_count;
1495
1496         /*
1497          * Set flush state and prevent the flusher from cycling into
1498          * the next flush group.  Do not place the ip on the list yet.
1499          * Inodes not in the idle state get an extra reference.
1500          */
1501         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
1502         if (ip->flush_state == HAMMER_FST_IDLE)
1503                 hammer_ref(&ip->lock);
1504         ip->flush_state = HAMMER_FST_FLUSH;
1505         ip->flush_group = ip->hmp->flusher.next;
1506         ++ip->hmp->flusher.group_lock;
1507         ++ip->hmp->count_iqueued;
1508         ++hammer_count_iqueued;
1509
1510         /*
1511          * We need to be able to vfsync/truncate from the backend.
1512          */
1513         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
1514         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
1515                 ip->flags |= HAMMER_INODE_VHELD;
1516                 vref(ip->vp);
1517         }
1518
1519         /*
1520          * Figure out how many in-memory records we can actually flush
1521          * (not including inode meta-data, buffers, etc).
1522          *
1523          * Do not add new records to the flush if this is a recursion or
1524          * if we must still complete a flush from the previous flush cycle.
1525          */
1526         if (flags & HAMMER_FLUSH_RECURSION) {
1527                 go_count = 1;
1528         } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
1529                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
1530                                    hammer_syncgrp_child_callback, NULL);
1531                 go_count = 1;
1532         } else {
1533                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
1534                                    hammer_setup_child_callback, NULL);
1535         }
1536
1537         /*
1538          * This is a more involved test that includes go_count.  If we
1539          * can't flush, flag the inode and return.  If go_count is 0 we
1540          * were are unable to flush any records in our rec_tree and
1541          * must ignore the XDIRTY flag.
1542          */
1543         if (go_count == 0) {
1544                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
1545                         ip->flags |= HAMMER_INODE_REFLUSH;
1546
1547                         --ip->hmp->count_iqueued;
1548                         --hammer_count_iqueued;
1549
1550                         ip->flush_state = HAMMER_FST_SETUP;
1551                         if (ip->flags & HAMMER_INODE_VHELD) {
1552                                 ip->flags &= ~HAMMER_INODE_VHELD;
1553                                 vrele(ip->vp);
1554                         }
1555                         if (flags & HAMMER_FLUSH_SIGNAL) {
1556                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1557                                 hammer_flusher_async(ip->hmp);
1558                         }
1559                         if (--ip->hmp->flusher.group_lock == 0)
1560                                 wakeup(&ip->hmp->flusher.group_lock);
1561                         return;
1562                 }
1563         }
1564
1565         /*
1566          * Snapshot the state of the inode for the backend flusher.
1567          *
1568          * We continue to retain save_trunc_off even when all truncations
1569          * have been resolved as an optimization to determine if we can
1570          * skip the B-Tree lookup for overwrite deletions.
1571          *
1572          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
1573          * and stays in ip->flags.  Once set, it stays set until the
1574          * inode is destroyed.
1575          *
1576          * NOTE: If a truncation from a previous flush cycle had to be
1577          * continued into this one, the TRUNCATED flag will still be
1578          * set in sync_flags as will WOULDBLOCK.  When this occurs
1579          * we CANNOT safely integrate a new truncation from the front-end
1580          * because there may be data records in-memory assigned a flush
1581          * state from the previous cycle that are supposed to be flushed
1582          * before the next frontend truncation.
1583          */
1584         if ((ip->flags & (HAMMER_INODE_TRUNCATED | HAMMER_INODE_WOULDBLOCK)) ==
1585             HAMMER_INODE_TRUNCATED) {
1586                 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
1587                 ip->sync_trunc_off = ip->trunc_off;
1588                 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
1589                 ip->flags &= ~HAMMER_INODE_TRUNCATED;
1590                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
1591
1592                 /*
1593                  * The save_trunc_off used to cache whether the B-Tree
1594                  * holds any records past that point is not used until
1595                  * after the truncation has succeeded, so we can safely
1596                  * set it now.
1597                  */
1598                 if (ip->save_trunc_off > ip->sync_trunc_off)
1599                         ip->save_trunc_off = ip->sync_trunc_off;
1600         }
1601         ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK &
1602                            ~HAMMER_INODE_TRUNCATED);
1603         ip->sync_ino_leaf = ip->ino_leaf;
1604         ip->sync_ino_data = ip->ino_data;
1605         ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED;
1606 #ifdef DEBUG_TRUNCATE
1607         if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
1608                 kprintf("truncateS %016llx\n", ip->sync_trunc_off);
1609 #endif
1610
1611         /*
1612          * The flusher list inherits our inode and reference.
1613          */
1614         TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
1615         if (--ip->hmp->flusher.group_lock == 0)
1616                 wakeup(&ip->hmp->flusher.group_lock);
1617
1618         if (flags & HAMMER_FLUSH_SIGNAL) {
1619                 hammer_flusher_async(ip->hmp);
1620         }
1621 }
1622
1623 /*
1624  * Callback for scan of ip->rec_tree.  Try to include each record in our
1625  * flush.  ip->flush_group has been set but the inode has not yet been
1626  * moved into a flushing state.
1627  *
1628  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
1629  * both inodes.
1630  *
1631  * We return 1 for any record placed or found in FST_FLUSH, which prevents
1632  * the caller from shortcutting the flush.
1633  */
1634 static int
1635 hammer_setup_child_callback(hammer_record_t rec, void *data)
1636 {
1637         hammer_inode_t target_ip;
1638         hammer_inode_t ip;
1639         int r;
1640
1641         /*
1642          * Deleted records are ignored.  Note that the flush detects deleted
1643          * front-end records at multiple points to deal with races.  This is
1644          * just the first line of defense.  The only time DELETED_FE cannot
1645          * be set is when HAMMER_RECF_INTERLOCK_BE is set.
1646          *
1647          * Don't get confused between record deletion and, say, directory
1648          * entry deletion.  The deletion of a directory entry that is on
1649          * the media has nothing to do with the record deletion flags.
1650          *
1651          * The flush_group for a record already in a flush state must
1652          * be updated.  This case can only occur if the inode deleting
1653          * too many records had to be moved to the next flush group.
1654          */
1655         if (rec->flags & (HAMMER_RECF_DELETED_FE|HAMMER_RECF_DELETED_BE)) {
1656                 if (rec->flush_state == HAMMER_FST_FLUSH) {
1657                         KKASSERT(rec->ip->flags & HAMMER_INODE_WOULDBLOCK);
1658                         rec->flush_group = rec->ip->flush_group;
1659                         r = 1;
1660                 } else {
1661                         r = 0;
1662                 }
1663                 return(r);
1664         }
1665
1666         /*
1667          * If the record is in an idle state it has no dependancies and
1668          * can be flushed.
1669          */
1670         ip = rec->ip;
1671         r = 0;
1672
1673         switch(rec->flush_state) {
1674         case HAMMER_FST_IDLE:
1675                 /*
1676                  * Record has no setup dependancy, we can flush it.
1677                  */
1678                 KKASSERT(rec->target_ip == NULL);
1679                 rec->flush_state = HAMMER_FST_FLUSH;
1680                 rec->flush_group = ip->flush_group;
1681                 hammer_ref(&rec->lock);
1682                 r = 1;
1683                 break;
1684         case HAMMER_FST_SETUP:
1685                 /*
1686                  * Record has a setup dependancy.  Try to include the
1687                  * target ip in the flush.
1688                  *
1689                  * We have to be careful here, if we do not do the right
1690                  * thing we can lose track of dirty inodes and the system
1691                  * will lockup trying to allocate buffers.
1692                  */
1693                 target_ip = rec->target_ip;
1694                 KKASSERT(target_ip != NULL);
1695                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
1696                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
1697                         /*
1698                          * If the target IP is already flushing in our group
1699                          * we are golden, otherwise make sure the target
1700                          * reflushes.
1701                          */
1702                         if (target_ip->flush_group == ip->flush_group) {
1703                                 rec->flush_state = HAMMER_FST_FLUSH;
1704                                 rec->flush_group = ip->flush_group;
1705                                 hammer_ref(&rec->lock);
1706                                 r = 1;
1707                         } else {
1708                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
1709                         }
1710                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
1711                         /*
1712                          * If the target IP is not flushing we can force
1713                          * it to flush, even if it is unable to write out
1714                          * any of its own records we have at least one in
1715                          * hand that we CAN deal with.
1716                          */
1717                         rec->flush_state = HAMMER_FST_FLUSH;
1718                         rec->flush_group = ip->flush_group;
1719                         hammer_ref(&rec->lock);
1720                         hammer_flush_inode_core(target_ip,
1721                                                 HAMMER_FLUSH_RECURSION);
1722                         r = 1;
1723                 } else {
1724                         /*
1725                          * General or delete-on-disk record.
1726                          *
1727                          * XXX this needs help.  If a delete-on-disk we could
1728                          * disconnect the target.  If the target has its own
1729                          * dependancies they really need to be flushed.
1730                          *
1731                          * XXX
1732                          */
1733                         rec->flush_state = HAMMER_FST_FLUSH;
1734                         rec->flush_group = ip->flush_group;
1735                         hammer_ref(&rec->lock);
1736                         hammer_flush_inode_core(target_ip,
1737                                                 HAMMER_FLUSH_RECURSION);
1738                         r = 1;
1739                 }
1740                 break;
1741         case HAMMER_FST_FLUSH:
1742                 /*
1743                  * If the WOULDBLOCK flag is set records may have been left
1744                  * over from a previous flush attempt and should be moved
1745                  * to the current flush group.  If it is not set then all
1746                  * such records had better have been flushed already or
1747                  * already associated with the current flush group.
1748                  */
1749                 if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
1750                         rec->flush_group = ip->flush_group;
1751                 } else {
1752                         KKASSERT(rec->flush_group == ip->flush_group);
1753                 }
1754                 r = 1;
1755                 break;
1756         }
1757         return(r);
1758 }
1759
1760 /*
1761  * This version just moves records already in a flush state to the new
1762  * flush group and that is it.
1763  */
1764 static int
1765 hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
1766 {
1767         hammer_inode_t ip = rec->ip;
1768
1769         switch(rec->flush_state) {
1770         case HAMMER_FST_FLUSH:
1771                 if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
1772                         rec->flush_group = ip->flush_group;
1773                 } else {
1774                         KKASSERT(rec->flush_group == ip->flush_group);
1775                 }
1776                 break;
1777         default:
1778                 break;
1779         }
1780         return(0);
1781 }
1782
1783 /*
1784  * Wait for a previously queued flush to complete.  Not only do we need to
1785  * wait for the inode to sync out, we also may have to run the flusher again
1786  * to get it past the UNDO position pertaining to the flush so a crash does
1787  * not 'undo' our flush.
1788  */
1789 void
1790 hammer_wait_inode(hammer_inode_t ip)
1791 {
1792         hammer_mount_t hmp = ip->hmp;
1793         int sync_group;
1794         int waitcount;
1795
1796         sync_group = ip->flush_group;
1797         waitcount = (ip->flags & HAMMER_INODE_REFLUSH) ? 2 : 1;
1798
1799         if (ip->flush_state == HAMMER_FST_SETUP) {
1800                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1801         }
1802         /* XXX can we make this != FST_IDLE ? check SETUP depends */
1803         while (ip->flush_state == HAMMER_FST_FLUSH &&
1804                (ip->flush_group - sync_group) < waitcount) {
1805                 ip->flags |= HAMMER_INODE_FLUSHW;
1806                 tsleep(&ip->flags, 0, "hmrwin", 0);
1807         }
1808         while (hmp->flusher.done - sync_group < waitcount) {
1809                 kprintf("Y");
1810                 hammer_flusher_sync(hmp);
1811         }
1812 }
1813
1814 /*
1815  * Called by the backend code when a flush has been completed.
1816  * The inode has already been removed from the flush list.
1817  *
1818  * A pipelined flush can occur, in which case we must re-enter the
1819  * inode on the list and re-copy its fields.
1820  */
1821 void
1822 hammer_flush_inode_done(hammer_inode_t ip)
1823 {
1824         hammer_mount_t hmp;
1825         int dorel;
1826
1827         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
1828
1829         hmp = ip->hmp;
1830
1831         /*
1832          * Merge left-over flags back into the frontend and fix the state.
1833          * Incomplete truncations are retained by the backend.
1834          */
1835         ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
1836         ip->sync_flags &= HAMMER_INODE_TRUNCATED;
1837
1838         /*
1839          * The backend may have adjusted nlinks, so if the adjusted nlinks
1840          * does not match the fronttend set the frontend's RDIRTY flag again.
1841          */
1842         if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
1843                 ip->flags |= HAMMER_INODE_DDIRTY;
1844
1845         /*
1846          * Fix up the dirty buffer status.
1847          */
1848         if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
1849                 ip->flags |= HAMMER_INODE_BUFS;
1850         }
1851
1852         /*
1853          * Re-set the XDIRTY flag if some of the inode's in-memory records
1854          * could not be flushed.
1855          */
1856         KKASSERT((RB_EMPTY(&ip->rec_tree) &&
1857                   (ip->flags & HAMMER_INODE_XDIRTY) == 0) ||
1858                  (!RB_EMPTY(&ip->rec_tree) &&
1859                   (ip->flags & HAMMER_INODE_XDIRTY) != 0));
1860
1861         /*
1862          * Do not lose track of inodes which no longer have vnode
1863          * assocations, otherwise they may never get flushed again.
1864          */
1865         if ((ip->flags & HAMMER_INODE_MODMASK) && ip->vp == NULL)
1866                 ip->flags |= HAMMER_INODE_REFLUSH;
1867
1868         /*
1869          * Clean up the vnode ref
1870          */
1871         if (ip->flags & HAMMER_INODE_VHELD) {
1872                 ip->flags &= ~HAMMER_INODE_VHELD;
1873                 vrele(ip->vp);
1874         }
1875
1876         /*
1877          * Adjust flush_state.  The target state (idle or setup) shouldn't
1878          * be terribly important since we will reflush if we really need
1879          * to do anything.
1880          *
1881          * If the WOULDBLOCK flag is set we must re-flush immediately
1882          * to continue a potentially large deletion.  The flag also causes
1883          * the hammer_setup_child_callback() to move records in the old
1884          * flush group to the new one.
1885          */
1886         if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
1887                 ip->flush_state = HAMMER_FST_IDLE;
1888                 hammer_flush_inode_core(ip, HAMMER_FLUSH_SIGNAL);
1889                 ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
1890                 dorel = 1;
1891         } else if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
1892                 ip->flush_state = HAMMER_FST_IDLE;
1893                 dorel = 1;
1894         } else {
1895                 ip->flush_state = HAMMER_FST_SETUP;
1896                 dorel = 0;
1897         }
1898
1899         --hmp->count_iqueued;
1900         --hammer_count_iqueued;
1901
1902         /*
1903          * If the frontend made more changes and requested another flush,
1904          * then try to get it running.
1905          */
1906         if (ip->flags & HAMMER_INODE_REFLUSH) {
1907                 ip->flags &= ~HAMMER_INODE_REFLUSH;
1908                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
1909                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
1910                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1911                 } else {
1912                         hammer_flush_inode(ip, 0);
1913                 }
1914         }
1915
1916         /*
1917          * If the inode is now clean drop the space reservation.
1918          */
1919         if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1920             (ip->flags & HAMMER_INODE_RSV_INODES)) {
1921                 ip->flags &= ~HAMMER_INODE_RSV_INODES;
1922                 --hmp->rsv_inodes;
1923         }
1924
1925         /*
1926          * Finally, if the frontend is waiting for a flush to complete,
1927          * wake it up.
1928          */
1929         if (ip->flush_state != HAMMER_FST_FLUSH) {
1930                 if (ip->flags & HAMMER_INODE_FLUSHW) {
1931                         ip->flags &= ~HAMMER_INODE_FLUSHW;
1932                         wakeup(&ip->flags);
1933                 }
1934         }
1935         if (dorel)
1936                 hammer_rel_inode(ip, 0);
1937 }
1938
1939 /*
1940  * Called from hammer_sync_inode() to synchronize in-memory records
1941  * to the media.
1942  */
1943 static int
1944 hammer_sync_record_callback(hammer_record_t record, void *data)
1945 {
1946         hammer_cursor_t cursor = data;
1947         hammer_transaction_t trans = cursor->trans;
1948         hammer_mount_t hmp = trans->hmp;
1949         int error;
1950
1951         /*
1952          * Skip records that do not belong to the current flush.
1953          */
1954         ++hammer_stats_record_iterations;
1955         if (record->flush_state != HAMMER_FST_FLUSH)
1956                 return(0);
1957
1958 #if 1
1959         if (record->flush_group != record->ip->flush_group) {
1960                 kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group);
1961                 Debugger("blah2");
1962                 return(0);
1963         }
1964 #endif
1965         KKASSERT(record->flush_group == record->ip->flush_group);
1966
1967         /*
1968          * Interlock the record using the BE flag.  Once BE is set the
1969          * frontend cannot change the state of FE.
1970          *
1971          * NOTE: If FE is set prior to us setting BE we still sync the
1972          * record out, but the flush completion code converts it to
1973          * a delete-on-disk record instead of destroying it.
1974          */
1975         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
1976         record->flags |= HAMMER_RECF_INTERLOCK_BE;
1977
1978         /*
1979          * The backend may have already disposed of the record.
1980          */
1981         if (record->flags & HAMMER_RECF_DELETED_BE) {
1982                 error = 0;
1983                 goto done;
1984         }
1985
1986         /*
1987          * If the whole inode is being deleting all on-disk records will
1988          * be deleted very soon, we can't sync any new records to disk
1989          * because they will be deleted in the same transaction they were
1990          * created in (delete_tid == create_tid), which will assert.
1991          *
1992          * XXX There may be a case with RECORD_ADD with DELETED_FE set
1993          * that we currently panic on.
1994          */
1995         if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
1996                 switch(record->type) {
1997                 case HAMMER_MEM_RECORD_DATA:
1998                         /*
1999                          * We don't have to do anything, if the record was
2000                          * committed the space will have been accounted for
2001                          * in the blockmap.
2002                          */
2003                         /* fall through */
2004                 case HAMMER_MEM_RECORD_GENERAL:
2005                         record->flags |= HAMMER_RECF_DELETED_FE;
2006                         record->flags |= HAMMER_RECF_DELETED_BE;
2007                         error = 0;
2008                         goto done;
2009                 case HAMMER_MEM_RECORD_ADD:
2010                         panic("hammer_sync_record_callback: illegal add "
2011                               "during inode deletion record %p", record);
2012                         break; /* NOT REACHED */
2013                 case HAMMER_MEM_RECORD_INODE:
2014                         panic("hammer_sync_record_callback: attempt to "
2015                               "sync inode record %p?", record);
2016                         break; /* NOT REACHED */
2017                 case HAMMER_MEM_RECORD_DEL:
2018                         /*
2019                          * Follow through and issue the on-disk deletion
2020                          */
2021                         break;
2022                 }
2023         }
2024
2025         /*
2026          * If DELETED_FE is set special handling is needed for directory
2027          * entries.  Dependant pieces related to the directory entry may
2028          * have already been synced to disk.  If this occurs we have to
2029          * sync the directory entry and then change the in-memory record
2030          * from an ADD to a DELETE to cover the fact that it's been
2031          * deleted by the frontend.
2032          *
2033          * A directory delete covering record (MEM_RECORD_DEL) can never
2034          * be deleted by the frontend.
2035          *
2036          * Any other record type (aka DATA) can be deleted by the frontend.
2037          * XXX At the moment the flusher must skip it because there may
2038          * be another data record in the flush group for the same block,
2039          * meaning that some frontend data changes can leak into the backend's
2040          * synchronization point.
2041          */
2042         if (record->flags & HAMMER_RECF_DELETED_FE) {
2043                 if (record->type == HAMMER_MEM_RECORD_ADD) {
2044                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
2045                 } else {
2046                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
2047                         record->flags |= HAMMER_RECF_DELETED_BE;
2048                         error = 0;
2049                         goto done;
2050                 }
2051         }
2052
2053         /*
2054          * Assign the create_tid for new records.  Deletions already
2055          * have the record's entire key properly set up.
2056          */
2057         if (record->type != HAMMER_MEM_RECORD_DEL)
2058                 record->leaf.base.create_tid = trans->tid;
2059                 record->leaf.create_ts = trans->time32;
2060         for (;;) {
2061                 error = hammer_ip_sync_record_cursor(cursor, record);
2062                 if (error != EDEADLK)
2063                         break;
2064                 hammer_done_cursor(cursor);
2065                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
2066                                            record->ip);
2067                 if (error)
2068                         break;
2069         }
2070         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
2071
2072         if (error) {
2073                 error = -error;
2074                 if (error != -ENOSPC) {
2075                         kprintf("hammer_sync_record_callback: sync failed rec "
2076                                 "%p, error %d\n", record, error);
2077                         Debugger("sync failed rec");
2078                 }
2079         }
2080 done:
2081         hammer_flush_record_done(record, error);
2082
2083         /*
2084          * Do partial finalization if we have built up too many dirty
2085          * buffers.  Otherwise a buffer cache deadlock can occur when
2086          * doing things like creating tens of thousands of tiny files.
2087          *
2088          * The finalization lock is already being held by virtue of the
2089          * flusher calling us.
2090          */
2091         if (hammer_flusher_meta_limit(hmp))
2092                 hammer_flusher_finalize(trans, 0);
2093
2094         return(error);
2095 }
2096
2097 /*
2098  * XXX error handling
2099  */
2100 int
2101 hammer_sync_inode(hammer_inode_t ip)
2102 {
2103         struct hammer_transaction trans;
2104         struct hammer_cursor cursor;
2105         hammer_node_t tmp_node;
2106         hammer_record_t depend;
2107         hammer_record_t next;
2108         int error, tmp_error;
2109         u_int64_t nlinks;
2110
2111         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
2112                 return(0);
2113
2114         hammer_start_transaction_fls(&trans, ip->hmp);
2115         error = hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2116         if (error)
2117                 goto done;
2118
2119         /*
2120          * Any directory records referencing this inode which are not in
2121          * our current flush group must adjust our nlink count for the
2122          * purposes of synchronization to disk.
2123          *
2124          * Records which are in our flush group can be unlinked from our
2125          * inode now, potentially allowing the inode to be physically
2126          * deleted.
2127          *
2128          * This cannot block.
2129          */
2130         nlinks = ip->ino_data.nlinks;
2131         next = TAILQ_FIRST(&ip->target_list);
2132         while ((depend = next) != NULL) {
2133                 next = TAILQ_NEXT(depend, target_entry);
2134                 if (depend->flush_state == HAMMER_FST_FLUSH &&
2135                     depend->flush_group == ip->hmp->flusher.act) {
2136                         /*
2137                          * If this is an ADD that was deleted by the frontend
2138                          * the frontend nlinks count will have already been
2139                          * decremented, but the backend is going to sync its
2140                          * directory entry and must account for it.  The
2141                          * record will be converted to a delete-on-disk when
2142                          * it gets synced.
2143                          *
2144                          * If the ADD was not deleted by the frontend we
2145                          * can remove the dependancy from our target_list.
2146                          */
2147                         if (depend->flags & HAMMER_RECF_DELETED_FE) {
2148                                 ++nlinks;
2149                         } else {
2150                                 TAILQ_REMOVE(&ip->target_list, depend,
2151                                              target_entry);
2152                                 depend->target_ip = NULL;
2153                         }
2154                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
2155                         /*
2156                          * Not part of our flush group
2157                          */
2158                         KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
2159                         switch(depend->type) {
2160                         case HAMMER_MEM_RECORD_ADD:
2161                                 --nlinks;
2162                                 break;
2163                         case HAMMER_MEM_RECORD_DEL:
2164                                 ++nlinks;
2165                                 break;
2166                         default:
2167                                 break;
2168                         }
2169                 }
2170         }
2171
2172         /*
2173          * Set dirty if we had to modify the link count.
2174          */
2175         if (ip->sync_ino_data.nlinks != nlinks) {
2176                 KKASSERT((int64_t)nlinks >= 0);
2177                 ip->sync_ino_data.nlinks = nlinks;
2178                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2179         }
2180
2181         /*
2182          * If there is a trunction queued destroy any data past the (aligned)
2183          * truncation point.  Userland will have dealt with the buffer
2184          * containing the truncation point for us.
2185          *
2186          * We don't flush pending frontend data buffers until after we've
2187          * dealt with the truncation.
2188          */
2189         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2190                 /*
2191                  * Interlock trunc_off.  The VOP front-end may continue to
2192                  * make adjustments to it while we are blocked.
2193                  */
2194                 off_t trunc_off;
2195                 off_t aligned_trunc_off;
2196                 int blkmask;
2197
2198                 trunc_off = ip->sync_trunc_off;
2199                 blkmask = hammer_blocksize(trunc_off) - 1;
2200                 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
2201
2202                 /*
2203                  * Delete any whole blocks on-media.  The front-end has
2204                  * already cleaned out any partial block and made it
2205                  * pending.  The front-end may have updated trunc_off
2206                  * while we were blocked so we only use sync_trunc_off.
2207                  *
2208                  * This operation can blow out the buffer cache, EWOULDBLOCK
2209                  * means we were unable to complete the deletion.  The
2210                  * deletion will update sync_trunc_off in that case.
2211                  */
2212                 error = hammer_ip_delete_range(&cursor, ip,
2213                                                 aligned_trunc_off,
2214                                                 0x7FFFFFFFFFFFFFFFLL, 2);
2215                 if (error == EWOULDBLOCK) {
2216                         ip->flags |= HAMMER_INODE_WOULDBLOCK;
2217                         error = 0;
2218                         goto defer_buffer_flush;
2219                 }
2220
2221                 if (error)
2222                         Debugger("hammer_ip_delete_range errored");
2223
2224                 /*
2225                  * Clear the truncation flag on the backend after we have
2226                  * complete the deletions.  Backend data is now good again
2227                  * (including new records we are about to sync, below).
2228                  *
2229                  * Leave sync_trunc_off intact.  As we write additional
2230                  * records the backend will update sync_trunc_off.  This
2231                  * tells the backend whether it can skip the overwrite
2232                  * test.  This should work properly even when the backend
2233                  * writes full blocks where the truncation point straddles
2234                  * the block because the comparison is against the base
2235                  * offset of the record.
2236                  */
2237                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
2238                 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
2239         } else {
2240                 error = 0;
2241         }
2242
2243         /*
2244          * Now sync related records.  These will typically be directory
2245          * entries, records tracking direct-writes, or delete-on-disk records.
2246          */
2247         if (error == 0) {
2248                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2249                                     hammer_sync_record_callback, &cursor);
2250                 if (tmp_error < 0)
2251                         tmp_error = -error;
2252                 if (tmp_error)
2253                         error = tmp_error;
2254         }
2255         hammer_cache_node(&ip->cache[1], cursor.node);
2256
2257         /*
2258          * Re-seek for inode update, assuming our cache hasn't been ripped
2259          * out from under us.
2260          */
2261         if (error == 0) {
2262                 tmp_node = hammer_ref_node_safe(ip->hmp, &ip->cache[0], &error);
2263                 if (tmp_node) {
2264                         hammer_cursor_downgrade(&cursor);
2265                         hammer_lock_sh(&tmp_node->lock);
2266                         if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
2267                                 hammer_cursor_seek(&cursor, tmp_node, 0);
2268                         hammer_unlock(&tmp_node->lock);
2269                         hammer_rel_node(tmp_node);
2270                 }
2271                 error = 0;
2272         }
2273
2274         /*
2275          * If we are deleting the inode the frontend had better not have
2276          * any active references on elements making up the inode.
2277          *
2278          * The call to hammer_ip_delete_clean() cleans up auxillary records
2279          * but not DB or DATA records.  Those must have already been deleted
2280          * by the normal truncation mechanic.
2281          */
2282         if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
2283                 RB_EMPTY(&ip->rec_tree)  &&
2284             (ip->sync_flags & HAMMER_INODE_DELETING) &&
2285             (ip->flags & HAMMER_INODE_DELETED) == 0) {
2286                 int count1 = 0;
2287
2288                 error = hammer_ip_delete_clean(&cursor, ip, &count1);
2289                 if (error == 0) {
2290                         ip->flags |= HAMMER_INODE_DELETED;
2291                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
2292                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
2293                         KKASSERT(RB_EMPTY(&ip->rec_tree));
2294
2295                         /*
2296                          * Set delete_tid in both the frontend and backend
2297                          * copy of the inode record.  The DELETED flag handles
2298                          * this, do not set RDIRTY.
2299                          */
2300                         ip->ino_leaf.base.delete_tid = trans.tid;
2301                         ip->sync_ino_leaf.base.delete_tid = trans.tid;
2302                         ip->ino_leaf.delete_ts = trans.time32;
2303                         ip->sync_ino_leaf.delete_ts = trans.time32;
2304
2305
2306                         /*
2307                          * Adjust the inode count in the volume header
2308                          */
2309                         hammer_sync_lock_sh(&trans);
2310                         if (ip->flags & HAMMER_INODE_ONDISK) {
2311                                 hammer_modify_volume_field(&trans,
2312                                                            trans.rootvol,
2313                                                            vol0_stat_inodes);
2314                                 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
2315                                 hammer_modify_volume_done(trans.rootvol);
2316                         }
2317                         hammer_sync_unlock(&trans);
2318                 } else {
2319                         Debugger("hammer_ip_delete_clean errored");
2320                 }
2321         }
2322
2323         ip->sync_flags &= ~HAMMER_INODE_BUFS;
2324
2325         if (error)
2326                 Debugger("RB_SCAN errored");
2327
2328 defer_buffer_flush:
2329         /*
2330          * Now update the inode's on-disk inode-data and/or on-disk record.
2331          * DELETED and ONDISK are managed only in ip->flags.
2332          *
2333          * In the case of a defered buffer flush we still update the on-disk
2334          * inode to satisfy visibility requirements if there happen to be
2335          * directory dependancies.
2336          */
2337         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
2338         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
2339                 /*
2340                  * If deleted and on-disk, don't set any additional flags.
2341                  * the delete flag takes care of things.
2342                  *
2343                  * Clear flags which may have been set by the frontend.
2344                  */
2345                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
2346                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
2347                                     HAMMER_INODE_DELETING);
2348                 break;
2349         case HAMMER_INODE_DELETED:
2350                 /*
2351                  * Take care of the case where a deleted inode was never
2352                  * flushed to the disk in the first place.
2353                  *
2354                  * Clear flags which may have been set by the frontend.
2355                  */
2356                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
2357                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
2358                                     HAMMER_INODE_DELETING);
2359                 while (RB_ROOT(&ip->rec_tree)) {
2360                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
2361                         hammer_ref(&record->lock);
2362                         KKASSERT(record->lock.refs == 1);
2363                         record->flags |= HAMMER_RECF_DELETED_FE;
2364                         record->flags |= HAMMER_RECF_DELETED_BE;
2365                         hammer_rel_mem_record(record);
2366                 }
2367                 break;
2368         case HAMMER_INODE_ONDISK:
2369                 /*
2370                  * If already on-disk, do not set any additional flags.
2371                  */
2372                 break;
2373         default:
2374                 /*
2375                  * If not on-disk and not deleted, set DDIRTY to force
2376                  * an initial record to be written.
2377                  *
2378                  * Also set the create_tid in both the frontend and backend
2379                  * copy of the inode record.
2380                  */
2381                 ip->ino_leaf.base.create_tid = trans.tid;
2382                 ip->ino_leaf.create_ts = trans.time32;
2383                 ip->sync_ino_leaf.base.create_tid = trans.tid;
2384                 ip->sync_ino_leaf.create_ts = trans.time32;
2385                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2386                 break;
2387         }
2388
2389         /*
2390          * If RDIRTY or DDIRTY is set, write out a new record.  If the inode
2391          * is already on-disk the old record is marked as deleted.
2392          *
2393          * If DELETED is set hammer_update_inode() will delete the existing
2394          * record without writing out a new one.
2395          *
2396          * If *ONLY* the ITIMES flag is set we can update the record in-place.
2397          */
2398         if (ip->flags & HAMMER_INODE_DELETED) {
2399                 error = hammer_update_inode(&cursor, ip);
2400         } else
2401         if ((ip->sync_flags & HAMMER_INODE_DDIRTY) == 0 &&
2402             (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
2403                 error = hammer_update_itimes(&cursor, ip);
2404         } else
2405         if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
2406                 error = hammer_update_inode(&cursor, ip);
2407         }
2408         if (error)
2409                 Debugger("hammer_update_itimes/inode errored");
2410 done:
2411         /*
2412          * Save the TID we used to sync the inode with to make sure we
2413          * do not improperly reuse it.
2414          */
2415         hammer_done_cursor(&cursor);
2416         hammer_done_transaction(&trans);
2417         return(error);
2418 }
2419
2420 /*
2421  * This routine is called when the OS is no longer actively referencing
2422  * the inode (but might still be keeping it cached), or when releasing
2423  * the last reference to an inode.
2424  *
2425  * At this point if the inode's nlinks count is zero we want to destroy
2426  * it, which may mean destroying it on-media too.
2427  */
2428 void
2429 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
2430 {
2431         struct vnode *vp;
2432
2433         /*
2434          * Set the DELETING flag when the link count drops to 0 and the
2435          * OS no longer has any opens on the inode.
2436          *
2437          * The backend will clear DELETING (a mod flag) and set DELETED
2438          * (a state flag) when it is actually able to perform the
2439          * operation.
2440          */
2441         if (ip->ino_data.nlinks == 0 &&
2442             (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
2443                 ip->flags |= HAMMER_INODE_DELETING;
2444                 ip->flags |= HAMMER_INODE_TRUNCATED;
2445                 ip->trunc_off = 0;
2446                 vp = NULL;
2447                 if (getvp) {
2448                         if (hammer_get_vnode(ip, &vp) != 0)
2449                                 return;
2450                 }
2451
2452                 /*
2453                  * Final cleanup
2454                  */
2455                 if (ip->vp) {
2456                         vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
2457                         vnode_pager_setsize(ip->vp, 0);
2458                 }
2459                 if (getvp) {
2460                         vput(vp);
2461                 }
2462         }
2463 }
2464
2465 /*
2466  * Re-test an inode when a dependancy had gone away to see if we
2467  * can chain flush it.
2468  */
2469 void
2470 hammer_test_inode(hammer_inode_t ip)
2471 {
2472         if (ip->flags & HAMMER_INODE_REFLUSH) {
2473                 ip->flags &= ~HAMMER_INODE_REFLUSH;
2474                 hammer_ref(&ip->lock);
2475                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
2476                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
2477                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2478                 } else {
2479                         hammer_flush_inode(ip, 0);
2480                 }
2481                 hammer_rel_inode(ip, 0);
2482         }
2483 }
2484
2485 /*
2486  * Clear the RECLAIM flag on an inode.  This occurs when the inode is
2487  * reassociated with a vp or just before it gets freed.
2488  *
2489  * Wakeup one thread blocked waiting on reclaims to complete.  Note that
2490  * the inode the thread is waiting on behalf of is a different inode then
2491  * the inode we are called with.  This is to create a pipeline.
2492  */
2493 static void
2494 hammer_inode_wakereclaims(hammer_inode_t ip)
2495 {
2496         struct hammer_reclaim *reclaim;
2497         hammer_mount_t hmp = ip->hmp;
2498
2499         if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
2500                 return;
2501
2502         --hammer_count_reclaiming;
2503         --hmp->inode_reclaims;
2504         ip->flags &= ~HAMMER_INODE_RECLAIM;
2505
2506         if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
2507                 TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
2508                 reclaim->okydoky = 1;
2509                 wakeup(reclaim);
2510         }
2511 }
2512
2513 /*
2514  * Setup our reclaim pipeline.  We only let so many detached (and dirty)
2515  * inodes build up before we start blocking.
2516  *
2517  * When we block we don't care *which* inode has finished reclaiming,
2518  * as lone as one does.  This is somewhat heuristical... we also put a
2519  * cap on how long we are willing to wait.
2520  */
2521 void
2522 hammer_inode_waitreclaims(hammer_mount_t hmp)
2523 {
2524         struct hammer_reclaim reclaim;
2525         int delay;
2526
2527         if (hmp->inode_reclaims > HAMMER_RECLAIM_WAIT) {
2528                 reclaim.okydoky = 0;
2529                 TAILQ_INSERT_TAIL(&hmp->reclaim_list,
2530                                   &reclaim, entry);
2531         } else {
2532                 reclaim.okydoky = 1;
2533         }
2534
2535         if (reclaim.okydoky == 0) {
2536                 delay = (hmp->inode_reclaims - HAMMER_RECLAIM_WAIT) * hz /
2537                         HAMMER_RECLAIM_WAIT;
2538                 if (delay >= 0)
2539                         tsleep(&reclaim, 0, "hmrrcm", delay + 1);
2540                 if (reclaim.okydoky == 0)
2541                         TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
2542         }
2543 }
2544