kernel/fs/ufs/ufs_subr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/time.h>
  43 #include <sys/fs/ufs_fs.h>
  44 #include <sys/cmn_err.h>
  45
  46 #ifdef _KERNEL
  47
  48 #include <sys/systm.h>
  49 #include <sys/sysmacros.h>
  50 #include <sys/buf.h>
  51 #include <sys/conf.h>
  52 #include <sys/user.h>
  53 #include <sys/var.h>
  54 #include <sys/vfs.h>
  55 #include <sys/vnode.h>
  56 #include <sys/proc.h>
  57 #include <sys/debug.h>
  58 #include <sys/fssnap_if.h>
  59 #include <sys/fs/ufs_inode.h>
  60 #include <sys/fs/ufs_trans.h>
  61 #include <sys/fs/ufs_panic.h>
  62 #include <sys/fs/ufs_bio.h>
  63 #include <sys/fs/ufs_log.h>
  64 #include <sys/kmem.h>
  65 #include <sys/policy.h>
  66 #include <vm/hat.h>
  67 #include <vm/as.h>
  68 #include <vm/seg.h>
  69 #include <vm/pvn.h>
  70 #include <vm/seg_map.h>
  71 #include <sys/swap.h>
  72 #include <vm/seg_kmem.h>
  73
  74 #else  /* _KERNEL */
  75
  76 #define ASSERT(x)               /* don't use asserts for fsck et al */
  77
  78 #endif  /* _KERNEL */
  79
  80 #ifdef _KERNEL
  81
  82 /*
  83  * Used to verify that a given entry on the ufs_instances list (see below)
  84  * still refers to a mounted file system.
  85  *
  86  * XXX: This is a crock that substitutes for proper locking to coordinate
  87  *      updates to and uses of the entries in ufs_instances.
  88  */
  89 struct check_node {
  90         struct vfs *vfsp;
  91         struct ufsvfs *ufsvfs;
  92         dev_t vfs_dev;
  93 };
  94
  95 static vfs_t *still_mounted(struct check_node *);
  96
  97 /*
  98  * All ufs file system instances are linked together into a list starting at
  99  * ufs_instances.  The list is updated as part of mount and unmount.  It's
 100  * consulted in ufs_update, to allow syncing out all ufs file system instances
 101  * in a batch.
 102  *
 103  * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist
 104  * manipulated in ufs_funmount_cleanup.  (A given ufs instance is always on
 105  * exactly one of these lists except while it's being allocated or
 106  * deallocated.)
 107  */
 108 struct ufsvfs   *ufs_instances;
 109 extern kmutex_t         ufsvfs_mutex;   /* XXX: move this to ufs_inode.h? */
 110
 111 /*
 112  * ufsvfs list manipulation routines
 113  */
 114
 115 /*
 116  * Link ufsp in at the head of the list of ufs_instances.
 117  */
 118 void
 119 ufs_vfs_add(struct ufsvfs *ufsp)
 120 {
 121         mutex_enter(&ufsvfs_mutex);
 122         ufsp->vfs_next = ufs_instances;
 123         ufs_instances = ufsp;
 124         mutex_exit(&ufsvfs_mutex);
 125 }
 126
 127 /*
 128  * Remove ufsp from the list of ufs_instances.
 129  *
 130  * Does no error checking; ufsp is assumed to actually be on the list.
 131  */
 132 void
 133 ufs_vfs_remove(struct ufsvfs *ufsp)
 134 {
 135         struct ufsvfs   **delpt = &ufs_instances;
 136
 137         mutex_enter(&ufsvfs_mutex);
 138         for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) {
 139                 if (*delpt == ufsp) {
 140                         *delpt = ufsp->vfs_next;
 141                         ufsp->vfs_next = NULL;
 142                         break;
 143                 }
 144         }
 145         mutex_exit(&ufsvfs_mutex);
 146 }
 147
 148 /*
 149  * Clean up state resulting from a forcible unmount that couldn't be handled
 150  * directly during the unmount.  (See commentary in the unmount code for more
 151  * info.)
 152  */
 153 static void
 154 ufs_funmount_cleanup()
 155 {
 156         struct ufsvfs           *ufsvfsp;
 157         extern struct ufsvfs    *oldufsvfslist, *ufsvfslist;
 158
 159         /*
 160          * Assumption: it's now safe to blow away the entries on
 161          * oldufsvfslist.
 162          */
 163         mutex_enter(&ufsvfs_mutex);
 164         while ((ufsvfsp = oldufsvfslist) != NULL) {
 165                 oldufsvfslist = ufsvfsp->vfs_next;
 166
 167                 mutex_destroy(&ufsvfsp->vfs_lock);
 168                 kmem_free(ufsvfsp, sizeof (struct ufsvfs));
 169         }
 170         /*
 171          * Rotate more recent unmount entries into place in preparation for
 172          * the next time around.
 173          */
 174         oldufsvfslist = ufsvfslist;
 175         ufsvfslist = NULL;
 176         mutex_exit(&ufsvfs_mutex);
 177 }
 178
 179
 180 /*
 181  * ufs_update performs the ufs part of `sync'.  It goes through the disk
 182  * queues to initiate sandbagged IO; goes through the inodes to write
 183  * modified nodes; and it goes through the mount table to initiate
 184  * the writing of the modified super blocks.
 185  */
 186 extern time_t   time;
 187 time_t          ufs_sync_time;
 188 time_t          ufs_sync_time_secs = 1;
 189
 190 extern kmutex_t ufs_scan_lock;
 191
 192 void
 193 ufs_update(int flag)
 194 {
 195         struct vfs *vfsp;
 196         struct fs *fs;
 197         struct ufsvfs *ufsp;
 198         struct ufsvfs *ufsnext;
 199         struct ufsvfs *update_list = NULL;
 200         int check_cnt = 0;
 201         size_t check_size;
 202         struct check_node *check_list, *ptr;
 203         int cheap = flag & SYNC_ATTR;
 204
 205         /*
 206          * This is a hack.  A design flaw in the forced unmount protocol
 207          * could allow a thread to attempt to use a kmem_freed ufsvfs
 208          * structure in ufs_lockfs_begin/ufs_check_lockfs.  This window
 209          * is difficult to hit, even during the lockfs stress tests.
 210          * So the hacky fix is to wait awhile before kmem_free'ing the
 211          * ufsvfs structures for forcibly unmounted file systems.  `Awhile'
 212          * is defined as every other call from fsflush (~60 seconds).
 213          */
 214         if (cheap)
 215                 ufs_funmount_cleanup();
 216
 217         /*
 218          * Examine all ufsvfs structures and add those that we can lock to the
 219          * update list.  This is so that we don't hold the list lock for a
 220          * long time.  If vfs_lock fails for a file system instance, then skip
 221          * it because somebody is doing a unmount on it.
 222          */
 223         mutex_enter(&ufsvfs_mutex);
 224         for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
 225                 vfsp = ufsp->vfs_vfs;
 226                 if (vfs_lock(vfsp) != 0)
 227                         continue;
 228                 ufsp->vfs_wnext = update_list;
 229                 update_list = ufsp;
 230                 check_cnt++;
 231         }
 232         mutex_exit(&ufsvfs_mutex);
 233
 234         if (update_list == NULL)
 235                 return;
 236
 237         check_size = sizeof (struct check_node) * check_cnt;
 238         check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP);
 239
 240         /*
 241          * Write back modified superblocks.
 242          * Consistency check that the superblock of
 243          * each file system is still in the buffer cache.
 244          *
 245          * Note that the update_list traversal is done without the protection
 246          * of an overall list lock, so it's necessary to rely on the fact that
 247          * each entry of the list is vfs_locked when moving from one entry to
 248          * the next.  This works because a concurrent attempt to add an entry
 249          * to another thread's update_list won't find it, since it'll already
 250          * be locked.
 251          */
 252         check_cnt = 0;
 253         for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) {
 254                 /*
 255                  * Need to grab the next ptr before we unlock this one so
 256                  * another thread doesn't grab it and change it before we move
 257                  * on to the next vfs.  (Once we unlock it, it's ok if another
 258                  * thread finds it to add it to its own update_list; we don't
 259                  * attempt to refer to it through our list any more.)
 260                  */
 261                 ufsnext = ufsp->vfs_wnext;
 262                 vfsp = ufsp->vfs_vfs;
 263
 264                 /*
 265                  * Seems like this can't happen, so perhaps it should become
 266                  * an ASSERT(vfsp->vfs_data != NULL).
 267                  */
 268                 if (!vfsp->vfs_data) {
 269                         vfs_unlock(vfsp);
 270                         continue;
 271                 }
 272
 273                 fs = ufsp->vfs_fs;
 274
 275                 /*
 276                  * don't update a locked superblock during a panic; it
 277                  * may be in an inconsistent state
 278                  */
 279                 if (panicstr) {
 280                         if (!mutex_tryenter(&ufsp->vfs_lock)) {
 281                                 vfs_unlock(vfsp);
 282                                 continue;
 283                         }
 284                 } else
 285                         mutex_enter(&ufsp->vfs_lock);
 286                 /*
 287                  * Build up the STABLE check list, so we can unlock the vfs
 288                  * until we do the actual checking.
 289                  */
 290                 if (check_list != NULL) {
 291                         if ((fs->fs_ronly == 0) &&
 292                             (fs->fs_clean != FSBAD) &&
 293                             (fs->fs_clean != FSSUSPEND)) {
 294                                 ptr->vfsp = vfsp;
 295                                 ptr->ufsvfs = ufsp;
 296                                 ptr->vfs_dev = vfsp->vfs_dev;
 297                                 ptr++;
 298                                 check_cnt++;
 299                         }
 300                 }
 301
 302                 /*
 303                  * superblock is not modified
 304                  */
 305                 if (fs->fs_fmod == 0) {
 306                         mutex_exit(&ufsp->vfs_lock);
 307                         vfs_unlock(vfsp);
 308                         continue;
 309                 }
 310                 if (fs->fs_ronly != 0) {
 311                         mutex_exit(&ufsp->vfs_lock);
 312                         vfs_unlock(vfsp);
 313                         (void) ufs_fault(ufsp->vfs_root,
 314                             "fs = %s update: ro fs mod\n", fs->fs_fsmnt);
 315                         /*
 316                          * XXX: Why is this a return instead of a continue?
 317                          *      This may be an attempt to replace a panic with
 318                          *      something less drastic, but there's cleanup we
 319                          *      should be doing that's not being done (e.g.,
 320                          *      unlocking the remaining entries on the list).
 321                          */
 322                         return;
 323                 }
 324                 fs->fs_fmod = 0;
 325                 mutex_exit(&ufsp->vfs_lock);
 326                 TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE);
 327                 vfs_unlock(vfsp);
 328         }
 329
 330         ufs_sync_time = time;
 331
 332         /*
 333          * Avoid racing with ufs_unmount() and ufs_sync().
 334          */
 335         mutex_enter(&ufs_scan_lock);
 336
 337         (void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap,
 338             NULL);
 339
 340         mutex_exit(&ufs_scan_lock);
 341
 342         /*
 343          * Force stale buffer cache information to be flushed,
 344          * for all devices.  This should cause any remaining control
 345          * information (e.g., cg and inode info) to be flushed back.
 346          */
 347         bflush((dev_t)NODEV);
 348
 349         if (check_list == NULL)
 350                 return;
 351
 352         /*
 353          * For each UFS filesystem in the STABLE check_list, update
 354          * the clean flag if warranted.
 355          */
 356         for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) {
 357                 int     error;
 358
 359                 /*
 360                  * still_mounted() returns with vfsp and the vfs_reflock
 361                  * held if ptr refers to a vfs that is still mounted.
 362                  */
 363                 if ((vfsp = still_mounted(ptr)) == NULL)
 364                         continue;
 365                 ufs_checkclean(vfsp);
 366                 /*
 367                  * commit any outstanding async transactions
 368                  */
 369                 ufsp = (struct ufsvfs *)vfsp->vfs_data;
 370                 curthread->t_flag |= T_DONTBLOCK;
 371                 TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE,
 372                                  &error);
 373                 if (!error) {
 374                         TRANS_END_SYNC(ufsp, &error, TOP_COMMIT_UPDATE,
 375                                        TOP_COMMIT_SIZE);
 376                 }
 377                 curthread->t_flag &= ~T_DONTBLOCK;
 378
 379                 vfs_unlock(vfsp);
 380         }
 381
 382         kmem_free(check_list, check_size);
 383 }
 384
 385 int
 386 ufs_sync_inode(struct inode *ip, void *arg)
 387 {
 388         int cheap = (int)(uintptr_t)arg;
 389         struct ufsvfs *ufsvfsp;
 390         uint_t flag = ip->i_flag;
 391
 392         if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0))
 393                 return (0);
 394
 395         /*
 396          * if we are panic'ing; then don't update the inode if this
 397          * file system is FSSTABLE.  Otherwise, we would have to
 398          * force the superblock to FSACTIVE and the superblock
 399          * may not be in a good state.  Also, if the inode is
 400          * IREF'ed then it may be in an inconsistent state.  Don't
 401          * push it.  Finally, don't push the inode if the fs is
 402          * logging; the transaction will be discarded at boot.
 403          */
 404         if (panicstr) {
 405
 406                 if (flag & IREF)
 407                         return (0);
 408
 409                 if (ip->i_ufsvfs == NULL ||
 410                     (ip->i_fs->fs_clean == FSSTABLE ||
 411                     ip->i_fs->fs_clean == FSLOG))
 412                                 return (0);
 413         }
 414
 415         ufsvfsp = ip->i_ufsvfs;
 416
 417         /*
 418          * Limit access time only updates
 419          */
 420         if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) {
 421                 /*
 422                  * if file system has deferred access time turned on and there
 423                  * was no IO recently, don't bother flushing it. It will be
 424                  * flushed when I/Os start again.
 425                  */
 426                 if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) &&
 427                     (ufsvfsp->vfs_iotstamp + ufs_iowait < ddi_get_lbolt()))
 428                         return (0);
 429                 /*
 430                  * an app issueing a sync() can take forever on a trans device
 431                  * when NetWorker or find is running because all of the
 432                  * directorys' access times have to be updated. So, we limit
 433                  * the time we spend updating access times per sync.
 434                  */
 435                 if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time +
 436                     ufs_sync_time_secs) < time))
 437                         return (0);
 438         }
 439
 440         /*
 441          * if we are running on behalf of the flush thread or this is
 442          * a swap file, then simply do a delay update of the inode.
 443          * Otherwise, push the pages and then do a delayed inode update.
 444          */
 445         if (cheap || IS_SWAPVP(ITOV(ip))) {
 446                 TRANS_IUPDAT(ip, 0);
 447         } else {
 448                 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC);
 449         }
 450         return (0);
 451 }
 452
 453 /*
 454  * Flush all the pages associated with an inode using the given 'flags',
 455  * then force inode information to be written back using the given 'waitfor'.
 456  */
 457 int
 458 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid)
 459 {
 460         int     error;
 461         struct vnode *vp = ITOV(ip);
 462         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
 463         int dotrans = 0;
 464
 465         /*
 466          * Return if file system has been forcibly umounted.
 467          */
 468         if (ufsvfsp == NULL)
 469                 return (EIO);
 470         /*
 471          * don't need to fop_putpage if there are no pages
 472          */
 473         if (!vn_has_cached_data(vp) || vp->v_type == VCHR) {
 474                 error = 0;
 475         } else {
 476                 /*
 477                  * if the inode we're working on is a shadow inode
 478                  * or quota inode we need to make sure that the
 479                  * ufs_putpage call is inside a transaction as this
 480                  * could include meta data changes.
 481                  */
 482                 if ((ip->i_mode & IFMT) == IFSHAD ||
 483                     ufsvfsp->vfs_qinod == ip) {
 484                         dotrans = 1;
 485                         curthread->t_flag |= T_DONTBLOCK;
 486                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE,
 487                             TOP_PUTPAGE_SIZE(ip));
 488                 }
 489                 error = fop_putpage(vp, 0, (size_t)0,
 490                     flags, CRED(), NULL);
 491                 if (dotrans) {
 492                         TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE,
 493                             TOP_PUTPAGE_SIZE(ip));
 494                         curthread->t_flag &= ~T_DONTBLOCK;
 495                         dotrans = 0;
 496                 }
 497         }
 498         if (panicstr && TRANS_ISTRANS(ufsvfsp))
 499                 goto out;
 500         /*
 501          * waitfor represents two things -
 502          * 1. whether data sync or file sync.
 503          * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not.
 504          */
 505         if (waitfor == I_DSYNC) {
 506                 /*
 507                  * If data sync, only IATTCHG (size/block change) requires
 508                  * inode update, fdatasync()/FDSYNC implementation.
 509                  */
 510                 if (ip->i_flag & (IBDWRITE|IATTCHG)) {
 511                         /*
 512                          * Enter a transaction to provide mutual exclusion
 513                          * with deltamap_push and avoid a race where
 514                          * the inode flush could get dropped.
 515                          */
 516                         if ((curthread->t_flag & T_DONTBLOCK) == 0) {
 517                                 dotrans = 1;
 518                                 curthread->t_flag |= T_DONTBLOCK;
 519                                 TRANS_BEGIN_ASYNC(ufsvfsp, topid,
 520                                     TOP_SYNCIP_SIZE);
 521                         }
 522                         rw_enter(&ip->i_contents, RW_READER);
 523                         mutex_enter(&ip->i_tlock);
 524                         ip->i_flag &= ~IMODTIME;
 525                         mutex_exit(&ip->i_tlock);
 526                         ufs_iupdat(ip, 1);
 527                         rw_exit(&ip->i_contents);
 528                         if (dotrans) {
 529                                 TRANS_END_ASYNC(ufsvfsp, topid,
 530                                     TOP_SYNCIP_SIZE);
 531                                 curthread->t_flag &= ~T_DONTBLOCK;
 532                         }
 533                 }
 534         } else {
 535                 /* For file sync, any inode change requires inode update */
 536                 if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) {
 537                         /*
 538                          * Enter a transaction to provide mutual exclusion
 539                          * with deltamap_push and avoid a race where
 540                          * the inode flush could get dropped.
 541                          */
 542                         if ((curthread->t_flag & T_DONTBLOCK) == 0) {
 543                                 dotrans = 1;
 544                                 curthread->t_flag |= T_DONTBLOCK;
 545                                 TRANS_BEGIN_ASYNC(ufsvfsp, topid,
 546                                     TOP_SYNCIP_SIZE);
 547                         }
 548                         rw_enter(&ip->i_contents, RW_READER);
 549                         mutex_enter(&ip->i_tlock);
 550                         ip->i_flag &= ~IMODTIME;
 551                         mutex_exit(&ip->i_tlock);
 552                         ufs_iupdat(ip, waitfor);
 553                         rw_exit(&ip->i_contents);
 554                         if (dotrans) {
 555                                 TRANS_END_ASYNC(ufsvfsp, topid,
 556                                     TOP_SYNCIP_SIZE);
 557                                 curthread->t_flag &= ~T_DONTBLOCK;
 558                         }
 559                 }
 560         }
 561
 562 out:
 563         return (error);
 564 }
 565 /*
 566  * Flush all indirect blocks related to an inode.
 567  * Supports triple indirect blocks also.
 568  */
 569 int
 570 ufs_sync_indir(struct inode *ip)
 571 {
 572         int i;
 573         daddr_t blkno;
 574         daddr_t lbn;    /* logical blkno of last blk in file */
 575         daddr_t clbn;   /* current logical blk */
 576         daddr32_t *bap;
 577         struct fs *fs;
 578         struct buf *bp;
 579         int bsize;
 580         struct ufsvfs *ufsvfsp;
 581         int j;
 582         daddr_t indirect_blkno;
 583         daddr32_t *indirect_bap;
 584         struct buf *indirect_bp;
 585
 586         ufsvfsp = ip->i_ufsvfs;
 587         /*
 588          * unnecessary when logging; allocation blocks are kept up-to-date
 589          */
 590         if (TRANS_ISTRANS(ufsvfsp))
 591                 return (0);
 592
 593         fs = ufsvfsp->vfs_fs;
 594         bsize = fs->fs_bsize;
 595         lbn = (daddr_t)lblkno(fs, ip->i_size - 1);
 596         if (lbn < NDADDR)
 597                 return (0);     /* No indirect blocks used */
 598         if (lbn < NDADDR + NINDIR(fs)) {
 599                 /* File has one indirect block. */
 600                 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0]));
 601                 return (0);
 602         }
 603
 604         /* Write out all the first level indirect blocks */
 605         for (i = 0; i < NIADDR; i++) {
 606                 if ((blkno = ip->i_ib[i]) == 0)
 607                         continue;
 608                 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
 609         }
 610         /* Write out second level of indirect blocks */
 611         if ((blkno = ip->i_ib[1]) == 0)
 612                 return (0);
 613         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
 614         if (bp->b_flags & B_ERROR) {
 615                 brelse(bp);
 616                 return (EIO);
 617         }
 618         bap = bp->b_un.b_daddr;
 619         clbn = NDADDR + NINDIR(fs);
 620         for (i = 0; i < NINDIR(fs); i++) {
 621                 if (clbn > lbn)
 622                         break;
 623                 clbn += NINDIR(fs);
 624                 if ((blkno = bap[i]) == 0)
 625                         continue;
 626                 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
 627         }
 628
 629         brelse(bp);
 630         /* write out third level indirect blocks */
 631
 632         if ((blkno = ip->i_ib[2]) == 0)
 633                 return (0);
 634
 635         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
 636         if (bp->b_flags & B_ERROR) {
 637                 brelse(bp);
 638                 return (EIO);
 639         }
 640         bap = bp->b_un.b_daddr;
 641         clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs));
 642
 643         for (i = 0; i < NINDIR(fs); i++) {
 644                 if (clbn > lbn)
 645                         break;
 646                 if ((indirect_blkno = bap[i]) == 0)
 647                         continue;
 648                 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno));
 649                 indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev,
 650                     (daddr_t)fsbtodb(fs, indirect_blkno), bsize);
 651                 if (indirect_bp->b_flags & B_ERROR) {
 652                         brelse(indirect_bp);
 653                         brelse(bp);
 654                         return (EIO);
 655                 }
 656                 indirect_bap = indirect_bp->b_un.b_daddr;
 657                 for (j = 0; j < NINDIR(fs); j++) {
 658                         if (clbn > lbn)
 659                                 break;
 660                         clbn += NINDIR(fs);
 661                         if ((blkno = indirect_bap[j]) == 0)
 662                                 continue;
 663                         blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
 664                 }
 665                 brelse(indirect_bp);
 666         }
 667         brelse(bp);
 668
 669         return (0);
 670 }
 671
 672 /*
 673  * Flush all indirect blocks related to an offset of a file.
 674  * read/write in sync mode may have to flush indirect blocks.
 675  */
 676 int
 677 ufs_indirblk_sync(struct inode *ip, offset_t off)
 678 {
 679         daddr_t lbn;
 680         struct  fs *fs;
 681         struct  buf *bp;
 682         int     i, j, shft;
 683         daddr_t ob, nb, tbn;
 684         daddr32_t *bap;
 685         int     nindirshift, nindiroffset;
 686         struct ufsvfs *ufsvfsp;
 687
 688         ufsvfsp = ip->i_ufsvfs;
 689         /*
 690          * unnecessary when logging; allocation blocks are kept up-to-date
 691          */
 692         if (TRANS_ISTRANS(ufsvfsp))
 693                 return (0);
 694
 695         fs = ufsvfsp->vfs_fs;
 696
 697         lbn = (daddr_t)lblkno(fs, off);
 698         if (lbn < 0)
 699                 return (EFBIG);
 700
 701         /* The first NDADDR are direct so nothing to do */
 702         if (lbn < NDADDR)
 703                 return (0);
 704
 705         nindirshift = ip->i_ufsvfs->vfs_nindirshift;
 706         nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
 707
 708         /* Determine level of indirect blocks */
 709         shft = 0;
 710         tbn = lbn - NDADDR;
 711         for (j = NIADDR; j > 0; j--) {
 712                 longlong_t      sh;
 713
 714                 shft += nindirshift;
 715                 sh = 1LL << shft;
 716                 if (tbn < sh)
 717                         break;
 718                 tbn -= (daddr_t)sh;
 719         }
 720
 721         if (j == 0)
 722                 return (EFBIG);
 723
 724         if ((nb = ip->i_ib[NIADDR - j]) == 0)
 725                         return (0);             /* UFS Hole */
 726
 727         /* Flush first level indirect block */
 728         blkflush(ip->i_dev, fsbtodb(fs, nb));
 729
 730         /* Fetch through next levels */
 731         for (; j < NIADDR; j++) {
 732                 ob = nb;
 733                 bp = UFS_BREAD(ufsvfsp,
 734                     ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
 735                 if (bp->b_flags & B_ERROR) {
 736                         brelse(bp);
 737                         return (EIO);
 738                 }
 739                 bap = bp->b_un.b_daddr;
 740                 shft -= nindirshift;            /* sh / nindir */
 741                 i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */
 742                 nb = bap[i];
 743                 brelse(bp);
 744                 if (nb == 0) {
 745                         return (0);             /* UFS hole */
 746                 }
 747                 blkflush(ip->i_dev, fsbtodb(fs, nb));
 748         }
 749         return (0);
 750 }
 751
 752 #ifdef DEBUG
 753
 754 /*
 755  * The bad block checking routines: ufs_indir_badblock() and ufs_badblock()
 756  * are very expensive. It's been found from profiling that we're
 757  * spending 6-7% of our time in ufs_badblock, and another 1-2% in
 758  * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels).
 759  * In addition from experience no failures have been found in recent
 760  * years. So the following tunable can be set to enable checking.
 761  */
 762 int ufs_badblock_checks = 0;
 763
 764 /*
 765  * Check that a given indirect block contains blocks in range
 766  */
 767 int
 768 ufs_indir_badblock(struct inode *ip, daddr32_t *bap)
 769 {
 770         int i;
 771         int err = 0;
 772
 773         if (ufs_badblock_checks) {
 774                 for (i = 0; i < NINDIR(ip->i_fs) - 1; i++)
 775                         if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i])))
 776                                 break;
 777         }
 778         return (err);
 779 }
 780
 781 /*
 782  * Check that a specified block number is in range.
 783  */
 784 int
 785 ufs_badblock(struct inode *ip, daddr_t bn)
 786 {
 787         long    c;
 788         daddr_t sum;
 789
 790         if (!ufs_badblock_checks)
 791                 return (0);
 792         ASSERT(bn);
 793         if (bn <= 0 || bn > ip->i_fs->fs_size)
 794                 return (bn);
 795
 796         sum = 0;
 797         c = dtog(ip->i_fs, bn);
 798         if (c == 0) {
 799                 sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize);
 800         }
 801         /*
 802          * if block no. is below this cylinder group,
 803          * within the space reserved for superblock, inodes, (summary data)
 804          * or if it is above this cylinder group
 805          * then its invalid
 806          * It's hard to see how we'd be outside this cyl, but let's be careful.
 807          */
 808         if ((bn < cgbase(ip->i_fs, c)) ||
 809             (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) ||
 810             (bn >= (unsigned)cgbase(ip->i_fs, c+1)))
 811                 return (bn);
 812
 813         return (0);     /* not a bad block */
 814 }
 815
 816 #endif /* DEBUG */
 817
 818 /*
 819  * When i_rwlock is write-locked or has a writer pended, then the inode
 820  * is going to change in a way that the filesystem will be marked as
 821  * active. So no need to let the filesystem be mark as stable now.
 822  * Also to ensure the filesystem consistency during the directory
 823  * operations, filesystem cannot be marked as stable if i_rwlock of
 824  * the directory inode is write-locked.
 825  */
 826
 827 /*
 828  * Check for busy inodes for this filesystem.
 829  * NOTE: Needs better way to do this expensive operation in the future.
 830  */
 831 static void
 832 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp)
 833 {
 834         union  ihead    *ih;
 835         struct inode    *ip;
 836         int             i;
 837         int             isnottrans      = !TRANS_ISTRANS(ufsvfsp);
 838         int             isbusy          = *isbusyp;
 839         int             isreclaim       = *isreclaimp;
 840
 841         for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
 842                 mutex_enter(&ih_lock[i]);
 843                 for (ip = ih->ih_chain[0];
 844                     ip != (struct inode *)ih;
 845                     ip = ip->i_forw) {
 846                         /*
 847                          * if inode is busy/modified/deleted, filesystem is busy
 848                          */
 849                         if (ip->i_ufsvfs != ufsvfsp)
 850                                 continue;
 851                         if ((ip->i_flag & (IMOD | IUPD | ICHG)) ||
 852                             (RW_ISWRITER(&ip->i_rwlock)))
 853                                 isbusy = 1;
 854                         if ((ip->i_nlink <= 0) && (ip->i_flag & IREF))
 855                                 isreclaim = 1;
 856                         if (isbusy && (isreclaim || isnottrans))
 857                                 break;
 858                 }
 859                 mutex_exit(&ih_lock[i]);
 860                 if (isbusy && (isreclaim || isnottrans))
 861                         break;
 862         }
 863         *isbusyp = isbusy;
 864         *isreclaimp = isreclaim;
 865 }
 866
 867 /*
 868  * As part of the ufs 'sync' operation, this routine is called to mark
 869  * the filesystem as STABLE if there is no modified metadata in memory.
 870  */
 871 void
 872 ufs_checkclean(struct vfs *vfsp)
 873 {
 874         struct ufsvfs   *ufsvfsp        = (struct ufsvfs *)vfsp->vfs_data;
 875         struct fs       *fs             = ufsvfsp->vfs_fs;
 876         int             isbusy;
 877         int             isreclaim;
 878         int             updatesb;
 879
 880         ASSERT(vfs_lock_held(vfsp));
 881
 882         /*
 883          * filesystem is stable or cleanflag processing is disabled; do nothing
 884          *      no transitions when panic'ing
 885          */
 886         if (fs->fs_ronly ||
 887             fs->fs_clean == FSBAD ||
 888             fs->fs_clean == FSSUSPEND ||
 889             fs->fs_clean == FSSTABLE ||
 890             panicstr)
 891                 return;
 892
 893         /*
 894          * if logging and nothing to reclaim; do nothing
 895          */
 896         if ((fs->fs_clean == FSLOG) &&
 897             (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
 898             (fs->fs_reclaim & FS_RECLAIMING)))
 899                 return;
 900
 901         /*
 902          * FS_CHECKCLEAN is reset if the file system goes dirty
 903          * FS_CHECKRECLAIM is reset if a file gets deleted
 904          */
 905         mutex_enter(&ufsvfsp->vfs_lock);
 906         fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM);
 907         mutex_exit(&ufsvfsp->vfs_lock);
 908
 909         updatesb = 0;
 910
 911         /*
 912          * if logging or buffers are busy; do nothing
 913          */
 914         isbusy = isreclaim = 0;
 915         if ((fs->fs_clean == FSLOG) ||
 916             (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp)))
 917                 isbusy = 1;
 918
 919         /*
 920          * isreclaim == TRUE means can't change the state of fs_reclaim
 921          */
 922         isreclaim =
 923             ((fs->fs_clean == FSLOG) &&
 924             (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
 925             (fs->fs_reclaim & FS_RECLAIMING)));
 926
 927         /*
 928          * if fs is busy or can't change the state of fs_reclaim; do nothing
 929          */
 930         if (isbusy && isreclaim)
 931                 return;
 932
 933         /*
 934          * look for busy or deleted inodes; (deleted == needs reclaim)
 935          */
 936         ufs_icheck(ufsvfsp, &isbusy, &isreclaim);
 937
 938         mutex_enter(&ufsvfsp->vfs_lock);
 939
 940         /*
 941          * IF POSSIBLE, RESET RECLAIM
 942          */
 943         /*
 944          * the reclaim thread is not running
 945          */
 946         if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
 947                 /*
 948                  * no files were deleted during the scan
 949                  */
 950                 if (fs->fs_reclaim & FS_CHECKRECLAIM)
 951                         /*
 952                          * no deleted files were found in the inode cache
 953                          */
 954                         if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) {
 955                                 fs->fs_reclaim &= ~FS_RECLAIM;
 956                                 updatesb = 1;
 957                         }
 958         /*
 959          * IF POSSIBLE, SET STABLE
 960          */
 961         /*
 962          * not logging
 963          */
 964         if (fs->fs_clean != FSLOG)
 965                 /*
 966                  * file system has not gone dirty since the scan began
 967                  */
 968                 if (fs->fs_reclaim & FS_CHECKCLEAN)
 969                         /*
 970                          * nothing dirty was found in the buffer or inode cache
 971                          */
 972                         if ((isbusy == 0) && (isreclaim == 0) &&
 973                             (fs->fs_clean != FSSTABLE)) {
 974                                 fs->fs_clean = FSSTABLE;
 975                                 updatesb = 1;
 976                         }
 977
 978         mutex_exit(&ufsvfsp->vfs_lock);
 979         if (updatesb) {
 980                 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
 981         }
 982 }
 983
 984 /*
 985  * called whenever an unlink occurs
 986  */
 987 void
 988 ufs_setreclaim(struct inode *ip)
 989 {
 990         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
 991         struct fs       *fs             = ufsvfsp->vfs_fs;
 992
 993         if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG))
 994                 return;
 995
 996         /*
 997          * reclaim-needed bit is already set or we need to tell
 998          * ufs_checkclean that a file has been deleted
 999          */
1000         if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM)
1001                 return;
1002
1003         mutex_enter(&ufsvfsp->vfs_lock);
1004         /*
1005          * inform ufs_checkclean that the file system has gone dirty
1006          */
1007         fs->fs_reclaim &= ~FS_CHECKRECLAIM;
1008
1009         /*
1010          * set the reclaim-needed bit
1011          */
1012         if ((fs->fs_reclaim & FS_RECLAIM) == 0) {
1013                 fs->fs_reclaim |= FS_RECLAIM;
1014                 ufs_sbwrite(ufsvfsp);
1015         }
1016         mutex_exit(&ufsvfsp->vfs_lock);
1017 }
1018
1019 /*
1020  * Before any modified metadata written back to the disk, this routine
1021  * is called to mark the filesystem as ACTIVE.
1022  */
1023 void
1024 ufs_notclean(struct ufsvfs *ufsvfsp)
1025 {
1026         struct fs *fs = ufsvfsp->vfs_fs;
1027
1028         ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1029         ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1030
1031         /*
1032          * inform ufs_checkclean that the file system has gone dirty
1033          */
1034         fs->fs_reclaim &= ~FS_CHECKCLEAN;
1035
1036         /*
1037          * ignore if active or bad or suspended or readonly or logging
1038          */
1039         if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) ||
1040             (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) ||
1041             (fs->fs_ronly)) {
1042                 mutex_exit(&ufsvfsp->vfs_lock);
1043                 return;
1044         }
1045         fs->fs_clean = FSACTIVE;
1046         /*
1047          * write superblock synchronously
1048          */
1049         ufs_sbwrite(ufsvfsp);
1050         mutex_exit(&ufsvfsp->vfs_lock);
1051 }
1052
1053 /*
1054  * ufs specific fbwrite()
1055  */
1056 int
1057 ufs_fbwrite(struct fbuf *fbp, struct inode *ip)
1058 {
1059         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
1060
1061         if (TRANS_ISTRANS(ufsvfsp))
1062                 return (fbwrite(fbp));
1063         mutex_enter(&ufsvfsp->vfs_lock);
1064         ufs_notclean(ufsvfsp);
1065         return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp));
1066 }
1067
1068 /*
1069  * ufs specific fbiwrite()
1070  */
1071 int
1072 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize)
1073 {
1074         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
1075         o_mode_t        ifmt            = ip->i_mode & IFMT;
1076         buf_t           *bp;
1077         int             error;
1078
1079         mutex_enter(&ufsvfsp->vfs_lock);
1080         ufs_notclean(ufsvfsp);
1081         if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR ||
1082             (ip->i_ufsvfs->vfs_qinod == ip)) {
1083                 TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))),
1084                     fbp->fb_count, DT_FBI, 0, 0);
1085         }
1086         /*
1087          * Inlined version of fbiwrite()
1088          */
1089         bp = pageio_setup(NULL, fbp->fb_count, ip->i_devvp, B_WRITE);
1090         bp->b_flags &= ~B_PAGEIO;
1091         bp->b_un.b_addr = fbp->fb_addr;
1092
1093         bp->b_blkno = bn * btod(bsize);
1094         bp->b_dev = cmpdev(ip->i_dev);  /* store in old dev format */
1095         bp->b_edev = ip->i_dev;
1096         bp->b_proc = NULL;                      /* i.e. the kernel */
1097         bp->b_file = ip->i_vnode;
1098         bp->b_offset = -1;
1099
1100         if (ufsvfsp->vfs_log) {
1101                 lufs_write_strategy(ufsvfsp->vfs_log, bp);
1102         } else if (ufsvfsp->vfs_snapshot) {
1103                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
1104         } else {
1105                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
1106                 ub.ub_fbiwrites.value.ul++;
1107                 (void) bdev_strategy(bp);
1108                 lwp_stat_update(LWP_STAT_OUBLK, 1);
1109         }
1110         error = biowait(bp);
1111         pageio_done(bp);
1112         fbrelse(fbp, S_OTHER);
1113         return (error);
1114 }
1115
1116 /*
1117  * Write the ufs superblock only.
1118  */
1119 void
1120 ufs_sbwrite(struct ufsvfs *ufsvfsp)
1121 {
1122         char sav_fs_fmod;
1123         struct fs *fs = ufsvfsp->vfs_fs;
1124         struct buf *bp = ufsvfsp->vfs_bufp;
1125
1126         ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1127
1128         /*
1129          * for ulockfs processing, limit the superblock writes
1130          */
1131         if ((ufsvfsp->vfs_ulockfs.ul_sbowner) &&
1132             (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) {
1133                 /* try again later */
1134                 fs->fs_fmod = 1;
1135                 return;
1136         }
1137
1138         ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1139         /*
1140          * update superblock timestamp and fs_clean checksum
1141          * if marked FSBAD, we always want an erroneous
1142          * checksum to force repair
1143          */
1144         fs->fs_time = gethrestime_sec();
1145         fs->fs_state = (fs->fs_clean != FSBAD) ?
1146             FSOKAY - fs->fs_time : -(FSOKAY - fs->fs_time);
1147         switch (fs->fs_clean) {
1148         case FSCLEAN:
1149         case FSSTABLE:
1150                 fs->fs_reclaim &= ~FS_RECLAIM;
1151                 break;
1152         case FSACTIVE:
1153         case FSSUSPEND:
1154         case FSBAD:
1155         case FSLOG:
1156                 break;
1157         default:
1158                 fs->fs_clean = FSACTIVE;
1159                 break;
1160         }
1161         /*
1162          * reset incore only bits
1163          */
1164         fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM);
1165
1166         /*
1167          * delta the whole superblock
1168          */
1169         TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs),
1170             DT_SB, NULL, 0);
1171         /*
1172          * retain the incore state of fs_fmod; set the ondisk state to 0
1173          */
1174         sav_fs_fmod = fs->fs_fmod;
1175         fs->fs_fmod = 0;
1176
1177         /*
1178          * Don't release the buffer after written to the disk
1179          */
1180         UFS_BWRITE2(ufsvfsp, bp);
1181         fs->fs_fmod = sav_fs_fmod;      /* reset fs_fmod's incore state */
1182 }
1183
1184 /*
1185  * Returns vfs pointer if vfs still being mounted. vfs lock is held.
1186  * Otherwise, returns NULL.
1187  *
1188  * For our purposes, "still mounted" means that the file system still appears
1189  * on the list of UFS file system instances.
1190  */
1191 static vfs_t *
1192 still_mounted(struct check_node *checkp)
1193 {
1194         struct vfs      *vfsp;
1195         struct ufsvfs   *ufsp;
1196
1197         mutex_enter(&ufsvfs_mutex);
1198         for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
1199                 if (ufsp != checkp->ufsvfs)
1200                         continue;
1201                 /*
1202                  * Tentative match:  verify it and try to lock.  (It's not at
1203                  * all clear how the verification could fail, given that we've
1204                  * gotten this far.  We would have had to reallocate the
1205                  * ufsvfs struct at hand for a new incarnation; is that really
1206                  * possible in the interval from constructing the check_node
1207                  * to here?)
1208                  */
1209                 vfsp = ufsp->vfs_vfs;
1210                 if (vfsp != checkp->vfsp)
1211                         continue;
1212                 if (vfsp->vfs_dev != checkp->vfs_dev)
1213                         continue;
1214                 if (vfs_lock(vfsp) != 0)
1215                         continue;
1216
1217                 mutex_exit(&ufsvfs_mutex);
1218                 return (vfsp);
1219         }
1220         mutex_exit(&ufsvfs_mutex);
1221         return (NULL);
1222 }
1223
1224 int
1225 ufs_si_io_done(struct buf *bp)
1226 {
1227         sema_v(&bp->b_io);
1228         return (0);
1229 }
1230
1231 #define SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE)
1232 #define NSIBUF 32
1233
1234 /*
1235  * ufs_construct_si()
1236  * Read each cylinder group in turn and construct the summary information
1237  */
1238 static int
1239 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp)
1240 {
1241         buf_t *bps, *bp;
1242         char *bufs;
1243         struct csum *sip = fs->fs_u.fs_csp;
1244         struct cg *cgp;
1245         int i, ncg;
1246         int error = 0, cg = 0;
1247
1248         bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP);
1249         bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP);
1250
1251         /*
1252          * Initialise the buffer headers
1253          */
1254         for (bp = bps, i = 0; i < NSIBUF; i++, bp++) {
1255                 bioinit(bp);
1256                 bp->b_iodone = ufs_si_io_done;
1257                 bp->b_bufsize = bp->b_bcount = SI_BUFSZ;
1258                 bp->b_flags = B_READ;
1259                 bp->b_un.b_addr = bufs + (i * SI_BUFSZ);
1260                 bp->b_edev = dev;
1261         }
1262
1263         /*
1264          * Repeat while there are cylinder groups left to read.
1265          */
1266         do {
1267                 /*
1268                  * Issue upto NSIBUF asynchronous reads
1269                  */
1270                 ncg = MIN(NSIBUF, (fs->fs_ncg - cg));
1271                 for (bp = bps, i = 0; i < ncg; i++, bp++) {
1272                         bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i));
1273                         if (ufsvfsp->vfs_log) {
1274                                 lufs_read_strategy(ufsvfsp->vfs_log, bp);
1275                         } else {
1276                                 (void) bdev_strategy(bp);
1277                         }
1278                 }
1279
1280                 /*
1281                  * wait for each read to finish;
1282                  * check for errors and copy the csum info
1283                  */
1284                 for (bp = bps, i = 0; i < ncg; i++, bp++) {
1285                         sema_p(&bp->b_io);
1286                         if (!error) {
1287                                 cgp = bp->b_un.b_cg;
1288                                 sip[cg + i] = cgp->cg_cs;
1289                                 error = geterror(bp);
1290                         }
1291                 }
1292                 if (error) {
1293                         goto err;
1294                 }
1295                 cg += ncg;
1296         } while (cg < fs->fs_ncg);
1297
1298 err:
1299         kmem_free(bps, NSIBUF * sizeof (buf_t));
1300         kmem_free(bufs, NSIBUF * SI_BUFSZ);
1301         return (error);
1302 }
1303
1304 /*
1305  * ufs_getsummaryinfo
1306  */
1307 int
1308 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1309 {
1310         int             i;              /* `for' loop counter */
1311         ssize_t         size;           /* bytes of summary info to read */
1312         daddr_t         frags;          /* frags of summary info to read */
1313         caddr_t         sip;            /* summary info */
1314         struct buf      *tp;            /* tmp buf */
1315
1316         /*
1317          * maintain metadata map for trans device (debug only)
1318          */
1319         TRANS_MATA_SI(ufsvfsp, fs);
1320
1321         /*
1322          * Compute #frags and allocate space for summary info
1323          */
1324         frags = howmany(fs->fs_cssize, fs->fs_fsize);
1325         sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP);
1326         fs->fs_u.fs_csp = (struct csum *)sip;
1327
1328         if (fs->fs_si == FS_SI_BAD) {
1329                 /*
1330                  * The summary information is unknown, read it in from
1331                  * the cylinder groups.
1332                  */
1333                 if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) &&
1334                     ufsvfsp->vfs_log->un_logmap) {
1335                         logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */
1336                 }
1337                 bzero(sip, (size_t)fs->fs_cssize);
1338                 if (ufs_construct_si(dev, fs, ufsvfsp)) {
1339                         kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1340                         fs->fs_u.fs_csp = NULL;
1341                         return (EIO);
1342                 }
1343         } else {
1344                 /* Read summary info a fs block at a time */
1345                 size = fs->fs_bsize;
1346                 for (i = 0; i < frags; i += fs->fs_frag) {
1347                         if (i + fs->fs_frag > frags)
1348                                 /*
1349                                  * This happens only the last iteration, so
1350                                  * don't worry about size being reset
1351                                  */
1352                                 size = (frags - i) * fs->fs_fsize;
1353                         tp = UFS_BREAD(ufsvfsp, dev,
1354                             (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size);
1355                         tp->b_flags |= B_STALE | B_AGE;
1356                         if (tp->b_flags & B_ERROR) {
1357                                 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1358                                 fs->fs_u.fs_csp = NULL;
1359                                 brelse(tp);
1360                                 return (EIO);
1361                         }
1362                         bcopy(tp->b_un.b_addr, sip, size);
1363                         sip += size;
1364                         brelse(tp);
1365                 }
1366         }
1367         bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal));
1368         for (i = 0; i < fs->fs_ncg; ++i) {
1369                 fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir;
1370                 fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree;
1371                 fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree;
1372                 fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree;
1373         }
1374         return (0);
1375 }
1376
1377 /*
1378  * ufs_putsummaryinfo() stores all the cylinder group summary information
1379  * This is only used when logging, but the file system may not
1380  * be logging at the time, eg a read-only mount to flush the log
1381  * may push the summary info out.
1382  */
1383 int
1384 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1385 {
1386         struct buf      b, *bp;         /* tmp buf */
1387         caddr_t         sip;            /* summary info */
1388         ssize_t         size;           /* bytes of summary info to write */
1389         daddr_t         frags;          /* frags of summary info to write */
1390         int             i;              /* `for' loop counter */
1391         int             error;          /* error */
1392
1393         if (TRANS_ISERROR(ufsvfsp)) {
1394                 return (EIO);
1395         }
1396
1397         if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) {
1398                 return (0);
1399         }
1400
1401         bp = &b;
1402         bioinit(bp);
1403         bp->b_iodone = ufs_si_io_done;
1404         bp->b_bufsize = size = fs->fs_bsize;
1405         bp->b_flags = B_WRITE;
1406         bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP);
1407         bp->b_edev = dev;
1408         frags = howmany(fs->fs_cssize, fs->fs_fsize);
1409         sip = (caddr_t)fs->fs_u.fs_csp;
1410
1411         /* Write summary info one fs block at a time */
1412         for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) {
1413                 if (i + fs->fs_frag > frags) {
1414                         /*
1415                          * This happens only the last iteration, so
1416                          * don't worry about size being reset
1417                          */
1418                         size = (frags - i) * fs->fs_fsize;
1419                 }
1420                 bcopy(sip, bp->b_un.b_addr, size);
1421                 bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i);
1422                 bp->b_bcount = size;
1423                 (void) bdev_strategy(bp);
1424                 sema_p(&bp->b_io); /* wait for write to complete */
1425                 error = geterror(bp);
1426                 sip += size;
1427         }
1428         kmem_free(bp->b_un.b_addr, fs->fs_bsize);
1429         if (!error) {
1430                 fs->fs_si = FS_SI_OK;
1431         }
1432         return (error);
1433 }
1434
1435 /*
1436  * Decide whether it is okay to remove within a sticky directory.
1437  * Two conditions need to be met:  write access to the directory
1438  * is needed.  In sticky directories, write access is not sufficient;
1439  * you can remove entries from a directory only if you own the directory,
1440  * if you are privileged, if you own the entry or if the entry is
1441  * a plain file and you have write access to that file.
1442  * Function returns 0 if remove access is granted.
1443  * Note, the caller is responsible for holding the i_contents lock
1444  * at least as reader on the inquired inode 'ip'.
1445  */
1446 int
1447 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr)
1448 {
1449         uid_t uid;
1450
1451         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1452
1453         if ((dp->i_mode & ISVTX) &&
1454             (uid = crgetuid(cr)) != dp->i_uid &&
1455             uid != ip->i_uid &&
1456             ((ip->i_mode & IFMT) != IFREG ||
1457             ufs_iaccess(ip, IWRITE, cr, 0) != 0))
1458                 return (secpolicy_vnode_remove(cr));
1459
1460         return (0);
1461 }
1462 #endif  /* _KERNEL */
1463
1464 extern  int around[9];
1465 extern  int inside[9];
1466 extern  uchar_t *fragtbl[];
1467
1468 /*
1469  * Update the frsum fields to reflect addition or deletion
1470  * of some frags.
1471  */
1472 void
1473 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt)
1474 {
1475         int inblk;
1476         int field, subfield;
1477         int siz, pos;
1478
1479         /*
1480          * ufsvfsp->vfs_lock is held when calling this.
1481          */
1482         inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
1483         fragmap <<= 1;
1484         for (siz = 1; siz < fs->fs_frag; siz++) {
1485                 if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
1486                         continue;
1487                 field = around[siz];
1488                 subfield = inside[siz];
1489                 for (pos = siz; pos <= fs->fs_frag; pos++) {
1490                         if ((fragmap & field) == subfield) {
1491                                 fraglist[siz] += cnt;
1492                                 ASSERT(fraglist[siz] >= 0);
1493                                 pos += siz;
1494                                 field <<= siz;
1495                                 subfield <<= siz;
1496                         }
1497                         field <<= 1;
1498                         subfield <<= 1;
1499                 }
1500         }
1501 }
1502
1503 /*
1504  * Block operations
1505  */
1506
1507 /*
1508  * Check if a block is available
1509  */
1510 int
1511 isblock(struct fs *fs, uchar_t *cp, daddr_t h)
1512 {
1513         uchar_t mask;
1514
1515         ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1516             fs->fs_frag == 1);
1517         /*
1518          * ufsvfsp->vfs_lock is held when calling this.
1519          */
1520         switch ((int)fs->fs_frag) {
1521         case 8:
1522                 return (cp[h] == 0xff);
1523         case 4:
1524                 mask = 0x0f << ((h & 0x1) << 2);
1525                 return ((cp[h >> 1] & mask) == mask);
1526         case 2:
1527                 mask = 0x03 << ((h & 0x3) << 1);
1528                 return ((cp[h >> 2] & mask) == mask);
1529         case 1:
1530                 mask = 0x01 << (h & 0x7);
1531                 return ((cp[h >> 3] & mask) == mask);
1532         default:
1533 #ifndef _KERNEL
1534                 cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)",
1535                     fs->fs_frag);
1536 #endif /* _KERNEL */
1537                 return (0);
1538         }
1539 }
1540
1541 /*
1542  * Take a block out of the map
1543  */
1544 void
1545 clrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1546 {
1547         ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1548             fs->fs_frag == 1);
1549         /*
1550          * ufsvfsp->vfs_lock is held when calling this.
1551          */
1552         switch ((int)fs->fs_frag) {
1553         case 8:
1554                 cp[h] = 0;
1555                 return;
1556         case 4:
1557                 cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
1558                 return;
1559         case 2:
1560                 cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
1561                 return;
1562         case 1:
1563                 cp[h >> 3] &= ~(0x01 << (h & 0x7));
1564                 return;
1565         default:
1566 #ifndef _KERNEL
1567                 cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)",
1568                     fs->fs_frag);
1569 #endif /* _KERNEL */
1570                 return;
1571         }
1572 }
1573
1574 /*
1575  * Is block allocated?
1576  */
1577 int
1578 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1579 {
1580         uchar_t mask;
1581         int     frag;
1582         /*
1583          * ufsvfsp->vfs_lock is held when calling this.
1584          */
1585         frag = fs->fs_frag;
1586         ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1);
1587         switch (frag) {
1588         case 8:
1589                 return (cp[h] == 0);
1590         case 4:
1591                 mask = ~(0x0f << ((h & 0x1) << 2));
1592                 return (cp[h >> 1] == (cp[h >> 1] & mask));
1593         case 2:
1594                 mask =  ~(0x03 << ((h & 0x3) << 1));
1595                 return (cp[h >> 2] == (cp[h >> 2] & mask));
1596         case 1:
1597                 mask = ~(0x01 << (h & 0x7));
1598                 return (cp[h >> 3] == (cp[h >> 3] & mask));
1599         default:
1600 #ifndef _KERNEL
1601                 cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)",
1602                     fs->fs_frag);
1603 #endif /* _KERNEL */
1604                 break;
1605         }
1606         return (0);
1607 }
1608
1609 /*
1610  * Put a block into the map
1611  */
1612 void
1613 setblock(struct fs *fs, uchar_t *cp, daddr_t h)
1614 {
1615         ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1616             fs->fs_frag == 1);
1617         /*
1618          * ufsvfsp->vfs_lock is held when calling this.
1619          */
1620         switch ((int)fs->fs_frag) {
1621         case 8:
1622                 cp[h] = 0xff;
1623                 return;
1624         case 4:
1625                 cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
1626                 return;
1627         case 2:
1628                 cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
1629                 return;
1630         case 1:
1631                 cp[h >> 3] |= (0x01 << (h & 0x7));
1632                 return;
1633         default:
1634 #ifndef _KERNEL
1635                 cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)",
1636                     fs->fs_frag);
1637 #endif /* _KERNEL */
1638                 return;
1639         }
1640 }
1641
1642 int
1643 skpc(char c, uint_t len, char *cp)
1644 {
1645         if (len == 0)
1646                 return (0);
1647         while (*cp++ == c && --len)
1648                 ;
1649         return (len);
1650 }