kernel/fs/ufs/ufs_lockfs.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 #include <sys/types.h>
  26 #include <sys/t_lock.h>
  27 #include <sys/param.h>
  28 #include <sys/time.h>
  29 #include <sys/systm.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/resource.h>
  32 #include <sys/signal.h>
  33 #include <sys/cred.h>
  34 #include <sys/user.h>
  35 #include <sys/buf.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/proc.h>
  39 #include <sys/disp.h>
  40 #include <sys/file.h>
  41 #include <sys/fcntl.h>
  42 #include <sys/flock.h>
  43 #include <sys/atomic.h>
  44 #include <sys/kmem.h>
  45 #include <sys/uio.h>
  46 #include <sys/conf.h>
  47 #include <sys/mman.h>
  48 #include <sys/pathname.h>
  49 #include <sys/debug.h>
  50 #include <sys/vmsystm.h>
  51 #include <sys/cmn_err.h>
  52 #include <sys/acct.h>
  53 #include <sys/dnlc.h>
  54 #include <sys/swap.h>
  55
  56 #include <sys/fs/ufs_fs.h>
  57 #include <sys/fs/ufs_inode.h>
  58 #include <sys/fs/ufs_fsdir.h>
  59 #include <sys/fs/ufs_trans.h>
  60 #include <sys/fs/ufs_panic.h>
  61 #include <sys/fs/ufs_mount.h>
  62 #include <sys/fs/ufs_bio.h>
  63 #include <sys/fs/ufs_log.h>
  64 #include <sys/fs/ufs_quota.h>
  65 #include <sys/dirent.h>         /* must be AFTER <sys/fs/fsdir.h>! */
  66 #include <sys/errno.h>
  67 #include <sys/sysinfo.h>
  68
  69 #include <vm/hat.h>
  70 #include <vm/pvn.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_vn.h>
  75 #include <vm/rm.h>
  76 #include <vm/anon.h>
  77 #include <sys/swap.h>
  78 #include <sys/dnlc.h>
  79
  80 extern struct vnode *common_specvp(struct vnode *vp);
  81
  82 /* error lock status */
  83 #define UN_ERRLCK       (-1)
  84 #define SET_ERRLCK      1
  85 #define RE_ERRLCK       2
  86 #define NO_ERRLCK       0
  87
  88 /*
  89  * Index to be used in TSD for storing lockfs data
  90  */
  91 uint_t ufs_lockfs_key;
  92
  93 typedef struct _ulockfs_info {
  94         struct _ulockfs_info *next;
  95         struct ulockfs *ulp;
  96         uint_t flags;
  97 } ulockfs_info_t;
  98
  99 #define ULOCK_INFO_FALLOCATE    0x00000001      /* fallocate thread */
 100
 101 /*
 102  * Check in TSD that whether we are already doing any VOP on this filesystem
 103  */
 104 #define IS_REC_VOP(found, head, ulp, free)              \
 105 {                                                       \
 106         ulockfs_info_t *_curr;                          \
 107                                                         \
 108         for (found = 0, free = NULL, _curr = head;      \
 109             _curr != NULL; _curr = _curr->next) {       \
 110                 if ((free == NULL) &&                   \
 111                     (_curr->ulp == NULL))               \
 112                         free = _curr;                   \
 113                 if (_curr->ulp == ulp) {                \
 114                         found = 1;                      \
 115                         break;                          \
 116                 }                                       \
 117         }                                               \
 118 }
 119
 120 /*
 121  * Get the lockfs data from TSD so that lockfs handles the recursive VOP
 122  * properly
 123  */
 124 #define SEARCH_ULOCKFSP(head, ulp, info)                \
 125 {                                                       \
 126         ulockfs_info_t *_curr;                          \
 127                                                         \
 128         for (_curr = head; _curr != NULL;               \
 129             _curr = _curr->next) {                      \
 130                 if (_curr->ulp == ulp) {                \
 131                         break;                          \
 132                 }                                       \
 133         }                                               \
 134                                                         \
 135         info = _curr;                                   \
 136 }
 137
 138 /*
 139  * Validate lockfs request
 140  */
 141 static int
 142 ufs_getlfd(
 143         struct lockfs *lockfsp,         /* new lock request */
 144         struct lockfs *ul_lockfsp)      /* old lock state */
 145 {
 146         int     error = 0;
 147
 148         /*
 149          * no input flags defined
 150          */
 151         if (lockfsp->lf_flags != 0) {
 152                 error = EINVAL;
 153                 goto errout;
 154         }
 155
 156         /*
 157          * check key
 158          */
 159         if (!LOCKFS_IS_ULOCK(ul_lockfsp))
 160                 if (lockfsp->lf_key != ul_lockfsp->lf_key) {
 161                         error = EINVAL;
 162                         goto errout;
 163         }
 164
 165         lockfsp->lf_key = ul_lockfsp->lf_key + 1;
 166
 167 errout:
 168         return (error);
 169 }
 170
 171 /*
 172  * ufs_checkaccton
 173  *      check if accounting is turned on on this fs
 174  */
 175
 176 int
 177 ufs_checkaccton(struct vnode *vp)
 178 {
 179         if (acct_fs_in_use(vp))
 180                 return (EDEADLK);
 181         return (0);
 182 }
 183
 184 /*
 185  * ufs_checkswapon
 186  *      check if local swapping is to file on this fs
 187  */
 188 int
 189 ufs_checkswapon(struct vnode *vp)
 190 {
 191         struct swapinfo *sip;
 192
 193         mutex_enter(&swapinfo_lock);
 194         for (sip = swapinfo; sip; sip = sip->si_next)
 195                 if (sip->si_vp->v_vfsp == vp->v_vfsp) {
 196                         mutex_exit(&swapinfo_lock);
 197                         return (EDEADLK);
 198                 }
 199         mutex_exit(&swapinfo_lock);
 200         return (0);
 201 }
 202
 203 /*
 204  * ufs_freeze
 205  *      pend future accesses for current lock and desired lock
 206  */
 207 void
 208 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
 209 {
 210         /*
 211          * set to new lock type
 212          */
 213         ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
 214         ulp->ul_lockfs.lf_key = lockfsp->lf_key;
 215         ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
 216         ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
 217
 218         ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
 219 }
 220
 221 /*
 222  * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
 223  * starting ufs_quiesce() protocol and decrement it only when a file system no
 224  * longer has to be in quiescent state. This allows ufs_pageio() to detect
 225  * that another thread wants to quiesce a file system. See more comments in
 226  * ufs_pageio().
 227  */
 228 ulong_t ufs_quiesce_pend = 0;
 229
 230 /*
 231  * ufs_quiesce
 232  *      wait for outstanding accesses to finish
 233  */
 234 int
 235 ufs_quiesce(struct ulockfs *ulp)
 236 {
 237         int error = 0;
 238         ulockfs_info_t *head;
 239         ulockfs_info_t *info;
 240         klwp_t *lwp = ttolwp(curthread);
 241
 242         head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
 243         SEARCH_ULOCKFSP(head, ulp, info);
 244
 245         /*
 246          * We have to keep /proc away from stopping us after we applied
 247          * the softlock but before we got a chance to clear it again.
 248          * prstop() may pagefault and become stuck on the softlock still
 249          * pending.
 250          */
 251         if (lwp != NULL)
 252                 lwp->lwp_nostop++;
 253
 254         /*
 255          * Set a softlock to suspend future ufs_vnops so that
 256          * this lockfs request will not be starved
 257          */
 258         ULOCKFS_SET_SLOCK(ulp);
 259         ASSERT(ufs_quiesce_pend);
 260
 261         /* check if there is any outstanding ufs vnodeops calls */
 262         while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
 263                 /*
 264                  * use timed version of cv_wait_sig() to make sure we don't
 265                  * miss a wake up call from ufs_pageio() when it doesn't use
 266                  * ul_lock.
 267                  *
 268                  * when a fallocate thread comes in, the only way it returns
 269                  * from this function is if there are no other vnode operations
 270                  * going on (remember fallocate threads are tracked using
 271                  * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
 272                  * hasn't already grabbed the fs write lock.
 273                  */
 274                 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
 275                         if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
 276                                 goto out;
 277                 }
 278                 if (!cv_reltimedwait_sig(&ulp->ul_cv, &ulp->ul_lock, hz,
 279                     TR_CLOCK_TICK)) {
 280                         error = EINTR;
 281                         goto out;
 282                 }
 283         }
 284
 285 out:
 286         /*
 287          * unlock the soft lock
 288          */
 289         ULOCKFS_CLR_SLOCK(ulp);
 290
 291         if (lwp != NULL)
 292                 lwp->lwp_nostop--;
 293
 294         return (error);
 295 }
 296
 297 /*
 298  * ufs_flush_inode
 299  */
 300 int
 301 ufs_flush_inode(struct inode *ip, void *arg)
 302 {
 303         int     error;
 304         int     saverror        = 0;
 305
 306         /*
 307          * wrong file system; keep looking
 308          */
 309         if (ip->i_ufsvfs != (struct ufsvfs *)arg)
 310                 return (0);
 311
 312         /*
 313          * asynchronously push all the dirty pages
 314          */
 315         if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
 316             (error != EAGAIN))
 317                 saverror = error;
 318         /*
 319          * wait for io and discard all mappings
 320          */
 321         if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
 322                 saverror = error;
 323
 324         if (ITOV(ip)->v_type == VDIR) {
 325                 dnlc_dir_purge(&ip->i_danchor);
 326         }
 327
 328         return (saverror);
 329 }
 330
 331 /*
 332  * ufs_flush
 333  *      Flush everything that is currently dirty; this includes invalidating
 334  *      any mappings.
 335  */
 336 int
 337 ufs_flush(struct vfs *vfsp)
 338 {
 339         int             error;
 340         int             saverror = 0;
 341         struct ufsvfs   *ufsvfsp        = (struct ufsvfs *)vfsp->vfs_data;
 342         struct fs       *fs             = ufsvfsp->vfs_fs;
 343         int             tdontblock = 0;
 344
 345         ASSERT(vfs_lock_held(vfsp));
 346
 347         /*
 348          * purge dnlc
 349          */
 350         (void) dnlc_purge_vfsp(vfsp, 0);
 351
 352         /*
 353          * drain the delete and idle threads
 354          */
 355         ufs_delete_drain(vfsp, 0, 0);
 356         ufs_idle_drain(vfsp);
 357
 358         /*
 359          * flush and invalidate quota records
 360          */
 361         (void) qsync(ufsvfsp);
 362
 363         /*
 364          * flush w/invalidate the inodes for vfsp
 365          */
 366         if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
 367                 saverror = error;
 368
 369         /*
 370          * synchronously flush superblock and summary info
 371          */
 372         if (fs->fs_ronly == 0 && fs->fs_fmod) {
 373                 fs->fs_fmod = 0;
 374                 TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
 375         }
 376         /*
 377          * flush w/invalidate block device pages and buf cache
 378          */
 379         if ((error = fop_putpage(common_specvp(ufsvfsp->vfs_devvp),
 380             0, 0, B_INVAL, CRED(), NULL)) > 0)
 381                 saverror = error;
 382
 383         (void) bflush((dev_t)vfsp->vfs_dev);
 384         (void) bfinval((dev_t)vfsp->vfs_dev, 0);
 385
 386         /*
 387          * drain the delete and idle threads again
 388          */
 389         ufs_delete_drain(vfsp, 0, 0);
 390         ufs_idle_drain(vfsp);
 391
 392         /*
 393          * play with the clean flag
 394          */
 395         if (saverror == 0)
 396                 ufs_checkclean(vfsp);
 397
 398         /*
 399          * Flush any outstanding transactions and roll the log
 400          * only if we are supposed to do, i.e. LDL_NOROLL not set.
 401          * We can not simply check for fs_ronly here since fsck also may
 402          * use this code to roll the log on a read-only filesystem, e.g.
 403          * root during early stages of boot, if other then a sanity check is
 404          * done, it will clear LDL_NOROLL before.
 405          * In addition we assert that the deltamap does not contain any deltas
 406          * in case LDL_NOROLL is set since this is not supposed to happen.
 407          */
 408         if (TRANS_ISTRANS(ufsvfsp)) {
 409                 ml_unit_t       *ul     = ufsvfsp->vfs_log;
 410                 mt_map_t        *mtm    = ul->un_deltamap;
 411
 412                 if (ul->un_flags & LDL_NOROLL) {
 413                         ASSERT(mtm->mtm_nme == 0);
 414                 } else {
 415                         /*
 416                          * Do not set T_DONTBLOCK if there is a
 417                          * transaction opened by caller.
 418                          */
 419                         if (curthread->t_flag & T_DONTBLOCK)
 420                                 tdontblock = 1;
 421                         else
 422                                 curthread->t_flag |= T_DONTBLOCK;
 423
 424                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
 425                                          TOP_COMMIT_SIZE, &error);
 426
 427                         if (!error) {
 428                                 TRANS_END_SYNC(ufsvfsp, &saverror,
 429                                                TOP_COMMIT_FLUSH,
 430                                                TOP_COMMIT_SIZE);
 431                         }
 432
 433                         if (tdontblock == 0)
 434                                 curthread->t_flag &= ~T_DONTBLOCK;
 435
 436                         logmap_roll_dev(ufsvfsp->vfs_log);
 437                 }
 438         }
 439
 440         return (saverror);
 441 }
 442
 443 /*
 444  * ufs_thaw_wlock
 445  *      special processing when thawing down to wlock
 446  */
 447 static int
 448 ufs_thaw_wlock(struct inode *ip, void *arg)
 449 {
 450         /*
 451          * wrong file system; keep looking
 452          */
 453         if (ip->i_ufsvfs != (struct ufsvfs *)arg)
 454                 return (0);
 455
 456         /*
 457          * iupdat refuses to clear flags if the fs is read only.  The fs
 458          * may become read/write during the lock and we wouldn't want
 459          * these inodes being written to disk.  So clear the flags.
 460          */
 461         rw_enter(&ip->i_contents, RW_WRITER);
 462         ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
 463         rw_exit(&ip->i_contents);
 464
 465         /*
 466          * pages are mlocked -- fail wlock
 467          */
 468         if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
 469                 return (EBUSY);
 470
 471         return (0);
 472 }
 473
 474 /*
 475  * ufs_thaw_hlock
 476  *      special processing when thawing down to hlock or elock
 477  */
 478 static int
 479 ufs_thaw_hlock(struct inode *ip, void *arg)
 480 {
 481         struct vnode    *vp     = ITOV(ip);
 482
 483         /*
 484          * wrong file system; keep looking
 485          */
 486         if (ip->i_ufsvfs != (struct ufsvfs *)arg)
 487                 return (0);
 488
 489         /*
 490          * blow away all pages - even if they are mlocked
 491          */
 492         do {
 493                 (void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
 494         } while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
 495         rw_enter(&ip->i_contents, RW_WRITER);
 496         ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
 497         rw_exit(&ip->i_contents);
 498
 499         return (0);
 500 }
 501
 502 /*
 503  * ufs_thaw
 504  *      thaw file system lock down to current value
 505  */
 506 int
 507 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
 508 {
 509         int             error   = 0;
 510         int             noidel  = (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
 511
 512         /*
 513          * if wlock or hlock or elock
 514          */
 515         if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
 516             ULOCKFS_IS_ELOCK(ulp)) {
 517
 518                 /*
 519                  * don't keep access times
 520                  * don't free deleted files
 521                  * if superblock writes are allowed, limit them to me for now
 522                  */
 523                 ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
 524                 if (ulp->ul_sbowner != (kthread_id_t)-1)
 525                         ulp->ul_sbowner = curthread;
 526
 527                 /*
 528                  * wait for writes for deleted files and superblock updates
 529                  */
 530                 (void) ufs_flush(vfsp);
 531
 532                 /*
 533                  * now make sure the quota file is up-to-date
 534                  *      expensive; but effective
 535                  */
 536                 error = ufs_flush(vfsp);
 537                 /*
 538                  * no one can write the superblock
 539                  */
 540                 ulp->ul_sbowner = (kthread_id_t)-1;
 541
 542                 /*
 543                  * special processing for wlock/hlock/elock
 544                  */
 545                 if (ULOCKFS_IS_WLOCK(ulp)) {
 546                         if (error)
 547                                 goto errout;
 548                         error = bfinval(ufsvfsp->vfs_dev, 0);
 549                         if (error)
 550                                 goto errout;
 551                         error = ufs_scan_inodes(0, ufs_thaw_wlock,
 552                             (void *)ufsvfsp, ufsvfsp);
 553                         if (error)
 554                                 goto errout;
 555                 }
 556                 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
 557                         error = 0;
 558                         (void) ufs_scan_inodes(0, ufs_thaw_hlock,
 559                             (void *)ufsvfsp, ufsvfsp);
 560                         (void) bfinval(ufsvfsp->vfs_dev, 1);
 561                 }
 562         } else {
 563
 564                 /*
 565                  * okay to keep access times
 566                  * okay to free deleted files
 567                  * okay to write the superblock
 568                  */
 569                 ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
 570                 ulp->ul_sbowner = NULL;
 571
 572                 /*
 573                  * flush in case deleted files are in memory
 574                  */
 575                 if (noidel) {
 576                         if (error = ufs_flush(vfsp))
 577                                 goto errout;
 578                 }
 579         }
 580
 581 errout:
 582         cv_broadcast(&ulp->ul_cv);
 583         return (error);
 584 }
 585
 586 /*
 587  * ufs_reconcile_fs
 588  *      reconcile incore superblock with ondisk superblock
 589  */
 590 int
 591 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
 592 {
 593         struct fs       *mfs;   /* in-memory superblock */
 594         struct fs       *dfs;   /* on-disk   superblock */
 595         struct buf      *bp;    /* on-disk   superblock buf */
 596         int              needs_unlock;
 597         char             finished_fsclean;
 598
 599         mfs = ufsvfsp->vfs_fs;
 600
 601         /*
 602          * get the on-disk copy of the superblock
 603          */
 604         bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
 605         bp->b_flags |= (B_STALE|B_AGE);
 606         if (bp->b_flags & B_ERROR) {
 607                 brelse(bp);
 608                 return (EIO);
 609         }
 610         dfs = bp->b_un.b_fs;
 611
 612         /* error locks may only unlock after the fs has been made consistent */
 613         if (errlck == UN_ERRLCK) {
 614                 if (dfs->fs_clean == FSFIX) {   /* being repaired */
 615                         brelse(bp);
 616                         return (EAGAIN);
 617                 }
 618                 /* repair not yet started? */
 619                 finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
 620                 if (dfs->fs_clean != finished_fsclean) {
 621                         brelse(bp);
 622                         return (EBUSY);
 623                 }
 624         }
 625
 626         /*
 627          * if superblock has changed too much, abort
 628          */
 629         if ((mfs->fs_sblkno             != dfs->fs_sblkno) ||
 630             (mfs->fs_cblkno             != dfs->fs_cblkno) ||
 631             (mfs->fs_iblkno             != dfs->fs_iblkno) ||
 632             (mfs->fs_dblkno             != dfs->fs_dblkno) ||
 633             (mfs->fs_cgoffset           != dfs->fs_cgoffset) ||
 634             (mfs->fs_cgmask             != dfs->fs_cgmask) ||
 635             (mfs->fs_bsize              != dfs->fs_bsize) ||
 636             (mfs->fs_fsize              != dfs->fs_fsize) ||
 637             (mfs->fs_frag               != dfs->fs_frag) ||
 638             (mfs->fs_bmask              != dfs->fs_bmask) ||
 639             (mfs->fs_fmask              != dfs->fs_fmask) ||
 640             (mfs->fs_bshift             != dfs->fs_bshift) ||
 641             (mfs->fs_fshift             != dfs->fs_fshift) ||
 642             (mfs->fs_fragshift          != dfs->fs_fragshift) ||
 643             (mfs->fs_fsbtodb            != dfs->fs_fsbtodb) ||
 644             (mfs->fs_sbsize             != dfs->fs_sbsize) ||
 645             (mfs->fs_nindir             != dfs->fs_nindir) ||
 646             (mfs->fs_nspf               != dfs->fs_nspf) ||
 647             (mfs->fs_trackskew          != dfs->fs_trackskew) ||
 648             (mfs->fs_cgsize             != dfs->fs_cgsize) ||
 649             (mfs->fs_ntrak              != dfs->fs_ntrak) ||
 650             (mfs->fs_nsect              != dfs->fs_nsect) ||
 651             (mfs->fs_spc                != dfs->fs_spc) ||
 652             (mfs->fs_cpg                != dfs->fs_cpg) ||
 653             (mfs->fs_ipg                != dfs->fs_ipg) ||
 654             (mfs->fs_fpg                != dfs->fs_fpg) ||
 655             (mfs->fs_postblformat       != dfs->fs_postblformat) ||
 656             (mfs->fs_magic              != dfs->fs_magic)) {
 657                 brelse(bp);
 658                 return (EACCES);
 659         }
 660         if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
 661                 if (mfs->fs_clean == FSLOG) {
 662                         brelse(bp);
 663                         return (EACCES);
 664                 }
 665
 666         /*
 667          * get new summary info
 668          */
 669         if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
 670                 brelse(bp);
 671                 return (EIO);
 672         }
 673
 674         /*
 675          * release old summary info and update in-memory superblock
 676          */
 677         kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
 678         mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;    /* Only entry 0 used */
 679
 680         /*
 681          * update fields allowed to change
 682          */
 683         mfs->fs_size            = dfs->fs_size;
 684         mfs->fs_dsize           = dfs->fs_dsize;
 685         mfs->fs_ncg             = dfs->fs_ncg;
 686         mfs->fs_minfree         = dfs->fs_minfree;
 687         mfs->fs_rotdelay        = dfs->fs_rotdelay;
 688         mfs->fs_rps             = dfs->fs_rps;
 689         mfs->fs_maxcontig       = dfs->fs_maxcontig;
 690         mfs->fs_maxbpg          = dfs->fs_maxbpg;
 691         mfs->fs_csmask          = dfs->fs_csmask;
 692         mfs->fs_csshift         = dfs->fs_csshift;
 693         mfs->fs_optim           = dfs->fs_optim;
 694         mfs->fs_csaddr          = dfs->fs_csaddr;
 695         mfs->fs_cssize          = dfs->fs_cssize;
 696         mfs->fs_ncyl            = dfs->fs_ncyl;
 697         mfs->fs_cstotal         = dfs->fs_cstotal;
 698         mfs->fs_reclaim         = dfs->fs_reclaim;
 699
 700         if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
 701                 mfs->fs_reclaim &= ~FS_RECLAIM;
 702                 mfs->fs_reclaim |=  FS_RECLAIMING;
 703                 ufs_thread_start(&ufsvfsp->vfs_reclaim,
 704                     ufs_thread_reclaim, vfsp);
 705         }
 706
 707         /* XXX What to do about sparecon? */
 708
 709         /* XXX need to copy volume label */
 710
 711         /*
 712          * ondisk clean flag overrides inmemory clean flag iff == FSBAD
 713          * or if error-locked and ondisk is now clean
 714          */
 715         needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
 716         if (needs_unlock)
 717                 mutex_enter(&ufsvfsp->vfs_lock);
 718
 719         if (errlck == UN_ERRLCK) {
 720                 if (finished_fsclean == dfs->fs_clean)
 721                         mfs->fs_clean = finished_fsclean;
 722                 else
 723                         mfs->fs_clean = FSBAD;
 724                 mfs->fs_state = FSOKAY - dfs->fs_time;
 725         }
 726
 727         if (FSOKAY != dfs->fs_state + dfs->fs_time ||
 728             (dfs->fs_clean == FSBAD))
 729                 mfs->fs_clean = FSBAD;
 730
 731         if (needs_unlock)
 732                 mutex_exit(&ufsvfsp->vfs_lock);
 733
 734         brelse(bp);
 735
 736         return (0);
 737 }
 738
 739 /*
 740  * ufs_reconcile_inode
 741  *      reconcile ondisk inode with incore inode
 742  */
 743 static int
 744 ufs_reconcile_inode(struct inode *ip, void *arg)
 745 {
 746         int             i;
 747         int             ndaddr;
 748         int             niaddr;
 749         struct dinode   *dp;            /* ondisk inode */
 750         struct buf      *bp     = NULL;
 751         uid_t           d_uid;
 752         gid_t           d_gid;
 753         int             error = 0;
 754         struct fs       *fs;
 755
 756         /*
 757          * not an inode we care about
 758          */
 759         if (ip->i_ufsvfs != (struct ufsvfs *)arg)
 760                 return (0);
 761
 762         fs = ip->i_fs;
 763
 764         /*
 765          * Inode reconciliation fails: we made the filesystem quiescent
 766          * and we did a ufs_flush() before calling ufs_reconcile_inode()
 767          * and thus the inode should not have been changed inbetween.
 768          * Any discrepancies indicate a logic error and a pretty
 769          * significant run-state inconsistency we should complain about.
 770          */
 771         if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
 772                 cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
 773                     "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
 774                 return (EINVAL);
 775         }
 776
 777         /*
 778          * get the dinode
 779          */
 780         bp = UFS_BREAD(ip->i_ufsvfs,
 781             ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
 782             (int)fs->fs_bsize);
 783         if (bp->b_flags & B_ERROR) {
 784                 brelse(bp);
 785                 return (EIO);
 786         }
 787         dp  = bp->b_un.b_dino;
 788         dp += itoo(fs, ip->i_number);
 789
 790         /*
 791          * handle Sun's implementation of EFT
 792          */
 793         d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
 794         d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
 795
 796         rw_enter(&ip->i_contents, RW_WRITER);
 797
 798         /*
 799          * some fields are not allowed to change
 800          */
 801         if ((ip->i_mode  != dp->di_mode) ||
 802             (ip->i_gen   != dp->di_gen) ||
 803             (ip->i_uid   != d_uid) ||
 804             (ip->i_gid   != d_gid)) {
 805                 error = EACCES;
 806                 goto out;
 807         }
 808
 809         /*
 810          * and some are allowed to change
 811          */
 812         ip->i_size              = dp->di_size;
 813         ip->i_ic.ic_flags       = dp->di_ic.ic_flags;
 814         ip->i_blocks            = dp->di_blocks;
 815         ip->i_nlink             = dp->di_nlink;
 816         if (ip->i_flag & IFASTSYMLNK) {
 817                 ndaddr = 1;
 818                 niaddr = 0;
 819         } else {
 820                 ndaddr = NDADDR;
 821                 niaddr = NIADDR;
 822         }
 823         for (i = 0; i < ndaddr; ++i)
 824                 ip->i_db[i] = dp->di_db[i];
 825         for (i = 0; i < niaddr; ++i)
 826                 ip->i_ib[i] = dp->di_ib[i];
 827
 828 out:
 829         rw_exit(&ip->i_contents);
 830         brelse(bp);
 831         return (error);
 832 }
 833
 834 /*
 835  * ufs_reconcile
 836  *      reconcile ondisk superblock/inodes with any incore
 837  */
 838 static int
 839 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
 840 {
 841         int     error = 0;
 842
 843         /*
 844          * get rid of as much inmemory data as possible
 845          */
 846         (void) ufs_flush(vfsp);
 847
 848         /*
 849          * reconcile the superblock and inodes
 850          */
 851         if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
 852                 return (error);
 853         if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
 854                 return (error);
 855         /*
 856          * allocation blocks may be incorrect; get rid of them
 857          */
 858         (void) ufs_flush(vfsp);
 859
 860         return (error);
 861 }
 862
 863 /*
 864  * File system locking
 865  */
 866 int
 867 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
 868 {
 869         return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
 870 }
 871
 872 /* kernel-internal interface, also used by fix-on-panic */
 873 int
 874 ufs__fiolfs(
 875         struct vnode *vp,
 876         struct lockfs *lockfsp,
 877         int from_user,
 878         int from_log)
 879 {
 880         struct ulockfs  *ulp;
 881         struct lockfs   lfs;
 882         int             error;
 883         struct vfs      *vfsp;
 884         struct ufsvfs   *ufsvfsp;
 885         int              errlck         = NO_ERRLCK;
 886         int              poll_events    = POLLPRI;
 887         extern struct pollhead ufs_pollhd;
 888         ulockfs_info_t *head;
 889         ulockfs_info_t *info;
 890         int signal = 0;
 891
 892         /* check valid lock type */
 893         if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
 894                 return (EINVAL);
 895
 896         if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
 897                 return (EIO);
 898
 899         vfsp = vp->v_vfsp;
 900
 901         if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */
 902                 return (EIO);
 903
 904         /* take the lock and check again */
 905         vfs_lock_wait(vfsp);
 906         if (vfsp->vfs_flag & VFS_UNMOUNTED) {
 907                 vfs_unlock(vfsp);
 908                 return (EIO);
 909         }
 910
 911         /*
 912          * Can't wlock or ro/elock fs with accounting or local swap file
 913          * We need to check for this before we grab the ul_lock to avoid
 914          * deadlocks with the accounting framework.
 915          */
 916         if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) ||
 917             LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) {
 918                 if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) {
 919                         vfs_unlock(vfsp);
 920                         return (EDEADLK);
 921                 }
 922         }
 923
 924         ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
 925         ulp = &ufsvfsp->vfs_ulockfs;
 926         head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
 927         SEARCH_ULOCKFSP(head, ulp, info);
 928
 929         /*
 930          * Suspend both the reclaim thread and the delete thread.
 931          * This must be done outside the lockfs locking protocol.
 932          */
 933         ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
 934         ufs_thread_suspend(&ufsvfsp->vfs_delete);
 935
 936         mutex_enter(&ulp->ul_lock);
 937         atomic_inc_ulong(&ufs_quiesce_pend);
 938
 939         /*
 940          * Quit if there is another lockfs request in progress
 941          * that is waiting for existing ufs_vnops to complete.
 942          */
 943         if (ULOCKFS_IS_BUSY(ulp)) {
 944                 error = EBUSY;
 945                 goto errexit;
 946         }
 947
 948         /* cannot ulocked or downgrade a hard-lock */
 949         if (ULOCKFS_IS_HLOCK(ulp)) {
 950                 error = EIO;
 951                 goto errexit;
 952         }
 953
 954         /* an error lock may be unlocked or relocked, only */
 955         if (ULOCKFS_IS_ELOCK(ulp)) {
 956                 if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
 957                         error = EBUSY;
 958                         goto errexit;
 959                 }
 960         }
 961
 962         /*
 963          * a read-only error lock may only be upgraded to an
 964          * error lock or hard lock
 965          */
 966         if (ULOCKFS_IS_ROELOCK(ulp)) {
 967                 if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
 968                         error = EBUSY;
 969                         goto errexit;
 970                 }
 971         }
 972
 973         /*
 974          * until read-only error locks are fully implemented
 975          * just return EINVAL
 976          */
 977         if (LOCKFS_IS_ROELOCK(lockfsp)) {
 978                 error = EINVAL;
 979                 goto errexit;
 980         }
 981
 982         /*
 983          * an error lock may only be applied if the file system is
 984          * unlocked or already error locked.
 985          * (this is to prevent the case where a fs gets changed out from
 986          * underneath a fs that is locked for backup,
 987          * that is, name/delete/write-locked.)
 988          */
 989         if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
 990             !ULOCKFS_IS_ROELOCK(ulp)) &&
 991             (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
 992                 error = EBUSY;
 993                 goto errexit;
 994         }
 995
 996         /* get and validate the input lockfs request */
 997         if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
 998                 goto errexit;
 999
1000         /*
1001          * save current ulockfs struct
1002          */
1003         bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
1004
1005         /*
1006          * Freeze the file system (pend future accesses)
1007          */
1008         ufs_freeze(ulp, lockfsp);
1009
1010         /*
1011          * Set locking in progress because ufs_quiesce may free the
1012          * ul_lock mutex.
1013          */
1014         ULOCKFS_SET_BUSY(ulp);
1015         /* update the ioctl copy */
1016         LOCKFS_SET_BUSY(&ulp->ul_lockfs);
1017
1018         /*
1019          * We  need to unset FWLOCK status before we call ufs_quiesce
1020          * so that the thread doesnt get suspended. We do this only if
1021          * this (fallocate) thread requested an unlock operation.
1022          */
1023         if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1024                 if (!ULOCKFS_IS_WLOCK(ulp))
1025                         ULOCKFS_CLR_FWLOCK(ulp);
1026         }
1027
1028         /*
1029          * Quiesce (wait for outstanding accesses to finish)
1030          */
1031         if (error = ufs_quiesce(ulp)) {
1032                 /*
1033                  * Interrupted due to signal. There could still be
1034                  * pending vnops.
1035                  */
1036                 signal = 1;
1037
1038                 /*
1039                  * We do broadcast because lock-status
1040                  * could be reverted to old status.
1041                  */
1042                 cv_broadcast(&ulp->ul_cv);
1043                 goto errout;
1044         }
1045
1046         /*
1047          * If the fallocate thread requested a write fs lock operation
1048          * then we set fwlock status in the ulp.
1049          */
1050         if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1051                 if (ULOCKFS_IS_WLOCK(ulp))
1052                         ULOCKFS_SET_FWLOCK(ulp);
1053         }
1054
1055         /*
1056          * save error lock status to pass down to reconcilation
1057          * routines and for later cleanup
1058          */
1059         if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1060                 errlck = UN_ERRLCK;
1061
1062         if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1063                 int needs_unlock;
1064                 int needs_sbwrite;
1065
1066                 poll_events |= POLLERR;
1067                 errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ?
1068                     RE_ERRLCK : SET_ERRLCK;
1069
1070                 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1071                 if (needs_unlock)
1072                         mutex_enter(&ufsvfsp->vfs_lock);
1073
1074                 /* disable delayed i/o */
1075                 needs_sbwrite = 0;
1076
1077                 if (errlck == SET_ERRLCK) {
1078                         ufsvfsp->vfs_fs->fs_clean = FSBAD;
1079                         needs_sbwrite = 1;
1080                 }
1081
1082                 needs_sbwrite |= ufsvfsp->vfs_dio;
1083                 ufsvfsp->vfs_dio = 0;
1084
1085                 if (needs_unlock)
1086                         mutex_exit(&ufsvfsp->vfs_lock);
1087
1088                 if (needs_sbwrite) {
1089                         ulp->ul_sbowner = curthread;
1090                         TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1091
1092                         if (needs_unlock)
1093                                 mutex_enter(&ufsvfsp->vfs_lock);
1094
1095                         ufsvfsp->vfs_fs->fs_fmod = 0;
1096
1097                         if (needs_unlock)
1098                                 mutex_exit(&ufsvfsp->vfs_lock);
1099                 }
1100         }
1101
1102         /*
1103          * reconcile superblock and inodes if was wlocked
1104          */
1105         if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1106                 if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1107                         goto errout;
1108                 /*
1109                  * in case the fs grew; reset the metadata map for logging tests
1110                  */
1111                 TRANS_MATA_UMOUNT(ufsvfsp);
1112                 TRANS_MATA_MOUNT(ufsvfsp);
1113                 TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1114         }
1115
1116         /*
1117          * At least everything *currently* dirty goes out.
1118          */
1119
1120         if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1121             !ULOCKFS_IS_ELOCK(ulp))
1122                 goto errout;
1123
1124         /*
1125          * thaw file system and wakeup pended processes
1126          */
1127         if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1128                 if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1129                         goto errout;
1130
1131         /*
1132          * reset modified flag if not already write locked
1133          */
1134         if (!LOCKFS_IS_WLOCK(&lfs))
1135                 ULOCKFS_CLR_MOD(ulp);
1136
1137         /*
1138          * idle the lock struct
1139          */
1140         ULOCKFS_CLR_BUSY(ulp);
1141         /* update the ioctl copy */
1142         LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1143
1144         /*
1145          * free current comment
1146          */
1147         if (lfs.lf_comment && lfs.lf_comlen != 0) {
1148                 kmem_free(lfs.lf_comment, lfs.lf_comlen);
1149                 lfs.lf_comment = NULL;
1150                 lfs.lf_comlen = 0;
1151         }
1152
1153         /* do error lock cleanup */
1154         if (errlck == UN_ERRLCK)
1155                 ufsfx_unlockfs(ufsvfsp);
1156
1157         else if (errlck == RE_ERRLCK)
1158                 ufsfx_lockfs(ufsvfsp);
1159
1160         /* don't allow error lock from user to invoke panic */
1161         else if (from_user && errlck == SET_ERRLCK &&
1162             !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1163                 (void) ufs_fault(ufsvfsp->vfs_root,
1164                     ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1165                     ulp->ul_lockfs.lf_comment: "user-applied error lock");
1166
1167         atomic_dec_ulong(&ufs_quiesce_pend);
1168         mutex_exit(&ulp->ul_lock);
1169         vfs_unlock(vfsp);
1170
1171         if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1172                 poll_events |= POLLERR;
1173
1174         pollwakeup(&ufs_pollhd, poll_events);
1175
1176         /*
1177          * Allow both the delete thread and the reclaim thread to
1178          * continue.
1179          */
1180         ufs_thread_continue(&ufsvfsp->vfs_delete);
1181         ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1182
1183         return (0);
1184
1185 errout:
1186         /*
1187          * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1188          */
1189         if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1190                 bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1191                 ulp->ul_fs_lock = (1 << lfs.lf_lock);
1192         }
1193
1194         /*
1195          * Don't call ufs_thaw() when there's a signal during
1196          * ufs quiesce operation as it can lead to deadlock
1197          * with getpage.
1198          */
1199         if (signal == 0)
1200                 (void) ufs_thaw(vfsp, ufsvfsp, ulp);
1201
1202         ULOCKFS_CLR_BUSY(ulp);
1203         LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1204
1205 errexit:
1206         atomic_dec_ulong(&ufs_quiesce_pend);
1207         mutex_exit(&ulp->ul_lock);
1208         vfs_unlock(vfsp);
1209
1210         /*
1211          * Allow both the delete thread and the reclaim thread to
1212          * continue.
1213          */
1214         ufs_thread_continue(&ufsvfsp->vfs_delete);
1215         ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1216
1217         return (error);
1218 }
1219
1220 /*
1221  * fiolfss
1222  *      return the current file system locking state info
1223  */
1224 int
1225 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1226 {
1227         struct ulockfs  *ulp;
1228
1229         if (!vp || !vp->v_vfsp || !VTOI(vp))
1230                 return (EINVAL);
1231
1232         /* file system has been forcibly unmounted */
1233         if (VTOI(vp)->i_ufsvfs == NULL)
1234                 return (EIO);
1235
1236         ulp = VTOUL(vp);
1237
1238         if (ULOCKFS_IS_HLOCK(ulp)) {
1239                 *lockfsp = ulp->ul_lockfs;      /* structure assignment */
1240                 return (0);
1241         }
1242
1243         mutex_enter(&ulp->ul_lock);
1244
1245         *lockfsp = ulp->ul_lockfs;      /* structure assignment */
1246
1247         if (ULOCKFS_IS_MOD(ulp))
1248                 lockfsp->lf_flags |= LOCKFS_MOD;
1249
1250         mutex_exit(&ulp->ul_lock);
1251
1252         return (0);
1253 }
1254
1255 /*
1256  * ufs_check_lockfs
1257  *      check whether a ufs_vnops conflicts with the file system lock
1258  */
1259 int
1260 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1261 {
1262         k_sigset_t      smask;
1263         int             sig, slock;
1264
1265         ASSERT(MUTEX_HELD(&ulp->ul_lock));
1266
1267         while (ulp->ul_fs_lock & mask) {
1268                 slock = (int)ULOCKFS_IS_SLOCK(ulp);
1269                 if ((curthread->t_flag & T_DONTPEND) && !slock) {
1270                         curthread->t_flag |= T_WOULDBLOCK;
1271                         return (EAGAIN);
1272                 }
1273                 curthread->t_flag &= ~T_WOULDBLOCK;
1274
1275                 /*
1276                  * In the case of an onerr umount of the fs, threads could
1277                  * have blocked before coming into ufs_check_lockfs and
1278                  * need to check for the special case of ELOCK and
1279                  * vfs_dontblock being set which would indicate that the fs
1280                  * is on its way out and will not return therefore making
1281                  * EIO the appropriate response.
1282                  */
1283                 if (ULOCKFS_IS_HLOCK(ulp) ||
1284                     (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1285                         return (EIO);
1286
1287                 /*
1288                  * wait for lock status to change
1289                  */
1290                 if (slock || ufsvfsp->vfs_nointr) {
1291                         cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1292                 } else {
1293                         sigintr(&smask, 1);
1294                         sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1295                         sigunintr(&smask);
1296                         if ((!sig && (ulp->ul_fs_lock & mask)) ||
1297                             ufsvfsp->vfs_dontblock)
1298                                 return (EINTR);
1299                 }
1300         }
1301
1302         if (mask & ULOCKFS_FWLOCK) {
1303                 atomic_inc_ulong(&ulp->ul_falloc_cnt);
1304                 ULOCKFS_SET_FALLOC(ulp);
1305         } else {
1306                 atomic_inc_ulong(&ulp->ul_vnops_cnt);
1307         }
1308
1309         return (0);
1310 }
1311
1312 /*
1313  * Check whether we came across the handcrafted lockfs protocol path. We can't
1314  * simply check for T_DONTBLOCK here as one would assume since this can also
1315  * falsely catch recursive VOP's going to a different filesystem, instead we
1316  * check if we already hold the ulockfs->ul_lock mutex.
1317  */
1318 static int
1319 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1320 {
1321         return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1322 }
1323
1324 /*
1325  * ufs_lockfs_begin - start the lockfs locking protocol
1326  */
1327 int
1328 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1329 {
1330         int             error;
1331         int             rec_vop;
1332         ushort_t        op_cnt_incremented = 0;
1333         ulong_t         *ctr;
1334         struct ulockfs *ulp;
1335         ulockfs_info_t  *ulockfs_info;
1336         ulockfs_info_t  *ulockfs_info_free;
1337         ulockfs_info_t  *ulockfs_info_temp;
1338
1339         /*
1340          * file system has been forcibly unmounted
1341          */
1342         if (ufsvfsp == NULL)
1343                 return (EIO);
1344
1345         *ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1346
1347         /*
1348          * Do lockfs protocol
1349          */
1350         ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1351         IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1352
1353         /*
1354          * Detect recursive VOP call or handcrafted internal lockfs protocol
1355          * path and bail out in that case.
1356          */
1357         if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1358                 *ulpp = NULL;
1359                 return (0);
1360         } else {
1361                 if (ulockfs_info_free == NULL) {
1362                         if ((ulockfs_info_temp = (ulockfs_info_t *)
1363                             kmem_zalloc(sizeof (ulockfs_info_t),
1364                             KM_NOSLEEP)) == NULL) {
1365                                 *ulpp = NULL;
1366                                 return (ENOMEM);
1367                         }
1368                 }
1369         }
1370
1371         /*
1372          * First time VOP call
1373          *
1374          * Increment the ctr irrespective of the lockfs state. If the lockfs
1375          * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1376          * before incrementing we need to check if there is a pending quiesce
1377          * request because if we have a continuous stream of ufs_lockfs_begin
1378          * requests pounding on a few cpu's then the ufs_quiesce thread might
1379          * never see the value of zero for ctr - a livelock kind of scenario.
1380          */
1381         ctr = (mask & ULOCKFS_FWLOCK) ?
1382             &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1383         if (!ULOCKFS_IS_SLOCK(ulp)) {
1384                 atomic_inc_ulong(ctr);
1385                 op_cnt_incremented++;
1386         }
1387
1388         /*
1389          * If the lockfs state (indicated by ul_fs_lock) is not just
1390          * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
1391          * where there is a check with an appropriate mask to selectively allow
1392          * operations permitted for that kind of lockfs state.
1393          *
1394          * Even these selective operations should not be allowed to go through
1395          * if a lockfs request is in progress because that could result in inode
1396          * modifications during a quiesce and could hence result in inode
1397          * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
1398          * so make use of ufs_quiesce_pend to disallow vnode operations when a
1399          * quiesce is in progress.
1400          */
1401         if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1402                 if (op_cnt_incremented)
1403                         if (!atomic_dec_ulong_nv(ctr))
1404                                 cv_broadcast(&ulp->ul_cv);
1405                 mutex_enter(&ulp->ul_lock);
1406                 error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1407                 mutex_exit(&ulp->ul_lock);
1408                 if (error) {
1409                         if (ulockfs_info_free == NULL)
1410                                 kmem_free(ulockfs_info_temp,
1411                                     sizeof (ulockfs_info_t));
1412                         return (error);
1413                 }
1414         } else {
1415                 /*
1416                  * This is the common case of file system in a unlocked state.
1417                  *
1418                  * If a file system is unlocked, we would expect the ctr to have
1419                  * been incremented by now. But this will not be true when a
1420                  * quiesce is winding up - SLOCK was set when we checked before
1421                  * incrementing the ctr, but by the time we checked for
1422                  * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
1423                  * to take ul_lock and go through the slow path in this uncommon
1424                  * case.
1425                  */
1426                 if (op_cnt_incremented == 0) {
1427                         mutex_enter(&ulp->ul_lock);
1428                         error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1429                         if (error) {
1430                                 mutex_exit(&ulp->ul_lock);
1431                                 if (ulockfs_info_free == NULL)
1432                                         kmem_free(ulockfs_info_temp,
1433                                             sizeof (ulockfs_info_t));
1434                                 return (error);
1435                         }
1436                         if (mask & ULOCKFS_FWLOCK)
1437                                 ULOCKFS_SET_FALLOC(ulp);
1438                         mutex_exit(&ulp->ul_lock);
1439                 } else if (mask & ULOCKFS_FWLOCK) {
1440                         mutex_enter(&ulp->ul_lock);
1441                         ULOCKFS_SET_FALLOC(ulp);
1442                         mutex_exit(&ulp->ul_lock);
1443                 }
1444         }
1445
1446         if (ulockfs_info_free != NULL) {
1447                 ulockfs_info_free->ulp = ulp;
1448                 if (mask & ULOCKFS_FWLOCK)
1449                         ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1450         } else {
1451                 ulockfs_info_temp->ulp = ulp;
1452                 ulockfs_info_temp->next = ulockfs_info;
1453                 if (mask & ULOCKFS_FWLOCK)
1454                         ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1455                 ASSERT(ufs_lockfs_key != 0);
1456                 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1457         }
1458
1459         curthread->t_flag |= T_DONTBLOCK;
1460         return (0);
1461 }
1462
1463 /*
1464  * Check whether we are returning from the top level VOP.
1465  */
1466 static int
1467 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1468 {
1469         ulockfs_info_t *info;
1470         int result = 1;
1471
1472         for (info = head; info != NULL; info = info->next) {
1473                 if (info->ulp != NULL) {
1474                         result = 0;
1475                         break;
1476                 }
1477         }
1478
1479         return (result);
1480 }
1481
1482 /*
1483  * ufs_lockfs_end - terminate the lockfs locking protocol
1484  */
1485 void
1486 ufs_lockfs_end(struct ulockfs *ulp)
1487 {
1488         ulockfs_info_t *info;
1489         ulockfs_info_t *head;
1490
1491         /*
1492          * end-of-VOP protocol
1493          */
1494         if (ulp == NULL)
1495                 return;
1496
1497         head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1498         SEARCH_ULOCKFSP(head, ulp, info);
1499
1500         /*
1501          * If we're called from a first level VOP, we have to have a
1502          * valid ulockfs record in the TSD.
1503          */
1504         ASSERT(info != NULL);
1505
1506         /*
1507          * Invalidate the ulockfs record.
1508          */
1509         info->ulp = NULL;
1510
1511         if (ufs_lockfs_top_vop_return(head))
1512                 curthread->t_flag &= ~T_DONTBLOCK;
1513
1514         /* fallocate thread */
1515         if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1516                 /* Clear the thread's fallocate state */
1517                 info->flags &= ~ULOCK_INFO_FALLOCATE;
1518                 if (!atomic_dec_ulong_nv(&ulp->ul_falloc_cnt)) {
1519                         mutex_enter(&ulp->ul_lock);
1520                         ULOCKFS_CLR_FALLOC(ulp);
1521                         cv_broadcast(&ulp->ul_cv);
1522                         mutex_exit(&ulp->ul_lock);
1523                 }
1524         } else  { /* normal thread */
1525                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
1526                         cv_broadcast(&ulp->ul_cv);
1527         }
1528 }
1529
1530 /*
1531  * ufs_lockfs_trybegin - try to start the lockfs locking protocol without
1532  * blocking.
1533  */
1534 int
1535 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1536 {
1537         int             error = 0;
1538         int             rec_vop;
1539         ushort_t        op_cnt_incremented = 0;
1540         ulong_t         *ctr;
1541         struct ulockfs *ulp;
1542         ulockfs_info_t  *ulockfs_info;
1543         ulockfs_info_t  *ulockfs_info_free;
1544         ulockfs_info_t  *ulockfs_info_temp;
1545
1546         /*
1547          * file system has been forcibly unmounted
1548          */
1549         if (ufsvfsp == NULL)
1550                 return (EIO);
1551
1552         *ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1553
1554         /*
1555          * Do lockfs protocol
1556          */
1557         ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1558         IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1559
1560         /*
1561          * Detect recursive VOP call or handcrafted internal lockfs protocol
1562          * path and bail out in that case.
1563          */
1564         if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1565                 *ulpp = NULL;
1566                 return (0);
1567         } else {
1568                 if (ulockfs_info_free == NULL) {
1569                         if ((ulockfs_info_temp = (ulockfs_info_t *)
1570                             kmem_zalloc(sizeof (ulockfs_info_t),
1571                             KM_NOSLEEP)) == NULL) {
1572                                 *ulpp = NULL;
1573                                 return (ENOMEM);
1574                         }
1575                 }
1576         }
1577
1578         /*
1579          * First time VOP call
1580          *
1581          * Increment the ctr irrespective of the lockfs state. If the lockfs
1582          * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1583          * before incrementing we need to check if there is a pending quiesce
1584          * request because if we have a continuous stream of ufs_lockfs_begin
1585          * requests pounding on a few cpu's then the ufs_quiesce thread might
1586          * never see the value of zero for ctr - a livelock kind of scenario.
1587          */
1588         ctr = (mask & ULOCKFS_FWLOCK) ?
1589             &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1590         if (!ULOCKFS_IS_SLOCK(ulp)) {
1591                 atomic_inc_ulong(ctr);
1592                 op_cnt_incremented++;
1593         }
1594
1595         if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1596                 /*
1597                  * Non-blocking version of ufs_check_lockfs() code.
1598                  *
1599                  * If the file system is not hard locked or error locked
1600                  * and if ulp->ul_fs_lock allows this operation, increment
1601                  * the appropriate counter and proceed (For eg., In case the
1602                  * file system is delete locked, a mmap can still go through).
1603                  */
1604                 if (op_cnt_incremented)
1605                         if (!atomic_dec_ulong_nv(ctr))
1606                                 cv_broadcast(&ulp->ul_cv);
1607                 mutex_enter(&ulp->ul_lock);
1608                 if (ULOCKFS_IS_HLOCK(ulp) ||
1609                     (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1610                         error = EIO;
1611                 else if (ulp->ul_fs_lock & mask)
1612                         error = EAGAIN;
1613
1614                 if (error) {
1615                         mutex_exit(&ulp->ul_lock);
1616                         if (ulockfs_info_free == NULL)
1617                                 kmem_free(ulockfs_info_temp,
1618                                     sizeof (ulockfs_info_t));
1619                         return (error);
1620                 }
1621                 atomic_inc_ulong(ctr);
1622                 if (mask & ULOCKFS_FWLOCK)
1623                         ULOCKFS_SET_FALLOC(ulp);
1624                 mutex_exit(&ulp->ul_lock);
1625         } else {
1626                 /*
1627                  * This is the common case of file system in a unlocked state.
1628                  *
1629                  * If a file system is unlocked, we would expect the ctr to have
1630                  * been incremented by now. But this will not be true when a
1631                  * quiesce is winding up - SLOCK was set when we checked before
1632                  * incrementing the ctr, but by the time we checked for
1633                  * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
1634                  * ul_lock and go through the non-blocking version of
1635                  * ufs_check_lockfs() code.
1636                  */
1637                 if (op_cnt_incremented == 0) {
1638                         mutex_enter(&ulp->ul_lock);
1639                         if (ULOCKFS_IS_HLOCK(ulp) ||
1640                             (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1641                                 error = EIO;
1642                         else if (ulp->ul_fs_lock & mask)
1643                                 error = EAGAIN;
1644
1645                         if (error) {
1646                                 mutex_exit(&ulp->ul_lock);
1647                                 if (ulockfs_info_free == NULL)
1648                                         kmem_free(ulockfs_info_temp,
1649                                             sizeof (ulockfs_info_t));
1650                                 return (error);
1651                         }
1652                         atomic_inc_ulong(ctr);
1653                         if (mask & ULOCKFS_FWLOCK)
1654                                 ULOCKFS_SET_FALLOC(ulp);
1655                         mutex_exit(&ulp->ul_lock);
1656                 } else if (mask & ULOCKFS_FWLOCK) {
1657                         mutex_enter(&ulp->ul_lock);
1658                         ULOCKFS_SET_FALLOC(ulp);
1659                         mutex_exit(&ulp->ul_lock);
1660                 }
1661         }
1662
1663         if (ulockfs_info_free != NULL) {
1664                 ulockfs_info_free->ulp = ulp;
1665                 if (mask & ULOCKFS_FWLOCK)
1666                         ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1667         } else {
1668                 ulockfs_info_temp->ulp = ulp;
1669                 ulockfs_info_temp->next = ulockfs_info;
1670                 if (mask & ULOCKFS_FWLOCK)
1671                         ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1672                 ASSERT(ufs_lockfs_key != 0);
1673                 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1674         }
1675
1676         curthread->t_flag |= T_DONTBLOCK;
1677         return (0);
1678 }
1679
1680 /*
1681  * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1682  */
1683 int
1684 ufs_lockfs_begin_getpage(
1685         struct ufsvfs   *ufsvfsp,
1686         struct ulockfs  **ulpp,
1687         struct seg      *seg,
1688         int             read_access,
1689         uint_t          *protp)
1690 {
1691         ulong_t                 mask;
1692         int                     error;
1693         int                     rec_vop;
1694         struct ulockfs          *ulp;
1695         ulockfs_info_t          *ulockfs_info;
1696         ulockfs_info_t          *ulockfs_info_free;
1697         ulockfs_info_t          *ulockfs_info_temp;
1698
1699         /*
1700          * file system has been forcibly unmounted
1701          */
1702         if (ufsvfsp == NULL)
1703                 return (EIO);
1704
1705         *ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1706
1707         /*
1708          * Do lockfs protocol
1709          */
1710         ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1711         IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1712
1713         /*
1714          * Detect recursive VOP call or handcrafted internal lockfs protocol
1715          * path and bail out in that case.
1716          */
1717         if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1718                 *ulpp = NULL;
1719                 return (0);
1720         } else {
1721                 if (ulockfs_info_free == NULL) {
1722                         if ((ulockfs_info_temp = (ulockfs_info_t *)
1723                             kmem_zalloc(sizeof (ulockfs_info_t),
1724                             KM_NOSLEEP)) == NULL) {
1725                                 *ulpp = NULL;
1726                                 return (ENOMEM);
1727                         }
1728                 }
1729         }
1730
1731         /*
1732          * First time VOP call
1733          */
1734         atomic_inc_ulong(&ulp->ul_vnops_cnt);
1735         if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1736                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
1737                         cv_broadcast(&ulp->ul_cv);
1738                 mutex_enter(&ulp->ul_lock);
1739                 if (seg->s_ops == &segvn_ops &&
1740                     ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1741                         mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1742                 } else if (protp && read_access) {
1743                         /*
1744                          * Restrict the mapping to readonly.
1745                          * Writes to this mapping will cause
1746                          * another fault which will then
1747                          * be suspended if fs is write locked
1748                          */
1749                         *protp &= ~PROT_WRITE;
1750                         mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1751                 } else
1752                         mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1753
1754                 /*
1755                  * will sleep if this fs is locked against this VOP
1756                  */
1757                 error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1758                 mutex_exit(&ulp->ul_lock);
1759                 if (error) {
1760                         if (ulockfs_info_free == NULL)
1761                                 kmem_free(ulockfs_info_temp,
1762                                     sizeof (ulockfs_info_t));
1763                         return (error);
1764                 }
1765         }
1766
1767         if (ulockfs_info_free != NULL) {
1768                 ulockfs_info_free->ulp = ulp;
1769         } else {
1770                 ulockfs_info_temp->ulp = ulp;
1771                 ulockfs_info_temp->next = ulockfs_info;
1772                 ASSERT(ufs_lockfs_key != 0);
1773                 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1774         }
1775
1776         curthread->t_flag |= T_DONTBLOCK;
1777         return (0);
1778 }
1779
1780 void
1781 ufs_lockfs_tsd_destructor(void *head)
1782 {
1783         ulockfs_info_t *curr = (ulockfs_info_t *)head;
1784         ulockfs_info_t *temp;
1785
1786         for (; curr != NULL; ) {
1787                 /*
1788                  * The TSD destructor is being called when the thread exits
1789                  * (via thread_exit()). At that time it must have cleaned up
1790                  * all VOPs via ufs_lockfs_end() and there must not be a
1791                  * valid ulockfs record exist while a thread is exiting.
1792                  */
1793                 temp = curr;
1794                 curr = curr->next;
1795                 ASSERT(temp->ulp == NULL);
1796                 kmem_free(temp, sizeof (ulockfs_info_t));
1797         }
1798 }