kernel/fs/ufs/ufs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  29 /*        All Rights Reserved   */
  30
  31 /*
  32  * Portions of this source code were derived from Berkeley 4.3 BSD
  33  * under license from the Regents of the University of California.
  34  */
  35
  36 #include <sys/types.h>
  37 #include <sys/t_lock.h>
  38 #include <sys/ksynch.h>
  39 #include <sys/param.h>
  40 #include <sys/time.h>
  41 #include <sys/systm.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/resource.h>
  44 #include <sys/signal.h>
  45 #include <sys/cred.h>
  46 #include <sys/user.h>
  47 #include <sys/buf.h>
  48 #include <sys/vfs.h>
  49 #include <sys/vnode.h>
  50 #include <sys/proc.h>
  51 #include <sys/disp.h>
  52 #include <sys/file.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/flock.h>
  55 #include <sys/atomic.h>
  56 #include <sys/kmem.h>
  57 #include <sys/uio.h>
  58 #include <sys/dnlc.h>
  59 #include <sys/conf.h>
  60 #include <sys/mman.h>
  61 #include <sys/pathname.h>
  62 #include <sys/debug.h>
  63 #include <sys/vmsystm.h>
  64 #include <sys/cmn_err.h>
  65 #include <sys/filio.h>
  66 #include <sys/policy.h>
  67
  68 #include <sys/fs/ufs_fs.h>
  69 #include <sys/fs/ufs_lockfs.h>
  70 #include <sys/fs/ufs_filio.h>
  71 #include <sys/fs/ufs_inode.h>
  72 #include <sys/fs/ufs_fsdir.h>
  73 #include <sys/fs/ufs_quota.h>
  74 #include <sys/fs/ufs_log.h>
  75 #include <sys/fs/ufs_snap.h>
  76 #include <sys/fs/ufs_trans.h>
  77 #include <sys/fs/ufs_panic.h>
  78 #include <sys/fs/ufs_bio.h>
  79 #include <sys/dirent.h>         /* must be AFTER <sys/fs/fsdir.h>! */
  80 #include <sys/errno.h>
  81 #include <sys/fssnap_if.h>
  82 #include <sys/unistd.h>
  83 #include <sys/sunddi.h>
  84
  85 #include <sys/filio.h>          /* _FIOIO */
  86
  87 #include <vm/hat.h>
  88 #include <vm/page.h>
  89 #include <vm/pvn.h>
  90 #include <vm/as.h>
  91 #include <vm/seg.h>
  92 #include <vm/seg_map.h>
  93 #include <vm/seg_vn.h>
  94 #include <vm/seg_kmem.h>
  95 #include <vm/rm.h>
  96 #include <sys/swap.h>
  97
  98 #include <sys/fs_subr.h>
  99
 100 #include <sys/fs/decomp.h>
 101
 102 static struct instats ins;
 103
 104 static  int ufs_getpage_ra(struct vnode *, uoff_t, struct seg *, caddr_t);
 105 static  int ufs_getpage_miss(struct vnode *, uoff_t, size_t, struct seg *,
 106                 caddr_t, struct page **, size_t, enum seg_rw, int);
 107 static  int ufs_open(struct vnode **, int, struct cred *, caller_context_t *);
 108 static  int ufs_close(struct vnode *, int, int, offset_t, struct cred *,
 109                 caller_context_t *);
 110 static  int ufs_read(struct vnode *, struct uio *, int, struct cred *,
 111                 struct caller_context *);
 112 static  int ufs_write(struct vnode *, struct uio *, int, struct cred *,
 113                 struct caller_context *);
 114 static  int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *,
 115                 int *, caller_context_t *);
 116 static  int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *,
 117                 caller_context_t *);
 118 static  int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *,
 119                 caller_context_t *);
 120 static  int ufs_access(struct vnode *, int, int, struct cred *,
 121                 caller_context_t *);
 122 static  int ufs_lookup(struct vnode *, char *, struct vnode **,
 123                 struct pathname *, int, struct vnode *, struct cred *,
 124                 caller_context_t *, int *, pathname_t *);
 125 static  int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl,
 126                 int, struct vnode **, struct cred *, int,
 127                 caller_context_t *, vsecattr_t  *);
 128 static  int ufs_remove(struct vnode *, char *, struct cred *,
 129                 caller_context_t *, int);
 130 static  int ufs_link(struct vnode *, struct vnode *, char *, struct cred *,
 131                 caller_context_t *, int);
 132 static  int ufs_rename(struct vnode *, char *, struct vnode *, char *,
 133                 struct cred *, caller_context_t *, int);
 134 static  int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **,
 135                 struct cred *, caller_context_t *, int, vsecattr_t *);
 136 static  int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *,
 137                 caller_context_t *, int);
 138 static  int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *,
 139                 caller_context_t *, int);
 140 static  int ufs_symlink(struct vnode *, char *, struct vattr *, char *,
 141                 struct cred *, caller_context_t *, int);
 142 static  int ufs_readlink(struct vnode *, struct uio *, struct cred *,
 143                 caller_context_t *);
 144 static  int ufs_fsync(struct vnode *, int, struct cred *, caller_context_t *);
 145 static  void ufs_inactive(struct vnode *, struct cred *, caller_context_t *);
 146 static  int ufs_fid(struct vnode *, struct fid *, caller_context_t *);
 147 static  int ufs_rwlock(struct vnode *, int, caller_context_t *);
 148 static  void ufs_rwunlock(struct vnode *, int, caller_context_t *);
 149 static  int ufs_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
 150 static  int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
 151                 struct flk_callback *, struct cred *,
 152                 caller_context_t *);
 153 static  int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t,
 154                 cred_t *, caller_context_t *);
 155 static  int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *,
 156                 struct page **, size_t, struct seg *, caddr_t,
 157                 enum seg_rw, struct cred *, caller_context_t *);
 158 static  int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *,
 159                 caller_context_t *);
 160 static  int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *);
 161 static  int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
 162                 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 163 static  int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 164                 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 165 static  int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 166                 uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
 167 static  int ufs_poll(vnode_t *, short, int, short *, struct pollhead **,
 168                 caller_context_t *);
 169 static  int ufs_dump(vnode_t *, caddr_t, offset_t, offset_t,
 170     caller_context_t *);
 171 static  int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *,
 172                 caller_context_t *);
 173 static  int ufs_pageio(struct vnode *, struct page *, uoff_t, size_t, int,
 174                 struct cred *, caller_context_t *);
 175 static  int ufs_dumpctl(vnode_t *, int, offset_t *, caller_context_t *);
 176 static  daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *,
 177                 daddr32_t *, int, int);
 178 static  int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 179                 caller_context_t *);
 180 static  int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 181                 caller_context_t *);
 182 static  int ufs_priv_access(void *, int, struct cred *);
 183 static  int ufs_eventlookup(struct vnode *, char *, struct cred *,
 184     struct vnode **);
 185 extern int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
 186
 187 /*
 188  * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions.
 189  *
 190  * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet.
 191  *
 192  * NOTE: "not blkd" below  means that the operation isn't blocked by lockfs
 193  */
 194 const struct vnodeops ufs_vnodeops = {
 195         .vnop_name = "ufs",
 196         .vop_open = ufs_open,   /* not blkd */
 197         .vop_close = ufs_close, /* not blkd */
 198         .vop_read = ufs_read,
 199         .vop_write = ufs_write,
 200         .vop_ioctl = ufs_ioctl,
 201         .vop_getattr = ufs_getattr,
 202         .vop_setattr = ufs_setattr,
 203         .vop_access = ufs_access,
 204         .vop_lookup = ufs_lookup,
 205         .vop_create = ufs_create,
 206         .vop_remove = ufs_remove,
 207         .vop_link = ufs_link,
 208         .vop_rename = ufs_rename,
 209         .vop_mkdir = ufs_mkdir,
 210         .vop_rmdir = ufs_rmdir,
 211         .vop_readdir = ufs_readdir,
 212         .vop_symlink = ufs_symlink,
 213         .vop_readlink = ufs_readlink,
 214         .vop_fsync = ufs_fsync,
 215         .vop_inactive = ufs_inactive, /* not blkd */
 216         .vop_fid = ufs_fid,
 217         .vop_rwlock = ufs_rwlock,       /* not blkd */
 218         .vop_rwunlock = ufs_rwunlock, /* not blkd */
 219         .vop_seek = ufs_seek,
 220         .vop_frlock = ufs_frlock,
 221         .vop_space = ufs_space,
 222         .vop_getpage = ufs_getpage,
 223         .vop_putpage = ufs_putpage,
 224         .vop_map = ufs_map,
 225         .vop_addmap = ufs_addmap,       /* not blkd */
 226         .vop_delmap = ufs_delmap,       /* not blkd */
 227         .vop_poll = ufs_poll,   /* not blkd */
 228         .vop_dump = ufs_dump,
 229         .vop_pathconf = ufs_l_pathconf,
 230         .vop_pageio = ufs_pageio,
 231         .vop_dumpctl = ufs_dumpctl,
 232         .vop_getsecattr = ufs_getsecattr,
 233         .vop_setsecattr = ufs_setsecattr,
 234         .vop_vnevent = fs_vnevent_support,
 235 };
 236
 237 #define MAX_BACKFILE_COUNT      9999
 238
 239 /*
 240  * Created by ufs_dumpctl() to store a file's disk block info into memory.
 241  * Used by ufs_dump() to dump data to disk directly.
 242  */
 243 struct dump {
 244         struct inode    *ip;            /* the file we contain */
 245         daddr_t         fsbs;           /* number of blocks stored */
 246         struct timeval32 time;          /* time stamp for the struct */
 247         daddr32_t       dblk[1];        /* place holder for block info */
 248 };
 249
 250 static struct dump *dump_info = NULL;
 251
 252 /*
 253  * Previously there was no special action required for ordinary files.
 254  * (Devices are handled through the device file system.)
 255  * Now we support Large Files and Large File API requires open to
 256  * fail if file is large.
 257  * We could take care to prevent data corruption
 258  * by doing an atomic check of size and truncate if file is opened with
 259  * FTRUNC flag set but traditionally this is being done by the vfs/vnode
 260  * layers. So taking care of truncation here is a change in the existing
 261  * semantics of fop_open and therefore we chose not to implement any thing
 262  * here. The check for the size of the file > 2GB is being done at the
 263  * vfs layer in routine vn_open().
 264  */
 265
 266 /* ARGSUSED */
 267 static int
 268 ufs_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ct)
 269 {
 270         return (0);
 271 }
 272
 273 /*ARGSUSED*/
 274 static int
 275 ufs_close(struct vnode *vp, int flag, int count, offset_t offset,
 276         struct cred *cr, caller_context_t *ct)
 277 {
 278         cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 279         cleanshares(vp, ttoproc(curthread)->p_pid);
 280
 281         /*
 282          * Push partially filled cluster at last close.
 283          * ``last close'' is approximated because the dnlc
 284          * may have a hold on the vnode.
 285          * Checking for VBAD here will also act as a forced umount check.
 286          */
 287         if (vp->v_count <= 2 && vp->v_type != VBAD) {
 288                 struct inode *ip = VTOI(vp);
 289                 if (ip->i_delaylen) {
 290                         ins.in_poc.value.ul++;
 291                         (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen,
 292                             B_ASYNC | B_FREE, cr);
 293                         ip->i_delaylen = 0;
 294                 }
 295         }
 296
 297         return (0);
 298 }
 299
 300 /*ARGSUSED*/
 301 static int
 302 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
 303         struct caller_context *ct)
 304 {
 305         struct inode *ip = VTOI(vp);
 306         struct ufsvfs *ufsvfsp;
 307         struct ulockfs *ulp = NULL;
 308         int error = 0;
 309         int intrans = 0;
 310
 311         ASSERT(RW_READ_HELD(&ip->i_rwlock));
 312
 313         /*
 314          * Mandatory locking needs to be done before ufs_lockfs_begin()
 315          * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep.
 316          */
 317         if (MANDLOCK(vp, ip->i_mode)) {
 318                 /*
 319                  * ufs_getattr ends up being called by chklock
 320                  */
 321                 error = chklock(vp, FREAD, uiop->uio_loffset,
 322                     uiop->uio_resid, uiop->uio_fmode, ct);
 323                 if (error)
 324                         goto out;
 325         }
 326
 327         ufsvfsp = ip->i_ufsvfs;
 328         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK);
 329         if (error)
 330                 goto out;
 331
 332         /*
 333          * In the case that a directory is opened for reading as a file
 334          * (eg "cat .") with the  O_RSYNC, O_SYNC and O_DSYNC flags set.
 335          * The locking order had to be changed to avoid a deadlock with
 336          * an update taking place on that directory at the same time.
 337          */
 338         if ((ip->i_mode & IFMT) == IFDIR) {
 339
 340                 rw_enter(&ip->i_contents, RW_READER);
 341                 error = rdip(ip, uiop, ioflag, cr);
 342                 rw_exit(&ip->i_contents);
 343
 344                 if (error) {
 345                         if (ulp)
 346                                 ufs_lockfs_end(ulp);
 347                         goto out;
 348                 }
 349
 350                 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 351                     TRANS_ISTRANS(ufsvfsp)) {
 352                         rw_exit(&ip->i_rwlock);
 353                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC,
 354                                          TOP_READ_SIZE, &error);
 355                         ASSERT(!error);
 356                         TRANS_END_SYNC(ufsvfsp, &error, TOP_READ_SYNC,
 357                                        TOP_READ_SIZE);
 358                         rw_enter(&ip->i_rwlock, RW_READER);
 359                 }
 360         } else {
 361                 /*
 362                  * Only transact reads to files opened for sync-read and
 363                  * sync-write on a file system that is not write locked.
 364                  *
 365                  * The ``not write locked'' check prevents problems with
 366                  * enabling/disabling logging on a busy file system.  E.g.,
 367                  * logging exists at the beginning of the read but does not
 368                  * at the end.
 369                  *
 370                  */
 371                 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 372                     TRANS_ISTRANS(ufsvfsp)) {
 373                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC,
 374                                          TOP_READ_SIZE, &error);
 375                         ASSERT(!error);
 376                         intrans = 1;
 377                 }
 378
 379                 rw_enter(&ip->i_contents, RW_READER);
 380                 error = rdip(ip, uiop, ioflag, cr);
 381                 rw_exit(&ip->i_contents);
 382
 383                 if (intrans) {
 384                         TRANS_END_SYNC(ufsvfsp, &error, TOP_READ_SYNC,
 385                                        TOP_READ_SIZE);
 386                 }
 387         }
 388
 389         if (ulp) {
 390                 ufs_lockfs_end(ulp);
 391         }
 392 out:
 393
 394         return (error);
 395 }
 396
 397 extern  int     ufs_HW;         /* high water mark */
 398 extern  int     ufs_LW;         /* low water mark */
 399 int     ufs_WRITES = 1;         /* XXX - enable/disable */
 400 int     ufs_throttles = 0;      /* throttling count */
 401 int     ufs_allow_shared_writes = 1;    /* directio shared writes */
 402
 403 static int
 404 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag)
 405 {
 406         int     shared_write;
 407
 408         /*
 409          * If the FDSYNC flag is set then ignore the global
 410          * ufs_allow_shared_writes in this case.
 411          */
 412         shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes;
 413
 414         /*
 415          * Filter to determine if this request is suitable as a
 416          * concurrent rewrite. This write must not allocate blocks
 417          * by extending the file or filling in holes. No use trying
 418          * through FSYNC descriptors as the inode will be synchronously
 419          * updated after the write. The uio structure has not yet been
 420          * checked for sanity, so assume nothing.
 421          */
 422         return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) &&
 423             (uiop->uio_loffset >= 0) &&
 424             (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) &&
 425             ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) &&
 426             !(ioflag & FSYNC) && !bmap_has_holes(ip) &&
 427             shared_write);
 428 }
 429
 430 /*ARGSUSED*/
 431 static int
 432 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr,
 433         caller_context_t *ct)
 434 {
 435         struct inode *ip = VTOI(vp);
 436         struct ufsvfs *ufsvfsp;
 437         struct ulockfs *ulp;
 438         int retry = 1;
 439         int error, resv, resid = 0;
 440         int directio_status;
 441         int exclusive;
 442         int rewriteflg;
 443         long start_resid = uiop->uio_resid;
 444
 445         ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
 446
 447 retry_mandlock:
 448         /*
 449          * Mandatory locking needs to be done before ufs_lockfs_begin()
 450          * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep.
 451          * Check for forced unmounts normally done in ufs_lockfs_begin().
 452          */
 453         if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
 454                 error = EIO;
 455                 goto out;
 456         }
 457         if (MANDLOCK(vp, ip->i_mode)) {
 458
 459                 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
 460
 461                 /*
 462                  * ufs_getattr ends up being called by chklock
 463                  */
 464                 error = chklock(vp, FWRITE, uiop->uio_loffset,
 465                     uiop->uio_resid, uiop->uio_fmode, ct);
 466                 if (error)
 467                         goto out;
 468         }
 469
 470         /* i_rwlock can change in chklock */
 471         exclusive = rw_write_held(&ip->i_rwlock);
 472         rewriteflg = ufs_check_rewrite(ip, uiop, ioflag);
 473
 474         /*
 475          * Check for fast-path special case of directio re-writes.
 476          */
 477         if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) &&
 478             !exclusive && rewriteflg) {
 479
 480                 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 481                 if (error)
 482                         goto out;
 483
 484                 rw_enter(&ip->i_contents, RW_READER);
 485                 error = ufs_directio_write(ip, uiop, ioflag, 1, cr,
 486                     &directio_status);
 487                 if (directio_status == DIRECTIO_SUCCESS) {
 488                         uint_t i_flag_save;
 489
 490                         if (start_resid != uiop->uio_resid)
 491                                 error = 0;
 492                         /*
 493                          * Special treatment of access times for re-writes.
 494                          * If IMOD is not already set, then convert it
 495                          * to IMODACC for this operation. This defers
 496                          * entering a delta into the log until the inode
 497                          * is flushed. This mimics what is done for read
 498                          * operations and inode access time.
 499                          */
 500                         mutex_enter(&ip->i_tlock);
 501                         i_flag_save = ip->i_flag;
 502                         ip->i_flag |= IUPD | ICHG;
 503                         ip->i_seq++;
 504                         ITIMES_NOLOCK(ip);
 505                         if ((i_flag_save & IMOD) == 0) {
 506                                 ip->i_flag &= ~IMOD;
 507                                 ip->i_flag |= IMODACC;
 508                         }
 509                         mutex_exit(&ip->i_tlock);
 510                         rw_exit(&ip->i_contents);
 511                         if (ulp)
 512                                 ufs_lockfs_end(ulp);
 513                         goto out;
 514                 }
 515                 rw_exit(&ip->i_contents);
 516                 if (ulp)
 517                         ufs_lockfs_end(ulp);
 518         }
 519
 520         if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) {
 521                 rw_exit(&ip->i_rwlock);
 522                 rw_enter(&ip->i_rwlock, RW_WRITER);
 523                 /*
 524                  * Mandatory locking could have been enabled
 525                  * after dropping the i_rwlock.
 526                  */
 527                 if (MANDLOCK(vp, ip->i_mode))
 528                         goto retry_mandlock;
 529         }
 530
 531         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 532         if (error)
 533                 goto out;
 534
 535         /*
 536          * Amount of log space needed for this write
 537          */
 538         if (!rewriteflg || !(ioflag & FDSYNC))
 539                 TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid);
 540
 541         /*
 542          * Throttle writes.
 543          */
 544         if (ufs_WRITES && (ip->i_writes > ufs_HW)) {
 545                 mutex_enter(&ip->i_tlock);
 546                 while (ip->i_writes > ufs_HW) {
 547                         ufs_throttles++;
 548                         cv_wait(&ip->i_wrcv, &ip->i_tlock);
 549                 }
 550                 mutex_exit(&ip->i_tlock);
 551         }
 552
 553         /*
 554          * Enter Transaction
 555          *
 556          * If the write is a rewrite there is no need to open a transaction
 557          * if the FDSYNC flag is set and not the FSYNC.  In this case just
 558          * set the IMODACC flag to modify do the update at a later time
 559          * thus avoiding the overhead of the logging transaction that is
 560          * not required.
 561          */
 562         if (ioflag & (FSYNC|FDSYNC)) {
 563                 if (ulp) {
 564                         if (rewriteflg) {
 565                                 uint_t i_flag_save;
 566
 567                                 rw_enter(&ip->i_contents, RW_READER);
 568                                 mutex_enter(&ip->i_tlock);
 569                                 i_flag_save = ip->i_flag;
 570                                 ip->i_flag |= IUPD | ICHG;
 571                                 ip->i_seq++;
 572                                 ITIMES_NOLOCK(ip);
 573                                 if ((i_flag_save & IMOD) == 0) {
 574                                         ip->i_flag &= ~IMOD;
 575                                         ip->i_flag |= IMODACC;
 576                                 }
 577                                 mutex_exit(&ip->i_tlock);
 578                                 rw_exit(&ip->i_contents);
 579                         } else {
 580                                 int terr = 0;
 581                                 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC,
 582                                                  resv, &terr);
 583                                 ASSERT(!terr);
 584                         }
 585                 }
 586         } else {
 587                 if (ulp)
 588                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
 589         }
 590
 591         /*
 592          * Write the file
 593          */
 594         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 595         rw_enter(&ip->i_contents, RW_WRITER);
 596         if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) {
 597                 /*
 598                  * In append mode start at end of file.
 599                  */
 600                 uiop->uio_loffset = ip->i_size;
 601         }
 602
 603         /*
 604          * Mild optimisation, don't call ufs_trans_write() unless we have to
 605          * Also, suppress file system full messages if we will retry.
 606          */
 607         if (retry)
 608                 ip->i_flag |= IQUIET;
 609         if (resid) {
 610                 TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid);
 611         } else {
 612                 error = wrip(ip, uiop, ioflag, cr);
 613         }
 614         ip->i_flag &= ~IQUIET;
 615
 616         rw_exit(&ip->i_contents);
 617         rw_exit(&ufsvfsp->vfs_dqrwlock);
 618
 619         /*
 620          * Leave Transaction
 621          */
 622         if (ulp) {
 623                 if (ioflag & (FSYNC|FDSYNC)) {
 624                         if (!rewriteflg) {
 625                                 int terr = 0;
 626
 627                                 TRANS_END_SYNC(ufsvfsp, &terr,
 628                                                TOP_WRITE_SYNC, resv);
 629                                 if (error == 0)
 630                                         error = terr;
 631                         }
 632                 } else {
 633                         TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
 634                 }
 635                 ufs_lockfs_end(ulp);
 636         }
 637 out:
 638         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
 639                 /*
 640                  * Any blocks tied up in pending deletes?
 641                  */
 642                 ufs_delete_drain_wait(ufsvfsp, 1);
 643                 retry = 0;
 644                 goto retry_mandlock;
 645         }
 646
 647         if (error == ENOSPC && (start_resid != uiop->uio_resid))
 648                 error = 0;
 649
 650         return (error);
 651 }
 652
 653 /*
 654  * Don't cache write blocks to files with the sticky bit set.
 655  * Used to keep swap files from blowing the page cache on a server.
 656  */
 657 int stickyhack = 1;
 658
 659 /*
 660  * Free behind hacks.  The pager is busted.
 661  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
 662  * or B_FREE_IF_TIGHT_ON_MEMORY.
 663  */
 664 int     freebehind = 1;
 665 int     smallfile = 0;
 666 uoff_t smallfile64 = 32 * 1024;
 667
 668 /*
 669  * While we should, in most cases, cache the pages for write, we
 670  * may also want to cache the pages for read as long as they are
 671  * frequently re-usable.
 672  *
 673  * If cache_read_ahead = 1, the pages for read will go to the tail
 674  * of the cache list when they are released, otherwise go to the head.
 675  */
 676 int     cache_read_ahead = 0;
 677
 678 /*
 679  * Freebehind exists  so that as we read  large files  sequentially we
 680  * don't consume most of memory with pages  from a few files. It takes
 681  * longer to re-read from disk multiple small files as it does reading
 682  * one large one sequentially.  As system  memory grows customers need
 683  * to retain bigger chunks   of files in  memory.   The advent of  the
 684  * cachelist opens up of the possibility freeing pages  to the head or
 685  * tail of the list.
 686  *
 687  * Not freeing a page is a bet that the page will be read again before
 688  * it's segmap slot is needed for something else. If we loose the bet,
 689  * it means some  other thread is  burdened with the  page free we did
 690  * not do. If we win we save a free and reclaim.
 691  *
 692  * Freeing it at the tail  vs the head of cachelist  is a bet that the
 693  * page will survive until the next  read.  It's also saying that this
 694  * page is more likely to  be re-used than a  page freed some time ago
 695  * and never reclaimed.
 696  *
 697  * Freebehind maintains a  range of  file offset [smallfile1; smallfile2]
 698  *
 699  *            0 < offset < smallfile1 : pages are not freed.
 700  *   smallfile1 < offset < smallfile2 : pages freed to tail of cachelist.
 701  *   smallfile2 < offset              : pages freed to head of cachelist.
 702  *
 703  * The range  is  computed  at most  once  per second  and  depends on
 704  * freemem  and  ncpus_online.  Both parameters  are   bounded to be
 705  * >= smallfile && >= smallfile64.
 706  *
 707  * smallfile1 = (free memory / ncpu) / 1000
 708  * smallfile2 = (free memory / ncpu) / 10
 709  *
 710  * A few examples values:
 711  *
 712  *       Free Mem (in Bytes) [smallfile1; smallfile2]  [smallfile1; smallfile2]
 713  *                                 ncpus_online = 4          ncpus_online = 64
 714  *       ------------------  -----------------------   -----------------------
 715  *             1G                   [256K;  25M]               [32K; 1.5M]
 716  *            10G                   [2.5M; 250M]              [156K; 15M]
 717  *           100G                    [25M; 2.5G]              [1.5M; 150M]
 718  *
 719  */
 720
 721 #define SMALLFILE1_D 1000
 722 #define SMALLFILE2_D 10
 723 static uoff_t smallfile1 = 32 * 1024;
 724 static uoff_t smallfile2 = 32 * 1024;
 725 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */
 726 uint_t smallfile1_d = SMALLFILE1_D;
 727 uint_t smallfile2_d = SMALLFILE2_D;
 728
 729 /*
 730  * wrip does the real work of write requests for ufs.
 731  */
 732 int
 733 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr)
 734 {
 735         rlim64_t limit = uio->uio_llimit;
 736         uoff_t off;
 737         uoff_t old_i_size;
 738         struct fs *fs;
 739         struct vnode *vp;
 740         struct ufsvfs *ufsvfsp;
 741         caddr_t base;
 742         long start_resid = uio->uio_resid;      /* save starting resid */
 743         long premove_resid;                     /* resid before uiomove() */
 744         uint_t flags;
 745         int newpage;
 746         int iupdat_flag, directio_status;
 747         int n, on, mapon;
 748         int error, pagecreate;
 749         int do_dqrwlock;                /* drop/reacquire vfs_dqrwlock */
 750         int32_t iblocks;
 751         int     new_iblocks;
 752
 753         /*
 754          * ip->i_size is incremented before the uiomove
 755          * is done on a write.  If the move fails (bad user
 756          * address) reset ip->i_size.
 757          * The better way would be to increment ip->i_size
 758          * only if the uiomove succeeds.
 759          */
 760         int i_size_changed = 0;
 761         o_mode_t type;
 762         int i_seq_needed = 0;
 763
 764         vp = ITOV(ip);
 765
 766         /*
 767          * check for forced unmount - should not happen as
 768          * the request passed the lockfs checks.
 769          */
 770         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
 771                 return (EIO);
 772
 773         fs = ip->i_fs;
 774
 775         ASSERT(RW_WRITE_HELD(&ip->i_contents));
 776
 777         /* check for valid filetype */
 778         type = ip->i_mode & IFMT;
 779         if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
 780             (type != IFLNK) && (type != IFSHAD)) {
 781                 return (EIO);
 782         }
 783
 784         /*
 785          * the actual limit of UFS file size
 786          * is UFS_MAXOFFSET_T
 787          */
 788         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 789                 limit = MAXOFFSET_T;
 790
 791         if (uio->uio_loffset >= limit) {
 792                 proc_t *p = ttoproc(curthread);
 793
 794                 mutex_enter(&p->p_lock);
 795                 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
 796                     p, RCA_UNSAFE_SIGINFO);
 797                 mutex_exit(&p->p_lock);
 798                 return (EFBIG);
 799         }
 800
 801         /*
 802          * if largefiles are disallowed, the limit is
 803          * the pre-largefiles value of 2GB
 804          */
 805         if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
 806                 limit = MIN(UFS_MAXOFFSET_T, limit);
 807         else
 808                 limit = MIN(MAXOFF32_T, limit);
 809
 810         if (uio->uio_loffset < 0) {
 811                 return (EINVAL);
 812         }
 813         if (uio->uio_resid == 0) {
 814                 return (0);
 815         }
 816
 817         if (uio->uio_loffset >= limit)
 818                 return (EFBIG);
 819
 820         ip->i_flag |= INOACC;   /* don't update ref time in getpage */
 821
 822         if (ioflag & (FSYNC|FDSYNC)) {
 823                 ip->i_flag |= ISYNC;
 824                 iupdat_flag = 1;
 825         }
 826         /*
 827          * Try to go direct
 828          */
 829         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
 830                 uio->uio_llimit = limit;
 831                 error = ufs_directio_write(ip, uio, ioflag, 0, cr,
 832                     &directio_status);
 833                 /*
 834                  * If ufs_directio wrote to the file or set the flags,
 835                  * we need to update i_seq, but it may be deferred.
 836                  */
 837                 if (start_resid != uio->uio_resid ||
 838                     (ip->i_flag & (ICHG|IUPD))) {
 839                         i_seq_needed = 1;
 840                         ip->i_flag |= ISEQ;
 841                 }
 842                 if (directio_status == DIRECTIO_SUCCESS)
 843                         goto out;
 844         }
 845
 846         /*
 847          * Behavior with respect to dropping/reacquiring vfs_dqrwlock:
 848          *
 849          * o shadow inodes: vfs_dqrwlock is not held at all
 850          * o quota updates: vfs_dqrwlock is read or write held
 851          * o other updates: vfs_dqrwlock is read held
 852          *
 853          * The first case is the only one where we do not hold
 854          * vfs_dqrwlock at all while entering wrip().
 855          * We must make sure not to downgrade/drop vfs_dqrwlock if we
 856          * have it as writer, i.e. if we are updating the quota inode.
 857          * There is no potential deadlock scenario in this case as
 858          * ufs_getpage() takes care of this and avoids reacquiring
 859          * vfs_dqrwlock in that case.
 860          *
 861          * This check is done here since the above conditions do not change
 862          * and we possibly loop below, so save a few cycles.
 863          */
 864         if ((type == IFSHAD) ||
 865             (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) {
 866                 do_dqrwlock = 0;
 867         } else {
 868                 do_dqrwlock = 1;
 869         }
 870
 871         /*
 872          * Large Files: We cast MAXBMASK to offset_t
 873          * inorder to mask out the higher bits. Since offset_t
 874          * is a signed value, the high order bit set in MAXBMASK
 875          * value makes it do the right thing by having all bits 1
 876          * in the higher word. May be removed for _SOLARIS64_.
 877          */
 878
 879         fs = ip->i_fs;
 880         do {
 881                 uoff_t uoff = uio->uio_loffset;
 882                 off = uoff & (offset_t)MAXBMASK;
 883                 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
 884                 on = (int)blkoff(fs, uoff);
 885                 n = (int)MIN(fs->fs_bsize - on, uio->uio_resid);
 886                 new_iblocks = 1;
 887
 888                 if (type == IFREG && uoff + n >= limit) {
 889                         if (uoff >= limit) {
 890                                 error = EFBIG;
 891                                 goto out;
 892                         }
 893                         /*
 894                          * since uoff + n >= limit,
 895                          * therefore n >= limit - uoff, and n is an int
 896                          * so it is safe to cast it to an int
 897                          */
 898                         n = (int)(limit - (rlim64_t)uoff);
 899                 }
 900                 if (uoff + n > ip->i_size) {
 901                         /*
 902                          * We are extending the length of the file.
 903                          * bmap is used so that we are sure that
 904                          * if we need to allocate new blocks, that it
 905                          * is done here before we up the file size.
 906                          */
 907                         error = bmap_write(ip, uoff, (int)(on + n),
 908                             mapon == 0, NULL, cr);
 909                         /*
 910                          * bmap_write never drops i_contents so if
 911                          * the flags are set it changed the file.
 912                          */
 913                         if (ip->i_flag & (ICHG|IUPD)) {
 914                                 i_seq_needed = 1;
 915                                 ip->i_flag |= ISEQ;
 916                         }
 917                         if (error)
 918                                 break;
 919                         /*
 920                          * There is a window of vulnerability here.
 921                          * The sequence of operations: allocate file
 922                          * system blocks, uiomove the data into pages,
 923                          * and then update the size of the file in the
 924                          * inode, must happen atomically.  However, due
 925                          * to current locking constraints, this can not
 926                          * be done.
 927                          */
 928                         ASSERT(ip->i_writer == NULL);
 929                         ip->i_writer = curthread;
 930                         i_size_changed = 1;
 931                         /*
 932                          * If we are writing from the beginning of
 933                          * the mapping, we can just create the
 934                          * pages without having to read them.
 935                          */
 936                         pagecreate = (mapon == 0);
 937                 } else if (n == MAXBSIZE) {
 938                         /*
 939                          * Going to do a whole mappings worth,
 940                          * so we can just create the pages w/o
 941                          * having to read them in.  But before
 942                          * we do that, we need to make sure any
 943                          * needed blocks are allocated first.
 944                          */
 945                         iblocks = ip->i_blocks;
 946                         error = bmap_write(ip, uoff, (int)(on + n),
 947                             BI_ALLOC_ONLY, NULL, cr);
 948                         /*
 949                          * bmap_write never drops i_contents so if
 950                          * the flags are set it changed the file.
 951                          */
 952                         if (ip->i_flag & (ICHG|IUPD)) {
 953                                 i_seq_needed = 1;
 954                                 ip->i_flag |= ISEQ;
 955                         }
 956                         if (error)
 957                                 break;
 958                         pagecreate = 1;
 959                         /*
 960                          * check if the new created page needed the
 961                          * allocation of new disk blocks.
 962                          */
 963                         if (iblocks == ip->i_blocks)
 964                                 new_iblocks = 0; /* no new blocks allocated */
 965                 } else {
 966                         pagecreate = 0;
 967                         /*
 968                          * In sync mode flush the indirect blocks which
 969                          * may have been allocated and not written on
 970                          * disk. In above cases bmap_write will allocate
 971                          * in sync mode.
 972                          */
 973                         if (ioflag & (FSYNC|FDSYNC)) {
 974                                 error = ufs_indirblk_sync(ip, uoff);
 975                                 if (error)
 976                                         break;
 977                         }
 978                 }
 979
 980                 /*
 981                  * At this point we can enter ufs_getpage() in one
 982                  * of two ways:
 983                  * 1) segmap_getmapflt() calls ufs_getpage() when the
 984                  *    forcefault parameter is true (pagecreate == 0)
 985                  * 2) uiomove() causes a page fault.
 986                  *
 987                  * We have to drop the contents lock to prevent the VM
 988                  * system from trying to reacquire it in ufs_getpage()
 989                  * should the uiomove cause a pagefault.
 990                  *
 991                  * We have to drop the reader vfs_dqrwlock here as well.
 992                  */
 993                 rw_exit(&ip->i_contents);
 994                 if (do_dqrwlock) {
 995                         ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
 996                         ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock)));
 997                         rw_exit(&ufsvfsp->vfs_dqrwlock);
 998                 }
 999
1000                 newpage = 0;
1001                 premove_resid = uio->uio_resid;
1002
1003                 /*
1004                  * Touch the page and fault it in if it is not in core
1005                  * before segmap_getmapflt or vpm_data_copy can lock it.
1006                  * This is to avoid the deadlock if the buffer is mapped
1007                  * to the same file through mmap which we want to write.
1008                  */
1009                 uio_prefaultpages((long)n, uio);
1010
1011                 if (vpm_enable) {
1012                         /*
1013                          * Copy data. If new pages are created, part of
1014                          * the page that is not written will be initizliazed
1015                          * with zeros.
1016                          */
1017                         error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1018                             uio, !pagecreate, &newpage, 0, S_WRITE);
1019                 } else {
1020
1021                         base = segmap_getmapflt(segkmap, vp, (off + mapon),
1022                             (uint_t)n, !pagecreate, S_WRITE);
1023
1024                         /*
1025                          * segmap_pagecreate() returns 1 if it calls
1026                          * page_create_va() to allocate any pages.
1027                          */
1028
1029                         if (pagecreate)
1030                                 newpage = segmap_pagecreate(segkmap, base,
1031                                     (size_t)n, 0);
1032
1033                         error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
1034                 }
1035
1036                 /*
1037                  * If "newpage" is set, then a new page was created and it
1038                  * does not contain valid data, so it needs to be initialized
1039                  * at this point.
1040                  * Otherwise the page contains old data, which was overwritten
1041                  * partially or as a whole in uiomove.
1042                  * If there is only one iovec structure within uio, then
1043                  * on error uiomove will not be able to update uio->uio_loffset
1044                  * and we would zero the whole page here!
1045                  *
1046                  * If uiomove fails because of an error, the old valid data
1047                  * is kept instead of filling the rest of the page with zero's.
1048                  */
1049                 if (!vpm_enable && newpage &&
1050                     uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
1051                         /*
1052                          * We created pages w/o initializing them completely,
1053                          * thus we need to zero the part that wasn't set up.
1054                          * This happens on most EOF write cases and if
1055                          * we had some sort of error during the uiomove.
1056                          */
1057                         int nzero, nmoved;
1058
1059                         nmoved = (int)(uio->uio_loffset - (off + mapon));
1060                         ASSERT(nmoved >= 0 && nmoved <= n);
1061                         nzero = roundup(on + n, PAGESIZE) - nmoved;
1062                         ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
1063                         (void) kzero(base + mapon + nmoved, (uint_t)nzero);
1064                 }
1065
1066                 /*
1067                  * Unlock the pages allocated by page_create_va()
1068                  * in segmap_pagecreate()
1069                  */
1070                 if (!vpm_enable && newpage)
1071                         segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
1072
1073                 /*
1074                  * If the size of the file changed, then update the
1075                  * size field in the inode now.  This can't be done
1076                  * before the call to segmap_pageunlock or there is
1077                  * a potential deadlock with callers to ufs_putpage().
1078                  * They will be holding i_contents and trying to lock
1079                  * a page, while this thread is holding a page locked
1080                  * and trying to acquire i_contents.
1081                  */
1082                 if (i_size_changed) {
1083                         rw_enter(&ip->i_contents, RW_WRITER);
1084                         old_i_size = ip->i_size;
1085                         UFS_SET_ISIZE(uoff + n, ip);
1086                         TRANS_INODE(ufsvfsp, ip);
1087                         /*
1088                          * file has grown larger than 2GB. Set flag
1089                          * in superblock to indicate this, if it
1090                          * is not already set.
1091                          */
1092                         if ((ip->i_size > MAXOFF32_T) &&
1093                             !(fs->fs_flags & FSLARGEFILES)) {
1094                                 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1095                                 mutex_enter(&ufsvfsp->vfs_lock);
1096                                 fs->fs_flags |= FSLARGEFILES;
1097                                 ufs_sbwrite(ufsvfsp);
1098                                 mutex_exit(&ufsvfsp->vfs_lock);
1099                         }
1100                         mutex_enter(&ip->i_tlock);
1101                         ip->i_writer = NULL;
1102                         cv_broadcast(&ip->i_wrcv);
1103                         mutex_exit(&ip->i_tlock);
1104                         rw_exit(&ip->i_contents);
1105                 }
1106
1107                 if (error) {
1108                         /*
1109                          * If we failed on a write, we may have already
1110                          * allocated file blocks as well as pages.  It's
1111                          * hard to undo the block allocation, but we must
1112                          * be sure to invalidate any pages that may have
1113                          * been allocated.
1114                          *
1115                          * If the page was created without initialization
1116                          * then we must check if it should be possible
1117                          * to destroy the new page and to keep the old data
1118                          * on the disk.
1119                          *
1120                          * It is possible to destroy the page without
1121                          * having to write back its contents only when
1122                          * - the size of the file keeps unchanged
1123                          * - bmap_write() did not allocate new disk blocks
1124                          *   it is possible to create big files using "seek" and
1125                          *   write to the end of the file. A "write" to a
1126                          *   position before the end of the file would not
1127                          *   change the size of the file but it would allocate
1128                          *   new disk blocks.
1129                          * - uiomove intended to overwrite the whole page.
1130                          * - a new page was created (newpage == 1).
1131                          */
1132
1133                         if (i_size_changed == 0 && new_iblocks == 0 &&
1134                             newpage) {
1135
1136                                 /* unwind what uiomove eventually last did */
1137                                 uio->uio_resid = premove_resid;
1138
1139                                 /*
1140                                  * destroy the page, do not write ambiguous
1141                                  * data to the disk.
1142                                  */
1143                                 flags = SM_DESTROY;
1144                         } else {
1145                                 /*
1146                                  * write the page back to the disk, if dirty,
1147                                  * and remove the page from the cache.
1148                                  */
1149                                 flags = SM_INVAL;
1150                         }
1151
1152                         if (vpm_enable) {
1153                                 /*
1154                                  *  Flush pages.
1155                                  */
1156                                 (void) vpm_sync_pages(vp, off, n, flags);
1157                         } else {
1158                                 (void) segmap_release(segkmap, base, flags);
1159                         }
1160                 } else {
1161                         flags = 0;
1162                         /*
1163                          * Force write back for synchronous write cases.
1164                          */
1165                         if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) {
1166                                 /*
1167                                  * If the sticky bit is set but the
1168                                  * execute bit is not set, we do a
1169                                  * synchronous write back and free
1170                                  * the page when done.  We set up swap
1171                                  * files to be handled this way to
1172                                  * prevent servers from keeping around
1173                                  * the client's swap pages too long.
1174                                  * XXX - there ought to be a better way.
1175                                  */
1176                                 if (IS_SWAPVP(vp)) {
1177                                         flags = SM_WRITE | SM_FREE |
1178                                             SM_DONTNEED;
1179                                         iupdat_flag = 0;
1180                                 } else {
1181                                         flags = SM_WRITE;
1182                                 }
1183                         } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
1184                                 /*
1185                                  * Have written a whole block.
1186                                  * Start an asynchronous write and
1187                                  * mark the buffer to indicate that
1188                                  * it won't be needed again soon.
1189                                  */
1190                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
1191                         }
1192                         if (vpm_enable) {
1193                                 /*
1194                                  * Flush pages.
1195                                  */
1196                                 error = vpm_sync_pages(vp, off, n, flags);
1197                         } else {
1198                                 error = segmap_release(segkmap, base, flags);
1199                         }
1200                         /*
1201                          * If the operation failed and is synchronous,
1202                          * then we need to unwind what uiomove() last
1203                          * did so we can potentially return an error to
1204                          * the caller.  If this write operation was
1205                          * done in two pieces and the first succeeded,
1206                          * then we won't return an error for the second
1207                          * piece that failed.  However, we only want to
1208                          * return a resid value that reflects what was
1209                          * really done.
1210                          *
1211                          * Failures for non-synchronous operations can
1212                          * be ignored since the page subsystem will
1213                          * retry the operation until it succeeds or the
1214                          * file system is unmounted.
1215                          */
1216                         if (error) {
1217                                 if ((ioflag & (FSYNC | FDSYNC)) ||
1218                                     type == IFDIR) {
1219                                         uio->uio_resid = premove_resid;
1220                                 } else {
1221                                         error = 0;
1222                                 }
1223                         }
1224                 }
1225
1226                 /*
1227                  * Re-acquire contents lock.
1228                  * If it was dropped, reacquire reader vfs_dqrwlock as well.
1229                  */
1230                 if (do_dqrwlock)
1231                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1232                 rw_enter(&ip->i_contents, RW_WRITER);
1233
1234                 /*
1235                  * If the uiomove() failed or if a synchronous
1236                  * page push failed, fix up i_size.
1237                  */
1238                 if (error) {
1239                         if (i_size_changed) {
1240                                 /*
1241                                  * The uiomove failed, and we
1242                                  * allocated blocks,so get rid
1243                                  * of them.
1244                                  */
1245                                 (void) ufs_itrunc(ip, old_i_size, 0, cr);
1246                         }
1247                 } else {
1248                         /*
1249                          * XXX - Can this be out of the loop?
1250                          */
1251                         ip->i_flag |= IUPD | ICHG;
1252                         /*
1253                          * Only do one increase of i_seq for multiple
1254                          * pieces.  Because we drop locks, record
1255                          * the fact that we changed the timestamp and
1256                          * are deferring the increase in case another thread
1257                          * pushes our timestamp update.
1258                          */
1259                         i_seq_needed = 1;
1260                         ip->i_flag |= ISEQ;
1261                         if (i_size_changed)
1262                                 ip->i_flag |= IATTCHG;
1263                         if ((ip->i_mode & (IEXEC | (IEXEC >> 3) |
1264                             (IEXEC >> 6))) != 0 &&
1265                             (ip->i_mode & (ISUID | ISGID)) != 0 &&
1266                             secpolicy_vnode_setid_retain(cr,
1267                             (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) {
1268                                 /*
1269                                  * Clear Set-UID & Set-GID bits on
1270                                  * successful write if not privileged
1271                                  * and at least one of the execute bits
1272                                  * is set.  If we always clear Set-GID,
1273                                  * mandatory file and record locking is
1274                                  * unuseable.
1275                                  */
1276                                 ip->i_mode &= ~(ISUID | ISGID);
1277                         }
1278                 }
1279                 /*
1280                  * In the case the FDSYNC flag is set and this is a
1281                  * "rewrite" we won't log a delta.
1282                  * The FSYNC flag overrides all cases.
1283                  */
1284                 if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) {
1285                         TRANS_INODE(ufsvfsp, ip);
1286                 }
1287         } while (error == 0 && uio->uio_resid > 0 && n != 0);
1288
1289 out:
1290         /*
1291          * Make sure i_seq is increased at least once per write
1292          */
1293         if (i_seq_needed) {
1294                 ip->i_seq++;
1295                 ip->i_flag &= ~ISEQ;    /* no longer deferred */
1296         }
1297
1298         /*
1299          * Inode is updated according to this table -
1300          *
1301          *   FSYNC        FDSYNC(posix.4)
1302          *   --------------------------
1303          *   always@      IATTCHG|IBDWRITE
1304          *
1305          * @ -  If we are doing synchronous write the only time we should
1306          *      not be sync'ing the ip here is if we have the stickyhack
1307          *      activated, the file is marked with the sticky bit and
1308          *      no exec bit, the file length has not been changed and
1309          *      no new blocks have been allocated during this write.
1310          */
1311
1312         if ((ip->i_flag & ISYNC) != 0) {
1313                 /*
1314                  * we have eliminated nosync
1315                  */
1316                 if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
1317                     ((ioflag & FSYNC) && iupdat_flag)) {
1318                         ufs_iupdat(ip, 1);
1319                 }
1320         }
1321
1322         /*
1323          * If we've already done a partial-write, terminate
1324          * the write but return no error unless the error is ENOSPC
1325          * because the caller can detect this and free resources and
1326          * try again.
1327          */
1328         if ((start_resid != uio->uio_resid) && (error != ENOSPC))
1329                 error = 0;
1330
1331         ip->i_flag &= ~(INOACC | ISYNC);
1332         ITIMES_NOLOCK(ip);
1333         return (error);
1334 }
1335
1336 /*
1337  * rdip does the real work of read requests for ufs.
1338  */
1339 int
1340 rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
1341 {
1342         uoff_t off;
1343         caddr_t base;
1344         struct fs *fs;
1345         struct ufsvfs *ufsvfsp;
1346         struct vnode *vp;
1347         long oresid = uio->uio_resid;
1348         uoff_t n, on, mapon;
1349         int error = 0;
1350         int doupdate = 1;
1351         uint_t flags;
1352         int dofree, directio_status;
1353         krw_t rwtype;
1354         o_mode_t type;
1355         clock_t now;
1356
1357         vp = ITOV(ip);
1358
1359         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1360
1361         ufsvfsp = ip->i_ufsvfs;
1362
1363         if (ufsvfsp == NULL)
1364                 return (EIO);
1365
1366         fs = ufsvfsp->vfs_fs;
1367
1368         /* check for valid filetype */
1369         type = ip->i_mode & IFMT;
1370         if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
1371             (type != IFLNK) && (type != IFSHAD)) {
1372                 return (EIO);
1373         }
1374
1375         if (uio->uio_loffset > UFS_MAXOFFSET_T) {
1376                 error = 0;
1377                 goto out;
1378         }
1379         if (uio->uio_loffset < 0) {
1380                 return (EINVAL);
1381         }
1382         if (uio->uio_resid == 0) {
1383                 return (0);
1384         }
1385
1386         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) &&
1387             (!ufsvfsp->vfs_noatime)) {
1388                 mutex_enter(&ip->i_tlock);
1389                 ip->i_flag |= IACC;
1390                 mutex_exit(&ip->i_tlock);
1391         }
1392         /*
1393          * Try to go direct
1394          */
1395         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
1396                 error = ufs_directio_read(ip, uio, cr, &directio_status);
1397                 if (directio_status == DIRECTIO_SUCCESS)
1398                         goto out;
1399         }
1400
1401         rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
1402
1403         do {
1404                 offset_t diff;
1405                 uoff_t uoff = uio->uio_loffset;
1406                 off = uoff & (offset_t)MAXBMASK;
1407                 mapon = (uoff_t)(uoff & (offset_t)MAXBOFFSET);
1408                 on = (uoff_t)blkoff(fs, uoff);
1409                 n = MIN((uoff_t)fs->fs_bsize - on,
1410                     (uoff_t)uio->uio_resid);
1411
1412                 diff = ip->i_size - uoff;
1413
1414                 if (diff <= 0) {
1415                         error = 0;
1416                         goto out;
1417                 }
1418                 if (diff < (offset_t)n)
1419                         n = (int)diff;
1420
1421                 /*
1422                  * We update smallfile2 and smallfile1 at most every second.
1423                  */
1424                 now = ddi_get_lbolt();
1425                 if (now >= smallfile_update) {
1426                         uint64_t percpufreeb;
1427                         if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
1428                         if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
1429                         percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
1430                         smallfile1 = percpufreeb / smallfile1_d;
1431                         smallfile2 = percpufreeb / smallfile2_d;
1432                         smallfile1 = MAX(smallfile1, smallfile);
1433                         smallfile1 = MAX(smallfile1, smallfile64);
1434                         smallfile2 = MAX(smallfile1, smallfile2);
1435                         smallfile_update = now + hz;
1436                 }
1437
1438                 dofree = freebehind &&
1439                     ip->i_nextr == (off & PAGEMASK) && off > smallfile1;
1440
1441                 /*
1442                  * At this point we can enter ufs_getpage() in one of two
1443                  * ways:
1444                  * 1) segmap_getmapflt() calls ufs_getpage() when the
1445                  *    forcefault parameter is true (value of 1 is passed)
1446                  * 2) uiomove() causes a page fault.
1447                  *
1448                  * We cannot hold onto an i_contents reader lock without
1449                  * risking deadlock in ufs_getpage() so drop a reader lock.
1450                  * The ufs_getpage() dolock logic already allows for a
1451                  * thread holding i_contents as writer to work properly
1452                  * so we keep a writer lock.
1453                  */
1454                 if (rwtype == RW_READER)
1455                         rw_exit(&ip->i_contents);
1456
1457                 if (vpm_enable) {
1458                         /*
1459                          * Copy data.
1460                          */
1461                         error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1462                             uio, 1, NULL, 0, S_READ);
1463                 } else {
1464                         base = segmap_getmapflt(segkmap, vp, (off + mapon),
1465                             (uint_t)n, 1, S_READ);
1466                         error = uiomove(base + mapon, (long)n, UIO_READ, uio);
1467                 }
1468
1469                 flags = 0;
1470                 if (!error) {
1471                         /*
1472                          * If  reading sequential  we won't need  this
1473                          * buffer again  soon.  For  offsets in  range
1474                          * [smallfile1,  smallfile2] release the pages
1475                          * at   the  tail  of the   cache list, larger
1476                          * offsets are released at the head.
1477                          */
1478                         if (dofree) {
1479                                 flags = SM_FREE | SM_ASYNC;
1480                                 if ((cache_read_ahead == 0) &&
1481                                     (off > smallfile2))
1482                                         flags |=  SM_DONTNEED;
1483                         }
1484                         /*
1485                          * In POSIX SYNC (FSYNC and FDSYNC) read mode,
1486                          * we want to make sure that the page which has
1487                          * been read, is written on disk if it is dirty.
1488                          * And corresponding indirect blocks should also
1489                          * be flushed out.
1490                          */
1491                         if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
1492                                 flags &= ~SM_ASYNC;
1493                                 flags |= SM_WRITE;
1494                         }
1495                         if (vpm_enable) {
1496                                 error = vpm_sync_pages(vp, off, n, flags);
1497                         } else {
1498                                 error = segmap_release(segkmap, base, flags);
1499                         }
1500                 } else {
1501                         if (vpm_enable) {
1502                                 (void) vpm_sync_pages(vp, off, n, flags);
1503                         } else {
1504                                 (void) segmap_release(segkmap, base, flags);
1505                         }
1506                 }
1507
1508                 if (rwtype == RW_READER)
1509                         rw_enter(&ip->i_contents, rwtype);
1510         } while (error == 0 && uio->uio_resid > 0 && n != 0);
1511 out:
1512         /*
1513          * Inode is updated according to this table if FRSYNC is set.
1514          *
1515          *   FSYNC        FDSYNC(posix.4)
1516          *   --------------------------
1517          *   always       IATTCHG|IBDWRITE
1518          */
1519         /*
1520          * The inode is not updated if we're logging and the inode is a
1521          * directory with FRSYNC, FSYNC and FDSYNC flags set.
1522          */
1523         if (ioflag & FRSYNC) {
1524                 if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) {
1525                         doupdate = 0;
1526                 }
1527                 if (doupdate) {
1528                         if ((ioflag & FSYNC) ||
1529                             ((ioflag & FDSYNC) &&
1530                             (ip->i_flag & (IATTCHG|IBDWRITE)))) {
1531                                 ufs_iupdat(ip, 1);
1532                         }
1533                 }
1534         }
1535         /*
1536          * If we've already done a partial read, terminate
1537          * the read but return no error.
1538          */
1539         if (oresid != uio->uio_resid)
1540                 error = 0;
1541         ITIMES(ip);
1542
1543         return (error);
1544 }
1545
1546 /* ARGSUSED */
1547 static int
1548 ufs_ioctl(
1549         struct vnode    *vp,
1550         int             cmd,
1551         intptr_t        arg,
1552         int             flag,
1553         struct cred     *cr,
1554         int             *rvalp,
1555         caller_context_t *ct)
1556 {
1557         struct lockfs   lockfs, lockfs_out;
1558         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
1559         char            *comment, *original_comment;
1560         struct fs       *fs;
1561         struct ulockfs  *ulp;
1562         offset_t        off;
1563         extern int      maxphys;
1564         int             error;
1565         int             issync;
1566         int             trans_size;
1567
1568
1569         /*
1570          * forcibly unmounted
1571          */
1572         if (ufsvfsp == NULL || vp->v_vfsp == NULL ||
1573             vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
1574                 return (EIO);
1575         fs = ufsvfsp->vfs_fs;
1576
1577         if (cmd == Q_QUOTACTL) {
1578                 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK);
1579                 if (error)
1580                         return (error);
1581
1582                 if (ulp) {
1583                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA,
1584                             TOP_SETQUOTA_SIZE(fs));
1585                 }
1586
1587                 error = quotactl(vp, arg, flag, cr);
1588
1589                 if (ulp) {
1590                         TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA,
1591                             TOP_SETQUOTA_SIZE(fs));
1592                         ufs_lockfs_end(ulp);
1593                 }
1594                 return (error);
1595         }
1596
1597         switch (cmd) {
1598                 case _FIOLFS:
1599                         /*
1600                          * file system locking
1601                          */
1602                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1603                                 return (EPERM);
1604
1605                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1606                                 if (copyin((caddr_t)arg, &lockfs,
1607                                     sizeof (struct lockfs)))
1608                                         return (EFAULT);
1609                         }
1610 #ifdef _SYSCALL32_IMPL
1611                         else {
1612                                 struct lockfs32 lockfs32;
1613                                 /* Translate ILP32 lockfs to LP64 lockfs */
1614                                 if (copyin((caddr_t)arg, &lockfs32,
1615                                     sizeof (struct lockfs32)))
1616                                         return (EFAULT);
1617                                 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1618                                 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1619                                 lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1620                                 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1621                                 lockfs.lf_comment =
1622                                     (caddr_t)(uintptr_t)lockfs32.lf_comment;
1623                         }
1624 #endif /* _SYSCALL32_IMPL */
1625
1626                         if (lockfs.lf_comlen) {
1627                                 if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN)
1628                                         return (ENAMETOOLONG);
1629                                 comment =
1630                                     kmem_alloc(lockfs.lf_comlen, KM_SLEEP);
1631                                 if (copyin(lockfs.lf_comment, comment,
1632                                     lockfs.lf_comlen)) {
1633                                         kmem_free(comment, lockfs.lf_comlen);
1634                                         return (EFAULT);
1635                                 }
1636                                 original_comment = lockfs.lf_comment;
1637                                 lockfs.lf_comment = comment;
1638                         }
1639                         if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) {
1640                                 lockfs.lf_comment = original_comment;
1641
1642                                 if ((flag & DATAMODEL_MASK) ==
1643                                     DATAMODEL_NATIVE) {
1644                                         (void) copyout(&lockfs, (caddr_t)arg,
1645                                             sizeof (struct lockfs));
1646                                 }
1647 #ifdef _SYSCALL32_IMPL
1648                                 else {
1649                                         struct lockfs32 lockfs32;
1650                                         /* Translate LP64 to ILP32 lockfs */
1651                                         lockfs32.lf_lock =
1652                                             (uint32_t)lockfs.lf_lock;
1653                                         lockfs32.lf_flags =
1654                                             (uint32_t)lockfs.lf_flags;
1655                                         lockfs32.lf_key =
1656                                             (uint32_t)lockfs.lf_key;
1657                                         lockfs32.lf_comlen =
1658                                             (uint32_t)lockfs.lf_comlen;
1659                                         lockfs32.lf_comment =
1660                                             (uint32_t)(uintptr_t)
1661                                             lockfs.lf_comment;
1662                                         (void) copyout(&lockfs32, (caddr_t)arg,
1663                                             sizeof (struct lockfs32));
1664                                 }
1665 #endif /* _SYSCALL32_IMPL */
1666
1667                         } else {
1668                                 if (lockfs.lf_comlen)
1669                                         kmem_free(comment, lockfs.lf_comlen);
1670                         }
1671                         return (error);
1672
1673                 case _FIOLFSS:
1674                         /*
1675                          * get file system locking status
1676                          */
1677
1678                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1679                                 if (copyin((caddr_t)arg, &lockfs,
1680                                     sizeof (struct lockfs)))
1681                                         return (EFAULT);
1682                         }
1683 #ifdef _SYSCALL32_IMPL
1684                         else {
1685                                 struct lockfs32 lockfs32;
1686                                 /* Translate ILP32 lockfs to LP64 lockfs */
1687                                 if (copyin((caddr_t)arg, &lockfs32,
1688                                     sizeof (struct lockfs32)))
1689                                         return (EFAULT);
1690                                 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1691                                 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1692                                 lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1693                                 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1694                                 lockfs.lf_comment =
1695                                     (caddr_t)(uintptr_t)lockfs32.lf_comment;
1696                         }
1697 #endif /* _SYSCALL32_IMPL */
1698
1699                         if (error =  ufs_fiolfss(vp, &lockfs_out))
1700                                 return (error);
1701                         lockfs.lf_lock = lockfs_out.lf_lock;
1702                         lockfs.lf_key = lockfs_out.lf_key;
1703                         lockfs.lf_flags = lockfs_out.lf_flags;
1704                         lockfs.lf_comlen = MIN(lockfs.lf_comlen,
1705                             lockfs_out.lf_comlen);
1706
1707                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1708                                 if (copyout(&lockfs, (caddr_t)arg,
1709                                     sizeof (struct lockfs)))
1710                                         return (EFAULT);
1711                         }
1712 #ifdef _SYSCALL32_IMPL
1713                         else {
1714                                 /* Translate LP64 to ILP32 lockfs */
1715                                 struct lockfs32 lockfs32;
1716                                 lockfs32.lf_lock = (uint32_t)lockfs.lf_lock;
1717                                 lockfs32.lf_flags = (uint32_t)lockfs.lf_flags;
1718                                 lockfs32.lf_key = (uint32_t)lockfs.lf_key;
1719                                 lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen;
1720                                 lockfs32.lf_comment =
1721                                     (uint32_t)(uintptr_t)lockfs.lf_comment;
1722                                 if (copyout(&lockfs32, (caddr_t)arg,
1723                                     sizeof (struct lockfs32)))
1724                                         return (EFAULT);
1725                         }
1726 #endif /* _SYSCALL32_IMPL */
1727
1728                         if (lockfs.lf_comlen &&
1729                             lockfs.lf_comment && lockfs_out.lf_comment)
1730                                 if (copyout(lockfs_out.lf_comment,
1731                                     lockfs.lf_comment, lockfs.lf_comlen))
1732                                         return (EFAULT);
1733                         return (0);
1734
1735                 case _FIOSATIME:
1736                         /*
1737                          * set access time
1738                          */
1739
1740                         /*
1741                          * if mounted w/o atime, return quietly.
1742                          * I briefly thought about returning ENOSYS, but
1743                          * figured that most apps would consider this fatal
1744                          * but the idea is to make this as seamless as poss.
1745                          */
1746                         if (ufsvfsp->vfs_noatime)
1747                                 return (0);
1748
1749                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1750                             ULOCKFS_SETATTR_MASK);
1751                         if (error)
1752                                 return (error);
1753
1754                         if (ulp) {
1755                                 trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp));
1756                                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync,
1757                                                   TOP_SETATTR, trans_size);
1758                         }
1759
1760                         error = ufs_fiosatime(vp, (struct timeval *)arg,
1761                             flag, cr);
1762
1763                         if (ulp) {
1764                                 TRANS_END_CSYNC(ufsvfsp, &error, issync,
1765                                                 TOP_SETATTR, trans_size);
1766                                 ufs_lockfs_end(ulp);
1767                         }
1768                         return (error);
1769
1770                 case _FIOSDIO:
1771                         /*
1772                          * set delayed-io
1773                          */
1774                         return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr));
1775
1776                 case _FIOGDIO:
1777                         /*
1778                          * get delayed-io
1779                          */
1780                         return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr));
1781
1782                 case _FIOIO:
1783                         /*
1784                          * inode open
1785                          */
1786                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1787                             ULOCKFS_VGET_MASK);
1788                         if (error)
1789                                 return (error);
1790
1791                         error = ufs_fioio(vp, (struct fioio *)arg, flag, cr);
1792
1793                         if (ulp) {
1794                                 ufs_lockfs_end(ulp);
1795                         }
1796                         return (error);
1797
1798                 case _FIOFFS:
1799                         /*
1800                          * file system flush (push w/invalidate)
1801                          */
1802                         if ((caddr_t)arg != NULL)
1803                                 return (EINVAL);
1804                         return (ufs_fioffs(vp, NULL, cr));
1805
1806                 case _FIOISBUSY:
1807                         /*
1808                          * Contract-private interface for Legato
1809                          * Purge this vnode from the DNLC and decide
1810                          * if this vnode is busy (*arg == 1) or not
1811                          * (*arg == 0)
1812                          */
1813                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1814                                 return (EPERM);
1815                         error = ufs_fioisbusy(vp, (int *)arg, cr);
1816                         return (error);
1817
1818                 case _FIODIRECTIO:
1819                         return (ufs_fiodirectio(vp, (int)arg, cr));
1820
1821                 case _FIOTUNE:
1822                         /*
1823                          * Tune the file system (aka setting fs attributes)
1824                          */
1825                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1826                             ULOCKFS_SETATTR_MASK);
1827                         if (error)
1828                                 return (error);
1829
1830                         error = ufs_fiotune(vp, (struct fiotune *)arg, cr);
1831
1832                         if (ulp)
1833                                 ufs_lockfs_end(ulp);
1834                         return (error);
1835
1836                 case _FIOLOGENABLE:
1837                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1838                                 return (EPERM);
1839                         return (ufs_fiologenable(vp, (void *)arg, cr, flag));
1840
1841                 case _FIOLOGDISABLE:
1842                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1843                                 return (EPERM);
1844                         return (ufs_fiologdisable(vp, (void *)arg, cr, flag));
1845
1846                 case _FIOISLOG:
1847                         return (ufs_fioislog(vp, (void *)arg, cr, flag));
1848
1849                 case _FIOSNAPSHOTCREATE_MULTI:
1850                 {
1851                         struct fiosnapcreate_multi      fc, *fcp;
1852                         size_t  fcm_size;
1853
1854                         if (copyin((void *)arg, &fc, sizeof (fc)))
1855                                 return (EFAULT);
1856                         if (fc.backfilecount > MAX_BACKFILE_COUNT)
1857                                 return (EINVAL);
1858                         fcm_size = sizeof (struct fiosnapcreate_multi) +
1859                             (fc.backfilecount - 1) * sizeof (int);
1860                         fcp = (struct fiosnapcreate_multi *)
1861                             kmem_alloc(fcm_size, KM_SLEEP);
1862                         if (copyin((void *)arg, fcp, fcm_size)) {
1863                                 kmem_free(fcp, fcm_size);
1864                                 return (EFAULT);
1865                         }
1866                         error = ufs_snap_create(vp, fcp, cr);
1867                         /*
1868                          * Do copyout even if there is an error because
1869                          * the details of error is stored in fcp.
1870                          */
1871                         if (copyout(fcp, (void *)arg, fcm_size))
1872                                 error = EFAULT;
1873                         kmem_free(fcp, fcm_size);
1874                         return (error);
1875                 }
1876
1877                 case _FIOSNAPSHOTDELETE:
1878                 {
1879                         struct fiosnapdelete    fc;
1880
1881                         if (copyin((void *)arg, &fc, sizeof (fc)))
1882                                 return (EFAULT);
1883                         error = ufs_snap_delete(vp, &fc, cr);
1884                         if (!error && copyout(&fc, (void *)arg, sizeof (fc)))
1885                                 error = EFAULT;
1886                         return (error);
1887                 }
1888
1889                 case _FIOGETSUPERBLOCK:
1890                         if (copyout(fs, (void *)arg, SBSIZE))
1891                                 return (EFAULT);
1892                         return (0);
1893
1894                 case _FIOGETMAXPHYS:
1895                         if (copyout(&maxphys, (void *)arg, sizeof (maxphys)))
1896                                 return (EFAULT);
1897                         return (0);
1898
1899                 /*
1900                  * The following 3 ioctls are for TSufs support
1901                  * although could potentially be used elsewhere
1902                  */
1903                 case _FIO_SET_LUFS_DEBUG:
1904                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1905                                 return (EPERM);
1906                         lufs_debug = (uint32_t)arg;
1907                         return (0);
1908
1909                 case _FIO_SET_LUFS_ERROR:
1910                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1911                                 return (EPERM);
1912                         TRANS_SETERROR(ufsvfsp);
1913                         return (0);
1914
1915                 case _FIO_GET_TOP_STATS:
1916                 {
1917                         fio_lufs_stats_t *ls;
1918                         ml_unit_t *ul = ufsvfsp->vfs_log;
1919
1920                         ls = kmem_zalloc(sizeof (*ls), KM_SLEEP);
1921                         ls->ls_debug = ul->un_debug; /* return debug value */
1922                         /* Copy stucture if statistics are being kept */
1923                         if (ul->un_logmap->mtm_tops) {
1924                                 ls->ls_topstats = *(ul->un_logmap->mtm_tops);
1925                         }
1926                         error = 0;
1927                         if (copyout(ls, (void *)arg, sizeof (*ls)))
1928                                 error = EFAULT;
1929                         kmem_free(ls, sizeof (*ls));
1930                         return (error);
1931                 }
1932
1933                 case _FIO_SEEK_DATA:
1934                 case _FIO_SEEK_HOLE:
1935                         if (ddi_copyin((void *)arg, &off, sizeof (off), flag))
1936                                 return (EFAULT);
1937                         /* offset paramater is in/out */
1938                         error = ufs_fio_holey(vp, cmd, &off);
1939                         if (error)
1940                                 return (error);
1941                         if (ddi_copyout(&off, (void *)arg, sizeof (off), flag))
1942                                 return (EFAULT);
1943                         return (0);
1944
1945                 case _FIO_COMPRESSED:
1946                 {
1947                         /*
1948                          * This is a project private ufs ioctl() to mark
1949                          * the inode as that belonging to a compressed
1950                          * file. This is used to mark individual
1951                          * compressed files in a miniroot archive.
1952                          * The files compressed in this manner are
1953                          * automatically decompressed by the dcfs filesystem
1954                          * (via an interception in ufs_lookup - see decompvp())
1955                          * which is layered on top of ufs on a system running
1956                          * from the archive. See uts/common/fs/dcfs for details.
1957                          * This ioctl only marks the file as compressed - the
1958                          * actual compression is done by fiocompress (a
1959                          * userland utility) which invokes this ioctl().
1960                          */
1961                         struct inode *ip = VTOI(vp);
1962
1963                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1964                             ULOCKFS_SETATTR_MASK);
1965                         if (error)
1966                                 return (error);
1967
1968                         if (ulp) {
1969                                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT,
1970                                     TOP_IUPDAT_SIZE(ip));
1971                         }
1972
1973                         error = ufs_mark_compressed(vp);
1974
1975                         if (ulp) {
1976                                 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT,
1977                                     TOP_IUPDAT_SIZE(ip));
1978                                 ufs_lockfs_end(ulp);
1979                         }
1980
1981                         return (error);
1982
1983                 }
1984
1985                 default:
1986                         return (ENOTTY);
1987         }
1988 }
1989
1990
1991 /* ARGSUSED */
1992 static int
1993 ufs_getattr(struct vnode *vp, struct vattr *vap, int flags,
1994         struct cred *cr, caller_context_t *ct)
1995 {
1996         struct inode *ip = VTOI(vp);
1997         struct ufsvfs *ufsvfsp;
1998         int err;
1999
2000         if (vap->va_mask == AT_SIZE) {
2001                 /*
2002                  * for performance, if only the size is requested don't bother
2003                  * with anything else.
2004                  */
2005                 UFS_GET_ISIZE(&vap->va_size, ip);
2006                 return (0);
2007         }
2008
2009         /*
2010          * inlined lockfs checks
2011          */
2012         ufsvfsp = ip->i_ufsvfs;
2013         if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) {
2014                 err = EIO;
2015                 goto out;
2016         }
2017
2018         rw_enter(&ip->i_contents, RW_READER);
2019         /*
2020          * Return all the attributes.  This should be refined so
2021          * that it only returns what's asked for.
2022          */
2023
2024         /*
2025          * Copy from inode table.
2026          */
2027         vap->va_type = vp->v_type;
2028         vap->va_mode = ip->i_mode & MODEMASK;
2029         /*
2030          * If there is an ACL and there is a mask entry, then do the
2031          * extra work that completes the equivalent of an acltomode(3)
2032          * call.  According to POSIX P1003.1e, the acl mask should be
2033          * returned in the group permissions field.
2034          *
2035          * - start with the original permission and mode bits (from above)
2036          * - clear the group owner bits
2037          * - add in the mask bits.
2038          */
2039         if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) {
2040                 vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3);
2041                 vap->va_mode |=
2042                     (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3;
2043         }
2044         vap->va_uid = ip->i_uid;
2045         vap->va_gid = ip->i_gid;
2046         vap->va_fsid = ip->i_dev;
2047         vap->va_nodeid = (ino64_t)ip->i_number;
2048         vap->va_nlink = ip->i_nlink;
2049         vap->va_size = ip->i_size;
2050         if (vp->v_type == VCHR || vp->v_type == VBLK)
2051                 vap->va_rdev = ip->i_rdev;
2052         else
2053                 vap->va_rdev = 0;       /* not a b/c spec. */
2054         mutex_enter(&ip->i_tlock);
2055         ITIMES_NOLOCK(ip);      /* mark correct time in inode */
2056         vap->va_seq = ip->i_seq;
2057         vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
2058         vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000;
2059         vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
2060         vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000;
2061         vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
2062         vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000;
2063         mutex_exit(&ip->i_tlock);
2064
2065         switch (ip->i_mode & IFMT) {
2066
2067         case IFBLK:
2068                 vap->va_blksize = MAXBSIZE;             /* was BLKDEV_IOSIZE */
2069                 break;
2070
2071         case IFCHR:
2072                 vap->va_blksize = MAXBSIZE;
2073                 break;
2074
2075         default:
2076                 vap->va_blksize = ip->i_fs->fs_bsize;
2077                 break;
2078         }
2079         vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks;
2080         rw_exit(&ip->i_contents);
2081         err = 0;
2082
2083 out:
2084         return (err);
2085 }
2086
2087 /*
2088  * Special wrapper to provide a callback for secpolicy_vnode_setattr().
2089  * The i_contents lock is already held by the caller and we need to
2090  * declare the inode as 'void *' argument.
2091  */
2092 static int
2093 ufs_priv_access(void *vip, int mode, struct cred *cr)
2094 {
2095         struct inode *ip = vip;
2096
2097         return (ufs_iaccess(ip, mode, cr, 0));
2098 }
2099
2100 /*ARGSUSED4*/
2101 static int
2102 ufs_setattr(
2103         struct vnode *vp,
2104         struct vattr *vap,
2105         int flags,
2106         struct cred *cr,
2107         caller_context_t *ct)
2108 {
2109         struct inode *ip = VTOI(vp);
2110         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2111         struct fs *fs;
2112         struct ulockfs *ulp;
2113         char *errmsg1;
2114         char *errmsg2;
2115         long blocks;
2116         long int mask = vap->va_mask;
2117         size_t len1, len2;
2118         int issync;
2119         int trans_size;
2120         int dotrans;
2121         int dorwlock;
2122         int error;
2123         int owner_change;
2124         int dodqlock;
2125         timestruc_t now;
2126         vattr_t oldva;
2127         int retry = 1;
2128         int indeadlock;
2129
2130         /*
2131          * Cannot set these attributes.
2132          */
2133         if ((mask & AT_NOSET) || (mask & AT_XVATTR))
2134                 return (EINVAL);
2135
2136         /*
2137          * check for forced unmount
2138          */
2139         if (ufsvfsp == NULL)
2140                 return (EIO);
2141
2142         fs = ufsvfsp->vfs_fs;
2143         if (fs->fs_ronly != 0)
2144                 return (EROFS);
2145
2146 again:
2147         errmsg1 = NULL;
2148         errmsg2 = NULL;
2149         dotrans = 0;
2150         dorwlock = 0;
2151         dodqlock = 0;
2152
2153         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK);
2154         if (error)
2155                 goto out;
2156
2157         /*
2158          * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
2159          * This follows the protocol for read()/write().
2160          */
2161         if (vp->v_type != VDIR) {
2162                 /*
2163                  * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
2164                  * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2165                  * possible, retries the operation.
2166                  */
2167                 indeadlock = ufs_tryirwlock(ulp, &ip->i_rwlock, RW_WRITER);
2168                 if (indeadlock) {
2169                         if (ulp)
2170                                 ufs_lockfs_end(ulp);
2171                         goto again;
2172                 }
2173                 dorwlock = 1;
2174         }
2175
2176         /*
2177          * Truncate file.  Must have write permission and not be a directory.
2178          */
2179         if (mask & AT_SIZE) {
2180                 rw_enter(&ip->i_contents, RW_WRITER);
2181                 if (vp->v_type == VDIR) {
2182                         error = EISDIR;
2183                         goto update_inode;
2184                 }
2185                 if (error = ufs_iaccess(ip, IWRITE, cr, 0))
2186                         goto update_inode;
2187
2188                 rw_exit(&ip->i_contents);
2189                 error = TRANS_ITRUNC(ip, vap->va_size, 0, cr);
2190                 if (error) {
2191                         rw_enter(&ip->i_contents, RW_WRITER);
2192                         goto update_inode;
2193                 }
2194
2195                 if (error == 0 && vap->va_size)
2196                         vnevent_truncate(vp, ct);
2197         }
2198
2199         if (ulp) {
2200                 trans_size = (int)TOP_SETATTR_SIZE(ip);
2201                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_SETATTR, trans_size);
2202                 ++dotrans;
2203         }
2204
2205         /*
2206          * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
2207          * This follows the protocol established by
2208          * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
2209          */
2210         if (vp->v_type == VDIR) {
2211                 indeadlock = ufs_tryirwlock_trans(ulp, &ip->i_rwlock,
2212                                                   RW_WRITER, TOP_SETATTR,
2213                                                   ufsvfsp, &error, issync,
2214                                                   trans_size);
2215                 if (indeadlock)
2216                         goto again;
2217                 dorwlock = 1;
2218         }
2219
2220         /*
2221          * Grab quota lock if we are changing the file's owner.
2222          */
2223         if (mask & AT_UID) {
2224                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2225                 dodqlock = 1;
2226         }
2227         rw_enter(&ip->i_contents, RW_WRITER);
2228
2229         oldva.va_mode = ip->i_mode;
2230         oldva.va_uid = ip->i_uid;
2231         oldva.va_gid = ip->i_gid;
2232
2233         vap->va_mask &= ~AT_SIZE;
2234
2235         error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2236             ufs_priv_access, ip);
2237         if (error)
2238                 goto update_inode;
2239
2240         mask = vap->va_mask;
2241
2242         /*
2243          * Change file access modes.
2244          */
2245         if (mask & AT_MODE) {
2246                 ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT);
2247                 TRANS_INODE(ufsvfsp, ip);
2248                 ip->i_flag |= ICHG;
2249                 if (stickyhack) {
2250                         mutex_enter(&vp->v_lock);
2251                         if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
2252                                 vp->v_flag |= VSWAPLIKE;
2253                         else
2254                                 vp->v_flag &= ~VSWAPLIKE;
2255                         mutex_exit(&vp->v_lock);
2256                 }
2257         }
2258         if (mask & (AT_UID|AT_GID)) {
2259                 if (mask & AT_UID) {
2260                         /*
2261                          * Don't change ownership of the quota inode.
2262                          */
2263                         if (ufsvfsp->vfs_qinod == ip) {
2264                                 ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED);
2265                                 error = EINVAL;
2266                                 goto update_inode;
2267                         }
2268
2269                         /*
2270                          * No real ownership change.
2271                          */
2272                         if (ip->i_uid == vap->va_uid) {
2273                                 blocks = 0;
2274                                 owner_change = 0;
2275                         }
2276                         /*
2277                          * Remove the blocks and the file, from the old user's
2278                          * quota.
2279                          */
2280                         else {
2281                                 blocks = ip->i_blocks;
2282                                 owner_change = 1;
2283
2284                                 (void) chkdq(ip, -blocks, /* force */ 1, cr,
2285                                     (char **)NULL, NULL);
2286                                 (void) chkiq(ufsvfsp, /* change */ -1, ip,
2287                                     (uid_t)ip->i_uid, /* force */ 1, cr,
2288                                     (char **)NULL, NULL);
2289                                 dqrele(ip->i_dquot);
2290                         }
2291
2292                         ip->i_uid = vap->va_uid;
2293
2294                         /*
2295                          * There is a real ownership change.
2296                          */
2297                         if (owner_change) {
2298                                 /*
2299                                  * Add the blocks and the file to the new
2300                                  * user's quota.
2301                                  */
2302                                 ip->i_dquot = getinoquota(ip);
2303                                 (void) chkdq(ip, blocks, /* force */ 1, cr,
2304                                     &errmsg1, &len1);
2305                                 (void) chkiq(ufsvfsp, /* change */ 1,
2306                                     NULL, (uid_t)ip->i_uid,
2307                                     /* force */ 1, cr, &errmsg2, &len2);
2308                         }
2309                 }
2310                 if (mask & AT_GID) {
2311                         ip->i_gid = vap->va_gid;
2312                 }
2313                 TRANS_INODE(ufsvfsp, ip);
2314                 ip->i_flag |= ICHG;
2315         }
2316         /*
2317          * Change file access or modified times.
2318          */
2319         if (mask & (AT_ATIME|AT_MTIME)) {
2320                 /* Check that the time value is within ufs range */
2321                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2322                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2323                         error = EOVERFLOW;
2324                         goto update_inode;
2325                 }
2326
2327                 /*
2328                  * if the "noaccess" mount option is set and only atime
2329                  * update is requested, do nothing. No error is returned.
2330                  */
2331                 if ((ufsvfsp->vfs_noatime) &&
2332                     ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME))
2333                         goto skip_atime;
2334
2335                 if (mask & AT_ATIME) {
2336                         ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2337                         ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2338                         ip->i_flag &= ~IACC;
2339                 }
2340                 if (mask & AT_MTIME) {
2341                         ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2342                         ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2343                         gethrestime(&now);
2344                         if (now.tv_sec > TIME32_MAX) {
2345                                 /*
2346                                  * In 2038, ctime sticks forever..
2347                                  */
2348                                 ip->i_ctime.tv_sec = TIME32_MAX;
2349                                 ip->i_ctime.tv_usec = 0;
2350                         } else {
2351                                 ip->i_ctime.tv_sec = now.tv_sec;
2352                                 ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2353                         }
2354                         ip->i_flag &= ~(IUPD|ICHG);
2355                         ip->i_flag |= IMODTIME;
2356                 }
2357                 TRANS_INODE(ufsvfsp, ip);
2358                 ip->i_flag |= IMOD;
2359         }
2360
2361 skip_atime:
2362         /*
2363          * The presence of a shadow inode may indicate an ACL, but does
2364          * not imply an ACL.  Future FSD types should be handled here too
2365          * and check for the presence of the attribute-specific data
2366          * before referencing it.
2367          */
2368         if (ip->i_shadow) {
2369                 /*
2370                  * XXX if ufs_iupdat is changed to sandbagged write fix
2371                  * ufs_acl_setattr to push ip to keep acls consistent
2372                  *
2373                  * Suppress out of inodes messages if we will retry.
2374                  */
2375                 if (retry)
2376                         ip->i_flag |= IQUIET;
2377                 error = ufs_acl_setattr(ip, vap, cr);
2378                 ip->i_flag &= ~IQUIET;
2379         }
2380
2381 update_inode:
2382         /*
2383          * Setattr always increases the sequence number
2384          */
2385         ip->i_seq++;
2386
2387         /*
2388          * if nfsd and not logging; push synchronously
2389          */
2390         if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) {
2391                 ufs_iupdat(ip, 1);
2392         } else {
2393                 ITIMES_NOLOCK(ip);
2394         }
2395
2396         rw_exit(&ip->i_contents);
2397         if (dodqlock) {
2398                 rw_exit(&ufsvfsp->vfs_dqrwlock);
2399         }
2400         if (dorwlock)
2401                 rw_exit(&ip->i_rwlock);
2402
2403         if (ulp) {
2404                 if (dotrans) {
2405                         int terr = 0;
2406                         TRANS_END_CSYNC(ufsvfsp, &terr, issync, TOP_SETATTR,
2407                                         trans_size);
2408                         if (error == 0)
2409                                 error = terr;
2410                 }
2411                 ufs_lockfs_end(ulp);
2412         }
2413 out:
2414         /*
2415          * If out of inodes or blocks, see if we can free something
2416          * up from the delete queue.
2417          */
2418         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
2419                 ufs_delete_drain_wait(ufsvfsp, 1);
2420                 retry = 0;
2421                 if (errmsg1 != NULL)
2422                         kmem_free(errmsg1, len1);
2423                 if (errmsg2 != NULL)
2424                         kmem_free(errmsg2, len2);
2425                 goto again;
2426         }
2427         if (errmsg1 != NULL) {
2428                 uprintf(errmsg1);
2429                 kmem_free(errmsg1, len1);
2430         }
2431         if (errmsg2 != NULL) {
2432                 uprintf(errmsg2);
2433                 kmem_free(errmsg2, len2);
2434         }
2435         return (error);
2436 }
2437
2438 /*ARGSUSED*/
2439 static int
2440 ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
2441         caller_context_t *ct)
2442 {
2443         struct inode *ip = VTOI(vp);
2444
2445         if (ip->i_ufsvfs == NULL)
2446                 return (EIO);
2447
2448         /*
2449          * The ufs_iaccess function wants to be called with
2450          * mode bits expressed as "ufs specific" bits.
2451          * I.e., VWRITE|VREAD|VEXEC do not make sense to
2452          * ufs_iaccess() but IWRITE|IREAD|IEXEC do.
2453          * But since they're the same we just pass the vnode mode
2454          * bit but just verify that assumption at compile time.
2455          */
2456 #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC
2457 #error "ufs_access needs to map Vmodes to Imodes"
2458 #endif
2459         return (ufs_iaccess(ip, mode, cr, 1));
2460 }
2461
2462 /* ARGSUSED */
2463 static int
2464 ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr,
2465         caller_context_t *ct)
2466 {
2467         struct inode *ip = VTOI(vp);
2468         struct ufsvfs *ufsvfsp;
2469         struct ulockfs *ulp;
2470         int error;
2471         int fastsymlink;
2472
2473         if (vp->v_type != VLNK) {
2474                 error = EINVAL;
2475                 goto nolockout;
2476         }
2477
2478         /*
2479          * If the symbolic link is empty there is nothing to read.
2480          * Fast-track these empty symbolic links
2481          */
2482         if (ip->i_size == 0) {
2483                 error = 0;
2484                 goto nolockout;
2485         }
2486
2487         ufsvfsp = ip->i_ufsvfs;
2488         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK);
2489         if (error)
2490                 goto nolockout;
2491         /*
2492          * The ip->i_rwlock protects the data blocks used for FASTSYMLINK
2493          */
2494 again:
2495         fastsymlink = 0;
2496         if (ip->i_flag & IFASTSYMLNK) {
2497                 rw_enter(&ip->i_rwlock, RW_READER);
2498                 rw_enter(&ip->i_contents, RW_READER);
2499                 if (ip->i_flag & IFASTSYMLNK) {
2500                         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
2501                             (ip->i_fs->fs_ronly == 0) &&
2502                             (!ufsvfsp->vfs_noatime)) {
2503                                 mutex_enter(&ip->i_tlock);
2504                                 ip->i_flag |= IACC;
2505                                 mutex_exit(&ip->i_tlock);
2506                         }
2507                         error = uiomove((caddr_t)&ip->i_db[1],
2508                             MIN(ip->i_size, uiop->uio_resid),
2509                             UIO_READ, uiop);
2510                         ITIMES(ip);
2511                         ++fastsymlink;
2512                 }
2513                 rw_exit(&ip->i_contents);
2514                 rw_exit(&ip->i_rwlock);
2515         }
2516         if (!fastsymlink) {
2517                 ssize_t size;   /* number of bytes read  */
2518                 caddr_t basep;  /* pointer to input data */
2519                 ino_t ino;
2520                 long  igen;
2521                 struct uio tuio;        /* temp uio struct */
2522                 struct uio *tuiop;
2523                 iovec_t tiov;           /* temp iovec struct */
2524                 char kbuf[FSL_SIZE];    /* buffer to hold fast symlink */
2525                 int tflag = 0;          /* flag to indicate temp vars used */
2526
2527                 ino = ip->i_number;
2528                 igen = ip->i_gen;
2529                 size = uiop->uio_resid;
2530                 basep = uiop->uio_iov->iov_base;
2531                 tuiop = uiop;
2532
2533                 rw_enter(&ip->i_rwlock, RW_WRITER);
2534                 rw_enter(&ip->i_contents, RW_WRITER);
2535                 if (ip->i_flag & IFASTSYMLNK) {
2536                         rw_exit(&ip->i_contents);
2537                         rw_exit(&ip->i_rwlock);
2538                         goto again;
2539                 }
2540
2541                 /* can this be a fast symlink and is it a user buffer? */
2542                 if (ip->i_size <= FSL_SIZE &&
2543                     (uiop->uio_segflg == UIO_USERSPACE ||
2544                     uiop->uio_segflg == UIO_USERISPACE)) {
2545
2546                         bzero(&tuio, sizeof (struct uio));
2547                         /*
2548                          * setup a kernel buffer to read link into.  this
2549                          * is to fix a race condition where the user buffer
2550                          * got corrupted before copying it into the inode.
2551                          */
2552                         size = ip->i_size;
2553                         tiov.iov_len = size;
2554                         tiov.iov_base = kbuf;
2555                         tuio.uio_iov = &tiov;
2556                         tuio.uio_iovcnt = 1;
2557                         tuio.uio_offset = uiop->uio_offset;
2558                         tuio.uio_segflg = UIO_SYSSPACE;
2559                         tuio.uio_fmode = uiop->uio_fmode;
2560                         tuio.uio_extflg = uiop->uio_extflg;
2561                         tuio.uio_limit = uiop->uio_limit;
2562                         tuio.uio_resid = size;
2563
2564                         basep = tuio.uio_iov->iov_base;
2565                         tuiop = &tuio;
2566                         tflag = 1;
2567                 }
2568
2569                 error = rdip(ip, tuiop, 0, cr);
2570                 if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) {
2571                         rw_exit(&ip->i_contents);
2572                         rw_exit(&ip->i_rwlock);
2573                         goto out;
2574                 }
2575
2576                 if (tflag == 0)
2577                         size -= uiop->uio_resid;
2578
2579                 if ((tflag == 0 && ip->i_size <= FSL_SIZE &&
2580                     ip->i_size == size) || (tflag == 1 &&
2581                     tuio.uio_resid == 0)) {
2582                         error = kcopy(basep, &ip->i_db[1], ip->i_size);
2583                         if (error == 0) {
2584                                 ip->i_flag |= IFASTSYMLNK;
2585                                 /*
2586                                  * free page
2587                                  */
2588                                 (void) fop_putpage(ITOV(ip),
2589                                     0, PAGESIZE,
2590                                     (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC),
2591                                     cr, ct);
2592                         } else {
2593                                 int i;
2594                                 /* error, clear garbage left behind */
2595                                 for (i = 1; i < NDADDR; i++)
2596                                         ip->i_db[i] = 0;
2597                                 for (i = 0; i < NIADDR; i++)
2598                                         ip->i_ib[i] = 0;
2599                         }
2600                 }
2601                 if (tflag == 1) {
2602                         /* now, copy it into the user buffer */
2603                         error = uiomove((caddr_t)kbuf,
2604                             MIN(size, uiop->uio_resid),
2605                             UIO_READ, uiop);
2606                 }
2607                 rw_exit(&ip->i_contents);
2608                 rw_exit(&ip->i_rwlock);
2609         }
2610 out:
2611         if (ulp) {
2612                 ufs_lockfs_end(ulp);
2613         }
2614 nolockout:
2615         return (error);
2616 }
2617
2618 /* ARGSUSED */
2619 static int
2620 ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr,
2621         caller_context_t *ct)
2622 {
2623         struct inode *ip = VTOI(vp);
2624         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2625         struct ulockfs *ulp;
2626         int error;
2627
2628         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK);
2629         if (error)
2630                 return (error);
2631
2632         if (TRANS_ISTRANS(ufsvfsp)) {
2633                 /*
2634                  * First push out any data pages
2635                  */
2636                 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2637                     (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) {
2638                         error = fop_putpage(vp, 0, (size_t)0,
2639                             0, CRED(), ct);
2640                         if (error)
2641                                 goto out;
2642                 }
2643
2644                 /*
2645                  * Delta any delayed inode times updates
2646                  * and push inode to log.
2647                  * All other inode deltas will have already been delta'd
2648                  * and will be pushed during the commit.
2649                  */
2650                 if (!(syncflag & FDSYNC) &&
2651                     ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) {
2652                         if (ulp) {
2653                                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC,
2654                                     TOP_SYNCIP_SIZE);
2655                         }
2656                         rw_enter(&ip->i_contents, RW_READER);
2657                         mutex_enter(&ip->i_tlock);
2658                         ip->i_flag &= ~IMODTIME;
2659                         mutex_exit(&ip->i_tlock);
2660                         ufs_iupdat(ip, I_SYNC);
2661                         rw_exit(&ip->i_contents);
2662                         if (ulp) {
2663                                 TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC,
2664                                     TOP_SYNCIP_SIZE);
2665                         }
2666                 }
2667
2668                 /*
2669                  * Commit the Moby transaction
2670                  *
2671                  * Deltas have already been made so we just need to
2672                  * commit them with a synchronous transaction.
2673                  * TRANS_BEGIN_SYNC() will return an error
2674                  * if there are no deltas to commit, for an
2675                  * empty transaction.
2676                  */
2677                 if (ulp) {
2678                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE,
2679                                          &error);
2680                         if (error) {
2681                                 error = 0; /* commit wasn't needed */
2682                                 goto out;
2683                         }
2684                         TRANS_END_SYNC(ufsvfsp, &error, TOP_FSYNC,
2685                                        TOP_COMMIT_SIZE);
2686                 }
2687         } else {        /* not logging */
2688                 if (!(IS_SWAPVP(vp)))
2689                         if (syncflag & FNODSYNC) {
2690                                 /* Just update the inode only */
2691                                 TRANS_IUPDAT(ip, 1);
2692                                 error = 0;
2693                         } else if (syncflag & FDSYNC)
2694                                 /* Do data-synchronous writes */
2695                                 error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC);
2696                         else
2697                                 /* Do synchronous writes */
2698                                 error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC);
2699
2700                 rw_enter(&ip->i_contents, RW_WRITER);
2701                 if (!error)
2702                         error = ufs_sync_indir(ip);
2703                 rw_exit(&ip->i_contents);
2704         }
2705 out:
2706         if (ulp) {
2707                 ufs_lockfs_end(ulp);
2708         }
2709         return (error);
2710 }
2711
2712 /*ARGSUSED*/
2713 static void
2714 ufs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
2715 {
2716         ufs_iinactive(VTOI(vp));
2717 }
2718
2719 /*
2720  * Unix file system operations having to do with directory manipulation.
2721  */
2722 int ufs_lookup_idle_count = 2;  /* Number of inodes to idle each time */
2723 /* ARGSUSED */
2724 static int
2725 ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
2726         struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr,
2727         caller_context_t *ct, int *direntflags, pathname_t *realpnp)
2728 {
2729         struct inode *ip;
2730         struct inode *sip;
2731         struct inode *xip;
2732         struct ufsvfs *ufsvfsp;
2733         struct ulockfs *ulp;
2734         struct vnode *vp;
2735         int error;
2736
2737         /*
2738          * Check flags for type of lookup (regular file or attribute file)
2739          */
2740
2741         ip = VTOI(dvp);
2742
2743         if (flags & LOOKUP_XATTR) {
2744
2745                 /*
2746                  * If not mounted with XATTR support then return EINVAL
2747                  */
2748
2749                 if (!(ip->i_ufsvfs->vfs_vfs->vfs_flag & VFS_XATTR))
2750                         return (EINVAL);
2751                 /*
2752                  * We don't allow recursive attributes...
2753                  * Maybe someday we will.
2754                  */
2755                 if ((ip->i_cflags & IXATTR)) {
2756                         return (EINVAL);
2757                 }
2758
2759                 if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) {
2760                         error = ufs_xattr_getattrdir(dvp, &sip, flags, cr);
2761                         if (error) {
2762                                 *vpp = NULL;
2763                                 goto out;
2764                         }
2765
2766                         vp = ITOV(sip);
2767                         dnlc_update(dvp, XATTR_DIR_NAME, vp);
2768                 }
2769
2770                 /*
2771                  * Check accessibility of directory.
2772                  */
2773                 if (vp == DNLC_NO_VNODE) {
2774                         VN_RELE(vp);
2775                         error = ENOENT;
2776                         goto out;
2777                 }
2778                 if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr, 1)) != 0) {
2779                         VN_RELE(vp);
2780                         goto out;
2781                 }
2782
2783                 *vpp = vp;
2784                 return (0);
2785         }
2786
2787         /*
2788          * Check for a null component, which we should treat as
2789          * looking at dvp from within it's parent, so we don't
2790          * need a call to ufs_iaccess(), as it has already been
2791          * done.
2792          */
2793         if (nm[0] == 0) {
2794                 VN_HOLD(dvp);
2795                 error = 0;
2796                 *vpp = dvp;
2797                 goto out;
2798         }
2799
2800         /*
2801          * Check for "." ie itself. this is a quick check and
2802          * avoids adding "." into the dnlc (which have been seen
2803          * to occupy >10% of the cache).
2804          */
2805         if ((nm[0] == '.') && (nm[1] == 0)) {
2806                 /*
2807                  * Don't return without checking accessibility
2808                  * of the directory. We only need the lock if
2809                  * we are going to return it.
2810                  */
2811                 if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) == 0) {
2812                         VN_HOLD(dvp);
2813                         *vpp = dvp;
2814                 }
2815                 goto out;
2816         }
2817
2818         /*
2819          * Fast path: Check the directory name lookup cache.
2820          */
2821         if (vp = dnlc_lookup(dvp, nm)) {
2822                 /*
2823                  * Check accessibility of directory.
2824                  */
2825                 if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) != 0) {
2826                         VN_RELE(vp);
2827                         goto out;
2828                 }
2829                 if (vp == DNLC_NO_VNODE) {
2830                         VN_RELE(vp);
2831                         error = ENOENT;
2832                         goto out;
2833                 }
2834                 xip = VTOI(vp);
2835                 ulp = NULL;
2836                 goto fastpath;
2837         }
2838
2839         /*
2840          * Keep the idle queue from getting too long by
2841          * idling two inodes before attempting to allocate another.
2842          *    This operation must be performed before entering
2843          *    lockfs or a transaction.
2844          */
2845         if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
2846                 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
2847                         ins.in_lidles.value.ul += ufs_lookup_idle_count;
2848                         ufs_idle_some(ufs_lookup_idle_count);
2849                 }
2850
2851 retry_lookup:
2852         /*
2853          * Check accessibility of directory.
2854          */
2855         if (error = ufs_diraccess(ip, IEXEC, cr))
2856                 goto out;
2857
2858         ufsvfsp = ip->i_ufsvfs;
2859         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK);
2860         if (error)
2861                 goto out;
2862
2863         error = ufs_dirlook(ip, nm, &xip, cr, 1, 0);
2864
2865 fastpath:
2866         if (error == 0) {
2867                 ip = xip;
2868                 *vpp = ITOV(ip);
2869
2870                 /*
2871                  * If vnode is a device return special vnode instead.
2872                  */
2873                 if (IS_DEVVP(*vpp)) {
2874                         struct vnode *newvp;
2875
2876                         newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
2877                             cr);
2878                         VN_RELE(*vpp);
2879                         if (newvp == NULL)
2880                                 error = ENOSYS;
2881                         else
2882                                 *vpp = newvp;
2883                 } else if (ip->i_cflags & ICOMPRESS) {
2884                         struct vnode *newvp;
2885
2886                         /*
2887                          * Compressed file, substitute dcfs vnode
2888                          */
2889                         newvp = decompvp(*vpp, cr, ct);
2890                         VN_RELE(*vpp);
2891                         if (newvp == NULL)
2892                                 error = ENOSYS;
2893                         else
2894                                 *vpp = newvp;
2895                 }
2896         }
2897         if (ulp) {
2898                 ufs_lockfs_end(ulp);
2899         }
2900
2901         if (error == EAGAIN)
2902                 goto retry_lookup;
2903
2904 out:
2905         return (error);
2906 }
2907
2908 /*ARGSUSED*/
2909 static int
2910 ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl,
2911         int mode, struct vnode **vpp, struct cred *cr, int flag,
2912         caller_context_t *ct, vsecattr_t *vsecp)
2913 {
2914         struct inode *ip;
2915         struct inode *xip;
2916         struct inode *dip;
2917         struct vnode *xvp;
2918         struct ufsvfs *ufsvfsp;
2919         struct ulockfs *ulp;
2920         int error;
2921         int issync;
2922         int truncflag;
2923         int trans_size;
2924         int noentry;
2925         int defer_dip_seq_update = 0;   /* need to defer update of dip->i_seq */
2926         int retry = 1;
2927         int indeadlock;
2928
2929 again:
2930         ip = VTOI(dvp);
2931         ufsvfsp = ip->i_ufsvfs;
2932         truncflag = 0;
2933
2934         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK);
2935         if (error)
2936                 goto out;
2937
2938         if (ulp) {
2939                 trans_size = (int)TOP_CREATE_SIZE(ip);
2940                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_CREATE, trans_size);
2941         }
2942
2943         if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
2944                 vap->va_mode &= ~VSVTX;
2945
2946         if (*name == '\0') {
2947                 /*
2948                  * Null component name refers to the directory itself.
2949                  */
2950                 VN_HOLD(dvp);
2951                 /*
2952                  * Even though this is an error case, we need to grab the
2953                  * quota lock since the error handling code below is common.
2954                  */
2955                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2956                 rw_enter(&ip->i_contents, RW_WRITER);
2957                 error = EEXIST;
2958         } else {
2959                 xip = NULL;
2960                 noentry = 0;
2961                 /*
2962                  * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
2963                  * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2964                  * possible, retries the operation.
2965                  */
2966                 indeadlock = ufs_tryirwlock_trans(ulp, &ip->i_rwlock,
2967                                                   RW_WRITER, TOP_CREATE,
2968                                                   ufsvfsp, &error, issync,
2969                                                   trans_size);
2970                 if (indeadlock)
2971                         goto again;
2972
2973                 xvp = dnlc_lookup(dvp, name);
2974                 if (xvp == DNLC_NO_VNODE) {
2975                         noentry = 1;
2976                         VN_RELE(xvp);
2977                         xvp = NULL;
2978                 }
2979                 if (xvp) {
2980                         rw_exit(&ip->i_rwlock);
2981                         if (error = ufs_iaccess(ip, IEXEC, cr, 1)) {
2982                                 VN_RELE(xvp);
2983                         } else {
2984                                 error = EEXIST;
2985                                 xip = VTOI(xvp);
2986                         }
2987                 } else {
2988                         /*
2989                          * Suppress file system full message if we will retry
2990                          */
2991                         error = ufs_direnter_cm(ip, name, DE_CREATE,
2992                             vap, &xip, cr, (noentry | (retry ? IQUIET : 0)));
2993                         if (error == EAGAIN) {
2994                                 if (ulp) {
2995                                         TRANS_END_CSYNC(ufsvfsp, &error,
2996                                                         issync, TOP_CREATE,
2997                                                         trans_size);
2998                                         ufs_lockfs_end(ulp);
2999                                 }
3000                                 goto again;
3001                         }
3002                         rw_exit(&ip->i_rwlock);
3003                 }
3004                 ip = xip;
3005                 if (ip != NULL) {
3006                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3007                         rw_enter(&ip->i_contents, RW_WRITER);
3008                 }
3009         }
3010
3011         /*
3012          * If the file already exists and this is a non-exclusive create,
3013          * check permissions and allow access for non-directories.
3014          * Read-only create of an existing directory is also allowed.
3015          * We fail an exclusive create of anything which already exists.
3016          */
3017         if (error == EEXIST) {
3018                 dip = VTOI(dvp);
3019                 if (excl == NONEXCL) {
3020                         if ((((ip->i_mode & IFMT) == IFDIR) ||
3021                             ((ip->i_mode & IFMT) == IFATTRDIR)) &&
3022                             (mode & IWRITE))
3023                                 error = EISDIR;
3024                         else if (mode)
3025                                 error = ufs_iaccess(ip, mode, cr, 0);
3026                         else
3027                                 error = 0;
3028                 }
3029                 if (error) {
3030                         rw_exit(&ip->i_contents);
3031                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3032                         VN_RELE(ITOV(ip));
3033                         goto unlock;
3034                 }
3035                 /*
3036                  * If the error EEXIST was set, then i_seq can not
3037                  * have been updated. The sequence number interface
3038                  * is defined such that a non-error fop_create must
3039                  * increase the dir va_seq it by at least one. If we
3040                  * have cleared the error, increase i_seq. Note that
3041                  * we are increasing the dir i_seq and in rare cases
3042                  * ip may actually be from the dvp, so we already have
3043                  * the locks and it will not be subject to truncation.
3044                  * In case we have to update i_seq of the parent
3045                  * directory dip, we have to defer it till we have
3046                  * released our locks on ip due to lock ordering requirements.
3047                  */
3048                 if (ip != dip)
3049                         defer_dip_seq_update = 1;
3050                 else
3051                         ip->i_seq++;
3052
3053                 if (((ip->i_mode & IFMT) == IFREG) &&
3054                     (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
3055                         /*
3056                          * Truncate regular files, if requested by caller.
3057                          * Grab i_rwlock to make sure no one else is
3058                          * currently writing to the file (we promised
3059                          * bmap we would do this).
3060                          * Must get the locks in the correct order.
3061                          */
3062                         if (ip->i_size == 0) {
3063                                 ip->i_flag |= ICHG | IUPD;
3064                                 ip->i_seq++;
3065                                 TRANS_INODE(ufsvfsp, ip);
3066                         } else {
3067                                 /*
3068                                  * Large Files: Why this check here?
3069                                  * Though we do it in vn_create() we really
3070                                  * want to guarantee that we do not destroy
3071                                  * Large file data by atomically checking
3072                                  * the size while holding the contents
3073                                  * lock.
3074                                  */
3075                                 if (flag && !(flag & FOFFMAX) &&
3076                                     ((ip->i_mode & IFMT) == IFREG) &&
3077                                     (ip->i_size > (offset_t)MAXOFF32_T)) {
3078                                         rw_exit(&ip->i_contents);
3079                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3080                                         error = EOVERFLOW;
3081                                         goto unlock;
3082                                 }
3083                                 if (TRANS_ISTRANS(ufsvfsp))
3084                                         truncflag++;
3085                                 else {
3086                                         rw_exit(&ip->i_contents);
3087                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3088                                         indeadlock = ufs_tryirwlock_trans(ulp,
3089                                                                           &ip->i_rwlock,
3090                                                                           RW_WRITER,
3091                                                                           TOP_CREATE,
3092                                                                           ufsvfsp,
3093                                                                           &error,
3094                                                                           issync,
3095                                                                           trans_size);
3096                                         if (indeadlock) {
3097                                                 VN_RELE(ITOV(ip));
3098                                                 goto again;
3099                                         }
3100                                         rw_enter(&ufsvfsp->vfs_dqrwlock,
3101                                             RW_READER);
3102                                         rw_enter(&ip->i_contents, RW_WRITER);
3103                                         (void) ufs_itrunc(ip, 0, 0,
3104                                             cr);
3105                                         rw_exit(&ip->i_rwlock);
3106                                 }
3107
3108                         }
3109                         if (error == 0) {
3110                                 vnevent_create(ITOV(ip), ct);
3111                         }
3112                 }
3113         }
3114
3115         if (error) {
3116                 if (ip != NULL) {
3117                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3118                         rw_exit(&ip->i_contents);
3119                 }
3120                 goto unlock;
3121         }
3122
3123         *vpp = ITOV(ip);
3124         ITIMES(ip);
3125         rw_exit(&ip->i_contents);
3126         rw_exit(&ufsvfsp->vfs_dqrwlock);
3127
3128         /*
3129          * If vnode is a device return special vnode instead.
3130          */
3131         if (!error && IS_DEVVP(*vpp)) {
3132                 struct vnode *newvp;
3133
3134                 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
3135                 VN_RELE(*vpp);
3136                 if (newvp == NULL) {
3137                         error = ENOSYS;
3138                         goto unlock;
3139                 }
3140                 truncflag = 0;
3141                 *vpp = newvp;
3142         }
3143 unlock:
3144
3145         /*
3146          * Do the deferred update of the parent directory's sequence
3147          * number now.
3148          */
3149         if (defer_dip_seq_update == 1) {
3150                 rw_enter(&dip->i_contents, RW_READER);
3151                 mutex_enter(&dip->i_tlock);
3152                 dip->i_seq++;
3153                 mutex_exit(&dip->i_tlock);
3154                 rw_exit(&dip->i_contents);
3155         }
3156
3157         if (ulp) {
3158                 int terr = 0;
3159
3160                 TRANS_END_CSYNC(ufsvfsp, &terr, issync, TOP_CREATE,
3161                                 trans_size);
3162
3163                 /*
3164                  * If we haven't had a more interesting failure
3165                  * already, then anything that might've happened
3166                  * here should be reported.
3167                  */
3168                 if (error == 0)
3169                         error = terr;
3170         }
3171
3172         if (!error && truncflag) {
3173                 indeadlock = ufs_tryirwlock(ulp, &ip->i_rwlock, RW_WRITER);
3174                 if (indeadlock) {
3175                         if (ulp)
3176                                 ufs_lockfs_end(ulp);
3177                         VN_RELE(ITOV(ip));
3178                         goto again;
3179                 }
3180                 (void) TRANS_ITRUNC(ip, 0, 0, cr);
3181                 rw_exit(&ip->i_rwlock);
3182         }
3183
3184         if (ulp)
3185                 ufs_lockfs_end(ulp);
3186
3187         /*
3188          * If no inodes available, try to free one up out of the
3189          * pending delete queue.
3190          */
3191         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3192                 ufs_delete_drain_wait(ufsvfsp, 1);
3193                 retry = 0;
3194                 goto again;
3195         }
3196
3197 out:
3198         return (error);
3199 }
3200
3201 extern int ufs_idle_max;
3202 /*ARGSUSED*/
3203 static int
3204 ufs_remove(struct vnode *vp, char *nm, struct cred *cr,
3205         caller_context_t *ct, int flags)
3206 {
3207         struct inode *ip = VTOI(vp);
3208         struct ufsvfs *ufsvfsp  = ip->i_ufsvfs;
3209         struct ulockfs *ulp;
3210         vnode_t *rmvp = NULL;   /* Vnode corresponding to name being removed */
3211         int indeadlock;
3212         int error;
3213         int issync;
3214         int trans_size;
3215
3216         /*
3217          * don't let the delete queue get too long
3218          */
3219         if (ufsvfsp == NULL) {
3220                 error = EIO;
3221                 goto out;
3222         }
3223         if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3224                 ufs_delete_drain(vp->v_vfsp, 1, 1);
3225
3226         error = ufs_eventlookup(vp, nm, cr, &rmvp);
3227         if (rmvp != NULL) {
3228                 /* Only send the event if there were no errors */
3229                 if (error == 0)
3230                         vnevent_remove(rmvp, vp, nm, ct);
3231                 VN_RELE(rmvp);
3232         }
3233
3234 retry_remove:
3235         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK);
3236         if (error)
3237                 goto out;
3238
3239         if (ulp)
3240                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_REMOVE,
3241                                   trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp)));
3242
3243         /*
3244          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3245          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3246          * possible, retries the operation.
3247          */
3248         indeadlock = ufs_tryirwlock_trans(ulp, &ip->i_rwlock, RW_WRITER,
3249                                           TOP_REMOVE, ufsvfsp, &error,
3250                                           issync, trans_size);
3251         if (indeadlock)
3252                 goto retry_remove;
3253         error = ufs_dirremove(ip, nm, NULL, NULL, DR_REMOVE, cr);
3254         rw_exit(&ip->i_rwlock);
3255
3256         if (ulp) {
3257                 TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_REMOVE,
3258                                 trans_size);
3259                 ufs_lockfs_end(ulp);
3260         }
3261
3262 out:
3263         return (error);
3264 }
3265
3266 /*
3267  * Link a file or a directory.  Only privileged processes are allowed to
3268  * make links to directories.
3269  */
3270 /*ARGSUSED*/
3271 static int
3272 ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr,
3273         caller_context_t *ct, int flags)
3274 {
3275         struct inode *sip;
3276         struct inode *tdp = VTOI(tdvp);
3277         struct ufsvfs *ufsvfsp = tdp->i_ufsvfs;
3278         struct ulockfs *ulp;
3279         struct vnode *realvp;
3280         int error;
3281         int issync;
3282         int trans_size;
3283         int isdev;
3284         int indeadlock;
3285
3286 retry_link:
3287         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK);
3288         if (error)
3289                 goto out;
3290
3291         if (ulp)
3292                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_LINK,
3293                                   trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp)));
3294
3295         if (fop_realvp(svp, &realvp, ct) == 0)
3296                 svp = realvp;
3297
3298         /*
3299          * Make sure link for extended attributes is valid
3300          * We only support hard linking of attr in ATTRDIR to ATTRDIR
3301          *
3302          * Make certain we don't attempt to look at a device node as
3303          * a ufs inode.
3304          */
3305
3306         isdev = IS_DEVVP(svp);
3307         if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) &&
3308             ((tdp->i_mode & IFMT) == IFATTRDIR)) ||
3309             ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) &&
3310             ((tdp->i_mode & IFMT) == IFDIR))) {
3311                 error = EINVAL;
3312                 goto unlock;
3313         }
3314
3315         sip = VTOI(svp);
3316         if ((svp->v_type == VDIR &&
3317             secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) ||
3318             (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) {
3319                 error = EPERM;
3320                 goto unlock;
3321         }
3322
3323         /*
3324          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3325          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3326          * possible, retries the operation.
3327          */
3328         indeadlock = ufs_tryirwlock_trans(ulp, &tdp->i_rwlock, RW_WRITER,
3329                                           TOP_LINK, ufsvfsp, &error, issync,
3330                                           trans_size);
3331         if (indeadlock)
3332                 goto retry_link;
3333         error = ufs_direnter_lr(tdp, tnm, DE_LINK, NULL, sip, cr);
3334         rw_exit(&tdp->i_rwlock);
3335
3336 unlock:
3337         if (ulp) {
3338                 TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_LINK, trans_size);
3339                 ufs_lockfs_end(ulp);
3340         }
3341
3342         if (!error) {
3343                 vnevent_link(svp, ct);
3344         }
3345 out:
3346         return (error);
3347 }
3348
3349 uint64_t ufs_rename_retry_cnt;
3350 uint64_t ufs_rename_upgrade_retry_cnt;
3351 uint64_t ufs_rename_dircheck_retry_cnt;
3352 clock_t  ufs_rename_backoff_delay = 1;
3353
3354 /*
3355  * Rename a file or directory.
3356  * We are given the vnode and entry string of the source and the
3357  * vnode and entry string of the place we want to move the source
3358  * to (the target). The essential operation is:
3359  *      unlink(target);
3360  *      link(source, target);
3361  *      unlink(source);
3362  * but "atomically".  Can't do full commit without saving state in
3363  * the inode on disk, which isn't feasible at this time.  Best we
3364  * can do is always guarantee that the TARGET exists.
3365  */
3366
3367 /*ARGSUSED*/
3368 static int
3369 ufs_rename(
3370         struct vnode *sdvp,             /* old (source) parent vnode */
3371         char *snm,                      /* old (source) entry name */
3372         struct vnode *tdvp,             /* new (target) parent vnode */
3373         char *tnm,                      /* new (target) entry name */
3374         struct cred *cr,
3375         caller_context_t *ct,
3376         int flags)
3377 {
3378         struct inode *sip = NULL;       /* source inode */
3379         struct inode *ip = NULL;        /* check inode */
3380         struct inode *sdp;              /* old (source) parent inode */
3381         struct inode *tdp;              /* new (target) parent inode */
3382         struct vnode *svp = NULL;       /* source vnode */
3383         struct vnode *tvp = NULL;       /* target vnode, if it exists */
3384         struct vnode *realvp;
3385         struct ufsvfs *ufsvfsp;
3386         struct ulockfs *ulp = NULL;
3387         struct ufs_slot slot;
3388         timestruc_t now;
3389         int error;
3390         int issync;
3391         int trans_size;
3392         krwlock_t *first_lock;
3393         krwlock_t *second_lock;
3394         krwlock_t *reverse_lock;
3395         int serr, terr;
3396
3397         sdp = VTOI(sdvp);
3398         slot.fbp = NULL;
3399         ufsvfsp = sdp->i_ufsvfs;
3400
3401         if (fop_realvp(tdvp, &realvp, ct) == 0)
3402                 tdvp = realvp;
3403
3404         /* Must do this before taking locks in case of DNLC miss */
3405         terr = ufs_eventlookup(tdvp, tnm, cr, &tvp);
3406         serr = ufs_eventlookup(sdvp, snm, cr, &svp);
3407
3408         if ((serr == 0) && ((terr == 0) || (terr == ENOENT))) {
3409                 if (tvp != NULL)
3410                         vnevent_pre_rename_dest(tvp, tdvp, tnm, ct);
3411
3412                 /*
3413                  * Notify the target directory of the rename event
3414                  * if source and target directories are not the same.
3415                  */
3416                 if (sdvp != tdvp)
3417                         vnevent_pre_rename_dest_dir(tdvp, svp, tnm, ct);
3418
3419                 if (svp != NULL)
3420                         vnevent_pre_rename_src(svp, sdvp, snm, ct);
3421         }
3422
3423         if (svp != NULL)
3424                 VN_RELE(svp);
3425
3426 retry_rename:
3427         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK);
3428         if (error)
3429                 goto unlock;
3430
3431         if (ulp)
3432                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_RENAME,
3433                                   trans_size = (int)TOP_RENAME_SIZE(sdp));
3434
3435         if (fop_realvp(tdvp, &realvp, ct) == 0)
3436                 tdvp = realvp;
3437
3438         tdp = VTOI(tdvp);
3439
3440         /*
3441          * We only allow renaming of attributes from ATTRDIR to ATTRDIR.
3442          */
3443         if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) {
3444                 error = EINVAL;
3445                 goto unlock;
3446         }
3447
3448         /*
3449          * Check accessibility of directory.
3450          */
3451         if (error = ufs_diraccess(sdp, IEXEC, cr))
3452                 goto unlock;
3453
3454         /*
3455          * Look up inode of file we're supposed to rename.
3456          */
3457         gethrestime(&now);
3458         if (error = ufs_dirlook(sdp, snm, &sip, cr, 0, 0)) {
3459                 if (error == EAGAIN) {
3460                         if (ulp) {
3461                                 TRANS_END_CSYNC(ufsvfsp, &error, issync,
3462                                                 TOP_RENAME, trans_size);
3463                                 ufs_lockfs_end(ulp);
3464                         }
3465                         goto retry_rename;
3466                 }
3467
3468                 goto unlock;
3469         }
3470
3471         /*
3472          * Lock both the source and target directories (they may be
3473          * the same) to provide the atomicity semantics that was
3474          * previously provided by the per file system vfs_rename_lock
3475          *
3476          * with vfs_rename_lock removed to allow simultaneous renames
3477          * within a file system, ufs_dircheckpath can deadlock while
3478          * traversing back to ensure that source is not a parent directory
3479          * of target parent directory. This is because we get into
3480          * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER.
3481          * If the tdp and sdp of the simultaneous renames happen to be
3482          * in the path of each other, it can lead to a deadlock. This
3483          * can be avoided by getting the locks as RW_READER here and then
3484          * upgrading to RW_WRITER after completing the ufs_dircheckpath.
3485          *
3486          * We hold the target directory's i_rwlock after calling
3487          * ufs_lockfs_begin but in many other operations (like ufs_readdir)
3488          * fop_rwlock is explicitly called by the filesystem independent code
3489          * before calling the file system operation. In these cases the order
3490          * is reversed (i.e i_rwlock is taken first and then ufs_lockfs_begin
3491          * is called). This is fine as long as ufs_lockfs_begin acts as a VOP
3492          * counter but with ufs_quiesce setting the SLOCK bit this becomes a
3493          * synchronizing object which might lead to a deadlock. So we use
3494          * rw_tryenter instead of rw_enter. If we fail to get this lock and
3495          * find that SLOCK bit is set, we call ufs_lockfs_end and restart the
3496          * operation.
3497          */
3498 retry:
3499         first_lock = &tdp->i_rwlock;
3500         second_lock = &sdp->i_rwlock;
3501 retry_firstlock:
3502         if (!rw_tryenter(first_lock, RW_READER)) {
3503                 /*
3504                  * We didn't get the lock. Check if the SLOCK is set in the
3505                  * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3506                  * and wait for SLOCK to be cleared.
3507                  */
3508
3509                 if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3510                         TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_RENAME,
3511                                         trans_size);
3512                         ufs_lockfs_end(ulp);
3513                         goto retry_rename;
3514
3515                 } else {
3516                         /*
3517                          * SLOCK isn't set so this is a genuine synchronization
3518                          * case. Let's try again after giving them a breather.
3519                          */
3520                         delay(RETRY_LOCK_DELAY);
3521                         goto  retry_firstlock;
3522                 }
3523         }
3524         /*
3525          * Need to check if the tdp and sdp are same !!!
3526          */
3527         if ((tdp != sdp) && (!rw_tryenter(second_lock, RW_READER))) {
3528                 /*
3529                  * We didn't get the lock. Check if the SLOCK is set in the
3530                  * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3531                  * and wait for SLOCK to be cleared.
3532                  */
3533
3534                 rw_exit(first_lock);
3535                 if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3536                         TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_RENAME,
3537                                         trans_size);
3538                         ufs_lockfs_end(ulp);
3539                         goto retry_rename;
3540
3541                 } else {
3542                         /*
3543                          * So we couldn't get the second level peer lock *and*
3544                          * the SLOCK bit isn't set. Too bad we can be
3545                          * contentding with someone wanting these locks otherway
3546                          * round. Reverse the locks in case there is a heavy
3547                          * contention for the second level lock.
3548                          */
3549                         reverse_lock = first_lock;
3550                         first_lock = second_lock;
3551                         second_lock = reverse_lock;
3552                         ufs_rename_retry_cnt++;
3553                         goto  retry_firstlock;
3554                 }
3555         }
3556
3557         if (sip == tdp) {
3558                 error = EINVAL;
3559                 goto errout;
3560         }
3561         /*
3562          * Make sure we can delete the source entry.  This requires
3563          * write permission on the containing directory.
3564          * Check for sticky directories.
3565          */
3566         rw_enter(&sdp->i_contents, RW_READER);
3567         rw_enter(&sip->i_contents, RW_READER);
3568         if ((error = ufs_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
3569             (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) {
3570                 rw_exit(&sip->i_contents);
3571                 rw_exit(&sdp->i_contents);
3572                 goto errout;
3573         }
3574
3575         /*
3576          * If this is a rename of a directory and the parent is
3577          * different (".." must be changed), then the source
3578          * directory must not be in the directory hierarchy
3579          * above the target, as this would orphan everything
3580          * below the source directory.  Also the user must have
3581          * write permission in the source so as to be able to
3582          * change "..".
3583          */
3584         if ((((sip->i_mode & IFMT) == IFDIR) ||
3585             ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) {
3586                 ino_t   inum;
3587
3588                 if (error = ufs_iaccess(sip, IWRITE, cr, 0)) {
3589                         rw_exit(&sip->i_contents);
3590                         rw_exit(&sdp->i_contents);
3591                         goto errout;
3592                 }
3593                 inum = sip->i_number;
3594                 rw_exit(&sip->i_contents);
3595                 rw_exit(&sdp->i_contents);
3596                 if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) {
3597                         /*
3598                          * If we got EAGAIN ufs_dircheckpath detected a
3599                          * potential deadlock and backed out. We need
3600                          * to retry the operation since sdp and tdp have
3601                          * to be released to avoid the deadlock.
3602                          */
3603                         if (error == EAGAIN) {
3604                                 rw_exit(&tdp->i_rwlock);
3605                                 if (tdp != sdp)
3606                                         rw_exit(&sdp->i_rwlock);
3607                                 delay(ufs_rename_backoff_delay);
3608                                 ufs_rename_dircheck_retry_cnt++;
3609                                 goto retry;
3610                         }
3611                         goto errout;
3612                 }
3613         } else {
3614                 rw_exit(&sip->i_contents);
3615                 rw_exit(&sdp->i_contents);
3616         }
3617
3618
3619         /*
3620          * Check for renaming '.' or '..' or alias of '.'
3621          */
3622         if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) {
3623                 error = EINVAL;
3624                 goto errout;
3625         }
3626
3627         /*
3628          * Simultaneous renames can deadlock in ufs_dircheckpath since it
3629          * tries to traverse back the file tree with both tdp and sdp held
3630          * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks
3631          * as RW_READERS  till ufs_dircheckpath is done.
3632          * Now that ufs_dircheckpath is done with, we can upgrade the locks
3633          * to RW_WRITER.
3634          */
3635         if (!rw_tryupgrade(&tdp->i_rwlock)) {
3636                 /*
3637                  * The upgrade failed. We got to give away the lock
3638                  * as to avoid deadlocking with someone else who is
3639                  * waiting for writer lock. With the lock gone, we
3640                  * cannot be sure the checks done above will hold
3641                  * good when we eventually get them back as writer.
3642                  * So if we can't upgrade we drop the locks and retry
3643                  * everything again.
3644                  */
3645                 rw_exit(&tdp->i_rwlock);
3646                 if (tdp != sdp)
3647                         rw_exit(&sdp->i_rwlock);
3648                 delay(ufs_rename_backoff_delay);
3649                 ufs_rename_upgrade_retry_cnt++;
3650                 goto retry;
3651         }
3652         if (tdp != sdp) {
3653                 if (!rw_tryupgrade(&sdp->i_rwlock)) {
3654                         /*
3655                          * The upgrade failed. We got to give away the lock
3656                          * as to avoid deadlocking with someone else who is
3657                          * waiting for writer lock. With the lock gone, we
3658                          * cannot be sure the checks done above will hold
3659                          * good when we eventually get them back as writer.
3660                          * So if we can't upgrade we drop the locks and retry
3661                          * everything again.
3662                          */
3663                         rw_exit(&tdp->i_rwlock);
3664                         rw_exit(&sdp->i_rwlock);
3665                         delay(ufs_rename_backoff_delay);
3666                         ufs_rename_upgrade_retry_cnt++;
3667                         goto retry;
3668                 }
3669         }
3670
3671         /*
3672          * Now that all the locks are held check to make sure another thread
3673          * didn't slip in and take out the sip.
3674          */
3675         slot.status = NONE;
3676         if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec ||
3677             sip->i_ctime.tv_sec > now.tv_sec) {
3678                 rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
3679                 rw_enter(&sdp->i_contents, RW_WRITER);
3680                 error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot,
3681                     &ip, cr, 0);
3682                 rw_exit(&sdp->i_contents);
3683                 rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock);
3684                 if (error) {
3685                         goto errout;
3686                 }
3687                 if (ip == NULL) {
3688                         error = ENOENT;
3689                         goto errout;
3690                 } else {
3691                         /*
3692                          * If the inode was found need to drop the v_count
3693                          * so as not to keep the filesystem from being
3694                          * unmounted at a later time.
3695                          */
3696                         VN_RELE(ITOV(ip));
3697                 }
3698
3699                 /*
3700                  * Release the slot.fbp that has the page mapped and
3701                  * locked SE_SHARED, and could be used in in
3702                  * ufs_direnter_lr() which needs to get the SE_EXCL lock
3703                  * on said page.
3704                  */
3705                 if (slot.fbp) {
3706                         fbrelse(slot.fbp, S_OTHER);
3707                         slot.fbp = NULL;
3708                 }
3709         }
3710
3711         /*
3712          * Link source to the target.
3713          */
3714         if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr)) {
3715                 /*
3716                  * ESAME isn't really an error; it indicates that the
3717                  * operation should not be done because the source and target
3718                  * are the same file, but that no error should be reported.
3719                  */
3720                 if (error == ESAME)
3721                         error = 0;
3722                 goto errout;
3723         }
3724
3725         if (error == 0 && tvp != NULL)
3726                 vnevent_rename_dest(tvp, tdvp, tnm, ct);
3727
3728         /*
3729          * Unlink the source.
3730          * Remove the source entry.  ufs_dirremove() checks that the entry
3731          * still reflects sip, and returns an error if it doesn't.
3732          * If the entry has changed just forget about it.  Release
3733          * the source inode.
3734          */
3735         if ((error = ufs_dirremove(sdp, snm, sip, NULL,
3736             DR_RENAME, cr)) == ENOENT)
3737                 error = 0;
3738
3739         if (error == 0) {
3740                 vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
3741                 /*
3742                  * Notify the target directory of the rename event
3743                  * if source and target directories are not the same.
3744                  */
3745                 if (sdvp != tdvp)
3746                         vnevent_rename_dest_dir(tdvp, ct);
3747         }
3748
3749 errout:
3750         if (slot.fbp)
3751                 fbrelse(slot.fbp, S_OTHER);
3752
3753         rw_exit(&tdp->i_rwlock);
3754         if (sdp != tdp) {
3755                 rw_exit(&sdp->i_rwlock);
3756         }
3757
3758 unlock:
3759         if (tvp != NULL)
3760                 VN_RELE(tvp);
3761         if (sip != NULL)
3762                 VN_RELE(ITOV(sip));
3763
3764         if (ulp) {
3765                 TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_RENAME,
3766                                 trans_size);
3767                 ufs_lockfs_end(ulp);
3768         }
3769
3770         return (error);
3771 }
3772
3773 /*ARGSUSED*/
3774 static int
3775 ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap,
3776         struct vnode **vpp, struct cred *cr, caller_context_t *ct, int flags,
3777         vsecattr_t *vsecp)
3778 {
3779         struct inode *ip;
3780         struct inode *xip;
3781         struct ufsvfs *ufsvfsp;
3782         struct ulockfs *ulp;
3783         int error;
3784         int issync;
3785         int trans_size;
3786         int indeadlock;
3787         int retry = 1;
3788
3789         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
3790
3791         /*
3792          * Can't make directory in attr hidden dir
3793          */
3794         if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
3795                 return (EINVAL);
3796
3797 again:
3798         ip = VTOI(dvp);
3799         ufsvfsp = ip->i_ufsvfs;
3800         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3801         if (error)
3802                 goto out;
3803         if (ulp)
3804                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_MKDIR,
3805                                   trans_size = (int)TOP_MKDIR_SIZE(ip));
3806
3807         /*
3808          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3809          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3810          * possible, retries the operation.
3811          */
3812         indeadlock = ufs_tryirwlock_trans(ulp, &ip->i_rwlock, RW_WRITER,
3813                                           TOP_MKDIR, ufsvfsp, &error, issync,
3814                                           trans_size);
3815         if (indeadlock)
3816                 goto again;
3817
3818         error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr,
3819             (retry ? IQUIET : 0));
3820         if (error == EAGAIN) {
3821                 if (ulp) {
3822                         TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_MKDIR,
3823                                         trans_size);
3824                         ufs_lockfs_end(ulp);
3825                 }
3826                 goto again;
3827         }
3828
3829         rw_exit(&ip->i_rwlock);
3830         if (error == 0) {
3831                 ip = xip;
3832                 *vpp = ITOV(ip);
3833         } else if (error == EEXIST)
3834                 VN_RELE(ITOV(xip));
3835
3836         if (ulp) {
3837                 int terr = 0;
3838                 TRANS_END_CSYNC(ufsvfsp, &terr, issync, TOP_MKDIR, trans_size);
3839                 ufs_lockfs_end(ulp);
3840                 if (error == 0)
3841                         error = terr;
3842         }
3843 out:
3844         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3845                 ufs_delete_drain_wait(ufsvfsp, 1);
3846                 retry = 0;
3847                 goto again;
3848         }
3849
3850         return (error);
3851 }
3852
3853 /*ARGSUSED*/
3854 static int
3855 ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr,
3856         caller_context_t *ct, int flags)
3857 {
3858         struct inode *ip = VTOI(vp);
3859         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
3860         struct ulockfs *ulp;
3861         vnode_t *rmvp = NULL;   /* Vnode of removed directory */
3862         int error;
3863         int issync;
3864         int trans_size;
3865         int indeadlock;
3866
3867         /*
3868          * don't let the delete queue get too long
3869          */
3870         if (ufsvfsp == NULL) {
3871                 error = EIO;
3872                 goto out;
3873         }
3874         if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3875                 ufs_delete_drain(vp->v_vfsp, 1, 1);
3876
3877         error = ufs_eventlookup(vp, nm, cr, &rmvp);
3878         if (rmvp != NULL) {
3879                 /* Only send the event if there were no errors */
3880                 if (error == 0)
3881                         vnevent_rmdir(rmvp, vp, nm, ct);
3882                 VN_RELE(rmvp);
3883         }
3884
3885 retry_rmdir:
3886         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK);
3887         if (error)
3888                 goto out;
3889
3890         if (ulp)
3891                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_RMDIR,
3892                                   trans_size = TOP_RMDIR_SIZE);
3893
3894         /*
3895          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3896          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3897          * possible, retries the operation.
3898          */
3899         indeadlock = ufs_tryirwlock_trans(ulp, &ip->i_rwlock, RW_WRITER,
3900                                           TOP_RMDIR, ufsvfsp, &error, issync,
3901                                           trans_size);
3902         if (indeadlock)
3903                 goto retry_rmdir;
3904         error = ufs_dirremove(ip, nm, NULL, cdir, DR_RMDIR, cr);
3905
3906         rw_exit(&ip->i_rwlock);
3907
3908         if (ulp) {
3909                 TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_RMDIR,
3910                                 trans_size);
3911                 ufs_lockfs_end(ulp);
3912         }
3913
3914 out:
3915         return (error);
3916 }
3917
3918 /* ARGSUSED */
3919 static int
3920 ufs_readdir(
3921         struct vnode *vp,
3922         struct uio *uiop,
3923         struct cred *cr,
3924         int *eofp,
3925         caller_context_t *ct,
3926         int flags)
3927 {
3928         struct iovec *iovp;
3929         struct inode *ip;
3930         struct direct *idp;
3931         struct dirent64 *odp;
3932         struct fbuf *fbp;
3933         struct ufsvfs *ufsvfsp;
3934         struct ulockfs *ulp;
3935         caddr_t outbuf;
3936         size_t bufsize;
3937         uint_t offset;
3938         uint_t bytes_wanted, total_bytes_wanted;
3939         int incount = 0;
3940         int outcount = 0;
3941         int error;
3942
3943         ip = VTOI(vp);
3944         ASSERT(RW_READ_HELD(&ip->i_rwlock));
3945
3946         if (uiop->uio_loffset >= MAXOFF32_T) {
3947                 if (eofp)
3948                         *eofp = 1;
3949                 return (0);
3950         }
3951
3952         /*
3953          * Check if we have been called with a valid iov_len
3954          * and bail out if not, otherwise we may potentially loop
3955          * forever further down.
3956          */
3957         if (uiop->uio_iov->iov_len <= 0) {
3958                 error = EINVAL;
3959                 goto out;
3960         }
3961
3962         /*
3963          * Large Files: When we come here we are guaranteed that
3964          * uio_offset can be used safely. The high word is zero.
3965          */
3966
3967         ufsvfsp = ip->i_ufsvfs;
3968         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK);
3969         if (error)
3970                 goto out;
3971
3972         iovp = uiop->uio_iov;
3973         total_bytes_wanted = iovp->iov_len;
3974
3975         /* Large Files: directory files should not be "large" */
3976
3977         ASSERT(ip->i_size <= MAXOFF32_T);
3978
3979         /* Force offset to be valid (to guard against bogus lseek() values) */
3980         offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1);
3981
3982         /* Quit if at end of file or link count of zero (posix) */
3983         if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) {
3984                 if (eofp)
3985                         *eofp = 1;
3986                 error = 0;
3987                 goto unlock;
3988         }
3989
3990         /*
3991          * Get space to change directory entries into fs independent format.
3992          * Do fast alloc for the most commonly used-request size (filesystem
3993          * block size).
3994          */
3995         if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) {
3996                 bufsize = total_bytes_wanted;
3997                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
3998                 odp = (struct dirent64 *)outbuf;
3999         } else {
4000                 bufsize = total_bytes_wanted;
4001                 odp = (struct dirent64 *)iovp->iov_base;
4002         }
4003
4004 nextblk:
4005         bytes_wanted = total_bytes_wanted;
4006
4007         /* Truncate request to file size */
4008         if (offset + bytes_wanted > (int)ip->i_size)
4009                 bytes_wanted = (int)(ip->i_size - offset);
4010
4011         /* Comply with MAXBSIZE boundary restrictions of fbread() */
4012         if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE)
4013                 bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET);
4014
4015         /*
4016          * Read in the next chunk.
4017          * We are still holding the i_rwlock.
4018          */
4019         error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp);
4020
4021         if (error)
4022                 goto update_inode;
4023         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) &&
4024             (!ufsvfsp->vfs_noatime)) {
4025                 ip->i_flag |= IACC;
4026         }
4027         incount = 0;
4028         idp = (struct direct *)fbp->fb_addr;
4029         if (idp->d_ino == 0 && idp->d_reclen == 0 && idp->d_namlen == 0) {
4030                 cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, "
4031                     "fs = %s\n",
4032                     (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt);
4033                 fbrelse(fbp, S_OTHER);
4034                 error = ENXIO;
4035                 goto update_inode;
4036         }
4037         /* Transform to file-system independent format */
4038         while (incount < bytes_wanted) {
4039                 /*
4040                  * If the current directory entry is mangled, then skip
4041                  * to the next block.  It would be nice to set the FSBAD
4042                  * flag in the super-block so that a fsck is forced on
4043                  * next reboot, but locking is a problem.
4044                  */
4045                 if (idp->d_reclen & 0x3) {
4046                         offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4047                         break;
4048                 }
4049
4050                 /* Skip to requested offset and skip empty entries */
4051                 if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) {
4052                         ushort_t this_reclen =
4053                             DIRENT64_RECLEN(idp->d_namlen);
4054                         /* Buffer too small for any entries */
4055                         if (!outcount && this_reclen > bufsize) {
4056                                 fbrelse(fbp, S_OTHER);
4057                                 error = EINVAL;
4058                                 goto update_inode;
4059                         }
4060                         /* If would overrun the buffer, quit */
4061                         if (outcount + this_reclen > bufsize) {
4062                                 break;
4063                         }
4064                         /* Take this entry */
4065                         odp->d_ino = (ino64_t)idp->d_ino;
4066                         odp->d_reclen = (ushort_t)this_reclen;
4067                         odp->d_off = (offset_t)(offset + idp->d_reclen);
4068
4069                         /* use strncpy(9f) to zero out uninitialized bytes */
4070
4071                         ASSERT(strlen(idp->d_name) + 1 <=
4072                             DIRENT64_NAMELEN(this_reclen));
4073                         (void) strncpy(odp->d_name, idp->d_name,
4074                             DIRENT64_NAMELEN(this_reclen));
4075                         outcount += odp->d_reclen;
4076                         odp = (struct dirent64 *)
4077                             ((intptr_t)odp + odp->d_reclen);
4078                         ASSERT(outcount <= bufsize);
4079                 }
4080                 if (idp->d_reclen) {
4081                         incount += idp->d_reclen;
4082                         offset += idp->d_reclen;
4083                         idp = (struct direct *)((intptr_t)idp + idp->d_reclen);
4084                 } else {
4085                         offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4086                         break;
4087                 }
4088         }
4089         /* Release the chunk */
4090         fbrelse(fbp, S_OTHER);
4091
4092         /* Read whole block, but got no entries, read another if not eof */
4093
4094         /*
4095          * Large Files: casting i_size to int here is not a problem
4096          * because directory sizes are always less than MAXOFF32_T.
4097          * See assertion above.
4098          */
4099
4100         if (offset < (int)ip->i_size && !outcount)
4101                 goto nextblk;
4102
4103         /* Copy out the entry data */
4104         if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) {
4105                 iovp->iov_base += outcount;
4106                 iovp->iov_len -= outcount;
4107                 uiop->uio_resid -= outcount;
4108                 uiop->uio_offset = offset;
4109         } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ,
4110             uiop)) == 0)
4111                 uiop->uio_offset = offset;
4112 update_inode:
4113         ITIMES(ip);
4114         if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1)
4115                 kmem_free(outbuf, bufsize);
4116
4117         if (eofp && error == 0)
4118                 *eofp = (uiop->uio_offset >= (int)ip->i_size);
4119 unlock:
4120         if (ulp) {
4121                 ufs_lockfs_end(ulp);
4122         }
4123 out:
4124         return (error);
4125 }
4126
4127 /*ARGSUSED*/
4128 static int
4129 ufs_symlink(
4130         struct vnode *dvp,              /* ptr to parent dir vnode */
4131         char *linkname,                 /* name of symbolic link */
4132         struct vattr *vap,              /* attributes */
4133         char *target,                   /* target path */
4134         struct cred *cr,                /* user credentials */
4135         caller_context_t *ct,
4136         int flags)
4137 {
4138         struct inode *ip, *dip = VTOI(dvp);
4139         struct ufsvfs *ufsvfsp = dip->i_ufsvfs;
4140         struct ulockfs *ulp;
4141         int error;
4142         int issync;
4143         int trans_size;
4144         int residual;
4145         int ioflag;
4146         int retry = 1;
4147
4148         /*
4149          * No symlinks in attrdirs at this time
4150          */
4151         if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
4152                 return (EINVAL);
4153
4154 again:
4155         ip = NULL;
4156         vap->va_type = VLNK;
4157         vap->va_rdev = 0;
4158
4159         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK);
4160         if (error)
4161                 goto out;
4162
4163         if (ulp)
4164                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_SYMLINK,
4165                                   trans_size = (int)TOP_SYMLINK_SIZE(dip));
4166
4167         /*
4168          * We must create the inode before the directory entry, to avoid
4169          * racing with readlink().  ufs_dirmakeinode requires that we
4170          * hold the quota lock as reader, and directory locks as writer.
4171          */
4172
4173         rw_enter(&dip->i_rwlock, RW_WRITER);
4174         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4175         rw_enter(&dip->i_contents, RW_WRITER);
4176
4177         /*
4178          * Suppress any out of inodes messages if we will retry on
4179          * ENOSP
4180          */
4181         if (retry)
4182                 dip->i_flag |= IQUIET;
4183
4184         error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr);
4185
4186         dip->i_flag &= ~IQUIET;
4187
4188         rw_exit(&dip->i_contents);
4189         rw_exit(&ufsvfsp->vfs_dqrwlock);
4190         rw_exit(&dip->i_rwlock);
4191
4192         if (error)
4193                 goto unlock;
4194
4195         /*
4196          * OK.  The inode has been created.  Write out the data of the
4197          * symbolic link.  Since symbolic links are metadata, and should
4198          * remain consistent across a system crash, we need to force the
4199          * data out synchronously.
4200          *
4201          * (This is a change from the semantics in earlier releases, which
4202          * only created symbolic links synchronously if the semi-documented
4203          * 'syncdir' option was set, or if we were being invoked by the NFS
4204          * server, which requires symbolic links to be created synchronously.)
4205          *
4206          * We need to pass in a pointer for the residual length; otherwise
4207          * ufs_rdwri() will always return EIO if it can't write the data,
4208          * even if the error was really ENOSPC or EDQUOT.
4209          */
4210
4211         ioflag = FWRITE | FDSYNC;
4212         residual = 0;
4213
4214         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4215         rw_enter(&ip->i_contents, RW_WRITER);
4216
4217         /*
4218          * Suppress file system full messages if we will retry
4219          */
4220         if (retry)
4221                 ip->i_flag |= IQUIET;
4222
4223         error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target),
4224             0, UIO_SYSSPACE, &residual, cr);
4225
4226         ip->i_flag &= ~IQUIET;
4227
4228         if (error) {
4229                 rw_exit(&ip->i_contents);
4230                 rw_exit(&ufsvfsp->vfs_dqrwlock);
4231                 goto remove;
4232         }
4233
4234         /*
4235          * If the link's data is small enough, we can cache it in the inode.
4236          * This is a "fast symbolic link".  We don't use the first direct
4237          * block because that's actually used to point at the symbolic link's
4238          * contents on disk; but we know that none of the other direct or
4239          * indirect blocks can be used because symbolic links are restricted
4240          * to be smaller than a file system block.
4241          */
4242
4243         ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip)));
4244
4245         if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) {
4246                 if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) {
4247                         ip->i_flag |= IFASTSYMLNK;
4248                 } else {
4249                         int i;
4250                         /* error, clear garbage left behind */
4251                         for (i = 1; i < NDADDR; i++)
4252                                 ip->i_db[i] = 0;
4253                         for (i = 0; i < NIADDR; i++)
4254                                 ip->i_ib[i] = 0;
4255                 }
4256         }
4257
4258         rw_exit(&ip->i_contents);
4259         rw_exit(&ufsvfsp->vfs_dqrwlock);
4260
4261         /*
4262          * OK.  We've successfully created the symbolic link.  All that
4263          * remains is to insert it into the appropriate directory.
4264          */
4265
4266         rw_enter(&dip->i_rwlock, RW_WRITER);
4267         error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr);
4268         rw_exit(&dip->i_rwlock);
4269
4270         /*
4271          * Fall through into remove-on-error code.  We're either done, or we
4272          * need to remove the inode (if we couldn't insert it).
4273          */
4274
4275 remove:
4276         if (error && (ip != NULL)) {
4277                 rw_enter(&ip->i_contents, RW_WRITER);
4278                 ip->i_nlink--;
4279                 ip->i_flag |= ICHG;
4280                 ip->i_seq++;
4281                 ufs_setreclaim(ip);
4282                 rw_exit(&ip->i_contents);
4283         }
4284
4285 unlock:
4286         if (ip != NULL)
4287                 VN_RELE(ITOV(ip));
4288
4289         if (ulp) {
4290                 int terr = 0;
4291
4292                 TRANS_END_CSYNC(ufsvfsp, &terr, issync, TOP_SYMLINK,
4293                                 trans_size);
4294                 ufs_lockfs_end(ulp);
4295                 if (error == 0)
4296                         error = terr;
4297         }
4298
4299         /*
4300          * We may have failed due to lack of an inode or of a block to
4301          * store the target in.  Try flushing the delete queue to free
4302          * logically-available things up and try again.
4303          */
4304         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
4305                 ufs_delete_drain_wait(ufsvfsp, 1);
4306                 retry = 0;
4307                 goto again;
4308         }
4309
4310 out:
4311         return (error);
4312 }
4313
4314 /*
4315  * Ufs specific routine used to do ufs io.
4316  */
4317 int
4318 ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base,
4319         ssize_t len, offset_t offset, enum uio_seg seg, int *aresid,
4320         struct cred *cr)
4321 {
4322         struct uio auio;
4323         struct iovec aiov;
4324         int error;
4325
4326         ASSERT(RW_LOCK_HELD(&ip->i_contents));
4327
4328         bzero((caddr_t)&auio, sizeof (uio_t));
4329         bzero((caddr_t)&aiov, sizeof (iovec_t));
4330
4331         aiov.iov_base = base;
4332         aiov.iov_len = len;
4333         auio.uio_iov = &aiov;
4334         auio.uio_iovcnt = 1;
4335         auio.uio_loffset = offset;
4336         auio.uio_segflg = (short)seg;
4337         auio.uio_resid = len;
4338
4339         if (rw == UIO_WRITE) {
4340                 auio.uio_fmode = FWRITE;
4341                 auio.uio_extflg = UIO_COPY_DEFAULT;
4342                 auio.uio_llimit = curproc->p_fsz_ctl;
4343                 error = wrip(ip, &auio, ioflag, cr);
4344         } else {
4345                 auio.uio_fmode = FREAD;
4346                 auio.uio_extflg = UIO_COPY_CACHED;
4347                 auio.uio_llimit = MAXOFFSET_T;
4348                 error = rdip(ip, &auio, ioflag, cr);
4349         }
4350
4351         if (aresid) {
4352                 *aresid = auio.uio_resid;
4353         } else if (auio.uio_resid) {
4354                 error = EIO;
4355         }
4356         return (error);
4357 }
4358
4359 /*ARGSUSED*/
4360 static int
4361 ufs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
4362 {
4363         struct ufid *ufid;
4364         struct inode *ip = VTOI(vp);
4365
4366         if (ip->i_ufsvfs == NULL)
4367                 return (EIO);
4368
4369         if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) {
4370                 fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t);
4371                 return (ENOSPC);
4372         }
4373
4374         ufid = (struct ufid *)fidp;
4375         bzero((char *)ufid, sizeof (struct ufid));
4376         ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t);
4377         ufid->ufid_ino = ip->i_number;
4378         ufid->ufid_gen = ip->i_gen;
4379
4380         return (0);
4381 }
4382
4383 /* ARGSUSED2 */
4384 static int
4385 ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4386 {
4387         struct inode    *ip = VTOI(vp);
4388         struct ufsvfs   *ufsvfsp;
4389         int             forcedirectio;
4390
4391         /*
4392          * Read case is easy.
4393          */
4394         if (!write_lock) {
4395                 rw_enter(&ip->i_rwlock, RW_READER);
4396                 return (V_WRITELOCK_FALSE);
4397         }
4398
4399         /*
4400          * Caller has requested a writer lock, but that inhibits any
4401          * concurrency in the VOPs that follow. Acquire the lock shared
4402          * and defer exclusive access until it is known to be needed in
4403          * other VOP handlers. Some cases can be determined here.
4404          */
4405
4406         /*
4407          * If directio is not set, there is no chance of concurrency,
4408          * so just acquire the lock exclusive. Beware of a forced
4409          * unmount before looking at the mount option.
4410          */
4411         ufsvfsp = ip->i_ufsvfs;
4412         forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0;
4413         if (!(ip->i_flag & IDIRECTIO || forcedirectio) ||
4414             !ufs_allow_shared_writes) {
4415                 rw_enter(&ip->i_rwlock, RW_WRITER);
4416                 return (V_WRITELOCK_TRUE);
4417         }
4418
4419         /*
4420          * Mandatory locking forces acquiring i_rwlock exclusive.
4421          */
4422         if (MANDLOCK(vp, ip->i_mode)) {
4423                 rw_enter(&ip->i_rwlock, RW_WRITER);
4424                 return (V_WRITELOCK_TRUE);
4425         }
4426
4427         /*
4428          * Acquire the lock shared in case a concurrent write follows.
4429          * Mandatory locking could have become enabled before the lock
4430          * was acquired. Re-check and upgrade if needed.
4431          */
4432         rw_enter(&ip->i_rwlock, RW_READER);
4433         if (MANDLOCK(vp, ip->i_mode)) {
4434                 rw_exit(&ip->i_rwlock);
4435                 rw_enter(&ip->i_rwlock, RW_WRITER);
4436                 return (V_WRITELOCK_TRUE);
4437         }
4438         return (V_WRITELOCK_FALSE);
4439 }
4440
4441 /*ARGSUSED*/
4442 static void
4443 ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4444 {
4445         struct inode    *ip = VTOI(vp);
4446
4447         rw_exit(&ip->i_rwlock);
4448 }
4449
4450 /* ARGSUSED */
4451 static int
4452 ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
4453         caller_context_t *ct)
4454 {
4455         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4456 }
4457
4458 /* ARGSUSED */
4459 static int
4460 ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4461         offset_t offset, struct flk_callback *flk_cbp, struct cred *cr,
4462         caller_context_t *ct)
4463 {
4464         struct inode *ip = VTOI(vp);
4465
4466         if (ip->i_ufsvfs == NULL)
4467                 return (EIO);
4468
4469         /*
4470          * If file is being mapped, disallow frlock.
4471          * XXX I am not holding tlock while checking i_mapcnt because the
4472          * current locking strategy drops all locks before calling fs_frlock.
4473          * So, mapcnt could change before we enter fs_frlock making is
4474          * meaningless to have held tlock in the first place.
4475          */
4476         if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode))
4477                 return (EAGAIN);
4478         return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4479 }
4480
4481 /* ARGSUSED */
4482 static int
4483 ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4484         offset_t offset, cred_t *cr, caller_context_t *ct)
4485 {
4486         struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
4487         struct ulockfs *ulp;
4488         int error;
4489
4490         if ((error = convoff(vp, bfp, 0, offset)) == 0) {
4491                 if (cmd == F_FREESP) {
4492                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
4493                             ULOCKFS_SPACE_MASK);
4494                         if (error)
4495                                 return (error);
4496                         error = ufs_freesp(vp, bfp, flag, cr);
4497
4498                         if (error == 0 && bfp->l_start == 0)
4499                                 vnevent_truncate(vp, ct);
4500                 } else if (cmd == F_ALLOCSP) {
4501                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
4502                             ULOCKFS_FALLOCATE_MASK);
4503                         if (error)
4504                                 return (error);
4505                         error = ufs_allocsp(vp, bfp, cr);
4506                 } else
4507                         return (EINVAL); /* Command not handled here */
4508
4509                 if (ulp)
4510                         ufs_lockfs_end(ulp);
4511
4512         }
4513         return (error);
4514 }
4515
4516 /*
4517  * Used to determine if read ahead should be done. Also used to
4518  * to determine when write back occurs.
4519  */
4520 #define CLUSTSZ(ip)             ((ip)->i_ufsvfs->vfs_ioclustsz)
4521
4522 /*
4523  * A faster version of ufs_getpage.
4524  *
4525  * We optimize by inlining the pvn_getpages iterator, eliminating
4526  * calls to bmap_read if file doesn't have UFS holes, and avoiding
4527  * the overhead of page_exists().
4528  *
4529  * When files has UFS_HOLES and ufs_getpage is called with S_READ,
4530  * we set *protp to PROT_READ to avoid calling bmap_read. This approach
4531  * victimizes performance when a file with UFS holes is faulted
4532  * first in the S_READ mode, and then in the S_WRITE mode. We will get
4533  * two MMU faults in this case.
4534  *
4535  * XXX - the inode fields which control the sequential mode are not
4536  *       protected by any mutex. The read ahead will act wild if
4537  *       multiple processes will access the file concurrently and
4538  *       some of them in sequential mode. One particulary bad case
4539  *       is if another thread will change the value of i_nextrio between
4540  *       the time this thread tests the i_nextrio value and then reads it
4541  *       again to use it as the offset for the read ahead.
4542  */
4543 /*ARGSUSED*/
4544 static int
4545 ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
4546         page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr,
4547         enum seg_rw rw, struct cred *cr, caller_context_t *ct)
4548 {
4549         uoff_t  uoff = (uoff_t)off; /* type conversion */
4550         uoff_t  pgoff;
4551         uoff_t  eoff;
4552         struct inode    *ip = VTOI(vp);
4553         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
4554         struct fs       *fs;
4555         struct ulockfs  *ulp;
4556         page_t          **pl;
4557         caddr_t         pgaddr;
4558         krw_t           rwtype;
4559         int             err;
4560         int             has_holes;
4561         int             beyond_eof;
4562         int             seqmode;
4563         int             pgsize = PAGESIZE;
4564         int             dolock;
4565         int             do_qlock;
4566         int             trans_size;
4567
4568         ASSERT((uoff & PAGEOFFSET) == 0);
4569
4570         if (protp)
4571                 *protp = PROT_ALL;
4572
4573         /*
4574          * Obey the lockfs protocol
4575          */
4576         err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg,
4577             rw == S_READ || rw == S_EXEC, protp);
4578         if (err)
4579                 goto out;
4580
4581         fs = ufsvfsp->vfs_fs;
4582
4583         if (ulp && (rw == S_CREATE || rw == S_WRITE) &&
4584             !(vp->v_flag & VISSWAP)) {
4585                 /*
4586                  * Try to start a transaction, will return if blocking is
4587                  * expected to occur and the address space is not the
4588                  * kernel address space.
4589                  */
4590                 trans_size = TOP_GETPAGE_SIZE(ip);
4591                 if (seg->s_as != &kas) {
4592                         TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE,
4593                             trans_size, &err);
4594                         if (err == EWOULDBLOCK) {
4595                                 /*
4596                                  * Use EDEADLK here because the VM code
4597                                  * can normally never see this error.
4598                                  */
4599                                 err = EDEADLK;
4600                                 ufs_lockfs_end(ulp);
4601                                 goto out;
4602                         }
4603                 } else {
4604                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4605                 }
4606         }
4607
4608         if (vp->v_flag & VNOMAP) {
4609                 err = ENOSYS;
4610                 goto unlock;
4611         }
4612
4613         seqmode = ip->i_nextr == uoff && rw != S_CREATE;
4614
4615         rwtype = RW_READER;             /* start as a reader */
4616         dolock = (rw_owner(&ip->i_contents) != curthread);
4617         /*
4618          * If this thread owns the lock, i.e., this thread grabbed it
4619          * as writer somewhere above, then we don't need to grab the
4620          * lock as reader in this routine.
4621          */
4622         do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread);
4623
4624 retrylock:
4625         if (dolock) {
4626                 /*
4627                  * Grab the quota lock if we need to call
4628                  * bmap_write() below (with i_contents as writer).
4629                  */
4630                 if (do_qlock && rwtype == RW_WRITER)
4631                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4632                 rw_enter(&ip->i_contents, rwtype);
4633         }
4634
4635         /*
4636          * We may be getting called as a side effect of a bmap using
4637          * fbread() when the blocks might be being allocated and the
4638          * size has not yet been up'ed.  In this case we want to be
4639          * able to return zero pages if we get back UFS_HOLE from
4640          * calling bmap for a non write case here.  We also might have
4641          * to read some frags from the disk into a page if we are
4642          * extending the number of frags for a given lbn in bmap().
4643          * Large Files: The read of i_size here is atomic because
4644          * i_contents is held here. If dolock is zero, the lock
4645          * is held in bmap routines.
4646          */
4647         beyond_eof = uoff + len >
4648             P2ROUNDUP_TYPED(ip->i_size, PAGESIZE, uoff_t);
4649         if (beyond_eof && seg != segkmap) {
4650                 if (dolock) {
4651                         rw_exit(&ip->i_contents);
4652                         if (do_qlock && rwtype == RW_WRITER)
4653                                 rw_exit(&ufsvfsp->vfs_dqrwlock);
4654                 }
4655                 err = EFAULT;
4656                 goto unlock;
4657         }
4658
4659         /*
4660          * Must hold i_contents lock throughout the call to pvn_getpages
4661          * since locked pages are returned from each call to ufs_getapage.
4662          * Must *not* return locked pages and then try for contents lock
4663          * due to lock ordering requirements (inode > page)
4664          */
4665
4666         has_holes = bmap_has_holes(ip);
4667
4668         if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) {
4669                 int     blk_size;
4670                 uoff_t offset;
4671
4672                 /*
4673                  * We must acquire the RW_WRITER lock in order to
4674                  * call bmap_write().
4675                  */
4676                 if (dolock && rwtype == RW_READER) {
4677                         rwtype = RW_WRITER;
4678
4679                         /*
4680                          * Grab the quota lock before
4681                          * upgrading i_contents, but if we can't grab it
4682                          * don't wait here due to lock order:
4683                          * vfs_dqrwlock > i_contents.
4684                          */
4685                         if (do_qlock &&
4686                             rw_tryenter(&ufsvfsp->vfs_dqrwlock, RW_READER)
4687                             == 0) {
4688                                 rw_exit(&ip->i_contents);
4689                                 goto retrylock;
4690                         }
4691                         if (!rw_tryupgrade(&ip->i_contents)) {
4692                                 rw_exit(&ip->i_contents);
4693                                 if (do_qlock)
4694                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4695                                 goto retrylock;
4696                         }
4697                 }
4698
4699                 /*
4700                  * May be allocating disk blocks for holes here as
4701                  * a result of mmap faults. write(2) does the bmap_write
4702                  * in rdip/wrip, not here. We are not dealing with frags
4703                  * in this case.
4704                  */
4705                 /*
4706                  * Large Files: We cast fs_bmask field to offset_t
4707                  * just as we do for MAXBMASK because uoff is a 64-bit
4708                  * data type. fs_bmask will still be a 32-bit type
4709                  * as we cannot change any ondisk data structures.
4710                  */
4711
4712                 offset = uoff & (offset_t)fs->fs_bmask;
4713                 while (offset < uoff + len) {
4714                         blk_size = (int)blksize(fs, ip, lblkno(fs, offset));
4715                         err = bmap_write(ip, offset, blk_size,
4716                             BI_NORMAL, NULL, cr);
4717                         if (ip->i_flag & (ICHG|IUPD))
4718                                 ip->i_seq++;
4719                         if (err)
4720                                 goto update_inode;
4721                         offset += blk_size; /* XXX - make this contig */
4722                 }
4723         }
4724
4725         /*
4726          * Can be a reader from now on.
4727          */
4728         if (dolock && rwtype == RW_WRITER) {
4729                 rw_downgrade(&ip->i_contents);
4730                 /*
4731                  * We can release vfs_dqrwlock early so do it, but make
4732                  * sure we don't try to release it again at the bottom.
4733                  */
4734                 if (do_qlock) {
4735                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4736                         do_qlock = 0;
4737                 }
4738         }
4739
4740         /*
4741          * We remove PROT_WRITE in cases when the file has UFS holes
4742          * because we don't  want to call bmap_read() to check each
4743          * page if it is backed with a disk block.
4744          */
4745         if (protp && has_holes && rw != S_WRITE && rw != S_CREATE)
4746                 *protp &= ~PROT_WRITE;
4747
4748         err = 0;
4749
4750         /*
4751          * The loop looks up pages in the range [off, off + len).
4752          * For each page, we first check if we should initiate an asynchronous
4753          * read ahead before we call page_lookup (we may sleep in page_lookup
4754          * for a previously initiated disk read).
4755          */
4756         eoff = (uoff + len);
4757         for (pgoff = uoff, pgaddr = addr, pl = plarr;
4758             pgoff < eoff; /* empty */) {
4759                 page_t  *pp;
4760                 uoff_t  nextrio;
4761                 se_t    se;
4762                 int retval;
4763
4764                 se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED);
4765
4766                 /* Handle async getpage (faultahead) */
4767                 if (plarr == NULL) {
4768                         ip->i_nextrio = pgoff;
4769                         (void) ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4770                         pgoff += pgsize;
4771                         pgaddr += pgsize;
4772                         continue;
4773                 }
4774                 /*
4775                  * Check if we should initiate read ahead of next cluster.
4776                  * We call page_exists only when we need to confirm that
4777                  * we have the current page before we initiate the read ahead.
4778                  */
4779                 nextrio = ip->i_nextrio;
4780                 if (seqmode &&
4781                     pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
4782                     nextrio < ip->i_size && page_exists(&vp->v_object, pgoff)) {
4783                         retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4784                         /*
4785                          * We always read ahead the next cluster of data
4786                          * starting from i_nextrio. If the page (vp,nextrio)
4787                          * is actually in core at this point, the routine
4788                          * ufs_getpage_ra() will stop pre-fetching data
4789                          * until we read that page in a synchronized manner
4790                          * through ufs_getpage_miss(). So, we should increase
4791                          * i_nextrio if the page (vp, nextrio) exists.
4792                          */
4793                         if ((retval == 0) && page_exists(&vp->v_object, nextrio)) {
4794                                 ip->i_nextrio = nextrio + pgsize;
4795                         }
4796                 }
4797
4798                 if ((pp = page_lookup(&vp->v_object, pgoff, se)) != NULL) {
4799                         /*
4800                          * We found the page in the page cache.
4801                          */
4802                         *pl++ = pp;
4803                         pgoff += pgsize;
4804                         pgaddr += pgsize;
4805                         len -= pgsize;
4806                         plsz -= pgsize;
4807                 } else  {
4808                         /*
4809                          * We have to create the page, or read it from disk.
4810                          */
4811                         if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr,
4812                             pl, plsz, rw, seqmode))
4813                                 goto error;
4814
4815                         while (*pl != NULL) {
4816                                 pl++;
4817                                 pgoff += pgsize;
4818                                 pgaddr += pgsize;
4819                                 len -= pgsize;
4820                                 plsz -= pgsize;
4821                         }
4822                 }
4823         }
4824
4825         /*
4826          * Return pages up to plsz if they are in the page cache.
4827          * We cannot return pages if there is a chance that they are
4828          * backed with a UFS hole and rw is S_WRITE or S_CREATE.
4829          */
4830         if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
4831
4832                 ASSERT((protp == NULL) ||
4833                     !(has_holes && (*protp & PROT_WRITE)));
4834
4835                 eoff = pgoff + plsz;
4836                 while (pgoff < eoff) {
4837                         page_t          *pp;
4838
4839                         if ((pp = page_lookup_nowait(&vp->v_object, pgoff, SE_SHARED)) == NULL)
4840                                 break;
4841
4842                         *pl++ = pp;
4843                         pgoff += pgsize;
4844                         plsz -= pgsize;
4845                 }
4846         }
4847
4848         if (plarr)
4849                 *pl = NULL;                     /* Terminate page list */
4850         ip->i_nextr = pgoff;
4851
4852 error:
4853         if (err && plarr) {
4854                 /*
4855                  * Release any pages we have locked.
4856                  */
4857                 while (pl > &plarr[0])
4858                         page_unlock(*--pl);
4859
4860                 plarr[0] = NULL;
4861         }
4862
4863 update_inode:
4864         /*
4865          * If the inode is not already marked for IACC (in rdip() for read)
4866          * and the inode is not marked for no access time update (in wrip()
4867          * for write) then update the inode access time and mod time now.
4868          */
4869         if ((ip->i_flag & (IACC | INOACC)) == 0) {
4870                 if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) {
4871                         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
4872                             (fs->fs_ronly == 0) &&
4873                             (!ufsvfsp->vfs_noatime)) {
4874                                 mutex_enter(&ip->i_tlock);
4875                                 ip->i_flag |= IACC;
4876                                 ITIMES_NOLOCK(ip);
4877                                 mutex_exit(&ip->i_tlock);
4878                         }
4879                 }
4880         }
4881
4882         if (dolock) {
4883                 rw_exit(&ip->i_contents);
4884                 if (do_qlock && rwtype == RW_WRITER)
4885                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4886         }
4887
4888 unlock:
4889         if (ulp) {
4890                 if ((rw == S_CREATE || rw == S_WRITE) &&
4891                     !(vp->v_flag & VISSWAP)) {
4892                         TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4893                 }
4894                 ufs_lockfs_end(ulp);
4895         }
4896 out:
4897         return (err);
4898 }
4899
4900 /*
4901  * ufs_getpage_miss is called when ufs_getpage missed the page in the page
4902  * cache. The page is either read from the disk, or it's created.
4903  * A page is created (without disk read) if rw == S_CREATE, or if
4904  * the page is not backed with a real disk block (UFS hole).
4905  */
4906 /* ARGSUSED */
4907 static int
4908 ufs_getpage_miss(struct vnode *vp, uoff_t off, size_t len, struct seg *seg,
4909         caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq)
4910 {
4911         struct inode    *ip = VTOI(vp);
4912         page_t          *pp;
4913         daddr_t         bn;
4914         size_t          io_len;
4915         int             crpage = 0;
4916         int             err;
4917         int             contig;
4918         int             bsize = ip->i_fs->fs_bsize;
4919
4920         /*
4921          * Figure out whether the page can be created, or must be
4922          * must be read from the disk.
4923          */
4924         if (rw == S_CREATE)
4925                 crpage = 1;
4926         else {
4927                 contig = 0;
4928                 if (err = bmap_read(ip, off, &bn, &contig))
4929                         return (err);
4930
4931                 crpage = (bn == UFS_HOLE);
4932
4933                 /*
4934                  * If its also a fallocated block that hasn't been written to
4935                  * yet, we will treat it just like a UFS_HOLE and create
4936                  * a zero page for it
4937                  */
4938                 if (ISFALLOCBLK(ip, bn))
4939                         crpage = 1;
4940         }
4941
4942         if (crpage) {
4943                 if ((pp = page_create_va(&vp->v_object, off, PAGESIZE, PG_WAIT,
4944                                          seg, addr)) == NULL) {
4945                         return (ufs_fault(vp,
4946                             "ufs_getpage_miss: page_create == NULL"));
4947                 }
4948
4949                 if (rw != S_CREATE)
4950                         pagezero(pp, 0, PAGESIZE);
4951
4952                 io_len = PAGESIZE;
4953         } else {
4954                 uoff_t  io_off;
4955                 uint_t  xlen;
4956                 struct buf      *bp;
4957                 ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
4958
4959                 /*
4960                  * If access is not in sequential order, we read from disk
4961                  * in bsize units.
4962                  *
4963                  * We limit the size of the transfer to bsize if we are reading
4964                  * from the beginning of the file. Note in this situation we
4965                  * will hedge our bets and initiate an async read ahead of
4966                  * the second block.
4967                  */
4968                 if (!seq || off == 0)
4969                         contig = MIN(contig, bsize);
4970
4971                 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4972                     &io_len, off, contig, 0);
4973
4974                 /*
4975                  * Some other thread has entered the page.
4976                  * ufs_getpage will retry page_lookup.
4977                  */
4978                 if (pp == NULL) {
4979                         pl[0] = NULL;
4980                         return (0);
4981                 }
4982
4983                 /*
4984                  * Zero part of the page which we are not
4985                  * going to read from the disk.
4986                  */
4987                 xlen = io_len & PAGEOFFSET;
4988                 if (xlen != 0)
4989                         pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
4990
4991                 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ);
4992                 bp->b_edev = ip->i_dev;
4993                 bp->b_dev = cmpdev(ip->i_dev);
4994                 bp->b_blkno = bn;
4995                 bp->b_un.b_addr = (caddr_t)0;
4996                 bp->b_file = ip->i_vnode;
4997                 bp->b_offset = off;
4998
4999                 if (ufsvfsp->vfs_log) {
5000                         lufs_read_strategy(ufsvfsp->vfs_log, bp);
5001                 } else if (ufsvfsp->vfs_snapshot) {
5002                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5003                 } else {
5004                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5005                         ub.ub_getpages.value.ul++;
5006                         (void) bdev_strategy(bp);
5007                         lwp_stat_update(LWP_STAT_INBLK, 1);
5008                 }
5009
5010                 ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK);
5011
5012                 /*
5013                  * If the file access is sequential, initiate read ahead
5014                  * of the next cluster.
5015                  */
5016                 if (seq && ip->i_nextrio < ip->i_size)
5017                         (void) ufs_getpage_ra(vp, off, seg, addr);
5018                 err = biowait(bp);
5019                 pageio_done(bp);
5020
5021                 if (err) {
5022                         pvn_read_done(pp, B_ERROR);
5023                         return (err);
5024                 }
5025         }
5026
5027         pvn_plist_init(pp, pl, plsz, off, io_len, rw);
5028         return (0);
5029 }
5030
5031 /*
5032  * Read ahead a cluster from the disk. Returns the length in bytes.
5033  */
5034 static int
5035 ufs_getpage_ra(struct vnode *vp, uoff_t off, struct seg *seg, caddr_t addr)
5036 {
5037         struct inode    *ip = VTOI(vp);
5038         page_t          *pp;
5039         uoff_t  io_off = ip->i_nextrio;
5040         ufsvfs_t        *ufsvfsp;
5041         caddr_t         addr2 = addr + (io_off - off);
5042         struct buf      *bp;
5043         daddr_t         bn;
5044         size_t          io_len;
5045         int             err;
5046         int             contig;
5047         int             xlen;
5048         int             bsize = ip->i_fs->fs_bsize;
5049
5050         /*
5051          * If the directio advisory is in effect on this file,
5052          * then do not do buffered read ahead. Read ahead makes
5053          * it more difficult on threads using directio as they
5054          * will be forced to flush the pages from this vnode.
5055          */
5056         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5057                 return (0);
5058         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio)
5059                 return (0);
5060
5061         /*
5062          * Is this test needed?
5063          */
5064         if (addr2 >= seg->s_base + seg->s_size)
5065                 return (0);
5066
5067         contig = 0;
5068         err = bmap_read(ip, io_off, &bn, &contig);
5069         /*
5070          * If its a UFS_HOLE or a fallocated block, do not perform
5071          * any read ahead's since there probably is nothing to read ahead
5072          */
5073         if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn))
5074                 return (0);
5075
5076         /*
5077          * Limit the transfer size to bsize if this is the 2nd block.
5078          */
5079         if (io_off == (uoff_t)bsize)
5080                 contig = MIN(contig, bsize);
5081
5082         if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off,
5083             &io_len, io_off, contig, 1)) == NULL)
5084                 return (0);
5085
5086         /*
5087          * Zero part of page which we are not going to read from disk
5088          */
5089         if ((xlen = (io_len & PAGEOFFSET)) > 0)
5090                 pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
5091
5092         ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK;
5093
5094         bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC);
5095         bp->b_edev = ip->i_dev;
5096         bp->b_dev = cmpdev(ip->i_dev);
5097         bp->b_blkno = bn;
5098         bp->b_un.b_addr = (caddr_t)0;
5099         bp->b_file = ip->i_vnode;
5100         bp->b_offset = off;
5101
5102         if (ufsvfsp->vfs_log) {
5103                 lufs_read_strategy(ufsvfsp->vfs_log, bp);
5104         } else if (ufsvfsp->vfs_snapshot) {
5105                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5106         } else {
5107                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5108                 ub.ub_getras.value.ul++;
5109                 (void) bdev_strategy(bp);
5110                 lwp_stat_update(LWP_STAT_INBLK, 1);
5111         }
5112
5113         return (io_len);
5114 }
5115
5116 int     ufs_delay = 1;
5117 /*
5118  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC}
5119  *
5120  * LMXXX - the inode really ought to contain a pointer to one of these
5121  * async args.  Stuff gunk in there and just hand the whole mess off.
5122  * This would replace i_delaylen, i_delayoff.
5123  */
5124 /*ARGSUSED*/
5125 static int
5126 ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
5127         struct cred *cr, caller_context_t *ct)
5128 {
5129         struct inode *ip = VTOI(vp);
5130         int err = 0;
5131
5132         if (vp->v_count == 0) {
5133                 return (ufs_fault(vp, "ufs_putpage: bad v_count == 0"));
5134         }
5135
5136         /*
5137          * XXX - Why should this check be made here?
5138          */
5139         if (vp->v_flag & VNOMAP) {
5140                 err = ENOSYS;
5141                 goto errout;
5142         }
5143
5144         if (ip->i_ufsvfs == NULL) {
5145                 err = EIO;
5146                 goto errout;
5147         }
5148
5149         if (flags & B_ASYNC) {
5150                 if (ufs_delay && len &&
5151                     (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
5152                         mutex_enter(&ip->i_tlock);
5153                         /*
5154                          * If nobody stalled, start a new cluster.
5155                          */
5156                         if (ip->i_delaylen == 0) {
5157                                 ip->i_delayoff = off;
5158                                 ip->i_delaylen = len;
5159                                 mutex_exit(&ip->i_tlock);
5160                                 goto errout;
5161                         }
5162                         /*
5163                          * If we have a full cluster or they are not contig,
5164                          * then push last cluster and start over.
5165                          */
5166                         if (ip->i_delaylen >= CLUSTSZ(ip) ||
5167                             ip->i_delayoff + ip->i_delaylen != off) {
5168                                 uoff_t doff;
5169                                 size_t dlen;
5170
5171                                 doff = ip->i_delayoff;
5172                                 dlen = ip->i_delaylen;
5173                                 ip->i_delayoff = off;
5174                                 ip->i_delaylen = len;
5175                                 mutex_exit(&ip->i_tlock);
5176                                 err = ufs_putpages(vp, doff, dlen,
5177                                     flags, cr);
5178                                 /* LMXXX - flags are new val, not old */
5179                                 goto errout;
5180                         }
5181                         /*
5182                          * There is something there, it's not full, and
5183                          * it is contig.
5184                          */
5185                         ip->i_delaylen += len;
5186                         mutex_exit(&ip->i_tlock);
5187                         goto errout;
5188                 }
5189                 /*
5190                  * Must have weird flags or we are not clustering.
5191                  */
5192         }
5193
5194         err = ufs_putpages(vp, off, len, flags, cr);
5195
5196 errout:
5197         return (err);
5198 }
5199
5200 /*
5201  * If len == 0, do from off to EOF.
5202  *
5203  * The normal cases should be len == 0 & off == 0 (entire vp list),
5204  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
5205  * (from pageout).
5206  */
5207 /*ARGSUSED*/
5208 static int
5209 ufs_putpages(
5210         struct vnode *vp,
5211         offset_t off,
5212         size_t len,
5213         int flags,
5214         struct cred *cr)
5215 {
5216         uoff_t io_off;
5217         uoff_t eoff;
5218         struct inode *ip = VTOI(vp);
5219         page_t *pp;
5220         size_t io_len;
5221         int err = 0;
5222         int dolock;
5223
5224         if (vp->v_count == 0)
5225                 return (ufs_fault(vp, "ufs_putpages: v_count == 0"));
5226         /*
5227          * Acquire the readers/write inode lock before locking
5228          * any pages in this inode.
5229          * The inode lock is held during i/o.
5230          */
5231         if (len == 0) {
5232                 mutex_enter(&ip->i_tlock);
5233                 ip->i_delayoff = ip->i_delaylen = 0;
5234                 mutex_exit(&ip->i_tlock);
5235         }
5236         dolock = (rw_owner(&ip->i_contents) != curthread);
5237         if (dolock) {
5238                 /*
5239                  * Must synchronize this thread and any possible thread
5240                  * operating in the window of vulnerability in wrip().
5241                  * It is dangerous to allow both a thread doing a putpage
5242                  * and a thread writing, so serialize them.  The exception
5243                  * is when the thread in wrip() does something which causes
5244                  * a putpage operation.  Then, the thread must be allowed
5245                  * to continue.  It may encounter a bmap_read problem in
5246                  * ufs_putapage, but that is handled in ufs_putapage.
5247                  * Allow async writers to proceed, we don't want to block
5248                  * the pageout daemon.
5249                  */
5250                 if (ip->i_writer == curthread)
5251                         rw_enter(&ip->i_contents, RW_READER);
5252                 else {
5253                         for (;;) {
5254                                 rw_enter(&ip->i_contents, RW_READER);
5255                                 mutex_enter(&ip->i_tlock);
5256                                 /*
5257                                  * If there is no thread in the critical
5258                                  * section of wrip(), then proceed.
5259                                  * Otherwise, wait until there isn't one.
5260                                  */
5261                                 if (ip->i_writer == NULL) {
5262                                         mutex_exit(&ip->i_tlock);
5263                                         break;
5264                                 }
5265                                 rw_exit(&ip->i_contents);
5266                                 /*
5267                                  * Bounce async writers when we have a writer
5268                                  * working on this file so we don't deadlock
5269                                  * the pageout daemon.
5270                                  */
5271                                 if (flags & B_ASYNC) {
5272                                         mutex_exit(&ip->i_tlock);
5273                                         return (0);
5274                                 }
5275                                 cv_wait(&ip->i_wrcv, &ip->i_tlock);
5276                                 mutex_exit(&ip->i_tlock);
5277                         }
5278                 }
5279         }
5280
5281         if (!vn_has_cached_data(vp)) {
5282                 if (dolock)
5283                         rw_exit(&ip->i_contents);
5284                 return (0);
5285         }
5286
5287         if (len == 0) {
5288                 /*
5289                  * Search the entire vp list for pages >= off.
5290                  */
5291                 err = pvn_vplist_dirty(vp, (uoff_t)off, ufs_putapage,
5292                     flags, cr);
5293         } else {
5294                 /*
5295                  * Loop over all offsets in the range looking for
5296                  * pages to deal with.
5297                  */
5298                 if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0)
5299                         eoff = MIN(off + len, eoff);
5300                 else
5301                         eoff = off + len;
5302
5303                 for (io_off = off; io_off < eoff; io_off += io_len) {
5304                         /*
5305                          * If we are not invalidating, synchronously
5306                          * freeing or writing pages, use the routine
5307                          * page_lookup_nowait() to prevent reclaiming
5308                          * them from the free list.
5309                          */
5310                         if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
5311                                 pp = page_lookup(&vp->v_object, io_off,
5312                                                  (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
5313                         } else {
5314                                 pp = page_lookup_nowait(&vp->v_object,
5315                                                         io_off,
5316                                                         (flags & B_FREE) ? SE_EXCL : SE_SHARED);
5317                         }
5318
5319                         if (pp == NULL || pvn_getdirty(pp, flags) == 0)
5320                                 io_len = PAGESIZE;
5321                         else {
5322                                 uoff_t *io_offp = &io_off;
5323
5324                                 err = ufs_putapage(vp, pp, io_offp, &io_len,
5325                                     flags, cr);
5326                                 if (err != 0)
5327                                         break;
5328                                 /*
5329                                  * "io_off" and "io_len" are returned as
5330                                  * the range of pages we actually wrote.
5331                                  * This allows us to skip ahead more quickly
5332                                  * since several pages may've been dealt
5333                                  * with by this iteration of the loop.
5334                                  */
5335                         }
5336                 }
5337         }
5338         if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
5339                 /*
5340                  * We have just sync'ed back all the pages on
5341                  * the inode, turn off the IMODTIME flag.
5342                  */
5343                 mutex_enter(&ip->i_tlock);
5344                 ip->i_flag &= ~IMODTIME;
5345                 mutex_exit(&ip->i_tlock);
5346         }
5347         if (dolock)
5348                 rw_exit(&ip->i_contents);
5349         return (err);
5350 }
5351
5352 static void
5353 ufs_iodone(buf_t *bp)
5354 {
5355         struct inode *ip;
5356
5357         VERIFY(bp->b_pages->p_object != NULL);
5358         ASSERT(bp->b_pages->p_vnode != NULL);
5359         ASSERT(!(bp->b_flags & B_READ));
5360
5361         bp->b_iodone = NULL;
5362
5363         ip = VTOI(bp->b_pages->p_vnode);
5364
5365         mutex_enter(&ip->i_tlock);
5366         if (ip->i_writes >= ufs_LW) {
5367                 if ((ip->i_writes -= bp->b_bcount) <= ufs_LW)
5368                         if (ufs_WRITES)
5369                                 cv_broadcast(&ip->i_wrcv); /* wake all up */
5370         } else {
5371                 ip->i_writes -= bp->b_bcount;
5372         }
5373
5374         mutex_exit(&ip->i_tlock);
5375         iodone(bp);
5376 }
5377
5378 /*
5379  * Write out a single page, possibly klustering adjacent
5380  * dirty pages.  The inode lock must be held.
5381  *
5382  * LMXXX - bsize < pagesize not done.
5383  */
5384 /*ARGSUSED*/
5385 int
5386 ufs_putapage(
5387         struct vnode *vp,
5388         page_t *pp,
5389         uoff_t *offp,
5390         size_t *lenp,           /* return values */
5391         int flags,
5392         struct cred *cr)
5393 {
5394         uoff_t io_off;
5395         uoff_t off;
5396         struct inode *ip = VTOI(vp);
5397         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
5398         struct fs *fs;
5399         struct buf *bp;
5400         size_t io_len;
5401         daddr_t bn;
5402         int err;
5403         int contig;
5404         int dotrans;
5405
5406         ASSERT(RW_LOCK_HELD(&ip->i_contents));
5407
5408         if (ufsvfsp == NULL) {
5409                 err = EIO;
5410                 goto out_trace;
5411         }
5412
5413         fs = ip->i_fs;
5414         ASSERT(fs->fs_ronly == 0);
5415
5416         /*
5417          * If the modified time on the inode has not already been
5418          * set elsewhere (e.g. for write/setattr) we set the time now.
5419          * This gives us approximate modified times for mmap'ed files
5420          * which are modified via stores in the user address space.
5421          */
5422         if ((ip->i_flag & IMODTIME) == 0) {
5423                 mutex_enter(&ip->i_tlock);
5424                 ip->i_flag |= IUPD;
5425                 ip->i_seq++;
5426                 ITIMES_NOLOCK(ip);
5427                 mutex_exit(&ip->i_tlock);
5428         }
5429
5430         /*
5431          * Align the request to a block boundry (for old file systems),
5432          * and go ask bmap() how contiguous things are for this file.
5433          */
5434         off = pp->p_offset & (offset_t)fs->fs_bmask;    /* block align it */
5435         contig = 0;
5436         err = bmap_read(ip, off, &bn, &contig);
5437         if (err)
5438                 goto out;
5439         if (bn == UFS_HOLE) {                   /* putpage never allocates */
5440                 /*
5441                  * logging device is in error mode; simply return EIO
5442                  */
5443                 if (TRANS_ISERROR(ufsvfsp)) {
5444                         err = EIO;
5445                         goto out;
5446                 }
5447                 /*
5448                  * Oops, the thread in the window in wrip() did some
5449                  * sort of operation which caused a putpage in the bad
5450                  * range.  In this case, just return an error which will
5451                  * cause the software modified bit on the page to set
5452                  * and the page will get written out again later.
5453                  */
5454                 if (ip->i_writer == curthread) {
5455                         err = EIO;
5456                         goto out;
5457                 }
5458                 /*
5459                  * If the pager is trying to push a page in the bad range
5460                  * just tell it to try again later when things are better.
5461                  */
5462                 if (flags & B_ASYNC) {
5463                         err = EAGAIN;
5464                         goto out;
5465                 }
5466                 err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE");
5467                 goto out;
5468         }
5469
5470         /*
5471          * If it is an fallocate'd block, reverse the negativity since
5472          * we are now writing to it
5473          */
5474         if (ISFALLOCBLK(ip, bn)) {
5475                 err = bmap_set_bn(vp, off, dbtofsb(fs, -bn));
5476                 if (err)
5477                         goto out;
5478
5479                 bn = -bn;
5480         }
5481
5482         /*
5483          * Take the length (of contiguous bytes) passed back from bmap()
5484          * and _try_ and get a set of pages covering that extent.
5485          */
5486         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags);
5487
5488         /*
5489          * May have run out of memory and not clustered backwards.
5490          * off          p_offset
5491          * [  pp - 1  ][   pp   ]
5492          * [    block           ]
5493          * We told bmap off, so we have to adjust the bn accordingly.
5494          */
5495         if (io_off > off) {
5496                 bn += btod(io_off - off);
5497                 contig -= (io_off - off);
5498         }
5499
5500         /*
5501          * bmap was carefull to tell us the right size so use that.
5502          * There might be unallocated frags at the end.
5503          * LMXXX - bzero the end of the page?  We must be writing after EOF.
5504          */
5505         if (io_len > contig) {
5506                 ASSERT(io_len - contig < fs->fs_bsize);
5507                 io_len -= (io_len - contig);
5508         }
5509
5510         /*
5511          * Handle the case where we are writing the last page after EOF.
5512          *
5513          * XXX - just a patch for i-mt3.
5514          */
5515         if (io_len == 0) {
5516                 ASSERT(pp->p_offset >=
5517                     (uoff_t)(roundup(ip->i_size, PAGESIZE)));
5518                 io_len = PAGESIZE;
5519         }
5520
5521         bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags);
5522
5523         ULOCKFS_SET_MOD(ITOUL(ip));
5524
5525         bp->b_edev = ip->i_dev;
5526         bp->b_dev = cmpdev(ip->i_dev);
5527         bp->b_blkno = bn;
5528         bp->b_un.b_addr = (caddr_t)0;
5529         bp->b_file = ip->i_vnode;
5530
5531         /*
5532          * File contents of shadow or quota inodes are metadata, and updates
5533          * to these need to be put into a logging transaction. All direct
5534          * callers in UFS do that, but fsflush can come here _before_ the
5535          * normal codepath. An example would be updating ACL information, for
5536          * which the normal codepath would be:
5537          *      ufs_si_store()
5538          *      ufs_rdwri()
5539          *      wrip()
5540          *      segmap_release()
5541          *      fop_putpage()
5542          * Here, fsflush can pick up the dirty page before segmap_release()
5543          * forces it out. If that happens, there's no transaction.
5544          * We therefore need to test whether a transaction exists, and if not
5545          * create one - for fsflush.
5546          */
5547         dotrans =
5548             (((ip->i_mode & IFMT) == IFSHAD || ufsvfsp->vfs_qinod == ip) &&
5549             ((curthread->t_flag & T_DONTBLOCK) == 0) &&
5550             (TRANS_ISTRANS(ufsvfsp)));
5551
5552         if (dotrans) {
5553                 curthread->t_flag |= T_DONTBLOCK;
5554                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5555         }
5556         if (TRANS_ISTRANS(ufsvfsp)) {
5557                 if ((ip->i_mode & IFMT) == IFSHAD) {
5558                         TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD);
5559                 } else if (ufsvfsp->vfs_qinod == ip) {
5560                         TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR,
5561                             0, 0);
5562                 }
5563         }
5564         if (dotrans) {
5565                 TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5566                 curthread->t_flag &= ~T_DONTBLOCK;
5567         }
5568
5569         /* write throttle */
5570
5571         ASSERT(bp->b_iodone == NULL);
5572         bp->b_iodone = (int (*)())ufs_iodone;
5573         mutex_enter(&ip->i_tlock);
5574         ip->i_writes += bp->b_bcount;
5575         mutex_exit(&ip->i_tlock);
5576
5577         if (bp->b_flags & B_ASYNC) {
5578                 if (ufsvfsp->vfs_log) {
5579                         lufs_write_strategy(ufsvfsp->vfs_log, bp);
5580                 } else if (ufsvfsp->vfs_snapshot) {
5581                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5582                 } else {
5583                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5584                         ub.ub_putasyncs.value.ul++;
5585                         (void) bdev_strategy(bp);
5586                         lwp_stat_update(LWP_STAT_OUBLK, 1);
5587                 }
5588         } else {
5589                 if (ufsvfsp->vfs_log) {
5590                         lufs_write_strategy(ufsvfsp->vfs_log, bp);
5591                 } else if (ufsvfsp->vfs_snapshot) {
5592                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5593                 } else {
5594                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5595                         ub.ub_putsyncs.value.ul++;
5596                         (void) bdev_strategy(bp);
5597                         lwp_stat_update(LWP_STAT_OUBLK, 1);
5598                 }
5599                 err = biowait(bp);
5600                 pageio_done(bp);
5601                 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
5602         }
5603
5604         pp = NULL;
5605
5606 out:
5607         if (err != 0 && pp != NULL)
5608                 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
5609
5610         if (offp)
5611                 *offp = io_off;
5612         if (lenp)
5613                 *lenp = io_len;
5614 out_trace:
5615         return (err);
5616 }
5617
5618 uint64_t ufs_map_alock_retry_cnt;
5619 uint64_t ufs_map_lockfs_retry_cnt;
5620
5621 /* ARGSUSED */
5622 static int
5623 ufs_map(struct vnode *vp,
5624         offset_t off,
5625         struct as *as,
5626         caddr_t *addrp,
5627         size_t len,
5628         uchar_t prot,
5629         uchar_t maxprot,
5630         uint_t flags,
5631         struct cred *cr,
5632         caller_context_t *ct)
5633 {
5634         struct segvn_crargs vn_a;
5635         struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
5636         struct ulockfs *ulp;
5637         int error, sig;
5638         k_sigset_t smask;
5639         caddr_t hint = *addrp;
5640
5641         if (vp->v_flag & VNOMAP) {
5642                 error = ENOSYS;
5643                 goto out;
5644         }
5645
5646         if (off < 0 || (offset_t)(off + len) < 0) {
5647                 error = ENXIO;
5648                 goto out;
5649         }
5650
5651         if (vp->v_type != VREG) {
5652                 error = ENODEV;
5653                 goto out;
5654         }
5655
5656 retry_map:
5657         *addrp = hint;
5658         /*
5659          * If file is being locked, disallow mapping.
5660          */
5661         if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) {
5662                 error = EAGAIN;
5663                 goto out;
5664         }
5665
5666         as_rangelock(as);
5667         /*
5668          * Note that if we are retrying (because ufs_lockfs_trybegin failed in
5669          * the previous attempt), some other thread could have grabbed
5670          * the same VA range if MAP_FIXED is set. In that case, choose_addr
5671          * would unmap the valid VA range, that is ok.
5672          */
5673         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5674         if (error != 0) {
5675                 as_rangeunlock(as);
5676                 goto out;
5677         }
5678
5679         /*
5680          * a_lock has to be acquired before entering the lockfs protocol
5681          * because that is the order in which pagefault works. Also we cannot
5682          * block on a_lock here because this waiting writer will prevent
5683          * further readers like ufs_read from progressing and could cause
5684          * deadlock between ufs_read/ufs_map/pagefault when a quiesce is
5685          * pending.
5686          */
5687         while (!AS_LOCK_TRYENTER(as, RW_WRITER)) {
5688                 ufs_map_alock_retry_cnt++;
5689                 delay(RETRY_LOCK_DELAY);
5690         }
5691
5692         /*
5693          * We can't hold as->a_lock and wait for lockfs to succeed because
5694          * the proc tools might hang on a_lock, so call ufs_lockfs_trybegin()
5695          * instead.
5696          */
5697         if (error = ufs_lockfs_trybegin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK)) {
5698                 /*
5699                  * ufs_lockfs_trybegin() did not succeed. It is safer to give up
5700                  * as->a_lock and wait for ulp->ul_fs_lock status to change.
5701                  */
5702                 ufs_map_lockfs_retry_cnt++;
5703                 AS_LOCK_EXIT(as);
5704                 as_rangeunlock(as);
5705                 if (error == EIO)
5706                         goto out;
5707
5708                 mutex_enter(&ulp->ul_lock);
5709                 while (ulp->ul_fs_lock & ULOCKFS_MAP_MASK) {
5710                         if (ULOCKFS_IS_SLOCK(ulp) || ufsvfsp->vfs_nointr) {
5711                                 cv_wait(&ulp->ul_cv, &ulp->ul_lock);
5712                         } else {
5713                                 sigintr(&smask, 1);
5714                                 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
5715                                 sigunintr(&smask);
5716                                 if (((ulp->ul_fs_lock & ULOCKFS_MAP_MASK) &&
5717                                     !sig) || ufsvfsp->vfs_dontblock) {
5718                                         mutex_exit(&ulp->ul_lock);
5719                                         return (EINTR);
5720                                 }
5721                         }
5722                 }
5723                 mutex_exit(&ulp->ul_lock);
5724                 goto retry_map;
5725         }
5726
5727         vn_a.vp = vp;
5728         vn_a.offset = (uoff_t)off;
5729         vn_a.type = flags & MAP_TYPE;
5730         vn_a.prot = prot;
5731         vn_a.maxprot = maxprot;
5732         vn_a.cred = cr;
5733         vn_a.amp = NULL;
5734         vn_a.flags = flags & ~MAP_TYPE;
5735         vn_a.szc = 0;
5736         vn_a.lgrp_mem_policy_flags = 0;
5737
5738         error = as_map_locked(as, *addrp, len, segvn_create, &vn_a);
5739         if (ulp)
5740                 ufs_lockfs_end(ulp);
5741         as_rangeunlock(as);
5742 out:
5743         return (error);
5744 }
5745
5746 /* ARGSUSED */
5747 static int
5748 ufs_addmap(struct vnode *vp,
5749         offset_t off,
5750         struct as *as,
5751         caddr_t addr,
5752         size_t  len,
5753         uchar_t  prot,
5754         uchar_t  maxprot,
5755         uint_t    flags,
5756         struct cred *cr,
5757         caller_context_t *ct)
5758 {
5759         struct inode *ip = VTOI(vp);
5760
5761         if (vp->v_flag & VNOMAP) {
5762                 return (ENOSYS);
5763         }
5764
5765         mutex_enter(&ip->i_tlock);
5766         ip->i_mapcnt += btopr(len);
5767         mutex_exit(&ip->i_tlock);
5768         return (0);
5769 }
5770
5771 /*ARGSUSED*/
5772 static int
5773 ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
5774         size_t len, uint_t prot,  uint_t maxprot,  uint_t flags,
5775         struct cred *cr, caller_context_t *ct)
5776 {
5777         struct inode *ip = VTOI(vp);
5778
5779         if (vp->v_flag & VNOMAP) {
5780                 return (ENOSYS);
5781         }
5782
5783         mutex_enter(&ip->i_tlock);
5784         ip->i_mapcnt -= btopr(len);     /* Count released mappings */
5785         ASSERT(ip->i_mapcnt >= 0);
5786         mutex_exit(&ip->i_tlock);
5787         return (0);
5788 }
5789 /*
5790  * Return the answer requested to poll() for non-device files
5791  */
5792 struct pollhead ufs_pollhd;
5793
5794 /* ARGSUSED */
5795 int
5796 ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp,
5797         caller_context_t *ct)
5798 {
5799         struct ufsvfs   *ufsvfsp;
5800
5801         *revp = 0;
5802         ufsvfsp = VTOI(vp)->i_ufsvfs;
5803
5804         if (!ufsvfsp) {
5805                 *revp = POLLHUP;
5806                 goto out;
5807         }
5808
5809         if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) ||
5810             ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) {
5811                 *revp |= POLLERR;
5812
5813         } else {
5814                 if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly &&
5815                     !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5816                         *revp |= POLLOUT;
5817
5818                 if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly &&
5819                     !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5820                         *revp |= POLLWRBAND;
5821
5822                 if (ev & POLLIN)
5823                         *revp |= POLLIN;
5824
5825                 if (ev & POLLRDNORM)
5826                         *revp |= POLLRDNORM;
5827
5828                 if (ev & POLLRDBAND)
5829                         *revp |= POLLRDBAND;
5830         }
5831
5832         if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP)))
5833                 *revp |= POLLPRI;
5834 out:
5835         *phpp = !any && !*revp ? &ufs_pollhd : NULL;
5836
5837         return (0);
5838 }
5839
5840 /* ARGSUSED */
5841 static int
5842 ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr,
5843         caller_context_t *ct)
5844 {
5845         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
5846         struct ulockfs  *ulp = NULL;
5847         struct inode    *sip = NULL;
5848         int             error;
5849         struct inode    *ip = VTOI(vp);
5850         int             issync;
5851
5852         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK);
5853         if (error)
5854                 return (error);
5855
5856         switch (cmd) {
5857                 /*
5858                  * Have to handle _PC_NAME_MAX here, because the normal way
5859                  * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()]
5860                  * results in a lock ordering reversal between
5861                  * ufs_lockfs_{begin,end}() and
5862                  * ufs_thread_{suspend,continue}().
5863                  *
5864                  * Keep in sync with ufs_statvfs().
5865                  */
5866         case _PC_NAME_MAX:
5867                 *valp = MAXNAMLEN;
5868                 break;
5869
5870         case _PC_FILESIZEBITS:
5871                 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
5872                         *valp = UFS_FILESIZE_BITS;
5873                 else
5874                         *valp = 32;
5875                 break;
5876
5877         case _PC_XATTR_EXISTS:
5878                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5879
5880                         error =
5881                             ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, cr);
5882                         if (error ==  0 && sip != NULL) {
5883                                 /* Start transaction */
5884                                 if (ulp) {
5885                                         TRANS_BEGIN_CSYNC(ufsvfsp, &issync,
5886                                                           TOP_RMDIR,
5887                                                           TOP_RMDIR_SIZE);
5888                                 }
5889                                 /*
5890                                  * Is directory empty
5891                                  */
5892                                 rw_enter(&sip->i_rwlock, RW_WRITER);
5893                                 rw_enter(&sip->i_contents, RW_WRITER);
5894                                 if (ufs_xattrdirempty(sip,
5895                                     sip->i_number, CRED())) {
5896                                         rw_enter(&ip->i_contents, RW_WRITER);
5897                                         ufs_unhook_shadow(ip, sip);
5898                                         rw_exit(&ip->i_contents);
5899
5900                                         *valp = 0;
5901
5902                                 } else
5903                                         *valp = 1;
5904                                 rw_exit(&sip->i_contents);
5905                                 rw_exit(&sip->i_rwlock);
5906                                 if (ulp) {
5907                                         TRANS_END_CSYNC(ufsvfsp, &error,
5908                                                         issync, TOP_RMDIR,
5909                                                         TOP_RMDIR_SIZE);
5910                                 }
5911                                 VN_RELE(ITOV(sip));
5912                         } else if (error == ENOENT) {
5913                                 *valp = 0;
5914                                 error = 0;
5915                         }
5916                 } else {
5917                         error = fs_pathconf(vp, cmd, valp, cr, ct);
5918                 }
5919                 break;
5920
5921         case _PC_ACL_ENABLED:
5922                 *valp = _ACL_ACLENT_ENABLED;
5923                 break;
5924
5925         case _PC_MIN_HOLE_SIZE:
5926                 *valp = (ulong_t)ip->i_fs->fs_bsize;
5927                 break;
5928
5929         case _PC_SATTR_ENABLED:
5930         case _PC_SATTR_EXISTS:
5931                 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5932                     (vp->v_type == VREG || vp->v_type == VDIR);
5933                 break;
5934
5935         case _PC_TIMESTAMP_RESOLUTION:
5936                 /*
5937                  * UFS keeps only microsecond timestamp resolution.
5938                  * This is historical and will probably never change.
5939                  */
5940                 *valp = 1000L;
5941                 break;
5942
5943         default:
5944                 error = fs_pathconf(vp, cmd, valp, cr, ct);
5945                 break;
5946         }
5947
5948         if (ulp != NULL) {
5949                 ufs_lockfs_end(ulp);
5950         }
5951         return (error);
5952 }
5953
5954 int ufs_pageio_writes, ufs_pageio_reads;
5955
5956 /*ARGSUSED*/
5957 static int
5958 ufs_pageio(struct vnode *vp, page_t *pp, uoff_t io_off, size_t io_len,
5959         int flags, struct cred *cr, caller_context_t *ct)
5960 {
5961         struct inode *ip = VTOI(vp);
5962         struct ufsvfs *ufsvfsp;
5963         page_t *npp = NULL, *opp = NULL, *cpp = pp;
5964         struct buf *bp;
5965         daddr_t bn;
5966         size_t done_len = 0, cur_len = 0;
5967         int err = 0;
5968         int contig = 0;
5969         int dolock;
5970         int vmpss = 0;
5971         struct ulockfs *ulp;
5972
5973         if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp &&
5974             vp->v_mpssdata != NULL) {
5975                 vmpss = 1;
5976         }
5977
5978         dolock = (rw_owner(&ip->i_contents) != curthread);
5979         /*
5980          * We need a better check.  Ideally, we would use another
5981          * vnodeops so that hlocked and forcibly unmounted file
5982          * systems would return EIO where appropriate and w/o the
5983          * need for these checks.
5984          */
5985         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5986                 return (EIO);
5987
5988         /*
5989          * For vmpss (pp can be NULL) case respect the quiesce protocol.
5990          * ul_lock must be taken before locking pages so we can't use it here
5991          * if pp is non NULL because segvn already locked pages
5992          * SE_EXCL. Instead we rely on the fact that a forced umount or
5993          * applying a filesystem lock via ufs_fiolfs() will block in the
5994          * implicit call to ufs_flush() until we unlock the pages after the
5995          * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend
5996          * above 0 until they are done. We have to be careful not to increment
5997          * ul_vnops_cnt here after forceful unmount hlocks the file system.
5998          *
5999          * If pp is NULL use ul_lock to make sure we don't increment
6000          * ul_vnops_cnt after forceful unmount hlocks the file system.
6001          */
6002         if (vmpss || pp == NULL) {
6003                 ulp = &ufsvfsp->vfs_ulockfs;
6004                 if (pp == NULL)
6005                         mutex_enter(&ulp->ul_lock);
6006                 if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) {
6007                         if (pp == NULL) {
6008                                 mutex_exit(&ulp->ul_lock);
6009                         }
6010                         return (vmpss ? EIO : EINVAL);
6011                 }
6012                 atomic_inc_ulong(&ulp->ul_vnops_cnt);
6013                 if (pp == NULL)
6014                         mutex_exit(&ulp->ul_lock);
6015                 if (ufs_quiesce_pend) {
6016                         if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6017                                 cv_broadcast(&ulp->ul_cv);
6018                         return (vmpss ? EIO : EINVAL);
6019                 }
6020         }
6021
6022         if (dolock) {
6023                 /*
6024                  * segvn may call fop_pageio() instead of fop_getpage() to
6025                  * handle a fault against a segment that maps vnode pages with
6026                  * large mappings.  Segvn creates pages and holds them locked
6027                  * SE_EXCL during fop_pageio() call. In this case we have to
6028                  * use rw_tryenter() to avoid a potential deadlock since in
6029                  * lock order i_contents needs to be taken first.
6030                  * Segvn will retry via fop_getpage() if fop_pageio() fails.
6031                  */
6032                 if (!vmpss) {
6033                         rw_enter(&ip->i_contents, RW_READER);
6034                 } else if (!rw_tryenter(&ip->i_contents, RW_READER)) {
6035                         if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6036                                 cv_broadcast(&ulp->ul_cv);
6037                         return (EDEADLK);
6038                 }
6039         }
6040
6041         /*
6042          * Return an error to segvn because the pagefault request is beyond
6043          * PAGESIZE rounded EOF.
6044          */
6045         if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) {
6046                 if (dolock)
6047                         rw_exit(&ip->i_contents);
6048                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6049                         cv_broadcast(&ulp->ul_cv);
6050                 return (EFAULT);
6051         }
6052
6053         if (pp == NULL) {
6054                 if (bmap_has_holes(ip)) {
6055                         err = ENOSYS;
6056                 } else {
6057                         err = EINVAL;
6058                 }
6059                 if (dolock)
6060                         rw_exit(&ip->i_contents);
6061                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6062                         cv_broadcast(&ulp->ul_cv);
6063                 return (err);
6064         }
6065
6066         /*
6067          * Break the io request into chunks, one for each contiguous
6068          * stretch of disk blocks in the target file.
6069          */
6070         while (done_len < io_len) {
6071                 ASSERT(cpp);
6072                 contig = 0;
6073                 if (err = bmap_read(ip, (uoff_t)(io_off + done_len),
6074                     &bn, &contig))
6075                         break;
6076
6077                 if (bn == UFS_HOLE) {   /* No holey swapfiles */
6078                         if (vmpss) {
6079                                 err = EFAULT;
6080                                 break;
6081                         }
6082                         err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE");
6083                         break;
6084                 }
6085
6086                 cur_len = MIN(io_len - done_len, contig);
6087                 /*
6088                  * Zero out a page beyond EOF, when the last block of
6089                  * a file is a UFS fragment so that ufs_pageio() can be used
6090                  * instead of ufs_getpage() to handle faults against
6091                  * segvn segments that use large pages.
6092                  */
6093                 page_list_break(&cpp, &npp, btopr(cur_len));
6094                 if ((flags & B_READ) && (cur_len & PAGEOFFSET)) {
6095                         size_t xlen = cur_len & PAGEOFFSET;
6096                         pagezero(cpp->p_prev, xlen, PAGESIZE - xlen);
6097                 }
6098
6099                 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
6100                 ASSERT(bp != NULL);
6101
6102                 bp->b_edev = ip->i_dev;
6103                 bp->b_dev = cmpdev(ip->i_dev);
6104                 bp->b_blkno = bn;
6105                 bp->b_un.b_addr = (caddr_t)0;
6106                 bp->b_file = ip->i_vnode;
6107
6108                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
6109                 ub.ub_pageios.value.ul++;
6110                 if (ufsvfsp->vfs_snapshot)
6111                         fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp);
6112                 else
6113                         (void) bdev_strategy(bp);
6114
6115                 if (flags & B_READ)
6116                         ufs_pageio_reads++;
6117                 else
6118                         ufs_pageio_writes++;
6119                 if (flags & B_READ)
6120                         lwp_stat_update(LWP_STAT_INBLK, 1);
6121                 else
6122                         lwp_stat_update(LWP_STAT_OUBLK, 1);
6123                 /*
6124                  * If the request is not B_ASYNC, wait for i/o to complete
6125                  * and re-assemble the page list to return to the caller.
6126                  * If it is B_ASYNC we leave the page list in pieces and
6127                  * cleanup() will dispose of them.
6128                  */
6129                 if ((flags & B_ASYNC) == 0) {
6130                         err = biowait(bp);
6131                         pageio_done(bp);
6132                         if (err)
6133                                 break;
6134                         page_list_concat(&opp, &cpp);
6135                 }
6136                 cpp = npp;
6137                 npp = NULL;
6138                 if (flags & B_READ)
6139                         cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t);
6140                 done_len += cur_len;
6141         }
6142         ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len));
6143         if (err) {
6144                 if (flags & B_ASYNC) {
6145                         /* Cleanup unprocessed parts of list */
6146                         page_list_concat(&cpp, &npp);
6147                         if (flags & B_READ)
6148                                 pvn_read_done(cpp, B_ERROR);
6149                         else
6150                                 pvn_write_done(cpp, B_ERROR);
6151                 } else {
6152                         /* Re-assemble list and let caller clean up */
6153                         page_list_concat(&opp, &cpp);
6154                         page_list_concat(&opp, &npp);
6155                 }
6156         }
6157
6158         if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) &&
6159             ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) {
6160                 mutex_enter(&ip->i_tlock);
6161                 ip->i_flag |= IACC;
6162                 ITIMES_NOLOCK(ip);
6163                 mutex_exit(&ip->i_tlock);
6164         }
6165
6166         if (dolock)
6167                 rw_exit(&ip->i_contents);
6168         if (vmpss && !atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6169                 cv_broadcast(&ulp->ul_cv);
6170         return (err);
6171 }
6172
6173 /*
6174  * Called when the kernel is in a frozen state to dump data
6175  * directly to the device. It uses a private dump data structure,
6176  * set up by dump_ctl, to locate the correct disk block to which to dump.
6177  */
6178 /*ARGSUSED*/
6179 static int
6180 ufs_dump(vnode_t *vp, caddr_t addr, offset_t ldbn, offset_t dblks,
6181     caller_context_t *ct)
6182 {
6183         uoff_t  file_size;
6184         struct inode    *ip = VTOI(vp);
6185         struct fs       *fs = ip->i_fs;
6186         daddr_t         dbn, lfsbn;
6187         int             disk_blks = fs->fs_bsize >> DEV_BSHIFT;
6188         int             error = 0;
6189         int             ndbs, nfsbs;
6190
6191         /*
6192          * forced unmount case
6193          */
6194         if (ip->i_ufsvfs == NULL)
6195                 return (EIO);
6196         /*
6197          * Validate the inode that it has not been modified since
6198          * the dump structure is allocated.
6199          */
6200         mutex_enter(&ip->i_tlock);
6201         if ((dump_info == NULL) ||
6202             (dump_info->ip != ip) ||
6203             (dump_info->time.tv_sec != ip->i_mtime.tv_sec) ||
6204             (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) {
6205                 mutex_exit(&ip->i_tlock);
6206                 return (-1);
6207         }
6208         mutex_exit(&ip->i_tlock);
6209
6210         /*
6211          * See that the file has room for this write
6212          */
6213         UFS_GET_ISIZE(&file_size, ip);
6214
6215         if (ldbtob(ldbn + dblks) > file_size)
6216                 return (ENOSPC);
6217
6218         /*
6219          * Find the physical disk block numbers from the dump
6220          * private data structure directly and write out the data
6221          * in contiguous block lumps
6222          */
6223         while (dblks > 0 && !error) {
6224                 lfsbn = (daddr_t)lblkno(fs, ldbtob(ldbn));
6225                 dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks;
6226                 nfsbs = 1;
6227                 ndbs = disk_blks - ldbn % disk_blks;
6228                 while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn +
6229                     nfsbs]) == dbn + ndbs) {
6230                         nfsbs++;
6231                         ndbs += disk_blks;
6232                 }
6233                 if (ndbs > dblks)
6234                         ndbs = dblks;
6235                 error = bdev_dump(ip->i_dev, addr, dbn, ndbs);
6236                 addr += ldbtob((offset_t)ndbs);
6237                 dblks -= ndbs;
6238                 ldbn += ndbs;
6239         }
6240         return (error);
6241
6242 }
6243
6244 /*
6245  * Prepare the file system before and after the dump operation.
6246  *
6247  * action = DUMP_ALLOC:
6248  * Preparation before dump, allocate dump private data structure
6249  * to hold all the direct and indirect block info for dump.
6250  *
6251  * action = DUMP_FREE:
6252  * Clean up after dump, deallocate the dump private data structure.
6253  *
6254  * action = DUMP_SCAN:
6255  * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space;
6256  * if found, the starting file-relative DEV_BSIZE lbn is written
6257  * to *bklp; that lbn is intended for use with fop_dump()
6258  */
6259 /*ARGSUSED*/
6260 static int
6261 ufs_dumpctl(vnode_t *vp, int action, offset_t *blkp, caller_context_t *ct)
6262 {
6263         struct inode    *ip = VTOI(vp);
6264         ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
6265         struct fs       *fs;
6266         daddr32_t       *dblk, *storeblk;
6267         daddr32_t       *nextblk, *endblk;
6268         struct buf      *bp;
6269         int             i, entry, entries;
6270         int             n, ncontig;
6271
6272         /*
6273          * check for forced unmount
6274          */
6275         if (ufsvfsp == NULL)
6276                 return (EIO);
6277
6278         if (action == DUMP_ALLOC) {
6279                 /*
6280                  * alloc and record dump_info
6281                  */
6282                 if (dump_info != NULL)
6283                         return (EINVAL);
6284
6285                 ASSERT(vp->v_type == VREG);
6286                 fs = ufsvfsp->vfs_fs;
6287
6288                 rw_enter(&ip->i_contents, RW_READER);
6289
6290                 if (bmap_has_holes(ip)) {
6291                         rw_exit(&ip->i_contents);
6292                         return (EFAULT);
6293                 }
6294
6295                 /*
6296                  * calculate and allocate space needed according to i_size
6297                  */
6298                 entries = (int)lblkno(fs, blkroundup(fs, ip->i_size));
6299                 dump_info = kmem_alloc(sizeof (struct dump) +
6300                     (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP);
6301                 if (dump_info == NULL) {
6302                         rw_exit(&ip->i_contents);
6303                         return (ENOMEM);
6304                 }
6305
6306                 /* Start saving the info */
6307                 dump_info->fsbs = entries;
6308                 dump_info->ip = ip;
6309                 storeblk = &dump_info->dblk[0];
6310
6311                 /* Direct Blocks */
6312                 for (entry = 0; entry < NDADDR && entry < entries; entry++)
6313                         *storeblk++ = ip->i_db[entry];
6314
6315                 /* Indirect Blocks */
6316                 for (i = 0; i < NIADDR; i++) {
6317                         int error = 0;
6318
6319                         bp = UFS_BREAD(ufsvfsp,
6320                             ip->i_dev, fsbtodb(fs, ip->i_ib[i]), fs->fs_bsize);
6321                         if (bp->b_flags & B_ERROR)
6322                                 error = EIO;
6323                         else {
6324                                 dblk = bp->b_un.b_daddr;
6325                                 if ((storeblk = save_dblks(ip, ufsvfsp,
6326                                     storeblk, dblk, i, entries)) == NULL)
6327                                         error = EIO;
6328                         }
6329
6330                         brelse(bp);
6331
6332                         if (error != 0) {
6333                                 kmem_free(dump_info, sizeof (struct dump) +
6334                                     (entries - 1) * sizeof (daddr32_t));
6335                                 rw_exit(&ip->i_contents);
6336                                 dump_info = NULL;
6337                                 return (error);
6338                         }
6339                 }
6340                 /* and time stamp the information */
6341                 mutex_enter(&ip->i_tlock);
6342                 dump_info->time = ip->i_mtime;
6343                 mutex_exit(&ip->i_tlock);
6344
6345                 rw_exit(&ip->i_contents);
6346         } else if (action == DUMP_FREE) {
6347                 /*
6348                  * free dump_info
6349                  */
6350                 if (dump_info == NULL)
6351                         return (EINVAL);
6352                 entries = dump_info->fsbs - 1;
6353                 kmem_free(dump_info, sizeof (struct dump) +
6354                     entries * sizeof (daddr32_t));
6355                 dump_info = NULL;
6356         } else if (action == DUMP_SCAN) {
6357                 /*
6358                  * scan dump_info
6359                  */
6360                 if (dump_info == NULL)
6361                         return (EINVAL);
6362
6363                 dblk = dump_info->dblk;
6364                 nextblk = dblk + 1;
6365                 endblk = dblk + dump_info->fsbs - 1;
6366                 fs = ufsvfsp->vfs_fs;
6367                 ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT);
6368
6369                 /*
6370                  * scan dblk[] entries; contig fs space is found when:
6371                  * ((current blkno + frags per block) == next blkno)
6372                  */
6373                 n = 0;
6374                 while (n < ncontig && dblk < endblk) {
6375                         if ((*dblk + fs->fs_frag) == *nextblk)
6376                                 n++;
6377                         else
6378                                 n = 0;
6379                         dblk++;
6380                         nextblk++;
6381                 }
6382
6383                 /*
6384                  * index is where size bytes of contig space begins;
6385                  * conversion from index to the file's DEV_BSIZE lbn
6386                  * is equivalent to:  (index * fs_bsize) / DEV_BSIZE
6387                  */
6388                 if (n == ncontig) {
6389                         i = (dblk - dump_info->dblk) - ncontig;
6390                         *blkp = i << (fs->fs_bshift - DEV_BSHIFT);
6391                 } else
6392                         return (EFAULT);
6393         }
6394         return (0);
6395 }
6396
6397 /*
6398  * Recursive helper function for ufs_dumpctl().  It follows the indirect file
6399  * system  blocks until it reaches the the disk block addresses, which are
6400  * then stored into the given buffer, storeblk.
6401  */
6402 static daddr32_t *
6403 save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp,  daddr32_t *storeblk,
6404     daddr32_t *dblk, int level, int entries)
6405 {
6406         struct fs       *fs = ufsvfsp->vfs_fs;
6407         struct buf      *bp;
6408         int             i;
6409
6410         if (level == 0) {
6411                 for (i = 0; i < NINDIR(fs); i++) {
6412                         if (storeblk - dump_info->dblk >= entries)
6413                                 break;
6414                         *storeblk++ = dblk[i];
6415                 }
6416                 return (storeblk);
6417         }
6418         for (i = 0; i < NINDIR(fs); i++) {
6419                 if (storeblk - dump_info->dblk >= entries)
6420                         break;
6421                 bp = UFS_BREAD(ufsvfsp,
6422                     ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize);
6423                 if (bp->b_flags & B_ERROR) {
6424                         brelse(bp);
6425                         return (NULL);
6426                 }
6427                 storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr,
6428                     level - 1, entries);
6429                 brelse(bp);
6430
6431                 if (storeblk == NULL)
6432                         return (NULL);
6433         }
6434         return (storeblk);
6435 }
6436
6437 /* ARGSUSED */
6438 static int
6439 ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag,
6440         struct cred *cr, caller_context_t *ct)
6441 {
6442         struct inode    *ip = VTOI(vp);
6443         struct ulockfs  *ulp;
6444         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
6445         ulong_t         vsa_mask = vsap->vsa_mask;
6446         int             err = EINVAL;
6447
6448         vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6449
6450         /*
6451          * Only grab locks if needed - they're not needed to check vsa_mask
6452          * or if the mask contains no acl flags.
6453          */
6454         if (vsa_mask != 0) {
6455                 if (err = ufs_lockfs_begin(ufsvfsp, &ulp,
6456                     ULOCKFS_GETATTR_MASK))
6457                         return (err);
6458
6459                 rw_enter(&ip->i_contents, RW_READER);
6460                 err = ufs_acl_get(ip, vsap, flag, cr);
6461                 rw_exit(&ip->i_contents);
6462
6463                 if (ulp)
6464                         ufs_lockfs_end(ulp);
6465         }
6466         return (err);
6467 }
6468
6469 /* ARGSUSED */
6470 static int
6471 ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr,
6472         caller_context_t *ct)
6473 {
6474         struct inode    *ip = VTOI(vp);
6475         struct ulockfs  *ulp = NULL;
6476         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
6477         ulong_t         vsa_mask = vsap->vsa_mask;
6478         int             err;
6479         int             haverwlock = 1;
6480         int             trans_size;
6481         int             donetrans = 0;
6482         int             retry = 1;
6483
6484         ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
6485
6486         /* Abort now if the request is either empty or invalid. */
6487         vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6488         if ((vsa_mask == 0) ||
6489             ((vsap->vsa_aclentp == NULL) &&
6490             (vsap->vsa_dfaclentp == NULL))) {
6491                 err = EINVAL;
6492                 goto out;
6493         }
6494
6495         /*
6496          * Following convention, if this is a directory then we acquire the
6497          * inode's i_rwlock after starting a UFS logging transaction;
6498          * otherwise, we acquire it beforehand. Since we were called (and
6499          * must therefore return) with the lock held, we will have to drop it,
6500          * and later reacquire it, if operating on a directory.
6501          */
6502         if (vp->v_type == VDIR) {
6503                 rw_exit(&ip->i_rwlock);
6504                 haverwlock = 0;
6505         } else {
6506                 /* Upgrade the lock if required. */
6507                 if (!rw_write_held(&ip->i_rwlock)) {
6508                         rw_exit(&ip->i_rwlock);
6509                         rw_enter(&ip->i_rwlock, RW_WRITER);
6510                 }
6511         }
6512
6513 again:
6514         ASSERT(!(vp->v_type == VDIR && haverwlock));
6515         if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) {
6516                 ulp = NULL;
6517                 retry = 0;
6518                 goto out;
6519         }
6520
6521         /*
6522          * Check that the file system supports this operation. Note that
6523          * ufs_lockfs_begin() will have checked that the file system had
6524          * not been forcibly unmounted.
6525          */
6526         if (ufsvfsp->vfs_fs->fs_ronly) {
6527                 err = EROFS;
6528                 goto out;
6529         }
6530         if (ufsvfsp->vfs_nosetsec) {
6531                 err = ENOSYS;
6532                 goto out;
6533         }
6534
6535         if (ulp) {
6536                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR,
6537                     trans_size = TOP_SETSECATTR_SIZE(VTOI(vp)));
6538                 donetrans = 1;
6539         }
6540
6541         if (vp->v_type == VDIR) {
6542                 rw_enter(&ip->i_rwlock, RW_WRITER);
6543                 haverwlock = 1;
6544         }
6545
6546         ASSERT(haverwlock);
6547
6548         /* Do the actual work. */
6549         rw_enter(&ip->i_contents, RW_WRITER);
6550         /*
6551          * Suppress out of inodes messages if we will retry.
6552          */
6553         if (retry)
6554                 ip->i_flag |= IQUIET;
6555         err = ufs_acl_set(ip, vsap, flag, cr);
6556         ip->i_flag &= ~IQUIET;
6557         rw_exit(&ip->i_contents);
6558
6559 out:
6560         if (ulp) {
6561                 if (donetrans) {
6562                         /*
6563                          * top_end_async() can eventually call
6564                          * top_end_sync(), which can block. We must
6565                          * therefore observe the lock-ordering protocol
6566                          * here as well.
6567                          */
6568                         if (vp->v_type == VDIR) {
6569                                 rw_exit(&ip->i_rwlock);
6570                                 haverwlock = 0;
6571                         }
6572                         TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size);
6573                 }
6574                 ufs_lockfs_end(ulp);
6575         }
6576         /*
6577          * If no inodes available, try scaring a logically-
6578          * free one out of the delete queue to someplace
6579          * that we can find it.
6580          */
6581         if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
6582                 ufs_delete_drain_wait(ufsvfsp, 1);
6583                 retry = 0;
6584                 if (vp->v_type == VDIR && haverwlock) {
6585                         rw_exit(&ip->i_rwlock);
6586                         haverwlock = 0;
6587                 }
6588                 goto again;
6589         }
6590         /*
6591          * If we need to reacquire the lock then it is safe to do so
6592          * as a reader. This is because ufs_rwunlock(), which will be
6593          * called by our caller after we return, does not differentiate
6594          * between shared and exclusive locks.
6595          */
6596         if (!haverwlock) {
6597                 ASSERT(vp->v_type == VDIR);
6598                 rw_enter(&ip->i_rwlock, RW_READER);
6599         }
6600
6601         return (err);
6602 }
6603
6604 /*
6605  * Locate the vnode to be used for an event notification. As this will
6606  * be called prior to the name space change perform basic verification
6607  * that the change will be allowed.
6608  */
6609
6610 static int
6611 ufs_eventlookup(struct vnode *dvp, char *nm, struct cred *cr,
6612     struct vnode **vpp)
6613 {
6614         int     namlen;
6615         int     error;
6616         struct vnode    *vp;
6617         struct inode    *ip;
6618         struct inode    *xip;
6619         struct ufsvfs   *ufsvfsp;
6620         struct ulockfs  *ulp;
6621
6622         ip = VTOI(dvp);
6623         *vpp = NULL;
6624
6625         if ((namlen = strlen(nm)) == 0)
6626                 return (EINVAL);
6627
6628         if (nm[0] == '.') {
6629                 if (namlen == 1)
6630                         return (EINVAL);
6631                 else if ((namlen == 2) && nm[1] == '.') {
6632                         return (EEXIST);
6633                 }
6634         }
6635
6636         /*
6637          * Check accessibility and write access of parent directory as we
6638          * only want to post the event if we're able to make a change.
6639          */
6640         if (error = ufs_diraccess(ip, IEXEC|IWRITE, cr))
6641                 return (error);
6642
6643         if (vp = dnlc_lookup(dvp, nm)) {
6644                 if (vp == DNLC_NO_VNODE) {
6645                         VN_RELE(vp);
6646                         return (ENOENT);
6647                 }
6648
6649                 *vpp = vp;
6650                 return (0);
6651         }
6652
6653         /*
6654          * Keep the idle queue from getting too long by idling two
6655          * inodes before attempting to allocate another.
6656          * This operation must be performed before entering lockfs
6657          * or a transaction.
6658          */
6659         if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
6660                 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
6661                         ins.in_lidles.value.ul += ufs_lookup_idle_count;
6662                         ufs_idle_some(ufs_lookup_idle_count);
6663                 }
6664
6665         ufsvfsp = ip->i_ufsvfs;
6666
6667 retry_lookup:
6668         if (error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK))
6669                 return (error);
6670
6671         if ((error = ufs_dirlook(ip, nm, &xip, cr, 1, 1)) == 0) {
6672                 vp = ITOV(xip);
6673                 *vpp = vp;
6674         }
6675
6676         if (ulp) {
6677                 ufs_lockfs_end(ulp);
6678         }
6679
6680         if (error == EAGAIN)
6681                 goto retry_lookup;
6682
6683         return (error);
6684 }