kernel/fs/ufs/ufs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2018 Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  29 /*        All Rights Reserved   */
  30
  31 /*
  32  * Portions of this source code were derived from Berkeley 4.3 BSD
  33  * under license from the Regents of the University of California.
  34  */
  35
  36 #include <sys/types.h>
  37 #include <sys/t_lock.h>
  38 #include <sys/ksynch.h>
  39 #include <sys/param.h>
  40 #include <sys/time.h>
  41 #include <sys/systm.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/resource.h>
  44 #include <sys/signal.h>
  45 #include <sys/cred.h>
  46 #include <sys/user.h>
  47 #include <sys/buf.h>
  48 #include <sys/vfs.h>
  49 #include <sys/vnode.h>
  50 #include <sys/proc.h>
  51 #include <sys/disp.h>
  52 #include <sys/file.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/flock.h>
  55 #include <sys/atomic.h>
  56 #include <sys/kmem.h>
  57 #include <sys/uio.h>
  58 #include <sys/dnlc.h>
  59 #include <sys/conf.h>
  60 #include <sys/mman.h>
  61 #include <sys/pathname.h>
  62 #include <sys/debug.h>
  63 #include <sys/vmsystm.h>
  64 #include <sys/cmn_err.h>
  65 #include <sys/filio.h>
  66 #include <sys/policy.h>
  67
  68 #include <sys/fs/ufs_fs.h>
  69 #include <sys/fs/ufs_lockfs.h>
  70 #include <sys/fs/ufs_filio.h>
  71 #include <sys/fs/ufs_inode.h>
  72 #include <sys/fs/ufs_fsdir.h>
  73 #include <sys/fs/ufs_quota.h>
  74 #include <sys/fs/ufs_log.h>
  75 #include <sys/fs/ufs_snap.h>
  76 #include <sys/fs/ufs_trans.h>
  77 #include <sys/fs/ufs_panic.h>
  78 #include <sys/fs/ufs_bio.h>
  79 #include <sys/dirent.h>         /* must be AFTER <sys/fs/fsdir.h>! */
  80 #include <sys/errno.h>
  81 #include <sys/fssnap_if.h>
  82 #include <sys/unistd.h>
  83 #include <sys/sunddi.h>
  84
  85 #include <sys/filio.h>          /* _FIOIO */
  86
  87 #include <vm/hat.h>
  88 #include <vm/page.h>
  89 #include <vm/pvn.h>
  90 #include <vm/as.h>
  91 #include <vm/seg.h>
  92 #include <vm/seg_map.h>
  93 #include <vm/seg_vn.h>
  94 #include <vm/seg_kmem.h>
  95 #include <vm/rm.h>
  96 #include <sys/swap.h>
  97
  98 #include <sys/fs_subr.h>
  99
 100 #include <sys/fs/decomp.h>
 101
 102 static struct instats ins;
 103
 104 static  int ufs_getpage_ra(struct vnode *, uoff_t, struct seg *, caddr_t);
 105 static  int ufs_getpage_miss(struct vnode *, uoff_t, size_t, struct seg *,
 106                 caddr_t, struct page **, size_t, enum seg_rw, int);
 107 static  int ufs_open(struct vnode **, int, struct cred *, caller_context_t *);
 108 static  int ufs_close(struct vnode *, int, int, offset_t, struct cred *,
 109                 caller_context_t *);
 110 static  int ufs_read(struct vnode *, struct uio *, int, struct cred *,
 111                 struct caller_context *);
 112 static  int ufs_write(struct vnode *, struct uio *, int, struct cred *,
 113                 struct caller_context *);
 114 static  int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *,
 115                 int *, caller_context_t *);
 116 static  int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *,
 117                 caller_context_t *);
 118 static  int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *,
 119                 caller_context_t *);
 120 static  int ufs_access(struct vnode *, int, int, struct cred *,
 121                 caller_context_t *);
 122 static  int ufs_lookup(struct vnode *, char *, struct vnode **,
 123                 struct pathname *, int, struct vnode *, struct cred *,
 124                 caller_context_t *, int *, pathname_t *);
 125 static  int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl,
 126                 int, struct vnode **, struct cred *, int,
 127                 caller_context_t *, vsecattr_t  *);
 128 static  int ufs_remove(struct vnode *, char *, struct cred *,
 129                 caller_context_t *, int);
 130 static  int ufs_link(struct vnode *, struct vnode *, char *, struct cred *,
 131                 caller_context_t *, int);
 132 static  int ufs_rename(struct vnode *, char *, struct vnode *, char *,
 133                 struct cred *, caller_context_t *, int);
 134 static  int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **,
 135                 struct cred *, caller_context_t *, int, vsecattr_t *);
 136 static  int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *,
 137                 caller_context_t *, int);
 138 static  int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *,
 139                 caller_context_t *, int);
 140 static  int ufs_symlink(struct vnode *, char *, struct vattr *, char *,
 141                 struct cred *, caller_context_t *, int);
 142 static  int ufs_readlink(struct vnode *, struct uio *, struct cred *,
 143                 caller_context_t *);
 144 static  int ufs_fsync(struct vnode *, int, struct cred *, caller_context_t *);
 145 static  void ufs_inactive(struct vnode *, struct cred *, caller_context_t *);
 146 static  int ufs_fid(struct vnode *, struct fid *, caller_context_t *);
 147 static  int ufs_rwlock(struct vnode *, int, caller_context_t *);
 148 static  void ufs_rwunlock(struct vnode *, int, caller_context_t *);
 149 static  int ufs_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
 150 static  int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
 151                 struct flk_callback *, struct cred *,
 152                 caller_context_t *);
 153 static  int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t,
 154                 cred_t *, caller_context_t *);
 155 static  int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *,
 156                 struct page **, size_t, struct seg *, caddr_t,
 157                 enum seg_rw, struct cred *, caller_context_t *);
 158 static  int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *,
 159                 caller_context_t *);
 160 static  int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *);
 161 static  int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
 162                 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 163 static  int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 164                 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 165 static  int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 166                 uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
 167 static  int ufs_poll(vnode_t *, short, int, short *, struct pollhead **,
 168                 caller_context_t *);
 169 static  int ufs_dump(vnode_t *, caddr_t, offset_t, offset_t,
 170     caller_context_t *);
 171 static  int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *,
 172                 caller_context_t *);
 173 static  int ufs_pageio(struct vnode *, struct page *, uoff_t, size_t, int,
 174                 struct cred *, caller_context_t *);
 175 static  int ufs_dumpctl(vnode_t *, int, offset_t *, caller_context_t *);
 176 static  daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *,
 177                 daddr32_t *, int, int);
 178 static  int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 179                 caller_context_t *);
 180 static  int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 181                 caller_context_t *);
 182 static  int ufs_priv_access(void *, int, struct cred *);
 183 static  int ufs_eventlookup(struct vnode *, char *, struct cred *,
 184     struct vnode **);
 185
 186 /*
 187  * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions.
 188  *
 189  * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet.
 190  *
 191  * NOTE: "not blkd" below  means that the operation isn't blocked by lockfs
 192  */
 193 const struct vnodeops ufs_vnodeops = {
 194         .vnop_name = "ufs",
 195         .vop_open = ufs_open,   /* not blkd */
 196         .vop_close = ufs_close, /* not blkd */
 197         .vop_read = ufs_read,
 198         .vop_write = ufs_write,
 199         .vop_ioctl = ufs_ioctl,
 200         .vop_getattr = ufs_getattr,
 201         .vop_setattr = ufs_setattr,
 202         .vop_access = ufs_access,
 203         .vop_lookup = ufs_lookup,
 204         .vop_create = ufs_create,
 205         .vop_remove = ufs_remove,
 206         .vop_link = ufs_link,
 207         .vop_rename = ufs_rename,
 208         .vop_mkdir = ufs_mkdir,
 209         .vop_rmdir = ufs_rmdir,
 210         .vop_readdir = ufs_readdir,
 211         .vop_symlink = ufs_symlink,
 212         .vop_readlink = ufs_readlink,
 213         .vop_fsync = ufs_fsync,
 214         .vop_inactive = ufs_inactive, /* not blkd */
 215         .vop_fid = ufs_fid,
 216         .vop_rwlock = ufs_rwlock,       /* not blkd */
 217         .vop_rwunlock = ufs_rwunlock, /* not blkd */
 218         .vop_seek = ufs_seek,
 219         .vop_frlock = ufs_frlock,
 220         .vop_space = ufs_space,
 221         .vop_getpage = ufs_getpage,
 222         .vop_putpage = ufs_putpage,
 223         .vop_map = ufs_map,
 224         .vop_addmap = ufs_addmap,       /* not blkd */
 225         .vop_delmap = ufs_delmap,       /* not blkd */
 226         .vop_poll = ufs_poll,   /* not blkd */
 227         .vop_dump = ufs_dump,
 228         .vop_pathconf = ufs_l_pathconf,
 229         .vop_pageio = ufs_pageio,
 230         .vop_dumpctl = ufs_dumpctl,
 231         .vop_getsecattr = ufs_getsecattr,
 232         .vop_setsecattr = ufs_setsecattr,
 233         .vop_vnevent = fs_vnevent_support,
 234 };
 235
 236 #define MAX_BACKFILE_COUNT      9999
 237
 238 /*
 239  * Created by ufs_dumpctl() to store a file's disk block info into memory.
 240  * Used by ufs_dump() to dump data to disk directly.
 241  */
 242 struct dump {
 243         struct inode    *ip;            /* the file we contain */
 244         daddr_t         fsbs;           /* number of blocks stored */
 245         struct timeval32 time;          /* time stamp for the struct */
 246         daddr32_t       dblk[1];        /* place holder for block info */
 247 };
 248
 249 static struct dump *dump_info = NULL;
 250
 251 /*
 252  * Previously there was no special action required for ordinary files.
 253  * (Devices are handled through the device file system.)
 254  * Now we support Large Files and Large File API requires open to
 255  * fail if file is large.
 256  * We could take care to prevent data corruption
 257  * by doing an atomic check of size and truncate if file is opened with
 258  * FTRUNC flag set but traditionally this is being done by the vfs/vnode
 259  * layers. So taking care of truncation here is a change in the existing
 260  * semantics of fop_open and therefore we chose not to implement any thing
 261  * here. The check for the size of the file > 2GB is being done at the
 262  * vfs layer in routine vn_open().
 263  */
 264
 265 /* ARGSUSED */
 266 static int
 267 ufs_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ct)
 268 {
 269         return (0);
 270 }
 271
 272 /*ARGSUSED*/
 273 static int
 274 ufs_close(struct vnode *vp, int flag, int count, offset_t offset,
 275     struct cred *cr, caller_context_t *ct)
 276 {
 277         cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 278         cleanshares(vp, ttoproc(curthread)->p_pid);
 279
 280         /*
 281          * Push partially filled cluster at last close.
 282          * ``last close'' is approximated because the dnlc
 283          * may have a hold on the vnode.
 284          * Checking for VBAD here will also act as a forced umount check.
 285          */
 286         if (vp->v_count <= 2 && vp->v_type != VBAD) {
 287                 struct inode *ip = VTOI(vp);
 288                 if (ip->i_delaylen) {
 289                         ins.in_poc.value.ul++;
 290                         (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen,
 291                             B_ASYNC | B_FREE, cr);
 292                         ip->i_delaylen = 0;
 293                 }
 294         }
 295
 296         return (0);
 297 }
 298
 299 /*ARGSUSED*/
 300 static int
 301 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
 302     struct caller_context *ct)
 303 {
 304         struct inode *ip = VTOI(vp);
 305         struct ufsvfs *ufsvfsp;
 306         struct ulockfs *ulp = NULL;
 307         int error = 0;
 308         int intrans = 0;
 309
 310         ASSERT(RW_READ_HELD(&ip->i_rwlock));
 311
 312         /*
 313          * Mandatory locking needs to be done before ufs_lockfs_begin()
 314          * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep.
 315          */
 316         if (MANDLOCK(vp, ip->i_mode)) {
 317                 /*
 318                  * ufs_getattr ends up being called by chklock
 319                  */
 320                 error = chklock(vp, FREAD, uiop->uio_loffset,
 321                     uiop->uio_resid, uiop->uio_fmode, ct);
 322                 if (error)
 323                         goto out;
 324         }
 325
 326         ufsvfsp = ip->i_ufsvfs;
 327         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK);
 328         if (error)
 329                 goto out;
 330
 331         /*
 332          * In the case that a directory is opened for reading as a file
 333          * (eg "cat .") with the  O_RSYNC, O_SYNC and O_DSYNC flags set.
 334          * The locking order had to be changed to avoid a deadlock with
 335          * an update taking place on that directory at the same time.
 336          */
 337         if ((ip->i_mode & IFMT) == IFDIR) {
 338
 339                 rw_enter(&ip->i_contents, RW_READER);
 340                 error = rdip(ip, uiop, ioflag, cr);
 341                 rw_exit(&ip->i_contents);
 342
 343                 if (error) {
 344                         if (ulp)
 345                                 ufs_lockfs_end(ulp);
 346                         goto out;
 347                 }
 348
 349                 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 350                     TRANS_ISTRANS(ufsvfsp)) {
 351                         rw_exit(&ip->i_rwlock);
 352                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC,
 353                                          TOP_READ_SIZE, &error);
 354                         ASSERT(!error);
 355                         TRANS_END_SYNC(ufsvfsp, &error, TOP_READ_SYNC,
 356                                        TOP_READ_SIZE);
 357                         rw_enter(&ip->i_rwlock, RW_READER);
 358                 }
 359         } else {
 360                 /*
 361                  * Only transact reads to files opened for sync-read and
 362                  * sync-write on a file system that is not write locked.
 363                  *
 364                  * The ``not write locked'' check prevents problems with
 365                  * enabling/disabling logging on a busy file system.  E.g.,
 366                  * logging exists at the beginning of the read but does not
 367                  * at the end.
 368                  *
 369                  */
 370                 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 371                     TRANS_ISTRANS(ufsvfsp)) {
 372                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC,
 373                                          TOP_READ_SIZE, &error);
 374                         ASSERT(!error);
 375                         intrans = 1;
 376                 }
 377
 378                 rw_enter(&ip->i_contents, RW_READER);
 379                 error = rdip(ip, uiop, ioflag, cr);
 380                 rw_exit(&ip->i_contents);
 381
 382                 if (intrans) {
 383                         TRANS_END_SYNC(ufsvfsp, &error, TOP_READ_SYNC,
 384                                        TOP_READ_SIZE);
 385                 }
 386         }
 387
 388         if (ulp) {
 389                 ufs_lockfs_end(ulp);
 390         }
 391 out:
 392
 393         return (error);
 394 }
 395
 396 extern  int     ufs_HW;         /* high water mark */
 397 extern  int     ufs_LW;         /* low water mark */
 398 int     ufs_WRITES = 1;         /* XXX - enable/disable */
 399 int     ufs_throttles = 0;      /* throttling count */
 400 int     ufs_allow_shared_writes = 1;    /* directio shared writes */
 401
 402 static int
 403 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag)
 404 {
 405         int     shared_write;
 406
 407         /*
 408          * If the FDSYNC flag is set then ignore the global
 409          * ufs_allow_shared_writes in this case.
 410          */
 411         shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes;
 412
 413         /*
 414          * Filter to determine if this request is suitable as a
 415          * concurrent rewrite. This write must not allocate blocks
 416          * by extending the file or filling in holes. No use trying
 417          * through FSYNC descriptors as the inode will be synchronously
 418          * updated after the write. The uio structure has not yet been
 419          * checked for sanity, so assume nothing.
 420          */
 421         return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) &&
 422             (uiop->uio_loffset >= 0) &&
 423             (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) &&
 424             ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) &&
 425             !(ioflag & FSYNC) && !bmap_has_holes(ip) &&
 426             shared_write);
 427 }
 428
 429 /*ARGSUSED*/
 430 static int
 431 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr,
 432     caller_context_t *ct)
 433 {
 434         struct inode *ip = VTOI(vp);
 435         struct ufsvfs *ufsvfsp;
 436         struct ulockfs *ulp;
 437         int retry = 1;
 438         int error, resv, resid = 0;
 439         int directio_status;
 440         int exclusive;
 441         int rewriteflg;
 442         long start_resid = uiop->uio_resid;
 443
 444         ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
 445
 446 retry_mandlock:
 447         /*
 448          * Mandatory locking needs to be done before ufs_lockfs_begin()
 449          * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep.
 450          * Check for forced unmounts normally done in ufs_lockfs_begin().
 451          */
 452         if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
 453                 error = EIO;
 454                 goto out;
 455         }
 456         if (MANDLOCK(vp, ip->i_mode)) {
 457
 458                 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
 459
 460                 /*
 461                  * ufs_getattr ends up being called by chklock
 462                  */
 463                 error = chklock(vp, FWRITE, uiop->uio_loffset,
 464                     uiop->uio_resid, uiop->uio_fmode, ct);
 465                 if (error)
 466                         goto out;
 467         }
 468
 469         /* i_rwlock can change in chklock */
 470         exclusive = rw_write_held(&ip->i_rwlock);
 471         rewriteflg = ufs_check_rewrite(ip, uiop, ioflag);
 472
 473         /*
 474          * Check for fast-path special case of directio re-writes.
 475          */
 476         if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) &&
 477             !exclusive && rewriteflg) {
 478
 479                 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 480                 if (error)
 481                         goto out;
 482
 483                 rw_enter(&ip->i_contents, RW_READER);
 484                 error = ufs_directio_write(ip, uiop, ioflag, 1, cr,
 485                     &directio_status);
 486                 if (directio_status == DIRECTIO_SUCCESS) {
 487                         uint_t i_flag_save;
 488
 489                         if (start_resid != uiop->uio_resid)
 490                                 error = 0;
 491                         /*
 492                          * Special treatment of access times for re-writes.
 493                          * If IMOD is not already set, then convert it
 494                          * to IMODACC for this operation. This defers
 495                          * entering a delta into the log until the inode
 496                          * is flushed. This mimics what is done for read
 497                          * operations and inode access time.
 498                          */
 499                         mutex_enter(&ip->i_tlock);
 500                         i_flag_save = ip->i_flag;
 501                         ip->i_flag |= IUPD | ICHG;
 502                         ip->i_seq++;
 503                         ITIMES_NOLOCK(ip);
 504                         if ((i_flag_save & IMOD) == 0) {
 505                                 ip->i_flag &= ~IMOD;
 506                                 ip->i_flag |= IMODACC;
 507                         }
 508                         mutex_exit(&ip->i_tlock);
 509                         rw_exit(&ip->i_contents);
 510                         if (ulp)
 511                                 ufs_lockfs_end(ulp);
 512                         goto out;
 513                 }
 514                 rw_exit(&ip->i_contents);
 515                 if (ulp)
 516                         ufs_lockfs_end(ulp);
 517         }
 518
 519         if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) {
 520                 rw_exit(&ip->i_rwlock);
 521                 rw_enter(&ip->i_rwlock, RW_WRITER);
 522                 /*
 523                  * Mandatory locking could have been enabled
 524                  * after dropping the i_rwlock.
 525                  */
 526                 if (MANDLOCK(vp, ip->i_mode))
 527                         goto retry_mandlock;
 528         }
 529
 530         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 531         if (error)
 532                 goto out;
 533
 534         /*
 535          * Amount of log space needed for this write
 536          */
 537         if (!rewriteflg || !(ioflag & FDSYNC))
 538                 TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid);
 539
 540         /*
 541          * Throttle writes.
 542          */
 543         if (ufs_WRITES && (ip->i_writes > ufs_HW)) {
 544                 mutex_enter(&ip->i_tlock);
 545                 while (ip->i_writes > ufs_HW) {
 546                         ufs_throttles++;
 547                         cv_wait(&ip->i_wrcv, &ip->i_tlock);
 548                 }
 549                 mutex_exit(&ip->i_tlock);
 550         }
 551
 552         /*
 553          * Enter Transaction
 554          *
 555          * If the write is a rewrite there is no need to open a transaction
 556          * if the FDSYNC flag is set and not the FSYNC.  In this case just
 557          * set the IMODACC flag to modify do the update at a later time
 558          * thus avoiding the overhead of the logging transaction that is
 559          * not required.
 560          */
 561         if (ioflag & (FSYNC|FDSYNC)) {
 562                 if (ulp) {
 563                         if (rewriteflg) {
 564                                 uint_t i_flag_save;
 565
 566                                 rw_enter(&ip->i_contents, RW_READER);
 567                                 mutex_enter(&ip->i_tlock);
 568                                 i_flag_save = ip->i_flag;
 569                                 ip->i_flag |= IUPD | ICHG;
 570                                 ip->i_seq++;
 571                                 ITIMES_NOLOCK(ip);
 572                                 if ((i_flag_save & IMOD) == 0) {
 573                                         ip->i_flag &= ~IMOD;
 574                                         ip->i_flag |= IMODACC;
 575                                 }
 576                                 mutex_exit(&ip->i_tlock);
 577                                 rw_exit(&ip->i_contents);
 578                         } else {
 579                                 int terr = 0;
 580                                 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC,
 581                                                  resv, &terr);
 582                                 ASSERT(!terr);
 583                         }
 584                 }
 585         } else {
 586                 if (ulp)
 587                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
 588         }
 589
 590         /*
 591          * Write the file
 592          */
 593         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 594         rw_enter(&ip->i_contents, RW_WRITER);
 595         if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) {
 596                 /*
 597                  * In append mode start at end of file.
 598                  */
 599                 uiop->uio_loffset = ip->i_size;
 600         }
 601
 602         /*
 603          * Mild optimisation, don't call ufs_trans_write() unless we have to
 604          * Also, suppress file system full messages if we will retry.
 605          */
 606         if (retry)
 607                 ip->i_flag |= IQUIET;
 608         if (resid) {
 609                 TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid);
 610         } else {
 611                 error = wrip(ip, uiop, ioflag, cr);
 612         }
 613         ip->i_flag &= ~IQUIET;
 614
 615         rw_exit(&ip->i_contents);
 616         rw_exit(&ufsvfsp->vfs_dqrwlock);
 617
 618         /*
 619          * Leave Transaction
 620          */
 621         if (ulp) {
 622                 if (ioflag & (FSYNC|FDSYNC)) {
 623                         if (!rewriteflg) {
 624                                 int terr = 0;
 625
 626                                 TRANS_END_SYNC(ufsvfsp, &terr,
 627                                                TOP_WRITE_SYNC, resv);
 628                                 if (error == 0)
 629                                         error = terr;
 630                         }
 631                 } else {
 632                         TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
 633                 }
 634                 ufs_lockfs_end(ulp);
 635         }
 636 out:
 637         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
 638                 /*
 639                  * Any blocks tied up in pending deletes?
 640                  */
 641                 ufs_delete_drain_wait(ufsvfsp, 1);
 642                 retry = 0;
 643                 goto retry_mandlock;
 644         }
 645
 646         if (error == ENOSPC && (start_resid != uiop->uio_resid))
 647                 error = 0;
 648
 649         return (error);
 650 }
 651
 652 /*
 653  * Don't cache write blocks to files with the sticky bit set.
 654  * Used to keep swap files from blowing the page cache on a server.
 655  */
 656 int stickyhack = 1;
 657
 658 /*
 659  * Free behind hacks.  The pager is busted.
 660  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
 661  * or B_FREE_IF_TIGHT_ON_MEMORY.
 662  */
 663 int     freebehind = 1;
 664 int     smallfile = 0;
 665 uoff_t smallfile64 = 32 * 1024;
 666
 667 /*
 668  * While we should, in most cases, cache the pages for write, we
 669  * may also want to cache the pages for read as long as they are
 670  * frequently re-usable.
 671  *
 672  * If cache_read_ahead = 1, the pages for read will go to the tail
 673  * of the cache list when they are released, otherwise go to the head.
 674  */
 675 int     cache_read_ahead = 0;
 676
 677 /*
 678  * Freebehind exists  so that as we read  large files  sequentially we
 679  * don't consume most of memory with pages  from a few files. It takes
 680  * longer to re-read from disk multiple small files as it does reading
 681  * one large one sequentially.  As system  memory grows customers need
 682  * to retain bigger chunks   of files in  memory.   The advent of  the
 683  * cachelist opens up of the possibility freeing pages  to the head or
 684  * tail of the list.
 685  *
 686  * Not freeing a page is a bet that the page will be read again before
 687  * it's segmap slot is needed for something else. If we loose the bet,
 688  * it means some  other thread is  burdened with the  page free we did
 689  * not do. If we win we save a free and reclaim.
 690  *
 691  * Freeing it at the tail  vs the head of cachelist  is a bet that the
 692  * page will survive until the next  read.  It's also saying that this
 693  * page is more likely to  be re-used than a  page freed some time ago
 694  * and never reclaimed.
 695  *
 696  * Freebehind maintains a  range of  file offset [smallfile1; smallfile2]
 697  *
 698  *            0 < offset < smallfile1 : pages are not freed.
 699  *   smallfile1 < offset < smallfile2 : pages freed to tail of cachelist.
 700  *   smallfile2 < offset              : pages freed to head of cachelist.
 701  *
 702  * The range  is  computed  at most  once  per second  and  depends on
 703  * freemem  and  ncpus_online.  Both parameters  are   bounded to be
 704  * >= smallfile && >= smallfile64.
 705  *
 706  * smallfile1 = (free memory / ncpu) / 1000
 707  * smallfile2 = (free memory / ncpu) / 10
 708  *
 709  * A few examples values:
 710  *
 711  *       Free Mem (in Bytes) [smallfile1; smallfile2]  [smallfile1; smallfile2]
 712  *                                 ncpus_online = 4          ncpus_online = 64
 713  *       ------------------  -----------------------   -----------------------
 714  *             1G                   [256K;  25M]               [32K; 1.5M]
 715  *            10G                   [2.5M; 250M]              [156K; 15M]
 716  *           100G                    [25M; 2.5G]              [1.5M; 150M]
 717  *
 718  */
 719
 720 #define SMALLFILE1_D 1000
 721 #define SMALLFILE2_D 10
 722 static uoff_t smallfile1 = 32 * 1024;
 723 static uoff_t smallfile2 = 32 * 1024;
 724 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */
 725 uint_t smallfile1_d = SMALLFILE1_D;
 726 uint_t smallfile2_d = SMALLFILE2_D;
 727
 728 /*
 729  * wrip does the real work of write requests for ufs.
 730  */
 731 int
 732 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr)
 733 {
 734         rlim64_t limit = uio->uio_llimit;
 735         uoff_t off;
 736         uoff_t old_i_size;
 737         struct fs *fs;
 738         struct vnode *vp;
 739         struct ufsvfs *ufsvfsp;
 740         caddr_t base;
 741         long start_resid = uio->uio_resid;      /* save starting resid */
 742         long premove_resid;                     /* resid before uiomove() */
 743         uint_t flags;
 744         int newpage;
 745         int iupdat_flag, directio_status;
 746         int n, on, mapon;
 747         int error, pagecreate;
 748         int do_dqrwlock;                /* drop/reacquire vfs_dqrwlock */
 749         int32_t iblocks;
 750         int     new_iblocks;
 751
 752         /*
 753          * ip->i_size is incremented before the uiomove
 754          * is done on a write.  If the move fails (bad user
 755          * address) reset ip->i_size.
 756          * The better way would be to increment ip->i_size
 757          * only if the uiomove succeeds.
 758          */
 759         int i_size_changed = 0;
 760         o_mode_t type;
 761         int i_seq_needed = 0;
 762
 763         vp = ITOV(ip);
 764
 765         /*
 766          * check for forced unmount - should not happen as
 767          * the request passed the lockfs checks.
 768          */
 769         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
 770                 return (EIO);
 771
 772         fs = ip->i_fs;
 773
 774         ASSERT(RW_WRITE_HELD(&ip->i_contents));
 775
 776         /* check for valid filetype */
 777         type = ip->i_mode & IFMT;
 778         if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
 779             (type != IFLNK) && (type != IFSHAD)) {
 780                 return (EIO);
 781         }
 782
 783         /*
 784          * the actual limit of UFS file size
 785          * is UFS_MAXOFFSET_T
 786          */
 787         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 788                 limit = MAXOFFSET_T;
 789
 790         if (uio->uio_loffset >= limit) {
 791                 proc_t *p = ttoproc(curthread);
 792
 793                 mutex_enter(&p->p_lock);
 794                 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
 795                     p, RCA_UNSAFE_SIGINFO);
 796                 mutex_exit(&p->p_lock);
 797                 return (EFBIG);
 798         }
 799
 800         /*
 801          * if largefiles are disallowed, the limit is
 802          * the pre-largefiles value of 2GB
 803          */
 804         if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
 805                 limit = MIN(UFS_MAXOFFSET_T, limit);
 806         else
 807                 limit = MIN(MAXOFF32_T, limit);
 808
 809         if (uio->uio_loffset < 0) {
 810                 return (EINVAL);
 811         }
 812         if (uio->uio_resid == 0) {
 813                 return (0);
 814         }
 815
 816         if (uio->uio_loffset >= limit)
 817                 return (EFBIG);
 818
 819         ip->i_flag |= INOACC;   /* don't update ref time in getpage */
 820
 821         if (ioflag & (FSYNC|FDSYNC)) {
 822                 ip->i_flag |= ISYNC;
 823                 iupdat_flag = 1;
 824         }
 825         /*
 826          * Try to go direct
 827          */
 828         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
 829                 uio->uio_llimit = limit;
 830                 error = ufs_directio_write(ip, uio, ioflag, 0, cr,
 831                     &directio_status);
 832                 /*
 833                  * If ufs_directio wrote to the file or set the flags,
 834                  * we need to update i_seq, but it may be deferred.
 835                  */
 836                 if (start_resid != uio->uio_resid ||
 837                     (ip->i_flag & (ICHG|IUPD))) {
 838                         i_seq_needed = 1;
 839                         ip->i_flag |= ISEQ;
 840                 }
 841                 if (directio_status == DIRECTIO_SUCCESS)
 842                         goto out;
 843         }
 844
 845         /*
 846          * Behavior with respect to dropping/reacquiring vfs_dqrwlock:
 847          *
 848          * o shadow inodes: vfs_dqrwlock is not held at all
 849          * o quota updates: vfs_dqrwlock is read or write held
 850          * o other updates: vfs_dqrwlock is read held
 851          *
 852          * The first case is the only one where we do not hold
 853          * vfs_dqrwlock at all while entering wrip().
 854          * We must make sure not to downgrade/drop vfs_dqrwlock if we
 855          * have it as writer, i.e. if we are updating the quota inode.
 856          * There is no potential deadlock scenario in this case as
 857          * ufs_getpage() takes care of this and avoids reacquiring
 858          * vfs_dqrwlock in that case.
 859          *
 860          * This check is done here since the above conditions do not change
 861          * and we possibly loop below, so save a few cycles.
 862          */
 863         if ((type == IFSHAD) ||
 864             (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) {
 865                 do_dqrwlock = 0;
 866         } else {
 867                 do_dqrwlock = 1;
 868         }
 869
 870         /*
 871          * Large Files: We cast MAXBMASK to offset_t
 872          * inorder to mask out the higher bits. Since offset_t
 873          * is a signed value, the high order bit set in MAXBMASK
 874          * value makes it do the right thing by having all bits 1
 875          * in the higher word. May be removed for _SOLARIS64_.
 876          */
 877
 878         fs = ip->i_fs;
 879         do {
 880                 uoff_t uoff = uio->uio_loffset;
 881                 off = uoff & (offset_t)MAXBMASK;
 882                 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
 883                 on = (int)blkoff(fs, uoff);
 884                 n = (int)MIN(fs->fs_bsize - on, uio->uio_resid);
 885                 new_iblocks = 1;
 886
 887                 if (type == IFREG && uoff + n >= limit) {
 888                         if (uoff >= limit) {
 889                                 error = EFBIG;
 890                                 goto out;
 891                         }
 892                         /*
 893                          * since uoff + n >= limit,
 894                          * therefore n >= limit - uoff, and n is an int
 895                          * so it is safe to cast it to an int
 896                          */
 897                         n = (int)(limit - (rlim64_t)uoff);
 898                 }
 899                 if (uoff + n > ip->i_size) {
 900                         /*
 901                          * We are extending the length of the file.
 902                          * bmap is used so that we are sure that
 903                          * if we need to allocate new blocks, that it
 904                          * is done here before we up the file size.
 905                          */
 906                         error = bmap_write(ip, uoff, (int)(on + n),
 907                             mapon == 0, NULL, cr);
 908                         /*
 909                          * bmap_write never drops i_contents so if
 910                          * the flags are set it changed the file.
 911                          */
 912                         if (ip->i_flag & (ICHG|IUPD)) {
 913                                 i_seq_needed = 1;
 914                                 ip->i_flag |= ISEQ;
 915                         }
 916                         if (error)
 917                                 break;
 918                         /*
 919                          * There is a window of vulnerability here.
 920                          * The sequence of operations: allocate file
 921                          * system blocks, uiomove the data into pages,
 922                          * and then update the size of the file in the
 923                          * inode, must happen atomically.  However, due
 924                          * to current locking constraints, this can not
 925                          * be done.
 926                          */
 927                         ASSERT(ip->i_writer == NULL);
 928                         ip->i_writer = curthread;
 929                         i_size_changed = 1;
 930                         /*
 931                          * If we are writing from the beginning of
 932                          * the mapping, we can just create the
 933                          * pages without having to read them.
 934                          */
 935                         pagecreate = (mapon == 0);
 936                 } else if (n == MAXBSIZE) {
 937                         /*
 938                          * Going to do a whole mappings worth,
 939                          * so we can just create the pages w/o
 940                          * having to read them in.  But before
 941                          * we do that, we need to make sure any
 942                          * needed blocks are allocated first.
 943                          */
 944                         iblocks = ip->i_blocks;
 945                         error = bmap_write(ip, uoff, (int)(on + n),
 946                             BI_ALLOC_ONLY, NULL, cr);
 947                         /*
 948                          * bmap_write never drops i_contents so if
 949                          * the flags are set it changed the file.
 950                          */
 951                         if (ip->i_flag & (ICHG|IUPD)) {
 952                                 i_seq_needed = 1;
 953                                 ip->i_flag |= ISEQ;
 954                         }
 955                         if (error)
 956                                 break;
 957                         pagecreate = 1;
 958                         /*
 959                          * check if the new created page needed the
 960                          * allocation of new disk blocks.
 961                          */
 962                         if (iblocks == ip->i_blocks)
 963                                 new_iblocks = 0; /* no new blocks allocated */
 964                 } else {
 965                         pagecreate = 0;
 966                         /*
 967                          * In sync mode flush the indirect blocks which
 968                          * may have been allocated and not written on
 969                          * disk. In above cases bmap_write will allocate
 970                          * in sync mode.
 971                          */
 972                         if (ioflag & (FSYNC|FDSYNC)) {
 973                                 error = ufs_indirblk_sync(ip, uoff);
 974                                 if (error)
 975                                         break;
 976                         }
 977                 }
 978
 979                 /*
 980                  * At this point we can enter ufs_getpage() in one
 981                  * of two ways:
 982                  * 1) segmap_getmapflt() calls ufs_getpage() when the
 983                  *    forcefault parameter is true (pagecreate == 0)
 984                  * 2) uiomove() causes a page fault.
 985                  *
 986                  * We have to drop the contents lock to prevent the VM
 987                  * system from trying to reacquire it in ufs_getpage()
 988                  * should the uiomove cause a pagefault.
 989                  *
 990                  * We have to drop the reader vfs_dqrwlock here as well.
 991                  */
 992                 rw_exit(&ip->i_contents);
 993                 if (do_dqrwlock) {
 994                         ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
 995                         ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock)));
 996                         rw_exit(&ufsvfsp->vfs_dqrwlock);
 997                 }
 998
 999                 newpage = 0;
1000                 premove_resid = uio->uio_resid;
1001
1002                 /*
1003                  * Touch the page and fault it in if it is not in core
1004                  * before segmap_getmapflt or vpm_data_copy can lock it.
1005                  * This is to avoid the deadlock if the buffer is mapped
1006                  * to the same file through mmap which we want to write.
1007                  */
1008                 uio_prefaultpages((long)n, uio);
1009
1010                 if (vpm_enable) {
1011                         /*
1012                          * Copy data. If new pages are created, part of
1013                          * the page that is not written will be initizliazed
1014                          * with zeros.
1015                          */
1016                         error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1017                             uio, !pagecreate, &newpage, 0, S_WRITE);
1018                 } else {
1019
1020                         base = segmap_getmapflt(segkmap, vp, (off + mapon),
1021                             (uint_t)n, !pagecreate, S_WRITE);
1022
1023                         /*
1024                          * segmap_pagecreate() returns 1 if it calls
1025                          * page_create_va() to allocate any pages.
1026                          */
1027
1028                         if (pagecreate)
1029                                 newpage = segmap_pagecreate(segkmap, base,
1030                                     (size_t)n, 0);
1031
1032                         error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
1033                 }
1034
1035                 /*
1036                  * If "newpage" is set, then a new page was created and it
1037                  * does not contain valid data, so it needs to be initialized
1038                  * at this point.
1039                  * Otherwise the page contains old data, which was overwritten
1040                  * partially or as a whole in uiomove.
1041                  * If there is only one iovec structure within uio, then
1042                  * on error uiomove will not be able to update uio->uio_loffset
1043                  * and we would zero the whole page here!
1044                  *
1045                  * If uiomove fails because of an error, the old valid data
1046                  * is kept instead of filling the rest of the page with zero's.
1047                  */
1048                 if (!vpm_enable && newpage &&
1049                     uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
1050                         /*
1051                          * We created pages w/o initializing them completely,
1052                          * thus we need to zero the part that wasn't set up.
1053                          * This happens on most EOF write cases and if
1054                          * we had some sort of error during the uiomove.
1055                          */
1056                         int nzero, nmoved;
1057
1058                         nmoved = (int)(uio->uio_loffset - (off + mapon));
1059                         ASSERT(nmoved >= 0 && nmoved <= n);
1060                         nzero = roundup(on + n, PAGESIZE) - nmoved;
1061                         ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
1062                         (void) kzero(base + mapon + nmoved, (uint_t)nzero);
1063                 }
1064
1065                 /*
1066                  * Unlock the pages allocated by page_create_va()
1067                  * in segmap_pagecreate()
1068                  */
1069                 if (!vpm_enable && newpage)
1070                         segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
1071
1072                 /*
1073                  * If the size of the file changed, then update the
1074                  * size field in the inode now.  This can't be done
1075                  * before the call to segmap_pageunlock or there is
1076                  * a potential deadlock with callers to ufs_putpage().
1077                  * They will be holding i_contents and trying to lock
1078                  * a page, while this thread is holding a page locked
1079                  * and trying to acquire i_contents.
1080                  */
1081                 if (i_size_changed) {
1082                         rw_enter(&ip->i_contents, RW_WRITER);
1083                         old_i_size = ip->i_size;
1084                         UFS_SET_ISIZE(uoff + n, ip);
1085                         TRANS_INODE(ufsvfsp, ip);
1086                         /*
1087                          * file has grown larger than 2GB. Set flag
1088                          * in superblock to indicate this, if it
1089                          * is not already set.
1090                          */
1091                         if ((ip->i_size > MAXOFF32_T) &&
1092                             !(fs->fs_flags & FSLARGEFILES)) {
1093                                 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1094                                 mutex_enter(&ufsvfsp->vfs_lock);
1095                                 fs->fs_flags |= FSLARGEFILES;
1096                                 ufs_sbwrite(ufsvfsp);
1097                                 mutex_exit(&ufsvfsp->vfs_lock);
1098                         }
1099                         mutex_enter(&ip->i_tlock);
1100                         ip->i_writer = NULL;
1101                         cv_broadcast(&ip->i_wrcv);
1102                         mutex_exit(&ip->i_tlock);
1103                         rw_exit(&ip->i_contents);
1104                 }
1105
1106                 if (error) {
1107                         /*
1108                          * If we failed on a write, we may have already
1109                          * allocated file blocks as well as pages.  It's
1110                          * hard to undo the block allocation, but we must
1111                          * be sure to invalidate any pages that may have
1112                          * been allocated.
1113                          *
1114                          * If the page was created without initialization
1115                          * then we must check if it should be possible
1116                          * to destroy the new page and to keep the old data
1117                          * on the disk.
1118                          *
1119                          * It is possible to destroy the page without
1120                          * having to write back its contents only when
1121                          * - the size of the file keeps unchanged
1122                          * - bmap_write() did not allocate new disk blocks
1123                          *   it is possible to create big files using "seek" and
1124                          *   write to the end of the file. A "write" to a
1125                          *   position before the end of the file would not
1126                          *   change the size of the file but it would allocate
1127                          *   new disk blocks.
1128                          * - uiomove intended to overwrite the whole page.
1129                          * - a new page was created (newpage == 1).
1130                          */
1131
1132                         if (i_size_changed == 0 && new_iblocks == 0 &&
1133                             newpage) {
1134
1135                                 /* unwind what uiomove eventually last did */
1136                                 uio->uio_resid = premove_resid;
1137
1138                                 /*
1139                                  * destroy the page, do not write ambiguous
1140                                  * data to the disk.
1141                                  */
1142                                 flags = SM_DESTROY;
1143                         } else {
1144                                 /*
1145                                  * write the page back to the disk, if dirty,
1146                                  * and remove the page from the cache.
1147                                  */
1148                                 flags = SM_INVAL;
1149                         }
1150
1151                         if (vpm_enable) {
1152                                 /*
1153                                  *  Flush pages.
1154                                  */
1155                                 (void) vpm_sync_pages(vp, off, n, flags);
1156                         } else {
1157                                 (void) segmap_release(segkmap, base, flags);
1158                         }
1159                 } else {
1160                         flags = 0;
1161                         /*
1162                          * Force write back for synchronous write cases.
1163                          */
1164                         if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) {
1165                                 /*
1166                                  * If the sticky bit is set but the
1167                                  * execute bit is not set, we do a
1168                                  * synchronous write back and free
1169                                  * the page when done.  We set up swap
1170                                  * files to be handled this way to
1171                                  * prevent servers from keeping around
1172                                  * the client's swap pages too long.
1173                                  * XXX - there ought to be a better way.
1174                                  */
1175                                 if (IS_SWAPVP(vp)) {
1176                                         flags = SM_WRITE | SM_FREE |
1177                                             SM_DONTNEED;
1178                                         iupdat_flag = 0;
1179                                 } else {
1180                                         flags = SM_WRITE;
1181                                 }
1182                         } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
1183                                 /*
1184                                  * Have written a whole block.
1185                                  * Start an asynchronous write and
1186                                  * mark the buffer to indicate that
1187                                  * it won't be needed again soon.
1188                                  */
1189                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
1190                         }
1191                         if (vpm_enable) {
1192                                 /*
1193                                  * Flush pages.
1194                                  */
1195                                 error = vpm_sync_pages(vp, off, n, flags);
1196                         } else {
1197                                 error = segmap_release(segkmap, base, flags);
1198                         }
1199                         /*
1200                          * If the operation failed and is synchronous,
1201                          * then we need to unwind what uiomove() last
1202                          * did so we can potentially return an error to
1203                          * the caller.  If this write operation was
1204                          * done in two pieces and the first succeeded,
1205                          * then we won't return an error for the second
1206                          * piece that failed.  However, we only want to
1207                          * return a resid value that reflects what was
1208                          * really done.
1209                          *
1210                          * Failures for non-synchronous operations can
1211                          * be ignored since the page subsystem will
1212                          * retry the operation until it succeeds or the
1213                          * file system is unmounted.
1214                          */
1215                         if (error) {
1216                                 if ((ioflag & (FSYNC | FDSYNC)) ||
1217                                     type == IFDIR) {
1218                                         uio->uio_resid = premove_resid;
1219                                 } else {
1220                                         error = 0;
1221                                 }
1222                         }
1223                 }
1224
1225                 /*
1226                  * Re-acquire contents lock.
1227                  * If it was dropped, reacquire reader vfs_dqrwlock as well.
1228                  */
1229                 if (do_dqrwlock)
1230                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1231                 rw_enter(&ip->i_contents, RW_WRITER);
1232
1233                 /*
1234                  * If the uiomove() failed or if a synchronous
1235                  * page push failed, fix up i_size.
1236                  */
1237                 if (error) {
1238                         if (i_size_changed) {
1239                                 /*
1240                                  * The uiomove failed, and we
1241                                  * allocated blocks,so get rid
1242                                  * of them.
1243                                  */
1244                                 (void) ufs_itrunc(ip, old_i_size, 0, cr);
1245                         }
1246                 } else {
1247                         /*
1248                          * XXX - Can this be out of the loop?
1249                          */
1250                         ip->i_flag |= IUPD | ICHG;
1251                         /*
1252                          * Only do one increase of i_seq for multiple
1253                          * pieces.  Because we drop locks, record
1254                          * the fact that we changed the timestamp and
1255                          * are deferring the increase in case another thread
1256                          * pushes our timestamp update.
1257                          */
1258                         i_seq_needed = 1;
1259                         ip->i_flag |= ISEQ;
1260                         if (i_size_changed)
1261                                 ip->i_flag |= IATTCHG;
1262                         if ((ip->i_mode & (IEXEC | (IEXEC >> 3) |
1263                             (IEXEC >> 6))) != 0 &&
1264                             (ip->i_mode & (ISUID | ISGID)) != 0 &&
1265                             secpolicy_vnode_setid_retain(cr,
1266                             (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) {
1267                                 /*
1268                                  * Clear Set-UID & Set-GID bits on
1269                                  * successful write if not privileged
1270                                  * and at least one of the execute bits
1271                                  * is set.  If we always clear Set-GID,
1272                                  * mandatory file and record locking is
1273                                  * unuseable.
1274                                  */
1275                                 ip->i_mode &= ~(ISUID | ISGID);
1276                         }
1277                 }
1278                 /*
1279                  * In the case the FDSYNC flag is set and this is a
1280                  * "rewrite" we won't log a delta.
1281                  * The FSYNC flag overrides all cases.
1282                  */
1283                 if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) {
1284                         TRANS_INODE(ufsvfsp, ip);
1285                 }
1286         } while (error == 0 && uio->uio_resid > 0 && n != 0);
1287
1288 out:
1289         /*
1290          * Make sure i_seq is increased at least once per write
1291          */
1292         if (i_seq_needed) {
1293                 ip->i_seq++;
1294                 ip->i_flag &= ~ISEQ;    /* no longer deferred */
1295         }
1296
1297         /*
1298          * Inode is updated according to this table -
1299          *
1300          *   FSYNC        FDSYNC(posix.4)
1301          *   --------------------------
1302          *   always@      IATTCHG|IBDWRITE
1303          *
1304          * @ -  If we are doing synchronous write the only time we should
1305          *      not be sync'ing the ip here is if we have the stickyhack
1306          *      activated, the file is marked with the sticky bit and
1307          *      no exec bit, the file length has not been changed and
1308          *      no new blocks have been allocated during this write.
1309          */
1310
1311         if ((ip->i_flag & ISYNC) != 0) {
1312                 /*
1313                  * we have eliminated nosync
1314                  */
1315                 if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
1316                     ((ioflag & FSYNC) && iupdat_flag)) {
1317                         ufs_iupdat(ip, 1);
1318                 }
1319         }
1320
1321         /*
1322          * If we've already done a partial-write, terminate
1323          * the write but return no error unless the error is ENOSPC
1324          * because the caller can detect this and free resources and
1325          * try again.
1326          */
1327         if ((start_resid != uio->uio_resid) && (error != ENOSPC))
1328                 error = 0;
1329
1330         ip->i_flag &= ~(INOACC | ISYNC);
1331         ITIMES_NOLOCK(ip);
1332         return (error);
1333 }
1334
1335 /*
1336  * rdip does the real work of read requests for ufs.
1337  */
1338 int
1339 rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
1340 {
1341         uoff_t off;
1342         caddr_t base;
1343         struct fs *fs;
1344         struct ufsvfs *ufsvfsp;
1345         struct vnode *vp;
1346         long oresid = uio->uio_resid;
1347         uoff_t n, on, mapon;
1348         int error = 0;
1349         int doupdate = 1;
1350         uint_t flags;
1351         int dofree, directio_status;
1352         krw_t rwtype;
1353         o_mode_t type;
1354         clock_t now;
1355
1356         vp = ITOV(ip);
1357
1358         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1359
1360         ufsvfsp = ip->i_ufsvfs;
1361
1362         if (ufsvfsp == NULL)
1363                 return (EIO);
1364
1365         fs = ufsvfsp->vfs_fs;
1366
1367         /* check for valid filetype */
1368         type = ip->i_mode & IFMT;
1369         if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
1370             (type != IFLNK) && (type != IFSHAD)) {
1371                 return (EIO);
1372         }
1373
1374         if (uio->uio_loffset > UFS_MAXOFFSET_T) {
1375                 error = 0;
1376                 goto out;
1377         }
1378         if (uio->uio_loffset < 0) {
1379                 return (EINVAL);
1380         }
1381         if (uio->uio_resid == 0) {
1382                 return (0);
1383         }
1384
1385         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) &&
1386             (!ufsvfsp->vfs_noatime)) {
1387                 mutex_enter(&ip->i_tlock);
1388                 ip->i_flag |= IACC;
1389                 mutex_exit(&ip->i_tlock);
1390         }
1391         /*
1392          * Try to go direct
1393          */
1394         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
1395                 error = ufs_directio_read(ip, uio, cr, &directio_status);
1396                 if (directio_status == DIRECTIO_SUCCESS)
1397                         goto out;
1398         }
1399
1400         rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
1401
1402         do {
1403                 offset_t diff;
1404                 uoff_t uoff = uio->uio_loffset;
1405                 off = uoff & (offset_t)MAXBMASK;
1406                 mapon = (uoff_t)(uoff & (offset_t)MAXBOFFSET);
1407                 on = (uoff_t)blkoff(fs, uoff);
1408                 n = MIN((uoff_t)fs->fs_bsize - on,
1409                     (uoff_t)uio->uio_resid);
1410
1411                 diff = ip->i_size - uoff;
1412
1413                 if (diff <= 0) {
1414                         error = 0;
1415                         goto out;
1416                 }
1417                 if (diff < (offset_t)n)
1418                         n = (int)diff;
1419
1420                 /*
1421                  * We update smallfile2 and smallfile1 at most every second.
1422                  */
1423                 now = ddi_get_lbolt();
1424                 if (now >= smallfile_update) {
1425                         uint64_t percpufreeb;
1426                         if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
1427                         if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
1428                         percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
1429                         smallfile1 = percpufreeb / smallfile1_d;
1430                         smallfile2 = percpufreeb / smallfile2_d;
1431                         smallfile1 = MAX(smallfile1, smallfile);
1432                         smallfile1 = MAX(smallfile1, smallfile64);
1433                         smallfile2 = MAX(smallfile1, smallfile2);
1434                         smallfile_update = now + hz;
1435                 }
1436
1437                 dofree = freebehind &&
1438                     ip->i_nextr == (off & PAGEMASK) && off > smallfile1;
1439
1440                 /*
1441                  * At this point we can enter ufs_getpage() in one of two
1442                  * ways:
1443                  * 1) segmap_getmapflt() calls ufs_getpage() when the
1444                  *    forcefault parameter is true (value of 1 is passed)
1445                  * 2) uiomove() causes a page fault.
1446                  *
1447                  * We cannot hold onto an i_contents reader lock without
1448                  * risking deadlock in ufs_getpage() so drop a reader lock.
1449                  * The ufs_getpage() dolock logic already allows for a
1450                  * thread holding i_contents as writer to work properly
1451                  * so we keep a writer lock.
1452                  */
1453                 if (rwtype == RW_READER)
1454                         rw_exit(&ip->i_contents);
1455
1456                 if (vpm_enable) {
1457                         /*
1458                          * Copy data.
1459                          */
1460                         error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1461                             uio, 1, NULL, 0, S_READ);
1462                 } else {
1463                         base = segmap_getmapflt(segkmap, vp, (off + mapon),
1464                             (uint_t)n, 1, S_READ);
1465                         error = uiomove(base + mapon, (long)n, UIO_READ, uio);
1466                 }
1467
1468                 flags = 0;
1469                 if (!error) {
1470                         /*
1471                          * If  reading sequential  we won't need  this
1472                          * buffer again  soon.  For  offsets in  range
1473                          * [smallfile1,  smallfile2] release the pages
1474                          * at   the  tail  of the   cache list, larger
1475                          * offsets are released at the head.
1476                          */
1477                         if (dofree) {
1478                                 flags = SM_FREE | SM_ASYNC;
1479                                 if ((cache_read_ahead == 0) &&
1480                                     (off > smallfile2))
1481                                         flags |=  SM_DONTNEED;
1482                         }
1483                         /*
1484                          * In POSIX SYNC (FSYNC and FDSYNC) read mode,
1485                          * we want to make sure that the page which has
1486                          * been read, is written on disk if it is dirty.
1487                          * And corresponding indirect blocks should also
1488                          * be flushed out.
1489                          */
1490                         if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
1491                                 flags &= ~SM_ASYNC;
1492                                 flags |= SM_WRITE;
1493                         }
1494                         if (vpm_enable) {
1495                                 error = vpm_sync_pages(vp, off, n, flags);
1496                         } else {
1497                                 error = segmap_release(segkmap, base, flags);
1498                         }
1499                 } else {
1500                         if (vpm_enable) {
1501                                 (void) vpm_sync_pages(vp, off, n, flags);
1502                         } else {
1503                                 (void) segmap_release(segkmap, base, flags);
1504                         }
1505                 }
1506
1507                 if (rwtype == RW_READER)
1508                         rw_enter(&ip->i_contents, rwtype);
1509         } while (error == 0 && uio->uio_resid > 0 && n != 0);
1510 out:
1511         /*
1512          * Inode is updated according to this table if FRSYNC is set.
1513          *
1514          *   FSYNC        FDSYNC(posix.4)
1515          *   --------------------------
1516          *   always       IATTCHG|IBDWRITE
1517          */
1518         /*
1519          * The inode is not updated if we're logging and the inode is a
1520          * directory with FRSYNC, FSYNC and FDSYNC flags set.
1521          */
1522         if (ioflag & FRSYNC) {
1523                 if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) {
1524                         doupdate = 0;
1525                 }
1526                 if (doupdate) {
1527                         if ((ioflag & FSYNC) ||
1528                             ((ioflag & FDSYNC) &&
1529                             (ip->i_flag & (IATTCHG|IBDWRITE)))) {
1530                                 ufs_iupdat(ip, 1);
1531                         }
1532                 }
1533         }
1534         /*
1535          * If we've already done a partial read, terminate
1536          * the read but return no error.
1537          */
1538         if (oresid != uio->uio_resid)
1539                 error = 0;
1540         ITIMES(ip);
1541
1542         return (error);
1543 }
1544
1545 /* ARGSUSED */
1546 static int
1547 ufs_ioctl(
1548         struct vnode    *vp,
1549         int             cmd,
1550         intptr_t        arg,
1551         int             flag,
1552         struct cred     *cr,
1553         int             *rvalp,
1554         caller_context_t *ct)
1555 {
1556         struct lockfs   lockfs, lockfs_out;
1557         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
1558         char            *comment, *original_comment;
1559         struct fs       *fs;
1560         struct ulockfs  *ulp;
1561         offset_t        off;
1562         extern int      maxphys;
1563         int             error;
1564         int             issync;
1565         int             trans_size;
1566
1567
1568         /*
1569          * forcibly unmounted
1570          */
1571         if (ufsvfsp == NULL || vp->v_vfsp == NULL ||
1572             vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
1573                 return (EIO);
1574         fs = ufsvfsp->vfs_fs;
1575
1576         if (cmd == Q_QUOTACTL) {
1577                 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK);
1578                 if (error)
1579                         return (error);
1580
1581                 if (ulp) {
1582                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA,
1583                             TOP_SETQUOTA_SIZE(fs));
1584                 }
1585
1586                 error = quotactl(vp, arg, flag, cr);
1587
1588                 if (ulp) {
1589                         TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA,
1590                             TOP_SETQUOTA_SIZE(fs));
1591                         ufs_lockfs_end(ulp);
1592                 }
1593                 return (error);
1594         }
1595
1596         switch (cmd) {
1597                 case _FIOLFS:
1598                         /*
1599                          * file system locking
1600                          */
1601                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1602                                 return (EPERM);
1603
1604                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1605                                 if (copyin((caddr_t)arg, &lockfs,
1606                                     sizeof (struct lockfs)))
1607                                         return (EFAULT);
1608                         }
1609 #ifdef _SYSCALL32_IMPL
1610                         else {
1611                                 struct lockfs32 lockfs32;
1612                                 /* Translate ILP32 lockfs to LP64 lockfs */
1613                                 if (copyin((caddr_t)arg, &lockfs32,
1614                                     sizeof (struct lockfs32)))
1615                                         return (EFAULT);
1616                                 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1617                                 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1618                                 lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1619                                 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1620                                 lockfs.lf_comment =
1621                                     (caddr_t)(uintptr_t)lockfs32.lf_comment;
1622                         }
1623 #endif /* _SYSCALL32_IMPL */
1624
1625                         if (lockfs.lf_comlen) {
1626                                 if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN)
1627                                         return (ENAMETOOLONG);
1628                                 comment =
1629                                     kmem_alloc(lockfs.lf_comlen, KM_SLEEP);
1630                                 if (copyin(lockfs.lf_comment, comment,
1631                                     lockfs.lf_comlen)) {
1632                                         kmem_free(comment, lockfs.lf_comlen);
1633                                         return (EFAULT);
1634                                 }
1635                                 original_comment = lockfs.lf_comment;
1636                                 lockfs.lf_comment = comment;
1637                         }
1638                         if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) {
1639                                 lockfs.lf_comment = original_comment;
1640
1641                                 if ((flag & DATAMODEL_MASK) ==
1642                                     DATAMODEL_NATIVE) {
1643                                         (void) copyout(&lockfs, (caddr_t)arg,
1644                                             sizeof (struct lockfs));
1645                                 }
1646 #ifdef _SYSCALL32_IMPL
1647                                 else {
1648                                         struct lockfs32 lockfs32;
1649                                         /* Translate LP64 to ILP32 lockfs */
1650                                         lockfs32.lf_lock =
1651                                             (uint32_t)lockfs.lf_lock;
1652                                         lockfs32.lf_flags =
1653                                             (uint32_t)lockfs.lf_flags;
1654                                         lockfs32.lf_key =
1655                                             (uint32_t)lockfs.lf_key;
1656                                         lockfs32.lf_comlen =
1657                                             (uint32_t)lockfs.lf_comlen;
1658                                         lockfs32.lf_comment =
1659                                             (uint32_t)(uintptr_t)
1660                                             lockfs.lf_comment;
1661                                         (void) copyout(&lockfs32, (caddr_t)arg,
1662                                             sizeof (struct lockfs32));
1663                                 }
1664 #endif /* _SYSCALL32_IMPL */
1665
1666                         } else {
1667                                 if (lockfs.lf_comlen)
1668                                         kmem_free(comment, lockfs.lf_comlen);
1669                         }
1670                         return (error);
1671
1672                 case _FIOLFSS:
1673                         /*
1674                          * get file system locking status
1675                          */
1676
1677                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1678                                 if (copyin((caddr_t)arg, &lockfs,
1679                                     sizeof (struct lockfs)))
1680                                         return (EFAULT);
1681                         }
1682 #ifdef _SYSCALL32_IMPL
1683                         else {
1684                                 struct lockfs32 lockfs32;
1685                                 /* Translate ILP32 lockfs to LP64 lockfs */
1686                                 if (copyin((caddr_t)arg, &lockfs32,
1687                                     sizeof (struct lockfs32)))
1688                                         return (EFAULT);
1689                                 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1690                                 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1691                                 lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1692                                 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1693                                 lockfs.lf_comment =
1694                                     (caddr_t)(uintptr_t)lockfs32.lf_comment;
1695                         }
1696 #endif /* _SYSCALL32_IMPL */
1697
1698                         if (error =  ufs_fiolfss(vp, &lockfs_out))
1699                                 return (error);
1700                         lockfs.lf_lock = lockfs_out.lf_lock;
1701                         lockfs.lf_key = lockfs_out.lf_key;
1702                         lockfs.lf_flags = lockfs_out.lf_flags;
1703                         lockfs.lf_comlen = MIN(lockfs.lf_comlen,
1704                             lockfs_out.lf_comlen);
1705
1706                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1707                                 if (copyout(&lockfs, (caddr_t)arg,
1708                                     sizeof (struct lockfs)))
1709                                         return (EFAULT);
1710                         }
1711 #ifdef _SYSCALL32_IMPL
1712                         else {
1713                                 /* Translate LP64 to ILP32 lockfs */
1714                                 struct lockfs32 lockfs32;
1715                                 lockfs32.lf_lock = (uint32_t)lockfs.lf_lock;
1716                                 lockfs32.lf_flags = (uint32_t)lockfs.lf_flags;
1717                                 lockfs32.lf_key = (uint32_t)lockfs.lf_key;
1718                                 lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen;
1719                                 lockfs32.lf_comment =
1720                                     (uint32_t)(uintptr_t)lockfs.lf_comment;
1721                                 if (copyout(&lockfs32, (caddr_t)arg,
1722                                     sizeof (struct lockfs32)))
1723                                         return (EFAULT);
1724                         }
1725 #endif /* _SYSCALL32_IMPL */
1726
1727                         if (lockfs.lf_comlen &&
1728                             lockfs.lf_comment && lockfs_out.lf_comment)
1729                                 if (copyout(lockfs_out.lf_comment,
1730                                     lockfs.lf_comment, lockfs.lf_comlen))
1731                                         return (EFAULT);
1732                         return (0);
1733
1734                 case _FIOSATIME:
1735                         /*
1736                          * set access time
1737                          */
1738
1739                         /*
1740                          * if mounted w/o atime, return quietly.
1741                          * I briefly thought about returning ENOSYS, but
1742                          * figured that most apps would consider this fatal
1743                          * but the idea is to make this as seamless as poss.
1744                          */
1745                         if (ufsvfsp->vfs_noatime)
1746                                 return (0);
1747
1748                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1749                             ULOCKFS_SETATTR_MASK);
1750                         if (error)
1751                                 return (error);
1752
1753                         if (ulp) {
1754                                 trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp));
1755                                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync,
1756                                                   TOP_SETATTR, trans_size);
1757                         }
1758
1759                         error = ufs_fiosatime(vp, (struct timeval *)arg,
1760                             flag, cr);
1761
1762                         if (ulp) {
1763                                 TRANS_END_CSYNC(ufsvfsp, &error, issync,
1764                                                 TOP_SETATTR, trans_size);
1765                                 ufs_lockfs_end(ulp);
1766                         }
1767                         return (error);
1768
1769                 case _FIOSDIO:
1770                         /*
1771                          * set delayed-io
1772                          */
1773                         return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr));
1774
1775                 case _FIOGDIO:
1776                         /*
1777                          * get delayed-io
1778                          */
1779                         return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr));
1780
1781                 case _FIOIO:
1782                         /*
1783                          * inode open
1784                          */
1785                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1786                             ULOCKFS_VGET_MASK);
1787                         if (error)
1788                                 return (error);
1789
1790                         error = ufs_fioio(vp, (struct fioio *)arg, flag, cr);
1791
1792                         if (ulp) {
1793                                 ufs_lockfs_end(ulp);
1794                         }
1795                         return (error);
1796
1797                 case _FIOFFS:
1798                         /*
1799                          * file system flush (push w/invalidate)
1800                          */
1801                         if ((caddr_t)arg != NULL)
1802                                 return (EINVAL);
1803                         return (ufs_fioffs(vp, NULL, cr));
1804
1805                 case _FIOISBUSY:
1806                         /*
1807                          * Contract-private interface for Legato
1808                          * Purge this vnode from the DNLC and decide
1809                          * if this vnode is busy (*arg == 1) or not
1810                          * (*arg == 0)
1811                          */
1812                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1813                                 return (EPERM);
1814                         error = ufs_fioisbusy(vp, (int *)arg, cr);
1815                         return (error);
1816
1817                 case _FIODIRECTIO:
1818                         return (ufs_fiodirectio(vp, (int)arg, cr));
1819
1820                 case _FIOTUNE:
1821                         /*
1822                          * Tune the file system (aka setting fs attributes)
1823                          */
1824                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1825                             ULOCKFS_SETATTR_MASK);
1826                         if (error)
1827                                 return (error);
1828
1829                         error = ufs_fiotune(vp, (struct fiotune *)arg, cr);
1830
1831                         if (ulp)
1832                                 ufs_lockfs_end(ulp);
1833                         return (error);
1834
1835                 case _FIOLOGENABLE:
1836                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1837                                 return (EPERM);
1838                         return (ufs_fiologenable(vp, (void *)arg, cr, flag));
1839
1840                 case _FIOLOGDISABLE:
1841                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1842                                 return (EPERM);
1843                         return (ufs_fiologdisable(vp, (void *)arg, cr, flag));
1844
1845                 case _FIOISLOG:
1846                         return (ufs_fioislog(vp, (void *)arg, cr, flag));
1847
1848                 case _FIOSNAPSHOTCREATE_MULTI:
1849                 {
1850                         struct fiosnapcreate_multi      fc, *fcp;
1851                         size_t  fcm_size;
1852
1853                         if (copyin((void *)arg, &fc, sizeof (fc)))
1854                                 return (EFAULT);
1855                         if (fc.backfilecount > MAX_BACKFILE_COUNT)
1856                                 return (EINVAL);
1857                         fcm_size = sizeof (struct fiosnapcreate_multi) +
1858                             (fc.backfilecount - 1) * sizeof (int);
1859                         fcp = (struct fiosnapcreate_multi *)
1860                             kmem_alloc(fcm_size, KM_SLEEP);
1861                         if (copyin((void *)arg, fcp, fcm_size)) {
1862                                 kmem_free(fcp, fcm_size);
1863                                 return (EFAULT);
1864                         }
1865                         error = ufs_snap_create(vp, fcp, cr);
1866                         /*
1867                          * Do copyout even if there is an error because
1868                          * the details of error is stored in fcp.
1869                          */
1870                         if (copyout(fcp, (void *)arg, fcm_size))
1871                                 error = EFAULT;
1872                         kmem_free(fcp, fcm_size);
1873                         return (error);
1874                 }
1875
1876                 case _FIOSNAPSHOTDELETE:
1877                 {
1878                         struct fiosnapdelete    fc;
1879
1880                         if (copyin((void *)arg, &fc, sizeof (fc)))
1881                                 return (EFAULT);
1882                         error = ufs_snap_delete(vp, &fc, cr);
1883                         if (!error && copyout(&fc, (void *)arg, sizeof (fc)))
1884                                 error = EFAULT;
1885                         return (error);
1886                 }
1887
1888                 case _FIOGETSUPERBLOCK:
1889                         if (copyout(fs, (void *)arg, SBSIZE))
1890                                 return (EFAULT);
1891                         return (0);
1892
1893                 case _FIOGETMAXPHYS:
1894                         if (copyout(&maxphys, (void *)arg, sizeof (maxphys)))
1895                                 return (EFAULT);
1896                         return (0);
1897
1898                 /*
1899                  * The following 3 ioctls are for TSufs support
1900                  * although could potentially be used elsewhere
1901                  */
1902                 case _FIO_SET_LUFS_DEBUG:
1903                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1904                                 return (EPERM);
1905                         lufs_debug = (uint32_t)arg;
1906                         return (0);
1907
1908                 case _FIO_SET_LUFS_ERROR:
1909                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1910                                 return (EPERM);
1911                         TRANS_SETERROR(ufsvfsp);
1912                         return (0);
1913
1914                 case _FIO_GET_TOP_STATS:
1915                 {
1916                         fio_lufs_stats_t *ls;
1917                         ml_unit_t *ul = ufsvfsp->vfs_log;
1918
1919                         ls = kmem_zalloc(sizeof (*ls), KM_SLEEP);
1920                         ls->ls_debug = ul->un_debug; /* return debug value */
1921                         /* Copy stucture if statistics are being kept */
1922                         if (ul->un_logmap->mtm_tops) {
1923                                 ls->ls_topstats = *(ul->un_logmap->mtm_tops);
1924                         }
1925                         error = 0;
1926                         if (copyout(ls, (void *)arg, sizeof (*ls)))
1927                                 error = EFAULT;
1928                         kmem_free(ls, sizeof (*ls));
1929                         return (error);
1930                 }
1931
1932                 case _FIO_SEEK_DATA:
1933                 case _FIO_SEEK_HOLE:
1934                         if (ddi_copyin((void *)arg, &off, sizeof (off), flag))
1935                                 return (EFAULT);
1936                         /* offset paramater is in/out */
1937                         error = ufs_fio_holey(vp, cmd, &off);
1938                         if (error)
1939                                 return (error);
1940                         if (ddi_copyout(&off, (void *)arg, sizeof (off), flag))
1941                                 return (EFAULT);
1942                         return (0);
1943
1944                 case _FIO_COMPRESSED:
1945                 {
1946                         /*
1947                          * This is a project private ufs ioctl() to mark
1948                          * the inode as that belonging to a compressed
1949                          * file. This is used to mark individual
1950                          * compressed files in a miniroot archive.
1951                          * The files compressed in this manner are
1952                          * automatically decompressed by the dcfs filesystem
1953                          * (via an interception in ufs_lookup - see decompvp())
1954                          * which is layered on top of ufs on a system running
1955                          * from the archive. See uts/common/fs/dcfs for details.
1956                          * This ioctl only marks the file as compressed - the
1957                          * actual compression is done by fiocompress (a
1958                          * userland utility) which invokes this ioctl().
1959                          */
1960                         struct inode *ip = VTOI(vp);
1961
1962                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1963                             ULOCKFS_SETATTR_MASK);
1964                         if (error)
1965                                 return (error);
1966
1967                         if (ulp) {
1968                                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT,
1969                                     TOP_IUPDAT_SIZE(ip));
1970                         }
1971
1972                         error = ufs_mark_compressed(vp);
1973
1974                         if (ulp) {
1975                                 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT,
1976                                     TOP_IUPDAT_SIZE(ip));
1977                                 ufs_lockfs_end(ulp);
1978                         }
1979
1980                         return (error);
1981
1982                 }
1983
1984                 default:
1985                         return (ENOTTY);
1986         }
1987 }
1988
1989
1990 /* ARGSUSED */
1991 static int
1992 ufs_getattr(struct vnode *vp, struct vattr *vap, int flags,
1993     struct cred *cr, caller_context_t *ct)
1994 {
1995         struct inode *ip = VTOI(vp);
1996         struct ufsvfs *ufsvfsp;
1997         int err;
1998
1999         if (vap->va_mask == VATTR_SIZE) {
2000                 /*
2001                  * for performance, if only the size is requested don't bother
2002                  * with anything else.
2003                  */
2004                 UFS_GET_ISIZE(&vap->va_size, ip);
2005                 return (0);
2006         }
2007
2008         /*
2009          * inlined lockfs checks
2010          */
2011         ufsvfsp = ip->i_ufsvfs;
2012         if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) {
2013                 err = EIO;
2014                 goto out;
2015         }
2016
2017         rw_enter(&ip->i_contents, RW_READER);
2018         /*
2019          * Return all the attributes.  This should be refined so
2020          * that it only returns what's asked for.
2021          */
2022
2023         /*
2024          * Copy from inode table.
2025          */
2026         vap->va_type = vp->v_type;
2027         vap->va_mode = ip->i_mode & MODEMASK;
2028         /*
2029          * If there is an ACL and there is a mask entry, then do the
2030          * extra work that completes the equivalent of an acltomode(3)
2031          * call.  According to POSIX P1003.1e, the acl mask should be
2032          * returned in the group permissions field.
2033          *
2034          * - start with the original permission and mode bits (from above)
2035          * - clear the group owner bits
2036          * - add in the mask bits.
2037          */
2038         if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) {
2039                 vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3);
2040                 vap->va_mode |=
2041                     (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3;
2042         }
2043         vap->va_uid = ip->i_uid;
2044         vap->va_gid = ip->i_gid;
2045         vap->va_fsid = ip->i_dev;
2046         vap->va_nodeid = (ino64_t)ip->i_number;
2047         vap->va_nlink = ip->i_nlink;
2048         vap->va_size = ip->i_size;
2049         if (vp->v_type == VCHR || vp->v_type == VBLK)
2050                 vap->va_rdev = ip->i_rdev;
2051         else
2052                 vap->va_rdev = 0;       /* not a b/c spec. */
2053         mutex_enter(&ip->i_tlock);
2054         ITIMES_NOLOCK(ip);      /* mark correct time in inode */
2055         vap->va_seq = ip->i_seq;
2056         vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
2057         vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000;
2058         vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
2059         vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000;
2060         vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
2061         vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000;
2062         mutex_exit(&ip->i_tlock);
2063
2064         switch (ip->i_mode & IFMT) {
2065
2066         case IFBLK:
2067                 vap->va_blksize = MAXBSIZE;             /* was BLKDEV_IOSIZE */
2068                 break;
2069
2070         case IFCHR:
2071                 vap->va_blksize = MAXBSIZE;
2072                 break;
2073
2074         default:
2075                 vap->va_blksize = ip->i_fs->fs_bsize;
2076                 break;
2077         }
2078         vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks;
2079         rw_exit(&ip->i_contents);
2080         err = 0;
2081
2082 out:
2083         return (err);
2084 }
2085
2086 /*
2087  * Special wrapper to provide a callback for secpolicy_vnode_setattr().
2088  * The i_contents lock is already held by the caller and we need to
2089  * declare the inode as 'void *' argument.
2090  */
2091 static int
2092 ufs_priv_access(void *vip, int mode, struct cred *cr)
2093 {
2094         struct inode *ip = vip;
2095
2096         return (ufs_iaccess(ip, mode, cr, 0));
2097 }
2098
2099 /*ARGSUSED4*/
2100 static int
2101 ufs_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
2102     caller_context_t *ct)
2103 {
2104         struct inode *ip = VTOI(vp);
2105         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2106         struct fs *fs;
2107         struct ulockfs *ulp;
2108         char *errmsg1;
2109         char *errmsg2;
2110         long blocks;
2111         long int mask = vap->va_mask;
2112         size_t len1, len2;
2113         int issync;
2114         int trans_size;
2115         int dotrans;
2116         int dorwlock;
2117         int error;
2118         int owner_change;
2119         int dodqlock;
2120         timestruc_t now;
2121         vattr_t oldva;
2122         int retry = 1;
2123         int indeadlock;
2124
2125         /*
2126          * Cannot set these attributes.
2127          */
2128         if ((mask & VATTR_NOSET) || (mask & VATTR_XVATTR))
2129                 return (EINVAL);
2130
2131         /*
2132          * check for forced unmount
2133          */
2134         if (ufsvfsp == NULL)
2135                 return (EIO);
2136
2137         fs = ufsvfsp->vfs_fs;
2138         if (fs->fs_ronly != 0)
2139                 return (EROFS);
2140
2141 again:
2142         errmsg1 = NULL;
2143         errmsg2 = NULL;
2144         dotrans = 0;
2145         dorwlock = 0;
2146         dodqlock = 0;
2147
2148         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK);
2149         if (error)
2150                 goto out;
2151
2152         /*
2153          * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
2154          * This follows the protocol for read()/write().
2155          */
2156         if (vp->v_type != VDIR) {
2157                 /*
2158                  * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
2159                  * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2160                  * possible, retries the operation.
2161                  */
2162                 indeadlock = ufs_tryirwlock(ulp, &ip->i_rwlock, RW_WRITER);
2163                 if (indeadlock) {
2164                         if (ulp)
2165                                 ufs_lockfs_end(ulp);
2166                         goto again;
2167                 }
2168                 dorwlock = 1;
2169         }
2170
2171         /*
2172          * Truncate file.  Must have write permission and not be a directory.
2173          */
2174         if (mask & VATTR_SIZE) {
2175                 rw_enter(&ip->i_contents, RW_WRITER);
2176                 if (vp->v_type == VDIR) {
2177                         error = EISDIR;
2178                         goto update_inode;
2179                 }
2180                 if (error = ufs_iaccess(ip, IWRITE, cr, 0))
2181                         goto update_inode;
2182
2183                 rw_exit(&ip->i_contents);
2184                 error = TRANS_ITRUNC(ip, vap->va_size, 0, cr);
2185                 if (error) {
2186                         rw_enter(&ip->i_contents, RW_WRITER);
2187                         goto update_inode;
2188                 }
2189
2190                 if (error == 0 && vap->va_size)
2191                         vnevent_truncate(vp, ct);
2192         }
2193
2194         if (ulp) {
2195                 trans_size = (int)TOP_SETATTR_SIZE(ip);
2196                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_SETATTR, trans_size);
2197                 ++dotrans;
2198         }
2199
2200         /*
2201          * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
2202          * This follows the protocol established by
2203          * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
2204          */
2205         if (vp->v_type == VDIR) {
2206                 indeadlock = ufs_tryirwlock_trans(ulp, &ip->i_rwlock,
2207                                                   RW_WRITER, TOP_SETATTR,
2208                                                   ufsvfsp, &error, issync,
2209                                                   trans_size);
2210                 if (indeadlock)
2211                         goto again;
2212                 dorwlock = 1;
2213         }
2214
2215         /*
2216          * Grab quota lock if we are changing the file's owner.
2217          */
2218         if (mask & VATTR_UID) {
2219                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2220                 dodqlock = 1;
2221         }
2222         rw_enter(&ip->i_contents, RW_WRITER);
2223
2224         oldva.va_mode = ip->i_mode;
2225         oldva.va_uid = ip->i_uid;
2226         oldva.va_gid = ip->i_gid;
2227
2228         vap->va_mask &= ~VATTR_SIZE;
2229
2230         error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2231             ufs_priv_access, ip);
2232         if (error)
2233                 goto update_inode;
2234
2235         mask = vap->va_mask;
2236
2237         /*
2238          * Change file access modes.
2239          */
2240         if (mask & VATTR_MODE) {
2241                 ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT);
2242                 TRANS_INODE(ufsvfsp, ip);
2243                 ip->i_flag |= ICHG;
2244                 if (stickyhack) {
2245                         mutex_enter(&vp->v_lock);
2246                         if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
2247                                 vp->v_flag |= VSWAPLIKE;
2248                         else
2249                                 vp->v_flag &= ~VSWAPLIKE;
2250                         mutex_exit(&vp->v_lock);
2251                 }
2252         }
2253         if (mask & (VATTR_UID|VATTR_GID)) {
2254                 if (mask & VATTR_UID) {
2255                         /*
2256                          * Don't change ownership of the quota inode.
2257                          */
2258                         if (ufsvfsp->vfs_qinod == ip) {
2259                                 ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED);
2260                                 error = EINVAL;
2261                                 goto update_inode;
2262                         }
2263
2264                         /*
2265                          * No real ownership change.
2266                          */
2267                         if (ip->i_uid == vap->va_uid) {
2268                                 blocks = 0;
2269                                 owner_change = 0;
2270                         }
2271                         /*
2272                          * Remove the blocks and the file, from the old user's
2273                          * quota.
2274                          */
2275                         else {
2276                                 blocks = ip->i_blocks;
2277                                 owner_change = 1;
2278
2279                                 (void) chkdq(ip, -blocks, /* force */ 1, cr,
2280                                     (char **)NULL, NULL);
2281                                 (void) chkiq(ufsvfsp, /* change */ -1, ip,
2282                                     (uid_t)ip->i_uid, /* force */ 1, cr,
2283                                     (char **)NULL, NULL);
2284                                 dqrele(ip->i_dquot);
2285                         }
2286
2287                         ip->i_uid = vap->va_uid;
2288
2289                         /*
2290                          * There is a real ownership change.
2291                          */
2292                         if (owner_change) {
2293                                 /*
2294                                  * Add the blocks and the file to the new
2295                                  * user's quota.
2296                                  */
2297                                 ip->i_dquot = getinoquota(ip);
2298                                 (void) chkdq(ip, blocks, /* force */ 1, cr,
2299                                     &errmsg1, &len1);
2300                                 (void) chkiq(ufsvfsp, /* change */ 1,
2301                                     NULL, (uid_t)ip->i_uid,
2302                                     /* force */ 1, cr, &errmsg2, &len2);
2303                         }
2304                 }
2305                 if (mask & VATTR_GID) {
2306                         ip->i_gid = vap->va_gid;
2307                 }
2308                 TRANS_INODE(ufsvfsp, ip);
2309                 ip->i_flag |= ICHG;
2310         }
2311         /*
2312          * Change file access or modified times.
2313          */
2314         if (mask & (VATTR_ATIME|VATTR_MTIME)) {
2315                 /* Check that the time value is within ufs range */
2316                 if (((mask & VATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2317                     ((mask & VATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2318                         error = EOVERFLOW;
2319                         goto update_inode;
2320                 }
2321
2322                 /*
2323                  * if the "noaccess" mount option is set and only atime
2324                  * update is requested, do nothing. No error is returned.
2325                  */
2326                 if ((ufsvfsp->vfs_noatime) &&
2327                     ((mask & (VATTR_ATIME|VATTR_MTIME)) == VATTR_ATIME))
2328                         goto skip_atime;
2329
2330                 if (mask & VATTR_ATIME) {
2331                         ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2332                         ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2333                         ip->i_flag &= ~IACC;
2334                 }
2335                 if (mask & VATTR_MTIME) {
2336                         ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2337                         ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2338                         gethrestime(&now);
2339                         if (now.tv_sec > TIME32_MAX) {
2340                                 /*
2341                                  * In 2038, ctime sticks forever..
2342                                  */
2343                                 ip->i_ctime.tv_sec = TIME32_MAX;
2344                                 ip->i_ctime.tv_usec = 0;
2345                         } else {
2346                                 ip->i_ctime.tv_sec = now.tv_sec;
2347                                 ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2348                         }
2349                         ip->i_flag &= ~(IUPD|ICHG);
2350                         ip->i_flag |= IMODTIME;
2351                 }
2352                 TRANS_INODE(ufsvfsp, ip);
2353                 ip->i_flag |= IMOD;
2354         }
2355
2356 skip_atime:
2357         /*
2358          * The presence of a shadow inode may indicate an ACL, but does
2359          * not imply an ACL.  Future FSD types should be handled here too
2360          * and check for the presence of the attribute-specific data
2361          * before referencing it.
2362          */
2363         if (ip->i_shadow) {
2364                 /*
2365                  * XXX if ufs_iupdat is changed to sandbagged write fix
2366                  * ufs_acl_setattr to push ip to keep acls consistent
2367                  *
2368                  * Suppress out of inodes messages if we will retry.
2369                  */
2370                 if (retry)
2371                         ip->i_flag |= IQUIET;
2372                 error = ufs_acl_setattr(ip, vap, cr);
2373                 ip->i_flag &= ~IQUIET;
2374         }
2375
2376 update_inode:
2377         /*
2378          * Setattr always increases the sequence number
2379          */
2380         ip->i_seq++;
2381
2382         /*
2383          * if nfsd and not logging; push synchronously
2384          */
2385         if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) {
2386                 ufs_iupdat(ip, 1);
2387         } else {
2388                 ITIMES_NOLOCK(ip);
2389         }
2390
2391         rw_exit(&ip->i_contents);
2392         if (dodqlock) {
2393                 rw_exit(&ufsvfsp->vfs_dqrwlock);
2394         }
2395         if (dorwlock)
2396                 rw_exit(&ip->i_rwlock);
2397
2398         if (ulp) {
2399                 if (dotrans) {
2400                         int terr = 0;
2401                         TRANS_END_CSYNC(ufsvfsp, &terr, issync, TOP_SETATTR,
2402                                         trans_size);
2403                         if (error == 0)
2404                                 error = terr;
2405                 }
2406                 ufs_lockfs_end(ulp);
2407         }
2408 out:
2409         /*
2410          * If out of inodes or blocks, see if we can free something
2411          * up from the delete queue.
2412          */
2413         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
2414                 ufs_delete_drain_wait(ufsvfsp, 1);
2415                 retry = 0;
2416                 if (errmsg1 != NULL)
2417                         kmem_free(errmsg1, len1);
2418                 if (errmsg2 != NULL)
2419                         kmem_free(errmsg2, len2);
2420                 goto again;
2421         }
2422         if (errmsg1 != NULL) {
2423                 uprintf(errmsg1);
2424                 kmem_free(errmsg1, len1);
2425         }
2426         if (errmsg2 != NULL) {
2427                 uprintf(errmsg2);
2428                 kmem_free(errmsg2, len2);
2429         }
2430         return (error);
2431 }
2432
2433 /*ARGSUSED*/
2434 static int
2435 ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
2436     caller_context_t *ct)
2437 {
2438         struct inode *ip = VTOI(vp);
2439
2440         if (ip->i_ufsvfs == NULL)
2441                 return (EIO);
2442
2443         /*
2444          * The ufs_iaccess function wants to be called with
2445          * mode bits expressed as "ufs specific" bits.
2446          * I.e., VWRITE|VREAD|VEXEC do not make sense to
2447          * ufs_iaccess() but IWRITE|IREAD|IEXEC do.
2448          * But since they're the same we just pass the vnode mode
2449          * bit but just verify that assumption at compile time.
2450          */
2451 #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC
2452 #error "ufs_access needs to map Vmodes to Imodes"
2453 #endif
2454         return (ufs_iaccess(ip, mode, cr, 1));
2455 }
2456
2457 /* ARGSUSED */
2458 static int
2459 ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr,
2460     caller_context_t *ct)
2461 {
2462         struct inode *ip = VTOI(vp);
2463         struct ufsvfs *ufsvfsp;
2464         struct ulockfs *ulp;
2465         int error;
2466         int fastsymlink;
2467
2468         if (vp->v_type != VLNK) {
2469                 error = EINVAL;
2470                 goto nolockout;
2471         }
2472
2473         /*
2474          * If the symbolic link is empty there is nothing to read.
2475          * Fast-track these empty symbolic links
2476          */
2477         if (ip->i_size == 0) {
2478                 error = 0;
2479                 goto nolockout;
2480         }
2481
2482         ufsvfsp = ip->i_ufsvfs;
2483         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK);
2484         if (error)
2485                 goto nolockout;
2486         /*
2487          * The ip->i_rwlock protects the data blocks used for FASTSYMLINK
2488          */
2489 again:
2490         fastsymlink = 0;
2491         if (ip->i_flag & IFASTSYMLNK) {
2492                 rw_enter(&ip->i_rwlock, RW_READER);
2493                 rw_enter(&ip->i_contents, RW_READER);
2494                 if (ip->i_flag & IFASTSYMLNK) {
2495                         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
2496                             (ip->i_fs->fs_ronly == 0) &&
2497                             (!ufsvfsp->vfs_noatime)) {
2498                                 mutex_enter(&ip->i_tlock);
2499                                 ip->i_flag |= IACC;
2500                                 mutex_exit(&ip->i_tlock);
2501                         }
2502                         error = uiomove((caddr_t)&ip->i_db[1],
2503                             MIN(ip->i_size, uiop->uio_resid),
2504                             UIO_READ, uiop);
2505                         ITIMES(ip);
2506                         ++fastsymlink;
2507                 }
2508                 rw_exit(&ip->i_contents);
2509                 rw_exit(&ip->i_rwlock);
2510         }
2511         if (!fastsymlink) {
2512                 ssize_t size;   /* number of bytes read  */
2513                 caddr_t basep;  /* pointer to input data */
2514                 ino_t ino;
2515                 long  igen;
2516                 struct uio tuio;        /* temp uio struct */
2517                 struct uio *tuiop;
2518                 iovec_t tiov;           /* temp iovec struct */
2519                 char kbuf[FSL_SIZE];    /* buffer to hold fast symlink */
2520                 int tflag = 0;          /* flag to indicate temp vars used */
2521
2522                 ino = ip->i_number;
2523                 igen = ip->i_gen;
2524                 size = uiop->uio_resid;
2525                 basep = uiop->uio_iov->iov_base;
2526                 tuiop = uiop;
2527
2528                 rw_enter(&ip->i_rwlock, RW_WRITER);
2529                 rw_enter(&ip->i_contents, RW_WRITER);
2530                 if (ip->i_flag & IFASTSYMLNK) {
2531                         rw_exit(&ip->i_contents);
2532                         rw_exit(&ip->i_rwlock);
2533                         goto again;
2534                 }
2535
2536                 /* can this be a fast symlink and is it a user buffer? */
2537                 if (ip->i_size <= FSL_SIZE &&
2538                     (uiop->uio_segflg == UIO_USERSPACE ||
2539                     uiop->uio_segflg == UIO_USERISPACE)) {
2540
2541                         bzero(&tuio, sizeof (struct uio));
2542                         /*
2543                          * setup a kernel buffer to read link into.  this
2544                          * is to fix a race condition where the user buffer
2545                          * got corrupted before copying it into the inode.
2546                          */
2547                         size = ip->i_size;
2548                         tiov.iov_len = size;
2549                         tiov.iov_base = kbuf;
2550                         tuio.uio_iov = &tiov;
2551                         tuio.uio_iovcnt = 1;
2552                         tuio.uio_offset = uiop->uio_offset;
2553                         tuio.uio_segflg = UIO_SYSSPACE;
2554                         tuio.uio_fmode = uiop->uio_fmode;
2555                         tuio.uio_extflg = uiop->uio_extflg;
2556                         tuio.uio_limit = uiop->uio_limit;
2557                         tuio.uio_resid = size;
2558
2559                         basep = tuio.uio_iov->iov_base;
2560                         tuiop = &tuio;
2561                         tflag = 1;
2562                 }
2563
2564                 error = rdip(ip, tuiop, 0, cr);
2565                 if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) {
2566                         rw_exit(&ip->i_contents);
2567                         rw_exit(&ip->i_rwlock);
2568                         goto out;
2569                 }
2570
2571                 if (tflag == 0)
2572                         size -= uiop->uio_resid;
2573
2574                 if ((tflag == 0 && ip->i_size <= FSL_SIZE &&
2575                     ip->i_size == size) || (tflag == 1 &&
2576                     tuio.uio_resid == 0)) {
2577                         error = kcopy(basep, &ip->i_db[1], ip->i_size);
2578                         if (error == 0) {
2579                                 ip->i_flag |= IFASTSYMLNK;
2580                                 /*
2581                                  * free page
2582                                  */
2583                                 (void) fop_putpage(ITOV(ip),
2584                                     0, PAGESIZE,
2585                                     (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC),
2586                                     cr, ct);
2587                         } else {
2588                                 int i;
2589                                 /* error, clear garbage left behind */
2590                                 for (i = 1; i < NDADDR; i++)
2591                                         ip->i_db[i] = 0;
2592                                 for (i = 0; i < NIADDR; i++)
2593                                         ip->i_ib[i] = 0;
2594                         }
2595                 }
2596                 if (tflag == 1) {
2597                         /* now, copy it into the user buffer */
2598                         error = uiomove((caddr_t)kbuf,
2599                             MIN(size, uiop->uio_resid),
2600                             UIO_READ, uiop);
2601                 }
2602                 rw_exit(&ip->i_contents);
2603                 rw_exit(&ip->i_rwlock);
2604         }
2605 out:
2606         if (ulp) {
2607                 ufs_lockfs_end(ulp);
2608         }
2609 nolockout:
2610         return (error);
2611 }
2612
2613 /* ARGSUSED */
2614 static int
2615 ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr, caller_context_t *ct)
2616 {
2617         struct inode *ip = VTOI(vp);
2618         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2619         struct ulockfs *ulp;
2620         int error;
2621
2622         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK);
2623         if (error)
2624                 return (error);
2625
2626         if (TRANS_ISTRANS(ufsvfsp)) {
2627                 /*
2628                  * First push out any data pages
2629                  */
2630                 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2631                     (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) {
2632                         error = fop_putpage(vp, 0, (size_t)0,
2633                             0, CRED(), ct);
2634                         if (error)
2635                                 goto out;
2636                 }
2637
2638                 /*
2639                  * Delta any delayed inode times updates
2640                  * and push inode to log.
2641                  * All other inode deltas will have already been delta'd
2642                  * and will be pushed during the commit.
2643                  */
2644                 if (!(syncflag & FDSYNC) &&
2645                     ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) {
2646                         if (ulp) {
2647                                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC,
2648                                     TOP_SYNCIP_SIZE);
2649                         }
2650                         rw_enter(&ip->i_contents, RW_READER);
2651                         mutex_enter(&ip->i_tlock);
2652                         ip->i_flag &= ~IMODTIME;
2653                         mutex_exit(&ip->i_tlock);
2654                         ufs_iupdat(ip, I_SYNC);
2655                         rw_exit(&ip->i_contents);
2656                         if (ulp) {
2657                                 TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC,
2658                                     TOP_SYNCIP_SIZE);
2659                         }
2660                 }
2661
2662                 /*
2663                  * Commit the Moby transaction
2664                  *
2665                  * Deltas have already been made so we just need to
2666                  * commit them with a synchronous transaction.
2667                  * TRANS_BEGIN_SYNC() will return an error
2668                  * if there are no deltas to commit, for an
2669                  * empty transaction.
2670                  */
2671                 if (ulp) {
2672                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE,
2673                                          &error);
2674                         if (error) {
2675                                 error = 0; /* commit wasn't needed */
2676                                 goto out;
2677                         }
2678                         TRANS_END_SYNC(ufsvfsp, &error, TOP_FSYNC,
2679                                        TOP_COMMIT_SIZE);
2680                 }
2681         } else {        /* not logging */
2682                 if (!(IS_SWAPVP(vp)))
2683                         if (syncflag & FNODSYNC) {
2684                                 /* Just update the inode only */
2685                                 TRANS_IUPDAT(ip, 1);
2686                                 error = 0;
2687                         } else if (syncflag & FDSYNC)
2688                                 /* Do data-synchronous writes */
2689                                 error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC);
2690                         else
2691                                 /* Do synchronous writes */
2692                                 error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC);
2693
2694                 rw_enter(&ip->i_contents, RW_WRITER);
2695                 if (!error)
2696                         error = ufs_sync_indir(ip);
2697                 rw_exit(&ip->i_contents);
2698         }
2699 out:
2700         if (ulp) {
2701                 ufs_lockfs_end(ulp);
2702         }
2703         return (error);
2704 }
2705
2706 /*ARGSUSED*/
2707 static void
2708 ufs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
2709 {
2710         ufs_iinactive(VTOI(vp));
2711 }
2712
2713 /*
2714  * Unix file system operations having to do with directory manipulation.
2715  */
2716 int ufs_lookup_idle_count = 2;  /* Number of inodes to idle each time */
2717 /* ARGSUSED */
2718 static int
2719 ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
2720     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr,
2721     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
2722 {
2723         struct inode *ip;
2724         struct inode *sip;
2725         struct inode *xip;
2726         struct ufsvfs *ufsvfsp;
2727         struct ulockfs *ulp;
2728         struct vnode *vp;
2729         int error;
2730
2731         /*
2732          * Check flags for type of lookup (regular file or attribute file)
2733          */
2734
2735         ip = VTOI(dvp);
2736
2737         if (flags & LOOKUP_XATTR) {
2738
2739                 /*
2740                  * If not mounted with XATTR support then return EINVAL
2741                  */
2742
2743                 if (!(ip->i_ufsvfs->vfs_vfs->vfs_flag & VFS_XATTR))
2744                         return (EINVAL);
2745                 /*
2746                  * We don't allow recursive attributes...
2747                  * Maybe someday we will.
2748                  */
2749                 if ((ip->i_cflags & IXATTR)) {
2750                         return (EINVAL);
2751                 }
2752
2753                 if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) {
2754                         error = ufs_xattr_getattrdir(dvp, &sip, flags, cr);
2755                         if (error) {
2756                                 *vpp = NULL;
2757                                 goto out;
2758                         }
2759
2760                         vp = ITOV(sip);
2761                         dnlc_update(dvp, XATTR_DIR_NAME, vp);
2762                 }
2763
2764                 /*
2765                  * Check accessibility of directory.
2766                  */
2767                 if (vp == DNLC_NO_VNODE) {
2768                         VN_RELE(vp);
2769                         error = ENOENT;
2770                         goto out;
2771                 }
2772                 if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr, 1)) != 0) {
2773                         VN_RELE(vp);
2774                         goto out;
2775                 }
2776
2777                 *vpp = vp;
2778                 return (0);
2779         }
2780
2781         /*
2782          * Check for a null component, which we should treat as
2783          * looking at dvp from within it's parent, so we don't
2784          * need a call to ufs_iaccess(), as it has already been
2785          * done.
2786          */
2787         if (nm[0] == 0) {
2788                 VN_HOLD(dvp);
2789                 error = 0;
2790                 *vpp = dvp;
2791                 goto out;
2792         }
2793
2794         /*
2795          * Check for "." ie itself. this is a quick check and
2796          * avoids adding "." into the dnlc (which have been seen
2797          * to occupy >10% of the cache).
2798          */
2799         if ((nm[0] == '.') && (nm[1] == 0)) {
2800                 /*
2801                  * Don't return without checking accessibility
2802                  * of the directory. We only need the lock if
2803                  * we are going to return it.
2804                  */
2805                 if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) == 0) {
2806                         VN_HOLD(dvp);
2807                         *vpp = dvp;
2808                 }
2809                 goto out;
2810         }
2811
2812         /*
2813          * Fast path: Check the directory name lookup cache.
2814          */
2815         if (vp = dnlc_lookup(dvp, nm)) {
2816                 /*
2817                  * Check accessibility of directory.
2818                  */
2819                 if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) != 0) {
2820                         VN_RELE(vp);
2821                         goto out;
2822                 }
2823                 if (vp == DNLC_NO_VNODE) {
2824                         VN_RELE(vp);
2825                         error = ENOENT;
2826                         goto out;
2827                 }
2828                 xip = VTOI(vp);
2829                 ulp = NULL;
2830                 goto fastpath;
2831         }
2832
2833         /*
2834          * Keep the idle queue from getting too long by
2835          * idling two inodes before attempting to allocate another.
2836          *    This operation must be performed before entering
2837          *    lockfs or a transaction.
2838          */
2839         if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
2840                 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
2841                         ins.in_lidles.value.ul += ufs_lookup_idle_count;
2842                         ufs_idle_some(ufs_lookup_idle_count);
2843                 }
2844
2845 retry_lookup:
2846         /*
2847          * Check accessibility of directory.
2848          */
2849         if (error = ufs_diraccess(ip, IEXEC, cr))
2850                 goto out;
2851
2852         ufsvfsp = ip->i_ufsvfs;
2853         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK);
2854         if (error)
2855                 goto out;
2856
2857         error = ufs_dirlook(ip, nm, &xip, cr, 1, 0);
2858
2859 fastpath:
2860         if (error == 0) {
2861                 ip = xip;
2862                 *vpp = ITOV(ip);
2863
2864                 /*
2865                  * If vnode is a device return special vnode instead.
2866                  */
2867                 if (IS_DEVVP(*vpp)) {
2868                         struct vnode *newvp;
2869
2870                         newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
2871                             cr);
2872                         VN_RELE(*vpp);
2873                         if (newvp == NULL)
2874                                 error = ENOSYS;
2875                         else
2876                                 *vpp = newvp;
2877                 } else if (ip->i_cflags & ICOMPRESS) {
2878                         struct vnode *newvp;
2879
2880                         /*
2881                          * Compressed file, substitute dcfs vnode
2882                          */
2883                         newvp = decompvp(*vpp, cr, ct);
2884                         VN_RELE(*vpp);
2885                         if (newvp == NULL)
2886                                 error = ENOSYS;
2887                         else
2888                                 *vpp = newvp;
2889                 }
2890         }
2891         if (ulp) {
2892                 ufs_lockfs_end(ulp);
2893         }
2894
2895         if (error == EAGAIN)
2896                 goto retry_lookup;
2897
2898 out:
2899         return (error);
2900 }
2901
2902 /*ARGSUSED*/
2903 static int
2904 ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl,
2905     int mode, struct vnode **vpp, struct cred *cr, int flag,
2906     caller_context_t *ct, vsecattr_t *vsecp)
2907 {
2908         struct inode *ip;
2909         struct inode *xip;
2910         struct inode *dip;
2911         struct vnode *xvp;
2912         struct ufsvfs *ufsvfsp;
2913         struct ulockfs *ulp;
2914         int error;
2915         int issync;
2916         int truncflag;
2917         int trans_size;
2918         int noentry;
2919         int defer_dip_seq_update = 0;   /* need to defer update of dip->i_seq */
2920         int retry = 1;
2921         int indeadlock;
2922
2923 again:
2924         ip = VTOI(dvp);
2925         ufsvfsp = ip->i_ufsvfs;
2926         truncflag = 0;
2927
2928         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK);
2929         if (error)
2930                 goto out;
2931
2932         if (ulp) {
2933                 trans_size = (int)TOP_CREATE_SIZE(ip);
2934                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_CREATE, trans_size);
2935         }
2936
2937         if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
2938                 vap->va_mode &= ~VSVTX;
2939
2940         if (*name == '\0') {
2941                 /*
2942                  * Null component name refers to the directory itself.
2943                  */
2944                 VN_HOLD(dvp);
2945                 /*
2946                  * Even though this is an error case, we need to grab the
2947                  * quota lock since the error handling code below is common.
2948                  */
2949                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2950                 rw_enter(&ip->i_contents, RW_WRITER);
2951                 error = EEXIST;
2952         } else {
2953                 xip = NULL;
2954                 noentry = 0;
2955                 /*
2956                  * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
2957                  * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2958                  * possible, retries the operation.
2959                  */
2960                 indeadlock = ufs_tryirwlock_trans(ulp, &ip->i_rwlock,
2961                                                   RW_WRITER, TOP_CREATE,
2962                                                   ufsvfsp, &error, issync,
2963                                                   trans_size);
2964                 if (indeadlock)
2965                         goto again;
2966
2967                 xvp = dnlc_lookup(dvp, name);
2968                 if (xvp == DNLC_NO_VNODE) {
2969                         noentry = 1;
2970                         VN_RELE(xvp);
2971                         xvp = NULL;
2972                 }
2973                 if (xvp) {
2974                         rw_exit(&ip->i_rwlock);
2975                         if (error = ufs_iaccess(ip, IEXEC, cr, 1)) {
2976                                 VN_RELE(xvp);
2977                         } else {
2978                                 error = EEXIST;
2979                                 xip = VTOI(xvp);
2980                         }
2981                 } else {
2982                         /*
2983                          * Suppress file system full message if we will retry
2984                          */
2985                         error = ufs_direnter_cm(ip, name, DE_CREATE,
2986                             vap, &xip, cr, (noentry | (retry ? IQUIET : 0)));
2987                         if (error == EAGAIN) {
2988                                 if (ulp) {
2989                                         TRANS_END_CSYNC(ufsvfsp, &error,
2990                                                         issync, TOP_CREATE,
2991                                                         trans_size);
2992                                         ufs_lockfs_end(ulp);
2993                                 }
2994                                 goto again;
2995                         }
2996                         rw_exit(&ip->i_rwlock);
2997                 }
2998                 ip = xip;
2999                 if (ip != NULL) {
3000                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3001                         rw_enter(&ip->i_contents, RW_WRITER);
3002                 }
3003         }
3004
3005         /*
3006          * If the file already exists and this is a non-exclusive create,
3007          * check permissions and allow access for non-directories.
3008          * Read-only create of an existing directory is also allowed.
3009          * We fail an exclusive create of anything which already exists.
3010          */
3011         if (error == EEXIST) {
3012                 dip = VTOI(dvp);
3013                 if (excl == NONEXCL) {
3014                         if ((((ip->i_mode & IFMT) == IFDIR) ||
3015                             ((ip->i_mode & IFMT) == IFATTRDIR)) &&
3016                             (mode & IWRITE))
3017                                 error = EISDIR;
3018                         else if (mode)
3019                                 error = ufs_iaccess(ip, mode, cr, 0);
3020                         else
3021                                 error = 0;
3022                 }
3023                 if (error) {
3024                         rw_exit(&ip->i_contents);
3025                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3026                         VN_RELE(ITOV(ip));
3027                         goto unlock;
3028                 }
3029                 /*
3030                  * If the error EEXIST was set, then i_seq can not
3031                  * have been updated. The sequence number interface
3032                  * is defined such that a non-error fop_create must
3033                  * increase the dir va_seq it by at least one. If we
3034                  * have cleared the error, increase i_seq. Note that
3035                  * we are increasing the dir i_seq and in rare cases
3036                  * ip may actually be from the dvp, so we already have
3037                  * the locks and it will not be subject to truncation.
3038                  * In case we have to update i_seq of the parent
3039                  * directory dip, we have to defer it till we have
3040                  * released our locks on ip due to lock ordering requirements.
3041                  */
3042                 if (ip != dip)
3043                         defer_dip_seq_update = 1;
3044                 else
3045                         ip->i_seq++;
3046
3047                 if (((ip->i_mode & IFMT) == IFREG) &&
3048                     (vap->va_mask & VATTR_SIZE) && vap->va_size == 0) {
3049                         /*
3050                          * Truncate regular files, if requested by caller.
3051                          * Grab i_rwlock to make sure no one else is
3052                          * currently writing to the file (we promised
3053                          * bmap we would do this).
3054                          * Must get the locks in the correct order.
3055                          */
3056                         if (ip->i_size == 0) {
3057                                 ip->i_flag |= ICHG | IUPD;
3058                                 ip->i_seq++;
3059                                 TRANS_INODE(ufsvfsp, ip);
3060                         } else {
3061                                 /*
3062                                  * Large Files: Why this check here?
3063                                  * Though we do it in vn_create() we really
3064                                  * want to guarantee that we do not destroy
3065                                  * Large file data by atomically checking
3066                                  * the size while holding the contents
3067                                  * lock.
3068                                  */
3069                                 if (flag && !(flag & FOFFMAX) &&
3070                                     ((ip->i_mode & IFMT) == IFREG) &&
3071                                     (ip->i_size > (offset_t)MAXOFF32_T)) {
3072                                         rw_exit(&ip->i_contents);
3073                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3074                                         error = EOVERFLOW;
3075                                         goto unlock;
3076                                 }
3077                                 if (TRANS_ISTRANS(ufsvfsp))
3078                                         truncflag++;
3079                                 else {
3080                                         rw_exit(&ip->i_contents);
3081                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3082                                         indeadlock = ufs_tryirwlock_trans(ulp,
3083                                                                           &ip->i_rwlock,
3084                                                                           RW_WRITER,
3085                                                                           TOP_CREATE,
3086                                                                           ufsvfsp,
3087                                                                           &error,
3088                                                                           issync,
3089                                                                           trans_size);
3090                                         if (indeadlock) {
3091                                                 VN_RELE(ITOV(ip));
3092                                                 goto again;
3093                                         }
3094                                         rw_enter(&ufsvfsp->vfs_dqrwlock,
3095                                             RW_READER);
3096                                         rw_enter(&ip->i_contents, RW_WRITER);
3097                                         (void) ufs_itrunc(ip, 0, 0,
3098                                             cr);
3099                                         rw_exit(&ip->i_rwlock);
3100                                 }
3101
3102                         }
3103                         if (error == 0) {
3104                                 vnevent_create(ITOV(ip), ct);
3105                         }
3106                 }
3107         }
3108
3109         if (error) {
3110                 if (ip != NULL) {
3111                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3112                         rw_exit(&ip->i_contents);
3113                 }
3114                 goto unlock;
3115         }
3116
3117         *vpp = ITOV(ip);
3118         ITIMES(ip);
3119         rw_exit(&ip->i_contents);
3120         rw_exit(&ufsvfsp->vfs_dqrwlock);
3121
3122         /*
3123          * If vnode is a device return special vnode instead.
3124          */
3125         if (!error && IS_DEVVP(*vpp)) {
3126                 struct vnode *newvp;
3127
3128                 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
3129                 VN_RELE(*vpp);
3130                 if (newvp == NULL) {
3131                         error = ENOSYS;
3132                         goto unlock;
3133                 }
3134                 truncflag = 0;
3135                 *vpp = newvp;
3136         }
3137 unlock:
3138
3139         /*
3140          * Do the deferred update of the parent directory's sequence
3141          * number now.
3142          */
3143         if (defer_dip_seq_update == 1) {
3144                 rw_enter(&dip->i_contents, RW_READER);
3145                 mutex_enter(&dip->i_tlock);
3146                 dip->i_seq++;
3147                 mutex_exit(&dip->i_tlock);
3148                 rw_exit(&dip->i_contents);
3149         }
3150
3151         if (ulp) {
3152                 int terr = 0;
3153
3154                 TRANS_END_CSYNC(ufsvfsp, &terr, issync, TOP_CREATE,
3155                                 trans_size);
3156
3157                 /*
3158                  * If we haven't had a more interesting failure
3159                  * already, then anything that might've happened
3160                  * here should be reported.
3161                  */
3162                 if (error == 0)
3163                         error = terr;
3164         }
3165
3166         if (!error && truncflag) {
3167                 indeadlock = ufs_tryirwlock(ulp, &ip->i_rwlock, RW_WRITER);
3168                 if (indeadlock) {
3169                         if (ulp)
3170                                 ufs_lockfs_end(ulp);
3171                         VN_RELE(ITOV(ip));
3172                         goto again;
3173                 }
3174                 (void) TRANS_ITRUNC(ip, 0, 0, cr);
3175                 rw_exit(&ip->i_rwlock);
3176         }
3177
3178         if (ulp)
3179                 ufs_lockfs_end(ulp);
3180
3181         /*
3182          * If no inodes available, try to free one up out of the
3183          * pending delete queue.
3184          */
3185         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3186                 ufs_delete_drain_wait(ufsvfsp, 1);
3187                 retry = 0;
3188                 goto again;
3189         }
3190
3191 out:
3192         return (error);
3193 }
3194
3195 extern int ufs_idle_max;
3196 /*ARGSUSED*/
3197 static int
3198 ufs_remove(struct vnode *vp, char *nm, struct cred *cr, caller_context_t *ct,
3199     int flags)
3200 {
3201         struct inode *ip = VTOI(vp);
3202         struct ufsvfs *ufsvfsp  = ip->i_ufsvfs;
3203         struct ulockfs *ulp;
3204         vnode_t *rmvp = NULL;   /* Vnode corresponding to name being removed */
3205         int indeadlock;
3206         int error;
3207         int issync;
3208         int trans_size;
3209
3210         /*
3211          * don't let the delete queue get too long
3212          */
3213         if (ufsvfsp == NULL) {
3214                 error = EIO;
3215                 goto out;
3216         }
3217         if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3218                 ufs_delete_drain(vp->v_vfsp, 1, 1);
3219
3220         error = ufs_eventlookup(vp, nm, cr, &rmvp);
3221         if (rmvp != NULL) {
3222                 /* Only send the event if there were no errors */
3223                 if (error == 0)
3224                         vnevent_remove(rmvp, vp, nm, ct);
3225                 VN_RELE(rmvp);
3226         }
3227
3228 retry_remove:
3229         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK);
3230         if (error)
3231                 goto out;
3232
3233         if (ulp)
3234                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_REMOVE,
3235                                   trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp)));
3236
3237         /*
3238          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3239          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3240          * possible, retries the operation.
3241          */
3242         indeadlock = ufs_tryirwlock_trans(ulp, &ip->i_rwlock, RW_WRITER,
3243                                           TOP_REMOVE, ufsvfsp, &error,
3244                                           issync, trans_size);
3245         if (indeadlock)
3246                 goto retry_remove;
3247         error = ufs_dirremove(ip, nm, NULL, NULL, DR_REMOVE, cr);
3248         rw_exit(&ip->i_rwlock);
3249
3250         if (ulp) {
3251                 TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_REMOVE,
3252                                 trans_size);
3253                 ufs_lockfs_end(ulp);
3254         }
3255
3256 out:
3257         return (error);
3258 }
3259
3260 /*
3261  * Link a file or a directory.  Only privileged processes are allowed to
3262  * make links to directories.
3263  */
3264 /*ARGSUSED*/
3265 static int
3266 ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr,
3267     caller_context_t *ct, int flags)
3268 {
3269         struct inode *sip;
3270         struct inode *tdp = VTOI(tdvp);
3271         struct ufsvfs *ufsvfsp = tdp->i_ufsvfs;
3272         struct ulockfs *ulp;
3273         struct vnode *realvp;
3274         int error;
3275         int issync;
3276         int trans_size;
3277         int isdev;
3278         int indeadlock;
3279
3280 retry_link:
3281         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK);
3282         if (error)
3283                 goto out;
3284
3285         if (ulp)
3286                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_LINK,
3287                                   trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp)));
3288
3289         if (fop_realvp(svp, &realvp, ct) == 0)
3290                 svp = realvp;
3291
3292         /*
3293          * Make sure link for extended attributes is valid
3294          * We only support hard linking of attr in ATTRDIR to ATTRDIR
3295          *
3296          * Make certain we don't attempt to look at a device node as
3297          * a ufs inode.
3298          */
3299
3300         isdev = IS_DEVVP(svp);
3301         if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) &&
3302             ((tdp->i_mode & IFMT) == IFATTRDIR)) ||
3303             ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) &&
3304             ((tdp->i_mode & IFMT) == IFDIR))) {
3305                 error = EINVAL;
3306                 goto unlock;
3307         }
3308
3309         sip = VTOI(svp);
3310         if ((svp->v_type == VDIR &&
3311             secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) ||
3312             (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) {
3313                 error = EPERM;
3314                 goto unlock;
3315         }
3316
3317         /*
3318          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3319          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3320          * possible, retries the operation.
3321          */
3322         indeadlock = ufs_tryirwlock_trans(ulp, &tdp->i_rwlock, RW_WRITER,
3323                                           TOP_LINK, ufsvfsp, &error, issync,
3324                                           trans_size);
3325         if (indeadlock)
3326                 goto retry_link;
3327         error = ufs_direnter_lr(tdp, tnm, DE_LINK, NULL, sip, cr);
3328         rw_exit(&tdp->i_rwlock);
3329
3330 unlock:
3331         if (ulp) {
3332                 TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_LINK, trans_size);
3333                 ufs_lockfs_end(ulp);
3334         }
3335
3336         if (!error) {
3337                 vnevent_link(svp, ct);
3338         }
3339 out:
3340         return (error);
3341 }
3342
3343 uint64_t ufs_rename_retry_cnt;
3344 uint64_t ufs_rename_upgrade_retry_cnt;
3345 uint64_t ufs_rename_dircheck_retry_cnt;
3346 clock_t  ufs_rename_backoff_delay = 1;
3347
3348 /*
3349  * Rename a file or directory.
3350  * We are given the vnode and entry string of the source and the
3351  * vnode and entry string of the place we want to move the source
3352  * to (the target). The essential operation is:
3353  *      unlink(target);
3354  *      link(source, target);
3355  *      unlink(source);
3356  * but "atomically".  Can't do full commit without saving state in
3357  * the inode on disk, which isn't feasible at this time.  Best we
3358  * can do is always guarantee that the TARGET exists.
3359  */
3360
3361 /*ARGSUSED*/
3362 static int
3363 ufs_rename(struct vnode *sdvp, char *snm, struct vnode *tdvp, char *tnm,
3364     struct cred *cr, caller_context_t *ct, int flags)
3365 {
3366         struct inode *sip = NULL;       /* source inode */
3367         struct inode *ip = NULL;        /* check inode */
3368         struct inode *sdp;              /* old (source) parent inode */
3369         struct inode *tdp;              /* new (target) parent inode */
3370         struct vnode *svp = NULL;       /* source vnode */
3371         struct vnode *tvp = NULL;       /* target vnode, if it exists */
3372         struct vnode *realvp;
3373         struct ufsvfs *ufsvfsp;
3374         struct ulockfs *ulp = NULL;
3375         struct ufs_slot slot;
3376         timestruc_t now;
3377         int error;
3378         int issync;
3379         int trans_size;
3380         krwlock_t *first_lock;
3381         krwlock_t *second_lock;
3382         krwlock_t *reverse_lock;
3383         int serr, terr;
3384
3385         sdp = VTOI(sdvp);
3386         slot.fbp = NULL;
3387         ufsvfsp = sdp->i_ufsvfs;
3388
3389         if (fop_realvp(tdvp, &realvp, ct) == 0)
3390                 tdvp = realvp;
3391
3392         /* Must do this before taking locks in case of DNLC miss */
3393         terr = ufs_eventlookup(tdvp, tnm, cr, &tvp);
3394         serr = ufs_eventlookup(sdvp, snm, cr, &svp);
3395
3396         if ((serr == 0) && ((terr == 0) || (terr == ENOENT))) {
3397                 if (tvp != NULL)
3398                         vnevent_pre_rename_dest(tvp, tdvp, tnm, ct);
3399
3400                 /*
3401                  * Notify the target directory of the rename event
3402                  * if source and target directories are not the same.
3403                  */
3404                 if (sdvp != tdvp)
3405                         vnevent_pre_rename_dest_dir(tdvp, svp, tnm, ct);
3406
3407                 if (svp != NULL)
3408                         vnevent_pre_rename_src(svp, sdvp, snm, ct);
3409         }
3410
3411         if (svp != NULL)
3412                 VN_RELE(svp);
3413
3414 retry_rename:
3415         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK);
3416         if (error)
3417                 goto unlock;
3418
3419         if (ulp)
3420                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_RENAME,
3421                                   trans_size = (int)TOP_RENAME_SIZE(sdp));
3422
3423         if (fop_realvp(tdvp, &realvp, ct) == 0)
3424                 tdvp = realvp;
3425
3426         tdp = VTOI(tdvp);
3427
3428         /*
3429          * We only allow renaming of attributes from ATTRDIR to ATTRDIR.
3430          */
3431         if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) {
3432                 error = EINVAL;
3433                 goto unlock;
3434         }
3435
3436         /*
3437          * Check accessibility of directory.
3438          */
3439         if (error = ufs_diraccess(sdp, IEXEC, cr))
3440                 goto unlock;
3441
3442         /*
3443          * Look up inode of file we're supposed to rename.
3444          */
3445         gethrestime(&now);
3446         if (error = ufs_dirlook(sdp, snm, &sip, cr, 0, 0)) {
3447                 if (error == EAGAIN) {
3448                         if (ulp) {
3449                                 TRANS_END_CSYNC(ufsvfsp, &error, issync,
3450                                                 TOP_RENAME, trans_size);
3451                                 ufs_lockfs_end(ulp);
3452                         }
3453                         goto retry_rename;
3454                 }
3455
3456                 goto unlock;
3457         }
3458
3459         /*
3460          * Lock both the source and target directories (they may be
3461          * the same) to provide the atomicity semantics that was
3462          * previously provided by the per file system vfs_rename_lock
3463          *
3464          * with vfs_rename_lock removed to allow simultaneous renames
3465          * within a file system, ufs_dircheckpath can deadlock while
3466          * traversing back to ensure that source is not a parent directory
3467          * of target parent directory. This is because we get into
3468          * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER.
3469          * If the tdp and sdp of the simultaneous renames happen to be
3470          * in the path of each other, it can lead to a deadlock. This
3471          * can be avoided by getting the locks as RW_READER here and then
3472          * upgrading to RW_WRITER after completing the ufs_dircheckpath.
3473          *
3474          * We hold the target directory's i_rwlock after calling
3475          * ufs_lockfs_begin but in many other operations (like ufs_readdir)
3476          * fop_rwlock is explicitly called by the filesystem independent code
3477          * before calling the file system operation. In these cases the order
3478          * is reversed (i.e i_rwlock is taken first and then ufs_lockfs_begin
3479          * is called). This is fine as long as ufs_lockfs_begin acts as a VOP
3480          * counter but with ufs_quiesce setting the SLOCK bit this becomes a
3481          * synchronizing object which might lead to a deadlock. So we use
3482          * rw_tryenter instead of rw_enter. If we fail to get this lock and
3483          * find that SLOCK bit is set, we call ufs_lockfs_end and restart the
3484          * operation.
3485          */
3486 retry:
3487         first_lock = &tdp->i_rwlock;
3488         second_lock = &sdp->i_rwlock;
3489 retry_firstlock:
3490         if (!rw_tryenter(first_lock, RW_READER)) {
3491                 /*
3492                  * We didn't get the lock. Check if the SLOCK is set in the
3493                  * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3494                  * and wait for SLOCK to be cleared.
3495                  */
3496
3497                 if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3498                         TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_RENAME,
3499                                         trans_size);
3500                         ufs_lockfs_end(ulp);
3501                         goto retry_rename;
3502
3503                 } else {
3504                         /*
3505                          * SLOCK isn't set so this is a genuine synchronization
3506                          * case. Let's try again after giving them a breather.
3507                          */
3508                         delay(RETRY_LOCK_DELAY);
3509                         goto  retry_firstlock;
3510                 }
3511         }
3512         /*
3513          * Need to check if the tdp and sdp are same !!!
3514          */
3515         if ((tdp != sdp) && (!rw_tryenter(second_lock, RW_READER))) {
3516                 /*
3517                  * We didn't get the lock. Check if the SLOCK is set in the
3518                  * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3519                  * and wait for SLOCK to be cleared.
3520                  */
3521
3522                 rw_exit(first_lock);
3523                 if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3524                         TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_RENAME,
3525                                         trans_size);
3526                         ufs_lockfs_end(ulp);
3527                         goto retry_rename;
3528
3529                 } else {
3530                         /*
3531                          * So we couldn't get the second level peer lock *and*
3532                          * the SLOCK bit isn't set. Too bad we can be
3533                          * contentding with someone wanting these locks otherway
3534                          * round. Reverse the locks in case there is a heavy
3535                          * contention for the second level lock.
3536                          */
3537                         reverse_lock = first_lock;
3538                         first_lock = second_lock;
3539                         second_lock = reverse_lock;
3540                         ufs_rename_retry_cnt++;
3541                         goto  retry_firstlock;
3542                 }
3543         }
3544
3545         if (sip == tdp) {
3546                 error = EINVAL;
3547                 goto errout;
3548         }
3549         /*
3550          * Make sure we can delete the source entry.  This requires
3551          * write permission on the containing directory.
3552          * Check for sticky directories.
3553          */
3554         rw_enter(&sdp->i_contents, RW_READER);
3555         rw_enter(&sip->i_contents, RW_READER);
3556         if ((error = ufs_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
3557             (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) {
3558                 rw_exit(&sip->i_contents);
3559                 rw_exit(&sdp->i_contents);
3560                 goto errout;
3561         }
3562
3563         /*
3564          * If this is a rename of a directory and the parent is
3565          * different (".." must be changed), then the source
3566          * directory must not be in the directory hierarchy
3567          * above the target, as this would orphan everything
3568          * below the source directory.  Also the user must have
3569          * write permission in the source so as to be able to
3570          * change "..".
3571          */
3572         if ((((sip->i_mode & IFMT) == IFDIR) ||
3573             ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) {
3574                 ino_t   inum;
3575
3576                 if (error = ufs_iaccess(sip, IWRITE, cr, 0)) {
3577                         rw_exit(&sip->i_contents);
3578                         rw_exit(&sdp->i_contents);
3579                         goto errout;
3580                 }
3581                 inum = sip->i_number;
3582                 rw_exit(&sip->i_contents);
3583                 rw_exit(&sdp->i_contents);
3584                 if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) {
3585                         /*
3586                          * If we got EAGAIN ufs_dircheckpath detected a
3587                          * potential deadlock and backed out. We need
3588                          * to retry the operation since sdp and tdp have
3589                          * to be released to avoid the deadlock.
3590                          */
3591                         if (error == EAGAIN) {
3592                                 rw_exit(&tdp->i_rwlock);
3593                                 if (tdp != sdp)
3594                                         rw_exit(&sdp->i_rwlock);
3595                                 delay(ufs_rename_backoff_delay);
3596                                 ufs_rename_dircheck_retry_cnt++;
3597                                 goto retry;
3598                         }
3599                         goto errout;
3600                 }
3601         } else {
3602                 rw_exit(&sip->i_contents);
3603                 rw_exit(&sdp->i_contents);
3604         }
3605
3606
3607         /*
3608          * Check for renaming '.' or '..' or alias of '.'
3609          */
3610         if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) {
3611                 error = EINVAL;
3612                 goto errout;
3613         }
3614
3615         /*
3616          * Simultaneous renames can deadlock in ufs_dircheckpath since it
3617          * tries to traverse back the file tree with both tdp and sdp held
3618          * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks
3619          * as RW_READERS  till ufs_dircheckpath is done.
3620          * Now that ufs_dircheckpath is done with, we can upgrade the locks
3621          * to RW_WRITER.
3622          */
3623         if (!rw_tryupgrade(&tdp->i_rwlock)) {
3624                 /*
3625                  * The upgrade failed. We got to give away the lock
3626                  * as to avoid deadlocking with someone else who is
3627                  * waiting for writer lock. With the lock gone, we
3628                  * cannot be sure the checks done above will hold
3629                  * good when we eventually get them back as writer.
3630                  * So if we can't upgrade we drop the locks and retry
3631                  * everything again.
3632                  */
3633                 rw_exit(&tdp->i_rwlock);
3634                 if (tdp != sdp)
3635                         rw_exit(&sdp->i_rwlock);
3636                 delay(ufs_rename_backoff_delay);
3637                 ufs_rename_upgrade_retry_cnt++;
3638                 goto retry;
3639         }
3640         if (tdp != sdp) {
3641                 if (!rw_tryupgrade(&sdp->i_rwlock)) {
3642                         /*
3643                          * The upgrade failed. We got to give away the lock
3644                          * as to avoid deadlocking with someone else who is
3645                          * waiting for writer lock. With the lock gone, we
3646                          * cannot be sure the checks done above will hold
3647                          * good when we eventually get them back as writer.
3648                          * So if we can't upgrade we drop the locks and retry
3649                          * everything again.
3650                          */
3651                         rw_exit(&tdp->i_rwlock);
3652                         rw_exit(&sdp->i_rwlock);
3653                         delay(ufs_rename_backoff_delay);
3654                         ufs_rename_upgrade_retry_cnt++;
3655                         goto retry;
3656                 }
3657         }
3658
3659         /*
3660          * Now that all the locks are held check to make sure another thread
3661          * didn't slip in and take out the sip.
3662          */
3663         slot.status = NONE;
3664         if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec ||
3665             sip->i_ctime.tv_sec > now.tv_sec) {
3666                 rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
3667                 rw_enter(&sdp->i_contents, RW_WRITER);
3668                 error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot,
3669                     &ip, cr, 0);
3670                 rw_exit(&sdp->i_contents);
3671                 rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock);
3672                 if (error) {
3673                         goto errout;
3674                 }
3675                 if (ip == NULL) {
3676                         error = ENOENT;
3677                         goto errout;
3678                 } else {
3679                         /*
3680                          * If the inode was found need to drop the v_count
3681                          * so as not to keep the filesystem from being
3682                          * unmounted at a later time.
3683                          */
3684                         VN_RELE(ITOV(ip));
3685                 }
3686
3687                 /*
3688                  * Release the slot.fbp that has the page mapped and
3689                  * locked SE_SHARED, and could be used in in
3690                  * ufs_direnter_lr() which needs to get the SE_EXCL lock
3691                  * on said page.
3692                  */
3693                 if (slot.fbp) {
3694                         fbrelse(slot.fbp, S_OTHER);
3695                         slot.fbp = NULL;
3696                 }
3697         }
3698
3699         /*
3700          * Link source to the target.
3701          */
3702         if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr)) {
3703                 /*
3704                  * ESAME isn't really an error; it indicates that the
3705                  * operation should not be done because the source and target
3706                  * are the same file, but that no error should be reported.
3707                  */
3708                 if (error == ESAME)
3709                         error = 0;
3710                 goto errout;
3711         }
3712
3713         if (error == 0 && tvp != NULL)
3714                 vnevent_rename_dest(tvp, tdvp, tnm, ct);
3715
3716         /*
3717          * Unlink the source.
3718          * Remove the source entry.  ufs_dirremove() checks that the entry
3719          * still reflects sip, and returns an error if it doesn't.
3720          * If the entry has changed just forget about it.  Release
3721          * the source inode.
3722          */
3723         if ((error = ufs_dirremove(sdp, snm, sip, NULL,
3724             DR_RENAME, cr)) == ENOENT)
3725                 error = 0;
3726
3727         if (error == 0) {
3728                 vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
3729                 /*
3730                  * Notify the target directory of the rename event
3731                  * if source and target directories are not the same.
3732                  */
3733                 if (sdvp != tdvp)
3734                         vnevent_rename_dest_dir(tdvp, ct);
3735         }
3736
3737 errout:
3738         if (slot.fbp)
3739                 fbrelse(slot.fbp, S_OTHER);
3740
3741         rw_exit(&tdp->i_rwlock);
3742         if (sdp != tdp) {
3743                 rw_exit(&sdp->i_rwlock);
3744         }
3745
3746 unlock:
3747         if (tvp != NULL)
3748                 VN_RELE(tvp);
3749         if (sip != NULL)
3750                 VN_RELE(ITOV(sip));
3751
3752         if (ulp) {
3753                 TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_RENAME,
3754                                 trans_size);
3755                 ufs_lockfs_end(ulp);
3756         }
3757
3758         return (error);
3759 }
3760
3761 /*ARGSUSED*/
3762 static int
3763 ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap,
3764     struct vnode **vpp, struct cred *cr, caller_context_t *ct, int flags,
3765     vsecattr_t *vsecp)
3766 {
3767         struct inode *ip;
3768         struct inode *xip;
3769         struct ufsvfs *ufsvfsp;
3770         struct ulockfs *ulp;
3771         int error;
3772         int issync;
3773         int trans_size;
3774         int indeadlock;
3775         int retry = 1;
3776
3777         ASSERT((vap->va_mask & (VATTR_TYPE|VATTR_MODE)) == (VATTR_TYPE|VATTR_MODE));
3778
3779         /*
3780          * Can't make directory in attr hidden dir
3781          */
3782         if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
3783                 return (EINVAL);
3784
3785 again:
3786         ip = VTOI(dvp);
3787         ufsvfsp = ip->i_ufsvfs;
3788         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3789         if (error)
3790                 goto out;
3791         if (ulp)
3792                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_MKDIR,
3793                                   trans_size = (int)TOP_MKDIR_SIZE(ip));
3794
3795         /*
3796          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3797          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3798          * possible, retries the operation.
3799          */
3800         indeadlock = ufs_tryirwlock_trans(ulp, &ip->i_rwlock, RW_WRITER,
3801                                           TOP_MKDIR, ufsvfsp, &error, issync,
3802                                           trans_size);
3803         if (indeadlock)
3804                 goto again;
3805
3806         error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr,
3807             (retry ? IQUIET : 0));
3808         if (error == EAGAIN) {
3809                 if (ulp) {
3810                         TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_MKDIR,
3811                                         trans_size);
3812                         ufs_lockfs_end(ulp);
3813                 }
3814                 goto again;
3815         }
3816
3817         rw_exit(&ip->i_rwlock);
3818         if (error == 0) {
3819                 ip = xip;
3820                 *vpp = ITOV(ip);
3821         } else if (error == EEXIST)
3822                 VN_RELE(ITOV(xip));
3823
3824         if (ulp) {
3825                 int terr = 0;
3826                 TRANS_END_CSYNC(ufsvfsp, &terr, issync, TOP_MKDIR, trans_size);
3827                 ufs_lockfs_end(ulp);
3828                 if (error == 0)
3829                         error = terr;
3830         }
3831 out:
3832         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3833                 ufs_delete_drain_wait(ufsvfsp, 1);
3834                 retry = 0;
3835                 goto again;
3836         }
3837
3838         return (error);
3839 }
3840
3841 /*ARGSUSED*/
3842 static int
3843 ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr,
3844     caller_context_t *ct, int flags)
3845 {
3846         struct inode *ip = VTOI(vp);
3847         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
3848         struct ulockfs *ulp;
3849         vnode_t *rmvp = NULL;   /* Vnode of removed directory */
3850         int error;
3851         int issync;
3852         int trans_size;
3853         int indeadlock;
3854
3855         /*
3856          * don't let the delete queue get too long
3857          */
3858         if (ufsvfsp == NULL) {
3859                 error = EIO;
3860                 goto out;
3861         }
3862         if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3863                 ufs_delete_drain(vp->v_vfsp, 1, 1);
3864
3865         error = ufs_eventlookup(vp, nm, cr, &rmvp);
3866         if (rmvp != NULL) {
3867                 /* Only send the event if there were no errors */
3868                 if (error == 0)
3869                         vnevent_rmdir(rmvp, vp, nm, ct);
3870                 VN_RELE(rmvp);
3871         }
3872
3873 retry_rmdir:
3874         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK);
3875         if (error)
3876                 goto out;
3877
3878         if (ulp)
3879                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_RMDIR,
3880                                   trans_size = TOP_RMDIR_SIZE);
3881
3882         /*
3883          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3884          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3885          * possible, retries the operation.
3886          */
3887         indeadlock = ufs_tryirwlock_trans(ulp, &ip->i_rwlock, RW_WRITER,
3888                                           TOP_RMDIR, ufsvfsp, &error, issync,
3889                                           trans_size);
3890         if (indeadlock)
3891                 goto retry_rmdir;
3892         error = ufs_dirremove(ip, nm, NULL, cdir, DR_RMDIR, cr);
3893
3894         rw_exit(&ip->i_rwlock);
3895
3896         if (ulp) {
3897                 TRANS_END_CSYNC(ufsvfsp, &error, issync, TOP_RMDIR,
3898                                 trans_size);
3899                 ufs_lockfs_end(ulp);
3900         }
3901
3902 out:
3903         return (error);
3904 }
3905
3906 /* ARGSUSED */
3907 static int
3908 ufs_readdir(struct vnode *vp, struct uio *uiop, struct cred *cr, int *eofp,
3909     caller_context_t *ct, int flags)
3910 {
3911         struct iovec *iovp;
3912         struct inode *ip;
3913         struct direct *idp;
3914         struct dirent64 *odp;
3915         struct fbuf *fbp;
3916         struct ufsvfs *ufsvfsp;
3917         struct ulockfs *ulp;
3918         caddr_t outbuf;
3919         size_t bufsize;
3920         uint_t offset;
3921         uint_t bytes_wanted, total_bytes_wanted;
3922         int incount = 0;
3923         int outcount = 0;
3924         int error;
3925
3926         ip = VTOI(vp);
3927         ASSERT(RW_READ_HELD(&ip->i_rwlock));
3928
3929         if (uiop->uio_loffset >= MAXOFF32_T) {
3930                 if (eofp)
3931                         *eofp = 1;
3932                 return (0);
3933         }
3934
3935         /*
3936          * Check if we have been called with a valid iov_len
3937          * and bail out if not, otherwise we may potentially loop
3938          * forever further down.
3939          */
3940         if (uiop->uio_iov->iov_len <= 0) {
3941                 error = EINVAL;
3942                 goto out;
3943         }
3944
3945         /*
3946          * Large Files: When we come here we are guaranteed that
3947          * uio_offset can be used safely. The high word is zero.
3948          */
3949
3950         ufsvfsp = ip->i_ufsvfs;
3951         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK);
3952         if (error)
3953                 goto out;
3954
3955         iovp = uiop->uio_iov;
3956         total_bytes_wanted = iovp->iov_len;
3957
3958         /* Large Files: directory files should not be "large" */
3959
3960         ASSERT(ip->i_size <= MAXOFF32_T);
3961
3962         /* Force offset to be valid (to guard against bogus lseek() values) */
3963         offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1);
3964
3965         /* Quit if at end of file or link count of zero (posix) */
3966         if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) {
3967                 if (eofp)
3968                         *eofp = 1;
3969                 error = 0;
3970                 goto unlock;
3971         }
3972
3973         /*
3974          * Get space to change directory entries into fs independent format.
3975          * Do fast alloc for the most commonly used-request size (filesystem
3976          * block size).
3977          */
3978         if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) {
3979                 bufsize = total_bytes_wanted;
3980                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
3981                 odp = (struct dirent64 *)outbuf;
3982         } else {
3983                 bufsize = total_bytes_wanted;
3984                 odp = (struct dirent64 *)iovp->iov_base;
3985         }
3986
3987 nextblk:
3988         bytes_wanted = total_bytes_wanted;
3989
3990         /* Truncate request to file size */
3991         if (offset + bytes_wanted > (int)ip->i_size)
3992                 bytes_wanted = (int)(ip->i_size - offset);
3993
3994         /* Comply with MAXBSIZE boundary restrictions of fbread() */
3995         if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE)
3996                 bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET);
3997
3998         /*
3999          * Read in the next chunk.
4000          * We are still holding the i_rwlock.
4001          */
4002         error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp);
4003
4004         if (error)
4005                 goto update_inode;
4006         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) &&
4007             (!ufsvfsp->vfs_noatime)) {
4008                 ip->i_flag |= IACC;
4009         }
4010         incount = 0;
4011         idp = (struct direct *)fbp->fb_addr;
4012         if (idp->d_ino == 0 && idp->d_reclen == 0 && idp->d_namlen == 0) {
4013                 cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, "
4014                     "fs = %s\n",
4015                     (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt);
4016                 fbrelse(fbp, S_OTHER);
4017                 error = ENXIO;
4018                 goto update_inode;
4019         }
4020         /* Transform to file-system independent format */
4021         while (incount < bytes_wanted) {
4022                 /*
4023                  * If the current directory entry is mangled, then skip
4024                  * to the next block.  It would be nice to set the FSBAD
4025                  * flag in the super-block so that a fsck is forced on
4026                  * next reboot, but locking is a problem.
4027                  */
4028                 if (idp->d_reclen & 0x3) {
4029                         offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4030                         break;
4031                 }
4032
4033                 /* Skip to requested offset and skip empty entries */
4034                 if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) {
4035                         ushort_t this_reclen =
4036                             DIRENT64_RECLEN(idp->d_namlen);
4037                         /* Buffer too small for any entries */
4038                         if (!outcount && this_reclen > bufsize) {
4039                                 fbrelse(fbp, S_OTHER);
4040                                 error = EINVAL;
4041                                 goto update_inode;
4042                         }
4043                         /* If would overrun the buffer, quit */
4044                         if (outcount + this_reclen > bufsize) {
4045                                 break;
4046                         }
4047                         /* Take this entry */
4048                         odp->d_ino = (ino64_t)idp->d_ino;
4049                         odp->d_reclen = (ushort_t)this_reclen;
4050                         odp->d_off = (offset_t)(offset + idp->d_reclen);
4051
4052                         /* use strncpy(9f) to zero out uninitialized bytes */
4053
4054                         ASSERT(strlen(idp->d_name) + 1 <=
4055                             DIRENT64_NAMELEN(this_reclen));
4056                         (void) strncpy(odp->d_name, idp->d_name,
4057                             DIRENT64_NAMELEN(this_reclen));
4058                         outcount += odp->d_reclen;
4059                         odp = (struct dirent64 *)
4060                             ((intptr_t)odp + odp->d_reclen);
4061                         ASSERT(outcount <= bufsize);
4062                 }
4063                 if (idp->d_reclen) {
4064                         incount += idp->d_reclen;
4065                         offset += idp->d_reclen;
4066                         idp = (struct direct *)((intptr_t)idp + idp->d_reclen);
4067                 } else {
4068                         offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4069                         break;
4070                 }
4071         }
4072         /* Release the chunk */
4073         fbrelse(fbp, S_OTHER);
4074
4075         /* Read whole block, but got no entries, read another if not eof */
4076
4077         /*
4078          * Large Files: casting i_size to int here is not a problem
4079          * because directory sizes are always less than MAXOFF32_T.
4080          * See assertion above.
4081          */
4082
4083         if (offset < (int)ip->i_size && !outcount)
4084                 goto nextblk;
4085
4086         /* Copy out the entry data */
4087         if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) {
4088                 iovp->iov_base += outcount;
4089                 iovp->iov_len -= outcount;
4090                 uiop->uio_resid -= outcount;
4091                 uiop->uio_offset = offset;
4092         } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ,
4093             uiop)) == 0)
4094                 uiop->uio_offset = offset;
4095 update_inode:
4096         ITIMES(ip);
4097         if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1)
4098                 kmem_free(outbuf, bufsize);
4099
4100         if (eofp && error == 0)
4101                 *eofp = (uiop->uio_offset >= (int)ip->i_size);
4102 unlock:
4103         if (ulp) {
4104                 ufs_lockfs_end(ulp);
4105         }
4106 out:
4107         return (error);
4108 }
4109
4110 /*ARGSUSED*/
4111 static int
4112 ufs_symlink(struct vnode *dvp, char *linkname, struct vattr *vap, char *target,
4113     struct cred *cr, caller_context_t *ct, int flags)
4114 {
4115         struct inode *ip, *dip = VTOI(dvp);
4116         struct ufsvfs *ufsvfsp = dip->i_ufsvfs;
4117         struct ulockfs *ulp;
4118         int error;
4119         int issync;
4120         int trans_size;
4121         int residual;
4122         int ioflag;
4123         int retry = 1;
4124
4125         /*
4126          * No symlinks in attrdirs at this time
4127          */
4128         if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
4129                 return (EINVAL);
4130
4131 again:
4132         ip = NULL;
4133         vap->va_type = VLNK;
4134         vap->va_rdev = 0;
4135
4136         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK);
4137         if (error)
4138                 goto out;
4139
4140         if (ulp)
4141                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_SYMLINK,
4142                                   trans_size = (int)TOP_SYMLINK_SIZE(dip));
4143
4144         /*
4145          * We must create the inode before the directory entry, to avoid
4146          * racing with readlink().  ufs_dirmakeinode requires that we
4147          * hold the quota lock as reader, and directory locks as writer.
4148          */
4149
4150         rw_enter(&dip->i_rwlock, RW_WRITER);
4151         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4152         rw_enter(&dip->i_contents, RW_WRITER);
4153
4154         /*
4155          * Suppress any out of inodes messages if we will retry on
4156          * ENOSP
4157          */
4158         if (retry)
4159                 dip->i_flag |= IQUIET;
4160
4161         error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr);
4162
4163         dip->i_flag &= ~IQUIET;
4164
4165         rw_exit(&dip->i_contents);
4166         rw_exit(&ufsvfsp->vfs_dqrwlock);
4167         rw_exit(&dip->i_rwlock);
4168
4169         if (error)
4170                 goto unlock;
4171
4172         /*
4173          * OK.  The inode has been created.  Write out the data of the
4174          * symbolic link.  Since symbolic links are metadata, and should
4175          * remain consistent across a system crash, we need to force the
4176          * data out synchronously.
4177          *
4178          * (This is a change from the semantics in earlier releases, which
4179          * only created symbolic links synchronously if the semi-documented
4180          * 'syncdir' option was set, or if we were being invoked by the NFS
4181          * server, which requires symbolic links to be created synchronously.)
4182          *
4183          * We need to pass in a pointer for the residual length; otherwise
4184          * ufs_rdwri() will always return EIO if it can't write the data,
4185          * even if the error was really ENOSPC or EDQUOT.
4186          */
4187
4188         ioflag = FWRITE | FDSYNC;
4189         residual = 0;
4190
4191         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4192         rw_enter(&ip->i_contents, RW_WRITER);
4193
4194         /*
4195          * Suppress file system full messages if we will retry
4196          */
4197         if (retry)
4198                 ip->i_flag |= IQUIET;
4199
4200         error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target),
4201             0, UIO_SYSSPACE, &residual, cr);
4202
4203         ip->i_flag &= ~IQUIET;
4204
4205         if (error) {
4206                 rw_exit(&ip->i_contents);
4207                 rw_exit(&ufsvfsp->vfs_dqrwlock);
4208                 goto remove;
4209         }
4210
4211         /*
4212          * If the link's data is small enough, we can cache it in the inode.
4213          * This is a "fast symbolic link".  We don't use the first direct
4214          * block because that's actually used to point at the symbolic link's
4215          * contents on disk; but we know that none of the other direct or
4216          * indirect blocks can be used because symbolic links are restricted
4217          * to be smaller than a file system block.
4218          */
4219
4220         ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip)));
4221
4222         if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) {
4223                 if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) {
4224                         ip->i_flag |= IFASTSYMLNK;
4225                 } else {
4226                         int i;
4227                         /* error, clear garbage left behind */
4228                         for (i = 1; i < NDADDR; i++)
4229                                 ip->i_db[i] = 0;
4230                         for (i = 0; i < NIADDR; i++)
4231                                 ip->i_ib[i] = 0;
4232                 }
4233         }
4234
4235         rw_exit(&ip->i_contents);
4236         rw_exit(&ufsvfsp->vfs_dqrwlock);
4237
4238         /*
4239          * OK.  We've successfully created the symbolic link.  All that
4240          * remains is to insert it into the appropriate directory.
4241          */
4242
4243         rw_enter(&dip->i_rwlock, RW_WRITER);
4244         error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr);
4245         rw_exit(&dip->i_rwlock);
4246
4247         /*
4248          * Fall through into remove-on-error code.  We're either done, or we
4249          * need to remove the inode (if we couldn't insert it).
4250          */
4251
4252 remove:
4253         if (error && (ip != NULL)) {
4254                 rw_enter(&ip->i_contents, RW_WRITER);
4255                 ip->i_nlink--;
4256                 ip->i_flag |= ICHG;
4257                 ip->i_seq++;
4258                 ufs_setreclaim(ip);
4259                 rw_exit(&ip->i_contents);
4260         }
4261
4262 unlock:
4263         if (ip != NULL)
4264                 VN_RELE(ITOV(ip));
4265
4266         if (ulp) {
4267                 int terr = 0;
4268
4269                 TRANS_END_CSYNC(ufsvfsp, &terr, issync, TOP_SYMLINK,
4270                                 trans_size);
4271                 ufs_lockfs_end(ulp);
4272                 if (error == 0)
4273                         error = terr;
4274         }
4275
4276         /*
4277          * We may have failed due to lack of an inode or of a block to
4278          * store the target in.  Try flushing the delete queue to free
4279          * logically-available things up and try again.
4280          */
4281         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
4282                 ufs_delete_drain_wait(ufsvfsp, 1);
4283                 retry = 0;
4284                 goto again;
4285         }
4286
4287 out:
4288         return (error);
4289 }
4290
4291 /*
4292  * Ufs specific routine used to do ufs io.
4293  */
4294 int
4295 ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base,
4296     ssize_t len, offset_t offset, enum uio_seg seg, int *aresid,
4297     struct cred *cr)
4298 {
4299         struct uio auio;
4300         struct iovec aiov;
4301         int error;
4302
4303         ASSERT(RW_LOCK_HELD(&ip->i_contents));
4304
4305         bzero((caddr_t)&auio, sizeof (uio_t));
4306         bzero((caddr_t)&aiov, sizeof (iovec_t));
4307
4308         aiov.iov_base = base;
4309         aiov.iov_len = len;
4310         auio.uio_iov = &aiov;
4311         auio.uio_iovcnt = 1;
4312         auio.uio_loffset = offset;
4313         auio.uio_segflg = (short)seg;
4314         auio.uio_resid = len;
4315
4316         if (rw == UIO_WRITE) {
4317                 auio.uio_fmode = FWRITE;
4318                 auio.uio_extflg = UIO_COPY_DEFAULT;
4319                 auio.uio_llimit = curproc->p_fsz_ctl;
4320                 error = wrip(ip, &auio, ioflag, cr);
4321         } else {
4322                 auio.uio_fmode = FREAD;
4323                 auio.uio_extflg = UIO_COPY_CACHED;
4324                 auio.uio_llimit = MAXOFFSET_T;
4325                 error = rdip(ip, &auio, ioflag, cr);
4326         }
4327
4328         if (aresid) {
4329                 *aresid = auio.uio_resid;
4330         } else if (auio.uio_resid) {
4331                 error = EIO;
4332         }
4333         return (error);
4334 }
4335
4336 /*ARGSUSED*/
4337 static int
4338 ufs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
4339 {
4340         struct ufid *ufid;
4341         struct inode *ip = VTOI(vp);
4342
4343         if (ip->i_ufsvfs == NULL)
4344                 return (EIO);
4345
4346         if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) {
4347                 fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t);
4348                 return (ENOSPC);
4349         }
4350
4351         ufid = (struct ufid *)fidp;
4352         bzero((char *)ufid, sizeof (struct ufid));
4353         ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t);
4354         ufid->ufid_ino = ip->i_number;
4355         ufid->ufid_gen = ip->i_gen;
4356
4357         return (0);
4358 }
4359
4360 /* ARGSUSED2 */
4361 static int
4362 ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4363 {
4364         struct inode    *ip = VTOI(vp);
4365         struct ufsvfs   *ufsvfsp;
4366         int             forcedirectio;
4367
4368         /*
4369          * Read case is easy.
4370          */
4371         if (!write_lock) {
4372                 rw_enter(&ip->i_rwlock, RW_READER);
4373                 return (V_WRITELOCK_FALSE);
4374         }
4375
4376         /*
4377          * Caller has requested a writer lock, but that inhibits any
4378          * concurrency in the VOPs that follow. Acquire the lock shared
4379          * and defer exclusive access until it is known to be needed in
4380          * other VOP handlers. Some cases can be determined here.
4381          */
4382
4383         /*
4384          * If directio is not set, there is no chance of concurrency,
4385          * so just acquire the lock exclusive. Beware of a forced
4386          * unmount before looking at the mount option.
4387          */
4388         ufsvfsp = ip->i_ufsvfs;
4389         forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0;
4390         if (!(ip->i_flag & IDIRECTIO || forcedirectio) ||
4391             !ufs_allow_shared_writes) {
4392                 rw_enter(&ip->i_rwlock, RW_WRITER);
4393                 return (V_WRITELOCK_TRUE);
4394         }
4395
4396         /*
4397          * Mandatory locking forces acquiring i_rwlock exclusive.
4398          */
4399         if (MANDLOCK(vp, ip->i_mode)) {
4400                 rw_enter(&ip->i_rwlock, RW_WRITER);
4401                 return (V_WRITELOCK_TRUE);
4402         }
4403
4404         /*
4405          * Acquire the lock shared in case a concurrent write follows.
4406          * Mandatory locking could have become enabled before the lock
4407          * was acquired. Re-check and upgrade if needed.
4408          */
4409         rw_enter(&ip->i_rwlock, RW_READER);
4410         if (MANDLOCK(vp, ip->i_mode)) {
4411                 rw_exit(&ip->i_rwlock);
4412                 rw_enter(&ip->i_rwlock, RW_WRITER);
4413                 return (V_WRITELOCK_TRUE);
4414         }
4415         return (V_WRITELOCK_FALSE);
4416 }
4417
4418 /*ARGSUSED*/
4419 static void
4420 ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4421 {
4422         struct inode    *ip = VTOI(vp);
4423
4424         rw_exit(&ip->i_rwlock);
4425 }
4426
4427 /* ARGSUSED */
4428 static int
4429 ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4430 {
4431         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4432 }
4433
4434 /* ARGSUSED */
4435 static int
4436 ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4437     offset_t offset, struct flk_callback *flk_cbp, struct cred *cr,
4438     caller_context_t *ct)
4439 {
4440         struct inode *ip = VTOI(vp);
4441
4442         if (ip->i_ufsvfs == NULL)
4443                 return (EIO);
4444
4445         /*
4446          * If file is being mapped, disallow frlock.
4447          * XXX I am not holding tlock while checking i_mapcnt because the
4448          * current locking strategy drops all locks before calling fs_frlock.
4449          * So, mapcnt could change before we enter fs_frlock making is
4450          * meaningless to have held tlock in the first place.
4451          */
4452         if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode))
4453                 return (EAGAIN);
4454         return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4455 }
4456
4457 /* ARGSUSED */
4458 static int
4459 ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4460     offset_t offset, cred_t *cr, caller_context_t *ct)
4461 {
4462         struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
4463         struct ulockfs *ulp;
4464         int error;
4465
4466         if ((error = convoff(vp, bfp, 0, offset)) == 0) {
4467                 if (cmd == F_FREESP) {
4468                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
4469                             ULOCKFS_SPACE_MASK);
4470                         if (error)
4471                                 return (error);
4472                         error = ufs_freesp(vp, bfp, flag, cr);
4473
4474                         if (error == 0 && bfp->l_start == 0)
4475                                 vnevent_truncate(vp, ct);
4476                 } else if (cmd == F_ALLOCSP) {
4477                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
4478                             ULOCKFS_FALLOCATE_MASK);
4479                         if (error)
4480                                 return (error);
4481                         error = ufs_allocsp(vp, bfp, cr);
4482                 } else
4483                         return (EINVAL); /* Command not handled here */
4484
4485                 if (ulp)
4486                         ufs_lockfs_end(ulp);
4487
4488         }
4489         return (error);
4490 }
4491
4492 /*
4493  * Used to determine if read ahead should be done. Also used to
4494  * to determine when write back occurs.
4495  */
4496 #define CLUSTSZ(ip)             ((ip)->i_ufsvfs->vfs_ioclustsz)
4497
4498 /*
4499  * A faster version of ufs_getpage.
4500  *
4501  * We optimize by inlining the pvn_getpages iterator, eliminating
4502  * calls to bmap_read if file doesn't have UFS holes, and avoiding
4503  * the overhead of page_exists().
4504  *
4505  * When files has UFS_HOLES and ufs_getpage is called with S_READ,
4506  * we set *protp to PROT_READ to avoid calling bmap_read. This approach
4507  * victimizes performance when a file with UFS holes is faulted
4508  * first in the S_READ mode, and then in the S_WRITE mode. We will get
4509  * two MMU faults in this case.
4510  *
4511  * XXX - the inode fields which control the sequential mode are not
4512  *       protected by any mutex. The read ahead will act wild if
4513  *       multiple processes will access the file concurrently and
4514  *       some of them in sequential mode. One particulary bad case
4515  *       is if another thread will change the value of i_nextrio between
4516  *       the time this thread tests the i_nextrio value and then reads it
4517  *       again to use it as the offset for the read ahead.
4518  */
4519 /*ARGSUSED*/
4520 static int
4521 ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
4522     page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr,
4523     enum seg_rw rw, struct cred *cr, caller_context_t *ct)
4524 {
4525         uoff_t  uoff = (uoff_t)off; /* type conversion */
4526         uoff_t  pgoff;
4527         uoff_t  eoff;
4528         struct inode    *ip = VTOI(vp);
4529         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
4530         struct fs       *fs;
4531         struct ulockfs  *ulp;
4532         page_t          **pl;
4533         caddr_t         pgaddr;
4534         krw_t           rwtype;
4535         int             err;
4536         int             has_holes;
4537         int             beyond_eof;
4538         int             seqmode;
4539         int             pgsize = PAGESIZE;
4540         int             dolock;
4541         int             do_qlock;
4542         int             trans_size;
4543
4544         ASSERT((uoff & PAGEOFFSET) == 0);
4545
4546         if (protp)
4547                 *protp = PROT_ALL;
4548
4549         /*
4550          * Obey the lockfs protocol
4551          */
4552         err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg,
4553             rw == S_READ || rw == S_EXEC, protp);
4554         if (err)
4555                 goto out;
4556
4557         fs = ufsvfsp->vfs_fs;
4558
4559         if (ulp && (rw == S_CREATE || rw == S_WRITE) &&
4560             !(vp->v_flag & VISSWAP)) {
4561                 /*
4562                  * Try to start a transaction, will return if blocking is
4563                  * expected to occur and the address space is not the
4564                  * kernel address space.
4565                  */
4566                 trans_size = TOP_GETPAGE_SIZE(ip);
4567                 if (seg->s_as != &kas) {
4568                         TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE,
4569                             trans_size, &err);
4570                         if (err == EWOULDBLOCK) {
4571                                 /*
4572                                  * Use EDEADLK here because the VM code
4573                                  * can normally never see this error.
4574                                  */
4575                                 err = EDEADLK;
4576                                 ufs_lockfs_end(ulp);
4577                                 goto out;
4578                         }
4579                 } else {
4580                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4581                 }
4582         }
4583
4584         if (vp->v_flag & VNOMAP) {
4585                 err = ENOSYS;
4586                 goto unlock;
4587         }
4588
4589         seqmode = ip->i_nextr == uoff && rw != S_CREATE;
4590
4591         rwtype = RW_READER;             /* start as a reader */
4592         dolock = (rw_owner(&ip->i_contents) != curthread);
4593         /*
4594          * If this thread owns the lock, i.e., this thread grabbed it
4595          * as writer somewhere above, then we don't need to grab the
4596          * lock as reader in this routine.
4597          */
4598         do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread);
4599
4600 retrylock:
4601         if (dolock) {
4602                 /*
4603                  * Grab the quota lock if we need to call
4604                  * bmap_write() below (with i_contents as writer).
4605                  */
4606                 if (do_qlock && rwtype == RW_WRITER)
4607                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4608                 rw_enter(&ip->i_contents, rwtype);
4609         }
4610
4611         /*
4612          * We may be getting called as a side effect of a bmap using
4613          * fbread() when the blocks might be being allocated and the
4614          * size has not yet been up'ed.  In this case we want to be
4615          * able to return zero pages if we get back UFS_HOLE from
4616          * calling bmap for a non write case here.  We also might have
4617          * to read some frags from the disk into a page if we are
4618          * extending the number of frags for a given lbn in bmap().
4619          * Large Files: The read of i_size here is atomic because
4620          * i_contents is held here. If dolock is zero, the lock
4621          * is held in bmap routines.
4622          */
4623         beyond_eof = uoff + len >
4624             P2ROUNDUP_TYPED(ip->i_size, PAGESIZE, uoff_t);
4625         if (beyond_eof && seg != segkmap) {
4626                 if (dolock) {
4627                         rw_exit(&ip->i_contents);
4628                         if (do_qlock && rwtype == RW_WRITER)
4629                                 rw_exit(&ufsvfsp->vfs_dqrwlock);
4630                 }
4631                 err = EFAULT;
4632                 goto unlock;
4633         }
4634
4635         /*
4636          * Must hold i_contents lock throughout the call to pvn_getpages
4637          * since locked pages are returned from each call to ufs_getapage.
4638          * Must *not* return locked pages and then try for contents lock
4639          * due to lock ordering requirements (inode > page)
4640          */
4641
4642         has_holes = bmap_has_holes(ip);
4643
4644         if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) {
4645                 int     blk_size;
4646                 uoff_t offset;
4647
4648                 /*
4649                  * We must acquire the RW_WRITER lock in order to
4650                  * call bmap_write().
4651                  */
4652                 if (dolock && rwtype == RW_READER) {
4653                         rwtype = RW_WRITER;
4654
4655                         /*
4656                          * Grab the quota lock before
4657                          * upgrading i_contents, but if we can't grab it
4658                          * don't wait here due to lock order:
4659                          * vfs_dqrwlock > i_contents.
4660                          */
4661                         if (do_qlock &&
4662                             rw_tryenter(&ufsvfsp->vfs_dqrwlock, RW_READER)
4663                             == 0) {
4664                                 rw_exit(&ip->i_contents);
4665                                 goto retrylock;
4666                         }
4667                         if (!rw_tryupgrade(&ip->i_contents)) {
4668                                 rw_exit(&ip->i_contents);
4669                                 if (do_qlock)
4670                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4671                                 goto retrylock;
4672                         }
4673                 }
4674
4675                 /*
4676                  * May be allocating disk blocks for holes here as
4677                  * a result of mmap faults. write(2) does the bmap_write
4678                  * in rdip/wrip, not here. We are not dealing with frags
4679                  * in this case.
4680                  */
4681                 /*
4682                  * Large Files: We cast fs_bmask field to offset_t
4683                  * just as we do for MAXBMASK because uoff is a 64-bit
4684                  * data type. fs_bmask will still be a 32-bit type
4685                  * as we cannot change any ondisk data structures.
4686                  */
4687
4688                 offset = uoff & (offset_t)fs->fs_bmask;
4689                 while (offset < uoff + len) {
4690                         blk_size = (int)blksize(fs, ip, lblkno(fs, offset));
4691                         err = bmap_write(ip, offset, blk_size,
4692                             BI_NORMAL, NULL, cr);
4693                         if (ip->i_flag & (ICHG|IUPD))
4694                                 ip->i_seq++;
4695                         if (err)
4696                                 goto update_inode;
4697                         offset += blk_size; /* XXX - make this contig */
4698                 }
4699         }
4700
4701         /*
4702          * Can be a reader from now on.
4703          */
4704         if (dolock && rwtype == RW_WRITER) {
4705                 rw_downgrade(&ip->i_contents);
4706                 /*
4707                  * We can release vfs_dqrwlock early so do it, but make
4708                  * sure we don't try to release it again at the bottom.
4709                  */
4710                 if (do_qlock) {
4711                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4712                         do_qlock = 0;
4713                 }
4714         }
4715
4716         /*
4717          * We remove PROT_WRITE in cases when the file has UFS holes
4718          * because we don't  want to call bmap_read() to check each
4719          * page if it is backed with a disk block.
4720          */
4721         if (protp && has_holes && rw != S_WRITE && rw != S_CREATE)
4722                 *protp &= ~PROT_WRITE;
4723
4724         err = 0;
4725
4726         /*
4727          * The loop looks up pages in the range [off, off + len).
4728          * For each page, we first check if we should initiate an asynchronous
4729          * read ahead before we call page_lookup (we may sleep in page_lookup
4730          * for a previously initiated disk read).
4731          */
4732         eoff = (uoff + len);
4733         for (pgoff = uoff, pgaddr = addr, pl = plarr;
4734             pgoff < eoff; /* empty */) {
4735                 page_t  *pp;
4736                 uoff_t  nextrio;
4737                 se_t    se;
4738                 int retval;
4739
4740                 se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED);
4741
4742                 /* Handle async getpage (faultahead) */
4743                 if (plarr == NULL) {
4744                         ip->i_nextrio = pgoff;
4745                         (void) ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4746                         pgoff += pgsize;
4747                         pgaddr += pgsize;
4748                         continue;
4749                 }
4750                 /*
4751                  * Check if we should initiate read ahead of next cluster.
4752                  * We call page_exists only when we need to confirm that
4753                  * we have the current page before we initiate the read ahead.
4754                  */
4755                 nextrio = ip->i_nextrio;
4756                 if (seqmode &&
4757                     pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
4758                     nextrio < ip->i_size && page_exists(&vp->v_object, pgoff)) {
4759                         retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4760                         /*
4761                          * We always read ahead the next cluster of data
4762                          * starting from i_nextrio. If the page (vp,nextrio)
4763                          * is actually in core at this point, the routine
4764                          * ufs_getpage_ra() will stop pre-fetching data
4765                          * until we read that page in a synchronized manner
4766                          * through ufs_getpage_miss(). So, we should increase
4767                          * i_nextrio if the page (vp, nextrio) exists.
4768                          */
4769                         if ((retval == 0) && page_exists(&vp->v_object, nextrio)) {
4770                                 ip->i_nextrio = nextrio + pgsize;
4771                         }
4772                 }
4773
4774                 if ((pp = page_lookup(&vp->v_object, pgoff, se)) != NULL) {
4775                         /*
4776                          * We found the page in the page cache.
4777                          */
4778                         *pl++ = pp;
4779                         pgoff += pgsize;
4780                         pgaddr += pgsize;
4781                         len -= pgsize;
4782                         plsz -= pgsize;
4783                 } else  {
4784                         /*
4785                          * We have to create the page, or read it from disk.
4786                          */
4787                         if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr,
4788                             pl, plsz, rw, seqmode))
4789                                 goto error;
4790
4791                         while (*pl != NULL) {
4792                                 pl++;
4793                                 pgoff += pgsize;
4794                                 pgaddr += pgsize;
4795                                 len -= pgsize;
4796                                 plsz -= pgsize;
4797                         }
4798                 }
4799         }
4800
4801         /*
4802          * Return pages up to plsz if they are in the page cache.
4803          * We cannot return pages if there is a chance that they are
4804          * backed with a UFS hole and rw is S_WRITE or S_CREATE.
4805          */
4806         if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
4807
4808                 ASSERT((protp == NULL) ||
4809                     !(has_holes && (*protp & PROT_WRITE)));
4810
4811                 eoff = pgoff + plsz;
4812                 while (pgoff < eoff) {
4813                         page_t          *pp;
4814
4815                         if ((pp = page_lookup_nowait(&vp->v_object, pgoff, SE_SHARED)) == NULL)
4816                                 break;
4817
4818                         *pl++ = pp;
4819                         pgoff += pgsize;
4820                         plsz -= pgsize;
4821                 }
4822         }
4823
4824         if (plarr)
4825                 *pl = NULL;                     /* Terminate page list */
4826         ip->i_nextr = pgoff;
4827
4828 error:
4829         if (err && plarr) {
4830                 /*
4831                  * Release any pages we have locked.
4832                  */
4833                 while (pl > &plarr[0])
4834                         page_unlock(*--pl);
4835
4836                 plarr[0] = NULL;
4837         }
4838
4839 update_inode:
4840         /*
4841          * If the inode is not already marked for IACC (in rdip() for read)
4842          * and the inode is not marked for no access time update (in wrip()
4843          * for write) then update the inode access time and mod time now.
4844          */
4845         if ((ip->i_flag & (IACC | INOACC)) == 0) {
4846                 if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) {
4847                         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
4848                             (fs->fs_ronly == 0) &&
4849                             (!ufsvfsp->vfs_noatime)) {
4850                                 mutex_enter(&ip->i_tlock);
4851                                 ip->i_flag |= IACC;
4852                                 ITIMES_NOLOCK(ip);
4853                                 mutex_exit(&ip->i_tlock);
4854                         }
4855                 }
4856         }
4857
4858         if (dolock) {
4859                 rw_exit(&ip->i_contents);
4860                 if (do_qlock && rwtype == RW_WRITER)
4861                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4862         }
4863
4864 unlock:
4865         if (ulp) {
4866                 if ((rw == S_CREATE || rw == S_WRITE) &&
4867                     !(vp->v_flag & VISSWAP)) {
4868                         TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4869                 }
4870                 ufs_lockfs_end(ulp);
4871         }
4872 out:
4873         return (err);
4874 }
4875
4876 /*
4877  * ufs_getpage_miss is called when ufs_getpage missed the page in the page
4878  * cache. The page is either read from the disk, or it's created.
4879  * A page is created (without disk read) if rw == S_CREATE, or if
4880  * the page is not backed with a real disk block (UFS hole).
4881  */
4882 /* ARGSUSED */
4883 static int
4884 ufs_getpage_miss(struct vnode *vp, uoff_t off, size_t len, struct seg *seg,
4885         caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq)
4886 {
4887         struct inode    *ip = VTOI(vp);
4888         page_t          *pp;
4889         daddr_t         bn;
4890         size_t          io_len;
4891         int             crpage = 0;
4892         int             err;
4893         int             contig;
4894         int             bsize = ip->i_fs->fs_bsize;
4895
4896         /*
4897          * Figure out whether the page can be created, or must be
4898          * must be read from the disk.
4899          */
4900         if (rw == S_CREATE)
4901                 crpage = 1;
4902         else {
4903                 contig = 0;
4904                 if (err = bmap_read(ip, off, &bn, &contig))
4905                         return (err);
4906
4907                 crpage = (bn == UFS_HOLE);
4908
4909                 /*
4910                  * If its also a fallocated block that hasn't been written to
4911                  * yet, we will treat it just like a UFS_HOLE and create
4912                  * a zero page for it
4913                  */
4914                 if (ISFALLOCBLK(ip, bn))
4915                         crpage = 1;
4916         }
4917
4918         if (crpage) {
4919                 if ((pp = page_create_va(&vp->v_object, off, PAGESIZE, PG_WAIT,
4920                                          seg, addr)) == NULL) {
4921                         return (ufs_fault(vp,
4922                             "ufs_getpage_miss: page_create == NULL"));
4923                 }
4924
4925                 if (rw != S_CREATE)
4926                         pagezero(pp, 0, PAGESIZE);
4927
4928                 io_len = PAGESIZE;
4929         } else {
4930                 uoff_t  io_off;
4931                 uint_t  xlen;
4932                 struct buf      *bp;
4933                 ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
4934
4935                 /*
4936                  * If access is not in sequential order, we read from disk
4937                  * in bsize units.
4938                  *
4939                  * We limit the size of the transfer to bsize if we are reading
4940                  * from the beginning of the file. Note in this situation we
4941                  * will hedge our bets and initiate an async read ahead of
4942                  * the second block.
4943                  */
4944                 if (!seq || off == 0)
4945                         contig = MIN(contig, bsize);
4946
4947                 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4948                     &io_len, off, contig, 0);
4949
4950                 /*
4951                  * Some other thread has entered the page.
4952                  * ufs_getpage will retry page_lookup.
4953                  */
4954                 if (pp == NULL) {
4955                         pl[0] = NULL;
4956                         return (0);
4957                 }
4958
4959                 /*
4960                  * Zero part of the page which we are not
4961                  * going to read from the disk.
4962                  */
4963                 xlen = io_len & PAGEOFFSET;
4964                 if (xlen != 0)
4965                         pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
4966
4967                 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ);
4968                 bp->b_edev = ip->i_dev;
4969                 bp->b_dev = cmpdev(ip->i_dev);
4970                 bp->b_blkno = bn;
4971                 bp->b_un.b_addr = (caddr_t)0;
4972                 bp->b_file = ip->i_vnode;
4973                 bp->b_offset = off;
4974
4975                 if (ufsvfsp->vfs_log) {
4976                         lufs_read_strategy(ufsvfsp->vfs_log, bp);
4977                 } else if (ufsvfsp->vfs_snapshot) {
4978                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
4979                 } else {
4980                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
4981                         ub.ub_getpages.value.ul++;
4982                         (void) bdev_strategy(bp);
4983                         lwp_stat_update(LWP_STAT_INBLK, 1);
4984                 }
4985
4986                 ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK);
4987
4988                 /*
4989                  * If the file access is sequential, initiate read ahead
4990                  * of the next cluster.
4991                  */
4992                 if (seq && ip->i_nextrio < ip->i_size)
4993                         (void) ufs_getpage_ra(vp, off, seg, addr);
4994                 err = biowait(bp);
4995                 pageio_done(bp);
4996
4997                 if (err) {
4998                         pvn_read_done(pp, B_ERROR);
4999                         return (err);
5000                 }
5001         }
5002
5003         pvn_plist_init(pp, pl, plsz, off, io_len, rw);
5004         return (0);
5005 }
5006
5007 /*
5008  * Read ahead a cluster from the disk. Returns the length in bytes.
5009  */
5010 static int
5011 ufs_getpage_ra(struct vnode *vp, uoff_t off, struct seg *seg, caddr_t addr)
5012 {
5013         struct inode    *ip = VTOI(vp);
5014         page_t          *pp;
5015         uoff_t  io_off = ip->i_nextrio;
5016         ufsvfs_t        *ufsvfsp;
5017         caddr_t         addr2 = addr + (io_off - off);
5018         struct buf      *bp;
5019         daddr_t         bn;
5020         size_t          io_len;
5021         int             err;
5022         int             contig;
5023         int             xlen;
5024         int             bsize = ip->i_fs->fs_bsize;
5025
5026         /*
5027          * If the directio advisory is in effect on this file,
5028          * then do not do buffered read ahead. Read ahead makes
5029          * it more difficult on threads using directio as they
5030          * will be forced to flush the pages from this vnode.
5031          */
5032         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5033                 return (0);
5034         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio)
5035                 return (0);
5036
5037         /*
5038          * Is this test needed?
5039          */
5040         if (addr2 >= seg->s_base + seg->s_size)
5041                 return (0);
5042
5043         contig = 0;
5044         err = bmap_read(ip, io_off, &bn, &contig);
5045         /*
5046          * If its a UFS_HOLE or a fallocated block, do not perform
5047          * any read ahead's since there probably is nothing to read ahead
5048          */
5049         if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn))
5050                 return (0);
5051
5052         /*
5053          * Limit the transfer size to bsize if this is the 2nd block.
5054          */
5055         if (io_off == (uoff_t)bsize)
5056                 contig = MIN(contig, bsize);
5057
5058         if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off,
5059             &io_len, io_off, contig, 1)) == NULL)
5060                 return (0);
5061
5062         /*
5063          * Zero part of page which we are not going to read from disk
5064          */
5065         if ((xlen = (io_len & PAGEOFFSET)) > 0)
5066                 pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
5067
5068         ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK;
5069
5070         bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC);
5071         bp->b_edev = ip->i_dev;
5072         bp->b_dev = cmpdev(ip->i_dev);
5073         bp->b_blkno = bn;
5074         bp->b_un.b_addr = (caddr_t)0;
5075         bp->b_file = ip->i_vnode;
5076         bp->b_offset = off;
5077
5078         if (ufsvfsp->vfs_log) {
5079                 lufs_read_strategy(ufsvfsp->vfs_log, bp);
5080         } else if (ufsvfsp->vfs_snapshot) {
5081                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5082         } else {
5083                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5084                 ub.ub_getras.value.ul++;
5085                 (void) bdev_strategy(bp);
5086                 lwp_stat_update(LWP_STAT_INBLK, 1);
5087         }
5088
5089         return (io_len);
5090 }
5091
5092 int     ufs_delay = 1;
5093 /*
5094  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC}
5095  *
5096  * LMXXX - the inode really ought to contain a pointer to one of these
5097  * async args.  Stuff gunk in there and just hand the whole mess off.
5098  * This would replace i_delaylen, i_delayoff.
5099  */
5100 /*ARGSUSED*/
5101 static int
5102 ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
5103     struct cred *cr, caller_context_t *ct)
5104 {
5105         struct inode *ip = VTOI(vp);
5106         int err = 0;
5107
5108         if (vp->v_count == 0) {
5109                 return (ufs_fault(vp, "ufs_putpage: bad v_count == 0"));
5110         }
5111
5112         /*
5113          * XXX - Why should this check be made here?
5114          */
5115         if (vp->v_flag & VNOMAP) {
5116                 err = ENOSYS;
5117                 goto errout;
5118         }
5119
5120         if (ip->i_ufsvfs == NULL) {
5121                 err = EIO;
5122                 goto errout;
5123         }
5124
5125         if (flags & B_ASYNC) {
5126                 if (ufs_delay && len &&
5127                     (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
5128                         mutex_enter(&ip->i_tlock);
5129                         /*
5130                          * If nobody stalled, start a new cluster.
5131                          */
5132                         if (ip->i_delaylen == 0) {
5133                                 ip->i_delayoff = off;
5134                                 ip->i_delaylen = len;
5135                                 mutex_exit(&ip->i_tlock);
5136                                 goto errout;
5137                         }
5138                         /*
5139                          * If we have a full cluster or they are not contig,
5140                          * then push last cluster and start over.
5141                          */
5142                         if (ip->i_delaylen >= CLUSTSZ(ip) ||
5143                             ip->i_delayoff + ip->i_delaylen != off) {
5144                                 uoff_t doff;
5145                                 size_t dlen;
5146
5147                                 doff = ip->i_delayoff;
5148                                 dlen = ip->i_delaylen;
5149                                 ip->i_delayoff = off;
5150                                 ip->i_delaylen = len;
5151                                 mutex_exit(&ip->i_tlock);
5152                                 err = ufs_putpages(vp, doff, dlen,
5153                                     flags, cr);
5154                                 /* LMXXX - flags are new val, not old */
5155                                 goto errout;
5156                         }
5157                         /*
5158                          * There is something there, it's not full, and
5159                          * it is contig.
5160                          */
5161                         ip->i_delaylen += len;
5162                         mutex_exit(&ip->i_tlock);
5163                         goto errout;
5164                 }
5165                 /*
5166                  * Must have weird flags or we are not clustering.
5167                  */
5168         }
5169
5170         err = ufs_putpages(vp, off, len, flags, cr);
5171
5172 errout:
5173         return (err);
5174 }
5175
5176 /*
5177  * If len == 0, do from off to EOF.
5178  *
5179  * The normal cases should be len == 0 & off == 0 (entire vp list),
5180  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
5181  * (from pageout).
5182  */
5183 /*ARGSUSED*/
5184 static int
5185 ufs_putpages(struct vnode *vp, offset_t off, size_t len, int flags,
5186     struct cred *cr)
5187 {
5188         uoff_t io_off;
5189         uoff_t eoff;
5190         struct inode *ip = VTOI(vp);
5191         page_t *pp;
5192         size_t io_len;
5193         int err = 0;
5194         int dolock;
5195
5196         if (vp->v_count == 0)
5197                 return (ufs_fault(vp, "ufs_putpages: v_count == 0"));
5198         /*
5199          * Acquire the readers/write inode lock before locking
5200          * any pages in this inode.
5201          * The inode lock is held during i/o.
5202          */
5203         if (len == 0) {
5204                 mutex_enter(&ip->i_tlock);
5205                 ip->i_delayoff = ip->i_delaylen = 0;
5206                 mutex_exit(&ip->i_tlock);
5207         }
5208         dolock = (rw_owner(&ip->i_contents) != curthread);
5209         if (dolock) {
5210                 /*
5211                  * Must synchronize this thread and any possible thread
5212                  * operating in the window of vulnerability in wrip().
5213                  * It is dangerous to allow both a thread doing a putpage
5214                  * and a thread writing, so serialize them.  The exception
5215                  * is when the thread in wrip() does something which causes
5216                  * a putpage operation.  Then, the thread must be allowed
5217                  * to continue.  It may encounter a bmap_read problem in
5218                  * ufs_putapage, but that is handled in ufs_putapage.
5219                  * Allow async writers to proceed, we don't want to block
5220                  * the pageout daemon.
5221                  */
5222                 if (ip->i_writer == curthread)
5223                         rw_enter(&ip->i_contents, RW_READER);
5224                 else {
5225                         for (;;) {
5226                                 rw_enter(&ip->i_contents, RW_READER);
5227                                 mutex_enter(&ip->i_tlock);
5228                                 /*
5229                                  * If there is no thread in the critical
5230                                  * section of wrip(), then proceed.
5231                                  * Otherwise, wait until there isn't one.
5232                                  */
5233                                 if (ip->i_writer == NULL) {
5234                                         mutex_exit(&ip->i_tlock);
5235                                         break;
5236                                 }
5237                                 rw_exit(&ip->i_contents);
5238                                 /*
5239                                  * Bounce async writers when we have a writer
5240                                  * working on this file so we don't deadlock
5241                                  * the pageout daemon.
5242                                  */
5243                                 if (flags & B_ASYNC) {
5244                                         mutex_exit(&ip->i_tlock);
5245                                         return (0);
5246                                 }
5247                                 cv_wait(&ip->i_wrcv, &ip->i_tlock);
5248                                 mutex_exit(&ip->i_tlock);
5249                         }
5250                 }
5251         }
5252
5253         if (!vn_has_cached_data(vp)) {
5254                 if (dolock)
5255                         rw_exit(&ip->i_contents);
5256                 return (0);
5257         }
5258
5259         if (len == 0) {
5260                 /*
5261                  * Search the entire vp list for pages >= off.
5262                  */
5263                 err = pvn_vplist_dirty(vp, (uoff_t)off, ufs_putapage,
5264                     flags, cr);
5265         } else {
5266                 /*
5267                  * Loop over all offsets in the range looking for
5268                  * pages to deal with.
5269                  */
5270                 if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0)
5271                         eoff = MIN(off + len, eoff);
5272                 else
5273                         eoff = off + len;
5274
5275                 for (io_off = off; io_off < eoff; io_off += io_len) {
5276                         /*
5277                          * If we are not invalidating, synchronously
5278                          * freeing or writing pages, use the routine
5279                          * page_lookup_nowait() to prevent reclaiming
5280                          * them from the free list.
5281                          */
5282                         if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
5283                                 pp = page_lookup(&vp->v_object, io_off,
5284                                                  (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
5285                         } else {
5286                                 pp = page_lookup_nowait(&vp->v_object,
5287                                                         io_off,
5288                                                         (flags & B_FREE) ? SE_EXCL : SE_SHARED);
5289                         }
5290
5291                         if (pp == NULL || pvn_getdirty(pp, flags) == 0)
5292                                 io_len = PAGESIZE;
5293                         else {
5294                                 uoff_t *io_offp = &io_off;
5295
5296                                 err = ufs_putapage(vp, pp, io_offp, &io_len,
5297                                     flags, cr);
5298                                 if (err != 0)
5299                                         break;
5300                                 /*
5301                                  * "io_off" and "io_len" are returned as
5302                                  * the range of pages we actually wrote.
5303                                  * This allows us to skip ahead more quickly
5304                                  * since several pages may've been dealt
5305                                  * with by this iteration of the loop.
5306                                  */
5307                         }
5308                 }
5309         }
5310         if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
5311                 /*
5312                  * We have just sync'ed back all the pages on
5313                  * the inode, turn off the IMODTIME flag.
5314                  */
5315                 mutex_enter(&ip->i_tlock);
5316                 ip->i_flag &= ~IMODTIME;
5317                 mutex_exit(&ip->i_tlock);
5318         }
5319         if (dolock)
5320                 rw_exit(&ip->i_contents);
5321         return (err);
5322 }
5323
5324 static void
5325 ufs_iodone(buf_t *bp)
5326 {
5327         struct inode *ip;
5328
5329         VERIFY(bp->b_pages->p_object != NULL);
5330         ASSERT(bp->b_pages->p_vnode != NULL);
5331         ASSERT(!(bp->b_flags & B_READ));
5332
5333         bp->b_iodone = NULL;
5334
5335         ip = VTOI(bp->b_pages->p_vnode);
5336
5337         mutex_enter(&ip->i_tlock);
5338         if (ip->i_writes >= ufs_LW) {
5339                 if ((ip->i_writes -= bp->b_bcount) <= ufs_LW)
5340                         if (ufs_WRITES)
5341                                 cv_broadcast(&ip->i_wrcv); /* wake all up */
5342         } else {
5343                 ip->i_writes -= bp->b_bcount;
5344         }
5345
5346         mutex_exit(&ip->i_tlock);
5347         iodone(bp);
5348 }
5349
5350 /*
5351  * Write out a single page, possibly klustering adjacent
5352  * dirty pages.  The inode lock must be held.
5353  *
5354  * LMXXX - bsize < pagesize not done.
5355  */
5356 /*ARGSUSED*/
5357 int
5358 ufs_putapage(struct vnode *vp, page_t *pp, uoff_t *offp, size_t *lenp,
5359         int flags, struct cred *cr)
5360 {
5361         uoff_t io_off;
5362         uoff_t off;
5363         struct inode *ip = VTOI(vp);
5364         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
5365         struct fs *fs;
5366         struct buf *bp;
5367         size_t io_len;
5368         daddr_t bn;
5369         int err;
5370         int contig;
5371         int dotrans;
5372
5373         ASSERT(RW_LOCK_HELD(&ip->i_contents));
5374
5375         if (ufsvfsp == NULL) {
5376                 err = EIO;
5377                 goto out_trace;
5378         }
5379
5380         fs = ip->i_fs;
5381         ASSERT(fs->fs_ronly == 0);
5382
5383         /*
5384          * If the modified time on the inode has not already been
5385          * set elsewhere (e.g. for write/setattr) we set the time now.
5386          * This gives us approximate modified times for mmap'ed files
5387          * which are modified via stores in the user address space.
5388          */
5389         if ((ip->i_flag & IMODTIME) == 0) {
5390                 mutex_enter(&ip->i_tlock);
5391                 ip->i_flag |= IUPD;
5392                 ip->i_seq++;
5393                 ITIMES_NOLOCK(ip);
5394                 mutex_exit(&ip->i_tlock);
5395         }
5396
5397         /*
5398          * Align the request to a block boundry (for old file systems),
5399          * and go ask bmap() how contiguous things are for this file.
5400          */
5401         off = pp->p_offset & (offset_t)fs->fs_bmask;    /* block align it */
5402         contig = 0;
5403         err = bmap_read(ip, off, &bn, &contig);
5404         if (err)
5405                 goto out;
5406         if (bn == UFS_HOLE) {                   /* putpage never allocates */
5407                 /*
5408                  * logging device is in error mode; simply return EIO
5409                  */
5410                 if (TRANS_ISERROR(ufsvfsp)) {
5411                         err = EIO;
5412                         goto out;
5413                 }
5414                 /*
5415                  * Oops, the thread in the window in wrip() did some
5416                  * sort of operation which caused a putpage in the bad
5417                  * range.  In this case, just return an error which will
5418                  * cause the software modified bit on the page to set
5419                  * and the page will get written out again later.
5420                  */
5421                 if (ip->i_writer == curthread) {
5422                         err = EIO;
5423                         goto out;
5424                 }
5425                 /*
5426                  * If the pager is trying to push a page in the bad range
5427                  * just tell it to try again later when things are better.
5428                  */
5429                 if (flags & B_ASYNC) {
5430                         err = EAGAIN;
5431                         goto out;
5432                 }
5433                 err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE");
5434                 goto out;
5435         }
5436
5437         /*
5438          * If it is an fallocate'd block, reverse the negativity since
5439          * we are now writing to it
5440          */
5441         if (ISFALLOCBLK(ip, bn)) {
5442                 err = bmap_set_bn(vp, off, dbtofsb(fs, -bn));
5443                 if (err)
5444                         goto out;
5445
5446                 bn = -bn;
5447         }
5448
5449         /*
5450          * Take the length (of contiguous bytes) passed back from bmap()
5451          * and _try_ and get a set of pages covering that extent.
5452          */
5453         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags);
5454
5455         /*
5456          * May have run out of memory and not clustered backwards.
5457          * off          p_offset
5458          * [  pp - 1  ][   pp   ]
5459          * [    block           ]
5460          * We told bmap off, so we have to adjust the bn accordingly.
5461          */
5462         if (io_off > off) {
5463                 bn += btod(io_off - off);
5464                 contig -= (io_off - off);
5465         }
5466
5467         /*
5468          * bmap was carefull to tell us the right size so use that.
5469          * There might be unallocated frags at the end.
5470          * LMXXX - bzero the end of the page?  We must be writing after EOF.
5471          */
5472         if (io_len > contig) {
5473                 ASSERT(io_len - contig < fs->fs_bsize);
5474                 io_len -= (io_len - contig);
5475         }
5476
5477         /*
5478          * Handle the case where we are writing the last page after EOF.
5479          *
5480          * XXX - just a patch for i-mt3.
5481          */
5482         if (io_len == 0) {
5483                 ASSERT(pp->p_offset >=
5484                     (uoff_t)(roundup(ip->i_size, PAGESIZE)));
5485                 io_len = PAGESIZE;
5486         }
5487
5488         bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags);
5489
5490         ULOCKFS_SET_MOD(ITOUL(ip));
5491
5492         bp->b_edev = ip->i_dev;
5493         bp->b_dev = cmpdev(ip->i_dev);
5494         bp->b_blkno = bn;
5495         bp->b_un.b_addr = (caddr_t)0;
5496         bp->b_file = ip->i_vnode;
5497
5498         /*
5499          * File contents of shadow or quota inodes are metadata, and updates
5500          * to these need to be put into a logging transaction. All direct
5501          * callers in UFS do that, but fsflush can come here _before_ the
5502          * normal codepath. An example would be updating ACL information, for
5503          * which the normal codepath would be:
5504          *      ufs_si_store()
5505          *      ufs_rdwri()
5506          *      wrip()
5507          *      segmap_release()
5508          *      fop_putpage()
5509          * Here, fsflush can pick up the dirty page before segmap_release()
5510          * forces it out. If that happens, there's no transaction.
5511          * We therefore need to test whether a transaction exists, and if not
5512          * create one - for fsflush.
5513          */
5514         dotrans =
5515             (((ip->i_mode & IFMT) == IFSHAD || ufsvfsp->vfs_qinod == ip) &&
5516             ((curthread->t_flag & T_DONTBLOCK) == 0) &&
5517             (TRANS_ISTRANS(ufsvfsp)));
5518
5519         if (dotrans) {
5520                 curthread->t_flag |= T_DONTBLOCK;
5521                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5522         }
5523         if (TRANS_ISTRANS(ufsvfsp)) {
5524                 if ((ip->i_mode & IFMT) == IFSHAD) {
5525                         TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD);
5526                 } else if (ufsvfsp->vfs_qinod == ip) {
5527                         TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR,
5528                             0, 0);
5529                 }
5530         }
5531         if (dotrans) {
5532                 TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5533                 curthread->t_flag &= ~T_DONTBLOCK;
5534         }
5535
5536         /* write throttle */
5537
5538         ASSERT(bp->b_iodone == NULL);
5539         bp->b_iodone = (int (*)())ufs_iodone;
5540         mutex_enter(&ip->i_tlock);
5541         ip->i_writes += bp->b_bcount;
5542         mutex_exit(&ip->i_tlock);
5543
5544         if (bp->b_flags & B_ASYNC) {
5545                 if (ufsvfsp->vfs_log) {
5546                         lufs_write_strategy(ufsvfsp->vfs_log, bp);
5547                 } else if (ufsvfsp->vfs_snapshot) {
5548                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5549                 } else {
5550                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5551                         ub.ub_putasyncs.value.ul++;
5552                         (void) bdev_strategy(bp);
5553                         lwp_stat_update(LWP_STAT_OUBLK, 1);
5554                 }
5555         } else {
5556                 if (ufsvfsp->vfs_log) {
5557                         lufs_write_strategy(ufsvfsp->vfs_log, bp);
5558                 } else if (ufsvfsp->vfs_snapshot) {
5559                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5560                 } else {
5561                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5562                         ub.ub_putsyncs.value.ul++;
5563                         (void) bdev_strategy(bp);
5564                         lwp_stat_update(LWP_STAT_OUBLK, 1);
5565                 }
5566                 err = biowait(bp);
5567                 pageio_done(bp);
5568                 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
5569         }
5570
5571         pp = NULL;
5572
5573 out:
5574         if (err != 0 && pp != NULL)
5575                 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
5576
5577         if (offp)
5578                 *offp = io_off;
5579         if (lenp)
5580                 *lenp = io_len;
5581 out_trace:
5582         return (err);
5583 }
5584
5585 uint64_t ufs_map_alock_retry_cnt;
5586 uint64_t ufs_map_lockfs_retry_cnt;
5587
5588 /* ARGSUSED */
5589 static int
5590 ufs_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
5591     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
5592     caller_context_t *ct)
5593 {
5594         struct segvn_crargs vn_a;
5595         struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
5596         struct ulockfs *ulp;
5597         int error, sig;
5598         k_sigset_t smask;
5599         caddr_t hint = *addrp;
5600
5601         if (vp->v_flag & VNOMAP) {
5602                 error = ENOSYS;
5603                 goto out;
5604         }
5605
5606         if (off < 0 || (offset_t)(off + len) < 0) {
5607                 error = ENXIO;
5608                 goto out;
5609         }
5610
5611         if (vp->v_type != VREG) {
5612                 error = ENODEV;
5613                 goto out;
5614         }
5615
5616 retry_map:
5617         *addrp = hint;
5618         /*
5619          * If file is being locked, disallow mapping.
5620          */
5621         if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) {
5622                 error = EAGAIN;
5623                 goto out;
5624         }
5625
5626         as_rangelock(as);
5627         /*
5628          * Note that if we are retrying (because ufs_lockfs_trybegin failed in
5629          * the previous attempt), some other thread could have grabbed
5630          * the same VA range if MAP_FIXED is set. In that case, choose_addr
5631          * would unmap the valid VA range, that is ok.
5632          */
5633         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5634         if (error != 0) {
5635                 as_rangeunlock(as);
5636                 goto out;
5637         }
5638
5639         /*
5640          * a_lock has to be acquired before entering the lockfs protocol
5641          * because that is the order in which pagefault works. Also we cannot
5642          * block on a_lock here because this waiting writer will prevent
5643          * further readers like ufs_read from progressing and could cause
5644          * deadlock between ufs_read/ufs_map/pagefault when a quiesce is
5645          * pending.
5646          */
5647         while (!AS_LOCK_TRYENTER(as, RW_WRITER)) {
5648                 ufs_map_alock_retry_cnt++;
5649                 delay(RETRY_LOCK_DELAY);
5650         }
5651
5652         /*
5653          * We can't hold as->a_lock and wait for lockfs to succeed because
5654          * the proc tools might hang on a_lock, so call ufs_lockfs_trybegin()
5655          * instead.
5656          */
5657         if (error = ufs_lockfs_trybegin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK)) {
5658                 /*
5659                  * ufs_lockfs_trybegin() did not succeed. It is safer to give up
5660                  * as->a_lock and wait for ulp->ul_fs_lock status to change.
5661                  */
5662                 ufs_map_lockfs_retry_cnt++;
5663                 AS_LOCK_EXIT(as);
5664                 as_rangeunlock(as);
5665                 if (error == EIO)
5666                         goto out;
5667
5668                 mutex_enter(&ulp->ul_lock);
5669                 while (ulp->ul_fs_lock & ULOCKFS_MAP_MASK) {
5670                         if (ULOCKFS_IS_SLOCK(ulp) || ufsvfsp->vfs_nointr) {
5671                                 cv_wait(&ulp->ul_cv, &ulp->ul_lock);
5672                         } else {
5673                                 sigintr(&smask, 1);
5674                                 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
5675                                 sigunintr(&smask);
5676                                 if (((ulp->ul_fs_lock & ULOCKFS_MAP_MASK) &&
5677                                     !sig) || ufsvfsp->vfs_dontblock) {
5678                                         mutex_exit(&ulp->ul_lock);
5679                                         return (EINTR);
5680                                 }
5681                         }
5682                 }
5683                 mutex_exit(&ulp->ul_lock);
5684                 goto retry_map;
5685         }
5686
5687         vn_a.vp = vp;
5688         vn_a.offset = (uoff_t)off;
5689         vn_a.type = flags & MAP_TYPE;
5690         vn_a.prot = prot;
5691         vn_a.maxprot = maxprot;
5692         vn_a.cred = cr;
5693         vn_a.amp = NULL;
5694         vn_a.flags = flags & ~MAP_TYPE;
5695         vn_a.szc = 0;
5696         vn_a.lgrp_mem_policy_flags = 0;
5697
5698         error = as_map_locked(as, *addrp, len, segvn_create, &vn_a);
5699         if (ulp)
5700                 ufs_lockfs_end(ulp);
5701         as_rangeunlock(as);
5702 out:
5703         return (error);
5704 }
5705
5706 /* ARGSUSED */
5707 static int
5708 ufs_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
5709     size_t len, uchar_t  prot, uchar_t  maxprot, uint_t    flags,
5710     struct cred *cr, caller_context_t *ct)
5711 {
5712         struct inode *ip = VTOI(vp);
5713
5714         if (vp->v_flag & VNOMAP) {
5715                 return (ENOSYS);
5716         }
5717
5718         mutex_enter(&ip->i_tlock);
5719         ip->i_mapcnt += btopr(len);
5720         mutex_exit(&ip->i_tlock);
5721         return (0);
5722 }
5723
5724 /*ARGSUSED*/
5725 static int
5726 ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
5727     size_t len, uint_t prot,  uint_t maxprot,  uint_t flags, struct cred *cr,
5728     caller_context_t *ct)
5729 {
5730         struct inode *ip = VTOI(vp);
5731
5732         if (vp->v_flag & VNOMAP) {
5733                 return (ENOSYS);
5734         }
5735
5736         mutex_enter(&ip->i_tlock);
5737         ip->i_mapcnt -= btopr(len);     /* Count released mappings */
5738         ASSERT(ip->i_mapcnt >= 0);
5739         mutex_exit(&ip->i_tlock);
5740         return (0);
5741 }
5742 /*
5743  * Return the answer requested to poll() for non-device files
5744  */
5745 struct pollhead ufs_pollhd;
5746
5747 /* ARGSUSED */
5748 int
5749 ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp,
5750     caller_context_t *ct)
5751 {
5752         struct ufsvfs   *ufsvfsp;
5753
5754         /*
5755          * Regular files reject edge-triggered pollers.
5756          * See the comment in fs_poll() for a more detailed explanation.
5757          */
5758         if (ev & POLLET)
5759                 return (EPERM);
5760
5761         *revp = 0;
5762         ufsvfsp = VTOI(vp)->i_ufsvfs;
5763
5764         if (!ufsvfsp) {
5765                 *revp = POLLHUP;
5766                 goto out;
5767         }
5768
5769         if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) ||
5770             ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) {
5771                 *revp |= POLLERR;
5772
5773         } else {
5774                 if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly &&
5775                     !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5776                         *revp |= POLLOUT;
5777
5778                 if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly &&
5779                     !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5780                         *revp |= POLLWRBAND;
5781
5782                 if (ev & POLLIN)
5783                         *revp |= POLLIN;
5784
5785                 if (ev & POLLRDNORM)
5786                         *revp |= POLLRDNORM;
5787
5788                 if (ev & POLLRDBAND)
5789                         *revp |= POLLRDBAND;
5790         }
5791
5792         if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP)))
5793                 *revp |= POLLPRI;
5794 out:
5795         if (*revp == 0 && ! any)
5796                 *phpp = &ufs_pollhd;
5797
5798         return (0);
5799 }
5800
5801 /* ARGSUSED */
5802 static int
5803 ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr,
5804     caller_context_t *ct)
5805 {
5806         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
5807         struct ulockfs  *ulp = NULL;
5808         struct inode    *sip = NULL;
5809         int             error;
5810         struct inode    *ip = VTOI(vp);
5811         int             issync;
5812
5813         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK);
5814         if (error)
5815                 return (error);
5816
5817         switch (cmd) {
5818                 /*
5819                  * Have to handle _PC_NAME_MAX here, because the normal way
5820                  * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()]
5821                  * results in a lock ordering reversal between
5822                  * ufs_lockfs_{begin,end}() and
5823                  * ufs_thread_{suspend,continue}().
5824                  *
5825                  * Keep in sync with ufs_statvfs().
5826                  */
5827         case _PC_NAME_MAX:
5828                 *valp = MAXNAMLEN;
5829                 break;
5830
5831         case _PC_FILESIZEBITS:
5832                 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
5833                         *valp = UFS_FILESIZE_BITS;
5834                 else
5835                         *valp = 32;
5836                 break;
5837
5838         case _PC_XATTR_EXISTS:
5839                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5840
5841                         error =
5842                             ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, cr);
5843                         if (error ==  0 && sip != NULL) {
5844                                 /* Start transaction */
5845                                 if (ulp) {
5846                                         TRANS_BEGIN_CSYNC(ufsvfsp, &issync,
5847                                                           TOP_RMDIR,
5848                                                           TOP_RMDIR_SIZE);
5849                                 }
5850                                 /*
5851                                  * Is directory empty
5852                                  */
5853                                 rw_enter(&sip->i_rwlock, RW_WRITER);
5854                                 rw_enter(&sip->i_contents, RW_WRITER);
5855                                 if (ufs_xattrdirempty(sip,
5856                                     sip->i_number, CRED())) {
5857                                         rw_enter(&ip->i_contents, RW_WRITER);
5858                                         ufs_unhook_shadow(ip, sip);
5859                                         rw_exit(&ip->i_contents);
5860
5861                                         *valp = 0;
5862
5863                                 } else
5864                                         *valp = 1;
5865                                 rw_exit(&sip->i_contents);
5866                                 rw_exit(&sip->i_rwlock);
5867                                 if (ulp) {
5868                                         TRANS_END_CSYNC(ufsvfsp, &error,
5869                                                         issync, TOP_RMDIR,
5870                                                         TOP_RMDIR_SIZE);
5871                                 }
5872                                 VN_RELE(ITOV(sip));
5873                         } else if (error == ENOENT) {
5874                                 *valp = 0;
5875                                 error = 0;
5876                         }
5877                 } else {
5878                         error = fs_pathconf(vp, cmd, valp, cr, ct);
5879                 }
5880                 break;
5881
5882         case _PC_ACL_ENABLED:
5883                 *valp = _ACL_ACLENT_ENABLED;
5884                 break;
5885
5886         case _PC_MIN_HOLE_SIZE:
5887                 *valp = (ulong_t)ip->i_fs->fs_bsize;
5888                 break;
5889
5890         case _PC_SATTR_ENABLED:
5891         case _PC_SATTR_EXISTS:
5892                 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5893                     (vp->v_type == VREG || vp->v_type == VDIR);
5894                 break;
5895
5896         case _PC_TIMESTAMP_RESOLUTION:
5897                 /*
5898                  * UFS keeps only microsecond timestamp resolution.
5899                  * This is historical and will probably never change.
5900                  */
5901                 *valp = 1000L;
5902                 break;
5903
5904         default:
5905                 error = fs_pathconf(vp, cmd, valp, cr, ct);
5906                 break;
5907         }
5908
5909         if (ulp != NULL) {
5910                 ufs_lockfs_end(ulp);
5911         }
5912         return (error);
5913 }
5914
5915 int ufs_pageio_writes, ufs_pageio_reads;
5916
5917 /*ARGSUSED*/
5918 static int
5919 ufs_pageio(struct vnode *vp, page_t *pp, uoff_t io_off, size_t io_len,
5920         int flags, struct cred *cr, caller_context_t *ct)
5921 {
5922         struct inode *ip = VTOI(vp);
5923         struct ufsvfs *ufsvfsp;
5924         page_t *npp = NULL, *opp = NULL, *cpp = pp;
5925         struct buf *bp;
5926         daddr_t bn;
5927         size_t done_len = 0, cur_len = 0;
5928         int err = 0;
5929         int contig = 0;
5930         int dolock;
5931         int vmpss = 0;
5932         struct ulockfs *ulp;
5933
5934         if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp &&
5935             vp->v_mpssdata != NULL) {
5936                 vmpss = 1;
5937         }
5938
5939         dolock = (rw_owner(&ip->i_contents) != curthread);
5940         /*
5941          * We need a better check.  Ideally, we would use another
5942          * vnodeops so that hlocked and forcibly unmounted file
5943          * systems would return EIO where appropriate and w/o the
5944          * need for these checks.
5945          */
5946         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5947                 return (EIO);
5948
5949         /*
5950          * For vmpss (pp can be NULL) case respect the quiesce protocol.
5951          * ul_lock must be taken before locking pages so we can't use it here
5952          * if pp is non NULL because segvn already locked pages
5953          * SE_EXCL. Instead we rely on the fact that a forced umount or
5954          * applying a filesystem lock via ufs_fiolfs() will block in the
5955          * implicit call to ufs_flush() until we unlock the pages after the
5956          * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend
5957          * above 0 until they are done. We have to be careful not to increment
5958          * ul_vnops_cnt here after forceful unmount hlocks the file system.
5959          *
5960          * If pp is NULL use ul_lock to make sure we don't increment
5961          * ul_vnops_cnt after forceful unmount hlocks the file system.
5962          */
5963         if (vmpss || pp == NULL) {
5964                 ulp = &ufsvfsp->vfs_ulockfs;
5965                 if (pp == NULL)
5966                         mutex_enter(&ulp->ul_lock);
5967                 if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) {
5968                         if (pp == NULL) {
5969                                 mutex_exit(&ulp->ul_lock);
5970                         }
5971                         return (vmpss ? EIO : EINVAL);
5972                 }
5973                 atomic_inc_ulong(&ulp->ul_vnops_cnt);
5974                 if (pp == NULL)
5975                         mutex_exit(&ulp->ul_lock);
5976                 if (ufs_quiesce_pend) {
5977                         if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
5978                                 cv_broadcast(&ulp->ul_cv);
5979                         return (vmpss ? EIO : EINVAL);
5980                 }
5981         }
5982
5983         if (dolock) {
5984                 /*
5985                  * segvn may call fop_pageio() instead of fop_getpage() to
5986                  * handle a fault against a segment that maps vnode pages with
5987                  * large mappings.  Segvn creates pages and holds them locked
5988                  * SE_EXCL during fop_pageio() call. In this case we have to
5989                  * use rw_tryenter() to avoid a potential deadlock since in
5990                  * lock order i_contents needs to be taken first.
5991                  * Segvn will retry via fop_getpage() if fop_pageio() fails.
5992                  */
5993                 if (!vmpss) {
5994                         rw_enter(&ip->i_contents, RW_READER);
5995                 } else if (!rw_tryenter(&ip->i_contents, RW_READER)) {
5996                         if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
5997                                 cv_broadcast(&ulp->ul_cv);
5998                         return (EDEADLK);
5999                 }
6000         }
6001
6002         /*
6003          * Return an error to segvn because the pagefault request is beyond
6004          * PAGESIZE rounded EOF.
6005          */
6006         if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) {
6007                 if (dolock)
6008                         rw_exit(&ip->i_contents);
6009                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6010                         cv_broadcast(&ulp->ul_cv);
6011                 return (EFAULT);
6012         }
6013
6014         if (pp == NULL) {
6015                 if (bmap_has_holes(ip)) {
6016                         err = ENOSYS;
6017                 } else {
6018                         err = EINVAL;
6019                 }
6020                 if (dolock)
6021                         rw_exit(&ip->i_contents);
6022                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6023                         cv_broadcast(&ulp->ul_cv);
6024                 return (err);
6025         }
6026
6027         /*
6028          * Break the io request into chunks, one for each contiguous
6029          * stretch of disk blocks in the target file.
6030          */
6031         while (done_len < io_len) {
6032                 ASSERT(cpp);
6033                 contig = 0;
6034                 if (err = bmap_read(ip, (uoff_t)(io_off + done_len),
6035                     &bn, &contig))
6036                         break;
6037
6038                 if (bn == UFS_HOLE) {   /* No holey swapfiles */
6039                         if (vmpss) {
6040                                 err = EFAULT;
6041                                 break;
6042                         }
6043                         err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE");
6044                         break;
6045                 }
6046
6047                 cur_len = MIN(io_len - done_len, contig);
6048                 /*
6049                  * Zero out a page beyond EOF, when the last block of
6050                  * a file is a UFS fragment so that ufs_pageio() can be used
6051                  * instead of ufs_getpage() to handle faults against
6052                  * segvn segments that use large pages.
6053                  */
6054                 page_list_break(&cpp, &npp, btopr(cur_len));
6055                 if ((flags & B_READ) && (cur_len & PAGEOFFSET)) {
6056                         size_t xlen = cur_len & PAGEOFFSET;
6057                         pagezero(cpp->p_prev, xlen, PAGESIZE - xlen);
6058                 }
6059
6060                 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
6061                 ASSERT(bp != NULL);
6062
6063                 bp->b_edev = ip->i_dev;
6064                 bp->b_dev = cmpdev(ip->i_dev);
6065                 bp->b_blkno = bn;
6066                 bp->b_un.b_addr = (caddr_t)0;
6067                 bp->b_file = ip->i_vnode;
6068
6069                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
6070                 ub.ub_pageios.value.ul++;
6071                 if (ufsvfsp->vfs_snapshot)
6072                         fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp);
6073                 else
6074                         (void) bdev_strategy(bp);
6075
6076                 if (flags & B_READ)
6077                         ufs_pageio_reads++;
6078                 else
6079                         ufs_pageio_writes++;
6080                 if (flags & B_READ)
6081                         lwp_stat_update(LWP_STAT_INBLK, 1);
6082                 else
6083                         lwp_stat_update(LWP_STAT_OUBLK, 1);
6084                 /*
6085                  * If the request is not B_ASYNC, wait for i/o to complete
6086                  * and re-assemble the page list to return to the caller.
6087                  * If it is B_ASYNC we leave the page list in pieces and
6088                  * cleanup() will dispose of them.
6089                  */
6090                 if ((flags & B_ASYNC) == 0) {
6091                         err = biowait(bp);
6092                         pageio_done(bp);
6093                         if (err)
6094                                 break;
6095                         page_list_concat(&opp, &cpp);
6096                 }
6097                 cpp = npp;
6098                 npp = NULL;
6099                 if (flags & B_READ)
6100                         cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t);
6101                 done_len += cur_len;
6102         }
6103         ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len));
6104         if (err) {
6105                 if (flags & B_ASYNC) {
6106                         /* Cleanup unprocessed parts of list */
6107                         page_list_concat(&cpp, &npp);
6108                         if (flags & B_READ)
6109                                 pvn_read_done(cpp, B_ERROR);
6110                         else
6111                                 pvn_write_done(cpp, B_ERROR);
6112                 } else {
6113                         /* Re-assemble list and let caller clean up */
6114                         page_list_concat(&opp, &cpp);
6115                         page_list_concat(&opp, &npp);
6116                 }
6117         }
6118
6119         if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) &&
6120             ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) {
6121                 mutex_enter(&ip->i_tlock);
6122                 ip->i_flag |= IACC;
6123                 ITIMES_NOLOCK(ip);
6124                 mutex_exit(&ip->i_tlock);
6125         }
6126
6127         if (dolock)
6128                 rw_exit(&ip->i_contents);
6129         if (vmpss && !atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6130                 cv_broadcast(&ulp->ul_cv);
6131         return (err);
6132 }
6133
6134 /*
6135  * Called when the kernel is in a frozen state to dump data
6136  * directly to the device. It uses a private dump data structure,
6137  * set up by dump_ctl, to locate the correct disk block to which to dump.
6138  */
6139 /*ARGSUSED*/
6140 static int
6141 ufs_dump(vnode_t *vp, caddr_t addr, offset_t ldbn, offset_t dblks,
6142     caller_context_t *ct)
6143 {
6144         uoff_t  file_size;
6145         struct inode    *ip = VTOI(vp);
6146         struct fs       *fs = ip->i_fs;
6147         daddr_t         dbn, lfsbn;
6148         int             disk_blks = fs->fs_bsize >> DEV_BSHIFT;
6149         int             error = 0;
6150         int             ndbs, nfsbs;
6151
6152         /*
6153          * forced unmount case
6154          */
6155         if (ip->i_ufsvfs == NULL)
6156                 return (EIO);
6157         /*
6158          * Validate the inode that it has not been modified since
6159          * the dump structure is allocated.
6160          */
6161         mutex_enter(&ip->i_tlock);
6162         if ((dump_info == NULL) ||
6163             (dump_info->ip != ip) ||
6164             (dump_info->time.tv_sec != ip->i_mtime.tv_sec) ||
6165             (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) {
6166                 mutex_exit(&ip->i_tlock);
6167                 return (-1);
6168         }
6169         mutex_exit(&ip->i_tlock);
6170
6171         /*
6172          * See that the file has room for this write
6173          */
6174         UFS_GET_ISIZE(&file_size, ip);
6175
6176         if (ldbtob(ldbn + dblks) > file_size)
6177                 return (ENOSPC);
6178
6179         /*
6180          * Find the physical disk block numbers from the dump
6181          * private data structure directly and write out the data
6182          * in contiguous block lumps
6183          */
6184         while (dblks > 0 && !error) {
6185                 lfsbn = (daddr_t)lblkno(fs, ldbtob(ldbn));
6186                 dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks;
6187                 nfsbs = 1;
6188                 ndbs = disk_blks - ldbn % disk_blks;
6189                 while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn +
6190                     nfsbs]) == dbn + ndbs) {
6191                         nfsbs++;
6192                         ndbs += disk_blks;
6193                 }
6194                 if (ndbs > dblks)
6195                         ndbs = dblks;
6196                 error = bdev_dump(ip->i_dev, addr, dbn, ndbs);
6197                 addr += ldbtob((offset_t)ndbs);
6198                 dblks -= ndbs;
6199                 ldbn += ndbs;
6200         }
6201         return (error);
6202
6203 }
6204
6205 /*
6206  * Prepare the file system before and after the dump operation.
6207  *
6208  * action = DUMP_ALLOC:
6209  * Preparation before dump, allocate dump private data structure
6210  * to hold all the direct and indirect block info for dump.
6211  *
6212  * action = DUMP_FREE:
6213  * Clean up after dump, deallocate the dump private data structure.
6214  *
6215  * action = DUMP_SCAN:
6216  * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space;
6217  * if found, the starting file-relative DEV_BSIZE lbn is written
6218  * to *bklp; that lbn is intended for use with fop_dump()
6219  */
6220 /*ARGSUSED*/
6221 static int
6222 ufs_dumpctl(vnode_t *vp, int action, offset_t *blkp, caller_context_t *ct)
6223 {
6224         struct inode    *ip = VTOI(vp);
6225         ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
6226         struct fs       *fs;
6227         daddr32_t       *dblk, *storeblk;
6228         daddr32_t       *nextblk, *endblk;
6229         struct buf      *bp;
6230         int             i, entry, entries;
6231         int             n, ncontig;
6232
6233         /*
6234          * check for forced unmount
6235          */
6236         if (ufsvfsp == NULL)
6237                 return (EIO);
6238
6239         if (action == DUMP_ALLOC) {
6240                 /*
6241                  * alloc and record dump_info
6242                  */
6243                 if (dump_info != NULL)
6244                         return (EINVAL);
6245
6246                 ASSERT(vp->v_type == VREG);
6247                 fs = ufsvfsp->vfs_fs;
6248
6249                 rw_enter(&ip->i_contents, RW_READER);
6250
6251                 if (bmap_has_holes(ip)) {
6252                         rw_exit(&ip->i_contents);
6253                         return (EFAULT);
6254                 }
6255
6256                 /*
6257                  * calculate and allocate space needed according to i_size
6258                  */
6259                 entries = (int)lblkno(fs, blkroundup(fs, ip->i_size));
6260                 dump_info = kmem_alloc(sizeof (struct dump) +
6261                     (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP);
6262                 if (dump_info == NULL) {
6263                         rw_exit(&ip->i_contents);
6264                         return (ENOMEM);
6265                 }
6266
6267                 /* Start saving the info */
6268                 dump_info->fsbs = entries;
6269                 dump_info->ip = ip;
6270                 storeblk = &dump_info->dblk[0];
6271
6272                 /* Direct Blocks */
6273                 for (entry = 0; entry < NDADDR && entry < entries; entry++)
6274                         *storeblk++ = ip->i_db[entry];
6275
6276                 /* Indirect Blocks */
6277                 for (i = 0; i < NIADDR; i++) {
6278                         int error = 0;
6279
6280                         bp = UFS_BREAD(ufsvfsp,
6281                             ip->i_dev, fsbtodb(fs, ip->i_ib[i]), fs->fs_bsize);
6282                         if (bp->b_flags & B_ERROR)
6283                                 error = EIO;
6284                         else {
6285                                 dblk = bp->b_un.b_daddr;
6286                                 if ((storeblk = save_dblks(ip, ufsvfsp,
6287                                     storeblk, dblk, i, entries)) == NULL)
6288                                         error = EIO;
6289                         }
6290
6291                         brelse(bp);
6292
6293                         if (error != 0) {
6294                                 kmem_free(dump_info, sizeof (struct dump) +
6295                                     (entries - 1) * sizeof (daddr32_t));
6296                                 rw_exit(&ip->i_contents);
6297                                 dump_info = NULL;
6298                                 return (error);
6299                         }
6300                 }
6301                 /* and time stamp the information */
6302                 mutex_enter(&ip->i_tlock);
6303                 dump_info->time = ip->i_mtime;
6304                 mutex_exit(&ip->i_tlock);
6305
6306                 rw_exit(&ip->i_contents);
6307         } else if (action == DUMP_FREE) {
6308                 /*
6309                  * free dump_info
6310                  */
6311                 if (dump_info == NULL)
6312                         return (EINVAL);
6313                 entries = dump_info->fsbs - 1;
6314                 kmem_free(dump_info, sizeof (struct dump) +
6315                     entries * sizeof (daddr32_t));
6316                 dump_info = NULL;
6317         } else if (action == DUMP_SCAN) {
6318                 /*
6319                  * scan dump_info
6320                  */
6321                 if (dump_info == NULL)
6322                         return (EINVAL);
6323
6324                 dblk = dump_info->dblk;
6325                 nextblk = dblk + 1;
6326                 endblk = dblk + dump_info->fsbs - 1;
6327                 fs = ufsvfsp->vfs_fs;
6328                 ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT);
6329
6330                 /*
6331                  * scan dblk[] entries; contig fs space is found when:
6332                  * ((current blkno + frags per block) == next blkno)
6333                  */
6334                 n = 0;
6335                 while (n < ncontig && dblk < endblk) {
6336                         if ((*dblk + fs->fs_frag) == *nextblk)
6337                                 n++;
6338                         else
6339                                 n = 0;
6340                         dblk++;
6341                         nextblk++;
6342                 }
6343
6344                 /*
6345                  * index is where size bytes of contig space begins;
6346                  * conversion from index to the file's DEV_BSIZE lbn
6347                  * is equivalent to:  (index * fs_bsize) / DEV_BSIZE
6348                  */
6349                 if (n == ncontig) {
6350                         i = (dblk - dump_info->dblk) - ncontig;
6351                         *blkp = i << (fs->fs_bshift - DEV_BSHIFT);
6352                 } else
6353                         return (EFAULT);
6354         }
6355         return (0);
6356 }
6357
6358 /*
6359  * Recursive helper function for ufs_dumpctl().  It follows the indirect file
6360  * system  blocks until it reaches the the disk block addresses, which are
6361  * then stored into the given buffer, storeblk.
6362  */
6363 static daddr32_t *
6364 save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp,  daddr32_t *storeblk,
6365     daddr32_t *dblk, int level, int entries)
6366 {
6367         struct fs       *fs = ufsvfsp->vfs_fs;
6368         struct buf      *bp;
6369         int             i;
6370
6371         if (level == 0) {
6372                 for (i = 0; i < NINDIR(fs); i++) {
6373                         if (storeblk - dump_info->dblk >= entries)
6374                                 break;
6375                         *storeblk++ = dblk[i];
6376                 }
6377                 return (storeblk);
6378         }
6379         for (i = 0; i < NINDIR(fs); i++) {
6380                 if (storeblk - dump_info->dblk >= entries)
6381                         break;
6382                 bp = UFS_BREAD(ufsvfsp,
6383                     ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize);
6384                 if (bp->b_flags & B_ERROR) {
6385                         brelse(bp);
6386                         return (NULL);
6387                 }
6388                 storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr,
6389                     level - 1, entries);
6390                 brelse(bp);
6391
6392                 if (storeblk == NULL)
6393                         return (NULL);
6394         }
6395         return (storeblk);
6396 }
6397
6398 /* ARGSUSED */
6399 static int
6400 ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag,
6401     struct cred *cr, caller_context_t *ct)
6402 {
6403         struct inode    *ip = VTOI(vp);
6404         struct ulockfs  *ulp;
6405         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
6406         ulong_t         vsa_mask = vsap->vsa_mask;
6407         int             err = EINVAL;
6408
6409         vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6410
6411         /*
6412          * Only grab locks if needed - they're not needed to check vsa_mask
6413          * or if the mask contains no acl flags.
6414          */
6415         if (vsa_mask != 0) {
6416                 if (err = ufs_lockfs_begin(ufsvfsp, &ulp,
6417                     ULOCKFS_GETATTR_MASK))
6418                         return (err);
6419
6420                 rw_enter(&ip->i_contents, RW_READER);
6421                 err = ufs_acl_get(ip, vsap, flag, cr);
6422                 rw_exit(&ip->i_contents);
6423
6424                 if (ulp)
6425                         ufs_lockfs_end(ulp);
6426         }
6427         return (err);
6428 }
6429
6430 /* ARGSUSED */
6431 static int
6432 ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr,
6433     caller_context_t *ct)
6434 {
6435         struct inode    *ip = VTOI(vp);
6436         struct ulockfs  *ulp = NULL;
6437         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
6438         ulong_t         vsa_mask = vsap->vsa_mask;
6439         int             err;
6440         int             haverwlock = 1;
6441         int             trans_size;
6442         int             donetrans = 0;
6443         int             retry = 1;
6444
6445         ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
6446
6447         /* Abort now if the request is either empty or invalid. */
6448         vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6449         if ((vsa_mask == 0) ||
6450             ((vsap->vsa_aclentp == NULL) &&
6451             (vsap->vsa_dfaclentp == NULL))) {
6452                 err = EINVAL;
6453                 goto out;
6454         }
6455
6456         /*
6457          * Following convention, if this is a directory then we acquire the
6458          * inode's i_rwlock after starting a UFS logging transaction;
6459          * otherwise, we acquire it beforehand. Since we were called (and
6460          * must therefore return) with the lock held, we will have to drop it,
6461          * and later reacquire it, if operating on a directory.
6462          */
6463         if (vp->v_type == VDIR) {
6464                 rw_exit(&ip->i_rwlock);
6465                 haverwlock = 0;
6466         } else {
6467                 /* Upgrade the lock if required. */
6468                 if (!rw_write_held(&ip->i_rwlock)) {
6469                         rw_exit(&ip->i_rwlock);
6470                         rw_enter(&ip->i_rwlock, RW_WRITER);
6471                 }
6472         }
6473
6474 again:
6475         ASSERT(!(vp->v_type == VDIR && haverwlock));
6476         if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) {
6477                 ulp = NULL;
6478                 retry = 0;
6479                 goto out;
6480         }
6481
6482         /*
6483          * Check that the file system supports this operation. Note that
6484          * ufs_lockfs_begin() will have checked that the file system had
6485          * not been forcibly unmounted.
6486          */
6487         if (ufsvfsp->vfs_fs->fs_ronly) {
6488                 err = EROFS;
6489                 goto out;
6490         }
6491         if (ufsvfsp->vfs_nosetsec) {
6492                 err = ENOSYS;
6493                 goto out;
6494         }
6495
6496         if (ulp) {
6497                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR,
6498                     trans_size = TOP_SETSECATTR_SIZE(VTOI(vp)));
6499                 donetrans = 1;
6500         }
6501
6502         if (vp->v_type == VDIR) {
6503                 rw_enter(&ip->i_rwlock, RW_WRITER);
6504                 haverwlock = 1;
6505         }
6506
6507         ASSERT(haverwlock);
6508
6509         /* Do the actual work. */
6510         rw_enter(&ip->i_contents, RW_WRITER);
6511         /*
6512          * Suppress out of inodes messages if we will retry.
6513          */
6514         if (retry)
6515                 ip->i_flag |= IQUIET;
6516         err = ufs_acl_set(ip, vsap, flag, cr);
6517         ip->i_flag &= ~IQUIET;
6518         rw_exit(&ip->i_contents);
6519
6520 out:
6521         if (ulp) {
6522                 if (donetrans) {
6523                         /*
6524                          * top_end_async() can eventually call
6525                          * top_end_sync(), which can block. We must
6526                          * therefore observe the lock-ordering protocol
6527                          * here as well.
6528                          */
6529                         if (vp->v_type == VDIR) {
6530                                 rw_exit(&ip->i_rwlock);
6531                                 haverwlock = 0;
6532                         }
6533                         TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size);
6534                 }
6535                 ufs_lockfs_end(ulp);
6536         }
6537         /*
6538          * If no inodes available, try scaring a logically-
6539          * free one out of the delete queue to someplace
6540          * that we can find it.
6541          */
6542         if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
6543                 ufs_delete_drain_wait(ufsvfsp, 1);
6544                 retry = 0;
6545                 if (vp->v_type == VDIR && haverwlock) {
6546                         rw_exit(&ip->i_rwlock);
6547                         haverwlock = 0;
6548                 }
6549                 goto again;
6550         }
6551         /*
6552          * If we need to reacquire the lock then it is safe to do so
6553          * as a reader. This is because ufs_rwunlock(), which will be
6554          * called by our caller after we return, does not differentiate
6555          * between shared and exclusive locks.
6556          */
6557         if (!haverwlock) {
6558                 ASSERT(vp->v_type == VDIR);
6559                 rw_enter(&ip->i_rwlock, RW_READER);
6560         }
6561
6562         return (err);
6563 }
6564
6565 /*
6566  * Locate the vnode to be used for an event notification. As this will
6567  * be called prior to the name space change perform basic verification
6568  * that the change will be allowed.
6569  */
6570
6571 static int
6572 ufs_eventlookup(struct vnode *dvp, char *nm, struct cred *cr,
6573     struct vnode **vpp)
6574 {
6575         int     namlen;
6576         int     error;
6577         struct vnode    *vp;
6578         struct inode    *ip;
6579         struct inode    *xip;
6580         struct ufsvfs   *ufsvfsp;
6581         struct ulockfs  *ulp;
6582
6583         ip = VTOI(dvp);
6584         *vpp = NULL;
6585
6586         if ((namlen = strlen(nm)) == 0)
6587                 return (EINVAL);
6588
6589         if (nm[0] == '.') {
6590                 if (namlen == 1)
6591                         return (EINVAL);
6592                 else if ((namlen == 2) && nm[1] == '.') {
6593                         return (EEXIST);
6594                 }
6595         }
6596
6597         /*
6598          * Check accessibility and write access of parent directory as we
6599          * only want to post the event if we're able to make a change.
6600          */
6601         if (error = ufs_diraccess(ip, IEXEC|IWRITE, cr))
6602                 return (error);
6603
6604         if (vp = dnlc_lookup(dvp, nm)) {
6605                 if (vp == DNLC_NO_VNODE) {
6606                         VN_RELE(vp);
6607                         return (ENOENT);
6608                 }
6609
6610                 *vpp = vp;
6611                 return (0);
6612         }
6613
6614         /*
6615          * Keep the idle queue from getting too long by idling two
6616          * inodes before attempting to allocate another.
6617          * This operation must be performed before entering lockfs
6618          * or a transaction.
6619          */
6620         if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
6621                 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
6622                         ins.in_lidles.value.ul += ufs_lookup_idle_count;
6623                         ufs_idle_some(ufs_lookup_idle_count);
6624                 }
6625
6626         ufsvfsp = ip->i_ufsvfs;
6627
6628 retry_lookup:
6629         if (error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK))
6630                 return (error);
6631
6632         if ((error = ufs_dirlook(ip, nm, &xip, cr, 1, 1)) == 0) {
6633                 vp = ITOV(xip);
6634                 *vpp = vp;
6635         }
6636
6637         if (ulp) {
6638                 ufs_lockfs_end(ulp);
6639         }
6640
6641         if (error == EAGAIN)
6642                 goto retry_lookup;
6643
6644         return (error);
6645 }