kernel/fs/vnode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2017, Joyent, Inc.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  27  */
  28
  29 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  30 /*        All Rights Reserved   */
  31
  32 /*
  33  * University Copyright- Copyright (c) 1982, 1986, 1988
  34  * The Regents of the University of California
  35  * All Rights Reserved
  36  *
  37  * University Acknowledgment- Portions of this document are derived from
  38  * software developed by the University of California, Berkeley, and its
  39  * contributors.
  40  */
  41
  42 #include <sys/types.h>
  43 #include <sys/param.h>
  44 #include <sys/t_lock.h>
  45 #include <sys/errno.h>
  46 #include <sys/cred.h>
  47 #include <sys/user.h>
  48 #include <sys/uio.h>
  49 #include <sys/file.h>
  50 #include <sys/pathname.h>
  51 #include <sys/atomic.h>
  52 #include <sys/vfs.h>
  53 #include <sys/vnode.h>
  54 #include <sys/vnode_dispatch.h>
  55 #include <sys/rwstlock.h>
  56 #include <sys/fem.h>
  57 #include <sys/stat.h>
  58 #include <sys/mode.h>
  59 #include <sys/conf.h>
  60 #include <sys/sysmacros.h>
  61 #include <sys/cmn_err.h>
  62 #include <sys/systm.h>
  63 #include <sys/kmem.h>
  64 #include <sys/debug.h>
  65 #include <sys/acl.h>
  66 #include <sys/nbmlock.h>
  67 #include <sys/fcntl.h>
  68 #include <sys/fs_subr.h>
  69 #include <sys/taskq.h>
  70 #include <sys/fs_reparse.h>
  71 #include <sys/time.h>
  72 #include <sys/sdt.h>
  73
  74 /* Determine if this vnode is a file that is read-only */
  75 #define ISROFILE(vp)    \
  76         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  77             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  78
  79 /* Tunable via /etc/system; used only by admin/install */
  80 int nfs_global_client_only;
  81
  82 /*
  83  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  84  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  85  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  86  * the same fstype index that is used to index into the vfssw table.
  87  */
  88 vopstats_t **vopstats_fstype;
  89
  90 /* vopstats initialization template used for fast initialization via bcopy() */
  91 static vopstats_t *vs_templatep;
  92
  93 /* Kmem cache handle for vsk_anchor_t allocations */
  94 kmem_cache_t *vsk_anchor_cache;
  95
  96 /* file events cleanup routine */
  97 extern void free_fopdata(vnode_t *);
  98
  99 /*
 100  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
 101  * updates to vsktat_tree.
 102  */
 103 avl_tree_t      vskstat_tree;
 104 kmutex_t        vskstat_tree_lock;
 105
 106 /* Global variable which enables/disables the vopstats collection */
 107 int vopstats_enabled = 1;
 108
 109 /* Global used for empty/invalid v_path */
 110 char *vn_vpath_empty = "";
 111
 112 /*
 113  * forward declarations for internal vnode specific data (vsd)
 114  */
 115 static void *vsd_realloc(void *, size_t, size_t);
 116
 117 /*
 118  * forward declarations for reparse point functions
 119  */
 120 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 121
 122 /*
 123  * VSD -- VNODE SPECIFIC DATA
 124  * The v_data pointer is typically used by a file system to store a
 125  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 126  * However, there are times when additional project private data needs
 127  * to be stored separately from the data (node) pointed to by v_data.
 128  * This additional data could be stored by the file system itself or
 129  * by a completely different kernel entity.  VSD provides a way for
 130  * callers to obtain a key and store a pointer to private data associated
 131  * with a vnode.
 132  *
 133  * Callers are responsible for protecting the vsd by holding v_vsd_lock
 134  * for calls to vsd_set() and vsd_get().
 135  */
 136
 137 /*
 138  * vsd_lock protects:
 139  *   vsd_nkeys - creation and deletion of vsd keys
 140  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 141  *   vsd_destructor - adding and removing destructors to the list
 142  */
 143 static kmutex_t         vsd_lock;
 144 static uint_t           vsd_nkeys;       /* size of destructor array */
 145 /* list of vsd_node's */
 146 static list_t *vsd_list = NULL;
 147 /* per-key destructor funcs */
 148 static void             (**vsd_destructor)(void *);
 149
 150 /*
 151  * The following is the common set of actions needed to update the
 152  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 153  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 154  * recording of the bytes transferred.  Since the code is similar
 155  * but small, it is nearly a duplicate.  Consequently any changes
 156  * to one may need to be reflected in the other.
 157  * Rundown of the variables:
 158  * vp - Pointer to the vnode
 159  * counter - Partial name structure member to update in vopstats for counts
 160  * bytecounter - Partial name structure member to update in vopstats for bytes
 161  * bytesval - Value to update in vopstats for bytes
 162  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 163  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 164  */
 165
 166 #define VOPSTATS_UPDATE(vp, counter) {                                  \
 167         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 168         if (vfsp && vfsp->vfs_implp &&                                  \
 169             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 170                 vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 171                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 172                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 173                     size_t, uint64_t *);                                \
 174                 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 175                 (*stataddr)++;                                          \
 176                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 177                         vsp->n##counter.value.ui64++;                   \
 178                 }                                                       \
 179         }                                                               \
 180 }
 181
 182 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 183         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 184         if (vfsp && vfsp->vfs_implp &&                                  \
 185             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 186                 vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 187                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 188                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 189                     size_t, uint64_t *);                                \
 190                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 191                 (*stataddr)++;                                          \
 192                 vsp->bytecounter.value.ui64 += bytesval;                \
 193                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 194                         vsp->n##counter.value.ui64++;                   \
 195                         vsp->bytecounter.value.ui64 += bytesval;        \
 196                 }                                                       \
 197         }                                                               \
 198 }
 199
 200 /*
 201  * If the filesystem does not support XIDs map credential
 202  * If the vfsp is NULL, perhaps we should also map?
 203  */
 204 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 205         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 206         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)            \
 207                 cr = crgetmapped(cr);                                   \
 208         }
 209
 210 /*
 211  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 212  * numerical order of S_IFMT and vnode types.)
 213  */
 214 enum vtype iftovt_tab[] = {
 215         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 216         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 217 };
 218
 219 ushort_t vttoif_tab[] = {
 220         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 221         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 222 };
 223
 224 /*
 225  * The system vnode cache.
 226  */
 227
 228 kmem_cache_t *vn_cache;
 229
 230
 231 /* Extensible attribute (xva) routines. */
 232
 233 /*
 234  * Zero out the structure, set the size of the requested/returned bitmaps,
 235  * set VATTR_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 236  * to the returned attributes array.
 237  */
 238 void
 239 xva_init(xvattr_t *xvap)
 240 {
 241         bzero(xvap, sizeof (xvattr_t));
 242         xvap->xva_mapsize = XVA_MAPSIZE;
 243         xvap->xva_magic = XVA_MAGIC;
 244         xvap->xva_vattr.va_mask = VATTR_XVATTR;
 245         xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 246 }
 247
 248 /*
 249  * If VATTR_XVATTR is set, returns a pointer to the embedded xoptattr_t
 250  * structure.  Otherwise, returns NULL.
 251  */
 252 xoptattr_t *
 253 xva_getxoptattr(xvattr_t *xvap)
 254 {
 255         xoptattr_t *xoap = NULL;
 256         if (xvap->xva_vattr.va_mask & VATTR_XVATTR)
 257                 xoap = &xvap->xva_xoptattrs;
 258         return (xoap);
 259 }
 260
 261 /*
 262  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 263  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 264  * kstat name.
 265  */
 266 static int
 267 vska_compar(const void *n1, const void *n2)
 268 {
 269         int ret;
 270         ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 271         ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 272
 273         if (p1 < p2) {
 274                 ret = -1;
 275         } else if (p1 > p2) {
 276                 ret = 1;
 277         } else {
 278                 ret = 0;
 279         }
 280
 281         return (ret);
 282 }
 283
 284 /*
 285  * Used to create a single template which will be bcopy()ed to a newly
 286  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 287  */
 288 static vopstats_t *
 289 create_vopstats_template()
 290 {
 291         vopstats_t              *vsp;
 292
 293         vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 294         bzero(vsp, sizeof (*vsp));      /* Start fresh */
 295
 296         /* fop_open */
 297         kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 298         /* fop_close */
 299         kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 300         /* fop_read I/O */
 301         kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 302         kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 303         /* fop_write I/O */
 304         kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 305         kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 306         /* fop_ioctl */
 307         kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 308         /* fop_setfl */
 309         kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 310         /* fop_getattr */
 311         kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 312         /* fop_setattr */
 313         kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 314         /* fop_access */
 315         kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 316         /* fop_lookup */
 317         kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 318         /* fop_create */
 319         kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 320         /* fop_remove */
 321         kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 322         /* fop_link */
 323         kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 324         /* fop_rename */
 325         kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 326         /* fop_mkdir */
 327         kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 328         /* fop_rmdir */
 329         kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 330         /* fop_readdir I/O */
 331         kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 332         kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 333             KSTAT_DATA_UINT64);
 334         /* fop_symlink */
 335         kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 336         /* fop_readlink */
 337         kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 338         /* fop_fsync */
 339         kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 340         /* fop_inactive */
 341         kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 342         /* fop_fid */
 343         kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 344         /* fop_rwlock */
 345         kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 346         /* fop_rwunlock */
 347         kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 348         /* fop_seek */
 349         kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 350         /* fop_cmp */
 351         kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 352         /* fop_frlock */
 353         kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 354         /* fop_space */
 355         kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 356         /* fop_realvp */
 357         kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 358         /* fop_getpage */
 359         kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 360         /* fop_putpage */
 361         kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 362         /* fop_map */
 363         kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 364         /* fop_addmap */
 365         kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 366         /* fop_delmap */
 367         kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 368         /* fop_poll */
 369         kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 370         /* fop_dump */
 371         kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 372         /* fop_pathconf */
 373         kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 374         /* fop_pageio */
 375         kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 376         /* fop_dumpctl */
 377         kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 378         /* fop_dispose */
 379         kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 380         /* fop_setsecattr */
 381         kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 382         /* fop_getsecattr */
 383         kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 384         /* fop_shrlock */
 385         kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 386         /* fop_vnevent */
 387         kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 388         /* fop_reqzcbuf */
 389         kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 390         /* fop_retzcbuf */
 391         kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 392
 393         return (vsp);
 394 }
 395
 396 /*
 397  * Creates a kstat structure associated with a vopstats structure.
 398  */
 399 kstat_t *
 400 new_vskstat(char *ksname, vopstats_t *vsp)
 401 {
 402         kstat_t         *ksp;
 403
 404         if (!vopstats_enabled) {
 405                 return (NULL);
 406         }
 407
 408         ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 409             sizeof (vopstats_t)/sizeof (kstat_named_t),
 410             KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 411         if (ksp) {
 412                 ksp->ks_data = vsp;
 413                 kstat_install(ksp);
 414         }
 415
 416         return (ksp);
 417 }
 418
 419 /*
 420  * Called from vfsinit() to initialize the support mechanisms for vopstats
 421  */
 422 void
 423 vopstats_startup()
 424 {
 425         if (!vopstats_enabled)
 426                 return;
 427
 428         /*
 429          * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 430          * is necessary since we need to check if a kstat exists before we
 431          * attempt to create it.  Also, initialize its lock.
 432          */
 433         avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 434             offsetof(vsk_anchor_t, vsk_node));
 435         mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 436
 437         vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 438             sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 439             NULL, NULL, 0);
 440
 441         /*
 442          * Set up the array of pointers for the vopstats-by-FS-type.
 443          * The entries will be allocated/initialized as each file system
 444          * goes through modload/mod_installfs.
 445          */
 446         vopstats_fstype = (vopstats_t **)kmem_zalloc(
 447             (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 448
 449         /* Set up the global vopstats initialization template */
 450         vs_templatep = create_vopstats_template();
 451 }
 452
 453 /*
 454  * We need to have the all of the counters zeroed.
 455  * The initialization of the vopstats_t includes on the order of
 456  * 50 calls to kstat_named_init().  Rather that do that on every call,
 457  * we do it once in a template (vs_templatep) then bcopy it over.
 458  */
 459 void
 460 initialize_vopstats(vopstats_t *vsp)
 461 {
 462         if (vsp == NULL)
 463                 return;
 464
 465         bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 466 }
 467
 468 /*
 469  * If possible, determine which vopstats by fstype to use and
 470  * return a pointer to the caller.
 471  */
 472 vopstats_t *
 473 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 474 {
 475         int             fstype = 0;     /* Index into vfssw[] */
 476         vopstats_t      *vsp = NULL;
 477
 478         if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 479             !vopstats_enabled)
 480                 return (NULL);
 481         /*
 482          * Set up the fstype.  We go to so much trouble because all versions
 483          * of NFS use the same fstype in their vfs even though they have
 484          * distinct entries in the vfssw[] table.
 485          * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 486          */
 487         if (vswp) {
 488                 fstype = vswp - vfssw;  /* Gets us the index */
 489         } else {
 490                 fstype = vfsp->vfs_fstype;
 491         }
 492
 493         /*
 494          * Point to the per-fstype vopstats. The only valid values are
 495          * non-zero positive values less than the number of vfssw[] table
 496          * entries.
 497          */
 498         if (fstype > 0 && fstype < nfstype) {
 499                 vsp = vopstats_fstype[fstype];
 500         }
 501
 502         return (vsp);
 503 }
 504
 505 /*
 506  * Generate a kstat name, create the kstat structure, and allocate a
 507  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 508  * to the caller.  This must only be called from a mount.
 509  */
 510 vsk_anchor_t *
 511 get_vskstat_anchor(vfs_t *vfsp)
 512 {
 513         char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 514         statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 515         vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 516         kstat_t         *ksp;                   /* Ptr to new kstat */
 517         avl_index_t     where;                  /* Location in the AVL tree */
 518
 519         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 520             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 521                 return (NULL);
 522
 523         /* Need to get the fsid to build a kstat name */
 524         if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 525                 /* Create a name for our kstats based on fsid */
 526                 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 527                     VOPSTATS_STR, statvfsbuf.f_fsid);
 528
 529                 /* Allocate and initialize the vsk_anchor_t */
 530                 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 531                 bzero(vskp, sizeof (*vskp));
 532                 vskp->vsk_fsid = statvfsbuf.f_fsid;
 533
 534                 mutex_enter(&vskstat_tree_lock);
 535                 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 536                         avl_insert(&vskstat_tree, vskp, where);
 537                         mutex_exit(&vskstat_tree_lock);
 538
 539                         /*
 540                          * Now that we've got the anchor in the AVL
 541                          * tree, we can create the kstat.
 542                          */
 543                         ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 544                         if (ksp) {
 545                                 vskp->vsk_ksp = ksp;
 546                         }
 547                 } else {
 548                         /* Oops, found one! Release memory and lock. */
 549                         mutex_exit(&vskstat_tree_lock);
 550                         kmem_cache_free(vsk_anchor_cache, vskp);
 551                         vskp = NULL;
 552                 }
 553         }
 554         return (vskp);
 555 }
 556
 557 /*
 558  * We're in the process of tearing down the vfs and need to cleanup
 559  * the data structures associated with the vopstats. Must only be called
 560  * from dounmount().
 561  */
 562 void
 563 teardown_vopstats(vfs_t *vfsp)
 564 {
 565         vsk_anchor_t    *vskap;
 566         avl_index_t     where;
 567
 568         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 569             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 570                 return;
 571
 572         /* This is a safe check since VFS_STATS must be set (see above) */
 573         if ((vskap = vfsp->vfs_vskap) == NULL)
 574                 return;
 575
 576         /* Whack the pointer right away */
 577         vfsp->vfs_vskap = NULL;
 578
 579         /* Lock the tree, remove the node, and delete the kstat */
 580         mutex_enter(&vskstat_tree_lock);
 581         if (avl_find(&vskstat_tree, vskap, &where)) {
 582                 avl_remove(&vskstat_tree, vskap);
 583         }
 584
 585         if (vskap->vsk_ksp) {
 586                 kstat_delete(vskap->vsk_ksp);
 587         }
 588         mutex_exit(&vskstat_tree_lock);
 589
 590         kmem_cache_free(vsk_anchor_cache, vskap);
 591 }
 592
 593 /*
 594  * Read or write a vnode.  Called from kernel code.
 595  */
 596 int
 597 vn_rdwr(
 598         enum uio_rw rw,
 599         struct vnode *vp,
 600         caddr_t base,
 601         ssize_t len,
 602         offset_t offset,
 603         enum uio_seg seg,
 604         int ioflag,
 605         rlim_t ulimit,  /* meaningful only if rw is UIO_WRITE */
 606         cred_t *cr,
 607         ssize_t *residp)
 608 {
 609         struct uio uio;
 610         struct iovec iov;
 611         int error;
 612         int in_crit = 0;
 613
 614         if (rw == UIO_WRITE && ISROFILE(vp))
 615                 return (EROFS);
 616
 617         if (len < 0)
 618                 return (EIO);
 619
 620         VOPXID_MAP_CR(vp, cr);
 621
 622         iov.iov_base = base;
 623         iov.iov_len = len;
 624         uio.uio_iov = &iov;
 625         uio.uio_iovcnt = 1;
 626         uio.uio_loffset = offset;
 627         uio.uio_segflg = (short)seg;
 628         uio.uio_resid = len;
 629         uio.uio_llimit = ulimit;
 630
 631         /*
 632          * We have to enter the critical region before calling fop_rwlock
 633          * to avoid a deadlock with ufs.
 634          */
 635         if (nbl_need_check(vp)) {
 636                 int svmand;
 637
 638                 nbl_start_crit(vp, RW_READER);
 639                 in_crit = 1;
 640                 error = nbl_svmand(vp, cr, &svmand);
 641                 if (error != 0)
 642                         goto done;
 643                 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 644                     uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 645                         error = EACCES;
 646                         goto done;
 647                 }
 648         }
 649
 650         (void) fop_rwlock(vp,
 651             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 652         if (rw == UIO_WRITE) {
 653                 uio.uio_fmode = FWRITE;
 654                 uio.uio_extflg = UIO_COPY_DEFAULT;
 655                 error = fop_write(vp, &uio, ioflag, cr, NULL);
 656         } else {
 657                 uio.uio_fmode = FREAD;
 658                 uio.uio_extflg = UIO_COPY_CACHED;
 659                 error = fop_read(vp, &uio, ioflag, cr, NULL);
 660         }
 661         fop_rwunlock(vp,
 662             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 663         if (residp)
 664                 *residp = uio.uio_resid;
 665         else if (uio.uio_resid)
 666                 error = EIO;
 667
 668 done:
 669         if (in_crit)
 670                 nbl_end_crit(vp);
 671         return (error);
 672 }
 673
 674 /*
 675  * Release a vnode.  Call fop_inactive on last reference or
 676  * decrement reference count.
 677  *
 678  * To avoid race conditions, the v_count is left at 1 for
 679  * the call to fop_inactive. This prevents another thread
 680  * from reclaiming and releasing the vnode *before* the
 681  * fop_inactive routine has a chance to destroy the vnode.
 682  * We can't have more than 1 thread calling fop_inactive
 683  * on a vnode.
 684  */
 685 void
 686 vn_rele(vnode_t *vp)
 687 {
 688         VERIFY(vp->v_count > 0);
 689         mutex_enter(&vp->v_lock);
 690         if (vp->v_count == 1) {
 691                 mutex_exit(&vp->v_lock);
 692                 fop_inactive(vp, CRED(), NULL);
 693                 return;
 694         }
 695         VN_RELE_LOCKED(vp);
 696         mutex_exit(&vp->v_lock);
 697 }
 698
 699 /*
 700  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 701  * as a single reference, so v_count is not decremented until the last DNLC hold
 702  * is released. This makes it possible to distinguish vnodes that are referenced
 703  * only by the DNLC.
 704  */
 705 void
 706 vn_rele_dnlc(vnode_t *vp)
 707 {
 708         VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 709         mutex_enter(&vp->v_lock);
 710         if (--vp->v_count_dnlc == 0) {
 711                 if (vp->v_count == 1) {
 712                         mutex_exit(&vp->v_lock);
 713                         fop_inactive(vp, CRED(), NULL);
 714                         return;
 715                 }
 716                 VN_RELE_LOCKED(vp);
 717         }
 718         mutex_exit(&vp->v_lock);
 719 }
 720
 721 /*
 722  * Like vn_rele() except that it clears v_stream under v_lock.
 723  * This is used by sockfs when it dismantles the association between
 724  * the sockfs node and the vnode in the underlying file system.
 725  * v_lock has to be held to prevent a thread coming through the lookupname
 726  * path from accessing a stream head that is going away.
 727  */
 728 void
 729 vn_rele_stream(vnode_t *vp)
 730 {
 731         VERIFY(vp->v_count > 0);
 732         mutex_enter(&vp->v_lock);
 733         vp->v_stream = NULL;
 734         if (vp->v_count == 1) {
 735                 mutex_exit(&vp->v_lock);
 736                 fop_inactive(vp, CRED(), NULL);
 737                 return;
 738         }
 739         VN_RELE_LOCKED(vp);
 740         mutex_exit(&vp->v_lock);
 741 }
 742
 743 static void
 744 vn_rele_inactive(vnode_t *vp)
 745 {
 746         fop_inactive(vp, CRED(), NULL);
 747 }
 748
 749 /*
 750  * Like vn_rele() except if we are going to call fop_inactive() then do it
 751  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 752  * the file system as a result of releasing the vnode. Note, file systems
 753  * already have to handle the race where the vnode is incremented before the
 754  * inactive routine is called and does its locking.
 755  *
 756  * Warning: Excessive use of this routine can lead to performance problems.
 757  * This is because taskqs throttle back allocation if too many are created.
 758  */
 759 void
 760 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 761 {
 762         VERIFY(vp->v_count > 0);
 763         mutex_enter(&vp->v_lock);
 764         if (vp->v_count == 1) {
 765                 mutex_exit(&vp->v_lock);
 766                 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 767                     vp, TQ_SLEEP) != (uintptr_t)NULL);
 768                 return;
 769         }
 770         VN_RELE_LOCKED(vp);
 771         mutex_exit(&vp->v_lock);
 772 }
 773
 774 int
 775 vn_open(
 776         char *pnamep,
 777         enum uio_seg seg,
 778         int filemode,
 779         int createmode,
 780         struct vnode **vpp,
 781         enum create crwhy,
 782         mode_t umask)
 783 {
 784         return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 785             umask, NULL, -1));
 786 }
 787
 788
 789 /*
 790  * Open/create a vnode.
 791  * This may be callable by the kernel, the only known use
 792  * of user context being that the current user credentials
 793  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 794  */
 795 int
 796 vn_openat(
 797         char *pnamep,
 798         enum uio_seg seg,
 799         int filemode,
 800         int createmode,
 801         struct vnode **vpp,
 802         enum create crwhy,
 803         mode_t umask,
 804         struct vnode *startvp,
 805         int fd)
 806 {
 807         struct vnode *vp;
 808         int mode;
 809         int accessflags;
 810         int error;
 811         int in_crit = 0;
 812         int open_done = 0;
 813         int shrlock_done = 0;
 814         struct vattr vattr;
 815         enum symfollow follow;
 816         int estale_retry = 0;
 817         struct shrlock shr;
 818         struct shr_locowner shr_own;
 819
 820         if (filemode & FSEARCH)
 821                 filemode |= FDIRECTORY;
 822
 823         mode = 0;
 824         accessflags = 0;
 825         if (filemode & FREAD)
 826                 mode |= VREAD;
 827         if (filemode & (FWRITE|FTRUNC))
 828                 mode |= VWRITE;
 829         if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 830                 mode |= VEXEC;
 831
 832         /* symlink interpretation */
 833         if (filemode & FNOFOLLOW)
 834                 follow = NO_FOLLOW;
 835         else
 836                 follow = FOLLOW;
 837
 838         if (filemode & FAPPEND)
 839                 accessflags |= V_APPEND;
 840
 841 top:
 842         if (filemode & FCREAT && !(filemode & FDIRECTORY)) {
 843                 enum vcexcl excl;
 844
 845                 /* Wish to create a file. */
 846                 vattr.va_type = VREG;
 847                 vattr.va_mode = createmode;
 848                 vattr.va_mask = VATTR_TYPE|VATTR_MODE;
 849                 if (filemode & FTRUNC) {
 850                         vattr.va_size = 0;
 851                         vattr.va_mask |= VATTR_SIZE;
 852                 }
 853                 if (filemode & FEXCL)
 854                         excl = EXCL;
 855                 else
 856                         excl = NONEXCL;
 857
 858                 if (error =
 859                     vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
 860                     (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
 861                         return (error);
 862         } else {
 863                 /* Wish to open a file.  Just look it up. */
 864                 if (error = lookupnameat(pnamep, seg, follow,
 865                     NULLVPP, &vp, startvp)) {
 866                         if ((error == ESTALE) &&
 867                             fs_need_estale_retry(estale_retry++))
 868                                 goto top;
 869                         return (error);
 870                 }
 871
 872                 /*
 873                  * Can't write directories, active texts, or
 874                  * read-only filesystems.  Can't truncate files
 875                  * on which mandatory locking is in effect.
 876                  */
 877                 if (filemode & (FWRITE|FTRUNC)) {
 878                         /*
 879                          * Allow writable directory if VDIROPEN flag is set.
 880                          */
 881                         if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
 882                                 error = EISDIR;
 883                                 goto out;
 884                         }
 885                         if (ISROFILE(vp)) {
 886                                 error = EROFS;
 887                                 goto out;
 888                         }
 889                         /*
 890                          * Can't truncate files on which
 891                          * sysv mandatory locking is in effect.
 892                          */
 893                         if (filemode & FTRUNC) {
 894                                 vnode_t *rvp;
 895
 896                                 if (fop_realvp(vp, &rvp, NULL) != 0)
 897                                         rvp = vp;
 898                                 if (rvp->v_filocks != NULL) {
 899                                         vattr.va_mask = VATTR_MODE;
 900                                         if ((error = fop_getattr(vp,
 901                                             &vattr, 0, CRED(), NULL)) == 0 &&
 902                                             MANDLOCK(vp, vattr.va_mode))
 903                                                 error = EAGAIN;
 904                                 }
 905                         }
 906                         if (error)
 907                                 goto out;
 908                 }
 909                 /*
 910                  * Check permissions.
 911                  */
 912                 if (error = fop_access(vp, mode, accessflags, CRED(), NULL))
 913                         goto out;
 914                 /*
 915                  * Require FDIRECTORY to return a directory.
 916                  * Require FEXEC to return a regular file.
 917                  */
 918                 if ((filemode & FDIRECTORY) && vp->v_type != VDIR) {
 919                         error = ENOTDIR;
 920                         goto out;
 921                 }
 922                 if ((filemode & FEXEC) && vp->v_type != VREG) {
 923                         error = ENOEXEC;        /* XXX: error code? */
 924                         goto out;
 925                 }
 926         }
 927
 928         /*
 929          * Do remaining checks for FNOFOLLOW and FNOLINKS.
 930          */
 931         if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
 932                 error = ELOOP;
 933                 goto out;
 934         }
 935         if (filemode & FNOLINKS) {
 936                 vattr.va_mask = VATTR_NLINK;
 937                 if ((error = fop_getattr(vp, &vattr, 0, CRED(), NULL))) {
 938                         goto out;
 939                 }
 940                 if (vattr.va_nlink != 1) {
 941                         error = EMLINK;
 942                         goto out;
 943                 }
 944         }
 945
 946         /*
 947          * Opening a socket corresponding to the AF_UNIX pathname
 948          * in the filesystem name space is not supported.
 949          * However, VSOCK nodes in namefs are supported in order
 950          * to make fattach work for sockets.
 951          *
 952          * XXX This uses fop_realvp to distinguish between
 953          * an unopened namefs node (where fop_realvp returns a
 954          * different VSOCK vnode) and a VSOCK created by vn_create
 955          * in some file system (where fop_realvp would never return
 956          * a different vnode).
 957          */
 958         if (vp->v_type == VSOCK) {
 959                 struct vnode *nvp;
 960
 961                 error = fop_realvp(vp, &nvp, NULL);
 962                 if (error != 0 || nvp == NULL || nvp == vp ||
 963                     nvp->v_type != VSOCK) {
 964                         error = EOPNOTSUPP;
 965                         goto out;
 966                 }
 967         }
 968
 969         if ((vp->v_type == VREG) && nbl_need_check(vp)) {
 970                 /* get share reservation */
 971                 shr.s_access = 0;
 972                 if (filemode & FWRITE)
 973                         shr.s_access |= F_WRACC;
 974                 if (filemode & FREAD)
 975                         shr.s_access |= F_RDACC;
 976                 shr.s_deny = 0;
 977                 shr.s_sysid = 0;
 978                 shr.s_pid = ttoproc(curthread)->p_pid;
 979                 shr_own.sl_pid = shr.s_pid;
 980                 shr_own.sl_id = fd;
 981                 shr.s_own_len = sizeof (shr_own);
 982                 shr.s_owner = (caddr_t)&shr_own;
 983                 error = fop_shrlock(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
 984                     NULL);
 985                 if (error)
 986                         goto out;
 987                 shrlock_done = 1;
 988
 989                 /* nbmand conflict check if truncating file */
 990                 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
 991                         nbl_start_crit(vp, RW_READER);
 992                         in_crit = 1;
 993
 994                         vattr.va_mask = VATTR_SIZE;
 995                         if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL))
 996                                 goto out;
 997                         if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
 998                             NULL)) {
 999                                 error = EACCES;
1000                                 goto out;
1001                         }
1002                 }
1003         }
1004
1005         /*
1006          * Do opening protocol.
1007          */
1008         error = fop_open(&vp, filemode, CRED(), NULL);
1009         if (error)
1010                 goto out;
1011         open_done = 1;
1012
1013         /*
1014          * Truncate if required.
1015          */
1016         if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1017                 vattr.va_size = 0;
1018                 vattr.va_mask = VATTR_SIZE;
1019                 if ((error = fop_setattr(vp, &vattr, 0, CRED(), NULL)) != 0)
1020                         goto out;
1021         }
1022 out:
1023         ASSERT(vp->v_count > 0);
1024
1025         if (in_crit) {
1026                 nbl_end_crit(vp);
1027                 in_crit = 0;
1028         }
1029         if (error) {
1030                 if (open_done) {
1031                         (void) fop_close(vp, filemode, 1, 0, CRED(),
1032                             NULL);
1033                         open_done = 0;
1034                         shrlock_done = 0;
1035                 }
1036                 if (shrlock_done) {
1037                         (void) fop_shrlock(vp, F_UNSHARE, &shr, 0, CRED(),
1038                             NULL);
1039                         shrlock_done = 0;
1040                 }
1041
1042                 /*
1043                  * The following clause was added to handle a problem
1044                  * with NFS consistency.  It is possible that a lookup
1045                  * of the file to be opened succeeded, but the file
1046                  * itself doesn't actually exist on the server.  This
1047                  * is chiefly due to the DNLC containing an entry for
1048                  * the file which has been removed on the server.  In
1049                  * this case, we just start over.  If there was some
1050                  * other cause for the ESTALE error, then the lookup
1051                  * of the file will fail and the error will be returned
1052                  * above instead of looping around from here.
1053                  */
1054                 VN_RELE(vp);
1055                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1056                         goto top;
1057         } else
1058                 *vpp = vp;
1059         return (error);
1060 }
1061
1062 /*
1063  * The following two accessor functions are for the NFSv4 server.  Since there
1064  * is no fop_open_UP/DOWNGRADE we need a way for the NFS server to keep the
1065  * vnode open counts correct when a client "upgrades" an open or does an
1066  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1067  * open mode (add or subtract read or write), but also change the share/deny
1068  * modes.  However, share reservations are not integrated with OPEN, yet, so
1069  * we need to handle each separately.  These functions are cleaner than having
1070  * the NFS server manipulate the counts directly, however, nobody else should
1071  * use these functions.
1072  */
1073 void
1074 vn_open_upgrade(
1075         vnode_t *vp,
1076         int filemode)
1077 {
1078         ASSERT(vp->v_type == VREG);
1079
1080         if (filemode & FREAD)
1081                 atomic_inc_32(&vp->v_rdcnt);
1082         if (filemode & FWRITE)
1083                 atomic_inc_32(&vp->v_wrcnt);
1084
1085 }
1086
1087 void
1088 vn_open_downgrade(
1089         vnode_t *vp,
1090         int filemode)
1091 {
1092         ASSERT(vp->v_type == VREG);
1093
1094         if (filemode & FREAD) {
1095                 ASSERT(vp->v_rdcnt > 0);
1096                 atomic_dec_32(&vp->v_rdcnt);
1097         }
1098         if (filemode & FWRITE) {
1099                 ASSERT(vp->v_wrcnt > 0);
1100                 atomic_dec_32(&vp->v_wrcnt);
1101         }
1102
1103 }
1104
1105 int
1106 vn_create(
1107         char *pnamep,
1108         enum uio_seg seg,
1109         struct vattr *vap,
1110         enum vcexcl excl,
1111         int mode,
1112         struct vnode **vpp,
1113         enum create why,
1114         int flag,
1115         mode_t umask)
1116 {
1117         return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1118             umask, NULL));
1119 }
1120
1121 /*
1122  * Create a vnode (makenode).
1123  */
1124 int
1125 vn_createat(
1126         char *pnamep,
1127         enum uio_seg seg,
1128         struct vattr *vap,
1129         enum vcexcl excl,
1130         int mode,
1131         struct vnode **vpp,
1132         enum create why,
1133         int flag,
1134         mode_t umask,
1135         struct vnode *startvp)
1136 {
1137         struct vnode *dvp;      /* ptr to parent dir vnode */
1138         struct vnode *vp = NULL;
1139         struct pathname pn;
1140         int error;
1141         int in_crit = 0;
1142         struct vattr vattr;
1143         enum symfollow follow;
1144         int estale_retry = 0;
1145
1146         ASSERT((vap->va_mask & (VATTR_TYPE|VATTR_MODE)) == (VATTR_TYPE|VATTR_MODE));
1147
1148         /* symlink interpretation */
1149         if ((flag & FNOFOLLOW) || excl == EXCL)
1150                 follow = NO_FOLLOW;
1151         else
1152                 follow = FOLLOW;
1153         flag &= ~(FNOFOLLOW|FNOLINKS);
1154
1155 top:
1156         /*
1157          * Lookup directory.
1158          * If new object is a file, call lower level to create it.
1159          * Note that it is up to the lower level to enforce exclusive
1160          * creation, if the file is already there.
1161          * This allows the lower level to do whatever
1162          * locking or protocol that is needed to prevent races.
1163          * If the new object is directory call lower level to make
1164          * the new directory, with "." and "..".
1165          */
1166         if (error = pn_get(pnamep, seg, &pn))
1167                 return (error);
1168         dvp = NULL;
1169         *vpp = NULL;
1170         /*
1171          * lookup will find the parent directory for the vnode.
1172          * When it is done the pn holds the name of the entry
1173          * in the directory.
1174          * If this is a non-exclusive create we also find the node itself.
1175          */
1176         error = lookuppnat(&pn, NULL, follow, &dvp,
1177             (excl == EXCL) ? NULLVPP : vpp, startvp);
1178         if (error) {
1179                 pn_free(&pn);
1180                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1181                         goto top;
1182                 if (why == CRMKDIR && error == EINVAL)
1183                         error = EEXIST;         /* SVID */
1184                 return (error);
1185         }
1186
1187         if (why != CRMKNOD)
1188                 vap->va_mode &= ~VSVTX;
1189
1190         /*
1191          * If default ACLs are defined for the directory don't apply the
1192          * umask if umask is passed.
1193          */
1194
1195         if (umask) {
1196
1197                 vsecattr_t vsec;
1198
1199                 vsec.vsa_aclcnt = 0;
1200                 vsec.vsa_aclentp = NULL;
1201                 vsec.vsa_dfaclcnt = 0;
1202                 vsec.vsa_dfaclentp = NULL;
1203                 vsec.vsa_mask = VSA_DFACLCNT;
1204                 error = fop_getsecattr(dvp, &vsec, 0, CRED(), NULL);
1205                 /*
1206                  * If error is ENOSYS then treat it as no error
1207                  * Don't want to force all file systems to support
1208                  * aclent_t style of ACL's.
1209                  */
1210                 if (error == ENOSYS)
1211                         error = 0;
1212                 if (error) {
1213                         if (*vpp != NULL)
1214                                 VN_RELE(*vpp);
1215                         goto out;
1216                 } else {
1217                         /*
1218                          * Apply the umask if no default ACLs.
1219                          */
1220                         if (vsec.vsa_dfaclcnt == 0)
1221                                 vap->va_mode &= ~umask;
1222
1223                         /*
1224                          * fop_getsecattr() may have allocated memory for
1225                          * ACLs we didn't request, so double-check and
1226                          * free it if necessary.
1227                          */
1228                         if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1229                                 kmem_free((caddr_t)vsec.vsa_aclentp,
1230                                     vsec.vsa_aclcnt * sizeof (aclent_t));
1231                         if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1232                                 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1233                                     vsec.vsa_dfaclcnt * sizeof (aclent_t));
1234                 }
1235         }
1236
1237         /*
1238          * In general we want to generate EROFS if the file system is
1239          * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1240          * documents the open system call, and it says that O_CREAT has no
1241          * effect if the file already exists.  Bug 1119649 states
1242          * that open(path, O_CREAT, ...) fails when attempting to open an
1243          * existing file on a read only file system.  Thus, the first part
1244          * of the following if statement has 3 checks:
1245          *      if the file exists &&
1246          *              it is being open with write access &&
1247          *              the file system is read only
1248          *      then generate EROFS
1249          */
1250         if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1251             (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1252                 if (*vpp)
1253                         VN_RELE(*vpp);
1254                 error = EROFS;
1255         } else if (excl == NONEXCL && *vpp != NULL) {
1256                 vnode_t *rvp;
1257
1258                 /*
1259                  * File already exists.  If a mandatory lock has been
1260                  * applied, return error.
1261                  */
1262                 vp = *vpp;
1263                 if (fop_realvp(vp, &rvp, NULL) != 0)
1264                         rvp = vp;
1265                 if ((vap->va_mask & VATTR_SIZE) && nbl_need_check(vp)) {
1266                         nbl_start_crit(vp, RW_READER);
1267                         in_crit = 1;
1268                 }
1269                 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1270                         vattr.va_mask = VATTR_MODE|VATTR_SIZE;
1271                         if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL)) {
1272                                 goto out;
1273                         }
1274                         if (MANDLOCK(vp, vattr.va_mode)) {
1275                                 error = EAGAIN;
1276                                 goto out;
1277                         }
1278                         /*
1279                          * File cannot be truncated if non-blocking mandatory
1280                          * locks are currently on the file.
1281                          */
1282                         if ((vap->va_mask & VATTR_SIZE) && in_crit) {
1283                                 uoff_t offset;
1284                                 ssize_t length;
1285
1286                                 offset = vap->va_size > vattr.va_size ?
1287                                     vattr.va_size : vap->va_size;
1288                                 length = vap->va_size > vattr.va_size ?
1289                                     vap->va_size - vattr.va_size :
1290                                     vattr.va_size - vap->va_size;
1291                                 if (nbl_conflict(vp, NBL_WRITE, offset,
1292                                     length, 0, NULL)) {
1293                                         error = EACCES;
1294                                         goto out;
1295                                 }
1296                         }
1297                 }
1298
1299                 /*
1300                  * If the file is the root of a VFS, we've crossed a
1301                  * mount point and the "containing" directory that we
1302                  * acquired above (dvp) is irrelevant because it's in
1303                  * a different file system.  We apply fop_create to the
1304                  * target itself instead of to the containing directory
1305                  * and supply a null path name to indicate (conventionally)
1306                  * the node itself as the "component" of interest.
1307                  *
1308                  * The call to fop_create() is necessary to ensure
1309                  * that the appropriate permission checks are made,
1310                  * i.e. EISDIR, EACCES, etc.  We already know that vpp
1311                  * exists since we are in the else condition where this
1312                  * was checked.
1313                  */
1314                 if (vp->v_flag & VROOT) {
1315                         ASSERT(why != CRMKDIR);
1316                         error = fop_create(vp, "", vap, excl, mode, vpp,
1317                             CRED(), flag, NULL, NULL);
1318                         /*
1319                          * If the create succeeded, it will have created a
1320                          * new reference on a new vnode (*vpp) in the child
1321                          * file system, so we want to drop our reference on
1322                          * the old (vp) upon exit.
1323                          */
1324                         goto out;
1325                 }
1326         }
1327
1328         if (error == 0) {
1329                 /*
1330                  * Call mkdir() if specified, otherwise create().
1331                  */
1332                 int must_be_dir = pn_fixslash(&pn);     /* trailing '/'? */
1333
1334                 if (why == CRMKDIR)
1335                         /*
1336                          * N.B., if vn_createat() ever requests
1337                          * case-insensitive behavior then it will need
1338                          * to be passed to fop_mkdir().  fop_create()
1339                          * will already get it via "flag"
1340                          */
1341                         error = fop_mkdir(dvp, pn.pn_path, vap, vpp, CRED(),
1342                             NULL, 0, NULL);
1343                 else if (!must_be_dir)
1344                         error = fop_create(dvp, pn.pn_path, vap,
1345                             excl, mode, vpp, CRED(), flag, NULL, NULL);
1346                 else
1347                         error = ENOTDIR;
1348         }
1349
1350 out:
1351
1352         if (in_crit) {
1353                 nbl_end_crit(vp);
1354                 in_crit = 0;
1355         }
1356         if (vp != NULL) {
1357                 VN_RELE(vp);
1358                 vp = NULL;
1359         }
1360         pn_free(&pn);
1361         VN_RELE(dvp);
1362         /*
1363          * The following clause was added to handle a problem
1364          * with NFS consistency.  It is possible that a lookup
1365          * of the file to be created succeeded, but the file
1366          * itself doesn't actually exist on the server.  This
1367          * is chiefly due to the DNLC containing an entry for
1368          * the file which has been removed on the server.  In
1369          * this case, we just start over.  If there was some
1370          * other cause for the ESTALE error, then the lookup
1371          * of the file will fail and the error will be returned
1372          * above instead of looping around from here.
1373          */
1374         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1375                 goto top;
1376         return (error);
1377 }
1378
1379 int
1380 vn_link(char *from, char *to, enum uio_seg seg)
1381 {
1382         return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1383 }
1384
1385 int
1386 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1387     vnode_t *tstartvp, char *to, enum uio_seg seg)
1388 {
1389         struct vnode *fvp;              /* from vnode ptr */
1390         struct vnode *tdvp;             /* to directory vnode ptr */
1391         struct pathname pn;
1392         int error;
1393         struct vattr vattr;
1394         dev_t fsid;
1395         int estale_retry = 0;
1396
1397 top:
1398         fvp = tdvp = NULL;
1399         if (error = pn_get(to, seg, &pn))
1400                 return (error);
1401         if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1402                 goto out;
1403         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1404                 goto out;
1405         /*
1406          * Make sure both source vnode and target directory vnode are
1407          * in the same vfs and that it is writeable.
1408          */
1409         vattr.va_mask = VATTR_FSID;
1410         if (error = fop_getattr(fvp, &vattr, 0, CRED(), NULL))
1411                 goto out;
1412         fsid = vattr.va_fsid;
1413         vattr.va_mask = VATTR_FSID;
1414         if (error = fop_getattr(tdvp, &vattr, 0, CRED(), NULL))
1415                 goto out;
1416         if (fsid != vattr.va_fsid) {
1417                 error = EXDEV;
1418                 goto out;
1419         }
1420         if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1421                 error = EROFS;
1422                 goto out;
1423         }
1424         /*
1425          * Do the link.
1426          */
1427         (void) pn_fixslash(&pn);
1428         error = fop_link(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1429 out:
1430         pn_free(&pn);
1431         if (fvp)
1432                 VN_RELE(fvp);
1433         if (tdvp)
1434                 VN_RELE(tdvp);
1435         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1436                 goto top;
1437         return (error);
1438 }
1439
1440 int
1441 vn_rename(char *from, char *to, enum uio_seg seg)
1442 {
1443         return (vn_renameat(NULL, from, NULL, to, seg));
1444 }
1445
1446 int
1447 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1448     char *tname, enum uio_seg seg)
1449 {
1450         int error;
1451         struct vattr vattr;
1452         struct pathname fpn;            /* from pathname */
1453         struct pathname tpn;            /* to pathname */
1454         dev_t fsid;
1455         int in_crit_src, in_crit_targ;
1456         vnode_t *fromvp, *fvp;
1457         vnode_t *tovp, *targvp;
1458         int estale_retry = 0;
1459
1460 top:
1461         fvp = fromvp = tovp = targvp = NULL;
1462         in_crit_src = in_crit_targ = 0;
1463         /*
1464          * Get to and from pathnames.
1465          */
1466         if (error = pn_get(fname, seg, &fpn))
1467                 return (error);
1468         if (error = pn_get(tname, seg, &tpn)) {
1469                 pn_free(&fpn);
1470                 return (error);
1471         }
1472
1473         /*
1474          * First we need to resolve the correct directories
1475          * The passed in directories may only be a starting point,
1476          * but we need the real directories the file(s) live in.
1477          * For example the fname may be something like usr/lib/sparc
1478          * and we were passed in the / directory, but we need to
1479          * use the lib directory for the rename.
1480          */
1481
1482         /*
1483          * Lookup to and from directories.
1484          */
1485         if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1486                 goto out;
1487         }
1488
1489         /*
1490          * Make sure there is an entry.
1491          */
1492         if (fvp == NULL) {
1493                 error = ENOENT;
1494                 goto out;
1495         }
1496
1497         if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1498                 goto out;
1499         }
1500
1501         /*
1502          * Make sure both the from vnode directory and the to directory
1503          * are in the same vfs and the to directory is writable.
1504          * We check fsid's, not vfs pointers, so loopback fs works.
1505          */
1506         if (fromvp != tovp) {
1507                 vattr.va_mask = VATTR_FSID;
1508                 if (error = fop_getattr(fromvp, &vattr, 0, CRED(), NULL))
1509                         goto out;
1510                 fsid = vattr.va_fsid;
1511                 vattr.va_mask = VATTR_FSID;
1512                 if (error = fop_getattr(tovp, &vattr, 0, CRED(), NULL))
1513                         goto out;
1514                 if (fsid != vattr.va_fsid) {
1515                         error = EXDEV;
1516                         goto out;
1517                 }
1518         }
1519
1520         if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1521                 error = EROFS;
1522                 goto out;
1523         }
1524
1525         /*
1526          * Make sure "from" vp is not a mount point.
1527          * Note, lookup did traverse() already, so
1528          * we'll be looking at the mounted FS root.
1529          * (but allow files like mnttab)
1530          */
1531         if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1532                 error = EBUSY;
1533                 goto out;
1534         }
1535
1536         if (targvp && (fvp != targvp)) {
1537                 nbl_start_crit(targvp, RW_READER);
1538                 in_crit_targ = 1;
1539                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1540                         error = EACCES;
1541                         goto out;
1542                 }
1543         }
1544
1545         if (nbl_need_check(fvp)) {
1546                 nbl_start_crit(fvp, RW_READER);
1547                 in_crit_src = 1;
1548                 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1549                         error = EACCES;
1550                         goto out;
1551                 }
1552         }
1553
1554         /*
1555          * Do the rename.
1556          */
1557         (void) pn_fixslash(&tpn);
1558         error = fop_rename(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1559             NULL, 0);
1560
1561 out:
1562         pn_free(&fpn);
1563         pn_free(&tpn);
1564         if (in_crit_src)
1565                 nbl_end_crit(fvp);
1566         if (in_crit_targ)
1567                 nbl_end_crit(targvp);
1568         if (fromvp)
1569                 VN_RELE(fromvp);
1570         if (tovp)
1571                 VN_RELE(tovp);
1572         if (targvp)
1573                 VN_RELE(targvp);
1574         if (fvp)
1575                 VN_RELE(fvp);
1576         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1577                 goto top;
1578         return (error);
1579 }
1580
1581 /*
1582  * Remove a file or directory.
1583  */
1584 int
1585 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1586 {
1587         return (vn_removeat(NULL, fnamep, seg, dirflag));
1588 }
1589
1590 int
1591 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1592 {
1593         struct vnode *vp;               /* entry vnode */
1594         struct vnode *dvp;              /* ptr to parent dir vnode */
1595         struct vnode *coveredvp;
1596         struct pathname pn;             /* name of entry */
1597         enum vtype vtype;
1598         int error;
1599         struct vfs *vfsp;
1600         struct vfs *dvfsp;      /* ptr to parent dir vfs */
1601         int in_crit = 0;
1602         int estale_retry = 0;
1603
1604 top:
1605         if (error = pn_get(fnamep, seg, &pn))
1606                 return (error);
1607         dvp = vp = NULL;
1608         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1609                 pn_free(&pn);
1610                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1611                         goto top;
1612                 return (error);
1613         }
1614
1615         /*
1616          * Make sure there is an entry.
1617          */
1618         if (vp == NULL) {
1619                 error = ENOENT;
1620                 goto out;
1621         }
1622
1623         vfsp = vp->v_vfsp;
1624         dvfsp = dvp->v_vfsp;
1625
1626         /*
1627          * If the named file is the root of a mounted filesystem, fail,
1628          * unless it's marked unlinkable.  In that case, unmount the
1629          * filesystem and proceed to unlink the covered vnode.  (If the
1630          * covered vnode is a directory, use rmdir instead of unlink,
1631          * to avoid file system corruption.)
1632          */
1633         if (vp->v_flag & VROOT) {
1634                 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1635                         error = EBUSY;
1636                         goto out;
1637                 }
1638
1639                 /*
1640                  * Namefs specific code starts here.
1641                  */
1642
1643                 if (dirflag == RMDIRECTORY) {
1644                         /*
1645                          * User called rmdir(2) on a file that has
1646                          * been namefs mounted on top of.  Since
1647                          * namefs doesn't allow directories to
1648                          * be mounted on other files we know
1649                          * vp is not of type VDIR so fail to operation.
1650                          */
1651                         error = ENOTDIR;
1652                         goto out;
1653                 }
1654
1655                 /*
1656                  * If VROOT is still set after grabbing vp->v_lock,
1657                  * noone has finished nm_unmount so far and coveredvp
1658                  * is valid.
1659                  * If we manage to grab vn_vfswlock(coveredvp) before releasing
1660                  * vp->v_lock, any race window is eliminated.
1661                  */
1662
1663                 mutex_enter(&vp->v_lock);
1664                 if ((vp->v_flag & VROOT) == 0) {
1665                         /* Someone beat us to the unmount */
1666                         mutex_exit(&vp->v_lock);
1667                         error = EBUSY;
1668                         goto out;
1669                 }
1670                 vfsp = vp->v_vfsp;
1671                 coveredvp = vfsp->vfs_vnodecovered;
1672                 ASSERT(coveredvp);
1673                 /*
1674                  * Note: Implementation of vn_vfswlock shows that ordering of
1675                  * v_lock / vn_vfswlock is not an issue here.
1676                  */
1677                 error = vn_vfswlock(coveredvp);
1678                 mutex_exit(&vp->v_lock);
1679
1680                 if (error)
1681                         goto out;
1682
1683                 VN_HOLD(coveredvp);
1684                 VN_RELE(vp);
1685                 error = dounmount(vfsp, 0, CRED());
1686
1687                 /*
1688                  * Unmounted the namefs file system; now get
1689                  * the object it was mounted over.
1690                  */
1691                 vp = coveredvp;
1692                 /*
1693                  * If namefs was mounted over a directory, then
1694                  * we want to use rmdir() instead of unlink().
1695                  */
1696                 if (vp->v_type == VDIR)
1697                         dirflag = RMDIRECTORY;
1698
1699                 if (error)
1700                         goto out;
1701         }
1702
1703         /*
1704          * Make sure filesystem is writeable.
1705          * We check the parent directory's vfs in case this is an lofs vnode.
1706          */
1707         if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1708                 error = EROFS;
1709                 goto out;
1710         }
1711
1712         vtype = vp->v_type;
1713
1714         /*
1715          * If there is the possibility of an nbmand share reservation, make
1716          * sure it's okay to remove the file.  Keep a reference to the
1717          * vnode, so that we can exit the nbl critical region after
1718          * calling fop_remove.
1719          * If there is no possibility of an nbmand share reservation,
1720          * release the vnode reference now.  Filesystems like NFS may
1721          * behave differently if there is an extra reference, so get rid of
1722          * this one.  Fortunately, we can't have nbmand mounts on NFS
1723          * filesystems.
1724          */
1725         if (nbl_need_check(vp)) {
1726                 nbl_start_crit(vp, RW_READER);
1727                 in_crit = 1;
1728                 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1729                         error = EACCES;
1730                         goto out;
1731                 }
1732         } else {
1733                 VN_RELE(vp);
1734                 vp = NULL;
1735         }
1736
1737         if (dirflag == RMDIRECTORY) {
1738                 /*
1739                  * Caller is using rmdir(2), which can only be applied to
1740                  * directories.
1741                  */
1742                 if (vtype != VDIR) {
1743                         error = ENOTDIR;
1744                 } else {
1745                         vnode_t *cwd;
1746                         proc_t *pp = curproc;
1747
1748                         mutex_enter(&pp->p_lock);
1749                         cwd = PTOU(pp)->u_cdir;
1750                         VN_HOLD(cwd);
1751                         mutex_exit(&pp->p_lock);
1752                         error = fop_rmdir(dvp, pn.pn_path, cwd, CRED(),
1753                             NULL, 0);
1754                         VN_RELE(cwd);
1755                 }
1756         } else {
1757                 /*
1758                  * Unlink(2) can be applied to anything.
1759                  */
1760                 error = fop_remove(dvp, pn.pn_path, CRED(), NULL, 0);
1761         }
1762
1763 out:
1764         pn_free(&pn);
1765         if (in_crit) {
1766                 nbl_end_crit(vp);
1767                 in_crit = 0;
1768         }
1769         if (vp != NULL)
1770                 VN_RELE(vp);
1771         if (dvp != NULL)
1772                 VN_RELE(dvp);
1773         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1774                 goto top;
1775         return (error);
1776 }
1777
1778 /*
1779  * Utility function to compare equality of vnodes.
1780  * Compare the underlying real vnodes, if there are underlying vnodes.
1781  * This is a more thorough comparison than the VN_CMP() macro provides.
1782  */
1783 int
1784 vn_compare(vnode_t *vp1, vnode_t *vp2)
1785 {
1786         vnode_t *realvp;
1787
1788         if (vp1 != NULL && fop_realvp(vp1, &realvp, NULL) == 0)
1789                 vp1 = realvp;
1790         if (vp2 != NULL && fop_realvp(vp2, &realvp, NULL) == 0)
1791                 vp2 = realvp;
1792         return (VN_CMP(vp1, vp2));
1793 }
1794
1795 /*
1796  * The number of locks to hash into.  This value must be a power
1797  * of 2 minus 1 and should probably also be prime.
1798  */
1799 #define NUM_BUCKETS     1023
1800
1801 struct  vn_vfslocks_bucket {
1802         kmutex_t vb_lock;
1803         vn_vfslocks_entry_t *vb_list;
1804         char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
1805 };
1806
1807 /*
1808  * Total number of buckets will be NUM_BUCKETS + 1 .
1809  */
1810
1811 #pragma align   64(vn_vfslocks_buckets)
1812 static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
1813
1814 #define VN_VFSLOCKS_SHIFT       9
1815
1816 #define VN_VFSLOCKS_HASH(vfsvpptr)      \
1817         ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
1818
1819 /*
1820  * vn_vfslocks_getlock() uses an HASH scheme to generate
1821  * rwstlock using vfs/vnode pointer passed to it.
1822  *
1823  * vn_vfslocks_rele() releases a reference in the
1824  * HASH table which allows the entry allocated by
1825  * vn_vfslocks_getlock() to be freed at a later
1826  * stage when the refcount drops to zero.
1827  */
1828
1829 vn_vfslocks_entry_t *
1830 vn_vfslocks_getlock(void *vfsvpptr)
1831 {
1832         struct vn_vfslocks_bucket *bp;
1833         vn_vfslocks_entry_t *vep;
1834         vn_vfslocks_entry_t *tvep;
1835
1836         ASSERT(vfsvpptr != NULL);
1837         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
1838
1839         mutex_enter(&bp->vb_lock);
1840         for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
1841                 if (vep->ve_vpvfs == vfsvpptr) {
1842                         vep->ve_refcnt++;
1843                         mutex_exit(&bp->vb_lock);
1844                         return (vep);
1845                 }
1846         }
1847         mutex_exit(&bp->vb_lock);
1848         vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
1849         rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
1850         vep->ve_vpvfs = (char *)vfsvpptr;
1851         vep->ve_refcnt = 1;
1852         mutex_enter(&bp->vb_lock);
1853         for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
1854                 if (tvep->ve_vpvfs == vfsvpptr) {
1855                         tvep->ve_refcnt++;
1856                         mutex_exit(&bp->vb_lock);
1857
1858                         /*
1859                          * There is already an entry in the hash
1860                          * destroy what we just allocated.
1861                          */
1862                         rwst_destroy(&vep->ve_lock);
1863                         kmem_free(vep, sizeof (*vep));
1864                         return (tvep);
1865                 }
1866         }
1867         vep->ve_next = bp->vb_list;
1868         bp->vb_list = vep;
1869         mutex_exit(&bp->vb_lock);
1870         return (vep);
1871 }
1872
1873 void
1874 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
1875 {
1876         struct vn_vfslocks_bucket *bp;
1877         vn_vfslocks_entry_t *vep;
1878         vn_vfslocks_entry_t *pvep;
1879
1880         ASSERT(vepent != NULL);
1881         ASSERT(vepent->ve_vpvfs != NULL);
1882
1883         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
1884
1885         mutex_enter(&bp->vb_lock);
1886         vepent->ve_refcnt--;
1887
1888         if ((int32_t)vepent->ve_refcnt < 0)
1889                 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
1890
1891         if (vepent->ve_refcnt == 0) {
1892                 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
1893                         if (vep->ve_vpvfs == vepent->ve_vpvfs) {
1894                                 if (bp->vb_list == vep)
1895                                         bp->vb_list = vep->ve_next;
1896                                 else {
1897                                         /* LINTED */
1898                                         pvep->ve_next = vep->ve_next;
1899                                 }
1900                                 mutex_exit(&bp->vb_lock);
1901                                 rwst_destroy(&vep->ve_lock);
1902                                 kmem_free(vep, sizeof (*vep));
1903                                 return;
1904                         }
1905                         pvep = vep;
1906                 }
1907                 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
1908         }
1909         mutex_exit(&bp->vb_lock);
1910 }
1911
1912 /*
1913  * vn_vfswlock_wait is used to implement a lock which is logically a writers
1914  * lock protecting the v_vfsmountedhere field.
1915  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
1916  * except that it blocks to acquire the lock VVFSLOCK.
1917  *
1918  * traverse() and routines re-implementing part of traverse (e.g. autofs)
1919  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
1920  * need the non-blocking version of the writers lock i.e. vn_vfswlock
1921  */
1922 int
1923 vn_vfswlock_wait(vnode_t *vp)
1924 {
1925         int retval;
1926         vn_vfslocks_entry_t *vpvfsentry;
1927         ASSERT(vp != NULL);
1928
1929         vpvfsentry = vn_vfslocks_getlock(vp);
1930         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
1931
1932         if (retval == EINTR) {
1933                 vn_vfslocks_rele(vpvfsentry);
1934                 return (EINTR);
1935         }
1936         return (retval);
1937 }
1938
1939 int
1940 vn_vfsrlock_wait(vnode_t *vp)
1941 {
1942         int retval;
1943         vn_vfslocks_entry_t *vpvfsentry;
1944         ASSERT(vp != NULL);
1945
1946         vpvfsentry = vn_vfslocks_getlock(vp);
1947         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
1948
1949         if (retval == EINTR) {
1950                 vn_vfslocks_rele(vpvfsentry);
1951                 return (EINTR);
1952         }
1953
1954         return (retval);
1955 }
1956
1957
1958 /*
1959  * vn_vfswlock is used to implement a lock which is logically a writers lock
1960  * protecting the v_vfsmountedhere field.
1961  */
1962 int
1963 vn_vfswlock(vnode_t *vp)
1964 {
1965         vn_vfslocks_entry_t *vpvfsentry;
1966
1967         /*
1968          * If vp is NULL then somebody is trying to lock the covered vnode
1969          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
1970          * only happen when unmounting /.  Since that operation will fail
1971          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
1972          */
1973         if (vp == NULL)
1974                 return (EBUSY);
1975
1976         vpvfsentry = vn_vfslocks_getlock(vp);
1977
1978         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
1979                 return (0);
1980
1981         vn_vfslocks_rele(vpvfsentry);
1982         return (EBUSY);
1983 }
1984
1985 int
1986 vn_vfsrlock(vnode_t *vp)
1987 {
1988         vn_vfslocks_entry_t *vpvfsentry;
1989
1990         /*
1991          * If vp is NULL then somebody is trying to lock the covered vnode
1992          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
1993          * only happen when unmounting /.  Since that operation will fail
1994          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
1995          */
1996         if (vp == NULL)
1997                 return (EBUSY);
1998
1999         vpvfsentry = vn_vfslocks_getlock(vp);
2000
2001         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2002                 return (0);
2003
2004         vn_vfslocks_rele(vpvfsentry);
2005         return (EBUSY);
2006 }
2007
2008 void
2009 vn_vfsunlock(vnode_t *vp)
2010 {
2011         vn_vfslocks_entry_t *vpvfsentry;
2012
2013         /*
2014          * ve_refcnt needs to be decremented twice.
2015          * 1. To release refernce after a call to vn_vfslocks_getlock()
2016          * 2. To release the reference from the locking routines like
2017          *    vn_vfsrlock/vn_vfswlock etc,.
2018          */
2019         vpvfsentry = vn_vfslocks_getlock(vp);
2020         vn_vfslocks_rele(vpvfsentry);
2021
2022         rwst_exit(&vpvfsentry->ve_lock);
2023         vn_vfslocks_rele(vpvfsentry);
2024 }
2025
2026 int
2027 vn_vfswlock_held(vnode_t *vp)
2028 {
2029         int held;
2030         vn_vfslocks_entry_t *vpvfsentry;
2031
2032         ASSERT(vp != NULL);
2033
2034         vpvfsentry = vn_vfslocks_getlock(vp);
2035         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2036
2037         vn_vfslocks_rele(vpvfsentry);
2038         return (held);
2039 }
2040
2041
2042 /*
2043  * Vnode cache.
2044  */
2045
2046 /* ARGSUSED */
2047 static int
2048 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2049 {
2050         struct vnode *vp;
2051
2052         vp = buf;
2053
2054         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2055         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2056         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2057         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2058         vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2059         vp->v_path = vn_vpath_empty;
2060         vp->v_path_stamp = 0;
2061         vp->v_mpssdata = NULL;
2062         vp->v_vsd = NULL;
2063         vp->v_fopdata = NULL;
2064
2065         vmobject_init(&vp->v_object, vp);
2066
2067         return (0);
2068 }
2069
2070 /* ARGSUSED */
2071 static void
2072 vn_cache_destructor(void *buf, void *cdrarg)
2073 {
2074         struct vnode *vp;
2075
2076         vp = buf;
2077
2078         vmobject_fini(&vp->v_object);
2079
2080         rw_destroy(&vp->v_nbllock);
2081         cv_destroy(&vp->v_cv);
2082         mutex_destroy(&vp->v_vsd_lock);
2083         mutex_destroy(&vp->v_lock);
2084 }
2085
2086 void
2087 vn_create_cache(void)
2088 {
2089         /* LINTED */
2090         ASSERT((1 << VNODE_ALIGN_LOG2) ==
2091             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2092         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2093             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2094             NULL, 0);
2095 }
2096
2097 void
2098 vn_destroy_cache(void)
2099 {
2100         kmem_cache_destroy(vn_cache);
2101 }
2102
2103 /*
2104  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2105  * cached by the file system and vnodes remain associated.
2106  */
2107 void
2108 vn_recycle(vnode_t *vp)
2109 {
2110         ASSERT(!vn_has_cached_data(vp));
2111         VERIFY(vp->v_path != NULL);
2112
2113         /*
2114          * XXX - This really belongs in vn_reinit(), but we have some issues
2115          * with the counts.  Best to have it here for clean initialization.
2116          */
2117         vp->v_rdcnt = 0;
2118         vp->v_wrcnt = 0;
2119         vp->v_mmap_read = 0;
2120         vp->v_mmap_write = 0;
2121
2122         /*
2123          * If FEM was in use, make sure everything gets cleaned up
2124          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2125          * constructor.
2126          */
2127         if (vp->v_femhead) {
2128                 /* XXX - There should be a free_femhead() that does all this */
2129                 ASSERT(vp->v_femhead->femh_list == NULL);
2130                 mutex_destroy(&vp->v_femhead->femh_lock);
2131                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2132                 vp->v_femhead = NULL;
2133         }
2134         if (vp->v_path != vn_vpath_empty) {
2135                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2136                 vp->v_path = vn_vpath_empty;
2137         }
2138         vp->v_path_stamp = 0;
2139
2140         if (vp->v_fopdata != NULL) {
2141                 free_fopdata(vp);
2142         }
2143         vp->v_mpssdata = NULL;
2144         vsd_free(vp);
2145 }
2146
2147 /*
2148  * Used to reset the vnode fields including those that are directly accessible
2149  * as well as those which require an accessor function.
2150  *
2151  * Does not initialize:
2152  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2153  *      v_data (since FS-nodes and vnodes point to each other and should
2154  *              be updated simultaneously)
2155  *      v_op (in case someone needs to make a VOP call on this object)
2156  */
2157 void
2158 vn_reinit(vnode_t *vp)
2159 {
2160         vp->v_count = 1;
2161         vp->v_count_dnlc = 0;
2162         vp->v_vfsp = NULL;
2163         vp->v_stream = NULL;
2164         vp->v_vfsmountedhere = NULL;
2165         vp->v_flag = 0;
2166         vp->v_type = VNON;
2167         vp->v_rdev = NODEV;
2168
2169         vp->v_filocks = NULL;
2170         vp->v_shrlocks = NULL;
2171         VERIFY(!vn_has_cached_data(vp));
2172
2173         vp->v_locality = NULL;
2174         vp->v_xattrdir = NULL;
2175
2176         /*
2177          * In a few specific instances, vn_reinit() is used to initialize
2178          * locally defined vnode_t instances.  Lacking the construction offered
2179          * by vn_alloc(), these vnodes require v_path initialization.
2180          */
2181         if (vp->v_path == NULL) {
2182                 vp->v_path = vn_vpath_empty;
2183         }
2184
2185         /* Handles v_femhead, v_path, and the r/w/map counts */
2186         vn_recycle(vp);
2187 }
2188
2189 vnode_t *
2190 vn_alloc(int kmflag)
2191 {
2192         vnode_t *vp;
2193
2194         vp = kmem_cache_alloc(vn_cache, kmflag);
2195
2196         if (vp != NULL) {
2197                 vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2198                 vp->v_fopdata = NULL;
2199                 vn_reinit(vp);
2200         }
2201
2202         return (vp);
2203 }
2204
2205 void
2206 vn_free(vnode_t *vp)
2207 {
2208         ASSERT(vp->v_shrlocks == NULL);
2209         ASSERT(vp->v_filocks == NULL);
2210
2211         /*
2212          * Some file systems call vn_free() with v_count of zero,
2213          * some with v_count of 1.  In any case, the value should
2214          * never be anything else.
2215          */
2216         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2217         ASSERT(vp->v_count_dnlc == 0);
2218         VERIFY(vp->v_path != NULL);
2219         if (vp->v_path != vn_vpath_empty) {
2220                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2221                 vp->v_path = vn_vpath_empty;
2222         }
2223
2224         /* If FEM was in use, make sure everything gets cleaned up */
2225         if (vp->v_femhead) {
2226                 /* XXX - There should be a free_femhead() that does all this */
2227                 ASSERT(vp->v_femhead->femh_list == NULL);
2228                 mutex_destroy(&vp->v_femhead->femh_lock);
2229                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2230                 vp->v_femhead = NULL;
2231         }
2232
2233         if (vp->v_fopdata != NULL) {
2234                 free_fopdata(vp);
2235         }
2236         vp->v_mpssdata = NULL;
2237         vsd_free(vp);
2238         kmem_cache_free(vn_cache, vp);
2239 }
2240
2241 /*
2242  * vnode status changes, should define better states than 1, 0.
2243  */
2244 void
2245 vn_reclaim(vnode_t *vp)
2246 {
2247         vfs_t   *vfsp = vp->v_vfsp;
2248
2249         if (vfsp == NULL ||
2250             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2251                 return;
2252         }
2253         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2254 }
2255
2256 void
2257 vn_idle(vnode_t *vp)
2258 {
2259         vfs_t   *vfsp = vp->v_vfsp;
2260
2261         if (vfsp == NULL ||
2262             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2263                 return;
2264         }
2265         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2266 }
2267 void
2268 vn_exists(vnode_t *vp)
2269 {
2270         vfs_t   *vfsp = vp->v_vfsp;
2271
2272         if (vfsp == NULL ||
2273             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2274                 return;
2275         }
2276         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2277 }
2278
2279 void
2280 vn_invalid(vnode_t *vp)
2281 {
2282         vfs_t   *vfsp = vp->v_vfsp;
2283
2284         if (vfsp == NULL ||
2285             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2286                 return;
2287         }
2288         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2289 }
2290
2291 /* Vnode event notification */
2292
2293 int
2294 vnevent_support(vnode_t *vp, caller_context_t *ct)
2295 {
2296         if (vp == NULL)
2297                 return (EINVAL);
2298
2299         return (fop_vnevent(vp, VE_SUPPORT, NULL, NULL, ct));
2300 }
2301
2302 void
2303 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2304 {
2305         if (vp == NULL || vp->v_femhead == NULL) {
2306                 return;
2307         }
2308         (void) fop_vnevent(vp, VE_RENAME_SRC, dvp, name, ct);
2309 }
2310
2311 void
2312 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2313     caller_context_t *ct)
2314 {
2315         if (vp == NULL || vp->v_femhead == NULL) {
2316                 return;
2317         }
2318         (void) fop_vnevent(vp, VE_RENAME_DEST, dvp, name, ct);
2319 }
2320
2321 void
2322 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2323 {
2324         if (vp == NULL || vp->v_femhead == NULL) {
2325                 return;
2326         }
2327         (void) fop_vnevent(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2328 }
2329
2330 void
2331 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2332 {
2333         if (vp == NULL || vp->v_femhead == NULL) {
2334                 return;
2335         }
2336         (void) fop_vnevent(vp, VE_REMOVE, dvp, name, ct);
2337 }
2338
2339 void
2340 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2341 {
2342         if (vp == NULL || vp->v_femhead == NULL) {
2343                 return;
2344         }
2345         (void) fop_vnevent(vp, VE_RMDIR, dvp, name, ct);
2346 }
2347
2348 void
2349 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2350     caller_context_t *ct)
2351 {
2352         if (vp == NULL || vp->v_femhead == NULL) {
2353                 return;
2354         }
2355         (void) fop_vnevent(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2356 }
2357
2358 void
2359 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2360     caller_context_t *ct)
2361 {
2362         if (vp == NULL || vp->v_femhead == NULL) {
2363                 return;
2364         }
2365         (void) fop_vnevent(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2366 }
2367
2368 void
2369 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2370     caller_context_t *ct)
2371 {
2372         if (vp == NULL || vp->v_femhead == NULL) {
2373                 return;
2374         }
2375         (void) fop_vnevent(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2376 }
2377
2378 void
2379 vnevent_create(vnode_t *vp, caller_context_t *ct)
2380 {
2381         if (vp == NULL || vp->v_femhead == NULL) {
2382                 return;
2383         }
2384         (void) fop_vnevent(vp, VE_CREATE, NULL, NULL, ct);
2385 }
2386
2387 void
2388 vnevent_link(vnode_t *vp, caller_context_t *ct)
2389 {
2390         if (vp == NULL || vp->v_femhead == NULL) {
2391                 return;
2392         }
2393         (void) fop_vnevent(vp, VE_LINK, NULL, NULL, ct);
2394 }
2395
2396 void
2397 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2398 {
2399         if (vp == NULL || vp->v_femhead == NULL) {
2400                 return;
2401         }
2402         (void) fop_vnevent(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2403 }
2404
2405 void
2406 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2407 {
2408         if (vp == NULL || vp->v_femhead == NULL) {
2409                 return;
2410         }
2411         (void) fop_vnevent(vp, VE_TRUNCATE, NULL, NULL, ct);
2412 }
2413
2414 /*
2415  * Vnode accessors.
2416  */
2417
2418 int
2419 vn_is_readonly(vnode_t *vp)
2420 {
2421         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2422 }
2423
2424 int
2425 vn_has_flocks(vnode_t *vp)
2426 {
2427         return (vp->v_filocks != NULL);
2428 }
2429
2430 int
2431 vn_has_mandatory_locks(vnode_t *vp, int mode)
2432 {
2433         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2434 }
2435
2436 int
2437 vn_has_cached_data(vnode_t *vp)
2438 {
2439         return (!list_is_empty(&vp->v_object.list));
2440 }
2441
2442 /*
2443  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2444  * zone_enter(2).
2445  */
2446 int
2447 vn_can_change_zones(vnode_t *vp)
2448 {
2449         struct vfssw *vswp;
2450         int allow = 1;
2451         vnode_t *rvp;
2452
2453         if (nfs_global_client_only != 0)
2454                 return (1);
2455
2456         /*
2457          * We always want to look at the underlying vnode if there is one.
2458          */
2459         if (fop_realvp(vp, &rvp, NULL) != 0)
2460                 rvp = vp;
2461         /*
2462          * Some pseudo filesystems (including doorfs) don't actually register
2463          * their vfsops_t, so the following may return NULL; we happily let
2464          * such vnodes switch zones.
2465          */
2466         vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2467         if (vswp != NULL) {
2468                 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2469                         allow = 0;
2470                 vfs_unrefvfssw(vswp);
2471         }
2472         return (allow);
2473 }
2474
2475 /*
2476  * Return nonzero if the vnode is a mount point, zero if not.
2477  */
2478 int
2479 vn_ismntpt(vnode_t *vp)
2480 {
2481         return (vp->v_vfsmountedhere != NULL);
2482 }
2483
2484 /* Retrieve the vfs (if any) mounted on this vnode */
2485 vfs_t *
2486 vn_mountedvfs(vnode_t *vp)
2487 {
2488         return (vp->v_vfsmountedhere);
2489 }
2490
2491 /*
2492  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2493  */
2494 int
2495 vn_in_dnlc(vnode_t *vp)
2496 {
2497         return (vp->v_count_dnlc > 0);
2498 }
2499
2500 /*
2501  * vn_has_other_opens() checks whether a particular file is opened by more than
2502  * just the caller and whether the open is for read and/or write.
2503  * This routine is for calling after the caller has already called fop_open()
2504  * and the caller wishes to know if they are the only one with it open for
2505  * the mode(s) specified.
2506  *
2507  * Vnode counts are only kept on regular files (v_type=VREG).
2508  */
2509 bool
2510 vn_has_other_opens(struct vnode *vp, v_mode_t mode)
2511 {
2512         ASSERT(vp != NULL);
2513
2514         switch (mode) {
2515         case V_WRITE:
2516                 if (vp->v_wrcnt > 1)
2517                         return true;
2518                 break;
2519         case V_RDORWR:
2520                 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2521                         return true;
2522                 break;
2523         case V_RDANDWR:
2524                 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2525                         return true;
2526                 break;
2527         case V_READ:
2528                 if (vp->v_rdcnt > 1)
2529                         return true;
2530                 break;
2531         }
2532
2533         return false;
2534 }
2535
2536 /*
2537  * vn_is_opened() checks whether a particular file is opened and
2538  * whether the open is for read and/or write.
2539  *
2540  * Vnode counts are only kept on regular files (v_type=VREG).
2541  */
2542 bool vn_is_opened(struct vnode *vp, v_mode_t mode)
2543 {
2544         ASSERT(vp != NULL);
2545
2546         switch (mode) {
2547         case V_WRITE:
2548                 if (vp->v_wrcnt)
2549                         return true;
2550                 break;
2551         case V_RDANDWR:
2552                 if (vp->v_rdcnt && vp->v_wrcnt)
2553                         return true;
2554                 break;
2555         case V_RDORWR:
2556                 if (vp->v_rdcnt || vp->v_wrcnt)
2557                         return true;
2558                 break;
2559         case V_READ:
2560                 if (vp->v_rdcnt)
2561                         return true;
2562                 break;
2563         }
2564
2565         return false;
2566 }
2567
2568 /*
2569  * vn_is_mapped() checks whether a particular file is mapped and whether
2570  * the file is mapped read and/or write.
2571  */
2572 bool vn_is_mapped(struct vnode *vp, v_mode_t mode)
2573 {
2574         ASSERT(vp != NULL);
2575
2576 #if !defined(_LP64)
2577         switch (mode) {
2578         /*
2579          * The atomic_add_64_nv functions force atomicity in the
2580          * case of 32 bit architectures. Otherwise the 64 bit values
2581          * require two fetches. The value of the fields may be
2582          * (potentially) changed between the first fetch and the
2583          * second
2584          */
2585         case V_WRITE:
2586                 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2587                         return true;
2588                 break;
2589         case V_RDANDWR:
2590                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2591                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2592                         return true;
2593                 break;
2594         case V_RDORWR:
2595                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2596                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2597                         return true;
2598                 break;
2599         case V_READ:
2600                 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2601                         return true;
2602                 break;
2603         }
2604 #else
2605         switch (mode) {
2606         case V_WRITE:
2607                 if (vp->v_mmap_write)
2608                         return true;
2609                 break;
2610         case V_RDANDWR:
2611                 if (vp->v_mmap_read && vp->v_mmap_write)
2612                         return true;
2613                 break;
2614         case V_RDORWR:
2615                 if (vp->v_mmap_read || vp->v_mmap_write)
2616                         return true;
2617                 break;
2618         case V_READ:
2619                 if (vp->v_mmap_read)
2620                         return true;
2621                 break;
2622         }
2623 #endif
2624
2625         return false;
2626 }
2627
2628 /*
2629  * Set the operations vector for a vnode.
2630  */
2631 void
2632 vn_setops(struct vnode *vnode, const struct vnodeops *ops)
2633 {
2634         vnode->v_op = ops;
2635 }
2636
2637 /*
2638  * Retrieve the operations vector for a vnode
2639  */
2640 const struct vnodeops *
2641 vn_getops(struct vnode *vnode)
2642 {
2643         return vnode->v_op;
2644 }
2645
2646 /*
2647  * Returns non-zero (1) if the vnodeops matches that of the vnode.
2648  * Returns zero (0) if not.
2649  */
2650 int
2651 vn_matchops(struct vnode *vp, const struct vnodeops *vnodeops)
2652 {
2653         return (vn_getops(vp) == vnodeops);
2654 }
2655
2656 /*
2657  * fs_new_caller_id() needs to return a unique ID on a given local system.
2658  * The IDs do not need to survive across reboots.  These are primarily
2659  * used so that (FEM) monitors can detect particular callers (such as
2660  * the NFS server) to a given vnode/vfs operation.
2661  */
2662 u_longlong_t
2663 fs_new_caller_id()
2664 {
2665         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2666
2667         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2668 }
2669
2670 /*
2671  * The value stored in v_path is relative to rootdir, located in the global
2672  * zone.  Zones or chroot environments which reside deeper inside the VFS
2673  * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2674  * what lies below their perceived root.  In order to keep v_path usable for
2675  * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2676  *
2677  * An upper bound of max_vnode_path is placed upon v_path allocations to
2678  * prevent the system from going too wild at the behest of pathological
2679  * behavior from the operator.
2680  */
2681 size_t max_vnode_path = 4 * MAXPATHLEN;
2682
2683
2684 void
2685 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
2686 {
2687         char *buf;
2688
2689         mutex_enter(&vp->v_lock);
2690         /*
2691          * If the snapshot of v_path_stamp passed in via compare_stamp does not
2692          * match the present value on the vnode, it indicates that subsequent
2693          * changes have occurred.  The v_path value is not cleared in this case
2694          * since the new value may be valid.
2695          */
2696         if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
2697                 mutex_exit(&vp->v_lock);
2698                 return;
2699         }
2700         buf = vp->v_path;
2701         vp->v_path = vn_vpath_empty;
2702         vp->v_path_stamp = 0;
2703         mutex_exit(&vp->v_lock);
2704         if (buf != vn_vpath_empty) {
2705                 kmem_free(buf, strlen(buf) + 1);
2706         }
2707 }
2708
2709 static void
2710 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
2711     boolean_t is_rename)
2712 {
2713         char *buf, *oldbuf;
2714         hrtime_t pstamp;
2715         size_t baselen, buflen = 0;
2716
2717         /* Handle the vn_setpath_str case. */
2718         if (pvp == NULL) {
2719                 if (len + 1 > max_vnode_path) {
2720                         DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
2721                             vnode_t *, vp, char *, name, size_t, len + 1);
2722                         return;
2723                 }
2724                 buf = kmem_alloc(len + 1, KM_SLEEP);
2725                 bcopy(name, buf, len);
2726                 buf[len] = '\0';
2727
2728                 mutex_enter(&vp->v_lock);
2729                 oldbuf = vp->v_path;
2730                 vp->v_path = buf;
2731                 vp->v_path_stamp = gethrtime();
2732                 mutex_exit(&vp->v_lock);
2733                 if (oldbuf != vn_vpath_empty) {
2734                         kmem_free(oldbuf, strlen(oldbuf) + 1);
2735                 }
2736                 return;
2737         }
2738
2739         /* Take snapshot of parent dir */
2740         mutex_enter(&pvp->v_lock);
2741
2742         if ((pvp->v_flag & VTRAVERSE) != 0) {
2743                 /*
2744                  * When the parent vnode has VTRAVERSE set in its flags, normal
2745                  * assumptions about v_path calculation no longer apply.  The
2746                  * primary situation where this occurs is via the VFS tricks
2747                  * which procfs plays in order to allow /proc/PID/(root|cwd) to
2748                  * yield meaningful results.
2749                  *
2750                  * When this flag is set, v_path on the child must not be
2751                  * updated since the calculated value is likely to be
2752                  * incorrect, given the current context.
2753                  */
2754                 mutex_exit(&pvp->v_lock);
2755                 return;
2756         }
2757
2758 retrybuf:
2759         if (pvp->v_path == vn_vpath_empty) {
2760                 /*
2761                  * Without v_path from the parent directory, generating a child
2762                  * path from the name is impossible.
2763                  */
2764                 if (len > 0) {
2765                         pstamp = pvp->v_path_stamp;
2766                         mutex_exit(&pvp->v_lock);
2767                         vn_clearpath(vp, pstamp);
2768                         return;
2769                 }
2770
2771                 /*
2772                  * The only feasible case here is where a NUL lookup is being
2773                  * performed on rootdir prior to its v_path being populated.
2774                  */
2775                 ASSERT(pvp->v_path_stamp == 0);
2776                 baselen = 0;
2777                 pstamp = 0;
2778         } else {
2779                 pstamp = pvp->v_path_stamp;
2780                 baselen = strlen(pvp->v_path);
2781                 /* ignore a trailing slash if present */
2782                 if (pvp->v_path[baselen - 1] == '/') {
2783                         /* This should only the be case for rootdir */
2784                         ASSERT(baselen == 1 && pvp == rootdir);
2785                         baselen--;
2786                 }
2787         }
2788         mutex_exit(&pvp->v_lock);
2789
2790         if (buflen != 0) {
2791                 /* Free the existing (mis-sized) buffer in case of retry */
2792                 kmem_free(buf, buflen);
2793         }
2794         /* base, '/', name and trailing NUL */
2795         buflen = baselen + len + 2;
2796         if (buflen > max_vnode_path) {
2797                 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
2798                     vnode_t *, vp, char *, name, size_t, buflen);
2799                 return;
2800         }
2801         buf = kmem_alloc(buflen, KM_SLEEP);
2802
2803         mutex_enter(&pvp->v_lock);
2804         if (pvp->v_path_stamp != pstamp) {
2805                 size_t vlen;
2806
2807                 /*
2808                  * Since v_path_stamp changed on the parent, it is likely that
2809                  * v_path has been altered as well.  If the length does not
2810                  * exactly match what was previously measured, the buffer
2811                  * allocation must be repeated for proper sizing.
2812                  */
2813                 if (pvp->v_path == vn_vpath_empty) {
2814                         /* Give up if parent lack v_path */
2815                         mutex_exit(&pvp->v_lock);
2816                         kmem_free(buf, buflen);
2817                         return;
2818                 }
2819                 vlen = strlen(pvp->v_path);
2820                 if (pvp->v_path[vlen - 1] == '/') {
2821                         vlen--;
2822                 }
2823                 if (vlen != baselen) {
2824                         goto retrybuf;
2825                 }
2826         }
2827         bcopy(pvp->v_path, buf, baselen);
2828         mutex_exit(&pvp->v_lock);
2829
2830         buf[baselen] = '/';
2831         baselen++;
2832         bcopy(name, &buf[baselen], len + 1);
2833
2834         mutex_enter(&vp->v_lock);
2835         if (vp->v_path_stamp == 0) {
2836                 /* never-visited vnode can inherit stamp from parent */
2837                 ASSERT(vp->v_path == vn_vpath_empty);
2838                 vp->v_path_stamp = pstamp;
2839                 vp->v_path = buf;
2840                 mutex_exit(&vp->v_lock);
2841         } else if (vp->v_path_stamp < pstamp || is_rename) {
2842                 /*
2843                  * Install the updated path and stamp, ensuring that the v_path
2844                  * pointer is valid at all times for dtrace.
2845                  */
2846                 oldbuf = vp->v_path;
2847                 vp->v_path = buf;
2848                 vp->v_path_stamp = gethrtime();
2849                 mutex_exit(&vp->v_lock);
2850                 kmem_free(oldbuf, strlen(oldbuf) + 1);
2851         } else {
2852                 /*
2853                  * If the timestamp matches or is greater, it means another
2854                  * thread performed the update first while locks were dropped
2855                  * here to make the allocation.  We defer to the newer value.
2856                  */
2857                 mutex_exit(&vp->v_lock);
2858                 kmem_free(buf, buflen);
2859         }
2860         ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
2861 }
2862
2863 void
2864 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
2865 {
2866         size_t len;
2867
2868         /*
2869          * If the parent is older or empty, there's nothing further to do.
2870          */
2871         if (pvp->v_path == vn_vpath_empty ||
2872             pvp->v_path_stamp <= vp->v_path_stamp) {
2873                 return;
2874         }
2875
2876         /*
2877          * Given the lack of appropriate context, meaningful updates to v_path
2878          * cannot be made for during lookups for the '.' or '..' entries.
2879          */
2880         len = strlen(name);
2881         if (len == 0 || (len == 1 && name[0] == '.') ||
2882             (len == 2 && name[0] == '.' && name[1] == '.')) {
2883                 return;
2884         }
2885
2886         vn_setpath_common(pvp, vp, name, len, B_FALSE);
2887 }
2888
2889 /*
2890  * Given a starting vnode and a path, updates the path in the target vnode in
2891  * a safe manner.  If the vnode already has path information embedded, then the
2892  * cached path is left untouched.
2893  */
2894 /* ARGSUSED */
2895 void
2896 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
2897     size_t len)
2898 {
2899         vn_setpath_common(pvp, vp, name, len, B_FALSE);
2900 }
2901
2902 /*
2903  * Sets the path to the vnode to be the given string, regardless of current
2904  * context.  The string must be a complete path from rootdir.  This is only used
2905  * by fsop_root() for setting the path based on the mountpoint.
2906  */
2907 void
2908 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
2909 {
2910         vn_setpath_common(NULL, vp, str, len, B_FALSE);
2911 }
2912
2913 /*
2914  * Called from within filesystem's vop_rename() to handle renames once the
2915  * target vnode is available.
2916  */
2917 void
2918 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
2919 {
2920         vn_setpath_common(pvp, vp, name, len, B_TRUE);
2921 }
2922
2923 /*
2924  * Similar to vn_setpath_str(), this function sets the path of the destination
2925  * vnode to the be the same as the source vnode.
2926  */
2927 void
2928 vn_copypath(struct vnode *src, struct vnode *dst)
2929 {
2930         char *buf;
2931         hrtime_t stamp;
2932         size_t buflen;
2933
2934         mutex_enter(&src->v_lock);
2935         if (src->v_path == vn_vpath_empty) {
2936                 mutex_exit(&src->v_lock);
2937                 return;
2938         }
2939         buflen = strlen(src->v_path) + 1;
2940         mutex_exit(&src->v_lock);
2941
2942         buf = kmem_alloc(buflen, KM_SLEEP);
2943
2944         mutex_enter(&src->v_lock);
2945         if (src->v_path == vn_vpath_empty ||
2946             strlen(src->v_path) + 1 != buflen) {
2947                 mutex_exit(&src->v_lock);
2948                 kmem_free(buf, buflen);
2949                 return;
2950         }
2951         bcopy(src->v_path, buf, buflen);
2952         stamp = src->v_path_stamp;
2953         mutex_exit(&src->v_lock);
2954
2955         mutex_enter(&dst->v_lock);
2956         if (dst->v_path != vn_vpath_empty) {
2957                 mutex_exit(&dst->v_lock);
2958                 kmem_free(buf, buflen);
2959                 return;
2960         }
2961         dst->v_path = buf;
2962         dst->v_path_stamp = stamp;
2963         mutex_exit(&dst->v_lock);
2964 }
2965
2966
2967 /*
2968  * XXX Private interface for segvn routines that handle vnode
2969  * large page segments.
2970  *
2971  * return 1 if vp's file system fop_pageio() implementation
2972  * can be safely used instead of fop_getpage() for handling
2973  * pagefaults against regular non swap files. fop_pageio()
2974  * interface is considered safe here if its implementation
2975  * is very close to fop_getpage() implementation.
2976  * e.g. It zero's out the part of the page beyond EOF. Doesn't
2977  * panic if there're file holes but instead returns an error.
2978  * Doesn't assume file won't be changed by user writes, etc.
2979  *
2980  * return 0 otherwise.
2981  *
2982  * For now allow segvn to only use fop_pageio() with ufs and nfs.
2983  */
2984 int
2985 vn_vmpss_usepageio(vnode_t *vp)
2986 {
2987         vfs_t   *vfsp = vp->v_vfsp;
2988         char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
2989         char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
2990         char **fsok = pageio_ok_fss;
2991
2992         if (fsname == NULL) {
2993                 return (0);
2994         }
2995
2996         for (; *fsok; fsok++) {
2997                 if (strcmp(*fsok, fsname) == 0) {
2998                         return (1);
2999                 }
3000         }
3001         return (0);
3002 }
3003
3004 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3005
3006 int
3007 fop_open(
3008         vnode_t **vpp,
3009         int mode,
3010         cred_t *cr,
3011         caller_context_t *ct)
3012 {
3013         int ret;
3014         vnode_t *vp = *vpp;
3015
3016         VN_HOLD(vp);
3017         /*
3018          * Adding to the vnode counts before calling open
3019          * avoids the need for a mutex. It circumvents a race
3020          * condition where a query made on the vnode counts results in a
3021          * false negative. The inquirer goes away believing the file is
3022          * not open when there is an open on the file already under way.
3023          *
3024          * The counts are meant to prevent NFS from granting a delegation
3025          * when it would be dangerous to do so.
3026          *
3027          * The vnode counts are only kept on regular files
3028          */
3029         if ((*vpp)->v_type == VREG) {
3030                 if (mode & FREAD)
3031                         atomic_inc_32(&(*vpp)->v_rdcnt);
3032                 if (mode & FWRITE)
3033                         atomic_inc_32(&(*vpp)->v_wrcnt);
3034         }
3035
3036         VOPXID_MAP_CR(vp, cr);
3037
3038         ret = fop_open_dispatch(vpp, mode, cr, ct, true);
3039
3040         if (ret) {
3041                 /*
3042                  * Use the saved vp just in case the vnode ptr got trashed
3043                  * by the error.
3044                  */
3045                 VOPSTATS_UPDATE(vp, open);
3046                 if ((vp->v_type == VREG) && (mode & FREAD))
3047                         atomic_dec_32(&vp->v_rdcnt);
3048                 if ((vp->v_type == VREG) && (mode & FWRITE))
3049                         atomic_dec_32(&vp->v_wrcnt);
3050         } else {
3051                 /*
3052                  * Some filesystems will return a different vnode,
3053                  * but the same path was still used to open it.
3054                  * So if we do change the vnode and need to
3055                  * copy over the path, do so here, rather than special
3056                  * casing each filesystem. Adjust the vnode counts to
3057                  * reflect the vnode switch.
3058                  */
3059                 VOPSTATS_UPDATE(*vpp, open);
3060                 if (*vpp != vp && *vpp != NULL) {
3061                         vn_copypath(vp, *vpp);
3062                         if (((*vpp)->v_type == VREG) && (mode & FREAD))
3063                                 atomic_inc_32(&(*vpp)->v_rdcnt);
3064                         if ((vp->v_type == VREG) && (mode & FREAD))
3065                                 atomic_dec_32(&vp->v_rdcnt);
3066                         if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3067                                 atomic_inc_32(&(*vpp)->v_wrcnt);
3068                         if ((vp->v_type == VREG) && (mode & FWRITE))
3069                                 atomic_dec_32(&vp->v_wrcnt);
3070                 }
3071         }
3072         VN_RELE(vp);
3073         return (ret);
3074 }
3075
3076 int
3077 fop_close(
3078         vnode_t *vp,
3079         int flag,
3080         int count,
3081         offset_t offset,
3082         cred_t *cr,
3083         caller_context_t *ct)
3084 {
3085         int err;
3086
3087         VOPXID_MAP_CR(vp, cr);
3088
3089         err = fop_close_dispatch(vp, flag, count, offset, cr, ct, true);
3090
3091         VOPSTATS_UPDATE(vp, close);
3092         /*
3093          * Check passed in count to handle possible dups. Vnode counts are only
3094          * kept on regular files
3095          */
3096         if ((vp->v_type == VREG) && (count == 1))  {
3097                 if (flag & FREAD) {
3098                         ASSERT(vp->v_rdcnt > 0);
3099                         atomic_dec_32(&vp->v_rdcnt);
3100                 }
3101                 if (flag & FWRITE) {
3102                         ASSERT(vp->v_wrcnt > 0);
3103                         atomic_dec_32(&vp->v_wrcnt);
3104                 }
3105         }
3106         return (err);
3107 }
3108
3109 int
3110 fop_read(
3111         vnode_t *vp,
3112         uio_t *uiop,
3113         int ioflag,
3114         cred_t *cr,
3115         caller_context_t *ct)
3116 {
3117         int     err;
3118         ssize_t resid_start = uiop->uio_resid;
3119
3120         VOPXID_MAP_CR(vp, cr);
3121
3122         err = fop_read_dispatch(vp, uiop, ioflag, cr, ct, true);
3123
3124         VOPSTATS_UPDATE_IO(vp, read,
3125             read_bytes, (resid_start - uiop->uio_resid));
3126         return (err);
3127 }
3128
3129 int
3130 fop_write(
3131         vnode_t *vp,
3132         uio_t *uiop,
3133         int ioflag,
3134         cred_t *cr,
3135         caller_context_t *ct)
3136 {
3137         int     err;
3138         ssize_t resid_start = uiop->uio_resid;
3139
3140         VOPXID_MAP_CR(vp, cr);
3141
3142         err = fop_write_dispatch(vp, uiop, ioflag, cr, ct, true);
3143
3144         VOPSTATS_UPDATE_IO(vp, write,
3145             write_bytes, (resid_start - uiop->uio_resid));
3146         return (err);
3147 }
3148
3149 int
3150 fop_ioctl(
3151         vnode_t *vp,
3152         int cmd,
3153         intptr_t arg,
3154         int flag,
3155         cred_t *cr,
3156         int *rvalp,
3157         caller_context_t *ct)
3158 {
3159         int     err;
3160
3161         VOPXID_MAP_CR(vp, cr);
3162
3163         err = fop_ioctl_dispatch(vp, cmd, arg, flag, cr, rvalp, ct, true);
3164
3165         VOPSTATS_UPDATE(vp, ioctl);
3166         return (err);
3167 }
3168
3169 int
3170 fop_setfl(
3171         vnode_t *vp,
3172         int oflags,
3173         int nflags,
3174         cred_t *cr,
3175         caller_context_t *ct)
3176 {
3177         int     err;
3178
3179         VOPXID_MAP_CR(vp, cr);
3180
3181         err = fop_setfl_dispatch(vp, oflags, nflags, cr, ct, true);
3182
3183         VOPSTATS_UPDATE(vp, setfl);
3184         return (err);
3185 }
3186
3187 int
3188 fop_getattr(
3189         vnode_t *vp,
3190         vattr_t *vap,
3191         int flags,
3192         cred_t *cr,
3193         caller_context_t *ct)
3194 {
3195         int     err;
3196
3197         VOPXID_MAP_CR(vp, cr);
3198
3199         /*
3200          * If this file system doesn't understand the xvattr extensions
3201          * then turn off the xvattr bit.
3202          */
3203         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3204                 vap->va_mask &= ~VATTR_XVATTR;
3205         }
3206
3207         /*
3208          * We're only allowed to skip the ACL check iff we used a 32 bit
3209          * ACE mask with fop_access() to determine permissions.
3210          */
3211         if ((flags & ATTR_NOACLCHECK) &&
3212             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3213                 return (EINVAL);
3214
3215         err = fop_getattr_dispatch(vp, vap, flags, cr, ct, true);
3216
3217         VOPSTATS_UPDATE(vp, getattr);
3218         return (err);
3219 }
3220
3221 int
3222 fop_setattr(
3223         vnode_t *vp,
3224         vattr_t *vap,
3225         int flags,
3226         cred_t *cr,
3227         caller_context_t *ct)
3228 {
3229         int     err;
3230
3231         VOPXID_MAP_CR(vp, cr);
3232
3233         /*
3234          * If this file system doesn't understand the xvattr extensions
3235          * then turn off the xvattr bit.
3236          */
3237         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3238                 vap->va_mask &= ~VATTR_XVATTR;
3239         }
3240
3241         /*
3242          * We're only allowed to skip the ACL check iff we used a 32 bit
3243          * ACE mask with fop_access() to determine permissions.
3244          */
3245         if ((flags & ATTR_NOACLCHECK) &&
3246             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3247                 return (EINVAL);
3248
3249         err = fop_setattr_dispatch(vp, vap, flags, cr, ct, true);
3250
3251         VOPSTATS_UPDATE(vp, setattr);
3252         return (err);
3253 }
3254
3255 int
3256 fop_access(
3257         vnode_t *vp,
3258         int mode,
3259         int flags,
3260         cred_t *cr,
3261         caller_context_t *ct)
3262 {
3263         int     err;
3264
3265         if ((flags & V_ACE_MASK) &&
3266             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3267                 return (EINVAL);
3268         }
3269
3270         VOPXID_MAP_CR(vp, cr);
3271
3272         err = fop_access_dispatch(vp, mode, flags, cr, ct, true);
3273
3274         VOPSTATS_UPDATE(vp, access);
3275         return (err);
3276 }
3277
3278 int
3279 fop_lookup(
3280         vnode_t *dvp,
3281         char *nm,
3282         vnode_t **vpp,
3283         pathname_t *pnp,
3284         int flags,
3285         vnode_t *rdir,
3286         cred_t *cr,
3287         caller_context_t *ct,
3288         int *deflags,           /* Returned per-dirent flags */
3289         pathname_t *ppnp)       /* Returned case-preserved name in directory */
3290 {
3291         int ret;
3292
3293         /*
3294          * If this file system doesn't support case-insensitive access
3295          * and said access is requested, fail quickly.  It is required
3296          * that if the vfs supports case-insensitive lookup, it also
3297          * supports extended dirent flags.
3298          */
3299         if (flags & FIGNORECASE &&
3300             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3301             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3302                 return (EINVAL);
3303
3304         VOPXID_MAP_CR(dvp, cr);
3305
3306         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3307                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3308         } else {
3309                 ret = fop_lookup_dispatch(dvp, nm, vpp, pnp, flags, rdir, cr,
3310                                           ct, deflags, ppnp, true);
3311         }
3312
3313         if (ret == 0 && *vpp) {
3314                 VOPSTATS_UPDATE(*vpp, lookup);
3315                 vn_updatepath(dvp, *vpp, nm);
3316         }
3317
3318         return (ret);
3319 }
3320
3321 int
3322 fop_create(
3323         vnode_t *dvp,
3324         char *name,
3325         vattr_t *vap,
3326         vcexcl_t excl,
3327         int mode,
3328         vnode_t **vpp,
3329         cred_t *cr,
3330         int flags,
3331         caller_context_t *ct,
3332         vsecattr_t *vsecp)      /* ACL to set during create */
3333 {
3334         int ret;
3335
3336         if (vsecp != NULL &&
3337             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3338                 return (EINVAL);
3339         }
3340         /*
3341          * If this file system doesn't support case-insensitive access
3342          * and said access is requested, fail quickly.
3343          */
3344         if (flags & FIGNORECASE &&
3345             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3346             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3347                 return (EINVAL);
3348
3349         VOPXID_MAP_CR(dvp, cr);
3350
3351         ret = fop_create_dispatch(dvp, name, vap, excl, mode, vpp, cr, flags,
3352                                   ct, vsecp, true);
3353
3354         if (ret == 0 && *vpp) {
3355                 VOPSTATS_UPDATE(*vpp, create);
3356                 vn_updatepath(dvp, *vpp, name);
3357         }
3358
3359         return (ret);
3360 }
3361
3362 int
3363 fop_remove(
3364         vnode_t *dvp,
3365         char *nm,
3366         cred_t *cr,
3367         caller_context_t *ct,
3368         int flags)
3369 {
3370         int     err;
3371
3372         /*
3373          * If this file system doesn't support case-insensitive access
3374          * and said access is requested, fail quickly.
3375          */
3376         if (flags & FIGNORECASE &&
3377             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3378             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3379                 return (EINVAL);
3380
3381         VOPXID_MAP_CR(dvp, cr);
3382
3383         err = fop_remove_dispatch(dvp, nm, cr, ct, flags, true);
3384
3385         VOPSTATS_UPDATE(dvp, remove);
3386         return (err);
3387 }
3388
3389 int
3390 fop_link(
3391         vnode_t *tdvp,
3392         vnode_t *svp,
3393         char *tnm,
3394         cred_t *cr,
3395         caller_context_t *ct,
3396         int flags)
3397 {
3398         int     err;
3399
3400         /*
3401          * If the target file system doesn't support case-insensitive access
3402          * and said access is requested, fail quickly.
3403          */
3404         if (flags & FIGNORECASE &&
3405             (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3406             vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3407                 return (EINVAL);
3408
3409         VOPXID_MAP_CR(tdvp, cr);
3410
3411         err = fop_link_dispatch(tdvp, svp, tnm, cr, ct, flags, true);
3412
3413         VOPSTATS_UPDATE(tdvp, link);
3414         return (err);
3415 }
3416
3417 int
3418 fop_rename(
3419         vnode_t *sdvp,
3420         char *snm,
3421         vnode_t *tdvp,
3422         char *tnm,
3423         cred_t *cr,
3424         caller_context_t *ct,
3425         int flags)
3426 {
3427         int     err;
3428
3429         /*
3430          * If the file system involved does not support
3431          * case-insensitive access and said access is requested, fail
3432          * quickly.
3433          */
3434         if (flags & FIGNORECASE &&
3435             ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3436             vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3437                 return (EINVAL);
3438
3439         VOPXID_MAP_CR(tdvp, cr);
3440
3441         err = fop_rename_dispatch(sdvp, snm, tdvp, tnm, cr, ct, flags, true);
3442
3443         VOPSTATS_UPDATE(sdvp, rename);
3444         return (err);
3445 }
3446
3447 int
3448 fop_mkdir(
3449         vnode_t *dvp,
3450         char *dirname,
3451         vattr_t *vap,
3452         vnode_t **vpp,
3453         cred_t *cr,
3454         caller_context_t *ct,
3455         int flags,
3456         vsecattr_t *vsecp)      /* ACL to set during create */
3457 {
3458         int ret;
3459
3460         if (vsecp != NULL &&
3461             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3462                 return (EINVAL);
3463         }
3464         /*
3465          * If this file system doesn't support case-insensitive access
3466          * and said access is requested, fail quickly.
3467          */
3468         if (flags & FIGNORECASE &&
3469             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3470             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3471                 return (EINVAL);
3472
3473         VOPXID_MAP_CR(dvp, cr);
3474
3475         ret = fop_mkdir_dispatch(dvp, dirname, vap, vpp, cr, ct, flags, vsecp,
3476                                  true);
3477
3478         if (ret == 0 && *vpp) {
3479                 VOPSTATS_UPDATE(*vpp, mkdir);
3480                 vn_updatepath(dvp, *vpp, dirname);
3481         }
3482
3483         return (ret);
3484 }
3485
3486 int
3487 fop_rmdir(
3488         vnode_t *dvp,
3489         char *nm,
3490         vnode_t *cdir,
3491         cred_t *cr,
3492         caller_context_t *ct,
3493         int flags)
3494 {
3495         int     err;
3496
3497         /*
3498          * If this file system doesn't support case-insensitive access
3499          * and said access is requested, fail quickly.
3500          */
3501         if (flags & FIGNORECASE &&
3502             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3503             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3504                 return (EINVAL);
3505
3506         VOPXID_MAP_CR(dvp, cr);
3507
3508         err = fop_rmdir_dispatch(dvp, nm, cdir, cr, ct, flags, true);
3509
3510         VOPSTATS_UPDATE(dvp, rmdir);
3511         return (err);
3512 }
3513
3514 int
3515 fop_readdir(
3516         vnode_t *vp,
3517         uio_t *uiop,
3518         cred_t *cr,
3519         int *eofp,
3520         caller_context_t *ct,
3521         int flags)
3522 {
3523         int     err;
3524         ssize_t resid_start = uiop->uio_resid;
3525
3526         /*
3527          * If this file system doesn't support retrieving directory
3528          * entry flags and said access is requested, fail quickly.
3529          */
3530         if (flags & V_RDDIR_ENTFLAGS &&
3531             vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3532                 return (EINVAL);
3533
3534         VOPXID_MAP_CR(vp, cr);
3535
3536         err = fop_readdir_dispatch(vp, uiop, cr, eofp, ct, flags, true);
3537
3538         VOPSTATS_UPDATE_IO(vp, readdir,
3539             readdir_bytes, (resid_start - uiop->uio_resid));
3540         return (err);
3541 }
3542
3543 int
3544 fop_symlink(
3545         vnode_t *dvp,
3546         char *linkname,
3547         vattr_t *vap,
3548         char *target,
3549         cred_t *cr,
3550         caller_context_t *ct,
3551         int flags)
3552 {
3553         int     err;
3554         xvattr_t xvattr;
3555
3556         /*
3557          * If this file system doesn't support case-insensitive access
3558          * and said access is requested, fail quickly.
3559          */
3560         if (flags & FIGNORECASE &&
3561             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3562             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3563                 return (EINVAL);
3564
3565         VOPXID_MAP_CR(dvp, cr);
3566
3567         /* check for reparse point */
3568         if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3569             (strncmp(target, FS_REPARSE_TAG_STR,
3570             strlen(FS_REPARSE_TAG_STR)) == 0)) {
3571                 if (!fs_reparse_mark(target, vap, &xvattr))
3572                         vap = (vattr_t *)&xvattr;
3573         }
3574
3575         err = fop_symlink_dispatch(dvp, linkname, vap, target, cr, ct, flags,
3576                                    true);
3577
3578         VOPSTATS_UPDATE(dvp, symlink);
3579         return (err);
3580 }
3581
3582 int
3583 fop_readlink(
3584         vnode_t *vp,
3585         uio_t *uiop,
3586         cred_t *cr,
3587         caller_context_t *ct)
3588 {
3589         int     err;
3590
3591         VOPXID_MAP_CR(vp, cr);
3592
3593         err = fop_readlink_dispatch(vp, uiop, cr, ct, true);
3594
3595         VOPSTATS_UPDATE(vp, readlink);
3596         return (err);
3597 }
3598
3599 int
3600 fop_fsync(
3601         vnode_t *vp,
3602         int syncflag,
3603         cred_t *cr,
3604         caller_context_t *ct)
3605 {
3606         int     err;
3607
3608         VOPXID_MAP_CR(vp, cr);
3609
3610         err = fop_fsync_dispatch(vp, syncflag, cr, ct, true);
3611
3612         VOPSTATS_UPDATE(vp, fsync);
3613         return (err);
3614 }
3615
3616 void
3617 fop_inactive(
3618         vnode_t *vp,
3619         cred_t *cr,
3620         caller_context_t *ct)
3621 {
3622         /* Need to update stats before vop call since we may lose the vnode */
3623         VOPSTATS_UPDATE(vp, inactive);
3624
3625         VOPXID_MAP_CR(vp, cr);
3626
3627         fop_inactive_dispatch(vp, cr, ct, true);
3628 }
3629
3630 int
3631 fop_fid(
3632         vnode_t *vp,
3633         fid_t *fidp,
3634         caller_context_t *ct)
3635 {
3636         int     err;
3637
3638         err = fop_fid_dispatch(vp, fidp, ct, true);
3639
3640         VOPSTATS_UPDATE(vp, fid);
3641         return (err);
3642 }
3643
3644 int
3645 fop_rwlock(
3646         vnode_t *vp,
3647         int write_lock,
3648         caller_context_t *ct)
3649 {
3650         int     ret;
3651
3652         ret = fop_rwlock_dispatch(vp, write_lock, ct, true);
3653
3654         VOPSTATS_UPDATE(vp, rwlock);
3655         return (ret);
3656 }
3657
3658 void
3659 fop_rwunlock(
3660         vnode_t *vp,
3661         int write_lock,
3662         caller_context_t *ct)
3663 {
3664         fop_rwunlock_dispatch(vp, write_lock, ct, true);
3665
3666         VOPSTATS_UPDATE(vp, rwunlock);
3667 }
3668
3669 int
3670 fop_seek(
3671         vnode_t *vp,
3672         offset_t ooff,
3673         offset_t *noffp,
3674         caller_context_t *ct)
3675 {
3676         int     err;
3677
3678         err = fop_seek_dispatch(vp, ooff, noffp, ct, true);
3679
3680         VOPSTATS_UPDATE(vp, seek);
3681         return (err);
3682 }
3683
3684 int
3685 fop_cmp(
3686         vnode_t *vp1,
3687         vnode_t *vp2,
3688         caller_context_t *ct)
3689 {
3690         int     err;
3691
3692         err = fop_cmp_dispatch(vp1, vp2, ct, true);
3693
3694         VOPSTATS_UPDATE(vp1, cmp);
3695         return (err);
3696 }
3697
3698 int
3699 fop_frlock(
3700         vnode_t *vp,
3701         int cmd,
3702         flock64_t *bfp,
3703         int flag,
3704         offset_t offset,
3705         struct flk_callback *flk_cbp,
3706         cred_t *cr,
3707         caller_context_t *ct)
3708 {
3709         int     err;
3710
3711         VOPXID_MAP_CR(vp, cr);
3712
3713         err = fop_frlock_dispatch(vp, cmd, bfp, flag, offset, flk_cbp, cr,
3714                                   ct, true);
3715
3716         VOPSTATS_UPDATE(vp, frlock);
3717         return (err);
3718 }
3719
3720 int
3721 fop_space(
3722         vnode_t *vp,
3723         int cmd,
3724         flock64_t *bfp,
3725         int flag,
3726         offset_t offset,
3727         cred_t *cr,
3728         caller_context_t *ct)
3729 {
3730         int     err;
3731
3732         VOPXID_MAP_CR(vp, cr);
3733
3734         err = fop_space_dispatch(vp, cmd, bfp, flag, offset, cr, ct, true);
3735
3736         VOPSTATS_UPDATE(vp, space);
3737         return (err);
3738 }
3739
3740 int
3741 fop_realvp(
3742         vnode_t *vp,
3743         vnode_t **vpp,
3744         caller_context_t *ct)
3745 {
3746         int     err;
3747
3748         err = fop_realvp_dispatch(vp, vpp, ct, true);
3749
3750         VOPSTATS_UPDATE(vp, realvp);
3751         return (err);
3752 }
3753
3754 int
3755 fop_getpage(
3756         vnode_t *vp,
3757         offset_t off,
3758         size_t len,
3759         uint_t *protp,
3760         page_t **plarr,
3761         size_t plsz,
3762         struct seg *seg,
3763         caddr_t addr,
3764         enum seg_rw rw,
3765         cred_t *cr,
3766         caller_context_t *ct)
3767 {
3768         int     err;
3769
3770         VOPXID_MAP_CR(vp, cr);
3771
3772         err = fop_getpage_dispatch(vp, off, len, protp, plarr, plsz, seg,
3773             addr, rw, cr, ct, true);
3774
3775         VOPSTATS_UPDATE(vp, getpage);
3776         return (err);
3777 }
3778
3779 int
3780 fop_putpage(
3781         vnode_t *vp,
3782         offset_t off,
3783         size_t len,
3784         int flags,
3785         cred_t *cr,
3786         caller_context_t *ct)
3787 {
3788         int     err;
3789
3790         VOPXID_MAP_CR(vp, cr);
3791
3792         err = fop_putpage_dispatch(vp, off, len, flags, cr, ct, true);
3793
3794         VOPSTATS_UPDATE(vp, putpage);
3795         return (err);
3796 }
3797
3798 int
3799 fop_map(
3800         vnode_t *vp,
3801         offset_t off,
3802         struct as *as,
3803         caddr_t *addrp,
3804         size_t len,
3805         uchar_t prot,
3806         uchar_t maxprot,
3807         uint_t flags,
3808         cred_t *cr,
3809         caller_context_t *ct)
3810 {
3811         int     err;
3812
3813         VOPXID_MAP_CR(vp, cr);
3814
3815         err = fop_map_dispatch(vp, off, as, addrp, len, prot, maxprot,
3816             flags, cr, ct, true);
3817
3818         VOPSTATS_UPDATE(vp, map);
3819         return (err);
3820 }
3821
3822 int
3823 fop_addmap(
3824         vnode_t *vp,
3825         offset_t off,
3826         struct as *as,
3827         caddr_t addr,
3828         size_t len,
3829         uchar_t prot,
3830         uchar_t maxprot,
3831         uint_t flags,
3832         cred_t *cr,
3833         caller_context_t *ct)
3834 {
3835         int error;
3836         u_longlong_t delta;
3837
3838         VOPXID_MAP_CR(vp, cr);
3839
3840         error = fop_addmap_dispatch(vp, off, as, addr, len, prot, maxprot,
3841             flags, cr, ct, true);
3842
3843         if ((!error) && (vp->v_type == VREG)) {
3844                 delta = (u_longlong_t)btopr(len);
3845                 /*
3846                  * If file is declared MAP_PRIVATE, it can't be written back
3847                  * even if open for write. Handle as read.
3848                  */
3849                 if (flags & MAP_PRIVATE) {
3850                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3851                             (int64_t)delta);
3852                 } else {
3853                         /*
3854                          * atomic_add_64 forces the fetch of a 64 bit value to
3855                          * be atomic on 32 bit machines
3856                          */
3857                         if (maxprot & PROT_WRITE)
3858                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3859                                     (int64_t)delta);
3860                         if (maxprot & PROT_READ)
3861                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3862                                     (int64_t)delta);
3863                         if (maxprot & PROT_EXEC)
3864                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3865                                     (int64_t)delta);
3866                 }
3867         }
3868         VOPSTATS_UPDATE(vp, addmap);
3869         return (error);
3870 }
3871
3872 int
3873 fop_delmap(
3874         vnode_t *vp,
3875         offset_t off,
3876         struct as *as,
3877         caddr_t addr,
3878         size_t len,
3879         uint_t prot,
3880         uint_t maxprot,
3881         uint_t flags,
3882         cred_t *cr,
3883         caller_context_t *ct)
3884 {
3885         int error;
3886         u_longlong_t delta;
3887
3888         VOPXID_MAP_CR(vp, cr);
3889
3890         error = fop_delmap_dispatch(vp, off, as, addr, len, prot, maxprot,
3891             flags, cr, ct, true);
3892
3893         /*
3894          * NFS calls into delmap twice, the first time
3895          * it simply establishes a callback mechanism and returns EAGAIN
3896          * while the real work is being done upon the second invocation.
3897          * We have to detect this here and only decrement the counts upon
3898          * the second delmap request.
3899          */
3900         if ((error != EAGAIN) && (vp->v_type == VREG)) {
3901
3902                 delta = (u_longlong_t)btopr(len);
3903
3904                 if (flags & MAP_PRIVATE) {
3905                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3906                             (int64_t)(-delta));
3907                 } else {
3908                         /*
3909                          * atomic_add_64 forces the fetch of a 64 bit value
3910                          * to be atomic on 32 bit machines
3911                          */
3912                         if (maxprot & PROT_WRITE)
3913                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3914                                     (int64_t)(-delta));
3915                         if (maxprot & PROT_READ)
3916                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3917                                     (int64_t)(-delta));
3918                         if (maxprot & PROT_EXEC)
3919                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3920                                     (int64_t)(-delta));
3921                 }
3922         }
3923         VOPSTATS_UPDATE(vp, delmap);
3924         return (error);
3925 }
3926
3927
3928 int
3929 fop_poll(
3930         vnode_t *vp,
3931         short events,
3932         int anyyet,
3933         short *reventsp,
3934         struct pollhead **phpp,
3935         caller_context_t *ct)
3936 {
3937         int     err;
3938
3939         err = fop_poll_dispatch(vp, events, anyyet, reventsp, phpp, ct, true);
3940
3941         VOPSTATS_UPDATE(vp, poll);
3942         return (err);
3943 }
3944
3945 int
3946 fop_dump(
3947         vnode_t *vp,
3948         caddr_t addr,
3949         offset_t lbdn,
3950         offset_t dblks,
3951         caller_context_t *ct)
3952 {
3953         int     err;
3954
3955         /* ensure lbdn and dblks can be passed safely to bdev_dump */
3956         if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
3957                 return (EIO);
3958
3959         err = fop_dump_dispatch(vp, addr, lbdn, dblks, ct, true);
3960
3961         VOPSTATS_UPDATE(vp, dump);
3962         return (err);
3963 }
3964
3965 int
3966 fop_pathconf(
3967         vnode_t *vp,
3968         int cmd,
3969         ulong_t *valp,
3970         cred_t *cr,
3971         caller_context_t *ct)
3972 {
3973         int     err;
3974
3975         VOPXID_MAP_CR(vp, cr);
3976
3977         err = fop_pathconf_dispatch(vp, cmd, valp, cr, ct, true);
3978
3979         VOPSTATS_UPDATE(vp, pathconf);
3980         return (err);
3981 }
3982
3983 int
3984 fop_pageio(
3985         vnode_t *vp,
3986         struct page *pp,
3987         uoff_t io_off,
3988         size_t io_len,
3989         int flags,
3990         cred_t *cr,
3991         caller_context_t *ct)
3992 {
3993         int     err;
3994
3995         VOPXID_MAP_CR(vp, cr);
3996
3997         err = fop_pageio_dispatch(vp, pp, io_off, io_len, flags, cr, ct, true);
3998
3999         VOPSTATS_UPDATE(vp, pageio);
4000         return (err);
4001 }
4002
4003 int
4004 fop_dumpctl(
4005         vnode_t *vp,
4006         int action,
4007         offset_t *blkp,
4008         caller_context_t *ct)
4009 {
4010         int     err;
4011
4012         err = fop_dumpctl_dispatch(vp, action, blkp, ct, true);
4013
4014         VOPSTATS_UPDATE(vp, dumpctl);
4015         return (err);
4016 }
4017
4018 void
4019 fop_dispose(
4020         vnode_t *vp,
4021         page_t *pp,
4022         int flag,
4023         int dn,
4024         cred_t *cr,
4025         caller_context_t *ct)
4026 {
4027         /* Must do stats first since it's possible to lose the vnode */
4028         VOPSTATS_UPDATE(vp, dispose);
4029
4030         VOPXID_MAP_CR(vp, cr);
4031
4032         fop_dispose_dispatch(vp, pp, flag, dn, cr, ct, true);
4033 }
4034
4035 int
4036 fop_setsecattr(
4037         vnode_t *vp,
4038         vsecattr_t *vsap,
4039         int flag,
4040         cred_t *cr,
4041         caller_context_t *ct)
4042 {
4043         int     err;
4044
4045         VOPXID_MAP_CR(vp, cr);
4046
4047         /*
4048          * We're only allowed to skip the ACL check iff we used a 32 bit
4049          * ACE mask with fop_access() to determine permissions.
4050          */
4051         if ((flag & ATTR_NOACLCHECK) &&
4052             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4053                 return (EINVAL);
4054         }
4055
4056         err = fop_setsecattr_dispatch(vp, vsap, flag, cr, ct, true);
4057
4058         VOPSTATS_UPDATE(vp, setsecattr);
4059         return (err);
4060 }
4061
4062 int
4063 fop_getsecattr(
4064         vnode_t *vp,
4065         vsecattr_t *vsap,
4066         int flag,
4067         cred_t *cr,
4068         caller_context_t *ct)
4069 {
4070         int     err;
4071
4072         /*
4073          * We're only allowed to skip the ACL check iff we used a 32 bit
4074          * ACE mask with fop_access() to determine permissions.
4075          */
4076         if ((flag & ATTR_NOACLCHECK) &&
4077             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4078                 return (EINVAL);
4079         }
4080
4081         VOPXID_MAP_CR(vp, cr);
4082
4083         err = fop_getsecattr_dispatch(vp, vsap, flag, cr, ct, true);
4084
4085         VOPSTATS_UPDATE(vp, getsecattr);
4086         return (err);
4087 }
4088
4089 int
4090 fop_shrlock(
4091         vnode_t *vp,
4092         int cmd,
4093         struct shrlock *shr,
4094         int flag,
4095         cred_t *cr,
4096         caller_context_t *ct)
4097 {
4098         int     err;
4099
4100         VOPXID_MAP_CR(vp, cr);
4101
4102         err = fop_shrlock_dispatch(vp, cmd, shr, flag, cr, ct, true);
4103
4104         VOPSTATS_UPDATE(vp, shrlock);
4105         return (err);
4106 }
4107
4108 int
4109 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4110     caller_context_t *ct)
4111 {
4112         int     err;
4113
4114         err = fop_vnevent_dispatch(vp, vnevent, dvp, fnm, ct, true);
4115
4116         VOPSTATS_UPDATE(vp, vnevent);
4117         return (err);
4118 }
4119
4120 int
4121 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4122     caller_context_t *ct)
4123 {
4124         int err;
4125
4126         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4127                 return (ENOTSUP);
4128
4129         err = fop_reqzcbuf_dispatch(vp, ioflag, uiop, cr, ct, true);
4130
4131         VOPSTATS_UPDATE(vp, reqzcbuf);
4132         return (err);
4133 }
4134
4135 int
4136 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4137 {
4138         int err;
4139
4140         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4141                 return (ENOTSUP);
4142
4143         err = fop_retzcbuf_dispatch(vp, uiop, cr, ct, true);
4144
4145         VOPSTATS_UPDATE(vp, retzcbuf);
4146         return (err);
4147 }
4148
4149 /*
4150  * Default destructor
4151  *      Needed because NULL destructor means that the key is unused
4152  */
4153 /* ARGSUSED */
4154 void
4155 vsd_defaultdestructor(void *value)
4156 {}
4157
4158 /*
4159  * Create a key (index into per vnode array)
4160  *      Locks out vsd_create, vsd_destroy, and vsd_free
4161  *      May allocate memory with lock held
4162  */
4163 void
4164 vsd_create(uint_t *keyp, void (*destructor)(void *))
4165 {
4166         int     i;
4167         uint_t  nkeys;
4168
4169         /*
4170          * if key is allocated, do nothing
4171          */
4172         mutex_enter(&vsd_lock);
4173         if (*keyp) {
4174                 mutex_exit(&vsd_lock);
4175                 return;
4176         }
4177         /*
4178          * find an unused key
4179          */
4180         if (destructor == NULL)
4181                 destructor = vsd_defaultdestructor;
4182
4183         for (i = 0; i < vsd_nkeys; ++i)
4184                 if (vsd_destructor[i] == NULL)
4185                         break;
4186
4187         /*
4188          * if no unused keys, increase the size of the destructor array
4189          */
4190         if (i == vsd_nkeys) {
4191                 if ((nkeys = (vsd_nkeys << 1)) == 0)
4192                         nkeys = 1;
4193                 vsd_destructor =
4194                     (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4195                     (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4196                     (size_t)(nkeys * sizeof (void (*)(void *))));
4197                 vsd_nkeys = nkeys;
4198         }
4199
4200         /*
4201          * allocate the next available unused key
4202          */
4203         vsd_destructor[i] = destructor;
4204         *keyp = i + 1;
4205
4206         /* create vsd_list, if it doesn't exist */
4207         if (vsd_list == NULL) {
4208                 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4209                 list_create(vsd_list, sizeof (struct vsd_node),
4210                     offsetof(struct vsd_node, vs_nodes));
4211         }
4212
4213         mutex_exit(&vsd_lock);
4214 }
4215
4216 /*
4217  * Destroy a key
4218  *
4219  * Assumes that the caller is preventing vsd_set and vsd_get
4220  * Locks out vsd_create, vsd_destroy, and vsd_free
4221  * May free memory with lock held
4222  */
4223 void
4224 vsd_destroy(uint_t *keyp)
4225 {
4226         uint_t key;
4227         struct vsd_node *vsd;
4228
4229         /*
4230          * protect the key namespace and our destructor lists
4231          */
4232         mutex_enter(&vsd_lock);
4233         key = *keyp;
4234         *keyp = 0;
4235
4236         ASSERT(key <= vsd_nkeys);
4237
4238         /*
4239          * if the key is valid
4240          */
4241         if (key != 0) {
4242                 uint_t k = key - 1;
4243                 /*
4244                  * for every vnode with VSD, call key's destructor
4245                  */
4246                 for (vsd = list_head(vsd_list); vsd != NULL;
4247                     vsd = list_next(vsd_list, vsd)) {
4248                         /*
4249                          * no VSD for key in this vnode
4250                          */
4251                         if (key > vsd->vs_nkeys)
4252                                 continue;
4253                         /*
4254                          * call destructor for key
4255                          */
4256                         if (vsd->vs_value[k] && vsd_destructor[k])
4257                                 (*vsd_destructor[k])(vsd->vs_value[k]);
4258                         /*
4259                          * reset value for key
4260                          */
4261                         vsd->vs_value[k] = NULL;
4262                 }
4263                 /*
4264                  * actually free the key (NULL destructor == unused)
4265                  */
4266                 vsd_destructor[k] = NULL;
4267         }
4268
4269         mutex_exit(&vsd_lock);
4270 }
4271
4272 /*
4273  * Quickly return the per vnode value that was stored with the specified key
4274  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4275  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4276  */
4277 void *
4278 vsd_get(vnode_t *vp, uint_t key)
4279 {
4280         struct vsd_node *vsd;
4281
4282         ASSERT(vp != NULL);
4283         ASSERT(mutex_owned(&vp->v_vsd_lock));
4284
4285         vsd = vp->v_vsd;
4286
4287         if (key && vsd != NULL && key <= vsd->vs_nkeys)
4288                 return (vsd->vs_value[key - 1]);
4289         return (NULL);
4290 }
4291
4292 /*
4293  * Set a per vnode value indexed with the specified key
4294  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4295  */
4296 int
4297 vsd_set(vnode_t *vp, uint_t key, void *value)
4298 {
4299         struct vsd_node *vsd;
4300
4301         ASSERT(vp != NULL);
4302         ASSERT(mutex_owned(&vp->v_vsd_lock));
4303
4304         if (key == 0)
4305                 return (EINVAL);
4306
4307         vsd = vp->v_vsd;
4308         if (vsd == NULL)
4309                 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4310
4311         /*
4312          * If the vsd was just allocated, vs_nkeys will be 0, so the following
4313          * code won't happen and we will continue down and allocate space for
4314          * the vs_value array.
4315          * If the caller is replacing one value with another, then it is up
4316          * to the caller to free/rele/destroy the previous value (if needed).
4317          */
4318         if (key <= vsd->vs_nkeys) {
4319                 vsd->vs_value[key - 1] = value;
4320                 return (0);
4321         }
4322
4323         ASSERT(key <= vsd_nkeys);
4324
4325         if (vsd->vs_nkeys == 0) {
4326                 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4327                 /*
4328                  * Link onto list of all VSD nodes.
4329                  */
4330                 list_insert_head(vsd_list, vsd);
4331                 mutex_exit(&vsd_lock);
4332         }
4333
4334         /*
4335          * Allocate vnode local storage and set the value for key
4336          */
4337         vsd->vs_value = vsd_realloc(vsd->vs_value,
4338             vsd->vs_nkeys * sizeof (void *),
4339             key * sizeof (void *));
4340         vsd->vs_nkeys = key;
4341         vsd->vs_value[key - 1] = value;
4342
4343         return (0);
4344 }
4345
4346 /*
4347  * Called from vn_free() to run the destructor function for each vsd
4348  *      Locks out vsd_create and vsd_destroy
4349  *      Assumes that the destructor *DOES NOT* use vsd
4350  */
4351 void
4352 vsd_free(vnode_t *vp)
4353 {
4354         int i;
4355         struct vsd_node *vsd = vp->v_vsd;
4356
4357         if (vsd == NULL)
4358                 return;
4359
4360         if (vsd->vs_nkeys == 0) {
4361                 kmem_free(vsd, sizeof (*vsd));
4362                 vp->v_vsd = NULL;
4363                 return;
4364         }
4365
4366         /*
4367          * lock out vsd_create and vsd_destroy, call
4368          * the destructor, and mark the value as destroyed.
4369          */
4370         mutex_enter(&vsd_lock);
4371
4372         for (i = 0; i < vsd->vs_nkeys; i++) {
4373                 if (vsd->vs_value[i] && vsd_destructor[i])
4374                         (*vsd_destructor[i])(vsd->vs_value[i]);
4375                 vsd->vs_value[i] = NULL;
4376         }
4377
4378         /*
4379          * remove from linked list of VSD nodes
4380          */
4381         list_remove(vsd_list, vsd);
4382
4383         mutex_exit(&vsd_lock);
4384
4385         /*
4386          * free up the VSD
4387          */
4388         kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4389         kmem_free(vsd, sizeof (struct vsd_node));
4390         vp->v_vsd = NULL;
4391 }
4392
4393 /*
4394  * realloc
4395  */
4396 static void *
4397 vsd_realloc(void *old, size_t osize, size_t nsize)
4398 {
4399         void *new;
4400
4401         new = kmem_zalloc(nsize, KM_SLEEP);
4402         if (old) {
4403                 bcopy(old, new, osize);
4404                 kmem_free(old, osize);
4405         }
4406         return (new);
4407 }
4408
4409 /*
4410  * Setup the extensible system attribute for creating a reparse point.
4411  * The symlink data 'target' is validated for proper format of a reparse
4412  * string and a check also made to make sure the symlink data does not
4413  * point to an existing file.
4414  *
4415  * return 0 if ok else -1.
4416  */
4417 static int
4418 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4419 {
4420         xoptattr_t *xoap;
4421
4422         if ((!target) || (!vap) || (!xvattr))
4423                 return (-1);
4424
4425         /* validate reparse string */
4426         if (reparse_validate((const char *)target))
4427                 return (-1);
4428
4429         xva_init(xvattr);
4430         xvattr->xva_vattr = *vap;
4431         xvattr->xva_vattr.va_mask |= VATTR_XVATTR;
4432         xoap = xva_getxoptattr(xvattr);
4433         ASSERT(xoap);
4434         XVA_SET_REQ(xvattr, XAT_REPARSE);
4435         xoap->xoa_reparse = 1;
4436
4437         return (0);
4438 }
4439
4440 /*
4441  * Function to check whether a symlink is a reparse point.
4442  * Return B_TRUE if it is a reparse point, else return B_FALSE
4443  */
4444 boolean_t
4445 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4446 {
4447         xvattr_t xvattr;
4448         xoptattr_t *xoap;
4449
4450         if ((vp->v_type != VLNK) ||
4451             !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4452                 return (B_FALSE);
4453
4454         xva_init(&xvattr);
4455         xoap = xva_getxoptattr(&xvattr);
4456         ASSERT(xoap);
4457         XVA_SET_REQ(&xvattr, XAT_REPARSE);
4458
4459         if (fop_getattr(vp, &xvattr.xva_vattr, 0, cr, ct))
4460                 return (B_FALSE);
4461
4462         if ((!(xvattr.xva_vattr.va_mask & VATTR_XVATTR)) ||
4463             (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4464                 return (B_FALSE);
4465
4466         return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4467 }