kernel/fs/vnode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2017, Joyent, Inc.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  27  */
  28
  29 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  30 /*        All Rights Reserved   */
  31
  32 /*
  33  * University Copyright- Copyright (c) 1982, 1986, 1988
  34  * The Regents of the University of California
  35  * All Rights Reserved
  36  *
  37  * University Acknowledgment- Portions of this document are derived from
  38  * software developed by the University of California, Berkeley, and its
  39  * contributors.
  40  */
  41
  42 #include <sys/types.h>
  43 #include <sys/param.h>
  44 #include <sys/t_lock.h>
  45 #include <sys/errno.h>
  46 #include <sys/cred.h>
  47 #include <sys/user.h>
  48 #include <sys/uio.h>
  49 #include <sys/file.h>
  50 #include <sys/pathname.h>
  51 #include <sys/atomic.h>
  52 #include <sys/vfs.h>
  53 #include <sys/vnode.h>
  54 #include <sys/vnode_dispatch.h>
  55 #include <sys/rwstlock.h>
  56 #include <sys/fem.h>
  57 #include <sys/stat.h>
  58 #include <sys/mode.h>
  59 #include <sys/conf.h>
  60 #include <sys/sysmacros.h>
  61 #include <sys/cmn_err.h>
  62 #include <sys/systm.h>
  63 #include <sys/kmem.h>
  64 #include <sys/debug.h>
  65 #include <sys/acl.h>
  66 #include <sys/nbmlock.h>
  67 #include <sys/fcntl.h>
  68 #include <sys/fs_subr.h>
  69 #include <sys/taskq.h>
  70 #include <sys/fs_reparse.h>
  71 #include <sys/time.h>
  72 #include <sys/sdt.h>
  73
  74 /* Determine if this vnode is a file that is read-only */
  75 #define ISROFILE(vp)    \
  76         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  77             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  78
  79 /* Tunable via /etc/system; used only by admin/install */
  80 int nfs_global_client_only;
  81
  82 /*
  83  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  84  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  85  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  86  * the same fstype index that is used to index into the vfssw table.
  87  */
  88 vopstats_t **vopstats_fstype;
  89
  90 /* vopstats initialization template used for fast initialization via bcopy() */
  91 static vopstats_t *vs_templatep;
  92
  93 /* Kmem cache handle for vsk_anchor_t allocations */
  94 kmem_cache_t *vsk_anchor_cache;
  95
  96 /* file events cleanup routine */
  97 extern void free_fopdata(vnode_t *);
  98
  99 /*
 100  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
 101  * updates to vsktat_tree.
 102  */
 103 avl_tree_t      vskstat_tree;
 104 kmutex_t        vskstat_tree_lock;
 105
 106 /* Global variable which enables/disables the vopstats collection */
 107 int vopstats_enabled = 1;
 108
 109 /* Global used for empty/invalid v_path */
 110 char *vn_vpath_empty = "";
 111
 112 /*
 113  * forward declarations for internal vnode specific data (vsd)
 114  */
 115 static void *vsd_realloc(void *, size_t, size_t);
 116
 117 /*
 118  * forward declarations for reparse point functions
 119  */
 120 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 121
 122 /*
 123  * VSD -- VNODE SPECIFIC DATA
 124  * The v_data pointer is typically used by a file system to store a
 125  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 126  * However, there are times when additional project private data needs
 127  * to be stored separately from the data (node) pointed to by v_data.
 128  * This additional data could be stored by the file system itself or
 129  * by a completely different kernel entity.  VSD provides a way for
 130  * callers to obtain a key and store a pointer to private data associated
 131  * with a vnode.
 132  *
 133  * Callers are responsible for protecting the vsd by holding v_vsd_lock
 134  * for calls to vsd_set() and vsd_get().
 135  */
 136
 137 /*
 138  * vsd_lock protects:
 139  *   vsd_nkeys - creation and deletion of vsd keys
 140  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 141  *   vsd_destructor - adding and removing destructors to the list
 142  */
 143 static kmutex_t         vsd_lock;
 144 static uint_t           vsd_nkeys;       /* size of destructor array */
 145 /* list of vsd_node's */
 146 static list_t *vsd_list = NULL;
 147 /* per-key destructor funcs */
 148 static void             (**vsd_destructor)(void *);
 149
 150 /*
 151  * The following is the common set of actions needed to update the
 152  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 153  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 154  * recording of the bytes transferred.  Since the code is similar
 155  * but small, it is nearly a duplicate.  Consequently any changes
 156  * to one may need to be reflected in the other.
 157  * Rundown of the variables:
 158  * vp - Pointer to the vnode
 159  * counter - Partial name structure member to update in vopstats for counts
 160  * bytecounter - Partial name structure member to update in vopstats for bytes
 161  * bytesval - Value to update in vopstats for bytes
 162  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 163  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 164  */
 165
 166 #define VOPSTATS_UPDATE(vp, counter) {                                  \
 167         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 168         if (vfsp && vfsp->vfs_implp &&                                  \
 169             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 170                 vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 171                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 172                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 173                     size_t, uint64_t *);                                \
 174                 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 175                 (*stataddr)++;                                          \
 176                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 177                         vsp->n##counter.value.ui64++;                   \
 178                 }                                                       \
 179         }                                                               \
 180 }
 181
 182 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 183         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 184         if (vfsp && vfsp->vfs_implp &&                                  \
 185             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 186                 vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 187                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 188                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 189                     size_t, uint64_t *);                                \
 190                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 191                 (*stataddr)++;                                          \
 192                 vsp->bytecounter.value.ui64 += bytesval;                \
 193                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 194                         vsp->n##counter.value.ui64++;                   \
 195                         vsp->bytecounter.value.ui64 += bytesval;        \
 196                 }                                                       \
 197         }                                                               \
 198 }
 199
 200 /*
 201  * If the filesystem does not support XIDs map credential
 202  * If the vfsp is NULL, perhaps we should also map?
 203  */
 204 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 205         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 206         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)            \
 207                 cr = crgetmapped(cr);                                   \
 208         }
 209
 210 /*
 211  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 212  * numerical order of S_IFMT and vnode types.)
 213  */
 214 enum vtype iftovt_tab[] = {
 215         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 216         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 217 };
 218
 219 ushort_t vttoif_tab[] = {
 220         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 221         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 222 };
 223
 224 /*
 225  * The system vnode cache.
 226  */
 227
 228 kmem_cache_t *vn_cache;
 229
 230
 231 /* Extensible attribute (xva) routines. */
 232
 233 /*
 234  * Zero out the structure, set the size of the requested/returned bitmaps,
 235  * set VATTR_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 236  * to the returned attributes array.
 237  */
 238 void
 239 xva_init(xvattr_t *xvap)
 240 {
 241         bzero(xvap, sizeof (xvattr_t));
 242         xvap->xva_mapsize = XVA_MAPSIZE;
 243         xvap->xva_magic = XVA_MAGIC;
 244         xvap->xva_vattr.va_mask = VATTR_XVATTR;
 245         xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 246 }
 247
 248 /*
 249  * If VATTR_XVATTR is set, returns a pointer to the embedded xoptattr_t
 250  * structure.  Otherwise, returns NULL.
 251  */
 252 xoptattr_t *
 253 xva_getxoptattr(xvattr_t *xvap)
 254 {
 255         xoptattr_t *xoap = NULL;
 256         if (xvap->xva_vattr.va_mask & VATTR_XVATTR)
 257                 xoap = &xvap->xva_xoptattrs;
 258         return (xoap);
 259 }
 260
 261 /*
 262  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 263  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 264  * kstat name.
 265  */
 266 static int
 267 vska_compar(const void *n1, const void *n2)
 268 {
 269         int ret;
 270         ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 271         ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 272
 273         if (p1 < p2) {
 274                 ret = -1;
 275         } else if (p1 > p2) {
 276                 ret = 1;
 277         } else {
 278                 ret = 0;
 279         }
 280
 281         return (ret);
 282 }
 283
 284 /*
 285  * Used to create a single template which will be bcopy()ed to a newly
 286  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 287  */
 288 static vopstats_t *
 289 create_vopstats_template()
 290 {
 291         vopstats_t              *vsp;
 292
 293         vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 294         bzero(vsp, sizeof (*vsp));      /* Start fresh */
 295
 296         /* fop_open */
 297         kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 298         /* fop_close */
 299         kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 300         /* fop_read I/O */
 301         kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 302         kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 303         /* fop_write I/O */
 304         kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 305         kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 306         /* fop_ioctl */
 307         kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 308         /* fop_setfl */
 309         kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 310         /* fop_getattr */
 311         kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 312         /* fop_setattr */
 313         kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 314         /* fop_access */
 315         kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 316         /* fop_lookup */
 317         kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 318         /* fop_create */
 319         kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 320         /* fop_remove */
 321         kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 322         /* fop_link */
 323         kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 324         /* fop_rename */
 325         kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 326         /* fop_mkdir */
 327         kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 328         /* fop_rmdir */
 329         kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 330         /* fop_readdir I/O */
 331         kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 332         kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 333             KSTAT_DATA_UINT64);
 334         /* fop_symlink */
 335         kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 336         /* fop_readlink */
 337         kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 338         /* fop_fsync */
 339         kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 340         /* fop_inactive */
 341         kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 342         /* fop_fid */
 343         kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 344         /* fop_rwlock */
 345         kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 346         /* fop_rwunlock */
 347         kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 348         /* fop_seek */
 349         kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 350         /* fop_cmp */
 351         kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 352         /* fop_frlock */
 353         kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 354         /* fop_space */
 355         kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 356         /* fop_realvp */
 357         kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 358         /* fop_getpage */
 359         kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 360         /* fop_putpage */
 361         kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 362         /* fop_map */
 363         kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 364         /* fop_addmap */
 365         kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 366         /* fop_delmap */
 367         kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 368         /* fop_poll */
 369         kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 370         /* fop_dump */
 371         kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 372         /* fop_pathconf */
 373         kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 374         /* fop_pageio */
 375         kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 376         /* fop_dumpctl */
 377         kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 378         /* fop_dispose */
 379         kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 380         /* fop_setsecattr */
 381         kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 382         /* fop_getsecattr */
 383         kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 384         /* fop_shrlock */
 385         kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 386         /* fop_vnevent */
 387         kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 388         /* fop_reqzcbuf */
 389         kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 390         /* fop_retzcbuf */
 391         kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 392
 393         return (vsp);
 394 }
 395
 396 /*
 397  * Creates a kstat structure associated with a vopstats structure.
 398  */
 399 kstat_t *
 400 new_vskstat(char *ksname, vopstats_t *vsp)
 401 {
 402         kstat_t         *ksp;
 403
 404         if (!vopstats_enabled) {
 405                 return (NULL);
 406         }
 407
 408         ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 409             sizeof (vopstats_t)/sizeof (kstat_named_t),
 410             KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 411         if (ksp) {
 412                 ksp->ks_data = vsp;
 413                 kstat_install(ksp);
 414         }
 415
 416         return (ksp);
 417 }
 418
 419 /*
 420  * Called from vfsinit() to initialize the support mechanisms for vopstats
 421  */
 422 void
 423 vopstats_startup()
 424 {
 425         if (!vopstats_enabled)
 426                 return;
 427
 428         /*
 429          * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 430          * is necessary since we need to check if a kstat exists before we
 431          * attempt to create it.  Also, initialize its lock.
 432          */
 433         avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 434             offsetof(vsk_anchor_t, vsk_node));
 435         mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 436
 437         vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 438             sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 439             NULL, NULL, 0);
 440
 441         /*
 442          * Set up the array of pointers for the vopstats-by-FS-type.
 443          * The entries will be allocated/initialized as each file system
 444          * goes through modload/mod_installfs.
 445          */
 446         vopstats_fstype = (vopstats_t **)kmem_zalloc(
 447             (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 448
 449         /* Set up the global vopstats initialization template */
 450         vs_templatep = create_vopstats_template();
 451 }
 452
 453 /*
 454  * We need to have the all of the counters zeroed.
 455  * The initialization of the vopstats_t includes on the order of
 456  * 50 calls to kstat_named_init().  Rather that do that on every call,
 457  * we do it once in a template (vs_templatep) then bcopy it over.
 458  */
 459 void
 460 initialize_vopstats(vopstats_t *vsp)
 461 {
 462         if (vsp == NULL)
 463                 return;
 464
 465         bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 466 }
 467
 468 /*
 469  * If possible, determine which vopstats by fstype to use and
 470  * return a pointer to the caller.
 471  */
 472 vopstats_t *
 473 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 474 {
 475         int             fstype = 0;     /* Index into vfssw[] */
 476         vopstats_t      *vsp = NULL;
 477
 478         if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 479             !vopstats_enabled)
 480                 return (NULL);
 481         /*
 482          * Set up the fstype.  We go to so much trouble because all versions
 483          * of NFS use the same fstype in their vfs even though they have
 484          * distinct entries in the vfssw[] table.
 485          * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 486          */
 487         if (vswp) {
 488                 fstype = vswp - vfssw;  /* Gets us the index */
 489         } else {
 490                 fstype = vfsp->vfs_fstype;
 491         }
 492
 493         /*
 494          * Point to the per-fstype vopstats. The only valid values are
 495          * non-zero positive values less than the number of vfssw[] table
 496          * entries.
 497          */
 498         if (fstype > 0 && fstype < nfstype) {
 499                 vsp = vopstats_fstype[fstype];
 500         }
 501
 502         return (vsp);
 503 }
 504
 505 /*
 506  * Generate a kstat name, create the kstat structure, and allocate a
 507  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 508  * to the caller.  This must only be called from a mount.
 509  */
 510 vsk_anchor_t *
 511 get_vskstat_anchor(vfs_t *vfsp)
 512 {
 513         char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 514         statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 515         vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 516         kstat_t         *ksp;                   /* Ptr to new kstat */
 517         avl_index_t     where;                  /* Location in the AVL tree */
 518
 519         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 520             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 521                 return (NULL);
 522
 523         /* Need to get the fsid to build a kstat name */
 524         if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 525                 /* Create a name for our kstats based on fsid */
 526                 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 527                     VOPSTATS_STR, statvfsbuf.f_fsid);
 528
 529                 /* Allocate and initialize the vsk_anchor_t */
 530                 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 531                 bzero(vskp, sizeof (*vskp));
 532                 vskp->vsk_fsid = statvfsbuf.f_fsid;
 533
 534                 mutex_enter(&vskstat_tree_lock);
 535                 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 536                         avl_insert(&vskstat_tree, vskp, where);
 537                         mutex_exit(&vskstat_tree_lock);
 538
 539                         /*
 540                          * Now that we've got the anchor in the AVL
 541                          * tree, we can create the kstat.
 542                          */
 543                         ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 544                         if (ksp) {
 545                                 vskp->vsk_ksp = ksp;
 546                         }
 547                 } else {
 548                         /* Oops, found one! Release memory and lock. */
 549                         mutex_exit(&vskstat_tree_lock);
 550                         kmem_cache_free(vsk_anchor_cache, vskp);
 551                         vskp = NULL;
 552                 }
 553         }
 554         return (vskp);
 555 }
 556
 557 /*
 558  * We're in the process of tearing down the vfs and need to cleanup
 559  * the data structures associated with the vopstats. Must only be called
 560  * from dounmount().
 561  */
 562 void
 563 teardown_vopstats(vfs_t *vfsp)
 564 {
 565         vsk_anchor_t    *vskap;
 566         avl_index_t     where;
 567
 568         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 569             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 570                 return;
 571
 572         /* This is a safe check since VFS_STATS must be set (see above) */
 573         if ((vskap = vfsp->vfs_vskap) == NULL)
 574                 return;
 575
 576         /* Whack the pointer right away */
 577         vfsp->vfs_vskap = NULL;
 578
 579         /* Lock the tree, remove the node, and delete the kstat */
 580         mutex_enter(&vskstat_tree_lock);
 581         if (avl_find(&vskstat_tree, vskap, &where)) {
 582                 avl_remove(&vskstat_tree, vskap);
 583         }
 584
 585         if (vskap->vsk_ksp) {
 586                 kstat_delete(vskap->vsk_ksp);
 587         }
 588         mutex_exit(&vskstat_tree_lock);
 589
 590         kmem_cache_free(vsk_anchor_cache, vskap);
 591 }
 592
 593 /*
 594  * Read or write a vnode.  Called from kernel code.
 595  */
 596 int
 597 vn_rdwr(
 598         enum uio_rw rw,
 599         struct vnode *vp,
 600         caddr_t base,
 601         ssize_t len,
 602         offset_t offset,
 603         enum uio_seg seg,
 604         int ioflag,
 605         rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 606         cred_t *cr,
 607         ssize_t *residp)
 608 {
 609         struct uio uio;
 610         struct iovec iov;
 611         int error;
 612         int in_crit = 0;
 613
 614         if (rw == UIO_WRITE && ISROFILE(vp))
 615                 return (EROFS);
 616
 617         if (len < 0)
 618                 return (EIO);
 619
 620         VOPXID_MAP_CR(vp, cr);
 621
 622         iov.iov_base = base;
 623         iov.iov_len = len;
 624         uio.uio_iov = &iov;
 625         uio.uio_iovcnt = 1;
 626         uio.uio_loffset = offset;
 627         uio.uio_segflg = (short)seg;
 628         uio.uio_resid = len;
 629         uio.uio_llimit = ulimit;
 630
 631         /*
 632          * We have to enter the critical region before calling fop_rwlock
 633          * to avoid a deadlock with ufs.
 634          */
 635         if (nbl_need_check(vp)) {
 636                 int svmand;
 637
 638                 nbl_start_crit(vp, RW_READER);
 639                 in_crit = 1;
 640                 error = nbl_svmand(vp, cr, &svmand);
 641                 if (error != 0)
 642                         goto done;
 643                 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 644                     uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 645                         error = EACCES;
 646                         goto done;
 647                 }
 648         }
 649
 650         (void) fop_rwlock(vp,
 651             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 652         if (rw == UIO_WRITE) {
 653                 uio.uio_fmode = FWRITE;
 654                 uio.uio_extflg = UIO_COPY_DEFAULT;
 655                 error = fop_write(vp, &uio, ioflag, cr, NULL);
 656         } else {
 657                 uio.uio_fmode = FREAD;
 658                 uio.uio_extflg = UIO_COPY_CACHED;
 659                 error = fop_read(vp, &uio, ioflag, cr, NULL);
 660         }
 661         fop_rwunlock(vp,
 662             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 663         if (residp)
 664                 *residp = uio.uio_resid;
 665         else if (uio.uio_resid)
 666                 error = EIO;
 667
 668 done:
 669         if (in_crit)
 670                 nbl_end_crit(vp);
 671         return (error);
 672 }
 673
 674 /*
 675  * Release a vnode.  Call fop_inactive on last reference or
 676  * decrement reference count.
 677  *
 678  * To avoid race conditions, the v_count is left at 1 for
 679  * the call to fop_inactive. This prevents another thread
 680  * from reclaiming and releasing the vnode *before* the
 681  * fop_inactive routine has a chance to destroy the vnode.
 682  * We can't have more than 1 thread calling fop_inactive
 683  * on a vnode.
 684  */
 685 void
 686 vn_rele(vnode_t *vp)
 687 {
 688         VERIFY(vp->v_count > 0);
 689         mutex_enter(&vp->v_lock);
 690         if (vp->v_count == 1) {
 691                 mutex_exit(&vp->v_lock);
 692                 fop_inactive(vp, CRED(), NULL);
 693                 return;
 694         }
 695         VN_RELE_LOCKED(vp);
 696         mutex_exit(&vp->v_lock);
 697 }
 698
 699 /*
 700  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 701  * as a single reference, so v_count is not decremented until the last DNLC hold
 702  * is released. This makes it possible to distinguish vnodes that are referenced
 703  * only by the DNLC.
 704  */
 705 void
 706 vn_rele_dnlc(vnode_t *vp)
 707 {
 708         VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 709         mutex_enter(&vp->v_lock);
 710         if (--vp->v_count_dnlc == 0) {
 711                 if (vp->v_count == 1) {
 712                         mutex_exit(&vp->v_lock);
 713                         fop_inactive(vp, CRED(), NULL);
 714                         return;
 715                 }
 716                 VN_RELE_LOCKED(vp);
 717         }
 718         mutex_exit(&vp->v_lock);
 719 }
 720
 721 /*
 722  * Like vn_rele() except that it clears v_stream under v_lock.
 723  * This is used by sockfs when it dismantles the association between
 724  * the sockfs node and the vnode in the underlying file system.
 725  * v_lock has to be held to prevent a thread coming through the lookupname
 726  * path from accessing a stream head that is going away.
 727  */
 728 void
 729 vn_rele_stream(vnode_t *vp)
 730 {
 731         VERIFY(vp->v_count > 0);
 732         mutex_enter(&vp->v_lock);
 733         vp->v_stream = NULL;
 734         if (vp->v_count == 1) {
 735                 mutex_exit(&vp->v_lock);
 736                 fop_inactive(vp, CRED(), NULL);
 737                 return;
 738         }
 739         VN_RELE_LOCKED(vp);
 740         mutex_exit(&vp->v_lock);
 741 }
 742
 743 static void
 744 vn_rele_inactive(vnode_t *vp)
 745 {
 746         fop_inactive(vp, CRED(), NULL);
 747 }
 748
 749 /*
 750  * Like vn_rele() except if we are going to call fop_inactive() then do it
 751  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 752  * the file system as a result of releasing the vnode. Note, file systems
 753  * already have to handle the race where the vnode is incremented before the
 754  * inactive routine is called and does its locking.
 755  *
 756  * Warning: Excessive use of this routine can lead to performance problems.
 757  * This is because taskqs throttle back allocation if too many are created.
 758  */
 759 void
 760 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 761 {
 762         VERIFY(vp->v_count > 0);
 763         mutex_enter(&vp->v_lock);
 764         if (vp->v_count == 1) {
 765                 mutex_exit(&vp->v_lock);
 766                 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 767                     vp, TQ_SLEEP) != (uintptr_t)NULL);
 768                 return;
 769         }
 770         VN_RELE_LOCKED(vp);
 771         mutex_exit(&vp->v_lock);
 772 }
 773
 774 int
 775 vn_open(
 776         char *pnamep,
 777         enum uio_seg seg,
 778         int filemode,
 779         int createmode,
 780         struct vnode **vpp,
 781         enum create crwhy,
 782         mode_t umask)
 783 {
 784         return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 785             umask, NULL, -1));
 786 }
 787
 788
 789 /*
 790  * Open/create a vnode.
 791  * This may be callable by the kernel, the only known use
 792  * of user context being that the current user credentials
 793  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 794  */
 795 int
 796 vn_openat(
 797         char *pnamep,
 798         enum uio_seg seg,
 799         int filemode,
 800         int createmode,
 801         struct vnode **vpp,
 802         enum create crwhy,
 803         mode_t umask,
 804         struct vnode *startvp,
 805         int fd)
 806 {
 807         struct vnode *vp;
 808         int mode;
 809         int accessflags;
 810         int error;
 811         int in_crit = 0;
 812         int open_done = 0;
 813         int shrlock_done = 0;
 814         struct vattr vattr;
 815         enum symfollow follow;
 816         int estale_retry = 0;
 817         struct shrlock shr;
 818         struct shr_locowner shr_own;
 819
 820         if (filemode & FSEARCH)
 821                 filemode |= FDIRECTORY;
 822
 823         mode = 0;
 824         accessflags = 0;
 825         if (filemode & FREAD)
 826                 mode |= VREAD;
 827         if (filemode & (FWRITE|FTRUNC))
 828                 mode |= VWRITE;
 829         if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 830                 mode |= VEXEC;
 831
 832         /* symlink interpretation */
 833         if (filemode & FNOFOLLOW)
 834                 follow = NO_FOLLOW;
 835         else
 836                 follow = FOLLOW;
 837
 838         if (filemode & FAPPEND)
 839                 accessflags |= V_APPEND;
 840
 841 top:
 842         if (filemode & FCREAT && !(filemode & FDIRECTORY)) {
 843                 enum vcexcl excl;
 844
 845                 /* Wish to create a file. */
 846                 vattr.va_type = VREG;
 847                 vattr.va_mode = createmode;
 848                 vattr.va_mask = VATTR_TYPE|VATTR_MODE;
 849                 if (filemode & FTRUNC) {
 850                         vattr.va_size = 0;
 851                         vattr.va_mask |= VATTR_SIZE;
 852                 }
 853                 if (filemode & FEXCL)
 854                         excl = EXCL;
 855                 else
 856                         excl = NONEXCL;
 857
 858                 if (error =
 859                     vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
 860                     (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
 861                         return (error);
 862         } else {
 863                 /* Wish to open a file.  Just look it up. */
 864                 if (error = lookupnameat(pnamep, seg, follow,
 865                     NULLVPP, &vp, startvp)) {
 866                         if ((error == ESTALE) &&
 867                             fs_need_estale_retry(estale_retry++))
 868                                 goto top;
 869                         return (error);
 870                 }
 871
 872                 /*
 873                  * Get the attributes to check whether file is large.
 874                  * We do this only if the FOFFMAX flag is not set and
 875                  * only for regular files.
 876                  */
 877
 878                 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
 879                         vattr.va_mask = VATTR_SIZE;
 880                         if ((error = fop_getattr(vp, &vattr, 0,
 881                             CRED(), NULL))) {
 882                                 goto out;
 883                         }
 884                         if (vattr.va_size > (uoff_t)MAXOFF32_T) {
 885                                 /*
 886                                  * Large File API - regular open fails
 887                                  * if FOFFMAX flag is set in file mode
 888                                  */
 889                                 error = EOVERFLOW;
 890                                 goto out;
 891                         }
 892                 }
 893                 /*
 894                  * Can't write directories, active texts, or
 895                  * read-only filesystems.  Can't truncate files
 896                  * on which mandatory locking is in effect.
 897                  */
 898                 if (filemode & (FWRITE|FTRUNC)) {
 899                         /*
 900                          * Allow writable directory if VDIROPEN flag is set.
 901                          */
 902                         if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
 903                                 error = EISDIR;
 904                                 goto out;
 905                         }
 906                         if (ISROFILE(vp)) {
 907                                 error = EROFS;
 908                                 goto out;
 909                         }
 910                         /*
 911                          * Can't truncate files on which
 912                          * sysv mandatory locking is in effect.
 913                          */
 914                         if (filemode & FTRUNC) {
 915                                 vnode_t *rvp;
 916
 917                                 if (fop_realvp(vp, &rvp, NULL) != 0)
 918                                         rvp = vp;
 919                                 if (rvp->v_filocks != NULL) {
 920                                         vattr.va_mask = VATTR_MODE;
 921                                         if ((error = fop_getattr(vp,
 922                                             &vattr, 0, CRED(), NULL)) == 0 &&
 923                                             MANDLOCK(vp, vattr.va_mode))
 924                                                 error = EAGAIN;
 925                                 }
 926                         }
 927                         if (error)
 928                                 goto out;
 929                 }
 930                 /*
 931                  * Check permissions.
 932                  */
 933                 if (error = fop_access(vp, mode, accessflags, CRED(), NULL))
 934                         goto out;
 935                 /*
 936                  * Require FDIRECTORY to return a directory.
 937                  * Require FEXEC to return a regular file.
 938                  */
 939                 if ((filemode & FDIRECTORY) && vp->v_type != VDIR) {
 940                         error = ENOTDIR;
 941                         goto out;
 942                 }
 943                 if ((filemode & FEXEC) && vp->v_type != VREG) {
 944                         error = ENOEXEC;        /* XXX: error code? */
 945                         goto out;
 946                 }
 947         }
 948
 949         /*
 950          * Do remaining checks for FNOFOLLOW and FNOLINKS.
 951          */
 952         if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
 953                 error = ELOOP;
 954                 goto out;
 955         }
 956         if (filemode & FNOLINKS) {
 957                 vattr.va_mask = VATTR_NLINK;
 958                 if ((error = fop_getattr(vp, &vattr, 0, CRED(), NULL))) {
 959                         goto out;
 960                 }
 961                 if (vattr.va_nlink != 1) {
 962                         error = EMLINK;
 963                         goto out;
 964                 }
 965         }
 966
 967         /*
 968          * Opening a socket corresponding to the AF_UNIX pathname
 969          * in the filesystem name space is not supported.
 970          * However, VSOCK nodes in namefs are supported in order
 971          * to make fattach work for sockets.
 972          *
 973          * XXX This uses fop_realvp to distinguish between
 974          * an unopened namefs node (where fop_realvp returns a
 975          * different VSOCK vnode) and a VSOCK created by vn_create
 976          * in some file system (where fop_realvp would never return
 977          * a different vnode).
 978          */
 979         if (vp->v_type == VSOCK) {
 980                 struct vnode *nvp;
 981
 982                 error = fop_realvp(vp, &nvp, NULL);
 983                 if (error != 0 || nvp == NULL || nvp == vp ||
 984                     nvp->v_type != VSOCK) {
 985                         error = EOPNOTSUPP;
 986                         goto out;
 987                 }
 988         }
 989
 990         if ((vp->v_type == VREG) && nbl_need_check(vp)) {
 991                 /* get share reservation */
 992                 shr.s_access = 0;
 993                 if (filemode & FWRITE)
 994                         shr.s_access |= F_WRACC;
 995                 if (filemode & FREAD)
 996                         shr.s_access |= F_RDACC;
 997                 shr.s_deny = 0;
 998                 shr.s_sysid = 0;
 999                 shr.s_pid = ttoproc(curthread)->p_pid;
1000                 shr_own.sl_pid = shr.s_pid;
1001                 shr_own.sl_id = fd;
1002                 shr.s_own_len = sizeof (shr_own);
1003                 shr.s_owner = (caddr_t)&shr_own;
1004                 error = fop_shrlock(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1005                     NULL);
1006                 if (error)
1007                         goto out;
1008                 shrlock_done = 1;
1009
1010                 /* nbmand conflict check if truncating file */
1011                 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1012                         nbl_start_crit(vp, RW_READER);
1013                         in_crit = 1;
1014
1015                         vattr.va_mask = VATTR_SIZE;
1016                         if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL))
1017                                 goto out;
1018                         if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1019                             NULL)) {
1020                                 error = EACCES;
1021                                 goto out;
1022                         }
1023                 }
1024         }
1025
1026         /*
1027          * Do opening protocol.
1028          */
1029         error = fop_open(&vp, filemode, CRED(), NULL);
1030         if (error)
1031                 goto out;
1032         open_done = 1;
1033
1034         /*
1035          * Truncate if required.
1036          */
1037         if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1038                 vattr.va_size = 0;
1039                 vattr.va_mask = VATTR_SIZE;
1040                 if ((error = fop_setattr(vp, &vattr, 0, CRED(), NULL)) != 0)
1041                         goto out;
1042         }
1043 out:
1044         ASSERT(vp->v_count > 0);
1045
1046         if (in_crit) {
1047                 nbl_end_crit(vp);
1048                 in_crit = 0;
1049         }
1050         if (error) {
1051                 if (open_done) {
1052                         (void) fop_close(vp, filemode, 1, 0, CRED(),
1053                             NULL);
1054                         open_done = 0;
1055                         shrlock_done = 0;
1056                 }
1057                 if (shrlock_done) {
1058                         (void) fop_shrlock(vp, F_UNSHARE, &shr, 0, CRED(),
1059                             NULL);
1060                         shrlock_done = 0;
1061                 }
1062
1063                 /*
1064                  * The following clause was added to handle a problem
1065                  * with NFS consistency.  It is possible that a lookup
1066                  * of the file to be opened succeeded, but the file
1067                  * itself doesn't actually exist on the server.  This
1068                  * is chiefly due to the DNLC containing an entry for
1069                  * the file which has been removed on the server.  In
1070                  * this case, we just start over.  If there was some
1071                  * other cause for the ESTALE error, then the lookup
1072                  * of the file will fail and the error will be returned
1073                  * above instead of looping around from here.
1074                  */
1075                 VN_RELE(vp);
1076                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1077                         goto top;
1078         } else
1079                 *vpp = vp;
1080         return (error);
1081 }
1082
1083 /*
1084  * The following two accessor functions are for the NFSv4 server.  Since there
1085  * is no fop_open_UP/DOWNGRADE we need a way for the NFS server to keep the
1086  * vnode open counts correct when a client "upgrades" an open or does an
1087  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1088  * open mode (add or subtract read or write), but also change the share/deny
1089  * modes.  However, share reservations are not integrated with OPEN, yet, so
1090  * we need to handle each separately.  These functions are cleaner than having
1091  * the NFS server manipulate the counts directly, however, nobody else should
1092  * use these functions.
1093  */
1094 void
1095 vn_open_upgrade(
1096         vnode_t *vp,
1097         int filemode)
1098 {
1099         ASSERT(vp->v_type == VREG);
1100
1101         if (filemode & FREAD)
1102                 atomic_inc_32(&vp->v_rdcnt);
1103         if (filemode & FWRITE)
1104                 atomic_inc_32(&vp->v_wrcnt);
1105
1106 }
1107
1108 void
1109 vn_open_downgrade(
1110         vnode_t *vp,
1111         int filemode)
1112 {
1113         ASSERT(vp->v_type == VREG);
1114
1115         if (filemode & FREAD) {
1116                 ASSERT(vp->v_rdcnt > 0);
1117                 atomic_dec_32(&vp->v_rdcnt);
1118         }
1119         if (filemode & FWRITE) {
1120                 ASSERT(vp->v_wrcnt > 0);
1121                 atomic_dec_32(&vp->v_wrcnt);
1122         }
1123
1124 }
1125
1126 int
1127 vn_create(
1128         char *pnamep,
1129         enum uio_seg seg,
1130         struct vattr *vap,
1131         enum vcexcl excl,
1132         int mode,
1133         struct vnode **vpp,
1134         enum create why,
1135         int flag,
1136         mode_t umask)
1137 {
1138         return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1139             umask, NULL));
1140 }
1141
1142 /*
1143  * Create a vnode (makenode).
1144  */
1145 int
1146 vn_createat(
1147         char *pnamep,
1148         enum uio_seg seg,
1149         struct vattr *vap,
1150         enum vcexcl excl,
1151         int mode,
1152         struct vnode **vpp,
1153         enum create why,
1154         int flag,
1155         mode_t umask,
1156         struct vnode *startvp)
1157 {
1158         struct vnode *dvp;      /* ptr to parent dir vnode */
1159         struct vnode *vp = NULL;
1160         struct pathname pn;
1161         int error;
1162         int in_crit = 0;
1163         struct vattr vattr;
1164         enum symfollow follow;
1165         int estale_retry = 0;
1166
1167         ASSERT((vap->va_mask & (VATTR_TYPE|VATTR_MODE)) == (VATTR_TYPE|VATTR_MODE));
1168
1169         /* symlink interpretation */
1170         if ((flag & FNOFOLLOW) || excl == EXCL)
1171                 follow = NO_FOLLOW;
1172         else
1173                 follow = FOLLOW;
1174         flag &= ~(FNOFOLLOW|FNOLINKS);
1175
1176 top:
1177         /*
1178          * Lookup directory.
1179          * If new object is a file, call lower level to create it.
1180          * Note that it is up to the lower level to enforce exclusive
1181          * creation, if the file is already there.
1182          * This allows the lower level to do whatever
1183          * locking or protocol that is needed to prevent races.
1184          * If the new object is directory call lower level to make
1185          * the new directory, with "." and "..".
1186          */
1187         if (error = pn_get(pnamep, seg, &pn))
1188                 return (error);
1189         dvp = NULL;
1190         *vpp = NULL;
1191         /*
1192          * lookup will find the parent directory for the vnode.
1193          * When it is done the pn holds the name of the entry
1194          * in the directory.
1195          * If this is a non-exclusive create we also find the node itself.
1196          */
1197         error = lookuppnat(&pn, NULL, follow, &dvp,
1198             (excl == EXCL) ? NULLVPP : vpp, startvp);
1199         if (error) {
1200                 pn_free(&pn);
1201                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1202                         goto top;
1203                 if (why == CRMKDIR && error == EINVAL)
1204                         error = EEXIST;         /* SVID */
1205                 return (error);
1206         }
1207
1208         if (why != CRMKNOD)
1209                 vap->va_mode &= ~VSVTX;
1210
1211         /*
1212          * If default ACLs are defined for the directory don't apply the
1213          * umask if umask is passed.
1214          */
1215
1216         if (umask) {
1217
1218                 vsecattr_t vsec;
1219
1220                 vsec.vsa_aclcnt = 0;
1221                 vsec.vsa_aclentp = NULL;
1222                 vsec.vsa_dfaclcnt = 0;
1223                 vsec.vsa_dfaclentp = NULL;
1224                 vsec.vsa_mask = VSA_DFACLCNT;
1225                 error = fop_getsecattr(dvp, &vsec, 0, CRED(), NULL);
1226                 /*
1227                  * If error is ENOSYS then treat it as no error
1228                  * Don't want to force all file systems to support
1229                  * aclent_t style of ACL's.
1230                  */
1231                 if (error == ENOSYS)
1232                         error = 0;
1233                 if (error) {
1234                         if (*vpp != NULL)
1235                                 VN_RELE(*vpp);
1236                         goto out;
1237                 } else {
1238                         /*
1239                          * Apply the umask if no default ACLs.
1240                          */
1241                         if (vsec.vsa_dfaclcnt == 0)
1242                                 vap->va_mode &= ~umask;
1243
1244                         /*
1245                          * fop_getsecattr() may have allocated memory for
1246                          * ACLs we didn't request, so double-check and
1247                          * free it if necessary.
1248                          */
1249                         if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1250                                 kmem_free((caddr_t)vsec.vsa_aclentp,
1251                                     vsec.vsa_aclcnt * sizeof (aclent_t));
1252                         if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1253                                 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1254                                     vsec.vsa_dfaclcnt * sizeof (aclent_t));
1255                 }
1256         }
1257
1258         /*
1259          * In general we want to generate EROFS if the file system is
1260          * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1261          * documents the open system call, and it says that O_CREAT has no
1262          * effect if the file already exists.  Bug 1119649 states
1263          * that open(path, O_CREAT, ...) fails when attempting to open an
1264          * existing file on a read only file system.  Thus, the first part
1265          * of the following if statement has 3 checks:
1266          *      if the file exists &&
1267          *              it is being open with write access &&
1268          *              the file system is read only
1269          *      then generate EROFS
1270          */
1271         if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1272             (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1273                 if (*vpp)
1274                         VN_RELE(*vpp);
1275                 error = EROFS;
1276         } else if (excl == NONEXCL && *vpp != NULL) {
1277                 vnode_t *rvp;
1278
1279                 /*
1280                  * File already exists.  If a mandatory lock has been
1281                  * applied, return error.
1282                  */
1283                 vp = *vpp;
1284                 if (fop_realvp(vp, &rvp, NULL) != 0)
1285                         rvp = vp;
1286                 if ((vap->va_mask & VATTR_SIZE) && nbl_need_check(vp)) {
1287                         nbl_start_crit(vp, RW_READER);
1288                         in_crit = 1;
1289                 }
1290                 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1291                         vattr.va_mask = VATTR_MODE|VATTR_SIZE;
1292                         if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL)) {
1293                                 goto out;
1294                         }
1295                         if (MANDLOCK(vp, vattr.va_mode)) {
1296                                 error = EAGAIN;
1297                                 goto out;
1298                         }
1299                         /*
1300                          * File cannot be truncated if non-blocking mandatory
1301                          * locks are currently on the file.
1302                          */
1303                         if ((vap->va_mask & VATTR_SIZE) && in_crit) {
1304                                 uoff_t offset;
1305                                 ssize_t length;
1306
1307                                 offset = vap->va_size > vattr.va_size ?
1308                                     vattr.va_size : vap->va_size;
1309                                 length = vap->va_size > vattr.va_size ?
1310                                     vap->va_size - vattr.va_size :
1311                                     vattr.va_size - vap->va_size;
1312                                 if (nbl_conflict(vp, NBL_WRITE, offset,
1313                                     length, 0, NULL)) {
1314                                         error = EACCES;
1315                                         goto out;
1316                                 }
1317                         }
1318                 }
1319
1320                 /*
1321                  * If the file is the root of a VFS, we've crossed a
1322                  * mount point and the "containing" directory that we
1323                  * acquired above (dvp) is irrelevant because it's in
1324                  * a different file system.  We apply fop_create to the
1325                  * target itself instead of to the containing directory
1326                  * and supply a null path name to indicate (conventionally)
1327                  * the node itself as the "component" of interest.
1328                  *
1329                  * The call to fop_create() is necessary to ensure
1330                  * that the appropriate permission checks are made,
1331                  * i.e. EISDIR, EACCES, etc.  We already know that vpp
1332                  * exists since we are in the else condition where this
1333                  * was checked.
1334                  */
1335                 if (vp->v_flag & VROOT) {
1336                         ASSERT(why != CRMKDIR);
1337                         error = fop_create(vp, "", vap, excl, mode, vpp,
1338                             CRED(), flag, NULL, NULL);
1339                         /*
1340                          * If the create succeeded, it will have created a
1341                          * new reference on a new vnode (*vpp) in the child
1342                          * file system, so we want to drop our reference on
1343                          * the old (vp) upon exit.
1344                          */
1345                         goto out;
1346                 }
1347
1348                 /*
1349                  * Large File API - non-large open (FOFFMAX flag not set)
1350                  * of regular file fails if the file size exceeds MAXOFF32_T.
1351                  */
1352                 if (why != CRMKDIR &&
1353                     !(flag & FOFFMAX) &&
1354                     (vp->v_type == VREG)) {
1355                         vattr.va_mask = VATTR_SIZE;
1356                         if ((error = fop_getattr(vp, &vattr, 0,
1357                             CRED(), NULL))) {
1358                                 goto out;
1359                         }
1360                         if ((vattr.va_size > (uoff_t)MAXOFF32_T)) {
1361                                 error = EOVERFLOW;
1362                                 goto out;
1363                         }
1364                 }
1365         }
1366
1367         if (error == 0) {
1368                 /*
1369                  * Call mkdir() if specified, otherwise create().
1370                  */
1371                 int must_be_dir = pn_fixslash(&pn);     /* trailing '/'? */
1372
1373                 if (why == CRMKDIR)
1374                         /*
1375                          * N.B., if vn_createat() ever requests
1376                          * case-insensitive behavior then it will need
1377                          * to be passed to fop_mkdir().  fop_create()
1378                          * will already get it via "flag"
1379                          */
1380                         error = fop_mkdir(dvp, pn.pn_path, vap, vpp, CRED(),
1381                             NULL, 0, NULL);
1382                 else if (!must_be_dir)
1383                         error = fop_create(dvp, pn.pn_path, vap,
1384                             excl, mode, vpp, CRED(), flag, NULL, NULL);
1385                 else
1386                         error = ENOTDIR;
1387         }
1388
1389 out:
1390
1391         if (in_crit) {
1392                 nbl_end_crit(vp);
1393                 in_crit = 0;
1394         }
1395         if (vp != NULL) {
1396                 VN_RELE(vp);
1397                 vp = NULL;
1398         }
1399         pn_free(&pn);
1400         VN_RELE(dvp);
1401         /*
1402          * The following clause was added to handle a problem
1403          * with NFS consistency.  It is possible that a lookup
1404          * of the file to be created succeeded, but the file
1405          * itself doesn't actually exist on the server.  This
1406          * is chiefly due to the DNLC containing an entry for
1407          * the file which has been removed on the server.  In
1408          * this case, we just start over.  If there was some
1409          * other cause for the ESTALE error, then the lookup
1410          * of the file will fail and the error will be returned
1411          * above instead of looping around from here.
1412          */
1413         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1414                 goto top;
1415         return (error);
1416 }
1417
1418 int
1419 vn_link(char *from, char *to, enum uio_seg seg)
1420 {
1421         return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1422 }
1423
1424 int
1425 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1426     vnode_t *tstartvp, char *to, enum uio_seg seg)
1427 {
1428         struct vnode *fvp;              /* from vnode ptr */
1429         struct vnode *tdvp;             /* to directory vnode ptr */
1430         struct pathname pn;
1431         int error;
1432         struct vattr vattr;
1433         dev_t fsid;
1434         int estale_retry = 0;
1435
1436 top:
1437         fvp = tdvp = NULL;
1438         if (error = pn_get(to, seg, &pn))
1439                 return (error);
1440         if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1441                 goto out;
1442         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1443                 goto out;
1444         /*
1445          * Make sure both source vnode and target directory vnode are
1446          * in the same vfs and that it is writeable.
1447          */
1448         vattr.va_mask = VATTR_FSID;
1449         if (error = fop_getattr(fvp, &vattr, 0, CRED(), NULL))
1450                 goto out;
1451         fsid = vattr.va_fsid;
1452         vattr.va_mask = VATTR_FSID;
1453         if (error = fop_getattr(tdvp, &vattr, 0, CRED(), NULL))
1454                 goto out;
1455         if (fsid != vattr.va_fsid) {
1456                 error = EXDEV;
1457                 goto out;
1458         }
1459         if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1460                 error = EROFS;
1461                 goto out;
1462         }
1463         /*
1464          * Do the link.
1465          */
1466         (void) pn_fixslash(&pn);
1467         error = fop_link(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1468 out:
1469         pn_free(&pn);
1470         if (fvp)
1471                 VN_RELE(fvp);
1472         if (tdvp)
1473                 VN_RELE(tdvp);
1474         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1475                 goto top;
1476         return (error);
1477 }
1478
1479 int
1480 vn_rename(char *from, char *to, enum uio_seg seg)
1481 {
1482         return (vn_renameat(NULL, from, NULL, to, seg));
1483 }
1484
1485 int
1486 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1487     char *tname, enum uio_seg seg)
1488 {
1489         int error;
1490         struct vattr vattr;
1491         struct pathname fpn;            /* from pathname */
1492         struct pathname tpn;            /* to pathname */
1493         dev_t fsid;
1494         int in_crit_src, in_crit_targ;
1495         vnode_t *fromvp, *fvp;
1496         vnode_t *tovp, *targvp;
1497         int estale_retry = 0;
1498
1499 top:
1500         fvp = fromvp = tovp = targvp = NULL;
1501         in_crit_src = in_crit_targ = 0;
1502         /*
1503          * Get to and from pathnames.
1504          */
1505         if (error = pn_get(fname, seg, &fpn))
1506                 return (error);
1507         if (error = pn_get(tname, seg, &tpn)) {
1508                 pn_free(&fpn);
1509                 return (error);
1510         }
1511
1512         /*
1513          * First we need to resolve the correct directories
1514          * The passed in directories may only be a starting point,
1515          * but we need the real directories the file(s) live in.
1516          * For example the fname may be something like usr/lib/sparc
1517          * and we were passed in the / directory, but we need to
1518          * use the lib directory for the rename.
1519          */
1520
1521         /*
1522          * Lookup to and from directories.
1523          */
1524         if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1525                 goto out;
1526         }
1527
1528         /*
1529          * Make sure there is an entry.
1530          */
1531         if (fvp == NULL) {
1532                 error = ENOENT;
1533                 goto out;
1534         }
1535
1536         if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1537                 goto out;
1538         }
1539
1540         /*
1541          * Make sure both the from vnode directory and the to directory
1542          * are in the same vfs and the to directory is writable.
1543          * We check fsid's, not vfs pointers, so loopback fs works.
1544          */
1545         if (fromvp != tovp) {
1546                 vattr.va_mask = VATTR_FSID;
1547                 if (error = fop_getattr(fromvp, &vattr, 0, CRED(), NULL))
1548                         goto out;
1549                 fsid = vattr.va_fsid;
1550                 vattr.va_mask = VATTR_FSID;
1551                 if (error = fop_getattr(tovp, &vattr, 0, CRED(), NULL))
1552                         goto out;
1553                 if (fsid != vattr.va_fsid) {
1554                         error = EXDEV;
1555                         goto out;
1556                 }
1557         }
1558
1559         if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1560                 error = EROFS;
1561                 goto out;
1562         }
1563
1564         /*
1565          * Make sure "from" vp is not a mount point.
1566          * Note, lookup did traverse() already, so
1567          * we'll be looking at the mounted FS root.
1568          * (but allow files like mnttab)
1569          */
1570         if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1571                 error = EBUSY;
1572                 goto out;
1573         }
1574
1575         if (targvp && (fvp != targvp)) {
1576                 nbl_start_crit(targvp, RW_READER);
1577                 in_crit_targ = 1;
1578                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1579                         error = EACCES;
1580                         goto out;
1581                 }
1582         }
1583
1584         if (nbl_need_check(fvp)) {
1585                 nbl_start_crit(fvp, RW_READER);
1586                 in_crit_src = 1;
1587                 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1588                         error = EACCES;
1589                         goto out;
1590                 }
1591         }
1592
1593         /*
1594          * Do the rename.
1595          */
1596         (void) pn_fixslash(&tpn);
1597         error = fop_rename(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1598             NULL, 0);
1599
1600 out:
1601         pn_free(&fpn);
1602         pn_free(&tpn);
1603         if (in_crit_src)
1604                 nbl_end_crit(fvp);
1605         if (in_crit_targ)
1606                 nbl_end_crit(targvp);
1607         if (fromvp)
1608                 VN_RELE(fromvp);
1609         if (tovp)
1610                 VN_RELE(tovp);
1611         if (targvp)
1612                 VN_RELE(targvp);
1613         if (fvp)
1614                 VN_RELE(fvp);
1615         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1616                 goto top;
1617         return (error);
1618 }
1619
1620 /*
1621  * Remove a file or directory.
1622  */
1623 int
1624 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1625 {
1626         return (vn_removeat(NULL, fnamep, seg, dirflag));
1627 }
1628
1629 int
1630 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1631 {
1632         struct vnode *vp;               /* entry vnode */
1633         struct vnode *dvp;              /* ptr to parent dir vnode */
1634         struct vnode *coveredvp;
1635         struct pathname pn;             /* name of entry */
1636         enum vtype vtype;
1637         int error;
1638         struct vfs *vfsp;
1639         struct vfs *dvfsp;      /* ptr to parent dir vfs */
1640         int in_crit = 0;
1641         int estale_retry = 0;
1642
1643 top:
1644         if (error = pn_get(fnamep, seg, &pn))
1645                 return (error);
1646         dvp = vp = NULL;
1647         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1648                 pn_free(&pn);
1649                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1650                         goto top;
1651                 return (error);
1652         }
1653
1654         /*
1655          * Make sure there is an entry.
1656          */
1657         if (vp == NULL) {
1658                 error = ENOENT;
1659                 goto out;
1660         }
1661
1662         vfsp = vp->v_vfsp;
1663         dvfsp = dvp->v_vfsp;
1664
1665         /*
1666          * If the named file is the root of a mounted filesystem, fail,
1667          * unless it's marked unlinkable.  In that case, unmount the
1668          * filesystem and proceed to unlink the covered vnode.  (If the
1669          * covered vnode is a directory, use rmdir instead of unlink,
1670          * to avoid file system corruption.)
1671          */
1672         if (vp->v_flag & VROOT) {
1673                 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1674                         error = EBUSY;
1675                         goto out;
1676                 }
1677
1678                 /*
1679                  * Namefs specific code starts here.
1680                  */
1681
1682                 if (dirflag == RMDIRECTORY) {
1683                         /*
1684                          * User called rmdir(2) on a file that has
1685                          * been namefs mounted on top of.  Since
1686                          * namefs doesn't allow directories to
1687                          * be mounted on other files we know
1688                          * vp is not of type VDIR so fail to operation.
1689                          */
1690                         error = ENOTDIR;
1691                         goto out;
1692                 }
1693
1694                 /*
1695                  * If VROOT is still set after grabbing vp->v_lock,
1696                  * noone has finished nm_unmount so far and coveredvp
1697                  * is valid.
1698                  * If we manage to grab vn_vfswlock(coveredvp) before releasing
1699                  * vp->v_lock, any race window is eliminated.
1700                  */
1701
1702                 mutex_enter(&vp->v_lock);
1703                 if ((vp->v_flag & VROOT) == 0) {
1704                         /* Someone beat us to the unmount */
1705                         mutex_exit(&vp->v_lock);
1706                         error = EBUSY;
1707                         goto out;
1708                 }
1709                 vfsp = vp->v_vfsp;
1710                 coveredvp = vfsp->vfs_vnodecovered;
1711                 ASSERT(coveredvp);
1712                 /*
1713                  * Note: Implementation of vn_vfswlock shows that ordering of
1714                  * v_lock / vn_vfswlock is not an issue here.
1715                  */
1716                 error = vn_vfswlock(coveredvp);
1717                 mutex_exit(&vp->v_lock);
1718
1719                 if (error)
1720                         goto out;
1721
1722                 VN_HOLD(coveredvp);
1723                 VN_RELE(vp);
1724                 error = dounmount(vfsp, 0, CRED());
1725
1726                 /*
1727                  * Unmounted the namefs file system; now get
1728                  * the object it was mounted over.
1729                  */
1730                 vp = coveredvp;
1731                 /*
1732                  * If namefs was mounted over a directory, then
1733                  * we want to use rmdir() instead of unlink().
1734                  */
1735                 if (vp->v_type == VDIR)
1736                         dirflag = RMDIRECTORY;
1737
1738                 if (error)
1739                         goto out;
1740         }
1741
1742         /*
1743          * Make sure filesystem is writeable.
1744          * We check the parent directory's vfs in case this is an lofs vnode.
1745          */
1746         if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1747                 error = EROFS;
1748                 goto out;
1749         }
1750
1751         vtype = vp->v_type;
1752
1753         /*
1754          * If there is the possibility of an nbmand share reservation, make
1755          * sure it's okay to remove the file.  Keep a reference to the
1756          * vnode, so that we can exit the nbl critical region after
1757          * calling fop_remove.
1758          * If there is no possibility of an nbmand share reservation,
1759          * release the vnode reference now.  Filesystems like NFS may
1760          * behave differently if there is an extra reference, so get rid of
1761          * this one.  Fortunately, we can't have nbmand mounts on NFS
1762          * filesystems.
1763          */
1764         if (nbl_need_check(vp)) {
1765                 nbl_start_crit(vp, RW_READER);
1766                 in_crit = 1;
1767                 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1768                         error = EACCES;
1769                         goto out;
1770                 }
1771         } else {
1772                 VN_RELE(vp);
1773                 vp = NULL;
1774         }
1775
1776         if (dirflag == RMDIRECTORY) {
1777                 /*
1778                  * Caller is using rmdir(2), which can only be applied to
1779                  * directories.
1780                  */
1781                 if (vtype != VDIR) {
1782                         error = ENOTDIR;
1783                 } else {
1784                         vnode_t *cwd;
1785                         proc_t *pp = curproc;
1786
1787                         mutex_enter(&pp->p_lock);
1788                         cwd = PTOU(pp)->u_cdir;
1789                         VN_HOLD(cwd);
1790                         mutex_exit(&pp->p_lock);
1791                         error = fop_rmdir(dvp, pn.pn_path, cwd, CRED(),
1792                             NULL, 0);
1793                         VN_RELE(cwd);
1794                 }
1795         } else {
1796                 /*
1797                  * Unlink(2) can be applied to anything.
1798                  */
1799                 error = fop_remove(dvp, pn.pn_path, CRED(), NULL, 0);
1800         }
1801
1802 out:
1803         pn_free(&pn);
1804         if (in_crit) {
1805                 nbl_end_crit(vp);
1806                 in_crit = 0;
1807         }
1808         if (vp != NULL)
1809                 VN_RELE(vp);
1810         if (dvp != NULL)
1811                 VN_RELE(dvp);
1812         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1813                 goto top;
1814         return (error);
1815 }
1816
1817 /*
1818  * Utility function to compare equality of vnodes.
1819  * Compare the underlying real vnodes, if there are underlying vnodes.
1820  * This is a more thorough comparison than the VN_CMP() macro provides.
1821  */
1822 int
1823 vn_compare(vnode_t *vp1, vnode_t *vp2)
1824 {
1825         vnode_t *realvp;
1826
1827         if (vp1 != NULL && fop_realvp(vp1, &realvp, NULL) == 0)
1828                 vp1 = realvp;
1829         if (vp2 != NULL && fop_realvp(vp2, &realvp, NULL) == 0)
1830                 vp2 = realvp;
1831         return (VN_CMP(vp1, vp2));
1832 }
1833
1834 /*
1835  * The number of locks to hash into.  This value must be a power
1836  * of 2 minus 1 and should probably also be prime.
1837  */
1838 #define NUM_BUCKETS     1023
1839
1840 struct  vn_vfslocks_bucket {
1841         kmutex_t vb_lock;
1842         vn_vfslocks_entry_t *vb_list;
1843         char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
1844 };
1845
1846 /*
1847  * Total number of buckets will be NUM_BUCKETS + 1 .
1848  */
1849
1850 #pragma align   64(vn_vfslocks_buckets)
1851 static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
1852
1853 #define VN_VFSLOCKS_SHIFT       9
1854
1855 #define VN_VFSLOCKS_HASH(vfsvpptr)      \
1856         ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
1857
1858 /*
1859  * vn_vfslocks_getlock() uses an HASH scheme to generate
1860  * rwstlock using vfs/vnode pointer passed to it.
1861  *
1862  * vn_vfslocks_rele() releases a reference in the
1863  * HASH table which allows the entry allocated by
1864  * vn_vfslocks_getlock() to be freed at a later
1865  * stage when the refcount drops to zero.
1866  */
1867
1868 vn_vfslocks_entry_t *
1869 vn_vfslocks_getlock(void *vfsvpptr)
1870 {
1871         struct vn_vfslocks_bucket *bp;
1872         vn_vfslocks_entry_t *vep;
1873         vn_vfslocks_entry_t *tvep;
1874
1875         ASSERT(vfsvpptr != NULL);
1876         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
1877
1878         mutex_enter(&bp->vb_lock);
1879         for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
1880                 if (vep->ve_vpvfs == vfsvpptr) {
1881                         vep->ve_refcnt++;
1882                         mutex_exit(&bp->vb_lock);
1883                         return (vep);
1884                 }
1885         }
1886         mutex_exit(&bp->vb_lock);
1887         vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
1888         rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
1889         vep->ve_vpvfs = (char *)vfsvpptr;
1890         vep->ve_refcnt = 1;
1891         mutex_enter(&bp->vb_lock);
1892         for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
1893                 if (tvep->ve_vpvfs == vfsvpptr) {
1894                         tvep->ve_refcnt++;
1895                         mutex_exit(&bp->vb_lock);
1896
1897                         /*
1898                          * There is already an entry in the hash
1899                          * destroy what we just allocated.
1900                          */
1901                         rwst_destroy(&vep->ve_lock);
1902                         kmem_free(vep, sizeof (*vep));
1903                         return (tvep);
1904                 }
1905         }
1906         vep->ve_next = bp->vb_list;
1907         bp->vb_list = vep;
1908         mutex_exit(&bp->vb_lock);
1909         return (vep);
1910 }
1911
1912 void
1913 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
1914 {
1915         struct vn_vfslocks_bucket *bp;
1916         vn_vfslocks_entry_t *vep;
1917         vn_vfslocks_entry_t *pvep;
1918
1919         ASSERT(vepent != NULL);
1920         ASSERT(vepent->ve_vpvfs != NULL);
1921
1922         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
1923
1924         mutex_enter(&bp->vb_lock);
1925         vepent->ve_refcnt--;
1926
1927         if ((int32_t)vepent->ve_refcnt < 0)
1928                 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
1929
1930         if (vepent->ve_refcnt == 0) {
1931                 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
1932                         if (vep->ve_vpvfs == vepent->ve_vpvfs) {
1933                                 if (bp->vb_list == vep)
1934                                         bp->vb_list = vep->ve_next;
1935                                 else {
1936                                         /* LINTED */
1937                                         pvep->ve_next = vep->ve_next;
1938                                 }
1939                                 mutex_exit(&bp->vb_lock);
1940                                 rwst_destroy(&vep->ve_lock);
1941                                 kmem_free(vep, sizeof (*vep));
1942                                 return;
1943                         }
1944                         pvep = vep;
1945                 }
1946                 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
1947         }
1948         mutex_exit(&bp->vb_lock);
1949 }
1950
1951 /*
1952  * vn_vfswlock_wait is used to implement a lock which is logically a writers
1953  * lock protecting the v_vfsmountedhere field.
1954  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
1955  * except that it blocks to acquire the lock VVFSLOCK.
1956  *
1957  * traverse() and routines re-implementing part of traverse (e.g. autofs)
1958  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
1959  * need the non-blocking version of the writers lock i.e. vn_vfswlock
1960  */
1961 int
1962 vn_vfswlock_wait(vnode_t *vp)
1963 {
1964         int retval;
1965         vn_vfslocks_entry_t *vpvfsentry;
1966         ASSERT(vp != NULL);
1967
1968         vpvfsentry = vn_vfslocks_getlock(vp);
1969         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
1970
1971         if (retval == EINTR) {
1972                 vn_vfslocks_rele(vpvfsentry);
1973                 return (EINTR);
1974         }
1975         return (retval);
1976 }
1977
1978 int
1979 vn_vfsrlock_wait(vnode_t *vp)
1980 {
1981         int retval;
1982         vn_vfslocks_entry_t *vpvfsentry;
1983         ASSERT(vp != NULL);
1984
1985         vpvfsentry = vn_vfslocks_getlock(vp);
1986         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
1987
1988         if (retval == EINTR) {
1989                 vn_vfslocks_rele(vpvfsentry);
1990                 return (EINTR);
1991         }
1992
1993         return (retval);
1994 }
1995
1996
1997 /*
1998  * vn_vfswlock is used to implement a lock which is logically a writers lock
1999  * protecting the v_vfsmountedhere field.
2000  */
2001 int
2002 vn_vfswlock(vnode_t *vp)
2003 {
2004         vn_vfslocks_entry_t *vpvfsentry;
2005
2006         /*
2007          * If vp is NULL then somebody is trying to lock the covered vnode
2008          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2009          * only happen when unmounting /.  Since that operation will fail
2010          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2011          */
2012         if (vp == NULL)
2013                 return (EBUSY);
2014
2015         vpvfsentry = vn_vfslocks_getlock(vp);
2016
2017         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2018                 return (0);
2019
2020         vn_vfslocks_rele(vpvfsentry);
2021         return (EBUSY);
2022 }
2023
2024 int
2025 vn_vfsrlock(vnode_t *vp)
2026 {
2027         vn_vfslocks_entry_t *vpvfsentry;
2028
2029         /*
2030          * If vp is NULL then somebody is trying to lock the covered vnode
2031          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2032          * only happen when unmounting /.  Since that operation will fail
2033          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2034          */
2035         if (vp == NULL)
2036                 return (EBUSY);
2037
2038         vpvfsentry = vn_vfslocks_getlock(vp);
2039
2040         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2041                 return (0);
2042
2043         vn_vfslocks_rele(vpvfsentry);
2044         return (EBUSY);
2045 }
2046
2047 void
2048 vn_vfsunlock(vnode_t *vp)
2049 {
2050         vn_vfslocks_entry_t *vpvfsentry;
2051
2052         /*
2053          * ve_refcnt needs to be decremented twice.
2054          * 1. To release refernce after a call to vn_vfslocks_getlock()
2055          * 2. To release the reference from the locking routines like
2056          *    vn_vfsrlock/vn_vfswlock etc,.
2057          */
2058         vpvfsentry = vn_vfslocks_getlock(vp);
2059         vn_vfslocks_rele(vpvfsentry);
2060
2061         rwst_exit(&vpvfsentry->ve_lock);
2062         vn_vfslocks_rele(vpvfsentry);
2063 }
2064
2065 int
2066 vn_vfswlock_held(vnode_t *vp)
2067 {
2068         int held;
2069         vn_vfslocks_entry_t *vpvfsentry;
2070
2071         ASSERT(vp != NULL);
2072
2073         vpvfsentry = vn_vfslocks_getlock(vp);
2074         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2075
2076         vn_vfslocks_rele(vpvfsentry);
2077         return (held);
2078 }
2079
2080
2081 /*
2082  * Vnode cache.
2083  */
2084
2085 /* ARGSUSED */
2086 static int
2087 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2088 {
2089         struct vnode *vp;
2090
2091         vp = buf;
2092
2093         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2094         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2095         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2096         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2097         vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2098         vp->v_path = vn_vpath_empty;
2099         vp->v_path_stamp = 0;
2100         vp->v_mpssdata = NULL;
2101         vp->v_vsd = NULL;
2102         vp->v_fopdata = NULL;
2103
2104         vmobject_init(&vp->v_object, vp);
2105
2106         return (0);
2107 }
2108
2109 /* ARGSUSED */
2110 static void
2111 vn_cache_destructor(void *buf, void *cdrarg)
2112 {
2113         struct vnode *vp;
2114
2115         vp = buf;
2116
2117         vmobject_fini(&vp->v_object);
2118
2119         rw_destroy(&vp->v_nbllock);
2120         cv_destroy(&vp->v_cv);
2121         mutex_destroy(&vp->v_vsd_lock);
2122         mutex_destroy(&vp->v_lock);
2123 }
2124
2125 void
2126 vn_create_cache(void)
2127 {
2128         /* LINTED */
2129         ASSERT((1 << VNODE_ALIGN_LOG2) ==
2130             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2131         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2132             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2133             NULL, 0);
2134 }
2135
2136 void
2137 vn_destroy_cache(void)
2138 {
2139         kmem_cache_destroy(vn_cache);
2140 }
2141
2142 /*
2143  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2144  * cached by the file system and vnodes remain associated.
2145  */
2146 void
2147 vn_recycle(vnode_t *vp)
2148 {
2149         ASSERT(!vn_has_cached_data(vp));
2150         VERIFY(vp->v_path != NULL);
2151
2152         /*
2153          * XXX - This really belongs in vn_reinit(), but we have some issues
2154          * with the counts.  Best to have it here for clean initialization.
2155          */
2156         vp->v_rdcnt = 0;
2157         vp->v_wrcnt = 0;
2158         vp->v_mmap_read = 0;
2159         vp->v_mmap_write = 0;
2160
2161         /*
2162          * If FEM was in use, make sure everything gets cleaned up
2163          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2164          * constructor.
2165          */
2166         if (vp->v_femhead) {
2167                 /* XXX - There should be a free_femhead() that does all this */
2168                 ASSERT(vp->v_femhead->femh_list == NULL);
2169                 mutex_destroy(&vp->v_femhead->femh_lock);
2170                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2171                 vp->v_femhead = NULL;
2172         }
2173         if (vp->v_path != vn_vpath_empty) {
2174                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2175                 vp->v_path = vn_vpath_empty;
2176         }
2177         vp->v_path_stamp = 0;
2178
2179         if (vp->v_fopdata != NULL) {
2180                 free_fopdata(vp);
2181         }
2182         vp->v_mpssdata = NULL;
2183         vsd_free(vp);
2184 }
2185
2186 /*
2187  * Used to reset the vnode fields including those that are directly accessible
2188  * as well as those which require an accessor function.
2189  *
2190  * Does not initialize:
2191  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2192  *      v_data (since FS-nodes and vnodes point to each other and should
2193  *              be updated simultaneously)
2194  *      v_op (in case someone needs to make a VOP call on this object)
2195  */
2196 void
2197 vn_reinit(vnode_t *vp)
2198 {
2199         vp->v_count = 1;
2200         vp->v_count_dnlc = 0;
2201         vp->v_vfsp = NULL;
2202         vp->v_stream = NULL;
2203         vp->v_vfsmountedhere = NULL;
2204         vp->v_flag = 0;
2205         vp->v_type = VNON;
2206         vp->v_rdev = NODEV;
2207
2208         vp->v_filocks = NULL;
2209         vp->v_shrlocks = NULL;
2210         VERIFY(!vn_has_cached_data(vp));
2211
2212         vp->v_locality = NULL;
2213         vp->v_xattrdir = NULL;
2214
2215         /*
2216          * In a few specific instances, vn_reinit() is used to initialize
2217          * locally defined vnode_t instances.  Lacking the construction offered
2218          * by vn_alloc(), these vnodes require v_path initialization.
2219          */
2220         if (vp->v_path == NULL) {
2221                 vp->v_path = vn_vpath_empty;
2222         }
2223
2224         /* Handles v_femhead, v_path, and the r/w/map counts */
2225         vn_recycle(vp);
2226 }
2227
2228 vnode_t *
2229 vn_alloc(int kmflag)
2230 {
2231         vnode_t *vp;
2232
2233         vp = kmem_cache_alloc(vn_cache, kmflag);
2234
2235         if (vp != NULL) {
2236                 vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2237                 vp->v_fopdata = NULL;
2238                 vn_reinit(vp);
2239         }
2240
2241         return (vp);
2242 }
2243
2244 void
2245 vn_free(vnode_t *vp)
2246 {
2247         ASSERT(vp->v_shrlocks == NULL);
2248         ASSERT(vp->v_filocks == NULL);
2249
2250         /*
2251          * Some file systems call vn_free() with v_count of zero,
2252          * some with v_count of 1.  In any case, the value should
2253          * never be anything else.
2254          */
2255         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2256         ASSERT(vp->v_count_dnlc == 0);
2257         VERIFY(vp->v_path != NULL);
2258         if (vp->v_path != vn_vpath_empty) {
2259                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2260                 vp->v_path = vn_vpath_empty;
2261         }
2262
2263         /* If FEM was in use, make sure everything gets cleaned up */
2264         if (vp->v_femhead) {
2265                 /* XXX - There should be a free_femhead() that does all this */
2266                 ASSERT(vp->v_femhead->femh_list == NULL);
2267                 mutex_destroy(&vp->v_femhead->femh_lock);
2268                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2269                 vp->v_femhead = NULL;
2270         }
2271
2272         if (vp->v_fopdata != NULL) {
2273                 free_fopdata(vp);
2274         }
2275         vp->v_mpssdata = NULL;
2276         vsd_free(vp);
2277         kmem_cache_free(vn_cache, vp);
2278 }
2279
2280 /*
2281  * vnode status changes, should define better states than 1, 0.
2282  */
2283 void
2284 vn_reclaim(vnode_t *vp)
2285 {
2286         vfs_t   *vfsp = vp->v_vfsp;
2287
2288         if (vfsp == NULL ||
2289             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2290                 return;
2291         }
2292         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2293 }
2294
2295 void
2296 vn_idle(vnode_t *vp)
2297 {
2298         vfs_t   *vfsp = vp->v_vfsp;
2299
2300         if (vfsp == NULL ||
2301             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2302                 return;
2303         }
2304         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2305 }
2306 void
2307 vn_exists(vnode_t *vp)
2308 {
2309         vfs_t   *vfsp = vp->v_vfsp;
2310
2311         if (vfsp == NULL ||
2312             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2313                 return;
2314         }
2315         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2316 }
2317
2318 void
2319 vn_invalid(vnode_t *vp)
2320 {
2321         vfs_t   *vfsp = vp->v_vfsp;
2322
2323         if (vfsp == NULL ||
2324             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2325                 return;
2326         }
2327         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2328 }
2329
2330 /* Vnode event notification */
2331
2332 int
2333 vnevent_support(vnode_t *vp, caller_context_t *ct)
2334 {
2335         if (vp == NULL)
2336                 return (EINVAL);
2337
2338         return (fop_vnevent(vp, VE_SUPPORT, NULL, NULL, ct));
2339 }
2340
2341 void
2342 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2343 {
2344         if (vp == NULL || vp->v_femhead == NULL) {
2345                 return;
2346         }
2347         (void) fop_vnevent(vp, VE_RENAME_SRC, dvp, name, ct);
2348 }
2349
2350 void
2351 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2352     caller_context_t *ct)
2353 {
2354         if (vp == NULL || vp->v_femhead == NULL) {
2355                 return;
2356         }
2357         (void) fop_vnevent(vp, VE_RENAME_DEST, dvp, name, ct);
2358 }
2359
2360 void
2361 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2362 {
2363         if (vp == NULL || vp->v_femhead == NULL) {
2364                 return;
2365         }
2366         (void) fop_vnevent(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2367 }
2368
2369 void
2370 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2371 {
2372         if (vp == NULL || vp->v_femhead == NULL) {
2373                 return;
2374         }
2375         (void) fop_vnevent(vp, VE_REMOVE, dvp, name, ct);
2376 }
2377
2378 void
2379 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2380 {
2381         if (vp == NULL || vp->v_femhead == NULL) {
2382                 return;
2383         }
2384         (void) fop_vnevent(vp, VE_RMDIR, dvp, name, ct);
2385 }
2386
2387 void
2388 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2389     caller_context_t *ct)
2390 {
2391         if (vp == NULL || vp->v_femhead == NULL) {
2392                 return;
2393         }
2394         (void) fop_vnevent(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2395 }
2396
2397 void
2398 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2399     caller_context_t *ct)
2400 {
2401         if (vp == NULL || vp->v_femhead == NULL) {
2402                 return;
2403         }
2404         (void) fop_vnevent(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2405 }
2406
2407 void
2408 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2409     caller_context_t *ct)
2410 {
2411         if (vp == NULL || vp->v_femhead == NULL) {
2412                 return;
2413         }
2414         (void) fop_vnevent(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2415 }
2416
2417 void
2418 vnevent_create(vnode_t *vp, caller_context_t *ct)
2419 {
2420         if (vp == NULL || vp->v_femhead == NULL) {
2421                 return;
2422         }
2423         (void) fop_vnevent(vp, VE_CREATE, NULL, NULL, ct);
2424 }
2425
2426 void
2427 vnevent_link(vnode_t *vp, caller_context_t *ct)
2428 {
2429         if (vp == NULL || vp->v_femhead == NULL) {
2430                 return;
2431         }
2432         (void) fop_vnevent(vp, VE_LINK, NULL, NULL, ct);
2433 }
2434
2435 void
2436 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2437 {
2438         if (vp == NULL || vp->v_femhead == NULL) {
2439                 return;
2440         }
2441         (void) fop_vnevent(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2442 }
2443
2444 void
2445 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2446 {
2447         if (vp == NULL || vp->v_femhead == NULL) {
2448                 return;
2449         }
2450         (void) fop_vnevent(vp, VE_TRUNCATE, NULL, NULL, ct);
2451 }
2452
2453 /*
2454  * Vnode accessors.
2455  */
2456
2457 int
2458 vn_is_readonly(vnode_t *vp)
2459 {
2460         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2461 }
2462
2463 int
2464 vn_has_flocks(vnode_t *vp)
2465 {
2466         return (vp->v_filocks != NULL);
2467 }
2468
2469 int
2470 vn_has_mandatory_locks(vnode_t *vp, int mode)
2471 {
2472         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2473 }
2474
2475 int
2476 vn_has_cached_data(vnode_t *vp)
2477 {
2478         return (!list_is_empty(&vp->v_object.list));
2479 }
2480
2481 /*
2482  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2483  * zone_enter(2).
2484  */
2485 int
2486 vn_can_change_zones(vnode_t *vp)
2487 {
2488         struct vfssw *vswp;
2489         int allow = 1;
2490         vnode_t *rvp;
2491
2492         if (nfs_global_client_only != 0)
2493                 return (1);
2494
2495         /*
2496          * We always want to look at the underlying vnode if there is one.
2497          */
2498         if (fop_realvp(vp, &rvp, NULL) != 0)
2499                 rvp = vp;
2500         /*
2501          * Some pseudo filesystems (including doorfs) don't actually register
2502          * their vfsops_t, so the following may return NULL; we happily let
2503          * such vnodes switch zones.
2504          */
2505         vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2506         if (vswp != NULL) {
2507                 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2508                         allow = 0;
2509                 vfs_unrefvfssw(vswp);
2510         }
2511         return (allow);
2512 }
2513
2514 /*
2515  * Return nonzero if the vnode is a mount point, zero if not.
2516  */
2517 int
2518 vn_ismntpt(vnode_t *vp)
2519 {
2520         return (vp->v_vfsmountedhere != NULL);
2521 }
2522
2523 /* Retrieve the vfs (if any) mounted on this vnode */
2524 vfs_t *
2525 vn_mountedvfs(vnode_t *vp)
2526 {
2527         return (vp->v_vfsmountedhere);
2528 }
2529
2530 /*
2531  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2532  */
2533 int
2534 vn_in_dnlc(vnode_t *vp)
2535 {
2536         return (vp->v_count_dnlc > 0);
2537 }
2538
2539 /*
2540  * vn_has_other_opens() checks whether a particular file is opened by more than
2541  * just the caller and whether the open is for read and/or write.
2542  * This routine is for calling after the caller has already called fop_open()
2543  * and the caller wishes to know if they are the only one with it open for
2544  * the mode(s) specified.
2545  *
2546  * Vnode counts are only kept on regular files (v_type=VREG).
2547  */
2548 bool
2549 vn_has_other_opens(struct vnode *vp, v_mode_t mode)
2550 {
2551         ASSERT(vp != NULL);
2552
2553         switch (mode) {
2554         case V_WRITE:
2555                 if (vp->v_wrcnt > 1)
2556                         return true;
2557                 break;
2558         case V_RDORWR:
2559                 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2560                         return true;
2561                 break;
2562         case V_RDANDWR:
2563                 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2564                         return true;
2565                 break;
2566         case V_READ:
2567                 if (vp->v_rdcnt > 1)
2568                         return true;
2569                 break;
2570         }
2571
2572         return false;
2573 }
2574
2575 /*
2576  * vn_is_opened() checks whether a particular file is opened and
2577  * whether the open is for read and/or write.
2578  *
2579  * Vnode counts are only kept on regular files (v_type=VREG).
2580  */
2581 bool vn_is_opened(struct vnode *vp, v_mode_t mode)
2582 {
2583         ASSERT(vp != NULL);
2584
2585         switch (mode) {
2586         case V_WRITE:
2587                 if (vp->v_wrcnt)
2588                         return true;
2589                 break;
2590         case V_RDANDWR:
2591                 if (vp->v_rdcnt && vp->v_wrcnt)
2592                         return true;
2593                 break;
2594         case V_RDORWR:
2595                 if (vp->v_rdcnt || vp->v_wrcnt)
2596                         return true;
2597                 break;
2598         case V_READ:
2599                 if (vp->v_rdcnt)
2600                         return true;
2601                 break;
2602         }
2603
2604         return false;
2605 }
2606
2607 /*
2608  * vn_is_mapped() checks whether a particular file is mapped and whether
2609  * the file is mapped read and/or write.
2610  */
2611 bool vn_is_mapped(struct vnode *vp, v_mode_t mode)
2612 {
2613         ASSERT(vp != NULL);
2614
2615 #if !defined(_LP64)
2616         switch (mode) {
2617         /*
2618          * The atomic_add_64_nv functions force atomicity in the
2619          * case of 32 bit architectures. Otherwise the 64 bit values
2620          * require two fetches. The value of the fields may be
2621          * (potentially) changed between the first fetch and the
2622          * second
2623          */
2624         case V_WRITE:
2625                 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2626                         return true;
2627                 break;
2628         case V_RDANDWR:
2629                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2630                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2631                         return true;
2632                 break;
2633         case V_RDORWR:
2634                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2635                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2636                         return true;
2637                 break;
2638         case V_READ:
2639                 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2640                         return true;
2641                 break;
2642         }
2643 #else
2644         switch (mode) {
2645         case V_WRITE:
2646                 if (vp->v_mmap_write)
2647                         return true;
2648                 break;
2649         case V_RDANDWR:
2650                 if (vp->v_mmap_read && vp->v_mmap_write)
2651                         return true;
2652                 break;
2653         case V_RDORWR:
2654                 if (vp->v_mmap_read || vp->v_mmap_write)
2655                         return true;
2656                 break;
2657         case V_READ:
2658                 if (vp->v_mmap_read)
2659                         return true;
2660                 break;
2661         }
2662 #endif
2663
2664         return false;
2665 }
2666
2667 /*
2668  * Set the operations vector for a vnode.
2669  */
2670 void
2671 vn_setops(struct vnode *vnode, const struct vnodeops *ops)
2672 {
2673         vnode->v_op = ops;
2674 }
2675
2676 /*
2677  * Retrieve the operations vector for a vnode
2678  */
2679 const struct vnodeops *
2680 vn_getops(struct vnode *vnode)
2681 {
2682         return vnode->v_op;
2683 }
2684
2685 /*
2686  * Returns non-zero (1) if the vnodeops matches that of the vnode.
2687  * Returns zero (0) if not.
2688  */
2689 int
2690 vn_matchops(struct vnode *vp, const struct vnodeops *vnodeops)
2691 {
2692         return (vn_getops(vp) == vnodeops);
2693 }
2694
2695 /*
2696  * fs_new_caller_id() needs to return a unique ID on a given local system.
2697  * The IDs do not need to survive across reboots.  These are primarily
2698  * used so that (FEM) monitors can detect particular callers (such as
2699  * the NFS server) to a given vnode/vfs operation.
2700  */
2701 u_longlong_t
2702 fs_new_caller_id()
2703 {
2704         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2705
2706         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2707 }
2708
2709 /*
2710  * The value stored in v_path is relative to rootdir, located in the global
2711  * zone.  Zones or chroot environments which reside deeper inside the VFS
2712  * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2713  * what lies below their perceived root.  In order to keep v_path usable for
2714  * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2715  *
2716  * An upper bound of max_vnode_path is placed upon v_path allocations to
2717  * prevent the system from going too wild at the behest of pathological
2718  * behavior from the operator.
2719  */
2720 size_t max_vnode_path = 4 * MAXPATHLEN;
2721
2722
2723 void
2724 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
2725 {
2726         char *buf;
2727
2728         mutex_enter(&vp->v_lock);
2729         /*
2730          * If the snapshot of v_path_stamp passed in via compare_stamp does not
2731          * match the present value on the vnode, it indicates that subsequent
2732          * changes have occurred.  The v_path value is not cleared in this case
2733          * since the new value may be valid.
2734          */
2735         if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
2736                 mutex_exit(&vp->v_lock);
2737                 return;
2738         }
2739         buf = vp->v_path;
2740         vp->v_path = vn_vpath_empty;
2741         vp->v_path_stamp = 0;
2742         mutex_exit(&vp->v_lock);
2743         if (buf != vn_vpath_empty) {
2744                 kmem_free(buf, strlen(buf) + 1);
2745         }
2746 }
2747
2748 static void
2749 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
2750     boolean_t is_rename)
2751 {
2752         char *buf, *oldbuf;
2753         hrtime_t pstamp;
2754         size_t baselen, buflen = 0;
2755
2756         /* Handle the vn_setpath_str case. */
2757         if (pvp == NULL) {
2758                 if (len + 1 > max_vnode_path) {
2759                         DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
2760                             vnode_t *, vp, char *, name, size_t, len + 1);
2761                         return;
2762                 }
2763                 buf = kmem_alloc(len + 1, KM_SLEEP);
2764                 bcopy(name, buf, len);
2765                 buf[len] = '\0';
2766
2767                 mutex_enter(&vp->v_lock);
2768                 oldbuf = vp->v_path;
2769                 vp->v_path = buf;
2770                 vp->v_path_stamp = gethrtime();
2771                 mutex_exit(&vp->v_lock);
2772                 if (oldbuf != vn_vpath_empty) {
2773                         kmem_free(oldbuf, strlen(oldbuf) + 1);
2774                 }
2775                 return;
2776         }
2777
2778         /* Take snapshot of parent dir */
2779         mutex_enter(&pvp->v_lock);
2780
2781         if ((pvp->v_flag & VTRAVERSE) != 0) {
2782                 /*
2783                  * When the parent vnode has VTRAVERSE set in its flags, normal
2784                  * assumptions about v_path calculation no longer apply.  The
2785                  * primary situation where this occurs is via the VFS tricks
2786                  * which procfs plays in order to allow /proc/PID/(root|cwd) to
2787                  * yield meaningful results.
2788                  *
2789                  * When this flag is set, v_path on the child must not be
2790                  * updated since the calculated value is likely to be
2791                  * incorrect, given the current context.
2792                  */
2793                 mutex_exit(&pvp->v_lock);
2794                 return;
2795         }
2796
2797 retrybuf:
2798         if (pvp->v_path == vn_vpath_empty) {
2799                 /*
2800                  * Without v_path from the parent directory, generating a child
2801                  * path from the name is impossible.
2802                  */
2803                 if (len > 0) {
2804                         pstamp = pvp->v_path_stamp;
2805                         mutex_exit(&pvp->v_lock);
2806                         vn_clearpath(vp, pstamp);
2807                         return;
2808                 }
2809
2810                 /*
2811                  * The only feasible case here is where a NUL lookup is being
2812                  * performed on rootdir prior to its v_path being populated.
2813                  */
2814                 ASSERT(pvp->v_path_stamp == 0);
2815                 baselen = 0;
2816                 pstamp = 0;
2817         } else {
2818                 pstamp = pvp->v_path_stamp;
2819                 baselen = strlen(pvp->v_path);
2820                 /* ignore a trailing slash if present */
2821                 if (pvp->v_path[baselen - 1] == '/') {
2822                         /* This should only the be case for rootdir */
2823                         ASSERT(baselen == 1 && pvp == rootdir);
2824                         baselen--;
2825                 }
2826         }
2827         mutex_exit(&pvp->v_lock);
2828
2829         if (buflen != 0) {
2830                 /* Free the existing (mis-sized) buffer in case of retry */
2831                 kmem_free(buf, buflen);
2832         }
2833         /* base, '/', name and trailing NUL */
2834         buflen = baselen + len + 2;
2835         if (buflen > max_vnode_path) {
2836                 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
2837                     vnode_t *, vp, char *, name, size_t, buflen);
2838                 return;
2839         }
2840         buf = kmem_alloc(buflen, KM_SLEEP);
2841
2842         mutex_enter(&pvp->v_lock);
2843         if (pvp->v_path_stamp != pstamp) {
2844                 size_t vlen;
2845
2846                 /*
2847                  * Since v_path_stamp changed on the parent, it is likely that
2848                  * v_path has been altered as well.  If the length does not
2849                  * exactly match what was previously measured, the buffer
2850                  * allocation must be repeated for proper sizing.
2851                  */
2852                 if (pvp->v_path == vn_vpath_empty) {
2853                         /* Give up if parent lack v_path */
2854                         mutex_exit(&pvp->v_lock);
2855                         kmem_free(buf, buflen);
2856                         return;
2857                 }
2858                 vlen = strlen(pvp->v_path);
2859                 if (pvp->v_path[vlen - 1] == '/') {
2860                         vlen--;
2861                 }
2862                 if (vlen != baselen) {
2863                         goto retrybuf;
2864                 }
2865         }
2866         bcopy(pvp->v_path, buf, baselen);
2867         mutex_exit(&pvp->v_lock);
2868
2869         buf[baselen] = '/';
2870         baselen++;
2871         bcopy(name, &buf[baselen], len + 1);
2872
2873         mutex_enter(&vp->v_lock);
2874         if (vp->v_path_stamp == 0) {
2875                 /* never-visited vnode can inherit stamp from parent */
2876                 ASSERT(vp->v_path == vn_vpath_empty);
2877                 vp->v_path_stamp = pstamp;
2878                 vp->v_path = buf;
2879                 mutex_exit(&vp->v_lock);
2880         } else if (vp->v_path_stamp < pstamp || is_rename) {
2881                 /*
2882                  * Install the updated path and stamp, ensuring that the v_path
2883                  * pointer is valid at all times for dtrace.
2884                  */
2885                 oldbuf = vp->v_path;
2886                 vp->v_path = buf;
2887                 vp->v_path_stamp = gethrtime();
2888                 mutex_exit(&vp->v_lock);
2889                 kmem_free(oldbuf, strlen(oldbuf) + 1);
2890         } else {
2891                 /*
2892                  * If the timestamp matches or is greater, it means another
2893                  * thread performed the update first while locks were dropped
2894                  * here to make the allocation.  We defer to the newer value.
2895                  */
2896                 mutex_exit(&vp->v_lock);
2897                 kmem_free(buf, buflen);
2898         }
2899         ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
2900 }
2901
2902 void
2903 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
2904 {
2905         size_t len;
2906
2907         /*
2908          * If the parent is older or empty, there's nothing further to do.
2909          */
2910         if (pvp->v_path == vn_vpath_empty ||
2911             pvp->v_path_stamp <= vp->v_path_stamp) {
2912                 return;
2913         }
2914
2915         /*
2916          * Given the lack of appropriate context, meaningful updates to v_path
2917          * cannot be made for during lookups for the '.' or '..' entries.
2918          */
2919         len = strlen(name);
2920         if (len == 0 || (len == 1 && name[0] == '.') ||
2921             (len == 2 && name[0] == '.' && name[1] == '.')) {
2922                 return;
2923         }
2924
2925         vn_setpath_common(pvp, vp, name, len, B_FALSE);
2926 }
2927
2928 /*
2929  * Given a starting vnode and a path, updates the path in the target vnode in
2930  * a safe manner.  If the vnode already has path information embedded, then the
2931  * cached path is left untouched.
2932  */
2933 /* ARGSUSED */
2934 void
2935 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
2936     size_t len)
2937 {
2938         vn_setpath_common(pvp, vp, name, len, B_FALSE);
2939 }
2940
2941 /*
2942  * Sets the path to the vnode to be the given string, regardless of current
2943  * context.  The string must be a complete path from rootdir.  This is only used
2944  * by fsop_root() for setting the path based on the mountpoint.
2945  */
2946 void
2947 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
2948 {
2949         vn_setpath_common(NULL, vp, str, len, B_FALSE);
2950 }
2951
2952 /*
2953  * Called from within filesystem's vop_rename() to handle renames once the
2954  * target vnode is available.
2955  */
2956 void
2957 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
2958 {
2959         vn_setpath_common(pvp, vp, name, len, B_TRUE);
2960 }
2961
2962 /*
2963  * Similar to vn_setpath_str(), this function sets the path of the destination
2964  * vnode to the be the same as the source vnode.
2965  */
2966 void
2967 vn_copypath(struct vnode *src, struct vnode *dst)
2968 {
2969         char *buf;
2970         hrtime_t stamp;
2971         size_t buflen;
2972
2973         mutex_enter(&src->v_lock);
2974         if (src->v_path == vn_vpath_empty) {
2975                 mutex_exit(&src->v_lock);
2976                 return;
2977         }
2978         buflen = strlen(src->v_path) + 1;
2979         mutex_exit(&src->v_lock);
2980
2981         buf = kmem_alloc(buflen, KM_SLEEP);
2982
2983         mutex_enter(&src->v_lock);
2984         if (src->v_path == vn_vpath_empty ||
2985             strlen(src->v_path) + 1 != buflen) {
2986                 mutex_exit(&src->v_lock);
2987                 kmem_free(buf, buflen);
2988                 return;
2989         }
2990         bcopy(src->v_path, buf, buflen);
2991         stamp = src->v_path_stamp;
2992         mutex_exit(&src->v_lock);
2993
2994         mutex_enter(&dst->v_lock);
2995         if (dst->v_path != vn_vpath_empty) {
2996                 mutex_exit(&dst->v_lock);
2997                 kmem_free(buf, buflen);
2998                 return;
2999         }
3000         dst->v_path = buf;
3001         dst->v_path_stamp = stamp;
3002         mutex_exit(&dst->v_lock);
3003 }
3004
3005
3006 /*
3007  * XXX Private interface for segvn routines that handle vnode
3008  * large page segments.
3009  *
3010  * return 1 if vp's file system fop_pageio() implementation
3011  * can be safely used instead of fop_getpage() for handling
3012  * pagefaults against regular non swap files. fop_pageio()
3013  * interface is considered safe here if its implementation
3014  * is very close to fop_getpage() implementation.
3015  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3016  * panic if there're file holes but instead returns an error.
3017  * Doesn't assume file won't be changed by user writes, etc.
3018  *
3019  * return 0 otherwise.
3020  *
3021  * For now allow segvn to only use fop_pageio() with ufs and nfs.
3022  */
3023 int
3024 vn_vmpss_usepageio(vnode_t *vp)
3025 {
3026         vfs_t   *vfsp = vp->v_vfsp;
3027         char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3028         char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3029         char **fsok = pageio_ok_fss;
3030
3031         if (fsname == NULL) {
3032                 return (0);
3033         }
3034
3035         for (; *fsok; fsok++) {
3036                 if (strcmp(*fsok, fsname) == 0) {
3037                         return (1);
3038                 }
3039         }
3040         return (0);
3041 }
3042
3043 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3044
3045 int
3046 fop_open(
3047         vnode_t **vpp,
3048         int mode,
3049         cred_t *cr,
3050         caller_context_t *ct)
3051 {
3052         int ret;
3053         vnode_t *vp = *vpp;
3054
3055         VN_HOLD(vp);
3056         /*
3057          * Adding to the vnode counts before calling open
3058          * avoids the need for a mutex. It circumvents a race
3059          * condition where a query made on the vnode counts results in a
3060          * false negative. The inquirer goes away believing the file is
3061          * not open when there is an open on the file already under way.
3062          *
3063          * The counts are meant to prevent NFS from granting a delegation
3064          * when it would be dangerous to do so.
3065          *
3066          * The vnode counts are only kept on regular files
3067          */
3068         if ((*vpp)->v_type == VREG) {
3069                 if (mode & FREAD)
3070                         atomic_inc_32(&(*vpp)->v_rdcnt);
3071                 if (mode & FWRITE)
3072                         atomic_inc_32(&(*vpp)->v_wrcnt);
3073         }
3074
3075         VOPXID_MAP_CR(vp, cr);
3076
3077         ret = fop_open_dispatch(vpp, mode, cr, ct, true);
3078
3079         if (ret) {
3080                 /*
3081                  * Use the saved vp just in case the vnode ptr got trashed
3082                  * by the error.
3083                  */
3084                 VOPSTATS_UPDATE(vp, open);
3085                 if ((vp->v_type == VREG) && (mode & FREAD))
3086                         atomic_dec_32(&vp->v_rdcnt);
3087                 if ((vp->v_type == VREG) && (mode & FWRITE))
3088                         atomic_dec_32(&vp->v_wrcnt);
3089         } else {
3090                 /*
3091                  * Some filesystems will return a different vnode,
3092                  * but the same path was still used to open it.
3093                  * So if we do change the vnode and need to
3094                  * copy over the path, do so here, rather than special
3095                  * casing each filesystem. Adjust the vnode counts to
3096                  * reflect the vnode switch.
3097                  */
3098                 VOPSTATS_UPDATE(*vpp, open);
3099                 if (*vpp != vp && *vpp != NULL) {
3100                         vn_copypath(vp, *vpp);
3101                         if (((*vpp)->v_type == VREG) && (mode & FREAD))
3102                                 atomic_inc_32(&(*vpp)->v_rdcnt);
3103                         if ((vp->v_type == VREG) && (mode & FREAD))
3104                                 atomic_dec_32(&vp->v_rdcnt);
3105                         if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3106                                 atomic_inc_32(&(*vpp)->v_wrcnt);
3107                         if ((vp->v_type == VREG) && (mode & FWRITE))
3108                                 atomic_dec_32(&vp->v_wrcnt);
3109                 }
3110         }
3111         VN_RELE(vp);
3112         return (ret);
3113 }
3114
3115 int
3116 fop_close(
3117         vnode_t *vp,
3118         int flag,
3119         int count,
3120         offset_t offset,
3121         cred_t *cr,
3122         caller_context_t *ct)
3123 {
3124         int err;
3125
3126         VOPXID_MAP_CR(vp, cr);
3127
3128         err = fop_close_dispatch(vp, flag, count, offset, cr, ct, true);
3129
3130         VOPSTATS_UPDATE(vp, close);
3131         /*
3132          * Check passed in count to handle possible dups. Vnode counts are only
3133          * kept on regular files
3134          */
3135         if ((vp->v_type == VREG) && (count == 1))  {
3136                 if (flag & FREAD) {
3137                         ASSERT(vp->v_rdcnt > 0);
3138                         atomic_dec_32(&vp->v_rdcnt);
3139                 }
3140                 if (flag & FWRITE) {
3141                         ASSERT(vp->v_wrcnt > 0);
3142                         atomic_dec_32(&vp->v_wrcnt);
3143                 }
3144         }
3145         return (err);
3146 }
3147
3148 int
3149 fop_read(
3150         vnode_t *vp,
3151         uio_t *uiop,
3152         int ioflag,
3153         cred_t *cr,
3154         caller_context_t *ct)
3155 {
3156         int     err;
3157         ssize_t resid_start = uiop->uio_resid;
3158
3159         VOPXID_MAP_CR(vp, cr);
3160
3161         err = fop_read_dispatch(vp, uiop, ioflag, cr, ct, true);
3162
3163         VOPSTATS_UPDATE_IO(vp, read,
3164             read_bytes, (resid_start - uiop->uio_resid));
3165         return (err);
3166 }
3167
3168 int
3169 fop_write(
3170         vnode_t *vp,
3171         uio_t *uiop,
3172         int ioflag,
3173         cred_t *cr,
3174         caller_context_t *ct)
3175 {
3176         int     err;
3177         ssize_t resid_start = uiop->uio_resid;
3178
3179         VOPXID_MAP_CR(vp, cr);
3180
3181         err = fop_write_dispatch(vp, uiop, ioflag, cr, ct, true);
3182
3183         VOPSTATS_UPDATE_IO(vp, write,
3184             write_bytes, (resid_start - uiop->uio_resid));
3185         return (err);
3186 }
3187
3188 int
3189 fop_ioctl(
3190         vnode_t *vp,
3191         int cmd,
3192         intptr_t arg,
3193         int flag,
3194         cred_t *cr,
3195         int *rvalp,
3196         caller_context_t *ct)
3197 {
3198         int     err;
3199
3200         VOPXID_MAP_CR(vp, cr);
3201
3202         err = fop_ioctl_dispatch(vp, cmd, arg, flag, cr, rvalp, ct, true);
3203
3204         VOPSTATS_UPDATE(vp, ioctl);
3205         return (err);
3206 }
3207
3208 int
3209 fop_setfl(
3210         vnode_t *vp,
3211         int oflags,
3212         int nflags,
3213         cred_t *cr,
3214         caller_context_t *ct)
3215 {
3216         int     err;
3217
3218         VOPXID_MAP_CR(vp, cr);
3219
3220         err = fop_setfl_dispatch(vp, oflags, nflags, cr, ct, true);
3221
3222         VOPSTATS_UPDATE(vp, setfl);
3223         return (err);
3224 }
3225
3226 int
3227 fop_getattr(
3228         vnode_t *vp,
3229         vattr_t *vap,
3230         int flags,
3231         cred_t *cr,
3232         caller_context_t *ct)
3233 {
3234         int     err;
3235
3236         VOPXID_MAP_CR(vp, cr);
3237
3238         /*
3239          * If this file system doesn't understand the xvattr extensions
3240          * then turn off the xvattr bit.
3241          */
3242         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3243                 vap->va_mask &= ~VATTR_XVATTR;
3244         }
3245
3246         /*
3247          * We're only allowed to skip the ACL check iff we used a 32 bit
3248          * ACE mask with fop_access() to determine permissions.
3249          */
3250         if ((flags & ATTR_NOACLCHECK) &&
3251             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3252                 return (EINVAL);
3253
3254         err = fop_getattr_dispatch(vp, vap, flags, cr, ct, true);
3255
3256         VOPSTATS_UPDATE(vp, getattr);
3257         return (err);
3258 }
3259
3260 int
3261 fop_setattr(
3262         vnode_t *vp,
3263         vattr_t *vap,
3264         int flags,
3265         cred_t *cr,
3266         caller_context_t *ct)
3267 {
3268         int     err;
3269
3270         VOPXID_MAP_CR(vp, cr);
3271
3272         /*
3273          * If this file system doesn't understand the xvattr extensions
3274          * then turn off the xvattr bit.
3275          */
3276         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3277                 vap->va_mask &= ~VATTR_XVATTR;
3278         }
3279
3280         /*
3281          * We're only allowed to skip the ACL check iff we used a 32 bit
3282          * ACE mask with fop_access() to determine permissions.
3283          */
3284         if ((flags & ATTR_NOACLCHECK) &&
3285             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3286                 return (EINVAL);
3287
3288         err = fop_setattr_dispatch(vp, vap, flags, cr, ct, true);
3289
3290         VOPSTATS_UPDATE(vp, setattr);
3291         return (err);
3292 }
3293
3294 int
3295 fop_access(
3296         vnode_t *vp,
3297         int mode,
3298         int flags,
3299         cred_t *cr,
3300         caller_context_t *ct)
3301 {
3302         int     err;
3303
3304         if ((flags & V_ACE_MASK) &&
3305             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3306                 return (EINVAL);
3307         }
3308
3309         VOPXID_MAP_CR(vp, cr);
3310
3311         err = fop_access_dispatch(vp, mode, flags, cr, ct, true);
3312
3313         VOPSTATS_UPDATE(vp, access);
3314         return (err);
3315 }
3316
3317 int
3318 fop_lookup(
3319         vnode_t *dvp,
3320         char *nm,
3321         vnode_t **vpp,
3322         pathname_t *pnp,
3323         int flags,
3324         vnode_t *rdir,
3325         cred_t *cr,
3326         caller_context_t *ct,
3327         int *deflags,           /* Returned per-dirent flags */
3328         pathname_t *ppnp)       /* Returned case-preserved name in directory */
3329 {
3330         int ret;
3331
3332         /*
3333          * If this file system doesn't support case-insensitive access
3334          * and said access is requested, fail quickly.  It is required
3335          * that if the vfs supports case-insensitive lookup, it also
3336          * supports extended dirent flags.
3337          */
3338         if (flags & FIGNORECASE &&
3339             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3340             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3341                 return (EINVAL);
3342
3343         VOPXID_MAP_CR(dvp, cr);
3344
3345         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3346                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3347         } else {
3348                 ret = fop_lookup_dispatch(dvp, nm, vpp, pnp, flags, rdir, cr,
3349                                           ct, deflags, ppnp, true);
3350         }
3351
3352         if (ret == 0 && *vpp) {
3353                 VOPSTATS_UPDATE(*vpp, lookup);
3354                 vn_updatepath(dvp, *vpp, nm);
3355         }
3356
3357         return (ret);
3358 }
3359
3360 int
3361 fop_create(
3362         vnode_t *dvp,
3363         char *name,
3364         vattr_t *vap,
3365         vcexcl_t excl,
3366         int mode,
3367         vnode_t **vpp,
3368         cred_t *cr,
3369         int flags,
3370         caller_context_t *ct,
3371         vsecattr_t *vsecp)      /* ACL to set during create */
3372 {
3373         int ret;
3374
3375         if (vsecp != NULL &&
3376             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3377                 return (EINVAL);
3378         }
3379         /*
3380          * If this file system doesn't support case-insensitive access
3381          * and said access is requested, fail quickly.
3382          */
3383         if (flags & FIGNORECASE &&
3384             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3385             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3386                 return (EINVAL);
3387
3388         VOPXID_MAP_CR(dvp, cr);
3389
3390         ret = fop_create_dispatch(dvp, name, vap, excl, mode, vpp, cr, flags,
3391                                   ct, vsecp, true);
3392
3393         if (ret == 0 && *vpp) {
3394                 VOPSTATS_UPDATE(*vpp, create);
3395                 vn_updatepath(dvp, *vpp, name);
3396         }
3397
3398         return (ret);
3399 }
3400
3401 int
3402 fop_remove(
3403         vnode_t *dvp,
3404         char *nm,
3405         cred_t *cr,
3406         caller_context_t *ct,
3407         int flags)
3408 {
3409         int     err;
3410
3411         /*
3412          * If this file system doesn't support case-insensitive access
3413          * and said access is requested, fail quickly.
3414          */
3415         if (flags & FIGNORECASE &&
3416             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3417             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3418                 return (EINVAL);
3419
3420         VOPXID_MAP_CR(dvp, cr);
3421
3422         err = fop_remove_dispatch(dvp, nm, cr, ct, flags, true);
3423
3424         VOPSTATS_UPDATE(dvp, remove);
3425         return (err);
3426 }
3427
3428 int
3429 fop_link(
3430         vnode_t *tdvp,
3431         vnode_t *svp,
3432         char *tnm,
3433         cred_t *cr,
3434         caller_context_t *ct,
3435         int flags)
3436 {
3437         int     err;
3438
3439         /*
3440          * If the target file system doesn't support case-insensitive access
3441          * and said access is requested, fail quickly.
3442          */
3443         if (flags & FIGNORECASE &&
3444             (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3445             vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3446                 return (EINVAL);
3447
3448         VOPXID_MAP_CR(tdvp, cr);
3449
3450         err = fop_link_dispatch(tdvp, svp, tnm, cr, ct, flags, true);
3451
3452         VOPSTATS_UPDATE(tdvp, link);
3453         return (err);
3454 }
3455
3456 int
3457 fop_rename(
3458         vnode_t *sdvp,
3459         char *snm,
3460         vnode_t *tdvp,
3461         char *tnm,
3462         cred_t *cr,
3463         caller_context_t *ct,
3464         int flags)
3465 {
3466         int     err;
3467
3468         /*
3469          * If the file system involved does not support
3470          * case-insensitive access and said access is requested, fail
3471          * quickly.
3472          */
3473         if (flags & FIGNORECASE &&
3474             ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3475             vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3476                 return (EINVAL);
3477
3478         VOPXID_MAP_CR(tdvp, cr);
3479
3480         err = fop_rename_dispatch(sdvp, snm, tdvp, tnm, cr, ct, flags, true);
3481
3482         VOPSTATS_UPDATE(sdvp, rename);
3483         return (err);
3484 }
3485
3486 int
3487 fop_mkdir(
3488         vnode_t *dvp,
3489         char *dirname,
3490         vattr_t *vap,
3491         vnode_t **vpp,
3492         cred_t *cr,
3493         caller_context_t *ct,
3494         int flags,
3495         vsecattr_t *vsecp)      /* ACL to set during create */
3496 {
3497         int ret;
3498
3499         if (vsecp != NULL &&
3500             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3501                 return (EINVAL);
3502         }
3503         /*
3504          * If this file system doesn't support case-insensitive access
3505          * and said access is requested, fail quickly.
3506          */
3507         if (flags & FIGNORECASE &&
3508             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3509             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3510                 return (EINVAL);
3511
3512         VOPXID_MAP_CR(dvp, cr);
3513
3514         ret = fop_mkdir_dispatch(dvp, dirname, vap, vpp, cr, ct, flags, vsecp,
3515                                  true);
3516
3517         if (ret == 0 && *vpp) {
3518                 VOPSTATS_UPDATE(*vpp, mkdir);
3519                 vn_updatepath(dvp, *vpp, dirname);
3520         }
3521
3522         return (ret);
3523 }
3524
3525 int
3526 fop_rmdir(
3527         vnode_t *dvp,
3528         char *nm,
3529         vnode_t *cdir,
3530         cred_t *cr,
3531         caller_context_t *ct,
3532         int flags)
3533 {
3534         int     err;
3535
3536         /*
3537          * If this file system doesn't support case-insensitive access
3538          * and said access is requested, fail quickly.
3539          */
3540         if (flags & FIGNORECASE &&
3541             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3542             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3543                 return (EINVAL);
3544
3545         VOPXID_MAP_CR(dvp, cr);
3546
3547         err = fop_rmdir_dispatch(dvp, nm, cdir, cr, ct, flags, true);
3548
3549         VOPSTATS_UPDATE(dvp, rmdir);
3550         return (err);
3551 }
3552
3553 int
3554 fop_readdir(
3555         vnode_t *vp,
3556         uio_t *uiop,
3557         cred_t *cr,
3558         int *eofp,
3559         caller_context_t *ct,
3560         int flags)
3561 {
3562         int     err;
3563         ssize_t resid_start = uiop->uio_resid;
3564
3565         /*
3566          * If this file system doesn't support retrieving directory
3567          * entry flags and said access is requested, fail quickly.
3568          */
3569         if (flags & V_RDDIR_ENTFLAGS &&
3570             vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3571                 return (EINVAL);
3572
3573         VOPXID_MAP_CR(vp, cr);
3574
3575         err = fop_readdir_dispatch(vp, uiop, cr, eofp, ct, flags, true);
3576
3577         VOPSTATS_UPDATE_IO(vp, readdir,
3578             readdir_bytes, (resid_start - uiop->uio_resid));
3579         return (err);
3580 }
3581
3582 int
3583 fop_symlink(
3584         vnode_t *dvp,
3585         char *linkname,
3586         vattr_t *vap,
3587         char *target,
3588         cred_t *cr,
3589         caller_context_t *ct,
3590         int flags)
3591 {
3592         int     err;
3593         xvattr_t xvattr;
3594
3595         /*
3596          * If this file system doesn't support case-insensitive access
3597          * and said access is requested, fail quickly.
3598          */
3599         if (flags & FIGNORECASE &&
3600             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3601             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3602                 return (EINVAL);
3603
3604         VOPXID_MAP_CR(dvp, cr);
3605
3606         /* check for reparse point */
3607         if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3608             (strncmp(target, FS_REPARSE_TAG_STR,
3609             strlen(FS_REPARSE_TAG_STR)) == 0)) {
3610                 if (!fs_reparse_mark(target, vap, &xvattr))
3611                         vap = (vattr_t *)&xvattr;
3612         }
3613
3614         err = fop_symlink_dispatch(dvp, linkname, vap, target, cr, ct, flags,
3615                                    true);
3616
3617         VOPSTATS_UPDATE(dvp, symlink);
3618         return (err);
3619 }
3620
3621 int
3622 fop_readlink(
3623         vnode_t *vp,
3624         uio_t *uiop,
3625         cred_t *cr,
3626         caller_context_t *ct)
3627 {
3628         int     err;
3629
3630         VOPXID_MAP_CR(vp, cr);
3631
3632         err = fop_readlink_dispatch(vp, uiop, cr, ct, true);
3633
3634         VOPSTATS_UPDATE(vp, readlink);
3635         return (err);
3636 }
3637
3638 int
3639 fop_fsync(
3640         vnode_t *vp,
3641         int syncflag,
3642         cred_t *cr,
3643         caller_context_t *ct)
3644 {
3645         int     err;
3646
3647         VOPXID_MAP_CR(vp, cr);
3648
3649         err = fop_fsync_dispatch(vp, syncflag, cr, ct, true);
3650
3651         VOPSTATS_UPDATE(vp, fsync);
3652         return (err);
3653 }
3654
3655 void
3656 fop_inactive(
3657         vnode_t *vp,
3658         cred_t *cr,
3659         caller_context_t *ct)
3660 {
3661         /* Need to update stats before vop call since we may lose the vnode */
3662         VOPSTATS_UPDATE(vp, inactive);
3663
3664         VOPXID_MAP_CR(vp, cr);
3665
3666         fop_inactive_dispatch(vp, cr, ct, true);
3667 }
3668
3669 int
3670 fop_fid(
3671         vnode_t *vp,
3672         fid_t *fidp,
3673         caller_context_t *ct)
3674 {
3675         int     err;
3676
3677         err = fop_fid_dispatch(vp, fidp, ct, true);
3678
3679         VOPSTATS_UPDATE(vp, fid);
3680         return (err);
3681 }
3682
3683 int
3684 fop_rwlock(
3685         vnode_t *vp,
3686         int write_lock,
3687         caller_context_t *ct)
3688 {
3689         int     ret;
3690
3691         ret = fop_rwlock_dispatch(vp, write_lock, ct, true);
3692
3693         VOPSTATS_UPDATE(vp, rwlock);
3694         return (ret);
3695 }
3696
3697 void
3698 fop_rwunlock(
3699         vnode_t *vp,
3700         int write_lock,
3701         caller_context_t *ct)
3702 {
3703         fop_rwunlock_dispatch(vp, write_lock, ct, true);
3704
3705         VOPSTATS_UPDATE(vp, rwunlock);
3706 }
3707
3708 int
3709 fop_seek(
3710         vnode_t *vp,
3711         offset_t ooff,
3712         offset_t *noffp,
3713         caller_context_t *ct)
3714 {
3715         int     err;
3716
3717         err = fop_seek_dispatch(vp, ooff, noffp, ct, true);
3718
3719         VOPSTATS_UPDATE(vp, seek);
3720         return (err);
3721 }
3722
3723 int
3724 fop_cmp(
3725         vnode_t *vp1,
3726         vnode_t *vp2,
3727         caller_context_t *ct)
3728 {
3729         int     err;
3730
3731         err = fop_cmp_dispatch(vp1, vp2, ct, true);
3732
3733         VOPSTATS_UPDATE(vp1, cmp);
3734         return (err);
3735 }
3736
3737 int
3738 fop_frlock(
3739         vnode_t *vp,
3740         int cmd,
3741         flock64_t *bfp,
3742         int flag,
3743         offset_t offset,
3744         struct flk_callback *flk_cbp,
3745         cred_t *cr,
3746         caller_context_t *ct)
3747 {
3748         int     err;
3749
3750         VOPXID_MAP_CR(vp, cr);
3751
3752         err = fop_frlock_dispatch(vp, cmd, bfp, flag, offset, flk_cbp, cr,
3753                                   ct, true);
3754
3755         VOPSTATS_UPDATE(vp, frlock);
3756         return (err);
3757 }
3758
3759 int
3760 fop_space(
3761         vnode_t *vp,
3762         int cmd,
3763         flock64_t *bfp,
3764         int flag,
3765         offset_t offset,
3766         cred_t *cr,
3767         caller_context_t *ct)
3768 {
3769         int     err;
3770
3771         VOPXID_MAP_CR(vp, cr);
3772
3773         err = fop_space_dispatch(vp, cmd, bfp, flag, offset, cr, ct, true);
3774
3775         VOPSTATS_UPDATE(vp, space);
3776         return (err);
3777 }
3778
3779 int
3780 fop_realvp(
3781         vnode_t *vp,
3782         vnode_t **vpp,
3783         caller_context_t *ct)
3784 {
3785         int     err;
3786
3787         err = fop_realvp_dispatch(vp, vpp, ct, true);
3788
3789         VOPSTATS_UPDATE(vp, realvp);
3790         return (err);
3791 }
3792
3793 int
3794 fop_getpage(
3795         vnode_t *vp,
3796         offset_t off,
3797         size_t len,
3798         uint_t *protp,
3799         page_t **plarr,
3800         size_t plsz,
3801         struct seg *seg,
3802         caddr_t addr,
3803         enum seg_rw rw,
3804         cred_t *cr,
3805         caller_context_t *ct)
3806 {
3807         int     err;
3808
3809         VOPXID_MAP_CR(vp, cr);
3810
3811         err = fop_getpage_dispatch(vp, off, len, protp, plarr, plsz, seg,
3812             addr, rw, cr, ct, true);
3813
3814         VOPSTATS_UPDATE(vp, getpage);
3815         return (err);
3816 }
3817
3818 int
3819 fop_putpage(
3820         vnode_t *vp,
3821         offset_t off,
3822         size_t len,
3823         int flags,
3824         cred_t *cr,
3825         caller_context_t *ct)
3826 {
3827         int     err;
3828
3829         VOPXID_MAP_CR(vp, cr);
3830
3831         err = fop_putpage_dispatch(vp, off, len, flags, cr, ct, true);
3832
3833         VOPSTATS_UPDATE(vp, putpage);
3834         return (err);
3835 }
3836
3837 int
3838 fop_map(
3839         vnode_t *vp,
3840         offset_t off,
3841         struct as *as,
3842         caddr_t *addrp,
3843         size_t len,
3844         uchar_t prot,
3845         uchar_t maxprot,
3846         uint_t flags,
3847         cred_t *cr,
3848         caller_context_t *ct)
3849 {
3850         int     err;
3851
3852         VOPXID_MAP_CR(vp, cr);
3853
3854         err = fop_map_dispatch(vp, off, as, addrp, len, prot, maxprot,
3855             flags, cr, ct, true);
3856
3857         VOPSTATS_UPDATE(vp, map);
3858         return (err);
3859 }
3860
3861 int
3862 fop_addmap(
3863         vnode_t *vp,
3864         offset_t off,
3865         struct as *as,
3866         caddr_t addr,
3867         size_t len,
3868         uchar_t prot,
3869         uchar_t maxprot,
3870         uint_t flags,
3871         cred_t *cr,
3872         caller_context_t *ct)
3873 {
3874         int error;
3875         u_longlong_t delta;
3876
3877         VOPXID_MAP_CR(vp, cr);
3878
3879         error = fop_addmap_dispatch(vp, off, as, addr, len, prot, maxprot,
3880             flags, cr, ct, true);
3881
3882         if ((!error) && (vp->v_type == VREG)) {
3883                 delta = (u_longlong_t)btopr(len);
3884                 /*
3885                  * If file is declared MAP_PRIVATE, it can't be written back
3886                  * even if open for write. Handle as read.
3887                  */
3888                 if (flags & MAP_PRIVATE) {
3889                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3890                             (int64_t)delta);
3891                 } else {
3892                         /*
3893                          * atomic_add_64 forces the fetch of a 64 bit value to
3894                          * be atomic on 32 bit machines
3895                          */
3896                         if (maxprot & PROT_WRITE)
3897                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3898                                     (int64_t)delta);
3899                         if (maxprot & PROT_READ)
3900                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3901                                     (int64_t)delta);
3902                         if (maxprot & PROT_EXEC)
3903                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3904                                     (int64_t)delta);
3905                 }
3906         }
3907         VOPSTATS_UPDATE(vp, addmap);
3908         return (error);
3909 }
3910
3911 int
3912 fop_delmap(
3913         vnode_t *vp,
3914         offset_t off,
3915         struct as *as,
3916         caddr_t addr,
3917         size_t len,
3918         uint_t prot,
3919         uint_t maxprot,
3920         uint_t flags,
3921         cred_t *cr,
3922         caller_context_t *ct)
3923 {
3924         int error;
3925         u_longlong_t delta;
3926
3927         VOPXID_MAP_CR(vp, cr);
3928
3929         error = fop_delmap_dispatch(vp, off, as, addr, len, prot, maxprot,
3930             flags, cr, ct, true);
3931
3932         /*
3933          * NFS calls into delmap twice, the first time
3934          * it simply establishes a callback mechanism and returns EAGAIN
3935          * while the real work is being done upon the second invocation.
3936          * We have to detect this here and only decrement the counts upon
3937          * the second delmap request.
3938          */
3939         if ((error != EAGAIN) && (vp->v_type == VREG)) {
3940
3941                 delta = (u_longlong_t)btopr(len);
3942
3943                 if (flags & MAP_PRIVATE) {
3944                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3945                             (int64_t)(-delta));
3946                 } else {
3947                         /*
3948                          * atomic_add_64 forces the fetch of a 64 bit value
3949                          * to be atomic on 32 bit machines
3950                          */
3951                         if (maxprot & PROT_WRITE)
3952                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3953                                     (int64_t)(-delta));
3954                         if (maxprot & PROT_READ)
3955                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3956                                     (int64_t)(-delta));
3957                         if (maxprot & PROT_EXEC)
3958                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3959                                     (int64_t)(-delta));
3960                 }
3961         }
3962         VOPSTATS_UPDATE(vp, delmap);
3963         return (error);
3964 }
3965
3966
3967 int
3968 fop_poll(
3969         vnode_t *vp,
3970         short events,
3971         int anyyet,
3972         short *reventsp,
3973         struct pollhead **phpp,
3974         caller_context_t *ct)
3975 {
3976         int     err;
3977
3978         err = fop_poll_dispatch(vp, events, anyyet, reventsp, phpp, ct, true);
3979
3980         VOPSTATS_UPDATE(vp, poll);
3981         return (err);
3982 }
3983
3984 int
3985 fop_dump(
3986         vnode_t *vp,
3987         caddr_t addr,
3988         offset_t lbdn,
3989         offset_t dblks,
3990         caller_context_t *ct)
3991 {
3992         int     err;
3993
3994         /* ensure lbdn and dblks can be passed safely to bdev_dump */
3995         if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
3996                 return (EIO);
3997
3998         err = fop_dump_dispatch(vp, addr, lbdn, dblks, ct, true);
3999
4000         VOPSTATS_UPDATE(vp, dump);
4001         return (err);
4002 }
4003
4004 int
4005 fop_pathconf(
4006         vnode_t *vp,
4007         int cmd,
4008         ulong_t *valp,
4009         cred_t *cr,
4010         caller_context_t *ct)
4011 {
4012         int     err;
4013
4014         VOPXID_MAP_CR(vp, cr);
4015
4016         err = fop_pathconf_dispatch(vp, cmd, valp, cr, ct, true);
4017
4018         VOPSTATS_UPDATE(vp, pathconf);
4019         return (err);
4020 }
4021
4022 int
4023 fop_pageio(
4024         vnode_t *vp,
4025         struct page *pp,
4026         uoff_t io_off,
4027         size_t io_len,
4028         int flags,
4029         cred_t *cr,
4030         caller_context_t *ct)
4031 {
4032         int     err;
4033
4034         VOPXID_MAP_CR(vp, cr);
4035
4036         err = fop_pageio_dispatch(vp, pp, io_off, io_len, flags, cr, ct, true);
4037
4038         VOPSTATS_UPDATE(vp, pageio);
4039         return (err);
4040 }
4041
4042 int
4043 fop_dumpctl(
4044         vnode_t *vp,
4045         int action,
4046         offset_t *blkp,
4047         caller_context_t *ct)
4048 {
4049         int     err;
4050
4051         err = fop_dumpctl_dispatch(vp, action, blkp, ct, true);
4052
4053         VOPSTATS_UPDATE(vp, dumpctl);
4054         return (err);
4055 }
4056
4057 void
4058 fop_dispose(
4059         vnode_t *vp,
4060         page_t *pp,
4061         int flag,
4062         int dn,
4063         cred_t *cr,
4064         caller_context_t *ct)
4065 {
4066         /* Must do stats first since it's possible to lose the vnode */
4067         VOPSTATS_UPDATE(vp, dispose);
4068
4069         VOPXID_MAP_CR(vp, cr);
4070
4071         fop_dispose_dispatch(vp, pp, flag, dn, cr, ct, true);
4072 }
4073
4074 int
4075 fop_setsecattr(
4076         vnode_t *vp,
4077         vsecattr_t *vsap,
4078         int flag,
4079         cred_t *cr,
4080         caller_context_t *ct)
4081 {
4082         int     err;
4083
4084         VOPXID_MAP_CR(vp, cr);
4085
4086         /*
4087          * We're only allowed to skip the ACL check iff we used a 32 bit
4088          * ACE mask with fop_access() to determine permissions.
4089          */
4090         if ((flag & ATTR_NOACLCHECK) &&
4091             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4092                 return (EINVAL);
4093         }
4094
4095         err = fop_setsecattr_dispatch(vp, vsap, flag, cr, ct, true);
4096
4097         VOPSTATS_UPDATE(vp, setsecattr);
4098         return (err);
4099 }
4100
4101 int
4102 fop_getsecattr(
4103         vnode_t *vp,
4104         vsecattr_t *vsap,
4105         int flag,
4106         cred_t *cr,
4107         caller_context_t *ct)
4108 {
4109         int     err;
4110
4111         /*
4112          * We're only allowed to skip the ACL check iff we used a 32 bit
4113          * ACE mask with fop_access() to determine permissions.
4114          */
4115         if ((flag & ATTR_NOACLCHECK) &&
4116             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4117                 return (EINVAL);
4118         }
4119
4120         VOPXID_MAP_CR(vp, cr);
4121
4122         err = fop_getsecattr_dispatch(vp, vsap, flag, cr, ct, true);
4123
4124         VOPSTATS_UPDATE(vp, getsecattr);
4125         return (err);
4126 }
4127
4128 int
4129 fop_shrlock(
4130         vnode_t *vp,
4131         int cmd,
4132         struct shrlock *shr,
4133         int flag,
4134         cred_t *cr,
4135         caller_context_t *ct)
4136 {
4137         int     err;
4138
4139         VOPXID_MAP_CR(vp, cr);
4140
4141         err = fop_shrlock_dispatch(vp, cmd, shr, flag, cr, ct, true);
4142
4143         VOPSTATS_UPDATE(vp, shrlock);
4144         return (err);
4145 }
4146
4147 int
4148 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4149     caller_context_t *ct)
4150 {
4151         int     err;
4152
4153         err = fop_vnevent_dispatch(vp, vnevent, dvp, fnm, ct, true);
4154
4155         VOPSTATS_UPDATE(vp, vnevent);
4156         return (err);
4157 }
4158
4159 int
4160 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4161     caller_context_t *ct)
4162 {
4163         int err;
4164
4165         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4166                 return (ENOTSUP);
4167
4168         err = fop_reqzcbuf_dispatch(vp, ioflag, uiop, cr, ct, true);
4169
4170         VOPSTATS_UPDATE(vp, reqzcbuf);
4171         return (err);
4172 }
4173
4174 int
4175 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4176 {
4177         int err;
4178
4179         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4180                 return (ENOTSUP);
4181
4182         err = fop_retzcbuf_dispatch(vp, uiop, cr, ct, true);
4183
4184         VOPSTATS_UPDATE(vp, retzcbuf);
4185         return (err);
4186 }
4187
4188 /*
4189  * Default destructor
4190  *      Needed because NULL destructor means that the key is unused
4191  */
4192 /* ARGSUSED */
4193 void
4194 vsd_defaultdestructor(void *value)
4195 {}
4196
4197 /*
4198  * Create a key (index into per vnode array)
4199  *      Locks out vsd_create, vsd_destroy, and vsd_free
4200  *      May allocate memory with lock held
4201  */
4202 void
4203 vsd_create(uint_t *keyp, void (*destructor)(void *))
4204 {
4205         int     i;
4206         uint_t  nkeys;
4207
4208         /*
4209          * if key is allocated, do nothing
4210          */
4211         mutex_enter(&vsd_lock);
4212         if (*keyp) {
4213                 mutex_exit(&vsd_lock);
4214                 return;
4215         }
4216         /*
4217          * find an unused key
4218          */
4219         if (destructor == NULL)
4220                 destructor = vsd_defaultdestructor;
4221
4222         for (i = 0; i < vsd_nkeys; ++i)
4223                 if (vsd_destructor[i] == NULL)
4224                         break;
4225
4226         /*
4227          * if no unused keys, increase the size of the destructor array
4228          */
4229         if (i == vsd_nkeys) {
4230                 if ((nkeys = (vsd_nkeys << 1)) == 0)
4231                         nkeys = 1;
4232                 vsd_destructor =
4233                     (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4234                     (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4235                     (size_t)(nkeys * sizeof (void (*)(void *))));
4236                 vsd_nkeys = nkeys;
4237         }
4238
4239         /*
4240          * allocate the next available unused key
4241          */
4242         vsd_destructor[i] = destructor;
4243         *keyp = i + 1;
4244
4245         /* create vsd_list, if it doesn't exist */
4246         if (vsd_list == NULL) {
4247                 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4248                 list_create(vsd_list, sizeof (struct vsd_node),
4249                     offsetof(struct vsd_node, vs_nodes));
4250         }
4251
4252         mutex_exit(&vsd_lock);
4253 }
4254
4255 /*
4256  * Destroy a key
4257  *
4258  * Assumes that the caller is preventing vsd_set and vsd_get
4259  * Locks out vsd_create, vsd_destroy, and vsd_free
4260  * May free memory with lock held
4261  */
4262 void
4263 vsd_destroy(uint_t *keyp)
4264 {
4265         uint_t key;
4266         struct vsd_node *vsd;
4267
4268         /*
4269          * protect the key namespace and our destructor lists
4270          */
4271         mutex_enter(&vsd_lock);
4272         key = *keyp;
4273         *keyp = 0;
4274
4275         ASSERT(key <= vsd_nkeys);
4276
4277         /*
4278          * if the key is valid
4279          */
4280         if (key != 0) {
4281                 uint_t k = key - 1;
4282                 /*
4283                  * for every vnode with VSD, call key's destructor
4284                  */
4285                 for (vsd = list_head(vsd_list); vsd != NULL;
4286                     vsd = list_next(vsd_list, vsd)) {
4287                         /*
4288                          * no VSD for key in this vnode
4289                          */
4290                         if (key > vsd->vs_nkeys)
4291                                 continue;
4292                         /*
4293                          * call destructor for key
4294                          */
4295                         if (vsd->vs_value[k] && vsd_destructor[k])
4296                                 (*vsd_destructor[k])(vsd->vs_value[k]);
4297                         /*
4298                          * reset value for key
4299                          */
4300                         vsd->vs_value[k] = NULL;
4301                 }
4302                 /*
4303                  * actually free the key (NULL destructor == unused)
4304                  */
4305                 vsd_destructor[k] = NULL;
4306         }
4307
4308         mutex_exit(&vsd_lock);
4309 }
4310
4311 /*
4312  * Quickly return the per vnode value that was stored with the specified key
4313  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4314  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4315  */
4316 void *
4317 vsd_get(vnode_t *vp, uint_t key)
4318 {
4319         struct vsd_node *vsd;
4320
4321         ASSERT(vp != NULL);
4322         ASSERT(mutex_owned(&vp->v_vsd_lock));
4323
4324         vsd = vp->v_vsd;
4325
4326         if (key && vsd != NULL && key <= vsd->vs_nkeys)
4327                 return (vsd->vs_value[key - 1]);
4328         return (NULL);
4329 }
4330
4331 /*
4332  * Set a per vnode value indexed with the specified key
4333  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4334  */
4335 int
4336 vsd_set(vnode_t *vp, uint_t key, void *value)
4337 {
4338         struct vsd_node *vsd;
4339
4340         ASSERT(vp != NULL);
4341         ASSERT(mutex_owned(&vp->v_vsd_lock));
4342
4343         if (key == 0)
4344                 return (EINVAL);
4345
4346         vsd = vp->v_vsd;
4347         if (vsd == NULL)
4348                 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4349
4350         /*
4351          * If the vsd was just allocated, vs_nkeys will be 0, so the following
4352          * code won't happen and we will continue down and allocate space for
4353          * the vs_value array.
4354          * If the caller is replacing one value with another, then it is up
4355          * to the caller to free/rele/destroy the previous value (if needed).
4356          */
4357         if (key <= vsd->vs_nkeys) {
4358                 vsd->vs_value[key - 1] = value;
4359                 return (0);
4360         }
4361
4362         ASSERT(key <= vsd_nkeys);
4363
4364         if (vsd->vs_nkeys == 0) {
4365                 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4366                 /*
4367                  * Link onto list of all VSD nodes.
4368                  */
4369                 list_insert_head(vsd_list, vsd);
4370                 mutex_exit(&vsd_lock);
4371         }
4372
4373         /*
4374          * Allocate vnode local storage and set the value for key
4375          */
4376         vsd->vs_value = vsd_realloc(vsd->vs_value,
4377             vsd->vs_nkeys * sizeof (void *),
4378             key * sizeof (void *));
4379         vsd->vs_nkeys = key;
4380         vsd->vs_value[key - 1] = value;
4381
4382         return (0);
4383 }
4384
4385 /*
4386  * Called from vn_free() to run the destructor function for each vsd
4387  *      Locks out vsd_create and vsd_destroy
4388  *      Assumes that the destructor *DOES NOT* use vsd
4389  */
4390 void
4391 vsd_free(vnode_t *vp)
4392 {
4393         int i;
4394         struct vsd_node *vsd = vp->v_vsd;
4395
4396         if (vsd == NULL)
4397                 return;
4398
4399         if (vsd->vs_nkeys == 0) {
4400                 kmem_free(vsd, sizeof (*vsd));
4401                 vp->v_vsd = NULL;
4402                 return;
4403         }
4404
4405         /*
4406          * lock out vsd_create and vsd_destroy, call
4407          * the destructor, and mark the value as destroyed.
4408          */
4409         mutex_enter(&vsd_lock);
4410
4411         for (i = 0; i < vsd->vs_nkeys; i++) {
4412                 if (vsd->vs_value[i] && vsd_destructor[i])
4413                         (*vsd_destructor[i])(vsd->vs_value[i]);
4414                 vsd->vs_value[i] = NULL;
4415         }
4416
4417         /*
4418          * remove from linked list of VSD nodes
4419          */
4420         list_remove(vsd_list, vsd);
4421
4422         mutex_exit(&vsd_lock);
4423
4424         /*
4425          * free up the VSD
4426          */
4427         kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4428         kmem_free(vsd, sizeof (struct vsd_node));
4429         vp->v_vsd = NULL;
4430 }
4431
4432 /*
4433  * realloc
4434  */
4435 static void *
4436 vsd_realloc(void *old, size_t osize, size_t nsize)
4437 {
4438         void *new;
4439
4440         new = kmem_zalloc(nsize, KM_SLEEP);
4441         if (old) {
4442                 bcopy(old, new, osize);
4443                 kmem_free(old, osize);
4444         }
4445         return (new);
4446 }
4447
4448 /*
4449  * Setup the extensible system attribute for creating a reparse point.
4450  * The symlink data 'target' is validated for proper format of a reparse
4451  * string and a check also made to make sure the symlink data does not
4452  * point to an existing file.
4453  *
4454  * return 0 if ok else -1.
4455  */
4456 static int
4457 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4458 {
4459         xoptattr_t *xoap;
4460
4461         if ((!target) || (!vap) || (!xvattr))
4462                 return (-1);
4463
4464         /* validate reparse string */
4465         if (reparse_validate((const char *)target))
4466                 return (-1);
4467
4468         xva_init(xvattr);
4469         xvattr->xva_vattr = *vap;
4470         xvattr->xva_vattr.va_mask |= VATTR_XVATTR;
4471         xoap = xva_getxoptattr(xvattr);
4472         ASSERT(xoap);
4473         XVA_SET_REQ(xvattr, XAT_REPARSE);
4474         xoap->xoa_reparse = 1;
4475
4476         return (0);
4477 }
4478
4479 /*
4480  * Function to check whether a symlink is a reparse point.
4481  * Return B_TRUE if it is a reparse point, else return B_FALSE
4482  */
4483 boolean_t
4484 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4485 {
4486         xvattr_t xvattr;
4487         xoptattr_t *xoap;
4488
4489         if ((vp->v_type != VLNK) ||
4490             !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4491                 return (B_FALSE);
4492
4493         xva_init(&xvattr);
4494         xoap = xva_getxoptattr(&xvattr);
4495         ASSERT(xoap);
4496         XVA_SET_REQ(&xvattr, XAT_REPARSE);
4497
4498         if (fop_getattr(vp, &xvattr.xva_vattr, 0, cr, ct))
4499                 return (B_FALSE);
4500
4501         if ((!(xvattr.xva_vattr.va_mask & VATTR_XVATTR)) ||
4502             (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4503                 return (B_FALSE);
4504
4505         return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4506 }