kernel/fs/vnode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  29 /*        All Rights Reserved   */
  30
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40
  41 #include <sys/types.h>
  42 #include <sys/param.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/errno.h>
  45 #include <sys/cred.h>
  46 #include <sys/user.h>
  47 #include <sys/uio.h>
  48 #include <sys/file.h>
  49 #include <sys/pathname.h>
  50 #include <sys/vfs.h>
  51 #include <sys/vfs_opreg.h>
  52 #include <sys/vnode.h>
  53 #include <sys/rwstlock.h>
  54 #include <sys/fem.h>
  55 #include <sys/stat.h>
  56 #include <sys/mode.h>
  57 #include <sys/conf.h>
  58 #include <sys/sysmacros.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/systm.h>
  61 #include <sys/kmem.h>
  62 #include <sys/debug.h>
  63 #include <c2/audit.h>
  64 #include <sys/acl.h>
  65 #include <sys/nbmlock.h>
  66 #include <sys/fcntl.h>
  67 #include <sys/fs_subr.h>
  68 #include <sys/taskq.h>
  69 #include <sys/fs_reparse.h>
  70
  71 /* Determine if this vnode is a file that is read-only */
  72 #define ISROFILE(vp)    \
  73         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  74             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  75
  76 /* Tunable via /etc/system; used only by admin/install */
  77 int nfs_global_client_only;
  78
  79 /*
  80  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  81  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  82  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  83  * the same fstype index that is used to index into the vfssw table.
  84  */
  85 vopstats_t **vopstats_fstype;
  86
  87 /* vopstats initialization template used for fast initialization via bcopy() */
  88 static vopstats_t *vs_templatep;
  89
  90 /* Kmem cache handle for vsk_anchor_t allocations */
  91 kmem_cache_t *vsk_anchor_cache;
  92
  93 /* file events cleanup routine */
  94 extern void free_fopdata(vnode_t *);
  95
  96 /*
  97  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
  98  * updates to vsktat_tree.
  99  */
 100 avl_tree_t      vskstat_tree;
 101 kmutex_t        vskstat_tree_lock;
 102
 103 /* Global variable which enables/disables the vopstats collection */
 104 int vopstats_enabled = 1;
 105
 106 /*
 107  * forward declarations for internal vnode specific data (vsd)
 108  */
 109 static void *vsd_realloc(void *, size_t, size_t);
 110
 111 /*
 112  * forward declarations for reparse point functions
 113  */
 114 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 115
 116 /*
 117  * VSD -- VNODE SPECIFIC DATA
 118  * The v_data pointer is typically used by a file system to store a
 119  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 120  * However, there are times when additional project private data needs
 121  * to be stored separately from the data (node) pointed to by v_data.
 122  * This additional data could be stored by the file system itself or
 123  * by a completely different kernel entity.  VSD provides a way for
 124  * callers to obtain a key and store a pointer to private data associated
 125  * with a vnode.
 126  *
 127  * Callers are responsible for protecting the vsd by holding v_vsd_lock
 128  * for calls to vsd_set() and vsd_get().
 129  */
 130
 131 /*
 132  * vsd_lock protects:
 133  *   vsd_nkeys - creation and deletion of vsd keys
 134  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 135  *   vsd_destructor - adding and removing destructors to the list
 136  */
 137 static kmutex_t         vsd_lock;
 138 static uint_t           vsd_nkeys;       /* size of destructor array */
 139 /* list of vsd_node's */
 140 static list_t *vsd_list = NULL;
 141 /* per-key destructor funcs */
 142 static void             (**vsd_destructor)(void *);
 143
 144 /*
 145  * The following is the common set of actions needed to update the
 146  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 147  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 148  * recording of the bytes transferred.  Since the code is similar
 149  * but small, it is nearly a duplicate.  Consequently any changes
 150  * to one may need to be reflected in the other.
 151  * Rundown of the variables:
 152  * vp - Pointer to the vnode
 153  * counter - Partial name structure member to update in vopstats for counts
 154  * bytecounter - Partial name structure member to update in vopstats for bytes
 155  * bytesval - Value to update in vopstats for bytes
 156  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 157  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 158  */
 159
 160 #define VOPSTATS_UPDATE(vp, counter) {                                  \
 161         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 162         if (vfsp && vfsp->vfs_implp &&                                  \
 163             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 164                 vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 165                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 166                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 167                     size_t, uint64_t *);                                \
 168                 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 169                 (*stataddr)++;                                          \
 170                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 171                         vsp->n##counter.value.ui64++;                   \
 172                 }                                                       \
 173         }                                                               \
 174 }
 175
 176 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 177         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 178         if (vfsp && vfsp->vfs_implp &&                                  \
 179             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 180                 vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 181                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 182                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 183                     size_t, uint64_t *);                                \
 184                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 185                 (*stataddr)++;                                          \
 186                 vsp->bytecounter.value.ui64 += bytesval;                \
 187                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 188                         vsp->n##counter.value.ui64++;                   \
 189                         vsp->bytecounter.value.ui64 += bytesval;        \
 190                 }                                                       \
 191         }                                                               \
 192 }
 193
 194 /*
 195  * If the filesystem does not support XIDs map credential
 196  * If the vfsp is NULL, perhaps we should also map?
 197  */
 198 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 199         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 200         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)            \
 201                 cr = crgetmapped(cr);                                   \
 202         }
 203
 204 /*
 205  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 206  * numerical order of S_IFMT and vnode types.)
 207  */
 208 enum vtype iftovt_tab[] = {
 209         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 210         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 211 };
 212
 213 ushort_t vttoif_tab[] = {
 214         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 215         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 216 };
 217
 218 /*
 219  * The system vnode cache.
 220  */
 221
 222 kmem_cache_t *vn_cache;
 223
 224
 225 /*
 226  * Vnode operations vector.
 227  */
 228
 229 static const fs_operation_trans_def_t vn_ops_table[] = {
 230         VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 231             fs_nosys,
 232
 233         VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 234             fs_nosys,
 235
 236         VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 237             fs_nosys,
 238
 239         VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 240             fs_nosys,
 241
 242         VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 243             fs_nosys,
 244
 245         VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 246             fs_setfl,
 247
 248         VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 249             fs_nosys,
 250
 251         VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 252             fs_nosys,
 253
 254         VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 255             fs_nosys,
 256
 257         VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 258             fs_nosys,
 259
 260         VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 261             fs_nosys,
 262
 263         VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 264             fs_nosys,
 265
 266         VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 267             fs_nosys,
 268
 269         VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 270             fs_nosys,
 271
 272         VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 273             fs_nosys,
 274
 275         VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 276             fs_nosys,
 277
 278         VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 279             fs_nosys,
 280
 281         VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 282             fs_nosys,
 283
 284         VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 285             fs_nosys,
 286
 287         VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 288             fs_nosys,
 289
 290         VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 291             fs_nosys,
 292
 293         VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 294             fs_nosys,
 295
 296         VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 297             fs_rwlock,
 298
 299         VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 300             (fs_generic_func_p) fs_rwunlock,
 301
 302         VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 303             fs_nosys,
 304
 305         VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 306             fs_cmp,
 307
 308         VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 309             fs_frlock,
 310
 311         VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 312             fs_nosys,
 313
 314         VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 315             fs_nosys,
 316
 317         VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 318             fs_nosys,
 319
 320         VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 321             fs_nosys,
 322
 323         VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 324             (fs_generic_func_p) fs_nosys_map,
 325
 326         VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 327             (fs_generic_func_p) fs_nosys_addmap,
 328
 329         VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 330             fs_nosys,
 331
 332         VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 333             (fs_generic_func_p) fs_poll,
 334
 335         VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 336             fs_nosys,
 337
 338         VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 339             fs_pathconf,
 340
 341         VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 342             fs_nosys,
 343
 344         VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 345             fs_nosys,
 346
 347         VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 348             (fs_generic_func_p) fs_dispose,
 349
 350         VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 351             fs_nosys,
 352
 353         VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 354             fs_fab_acl,
 355
 356         VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 357             fs_shrlock,
 358
 359         VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 360             (fs_generic_func_p) fs_vnevent_nosupport,
 361
 362         VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 363             fs_nosys,
 364
 365         VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 366             fs_nosys,
 367
 368         NULL, 0, NULL,
 369 };
 370
 371 /* Extensible attribute (xva) routines. */
 372
 373 /*
 374  * Zero out the structure, set the size of the requested/returned bitmaps,
 375  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 376  * to the returned attributes array.
 377  */
 378 void
 379 xva_init(xvattr_t *xvap)
 380 {
 381         bzero(xvap, sizeof (xvattr_t));
 382         xvap->xva_mapsize = XVA_MAPSIZE;
 383         xvap->xva_magic = XVA_MAGIC;
 384         xvap->xva_vattr.va_mask = AT_XVATTR;
 385         xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 386 }
 387
 388 /*
 389  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 390  * structure.  Otherwise, returns NULL.
 391  */
 392 xoptattr_t *
 393 xva_getxoptattr(xvattr_t *xvap)
 394 {
 395         xoptattr_t *xoap = NULL;
 396         if (xvap->xva_vattr.va_mask & AT_XVATTR)
 397                 xoap = &xvap->xva_xoptattrs;
 398         return (xoap);
 399 }
 400
 401 /*
 402  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 403  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 404  * kstat name.
 405  */
 406 static int
 407 vska_compar(const void *n1, const void *n2)
 408 {
 409         int ret;
 410         ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 411         ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 412
 413         if (p1 < p2) {
 414                 ret = -1;
 415         } else if (p1 > p2) {
 416                 ret = 1;
 417         } else {
 418                 ret = 0;
 419         }
 420
 421         return (ret);
 422 }
 423
 424 /*
 425  * Used to create a single template which will be bcopy()ed to a newly
 426  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 427  */
 428 static vopstats_t *
 429 create_vopstats_template()
 430 {
 431         vopstats_t              *vsp;
 432
 433         vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 434         bzero(vsp, sizeof (*vsp));      /* Start fresh */
 435
 436         /* fop_open */
 437         kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 438         /* fop_close */
 439         kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 440         /* fop_read I/O */
 441         kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 442         kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 443         /* fop_write I/O */
 444         kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 445         kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 446         /* fop_ioctl */
 447         kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 448         /* fop_setfl */
 449         kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 450         /* fop_getattr */
 451         kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 452         /* fop_setattr */
 453         kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 454         /* fop_access */
 455         kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 456         /* fop_lookup */
 457         kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 458         /* fop_create */
 459         kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 460         /* fop_remove */
 461         kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 462         /* fop_link */
 463         kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 464         /* fop_rename */
 465         kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 466         /* fop_mkdir */
 467         kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 468         /* fop_rmdir */
 469         kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 470         /* fop_readdir I/O */
 471         kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 472         kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 473             KSTAT_DATA_UINT64);
 474         /* fop_symlink */
 475         kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 476         /* fop_readlink */
 477         kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 478         /* fop_fsync */
 479         kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 480         /* fop_inactive */
 481         kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 482         /* fop_fid */
 483         kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 484         /* fop_rwlock */
 485         kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 486         /* fop_rwunlock */
 487         kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 488         /* fop_seek */
 489         kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 490         /* fop_cmp */
 491         kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 492         /* fop_frlock */
 493         kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 494         /* fop_space */
 495         kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 496         /* fop_realvp */
 497         kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 498         /* fop_getpage */
 499         kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 500         /* fop_putpage */
 501         kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 502         /* fop_map */
 503         kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 504         /* fop_addmap */
 505         kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 506         /* fop_delmap */
 507         kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 508         /* fop_poll */
 509         kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 510         /* fop_dump */
 511         kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 512         /* fop_pathconf */
 513         kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 514         /* fop_pageio */
 515         kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 516         /* fop_dumpctl */
 517         kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 518         /* fop_dispose */
 519         kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 520         /* fop_setsecattr */
 521         kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 522         /* fop_getsecattr */
 523         kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 524         /* fop_shrlock */
 525         kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 526         /* fop_vnevent */
 527         kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 528         /* fop_reqzcbuf */
 529         kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 530         /* fop_retzcbuf */
 531         kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 532
 533         return (vsp);
 534 }
 535
 536 /*
 537  * Creates a kstat structure associated with a vopstats structure.
 538  */
 539 kstat_t *
 540 new_vskstat(char *ksname, vopstats_t *vsp)
 541 {
 542         kstat_t         *ksp;
 543
 544         if (!vopstats_enabled) {
 545                 return (NULL);
 546         }
 547
 548         ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 549             sizeof (vopstats_t)/sizeof (kstat_named_t),
 550             KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 551         if (ksp) {
 552                 ksp->ks_data = vsp;
 553                 kstat_install(ksp);
 554         }
 555
 556         return (ksp);
 557 }
 558
 559 /*
 560  * Called from vfsinit() to initialize the support mechanisms for vopstats
 561  */
 562 void
 563 vopstats_startup()
 564 {
 565         if (!vopstats_enabled)
 566                 return;
 567
 568         /*
 569          * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 570          * is necessary since we need to check if a kstat exists before we
 571          * attempt to create it.  Also, initialize its lock.
 572          */
 573         avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 574             offsetof(vsk_anchor_t, vsk_node));
 575         mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 576
 577         vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 578             sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 579             NULL, NULL, 0);
 580
 581         /*
 582          * Set up the array of pointers for the vopstats-by-FS-type.
 583          * The entries will be allocated/initialized as each file system
 584          * goes through modload/mod_installfs.
 585          */
 586         vopstats_fstype = (vopstats_t **)kmem_zalloc(
 587             (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 588
 589         /* Set up the global vopstats initialization template */
 590         vs_templatep = create_vopstats_template();
 591 }
 592
 593 /*
 594  * We need to have the all of the counters zeroed.
 595  * The initialization of the vopstats_t includes on the order of
 596  * 50 calls to kstat_named_init().  Rather that do that on every call,
 597  * we do it once in a template (vs_templatep) then bcopy it over.
 598  */
 599 void
 600 initialize_vopstats(vopstats_t *vsp)
 601 {
 602         if (vsp == NULL)
 603                 return;
 604
 605         bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 606 }
 607
 608 /*
 609  * If possible, determine which vopstats by fstype to use and
 610  * return a pointer to the caller.
 611  */
 612 vopstats_t *
 613 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 614 {
 615         int             fstype = 0;     /* Index into vfssw[] */
 616         vopstats_t      *vsp = NULL;
 617
 618         if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 619             !vopstats_enabled)
 620                 return (NULL);
 621         /*
 622          * Set up the fstype.  We go to so much trouble because all versions
 623          * of NFS use the same fstype in their vfs even though they have
 624          * distinct entries in the vfssw[] table.
 625          * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 626          */
 627         if (vswp) {
 628                 fstype = vswp - vfssw;  /* Gets us the index */
 629         } else {
 630                 fstype = vfsp->vfs_fstype;
 631         }
 632
 633         /*
 634          * Point to the per-fstype vopstats. The only valid values are
 635          * non-zero positive values less than the number of vfssw[] table
 636          * entries.
 637          */
 638         if (fstype > 0 && fstype < nfstype) {
 639                 vsp = vopstats_fstype[fstype];
 640         }
 641
 642         return (vsp);
 643 }
 644
 645 /*
 646  * Generate a kstat name, create the kstat structure, and allocate a
 647  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 648  * to the caller.  This must only be called from a mount.
 649  */
 650 vsk_anchor_t *
 651 get_vskstat_anchor(vfs_t *vfsp)
 652 {
 653         char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 654         statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 655         vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 656         kstat_t         *ksp;                   /* Ptr to new kstat */
 657         avl_index_t     where;                  /* Location in the AVL tree */
 658
 659         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 660             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 661                 return (NULL);
 662
 663         /* Need to get the fsid to build a kstat name */
 664         if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 665                 /* Create a name for our kstats based on fsid */
 666                 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 667                     VOPSTATS_STR, statvfsbuf.f_fsid);
 668
 669                 /* Allocate and initialize the vsk_anchor_t */
 670                 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 671                 bzero(vskp, sizeof (*vskp));
 672                 vskp->vsk_fsid = statvfsbuf.f_fsid;
 673
 674                 mutex_enter(&vskstat_tree_lock);
 675                 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 676                         avl_insert(&vskstat_tree, vskp, where);
 677                         mutex_exit(&vskstat_tree_lock);
 678
 679                         /*
 680                          * Now that we've got the anchor in the AVL
 681                          * tree, we can create the kstat.
 682                          */
 683                         ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 684                         if (ksp) {
 685                                 vskp->vsk_ksp = ksp;
 686                         }
 687                 } else {
 688                         /* Oops, found one! Release memory and lock. */
 689                         mutex_exit(&vskstat_tree_lock);
 690                         kmem_cache_free(vsk_anchor_cache, vskp);
 691                         vskp = NULL;
 692                 }
 693         }
 694         return (vskp);
 695 }
 696
 697 /*
 698  * We're in the process of tearing down the vfs and need to cleanup
 699  * the data structures associated with the vopstats. Must only be called
 700  * from dounmount().
 701  */
 702 void
 703 teardown_vopstats(vfs_t *vfsp)
 704 {
 705         vsk_anchor_t    *vskap;
 706         avl_index_t     where;
 707
 708         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 709             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 710                 return;
 711
 712         /* This is a safe check since VFS_STATS must be set (see above) */
 713         if ((vskap = vfsp->vfs_vskap) == NULL)
 714                 return;
 715
 716         /* Whack the pointer right away */
 717         vfsp->vfs_vskap = NULL;
 718
 719         /* Lock the tree, remove the node, and delete the kstat */
 720         mutex_enter(&vskstat_tree_lock);
 721         if (avl_find(&vskstat_tree, vskap, &where)) {
 722                 avl_remove(&vskstat_tree, vskap);
 723         }
 724
 725         if (vskap->vsk_ksp) {
 726                 kstat_delete(vskap->vsk_ksp);
 727         }
 728         mutex_exit(&vskstat_tree_lock);
 729
 730         kmem_cache_free(vsk_anchor_cache, vskap);
 731 }
 732
 733 /*
 734  * Read or write a vnode.  Called from kernel code.
 735  */
 736 int
 737 vn_rdwr(
 738         enum uio_rw rw,
 739         struct vnode *vp,
 740         caddr_t base,
 741         ssize_t len,
 742         offset_t offset,
 743         enum uio_seg seg,
 744         int ioflag,
 745         rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 746         cred_t *cr,
 747         ssize_t *residp)
 748 {
 749         struct uio uio;
 750         struct iovec iov;
 751         int error;
 752         int in_crit = 0;
 753
 754         if (rw == UIO_WRITE && ISROFILE(vp))
 755                 return (EROFS);
 756
 757         if (len < 0)
 758                 return (EIO);
 759
 760         VOPXID_MAP_CR(vp, cr);
 761
 762         iov.iov_base = base;
 763         iov.iov_len = len;
 764         uio.uio_iov = &iov;
 765         uio.uio_iovcnt = 1;
 766         uio.uio_loffset = offset;
 767         uio.uio_segflg = (short)seg;
 768         uio.uio_resid = len;
 769         uio.uio_llimit = ulimit;
 770
 771         /*
 772          * We have to enter the critical region before calling fop_rwlock
 773          * to avoid a deadlock with ufs.
 774          */
 775         if (nbl_need_check(vp)) {
 776                 int svmand;
 777
 778                 nbl_start_crit(vp, RW_READER);
 779                 in_crit = 1;
 780                 error = nbl_svmand(vp, cr, &svmand);
 781                 if (error != 0)
 782                         goto done;
 783                 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 784                     uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 785                         error = EACCES;
 786                         goto done;
 787                 }
 788         }
 789
 790         (void) fop_rwlock(vp,
 791             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 792         if (rw == UIO_WRITE) {
 793                 uio.uio_fmode = FWRITE;
 794                 uio.uio_extflg = UIO_COPY_DEFAULT;
 795                 error = fop_write(vp, &uio, ioflag, cr, NULL);
 796         } else {
 797                 uio.uio_fmode = FREAD;
 798                 uio.uio_extflg = UIO_COPY_CACHED;
 799                 error = fop_read(vp, &uio, ioflag, cr, NULL);
 800         }
 801         fop_rwunlock(vp,
 802             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 803         if (residp)
 804                 *residp = uio.uio_resid;
 805         else if (uio.uio_resid)
 806                 error = EIO;
 807
 808 done:
 809         if (in_crit)
 810                 nbl_end_crit(vp);
 811         return (error);
 812 }
 813
 814 /*
 815  * Release a vnode.  Call fop_inactive on last reference or
 816  * decrement reference count.
 817  *
 818  * To avoid race conditions, the v_count is left at 1 for
 819  * the call to fop_inactive. This prevents another thread
 820  * from reclaiming and releasing the vnode *before* the
 821  * fop_inactive routine has a chance to destroy the vnode.
 822  * We can't have more than 1 thread calling fop_inactive
 823  * on a vnode.
 824  */
 825 void
 826 vn_rele(vnode_t *vp)
 827 {
 828         VERIFY(vp->v_count > 0);
 829         mutex_enter(&vp->v_lock);
 830         if (vp->v_count == 1) {
 831                 mutex_exit(&vp->v_lock);
 832                 fop_inactive(vp, CRED(), NULL);
 833                 return;
 834         }
 835         vp->v_count--;
 836         mutex_exit(&vp->v_lock);
 837 }
 838
 839 /*
 840  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 841  * as a single reference, so v_count is not decremented until the last DNLC hold
 842  * is released. This makes it possible to distinguish vnodes that are referenced
 843  * only by the DNLC.
 844  */
 845 void
 846 vn_rele_dnlc(vnode_t *vp)
 847 {
 848         VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 849         mutex_enter(&vp->v_lock);
 850         if (--vp->v_count_dnlc == 0) {
 851                 if (vp->v_count == 1) {
 852                         mutex_exit(&vp->v_lock);
 853                         fop_inactive(vp, CRED(), NULL);
 854                         return;
 855                 }
 856                 vp->v_count--;
 857         }
 858         mutex_exit(&vp->v_lock);
 859 }
 860
 861 /*
 862  * Like vn_rele() except that it clears v_stream under v_lock.
 863  * This is used by sockfs when it dismantels the association between
 864  * the sockfs node and the vnode in the underlaying file system.
 865  * v_lock has to be held to prevent a thread coming through the lookupname
 866  * path from accessing a stream head that is going away.
 867  */
 868 void
 869 vn_rele_stream(vnode_t *vp)
 870 {
 871         VERIFY(vp->v_count > 0);
 872         mutex_enter(&vp->v_lock);
 873         vp->v_stream = NULL;
 874         if (vp->v_count == 1) {
 875                 mutex_exit(&vp->v_lock);
 876                 fop_inactive(vp, CRED(), NULL);
 877                 return;
 878         }
 879         vp->v_count--;
 880         mutex_exit(&vp->v_lock);
 881 }
 882
 883 static void
 884 vn_rele_inactive(vnode_t *vp)
 885 {
 886         fop_inactive(vp, CRED(), NULL);
 887 }
 888
 889 /*
 890  * Like vn_rele() except if we are going to call fop_inactive() then do it
 891  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 892  * the file system as a result of releasing the vnode. Note, file systems
 893  * already have to handle the race where the vnode is incremented before the
 894  * inactive routine is called and does its locking.
 895  *
 896  * Warning: Excessive use of this routine can lead to performance problems.
 897  * This is because taskqs throttle back allocation if too many are created.
 898  */
 899 void
 900 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 901 {
 902         VERIFY(vp->v_count > 0);
 903         mutex_enter(&vp->v_lock);
 904         if (vp->v_count == 1) {
 905                 mutex_exit(&vp->v_lock);
 906                 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 907                     vp, TQ_SLEEP) != (uintptr_t)NULL);
 908                 return;
 909         }
 910         vp->v_count--;
 911         mutex_exit(&vp->v_lock);
 912 }
 913
 914 int
 915 vn_open(
 916         char *pnamep,
 917         enum uio_seg seg,
 918         int filemode,
 919         int createmode,
 920         struct vnode **vpp,
 921         enum create crwhy,
 922         mode_t umask)
 923 {
 924         return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 925             umask, NULL, -1));
 926 }
 927
 928
 929 /*
 930  * Open/create a vnode.
 931  * This may be callable by the kernel, the only known use
 932  * of user context being that the current user credentials
 933  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 934  */
 935 int
 936 vn_openat(
 937         char *pnamep,
 938         enum uio_seg seg,
 939         int filemode,
 940         int createmode,
 941         struct vnode **vpp,
 942         enum create crwhy,
 943         mode_t umask,
 944         struct vnode *startvp,
 945         int fd)
 946 {
 947         struct vnode *vp;
 948         int mode;
 949         int accessflags;
 950         int error;
 951         int in_crit = 0;
 952         int open_done = 0;
 953         int shrlock_done = 0;
 954         struct vattr vattr;
 955         enum symfollow follow;
 956         int estale_retry = 0;
 957         struct shrlock shr;
 958         struct shr_locowner shr_own;
 959
 960         if (filemode & FSEARCH)
 961                 filemode |= FDIRECTORY;
 962
 963         mode = 0;
 964         accessflags = 0;
 965         if (filemode & FREAD)
 966                 mode |= VREAD;
 967         if (filemode & (FWRITE|FTRUNC))
 968                 mode |= VWRITE;
 969         if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 970                 mode |= VEXEC;
 971
 972         /* symlink interpretation */
 973         if (filemode & FNOFOLLOW)
 974                 follow = NO_FOLLOW;
 975         else
 976                 follow = FOLLOW;
 977
 978         if (filemode & FAPPEND)
 979                 accessflags |= V_APPEND;
 980
 981 top:
 982         if (filemode & FCREAT && !(filemode & FDIRECTORY)) {
 983                 enum vcexcl excl;
 984
 985                 /* Wish to create a file. */
 986                 vattr.va_type = VREG;
 987                 vattr.va_mode = createmode;
 988                 vattr.va_mask = AT_TYPE|AT_MODE;
 989                 if (filemode & FTRUNC) {
 990                         vattr.va_size = 0;
 991                         vattr.va_mask |= AT_SIZE;
 992                 }
 993                 if (filemode & FEXCL)
 994                         excl = EXCL;
 995                 else
 996                         excl = NONEXCL;
 997
 998                 if (error =
 999                     vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1000                     (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1001                         return (error);
1002         } else {
1003                 /* Wish to open a file.  Just look it up. */
1004                 if (error = lookupnameat(pnamep, seg, follow,
1005                     NULLVPP, &vp, startvp)) {
1006                         if ((error == ESTALE) &&
1007                             fs_need_estale_retry(estale_retry++))
1008                                 goto top;
1009                         return (error);
1010                 }
1011
1012                 /*
1013                  * Get the attributes to check whether file is large.
1014                  * We do this only if the FOFFMAX flag is not set and
1015                  * only for regular files.
1016                  */
1017
1018                 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1019                         vattr.va_mask = AT_SIZE;
1020                         if ((error = fop_getattr(vp, &vattr, 0,
1021                             CRED(), NULL))) {
1022                                 goto out;
1023                         }
1024                         if (vattr.va_size > (uoff_t)MAXOFF32_T) {
1025                                 /*
1026                                  * Large File API - regular open fails
1027                                  * if FOFFMAX flag is set in file mode
1028                                  */
1029                                 error = EOVERFLOW;
1030                                 goto out;
1031                         }
1032                 }
1033                 /*
1034                  * Can't write directories, active texts, or
1035                  * read-only filesystems.  Can't truncate files
1036                  * on which mandatory locking is in effect.
1037                  */
1038                 if (filemode & (FWRITE|FTRUNC)) {
1039                         /*
1040                          * Allow writable directory if VDIROPEN flag is set.
1041                          */
1042                         if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1043                                 error = EISDIR;
1044                                 goto out;
1045                         }
1046                         if (ISROFILE(vp)) {
1047                                 error = EROFS;
1048                                 goto out;
1049                         }
1050                         /*
1051                          * Can't truncate files on which
1052                          * sysv mandatory locking is in effect.
1053                          */
1054                         if (filemode & FTRUNC) {
1055                                 vnode_t *rvp;
1056
1057                                 if (fop_realvp(vp, &rvp, NULL) != 0)
1058                                         rvp = vp;
1059                                 if (rvp->v_filocks != NULL) {
1060                                         vattr.va_mask = AT_MODE;
1061                                         if ((error = fop_getattr(vp,
1062                                             &vattr, 0, CRED(), NULL)) == 0 &&
1063                                             MANDLOCK(vp, vattr.va_mode))
1064                                                 error = EAGAIN;
1065                                 }
1066                         }
1067                         if (error)
1068                                 goto out;
1069                 }
1070                 /*
1071                  * Check permissions.
1072                  */
1073                 if (error = fop_access(vp, mode, accessflags, CRED(), NULL))
1074                         goto out;
1075                 /*
1076                  * Require FDIRECTORY to return a directory.
1077                  * Require FEXEC to return a regular file.
1078                  */
1079                 if ((filemode & FDIRECTORY) && vp->v_type != VDIR) {
1080                         error = ENOTDIR;
1081                         goto out;
1082                 }
1083                 if ((filemode & FEXEC) && vp->v_type != VREG) {
1084                         error = ENOEXEC;        /* XXX: error code? */
1085                         goto out;
1086                 }
1087         }
1088
1089         /*
1090          * Do remaining checks for FNOFOLLOW and FNOLINKS.
1091          */
1092         if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1093                 error = ELOOP;
1094                 goto out;
1095         }
1096         if (filemode & FNOLINKS) {
1097                 vattr.va_mask = AT_NLINK;
1098                 if ((error = fop_getattr(vp, &vattr, 0, CRED(), NULL))) {
1099                         goto out;
1100                 }
1101                 if (vattr.va_nlink != 1) {
1102                         error = EMLINK;
1103                         goto out;
1104                 }
1105         }
1106
1107         /*
1108          * Opening a socket corresponding to the AF_UNIX pathname
1109          * in the filesystem name space is not supported.
1110          * However, VSOCK nodes in namefs are supported in order
1111          * to make fattach work for sockets.
1112          *
1113          * XXX This uses fop_realvp to distinguish between
1114          * an unopened namefs node (where fop_realvp returns a
1115          * different VSOCK vnode) and a VSOCK created by vn_create
1116          * in some file system (where fop_realvp would never return
1117          * a different vnode).
1118          */
1119         if (vp->v_type == VSOCK) {
1120                 struct vnode *nvp;
1121
1122                 error = fop_realvp(vp, &nvp, NULL);
1123                 if (error != 0 || nvp == NULL || nvp == vp ||
1124                     nvp->v_type != VSOCK) {
1125                         error = EOPNOTSUPP;
1126                         goto out;
1127                 }
1128         }
1129
1130         if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1131                 /* get share reservation */
1132                 shr.s_access = 0;
1133                 if (filemode & FWRITE)
1134                         shr.s_access |= F_WRACC;
1135                 if (filemode & FREAD)
1136                         shr.s_access |= F_RDACC;
1137                 shr.s_deny = 0;
1138                 shr.s_sysid = 0;
1139                 shr.s_pid = ttoproc(curthread)->p_pid;
1140                 shr_own.sl_pid = shr.s_pid;
1141                 shr_own.sl_id = fd;
1142                 shr.s_own_len = sizeof (shr_own);
1143                 shr.s_owner = (caddr_t)&shr_own;
1144                 error = fop_shrlock(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1145                     NULL);
1146                 if (error)
1147                         goto out;
1148                 shrlock_done = 1;
1149
1150                 /* nbmand conflict check if truncating file */
1151                 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1152                         nbl_start_crit(vp, RW_READER);
1153                         in_crit = 1;
1154
1155                         vattr.va_mask = AT_SIZE;
1156                         if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL))
1157                                 goto out;
1158                         if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1159                             NULL)) {
1160                                 error = EACCES;
1161                                 goto out;
1162                         }
1163                 }
1164         }
1165
1166         /*
1167          * Do opening protocol.
1168          */
1169         error = fop_open(&vp, filemode, CRED(), NULL);
1170         if (error)
1171                 goto out;
1172         open_done = 1;
1173
1174         /*
1175          * Truncate if required.
1176          */
1177         if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1178                 vattr.va_size = 0;
1179                 vattr.va_mask = AT_SIZE;
1180                 if ((error = fop_setattr(vp, &vattr, 0, CRED(), NULL)) != 0)
1181                         goto out;
1182         }
1183 out:
1184         ASSERT(vp->v_count > 0);
1185
1186         if (in_crit) {
1187                 nbl_end_crit(vp);
1188                 in_crit = 0;
1189         }
1190         if (error) {
1191                 if (open_done) {
1192                         (void) fop_close(vp, filemode, 1, (offset_t)0, CRED(),
1193                             NULL);
1194                         open_done = 0;
1195                         shrlock_done = 0;
1196                 }
1197                 if (shrlock_done) {
1198                         (void) fop_shrlock(vp, F_UNSHARE, &shr, 0, CRED(),
1199                             NULL);
1200                         shrlock_done = 0;
1201                 }
1202
1203                 /*
1204                  * The following clause was added to handle a problem
1205                  * with NFS consistency.  It is possible that a lookup
1206                  * of the file to be opened succeeded, but the file
1207                  * itself doesn't actually exist on the server.  This
1208                  * is chiefly due to the DNLC containing an entry for
1209                  * the file which has been removed on the server.  In
1210                  * this case, we just start over.  If there was some
1211                  * other cause for the ESTALE error, then the lookup
1212                  * of the file will fail and the error will be returned
1213                  * above instead of looping around from here.
1214                  */
1215                 VN_RELE(vp);
1216                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1217                         goto top;
1218         } else
1219                 *vpp = vp;
1220         return (error);
1221 }
1222
1223 /*
1224  * The following two accessor functions are for the NFSv4 server.  Since there
1225  * is no fop_open_UP/DOWNGRADE we need a way for the NFS server to keep the
1226  * vnode open counts correct when a client "upgrades" an open or does an
1227  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1228  * open mode (add or subtract read or write), but also change the share/deny
1229  * modes.  However, share reservations are not integrated with OPEN, yet, so
1230  * we need to handle each separately.  These functions are cleaner than having
1231  * the NFS server manipulate the counts directly, however, nobody else should
1232  * use these functions.
1233  */
1234 void
1235 vn_open_upgrade(
1236         vnode_t *vp,
1237         int filemode)
1238 {
1239         ASSERT(vp->v_type == VREG);
1240
1241         if (filemode & FREAD)
1242                 atomic_inc_32(&vp->v_rdcnt);
1243         if (filemode & FWRITE)
1244                 atomic_inc_32(&vp->v_wrcnt);
1245
1246 }
1247
1248 void
1249 vn_open_downgrade(
1250         vnode_t *vp,
1251         int filemode)
1252 {
1253         ASSERT(vp->v_type == VREG);
1254
1255         if (filemode & FREAD) {
1256                 ASSERT(vp->v_rdcnt > 0);
1257                 atomic_dec_32(&vp->v_rdcnt);
1258         }
1259         if (filemode & FWRITE) {
1260                 ASSERT(vp->v_wrcnt > 0);
1261                 atomic_dec_32(&vp->v_wrcnt);
1262         }
1263
1264 }
1265
1266 int
1267 vn_create(
1268         char *pnamep,
1269         enum uio_seg seg,
1270         struct vattr *vap,
1271         enum vcexcl excl,
1272         int mode,
1273         struct vnode **vpp,
1274         enum create why,
1275         int flag,
1276         mode_t umask)
1277 {
1278         return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1279             umask, NULL));
1280 }
1281
1282 /*
1283  * Create a vnode (makenode).
1284  */
1285 int
1286 vn_createat(
1287         char *pnamep,
1288         enum uio_seg seg,
1289         struct vattr *vap,
1290         enum vcexcl excl,
1291         int mode,
1292         struct vnode **vpp,
1293         enum create why,
1294         int flag,
1295         mode_t umask,
1296         struct vnode *startvp)
1297 {
1298         struct vnode *dvp;      /* ptr to parent dir vnode */
1299         struct vnode *vp = NULL;
1300         struct pathname pn;
1301         int error;
1302         int in_crit = 0;
1303         struct vattr vattr;
1304         enum symfollow follow;
1305         int estale_retry = 0;
1306         uint32_t auditing = AU_AUDITING();
1307
1308         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1309
1310         /* symlink interpretation */
1311         if ((flag & FNOFOLLOW) || excl == EXCL)
1312                 follow = NO_FOLLOW;
1313         else
1314                 follow = FOLLOW;
1315         flag &= ~(FNOFOLLOW|FNOLINKS);
1316
1317 top:
1318         /*
1319          * Lookup directory.
1320          * If new object is a file, call lower level to create it.
1321          * Note that it is up to the lower level to enforce exclusive
1322          * creation, if the file is already there.
1323          * This allows the lower level to do whatever
1324          * locking or protocol that is needed to prevent races.
1325          * If the new object is directory call lower level to make
1326          * the new directory, with "." and "..".
1327          */
1328         if (error = pn_get(pnamep, seg, &pn))
1329                 return (error);
1330         if (auditing)
1331                 audit_vncreate_start();
1332         dvp = NULL;
1333         *vpp = NULL;
1334         /*
1335          * lookup will find the parent directory for the vnode.
1336          * When it is done the pn holds the name of the entry
1337          * in the directory.
1338          * If this is a non-exclusive create we also find the node itself.
1339          */
1340         error = lookuppnat(&pn, NULL, follow, &dvp,
1341             (excl == EXCL) ? NULLVPP : vpp, startvp);
1342         if (error) {
1343                 pn_free(&pn);
1344                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1345                         goto top;
1346                 if (why == CRMKDIR && error == EINVAL)
1347                         error = EEXIST;         /* SVID */
1348                 return (error);
1349         }
1350
1351         if (why != CRMKNOD)
1352                 vap->va_mode &= ~VSVTX;
1353
1354         /*
1355          * If default ACLs are defined for the directory don't apply the
1356          * umask if umask is passed.
1357          */
1358
1359         if (umask) {
1360
1361                 vsecattr_t vsec;
1362
1363                 vsec.vsa_aclcnt = 0;
1364                 vsec.vsa_aclentp = NULL;
1365                 vsec.vsa_dfaclcnt = 0;
1366                 vsec.vsa_dfaclentp = NULL;
1367                 vsec.vsa_mask = VSA_DFACLCNT;
1368                 error = fop_getsecattr(dvp, &vsec, 0, CRED(), NULL);
1369                 /*
1370                  * If error is ENOSYS then treat it as no error
1371                  * Don't want to force all file systems to support
1372                  * aclent_t style of ACL's.
1373                  */
1374                 if (error == ENOSYS)
1375                         error = 0;
1376                 if (error) {
1377                         if (*vpp != NULL)
1378                                 VN_RELE(*vpp);
1379                         goto out;
1380                 } else {
1381                         /*
1382                          * Apply the umask if no default ACLs.
1383                          */
1384                         if (vsec.vsa_dfaclcnt == 0)
1385                                 vap->va_mode &= ~umask;
1386
1387                         /*
1388                          * fop_getsecattr() may have allocated memory for
1389                          * ACLs we didn't request, so double-check and
1390                          * free it if necessary.
1391                          */
1392                         if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1393                                 kmem_free((caddr_t)vsec.vsa_aclentp,
1394                                     vsec.vsa_aclcnt * sizeof (aclent_t));
1395                         if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1396                                 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1397                                     vsec.vsa_dfaclcnt * sizeof (aclent_t));
1398                 }
1399         }
1400
1401         /*
1402          * In general we want to generate EROFS if the file system is
1403          * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1404          * documents the open system call, and it says that O_CREAT has no
1405          * effect if the file already exists.  Bug 1119649 states
1406          * that open(path, O_CREAT, ...) fails when attempting to open an
1407          * existing file on a read only file system.  Thus, the first part
1408          * of the following if statement has 3 checks:
1409          *      if the file exists &&
1410          *              it is being open with write access &&
1411          *              the file system is read only
1412          *      then generate EROFS
1413          */
1414         if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1415             (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1416                 if (*vpp)
1417                         VN_RELE(*vpp);
1418                 error = EROFS;
1419         } else if (excl == NONEXCL && *vpp != NULL) {
1420                 vnode_t *rvp;
1421
1422                 /*
1423                  * File already exists.  If a mandatory lock has been
1424                  * applied, return error.
1425                  */
1426                 vp = *vpp;
1427                 if (fop_realvp(vp, &rvp, NULL) != 0)
1428                         rvp = vp;
1429                 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1430                         nbl_start_crit(vp, RW_READER);
1431                         in_crit = 1;
1432                 }
1433                 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1434                         vattr.va_mask = AT_MODE|AT_SIZE;
1435                         if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL)) {
1436                                 goto out;
1437                         }
1438                         if (MANDLOCK(vp, vattr.va_mode)) {
1439                                 error = EAGAIN;
1440                                 goto out;
1441                         }
1442                         /*
1443                          * File cannot be truncated if non-blocking mandatory
1444                          * locks are currently on the file.
1445                          */
1446                         if ((vap->va_mask & AT_SIZE) && in_crit) {
1447                                 uoff_t offset;
1448                                 ssize_t length;
1449
1450                                 offset = vap->va_size > vattr.va_size ?
1451                                     vattr.va_size : vap->va_size;
1452                                 length = vap->va_size > vattr.va_size ?
1453                                     vap->va_size - vattr.va_size :
1454                                     vattr.va_size - vap->va_size;
1455                                 if (nbl_conflict(vp, NBL_WRITE, offset,
1456                                     length, 0, NULL)) {
1457                                         error = EACCES;
1458                                         goto out;
1459                                 }
1460                         }
1461                 }
1462
1463                 /*
1464                  * If the file is the root of a VFS, we've crossed a
1465                  * mount point and the "containing" directory that we
1466                  * acquired above (dvp) is irrelevant because it's in
1467                  * a different file system.  We apply fop_create to the
1468                  * target itself instead of to the containing directory
1469                  * and supply a null path name to indicate (conventionally)
1470                  * the node itself as the "component" of interest.
1471                  *
1472                  * The call to fop_create() is necessary to ensure
1473                  * that the appropriate permission checks are made,
1474                  * i.e. EISDIR, EACCES, etc.  We already know that vpp
1475                  * exists since we are in the else condition where this
1476                  * was checked.
1477                  */
1478                 if (vp->v_flag & VROOT) {
1479                         ASSERT(why != CRMKDIR);
1480                         error = fop_create(vp, "", vap, excl, mode, vpp,
1481                             CRED(), flag, NULL, NULL);
1482                         /*
1483                          * If the create succeeded, it will have created a
1484                          * new reference on a new vnode (*vpp) in the child
1485                          * file system, so we want to drop our reference on
1486                          * the old (vp) upon exit.
1487                          */
1488                         goto out;
1489                 }
1490
1491                 /*
1492                  * Large File API - non-large open (FOFFMAX flag not set)
1493                  * of regular file fails if the file size exceeds MAXOFF32_T.
1494                  */
1495                 if (why != CRMKDIR &&
1496                     !(flag & FOFFMAX) &&
1497                     (vp->v_type == VREG)) {
1498                         vattr.va_mask = AT_SIZE;
1499                         if ((error = fop_getattr(vp, &vattr, 0,
1500                             CRED(), NULL))) {
1501                                 goto out;
1502                         }
1503                         if ((vattr.va_size > (uoff_t)MAXOFF32_T)) {
1504                                 error = EOVERFLOW;
1505                                 goto out;
1506                         }
1507                 }
1508         }
1509
1510         if (error == 0) {
1511                 /*
1512                  * Call mkdir() if specified, otherwise create().
1513                  */
1514                 int must_be_dir = pn_fixslash(&pn);     /* trailing '/'? */
1515
1516                 if (why == CRMKDIR)
1517                         /*
1518                          * N.B., if vn_createat() ever requests
1519                          * case-insensitive behavior then it will need
1520                          * to be passed to fop_mkdir().  fop_create()
1521                          * will already get it via "flag"
1522                          */
1523                         error = fop_mkdir(dvp, pn.pn_path, vap, vpp, CRED(),
1524                             NULL, 0, NULL);
1525                 else if (!must_be_dir)
1526                         error = fop_create(dvp, pn.pn_path, vap,
1527                             excl, mode, vpp, CRED(), flag, NULL, NULL);
1528                 else
1529                         error = ENOTDIR;
1530         }
1531
1532 out:
1533
1534         if (auditing)
1535                 audit_vncreate_finish(*vpp, error);
1536         if (in_crit) {
1537                 nbl_end_crit(vp);
1538                 in_crit = 0;
1539         }
1540         if (vp != NULL) {
1541                 VN_RELE(vp);
1542                 vp = NULL;
1543         }
1544         pn_free(&pn);
1545         VN_RELE(dvp);
1546         /*
1547          * The following clause was added to handle a problem
1548          * with NFS consistency.  It is possible that a lookup
1549          * of the file to be created succeeded, but the file
1550          * itself doesn't actually exist on the server.  This
1551          * is chiefly due to the DNLC containing an entry for
1552          * the file which has been removed on the server.  In
1553          * this case, we just start over.  If there was some
1554          * other cause for the ESTALE error, then the lookup
1555          * of the file will fail and the error will be returned
1556          * above instead of looping around from here.
1557          */
1558         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1559                 goto top;
1560         return (error);
1561 }
1562
1563 int
1564 vn_link(char *from, char *to, enum uio_seg seg)
1565 {
1566         return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1567 }
1568
1569 int
1570 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1571     vnode_t *tstartvp, char *to, enum uio_seg seg)
1572 {
1573         struct vnode *fvp;              /* from vnode ptr */
1574         struct vnode *tdvp;             /* to directory vnode ptr */
1575         struct pathname pn;
1576         int error;
1577         struct vattr vattr;
1578         dev_t fsid;
1579         int estale_retry = 0;
1580         uint32_t auditing = AU_AUDITING();
1581
1582 top:
1583         fvp = tdvp = NULL;
1584         if (error = pn_get(to, seg, &pn))
1585                 return (error);
1586         if (auditing && fstartvp != NULL)
1587                 audit_setfsat_path(1);
1588         if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1589                 goto out;
1590         if (auditing && tstartvp != NULL)
1591                 audit_setfsat_path(3);
1592         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1593                 goto out;
1594         /*
1595          * Make sure both source vnode and target directory vnode are
1596          * in the same vfs and that it is writeable.
1597          */
1598         vattr.va_mask = AT_FSID;
1599         if (error = fop_getattr(fvp, &vattr, 0, CRED(), NULL))
1600                 goto out;
1601         fsid = vattr.va_fsid;
1602         vattr.va_mask = AT_FSID;
1603         if (error = fop_getattr(tdvp, &vattr, 0, CRED(), NULL))
1604                 goto out;
1605         if (fsid != vattr.va_fsid) {
1606                 error = EXDEV;
1607                 goto out;
1608         }
1609         if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1610                 error = EROFS;
1611                 goto out;
1612         }
1613         /*
1614          * Do the link.
1615          */
1616         (void) pn_fixslash(&pn);
1617         error = fop_link(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1618 out:
1619         pn_free(&pn);
1620         if (fvp)
1621                 VN_RELE(fvp);
1622         if (tdvp)
1623                 VN_RELE(tdvp);
1624         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1625                 goto top;
1626         return (error);
1627 }
1628
1629 int
1630 vn_rename(char *from, char *to, enum uio_seg seg)
1631 {
1632         return (vn_renameat(NULL, from, NULL, to, seg));
1633 }
1634
1635 int
1636 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1637     char *tname, enum uio_seg seg)
1638 {
1639         int error;
1640         struct vattr vattr;
1641         struct pathname fpn;            /* from pathname */
1642         struct pathname tpn;            /* to pathname */
1643         dev_t fsid;
1644         int in_crit_src, in_crit_targ;
1645         vnode_t *fromvp, *fvp;
1646         vnode_t *tovp, *targvp;
1647         int estale_retry = 0;
1648         uint32_t auditing = AU_AUDITING();
1649
1650 top:
1651         fvp = fromvp = tovp = targvp = NULL;
1652         in_crit_src = in_crit_targ = 0;
1653         /*
1654          * Get to and from pathnames.
1655          */
1656         if (error = pn_get(fname, seg, &fpn))
1657                 return (error);
1658         if (error = pn_get(tname, seg, &tpn)) {
1659                 pn_free(&fpn);
1660                 return (error);
1661         }
1662
1663         /*
1664          * First we need to resolve the correct directories
1665          * The passed in directories may only be a starting point,
1666          * but we need the real directories the file(s) live in.
1667          * For example the fname may be something like usr/lib/sparc
1668          * and we were passed in the / directory, but we need to
1669          * use the lib directory for the rename.
1670          */
1671
1672         if (auditing && fdvp != NULL)
1673                 audit_setfsat_path(1);
1674         /*
1675          * Lookup to and from directories.
1676          */
1677         if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1678                 goto out;
1679         }
1680
1681         /*
1682          * Make sure there is an entry.
1683          */
1684         if (fvp == NULL) {
1685                 error = ENOENT;
1686                 goto out;
1687         }
1688
1689         if (auditing && tdvp != NULL)
1690                 audit_setfsat_path(3);
1691         if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1692                 goto out;
1693         }
1694
1695         /*
1696          * Make sure both the from vnode directory and the to directory
1697          * are in the same vfs and the to directory is writable.
1698          * We check fsid's, not vfs pointers, so loopback fs works.
1699          */
1700         if (fromvp != tovp) {
1701                 vattr.va_mask = AT_FSID;
1702                 if (error = fop_getattr(fromvp, &vattr, 0, CRED(), NULL))
1703                         goto out;
1704                 fsid = vattr.va_fsid;
1705                 vattr.va_mask = AT_FSID;
1706                 if (error = fop_getattr(tovp, &vattr, 0, CRED(), NULL))
1707                         goto out;
1708                 if (fsid != vattr.va_fsid) {
1709                         error = EXDEV;
1710                         goto out;
1711                 }
1712         }
1713
1714         if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1715                 error = EROFS;
1716                 goto out;
1717         }
1718
1719         /*
1720          * Make sure "from" vp is not a mount point.
1721          * Note, lookup did traverse() already, so
1722          * we'll be looking at the mounted FS root.
1723          * (but allow files like mnttab)
1724          */
1725         if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1726                 error = EBUSY;
1727                 goto out;
1728         }
1729
1730         if (targvp && (fvp != targvp)) {
1731                 nbl_start_crit(targvp, RW_READER);
1732                 in_crit_targ = 1;
1733                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1734                         error = EACCES;
1735                         goto out;
1736                 }
1737         }
1738
1739         if (nbl_need_check(fvp)) {
1740                 nbl_start_crit(fvp, RW_READER);
1741                 in_crit_src = 1;
1742                 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1743                         error = EACCES;
1744                         goto out;
1745                 }
1746         }
1747
1748         /*
1749          * Do the rename.
1750          */
1751         (void) pn_fixslash(&tpn);
1752         error = fop_rename(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1753             NULL, 0);
1754
1755 out:
1756         pn_free(&fpn);
1757         pn_free(&tpn);
1758         if (in_crit_src)
1759                 nbl_end_crit(fvp);
1760         if (in_crit_targ)
1761                 nbl_end_crit(targvp);
1762         if (fromvp)
1763                 VN_RELE(fromvp);
1764         if (tovp)
1765                 VN_RELE(tovp);
1766         if (targvp)
1767                 VN_RELE(targvp);
1768         if (fvp)
1769                 VN_RELE(fvp);
1770         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1771                 goto top;
1772         return (error);
1773 }
1774
1775 /*
1776  * Remove a file or directory.
1777  */
1778 int
1779 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1780 {
1781         return (vn_removeat(NULL, fnamep, seg, dirflag));
1782 }
1783
1784 int
1785 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1786 {
1787         struct vnode *vp;               /* entry vnode */
1788         struct vnode *dvp;              /* ptr to parent dir vnode */
1789         struct vnode *coveredvp;
1790         struct pathname pn;             /* name of entry */
1791         enum vtype vtype;
1792         int error;
1793         struct vfs *vfsp;
1794         struct vfs *dvfsp;      /* ptr to parent dir vfs */
1795         int in_crit = 0;
1796         int estale_retry = 0;
1797
1798 top:
1799         if (error = pn_get(fnamep, seg, &pn))
1800                 return (error);
1801         dvp = vp = NULL;
1802         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1803                 pn_free(&pn);
1804                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1805                         goto top;
1806                 return (error);
1807         }
1808
1809         /*
1810          * Make sure there is an entry.
1811          */
1812         if (vp == NULL) {
1813                 error = ENOENT;
1814                 goto out;
1815         }
1816
1817         vfsp = vp->v_vfsp;
1818         dvfsp = dvp->v_vfsp;
1819
1820         /*
1821          * If the named file is the root of a mounted filesystem, fail,
1822          * unless it's marked unlinkable.  In that case, unmount the
1823          * filesystem and proceed to unlink the covered vnode.  (If the
1824          * covered vnode is a directory, use rmdir instead of unlink,
1825          * to avoid file system corruption.)
1826          */
1827         if (vp->v_flag & VROOT) {
1828                 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1829                         error = EBUSY;
1830                         goto out;
1831                 }
1832
1833                 /*
1834                  * Namefs specific code starts here.
1835                  */
1836
1837                 if (dirflag == RMDIRECTORY) {
1838                         /*
1839                          * User called rmdir(2) on a file that has
1840                          * been namefs mounted on top of.  Since
1841                          * namefs doesn't allow directories to
1842                          * be mounted on other files we know
1843                          * vp is not of type VDIR so fail to operation.
1844                          */
1845                         error = ENOTDIR;
1846                         goto out;
1847                 }
1848
1849                 /*
1850                  * If VROOT is still set after grabbing vp->v_lock,
1851                  * noone has finished nm_unmount so far and coveredvp
1852                  * is valid.
1853                  * If we manage to grab vn_vfswlock(coveredvp) before releasing
1854                  * vp->v_lock, any race window is eliminated.
1855                  */
1856
1857                 mutex_enter(&vp->v_lock);
1858                 if ((vp->v_flag & VROOT) == 0) {
1859                         /* Someone beat us to the unmount */
1860                         mutex_exit(&vp->v_lock);
1861                         error = EBUSY;
1862                         goto out;
1863                 }
1864                 vfsp = vp->v_vfsp;
1865                 coveredvp = vfsp->vfs_vnodecovered;
1866                 ASSERT(coveredvp);
1867                 /*
1868                  * Note: Implementation of vn_vfswlock shows that ordering of
1869                  * v_lock / vn_vfswlock is not an issue here.
1870                  */
1871                 error = vn_vfswlock(coveredvp);
1872                 mutex_exit(&vp->v_lock);
1873
1874                 if (error)
1875                         goto out;
1876
1877                 VN_HOLD(coveredvp);
1878                 VN_RELE(vp);
1879                 error = dounmount(vfsp, 0, CRED());
1880
1881                 /*
1882                  * Unmounted the namefs file system; now get
1883                  * the object it was mounted over.
1884                  */
1885                 vp = coveredvp;
1886                 /*
1887                  * If namefs was mounted over a directory, then
1888                  * we want to use rmdir() instead of unlink().
1889                  */
1890                 if (vp->v_type == VDIR)
1891                         dirflag = RMDIRECTORY;
1892
1893                 if (error)
1894                         goto out;
1895         }
1896
1897         /*
1898          * Make sure filesystem is writeable.
1899          * We check the parent directory's vfs in case this is an lofs vnode.
1900          */
1901         if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1902                 error = EROFS;
1903                 goto out;
1904         }
1905
1906         vtype = vp->v_type;
1907
1908         /*
1909          * If there is the possibility of an nbmand share reservation, make
1910          * sure it's okay to remove the file.  Keep a reference to the
1911          * vnode, so that we can exit the nbl critical region after
1912          * calling fop_remove.
1913          * If there is no possibility of an nbmand share reservation,
1914          * release the vnode reference now.  Filesystems like NFS may
1915          * behave differently if there is an extra reference, so get rid of
1916          * this one.  Fortunately, we can't have nbmand mounts on NFS
1917          * filesystems.
1918          */
1919         if (nbl_need_check(vp)) {
1920                 nbl_start_crit(vp, RW_READER);
1921                 in_crit = 1;
1922                 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1923                         error = EACCES;
1924                         goto out;
1925                 }
1926         } else {
1927                 VN_RELE(vp);
1928                 vp = NULL;
1929         }
1930
1931         if (dirflag == RMDIRECTORY) {
1932                 /*
1933                  * Caller is using rmdir(2), which can only be applied to
1934                  * directories.
1935                  */
1936                 if (vtype != VDIR) {
1937                         error = ENOTDIR;
1938                 } else {
1939                         vnode_t *cwd;
1940                         proc_t *pp = curproc;
1941
1942                         mutex_enter(&pp->p_lock);
1943                         cwd = PTOU(pp)->u_cdir;
1944                         VN_HOLD(cwd);
1945                         mutex_exit(&pp->p_lock);
1946                         error = fop_rmdir(dvp, pn.pn_path, cwd, CRED(),
1947                             NULL, 0);
1948                         VN_RELE(cwd);
1949                 }
1950         } else {
1951                 /*
1952                  * Unlink(2) can be applied to anything.
1953                  */
1954                 error = fop_remove(dvp, pn.pn_path, CRED(), NULL, 0);
1955         }
1956
1957 out:
1958         pn_free(&pn);
1959         if (in_crit) {
1960                 nbl_end_crit(vp);
1961                 in_crit = 0;
1962         }
1963         if (vp != NULL)
1964                 VN_RELE(vp);
1965         if (dvp != NULL)
1966                 VN_RELE(dvp);
1967         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1968                 goto top;
1969         return (error);
1970 }
1971
1972 /*
1973  * Utility function to compare equality of vnodes.
1974  * Compare the underlying real vnodes, if there are underlying vnodes.
1975  * This is a more thorough comparison than the VN_CMP() macro provides.
1976  */
1977 int
1978 vn_compare(vnode_t *vp1, vnode_t *vp2)
1979 {
1980         vnode_t *realvp;
1981
1982         if (vp1 != NULL && fop_realvp(vp1, &realvp, NULL) == 0)
1983                 vp1 = realvp;
1984         if (vp2 != NULL && fop_realvp(vp2, &realvp, NULL) == 0)
1985                 vp2 = realvp;
1986         return (VN_CMP(vp1, vp2));
1987 }
1988
1989 /*
1990  * The number of locks to hash into.  This value must be a power
1991  * of 2 minus 1 and should probably also be prime.
1992  */
1993 #define NUM_BUCKETS     1023
1994
1995 struct  vn_vfslocks_bucket {
1996         kmutex_t vb_lock;
1997         vn_vfslocks_entry_t *vb_list;
1998         char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
1999 };
2000
2001 /*
2002  * Total number of buckets will be NUM_BUCKETS + 1 .
2003  */
2004
2005 #pragma align   64(vn_vfslocks_buckets)
2006 static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2007
2008 #define VN_VFSLOCKS_SHIFT       9
2009
2010 #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2011         ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2012
2013 /*
2014  * vn_vfslocks_getlock() uses an HASH scheme to generate
2015  * rwstlock using vfs/vnode pointer passed to it.
2016  *
2017  * vn_vfslocks_rele() releases a reference in the
2018  * HASH table which allows the entry allocated by
2019  * vn_vfslocks_getlock() to be freed at a later
2020  * stage when the refcount drops to zero.
2021  */
2022
2023 vn_vfslocks_entry_t *
2024 vn_vfslocks_getlock(void *vfsvpptr)
2025 {
2026         struct vn_vfslocks_bucket *bp;
2027         vn_vfslocks_entry_t *vep;
2028         vn_vfslocks_entry_t *tvep;
2029
2030         ASSERT(vfsvpptr != NULL);
2031         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2032
2033         mutex_enter(&bp->vb_lock);
2034         for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2035                 if (vep->ve_vpvfs == vfsvpptr) {
2036                         vep->ve_refcnt++;
2037                         mutex_exit(&bp->vb_lock);
2038                         return (vep);
2039                 }
2040         }
2041         mutex_exit(&bp->vb_lock);
2042         vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2043         rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2044         vep->ve_vpvfs = (char *)vfsvpptr;
2045         vep->ve_refcnt = 1;
2046         mutex_enter(&bp->vb_lock);
2047         for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2048                 if (tvep->ve_vpvfs == vfsvpptr) {
2049                         tvep->ve_refcnt++;
2050                         mutex_exit(&bp->vb_lock);
2051
2052                         /*
2053                          * There is already an entry in the hash
2054                          * destroy what we just allocated.
2055                          */
2056                         rwst_destroy(&vep->ve_lock);
2057                         kmem_free(vep, sizeof (*vep));
2058                         return (tvep);
2059                 }
2060         }
2061         vep->ve_next = bp->vb_list;
2062         bp->vb_list = vep;
2063         mutex_exit(&bp->vb_lock);
2064         return (vep);
2065 }
2066
2067 void
2068 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2069 {
2070         struct vn_vfslocks_bucket *bp;
2071         vn_vfslocks_entry_t *vep;
2072         vn_vfslocks_entry_t *pvep;
2073
2074         ASSERT(vepent != NULL);
2075         ASSERT(vepent->ve_vpvfs != NULL);
2076
2077         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2078
2079         mutex_enter(&bp->vb_lock);
2080         vepent->ve_refcnt--;
2081
2082         if ((int32_t)vepent->ve_refcnt < 0)
2083                 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2084
2085         if (vepent->ve_refcnt == 0) {
2086                 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2087                         if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2088                                 if (bp->vb_list == vep)
2089                                         bp->vb_list = vep->ve_next;
2090                                 else {
2091                                         /* LINTED */
2092                                         pvep->ve_next = vep->ve_next;
2093                                 }
2094                                 mutex_exit(&bp->vb_lock);
2095                                 rwst_destroy(&vep->ve_lock);
2096                                 kmem_free(vep, sizeof (*vep));
2097                                 return;
2098                         }
2099                         pvep = vep;
2100                 }
2101                 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2102         }
2103         mutex_exit(&bp->vb_lock);
2104 }
2105
2106 /*
2107  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2108  * lock protecting the v_vfsmountedhere field.
2109  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2110  * except that it blocks to acquire the lock VVFSLOCK.
2111  *
2112  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2113  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2114  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2115  */
2116 int
2117 vn_vfswlock_wait(vnode_t *vp)
2118 {
2119         int retval;
2120         vn_vfslocks_entry_t *vpvfsentry;
2121         ASSERT(vp != NULL);
2122
2123         vpvfsentry = vn_vfslocks_getlock(vp);
2124         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2125
2126         if (retval == EINTR) {
2127                 vn_vfslocks_rele(vpvfsentry);
2128                 return (EINTR);
2129         }
2130         return (retval);
2131 }
2132
2133 int
2134 vn_vfsrlock_wait(vnode_t *vp)
2135 {
2136         int retval;
2137         vn_vfslocks_entry_t *vpvfsentry;
2138         ASSERT(vp != NULL);
2139
2140         vpvfsentry = vn_vfslocks_getlock(vp);
2141         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2142
2143         if (retval == EINTR) {
2144                 vn_vfslocks_rele(vpvfsentry);
2145                 return (EINTR);
2146         }
2147
2148         return (retval);
2149 }
2150
2151
2152 /*
2153  * vn_vfswlock is used to implement a lock which is logically a writers lock
2154  * protecting the v_vfsmountedhere field.
2155  */
2156 int
2157 vn_vfswlock(vnode_t *vp)
2158 {
2159         vn_vfslocks_entry_t *vpvfsentry;
2160
2161         /*
2162          * If vp is NULL then somebody is trying to lock the covered vnode
2163          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2164          * only happen when unmounting /.  Since that operation will fail
2165          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2166          */
2167         if (vp == NULL)
2168                 return (EBUSY);
2169
2170         vpvfsentry = vn_vfslocks_getlock(vp);
2171
2172         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2173                 return (0);
2174
2175         vn_vfslocks_rele(vpvfsentry);
2176         return (EBUSY);
2177 }
2178
2179 int
2180 vn_vfsrlock(vnode_t *vp)
2181 {
2182         vn_vfslocks_entry_t *vpvfsentry;
2183
2184         /*
2185          * If vp is NULL then somebody is trying to lock the covered vnode
2186          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2187          * only happen when unmounting /.  Since that operation will fail
2188          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2189          */
2190         if (vp == NULL)
2191                 return (EBUSY);
2192
2193         vpvfsentry = vn_vfslocks_getlock(vp);
2194
2195         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2196                 return (0);
2197
2198         vn_vfslocks_rele(vpvfsentry);
2199         return (EBUSY);
2200 }
2201
2202 void
2203 vn_vfsunlock(vnode_t *vp)
2204 {
2205         vn_vfslocks_entry_t *vpvfsentry;
2206
2207         /*
2208          * ve_refcnt needs to be decremented twice.
2209          * 1. To release refernce after a call to vn_vfslocks_getlock()
2210          * 2. To release the reference from the locking routines like
2211          *    vn_vfsrlock/vn_vfswlock etc,.
2212          */
2213         vpvfsentry = vn_vfslocks_getlock(vp);
2214         vn_vfslocks_rele(vpvfsentry);
2215
2216         rwst_exit(&vpvfsentry->ve_lock);
2217         vn_vfslocks_rele(vpvfsentry);
2218 }
2219
2220 int
2221 vn_vfswlock_held(vnode_t *vp)
2222 {
2223         int held;
2224         vn_vfslocks_entry_t *vpvfsentry;
2225
2226         ASSERT(vp != NULL);
2227
2228         vpvfsentry = vn_vfslocks_getlock(vp);
2229         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2230
2231         vn_vfslocks_rele(vpvfsentry);
2232         return (held);
2233 }
2234
2235
2236 int
2237 vn_make_ops(
2238         const char *name,                       /* Name of file system */
2239         const fs_operation_def_t *templ,        /* Operation specification */
2240         vnodeops_t **actual)                    /* Return the vnodeops */
2241 {
2242         int unused_ops;
2243         int error;
2244
2245         *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2246
2247         (*actual)->vnop_name = name;
2248
2249         error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2250         if (error) {
2251                 kmem_free(*actual, sizeof (vnodeops_t));
2252         }
2253
2254 #if DEBUG
2255         if (unused_ops != 0)
2256                 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2257                     "but not used", name, unused_ops);
2258 #endif
2259
2260         return (error);
2261 }
2262
2263 /*
2264  * Free the vnodeops created as a result of vn_make_ops()
2265  */
2266 void
2267 vn_freevnodeops(vnodeops_t *vnops)
2268 {
2269         kmem_free(vnops, sizeof (vnodeops_t));
2270 }
2271
2272 /*
2273  * Vnode cache.
2274  */
2275
2276 /* ARGSUSED */
2277 static int
2278 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2279 {
2280         struct vnode *vp;
2281
2282         vp = buf;
2283
2284         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2285         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2286         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2287         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2288         vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2289         vp->v_path = NULL;
2290         vp->v_mpssdata = NULL;
2291         vp->v_vsd = NULL;
2292         vp->v_fopdata = NULL;
2293
2294         pagecache_init(vp);
2295
2296         return (0);
2297 }
2298
2299 /* ARGSUSED */
2300 static void
2301 vn_cache_destructor(void *buf, void *cdrarg)
2302 {
2303         struct vnode *vp;
2304
2305         vp = buf;
2306
2307         pagecache_fini(vp);
2308
2309         rw_destroy(&vp->v_nbllock);
2310         cv_destroy(&vp->v_cv);
2311         mutex_destroy(&vp->v_vsd_lock);
2312         mutex_destroy(&vp->v_lock);
2313 }
2314
2315 void
2316 vn_create_cache(void)
2317 {
2318         /* LINTED */
2319         ASSERT((1 << VNODE_ALIGN_LOG2) ==
2320             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2321         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2322             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2323             NULL, 0);
2324 }
2325
2326 void
2327 vn_destroy_cache(void)
2328 {
2329         kmem_cache_destroy(vn_cache);
2330 }
2331
2332 /*
2333  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2334  * cached by the file system and vnodes remain associated.
2335  */
2336 void
2337 vn_recycle(vnode_t *vp)
2338 {
2339         ASSERT(!vn_has_cached_data(vp));
2340
2341         /*
2342          * XXX - This really belongs in vn_reinit(), but we have some issues
2343          * with the counts.  Best to have it here for clean initialization.
2344          */
2345         vp->v_rdcnt = 0;
2346         vp->v_wrcnt = 0;
2347         vp->v_mmap_read = 0;
2348         vp->v_mmap_write = 0;
2349
2350         /*
2351          * If FEM was in use, make sure everything gets cleaned up
2352          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2353          * constructor.
2354          */
2355         if (vp->v_femhead) {
2356                 /* XXX - There should be a free_femhead() that does all this */
2357                 ASSERT(vp->v_femhead->femh_list == NULL);
2358                 mutex_destroy(&vp->v_femhead->femh_lock);
2359                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2360                 vp->v_femhead = NULL;
2361         }
2362         if (vp->v_path) {
2363                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2364                 vp->v_path = NULL;
2365         }
2366
2367         if (vp->v_fopdata != NULL) {
2368                 free_fopdata(vp);
2369         }
2370         vp->v_mpssdata = NULL;
2371         vsd_free(vp);
2372 }
2373
2374 /*
2375  * Used to reset the vnode fields including those that are directly accessible
2376  * as well as those which require an accessor function.
2377  *
2378  * Does not initialize:
2379  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2380  *      v_data (since FS-nodes and vnodes point to each other and should
2381  *              be updated simultaneously)
2382  *      v_op (in case someone needs to make a VOP call on this object)
2383  */
2384 void
2385 vn_reinit(vnode_t *vp)
2386 {
2387         vp->v_count = 1;
2388         vp->v_count_dnlc = 0;
2389         vp->v_vfsp = NULL;
2390         vp->v_stream = NULL;
2391         vp->v_vfsmountedhere = NULL;
2392         vp->v_flag = 0;
2393         vp->v_type = VNON;
2394         vp->v_rdev = NODEV;
2395
2396         vp->v_filocks = NULL;
2397         vp->v_shrlocks = NULL;
2398         VERIFY(!vn_has_cached_data(vp));
2399
2400         vp->v_locality = NULL;
2401         vp->v_xattrdir = NULL;
2402
2403         /* Handles v_femhead, v_path, and the r/w/map counts */
2404         vn_recycle(vp);
2405 }
2406
2407 vnode_t *
2408 vn_alloc(int kmflag)
2409 {
2410         vnode_t *vp;
2411
2412         vp = kmem_cache_alloc(vn_cache, kmflag);
2413
2414         if (vp != NULL) {
2415                 vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2416                 vp->v_fopdata = NULL;
2417                 vn_reinit(vp);
2418         }
2419
2420         return (vp);
2421 }
2422
2423 void
2424 vn_free(vnode_t *vp)
2425 {
2426         ASSERT(vp->v_shrlocks == NULL);
2427         ASSERT(vp->v_filocks == NULL);
2428
2429         /*
2430          * Some file systems call vn_free() with v_count of zero,
2431          * some with v_count of 1.  In any case, the value should
2432          * never be anything else.
2433          */
2434         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2435         ASSERT(vp->v_count_dnlc == 0);
2436         if (vp->v_path != NULL) {
2437                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2438                 vp->v_path = NULL;
2439         }
2440
2441         /* If FEM was in use, make sure everything gets cleaned up */
2442         if (vp->v_femhead) {
2443                 /* XXX - There should be a free_femhead() that does all this */
2444                 ASSERT(vp->v_femhead->femh_list == NULL);
2445                 mutex_destroy(&vp->v_femhead->femh_lock);
2446                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2447                 vp->v_femhead = NULL;
2448         }
2449
2450         if (vp->v_fopdata != NULL) {
2451                 free_fopdata(vp);
2452         }
2453         vp->v_mpssdata = NULL;
2454         vsd_free(vp);
2455         kmem_cache_free(vn_cache, vp);
2456 }
2457
2458 /*
2459  * vnode status changes, should define better states than 1, 0.
2460  */
2461 void
2462 vn_reclaim(vnode_t *vp)
2463 {
2464         vfs_t   *vfsp = vp->v_vfsp;
2465
2466         if (vfsp == NULL ||
2467             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2468                 return;
2469         }
2470         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2471 }
2472
2473 void
2474 vn_idle(vnode_t *vp)
2475 {
2476         vfs_t   *vfsp = vp->v_vfsp;
2477
2478         if (vfsp == NULL ||
2479             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2480                 return;
2481         }
2482         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2483 }
2484 void
2485 vn_exists(vnode_t *vp)
2486 {
2487         vfs_t   *vfsp = vp->v_vfsp;
2488
2489         if (vfsp == NULL ||
2490             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2491                 return;
2492         }
2493         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2494 }
2495
2496 void
2497 vn_invalid(vnode_t *vp)
2498 {
2499         vfs_t   *vfsp = vp->v_vfsp;
2500
2501         if (vfsp == NULL ||
2502             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2503                 return;
2504         }
2505         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2506 }
2507
2508 /* Vnode event notification */
2509
2510 int
2511 vnevent_support(vnode_t *vp, caller_context_t *ct)
2512 {
2513         if (vp == NULL)
2514                 return (EINVAL);
2515
2516         return (fop_vnevent(vp, VE_SUPPORT, NULL, NULL, ct));
2517 }
2518
2519 void
2520 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2521 {
2522         if (vp == NULL || vp->v_femhead == NULL) {
2523                 return;
2524         }
2525         (void) fop_vnevent(vp, VE_RENAME_SRC, dvp, name, ct);
2526 }
2527
2528 void
2529 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2530     caller_context_t *ct)
2531 {
2532         if (vp == NULL || vp->v_femhead == NULL) {
2533                 return;
2534         }
2535         (void) fop_vnevent(vp, VE_RENAME_DEST, dvp, name, ct);
2536 }
2537
2538 void
2539 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2540 {
2541         if (vp == NULL || vp->v_femhead == NULL) {
2542                 return;
2543         }
2544         (void) fop_vnevent(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2545 }
2546
2547 void
2548 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2549 {
2550         if (vp == NULL || vp->v_femhead == NULL) {
2551                 return;
2552         }
2553         (void) fop_vnevent(vp, VE_REMOVE, dvp, name, ct);
2554 }
2555
2556 void
2557 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2558 {
2559         if (vp == NULL || vp->v_femhead == NULL) {
2560                 return;
2561         }
2562         (void) fop_vnevent(vp, VE_RMDIR, dvp, name, ct);
2563 }
2564
2565 void
2566 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2567     caller_context_t *ct)
2568 {
2569         if (vp == NULL || vp->v_femhead == NULL) {
2570                 return;
2571         }
2572         (void) fop_vnevent(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2573 }
2574
2575 void
2576 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2577     caller_context_t *ct)
2578 {
2579         if (vp == NULL || vp->v_femhead == NULL) {
2580                 return;
2581         }
2582         (void) fop_vnevent(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2583 }
2584
2585 void
2586 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2587     caller_context_t *ct)
2588 {
2589         if (vp == NULL || vp->v_femhead == NULL) {
2590                 return;
2591         }
2592         (void) fop_vnevent(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2593 }
2594
2595 void
2596 vnevent_create(vnode_t *vp, caller_context_t *ct)
2597 {
2598         if (vp == NULL || vp->v_femhead == NULL) {
2599                 return;
2600         }
2601         (void) fop_vnevent(vp, VE_CREATE, NULL, NULL, ct);
2602 }
2603
2604 void
2605 vnevent_link(vnode_t *vp, caller_context_t *ct)
2606 {
2607         if (vp == NULL || vp->v_femhead == NULL) {
2608                 return;
2609         }
2610         (void) fop_vnevent(vp, VE_LINK, NULL, NULL, ct);
2611 }
2612
2613 void
2614 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2615 {
2616         if (vp == NULL || vp->v_femhead == NULL) {
2617                 return;
2618         }
2619         (void) fop_vnevent(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2620 }
2621
2622 void
2623 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2624 {
2625         if (vp == NULL || vp->v_femhead == NULL) {
2626                 return;
2627         }
2628         (void) fop_vnevent(vp, VE_TRUNCATE, NULL, NULL, ct);
2629 }
2630
2631 /*
2632  * Vnode accessors.
2633  */
2634
2635 int
2636 vn_is_readonly(vnode_t *vp)
2637 {
2638         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2639 }
2640
2641 int
2642 vn_has_flocks(vnode_t *vp)
2643 {
2644         return (vp->v_filocks != NULL);
2645 }
2646
2647 int
2648 vn_has_mandatory_locks(vnode_t *vp, int mode)
2649 {
2650         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2651 }
2652
2653 int
2654 vn_has_cached_data(vnode_t *vp)
2655 {
2656         return (!list_is_empty(&vp->v_pagecache_list));
2657 }
2658
2659 /*
2660  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2661  * zone_enter(2).
2662  */
2663 int
2664 vn_can_change_zones(vnode_t *vp)
2665 {
2666         struct vfssw *vswp;
2667         int allow = 1;
2668         vnode_t *rvp;
2669
2670         if (nfs_global_client_only != 0)
2671                 return (1);
2672
2673         /*
2674          * We always want to look at the underlying vnode if there is one.
2675          */
2676         if (fop_realvp(vp, &rvp, NULL) != 0)
2677                 rvp = vp;
2678         /*
2679          * Some pseudo filesystems (including doorfs) don't actually register
2680          * their vfsops_t, so the following may return NULL; we happily let
2681          * such vnodes switch zones.
2682          */
2683         vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2684         if (vswp != NULL) {
2685                 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2686                         allow = 0;
2687                 vfs_unrefvfssw(vswp);
2688         }
2689         return (allow);
2690 }
2691
2692 /*
2693  * Return nonzero if the vnode is a mount point, zero if not.
2694  */
2695 int
2696 vn_ismntpt(vnode_t *vp)
2697 {
2698         return (vp->v_vfsmountedhere != NULL);
2699 }
2700
2701 /* Retrieve the vfs (if any) mounted on this vnode */
2702 vfs_t *
2703 vn_mountedvfs(vnode_t *vp)
2704 {
2705         return (vp->v_vfsmountedhere);
2706 }
2707
2708 /*
2709  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2710  */
2711 int
2712 vn_in_dnlc(vnode_t *vp)
2713 {
2714         return (vp->v_count_dnlc > 0);
2715 }
2716
2717 /*
2718  * vn_has_other_opens() checks whether a particular file is opened by more than
2719  * just the caller and whether the open is for read and/or write.
2720  * This routine is for calling after the caller has already called fop_open()
2721  * and the caller wishes to know if they are the only one with it open for
2722  * the mode(s) specified.
2723  *
2724  * Vnode counts are only kept on regular files (v_type=VREG).
2725  */
2726 int
2727 vn_has_other_opens(
2728         vnode_t *vp,
2729         v_mode_t mode)
2730 {
2731
2732         ASSERT(vp != NULL);
2733
2734         switch (mode) {
2735         case V_WRITE:
2736                 if (vp->v_wrcnt > 1)
2737                         return (V_TRUE);
2738                 break;
2739         case V_RDORWR:
2740                 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2741                         return (V_TRUE);
2742                 break;
2743         case V_RDANDWR:
2744                 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2745                         return (V_TRUE);
2746                 break;
2747         case V_READ:
2748                 if (vp->v_rdcnt > 1)
2749                         return (V_TRUE);
2750                 break;
2751         }
2752
2753         return (V_FALSE);
2754 }
2755
2756 /*
2757  * vn_is_opened() checks whether a particular file is opened and
2758  * whether the open is for read and/or write.
2759  *
2760  * Vnode counts are only kept on regular files (v_type=VREG).
2761  */
2762 int
2763 vn_is_opened(
2764         vnode_t *vp,
2765         v_mode_t mode)
2766 {
2767
2768         ASSERT(vp != NULL);
2769
2770         switch (mode) {
2771         case V_WRITE:
2772                 if (vp->v_wrcnt)
2773                         return (V_TRUE);
2774                 break;
2775         case V_RDANDWR:
2776                 if (vp->v_rdcnt && vp->v_wrcnt)
2777                         return (V_TRUE);
2778                 break;
2779         case V_RDORWR:
2780                 if (vp->v_rdcnt || vp->v_wrcnt)
2781                         return (V_TRUE);
2782                 break;
2783         case V_READ:
2784                 if (vp->v_rdcnt)
2785                         return (V_TRUE);
2786                 break;
2787         }
2788
2789         return (V_FALSE);
2790 }
2791
2792 /*
2793  * vn_is_mapped() checks whether a particular file is mapped and whether
2794  * the file is mapped read and/or write.
2795  */
2796 int
2797 vn_is_mapped(
2798         vnode_t *vp,
2799         v_mode_t mode)
2800 {
2801
2802         ASSERT(vp != NULL);
2803
2804 #if !defined(_LP64)
2805         switch (mode) {
2806         /*
2807          * The atomic_add_64_nv functions force atomicity in the
2808          * case of 32 bit architectures. Otherwise the 64 bit values
2809          * require two fetches. The value of the fields may be
2810          * (potentially) changed between the first fetch and the
2811          * second
2812          */
2813         case V_WRITE:
2814                 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2815                         return (V_TRUE);
2816                 break;
2817         case V_RDANDWR:
2818                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2819                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2820                         return (V_TRUE);
2821                 break;
2822         case V_RDORWR:
2823                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2824                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2825                         return (V_TRUE);
2826                 break;
2827         case V_READ:
2828                 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2829                         return (V_TRUE);
2830                 break;
2831         }
2832 #else
2833         switch (mode) {
2834         case V_WRITE:
2835                 if (vp->v_mmap_write)
2836                         return (V_TRUE);
2837                 break;
2838         case V_RDANDWR:
2839                 if (vp->v_mmap_read && vp->v_mmap_write)
2840                         return (V_TRUE);
2841                 break;
2842         case V_RDORWR:
2843                 if (vp->v_mmap_read || vp->v_mmap_write)
2844                         return (V_TRUE);
2845                 break;
2846         case V_READ:
2847                 if (vp->v_mmap_read)
2848                         return (V_TRUE);
2849                 break;
2850         }
2851 #endif
2852
2853         return (V_FALSE);
2854 }
2855
2856 /*
2857  * Set the operations vector for a vnode.
2858  *
2859  * FEM ensures that the v_femhead pointer is filled in before the
2860  * v_op pointer is changed.  This means that if the v_femhead pointer
2861  * is NULL, and the v_op field hasn't changed since before which checked
2862  * the v_femhead pointer; then our update is ok - we are not racing with
2863  * FEM.
2864  */
2865 void
2866 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2867 {
2868         vnodeops_t      *op;
2869
2870         ASSERT(vp != NULL);
2871         ASSERT(vnodeops != NULL);
2872
2873         op = vp->v_op;
2874         membar_consumer();
2875         /*
2876          * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2877          * the compare-and-swap on vp->v_op.  If either fails, then FEM is
2878          * in effect on the vnode and we need to have FEM deal with it.
2879          */
2880         if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2881             op) {
2882                 fem_setvnops(vp, vnodeops);
2883         }
2884 }
2885
2886 /*
2887  * Retrieve the operations vector for a vnode
2888  * As with vn_setops(above); make sure we aren't racing with FEM.
2889  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2890  * make sense to the callers of this routine.
2891  */
2892 vnodeops_t *
2893 vn_getops(vnode_t *vp)
2894 {
2895         vnodeops_t      *op;
2896
2897         ASSERT(vp != NULL);
2898
2899         op = vp->v_op;
2900         membar_consumer();
2901         if (vp->v_femhead == NULL && op == vp->v_op) {
2902                 return (op);
2903         } else {
2904                 return (fem_getvnops(vp));
2905         }
2906 }
2907
2908 /*
2909  * Returns non-zero (1) if the vnodeops matches that of the vnode.
2910  * Returns zero (0) if not.
2911  */
2912 int
2913 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2914 {
2915         return (vn_getops(vp) == vnodeops);
2916 }
2917
2918 /*
2919  * Returns non-zero (1) if the specified operation matches the
2920  * corresponding operation for that the vnode.
2921  * Returns zero (0) if not.
2922  */
2923
2924 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2925
2926 int
2927 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2928 {
2929         const fs_operation_trans_def_t *otdp;
2930         fs_generic_func_p *loc = NULL;
2931         vnodeops_t      *vop = vn_getops(vp);
2932
2933         ASSERT(vopname != NULL);
2934
2935         for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
2936                 if (MATCHNAME(otdp->name, vopname)) {
2937                         loc = (fs_generic_func_p *)
2938                             ((char *)(vop) + otdp->offset);
2939                         break;
2940                 }
2941         }
2942
2943         return ((loc != NULL) && (*loc == funcp));
2944 }
2945
2946 /*
2947  * fs_new_caller_id() needs to return a unique ID on a given local system.
2948  * The IDs do not need to survive across reboots.  These are primarily
2949  * used so that (FEM) monitors can detect particular callers (such as
2950  * the NFS server) to a given vnode/vfs operation.
2951  */
2952 u_longlong_t
2953 fs_new_caller_id()
2954 {
2955         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2956
2957         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2958 }
2959
2960 /*
2961  * Given a starting vnode and a path, updates the path in the target vnode in
2962  * a safe manner.  If the vnode already has path information embedded, then the
2963  * cached path is left untouched.
2964  */
2965
2966 size_t max_vnode_path = 4 * MAXPATHLEN;
2967
2968 void
2969 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
2970     const char *path, size_t plen)
2971 {
2972         char    *rpath;
2973         vnode_t *base;
2974         size_t  rpathlen, rpathalloc;
2975         int     doslash = 1;
2976
2977         if (*path == '/') {
2978                 base = rootvp;
2979                 path++;
2980                 plen--;
2981         } else {
2982                 base = startvp;
2983         }
2984
2985         /*
2986          * We cannot grab base->v_lock while we hold vp->v_lock because of
2987          * the potential for deadlock.
2988          */
2989         mutex_enter(&base->v_lock);
2990         if (base->v_path == NULL) {
2991                 mutex_exit(&base->v_lock);
2992                 return;
2993         }
2994
2995         rpathlen = strlen(base->v_path);
2996         rpathalloc = rpathlen + plen + 1;
2997         /* Avoid adding a slash if there's already one there */
2998         if (base->v_path[rpathlen-1] == '/')
2999                 doslash = 0;
3000         else
3001                 rpathalloc++;
3002
3003         /*
3004          * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
3005          * so we must do this dance.  If, by chance, something changes the path,
3006          * just give up since there is no real harm.
3007          */
3008         mutex_exit(&base->v_lock);
3009
3010         /* Paths should stay within reason */
3011         if (rpathalloc > max_vnode_path)
3012                 return;
3013
3014         rpath = kmem_alloc(rpathalloc, KM_SLEEP);
3015
3016         mutex_enter(&base->v_lock);
3017         if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
3018                 mutex_exit(&base->v_lock);
3019                 kmem_free(rpath, rpathalloc);
3020                 return;
3021         }
3022         bcopy(base->v_path, rpath, rpathlen);
3023         mutex_exit(&base->v_lock);
3024
3025         if (doslash)
3026                 rpath[rpathlen++] = '/';
3027         bcopy(path, rpath + rpathlen, plen);
3028         rpath[rpathlen + plen] = '\0';
3029
3030         mutex_enter(&vp->v_lock);
3031         if (vp->v_path != NULL) {
3032                 mutex_exit(&vp->v_lock);
3033                 kmem_free(rpath, rpathalloc);
3034         } else {
3035                 vp->v_path = rpath;
3036                 mutex_exit(&vp->v_lock);
3037         }
3038 }
3039
3040 /*
3041  * Sets the path to the vnode to be the given string, regardless of current
3042  * context.  The string must be a complete path from rootdir.  This is only used
3043  * by fsop_root() for setting the path based on the mountpoint.
3044  */
3045 void
3046 vn_setpath_str(struct vnode *vp, const char *str, size_t len)
3047 {
3048         char *buf = kmem_alloc(len + 1, KM_SLEEP);
3049
3050         mutex_enter(&vp->v_lock);
3051         if (vp->v_path != NULL) {
3052                 mutex_exit(&vp->v_lock);
3053                 kmem_free(buf, len + 1);
3054                 return;
3055         }
3056
3057         vp->v_path = buf;
3058         bcopy(str, vp->v_path, len);
3059         vp->v_path[len] = '\0';
3060
3061         mutex_exit(&vp->v_lock);
3062 }
3063
3064 /*
3065  * Called from within filesystem's vop_rename() to handle renames once the
3066  * target vnode is available.
3067  */
3068 void
3069 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
3070 {
3071         char *tmp;
3072
3073         mutex_enter(&vp->v_lock);
3074         tmp = vp->v_path;
3075         vp->v_path = NULL;
3076         mutex_exit(&vp->v_lock);
3077         vn_setpath(rootdir, dvp, vp, nm, len);
3078         if (tmp != NULL)
3079                 kmem_free(tmp, strlen(tmp) + 1);
3080 }
3081
3082 /*
3083  * Similar to vn_setpath_str(), this function sets the path of the destination
3084  * vnode to the be the same as the source vnode.
3085  */
3086 void
3087 vn_copypath(struct vnode *src, struct vnode *dst)
3088 {
3089         char *buf;
3090         int alloc;
3091
3092         mutex_enter(&src->v_lock);
3093         if (src->v_path == NULL) {
3094                 mutex_exit(&src->v_lock);
3095                 return;
3096         }
3097         alloc = strlen(src->v_path) + 1;
3098
3099         /* avoid kmem_alloc() with lock held */
3100         mutex_exit(&src->v_lock);
3101         buf = kmem_alloc(alloc, KM_SLEEP);
3102         mutex_enter(&src->v_lock);
3103         if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3104                 mutex_exit(&src->v_lock);
3105                 kmem_free(buf, alloc);
3106                 return;
3107         }
3108         bcopy(src->v_path, buf, alloc);
3109         mutex_exit(&src->v_lock);
3110
3111         mutex_enter(&dst->v_lock);
3112         if (dst->v_path != NULL) {
3113                 mutex_exit(&dst->v_lock);
3114                 kmem_free(buf, alloc);
3115                 return;
3116         }
3117         dst->v_path = buf;
3118         mutex_exit(&dst->v_lock);
3119 }
3120
3121 /*
3122  * XXX Private interface for segvn routines that handle vnode
3123  * large page segments.
3124  *
3125  * return 1 if vp's file system fop_pageio() implementation
3126  * can be safely used instead of fop_getpage() for handling
3127  * pagefaults against regular non swap files. fop_pageio()
3128  * interface is considered safe here if its implementation
3129  * is very close to fop_getpage() implementation.
3130  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3131  * panic if there're file holes but instead returns an error.
3132  * Doesn't assume file won't be changed by user writes, etc.
3133  *
3134  * return 0 otherwise.
3135  *
3136  * For now allow segvn to only use fop_pageio() with ufs and nfs.
3137  */
3138 int
3139 vn_vmpss_usepageio(vnode_t *vp)
3140 {
3141         vfs_t   *vfsp = vp->v_vfsp;
3142         char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3143         char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3144         char **fsok = pageio_ok_fss;
3145
3146         if (fsname == NULL) {
3147                 return (0);
3148         }
3149
3150         for (; *fsok; fsok++) {
3151                 if (strcmp(*fsok, fsname) == 0) {
3152                         return (1);
3153                 }
3154         }
3155         return (0);
3156 }
3157
3158 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3159
3160 int
3161 fop_open(
3162         vnode_t **vpp,
3163         int mode,
3164         cred_t *cr,
3165         caller_context_t *ct)
3166 {
3167         int ret;
3168         vnode_t *vp = *vpp;
3169
3170         VN_HOLD(vp);
3171         /*
3172          * Adding to the vnode counts before calling open
3173          * avoids the need for a mutex. It circumvents a race
3174          * condition where a query made on the vnode counts results in a
3175          * false negative. The inquirer goes away believing the file is
3176          * not open when there is an open on the file already under way.
3177          *
3178          * The counts are meant to prevent NFS from granting a delegation
3179          * when it would be dangerous to do so.
3180          *
3181          * The vnode counts are only kept on regular files
3182          */
3183         if ((*vpp)->v_type == VREG) {
3184                 if (mode & FREAD)
3185                         atomic_inc_32(&(*vpp)->v_rdcnt);
3186                 if (mode & FWRITE)
3187                         atomic_inc_32(&(*vpp)->v_wrcnt);
3188         }
3189
3190         VOPXID_MAP_CR(vp, cr);
3191
3192         if ((*vpp)->v_op->vop_open == NULL)
3193                 ret = ENOSYS;
3194         else
3195                 ret = (*vpp)->v_op->vop_open(vpp, mode, cr, ct);
3196
3197         if (ret) {
3198                 /*
3199                  * Use the saved vp just in case the vnode ptr got trashed
3200                  * by the error.
3201                  */
3202                 VOPSTATS_UPDATE(vp, open);
3203                 if ((vp->v_type == VREG) && (mode & FREAD))
3204                         atomic_dec_32(&vp->v_rdcnt);
3205                 if ((vp->v_type == VREG) && (mode & FWRITE))
3206                         atomic_dec_32(&vp->v_wrcnt);
3207         } else {
3208                 /*
3209                  * Some filesystems will return a different vnode,
3210                  * but the same path was still used to open it.
3211                  * So if we do change the vnode and need to
3212                  * copy over the path, do so here, rather than special
3213                  * casing each filesystem. Adjust the vnode counts to
3214                  * reflect the vnode switch.
3215                  */
3216                 VOPSTATS_UPDATE(*vpp, open);
3217                 if (*vpp != vp && *vpp != NULL) {
3218                         vn_copypath(vp, *vpp);
3219                         if (((*vpp)->v_type == VREG) && (mode & FREAD))
3220                                 atomic_inc_32(&(*vpp)->v_rdcnt);
3221                         if ((vp->v_type == VREG) && (mode & FREAD))
3222                                 atomic_dec_32(&vp->v_rdcnt);
3223                         if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3224                                 atomic_inc_32(&(*vpp)->v_wrcnt);
3225                         if ((vp->v_type == VREG) && (mode & FWRITE))
3226                                 atomic_dec_32(&vp->v_wrcnt);
3227                 }
3228         }
3229         VN_RELE(vp);
3230         return (ret);
3231 }
3232
3233 int
3234 fop_close(
3235         vnode_t *vp,
3236         int flag,
3237         int count,
3238         offset_t offset,
3239         cred_t *cr,
3240         caller_context_t *ct)
3241 {
3242         int err;
3243
3244         VOPXID_MAP_CR(vp, cr);
3245
3246         if (vp->v_op->vop_close == NULL)
3247                 err = ENOSYS;
3248         else
3249                 err = vp->v_op->vop_close(vp, flag, count, offset, cr, ct);
3250
3251         VOPSTATS_UPDATE(vp, close);
3252         /*
3253          * Check passed in count to handle possible dups. Vnode counts are only
3254          * kept on regular files
3255          */
3256         if ((vp->v_type == VREG) && (count == 1))  {
3257                 if (flag & FREAD) {
3258                         ASSERT(vp->v_rdcnt > 0);
3259                         atomic_dec_32(&vp->v_rdcnt);
3260                 }
3261                 if (flag & FWRITE) {
3262                         ASSERT(vp->v_wrcnt > 0);
3263                         atomic_dec_32(&vp->v_wrcnt);
3264                 }
3265         }
3266         return (err);
3267 }
3268
3269 int
3270 fop_read(
3271         vnode_t *vp,
3272         uio_t *uiop,
3273         int ioflag,
3274         cred_t *cr,
3275         caller_context_t *ct)
3276 {
3277         int     err;
3278         ssize_t resid_start = uiop->uio_resid;
3279
3280         VOPXID_MAP_CR(vp, cr);
3281
3282         if (vp->v_op->vop_read == NULL)
3283                 err = ENOSYS;
3284         else
3285                 err = vp->v_op->vop_read(vp, uiop, ioflag, cr, ct);
3286
3287         VOPSTATS_UPDATE_IO(vp, read,
3288             read_bytes, (resid_start - uiop->uio_resid));
3289         return (err);
3290 }
3291
3292 int
3293 fop_write(
3294         vnode_t *vp,
3295         uio_t *uiop,
3296         int ioflag,
3297         cred_t *cr,
3298         caller_context_t *ct)
3299 {
3300         int     err;
3301         ssize_t resid_start = uiop->uio_resid;
3302
3303         VOPXID_MAP_CR(vp, cr);
3304
3305         if (vp->v_op->vop_write == NULL)
3306                 err = ENOSYS;
3307         else
3308                 err = vp->v_op->vop_write(vp, uiop, ioflag, cr, ct);
3309
3310         VOPSTATS_UPDATE_IO(vp, write,
3311             write_bytes, (resid_start - uiop->uio_resid));
3312         return (err);
3313 }
3314
3315 int
3316 fop_ioctl(
3317         vnode_t *vp,
3318         int cmd,
3319         intptr_t arg,
3320         int flag,
3321         cred_t *cr,
3322         int *rvalp,
3323         caller_context_t *ct)
3324 {
3325         int     err;
3326
3327         VOPXID_MAP_CR(vp, cr);
3328
3329         if (vp->v_op->vop_ioctl == NULL)
3330                 err = ENOSYS;
3331         else
3332                 err = vp->v_op->vop_ioctl(vp, cmd, arg, flag, cr, rvalp, ct);
3333
3334         VOPSTATS_UPDATE(vp, ioctl);
3335         return (err);
3336 }
3337
3338 int
3339 fop_setfl(
3340         vnode_t *vp,
3341         int oflags,
3342         int nflags,
3343         cred_t *cr,
3344         caller_context_t *ct)
3345 {
3346         int     err;
3347
3348         VOPXID_MAP_CR(vp, cr);
3349
3350         if (vp->v_op->vop_setfl == NULL)
3351                 err = fs_setfl(vp, oflags, nflags, cr, ct);
3352         else
3353                 err = vp->v_op->vop_setfl(vp, oflags, nflags, cr, ct);
3354
3355         VOPSTATS_UPDATE(vp, setfl);
3356         return (err);
3357 }
3358
3359 int
3360 fop_getattr(
3361         vnode_t *vp,
3362         vattr_t *vap,
3363         int flags,
3364         cred_t *cr,
3365         caller_context_t *ct)
3366 {
3367         int     err;
3368
3369         VOPXID_MAP_CR(vp, cr);
3370
3371         /*
3372          * If this file system doesn't understand the xvattr extensions
3373          * then turn off the xvattr bit.
3374          */
3375         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3376                 vap->va_mask &= ~AT_XVATTR;
3377         }
3378
3379         /*
3380          * We're only allowed to skip the ACL check iff we used a 32 bit
3381          * ACE mask with fop_access() to determine permissions.
3382          */
3383         if ((flags & ATTR_NOACLCHECK) &&
3384             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3385                 return (EINVAL);
3386
3387         if (vp->v_op->vop_getattr == NULL)
3388                 err = ENOSYS;
3389         else
3390                 err = vp->v_op->vop_getattr(vp, vap, flags, cr, ct);
3391
3392         VOPSTATS_UPDATE(vp, getattr);
3393         return (err);
3394 }
3395
3396 int
3397 fop_setattr(
3398         vnode_t *vp,
3399         vattr_t *vap,
3400         int flags,
3401         cred_t *cr,
3402         caller_context_t *ct)
3403 {
3404         int     err;
3405
3406         VOPXID_MAP_CR(vp, cr);
3407
3408         /*
3409          * If this file system doesn't understand the xvattr extensions
3410          * then turn off the xvattr bit.
3411          */
3412         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3413                 vap->va_mask &= ~AT_XVATTR;
3414         }
3415
3416         /*
3417          * We're only allowed to skip the ACL check iff we used a 32 bit
3418          * ACE mask with fop_access() to determine permissions.
3419          */
3420         if ((flags & ATTR_NOACLCHECK) &&
3421             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3422                 return (EINVAL);
3423
3424         if (vp->v_op->vop_setattr == NULL)
3425                 err = ENOSYS;
3426         else
3427                 err = vp->v_op->vop_setattr(vp, vap, flags, cr, ct);
3428
3429         VOPSTATS_UPDATE(vp, setattr);
3430         return (err);
3431 }
3432
3433 int
3434 fop_access(
3435         vnode_t *vp,
3436         int mode,
3437         int flags,
3438         cred_t *cr,
3439         caller_context_t *ct)
3440 {
3441         int     err;
3442
3443         if ((flags & V_ACE_MASK) &&
3444             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3445                 return (EINVAL);
3446         }
3447
3448         VOPXID_MAP_CR(vp, cr);
3449
3450         if (vp->v_op->vop_access == NULL)
3451                 err = ENOSYS;
3452         else
3453                 err = vp->v_op->vop_access(vp, mode, flags, cr, ct);
3454
3455         VOPSTATS_UPDATE(vp, access);
3456         return (err);
3457 }
3458
3459 int
3460 fop_lookup(
3461         vnode_t *dvp,
3462         char *nm,
3463         vnode_t **vpp,
3464         pathname_t *pnp,
3465         int flags,
3466         vnode_t *rdir,
3467         cred_t *cr,
3468         caller_context_t *ct,
3469         int *deflags,           /* Returned per-dirent flags */
3470         pathname_t *ppnp)       /* Returned case-preserved name in directory */
3471 {
3472         int ret;
3473
3474         /*
3475          * If this file system doesn't support case-insensitive access
3476          * and said access is requested, fail quickly.  It is required
3477          * that if the vfs supports case-insensitive lookup, it also
3478          * supports extended dirent flags.
3479          */
3480         if (flags & FIGNORECASE &&
3481             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3482             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3483                 return (EINVAL);
3484
3485         VOPXID_MAP_CR(dvp, cr);
3486
3487         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3488                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3489         } else if (dvp->v_op->vop_lookup == NULL) {
3490                 ret = ENOSYS;
3491         } else {
3492                 ret = dvp->v_op->vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
3493                                             cr, ct, deflags, ppnp);
3494         }
3495
3496         if (ret == 0 && *vpp) {
3497                 VOPSTATS_UPDATE(*vpp, lookup);
3498                 if ((*vpp)->v_path == NULL) {
3499                         vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
3500                 }
3501         }
3502
3503         return (ret);
3504 }
3505
3506 int
3507 fop_create(
3508         vnode_t *dvp,
3509         char *name,
3510         vattr_t *vap,
3511         vcexcl_t excl,
3512         int mode,
3513         vnode_t **vpp,
3514         cred_t *cr,
3515         int flags,
3516         caller_context_t *ct,
3517         vsecattr_t *vsecp)      /* ACL to set during create */
3518 {
3519         int ret;
3520
3521         if (vsecp != NULL &&
3522             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3523                 return (EINVAL);
3524         }
3525         /*
3526          * If this file system doesn't support case-insensitive access
3527          * and said access is requested, fail quickly.
3528          */
3529         if (flags & FIGNORECASE &&
3530             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3531             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3532                 return (EINVAL);
3533
3534         VOPXID_MAP_CR(dvp, cr);
3535
3536         if (dvp->v_op->vop_create == NULL)
3537                 ret = ENOSYS;
3538         else
3539                 ret = dvp->v_op->vop_create(dvp, name, vap, excl, mode, vpp,
3540                                             cr, flags, ct, vsecp);
3541
3542         if (ret == 0 && *vpp) {
3543                 VOPSTATS_UPDATE(*vpp, create);
3544                 if ((*vpp)->v_path == NULL) {
3545                         vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
3546                 }
3547         }
3548
3549         return (ret);
3550 }
3551
3552 int
3553 fop_remove(
3554         vnode_t *dvp,
3555         char *nm,
3556         cred_t *cr,
3557         caller_context_t *ct,
3558         int flags)
3559 {
3560         int     err;
3561
3562         /*
3563          * If this file system doesn't support case-insensitive access
3564          * and said access is requested, fail quickly.
3565          */
3566         if (flags & FIGNORECASE &&
3567             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3568             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3569                 return (EINVAL);
3570
3571         VOPXID_MAP_CR(dvp, cr);
3572
3573         if (dvp->v_op->vop_remove == NULL)
3574                 err = ENOSYS;
3575         else
3576                 err = dvp->v_op->vop_remove(dvp, nm, cr, ct, flags);
3577
3578         VOPSTATS_UPDATE(dvp, remove);
3579         return (err);
3580 }
3581
3582 int
3583 fop_link(
3584         vnode_t *tdvp,
3585         vnode_t *svp,
3586         char *tnm,
3587         cred_t *cr,
3588         caller_context_t *ct,
3589         int flags)
3590 {
3591         int     err;
3592
3593         /*
3594          * If the target file system doesn't support case-insensitive access
3595          * and said access is requested, fail quickly.
3596          */
3597         if (flags & FIGNORECASE &&
3598             (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3599             vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3600                 return (EINVAL);
3601
3602         VOPXID_MAP_CR(tdvp, cr);
3603
3604         if (tdvp->v_op->vop_link == NULL)
3605                 err = ENOSYS;
3606         else
3607                 err = tdvp->v_op->vop_link(tdvp, svp, tnm, cr, ct, flags);
3608
3609         VOPSTATS_UPDATE(tdvp, link);
3610         return (err);
3611 }
3612
3613 int
3614 fop_rename(
3615         vnode_t *sdvp,
3616         char *snm,
3617         vnode_t *tdvp,
3618         char *tnm,
3619         cred_t *cr,
3620         caller_context_t *ct,
3621         int flags)
3622 {
3623         int     err;
3624
3625         /*
3626          * If the file system involved does not support
3627          * case-insensitive access and said access is requested, fail
3628          * quickly.
3629          */
3630         if (flags & FIGNORECASE &&
3631             ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3632             vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3633                 return (EINVAL);
3634
3635         VOPXID_MAP_CR(tdvp, cr);
3636
3637         if (sdvp->v_op->vop_rename == NULL)
3638                 err = ENOSYS;
3639         else
3640                 err = sdvp->v_op->vop_rename(sdvp, snm, tdvp, tnm, cr, ct,
3641                                              flags);
3642
3643         VOPSTATS_UPDATE(sdvp, rename);
3644         return (err);
3645 }
3646
3647 int
3648 fop_mkdir(
3649         vnode_t *dvp,
3650         char *dirname,
3651         vattr_t *vap,
3652         vnode_t **vpp,
3653         cred_t *cr,
3654         caller_context_t *ct,
3655         int flags,
3656         vsecattr_t *vsecp)      /* ACL to set during create */
3657 {
3658         int ret;
3659
3660         if (vsecp != NULL &&
3661             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3662                 return (EINVAL);
3663         }
3664         /*
3665          * If this file system doesn't support case-insensitive access
3666          * and said access is requested, fail quickly.
3667          */
3668         if (flags & FIGNORECASE &&
3669             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3670             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3671                 return (EINVAL);
3672
3673         VOPXID_MAP_CR(dvp, cr);
3674
3675         if (dvp->v_op->vop_mkdir == NULL)
3676                 ret = ENOSYS;
3677         else
3678                 ret = dvp->v_op->vop_mkdir(dvp, dirname, vap, vpp, cr, ct,
3679                                            flags, vsecp);
3680
3681         if (ret == 0 && *vpp) {
3682                 VOPSTATS_UPDATE(*vpp, mkdir);
3683                 if ((*vpp)->v_path == NULL) {
3684                         vn_setpath(rootdir, dvp, *vpp, dirname,
3685                             strlen(dirname));
3686                 }
3687         }
3688
3689         return (ret);
3690 }
3691
3692 int
3693 fop_rmdir(
3694         vnode_t *dvp,
3695         char *nm,
3696         vnode_t *cdir,
3697         cred_t *cr,
3698         caller_context_t *ct,
3699         int flags)
3700 {
3701         int     err;
3702
3703         /*
3704          * If this file system doesn't support case-insensitive access
3705          * and said access is requested, fail quickly.
3706          */
3707         if (flags & FIGNORECASE &&
3708             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3709             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3710                 return (EINVAL);
3711
3712         VOPXID_MAP_CR(dvp, cr);
3713
3714         if (dvp->v_op->vop_rmdir == NULL)
3715                 err = ENOSYS;
3716         else
3717                 err = dvp->v_op->vop_rmdir(dvp, nm, cdir, cr, ct, flags);
3718
3719         VOPSTATS_UPDATE(dvp, rmdir);
3720         return (err);
3721 }
3722
3723 int
3724 fop_readdir(
3725         vnode_t *vp,
3726         uio_t *uiop,
3727         cred_t *cr,
3728         int *eofp,
3729         caller_context_t *ct,
3730         int flags)
3731 {
3732         int     err;
3733         ssize_t resid_start = uiop->uio_resid;
3734
3735         /*
3736          * If this file system doesn't support retrieving directory
3737          * entry flags and said access is requested, fail quickly.
3738          */
3739         if (flags & V_RDDIR_ENTFLAGS &&
3740             vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3741                 return (EINVAL);
3742
3743         VOPXID_MAP_CR(vp, cr);
3744
3745         if (vp->v_op->vop_readdir == NULL)
3746                 err = ENOSYS;
3747         else
3748                 err = vp->v_op->vop_readdir(vp, uiop, cr, eofp, ct, flags);
3749
3750         VOPSTATS_UPDATE_IO(vp, readdir,
3751             readdir_bytes, (resid_start - uiop->uio_resid));
3752         return (err);
3753 }
3754
3755 int
3756 fop_symlink(
3757         vnode_t *dvp,
3758         char *linkname,
3759         vattr_t *vap,
3760         char *target,
3761         cred_t *cr,
3762         caller_context_t *ct,
3763         int flags)
3764 {
3765         int     err;
3766         xvattr_t xvattr;
3767
3768         /*
3769          * If this file system doesn't support case-insensitive access
3770          * and said access is requested, fail quickly.
3771          */
3772         if (flags & FIGNORECASE &&
3773             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3774             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3775                 return (EINVAL);
3776
3777         VOPXID_MAP_CR(dvp, cr);
3778
3779         /* check for reparse point */
3780         if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3781             (strncmp(target, FS_REPARSE_TAG_STR,
3782             strlen(FS_REPARSE_TAG_STR)) == 0)) {
3783                 if (!fs_reparse_mark(target, vap, &xvattr))
3784                         vap = (vattr_t *)&xvattr;
3785         }
3786
3787         if (dvp->v_op->vop_symlink == NULL)
3788                 err = ENOSYS;
3789         else
3790                 err = dvp->v_op->vop_symlink(dvp, linkname, vap, target, cr,
3791                                              ct, flags);
3792
3793         VOPSTATS_UPDATE(dvp, symlink);
3794         return (err);
3795 }
3796
3797 int
3798 fop_readlink(
3799         vnode_t *vp,
3800         uio_t *uiop,
3801         cred_t *cr,
3802         caller_context_t *ct)
3803 {
3804         int     err;
3805
3806         VOPXID_MAP_CR(vp, cr);
3807
3808         if (vp->v_op->vop_readlink == NULL)
3809                 err = ENOSYS;
3810         else
3811                 err = vp->v_op->vop_readlink(vp, uiop, cr, ct);
3812
3813         VOPSTATS_UPDATE(vp, readlink);
3814         return (err);
3815 }
3816
3817 int
3818 fop_fsync(
3819         vnode_t *vp,
3820         int syncflag,
3821         cred_t *cr,
3822         caller_context_t *ct)
3823 {
3824         int     err;
3825
3826         VOPXID_MAP_CR(vp, cr);
3827
3828         if (vp->v_op->vop_fsync == NULL)
3829                 err = ENOSYS;
3830         else
3831                 err = vp->v_op->vop_fsync(vp, syncflag, cr, ct);
3832
3833         VOPSTATS_UPDATE(vp, fsync);
3834         return (err);
3835 }
3836
3837 void
3838 fop_inactive(
3839         vnode_t *vp,
3840         cred_t *cr,
3841         caller_context_t *ct)
3842 {
3843         /* Need to update stats before vop call since we may lose the vnode */
3844         VOPSTATS_UPDATE(vp, inactive);
3845
3846         VOPXID_MAP_CR(vp, cr);
3847
3848         if (vp->v_op->vop_inactive != NULL)
3849                 vp->v_op->vop_inactive(vp, cr, ct);
3850 }
3851
3852 int
3853 fop_fid(
3854         vnode_t *vp,
3855         fid_t *fidp,
3856         caller_context_t *ct)
3857 {
3858         int     err;
3859
3860         if (vp->v_op->vop_fid == NULL)
3861                 err = ENOSYS;
3862         else
3863                 err = vp->v_op->vop_fid(vp, fidp, ct);
3864
3865         VOPSTATS_UPDATE(vp, fid);
3866         return (err);
3867 }
3868
3869 int
3870 fop_rwlock(
3871         vnode_t *vp,
3872         int write_lock,
3873         caller_context_t *ct)
3874 {
3875         int     ret;
3876
3877         if (vp->v_op->vop_rwlock == NULL)
3878                 ret = fs_rwlock(vp, write_lock, ct);
3879         else
3880                 ret = vp->v_op->vop_rwlock(vp, write_lock, ct);
3881
3882         VOPSTATS_UPDATE(vp, rwlock);
3883         return (ret);
3884 }
3885
3886 void
3887 fop_rwunlock(
3888         vnode_t *vp,
3889         int write_lock,
3890         caller_context_t *ct)
3891 {
3892         if (vp->v_op->vop_rwunlock == NULL)
3893                 fs_rwunlock(vp, write_lock, ct);
3894         else
3895                 vp->v_op->vop_rwunlock(vp, write_lock, ct);
3896
3897         VOPSTATS_UPDATE(vp, rwunlock);
3898 }
3899
3900 int
3901 fop_seek(
3902         vnode_t *vp,
3903         offset_t ooff,
3904         offset_t *noffp,
3905         caller_context_t *ct)
3906 {
3907         int     err;
3908
3909         if (vp->v_op->vop_seek == NULL)
3910                 err = ENOSYS;
3911         else
3912                 err = vp->v_op->vop_seek(vp, ooff, noffp, ct);
3913
3914         VOPSTATS_UPDATE(vp, seek);
3915         return (err);
3916 }
3917
3918 int
3919 fop_cmp(
3920         vnode_t *vp1,
3921         vnode_t *vp2,
3922         caller_context_t *ct)
3923 {
3924         int     err;
3925
3926         if (vp1->v_op->vop_cmp == NULL)
3927                 err = fs_cmp(vp1, vp2, ct);
3928         else
3929                 err = vp1->v_op->vop_cmp(vp1, vp2, ct);
3930
3931         VOPSTATS_UPDATE(vp1, cmp);
3932         return (err);
3933 }
3934
3935 int
3936 fop_frlock(
3937         vnode_t *vp,
3938         int cmd,
3939         flock64_t *bfp,
3940         int flag,
3941         offset_t offset,
3942         struct flk_callback *flk_cbp,
3943         cred_t *cr,
3944         caller_context_t *ct)
3945 {
3946         int     err;
3947
3948         VOPXID_MAP_CR(vp, cr);
3949
3950         if (vp->v_op->vop_frlock == NULL)
3951                 err = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
3952         else
3953                 err = vp->v_op->vop_frlock(vp, cmd, bfp, flag, offset,
3954                                            flk_cbp, cr, ct);
3955
3956         VOPSTATS_UPDATE(vp, frlock);
3957         return (err);
3958 }
3959
3960 int
3961 fop_space(
3962         vnode_t *vp,
3963         int cmd,
3964         flock64_t *bfp,
3965         int flag,
3966         offset_t offset,
3967         cred_t *cr,
3968         caller_context_t *ct)
3969 {
3970         int     err;
3971
3972         VOPXID_MAP_CR(vp, cr);
3973
3974         if (vp->v_op->vop_space == NULL)
3975                 err = ENOSYS;
3976         else
3977                 err = vp->v_op->vop_space(vp, cmd, bfp, flag, offset, cr, ct);
3978
3979         VOPSTATS_UPDATE(vp, space);
3980         return (err);
3981 }
3982
3983 int
3984 fop_realvp(
3985         vnode_t *vp,
3986         vnode_t **vpp,
3987         caller_context_t *ct)
3988 {
3989         int     err;
3990
3991         if (vp->v_op->vop_realvp == NULL)
3992                 err = ENOSYS;
3993         else
3994                 err = vp->v_op->vop_realvp(vp, vpp, ct);
3995
3996         VOPSTATS_UPDATE(vp, realvp);
3997         return (err);
3998 }
3999
4000 int
4001 fop_getpage(
4002         vnode_t *vp,
4003         offset_t off,
4004         size_t len,
4005         uint_t *protp,
4006         page_t **plarr,
4007         size_t plsz,
4008         struct seg *seg,
4009         caddr_t addr,
4010         enum seg_rw rw,
4011         cred_t *cr,
4012         caller_context_t *ct)
4013 {
4014         int     err;
4015
4016         VOPXID_MAP_CR(vp, cr);
4017
4018         if (vp->v_op->vop_getpage == NULL)
4019                 err = ENOSYS;
4020         else
4021                 err = vp->v_op->vop_getpage(vp, off, len, protp, plarr,
4022                                             plsz, seg, addr, rw, cr, ct);
4023
4024         VOPSTATS_UPDATE(vp, getpage);
4025         return (err);
4026 }
4027
4028 int
4029 fop_putpage(
4030         vnode_t *vp,
4031         offset_t off,
4032         size_t len,
4033         int flags,
4034         cred_t *cr,
4035         caller_context_t *ct)
4036 {
4037         int     err;
4038
4039         VOPXID_MAP_CR(vp, cr);
4040
4041         if (vp->v_op->vop_putpage == NULL)
4042                 err = ENOSYS;
4043         else
4044                 err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4045         VOPSTATS_UPDATE(vp, putpage);
4046         return (err);
4047 }
4048
4049 int
4050 fop_map(
4051         vnode_t *vp,
4052         offset_t off,
4053         struct as *as,
4054         caddr_t *addrp,
4055         size_t len,
4056         uchar_t prot,
4057         uchar_t maxprot,
4058         uint_t flags,
4059         cred_t *cr,
4060         caller_context_t *ct)
4061 {
4062         int     err;
4063
4064         VOPXID_MAP_CR(vp, cr);
4065
4066         if (vp->v_op->vop_map == NULL)
4067                 err = ENOSYS;
4068         else
4069                 err = vp->v_op->vop_map(vp, off, as, addrp, len, prot,
4070                                         maxprot, flags, cr, ct);
4071
4072         VOPSTATS_UPDATE(vp, map);
4073         return (err);
4074 }
4075
4076 int
4077 fop_addmap(
4078         vnode_t *vp,
4079         offset_t off,
4080         struct as *as,
4081         caddr_t addr,
4082         size_t len,
4083         uchar_t prot,
4084         uchar_t maxprot,
4085         uint_t flags,
4086         cred_t *cr,
4087         caller_context_t *ct)
4088 {
4089         int error;
4090         u_longlong_t delta;
4091
4092         VOPXID_MAP_CR(vp, cr);
4093
4094         if (vp->v_op->vop_addmap == NULL)
4095                 error = ENOSYS;
4096         else
4097                 error = vp->v_op->vop_addmap(vp, off, as, addr, len, prot,
4098                                              maxprot, flags, cr, ct);
4099
4100         if ((!error) && (vp->v_type == VREG)) {
4101                 delta = (u_longlong_t)btopr(len);
4102                 /*
4103                  * If file is declared MAP_PRIVATE, it can't be written back
4104                  * even if open for write. Handle as read.
4105                  */
4106                 if (flags & MAP_PRIVATE) {
4107                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4108                             (int64_t)delta);
4109                 } else {
4110                         /*
4111                          * atomic_add_64 forces the fetch of a 64 bit value to
4112                          * be atomic on 32 bit machines
4113                          */
4114                         if (maxprot & PROT_WRITE)
4115                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4116                                     (int64_t)delta);
4117                         if (maxprot & PROT_READ)
4118                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4119                                     (int64_t)delta);
4120                         if (maxprot & PROT_EXEC)
4121                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4122                                     (int64_t)delta);
4123                 }
4124         }
4125         VOPSTATS_UPDATE(vp, addmap);
4126         return (error);
4127 }
4128
4129 int
4130 fop_delmap(
4131         vnode_t *vp,
4132         offset_t off,
4133         struct as *as,
4134         caddr_t addr,
4135         size_t len,
4136         uint_t prot,
4137         uint_t maxprot,
4138         uint_t flags,
4139         cred_t *cr,
4140         caller_context_t *ct)
4141 {
4142         int error;
4143         u_longlong_t delta;
4144
4145         VOPXID_MAP_CR(vp, cr);
4146
4147         if (vp->v_op->vop_delmap == NULL)
4148                 error = ENOSYS;
4149         else
4150                 error = vp->v_op->vop_delmap(vp, off, as, addr, len, prot,
4151                                              maxprot, flags, cr, ct);
4152
4153         /*
4154          * NFS calls into delmap twice, the first time
4155          * it simply establishes a callback mechanism and returns EAGAIN
4156          * while the real work is being done upon the second invocation.
4157          * We have to detect this here and only decrement the counts upon
4158          * the second delmap request.
4159          */
4160         if ((error != EAGAIN) && (vp->v_type == VREG)) {
4161
4162                 delta = (u_longlong_t)btopr(len);
4163
4164                 if (flags & MAP_PRIVATE) {
4165                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4166                             (int64_t)(-delta));
4167                 } else {
4168                         /*
4169                          * atomic_add_64 forces the fetch of a 64 bit value
4170                          * to be atomic on 32 bit machines
4171                          */
4172                         if (maxprot & PROT_WRITE)
4173                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4174                                     (int64_t)(-delta));
4175                         if (maxprot & PROT_READ)
4176                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4177                                     (int64_t)(-delta));
4178                         if (maxprot & PROT_EXEC)
4179                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4180                                     (int64_t)(-delta));
4181                 }
4182         }
4183         VOPSTATS_UPDATE(vp, delmap);
4184         return (error);
4185 }
4186
4187
4188 int
4189 fop_poll(
4190         vnode_t *vp,
4191         short events,
4192         int anyyet,
4193         short *reventsp,
4194         struct pollhead **phpp,
4195         caller_context_t *ct)
4196 {
4197         int     err;
4198
4199         if (vp->v_op->vop_poll == NULL)
4200                 err = fs_poll(vp, events, anyyet, reventsp, phpp, ct);
4201         else
4202                 err = vp->v_op->vop_poll(vp, events, anyyet, reventsp, phpp,
4203                                          ct);
4204
4205         VOPSTATS_UPDATE(vp, poll);
4206         return (err);
4207 }
4208
4209 int
4210 fop_dump(
4211         vnode_t *vp,
4212         caddr_t addr,
4213         offset_t lbdn,
4214         offset_t dblks,
4215         caller_context_t *ct)
4216 {
4217         int     err;
4218
4219         /* ensure lbdn and dblks can be passed safely to bdev_dump */
4220         if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4221                 return (EIO);
4222
4223         if (vp->v_op->vop_dump == NULL)
4224                 err = ENOSYS;
4225         else
4226                 err = vp->v_op->vop_dump(vp, addr, lbdn, dblks, ct);
4227
4228         VOPSTATS_UPDATE(vp, dump);
4229         return (err);
4230 }
4231
4232 int
4233 fop_pathconf(
4234         vnode_t *vp,
4235         int cmd,
4236         ulong_t *valp,
4237         cred_t *cr,
4238         caller_context_t *ct)
4239 {
4240         int     err;
4241
4242         VOPXID_MAP_CR(vp, cr);
4243
4244         if (vp->v_op->vop_pathconf == NULL)
4245                 err = fs_pathconf(vp, cmd, valp, cr, ct);
4246         else
4247                 err = vp->v_op->vop_pathconf(vp, cmd, valp, cr, ct);
4248
4249         VOPSTATS_UPDATE(vp, pathconf);
4250         return (err);
4251 }
4252
4253 int
4254 fop_pageio(
4255         vnode_t *vp,
4256         struct page *pp,
4257         uoff_t io_off,
4258         size_t io_len,
4259         int flags,
4260         cred_t *cr,
4261         caller_context_t *ct)
4262 {
4263         int     err;
4264
4265         VOPXID_MAP_CR(vp, cr);
4266
4267         if (vp->v_op->vop_pageio == NULL)
4268                 err = ENOSYS;
4269         else
4270                 err = vp->v_op->vop_pageio(vp, pp, io_off, io_len, flags,
4271                                            cr, ct);
4272
4273         VOPSTATS_UPDATE(vp, pageio);
4274         return (err);
4275 }
4276
4277 int
4278 fop_dumpctl(
4279         vnode_t *vp,
4280         int action,
4281         offset_t *blkp,
4282         caller_context_t *ct)
4283 {
4284         int     err;
4285
4286         if (vp->v_op->vop_dumpctl == NULL)
4287                 err = ENOSYS;
4288         else
4289                 err = vp->v_op->vop_dumpctl(vp, action, blkp, ct);
4290
4291         VOPSTATS_UPDATE(vp, dumpctl);
4292         return (err);
4293 }
4294
4295 void
4296 fop_dispose(
4297         vnode_t *vp,
4298         page_t *pp,
4299         int flag,
4300         int dn,
4301         cred_t *cr,
4302         caller_context_t *ct)
4303 {
4304         /* Must do stats first since it's possible to lose the vnode */
4305         VOPSTATS_UPDATE(vp, dispose);
4306
4307         VOPXID_MAP_CR(vp, cr);
4308
4309         if (vp->v_op->vop_dispose == NULL)
4310                 fs_dispose(vp, pp, flag, dn, cr, ct);
4311         else
4312                 vp->v_op->vop_dispose(vp, pp, flag, dn, cr, ct);
4313 }
4314
4315 int
4316 fop_setsecattr(
4317         vnode_t *vp,
4318         vsecattr_t *vsap,
4319         int flag,
4320         cred_t *cr,
4321         caller_context_t *ct)
4322 {
4323         int     err;
4324
4325         VOPXID_MAP_CR(vp, cr);
4326
4327         /*
4328          * We're only allowed to skip the ACL check iff we used a 32 bit
4329          * ACE mask with fop_access() to determine permissions.
4330          */
4331         if ((flag & ATTR_NOACLCHECK) &&
4332             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4333                 return (EINVAL);
4334         }
4335
4336         if (vp->v_op->vop_setsecattr == NULL)
4337                 err = ENOSYS;
4338         else
4339                 err = vp->v_op->vop_setsecattr(vp, vsap, flag, cr, ct);
4340
4341         VOPSTATS_UPDATE(vp, setsecattr);
4342         return (err);
4343 }
4344
4345 int
4346 fop_getsecattr(
4347         vnode_t *vp,
4348         vsecattr_t *vsap,
4349         int flag,
4350         cred_t *cr,
4351         caller_context_t *ct)
4352 {
4353         int     err;
4354
4355         /*
4356          * We're only allowed to skip the ACL check iff we used a 32 bit
4357          * ACE mask with fop_access() to determine permissions.
4358          */
4359         if ((flag & ATTR_NOACLCHECK) &&
4360             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4361                 return (EINVAL);
4362         }
4363
4364         VOPXID_MAP_CR(vp, cr);
4365
4366         if (vp->v_op->vop_getsecattr == NULL)
4367                 err = fs_fab_acl(vp, vsap, flag, cr, ct);
4368         else
4369                 err = vp->v_op->vop_getsecattr(vp, vsap, flag, cr, ct);
4370
4371         VOPSTATS_UPDATE(vp, getsecattr);
4372         return (err);
4373 }
4374
4375 int
4376 fop_shrlock(
4377         vnode_t *vp,
4378         int cmd,
4379         struct shrlock *shr,
4380         int flag,
4381         cred_t *cr,
4382         caller_context_t *ct)
4383 {
4384         int     err;
4385
4386         VOPXID_MAP_CR(vp, cr);
4387
4388         if (vp->v_op->vop_shrlock == NULL)
4389                 err = fs_shrlock(vp, cmd, shr, flag, cr, ct);
4390         else
4391                 err = vp->v_op->vop_shrlock(vp, cmd, shr, flag, cr, ct);
4392
4393         VOPSTATS_UPDATE(vp, shrlock);
4394         return (err);
4395 }
4396
4397 int
4398 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4399     caller_context_t *ct)
4400 {
4401         int     err;
4402
4403         if (vp->v_op->vop_vnevent == NULL)
4404                 err = ENOTSUP;
4405         else
4406                 err = vp->v_op->vop_vnevent(vp, vnevent, dvp, fnm, ct);
4407
4408         VOPSTATS_UPDATE(vp, vnevent);
4409         return (err);
4410 }
4411
4412 int
4413 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4414     caller_context_t *ct)
4415 {
4416         int err;
4417
4418         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4419                 return (ENOTSUP);
4420
4421         if (vp->v_op->vop_reqzcbuf == NULL)
4422                 err = ENOSYS;
4423         else
4424                 err = vp->v_op->vop_reqzcbuf(vp, ioflag, uiop, cr, ct);
4425
4426         VOPSTATS_UPDATE(vp, reqzcbuf);
4427         return (err);
4428 }
4429
4430 int
4431 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4432 {
4433         int err;
4434
4435         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4436                 return (ENOTSUP);
4437
4438         if (vp->v_op->vop_retzcbuf == NULL)
4439                 err = ENOSYS;
4440         else
4441                 err = vp->v_op->vop_retzcbuf(vp, uiop, cr, ct);
4442
4443         VOPSTATS_UPDATE(vp, retzcbuf);
4444         return (err);
4445 }
4446
4447 /*
4448  * Default destructor
4449  *      Needed because NULL destructor means that the key is unused
4450  */
4451 /* ARGSUSED */
4452 void
4453 vsd_defaultdestructor(void *value)
4454 {}
4455
4456 /*
4457  * Create a key (index into per vnode array)
4458  *      Locks out vsd_create, vsd_destroy, and vsd_free
4459  *      May allocate memory with lock held
4460  */
4461 void
4462 vsd_create(uint_t *keyp, void (*destructor)(void *))
4463 {
4464         int     i;
4465         uint_t  nkeys;
4466
4467         /*
4468          * if key is allocated, do nothing
4469          */
4470         mutex_enter(&vsd_lock);
4471         if (*keyp) {
4472                 mutex_exit(&vsd_lock);
4473                 return;
4474         }
4475         /*
4476          * find an unused key
4477          */
4478         if (destructor == NULL)
4479                 destructor = vsd_defaultdestructor;
4480
4481         for (i = 0; i < vsd_nkeys; ++i)
4482                 if (vsd_destructor[i] == NULL)
4483                         break;
4484
4485         /*
4486          * if no unused keys, increase the size of the destructor array
4487          */
4488         if (i == vsd_nkeys) {
4489                 if ((nkeys = (vsd_nkeys << 1)) == 0)
4490                         nkeys = 1;
4491                 vsd_destructor =
4492                     (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4493                     (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4494                     (size_t)(nkeys * sizeof (void (*)(void *))));
4495                 vsd_nkeys = nkeys;
4496         }
4497
4498         /*
4499          * allocate the next available unused key
4500          */
4501         vsd_destructor[i] = destructor;
4502         *keyp = i + 1;
4503
4504         /* create vsd_list, if it doesn't exist */
4505         if (vsd_list == NULL) {
4506                 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4507                 list_create(vsd_list, sizeof (struct vsd_node),
4508                     offsetof(struct vsd_node, vs_nodes));
4509         }
4510
4511         mutex_exit(&vsd_lock);
4512 }
4513
4514 /*
4515  * Destroy a key
4516  *
4517  * Assumes that the caller is preventing vsd_set and vsd_get
4518  * Locks out vsd_create, vsd_destroy, and vsd_free
4519  * May free memory with lock held
4520  */
4521 void
4522 vsd_destroy(uint_t *keyp)
4523 {
4524         uint_t key;
4525         struct vsd_node *vsd;
4526
4527         /*
4528          * protect the key namespace and our destructor lists
4529          */
4530         mutex_enter(&vsd_lock);
4531         key = *keyp;
4532         *keyp = 0;
4533
4534         ASSERT(key <= vsd_nkeys);
4535
4536         /*
4537          * if the key is valid
4538          */
4539         if (key != 0) {
4540                 uint_t k = key - 1;
4541                 /*
4542                  * for every vnode with VSD, call key's destructor
4543                  */
4544                 for (vsd = list_head(vsd_list); vsd != NULL;
4545                     vsd = list_next(vsd_list, vsd)) {
4546                         /*
4547                          * no VSD for key in this vnode
4548                          */
4549                         if (key > vsd->vs_nkeys)
4550                                 continue;
4551                         /*
4552                          * call destructor for key
4553                          */
4554                         if (vsd->vs_value[k] && vsd_destructor[k])
4555                                 (*vsd_destructor[k])(vsd->vs_value[k]);
4556                         /*
4557                          * reset value for key
4558                          */
4559                         vsd->vs_value[k] = NULL;
4560                 }
4561                 /*
4562                  * actually free the key (NULL destructor == unused)
4563                  */
4564                 vsd_destructor[k] = NULL;
4565         }
4566
4567         mutex_exit(&vsd_lock);
4568 }
4569
4570 /*
4571  * Quickly return the per vnode value that was stored with the specified key
4572  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4573  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4574  */
4575 void *
4576 vsd_get(vnode_t *vp, uint_t key)
4577 {
4578         struct vsd_node *vsd;
4579
4580         ASSERT(vp != NULL);
4581         ASSERT(mutex_owned(&vp->v_vsd_lock));
4582
4583         vsd = vp->v_vsd;
4584
4585         if (key && vsd != NULL && key <= vsd->vs_nkeys)
4586                 return (vsd->vs_value[key - 1]);
4587         return (NULL);
4588 }
4589
4590 /*
4591  * Set a per vnode value indexed with the specified key
4592  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4593  */
4594 int
4595 vsd_set(vnode_t *vp, uint_t key, void *value)
4596 {
4597         struct vsd_node *vsd;
4598
4599         ASSERT(vp != NULL);
4600         ASSERT(mutex_owned(&vp->v_vsd_lock));
4601
4602         if (key == 0)
4603                 return (EINVAL);
4604
4605         vsd = vp->v_vsd;
4606         if (vsd == NULL)
4607                 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4608
4609         /*
4610          * If the vsd was just allocated, vs_nkeys will be 0, so the following
4611          * code won't happen and we will continue down and allocate space for
4612          * the vs_value array.
4613          * If the caller is replacing one value with another, then it is up
4614          * to the caller to free/rele/destroy the previous value (if needed).
4615          */
4616         if (key <= vsd->vs_nkeys) {
4617                 vsd->vs_value[key - 1] = value;
4618                 return (0);
4619         }
4620
4621         ASSERT(key <= vsd_nkeys);
4622
4623         if (vsd->vs_nkeys == 0) {
4624                 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4625                 /*
4626                  * Link onto list of all VSD nodes.
4627                  */
4628                 list_insert_head(vsd_list, vsd);
4629                 mutex_exit(&vsd_lock);
4630         }
4631
4632         /*
4633          * Allocate vnode local storage and set the value for key
4634          */
4635         vsd->vs_value = vsd_realloc(vsd->vs_value,
4636             vsd->vs_nkeys * sizeof (void *),
4637             key * sizeof (void *));
4638         vsd->vs_nkeys = key;
4639         vsd->vs_value[key - 1] = value;
4640
4641         return (0);
4642 }
4643
4644 /*
4645  * Called from vn_free() to run the destructor function for each vsd
4646  *      Locks out vsd_create and vsd_destroy
4647  *      Assumes that the destructor *DOES NOT* use vsd
4648  */
4649 void
4650 vsd_free(vnode_t *vp)
4651 {
4652         int i;
4653         struct vsd_node *vsd = vp->v_vsd;
4654
4655         if (vsd == NULL)
4656                 return;
4657
4658         if (vsd->vs_nkeys == 0) {
4659                 kmem_free(vsd, sizeof (*vsd));
4660                 vp->v_vsd = NULL;
4661                 return;
4662         }
4663
4664         /*
4665          * lock out vsd_create and vsd_destroy, call
4666          * the destructor, and mark the value as destroyed.
4667          */
4668         mutex_enter(&vsd_lock);
4669
4670         for (i = 0; i < vsd->vs_nkeys; i++) {
4671                 if (vsd->vs_value[i] && vsd_destructor[i])
4672                         (*vsd_destructor[i])(vsd->vs_value[i]);
4673                 vsd->vs_value[i] = NULL;
4674         }
4675
4676         /*
4677          * remove from linked list of VSD nodes
4678          */
4679         list_remove(vsd_list, vsd);
4680
4681         mutex_exit(&vsd_lock);
4682
4683         /*
4684          * free up the VSD
4685          */
4686         kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4687         kmem_free(vsd, sizeof (struct vsd_node));
4688         vp->v_vsd = NULL;
4689 }
4690
4691 /*
4692  * realloc
4693  */
4694 static void *
4695 vsd_realloc(void *old, size_t osize, size_t nsize)
4696 {
4697         void *new;
4698
4699         new = kmem_zalloc(nsize, KM_SLEEP);
4700         if (old) {
4701                 bcopy(old, new, osize);
4702                 kmem_free(old, osize);
4703         }
4704         return (new);
4705 }
4706
4707 /*
4708  * Setup the extensible system attribute for creating a reparse point.
4709  * The symlink data 'target' is validated for proper format of a reparse
4710  * string and a check also made to make sure the symlink data does not
4711  * point to an existing file.
4712  *
4713  * return 0 if ok else -1.
4714  */
4715 static int
4716 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4717 {
4718         xoptattr_t *xoap;
4719
4720         if ((!target) || (!vap) || (!xvattr))
4721                 return (-1);
4722
4723         /* validate reparse string */
4724         if (reparse_validate((const char *)target))
4725                 return (-1);
4726
4727         xva_init(xvattr);
4728         xvattr->xva_vattr = *vap;
4729         xvattr->xva_vattr.va_mask |= AT_XVATTR;
4730         xoap = xva_getxoptattr(xvattr);
4731         ASSERT(xoap);
4732         XVA_SET_REQ(xvattr, XAT_REPARSE);
4733         xoap->xoa_reparse = 1;
4734
4735         return (0);
4736 }
4737
4738 /*
4739  * Function to check whether a symlink is a reparse point.
4740  * Return B_TRUE if it is a reparse point, else return B_FALSE
4741  */
4742 boolean_t
4743 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4744 {
4745         xvattr_t xvattr;
4746         xoptattr_t *xoap;
4747
4748         if ((vp->v_type != VLNK) ||
4749             !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4750                 return (B_FALSE);
4751
4752         xva_init(&xvattr);
4753         xoap = xva_getxoptattr(&xvattr);
4754         ASSERT(xoap);
4755         XVA_SET_REQ(&xvattr, XAT_REPARSE);
4756
4757         if (fop_getattr(vp, &xvattr.xva_vattr, 0, cr, ct))
4758                 return (B_FALSE);
4759
4760         if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4761             (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4762                 return (B_FALSE);
4763
4764         return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4765 }