kernel/fs/vnode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  27  */
  28
  29 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  30 /*        All Rights Reserved   */
  31
  32 /*
  33  * University Copyright- Copyright (c) 1982, 1986, 1988
  34  * The Regents of the University of California
  35  * All Rights Reserved
  36  *
  37  * University Acknowledgment- Portions of this document are derived from
  38  * software developed by the University of California, Berkeley, and its
  39  * contributors.
  40  */
  41
  42 #include <sys/types.h>
  43 #include <sys/param.h>
  44 #include <sys/t_lock.h>
  45 #include <sys/errno.h>
  46 #include <sys/cred.h>
  47 #include <sys/user.h>
  48 #include <sys/uio.h>
  49 #include <sys/file.h>
  50 #include <sys/pathname.h>
  51 #include <sys/vfs.h>
  52 #include <sys/vfs_opreg.h>
  53 #include <sys/vnode.h>
  54 #include <sys/vnode_dispatch.h>
  55 #include <sys/rwstlock.h>
  56 #include <sys/fem.h>
  57 #include <sys/stat.h>
  58 #include <sys/mode.h>
  59 #include <sys/conf.h>
  60 #include <sys/sysmacros.h>
  61 #include <sys/cmn_err.h>
  62 #include <sys/systm.h>
  63 #include <sys/kmem.h>
  64 #include <sys/debug.h>
  65 #include <c2/audit.h>
  66 #include <sys/acl.h>
  67 #include <sys/nbmlock.h>
  68 #include <sys/fcntl.h>
  69 #include <sys/fs_subr.h>
  70 #include <sys/taskq.h>
  71 #include <sys/fs_reparse.h>
  72
  73 /* Determine if this vnode is a file that is read-only */
  74 #define ISROFILE(vp)    \
  75         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  76             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  77
  78 /* Tunable via /etc/system; used only by admin/install */
  79 int nfs_global_client_only;
  80
  81 /*
  82  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  83  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  84  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  85  * the same fstype index that is used to index into the vfssw table.
  86  */
  87 vopstats_t **vopstats_fstype;
  88
  89 /* vopstats initialization template used for fast initialization via bcopy() */
  90 static vopstats_t *vs_templatep;
  91
  92 /* Kmem cache handle for vsk_anchor_t allocations */
  93 kmem_cache_t *vsk_anchor_cache;
  94
  95 /* file events cleanup routine */
  96 extern void free_fopdata(vnode_t *);
  97
  98 /*
  99  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
 100  * updates to vsktat_tree.
 101  */
 102 avl_tree_t      vskstat_tree;
 103 kmutex_t        vskstat_tree_lock;
 104
 105 /* Global variable which enables/disables the vopstats collection */
 106 int vopstats_enabled = 1;
 107
 108 /*
 109  * forward declarations for internal vnode specific data (vsd)
 110  */
 111 static void *vsd_realloc(void *, size_t, size_t);
 112
 113 /*
 114  * forward declarations for reparse point functions
 115  */
 116 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 117
 118 /*
 119  * VSD -- VNODE SPECIFIC DATA
 120  * The v_data pointer is typically used by a file system to store a
 121  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 122  * However, there are times when additional project private data needs
 123  * to be stored separately from the data (node) pointed to by v_data.
 124  * This additional data could be stored by the file system itself or
 125  * by a completely different kernel entity.  VSD provides a way for
 126  * callers to obtain a key and store a pointer to private data associated
 127  * with a vnode.
 128  *
 129  * Callers are responsible for protecting the vsd by holding v_vsd_lock
 130  * for calls to vsd_set() and vsd_get().
 131  */
 132
 133 /*
 134  * vsd_lock protects:
 135  *   vsd_nkeys - creation and deletion of vsd keys
 136  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 137  *   vsd_destructor - adding and removing destructors to the list
 138  */
 139 static kmutex_t         vsd_lock;
 140 static uint_t           vsd_nkeys;       /* size of destructor array */
 141 /* list of vsd_node's */
 142 static list_t *vsd_list = NULL;
 143 /* per-key destructor funcs */
 144 static void             (**vsd_destructor)(void *);
 145
 146 /*
 147  * The following is the common set of actions needed to update the
 148  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 149  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 150  * recording of the bytes transferred.  Since the code is similar
 151  * but small, it is nearly a duplicate.  Consequently any changes
 152  * to one may need to be reflected in the other.
 153  * Rundown of the variables:
 154  * vp - Pointer to the vnode
 155  * counter - Partial name structure member to update in vopstats for counts
 156  * bytecounter - Partial name structure member to update in vopstats for bytes
 157  * bytesval - Value to update in vopstats for bytes
 158  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 159  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 160  */
 161
 162 #define VOPSTATS_UPDATE(vp, counter) {                                  \
 163         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 164         if (vfsp && vfsp->vfs_implp &&                                  \
 165             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 166                 vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 167                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 168                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 169                     size_t, uint64_t *);                                \
 170                 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 171                 (*stataddr)++;                                          \
 172                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 173                         vsp->n##counter.value.ui64++;                   \
 174                 }                                                       \
 175         }                                                               \
 176 }
 177
 178 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 179         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 180         if (vfsp && vfsp->vfs_implp &&                                  \
 181             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 182                 vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 183                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 184                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 185                     size_t, uint64_t *);                                \
 186                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 187                 (*stataddr)++;                                          \
 188                 vsp->bytecounter.value.ui64 += bytesval;                \
 189                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 190                         vsp->n##counter.value.ui64++;                   \
 191                         vsp->bytecounter.value.ui64 += bytesval;        \
 192                 }                                                       \
 193         }                                                               \
 194 }
 195
 196 /*
 197  * If the filesystem does not support XIDs map credential
 198  * If the vfsp is NULL, perhaps we should also map?
 199  */
 200 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 201         vfs_t *vfsp = (vp)->v_vfsp;                                     \
 202         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)            \
 203                 cr = crgetmapped(cr);                                   \
 204         }
 205
 206 /*
 207  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 208  * numerical order of S_IFMT and vnode types.)
 209  */
 210 enum vtype iftovt_tab[] = {
 211         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 212         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 213 };
 214
 215 ushort_t vttoif_tab[] = {
 216         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 217         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 218 };
 219
 220 /*
 221  * The system vnode cache.
 222  */
 223
 224 kmem_cache_t *vn_cache;
 225
 226
 227 /*
 228  * Vnode operations vector.
 229  */
 230
 231 static const fs_operation_trans_def_t vn_ops_table[] = {
 232         VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 233             fs_nosys,
 234
 235         VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 236             fs_nosys,
 237
 238         VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 239             fs_nosys,
 240
 241         VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 242             fs_nosys,
 243
 244         VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 245             fs_nosys,
 246
 247         VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 248             fs_setfl,
 249
 250         VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 251             fs_nosys,
 252
 253         VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 254             fs_nosys,
 255
 256         VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 257             fs_nosys,
 258
 259         VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 260             fs_nosys,
 261
 262         VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 263             fs_nosys,
 264
 265         VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 266             fs_nosys,
 267
 268         VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 269             fs_nosys,
 270
 271         VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 272             fs_nosys,
 273
 274         VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 275             fs_nosys,
 276
 277         VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 278             fs_nosys,
 279
 280         VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 281             fs_nosys,
 282
 283         VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 284             fs_nosys,
 285
 286         VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 287             fs_nosys,
 288
 289         VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 290             fs_nosys,
 291
 292         VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 293             fs_nosys,
 294
 295         VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 296             fs_nosys,
 297
 298         VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 299             fs_rwlock,
 300
 301         VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 302             (fs_generic_func_p) fs_rwunlock,
 303
 304         VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 305             fs_nosys,
 306
 307         VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 308             fs_cmp,
 309
 310         VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 311             fs_frlock,
 312
 313         VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 314             fs_nosys,
 315
 316         VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 317             fs_nosys,
 318
 319         VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 320             fs_nosys,
 321
 322         VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 323             fs_nosys,
 324
 325         VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 326             (fs_generic_func_p) fs_nosys_map,
 327
 328         VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 329             (fs_generic_func_p) fs_nosys_addmap,
 330
 331         VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 332             fs_nosys,
 333
 334         VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 335             (fs_generic_func_p) fs_poll,
 336
 337         VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 338             fs_nosys,
 339
 340         VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 341             fs_pathconf,
 342
 343         VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 344             fs_nosys,
 345
 346         VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 347             fs_nosys,
 348
 349         VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 350             (fs_generic_func_p) fs_dispose,
 351
 352         VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 353             fs_nosys,
 354
 355         VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 356             fs_fab_acl,
 357
 358         VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 359             fs_shrlock,
 360
 361         VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 362             (fs_generic_func_p) fs_vnevent_nosupport,
 363
 364         VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 365             fs_nosys,
 366
 367         VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 368             fs_nosys,
 369
 370         NULL, 0, NULL,
 371 };
 372
 373 /* Extensible attribute (xva) routines. */
 374
 375 /*
 376  * Zero out the structure, set the size of the requested/returned bitmaps,
 377  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 378  * to the returned attributes array.
 379  */
 380 void
 381 xva_init(xvattr_t *xvap)
 382 {
 383         bzero(xvap, sizeof (xvattr_t));
 384         xvap->xva_mapsize = XVA_MAPSIZE;
 385         xvap->xva_magic = XVA_MAGIC;
 386         xvap->xva_vattr.va_mask = AT_XVATTR;
 387         xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 388 }
 389
 390 /*
 391  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 392  * structure.  Otherwise, returns NULL.
 393  */
 394 xoptattr_t *
 395 xva_getxoptattr(xvattr_t *xvap)
 396 {
 397         xoptattr_t *xoap = NULL;
 398         if (xvap->xva_vattr.va_mask & AT_XVATTR)
 399                 xoap = &xvap->xva_xoptattrs;
 400         return (xoap);
 401 }
 402
 403 /*
 404  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 405  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 406  * kstat name.
 407  */
 408 static int
 409 vska_compar(const void *n1, const void *n2)
 410 {
 411         int ret;
 412         ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 413         ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 414
 415         if (p1 < p2) {
 416                 ret = -1;
 417         } else if (p1 > p2) {
 418                 ret = 1;
 419         } else {
 420                 ret = 0;
 421         }
 422
 423         return (ret);
 424 }
 425
 426 /*
 427  * Used to create a single template which will be bcopy()ed to a newly
 428  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 429  */
 430 static vopstats_t *
 431 create_vopstats_template()
 432 {
 433         vopstats_t              *vsp;
 434
 435         vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 436         bzero(vsp, sizeof (*vsp));      /* Start fresh */
 437
 438         /* fop_open */
 439         kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 440         /* fop_close */
 441         kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 442         /* fop_read I/O */
 443         kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 444         kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 445         /* fop_write I/O */
 446         kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 447         kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 448         /* fop_ioctl */
 449         kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 450         /* fop_setfl */
 451         kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 452         /* fop_getattr */
 453         kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 454         /* fop_setattr */
 455         kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 456         /* fop_access */
 457         kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 458         /* fop_lookup */
 459         kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 460         /* fop_create */
 461         kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 462         /* fop_remove */
 463         kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 464         /* fop_link */
 465         kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 466         /* fop_rename */
 467         kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 468         /* fop_mkdir */
 469         kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 470         /* fop_rmdir */
 471         kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 472         /* fop_readdir I/O */
 473         kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 474         kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 475             KSTAT_DATA_UINT64);
 476         /* fop_symlink */
 477         kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 478         /* fop_readlink */
 479         kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 480         /* fop_fsync */
 481         kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 482         /* fop_inactive */
 483         kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 484         /* fop_fid */
 485         kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 486         /* fop_rwlock */
 487         kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 488         /* fop_rwunlock */
 489         kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 490         /* fop_seek */
 491         kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 492         /* fop_cmp */
 493         kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 494         /* fop_frlock */
 495         kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 496         /* fop_space */
 497         kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 498         /* fop_realvp */
 499         kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 500         /* fop_getpage */
 501         kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 502         /* fop_putpage */
 503         kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 504         /* fop_map */
 505         kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 506         /* fop_addmap */
 507         kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 508         /* fop_delmap */
 509         kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 510         /* fop_poll */
 511         kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 512         /* fop_dump */
 513         kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 514         /* fop_pathconf */
 515         kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 516         /* fop_pageio */
 517         kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 518         /* fop_dumpctl */
 519         kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 520         /* fop_dispose */
 521         kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 522         /* fop_setsecattr */
 523         kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 524         /* fop_getsecattr */
 525         kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 526         /* fop_shrlock */
 527         kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 528         /* fop_vnevent */
 529         kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 530         /* fop_reqzcbuf */
 531         kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 532         /* fop_retzcbuf */
 533         kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 534
 535         return (vsp);
 536 }
 537
 538 /*
 539  * Creates a kstat structure associated with a vopstats structure.
 540  */
 541 kstat_t *
 542 new_vskstat(char *ksname, vopstats_t *vsp)
 543 {
 544         kstat_t         *ksp;
 545
 546         if (!vopstats_enabled) {
 547                 return (NULL);
 548         }
 549
 550         ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 551             sizeof (vopstats_t)/sizeof (kstat_named_t),
 552             KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 553         if (ksp) {
 554                 ksp->ks_data = vsp;
 555                 kstat_install(ksp);
 556         }
 557
 558         return (ksp);
 559 }
 560
 561 /*
 562  * Called from vfsinit() to initialize the support mechanisms for vopstats
 563  */
 564 void
 565 vopstats_startup()
 566 {
 567         if (!vopstats_enabled)
 568                 return;
 569
 570         /*
 571          * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 572          * is necessary since we need to check if a kstat exists before we
 573          * attempt to create it.  Also, initialize its lock.
 574          */
 575         avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 576             offsetof(vsk_anchor_t, vsk_node));
 577         mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 578
 579         vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 580             sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 581             NULL, NULL, 0);
 582
 583         /*
 584          * Set up the array of pointers for the vopstats-by-FS-type.
 585          * The entries will be allocated/initialized as each file system
 586          * goes through modload/mod_installfs.
 587          */
 588         vopstats_fstype = (vopstats_t **)kmem_zalloc(
 589             (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 590
 591         /* Set up the global vopstats initialization template */
 592         vs_templatep = create_vopstats_template();
 593 }
 594
 595 /*
 596  * We need to have the all of the counters zeroed.
 597  * The initialization of the vopstats_t includes on the order of
 598  * 50 calls to kstat_named_init().  Rather that do that on every call,
 599  * we do it once in a template (vs_templatep) then bcopy it over.
 600  */
 601 void
 602 initialize_vopstats(vopstats_t *vsp)
 603 {
 604         if (vsp == NULL)
 605                 return;
 606
 607         bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 608 }
 609
 610 /*
 611  * If possible, determine which vopstats by fstype to use and
 612  * return a pointer to the caller.
 613  */
 614 vopstats_t *
 615 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 616 {
 617         int             fstype = 0;     /* Index into vfssw[] */
 618         vopstats_t      *vsp = NULL;
 619
 620         if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 621             !vopstats_enabled)
 622                 return (NULL);
 623         /*
 624          * Set up the fstype.  We go to so much trouble because all versions
 625          * of NFS use the same fstype in their vfs even though they have
 626          * distinct entries in the vfssw[] table.
 627          * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 628          */
 629         if (vswp) {
 630                 fstype = vswp - vfssw;  /* Gets us the index */
 631         } else {
 632                 fstype = vfsp->vfs_fstype;
 633         }
 634
 635         /*
 636          * Point to the per-fstype vopstats. The only valid values are
 637          * non-zero positive values less than the number of vfssw[] table
 638          * entries.
 639          */
 640         if (fstype > 0 && fstype < nfstype) {
 641                 vsp = vopstats_fstype[fstype];
 642         }
 643
 644         return (vsp);
 645 }
 646
 647 /*
 648  * Generate a kstat name, create the kstat structure, and allocate a
 649  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 650  * to the caller.  This must only be called from a mount.
 651  */
 652 vsk_anchor_t *
 653 get_vskstat_anchor(vfs_t *vfsp)
 654 {
 655         char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 656         statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 657         vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 658         kstat_t         *ksp;                   /* Ptr to new kstat */
 659         avl_index_t     where;                  /* Location in the AVL tree */
 660
 661         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 662             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 663                 return (NULL);
 664
 665         /* Need to get the fsid to build a kstat name */
 666         if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 667                 /* Create a name for our kstats based on fsid */
 668                 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 669                     VOPSTATS_STR, statvfsbuf.f_fsid);
 670
 671                 /* Allocate and initialize the vsk_anchor_t */
 672                 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 673                 bzero(vskp, sizeof (*vskp));
 674                 vskp->vsk_fsid = statvfsbuf.f_fsid;
 675
 676                 mutex_enter(&vskstat_tree_lock);
 677                 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 678                         avl_insert(&vskstat_tree, vskp, where);
 679                         mutex_exit(&vskstat_tree_lock);
 680
 681                         /*
 682                          * Now that we've got the anchor in the AVL
 683                          * tree, we can create the kstat.
 684                          */
 685                         ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 686                         if (ksp) {
 687                                 vskp->vsk_ksp = ksp;
 688                         }
 689                 } else {
 690                         /* Oops, found one! Release memory and lock. */
 691                         mutex_exit(&vskstat_tree_lock);
 692                         kmem_cache_free(vsk_anchor_cache, vskp);
 693                         vskp = NULL;
 694                 }
 695         }
 696         return (vskp);
 697 }
 698
 699 /*
 700  * We're in the process of tearing down the vfs and need to cleanup
 701  * the data structures associated with the vopstats. Must only be called
 702  * from dounmount().
 703  */
 704 void
 705 teardown_vopstats(vfs_t *vfsp)
 706 {
 707         vsk_anchor_t    *vskap;
 708         avl_index_t     where;
 709
 710         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 711             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 712                 return;
 713
 714         /* This is a safe check since VFS_STATS must be set (see above) */
 715         if ((vskap = vfsp->vfs_vskap) == NULL)
 716                 return;
 717
 718         /* Whack the pointer right away */
 719         vfsp->vfs_vskap = NULL;
 720
 721         /* Lock the tree, remove the node, and delete the kstat */
 722         mutex_enter(&vskstat_tree_lock);
 723         if (avl_find(&vskstat_tree, vskap, &where)) {
 724                 avl_remove(&vskstat_tree, vskap);
 725         }
 726
 727         if (vskap->vsk_ksp) {
 728                 kstat_delete(vskap->vsk_ksp);
 729         }
 730         mutex_exit(&vskstat_tree_lock);
 731
 732         kmem_cache_free(vsk_anchor_cache, vskap);
 733 }
 734
 735 /*
 736  * Read or write a vnode.  Called from kernel code.
 737  */
 738 int
 739 vn_rdwr(
 740         enum uio_rw rw,
 741         struct vnode *vp,
 742         caddr_t base,
 743         ssize_t len,
 744         offset_t offset,
 745         enum uio_seg seg,
 746         int ioflag,
 747         rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 748         cred_t *cr,
 749         ssize_t *residp)
 750 {
 751         struct uio uio;
 752         struct iovec iov;
 753         int error;
 754         int in_crit = 0;
 755
 756         if (rw == UIO_WRITE && ISROFILE(vp))
 757                 return (EROFS);
 758
 759         if (len < 0)
 760                 return (EIO);
 761
 762         VOPXID_MAP_CR(vp, cr);
 763
 764         iov.iov_base = base;
 765         iov.iov_len = len;
 766         uio.uio_iov = &iov;
 767         uio.uio_iovcnt = 1;
 768         uio.uio_loffset = offset;
 769         uio.uio_segflg = (short)seg;
 770         uio.uio_resid = len;
 771         uio.uio_llimit = ulimit;
 772
 773         /*
 774          * We have to enter the critical region before calling fop_rwlock
 775          * to avoid a deadlock with ufs.
 776          */
 777         if (nbl_need_check(vp)) {
 778                 int svmand;
 779
 780                 nbl_start_crit(vp, RW_READER);
 781                 in_crit = 1;
 782                 error = nbl_svmand(vp, cr, &svmand);
 783                 if (error != 0)
 784                         goto done;
 785                 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 786                     uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 787                         error = EACCES;
 788                         goto done;
 789                 }
 790         }
 791
 792         (void) fop_rwlock(vp,
 793             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 794         if (rw == UIO_WRITE) {
 795                 uio.uio_fmode = FWRITE;
 796                 uio.uio_extflg = UIO_COPY_DEFAULT;
 797                 error = fop_write(vp, &uio, ioflag, cr, NULL);
 798         } else {
 799                 uio.uio_fmode = FREAD;
 800                 uio.uio_extflg = UIO_COPY_CACHED;
 801                 error = fop_read(vp, &uio, ioflag, cr, NULL);
 802         }
 803         fop_rwunlock(vp,
 804             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 805         if (residp)
 806                 *residp = uio.uio_resid;
 807         else if (uio.uio_resid)
 808                 error = EIO;
 809
 810 done:
 811         if (in_crit)
 812                 nbl_end_crit(vp);
 813         return (error);
 814 }
 815
 816 /*
 817  * Release a vnode.  Call fop_inactive on last reference or
 818  * decrement reference count.
 819  *
 820  * To avoid race conditions, the v_count is left at 1 for
 821  * the call to fop_inactive. This prevents another thread
 822  * from reclaiming and releasing the vnode *before* the
 823  * fop_inactive routine has a chance to destroy the vnode.
 824  * We can't have more than 1 thread calling fop_inactive
 825  * on a vnode.
 826  */
 827 void
 828 vn_rele(vnode_t *vp)
 829 {
 830         VERIFY(vp->v_count > 0);
 831         mutex_enter(&vp->v_lock);
 832         if (vp->v_count == 1) {
 833                 mutex_exit(&vp->v_lock);
 834                 fop_inactive(vp, CRED(), NULL);
 835                 return;
 836         }
 837         VN_RELE_LOCKED(vp);
 838         mutex_exit(&vp->v_lock);
 839 }
 840
 841 /*
 842  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 843  * as a single reference, so v_count is not decremented until the last DNLC hold
 844  * is released. This makes it possible to distinguish vnodes that are referenced
 845  * only by the DNLC.
 846  */
 847 void
 848 vn_rele_dnlc(vnode_t *vp)
 849 {
 850         VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 851         mutex_enter(&vp->v_lock);
 852         if (--vp->v_count_dnlc == 0) {
 853                 if (vp->v_count == 1) {
 854                         mutex_exit(&vp->v_lock);
 855                         fop_inactive(vp, CRED(), NULL);
 856                         return;
 857                 }
 858                 VN_RELE_LOCKED(vp);
 859         }
 860         mutex_exit(&vp->v_lock);
 861 }
 862
 863 /*
 864  * Like vn_rele() except that it clears v_stream under v_lock.
 865  * This is used by sockfs when it dismantles the association between
 866  * the sockfs node and the vnode in the underlying file system.
 867  * v_lock has to be held to prevent a thread coming through the lookupname
 868  * path from accessing a stream head that is going away.
 869  */
 870 void
 871 vn_rele_stream(vnode_t *vp)
 872 {
 873         VERIFY(vp->v_count > 0);
 874         mutex_enter(&vp->v_lock);
 875         vp->v_stream = NULL;
 876         if (vp->v_count == 1) {
 877                 mutex_exit(&vp->v_lock);
 878                 fop_inactive(vp, CRED(), NULL);
 879                 return;
 880         }
 881         VN_RELE_LOCKED(vp);
 882         mutex_exit(&vp->v_lock);
 883 }
 884
 885 static void
 886 vn_rele_inactive(vnode_t *vp)
 887 {
 888         fop_inactive(vp, CRED(), NULL);
 889 }
 890
 891 /*
 892  * Like vn_rele() except if we are going to call fop_inactive() then do it
 893  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 894  * the file system as a result of releasing the vnode. Note, file systems
 895  * already have to handle the race where the vnode is incremented before the
 896  * inactive routine is called and does its locking.
 897  *
 898  * Warning: Excessive use of this routine can lead to performance problems.
 899  * This is because taskqs throttle back allocation if too many are created.
 900  */
 901 void
 902 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 903 {
 904         VERIFY(vp->v_count > 0);
 905         mutex_enter(&vp->v_lock);
 906         if (vp->v_count == 1) {
 907                 mutex_exit(&vp->v_lock);
 908                 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 909                     vp, TQ_SLEEP) != (uintptr_t)NULL);
 910                 return;
 911         }
 912         VN_RELE_LOCKED(vp);
 913         mutex_exit(&vp->v_lock);
 914 }
 915
 916 int
 917 vn_open(
 918         char *pnamep,
 919         enum uio_seg seg,
 920         int filemode,
 921         int createmode,
 922         struct vnode **vpp,
 923         enum create crwhy,
 924         mode_t umask)
 925 {
 926         return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 927             umask, NULL, -1));
 928 }
 929
 930
 931 /*
 932  * Open/create a vnode.
 933  * This may be callable by the kernel, the only known use
 934  * of user context being that the current user credentials
 935  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 936  */
 937 int
 938 vn_openat(
 939         char *pnamep,
 940         enum uio_seg seg,
 941         int filemode,
 942         int createmode,
 943         struct vnode **vpp,
 944         enum create crwhy,
 945         mode_t umask,
 946         struct vnode *startvp,
 947         int fd)
 948 {
 949         struct vnode *vp;
 950         int mode;
 951         int accessflags;
 952         int error;
 953         int in_crit = 0;
 954         int open_done = 0;
 955         int shrlock_done = 0;
 956         struct vattr vattr;
 957         enum symfollow follow;
 958         int estale_retry = 0;
 959         struct shrlock shr;
 960         struct shr_locowner shr_own;
 961
 962         if (filemode & FSEARCH)
 963                 filemode |= FDIRECTORY;
 964
 965         mode = 0;
 966         accessflags = 0;
 967         if (filemode & FREAD)
 968                 mode |= VREAD;
 969         if (filemode & (FWRITE|FTRUNC))
 970                 mode |= VWRITE;
 971         if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 972                 mode |= VEXEC;
 973
 974         /* symlink interpretation */
 975         if (filemode & FNOFOLLOW)
 976                 follow = NO_FOLLOW;
 977         else
 978                 follow = FOLLOW;
 979
 980         if (filemode & FAPPEND)
 981                 accessflags |= V_APPEND;
 982
 983 top:
 984         if (filemode & FCREAT && !(filemode & FDIRECTORY)) {
 985                 enum vcexcl excl;
 986
 987                 /* Wish to create a file. */
 988                 vattr.va_type = VREG;
 989                 vattr.va_mode = createmode;
 990                 vattr.va_mask = AT_TYPE|AT_MODE;
 991                 if (filemode & FTRUNC) {
 992                         vattr.va_size = 0;
 993                         vattr.va_mask |= AT_SIZE;
 994                 }
 995                 if (filemode & FEXCL)
 996                         excl = EXCL;
 997                 else
 998                         excl = NONEXCL;
 999
1000                 if (error =
1001                     vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1002                     (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1003                         return (error);
1004         } else {
1005                 /* Wish to open a file.  Just look it up. */
1006                 if (error = lookupnameat(pnamep, seg, follow,
1007                     NULLVPP, &vp, startvp)) {
1008                         if ((error == ESTALE) &&
1009                             fs_need_estale_retry(estale_retry++))
1010                                 goto top;
1011                         return (error);
1012                 }
1013
1014                 /*
1015                  * Get the attributes to check whether file is large.
1016                  * We do this only if the FOFFMAX flag is not set and
1017                  * only for regular files.
1018                  */
1019
1020                 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1021                         vattr.va_mask = AT_SIZE;
1022                         if ((error = fop_getattr(vp, &vattr, 0,
1023                             CRED(), NULL))) {
1024                                 goto out;
1025                         }
1026                         if (vattr.va_size > (uoff_t)MAXOFF32_T) {
1027                                 /*
1028                                  * Large File API - regular open fails
1029                                  * if FOFFMAX flag is set in file mode
1030                                  */
1031                                 error = EOVERFLOW;
1032                                 goto out;
1033                         }
1034                 }
1035                 /*
1036                  * Can't write directories, active texts, or
1037                  * read-only filesystems.  Can't truncate files
1038                  * on which mandatory locking is in effect.
1039                  */
1040                 if (filemode & (FWRITE|FTRUNC)) {
1041                         /*
1042                          * Allow writable directory if VDIROPEN flag is set.
1043                          */
1044                         if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1045                                 error = EISDIR;
1046                                 goto out;
1047                         }
1048                         if (ISROFILE(vp)) {
1049                                 error = EROFS;
1050                                 goto out;
1051                         }
1052                         /*
1053                          * Can't truncate files on which
1054                          * sysv mandatory locking is in effect.
1055                          */
1056                         if (filemode & FTRUNC) {
1057                                 vnode_t *rvp;
1058
1059                                 if (fop_realvp(vp, &rvp, NULL) != 0)
1060                                         rvp = vp;
1061                                 if (rvp->v_filocks != NULL) {
1062                                         vattr.va_mask = AT_MODE;
1063                                         if ((error = fop_getattr(vp,
1064                                             &vattr, 0, CRED(), NULL)) == 0 &&
1065                                             MANDLOCK(vp, vattr.va_mode))
1066                                                 error = EAGAIN;
1067                                 }
1068                         }
1069                         if (error)
1070                                 goto out;
1071                 }
1072                 /*
1073                  * Check permissions.
1074                  */
1075                 if (error = fop_access(vp, mode, accessflags, CRED(), NULL))
1076                         goto out;
1077                 /*
1078                  * Require FDIRECTORY to return a directory.
1079                  * Require FEXEC to return a regular file.
1080                  */
1081                 if ((filemode & FDIRECTORY) && vp->v_type != VDIR) {
1082                         error = ENOTDIR;
1083                         goto out;
1084                 }
1085                 if ((filemode & FEXEC) && vp->v_type != VREG) {
1086                         error = ENOEXEC;        /* XXX: error code? */
1087                         goto out;
1088                 }
1089         }
1090
1091         /*
1092          * Do remaining checks for FNOFOLLOW and FNOLINKS.
1093          */
1094         if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1095                 error = ELOOP;
1096                 goto out;
1097         }
1098         if (filemode & FNOLINKS) {
1099                 vattr.va_mask = AT_NLINK;
1100                 if ((error = fop_getattr(vp, &vattr, 0, CRED(), NULL))) {
1101                         goto out;
1102                 }
1103                 if (vattr.va_nlink != 1) {
1104                         error = EMLINK;
1105                         goto out;
1106                 }
1107         }
1108
1109         /*
1110          * Opening a socket corresponding to the AF_UNIX pathname
1111          * in the filesystem name space is not supported.
1112          * However, VSOCK nodes in namefs are supported in order
1113          * to make fattach work for sockets.
1114          *
1115          * XXX This uses fop_realvp to distinguish between
1116          * an unopened namefs node (where fop_realvp returns a
1117          * different VSOCK vnode) and a VSOCK created by vn_create
1118          * in some file system (where fop_realvp would never return
1119          * a different vnode).
1120          */
1121         if (vp->v_type == VSOCK) {
1122                 struct vnode *nvp;
1123
1124                 error = fop_realvp(vp, &nvp, NULL);
1125                 if (error != 0 || nvp == NULL || nvp == vp ||
1126                     nvp->v_type != VSOCK) {
1127                         error = EOPNOTSUPP;
1128                         goto out;
1129                 }
1130         }
1131
1132         if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1133                 /* get share reservation */
1134                 shr.s_access = 0;
1135                 if (filemode & FWRITE)
1136                         shr.s_access |= F_WRACC;
1137                 if (filemode & FREAD)
1138                         shr.s_access |= F_RDACC;
1139                 shr.s_deny = 0;
1140                 shr.s_sysid = 0;
1141                 shr.s_pid = ttoproc(curthread)->p_pid;
1142                 shr_own.sl_pid = shr.s_pid;
1143                 shr_own.sl_id = fd;
1144                 shr.s_own_len = sizeof (shr_own);
1145                 shr.s_owner = (caddr_t)&shr_own;
1146                 error = fop_shrlock(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1147                     NULL);
1148                 if (error)
1149                         goto out;
1150                 shrlock_done = 1;
1151
1152                 /* nbmand conflict check if truncating file */
1153                 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1154                         nbl_start_crit(vp, RW_READER);
1155                         in_crit = 1;
1156
1157                         vattr.va_mask = AT_SIZE;
1158                         if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL))
1159                                 goto out;
1160                         if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1161                             NULL)) {
1162                                 error = EACCES;
1163                                 goto out;
1164                         }
1165                 }
1166         }
1167
1168         /*
1169          * Do opening protocol.
1170          */
1171         error = fop_open(&vp, filemode, CRED(), NULL);
1172         if (error)
1173                 goto out;
1174         open_done = 1;
1175
1176         /*
1177          * Truncate if required.
1178          */
1179         if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1180                 vattr.va_size = 0;
1181                 vattr.va_mask = AT_SIZE;
1182                 if ((error = fop_setattr(vp, &vattr, 0, CRED(), NULL)) != 0)
1183                         goto out;
1184         }
1185 out:
1186         ASSERT(vp->v_count > 0);
1187
1188         if (in_crit) {
1189                 nbl_end_crit(vp);
1190                 in_crit = 0;
1191         }
1192         if (error) {
1193                 if (open_done) {
1194                         (void) fop_close(vp, filemode, 1, (offset_t)0, CRED(),
1195                             NULL);
1196                         open_done = 0;
1197                         shrlock_done = 0;
1198                 }
1199                 if (shrlock_done) {
1200                         (void) fop_shrlock(vp, F_UNSHARE, &shr, 0, CRED(),
1201                             NULL);
1202                         shrlock_done = 0;
1203                 }
1204
1205                 /*
1206                  * The following clause was added to handle a problem
1207                  * with NFS consistency.  It is possible that a lookup
1208                  * of the file to be opened succeeded, but the file
1209                  * itself doesn't actually exist on the server.  This
1210                  * is chiefly due to the DNLC containing an entry for
1211                  * the file which has been removed on the server.  In
1212                  * this case, we just start over.  If there was some
1213                  * other cause for the ESTALE error, then the lookup
1214                  * of the file will fail and the error will be returned
1215                  * above instead of looping around from here.
1216                  */
1217                 VN_RELE(vp);
1218                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1219                         goto top;
1220         } else
1221                 *vpp = vp;
1222         return (error);
1223 }
1224
1225 /*
1226  * The following two accessor functions are for the NFSv4 server.  Since there
1227  * is no fop_open_UP/DOWNGRADE we need a way for the NFS server to keep the
1228  * vnode open counts correct when a client "upgrades" an open or does an
1229  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1230  * open mode (add or subtract read or write), but also change the share/deny
1231  * modes.  However, share reservations are not integrated with OPEN, yet, so
1232  * we need to handle each separately.  These functions are cleaner than having
1233  * the NFS server manipulate the counts directly, however, nobody else should
1234  * use these functions.
1235  */
1236 void
1237 vn_open_upgrade(
1238         vnode_t *vp,
1239         int filemode)
1240 {
1241         ASSERT(vp->v_type == VREG);
1242
1243         if (filemode & FREAD)
1244                 atomic_inc_32(&vp->v_rdcnt);
1245         if (filemode & FWRITE)
1246                 atomic_inc_32(&vp->v_wrcnt);
1247
1248 }
1249
1250 void
1251 vn_open_downgrade(
1252         vnode_t *vp,
1253         int filemode)
1254 {
1255         ASSERT(vp->v_type == VREG);
1256
1257         if (filemode & FREAD) {
1258                 ASSERT(vp->v_rdcnt > 0);
1259                 atomic_dec_32(&vp->v_rdcnt);
1260         }
1261         if (filemode & FWRITE) {
1262                 ASSERT(vp->v_wrcnt > 0);
1263                 atomic_dec_32(&vp->v_wrcnt);
1264         }
1265
1266 }
1267
1268 int
1269 vn_create(
1270         char *pnamep,
1271         enum uio_seg seg,
1272         struct vattr *vap,
1273         enum vcexcl excl,
1274         int mode,
1275         struct vnode **vpp,
1276         enum create why,
1277         int flag,
1278         mode_t umask)
1279 {
1280         return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1281             umask, NULL));
1282 }
1283
1284 /*
1285  * Create a vnode (makenode).
1286  */
1287 int
1288 vn_createat(
1289         char *pnamep,
1290         enum uio_seg seg,
1291         struct vattr *vap,
1292         enum vcexcl excl,
1293         int mode,
1294         struct vnode **vpp,
1295         enum create why,
1296         int flag,
1297         mode_t umask,
1298         struct vnode *startvp)
1299 {
1300         struct vnode *dvp;      /* ptr to parent dir vnode */
1301         struct vnode *vp = NULL;
1302         struct pathname pn;
1303         int error;
1304         int in_crit = 0;
1305         struct vattr vattr;
1306         enum symfollow follow;
1307         int estale_retry = 0;
1308         uint32_t auditing = AU_AUDITING();
1309
1310         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1311
1312         /* symlink interpretation */
1313         if ((flag & FNOFOLLOW) || excl == EXCL)
1314                 follow = NO_FOLLOW;
1315         else
1316                 follow = FOLLOW;
1317         flag &= ~(FNOFOLLOW|FNOLINKS);
1318
1319 top:
1320         /*
1321          * Lookup directory.
1322          * If new object is a file, call lower level to create it.
1323          * Note that it is up to the lower level to enforce exclusive
1324          * creation, if the file is already there.
1325          * This allows the lower level to do whatever
1326          * locking or protocol that is needed to prevent races.
1327          * If the new object is directory call lower level to make
1328          * the new directory, with "." and "..".
1329          */
1330         if (error = pn_get(pnamep, seg, &pn))
1331                 return (error);
1332         if (auditing)
1333                 audit_vncreate_start();
1334         dvp = NULL;
1335         *vpp = NULL;
1336         /*
1337          * lookup will find the parent directory for the vnode.
1338          * When it is done the pn holds the name of the entry
1339          * in the directory.
1340          * If this is a non-exclusive create we also find the node itself.
1341          */
1342         error = lookuppnat(&pn, NULL, follow, &dvp,
1343             (excl == EXCL) ? NULLVPP : vpp, startvp);
1344         if (error) {
1345                 pn_free(&pn);
1346                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1347                         goto top;
1348                 if (why == CRMKDIR && error == EINVAL)
1349                         error = EEXIST;         /* SVID */
1350                 return (error);
1351         }
1352
1353         if (why != CRMKNOD)
1354                 vap->va_mode &= ~VSVTX;
1355
1356         /*
1357          * If default ACLs are defined for the directory don't apply the
1358          * umask if umask is passed.
1359          */
1360
1361         if (umask) {
1362
1363                 vsecattr_t vsec;
1364
1365                 vsec.vsa_aclcnt = 0;
1366                 vsec.vsa_aclentp = NULL;
1367                 vsec.vsa_dfaclcnt = 0;
1368                 vsec.vsa_dfaclentp = NULL;
1369                 vsec.vsa_mask = VSA_DFACLCNT;
1370                 error = fop_getsecattr(dvp, &vsec, 0, CRED(), NULL);
1371                 /*
1372                  * If error is ENOSYS then treat it as no error
1373                  * Don't want to force all file systems to support
1374                  * aclent_t style of ACL's.
1375                  */
1376                 if (error == ENOSYS)
1377                         error = 0;
1378                 if (error) {
1379                         if (*vpp != NULL)
1380                                 VN_RELE(*vpp);
1381                         goto out;
1382                 } else {
1383                         /*
1384                          * Apply the umask if no default ACLs.
1385                          */
1386                         if (vsec.vsa_dfaclcnt == 0)
1387                                 vap->va_mode &= ~umask;
1388
1389                         /*
1390                          * fop_getsecattr() may have allocated memory for
1391                          * ACLs we didn't request, so double-check and
1392                          * free it if necessary.
1393                          */
1394                         if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1395                                 kmem_free((caddr_t)vsec.vsa_aclentp,
1396                                     vsec.vsa_aclcnt * sizeof (aclent_t));
1397                         if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1398                                 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1399                                     vsec.vsa_dfaclcnt * sizeof (aclent_t));
1400                 }
1401         }
1402
1403         /*
1404          * In general we want to generate EROFS if the file system is
1405          * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1406          * documents the open system call, and it says that O_CREAT has no
1407          * effect if the file already exists.  Bug 1119649 states
1408          * that open(path, O_CREAT, ...) fails when attempting to open an
1409          * existing file on a read only file system.  Thus, the first part
1410          * of the following if statement has 3 checks:
1411          *      if the file exists &&
1412          *              it is being open with write access &&
1413          *              the file system is read only
1414          *      then generate EROFS
1415          */
1416         if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1417             (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1418                 if (*vpp)
1419                         VN_RELE(*vpp);
1420                 error = EROFS;
1421         } else if (excl == NONEXCL && *vpp != NULL) {
1422                 vnode_t *rvp;
1423
1424                 /*
1425                  * File already exists.  If a mandatory lock has been
1426                  * applied, return error.
1427                  */
1428                 vp = *vpp;
1429                 if (fop_realvp(vp, &rvp, NULL) != 0)
1430                         rvp = vp;
1431                 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1432                         nbl_start_crit(vp, RW_READER);
1433                         in_crit = 1;
1434                 }
1435                 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1436                         vattr.va_mask = AT_MODE|AT_SIZE;
1437                         if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL)) {
1438                                 goto out;
1439                         }
1440                         if (MANDLOCK(vp, vattr.va_mode)) {
1441                                 error = EAGAIN;
1442                                 goto out;
1443                         }
1444                         /*
1445                          * File cannot be truncated if non-blocking mandatory
1446                          * locks are currently on the file.
1447                          */
1448                         if ((vap->va_mask & AT_SIZE) && in_crit) {
1449                                 uoff_t offset;
1450                                 ssize_t length;
1451
1452                                 offset = vap->va_size > vattr.va_size ?
1453                                     vattr.va_size : vap->va_size;
1454                                 length = vap->va_size > vattr.va_size ?
1455                                     vap->va_size - vattr.va_size :
1456                                     vattr.va_size - vap->va_size;
1457                                 if (nbl_conflict(vp, NBL_WRITE, offset,
1458                                     length, 0, NULL)) {
1459                                         error = EACCES;
1460                                         goto out;
1461                                 }
1462                         }
1463                 }
1464
1465                 /*
1466                  * If the file is the root of a VFS, we've crossed a
1467                  * mount point and the "containing" directory that we
1468                  * acquired above (dvp) is irrelevant because it's in
1469                  * a different file system.  We apply fop_create to the
1470                  * target itself instead of to the containing directory
1471                  * and supply a null path name to indicate (conventionally)
1472                  * the node itself as the "component" of interest.
1473                  *
1474                  * The call to fop_create() is necessary to ensure
1475                  * that the appropriate permission checks are made,
1476                  * i.e. EISDIR, EACCES, etc.  We already know that vpp
1477                  * exists since we are in the else condition where this
1478                  * was checked.
1479                  */
1480                 if (vp->v_flag & VROOT) {
1481                         ASSERT(why != CRMKDIR);
1482                         error = fop_create(vp, "", vap, excl, mode, vpp,
1483                             CRED(), flag, NULL, NULL);
1484                         /*
1485                          * If the create succeeded, it will have created a
1486                          * new reference on a new vnode (*vpp) in the child
1487                          * file system, so we want to drop our reference on
1488                          * the old (vp) upon exit.
1489                          */
1490                         goto out;
1491                 }
1492
1493                 /*
1494                  * Large File API - non-large open (FOFFMAX flag not set)
1495                  * of regular file fails if the file size exceeds MAXOFF32_T.
1496                  */
1497                 if (why != CRMKDIR &&
1498                     !(flag & FOFFMAX) &&
1499                     (vp->v_type == VREG)) {
1500                         vattr.va_mask = AT_SIZE;
1501                         if ((error = fop_getattr(vp, &vattr, 0,
1502                             CRED(), NULL))) {
1503                                 goto out;
1504                         }
1505                         if ((vattr.va_size > (uoff_t)MAXOFF32_T)) {
1506                                 error = EOVERFLOW;
1507                                 goto out;
1508                         }
1509                 }
1510         }
1511
1512         if (error == 0) {
1513                 /*
1514                  * Call mkdir() if specified, otherwise create().
1515                  */
1516                 int must_be_dir = pn_fixslash(&pn);     /* trailing '/'? */
1517
1518                 if (why == CRMKDIR)
1519                         /*
1520                          * N.B., if vn_createat() ever requests
1521                          * case-insensitive behavior then it will need
1522                          * to be passed to fop_mkdir().  fop_create()
1523                          * will already get it via "flag"
1524                          */
1525                         error = fop_mkdir(dvp, pn.pn_path, vap, vpp, CRED(),
1526                             NULL, 0, NULL);
1527                 else if (!must_be_dir)
1528                         error = fop_create(dvp, pn.pn_path, vap,
1529                             excl, mode, vpp, CRED(), flag, NULL, NULL);
1530                 else
1531                         error = ENOTDIR;
1532         }
1533
1534 out:
1535
1536         if (auditing)
1537                 audit_vncreate_finish(*vpp, error);
1538         if (in_crit) {
1539                 nbl_end_crit(vp);
1540                 in_crit = 0;
1541         }
1542         if (vp != NULL) {
1543                 VN_RELE(vp);
1544                 vp = NULL;
1545         }
1546         pn_free(&pn);
1547         VN_RELE(dvp);
1548         /*
1549          * The following clause was added to handle a problem
1550          * with NFS consistency.  It is possible that a lookup
1551          * of the file to be created succeeded, but the file
1552          * itself doesn't actually exist on the server.  This
1553          * is chiefly due to the DNLC containing an entry for
1554          * the file which has been removed on the server.  In
1555          * this case, we just start over.  If there was some
1556          * other cause for the ESTALE error, then the lookup
1557          * of the file will fail and the error will be returned
1558          * above instead of looping around from here.
1559          */
1560         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1561                 goto top;
1562         return (error);
1563 }
1564
1565 int
1566 vn_link(char *from, char *to, enum uio_seg seg)
1567 {
1568         return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1569 }
1570
1571 int
1572 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1573     vnode_t *tstartvp, char *to, enum uio_seg seg)
1574 {
1575         struct vnode *fvp;              /* from vnode ptr */
1576         struct vnode *tdvp;             /* to directory vnode ptr */
1577         struct pathname pn;
1578         int error;
1579         struct vattr vattr;
1580         dev_t fsid;
1581         int estale_retry = 0;
1582         uint32_t auditing = AU_AUDITING();
1583
1584 top:
1585         fvp = tdvp = NULL;
1586         if (error = pn_get(to, seg, &pn))
1587                 return (error);
1588         if (auditing && fstartvp != NULL)
1589                 audit_setfsat_path(1);
1590         if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1591                 goto out;
1592         if (auditing && tstartvp != NULL)
1593                 audit_setfsat_path(3);
1594         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1595                 goto out;
1596         /*
1597          * Make sure both source vnode and target directory vnode are
1598          * in the same vfs and that it is writeable.
1599          */
1600         vattr.va_mask = AT_FSID;
1601         if (error = fop_getattr(fvp, &vattr, 0, CRED(), NULL))
1602                 goto out;
1603         fsid = vattr.va_fsid;
1604         vattr.va_mask = AT_FSID;
1605         if (error = fop_getattr(tdvp, &vattr, 0, CRED(), NULL))
1606                 goto out;
1607         if (fsid != vattr.va_fsid) {
1608                 error = EXDEV;
1609                 goto out;
1610         }
1611         if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1612                 error = EROFS;
1613                 goto out;
1614         }
1615         /*
1616          * Do the link.
1617          */
1618         (void) pn_fixslash(&pn);
1619         error = fop_link(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1620 out:
1621         pn_free(&pn);
1622         if (fvp)
1623                 VN_RELE(fvp);
1624         if (tdvp)
1625                 VN_RELE(tdvp);
1626         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1627                 goto top;
1628         return (error);
1629 }
1630
1631 int
1632 vn_rename(char *from, char *to, enum uio_seg seg)
1633 {
1634         return (vn_renameat(NULL, from, NULL, to, seg));
1635 }
1636
1637 int
1638 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1639     char *tname, enum uio_seg seg)
1640 {
1641         int error;
1642         struct vattr vattr;
1643         struct pathname fpn;            /* from pathname */
1644         struct pathname tpn;            /* to pathname */
1645         dev_t fsid;
1646         int in_crit_src, in_crit_targ;
1647         vnode_t *fromvp, *fvp;
1648         vnode_t *tovp, *targvp;
1649         int estale_retry = 0;
1650         uint32_t auditing = AU_AUDITING();
1651
1652 top:
1653         fvp = fromvp = tovp = targvp = NULL;
1654         in_crit_src = in_crit_targ = 0;
1655         /*
1656          * Get to and from pathnames.
1657          */
1658         if (error = pn_get(fname, seg, &fpn))
1659                 return (error);
1660         if (error = pn_get(tname, seg, &tpn)) {
1661                 pn_free(&fpn);
1662                 return (error);
1663         }
1664
1665         /*
1666          * First we need to resolve the correct directories
1667          * The passed in directories may only be a starting point,
1668          * but we need the real directories the file(s) live in.
1669          * For example the fname may be something like usr/lib/sparc
1670          * and we were passed in the / directory, but we need to
1671          * use the lib directory for the rename.
1672          */
1673
1674         if (auditing && fdvp != NULL)
1675                 audit_setfsat_path(1);
1676         /*
1677          * Lookup to and from directories.
1678          */
1679         if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1680                 goto out;
1681         }
1682
1683         /*
1684          * Make sure there is an entry.
1685          */
1686         if (fvp == NULL) {
1687                 error = ENOENT;
1688                 goto out;
1689         }
1690
1691         if (auditing && tdvp != NULL)
1692                 audit_setfsat_path(3);
1693         if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1694                 goto out;
1695         }
1696
1697         /*
1698          * Make sure both the from vnode directory and the to directory
1699          * are in the same vfs and the to directory is writable.
1700          * We check fsid's, not vfs pointers, so loopback fs works.
1701          */
1702         if (fromvp != tovp) {
1703                 vattr.va_mask = AT_FSID;
1704                 if (error = fop_getattr(fromvp, &vattr, 0, CRED(), NULL))
1705                         goto out;
1706                 fsid = vattr.va_fsid;
1707                 vattr.va_mask = AT_FSID;
1708                 if (error = fop_getattr(tovp, &vattr, 0, CRED(), NULL))
1709                         goto out;
1710                 if (fsid != vattr.va_fsid) {
1711                         error = EXDEV;
1712                         goto out;
1713                 }
1714         }
1715
1716         if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1717                 error = EROFS;
1718                 goto out;
1719         }
1720
1721         /*
1722          * Make sure "from" vp is not a mount point.
1723          * Note, lookup did traverse() already, so
1724          * we'll be looking at the mounted FS root.
1725          * (but allow files like mnttab)
1726          */
1727         if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1728                 error = EBUSY;
1729                 goto out;
1730         }
1731
1732         if (targvp && (fvp != targvp)) {
1733                 nbl_start_crit(targvp, RW_READER);
1734                 in_crit_targ = 1;
1735                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1736                         error = EACCES;
1737                         goto out;
1738                 }
1739         }
1740
1741         if (nbl_need_check(fvp)) {
1742                 nbl_start_crit(fvp, RW_READER);
1743                 in_crit_src = 1;
1744                 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1745                         error = EACCES;
1746                         goto out;
1747                 }
1748         }
1749
1750         /*
1751          * Do the rename.
1752          */
1753         (void) pn_fixslash(&tpn);
1754         error = fop_rename(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1755             NULL, 0);
1756
1757 out:
1758         pn_free(&fpn);
1759         pn_free(&tpn);
1760         if (in_crit_src)
1761                 nbl_end_crit(fvp);
1762         if (in_crit_targ)
1763                 nbl_end_crit(targvp);
1764         if (fromvp)
1765                 VN_RELE(fromvp);
1766         if (tovp)
1767                 VN_RELE(tovp);
1768         if (targvp)
1769                 VN_RELE(targvp);
1770         if (fvp)
1771                 VN_RELE(fvp);
1772         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1773                 goto top;
1774         return (error);
1775 }
1776
1777 /*
1778  * Remove a file or directory.
1779  */
1780 int
1781 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1782 {
1783         return (vn_removeat(NULL, fnamep, seg, dirflag));
1784 }
1785
1786 int
1787 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1788 {
1789         struct vnode *vp;               /* entry vnode */
1790         struct vnode *dvp;              /* ptr to parent dir vnode */
1791         struct vnode *coveredvp;
1792         struct pathname pn;             /* name of entry */
1793         enum vtype vtype;
1794         int error;
1795         struct vfs *vfsp;
1796         struct vfs *dvfsp;      /* ptr to parent dir vfs */
1797         int in_crit = 0;
1798         int estale_retry = 0;
1799
1800 top:
1801         if (error = pn_get(fnamep, seg, &pn))
1802                 return (error);
1803         dvp = vp = NULL;
1804         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1805                 pn_free(&pn);
1806                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1807                         goto top;
1808                 return (error);
1809         }
1810
1811         /*
1812          * Make sure there is an entry.
1813          */
1814         if (vp == NULL) {
1815                 error = ENOENT;
1816                 goto out;
1817         }
1818
1819         vfsp = vp->v_vfsp;
1820         dvfsp = dvp->v_vfsp;
1821
1822         /*
1823          * If the named file is the root of a mounted filesystem, fail,
1824          * unless it's marked unlinkable.  In that case, unmount the
1825          * filesystem and proceed to unlink the covered vnode.  (If the
1826          * covered vnode is a directory, use rmdir instead of unlink,
1827          * to avoid file system corruption.)
1828          */
1829         if (vp->v_flag & VROOT) {
1830                 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1831                         error = EBUSY;
1832                         goto out;
1833                 }
1834
1835                 /*
1836                  * Namefs specific code starts here.
1837                  */
1838
1839                 if (dirflag == RMDIRECTORY) {
1840                         /*
1841                          * User called rmdir(2) on a file that has
1842                          * been namefs mounted on top of.  Since
1843                          * namefs doesn't allow directories to
1844                          * be mounted on other files we know
1845                          * vp is not of type VDIR so fail to operation.
1846                          */
1847                         error = ENOTDIR;
1848                         goto out;
1849                 }
1850
1851                 /*
1852                  * If VROOT is still set after grabbing vp->v_lock,
1853                  * noone has finished nm_unmount so far and coveredvp
1854                  * is valid.
1855                  * If we manage to grab vn_vfswlock(coveredvp) before releasing
1856                  * vp->v_lock, any race window is eliminated.
1857                  */
1858
1859                 mutex_enter(&vp->v_lock);
1860                 if ((vp->v_flag & VROOT) == 0) {
1861                         /* Someone beat us to the unmount */
1862                         mutex_exit(&vp->v_lock);
1863                         error = EBUSY;
1864                         goto out;
1865                 }
1866                 vfsp = vp->v_vfsp;
1867                 coveredvp = vfsp->vfs_vnodecovered;
1868                 ASSERT(coveredvp);
1869                 /*
1870                  * Note: Implementation of vn_vfswlock shows that ordering of
1871                  * v_lock / vn_vfswlock is not an issue here.
1872                  */
1873                 error = vn_vfswlock(coveredvp);
1874                 mutex_exit(&vp->v_lock);
1875
1876                 if (error)
1877                         goto out;
1878
1879                 VN_HOLD(coveredvp);
1880                 VN_RELE(vp);
1881                 error = dounmount(vfsp, 0, CRED());
1882
1883                 /*
1884                  * Unmounted the namefs file system; now get
1885                  * the object it was mounted over.
1886                  */
1887                 vp = coveredvp;
1888                 /*
1889                  * If namefs was mounted over a directory, then
1890                  * we want to use rmdir() instead of unlink().
1891                  */
1892                 if (vp->v_type == VDIR)
1893                         dirflag = RMDIRECTORY;
1894
1895                 if (error)
1896                         goto out;
1897         }
1898
1899         /*
1900          * Make sure filesystem is writeable.
1901          * We check the parent directory's vfs in case this is an lofs vnode.
1902          */
1903         if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1904                 error = EROFS;
1905                 goto out;
1906         }
1907
1908         vtype = vp->v_type;
1909
1910         /*
1911          * If there is the possibility of an nbmand share reservation, make
1912          * sure it's okay to remove the file.  Keep a reference to the
1913          * vnode, so that we can exit the nbl critical region after
1914          * calling fop_remove.
1915          * If there is no possibility of an nbmand share reservation,
1916          * release the vnode reference now.  Filesystems like NFS may
1917          * behave differently if there is an extra reference, so get rid of
1918          * this one.  Fortunately, we can't have nbmand mounts on NFS
1919          * filesystems.
1920          */
1921         if (nbl_need_check(vp)) {
1922                 nbl_start_crit(vp, RW_READER);
1923                 in_crit = 1;
1924                 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1925                         error = EACCES;
1926                         goto out;
1927                 }
1928         } else {
1929                 VN_RELE(vp);
1930                 vp = NULL;
1931         }
1932
1933         if (dirflag == RMDIRECTORY) {
1934                 /*
1935                  * Caller is using rmdir(2), which can only be applied to
1936                  * directories.
1937                  */
1938                 if (vtype != VDIR) {
1939                         error = ENOTDIR;
1940                 } else {
1941                         vnode_t *cwd;
1942                         proc_t *pp = curproc;
1943
1944                         mutex_enter(&pp->p_lock);
1945                         cwd = PTOU(pp)->u_cdir;
1946                         VN_HOLD(cwd);
1947                         mutex_exit(&pp->p_lock);
1948                         error = fop_rmdir(dvp, pn.pn_path, cwd, CRED(),
1949                             NULL, 0);
1950                         VN_RELE(cwd);
1951                 }
1952         } else {
1953                 /*
1954                  * Unlink(2) can be applied to anything.
1955                  */
1956                 error = fop_remove(dvp, pn.pn_path, CRED(), NULL, 0);
1957         }
1958
1959 out:
1960         pn_free(&pn);
1961         if (in_crit) {
1962                 nbl_end_crit(vp);
1963                 in_crit = 0;
1964         }
1965         if (vp != NULL)
1966                 VN_RELE(vp);
1967         if (dvp != NULL)
1968                 VN_RELE(dvp);
1969         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1970                 goto top;
1971         return (error);
1972 }
1973
1974 /*
1975  * Utility function to compare equality of vnodes.
1976  * Compare the underlying real vnodes, if there are underlying vnodes.
1977  * This is a more thorough comparison than the VN_CMP() macro provides.
1978  */
1979 int
1980 vn_compare(vnode_t *vp1, vnode_t *vp2)
1981 {
1982         vnode_t *realvp;
1983
1984         if (vp1 != NULL && fop_realvp(vp1, &realvp, NULL) == 0)
1985                 vp1 = realvp;
1986         if (vp2 != NULL && fop_realvp(vp2, &realvp, NULL) == 0)
1987                 vp2 = realvp;
1988         return (VN_CMP(vp1, vp2));
1989 }
1990
1991 /*
1992  * The number of locks to hash into.  This value must be a power
1993  * of 2 minus 1 and should probably also be prime.
1994  */
1995 #define NUM_BUCKETS     1023
1996
1997 struct  vn_vfslocks_bucket {
1998         kmutex_t vb_lock;
1999         vn_vfslocks_entry_t *vb_list;
2000         char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2001 };
2002
2003 /*
2004  * Total number of buckets will be NUM_BUCKETS + 1 .
2005  */
2006
2007 #pragma align   64(vn_vfslocks_buckets)
2008 static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2009
2010 #define VN_VFSLOCKS_SHIFT       9
2011
2012 #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2013         ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2014
2015 /*
2016  * vn_vfslocks_getlock() uses an HASH scheme to generate
2017  * rwstlock using vfs/vnode pointer passed to it.
2018  *
2019  * vn_vfslocks_rele() releases a reference in the
2020  * HASH table which allows the entry allocated by
2021  * vn_vfslocks_getlock() to be freed at a later
2022  * stage when the refcount drops to zero.
2023  */
2024
2025 vn_vfslocks_entry_t *
2026 vn_vfslocks_getlock(void *vfsvpptr)
2027 {
2028         struct vn_vfslocks_bucket *bp;
2029         vn_vfslocks_entry_t *vep;
2030         vn_vfslocks_entry_t *tvep;
2031
2032         ASSERT(vfsvpptr != NULL);
2033         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2034
2035         mutex_enter(&bp->vb_lock);
2036         for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2037                 if (vep->ve_vpvfs == vfsvpptr) {
2038                         vep->ve_refcnt++;
2039                         mutex_exit(&bp->vb_lock);
2040                         return (vep);
2041                 }
2042         }
2043         mutex_exit(&bp->vb_lock);
2044         vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2045         rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2046         vep->ve_vpvfs = (char *)vfsvpptr;
2047         vep->ve_refcnt = 1;
2048         mutex_enter(&bp->vb_lock);
2049         for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2050                 if (tvep->ve_vpvfs == vfsvpptr) {
2051                         tvep->ve_refcnt++;
2052                         mutex_exit(&bp->vb_lock);
2053
2054                         /*
2055                          * There is already an entry in the hash
2056                          * destroy what we just allocated.
2057                          */
2058                         rwst_destroy(&vep->ve_lock);
2059                         kmem_free(vep, sizeof (*vep));
2060                         return (tvep);
2061                 }
2062         }
2063         vep->ve_next = bp->vb_list;
2064         bp->vb_list = vep;
2065         mutex_exit(&bp->vb_lock);
2066         return (vep);
2067 }
2068
2069 void
2070 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2071 {
2072         struct vn_vfslocks_bucket *bp;
2073         vn_vfslocks_entry_t *vep;
2074         vn_vfslocks_entry_t *pvep;
2075
2076         ASSERT(vepent != NULL);
2077         ASSERT(vepent->ve_vpvfs != NULL);
2078
2079         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2080
2081         mutex_enter(&bp->vb_lock);
2082         vepent->ve_refcnt--;
2083
2084         if ((int32_t)vepent->ve_refcnt < 0)
2085                 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2086
2087         if (vepent->ve_refcnt == 0) {
2088                 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2089                         if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2090                                 if (bp->vb_list == vep)
2091                                         bp->vb_list = vep->ve_next;
2092                                 else {
2093                                         /* LINTED */
2094                                         pvep->ve_next = vep->ve_next;
2095                                 }
2096                                 mutex_exit(&bp->vb_lock);
2097                                 rwst_destroy(&vep->ve_lock);
2098                                 kmem_free(vep, sizeof (*vep));
2099                                 return;
2100                         }
2101                         pvep = vep;
2102                 }
2103                 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2104         }
2105         mutex_exit(&bp->vb_lock);
2106 }
2107
2108 /*
2109  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2110  * lock protecting the v_vfsmountedhere field.
2111  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2112  * except that it blocks to acquire the lock VVFSLOCK.
2113  *
2114  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2115  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2116  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2117  */
2118 int
2119 vn_vfswlock_wait(vnode_t *vp)
2120 {
2121         int retval;
2122         vn_vfslocks_entry_t *vpvfsentry;
2123         ASSERT(vp != NULL);
2124
2125         vpvfsentry = vn_vfslocks_getlock(vp);
2126         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2127
2128         if (retval == EINTR) {
2129                 vn_vfslocks_rele(vpvfsentry);
2130                 return (EINTR);
2131         }
2132         return (retval);
2133 }
2134
2135 int
2136 vn_vfsrlock_wait(vnode_t *vp)
2137 {
2138         int retval;
2139         vn_vfslocks_entry_t *vpvfsentry;
2140         ASSERT(vp != NULL);
2141
2142         vpvfsentry = vn_vfslocks_getlock(vp);
2143         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2144
2145         if (retval == EINTR) {
2146                 vn_vfslocks_rele(vpvfsentry);
2147                 return (EINTR);
2148         }
2149
2150         return (retval);
2151 }
2152
2153
2154 /*
2155  * vn_vfswlock is used to implement a lock which is logically a writers lock
2156  * protecting the v_vfsmountedhere field.
2157  */
2158 int
2159 vn_vfswlock(vnode_t *vp)
2160 {
2161         vn_vfslocks_entry_t *vpvfsentry;
2162
2163         /*
2164          * If vp is NULL then somebody is trying to lock the covered vnode
2165          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2166          * only happen when unmounting /.  Since that operation will fail
2167          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2168          */
2169         if (vp == NULL)
2170                 return (EBUSY);
2171
2172         vpvfsentry = vn_vfslocks_getlock(vp);
2173
2174         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2175                 return (0);
2176
2177         vn_vfslocks_rele(vpvfsentry);
2178         return (EBUSY);
2179 }
2180
2181 int
2182 vn_vfsrlock(vnode_t *vp)
2183 {
2184         vn_vfslocks_entry_t *vpvfsentry;
2185
2186         /*
2187          * If vp is NULL then somebody is trying to lock the covered vnode
2188          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2189          * only happen when unmounting /.  Since that operation will fail
2190          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2191          */
2192         if (vp == NULL)
2193                 return (EBUSY);
2194
2195         vpvfsentry = vn_vfslocks_getlock(vp);
2196
2197         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2198                 return (0);
2199
2200         vn_vfslocks_rele(vpvfsentry);
2201         return (EBUSY);
2202 }
2203
2204 void
2205 vn_vfsunlock(vnode_t *vp)
2206 {
2207         vn_vfslocks_entry_t *vpvfsentry;
2208
2209         /*
2210          * ve_refcnt needs to be decremented twice.
2211          * 1. To release refernce after a call to vn_vfslocks_getlock()
2212          * 2. To release the reference from the locking routines like
2213          *    vn_vfsrlock/vn_vfswlock etc,.
2214          */
2215         vpvfsentry = vn_vfslocks_getlock(vp);
2216         vn_vfslocks_rele(vpvfsentry);
2217
2218         rwst_exit(&vpvfsentry->ve_lock);
2219         vn_vfslocks_rele(vpvfsentry);
2220 }
2221
2222 int
2223 vn_vfswlock_held(vnode_t *vp)
2224 {
2225         int held;
2226         vn_vfslocks_entry_t *vpvfsentry;
2227
2228         ASSERT(vp != NULL);
2229
2230         vpvfsentry = vn_vfslocks_getlock(vp);
2231         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2232
2233         vn_vfslocks_rele(vpvfsentry);
2234         return (held);
2235 }
2236
2237
2238 int
2239 vn_make_ops(
2240         const char *name,                       /* Name of file system */
2241         const fs_operation_def_t *templ,        /* Operation specification */
2242         vnodeops_t **actual)                    /* Return the vnodeops */
2243 {
2244         int unused_ops;
2245         int error;
2246
2247         *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2248
2249         (*actual)->vnop_name = name;
2250
2251         error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2252         if (error) {
2253                 kmem_free(*actual, sizeof (vnodeops_t));
2254         }
2255
2256 #if DEBUG
2257         if (unused_ops != 0)
2258                 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2259                     "but not used", name, unused_ops);
2260 #endif
2261
2262         return (error);
2263 }
2264
2265 /*
2266  * Free the vnodeops created as a result of vn_make_ops()
2267  */
2268 void
2269 vn_freevnodeops(vnodeops_t *vnops)
2270 {
2271         kmem_free(vnops, sizeof (vnodeops_t));
2272 }
2273
2274 /*
2275  * Vnode cache.
2276  */
2277
2278 /* ARGSUSED */
2279 static int
2280 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2281 {
2282         struct vnode *vp;
2283
2284         vp = buf;
2285
2286         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2287         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2288         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2289         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2290         vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2291         vp->v_path = NULL;
2292         vp->v_mpssdata = NULL;
2293         vp->v_vsd = NULL;
2294         vp->v_fopdata = NULL;
2295
2296         vmobject_init(&vp->v_object, vp);
2297
2298         return (0);
2299 }
2300
2301 /* ARGSUSED */
2302 static void
2303 vn_cache_destructor(void *buf, void *cdrarg)
2304 {
2305         struct vnode *vp;
2306
2307         vp = buf;
2308
2309         vmobject_fini(&vp->v_object);
2310
2311         rw_destroy(&vp->v_nbllock);
2312         cv_destroy(&vp->v_cv);
2313         mutex_destroy(&vp->v_vsd_lock);
2314         mutex_destroy(&vp->v_lock);
2315 }
2316
2317 void
2318 vn_create_cache(void)
2319 {
2320         /* LINTED */
2321         ASSERT((1 << VNODE_ALIGN_LOG2) ==
2322             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2323         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2324             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2325             NULL, 0);
2326 }
2327
2328 void
2329 vn_destroy_cache(void)
2330 {
2331         kmem_cache_destroy(vn_cache);
2332 }
2333
2334 /*
2335  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2336  * cached by the file system and vnodes remain associated.
2337  */
2338 void
2339 vn_recycle(vnode_t *vp)
2340 {
2341         ASSERT(!vn_has_cached_data(vp));
2342
2343         /*
2344          * XXX - This really belongs in vn_reinit(), but we have some issues
2345          * with the counts.  Best to have it here for clean initialization.
2346          */
2347         vp->v_rdcnt = 0;
2348         vp->v_wrcnt = 0;
2349         vp->v_mmap_read = 0;
2350         vp->v_mmap_write = 0;
2351
2352         /*
2353          * If FEM was in use, make sure everything gets cleaned up
2354          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2355          * constructor.
2356          */
2357         if (vp->v_femhead) {
2358                 /* XXX - There should be a free_femhead() that does all this */
2359                 ASSERT(vp->v_femhead->femh_list == NULL);
2360                 mutex_destroy(&vp->v_femhead->femh_lock);
2361                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2362                 vp->v_femhead = NULL;
2363         }
2364         if (vp->v_path) {
2365                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2366                 vp->v_path = NULL;
2367         }
2368
2369         if (vp->v_fopdata != NULL) {
2370                 free_fopdata(vp);
2371         }
2372         vp->v_mpssdata = NULL;
2373         vsd_free(vp);
2374 }
2375
2376 /*
2377  * Used to reset the vnode fields including those that are directly accessible
2378  * as well as those which require an accessor function.
2379  *
2380  * Does not initialize:
2381  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2382  *      v_data (since FS-nodes and vnodes point to each other and should
2383  *              be updated simultaneously)
2384  *      v_op (in case someone needs to make a VOP call on this object)
2385  */
2386 void
2387 vn_reinit(vnode_t *vp)
2388 {
2389         vp->v_count = 1;
2390         vp->v_count_dnlc = 0;
2391         vp->v_vfsp = NULL;
2392         vp->v_stream = NULL;
2393         vp->v_vfsmountedhere = NULL;
2394         vp->v_flag = 0;
2395         vp->v_type = VNON;
2396         vp->v_rdev = NODEV;
2397
2398         vp->v_filocks = NULL;
2399         vp->v_shrlocks = NULL;
2400         VERIFY(!vn_has_cached_data(vp));
2401
2402         vp->v_locality = NULL;
2403         vp->v_xattrdir = NULL;
2404
2405         /* Handles v_femhead, v_path, and the r/w/map counts */
2406         vn_recycle(vp);
2407 }
2408
2409 vnode_t *
2410 vn_alloc(int kmflag)
2411 {
2412         vnode_t *vp;
2413
2414         vp = kmem_cache_alloc(vn_cache, kmflag);
2415
2416         if (vp != NULL) {
2417                 vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2418                 vp->v_fopdata = NULL;
2419                 vn_reinit(vp);
2420         }
2421
2422         return (vp);
2423 }
2424
2425 void
2426 vn_free(vnode_t *vp)
2427 {
2428         ASSERT(vp->v_shrlocks == NULL);
2429         ASSERT(vp->v_filocks == NULL);
2430
2431         /*
2432          * Some file systems call vn_free() with v_count of zero,
2433          * some with v_count of 1.  In any case, the value should
2434          * never be anything else.
2435          */
2436         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2437         ASSERT(vp->v_count_dnlc == 0);
2438         if (vp->v_path != NULL) {
2439                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2440                 vp->v_path = NULL;
2441         }
2442
2443         /* If FEM was in use, make sure everything gets cleaned up */
2444         if (vp->v_femhead) {
2445                 /* XXX - There should be a free_femhead() that does all this */
2446                 ASSERT(vp->v_femhead->femh_list == NULL);
2447                 mutex_destroy(&vp->v_femhead->femh_lock);
2448                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2449                 vp->v_femhead = NULL;
2450         }
2451
2452         if (vp->v_fopdata != NULL) {
2453                 free_fopdata(vp);
2454         }
2455         vp->v_mpssdata = NULL;
2456         vsd_free(vp);
2457         kmem_cache_free(vn_cache, vp);
2458 }
2459
2460 /*
2461  * vnode status changes, should define better states than 1, 0.
2462  */
2463 void
2464 vn_reclaim(vnode_t *vp)
2465 {
2466         vfs_t   *vfsp = vp->v_vfsp;
2467
2468         if (vfsp == NULL ||
2469             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2470                 return;
2471         }
2472         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2473 }
2474
2475 void
2476 vn_idle(vnode_t *vp)
2477 {
2478         vfs_t   *vfsp = vp->v_vfsp;
2479
2480         if (vfsp == NULL ||
2481             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2482                 return;
2483         }
2484         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2485 }
2486 void
2487 vn_exists(vnode_t *vp)
2488 {
2489         vfs_t   *vfsp = vp->v_vfsp;
2490
2491         if (vfsp == NULL ||
2492             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2493                 return;
2494         }
2495         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2496 }
2497
2498 void
2499 vn_invalid(vnode_t *vp)
2500 {
2501         vfs_t   *vfsp = vp->v_vfsp;
2502
2503         if (vfsp == NULL ||
2504             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2505                 return;
2506         }
2507         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2508 }
2509
2510 /* Vnode event notification */
2511
2512 int
2513 vnevent_support(vnode_t *vp, caller_context_t *ct)
2514 {
2515         if (vp == NULL)
2516                 return (EINVAL);
2517
2518         return (fop_vnevent(vp, VE_SUPPORT, NULL, NULL, ct));
2519 }
2520
2521 void
2522 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2523 {
2524         if (vp == NULL || vp->v_femhead == NULL) {
2525                 return;
2526         }
2527         (void) fop_vnevent(vp, VE_RENAME_SRC, dvp, name, ct);
2528 }
2529
2530 void
2531 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2532     caller_context_t *ct)
2533 {
2534         if (vp == NULL || vp->v_femhead == NULL) {
2535                 return;
2536         }
2537         (void) fop_vnevent(vp, VE_RENAME_DEST, dvp, name, ct);
2538 }
2539
2540 void
2541 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2542 {
2543         if (vp == NULL || vp->v_femhead == NULL) {
2544                 return;
2545         }
2546         (void) fop_vnevent(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2547 }
2548
2549 void
2550 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2551 {
2552         if (vp == NULL || vp->v_femhead == NULL) {
2553                 return;
2554         }
2555         (void) fop_vnevent(vp, VE_REMOVE, dvp, name, ct);
2556 }
2557
2558 void
2559 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2560 {
2561         if (vp == NULL || vp->v_femhead == NULL) {
2562                 return;
2563         }
2564         (void) fop_vnevent(vp, VE_RMDIR, dvp, name, ct);
2565 }
2566
2567 void
2568 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2569     caller_context_t *ct)
2570 {
2571         if (vp == NULL || vp->v_femhead == NULL) {
2572                 return;
2573         }
2574         (void) fop_vnevent(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2575 }
2576
2577 void
2578 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2579     caller_context_t *ct)
2580 {
2581         if (vp == NULL || vp->v_femhead == NULL) {
2582                 return;
2583         }
2584         (void) fop_vnevent(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2585 }
2586
2587 void
2588 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2589     caller_context_t *ct)
2590 {
2591         if (vp == NULL || vp->v_femhead == NULL) {
2592                 return;
2593         }
2594         (void) fop_vnevent(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2595 }
2596
2597 void
2598 vnevent_create(vnode_t *vp, caller_context_t *ct)
2599 {
2600         if (vp == NULL || vp->v_femhead == NULL) {
2601                 return;
2602         }
2603         (void) fop_vnevent(vp, VE_CREATE, NULL, NULL, ct);
2604 }
2605
2606 void
2607 vnevent_link(vnode_t *vp, caller_context_t *ct)
2608 {
2609         if (vp == NULL || vp->v_femhead == NULL) {
2610                 return;
2611         }
2612         (void) fop_vnevent(vp, VE_LINK, NULL, NULL, ct);
2613 }
2614
2615 void
2616 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2617 {
2618         if (vp == NULL || vp->v_femhead == NULL) {
2619                 return;
2620         }
2621         (void) fop_vnevent(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2622 }
2623
2624 void
2625 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2626 {
2627         if (vp == NULL || vp->v_femhead == NULL) {
2628                 return;
2629         }
2630         (void) fop_vnevent(vp, VE_TRUNCATE, NULL, NULL, ct);
2631 }
2632
2633 /*
2634  * Vnode accessors.
2635  */
2636
2637 int
2638 vn_is_readonly(vnode_t *vp)
2639 {
2640         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2641 }
2642
2643 int
2644 vn_has_flocks(vnode_t *vp)
2645 {
2646         return (vp->v_filocks != NULL);
2647 }
2648
2649 int
2650 vn_has_mandatory_locks(vnode_t *vp, int mode)
2651 {
2652         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2653 }
2654
2655 int
2656 vn_has_cached_data(vnode_t *vp)
2657 {
2658         return (!list_is_empty(&vp->v_object.list));
2659 }
2660
2661 /*
2662  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2663  * zone_enter(2).
2664  */
2665 int
2666 vn_can_change_zones(vnode_t *vp)
2667 {
2668         struct vfssw *vswp;
2669         int allow = 1;
2670         vnode_t *rvp;
2671
2672         if (nfs_global_client_only != 0)
2673                 return (1);
2674
2675         /*
2676          * We always want to look at the underlying vnode if there is one.
2677          */
2678         if (fop_realvp(vp, &rvp, NULL) != 0)
2679                 rvp = vp;
2680         /*
2681          * Some pseudo filesystems (including doorfs) don't actually register
2682          * their vfsops_t, so the following may return NULL; we happily let
2683          * such vnodes switch zones.
2684          */
2685         vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2686         if (vswp != NULL) {
2687                 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2688                         allow = 0;
2689                 vfs_unrefvfssw(vswp);
2690         }
2691         return (allow);
2692 }
2693
2694 /*
2695  * Return nonzero if the vnode is a mount point, zero if not.
2696  */
2697 int
2698 vn_ismntpt(vnode_t *vp)
2699 {
2700         return (vp->v_vfsmountedhere != NULL);
2701 }
2702
2703 /* Retrieve the vfs (if any) mounted on this vnode */
2704 vfs_t *
2705 vn_mountedvfs(vnode_t *vp)
2706 {
2707         return (vp->v_vfsmountedhere);
2708 }
2709
2710 /*
2711  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2712  */
2713 int
2714 vn_in_dnlc(vnode_t *vp)
2715 {
2716         return (vp->v_count_dnlc > 0);
2717 }
2718
2719 /*
2720  * vn_has_other_opens() checks whether a particular file is opened by more than
2721  * just the caller and whether the open is for read and/or write.
2722  * This routine is for calling after the caller has already called fop_open()
2723  * and the caller wishes to know if they are the only one with it open for
2724  * the mode(s) specified.
2725  *
2726  * Vnode counts are only kept on regular files (v_type=VREG).
2727  */
2728 int
2729 vn_has_other_opens(
2730         vnode_t *vp,
2731         v_mode_t mode)
2732 {
2733
2734         ASSERT(vp != NULL);
2735
2736         switch (mode) {
2737         case V_WRITE:
2738                 if (vp->v_wrcnt > 1)
2739                         return (V_TRUE);
2740                 break;
2741         case V_RDORWR:
2742                 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2743                         return (V_TRUE);
2744                 break;
2745         case V_RDANDWR:
2746                 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2747                         return (V_TRUE);
2748                 break;
2749         case V_READ:
2750                 if (vp->v_rdcnt > 1)
2751                         return (V_TRUE);
2752                 break;
2753         }
2754
2755         return (V_FALSE);
2756 }
2757
2758 /*
2759  * vn_is_opened() checks whether a particular file is opened and
2760  * whether the open is for read and/or write.
2761  *
2762  * Vnode counts are only kept on regular files (v_type=VREG).
2763  */
2764 int
2765 vn_is_opened(
2766         vnode_t *vp,
2767         v_mode_t mode)
2768 {
2769
2770         ASSERT(vp != NULL);
2771
2772         switch (mode) {
2773         case V_WRITE:
2774                 if (vp->v_wrcnt)
2775                         return (V_TRUE);
2776                 break;
2777         case V_RDANDWR:
2778                 if (vp->v_rdcnt && vp->v_wrcnt)
2779                         return (V_TRUE);
2780                 break;
2781         case V_RDORWR:
2782                 if (vp->v_rdcnt || vp->v_wrcnt)
2783                         return (V_TRUE);
2784                 break;
2785         case V_READ:
2786                 if (vp->v_rdcnt)
2787                         return (V_TRUE);
2788                 break;
2789         }
2790
2791         return (V_FALSE);
2792 }
2793
2794 /*
2795  * vn_is_mapped() checks whether a particular file is mapped and whether
2796  * the file is mapped read and/or write.
2797  */
2798 int
2799 vn_is_mapped(
2800         vnode_t *vp,
2801         v_mode_t mode)
2802 {
2803
2804         ASSERT(vp != NULL);
2805
2806 #if !defined(_LP64)
2807         switch (mode) {
2808         /*
2809          * The atomic_add_64_nv functions force atomicity in the
2810          * case of 32 bit architectures. Otherwise the 64 bit values
2811          * require two fetches. The value of the fields may be
2812          * (potentially) changed between the first fetch and the
2813          * second
2814          */
2815         case V_WRITE:
2816                 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2817                         return (V_TRUE);
2818                 break;
2819         case V_RDANDWR:
2820                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2821                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2822                         return (V_TRUE);
2823                 break;
2824         case V_RDORWR:
2825                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2826                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2827                         return (V_TRUE);
2828                 break;
2829         case V_READ:
2830                 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2831                         return (V_TRUE);
2832                 break;
2833         }
2834 #else
2835         switch (mode) {
2836         case V_WRITE:
2837                 if (vp->v_mmap_write)
2838                         return (V_TRUE);
2839                 break;
2840         case V_RDANDWR:
2841                 if (vp->v_mmap_read && vp->v_mmap_write)
2842                         return (V_TRUE);
2843                 break;
2844         case V_RDORWR:
2845                 if (vp->v_mmap_read || vp->v_mmap_write)
2846                         return (V_TRUE);
2847                 break;
2848         case V_READ:
2849                 if (vp->v_mmap_read)
2850                         return (V_TRUE);
2851                 break;
2852         }
2853 #endif
2854
2855         return (V_FALSE);
2856 }
2857
2858 /*
2859  * Set the operations vector for a vnode.
2860  *
2861  * FEM ensures that the v_femhead pointer is filled in before the
2862  * v_op pointer is changed.  This means that if the v_femhead pointer
2863  * is NULL, and the v_op field hasn't changed since before which checked
2864  * the v_femhead pointer; then our update is ok - we are not racing with
2865  * FEM.
2866  */
2867 void
2868 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2869 {
2870         vnodeops_t      *op;
2871
2872         ASSERT(vp != NULL);
2873         ASSERT(vnodeops != NULL);
2874
2875         op = vp->v_op;
2876         membar_consumer();
2877         /*
2878          * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2879          * the compare-and-swap on vp->v_op.  If either fails, then FEM is
2880          * in effect on the vnode and we need to have FEM deal with it.
2881          */
2882         if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2883             op) {
2884                 fem_setvnops(vp, vnodeops);
2885         }
2886 }
2887
2888 /*
2889  * Retrieve the operations vector for a vnode
2890  * As with vn_setops(above); make sure we aren't racing with FEM.
2891  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2892  * make sense to the callers of this routine.
2893  */
2894 vnodeops_t *
2895 vn_getops(vnode_t *vp)
2896 {
2897         vnodeops_t      *op;
2898
2899         ASSERT(vp != NULL);
2900
2901         op = vp->v_op;
2902         membar_consumer();
2903         if (vp->v_femhead == NULL && op == vp->v_op) {
2904                 return (op);
2905         } else {
2906                 return (fem_getvnops(vp));
2907         }
2908 }
2909
2910 /*
2911  * Returns non-zero (1) if the vnodeops matches that of the vnode.
2912  * Returns zero (0) if not.
2913  */
2914 int
2915 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2916 {
2917         return (vn_getops(vp) == vnodeops);
2918 }
2919
2920 /*
2921  * Returns non-zero (1) if the specified operation matches the
2922  * corresponding operation for that the vnode.
2923  * Returns zero (0) if not.
2924  */
2925
2926 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2927
2928 int
2929 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2930 {
2931         const fs_operation_trans_def_t *otdp;
2932         fs_generic_func_p *loc = NULL;
2933         vnodeops_t      *vop = vn_getops(vp);
2934
2935         ASSERT(vopname != NULL);
2936
2937         for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
2938                 if (MATCHNAME(otdp->name, vopname)) {
2939                         loc = (fs_generic_func_p *)
2940                             ((char *)(vop) + otdp->offset);
2941                         break;
2942                 }
2943         }
2944
2945         return ((loc != NULL) && (*loc == funcp));
2946 }
2947
2948 /*
2949  * fs_new_caller_id() needs to return a unique ID on a given local system.
2950  * The IDs do not need to survive across reboots.  These are primarily
2951  * used so that (FEM) monitors can detect particular callers (such as
2952  * the NFS server) to a given vnode/vfs operation.
2953  */
2954 u_longlong_t
2955 fs_new_caller_id()
2956 {
2957         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2958
2959         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2960 }
2961
2962 /*
2963  * Given a starting vnode and a path, updates the path in the target vnode in
2964  * a safe manner.  If the vnode already has path information embedded, then the
2965  * cached path is left untouched.
2966  */
2967
2968 size_t max_vnode_path = 4 * MAXPATHLEN;
2969
2970 void
2971 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
2972     const char *path, size_t plen)
2973 {
2974         char    *rpath;
2975         vnode_t *base;
2976         size_t  rpathlen, rpathalloc;
2977         int     doslash = 1;
2978
2979         if (*path == '/') {
2980                 base = rootvp;
2981                 path++;
2982                 plen--;
2983         } else {
2984                 base = startvp;
2985         }
2986
2987         /*
2988          * We cannot grab base->v_lock while we hold vp->v_lock because of
2989          * the potential for deadlock.
2990          */
2991         mutex_enter(&base->v_lock);
2992         if (base->v_path == NULL) {
2993                 mutex_exit(&base->v_lock);
2994                 return;
2995         }
2996
2997         rpathlen = strlen(base->v_path);
2998         rpathalloc = rpathlen + plen + 1;
2999         /* Avoid adding a slash if there's already one there */
3000         if (base->v_path[rpathlen-1] == '/')
3001                 doslash = 0;
3002         else
3003                 rpathalloc++;
3004
3005         /*
3006          * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
3007          * so we must do this dance.  If, by chance, something changes the path,
3008          * just give up since there is no real harm.
3009          */
3010         mutex_exit(&base->v_lock);
3011
3012         /* Paths should stay within reason */
3013         if (rpathalloc > max_vnode_path)
3014                 return;
3015
3016         rpath = kmem_alloc(rpathalloc, KM_SLEEP);
3017
3018         mutex_enter(&base->v_lock);
3019         if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
3020                 mutex_exit(&base->v_lock);
3021                 kmem_free(rpath, rpathalloc);
3022                 return;
3023         }
3024         bcopy(base->v_path, rpath, rpathlen);
3025         mutex_exit(&base->v_lock);
3026
3027         if (doslash)
3028                 rpath[rpathlen++] = '/';
3029         bcopy(path, rpath + rpathlen, plen);
3030         rpath[rpathlen + plen] = '\0';
3031
3032         mutex_enter(&vp->v_lock);
3033         if (vp->v_path != NULL) {
3034                 mutex_exit(&vp->v_lock);
3035                 kmem_free(rpath, rpathalloc);
3036         } else {
3037                 vp->v_path = rpath;
3038                 mutex_exit(&vp->v_lock);
3039         }
3040 }
3041
3042 /*
3043  * Sets the path to the vnode to be the given string, regardless of current
3044  * context.  The string must be a complete path from rootdir.  This is only used
3045  * by fsop_root() for setting the path based on the mountpoint.
3046  */
3047 void
3048 vn_setpath_str(struct vnode *vp, const char *str, size_t len)
3049 {
3050         char *buf = kmem_alloc(len + 1, KM_SLEEP);
3051
3052         mutex_enter(&vp->v_lock);
3053         if (vp->v_path != NULL) {
3054                 mutex_exit(&vp->v_lock);
3055                 kmem_free(buf, len + 1);
3056                 return;
3057         }
3058
3059         vp->v_path = buf;
3060         bcopy(str, vp->v_path, len);
3061         vp->v_path[len] = '\0';
3062
3063         mutex_exit(&vp->v_lock);
3064 }
3065
3066 /*
3067  * Called from within filesystem's vop_rename() to handle renames once the
3068  * target vnode is available.
3069  */
3070 void
3071 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
3072 {
3073         char *tmp;
3074
3075         mutex_enter(&vp->v_lock);
3076         tmp = vp->v_path;
3077         vp->v_path = NULL;
3078         mutex_exit(&vp->v_lock);
3079         vn_setpath(rootdir, dvp, vp, nm, len);
3080         if (tmp != NULL)
3081                 kmem_free(tmp, strlen(tmp) + 1);
3082 }
3083
3084 /*
3085  * Similar to vn_setpath_str(), this function sets the path of the destination
3086  * vnode to the be the same as the source vnode.
3087  */
3088 void
3089 vn_copypath(struct vnode *src, struct vnode *dst)
3090 {
3091         char *buf;
3092         int alloc;
3093
3094         mutex_enter(&src->v_lock);
3095         if (src->v_path == NULL) {
3096                 mutex_exit(&src->v_lock);
3097                 return;
3098         }
3099         alloc = strlen(src->v_path) + 1;
3100
3101         /* avoid kmem_alloc() with lock held */
3102         mutex_exit(&src->v_lock);
3103         buf = kmem_alloc(alloc, KM_SLEEP);
3104         mutex_enter(&src->v_lock);
3105         if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3106                 mutex_exit(&src->v_lock);
3107                 kmem_free(buf, alloc);
3108                 return;
3109         }
3110         bcopy(src->v_path, buf, alloc);
3111         mutex_exit(&src->v_lock);
3112
3113         mutex_enter(&dst->v_lock);
3114         if (dst->v_path != NULL) {
3115                 mutex_exit(&dst->v_lock);
3116                 kmem_free(buf, alloc);
3117                 return;
3118         }
3119         dst->v_path = buf;
3120         mutex_exit(&dst->v_lock);
3121 }
3122
3123 /*
3124  * XXX Private interface for segvn routines that handle vnode
3125  * large page segments.
3126  *
3127  * return 1 if vp's file system fop_pageio() implementation
3128  * can be safely used instead of fop_getpage() for handling
3129  * pagefaults against regular non swap files. fop_pageio()
3130  * interface is considered safe here if its implementation
3131  * is very close to fop_getpage() implementation.
3132  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3133  * panic if there're file holes but instead returns an error.
3134  * Doesn't assume file won't be changed by user writes, etc.
3135  *
3136  * return 0 otherwise.
3137  *
3138  * For now allow segvn to only use fop_pageio() with ufs and nfs.
3139  */
3140 int
3141 vn_vmpss_usepageio(vnode_t *vp)
3142 {
3143         vfs_t   *vfsp = vp->v_vfsp;
3144         char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3145         char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3146         char **fsok = pageio_ok_fss;
3147
3148         if (fsname == NULL) {
3149                 return (0);
3150         }
3151
3152         for (; *fsok; fsok++) {
3153                 if (strcmp(*fsok, fsname) == 0) {
3154                         return (1);
3155                 }
3156         }
3157         return (0);
3158 }
3159
3160 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3161
3162 int
3163 fop_open(
3164         vnode_t **vpp,
3165         int mode,
3166         cred_t *cr,
3167         caller_context_t *ct)
3168 {
3169         int ret;
3170         vnode_t *vp = *vpp;
3171
3172         VN_HOLD(vp);
3173         /*
3174          * Adding to the vnode counts before calling open
3175          * avoids the need for a mutex. It circumvents a race
3176          * condition where a query made on the vnode counts results in a
3177          * false negative. The inquirer goes away believing the file is
3178          * not open when there is an open on the file already under way.
3179          *
3180          * The counts are meant to prevent NFS from granting a delegation
3181          * when it would be dangerous to do so.
3182          *
3183          * The vnode counts are only kept on regular files
3184          */
3185         if ((*vpp)->v_type == VREG) {
3186                 if (mode & FREAD)
3187                         atomic_inc_32(&(*vpp)->v_rdcnt);
3188                 if (mode & FWRITE)
3189                         atomic_inc_32(&(*vpp)->v_wrcnt);
3190         }
3191
3192         VOPXID_MAP_CR(vp, cr);
3193
3194         ret = fop_open_dispatch(vpp, mode, cr, ct);
3195
3196         if (ret) {
3197                 /*
3198                  * Use the saved vp just in case the vnode ptr got trashed
3199                  * by the error.
3200                  */
3201                 VOPSTATS_UPDATE(vp, open);
3202                 if ((vp->v_type == VREG) && (mode & FREAD))
3203                         atomic_dec_32(&vp->v_rdcnt);
3204                 if ((vp->v_type == VREG) && (mode & FWRITE))
3205                         atomic_dec_32(&vp->v_wrcnt);
3206         } else {
3207                 /*
3208                  * Some filesystems will return a different vnode,
3209                  * but the same path was still used to open it.
3210                  * So if we do change the vnode and need to
3211                  * copy over the path, do so here, rather than special
3212                  * casing each filesystem. Adjust the vnode counts to
3213                  * reflect the vnode switch.
3214                  */
3215                 VOPSTATS_UPDATE(*vpp, open);
3216                 if (*vpp != vp && *vpp != NULL) {
3217                         vn_copypath(vp, *vpp);
3218                         if (((*vpp)->v_type == VREG) && (mode & FREAD))
3219                                 atomic_inc_32(&(*vpp)->v_rdcnt);
3220                         if ((vp->v_type == VREG) && (mode & FREAD))
3221                                 atomic_dec_32(&vp->v_rdcnt);
3222                         if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3223                                 atomic_inc_32(&(*vpp)->v_wrcnt);
3224                         if ((vp->v_type == VREG) && (mode & FWRITE))
3225                                 atomic_dec_32(&vp->v_wrcnt);
3226                 }
3227         }
3228         VN_RELE(vp);
3229         return (ret);
3230 }
3231
3232 int
3233 fop_close(
3234         vnode_t *vp,
3235         int flag,
3236         int count,
3237         offset_t offset,
3238         cred_t *cr,
3239         caller_context_t *ct)
3240 {
3241         int err;
3242
3243         VOPXID_MAP_CR(vp, cr);
3244
3245         err = fop_close_dispatch(vp, flag, count, offset, cr, ct);
3246
3247         VOPSTATS_UPDATE(vp, close);
3248         /*
3249          * Check passed in count to handle possible dups. Vnode counts are only
3250          * kept on regular files
3251          */
3252         if ((vp->v_type == VREG) && (count == 1))  {
3253                 if (flag & FREAD) {
3254                         ASSERT(vp->v_rdcnt > 0);
3255                         atomic_dec_32(&vp->v_rdcnt);
3256                 }
3257                 if (flag & FWRITE) {
3258                         ASSERT(vp->v_wrcnt > 0);
3259                         atomic_dec_32(&vp->v_wrcnt);
3260                 }
3261         }
3262         return (err);
3263 }
3264
3265 int
3266 fop_read(
3267         vnode_t *vp,
3268         uio_t *uiop,
3269         int ioflag,
3270         cred_t *cr,
3271         caller_context_t *ct)
3272 {
3273         int     err;
3274         ssize_t resid_start = uiop->uio_resid;
3275
3276         VOPXID_MAP_CR(vp, cr);
3277
3278         err = fop_read_dispatch(vp, uiop, ioflag, cr, ct);
3279
3280         VOPSTATS_UPDATE_IO(vp, read,
3281             read_bytes, (resid_start - uiop->uio_resid));
3282         return (err);
3283 }
3284
3285 int
3286 fop_write(
3287         vnode_t *vp,
3288         uio_t *uiop,
3289         int ioflag,
3290         cred_t *cr,
3291         caller_context_t *ct)
3292 {
3293         int     err;
3294         ssize_t resid_start = uiop->uio_resid;
3295
3296         VOPXID_MAP_CR(vp, cr);
3297
3298         err = fop_write_dispatch(vp, uiop, ioflag, cr, ct);
3299
3300         VOPSTATS_UPDATE_IO(vp, write,
3301             write_bytes, (resid_start - uiop->uio_resid));
3302         return (err);
3303 }
3304
3305 int
3306 fop_ioctl(
3307         vnode_t *vp,
3308         int cmd,
3309         intptr_t arg,
3310         int flag,
3311         cred_t *cr,
3312         int *rvalp,
3313         caller_context_t *ct)
3314 {
3315         int     err;
3316
3317         VOPXID_MAP_CR(vp, cr);
3318
3319         err = fop_ioctl_dispatch(vp, cmd, arg, flag, cr, rvalp, ct);
3320
3321         VOPSTATS_UPDATE(vp, ioctl);
3322         return (err);
3323 }
3324
3325 int
3326 fop_setfl(
3327         vnode_t *vp,
3328         int oflags,
3329         int nflags,
3330         cred_t *cr,
3331         caller_context_t *ct)
3332 {
3333         int     err;
3334
3335         VOPXID_MAP_CR(vp, cr);
3336
3337         if (vp->v_op->vop_setfl == NULL)
3338                 err = fs_setfl(vp, oflags, nflags, cr, ct);
3339         else
3340                 err = vp->v_op->vop_setfl(vp, oflags, nflags, cr, ct);
3341
3342         VOPSTATS_UPDATE(vp, setfl);
3343         return (err);
3344 }
3345
3346 int
3347 fop_getattr(
3348         vnode_t *vp,
3349         vattr_t *vap,
3350         int flags,
3351         cred_t *cr,
3352         caller_context_t *ct)
3353 {
3354         int     err;
3355
3356         VOPXID_MAP_CR(vp, cr);
3357
3358         /*
3359          * If this file system doesn't understand the xvattr extensions
3360          * then turn off the xvattr bit.
3361          */
3362         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3363                 vap->va_mask &= ~AT_XVATTR;
3364         }
3365
3366         /*
3367          * We're only allowed to skip the ACL check iff we used a 32 bit
3368          * ACE mask with fop_access() to determine permissions.
3369          */
3370         if ((flags & ATTR_NOACLCHECK) &&
3371             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3372                 return (EINVAL);
3373
3374         err = fop_getattr_dispatch(vp, vap, flags, cr, ct);
3375
3376         VOPSTATS_UPDATE(vp, getattr);
3377         return (err);
3378 }
3379
3380 int
3381 fop_setattr(
3382         vnode_t *vp,
3383         vattr_t *vap,
3384         int flags,
3385         cred_t *cr,
3386         caller_context_t *ct)
3387 {
3388         int     err;
3389
3390         VOPXID_MAP_CR(vp, cr);
3391
3392         /*
3393          * If this file system doesn't understand the xvattr extensions
3394          * then turn off the xvattr bit.
3395          */
3396         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3397                 vap->va_mask &= ~AT_XVATTR;
3398         }
3399
3400         /*
3401          * We're only allowed to skip the ACL check iff we used a 32 bit
3402          * ACE mask with fop_access() to determine permissions.
3403          */
3404         if ((flags & ATTR_NOACLCHECK) &&
3405             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3406                 return (EINVAL);
3407
3408         err = fop_setattr_dispatch(vp, vap, flags, cr, ct);
3409
3410         VOPSTATS_UPDATE(vp, setattr);
3411         return (err);
3412 }
3413
3414 int
3415 fop_access(
3416         vnode_t *vp,
3417         int mode,
3418         int flags,
3419         cred_t *cr,
3420         caller_context_t *ct)
3421 {
3422         int     err;
3423
3424         if ((flags & V_ACE_MASK) &&
3425             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3426                 return (EINVAL);
3427         }
3428
3429         VOPXID_MAP_CR(vp, cr);
3430
3431         err = fop_access_dispatch(vp, mode, flags, cr, ct);
3432
3433         VOPSTATS_UPDATE(vp, access);
3434         return (err);
3435 }
3436
3437 int
3438 fop_lookup(
3439         vnode_t *dvp,
3440         char *nm,
3441         vnode_t **vpp,
3442         pathname_t *pnp,
3443         int flags,
3444         vnode_t *rdir,
3445         cred_t *cr,
3446         caller_context_t *ct,
3447         int *deflags,           /* Returned per-dirent flags */
3448         pathname_t *ppnp)       /* Returned case-preserved name in directory */
3449 {
3450         int ret;
3451
3452         /*
3453          * If this file system doesn't support case-insensitive access
3454          * and said access is requested, fail quickly.  It is required
3455          * that if the vfs supports case-insensitive lookup, it also
3456          * supports extended dirent flags.
3457          */
3458         if (flags & FIGNORECASE &&
3459             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3460             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3461                 return (EINVAL);
3462
3463         VOPXID_MAP_CR(dvp, cr);
3464
3465         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3466                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3467         } else if (dvp->v_op->vop_lookup == NULL) {
3468                 ret = ENOSYS;
3469         } else {
3470                 ret = dvp->v_op->vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
3471                                             cr, ct, deflags, ppnp);
3472         }
3473
3474         if (ret == 0 && *vpp) {
3475                 VOPSTATS_UPDATE(*vpp, lookup);
3476                 if ((*vpp)->v_path == NULL) {
3477                         vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
3478                 }
3479         }
3480
3481         return (ret);
3482 }
3483
3484 int
3485 fop_create(
3486         vnode_t *dvp,
3487         char *name,
3488         vattr_t *vap,
3489         vcexcl_t excl,
3490         int mode,
3491         vnode_t **vpp,
3492         cred_t *cr,
3493         int flags,
3494         caller_context_t *ct,
3495         vsecattr_t *vsecp)      /* ACL to set during create */
3496 {
3497         int ret;
3498
3499         if (vsecp != NULL &&
3500             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3501                 return (EINVAL);
3502         }
3503         /*
3504          * If this file system doesn't support case-insensitive access
3505          * and said access is requested, fail quickly.
3506          */
3507         if (flags & FIGNORECASE &&
3508             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3509             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3510                 return (EINVAL);
3511
3512         VOPXID_MAP_CR(dvp, cr);
3513
3514         if (dvp->v_op->vop_create == NULL)
3515                 ret = ENOSYS;
3516         else
3517                 ret = dvp->v_op->vop_create(dvp, name, vap, excl, mode, vpp,
3518                                             cr, flags, ct, vsecp);
3519
3520         if (ret == 0 && *vpp) {
3521                 VOPSTATS_UPDATE(*vpp, create);
3522                 if ((*vpp)->v_path == NULL) {
3523                         vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
3524                 }
3525         }
3526
3527         return (ret);
3528 }
3529
3530 int
3531 fop_remove(
3532         vnode_t *dvp,
3533         char *nm,
3534         cred_t *cr,
3535         caller_context_t *ct,
3536         int flags)
3537 {
3538         int     err;
3539
3540         /*
3541          * If this file system doesn't support case-insensitive access
3542          * and said access is requested, fail quickly.
3543          */
3544         if (flags & FIGNORECASE &&
3545             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3546             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3547                 return (EINVAL);
3548
3549         VOPXID_MAP_CR(dvp, cr);
3550
3551         if (dvp->v_op->vop_remove == NULL)
3552                 err = ENOSYS;
3553         else
3554                 err = dvp->v_op->vop_remove(dvp, nm, cr, ct, flags);
3555
3556         VOPSTATS_UPDATE(dvp, remove);
3557         return (err);
3558 }
3559
3560 int
3561 fop_link(
3562         vnode_t *tdvp,
3563         vnode_t *svp,
3564         char *tnm,
3565         cred_t *cr,
3566         caller_context_t *ct,
3567         int flags)
3568 {
3569         int     err;
3570
3571         /*
3572          * If the target file system doesn't support case-insensitive access
3573          * and said access is requested, fail quickly.
3574          */
3575         if (flags & FIGNORECASE &&
3576             (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3577             vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3578                 return (EINVAL);
3579
3580         VOPXID_MAP_CR(tdvp, cr);
3581
3582         if (tdvp->v_op->vop_link == NULL)
3583                 err = ENOSYS;
3584         else
3585                 err = tdvp->v_op->vop_link(tdvp, svp, tnm, cr, ct, flags);
3586
3587         VOPSTATS_UPDATE(tdvp, link);
3588         return (err);
3589 }
3590
3591 int
3592 fop_rename(
3593         vnode_t *sdvp,
3594         char *snm,
3595         vnode_t *tdvp,
3596         char *tnm,
3597         cred_t *cr,
3598         caller_context_t *ct,
3599         int flags)
3600 {
3601         int     err;
3602
3603         /*
3604          * If the file system involved does not support
3605          * case-insensitive access and said access is requested, fail
3606          * quickly.
3607          */
3608         if (flags & FIGNORECASE &&
3609             ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3610             vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3611                 return (EINVAL);
3612
3613         VOPXID_MAP_CR(tdvp, cr);
3614
3615         if (sdvp->v_op->vop_rename == NULL)
3616                 err = ENOSYS;
3617         else
3618                 err = sdvp->v_op->vop_rename(sdvp, snm, tdvp, tnm, cr, ct,
3619                                              flags);
3620
3621         VOPSTATS_UPDATE(sdvp, rename);
3622         return (err);
3623 }
3624
3625 int
3626 fop_mkdir(
3627         vnode_t *dvp,
3628         char *dirname,
3629         vattr_t *vap,
3630         vnode_t **vpp,
3631         cred_t *cr,
3632         caller_context_t *ct,
3633         int flags,
3634         vsecattr_t *vsecp)      /* ACL to set during create */
3635 {
3636         int ret;
3637
3638         if (vsecp != NULL &&
3639             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3640                 return (EINVAL);
3641         }
3642         /*
3643          * If this file system doesn't support case-insensitive access
3644          * and said access is requested, fail quickly.
3645          */
3646         if (flags & FIGNORECASE &&
3647             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3648             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3649                 return (EINVAL);
3650
3651         VOPXID_MAP_CR(dvp, cr);
3652
3653         if (dvp->v_op->vop_mkdir == NULL)
3654                 ret = ENOSYS;
3655         else
3656                 ret = dvp->v_op->vop_mkdir(dvp, dirname, vap, vpp, cr, ct,
3657                                            flags, vsecp);
3658
3659         if (ret == 0 && *vpp) {
3660                 VOPSTATS_UPDATE(*vpp, mkdir);
3661                 if ((*vpp)->v_path == NULL) {
3662                         vn_setpath(rootdir, dvp, *vpp, dirname,
3663                             strlen(dirname));
3664                 }
3665         }
3666
3667         return (ret);
3668 }
3669
3670 int
3671 fop_rmdir(
3672         vnode_t *dvp,
3673         char *nm,
3674         vnode_t *cdir,
3675         cred_t *cr,
3676         caller_context_t *ct,
3677         int flags)
3678 {
3679         int     err;
3680
3681         /*
3682          * If this file system doesn't support case-insensitive access
3683          * and said access is requested, fail quickly.
3684          */
3685         if (flags & FIGNORECASE &&
3686             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3687             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3688                 return (EINVAL);
3689
3690         VOPXID_MAP_CR(dvp, cr);
3691
3692         if (dvp->v_op->vop_rmdir == NULL)
3693                 err = ENOSYS;
3694         else
3695                 err = dvp->v_op->vop_rmdir(dvp, nm, cdir, cr, ct, flags);
3696
3697         VOPSTATS_UPDATE(dvp, rmdir);
3698         return (err);
3699 }
3700
3701 int
3702 fop_readdir(
3703         vnode_t *vp,
3704         uio_t *uiop,
3705         cred_t *cr,
3706         int *eofp,
3707         caller_context_t *ct,
3708         int flags)
3709 {
3710         int     err;
3711         ssize_t resid_start = uiop->uio_resid;
3712
3713         /*
3714          * If this file system doesn't support retrieving directory
3715          * entry flags and said access is requested, fail quickly.
3716          */
3717         if (flags & V_RDDIR_ENTFLAGS &&
3718             vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3719                 return (EINVAL);
3720
3721         VOPXID_MAP_CR(vp, cr);
3722
3723         err = fop_readdir_dispatch(vp, uiop, cr, eofp, ct, flags);
3724
3725         VOPSTATS_UPDATE_IO(vp, readdir,
3726             readdir_bytes, (resid_start - uiop->uio_resid));
3727         return (err);
3728 }
3729
3730 int
3731 fop_symlink(
3732         vnode_t *dvp,
3733         char *linkname,
3734         vattr_t *vap,
3735         char *target,
3736         cred_t *cr,
3737         caller_context_t *ct,
3738         int flags)
3739 {
3740         int     err;
3741         xvattr_t xvattr;
3742
3743         /*
3744          * If this file system doesn't support case-insensitive access
3745          * and said access is requested, fail quickly.
3746          */
3747         if (flags & FIGNORECASE &&
3748             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3749             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3750                 return (EINVAL);
3751
3752         VOPXID_MAP_CR(dvp, cr);
3753
3754         /* check for reparse point */
3755         if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3756             (strncmp(target, FS_REPARSE_TAG_STR,
3757             strlen(FS_REPARSE_TAG_STR)) == 0)) {
3758                 if (!fs_reparse_mark(target, vap, &xvattr))
3759                         vap = (vattr_t *)&xvattr;
3760         }
3761
3762         if (dvp->v_op->vop_symlink == NULL)
3763                 err = ENOSYS;
3764         else
3765                 err = dvp->v_op->vop_symlink(dvp, linkname, vap, target, cr,
3766                                              ct, flags);
3767
3768         VOPSTATS_UPDATE(dvp, symlink);
3769         return (err);
3770 }
3771
3772 int
3773 fop_readlink(
3774         vnode_t *vp,
3775         uio_t *uiop,
3776         cred_t *cr,
3777         caller_context_t *ct)
3778 {
3779         int     err;
3780
3781         VOPXID_MAP_CR(vp, cr);
3782
3783         err = fop_readlink_dispatch(vp, uiop, cr, ct);
3784
3785         VOPSTATS_UPDATE(vp, readlink);
3786         return (err);
3787 }
3788
3789 int
3790 fop_fsync(
3791         vnode_t *vp,
3792         int syncflag,
3793         cred_t *cr,
3794         caller_context_t *ct)
3795 {
3796         int     err;
3797
3798         VOPXID_MAP_CR(vp, cr);
3799
3800         err = fop_fsync_dispatch(vp, syncflag, cr, ct);
3801
3802         VOPSTATS_UPDATE(vp, fsync);
3803         return (err);
3804 }
3805
3806 void
3807 fop_inactive(
3808         vnode_t *vp,
3809         cred_t *cr,
3810         caller_context_t *ct)
3811 {
3812         /* Need to update stats before vop call since we may lose the vnode */
3813         VOPSTATS_UPDATE(vp, inactive);
3814
3815         VOPXID_MAP_CR(vp, cr);
3816
3817         if (vp->v_op->vop_inactive != NULL)
3818                 vp->v_op->vop_inactive(vp, cr, ct);
3819 }
3820
3821 int
3822 fop_fid(
3823         vnode_t *vp,
3824         fid_t *fidp,
3825         caller_context_t *ct)
3826 {
3827         int     err;
3828
3829         err = fop_fid_dispatch(vp, fidp, ct);
3830
3831         VOPSTATS_UPDATE(vp, fid);
3832         return (err);
3833 }
3834
3835 int
3836 fop_rwlock(
3837         vnode_t *vp,
3838         int write_lock,
3839         caller_context_t *ct)
3840 {
3841         int     ret;
3842
3843         if (vp->v_op->vop_rwlock == NULL)
3844                 ret = fs_rwlock(vp, write_lock, ct);
3845         else
3846                 ret = vp->v_op->vop_rwlock(vp, write_lock, ct);
3847
3848         VOPSTATS_UPDATE(vp, rwlock);
3849         return (ret);
3850 }
3851
3852 void
3853 fop_rwunlock(
3854         vnode_t *vp,
3855         int write_lock,
3856         caller_context_t *ct)
3857 {
3858         if (vp->v_op->vop_rwunlock == NULL)
3859                 fs_rwunlock(vp, write_lock, ct);
3860         else
3861                 vp->v_op->vop_rwunlock(vp, write_lock, ct);
3862
3863         VOPSTATS_UPDATE(vp, rwunlock);
3864 }
3865
3866 int
3867 fop_seek(
3868         vnode_t *vp,
3869         offset_t ooff,
3870         offset_t *noffp,
3871         caller_context_t *ct)
3872 {
3873         int     err;
3874
3875         err = fop_seek_dispatch(vp, ooff, noffp, ct);
3876
3877         VOPSTATS_UPDATE(vp, seek);
3878         return (err);
3879 }
3880
3881 int
3882 fop_cmp(
3883         vnode_t *vp1,
3884         vnode_t *vp2,
3885         caller_context_t *ct)
3886 {
3887         int     err;
3888
3889         if (vp1->v_op->vop_cmp == NULL)
3890                 err = fs_cmp(vp1, vp2, ct);
3891         else
3892                 err = vp1->v_op->vop_cmp(vp1, vp2, ct);
3893
3894         VOPSTATS_UPDATE(vp1, cmp);
3895         return (err);
3896 }
3897
3898 int
3899 fop_frlock(
3900         vnode_t *vp,
3901         int cmd,
3902         flock64_t *bfp,
3903         int flag,
3904         offset_t offset,
3905         struct flk_callback *flk_cbp,
3906         cred_t *cr,
3907         caller_context_t *ct)
3908 {
3909         int     err;
3910
3911         VOPXID_MAP_CR(vp, cr);
3912
3913         if (vp->v_op->vop_frlock == NULL)
3914                 err = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
3915         else
3916                 err = vp->v_op->vop_frlock(vp, cmd, bfp, flag, offset,
3917                                            flk_cbp, cr, ct);
3918
3919         VOPSTATS_UPDATE(vp, frlock);
3920         return (err);
3921 }
3922
3923 int
3924 fop_space(
3925         vnode_t *vp,
3926         int cmd,
3927         flock64_t *bfp,
3928         int flag,
3929         offset_t offset,
3930         cred_t *cr,
3931         caller_context_t *ct)
3932 {
3933         int     err;
3934
3935         VOPXID_MAP_CR(vp, cr);
3936
3937         err = fop_space_dispatch(vp, cmd, bfp, flag, offset, cr, ct);
3938
3939         VOPSTATS_UPDATE(vp, space);
3940         return (err);
3941 }
3942
3943 int
3944 fop_realvp(
3945         vnode_t *vp,
3946         vnode_t **vpp,
3947         caller_context_t *ct)
3948 {
3949         int     err;
3950
3951         err = fop_realvp_dispatch(vp, vpp, ct);
3952
3953         VOPSTATS_UPDATE(vp, realvp);
3954         return (err);
3955 }
3956
3957 int
3958 fop_getpage(
3959         vnode_t *vp,
3960         offset_t off,
3961         size_t len,
3962         uint_t *protp,
3963         page_t **plarr,
3964         size_t plsz,
3965         struct seg *seg,
3966         caddr_t addr,
3967         enum seg_rw rw,
3968         cred_t *cr,
3969         caller_context_t *ct)
3970 {
3971         int     err;
3972
3973         VOPXID_MAP_CR(vp, cr);
3974
3975         err = fop_getpage_dispatch(vp, off, len, protp, plarr, plsz, seg,
3976             addr, rw, cr, ct);
3977
3978         VOPSTATS_UPDATE(vp, getpage);
3979         return (err);
3980 }
3981
3982 int
3983 fop_putpage(
3984         vnode_t *vp,
3985         offset_t off,
3986         size_t len,
3987         int flags,
3988         cred_t *cr,
3989         caller_context_t *ct)
3990 {
3991         int     err;
3992
3993         VOPXID_MAP_CR(vp, cr);
3994
3995         err = fop_putpage_dispatch(vp, off, len, flags, cr, ct);
3996
3997         VOPSTATS_UPDATE(vp, putpage);
3998         return (err);
3999 }
4000
4001 int
4002 fop_map(
4003         vnode_t *vp,
4004         offset_t off,
4005         struct as *as,
4006         caddr_t *addrp,
4007         size_t len,
4008         uchar_t prot,
4009         uchar_t maxprot,
4010         uint_t flags,
4011         cred_t *cr,
4012         caller_context_t *ct)
4013 {
4014         int     err;
4015
4016         VOPXID_MAP_CR(vp, cr);
4017
4018         err = fop_map_dispatch(vp, off, as, addrp, len, prot, maxprot,
4019             flags, cr, ct);
4020
4021         VOPSTATS_UPDATE(vp, map);
4022         return (err);
4023 }
4024
4025 int
4026 fop_addmap(
4027         vnode_t *vp,
4028         offset_t off,
4029         struct as *as,
4030         caddr_t addr,
4031         size_t len,
4032         uchar_t prot,
4033         uchar_t maxprot,
4034         uint_t flags,
4035         cred_t *cr,
4036         caller_context_t *ct)
4037 {
4038         int error;
4039         u_longlong_t delta;
4040
4041         VOPXID_MAP_CR(vp, cr);
4042
4043         error = fop_addmap_dispatch(vp, off, as, addr, len, prot, maxprot,
4044             flags, cr, ct);
4045
4046         if ((!error) && (vp->v_type == VREG)) {
4047                 delta = (u_longlong_t)btopr(len);
4048                 /*
4049                  * If file is declared MAP_PRIVATE, it can't be written back
4050                  * even if open for write. Handle as read.
4051                  */
4052                 if (flags & MAP_PRIVATE) {
4053                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4054                             (int64_t)delta);
4055                 } else {
4056                         /*
4057                          * atomic_add_64 forces the fetch of a 64 bit value to
4058                          * be atomic on 32 bit machines
4059                          */
4060                         if (maxprot & PROT_WRITE)
4061                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4062                                     (int64_t)delta);
4063                         if (maxprot & PROT_READ)
4064                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4065                                     (int64_t)delta);
4066                         if (maxprot & PROT_EXEC)
4067                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4068                                     (int64_t)delta);
4069                 }
4070         }
4071         VOPSTATS_UPDATE(vp, addmap);
4072         return (error);
4073 }
4074
4075 int
4076 fop_delmap(
4077         vnode_t *vp,
4078         offset_t off,
4079         struct as *as,
4080         caddr_t addr,
4081         size_t len,
4082         uint_t prot,
4083         uint_t maxprot,
4084         uint_t flags,
4085         cred_t *cr,
4086         caller_context_t *ct)
4087 {
4088         int error;
4089         u_longlong_t delta;
4090
4091         VOPXID_MAP_CR(vp, cr);
4092
4093         error = fop_delmap_dispatch(vp, off, as, addr, len, prot, maxprot,
4094             flags, cr, ct);
4095
4096         /*
4097          * NFS calls into delmap twice, the first time
4098          * it simply establishes a callback mechanism and returns EAGAIN
4099          * while the real work is being done upon the second invocation.
4100          * We have to detect this here and only decrement the counts upon
4101          * the second delmap request.
4102          */
4103         if ((error != EAGAIN) && (vp->v_type == VREG)) {
4104
4105                 delta = (u_longlong_t)btopr(len);
4106
4107                 if (flags & MAP_PRIVATE) {
4108                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4109                             (int64_t)(-delta));
4110                 } else {
4111                         /*
4112                          * atomic_add_64 forces the fetch of a 64 bit value
4113                          * to be atomic on 32 bit machines
4114                          */
4115                         if (maxprot & PROT_WRITE)
4116                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4117                                     (int64_t)(-delta));
4118                         if (maxprot & PROT_READ)
4119                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4120                                     (int64_t)(-delta));
4121                         if (maxprot & PROT_EXEC)
4122                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4123                                     (int64_t)(-delta));
4124                 }
4125         }
4126         VOPSTATS_UPDATE(vp, delmap);
4127         return (error);
4128 }
4129
4130
4131 int
4132 fop_poll(
4133         vnode_t *vp,
4134         short events,
4135         int anyyet,
4136         short *reventsp,
4137         struct pollhead **phpp,
4138         caller_context_t *ct)
4139 {
4140         int     err;
4141
4142         if (vp->v_op->vop_poll == NULL)
4143                 err = fs_poll(vp, events, anyyet, reventsp, phpp, ct);
4144         else
4145                 err = vp->v_op->vop_poll(vp, events, anyyet, reventsp, phpp,
4146                                          ct);
4147
4148         VOPSTATS_UPDATE(vp, poll);
4149         return (err);
4150 }
4151
4152 int
4153 fop_dump(
4154         vnode_t *vp,
4155         caddr_t addr,
4156         offset_t lbdn,
4157         offset_t dblks,
4158         caller_context_t *ct)
4159 {
4160         int     err;
4161
4162         /* ensure lbdn and dblks can be passed safely to bdev_dump */
4163         if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4164                 return (EIO);
4165
4166         err = fop_dump_dispatch(vp, addr, lbdn, dblks, ct);
4167
4168         VOPSTATS_UPDATE(vp, dump);
4169         return (err);
4170 }
4171
4172 int
4173 fop_pathconf(
4174         vnode_t *vp,
4175         int cmd,
4176         ulong_t *valp,
4177         cred_t *cr,
4178         caller_context_t *ct)
4179 {
4180         int     err;
4181
4182         VOPXID_MAP_CR(vp, cr);
4183
4184         if (vp->v_op->vop_pathconf == NULL)
4185                 err = fs_pathconf(vp, cmd, valp, cr, ct);
4186         else
4187                 err = vp->v_op->vop_pathconf(vp, cmd, valp, cr, ct);
4188
4189         VOPSTATS_UPDATE(vp, pathconf);
4190         return (err);
4191 }
4192
4193 int
4194 fop_pageio(
4195         vnode_t *vp,
4196         struct page *pp,
4197         uoff_t io_off,
4198         size_t io_len,
4199         int flags,
4200         cred_t *cr,
4201         caller_context_t *ct)
4202 {
4203         int     err;
4204
4205         VOPXID_MAP_CR(vp, cr);
4206
4207         err = fop_pageio_dispatch(vp, pp, io_off, io_len, flags, cr, ct);
4208
4209         VOPSTATS_UPDATE(vp, pageio);
4210         return (err);
4211 }
4212
4213 int
4214 fop_dumpctl(
4215         vnode_t *vp,
4216         int action,
4217         offset_t *blkp,
4218         caller_context_t *ct)
4219 {
4220         int     err;
4221
4222         err = fop_dumpctl_dispatch(vp, action, blkp, ct);
4223
4224         VOPSTATS_UPDATE(vp, dumpctl);
4225         return (err);
4226 }
4227
4228 void
4229 fop_dispose(
4230         vnode_t *vp,
4231         page_t *pp,
4232         int flag,
4233         int dn,
4234         cred_t *cr,
4235         caller_context_t *ct)
4236 {
4237         /* Must do stats first since it's possible to lose the vnode */
4238         VOPSTATS_UPDATE(vp, dispose);
4239
4240         VOPXID_MAP_CR(vp, cr);
4241
4242         if (vp->v_op->vop_dispose == NULL)
4243                 fs_dispose(vp, pp, flag, dn, cr, ct);
4244         else
4245                 vp->v_op->vop_dispose(vp, pp, flag, dn, cr, ct);
4246 }
4247
4248 int
4249 fop_setsecattr(
4250         vnode_t *vp,
4251         vsecattr_t *vsap,
4252         int flag,
4253         cred_t *cr,
4254         caller_context_t *ct)
4255 {
4256         int     err;
4257
4258         VOPXID_MAP_CR(vp, cr);
4259
4260         /*
4261          * We're only allowed to skip the ACL check iff we used a 32 bit
4262          * ACE mask with fop_access() to determine permissions.
4263          */
4264         if ((flag & ATTR_NOACLCHECK) &&
4265             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4266                 return (EINVAL);
4267         }
4268
4269         err = fop_setsecattr_dispatch(vp, vsap, flag, cr, ct);
4270
4271         VOPSTATS_UPDATE(vp, setsecattr);
4272         return (err);
4273 }
4274
4275 int
4276 fop_getsecattr(
4277         vnode_t *vp,
4278         vsecattr_t *vsap,
4279         int flag,
4280         cred_t *cr,
4281         caller_context_t *ct)
4282 {
4283         int     err;
4284
4285         /*
4286          * We're only allowed to skip the ACL check iff we used a 32 bit
4287          * ACE mask with fop_access() to determine permissions.
4288          */
4289         if ((flag & ATTR_NOACLCHECK) &&
4290             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4291                 return (EINVAL);
4292         }
4293
4294         VOPXID_MAP_CR(vp, cr);
4295
4296         if (vp->v_op->vop_getsecattr == NULL)
4297                 err = fs_fab_acl(vp, vsap, flag, cr, ct);
4298         else
4299                 err = vp->v_op->vop_getsecattr(vp, vsap, flag, cr, ct);
4300
4301         VOPSTATS_UPDATE(vp, getsecattr);
4302         return (err);
4303 }
4304
4305 int
4306 fop_shrlock(
4307         vnode_t *vp,
4308         int cmd,
4309         struct shrlock *shr,
4310         int flag,
4311         cred_t *cr,
4312         caller_context_t *ct)
4313 {
4314         int     err;
4315
4316         VOPXID_MAP_CR(vp, cr);
4317
4318         if (vp->v_op->vop_shrlock == NULL)
4319                 err = fs_shrlock(vp, cmd, shr, flag, cr, ct);
4320         else
4321                 err = vp->v_op->vop_shrlock(vp, cmd, shr, flag, cr, ct);
4322
4323         VOPSTATS_UPDATE(vp, shrlock);
4324         return (err);
4325 }
4326
4327 int
4328 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4329     caller_context_t *ct)
4330 {
4331         int     err;
4332
4333         err = fop_vnevent_dispatch(vp, vnevent, dvp, fnm, ct);
4334
4335         VOPSTATS_UPDATE(vp, vnevent);
4336         return (err);
4337 }
4338
4339 int
4340 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4341     caller_context_t *ct)
4342 {
4343         int err;
4344
4345         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4346                 return (ENOTSUP);
4347
4348         err = fop_reqzcbuf_dispatch(vp, ioflag, uiop, cr, ct);
4349
4350         VOPSTATS_UPDATE(vp, reqzcbuf);
4351         return (err);
4352 }
4353
4354 int
4355 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4356 {
4357         int err;
4358
4359         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4360                 return (ENOTSUP);
4361
4362         err = fop_retzcbuf_dispatch(vp, uiop, cr, ct);
4363
4364         VOPSTATS_UPDATE(vp, retzcbuf);
4365         return (err);
4366 }
4367
4368 /*
4369  * Default destructor
4370  *      Needed because NULL destructor means that the key is unused
4371  */
4372 /* ARGSUSED */
4373 void
4374 vsd_defaultdestructor(void *value)
4375 {}
4376
4377 /*
4378  * Create a key (index into per vnode array)
4379  *      Locks out vsd_create, vsd_destroy, and vsd_free
4380  *      May allocate memory with lock held
4381  */
4382 void
4383 vsd_create(uint_t *keyp, void (*destructor)(void *))
4384 {
4385         int     i;
4386         uint_t  nkeys;
4387
4388         /*
4389          * if key is allocated, do nothing
4390          */
4391         mutex_enter(&vsd_lock);
4392         if (*keyp) {
4393                 mutex_exit(&vsd_lock);
4394                 return;
4395         }
4396         /*
4397          * find an unused key
4398          */
4399         if (destructor == NULL)
4400                 destructor = vsd_defaultdestructor;
4401
4402         for (i = 0; i < vsd_nkeys; ++i)
4403                 if (vsd_destructor[i] == NULL)
4404                         break;
4405
4406         /*
4407          * if no unused keys, increase the size of the destructor array
4408          */
4409         if (i == vsd_nkeys) {
4410                 if ((nkeys = (vsd_nkeys << 1)) == 0)
4411                         nkeys = 1;
4412                 vsd_destructor =
4413                     (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4414                     (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4415                     (size_t)(nkeys * sizeof (void (*)(void *))));
4416                 vsd_nkeys = nkeys;
4417         }
4418
4419         /*
4420          * allocate the next available unused key
4421          */
4422         vsd_destructor[i] = destructor;
4423         *keyp = i + 1;
4424
4425         /* create vsd_list, if it doesn't exist */
4426         if (vsd_list == NULL) {
4427                 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4428                 list_create(vsd_list, sizeof (struct vsd_node),
4429                     offsetof(struct vsd_node, vs_nodes));
4430         }
4431
4432         mutex_exit(&vsd_lock);
4433 }
4434
4435 /*
4436  * Destroy a key
4437  *
4438  * Assumes that the caller is preventing vsd_set and vsd_get
4439  * Locks out vsd_create, vsd_destroy, and vsd_free
4440  * May free memory with lock held
4441  */
4442 void
4443 vsd_destroy(uint_t *keyp)
4444 {
4445         uint_t key;
4446         struct vsd_node *vsd;
4447
4448         /*
4449          * protect the key namespace and our destructor lists
4450          */
4451         mutex_enter(&vsd_lock);
4452         key = *keyp;
4453         *keyp = 0;
4454
4455         ASSERT(key <= vsd_nkeys);
4456
4457         /*
4458          * if the key is valid
4459          */
4460         if (key != 0) {
4461                 uint_t k = key - 1;
4462                 /*
4463                  * for every vnode with VSD, call key's destructor
4464                  */
4465                 for (vsd = list_head(vsd_list); vsd != NULL;
4466                     vsd = list_next(vsd_list, vsd)) {
4467                         /*
4468                          * no VSD for key in this vnode
4469                          */
4470                         if (key > vsd->vs_nkeys)
4471                                 continue;
4472                         /*
4473                          * call destructor for key
4474                          */
4475                         if (vsd->vs_value[k] && vsd_destructor[k])
4476                                 (*vsd_destructor[k])(vsd->vs_value[k]);
4477                         /*
4478                          * reset value for key
4479                          */
4480                         vsd->vs_value[k] = NULL;
4481                 }
4482                 /*
4483                  * actually free the key (NULL destructor == unused)
4484                  */
4485                 vsd_destructor[k] = NULL;
4486         }
4487
4488         mutex_exit(&vsd_lock);
4489 }
4490
4491 /*
4492  * Quickly return the per vnode value that was stored with the specified key
4493  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4494  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4495  */
4496 void *
4497 vsd_get(vnode_t *vp, uint_t key)
4498 {
4499         struct vsd_node *vsd;
4500
4501         ASSERT(vp != NULL);
4502         ASSERT(mutex_owned(&vp->v_vsd_lock));
4503
4504         vsd = vp->v_vsd;
4505
4506         if (key && vsd != NULL && key <= vsd->vs_nkeys)
4507                 return (vsd->vs_value[key - 1]);
4508         return (NULL);
4509 }
4510
4511 /*
4512  * Set a per vnode value indexed with the specified key
4513  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4514  */
4515 int
4516 vsd_set(vnode_t *vp, uint_t key, void *value)
4517 {
4518         struct vsd_node *vsd;
4519
4520         ASSERT(vp != NULL);
4521         ASSERT(mutex_owned(&vp->v_vsd_lock));
4522
4523         if (key == 0)
4524                 return (EINVAL);
4525
4526         vsd = vp->v_vsd;
4527         if (vsd == NULL)
4528                 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4529
4530         /*
4531          * If the vsd was just allocated, vs_nkeys will be 0, so the following
4532          * code won't happen and we will continue down and allocate space for
4533          * the vs_value array.
4534          * If the caller is replacing one value with another, then it is up
4535          * to the caller to free/rele/destroy the previous value (if needed).
4536          */
4537         if (key <= vsd->vs_nkeys) {
4538                 vsd->vs_value[key - 1] = value;
4539                 return (0);
4540         }
4541
4542         ASSERT(key <= vsd_nkeys);
4543
4544         if (vsd->vs_nkeys == 0) {
4545                 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4546                 /*
4547                  * Link onto list of all VSD nodes.
4548                  */
4549                 list_insert_head(vsd_list, vsd);
4550                 mutex_exit(&vsd_lock);
4551         }
4552
4553         /*
4554          * Allocate vnode local storage and set the value for key
4555          */
4556         vsd->vs_value = vsd_realloc(vsd->vs_value,
4557             vsd->vs_nkeys * sizeof (void *),
4558             key * sizeof (void *));
4559         vsd->vs_nkeys = key;
4560         vsd->vs_value[key - 1] = value;
4561
4562         return (0);
4563 }
4564
4565 /*
4566  * Called from vn_free() to run the destructor function for each vsd
4567  *      Locks out vsd_create and vsd_destroy
4568  *      Assumes that the destructor *DOES NOT* use vsd
4569  */
4570 void
4571 vsd_free(vnode_t *vp)
4572 {
4573         int i;
4574         struct vsd_node *vsd = vp->v_vsd;
4575
4576         if (vsd == NULL)
4577                 return;
4578
4579         if (vsd->vs_nkeys == 0) {
4580                 kmem_free(vsd, sizeof (*vsd));
4581                 vp->v_vsd = NULL;
4582                 return;
4583         }
4584
4585         /*
4586          * lock out vsd_create and vsd_destroy, call
4587          * the destructor, and mark the value as destroyed.
4588          */
4589         mutex_enter(&vsd_lock);
4590
4591         for (i = 0; i < vsd->vs_nkeys; i++) {
4592                 if (vsd->vs_value[i] && vsd_destructor[i])
4593                         (*vsd_destructor[i])(vsd->vs_value[i]);
4594                 vsd->vs_value[i] = NULL;
4595         }
4596
4597         /*
4598          * remove from linked list of VSD nodes
4599          */
4600         list_remove(vsd_list, vsd);
4601
4602         mutex_exit(&vsd_lock);
4603
4604         /*
4605          * free up the VSD
4606          */
4607         kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4608         kmem_free(vsd, sizeof (struct vsd_node));
4609         vp->v_vsd = NULL;
4610 }
4611
4612 /*
4613  * realloc
4614  */
4615 static void *
4616 vsd_realloc(void *old, size_t osize, size_t nsize)
4617 {
4618         void *new;
4619
4620         new = kmem_zalloc(nsize, KM_SLEEP);
4621         if (old) {
4622                 bcopy(old, new, osize);
4623                 kmem_free(old, osize);
4624         }
4625         return (new);
4626 }
4627
4628 /*
4629  * Setup the extensible system attribute for creating a reparse point.
4630  * The symlink data 'target' is validated for proper format of a reparse
4631  * string and a check also made to make sure the symlink data does not
4632  * point to an existing file.
4633  *
4634  * return 0 if ok else -1.
4635  */
4636 static int
4637 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4638 {
4639         xoptattr_t *xoap;
4640
4641         if ((!target) || (!vap) || (!xvattr))
4642                 return (-1);
4643
4644         /* validate reparse string */
4645         if (reparse_validate((const char *)target))
4646                 return (-1);
4647
4648         xva_init(xvattr);
4649         xvattr->xva_vattr = *vap;
4650         xvattr->xva_vattr.va_mask |= AT_XVATTR;
4651         xoap = xva_getxoptattr(xvattr);
4652         ASSERT(xoap);
4653         XVA_SET_REQ(xvattr, XAT_REPARSE);
4654         xoap->xoa_reparse = 1;
4655
4656         return (0);
4657 }
4658
4659 /*
4660  * Function to check whether a symlink is a reparse point.
4661  * Return B_TRUE if it is a reparse point, else return B_FALSE
4662  */
4663 boolean_t
4664 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4665 {
4666         xvattr_t xvattr;
4667         xoptattr_t *xoap;
4668
4669         if ((vp->v_type != VLNK) ||
4670             !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4671                 return (B_FALSE);
4672
4673         xva_init(&xvattr);
4674         xoap = xva_getxoptattr(&xvattr);
4675         ASSERT(xoap);
4676         XVA_SET_REQ(&xvattr, XAT_REPARSE);
4677
4678         if (fop_getattr(vp, &xvattr.xva_vattr, 0, cr, ct))
4679                 return (B_FALSE);
4680
4681         if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4682             (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4683                 return (B_FALSE);
4684
4685         return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4686 }