drop net-snmp dep
[unleashed.git] / kernel / fs / vnode.c
blob97ba98e76c810e6d38324f233063c663c90ad11c
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2017, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
29 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * University Copyright- Copyright (c) 1982, 1986, 1988
34 * The Regents of the University of California
35 * All Rights Reserved
37 * University Acknowledgment- Portions of this document are derived from
38 * software developed by the University of California, Berkeley, and its
39 * contributors.
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/t_lock.h>
45 #include <sys/errno.h>
46 #include <sys/cred.h>
47 #include <sys/user.h>
48 #include <sys/uio.h>
49 #include <sys/file.h>
50 #include <sys/pathname.h>
51 #include <sys/atomic.h>
52 #include <sys/vfs.h>
53 #include <sys/vnode.h>
54 #include <sys/vnode_dispatch.h>
55 #include <sys/rwstlock.h>
56 #include <sys/fem.h>
57 #include <sys/stat.h>
58 #include <sys/mode.h>
59 #include <sys/conf.h>
60 #include <sys/sysmacros.h>
61 #include <sys/cmn_err.h>
62 #include <sys/systm.h>
63 #include <sys/kmem.h>
64 #include <sys/debug.h>
65 #include <sys/acl.h>
66 #include <sys/nbmlock.h>
67 #include <sys/fcntl.h>
68 #include <sys/fs_subr.h>
69 #include <sys/taskq.h>
70 #include <sys/fs_reparse.h>
71 #include <sys/time.h>
72 #include <sys/sdt.h>
74 /* Determine if this vnode is a file that is read-only */
75 #define ISROFILE(vp) \
76 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
77 (vp)->v_type != VFIFO && vn_is_readonly(vp))
79 /* Tunable via /etc/system; used only by admin/install */
80 int nfs_global_client_only;
83 * Array of vopstats_t for per-FS-type vopstats. This array has the same
84 * number of entries as and parallel to the vfssw table. (Arguably, it could
85 * be part of the vfssw table.) Once it's initialized, it's accessed using
86 * the same fstype index that is used to index into the vfssw table.
88 vopstats_t **vopstats_fstype;
90 /* vopstats initialization template used for fast initialization via bcopy() */
91 static vopstats_t *vs_templatep;
93 /* Kmem cache handle for vsk_anchor_t allocations */
94 kmem_cache_t *vsk_anchor_cache;
96 /* file events cleanup routine */
97 extern void free_fopdata(vnode_t *);
100 * Root of AVL tree for the kstats associated with vopstats. Lock protects
101 * updates to vsktat_tree.
103 avl_tree_t vskstat_tree;
104 kmutex_t vskstat_tree_lock;
106 /* Global variable which enables/disables the vopstats collection */
107 int vopstats_enabled = 1;
109 /* Global used for empty/invalid v_path */
110 char *vn_vpath_empty = "";
113 * forward declarations for internal vnode specific data (vsd)
115 static void *vsd_realloc(void *, size_t, size_t);
118 * forward declarations for reparse point functions
120 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
123 * VSD -- VNODE SPECIFIC DATA
124 * The v_data pointer is typically used by a file system to store a
125 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
126 * However, there are times when additional project private data needs
127 * to be stored separately from the data (node) pointed to by v_data.
128 * This additional data could be stored by the file system itself or
129 * by a completely different kernel entity. VSD provides a way for
130 * callers to obtain a key and store a pointer to private data associated
131 * with a vnode.
133 * Callers are responsible for protecting the vsd by holding v_vsd_lock
134 * for calls to vsd_set() and vsd_get().
138 * vsd_lock protects:
139 * vsd_nkeys - creation and deletion of vsd keys
140 * vsd_list - insertion and deletion of vsd_node in the vsd_list
141 * vsd_destructor - adding and removing destructors to the list
143 static kmutex_t vsd_lock;
144 static uint_t vsd_nkeys; /* size of destructor array */
145 /* list of vsd_node's */
146 static list_t *vsd_list = NULL;
147 /* per-key destructor funcs */
148 static void (**vsd_destructor)(void *);
151 * The following is the common set of actions needed to update the
152 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and
153 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
154 * recording of the bytes transferred. Since the code is similar
155 * but small, it is nearly a duplicate. Consequently any changes
156 * to one may need to be reflected in the other.
157 * Rundown of the variables:
158 * vp - Pointer to the vnode
159 * counter - Partial name structure member to update in vopstats for counts
160 * bytecounter - Partial name structure member to update in vopstats for bytes
161 * bytesval - Value to update in vopstats for bytes
162 * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
163 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
166 #define VOPSTATS_UPDATE(vp, counter) { \
167 vfs_t *vfsp = (vp)->v_vfsp; \
168 if (vfsp && vfsp->vfs_implp && \
169 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
170 vopstats_t *vsp = &vfsp->vfs_vopstats; \
171 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
172 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
173 size_t, uint64_t *); \
174 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \
175 (*stataddr)++; \
176 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
177 vsp->n##counter.value.ui64++; \
182 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \
183 vfs_t *vfsp = (vp)->v_vfsp; \
184 if (vfsp && vfsp->vfs_implp && \
185 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
186 vopstats_t *vsp = &vfsp->vfs_vopstats; \
187 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
188 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
189 size_t, uint64_t *); \
190 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
191 (*stataddr)++; \
192 vsp->bytecounter.value.ui64 += bytesval; \
193 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
194 vsp->n##counter.value.ui64++; \
195 vsp->bytecounter.value.ui64 += bytesval; \
201 * If the filesystem does not support XIDs map credential
202 * If the vfsp is NULL, perhaps we should also map?
204 #define VOPXID_MAP_CR(vp, cr) { \
205 vfs_t *vfsp = (vp)->v_vfsp; \
206 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
207 cr = crgetmapped(cr); \
211 * Convert stat(2) formats to vnode types and vice versa. (Knows about
212 * numerical order of S_IFMT and vnode types.)
214 enum vtype iftovt_tab[] = {
215 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
216 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
219 ushort_t vttoif_tab[] = {
220 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
221 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
225 * The system vnode cache.
228 kmem_cache_t *vn_cache;
231 /* Extensible attribute (xva) routines. */
234 * Zero out the structure, set the size of the requested/returned bitmaps,
235 * set VATTR_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
236 * to the returned attributes array.
238 void
239 xva_init(xvattr_t *xvap)
241 bzero(xvap, sizeof (xvattr_t));
242 xvap->xva_mapsize = XVA_MAPSIZE;
243 xvap->xva_magic = XVA_MAGIC;
244 xvap->xva_vattr.va_mask = VATTR_XVATTR;
245 xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
249 * If VATTR_XVATTR is set, returns a pointer to the embedded xoptattr_t
250 * structure. Otherwise, returns NULL.
252 xoptattr_t *
253 xva_getxoptattr(xvattr_t *xvap)
255 xoptattr_t *xoap = NULL;
256 if (xvap->xva_vattr.va_mask & VATTR_XVATTR)
257 xoap = &xvap->xva_xoptattrs;
258 return (xoap);
262 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
263 * We use the f_fsid reported by VFS_STATVFS() since we use that for the
264 * kstat name.
266 static int
267 vska_compar(const void *n1, const void *n2)
269 int ret;
270 ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
271 ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
273 if (p1 < p2) {
274 ret = -1;
275 } else if (p1 > p2) {
276 ret = 1;
277 } else {
278 ret = 0;
281 return (ret);
285 * Used to create a single template which will be bcopy()ed to a newly
286 * allocated vsanchor_combo_t structure in new_vsanchor(), below.
288 static vopstats_t *
289 create_vopstats_template()
291 vopstats_t *vsp;
293 vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
294 bzero(vsp, sizeof (*vsp)); /* Start fresh */
296 /* fop_open */
297 kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
298 /* fop_close */
299 kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
300 /* fop_read I/O */
301 kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
302 kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
303 /* fop_write I/O */
304 kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
305 kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
306 /* fop_ioctl */
307 kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
308 /* fop_setfl */
309 kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
310 /* fop_getattr */
311 kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
312 /* fop_setattr */
313 kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
314 /* fop_access */
315 kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
316 /* fop_lookup */
317 kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
318 /* fop_create */
319 kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
320 /* fop_remove */
321 kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
322 /* fop_link */
323 kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
324 /* fop_rename */
325 kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
326 /* fop_mkdir */
327 kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
328 /* fop_rmdir */
329 kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
330 /* fop_readdir I/O */
331 kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
332 kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
333 KSTAT_DATA_UINT64);
334 /* fop_symlink */
335 kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
336 /* fop_readlink */
337 kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
338 /* fop_fsync */
339 kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
340 /* fop_inactive */
341 kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
342 /* fop_fid */
343 kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
344 /* fop_rwlock */
345 kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
346 /* fop_rwunlock */
347 kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
348 /* fop_seek */
349 kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
350 /* fop_cmp */
351 kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
352 /* fop_frlock */
353 kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
354 /* fop_space */
355 kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
356 /* fop_realvp */
357 kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
358 /* fop_getpage */
359 kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
360 /* fop_putpage */
361 kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
362 /* fop_map */
363 kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
364 /* fop_addmap */
365 kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
366 /* fop_delmap */
367 kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
368 /* fop_poll */
369 kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
370 /* fop_dump */
371 kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
372 /* fop_pathconf */
373 kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
374 /* fop_pageio */
375 kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
376 /* fop_dumpctl */
377 kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
378 /* fop_dispose */
379 kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
380 /* fop_setsecattr */
381 kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
382 /* fop_getsecattr */
383 kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
384 /* fop_shrlock */
385 kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
386 /* fop_vnevent */
387 kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
388 /* fop_reqzcbuf */
389 kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
390 /* fop_retzcbuf */
391 kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
393 return (vsp);
397 * Creates a kstat structure associated with a vopstats structure.
399 kstat_t *
400 new_vskstat(char *ksname, vopstats_t *vsp)
402 kstat_t *ksp;
404 if (!vopstats_enabled) {
405 return (NULL);
408 ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
409 sizeof (vopstats_t)/sizeof (kstat_named_t),
410 KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
411 if (ksp) {
412 ksp->ks_data = vsp;
413 kstat_install(ksp);
416 return (ksp);
420 * Called from vfsinit() to initialize the support mechanisms for vopstats
422 void
423 vopstats_startup()
425 if (!vopstats_enabled)
426 return;
429 * Creates the AVL tree which holds per-vfs vopstat anchors. This
430 * is necessary since we need to check if a kstat exists before we
431 * attempt to create it. Also, initialize its lock.
433 avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
434 offsetof(vsk_anchor_t, vsk_node));
435 mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
437 vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
438 sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
439 NULL, NULL, 0);
442 * Set up the array of pointers for the vopstats-by-FS-type.
443 * The entries will be allocated/initialized as each file system
444 * goes through modload/mod_installfs.
446 vopstats_fstype = (vopstats_t **)kmem_zalloc(
447 (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
449 /* Set up the global vopstats initialization template */
450 vs_templatep = create_vopstats_template();
454 * We need to have the all of the counters zeroed.
455 * The initialization of the vopstats_t includes on the order of
456 * 50 calls to kstat_named_init(). Rather that do that on every call,
457 * we do it once in a template (vs_templatep) then bcopy it over.
459 void
460 initialize_vopstats(vopstats_t *vsp)
462 if (vsp == NULL)
463 return;
465 bcopy(vs_templatep, vsp, sizeof (vopstats_t));
469 * If possible, determine which vopstats by fstype to use and
470 * return a pointer to the caller.
472 vopstats_t *
473 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
475 int fstype = 0; /* Index into vfssw[] */
476 vopstats_t *vsp = NULL;
478 if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
479 !vopstats_enabled)
480 return (NULL);
482 * Set up the fstype. We go to so much trouble because all versions
483 * of NFS use the same fstype in their vfs even though they have
484 * distinct entries in the vfssw[] table.
485 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
487 if (vswp) {
488 fstype = vswp - vfssw; /* Gets us the index */
489 } else {
490 fstype = vfsp->vfs_fstype;
494 * Point to the per-fstype vopstats. The only valid values are
495 * non-zero positive values less than the number of vfssw[] table
496 * entries.
498 if (fstype > 0 && fstype < nfstype) {
499 vsp = vopstats_fstype[fstype];
502 return (vsp);
506 * Generate a kstat name, create the kstat structure, and allocate a
507 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t
508 * to the caller. This must only be called from a mount.
510 vsk_anchor_t *
511 get_vskstat_anchor(vfs_t *vfsp)
513 char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
514 statvfs64_t statvfsbuf; /* Needed to find f_fsid */
515 vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */
516 kstat_t *ksp; /* Ptr to new kstat */
517 avl_index_t where; /* Location in the AVL tree */
519 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
520 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
521 return (NULL);
523 /* Need to get the fsid to build a kstat name */
524 if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
525 /* Create a name for our kstats based on fsid */
526 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
527 VOPSTATS_STR, statvfsbuf.f_fsid);
529 /* Allocate and initialize the vsk_anchor_t */
530 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
531 bzero(vskp, sizeof (*vskp));
532 vskp->vsk_fsid = statvfsbuf.f_fsid;
534 mutex_enter(&vskstat_tree_lock);
535 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
536 avl_insert(&vskstat_tree, vskp, where);
537 mutex_exit(&vskstat_tree_lock);
540 * Now that we've got the anchor in the AVL
541 * tree, we can create the kstat.
543 ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
544 if (ksp) {
545 vskp->vsk_ksp = ksp;
547 } else {
548 /* Oops, found one! Release memory and lock. */
549 mutex_exit(&vskstat_tree_lock);
550 kmem_cache_free(vsk_anchor_cache, vskp);
551 vskp = NULL;
554 return (vskp);
558 * We're in the process of tearing down the vfs and need to cleanup
559 * the data structures associated with the vopstats. Must only be called
560 * from dounmount().
562 void
563 teardown_vopstats(vfs_t *vfsp)
565 vsk_anchor_t *vskap;
566 avl_index_t where;
568 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
569 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
570 return;
572 /* This is a safe check since VFS_STATS must be set (see above) */
573 if ((vskap = vfsp->vfs_vskap) == NULL)
574 return;
576 /* Whack the pointer right away */
577 vfsp->vfs_vskap = NULL;
579 /* Lock the tree, remove the node, and delete the kstat */
580 mutex_enter(&vskstat_tree_lock);
581 if (avl_find(&vskstat_tree, vskap, &where)) {
582 avl_remove(&vskstat_tree, vskap);
585 if (vskap->vsk_ksp) {
586 kstat_delete(vskap->vsk_ksp);
588 mutex_exit(&vskstat_tree_lock);
590 kmem_cache_free(vsk_anchor_cache, vskap);
594 * Read or write a vnode. Called from kernel code.
597 vn_rdwr(
598 enum uio_rw rw,
599 struct vnode *vp,
600 caddr_t base,
601 ssize_t len,
602 offset_t offset,
603 enum uio_seg seg,
604 int ioflag,
605 rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */
606 cred_t *cr,
607 ssize_t *residp)
609 struct uio uio;
610 struct iovec iov;
611 int error;
612 int in_crit = 0;
614 if (rw == UIO_WRITE && ISROFILE(vp))
615 return (EROFS);
617 if (len < 0)
618 return (EIO);
620 VOPXID_MAP_CR(vp, cr);
622 iov.iov_base = base;
623 iov.iov_len = len;
624 uio.uio_iov = &iov;
625 uio.uio_iovcnt = 1;
626 uio.uio_loffset = offset;
627 uio.uio_segflg = (short)seg;
628 uio.uio_resid = len;
629 uio.uio_llimit = ulimit;
632 * We have to enter the critical region before calling fop_rwlock
633 * to avoid a deadlock with ufs.
635 if (nbl_need_check(vp)) {
636 int svmand;
638 nbl_start_crit(vp, RW_READER);
639 in_crit = 1;
640 error = nbl_svmand(vp, cr, &svmand);
641 if (error != 0)
642 goto done;
643 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
644 uio.uio_offset, uio.uio_resid, svmand, NULL)) {
645 error = EACCES;
646 goto done;
650 (void) fop_rwlock(vp,
651 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
652 if (rw == UIO_WRITE) {
653 uio.uio_fmode = FWRITE;
654 uio.uio_extflg = UIO_COPY_DEFAULT;
655 error = fop_write(vp, &uio, ioflag, cr, NULL);
656 } else {
657 uio.uio_fmode = FREAD;
658 uio.uio_extflg = UIO_COPY_CACHED;
659 error = fop_read(vp, &uio, ioflag, cr, NULL);
661 fop_rwunlock(vp,
662 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
663 if (residp)
664 *residp = uio.uio_resid;
665 else if (uio.uio_resid)
666 error = EIO;
668 done:
669 if (in_crit)
670 nbl_end_crit(vp);
671 return (error);
675 * Release a vnode. Call fop_inactive on last reference or
676 * decrement reference count.
678 * To avoid race conditions, the v_count is left at 1 for
679 * the call to fop_inactive. This prevents another thread
680 * from reclaiming and releasing the vnode *before* the
681 * fop_inactive routine has a chance to destroy the vnode.
682 * We can't have more than 1 thread calling fop_inactive
683 * on a vnode.
685 void
686 vn_rele(vnode_t *vp)
688 VERIFY(vp->v_count > 0);
689 mutex_enter(&vp->v_lock);
690 if (vp->v_count == 1) {
691 mutex_exit(&vp->v_lock);
692 fop_inactive(vp, CRED(), NULL);
693 return;
695 VN_RELE_LOCKED(vp);
696 mutex_exit(&vp->v_lock);
700 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
701 * as a single reference, so v_count is not decremented until the last DNLC hold
702 * is released. This makes it possible to distinguish vnodes that are referenced
703 * only by the DNLC.
705 void
706 vn_rele_dnlc(vnode_t *vp)
708 VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
709 mutex_enter(&vp->v_lock);
710 if (--vp->v_count_dnlc == 0) {
711 if (vp->v_count == 1) {
712 mutex_exit(&vp->v_lock);
713 fop_inactive(vp, CRED(), NULL);
714 return;
716 VN_RELE_LOCKED(vp);
718 mutex_exit(&vp->v_lock);
722 * Like vn_rele() except that it clears v_stream under v_lock.
723 * This is used by sockfs when it dismantles the association between
724 * the sockfs node and the vnode in the underlying file system.
725 * v_lock has to be held to prevent a thread coming through the lookupname
726 * path from accessing a stream head that is going away.
728 void
729 vn_rele_stream(vnode_t *vp)
731 VERIFY(vp->v_count > 0);
732 mutex_enter(&vp->v_lock);
733 vp->v_stream = NULL;
734 if (vp->v_count == 1) {
735 mutex_exit(&vp->v_lock);
736 fop_inactive(vp, CRED(), NULL);
737 return;
739 VN_RELE_LOCKED(vp);
740 mutex_exit(&vp->v_lock);
743 static void
744 vn_rele_inactive(vnode_t *vp)
746 fop_inactive(vp, CRED(), NULL);
750 * Like vn_rele() except if we are going to call fop_inactive() then do it
751 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
752 * the file system as a result of releasing the vnode. Note, file systems
753 * already have to handle the race where the vnode is incremented before the
754 * inactive routine is called and does its locking.
756 * Warning: Excessive use of this routine can lead to performance problems.
757 * This is because taskqs throttle back allocation if too many are created.
759 void
760 vn_rele_async(vnode_t *vp, taskq_t *taskq)
762 VERIFY(vp->v_count > 0);
763 mutex_enter(&vp->v_lock);
764 if (vp->v_count == 1) {
765 mutex_exit(&vp->v_lock);
766 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
767 vp, TQ_SLEEP) != (uintptr_t)NULL);
768 return;
770 VN_RELE_LOCKED(vp);
771 mutex_exit(&vp->v_lock);
775 vn_open(
776 char *pnamep,
777 enum uio_seg seg,
778 int filemode,
779 int createmode,
780 struct vnode **vpp,
781 enum create crwhy,
782 mode_t umask)
784 return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
785 umask, NULL, -1));
790 * Open/create a vnode.
791 * This may be callable by the kernel, the only known use
792 * of user context being that the current user credentials
793 * are used for permissions. crwhy is defined iff filemode & FCREAT.
796 vn_openat(
797 char *pnamep,
798 enum uio_seg seg,
799 int filemode,
800 int createmode,
801 struct vnode **vpp,
802 enum create crwhy,
803 mode_t umask,
804 struct vnode *startvp,
805 int fd)
807 struct vnode *vp;
808 int mode;
809 int accessflags;
810 int error;
811 int in_crit = 0;
812 int open_done = 0;
813 int shrlock_done = 0;
814 struct vattr vattr;
815 enum symfollow follow;
816 int estale_retry = 0;
817 struct shrlock shr;
818 struct shr_locowner shr_own;
820 if (filemode & FSEARCH)
821 filemode |= FDIRECTORY;
823 mode = 0;
824 accessflags = 0;
825 if (filemode & FREAD)
826 mode |= VREAD;
827 if (filemode & (FWRITE|FTRUNC))
828 mode |= VWRITE;
829 if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
830 mode |= VEXEC;
832 /* symlink interpretation */
833 if (filemode & FNOFOLLOW)
834 follow = NO_FOLLOW;
835 else
836 follow = FOLLOW;
838 if (filemode & FAPPEND)
839 accessflags |= V_APPEND;
841 top:
842 if (filemode & FCREAT && !(filemode & FDIRECTORY)) {
843 enum vcexcl excl;
845 /* Wish to create a file. */
846 vattr.va_type = VREG;
847 vattr.va_mode = createmode;
848 vattr.va_mask = VATTR_TYPE|VATTR_MODE;
849 if (filemode & FTRUNC) {
850 vattr.va_size = 0;
851 vattr.va_mask |= VATTR_SIZE;
853 if (filemode & FEXCL)
854 excl = EXCL;
855 else
856 excl = NONEXCL;
858 if (error =
859 vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
860 (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
861 return (error);
862 } else {
863 /* Wish to open a file. Just look it up. */
864 if (error = lookupnameat(pnamep, seg, follow,
865 NULLVPP, &vp, startvp)) {
866 if ((error == ESTALE) &&
867 fs_need_estale_retry(estale_retry++))
868 goto top;
869 return (error);
873 * Get the attributes to check whether file is large.
874 * We do this only if the FOFFMAX flag is not set and
875 * only for regular files.
878 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
879 vattr.va_mask = VATTR_SIZE;
880 if ((error = fop_getattr(vp, &vattr, 0,
881 CRED(), NULL))) {
882 goto out;
884 if (vattr.va_size > (uoff_t)MAXOFF32_T) {
886 * Large File API - regular open fails
887 * if FOFFMAX flag is set in file mode
889 error = EOVERFLOW;
890 goto out;
894 * Can't write directories, active texts, or
895 * read-only filesystems. Can't truncate files
896 * on which mandatory locking is in effect.
898 if (filemode & (FWRITE|FTRUNC)) {
900 * Allow writable directory if VDIROPEN flag is set.
902 if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
903 error = EISDIR;
904 goto out;
906 if (ISROFILE(vp)) {
907 error = EROFS;
908 goto out;
911 * Can't truncate files on which
912 * sysv mandatory locking is in effect.
914 if (filemode & FTRUNC) {
915 vnode_t *rvp;
917 if (fop_realvp(vp, &rvp, NULL) != 0)
918 rvp = vp;
919 if (rvp->v_filocks != NULL) {
920 vattr.va_mask = VATTR_MODE;
921 if ((error = fop_getattr(vp,
922 &vattr, 0, CRED(), NULL)) == 0 &&
923 MANDLOCK(vp, vattr.va_mode))
924 error = EAGAIN;
927 if (error)
928 goto out;
931 * Check permissions.
933 if (error = fop_access(vp, mode, accessflags, CRED(), NULL))
934 goto out;
936 * Require FDIRECTORY to return a directory.
937 * Require FEXEC to return a regular file.
939 if ((filemode & FDIRECTORY) && vp->v_type != VDIR) {
940 error = ENOTDIR;
941 goto out;
943 if ((filemode & FEXEC) && vp->v_type != VREG) {
944 error = ENOEXEC; /* XXX: error code? */
945 goto out;
950 * Do remaining checks for FNOFOLLOW and FNOLINKS.
952 if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
953 error = ELOOP;
954 goto out;
956 if (filemode & FNOLINKS) {
957 vattr.va_mask = VATTR_NLINK;
958 if ((error = fop_getattr(vp, &vattr, 0, CRED(), NULL))) {
959 goto out;
961 if (vattr.va_nlink != 1) {
962 error = EMLINK;
963 goto out;
968 * Opening a socket corresponding to the AF_UNIX pathname
969 * in the filesystem name space is not supported.
970 * However, VSOCK nodes in namefs are supported in order
971 * to make fattach work for sockets.
973 * XXX This uses fop_realvp to distinguish between
974 * an unopened namefs node (where fop_realvp returns a
975 * different VSOCK vnode) and a VSOCK created by vn_create
976 * in some file system (where fop_realvp would never return
977 * a different vnode).
979 if (vp->v_type == VSOCK) {
980 struct vnode *nvp;
982 error = fop_realvp(vp, &nvp, NULL);
983 if (error != 0 || nvp == NULL || nvp == vp ||
984 nvp->v_type != VSOCK) {
985 error = EOPNOTSUPP;
986 goto out;
990 if ((vp->v_type == VREG) && nbl_need_check(vp)) {
991 /* get share reservation */
992 shr.s_access = 0;
993 if (filemode & FWRITE)
994 shr.s_access |= F_WRACC;
995 if (filemode & FREAD)
996 shr.s_access |= F_RDACC;
997 shr.s_deny = 0;
998 shr.s_sysid = 0;
999 shr.s_pid = ttoproc(curthread)->p_pid;
1000 shr_own.sl_pid = shr.s_pid;
1001 shr_own.sl_id = fd;
1002 shr.s_own_len = sizeof (shr_own);
1003 shr.s_owner = (caddr_t)&shr_own;
1004 error = fop_shrlock(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1005 NULL);
1006 if (error)
1007 goto out;
1008 shrlock_done = 1;
1010 /* nbmand conflict check if truncating file */
1011 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1012 nbl_start_crit(vp, RW_READER);
1013 in_crit = 1;
1015 vattr.va_mask = VATTR_SIZE;
1016 if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL))
1017 goto out;
1018 if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1019 NULL)) {
1020 error = EACCES;
1021 goto out;
1027 * Do opening protocol.
1029 error = fop_open(&vp, filemode, CRED(), NULL);
1030 if (error)
1031 goto out;
1032 open_done = 1;
1035 * Truncate if required.
1037 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1038 vattr.va_size = 0;
1039 vattr.va_mask = VATTR_SIZE;
1040 if ((error = fop_setattr(vp, &vattr, 0, CRED(), NULL)) != 0)
1041 goto out;
1043 out:
1044 ASSERT(vp->v_count > 0);
1046 if (in_crit) {
1047 nbl_end_crit(vp);
1048 in_crit = 0;
1050 if (error) {
1051 if (open_done) {
1052 (void) fop_close(vp, filemode, 1, 0, CRED(),
1053 NULL);
1054 open_done = 0;
1055 shrlock_done = 0;
1057 if (shrlock_done) {
1058 (void) fop_shrlock(vp, F_UNSHARE, &shr, 0, CRED(),
1059 NULL);
1060 shrlock_done = 0;
1064 * The following clause was added to handle a problem
1065 * with NFS consistency. It is possible that a lookup
1066 * of the file to be opened succeeded, but the file
1067 * itself doesn't actually exist on the server. This
1068 * is chiefly due to the DNLC containing an entry for
1069 * the file which has been removed on the server. In
1070 * this case, we just start over. If there was some
1071 * other cause for the ESTALE error, then the lookup
1072 * of the file will fail and the error will be returned
1073 * above instead of looping around from here.
1075 VN_RELE(vp);
1076 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1077 goto top;
1078 } else
1079 *vpp = vp;
1080 return (error);
1084 * The following two accessor functions are for the NFSv4 server. Since there
1085 * is no fop_open_UP/DOWNGRADE we need a way for the NFS server to keep the
1086 * vnode open counts correct when a client "upgrades" an open or does an
1087 * open_downgrade. In NFS, an upgrade or downgrade can not only change the
1088 * open mode (add or subtract read or write), but also change the share/deny
1089 * modes. However, share reservations are not integrated with OPEN, yet, so
1090 * we need to handle each separately. These functions are cleaner than having
1091 * the NFS server manipulate the counts directly, however, nobody else should
1092 * use these functions.
1094 void
1095 vn_open_upgrade(
1096 vnode_t *vp,
1097 int filemode)
1099 ASSERT(vp->v_type == VREG);
1101 if (filemode & FREAD)
1102 atomic_inc_32(&vp->v_rdcnt);
1103 if (filemode & FWRITE)
1104 atomic_inc_32(&vp->v_wrcnt);
1108 void
1109 vn_open_downgrade(
1110 vnode_t *vp,
1111 int filemode)
1113 ASSERT(vp->v_type == VREG);
1115 if (filemode & FREAD) {
1116 ASSERT(vp->v_rdcnt > 0);
1117 atomic_dec_32(&vp->v_rdcnt);
1119 if (filemode & FWRITE) {
1120 ASSERT(vp->v_wrcnt > 0);
1121 atomic_dec_32(&vp->v_wrcnt);
1127 vn_create(
1128 char *pnamep,
1129 enum uio_seg seg,
1130 struct vattr *vap,
1131 enum vcexcl excl,
1132 int mode,
1133 struct vnode **vpp,
1134 enum create why,
1135 int flag,
1136 mode_t umask)
1138 return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1139 umask, NULL));
1143 * Create a vnode (makenode).
1146 vn_createat(
1147 char *pnamep,
1148 enum uio_seg seg,
1149 struct vattr *vap,
1150 enum vcexcl excl,
1151 int mode,
1152 struct vnode **vpp,
1153 enum create why,
1154 int flag,
1155 mode_t umask,
1156 struct vnode *startvp)
1158 struct vnode *dvp; /* ptr to parent dir vnode */
1159 struct vnode *vp = NULL;
1160 struct pathname pn;
1161 int error;
1162 int in_crit = 0;
1163 struct vattr vattr;
1164 enum symfollow follow;
1165 int estale_retry = 0;
1167 ASSERT((vap->va_mask & (VATTR_TYPE|VATTR_MODE)) == (VATTR_TYPE|VATTR_MODE));
1169 /* symlink interpretation */
1170 if ((flag & FNOFOLLOW) || excl == EXCL)
1171 follow = NO_FOLLOW;
1172 else
1173 follow = FOLLOW;
1174 flag &= ~(FNOFOLLOW|FNOLINKS);
1176 top:
1178 * Lookup directory.
1179 * If new object is a file, call lower level to create it.
1180 * Note that it is up to the lower level to enforce exclusive
1181 * creation, if the file is already there.
1182 * This allows the lower level to do whatever
1183 * locking or protocol that is needed to prevent races.
1184 * If the new object is directory call lower level to make
1185 * the new directory, with "." and "..".
1187 if (error = pn_get(pnamep, seg, &pn))
1188 return (error);
1189 dvp = NULL;
1190 *vpp = NULL;
1192 * lookup will find the parent directory for the vnode.
1193 * When it is done the pn holds the name of the entry
1194 * in the directory.
1195 * If this is a non-exclusive create we also find the node itself.
1197 error = lookuppnat(&pn, NULL, follow, &dvp,
1198 (excl == EXCL) ? NULLVPP : vpp, startvp);
1199 if (error) {
1200 pn_free(&pn);
1201 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1202 goto top;
1203 if (why == CRMKDIR && error == EINVAL)
1204 error = EEXIST; /* SVID */
1205 return (error);
1208 if (why != CRMKNOD)
1209 vap->va_mode &= ~VSVTX;
1212 * If default ACLs are defined for the directory don't apply the
1213 * umask if umask is passed.
1216 if (umask) {
1218 vsecattr_t vsec;
1220 vsec.vsa_aclcnt = 0;
1221 vsec.vsa_aclentp = NULL;
1222 vsec.vsa_dfaclcnt = 0;
1223 vsec.vsa_dfaclentp = NULL;
1224 vsec.vsa_mask = VSA_DFACLCNT;
1225 error = fop_getsecattr(dvp, &vsec, 0, CRED(), NULL);
1227 * If error is ENOSYS then treat it as no error
1228 * Don't want to force all file systems to support
1229 * aclent_t style of ACL's.
1231 if (error == ENOSYS)
1232 error = 0;
1233 if (error) {
1234 if (*vpp != NULL)
1235 VN_RELE(*vpp);
1236 goto out;
1237 } else {
1239 * Apply the umask if no default ACLs.
1241 if (vsec.vsa_dfaclcnt == 0)
1242 vap->va_mode &= ~umask;
1245 * fop_getsecattr() may have allocated memory for
1246 * ACLs we didn't request, so double-check and
1247 * free it if necessary.
1249 if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1250 kmem_free((caddr_t)vsec.vsa_aclentp,
1251 vsec.vsa_aclcnt * sizeof (aclent_t));
1252 if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1253 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1254 vsec.vsa_dfaclcnt * sizeof (aclent_t));
1259 * In general we want to generate EROFS if the file system is
1260 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1
1261 * documents the open system call, and it says that O_CREAT has no
1262 * effect if the file already exists. Bug 1119649 states
1263 * that open(path, O_CREAT, ...) fails when attempting to open an
1264 * existing file on a read only file system. Thus, the first part
1265 * of the following if statement has 3 checks:
1266 * if the file exists &&
1267 * it is being open with write access &&
1268 * the file system is read only
1269 * then generate EROFS
1271 if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1272 (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1273 if (*vpp)
1274 VN_RELE(*vpp);
1275 error = EROFS;
1276 } else if (excl == NONEXCL && *vpp != NULL) {
1277 vnode_t *rvp;
1280 * File already exists. If a mandatory lock has been
1281 * applied, return error.
1283 vp = *vpp;
1284 if (fop_realvp(vp, &rvp, NULL) != 0)
1285 rvp = vp;
1286 if ((vap->va_mask & VATTR_SIZE) && nbl_need_check(vp)) {
1287 nbl_start_crit(vp, RW_READER);
1288 in_crit = 1;
1290 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1291 vattr.va_mask = VATTR_MODE|VATTR_SIZE;
1292 if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL)) {
1293 goto out;
1295 if (MANDLOCK(vp, vattr.va_mode)) {
1296 error = EAGAIN;
1297 goto out;
1300 * File cannot be truncated if non-blocking mandatory
1301 * locks are currently on the file.
1303 if ((vap->va_mask & VATTR_SIZE) && in_crit) {
1304 uoff_t offset;
1305 ssize_t length;
1307 offset = vap->va_size > vattr.va_size ?
1308 vattr.va_size : vap->va_size;
1309 length = vap->va_size > vattr.va_size ?
1310 vap->va_size - vattr.va_size :
1311 vattr.va_size - vap->va_size;
1312 if (nbl_conflict(vp, NBL_WRITE, offset,
1313 length, 0, NULL)) {
1314 error = EACCES;
1315 goto out;
1321 * If the file is the root of a VFS, we've crossed a
1322 * mount point and the "containing" directory that we
1323 * acquired above (dvp) is irrelevant because it's in
1324 * a different file system. We apply fop_create to the
1325 * target itself instead of to the containing directory
1326 * and supply a null path name to indicate (conventionally)
1327 * the node itself as the "component" of interest.
1329 * The call to fop_create() is necessary to ensure
1330 * that the appropriate permission checks are made,
1331 * i.e. EISDIR, EACCES, etc. We already know that vpp
1332 * exists since we are in the else condition where this
1333 * was checked.
1335 if (vp->v_flag & VROOT) {
1336 ASSERT(why != CRMKDIR);
1337 error = fop_create(vp, "", vap, excl, mode, vpp,
1338 CRED(), flag, NULL, NULL);
1340 * If the create succeeded, it will have created a
1341 * new reference on a new vnode (*vpp) in the child
1342 * file system, so we want to drop our reference on
1343 * the old (vp) upon exit.
1345 goto out;
1349 * Large File API - non-large open (FOFFMAX flag not set)
1350 * of regular file fails if the file size exceeds MAXOFF32_T.
1352 if (why != CRMKDIR &&
1353 !(flag & FOFFMAX) &&
1354 (vp->v_type == VREG)) {
1355 vattr.va_mask = VATTR_SIZE;
1356 if ((error = fop_getattr(vp, &vattr, 0,
1357 CRED(), NULL))) {
1358 goto out;
1360 if ((vattr.va_size > (uoff_t)MAXOFF32_T)) {
1361 error = EOVERFLOW;
1362 goto out;
1367 if (error == 0) {
1369 * Call mkdir() if specified, otherwise create().
1371 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1373 if (why == CRMKDIR)
1375 * N.B., if vn_createat() ever requests
1376 * case-insensitive behavior then it will need
1377 * to be passed to fop_mkdir(). fop_create()
1378 * will already get it via "flag"
1380 error = fop_mkdir(dvp, pn.pn_path, vap, vpp, CRED(),
1381 NULL, 0, NULL);
1382 else if (!must_be_dir)
1383 error = fop_create(dvp, pn.pn_path, vap,
1384 excl, mode, vpp, CRED(), flag, NULL, NULL);
1385 else
1386 error = ENOTDIR;
1389 out:
1391 if (in_crit) {
1392 nbl_end_crit(vp);
1393 in_crit = 0;
1395 if (vp != NULL) {
1396 VN_RELE(vp);
1397 vp = NULL;
1399 pn_free(&pn);
1400 VN_RELE(dvp);
1402 * The following clause was added to handle a problem
1403 * with NFS consistency. It is possible that a lookup
1404 * of the file to be created succeeded, but the file
1405 * itself doesn't actually exist on the server. This
1406 * is chiefly due to the DNLC containing an entry for
1407 * the file which has been removed on the server. In
1408 * this case, we just start over. If there was some
1409 * other cause for the ESTALE error, then the lookup
1410 * of the file will fail and the error will be returned
1411 * above instead of looping around from here.
1413 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1414 goto top;
1415 return (error);
1419 vn_link(char *from, char *to, enum uio_seg seg)
1421 return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1425 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1426 vnode_t *tstartvp, char *to, enum uio_seg seg)
1428 struct vnode *fvp; /* from vnode ptr */
1429 struct vnode *tdvp; /* to directory vnode ptr */
1430 struct pathname pn;
1431 int error;
1432 struct vattr vattr;
1433 dev_t fsid;
1434 int estale_retry = 0;
1436 top:
1437 fvp = tdvp = NULL;
1438 if (error = pn_get(to, seg, &pn))
1439 return (error);
1440 if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1441 goto out;
1442 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1443 goto out;
1445 * Make sure both source vnode and target directory vnode are
1446 * in the same vfs and that it is writeable.
1448 vattr.va_mask = VATTR_FSID;
1449 if (error = fop_getattr(fvp, &vattr, 0, CRED(), NULL))
1450 goto out;
1451 fsid = vattr.va_fsid;
1452 vattr.va_mask = VATTR_FSID;
1453 if (error = fop_getattr(tdvp, &vattr, 0, CRED(), NULL))
1454 goto out;
1455 if (fsid != vattr.va_fsid) {
1456 error = EXDEV;
1457 goto out;
1459 if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1460 error = EROFS;
1461 goto out;
1464 * Do the link.
1466 (void) pn_fixslash(&pn);
1467 error = fop_link(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1468 out:
1469 pn_free(&pn);
1470 if (fvp)
1471 VN_RELE(fvp);
1472 if (tdvp)
1473 VN_RELE(tdvp);
1474 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1475 goto top;
1476 return (error);
1480 vn_rename(char *from, char *to, enum uio_seg seg)
1482 return (vn_renameat(NULL, from, NULL, to, seg));
1486 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1487 char *tname, enum uio_seg seg)
1489 int error;
1490 struct vattr vattr;
1491 struct pathname fpn; /* from pathname */
1492 struct pathname tpn; /* to pathname */
1493 dev_t fsid;
1494 int in_crit_src, in_crit_targ;
1495 vnode_t *fromvp, *fvp;
1496 vnode_t *tovp, *targvp;
1497 int estale_retry = 0;
1499 top:
1500 fvp = fromvp = tovp = targvp = NULL;
1501 in_crit_src = in_crit_targ = 0;
1503 * Get to and from pathnames.
1505 if (error = pn_get(fname, seg, &fpn))
1506 return (error);
1507 if (error = pn_get(tname, seg, &tpn)) {
1508 pn_free(&fpn);
1509 return (error);
1513 * First we need to resolve the correct directories
1514 * The passed in directories may only be a starting point,
1515 * but we need the real directories the file(s) live in.
1516 * For example the fname may be something like usr/lib/sparc
1517 * and we were passed in the / directory, but we need to
1518 * use the lib directory for the rename.
1522 * Lookup to and from directories.
1524 if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1525 goto out;
1529 * Make sure there is an entry.
1531 if (fvp == NULL) {
1532 error = ENOENT;
1533 goto out;
1536 if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1537 goto out;
1541 * Make sure both the from vnode directory and the to directory
1542 * are in the same vfs and the to directory is writable.
1543 * We check fsid's, not vfs pointers, so loopback fs works.
1545 if (fromvp != tovp) {
1546 vattr.va_mask = VATTR_FSID;
1547 if (error = fop_getattr(fromvp, &vattr, 0, CRED(), NULL))
1548 goto out;
1549 fsid = vattr.va_fsid;
1550 vattr.va_mask = VATTR_FSID;
1551 if (error = fop_getattr(tovp, &vattr, 0, CRED(), NULL))
1552 goto out;
1553 if (fsid != vattr.va_fsid) {
1554 error = EXDEV;
1555 goto out;
1559 if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1560 error = EROFS;
1561 goto out;
1565 * Make sure "from" vp is not a mount point.
1566 * Note, lookup did traverse() already, so
1567 * we'll be looking at the mounted FS root.
1568 * (but allow files like mnttab)
1570 if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1571 error = EBUSY;
1572 goto out;
1575 if (targvp && (fvp != targvp)) {
1576 nbl_start_crit(targvp, RW_READER);
1577 in_crit_targ = 1;
1578 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1579 error = EACCES;
1580 goto out;
1584 if (nbl_need_check(fvp)) {
1585 nbl_start_crit(fvp, RW_READER);
1586 in_crit_src = 1;
1587 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1588 error = EACCES;
1589 goto out;
1594 * Do the rename.
1596 (void) pn_fixslash(&tpn);
1597 error = fop_rename(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1598 NULL, 0);
1600 out:
1601 pn_free(&fpn);
1602 pn_free(&tpn);
1603 if (in_crit_src)
1604 nbl_end_crit(fvp);
1605 if (in_crit_targ)
1606 nbl_end_crit(targvp);
1607 if (fromvp)
1608 VN_RELE(fromvp);
1609 if (tovp)
1610 VN_RELE(tovp);
1611 if (targvp)
1612 VN_RELE(targvp);
1613 if (fvp)
1614 VN_RELE(fvp);
1615 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1616 goto top;
1617 return (error);
1621 * Remove a file or directory.
1624 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1626 return (vn_removeat(NULL, fnamep, seg, dirflag));
1630 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1632 struct vnode *vp; /* entry vnode */
1633 struct vnode *dvp; /* ptr to parent dir vnode */
1634 struct vnode *coveredvp;
1635 struct pathname pn; /* name of entry */
1636 enum vtype vtype;
1637 int error;
1638 struct vfs *vfsp;
1639 struct vfs *dvfsp; /* ptr to parent dir vfs */
1640 int in_crit = 0;
1641 int estale_retry = 0;
1643 top:
1644 if (error = pn_get(fnamep, seg, &pn))
1645 return (error);
1646 dvp = vp = NULL;
1647 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1648 pn_free(&pn);
1649 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1650 goto top;
1651 return (error);
1655 * Make sure there is an entry.
1657 if (vp == NULL) {
1658 error = ENOENT;
1659 goto out;
1662 vfsp = vp->v_vfsp;
1663 dvfsp = dvp->v_vfsp;
1666 * If the named file is the root of a mounted filesystem, fail,
1667 * unless it's marked unlinkable. In that case, unmount the
1668 * filesystem and proceed to unlink the covered vnode. (If the
1669 * covered vnode is a directory, use rmdir instead of unlink,
1670 * to avoid file system corruption.)
1672 if (vp->v_flag & VROOT) {
1673 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1674 error = EBUSY;
1675 goto out;
1679 * Namefs specific code starts here.
1682 if (dirflag == RMDIRECTORY) {
1684 * User called rmdir(2) on a file that has
1685 * been namefs mounted on top of. Since
1686 * namefs doesn't allow directories to
1687 * be mounted on other files we know
1688 * vp is not of type VDIR so fail to operation.
1690 error = ENOTDIR;
1691 goto out;
1695 * If VROOT is still set after grabbing vp->v_lock,
1696 * noone has finished nm_unmount so far and coveredvp
1697 * is valid.
1698 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1699 * vp->v_lock, any race window is eliminated.
1702 mutex_enter(&vp->v_lock);
1703 if ((vp->v_flag & VROOT) == 0) {
1704 /* Someone beat us to the unmount */
1705 mutex_exit(&vp->v_lock);
1706 error = EBUSY;
1707 goto out;
1709 vfsp = vp->v_vfsp;
1710 coveredvp = vfsp->vfs_vnodecovered;
1711 ASSERT(coveredvp);
1713 * Note: Implementation of vn_vfswlock shows that ordering of
1714 * v_lock / vn_vfswlock is not an issue here.
1716 error = vn_vfswlock(coveredvp);
1717 mutex_exit(&vp->v_lock);
1719 if (error)
1720 goto out;
1722 VN_HOLD(coveredvp);
1723 VN_RELE(vp);
1724 error = dounmount(vfsp, 0, CRED());
1727 * Unmounted the namefs file system; now get
1728 * the object it was mounted over.
1730 vp = coveredvp;
1732 * If namefs was mounted over a directory, then
1733 * we want to use rmdir() instead of unlink().
1735 if (vp->v_type == VDIR)
1736 dirflag = RMDIRECTORY;
1738 if (error)
1739 goto out;
1743 * Make sure filesystem is writeable.
1744 * We check the parent directory's vfs in case this is an lofs vnode.
1746 if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1747 error = EROFS;
1748 goto out;
1751 vtype = vp->v_type;
1754 * If there is the possibility of an nbmand share reservation, make
1755 * sure it's okay to remove the file. Keep a reference to the
1756 * vnode, so that we can exit the nbl critical region after
1757 * calling fop_remove.
1758 * If there is no possibility of an nbmand share reservation,
1759 * release the vnode reference now. Filesystems like NFS may
1760 * behave differently if there is an extra reference, so get rid of
1761 * this one. Fortunately, we can't have nbmand mounts on NFS
1762 * filesystems.
1764 if (nbl_need_check(vp)) {
1765 nbl_start_crit(vp, RW_READER);
1766 in_crit = 1;
1767 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1768 error = EACCES;
1769 goto out;
1771 } else {
1772 VN_RELE(vp);
1773 vp = NULL;
1776 if (dirflag == RMDIRECTORY) {
1778 * Caller is using rmdir(2), which can only be applied to
1779 * directories.
1781 if (vtype != VDIR) {
1782 error = ENOTDIR;
1783 } else {
1784 vnode_t *cwd;
1785 proc_t *pp = curproc;
1787 mutex_enter(&pp->p_lock);
1788 cwd = PTOU(pp)->u_cdir;
1789 VN_HOLD(cwd);
1790 mutex_exit(&pp->p_lock);
1791 error = fop_rmdir(dvp, pn.pn_path, cwd, CRED(),
1792 NULL, 0);
1793 VN_RELE(cwd);
1795 } else {
1797 * Unlink(2) can be applied to anything.
1799 error = fop_remove(dvp, pn.pn_path, CRED(), NULL, 0);
1802 out:
1803 pn_free(&pn);
1804 if (in_crit) {
1805 nbl_end_crit(vp);
1806 in_crit = 0;
1808 if (vp != NULL)
1809 VN_RELE(vp);
1810 if (dvp != NULL)
1811 VN_RELE(dvp);
1812 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1813 goto top;
1814 return (error);
1818 * Utility function to compare equality of vnodes.
1819 * Compare the underlying real vnodes, if there are underlying vnodes.
1820 * This is a more thorough comparison than the VN_CMP() macro provides.
1823 vn_compare(vnode_t *vp1, vnode_t *vp2)
1825 vnode_t *realvp;
1827 if (vp1 != NULL && fop_realvp(vp1, &realvp, NULL) == 0)
1828 vp1 = realvp;
1829 if (vp2 != NULL && fop_realvp(vp2, &realvp, NULL) == 0)
1830 vp2 = realvp;
1831 return (VN_CMP(vp1, vp2));
1835 * The number of locks to hash into. This value must be a power
1836 * of 2 minus 1 and should probably also be prime.
1838 #define NUM_BUCKETS 1023
1840 struct vn_vfslocks_bucket {
1841 kmutex_t vb_lock;
1842 vn_vfslocks_entry_t *vb_list;
1843 char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
1847 * Total number of buckets will be NUM_BUCKETS + 1 .
1850 #pragma align 64(vn_vfslocks_buckets)
1851 static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1];
1853 #define VN_VFSLOCKS_SHIFT 9
1855 #define VN_VFSLOCKS_HASH(vfsvpptr) \
1856 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
1859 * vn_vfslocks_getlock() uses an HASH scheme to generate
1860 * rwstlock using vfs/vnode pointer passed to it.
1862 * vn_vfslocks_rele() releases a reference in the
1863 * HASH table which allows the entry allocated by
1864 * vn_vfslocks_getlock() to be freed at a later
1865 * stage when the refcount drops to zero.
1868 vn_vfslocks_entry_t *
1869 vn_vfslocks_getlock(void *vfsvpptr)
1871 struct vn_vfslocks_bucket *bp;
1872 vn_vfslocks_entry_t *vep;
1873 vn_vfslocks_entry_t *tvep;
1875 ASSERT(vfsvpptr != NULL);
1876 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
1878 mutex_enter(&bp->vb_lock);
1879 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
1880 if (vep->ve_vpvfs == vfsvpptr) {
1881 vep->ve_refcnt++;
1882 mutex_exit(&bp->vb_lock);
1883 return (vep);
1886 mutex_exit(&bp->vb_lock);
1887 vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
1888 rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
1889 vep->ve_vpvfs = (char *)vfsvpptr;
1890 vep->ve_refcnt = 1;
1891 mutex_enter(&bp->vb_lock);
1892 for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
1893 if (tvep->ve_vpvfs == vfsvpptr) {
1894 tvep->ve_refcnt++;
1895 mutex_exit(&bp->vb_lock);
1898 * There is already an entry in the hash
1899 * destroy what we just allocated.
1901 rwst_destroy(&vep->ve_lock);
1902 kmem_free(vep, sizeof (*vep));
1903 return (tvep);
1906 vep->ve_next = bp->vb_list;
1907 bp->vb_list = vep;
1908 mutex_exit(&bp->vb_lock);
1909 return (vep);
1912 void
1913 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
1915 struct vn_vfslocks_bucket *bp;
1916 vn_vfslocks_entry_t *vep;
1917 vn_vfslocks_entry_t *pvep;
1919 ASSERT(vepent != NULL);
1920 ASSERT(vepent->ve_vpvfs != NULL);
1922 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
1924 mutex_enter(&bp->vb_lock);
1925 vepent->ve_refcnt--;
1927 if ((int32_t)vepent->ve_refcnt < 0)
1928 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
1930 if (vepent->ve_refcnt == 0) {
1931 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
1932 if (vep->ve_vpvfs == vepent->ve_vpvfs) {
1933 if (bp->vb_list == vep)
1934 bp->vb_list = vep->ve_next;
1935 else {
1936 /* LINTED */
1937 pvep->ve_next = vep->ve_next;
1939 mutex_exit(&bp->vb_lock);
1940 rwst_destroy(&vep->ve_lock);
1941 kmem_free(vep, sizeof (*vep));
1942 return;
1944 pvep = vep;
1946 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
1948 mutex_exit(&bp->vb_lock);
1952 * vn_vfswlock_wait is used to implement a lock which is logically a writers
1953 * lock protecting the v_vfsmountedhere field.
1954 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
1955 * except that it blocks to acquire the lock VVFSLOCK.
1957 * traverse() and routines re-implementing part of traverse (e.g. autofs)
1958 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
1959 * need the non-blocking version of the writers lock i.e. vn_vfswlock
1962 vn_vfswlock_wait(vnode_t *vp)
1964 int retval;
1965 vn_vfslocks_entry_t *vpvfsentry;
1966 ASSERT(vp != NULL);
1968 vpvfsentry = vn_vfslocks_getlock(vp);
1969 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
1971 if (retval == EINTR) {
1972 vn_vfslocks_rele(vpvfsentry);
1973 return (EINTR);
1975 return (retval);
1979 vn_vfsrlock_wait(vnode_t *vp)
1981 int retval;
1982 vn_vfslocks_entry_t *vpvfsentry;
1983 ASSERT(vp != NULL);
1985 vpvfsentry = vn_vfslocks_getlock(vp);
1986 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
1988 if (retval == EINTR) {
1989 vn_vfslocks_rele(vpvfsentry);
1990 return (EINTR);
1993 return (retval);
1998 * vn_vfswlock is used to implement a lock which is logically a writers lock
1999 * protecting the v_vfsmountedhere field.
2002 vn_vfswlock(vnode_t *vp)
2004 vn_vfslocks_entry_t *vpvfsentry;
2007 * If vp is NULL then somebody is trying to lock the covered vnode
2008 * of /. (vfs_vnodecovered is NULL for /). This situation will
2009 * only happen when unmounting /. Since that operation will fail
2010 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2012 if (vp == NULL)
2013 return (EBUSY);
2015 vpvfsentry = vn_vfslocks_getlock(vp);
2017 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2018 return (0);
2020 vn_vfslocks_rele(vpvfsentry);
2021 return (EBUSY);
2025 vn_vfsrlock(vnode_t *vp)
2027 vn_vfslocks_entry_t *vpvfsentry;
2030 * If vp is NULL then somebody is trying to lock the covered vnode
2031 * of /. (vfs_vnodecovered is NULL for /). This situation will
2032 * only happen when unmounting /. Since that operation will fail
2033 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2035 if (vp == NULL)
2036 return (EBUSY);
2038 vpvfsentry = vn_vfslocks_getlock(vp);
2040 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2041 return (0);
2043 vn_vfslocks_rele(vpvfsentry);
2044 return (EBUSY);
2047 void
2048 vn_vfsunlock(vnode_t *vp)
2050 vn_vfslocks_entry_t *vpvfsentry;
2053 * ve_refcnt needs to be decremented twice.
2054 * 1. To release refernce after a call to vn_vfslocks_getlock()
2055 * 2. To release the reference from the locking routines like
2056 * vn_vfsrlock/vn_vfswlock etc,.
2058 vpvfsentry = vn_vfslocks_getlock(vp);
2059 vn_vfslocks_rele(vpvfsentry);
2061 rwst_exit(&vpvfsentry->ve_lock);
2062 vn_vfslocks_rele(vpvfsentry);
2066 vn_vfswlock_held(vnode_t *vp)
2068 int held;
2069 vn_vfslocks_entry_t *vpvfsentry;
2071 ASSERT(vp != NULL);
2073 vpvfsentry = vn_vfslocks_getlock(vp);
2074 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2076 vn_vfslocks_rele(vpvfsentry);
2077 return (held);
2082 * Vnode cache.
2085 /* ARGSUSED */
2086 static int
2087 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2089 struct vnode *vp;
2091 vp = buf;
2093 mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2094 mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2095 cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2096 rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2097 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2098 vp->v_path = vn_vpath_empty;
2099 vp->v_path_stamp = 0;
2100 vp->v_mpssdata = NULL;
2101 vp->v_vsd = NULL;
2102 vp->v_fopdata = NULL;
2104 vmobject_init(&vp->v_object, vp);
2106 return (0);
2109 /* ARGSUSED */
2110 static void
2111 vn_cache_destructor(void *buf, void *cdrarg)
2113 struct vnode *vp;
2115 vp = buf;
2117 vmobject_fini(&vp->v_object);
2119 rw_destroy(&vp->v_nbllock);
2120 cv_destroy(&vp->v_cv);
2121 mutex_destroy(&vp->v_vsd_lock);
2122 mutex_destroy(&vp->v_lock);
2125 void
2126 vn_create_cache(void)
2128 /* LINTED */
2129 ASSERT((1 << VNODE_ALIGN_LOG2) ==
2130 P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2131 vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2132 VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2133 NULL, 0);
2136 void
2137 vn_destroy_cache(void)
2139 kmem_cache_destroy(vn_cache);
2143 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2144 * cached by the file system and vnodes remain associated.
2146 void
2147 vn_recycle(vnode_t *vp)
2149 ASSERT(!vn_has_cached_data(vp));
2150 VERIFY(vp->v_path != NULL);
2153 * XXX - This really belongs in vn_reinit(), but we have some issues
2154 * with the counts. Best to have it here for clean initialization.
2156 vp->v_rdcnt = 0;
2157 vp->v_wrcnt = 0;
2158 vp->v_mmap_read = 0;
2159 vp->v_mmap_write = 0;
2162 * If FEM was in use, make sure everything gets cleaned up
2163 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2164 * constructor.
2166 if (vp->v_femhead) {
2167 /* XXX - There should be a free_femhead() that does all this */
2168 ASSERT(vp->v_femhead->femh_list == NULL);
2169 mutex_destroy(&vp->v_femhead->femh_lock);
2170 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2171 vp->v_femhead = NULL;
2173 if (vp->v_path != vn_vpath_empty) {
2174 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2175 vp->v_path = vn_vpath_empty;
2177 vp->v_path_stamp = 0;
2179 if (vp->v_fopdata != NULL) {
2180 free_fopdata(vp);
2182 vp->v_mpssdata = NULL;
2183 vsd_free(vp);
2187 * Used to reset the vnode fields including those that are directly accessible
2188 * as well as those which require an accessor function.
2190 * Does not initialize:
2191 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2192 * v_data (since FS-nodes and vnodes point to each other and should
2193 * be updated simultaneously)
2194 * v_op (in case someone needs to make a VOP call on this object)
2196 void
2197 vn_reinit(vnode_t *vp)
2199 vp->v_count = 1;
2200 vp->v_count_dnlc = 0;
2201 vp->v_vfsp = NULL;
2202 vp->v_stream = NULL;
2203 vp->v_vfsmountedhere = NULL;
2204 vp->v_flag = 0;
2205 vp->v_type = VNON;
2206 vp->v_rdev = NODEV;
2208 vp->v_filocks = NULL;
2209 vp->v_shrlocks = NULL;
2210 VERIFY(!vn_has_cached_data(vp));
2212 vp->v_locality = NULL;
2213 vp->v_xattrdir = NULL;
2216 * In a few specific instances, vn_reinit() is used to initialize
2217 * locally defined vnode_t instances. Lacking the construction offered
2218 * by vn_alloc(), these vnodes require v_path initialization.
2220 if (vp->v_path == NULL) {
2221 vp->v_path = vn_vpath_empty;
2224 /* Handles v_femhead, v_path, and the r/w/map counts */
2225 vn_recycle(vp);
2228 vnode_t *
2229 vn_alloc(int kmflag)
2231 vnode_t *vp;
2233 vp = kmem_cache_alloc(vn_cache, kmflag);
2235 if (vp != NULL) {
2236 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2237 vp->v_fopdata = NULL;
2238 vn_reinit(vp);
2241 return (vp);
2244 void
2245 vn_free(vnode_t *vp)
2247 ASSERT(vp->v_shrlocks == NULL);
2248 ASSERT(vp->v_filocks == NULL);
2251 * Some file systems call vn_free() with v_count of zero,
2252 * some with v_count of 1. In any case, the value should
2253 * never be anything else.
2255 ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2256 ASSERT(vp->v_count_dnlc == 0);
2257 VERIFY(vp->v_path != NULL);
2258 if (vp->v_path != vn_vpath_empty) {
2259 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2260 vp->v_path = vn_vpath_empty;
2263 /* If FEM was in use, make sure everything gets cleaned up */
2264 if (vp->v_femhead) {
2265 /* XXX - There should be a free_femhead() that does all this */
2266 ASSERT(vp->v_femhead->femh_list == NULL);
2267 mutex_destroy(&vp->v_femhead->femh_lock);
2268 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2269 vp->v_femhead = NULL;
2272 if (vp->v_fopdata != NULL) {
2273 free_fopdata(vp);
2275 vp->v_mpssdata = NULL;
2276 vsd_free(vp);
2277 kmem_cache_free(vn_cache, vp);
2281 * vnode status changes, should define better states than 1, 0.
2283 void
2284 vn_reclaim(vnode_t *vp)
2286 vfs_t *vfsp = vp->v_vfsp;
2288 if (vfsp == NULL ||
2289 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2290 return;
2292 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2295 void
2296 vn_idle(vnode_t *vp)
2298 vfs_t *vfsp = vp->v_vfsp;
2300 if (vfsp == NULL ||
2301 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2302 return;
2304 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2306 void
2307 vn_exists(vnode_t *vp)
2309 vfs_t *vfsp = vp->v_vfsp;
2311 if (vfsp == NULL ||
2312 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2313 return;
2315 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2318 void
2319 vn_invalid(vnode_t *vp)
2321 vfs_t *vfsp = vp->v_vfsp;
2323 if (vfsp == NULL ||
2324 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2325 return;
2327 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2330 /* Vnode event notification */
2333 vnevent_support(vnode_t *vp, caller_context_t *ct)
2335 if (vp == NULL)
2336 return (EINVAL);
2338 return (fop_vnevent(vp, VE_SUPPORT, NULL, NULL, ct));
2341 void
2342 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2344 if (vp == NULL || vp->v_femhead == NULL) {
2345 return;
2347 (void) fop_vnevent(vp, VE_RENAME_SRC, dvp, name, ct);
2350 void
2351 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2352 caller_context_t *ct)
2354 if (vp == NULL || vp->v_femhead == NULL) {
2355 return;
2357 (void) fop_vnevent(vp, VE_RENAME_DEST, dvp, name, ct);
2360 void
2361 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2363 if (vp == NULL || vp->v_femhead == NULL) {
2364 return;
2366 (void) fop_vnevent(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2369 void
2370 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2372 if (vp == NULL || vp->v_femhead == NULL) {
2373 return;
2375 (void) fop_vnevent(vp, VE_REMOVE, dvp, name, ct);
2378 void
2379 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2381 if (vp == NULL || vp->v_femhead == NULL) {
2382 return;
2384 (void) fop_vnevent(vp, VE_RMDIR, dvp, name, ct);
2387 void
2388 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2389 caller_context_t *ct)
2391 if (vp == NULL || vp->v_femhead == NULL) {
2392 return;
2394 (void) fop_vnevent(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2397 void
2398 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2399 caller_context_t *ct)
2401 if (vp == NULL || vp->v_femhead == NULL) {
2402 return;
2404 (void) fop_vnevent(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2407 void
2408 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2409 caller_context_t *ct)
2411 if (vp == NULL || vp->v_femhead == NULL) {
2412 return;
2414 (void) fop_vnevent(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2417 void
2418 vnevent_create(vnode_t *vp, caller_context_t *ct)
2420 if (vp == NULL || vp->v_femhead == NULL) {
2421 return;
2423 (void) fop_vnevent(vp, VE_CREATE, NULL, NULL, ct);
2426 void
2427 vnevent_link(vnode_t *vp, caller_context_t *ct)
2429 if (vp == NULL || vp->v_femhead == NULL) {
2430 return;
2432 (void) fop_vnevent(vp, VE_LINK, NULL, NULL, ct);
2435 void
2436 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2438 if (vp == NULL || vp->v_femhead == NULL) {
2439 return;
2441 (void) fop_vnevent(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2444 void
2445 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2447 if (vp == NULL || vp->v_femhead == NULL) {
2448 return;
2450 (void) fop_vnevent(vp, VE_TRUNCATE, NULL, NULL, ct);
2454 * Vnode accessors.
2458 vn_is_readonly(vnode_t *vp)
2460 return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2464 vn_has_flocks(vnode_t *vp)
2466 return (vp->v_filocks != NULL);
2470 vn_has_mandatory_locks(vnode_t *vp, int mode)
2472 return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2476 vn_has_cached_data(vnode_t *vp)
2478 return (!list_is_empty(&vp->v_object.list));
2482 * Return 0 if the vnode in question shouldn't be permitted into a zone via
2483 * zone_enter(2).
2486 vn_can_change_zones(vnode_t *vp)
2488 struct vfssw *vswp;
2489 int allow = 1;
2490 vnode_t *rvp;
2492 if (nfs_global_client_only != 0)
2493 return (1);
2496 * We always want to look at the underlying vnode if there is one.
2498 if (fop_realvp(vp, &rvp, NULL) != 0)
2499 rvp = vp;
2501 * Some pseudo filesystems (including doorfs) don't actually register
2502 * their vfsops_t, so the following may return NULL; we happily let
2503 * such vnodes switch zones.
2505 vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2506 if (vswp != NULL) {
2507 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2508 allow = 0;
2509 vfs_unrefvfssw(vswp);
2511 return (allow);
2515 * Return nonzero if the vnode is a mount point, zero if not.
2518 vn_ismntpt(vnode_t *vp)
2520 return (vp->v_vfsmountedhere != NULL);
2523 /* Retrieve the vfs (if any) mounted on this vnode */
2524 vfs_t *
2525 vn_mountedvfs(vnode_t *vp)
2527 return (vp->v_vfsmountedhere);
2531 * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2534 vn_in_dnlc(vnode_t *vp)
2536 return (vp->v_count_dnlc > 0);
2540 * vn_has_other_opens() checks whether a particular file is opened by more than
2541 * just the caller and whether the open is for read and/or write.
2542 * This routine is for calling after the caller has already called fop_open()
2543 * and the caller wishes to know if they are the only one with it open for
2544 * the mode(s) specified.
2546 * Vnode counts are only kept on regular files (v_type=VREG).
2548 bool
2549 vn_has_other_opens(struct vnode *vp, v_mode_t mode)
2551 ASSERT(vp != NULL);
2553 switch (mode) {
2554 case V_WRITE:
2555 if (vp->v_wrcnt > 1)
2556 return true;
2557 break;
2558 case V_RDORWR:
2559 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2560 return true;
2561 break;
2562 case V_RDANDWR:
2563 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2564 return true;
2565 break;
2566 case V_READ:
2567 if (vp->v_rdcnt > 1)
2568 return true;
2569 break;
2572 return false;
2576 * vn_is_opened() checks whether a particular file is opened and
2577 * whether the open is for read and/or write.
2579 * Vnode counts are only kept on regular files (v_type=VREG).
2581 bool vn_is_opened(struct vnode *vp, v_mode_t mode)
2583 ASSERT(vp != NULL);
2585 switch (mode) {
2586 case V_WRITE:
2587 if (vp->v_wrcnt)
2588 return true;
2589 break;
2590 case V_RDANDWR:
2591 if (vp->v_rdcnt && vp->v_wrcnt)
2592 return true;
2593 break;
2594 case V_RDORWR:
2595 if (vp->v_rdcnt || vp->v_wrcnt)
2596 return true;
2597 break;
2598 case V_READ:
2599 if (vp->v_rdcnt)
2600 return true;
2601 break;
2604 return false;
2608 * vn_is_mapped() checks whether a particular file is mapped and whether
2609 * the file is mapped read and/or write.
2611 bool vn_is_mapped(struct vnode *vp, v_mode_t mode)
2613 ASSERT(vp != NULL);
2615 #if !defined(_LP64)
2616 switch (mode) {
2618 * The atomic_add_64_nv functions force atomicity in the
2619 * case of 32 bit architectures. Otherwise the 64 bit values
2620 * require two fetches. The value of the fields may be
2621 * (potentially) changed between the first fetch and the
2622 * second
2624 case V_WRITE:
2625 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2626 return true;
2627 break;
2628 case V_RDANDWR:
2629 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2630 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2631 return true;
2632 break;
2633 case V_RDORWR:
2634 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2635 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2636 return true;
2637 break;
2638 case V_READ:
2639 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2640 return true;
2641 break;
2643 #else
2644 switch (mode) {
2645 case V_WRITE:
2646 if (vp->v_mmap_write)
2647 return true;
2648 break;
2649 case V_RDANDWR:
2650 if (vp->v_mmap_read && vp->v_mmap_write)
2651 return true;
2652 break;
2653 case V_RDORWR:
2654 if (vp->v_mmap_read || vp->v_mmap_write)
2655 return true;
2656 break;
2657 case V_READ:
2658 if (vp->v_mmap_read)
2659 return true;
2660 break;
2662 #endif
2664 return false;
2668 * Set the operations vector for a vnode.
2670 void
2671 vn_setops(struct vnode *vnode, const struct vnodeops *ops)
2673 vnode->v_op = ops;
2677 * Retrieve the operations vector for a vnode
2679 const struct vnodeops *
2680 vn_getops(struct vnode *vnode)
2682 return vnode->v_op;
2686 * Returns non-zero (1) if the vnodeops matches that of the vnode.
2687 * Returns zero (0) if not.
2690 vn_matchops(struct vnode *vp, const struct vnodeops *vnodeops)
2692 return (vn_getops(vp) == vnodeops);
2696 * fs_new_caller_id() needs to return a unique ID on a given local system.
2697 * The IDs do not need to survive across reboots. These are primarily
2698 * used so that (FEM) monitors can detect particular callers (such as
2699 * the NFS server) to a given vnode/vfs operation.
2701 u_longlong_t
2702 fs_new_caller_id()
2704 static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2706 return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2710 * The value stored in v_path is relative to rootdir, located in the global
2711 * zone. Zones or chroot environments which reside deeper inside the VFS
2712 * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2713 * what lies below their perceived root. In order to keep v_path usable for
2714 * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2716 * An upper bound of max_vnode_path is placed upon v_path allocations to
2717 * prevent the system from going too wild at the behest of pathological
2718 * behavior from the operator.
2720 size_t max_vnode_path = 4 * MAXPATHLEN;
2723 void
2724 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
2726 char *buf;
2728 mutex_enter(&vp->v_lock);
2730 * If the snapshot of v_path_stamp passed in via compare_stamp does not
2731 * match the present value on the vnode, it indicates that subsequent
2732 * changes have occurred. The v_path value is not cleared in this case
2733 * since the new value may be valid.
2735 if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
2736 mutex_exit(&vp->v_lock);
2737 return;
2739 buf = vp->v_path;
2740 vp->v_path = vn_vpath_empty;
2741 vp->v_path_stamp = 0;
2742 mutex_exit(&vp->v_lock);
2743 if (buf != vn_vpath_empty) {
2744 kmem_free(buf, strlen(buf) + 1);
2748 static void
2749 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
2750 boolean_t is_rename)
2752 char *buf, *oldbuf;
2753 hrtime_t pstamp;
2754 size_t baselen, buflen = 0;
2756 /* Handle the vn_setpath_str case. */
2757 if (pvp == NULL) {
2758 if (len + 1 > max_vnode_path) {
2759 DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
2760 vnode_t *, vp, char *, name, size_t, len + 1);
2761 return;
2763 buf = kmem_alloc(len + 1, KM_SLEEP);
2764 bcopy(name, buf, len);
2765 buf[len] = '\0';
2767 mutex_enter(&vp->v_lock);
2768 oldbuf = vp->v_path;
2769 vp->v_path = buf;
2770 vp->v_path_stamp = gethrtime();
2771 mutex_exit(&vp->v_lock);
2772 if (oldbuf != vn_vpath_empty) {
2773 kmem_free(oldbuf, strlen(oldbuf) + 1);
2775 return;
2778 /* Take snapshot of parent dir */
2779 mutex_enter(&pvp->v_lock);
2781 if ((pvp->v_flag & VTRAVERSE) != 0) {
2783 * When the parent vnode has VTRAVERSE set in its flags, normal
2784 * assumptions about v_path calculation no longer apply. The
2785 * primary situation where this occurs is via the VFS tricks
2786 * which procfs plays in order to allow /proc/PID/(root|cwd) to
2787 * yield meaningful results.
2789 * When this flag is set, v_path on the child must not be
2790 * updated since the calculated value is likely to be
2791 * incorrect, given the current context.
2793 mutex_exit(&pvp->v_lock);
2794 return;
2797 retrybuf:
2798 if (pvp->v_path == vn_vpath_empty) {
2800 * Without v_path from the parent directory, generating a child
2801 * path from the name is impossible.
2803 if (len > 0) {
2804 pstamp = pvp->v_path_stamp;
2805 mutex_exit(&pvp->v_lock);
2806 vn_clearpath(vp, pstamp);
2807 return;
2811 * The only feasible case here is where a NUL lookup is being
2812 * performed on rootdir prior to its v_path being populated.
2814 ASSERT(pvp->v_path_stamp == 0);
2815 baselen = 0;
2816 pstamp = 0;
2817 } else {
2818 pstamp = pvp->v_path_stamp;
2819 baselen = strlen(pvp->v_path);
2820 /* ignore a trailing slash if present */
2821 if (pvp->v_path[baselen - 1] == '/') {
2822 /* This should only the be case for rootdir */
2823 ASSERT(baselen == 1 && pvp == rootdir);
2824 baselen--;
2827 mutex_exit(&pvp->v_lock);
2829 if (buflen != 0) {
2830 /* Free the existing (mis-sized) buffer in case of retry */
2831 kmem_free(buf, buflen);
2833 /* base, '/', name and trailing NUL */
2834 buflen = baselen + len + 2;
2835 if (buflen > max_vnode_path) {
2836 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
2837 vnode_t *, vp, char *, name, size_t, buflen);
2838 return;
2840 buf = kmem_alloc(buflen, KM_SLEEP);
2842 mutex_enter(&pvp->v_lock);
2843 if (pvp->v_path_stamp != pstamp) {
2844 size_t vlen;
2847 * Since v_path_stamp changed on the parent, it is likely that
2848 * v_path has been altered as well. If the length does not
2849 * exactly match what was previously measured, the buffer
2850 * allocation must be repeated for proper sizing.
2852 if (pvp->v_path == vn_vpath_empty) {
2853 /* Give up if parent lack v_path */
2854 mutex_exit(&pvp->v_lock);
2855 kmem_free(buf, buflen);
2856 return;
2858 vlen = strlen(pvp->v_path);
2859 if (pvp->v_path[vlen - 1] == '/') {
2860 vlen--;
2862 if (vlen != baselen) {
2863 goto retrybuf;
2866 bcopy(pvp->v_path, buf, baselen);
2867 mutex_exit(&pvp->v_lock);
2869 buf[baselen] = '/';
2870 baselen++;
2871 bcopy(name, &buf[baselen], len + 1);
2873 mutex_enter(&vp->v_lock);
2874 if (vp->v_path_stamp == 0) {
2875 /* never-visited vnode can inherit stamp from parent */
2876 ASSERT(vp->v_path == vn_vpath_empty);
2877 vp->v_path_stamp = pstamp;
2878 vp->v_path = buf;
2879 mutex_exit(&vp->v_lock);
2880 } else if (vp->v_path_stamp < pstamp || is_rename) {
2882 * Install the updated path and stamp, ensuring that the v_path
2883 * pointer is valid at all times for dtrace.
2885 oldbuf = vp->v_path;
2886 vp->v_path = buf;
2887 vp->v_path_stamp = gethrtime();
2888 mutex_exit(&vp->v_lock);
2889 kmem_free(oldbuf, strlen(oldbuf) + 1);
2890 } else {
2892 * If the timestamp matches or is greater, it means another
2893 * thread performed the update first while locks were dropped
2894 * here to make the allocation. We defer to the newer value.
2896 mutex_exit(&vp->v_lock);
2897 kmem_free(buf, buflen);
2899 ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
2902 void
2903 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
2905 size_t len;
2908 * If the parent is older or empty, there's nothing further to do.
2910 if (pvp->v_path == vn_vpath_empty ||
2911 pvp->v_path_stamp <= vp->v_path_stamp) {
2912 return;
2916 * Given the lack of appropriate context, meaningful updates to v_path
2917 * cannot be made for during lookups for the '.' or '..' entries.
2919 len = strlen(name);
2920 if (len == 0 || (len == 1 && name[0] == '.') ||
2921 (len == 2 && name[0] == '.' && name[1] == '.')) {
2922 return;
2925 vn_setpath_common(pvp, vp, name, len, B_FALSE);
2929 * Given a starting vnode and a path, updates the path in the target vnode in
2930 * a safe manner. If the vnode already has path information embedded, then the
2931 * cached path is left untouched.
2933 /* ARGSUSED */
2934 void
2935 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
2936 size_t len)
2938 vn_setpath_common(pvp, vp, name, len, B_FALSE);
2942 * Sets the path to the vnode to be the given string, regardless of current
2943 * context. The string must be a complete path from rootdir. This is only used
2944 * by fsop_root() for setting the path based on the mountpoint.
2946 void
2947 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
2949 vn_setpath_common(NULL, vp, str, len, B_FALSE);
2953 * Called from within filesystem's vop_rename() to handle renames once the
2954 * target vnode is available.
2956 void
2957 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
2959 vn_setpath_common(pvp, vp, name, len, B_TRUE);
2963 * Similar to vn_setpath_str(), this function sets the path of the destination
2964 * vnode to the be the same as the source vnode.
2966 void
2967 vn_copypath(struct vnode *src, struct vnode *dst)
2969 char *buf;
2970 hrtime_t stamp;
2971 size_t buflen;
2973 mutex_enter(&src->v_lock);
2974 if (src->v_path == vn_vpath_empty) {
2975 mutex_exit(&src->v_lock);
2976 return;
2978 buflen = strlen(src->v_path) + 1;
2979 mutex_exit(&src->v_lock);
2981 buf = kmem_alloc(buflen, KM_SLEEP);
2983 mutex_enter(&src->v_lock);
2984 if (src->v_path == vn_vpath_empty ||
2985 strlen(src->v_path) + 1 != buflen) {
2986 mutex_exit(&src->v_lock);
2987 kmem_free(buf, buflen);
2988 return;
2990 bcopy(src->v_path, buf, buflen);
2991 stamp = src->v_path_stamp;
2992 mutex_exit(&src->v_lock);
2994 mutex_enter(&dst->v_lock);
2995 if (dst->v_path != vn_vpath_empty) {
2996 mutex_exit(&dst->v_lock);
2997 kmem_free(buf, buflen);
2998 return;
3000 dst->v_path = buf;
3001 dst->v_path_stamp = stamp;
3002 mutex_exit(&dst->v_lock);
3007 * XXX Private interface for segvn routines that handle vnode
3008 * large page segments.
3010 * return 1 if vp's file system fop_pageio() implementation
3011 * can be safely used instead of fop_getpage() for handling
3012 * pagefaults against regular non swap files. fop_pageio()
3013 * interface is considered safe here if its implementation
3014 * is very close to fop_getpage() implementation.
3015 * e.g. It zero's out the part of the page beyond EOF. Doesn't
3016 * panic if there're file holes but instead returns an error.
3017 * Doesn't assume file won't be changed by user writes, etc.
3019 * return 0 otherwise.
3021 * For now allow segvn to only use fop_pageio() with ufs and nfs.
3024 vn_vmpss_usepageio(vnode_t *vp)
3026 vfs_t *vfsp = vp->v_vfsp;
3027 char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3028 char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3029 char **fsok = pageio_ok_fss;
3031 if (fsname == NULL) {
3032 return (0);
3035 for (; *fsok; fsok++) {
3036 if (strcmp(*fsok, fsname) == 0) {
3037 return (1);
3040 return (0);
3043 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3046 fop_open(
3047 vnode_t **vpp,
3048 int mode,
3049 cred_t *cr,
3050 caller_context_t *ct)
3052 int ret;
3053 vnode_t *vp = *vpp;
3055 VN_HOLD(vp);
3057 * Adding to the vnode counts before calling open
3058 * avoids the need for a mutex. It circumvents a race
3059 * condition where a query made on the vnode counts results in a
3060 * false negative. The inquirer goes away believing the file is
3061 * not open when there is an open on the file already under way.
3063 * The counts are meant to prevent NFS from granting a delegation
3064 * when it would be dangerous to do so.
3066 * The vnode counts are only kept on regular files
3068 if ((*vpp)->v_type == VREG) {
3069 if (mode & FREAD)
3070 atomic_inc_32(&(*vpp)->v_rdcnt);
3071 if (mode & FWRITE)
3072 atomic_inc_32(&(*vpp)->v_wrcnt);
3075 VOPXID_MAP_CR(vp, cr);
3077 ret = fop_open_dispatch(vpp, mode, cr, ct, true);
3079 if (ret) {
3081 * Use the saved vp just in case the vnode ptr got trashed
3082 * by the error.
3084 VOPSTATS_UPDATE(vp, open);
3085 if ((vp->v_type == VREG) && (mode & FREAD))
3086 atomic_dec_32(&vp->v_rdcnt);
3087 if ((vp->v_type == VREG) && (mode & FWRITE))
3088 atomic_dec_32(&vp->v_wrcnt);
3089 } else {
3091 * Some filesystems will return a different vnode,
3092 * but the same path was still used to open it.
3093 * So if we do change the vnode and need to
3094 * copy over the path, do so here, rather than special
3095 * casing each filesystem. Adjust the vnode counts to
3096 * reflect the vnode switch.
3098 VOPSTATS_UPDATE(*vpp, open);
3099 if (*vpp != vp && *vpp != NULL) {
3100 vn_copypath(vp, *vpp);
3101 if (((*vpp)->v_type == VREG) && (mode & FREAD))
3102 atomic_inc_32(&(*vpp)->v_rdcnt);
3103 if ((vp->v_type == VREG) && (mode & FREAD))
3104 atomic_dec_32(&vp->v_rdcnt);
3105 if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3106 atomic_inc_32(&(*vpp)->v_wrcnt);
3107 if ((vp->v_type == VREG) && (mode & FWRITE))
3108 atomic_dec_32(&vp->v_wrcnt);
3111 VN_RELE(vp);
3112 return (ret);
3116 fop_close(
3117 vnode_t *vp,
3118 int flag,
3119 int count,
3120 offset_t offset,
3121 cred_t *cr,
3122 caller_context_t *ct)
3124 int err;
3126 VOPXID_MAP_CR(vp, cr);
3128 err = fop_close_dispatch(vp, flag, count, offset, cr, ct, true);
3130 VOPSTATS_UPDATE(vp, close);
3132 * Check passed in count to handle possible dups. Vnode counts are only
3133 * kept on regular files
3135 if ((vp->v_type == VREG) && (count == 1)) {
3136 if (flag & FREAD) {
3137 ASSERT(vp->v_rdcnt > 0);
3138 atomic_dec_32(&vp->v_rdcnt);
3140 if (flag & FWRITE) {
3141 ASSERT(vp->v_wrcnt > 0);
3142 atomic_dec_32(&vp->v_wrcnt);
3145 return (err);
3149 fop_read(
3150 vnode_t *vp,
3151 uio_t *uiop,
3152 int ioflag,
3153 cred_t *cr,
3154 caller_context_t *ct)
3156 int err;
3157 ssize_t resid_start = uiop->uio_resid;
3159 VOPXID_MAP_CR(vp, cr);
3161 err = fop_read_dispatch(vp, uiop, ioflag, cr, ct, true);
3163 VOPSTATS_UPDATE_IO(vp, read,
3164 read_bytes, (resid_start - uiop->uio_resid));
3165 return (err);
3169 fop_write(
3170 vnode_t *vp,
3171 uio_t *uiop,
3172 int ioflag,
3173 cred_t *cr,
3174 caller_context_t *ct)
3176 int err;
3177 ssize_t resid_start = uiop->uio_resid;
3179 VOPXID_MAP_CR(vp, cr);
3181 err = fop_write_dispatch(vp, uiop, ioflag, cr, ct, true);
3183 VOPSTATS_UPDATE_IO(vp, write,
3184 write_bytes, (resid_start - uiop->uio_resid));
3185 return (err);
3189 fop_ioctl(
3190 vnode_t *vp,
3191 int cmd,
3192 intptr_t arg,
3193 int flag,
3194 cred_t *cr,
3195 int *rvalp,
3196 caller_context_t *ct)
3198 int err;
3200 VOPXID_MAP_CR(vp, cr);
3202 err = fop_ioctl_dispatch(vp, cmd, arg, flag, cr, rvalp, ct, true);
3204 VOPSTATS_UPDATE(vp, ioctl);
3205 return (err);
3209 fop_setfl(
3210 vnode_t *vp,
3211 int oflags,
3212 int nflags,
3213 cred_t *cr,
3214 caller_context_t *ct)
3216 int err;
3218 VOPXID_MAP_CR(vp, cr);
3220 err = fop_setfl_dispatch(vp, oflags, nflags, cr, ct, true);
3222 VOPSTATS_UPDATE(vp, setfl);
3223 return (err);
3227 fop_getattr(
3228 vnode_t *vp,
3229 vattr_t *vap,
3230 int flags,
3231 cred_t *cr,
3232 caller_context_t *ct)
3234 int err;
3236 VOPXID_MAP_CR(vp, cr);
3239 * If this file system doesn't understand the xvattr extensions
3240 * then turn off the xvattr bit.
3242 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3243 vap->va_mask &= ~VATTR_XVATTR;
3247 * We're only allowed to skip the ACL check iff we used a 32 bit
3248 * ACE mask with fop_access() to determine permissions.
3250 if ((flags & ATTR_NOACLCHECK) &&
3251 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3252 return (EINVAL);
3254 err = fop_getattr_dispatch(vp, vap, flags, cr, ct, true);
3256 VOPSTATS_UPDATE(vp, getattr);
3257 return (err);
3261 fop_setattr(
3262 vnode_t *vp,
3263 vattr_t *vap,
3264 int flags,
3265 cred_t *cr,
3266 caller_context_t *ct)
3268 int err;
3270 VOPXID_MAP_CR(vp, cr);
3273 * If this file system doesn't understand the xvattr extensions
3274 * then turn off the xvattr bit.
3276 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3277 vap->va_mask &= ~VATTR_XVATTR;
3281 * We're only allowed to skip the ACL check iff we used a 32 bit
3282 * ACE mask with fop_access() to determine permissions.
3284 if ((flags & ATTR_NOACLCHECK) &&
3285 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3286 return (EINVAL);
3288 err = fop_setattr_dispatch(vp, vap, flags, cr, ct, true);
3290 VOPSTATS_UPDATE(vp, setattr);
3291 return (err);
3295 fop_access(
3296 vnode_t *vp,
3297 int mode,
3298 int flags,
3299 cred_t *cr,
3300 caller_context_t *ct)
3302 int err;
3304 if ((flags & V_ACE_MASK) &&
3305 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3306 return (EINVAL);
3309 VOPXID_MAP_CR(vp, cr);
3311 err = fop_access_dispatch(vp, mode, flags, cr, ct, true);
3313 VOPSTATS_UPDATE(vp, access);
3314 return (err);
3318 fop_lookup(
3319 vnode_t *dvp,
3320 char *nm,
3321 vnode_t **vpp,
3322 pathname_t *pnp,
3323 int flags,
3324 vnode_t *rdir,
3325 cred_t *cr,
3326 caller_context_t *ct,
3327 int *deflags, /* Returned per-dirent flags */
3328 pathname_t *ppnp) /* Returned case-preserved name in directory */
3330 int ret;
3333 * If this file system doesn't support case-insensitive access
3334 * and said access is requested, fail quickly. It is required
3335 * that if the vfs supports case-insensitive lookup, it also
3336 * supports extended dirent flags.
3338 if (flags & FIGNORECASE &&
3339 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3340 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3341 return (EINVAL);
3343 VOPXID_MAP_CR(dvp, cr);
3345 if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3346 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3347 } else {
3348 ret = fop_lookup_dispatch(dvp, nm, vpp, pnp, flags, rdir, cr,
3349 ct, deflags, ppnp, true);
3352 if (ret == 0 && *vpp) {
3353 VOPSTATS_UPDATE(*vpp, lookup);
3354 vn_updatepath(dvp, *vpp, nm);
3357 return (ret);
3361 fop_create(
3362 vnode_t *dvp,
3363 char *name,
3364 vattr_t *vap,
3365 vcexcl_t excl,
3366 int mode,
3367 vnode_t **vpp,
3368 cred_t *cr,
3369 int flags,
3370 caller_context_t *ct,
3371 vsecattr_t *vsecp) /* ACL to set during create */
3373 int ret;
3375 if (vsecp != NULL &&
3376 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3377 return (EINVAL);
3380 * If this file system doesn't support case-insensitive access
3381 * and said access is requested, fail quickly.
3383 if (flags & FIGNORECASE &&
3384 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3385 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3386 return (EINVAL);
3388 VOPXID_MAP_CR(dvp, cr);
3390 ret = fop_create_dispatch(dvp, name, vap, excl, mode, vpp, cr, flags,
3391 ct, vsecp, true);
3393 if (ret == 0 && *vpp) {
3394 VOPSTATS_UPDATE(*vpp, create);
3395 vn_updatepath(dvp, *vpp, name);
3398 return (ret);
3402 fop_remove(
3403 vnode_t *dvp,
3404 char *nm,
3405 cred_t *cr,
3406 caller_context_t *ct,
3407 int flags)
3409 int err;
3412 * If this file system doesn't support case-insensitive access
3413 * and said access is requested, fail quickly.
3415 if (flags & FIGNORECASE &&
3416 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3417 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3418 return (EINVAL);
3420 VOPXID_MAP_CR(dvp, cr);
3422 err = fop_remove_dispatch(dvp, nm, cr, ct, flags, true);
3424 VOPSTATS_UPDATE(dvp, remove);
3425 return (err);
3429 fop_link(
3430 vnode_t *tdvp,
3431 vnode_t *svp,
3432 char *tnm,
3433 cred_t *cr,
3434 caller_context_t *ct,
3435 int flags)
3437 int err;
3440 * If the target file system doesn't support case-insensitive access
3441 * and said access is requested, fail quickly.
3443 if (flags & FIGNORECASE &&
3444 (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3445 vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3446 return (EINVAL);
3448 VOPXID_MAP_CR(tdvp, cr);
3450 err = fop_link_dispatch(tdvp, svp, tnm, cr, ct, flags, true);
3452 VOPSTATS_UPDATE(tdvp, link);
3453 return (err);
3457 fop_rename(
3458 vnode_t *sdvp,
3459 char *snm,
3460 vnode_t *tdvp,
3461 char *tnm,
3462 cred_t *cr,
3463 caller_context_t *ct,
3464 int flags)
3466 int err;
3469 * If the file system involved does not support
3470 * case-insensitive access and said access is requested, fail
3471 * quickly.
3473 if (flags & FIGNORECASE &&
3474 ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3475 vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3476 return (EINVAL);
3478 VOPXID_MAP_CR(tdvp, cr);
3480 err = fop_rename_dispatch(sdvp, snm, tdvp, tnm, cr, ct, flags, true);
3482 VOPSTATS_UPDATE(sdvp, rename);
3483 return (err);
3487 fop_mkdir(
3488 vnode_t *dvp,
3489 char *dirname,
3490 vattr_t *vap,
3491 vnode_t **vpp,
3492 cred_t *cr,
3493 caller_context_t *ct,
3494 int flags,
3495 vsecattr_t *vsecp) /* ACL to set during create */
3497 int ret;
3499 if (vsecp != NULL &&
3500 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3501 return (EINVAL);
3504 * If this file system doesn't support case-insensitive access
3505 * and said access is requested, fail quickly.
3507 if (flags & FIGNORECASE &&
3508 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3509 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3510 return (EINVAL);
3512 VOPXID_MAP_CR(dvp, cr);
3514 ret = fop_mkdir_dispatch(dvp, dirname, vap, vpp, cr, ct, flags, vsecp,
3515 true);
3517 if (ret == 0 && *vpp) {
3518 VOPSTATS_UPDATE(*vpp, mkdir);
3519 vn_updatepath(dvp, *vpp, dirname);
3522 return (ret);
3526 fop_rmdir(
3527 vnode_t *dvp,
3528 char *nm,
3529 vnode_t *cdir,
3530 cred_t *cr,
3531 caller_context_t *ct,
3532 int flags)
3534 int err;
3537 * If this file system doesn't support case-insensitive access
3538 * and said access is requested, fail quickly.
3540 if (flags & FIGNORECASE &&
3541 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3542 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3543 return (EINVAL);
3545 VOPXID_MAP_CR(dvp, cr);
3547 err = fop_rmdir_dispatch(dvp, nm, cdir, cr, ct, flags, true);
3549 VOPSTATS_UPDATE(dvp, rmdir);
3550 return (err);
3554 fop_readdir(
3555 vnode_t *vp,
3556 uio_t *uiop,
3557 cred_t *cr,
3558 int *eofp,
3559 caller_context_t *ct,
3560 int flags)
3562 int err;
3563 ssize_t resid_start = uiop->uio_resid;
3566 * If this file system doesn't support retrieving directory
3567 * entry flags and said access is requested, fail quickly.
3569 if (flags & V_RDDIR_ENTFLAGS &&
3570 vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3571 return (EINVAL);
3573 VOPXID_MAP_CR(vp, cr);
3575 err = fop_readdir_dispatch(vp, uiop, cr, eofp, ct, flags, true);
3577 VOPSTATS_UPDATE_IO(vp, readdir,
3578 readdir_bytes, (resid_start - uiop->uio_resid));
3579 return (err);
3583 fop_symlink(
3584 vnode_t *dvp,
3585 char *linkname,
3586 vattr_t *vap,
3587 char *target,
3588 cred_t *cr,
3589 caller_context_t *ct,
3590 int flags)
3592 int err;
3593 xvattr_t xvattr;
3596 * If this file system doesn't support case-insensitive access
3597 * and said access is requested, fail quickly.
3599 if (flags & FIGNORECASE &&
3600 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3601 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3602 return (EINVAL);
3604 VOPXID_MAP_CR(dvp, cr);
3606 /* check for reparse point */
3607 if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3608 (strncmp(target, FS_REPARSE_TAG_STR,
3609 strlen(FS_REPARSE_TAG_STR)) == 0)) {
3610 if (!fs_reparse_mark(target, vap, &xvattr))
3611 vap = (vattr_t *)&xvattr;
3614 err = fop_symlink_dispatch(dvp, linkname, vap, target, cr, ct, flags,
3615 true);
3617 VOPSTATS_UPDATE(dvp, symlink);
3618 return (err);
3622 fop_readlink(
3623 vnode_t *vp,
3624 uio_t *uiop,
3625 cred_t *cr,
3626 caller_context_t *ct)
3628 int err;
3630 VOPXID_MAP_CR(vp, cr);
3632 err = fop_readlink_dispatch(vp, uiop, cr, ct, true);
3634 VOPSTATS_UPDATE(vp, readlink);
3635 return (err);
3639 fop_fsync(
3640 vnode_t *vp,
3641 int syncflag,
3642 cred_t *cr,
3643 caller_context_t *ct)
3645 int err;
3647 VOPXID_MAP_CR(vp, cr);
3649 err = fop_fsync_dispatch(vp, syncflag, cr, ct, true);
3651 VOPSTATS_UPDATE(vp, fsync);
3652 return (err);
3655 void
3656 fop_inactive(
3657 vnode_t *vp,
3658 cred_t *cr,
3659 caller_context_t *ct)
3661 /* Need to update stats before vop call since we may lose the vnode */
3662 VOPSTATS_UPDATE(vp, inactive);
3664 VOPXID_MAP_CR(vp, cr);
3666 fop_inactive_dispatch(vp, cr, ct, true);
3670 fop_fid(
3671 vnode_t *vp,
3672 fid_t *fidp,
3673 caller_context_t *ct)
3675 int err;
3677 err = fop_fid_dispatch(vp, fidp, ct, true);
3679 VOPSTATS_UPDATE(vp, fid);
3680 return (err);
3684 fop_rwlock(
3685 vnode_t *vp,
3686 int write_lock,
3687 caller_context_t *ct)
3689 int ret;
3691 ret = fop_rwlock_dispatch(vp, write_lock, ct, true);
3693 VOPSTATS_UPDATE(vp, rwlock);
3694 return (ret);
3697 void
3698 fop_rwunlock(
3699 vnode_t *vp,
3700 int write_lock,
3701 caller_context_t *ct)
3703 fop_rwunlock_dispatch(vp, write_lock, ct, true);
3705 VOPSTATS_UPDATE(vp, rwunlock);
3709 fop_seek(
3710 vnode_t *vp,
3711 offset_t ooff,
3712 offset_t *noffp,
3713 caller_context_t *ct)
3715 int err;
3717 err = fop_seek_dispatch(vp, ooff, noffp, ct, true);
3719 VOPSTATS_UPDATE(vp, seek);
3720 return (err);
3724 fop_cmp(
3725 vnode_t *vp1,
3726 vnode_t *vp2,
3727 caller_context_t *ct)
3729 int err;
3731 err = fop_cmp_dispatch(vp1, vp2, ct, true);
3733 VOPSTATS_UPDATE(vp1, cmp);
3734 return (err);
3738 fop_frlock(
3739 vnode_t *vp,
3740 int cmd,
3741 flock64_t *bfp,
3742 int flag,
3743 offset_t offset,
3744 struct flk_callback *flk_cbp,
3745 cred_t *cr,
3746 caller_context_t *ct)
3748 int err;
3750 VOPXID_MAP_CR(vp, cr);
3752 err = fop_frlock_dispatch(vp, cmd, bfp, flag, offset, flk_cbp, cr,
3753 ct, true);
3755 VOPSTATS_UPDATE(vp, frlock);
3756 return (err);
3760 fop_space(
3761 vnode_t *vp,
3762 int cmd,
3763 flock64_t *bfp,
3764 int flag,
3765 offset_t offset,
3766 cred_t *cr,
3767 caller_context_t *ct)
3769 int err;
3771 VOPXID_MAP_CR(vp, cr);
3773 err = fop_space_dispatch(vp, cmd, bfp, flag, offset, cr, ct, true);
3775 VOPSTATS_UPDATE(vp, space);
3776 return (err);
3780 fop_realvp(
3781 vnode_t *vp,
3782 vnode_t **vpp,
3783 caller_context_t *ct)
3785 int err;
3787 err = fop_realvp_dispatch(vp, vpp, ct, true);
3789 VOPSTATS_UPDATE(vp, realvp);
3790 return (err);
3794 fop_getpage(
3795 vnode_t *vp,
3796 offset_t off,
3797 size_t len,
3798 uint_t *protp,
3799 page_t **plarr,
3800 size_t plsz,
3801 struct seg *seg,
3802 caddr_t addr,
3803 enum seg_rw rw,
3804 cred_t *cr,
3805 caller_context_t *ct)
3807 int err;
3809 VOPXID_MAP_CR(vp, cr);
3811 err = fop_getpage_dispatch(vp, off, len, protp, plarr, plsz, seg,
3812 addr, rw, cr, ct, true);
3814 VOPSTATS_UPDATE(vp, getpage);
3815 return (err);
3819 fop_putpage(
3820 vnode_t *vp,
3821 offset_t off,
3822 size_t len,
3823 int flags,
3824 cred_t *cr,
3825 caller_context_t *ct)
3827 int err;
3829 VOPXID_MAP_CR(vp, cr);
3831 err = fop_putpage_dispatch(vp, off, len, flags, cr, ct, true);
3833 VOPSTATS_UPDATE(vp, putpage);
3834 return (err);
3838 fop_map(
3839 vnode_t *vp,
3840 offset_t off,
3841 struct as *as,
3842 caddr_t *addrp,
3843 size_t len,
3844 uchar_t prot,
3845 uchar_t maxprot,
3846 uint_t flags,
3847 cred_t *cr,
3848 caller_context_t *ct)
3850 int err;
3852 VOPXID_MAP_CR(vp, cr);
3854 err = fop_map_dispatch(vp, off, as, addrp, len, prot, maxprot,
3855 flags, cr, ct, true);
3857 VOPSTATS_UPDATE(vp, map);
3858 return (err);
3862 fop_addmap(
3863 vnode_t *vp,
3864 offset_t off,
3865 struct as *as,
3866 caddr_t addr,
3867 size_t len,
3868 uchar_t prot,
3869 uchar_t maxprot,
3870 uint_t flags,
3871 cred_t *cr,
3872 caller_context_t *ct)
3874 int error;
3875 u_longlong_t delta;
3877 VOPXID_MAP_CR(vp, cr);
3879 error = fop_addmap_dispatch(vp, off, as, addr, len, prot, maxprot,
3880 flags, cr, ct, true);
3882 if ((!error) && (vp->v_type == VREG)) {
3883 delta = (u_longlong_t)btopr(len);
3885 * If file is declared MAP_PRIVATE, it can't be written back
3886 * even if open for write. Handle as read.
3888 if (flags & MAP_PRIVATE) {
3889 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3890 (int64_t)delta);
3891 } else {
3893 * atomic_add_64 forces the fetch of a 64 bit value to
3894 * be atomic on 32 bit machines
3896 if (maxprot & PROT_WRITE)
3897 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3898 (int64_t)delta);
3899 if (maxprot & PROT_READ)
3900 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3901 (int64_t)delta);
3902 if (maxprot & PROT_EXEC)
3903 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3904 (int64_t)delta);
3907 VOPSTATS_UPDATE(vp, addmap);
3908 return (error);
3912 fop_delmap(
3913 vnode_t *vp,
3914 offset_t off,
3915 struct as *as,
3916 caddr_t addr,
3917 size_t len,
3918 uint_t prot,
3919 uint_t maxprot,
3920 uint_t flags,
3921 cred_t *cr,
3922 caller_context_t *ct)
3924 int error;
3925 u_longlong_t delta;
3927 VOPXID_MAP_CR(vp, cr);
3929 error = fop_delmap_dispatch(vp, off, as, addr, len, prot, maxprot,
3930 flags, cr, ct, true);
3933 * NFS calls into delmap twice, the first time
3934 * it simply establishes a callback mechanism and returns EAGAIN
3935 * while the real work is being done upon the second invocation.
3936 * We have to detect this here and only decrement the counts upon
3937 * the second delmap request.
3939 if ((error != EAGAIN) && (vp->v_type == VREG)) {
3941 delta = (u_longlong_t)btopr(len);
3943 if (flags & MAP_PRIVATE) {
3944 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3945 (int64_t)(-delta));
3946 } else {
3948 * atomic_add_64 forces the fetch of a 64 bit value
3949 * to be atomic on 32 bit machines
3951 if (maxprot & PROT_WRITE)
3952 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3953 (int64_t)(-delta));
3954 if (maxprot & PROT_READ)
3955 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3956 (int64_t)(-delta));
3957 if (maxprot & PROT_EXEC)
3958 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3959 (int64_t)(-delta));
3962 VOPSTATS_UPDATE(vp, delmap);
3963 return (error);
3968 fop_poll(
3969 vnode_t *vp,
3970 short events,
3971 int anyyet,
3972 short *reventsp,
3973 struct pollhead **phpp,
3974 caller_context_t *ct)
3976 int err;
3978 err = fop_poll_dispatch(vp, events, anyyet, reventsp, phpp, ct, true);
3980 VOPSTATS_UPDATE(vp, poll);
3981 return (err);
3985 fop_dump(
3986 vnode_t *vp,
3987 caddr_t addr,
3988 offset_t lbdn,
3989 offset_t dblks,
3990 caller_context_t *ct)
3992 int err;
3994 /* ensure lbdn and dblks can be passed safely to bdev_dump */
3995 if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
3996 return (EIO);
3998 err = fop_dump_dispatch(vp, addr, lbdn, dblks, ct, true);
4000 VOPSTATS_UPDATE(vp, dump);
4001 return (err);
4005 fop_pathconf(
4006 vnode_t *vp,
4007 int cmd,
4008 ulong_t *valp,
4009 cred_t *cr,
4010 caller_context_t *ct)
4012 int err;
4014 VOPXID_MAP_CR(vp, cr);
4016 err = fop_pathconf_dispatch(vp, cmd, valp, cr, ct, true);
4018 VOPSTATS_UPDATE(vp, pathconf);
4019 return (err);
4023 fop_pageio(
4024 vnode_t *vp,
4025 struct page *pp,
4026 uoff_t io_off,
4027 size_t io_len,
4028 int flags,
4029 cred_t *cr,
4030 caller_context_t *ct)
4032 int err;
4034 VOPXID_MAP_CR(vp, cr);
4036 err = fop_pageio_dispatch(vp, pp, io_off, io_len, flags, cr, ct, true);
4038 VOPSTATS_UPDATE(vp, pageio);
4039 return (err);
4043 fop_dumpctl(
4044 vnode_t *vp,
4045 int action,
4046 offset_t *blkp,
4047 caller_context_t *ct)
4049 int err;
4051 err = fop_dumpctl_dispatch(vp, action, blkp, ct, true);
4053 VOPSTATS_UPDATE(vp, dumpctl);
4054 return (err);
4057 void
4058 fop_dispose(
4059 vnode_t *vp,
4060 page_t *pp,
4061 int flag,
4062 int dn,
4063 cred_t *cr,
4064 caller_context_t *ct)
4066 /* Must do stats first since it's possible to lose the vnode */
4067 VOPSTATS_UPDATE(vp, dispose);
4069 VOPXID_MAP_CR(vp, cr);
4071 fop_dispose_dispatch(vp, pp, flag, dn, cr, ct, true);
4075 fop_setsecattr(
4076 vnode_t *vp,
4077 vsecattr_t *vsap,
4078 int flag,
4079 cred_t *cr,
4080 caller_context_t *ct)
4082 int err;
4084 VOPXID_MAP_CR(vp, cr);
4087 * We're only allowed to skip the ACL check iff we used a 32 bit
4088 * ACE mask with fop_access() to determine permissions.
4090 if ((flag & ATTR_NOACLCHECK) &&
4091 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4092 return (EINVAL);
4095 err = fop_setsecattr_dispatch(vp, vsap, flag, cr, ct, true);
4097 VOPSTATS_UPDATE(vp, setsecattr);
4098 return (err);
4102 fop_getsecattr(
4103 vnode_t *vp,
4104 vsecattr_t *vsap,
4105 int flag,
4106 cred_t *cr,
4107 caller_context_t *ct)
4109 int err;
4112 * We're only allowed to skip the ACL check iff we used a 32 bit
4113 * ACE mask with fop_access() to determine permissions.
4115 if ((flag & ATTR_NOACLCHECK) &&
4116 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4117 return (EINVAL);
4120 VOPXID_MAP_CR(vp, cr);
4122 err = fop_getsecattr_dispatch(vp, vsap, flag, cr, ct, true);
4124 VOPSTATS_UPDATE(vp, getsecattr);
4125 return (err);
4129 fop_shrlock(
4130 vnode_t *vp,
4131 int cmd,
4132 struct shrlock *shr,
4133 int flag,
4134 cred_t *cr,
4135 caller_context_t *ct)
4137 int err;
4139 VOPXID_MAP_CR(vp, cr);
4141 err = fop_shrlock_dispatch(vp, cmd, shr, flag, cr, ct, true);
4143 VOPSTATS_UPDATE(vp, shrlock);
4144 return (err);
4148 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4149 caller_context_t *ct)
4151 int err;
4153 err = fop_vnevent_dispatch(vp, vnevent, dvp, fnm, ct, true);
4155 VOPSTATS_UPDATE(vp, vnevent);
4156 return (err);
4160 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4161 caller_context_t *ct)
4163 int err;
4165 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4166 return (ENOTSUP);
4168 err = fop_reqzcbuf_dispatch(vp, ioflag, uiop, cr, ct, true);
4170 VOPSTATS_UPDATE(vp, reqzcbuf);
4171 return (err);
4175 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4177 int err;
4179 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4180 return (ENOTSUP);
4182 err = fop_retzcbuf_dispatch(vp, uiop, cr, ct, true);
4184 VOPSTATS_UPDATE(vp, retzcbuf);
4185 return (err);
4189 * Default destructor
4190 * Needed because NULL destructor means that the key is unused
4192 /* ARGSUSED */
4193 void
4194 vsd_defaultdestructor(void *value)
4198 * Create a key (index into per vnode array)
4199 * Locks out vsd_create, vsd_destroy, and vsd_free
4200 * May allocate memory with lock held
4202 void
4203 vsd_create(uint_t *keyp, void (*destructor)(void *))
4205 int i;
4206 uint_t nkeys;
4209 * if key is allocated, do nothing
4211 mutex_enter(&vsd_lock);
4212 if (*keyp) {
4213 mutex_exit(&vsd_lock);
4214 return;
4217 * find an unused key
4219 if (destructor == NULL)
4220 destructor = vsd_defaultdestructor;
4222 for (i = 0; i < vsd_nkeys; ++i)
4223 if (vsd_destructor[i] == NULL)
4224 break;
4227 * if no unused keys, increase the size of the destructor array
4229 if (i == vsd_nkeys) {
4230 if ((nkeys = (vsd_nkeys << 1)) == 0)
4231 nkeys = 1;
4232 vsd_destructor =
4233 (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4234 (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4235 (size_t)(nkeys * sizeof (void (*)(void *))));
4236 vsd_nkeys = nkeys;
4240 * allocate the next available unused key
4242 vsd_destructor[i] = destructor;
4243 *keyp = i + 1;
4245 /* create vsd_list, if it doesn't exist */
4246 if (vsd_list == NULL) {
4247 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4248 list_create(vsd_list, sizeof (struct vsd_node),
4249 offsetof(struct vsd_node, vs_nodes));
4252 mutex_exit(&vsd_lock);
4256 * Destroy a key
4258 * Assumes that the caller is preventing vsd_set and vsd_get
4259 * Locks out vsd_create, vsd_destroy, and vsd_free
4260 * May free memory with lock held
4262 void
4263 vsd_destroy(uint_t *keyp)
4265 uint_t key;
4266 struct vsd_node *vsd;
4269 * protect the key namespace and our destructor lists
4271 mutex_enter(&vsd_lock);
4272 key = *keyp;
4273 *keyp = 0;
4275 ASSERT(key <= vsd_nkeys);
4278 * if the key is valid
4280 if (key != 0) {
4281 uint_t k = key - 1;
4283 * for every vnode with VSD, call key's destructor
4285 for (vsd = list_head(vsd_list); vsd != NULL;
4286 vsd = list_next(vsd_list, vsd)) {
4288 * no VSD for key in this vnode
4290 if (key > vsd->vs_nkeys)
4291 continue;
4293 * call destructor for key
4295 if (vsd->vs_value[k] && vsd_destructor[k])
4296 (*vsd_destructor[k])(vsd->vs_value[k]);
4298 * reset value for key
4300 vsd->vs_value[k] = NULL;
4303 * actually free the key (NULL destructor == unused)
4305 vsd_destructor[k] = NULL;
4308 mutex_exit(&vsd_lock);
4312 * Quickly return the per vnode value that was stored with the specified key
4313 * Assumes the caller is protecting key from vsd_create and vsd_destroy
4314 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4316 void *
4317 vsd_get(vnode_t *vp, uint_t key)
4319 struct vsd_node *vsd;
4321 ASSERT(vp != NULL);
4322 ASSERT(mutex_owned(&vp->v_vsd_lock));
4324 vsd = vp->v_vsd;
4326 if (key && vsd != NULL && key <= vsd->vs_nkeys)
4327 return (vsd->vs_value[key - 1]);
4328 return (NULL);
4332 * Set a per vnode value indexed with the specified key
4333 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4336 vsd_set(vnode_t *vp, uint_t key, void *value)
4338 struct vsd_node *vsd;
4340 ASSERT(vp != NULL);
4341 ASSERT(mutex_owned(&vp->v_vsd_lock));
4343 if (key == 0)
4344 return (EINVAL);
4346 vsd = vp->v_vsd;
4347 if (vsd == NULL)
4348 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4351 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4352 * code won't happen and we will continue down and allocate space for
4353 * the vs_value array.
4354 * If the caller is replacing one value with another, then it is up
4355 * to the caller to free/rele/destroy the previous value (if needed).
4357 if (key <= vsd->vs_nkeys) {
4358 vsd->vs_value[key - 1] = value;
4359 return (0);
4362 ASSERT(key <= vsd_nkeys);
4364 if (vsd->vs_nkeys == 0) {
4365 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4367 * Link onto list of all VSD nodes.
4369 list_insert_head(vsd_list, vsd);
4370 mutex_exit(&vsd_lock);
4374 * Allocate vnode local storage and set the value for key
4376 vsd->vs_value = vsd_realloc(vsd->vs_value,
4377 vsd->vs_nkeys * sizeof (void *),
4378 key * sizeof (void *));
4379 vsd->vs_nkeys = key;
4380 vsd->vs_value[key - 1] = value;
4382 return (0);
4386 * Called from vn_free() to run the destructor function for each vsd
4387 * Locks out vsd_create and vsd_destroy
4388 * Assumes that the destructor *DOES NOT* use vsd
4390 void
4391 vsd_free(vnode_t *vp)
4393 int i;
4394 struct vsd_node *vsd = vp->v_vsd;
4396 if (vsd == NULL)
4397 return;
4399 if (vsd->vs_nkeys == 0) {
4400 kmem_free(vsd, sizeof (*vsd));
4401 vp->v_vsd = NULL;
4402 return;
4406 * lock out vsd_create and vsd_destroy, call
4407 * the destructor, and mark the value as destroyed.
4409 mutex_enter(&vsd_lock);
4411 for (i = 0; i < vsd->vs_nkeys; i++) {
4412 if (vsd->vs_value[i] && vsd_destructor[i])
4413 (*vsd_destructor[i])(vsd->vs_value[i]);
4414 vsd->vs_value[i] = NULL;
4418 * remove from linked list of VSD nodes
4420 list_remove(vsd_list, vsd);
4422 mutex_exit(&vsd_lock);
4425 * free up the VSD
4427 kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4428 kmem_free(vsd, sizeof (struct vsd_node));
4429 vp->v_vsd = NULL;
4433 * realloc
4435 static void *
4436 vsd_realloc(void *old, size_t osize, size_t nsize)
4438 void *new;
4440 new = kmem_zalloc(nsize, KM_SLEEP);
4441 if (old) {
4442 bcopy(old, new, osize);
4443 kmem_free(old, osize);
4445 return (new);
4449 * Setup the extensible system attribute for creating a reparse point.
4450 * The symlink data 'target' is validated for proper format of a reparse
4451 * string and a check also made to make sure the symlink data does not
4452 * point to an existing file.
4454 * return 0 if ok else -1.
4456 static int
4457 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4459 xoptattr_t *xoap;
4461 if ((!target) || (!vap) || (!xvattr))
4462 return (-1);
4464 /* validate reparse string */
4465 if (reparse_validate((const char *)target))
4466 return (-1);
4468 xva_init(xvattr);
4469 xvattr->xva_vattr = *vap;
4470 xvattr->xva_vattr.va_mask |= VATTR_XVATTR;
4471 xoap = xva_getxoptattr(xvattr);
4472 ASSERT(xoap);
4473 XVA_SET_REQ(xvattr, XAT_REPARSE);
4474 xoap->xoa_reparse = 1;
4476 return (0);
4480 * Function to check whether a symlink is a reparse point.
4481 * Return B_TRUE if it is a reparse point, else return B_FALSE
4483 boolean_t
4484 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4486 xvattr_t xvattr;
4487 xoptattr_t *xoap;
4489 if ((vp->v_type != VLNK) ||
4490 !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4491 return (B_FALSE);
4493 xva_init(&xvattr);
4494 xoap = xva_getxoptattr(&xvattr);
4495 ASSERT(xoap);
4496 XVA_SET_REQ(&xvattr, XAT_REPARSE);
4498 if (fop_getattr(vp, &xvattr.xva_vattr, 0, cr, ct))
4499 return (B_FALSE);
4501 if ((!(xvattr.xva_vattr.va_mask & VATTR_XVATTR)) ||
4502 (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4503 return (B_FALSE);
4505 return (xoap->xoa_reparse ? B_TRUE : B_FALSE);