4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2017, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
29 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * University Copyright- Copyright (c) 1982, 1986, 1988
34 * The Regents of the University of California
37 * University Acknowledgment- Portions of this document are derived from
38 * software developed by the University of California, Berkeley, and its
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/t_lock.h>
45 #include <sys/errno.h>
50 #include <sys/pathname.h>
52 #include <sys/vnode.h>
53 #include <sys/vnode_dispatch.h>
54 #include <sys/rwstlock.h>
59 #include <sys/sysmacros.h>
60 #include <sys/cmn_err.h>
61 #include <sys/systm.h>
63 #include <sys/debug.h>
66 #include <sys/nbmlock.h>
67 #include <sys/fcntl.h>
68 #include <sys/fs_subr.h>
69 #include <sys/taskq.h>
70 #include <sys/fs_reparse.h>
74 /* Determine if this vnode is a file that is read-only */
75 #define ISROFILE(vp) \
76 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
77 (vp)->v_type != VFIFO && vn_is_readonly(vp))
79 /* Tunable via /etc/system; used only by admin/install */
80 int nfs_global_client_only
;
83 * Array of vopstats_t for per-FS-type vopstats. This array has the same
84 * number of entries as and parallel to the vfssw table. (Arguably, it could
85 * be part of the vfssw table.) Once it's initialized, it's accessed using
86 * the same fstype index that is used to index into the vfssw table.
88 vopstats_t
**vopstats_fstype
;
90 /* vopstats initialization template used for fast initialization via bcopy() */
91 static vopstats_t
*vs_templatep
;
93 /* Kmem cache handle for vsk_anchor_t allocations */
94 kmem_cache_t
*vsk_anchor_cache
;
96 /* file events cleanup routine */
97 extern void free_fopdata(vnode_t
*);
100 * Root of AVL tree for the kstats associated with vopstats. Lock protects
101 * updates to vsktat_tree.
103 avl_tree_t vskstat_tree
;
104 kmutex_t vskstat_tree_lock
;
106 /* Global variable which enables/disables the vopstats collection */
107 int vopstats_enabled
= 1;
109 /* Global used for empty/invalid v_path */
110 char *vn_vpath_empty
= "";
113 * forward declarations for internal vnode specific data (vsd)
115 static void *vsd_realloc(void *, size_t, size_t);
118 * forward declarations for reparse point functions
120 static int fs_reparse_mark(char *target
, vattr_t
*vap
, xvattr_t
*xvattr
);
123 * VSD -- VNODE SPECIFIC DATA
124 * The v_data pointer is typically used by a file system to store a
125 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
126 * However, there are times when additional project private data needs
127 * to be stored separately from the data (node) pointed to by v_data.
128 * This additional data could be stored by the file system itself or
129 * by a completely different kernel entity. VSD provides a way for
130 * callers to obtain a key and store a pointer to private data associated
133 * Callers are responsible for protecting the vsd by holding v_vsd_lock
134 * for calls to vsd_set() and vsd_get().
139 * vsd_nkeys - creation and deletion of vsd keys
140 * vsd_list - insertion and deletion of vsd_node in the vsd_list
141 * vsd_destructor - adding and removing destructors to the list
143 static kmutex_t vsd_lock
;
144 static uint_t vsd_nkeys
; /* size of destructor array */
145 /* list of vsd_node's */
146 static list_t
*vsd_list
= NULL
;
147 /* per-key destructor funcs */
148 static void (**vsd_destructor
)(void *);
151 * The following is the common set of actions needed to update the
152 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and
153 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
154 * recording of the bytes transferred. Since the code is similar
155 * but small, it is nearly a duplicate. Consequently any changes
156 * to one may need to be reflected in the other.
157 * Rundown of the variables:
158 * vp - Pointer to the vnode
159 * counter - Partial name structure member to update in vopstats for counts
160 * bytecounter - Partial name structure member to update in vopstats for bytes
161 * bytesval - Value to update in vopstats for bytes
162 * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
163 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
166 #define VOPSTATS_UPDATE(vp, counter) { \
167 vfs_t *vfsp = (vp)->v_vfsp; \
168 if (vfsp && vfsp->vfs_implp && \
169 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
170 vopstats_t *vsp = &vfsp->vfs_vopstats; \
171 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
172 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
173 size_t, uint64_t *); \
174 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \
176 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
177 vsp->n##counter.value.ui64++; \
182 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \
183 vfs_t *vfsp = (vp)->v_vfsp; \
184 if (vfsp && vfsp->vfs_implp && \
185 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
186 vopstats_t *vsp = &vfsp->vfs_vopstats; \
187 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
188 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
189 size_t, uint64_t *); \
190 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
192 vsp->bytecounter.value.ui64 += bytesval; \
193 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
194 vsp->n##counter.value.ui64++; \
195 vsp->bytecounter.value.ui64 += bytesval; \
201 * If the filesystem does not support XIDs map credential
202 * If the vfsp is NULL, perhaps we should also map?
204 #define VOPXID_MAP_CR(vp, cr) { \
205 vfs_t *vfsp = (vp)->v_vfsp; \
206 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
207 cr = crgetmapped(cr); \
211 * Convert stat(2) formats to vnode types and vice versa. (Knows about
212 * numerical order of S_IFMT and vnode types.)
214 enum vtype iftovt_tab
[] = {
215 VNON
, VFIFO
, VCHR
, VNON
, VDIR
, VNON
, VBLK
, VNON
,
216 VREG
, VNON
, VLNK
, VNON
, VSOCK
, VNON
, VNON
, VNON
219 ushort_t vttoif_tab
[] = {
220 0, S_IFREG
, S_IFDIR
, S_IFBLK
, S_IFCHR
, S_IFLNK
, S_IFIFO
,
221 S_IFDOOR
, 0, S_IFSOCK
, S_IFPORT
, 0
225 * The system vnode cache.
228 kmem_cache_t
*vn_cache
;
231 /* Extensible attribute (xva) routines. */
234 * Zero out the structure, set the size of the requested/returned bitmaps,
235 * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
236 * to the returned attributes array.
239 xva_init(xvattr_t
*xvap
)
241 bzero(xvap
, sizeof (xvattr_t
));
242 xvap
->xva_mapsize
= XVA_MAPSIZE
;
243 xvap
->xva_magic
= XVA_MAGIC
;
244 xvap
->xva_vattr
.va_mask
= AT_XVATTR
;
245 xvap
->xva_rtnattrmapp
= &(xvap
->xva_rtnattrmap
)[0];
249 * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
250 * structure. Otherwise, returns NULL.
253 xva_getxoptattr(xvattr_t
*xvap
)
255 xoptattr_t
*xoap
= NULL
;
256 if (xvap
->xva_vattr
.va_mask
& AT_XVATTR
)
257 xoap
= &xvap
->xva_xoptattrs
;
262 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
263 * We use the f_fsid reported by VFS_STATVFS() since we use that for the
267 vska_compar(const void *n1
, const void *n2
)
270 ulong_t p1
= ((vsk_anchor_t
*)n1
)->vsk_fsid
;
271 ulong_t p2
= ((vsk_anchor_t
*)n2
)->vsk_fsid
;
275 } else if (p1
> p2
) {
285 * Used to create a single template which will be bcopy()ed to a newly
286 * allocated vsanchor_combo_t structure in new_vsanchor(), below.
289 create_vopstats_template()
293 vsp
= kmem_alloc(sizeof (vopstats_t
), KM_SLEEP
);
294 bzero(vsp
, sizeof (*vsp
)); /* Start fresh */
297 kstat_named_init(&vsp
->nopen
, "nopen", KSTAT_DATA_UINT64
);
299 kstat_named_init(&vsp
->nclose
, "nclose", KSTAT_DATA_UINT64
);
301 kstat_named_init(&vsp
->nread
, "nread", KSTAT_DATA_UINT64
);
302 kstat_named_init(&vsp
->read_bytes
, "read_bytes", KSTAT_DATA_UINT64
);
304 kstat_named_init(&vsp
->nwrite
, "nwrite", KSTAT_DATA_UINT64
);
305 kstat_named_init(&vsp
->write_bytes
, "write_bytes", KSTAT_DATA_UINT64
);
307 kstat_named_init(&vsp
->nioctl
, "nioctl", KSTAT_DATA_UINT64
);
309 kstat_named_init(&vsp
->nsetfl
, "nsetfl", KSTAT_DATA_UINT64
);
311 kstat_named_init(&vsp
->ngetattr
, "ngetattr", KSTAT_DATA_UINT64
);
313 kstat_named_init(&vsp
->nsetattr
, "nsetattr", KSTAT_DATA_UINT64
);
315 kstat_named_init(&vsp
->naccess
, "naccess", KSTAT_DATA_UINT64
);
317 kstat_named_init(&vsp
->nlookup
, "nlookup", KSTAT_DATA_UINT64
);
319 kstat_named_init(&vsp
->ncreate
, "ncreate", KSTAT_DATA_UINT64
);
321 kstat_named_init(&vsp
->nremove
, "nremove", KSTAT_DATA_UINT64
);
323 kstat_named_init(&vsp
->nlink
, "nlink", KSTAT_DATA_UINT64
);
325 kstat_named_init(&vsp
->nrename
, "nrename", KSTAT_DATA_UINT64
);
327 kstat_named_init(&vsp
->nmkdir
, "nmkdir", KSTAT_DATA_UINT64
);
329 kstat_named_init(&vsp
->nrmdir
, "nrmdir", KSTAT_DATA_UINT64
);
330 /* fop_readdir I/O */
331 kstat_named_init(&vsp
->nreaddir
, "nreaddir", KSTAT_DATA_UINT64
);
332 kstat_named_init(&vsp
->readdir_bytes
, "readdir_bytes",
335 kstat_named_init(&vsp
->nsymlink
, "nsymlink", KSTAT_DATA_UINT64
);
337 kstat_named_init(&vsp
->nreadlink
, "nreadlink", KSTAT_DATA_UINT64
);
339 kstat_named_init(&vsp
->nfsync
, "nfsync", KSTAT_DATA_UINT64
);
341 kstat_named_init(&vsp
->ninactive
, "ninactive", KSTAT_DATA_UINT64
);
343 kstat_named_init(&vsp
->nfid
, "nfid", KSTAT_DATA_UINT64
);
345 kstat_named_init(&vsp
->nrwlock
, "nrwlock", KSTAT_DATA_UINT64
);
347 kstat_named_init(&vsp
->nrwunlock
, "nrwunlock", KSTAT_DATA_UINT64
);
349 kstat_named_init(&vsp
->nseek
, "nseek", KSTAT_DATA_UINT64
);
351 kstat_named_init(&vsp
->ncmp
, "ncmp", KSTAT_DATA_UINT64
);
353 kstat_named_init(&vsp
->nfrlock
, "nfrlock", KSTAT_DATA_UINT64
);
355 kstat_named_init(&vsp
->nspace
, "nspace", KSTAT_DATA_UINT64
);
357 kstat_named_init(&vsp
->nrealvp
, "nrealvp", KSTAT_DATA_UINT64
);
359 kstat_named_init(&vsp
->ngetpage
, "ngetpage", KSTAT_DATA_UINT64
);
361 kstat_named_init(&vsp
->nputpage
, "nputpage", KSTAT_DATA_UINT64
);
363 kstat_named_init(&vsp
->nmap
, "nmap", KSTAT_DATA_UINT64
);
365 kstat_named_init(&vsp
->naddmap
, "naddmap", KSTAT_DATA_UINT64
);
367 kstat_named_init(&vsp
->ndelmap
, "ndelmap", KSTAT_DATA_UINT64
);
369 kstat_named_init(&vsp
->npoll
, "npoll", KSTAT_DATA_UINT64
);
371 kstat_named_init(&vsp
->ndump
, "ndump", KSTAT_DATA_UINT64
);
373 kstat_named_init(&vsp
->npathconf
, "npathconf", KSTAT_DATA_UINT64
);
375 kstat_named_init(&vsp
->npageio
, "npageio", KSTAT_DATA_UINT64
);
377 kstat_named_init(&vsp
->ndumpctl
, "ndumpctl", KSTAT_DATA_UINT64
);
379 kstat_named_init(&vsp
->ndispose
, "ndispose", KSTAT_DATA_UINT64
);
381 kstat_named_init(&vsp
->nsetsecattr
, "nsetsecattr", KSTAT_DATA_UINT64
);
383 kstat_named_init(&vsp
->ngetsecattr
, "ngetsecattr", KSTAT_DATA_UINT64
);
385 kstat_named_init(&vsp
->nshrlock
, "nshrlock", KSTAT_DATA_UINT64
);
387 kstat_named_init(&vsp
->nvnevent
, "nvnevent", KSTAT_DATA_UINT64
);
389 kstat_named_init(&vsp
->nreqzcbuf
, "nreqzcbuf", KSTAT_DATA_UINT64
);
391 kstat_named_init(&vsp
->nretzcbuf
, "nretzcbuf", KSTAT_DATA_UINT64
);
397 * Creates a kstat structure associated with a vopstats structure.
400 new_vskstat(char *ksname
, vopstats_t
*vsp
)
404 if (!vopstats_enabled
) {
408 ksp
= kstat_create("unix", 0, ksname
, "misc", KSTAT_TYPE_NAMED
,
409 sizeof (vopstats_t
)/sizeof (kstat_named_t
),
410 KSTAT_FLAG_VIRTUAL
|KSTAT_FLAG_WRITABLE
);
420 * Called from vfsinit() to initialize the support mechanisms for vopstats
425 if (!vopstats_enabled
)
429 * Creates the AVL tree which holds per-vfs vopstat anchors. This
430 * is necessary since we need to check if a kstat exists before we
431 * attempt to create it. Also, initialize its lock.
433 avl_create(&vskstat_tree
, vska_compar
, sizeof (vsk_anchor_t
),
434 offsetof(vsk_anchor_t
, vsk_node
));
435 mutex_init(&vskstat_tree_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
437 vsk_anchor_cache
= kmem_cache_create("vsk_anchor_cache",
438 sizeof (vsk_anchor_t
), sizeof (uintptr_t), NULL
, NULL
, NULL
,
442 * Set up the array of pointers for the vopstats-by-FS-type.
443 * The entries will be allocated/initialized as each file system
444 * goes through modload/mod_installfs.
446 vopstats_fstype
= (vopstats_t
**)kmem_zalloc(
447 (sizeof (vopstats_t
*) * nfstype
), KM_SLEEP
);
449 /* Set up the global vopstats initialization template */
450 vs_templatep
= create_vopstats_template();
454 * We need to have the all of the counters zeroed.
455 * The initialization of the vopstats_t includes on the order of
456 * 50 calls to kstat_named_init(). Rather that do that on every call,
457 * we do it once in a template (vs_templatep) then bcopy it over.
460 initialize_vopstats(vopstats_t
*vsp
)
465 bcopy(vs_templatep
, vsp
, sizeof (vopstats_t
));
469 * If possible, determine which vopstats by fstype to use and
470 * return a pointer to the caller.
473 get_fstype_vopstats(vfs_t
*vfsp
, struct vfssw
*vswp
)
475 int fstype
= 0; /* Index into vfssw[] */
476 vopstats_t
*vsp
= NULL
;
478 if (vfsp
== NULL
|| (vfsp
->vfs_flag
& VFS_STATS
) == 0 ||
482 * Set up the fstype. We go to so much trouble because all versions
483 * of NFS use the same fstype in their vfs even though they have
484 * distinct entries in the vfssw[] table.
485 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
488 fstype
= vswp
- vfssw
; /* Gets us the index */
490 fstype
= vfsp
->vfs_fstype
;
494 * Point to the per-fstype vopstats. The only valid values are
495 * non-zero positive values less than the number of vfssw[] table
498 if (fstype
> 0 && fstype
< nfstype
) {
499 vsp
= vopstats_fstype
[fstype
];
506 * Generate a kstat name, create the kstat structure, and allocate a
507 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t
508 * to the caller. This must only be called from a mount.
511 get_vskstat_anchor(vfs_t
*vfsp
)
513 char kstatstr
[KSTAT_STRLEN
]; /* kstat name for vopstats */
514 statvfs64_t statvfsbuf
; /* Needed to find f_fsid */
515 vsk_anchor_t
*vskp
= NULL
; /* vfs <--> kstat anchor */
516 kstat_t
*ksp
; /* Ptr to new kstat */
517 avl_index_t where
; /* Location in the AVL tree */
519 if (vfsp
== NULL
|| vfsp
->vfs_implp
== NULL
||
520 (vfsp
->vfs_flag
& VFS_STATS
) == 0 || !vopstats_enabled
)
523 /* Need to get the fsid to build a kstat name */
524 if (VFS_STATVFS(vfsp
, &statvfsbuf
) == 0) {
525 /* Create a name for our kstats based on fsid */
526 (void) snprintf(kstatstr
, KSTAT_STRLEN
, "%s%lx",
527 VOPSTATS_STR
, statvfsbuf
.f_fsid
);
529 /* Allocate and initialize the vsk_anchor_t */
530 vskp
= kmem_cache_alloc(vsk_anchor_cache
, KM_SLEEP
);
531 bzero(vskp
, sizeof (*vskp
));
532 vskp
->vsk_fsid
= statvfsbuf
.f_fsid
;
534 mutex_enter(&vskstat_tree_lock
);
535 if (avl_find(&vskstat_tree
, vskp
, &where
) == NULL
) {
536 avl_insert(&vskstat_tree
, vskp
, where
);
537 mutex_exit(&vskstat_tree_lock
);
540 * Now that we've got the anchor in the AVL
541 * tree, we can create the kstat.
543 ksp
= new_vskstat(kstatstr
, &vfsp
->vfs_vopstats
);
548 /* Oops, found one! Release memory and lock. */
549 mutex_exit(&vskstat_tree_lock
);
550 kmem_cache_free(vsk_anchor_cache
, vskp
);
558 * We're in the process of tearing down the vfs and need to cleanup
559 * the data structures associated with the vopstats. Must only be called
563 teardown_vopstats(vfs_t
*vfsp
)
568 if (vfsp
== NULL
|| vfsp
->vfs_implp
== NULL
||
569 (vfsp
->vfs_flag
& VFS_STATS
) == 0 || !vopstats_enabled
)
572 /* This is a safe check since VFS_STATS must be set (see above) */
573 if ((vskap
= vfsp
->vfs_vskap
) == NULL
)
576 /* Whack the pointer right away */
577 vfsp
->vfs_vskap
= NULL
;
579 /* Lock the tree, remove the node, and delete the kstat */
580 mutex_enter(&vskstat_tree_lock
);
581 if (avl_find(&vskstat_tree
, vskap
, &where
)) {
582 avl_remove(&vskstat_tree
, vskap
);
585 if (vskap
->vsk_ksp
) {
586 kstat_delete(vskap
->vsk_ksp
);
588 mutex_exit(&vskstat_tree_lock
);
590 kmem_cache_free(vsk_anchor_cache
, vskap
);
594 * Read or write a vnode. Called from kernel code.
605 rlim64_t ulimit
, /* meaningful only if rw is UIO_WRITE */
614 if (rw
== UIO_WRITE
&& ISROFILE(vp
))
620 VOPXID_MAP_CR(vp
, cr
);
626 uio
.uio_loffset
= offset
;
627 uio
.uio_segflg
= (short)seg
;
629 uio
.uio_llimit
= ulimit
;
632 * We have to enter the critical region before calling fop_rwlock
633 * to avoid a deadlock with ufs.
635 if (nbl_need_check(vp
)) {
638 nbl_start_crit(vp
, RW_READER
);
640 error
= nbl_svmand(vp
, cr
, &svmand
);
643 if (nbl_conflict(vp
, rw
== UIO_WRITE
? NBL_WRITE
: NBL_READ
,
644 uio
.uio_offset
, uio
.uio_resid
, svmand
, NULL
)) {
650 (void) fop_rwlock(vp
,
651 rw
== UIO_WRITE
? V_WRITELOCK_TRUE
: V_WRITELOCK_FALSE
, NULL
);
652 if (rw
== UIO_WRITE
) {
653 uio
.uio_fmode
= FWRITE
;
654 uio
.uio_extflg
= UIO_COPY_DEFAULT
;
655 error
= fop_write(vp
, &uio
, ioflag
, cr
, NULL
);
657 uio
.uio_fmode
= FREAD
;
658 uio
.uio_extflg
= UIO_COPY_CACHED
;
659 error
= fop_read(vp
, &uio
, ioflag
, cr
, NULL
);
662 rw
== UIO_WRITE
? V_WRITELOCK_TRUE
: V_WRITELOCK_FALSE
, NULL
);
664 *residp
= uio
.uio_resid
;
665 else if (uio
.uio_resid
)
675 * Release a vnode. Call fop_inactive on last reference or
676 * decrement reference count.
678 * To avoid race conditions, the v_count is left at 1 for
679 * the call to fop_inactive. This prevents another thread
680 * from reclaiming and releasing the vnode *before* the
681 * fop_inactive routine has a chance to destroy the vnode.
682 * We can't have more than 1 thread calling fop_inactive
688 VERIFY(vp
->v_count
> 0);
689 mutex_enter(&vp
->v_lock
);
690 if (vp
->v_count
== 1) {
691 mutex_exit(&vp
->v_lock
);
692 fop_inactive(vp
, CRED(), NULL
);
696 mutex_exit(&vp
->v_lock
);
700 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
701 * as a single reference, so v_count is not decremented until the last DNLC hold
702 * is released. This makes it possible to distinguish vnodes that are referenced
706 vn_rele_dnlc(vnode_t
*vp
)
708 VERIFY((vp
->v_count
> 0) && (vp
->v_count_dnlc
> 0));
709 mutex_enter(&vp
->v_lock
);
710 if (--vp
->v_count_dnlc
== 0) {
711 if (vp
->v_count
== 1) {
712 mutex_exit(&vp
->v_lock
);
713 fop_inactive(vp
, CRED(), NULL
);
718 mutex_exit(&vp
->v_lock
);
722 * Like vn_rele() except that it clears v_stream under v_lock.
723 * This is used by sockfs when it dismantles the association between
724 * the sockfs node and the vnode in the underlying file system.
725 * v_lock has to be held to prevent a thread coming through the lookupname
726 * path from accessing a stream head that is going away.
729 vn_rele_stream(vnode_t
*vp
)
731 VERIFY(vp
->v_count
> 0);
732 mutex_enter(&vp
->v_lock
);
734 if (vp
->v_count
== 1) {
735 mutex_exit(&vp
->v_lock
);
736 fop_inactive(vp
, CRED(), NULL
);
740 mutex_exit(&vp
->v_lock
);
744 vn_rele_inactive(vnode_t
*vp
)
746 fop_inactive(vp
, CRED(), NULL
);
750 * Like vn_rele() except if we are going to call fop_inactive() then do it
751 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
752 * the file system as a result of releasing the vnode. Note, file systems
753 * already have to handle the race where the vnode is incremented before the
754 * inactive routine is called and does its locking.
756 * Warning: Excessive use of this routine can lead to performance problems.
757 * This is because taskqs throttle back allocation if too many are created.
760 vn_rele_async(vnode_t
*vp
, taskq_t
*taskq
)
762 VERIFY(vp
->v_count
> 0);
763 mutex_enter(&vp
->v_lock
);
764 if (vp
->v_count
== 1) {
765 mutex_exit(&vp
->v_lock
);
766 VERIFY(taskq_dispatch(taskq
, (task_func_t
*)vn_rele_inactive
,
767 vp
, TQ_SLEEP
) != (uintptr_t)NULL
);
771 mutex_exit(&vp
->v_lock
);
784 return (vn_openat(pnamep
, seg
, filemode
, createmode
, vpp
, crwhy
,
790 * Open/create a vnode.
791 * This may be callable by the kernel, the only known use
792 * of user context being that the current user credentials
793 * are used for permissions. crwhy is defined iff filemode & FCREAT.
804 struct vnode
*startvp
,
813 int shrlock_done
= 0;
815 enum symfollow follow
;
816 int estale_retry
= 0;
818 struct shr_locowner shr_own
;
820 if (filemode
& FSEARCH
)
821 filemode
|= FDIRECTORY
;
825 if (filemode
& FREAD
)
827 if (filemode
& (FWRITE
|FTRUNC
))
829 if (filemode
& (FSEARCH
|FEXEC
|FXATTRDIROPEN
))
832 /* symlink interpretation */
833 if (filemode
& FNOFOLLOW
)
838 if (filemode
& FAPPEND
)
839 accessflags
|= V_APPEND
;
842 if (filemode
& FCREAT
&& !(filemode
& FDIRECTORY
)) {
845 /* Wish to create a file. */
846 vattr
.va_type
= VREG
;
847 vattr
.va_mode
= createmode
;
848 vattr
.va_mask
= AT_TYPE
|AT_MODE
;
849 if (filemode
& FTRUNC
) {
851 vattr
.va_mask
|= AT_SIZE
;
853 if (filemode
& FEXCL
)
859 vn_createat(pnamep
, seg
, &vattr
, excl
, mode
, &vp
, crwhy
,
860 (filemode
& ~(FTRUNC
|FEXCL
)), umask
, startvp
))
863 /* Wish to open a file. Just look it up. */
864 if (error
= lookupnameat(pnamep
, seg
, follow
,
865 NULLVPP
, &vp
, startvp
)) {
866 if ((error
== ESTALE
) &&
867 fs_need_estale_retry(estale_retry
++))
873 * Get the attributes to check whether file is large.
874 * We do this only if the FOFFMAX flag is not set and
875 * only for regular files.
878 if (!(filemode
& FOFFMAX
) && (vp
->v_type
== VREG
)) {
879 vattr
.va_mask
= AT_SIZE
;
880 if ((error
= fop_getattr(vp
, &vattr
, 0,
884 if (vattr
.va_size
> (uoff_t
)MAXOFF32_T
) {
886 * Large File API - regular open fails
887 * if FOFFMAX flag is set in file mode
894 * Can't write directories, active texts, or
895 * read-only filesystems. Can't truncate files
896 * on which mandatory locking is in effect.
898 if (filemode
& (FWRITE
|FTRUNC
)) {
900 * Allow writable directory if VDIROPEN flag is set.
902 if (vp
->v_type
== VDIR
&& !(vp
->v_flag
& VDIROPEN
)) {
911 * Can't truncate files on which
912 * sysv mandatory locking is in effect.
914 if (filemode
& FTRUNC
) {
917 if (fop_realvp(vp
, &rvp
, NULL
) != 0)
919 if (rvp
->v_filocks
!= NULL
) {
920 vattr
.va_mask
= AT_MODE
;
921 if ((error
= fop_getattr(vp
,
922 &vattr
, 0, CRED(), NULL
)) == 0 &&
923 MANDLOCK(vp
, vattr
.va_mode
))
933 if (error
= fop_access(vp
, mode
, accessflags
, CRED(), NULL
))
936 * Require FDIRECTORY to return a directory.
937 * Require FEXEC to return a regular file.
939 if ((filemode
& FDIRECTORY
) && vp
->v_type
!= VDIR
) {
943 if ((filemode
& FEXEC
) && vp
->v_type
!= VREG
) {
944 error
= ENOEXEC
; /* XXX: error code? */
950 * Do remaining checks for FNOFOLLOW and FNOLINKS.
952 if ((filemode
& FNOFOLLOW
) && vp
->v_type
== VLNK
) {
956 if (filemode
& FNOLINKS
) {
957 vattr
.va_mask
= AT_NLINK
;
958 if ((error
= fop_getattr(vp
, &vattr
, 0, CRED(), NULL
))) {
961 if (vattr
.va_nlink
!= 1) {
968 * Opening a socket corresponding to the AF_UNIX pathname
969 * in the filesystem name space is not supported.
970 * However, VSOCK nodes in namefs are supported in order
971 * to make fattach work for sockets.
973 * XXX This uses fop_realvp to distinguish between
974 * an unopened namefs node (where fop_realvp returns a
975 * different VSOCK vnode) and a VSOCK created by vn_create
976 * in some file system (where fop_realvp would never return
977 * a different vnode).
979 if (vp
->v_type
== VSOCK
) {
982 error
= fop_realvp(vp
, &nvp
, NULL
);
983 if (error
!= 0 || nvp
== NULL
|| nvp
== vp
||
984 nvp
->v_type
!= VSOCK
) {
990 if ((vp
->v_type
== VREG
) && nbl_need_check(vp
)) {
991 /* get share reservation */
993 if (filemode
& FWRITE
)
994 shr
.s_access
|= F_WRACC
;
995 if (filemode
& FREAD
)
996 shr
.s_access
|= F_RDACC
;
999 shr
.s_pid
= ttoproc(curthread
)->p_pid
;
1000 shr_own
.sl_pid
= shr
.s_pid
;
1002 shr
.s_own_len
= sizeof (shr_own
);
1003 shr
.s_owner
= (caddr_t
)&shr_own
;
1004 error
= fop_shrlock(vp
, F_SHARE_NBMAND
, &shr
, filemode
, CRED(),
1010 /* nbmand conflict check if truncating file */
1011 if ((filemode
& FTRUNC
) && !(filemode
& FCREAT
)) {
1012 nbl_start_crit(vp
, RW_READER
);
1015 vattr
.va_mask
= AT_SIZE
;
1016 if (error
= fop_getattr(vp
, &vattr
, 0, CRED(), NULL
))
1018 if (nbl_conflict(vp
, NBL_WRITE
, 0, vattr
.va_size
, 0,
1027 * Do opening protocol.
1029 error
= fop_open(&vp
, filemode
, CRED(), NULL
);
1035 * Truncate if required.
1037 if ((filemode
& FTRUNC
) && !(filemode
& FCREAT
)) {
1039 vattr
.va_mask
= AT_SIZE
;
1040 if ((error
= fop_setattr(vp
, &vattr
, 0, CRED(), NULL
)) != 0)
1044 ASSERT(vp
->v_count
> 0);
1052 (void) fop_close(vp
, filemode
, 1, 0, CRED(),
1058 (void) fop_shrlock(vp
, F_UNSHARE
, &shr
, 0, CRED(),
1064 * The following clause was added to handle a problem
1065 * with NFS consistency. It is possible that a lookup
1066 * of the file to be opened succeeded, but the file
1067 * itself doesn't actually exist on the server. This
1068 * is chiefly due to the DNLC containing an entry for
1069 * the file which has been removed on the server. In
1070 * this case, we just start over. If there was some
1071 * other cause for the ESTALE error, then the lookup
1072 * of the file will fail and the error will be returned
1073 * above instead of looping around from here.
1076 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1084 * The following two accessor functions are for the NFSv4 server. Since there
1085 * is no fop_open_UP/DOWNGRADE we need a way for the NFS server to keep the
1086 * vnode open counts correct when a client "upgrades" an open or does an
1087 * open_downgrade. In NFS, an upgrade or downgrade can not only change the
1088 * open mode (add or subtract read or write), but also change the share/deny
1089 * modes. However, share reservations are not integrated with OPEN, yet, so
1090 * we need to handle each separately. These functions are cleaner than having
1091 * the NFS server manipulate the counts directly, however, nobody else should
1092 * use these functions.
1099 ASSERT(vp
->v_type
== VREG
);
1101 if (filemode
& FREAD
)
1102 atomic_inc_32(&vp
->v_rdcnt
);
1103 if (filemode
& FWRITE
)
1104 atomic_inc_32(&vp
->v_wrcnt
);
1113 ASSERT(vp
->v_type
== VREG
);
1115 if (filemode
& FREAD
) {
1116 ASSERT(vp
->v_rdcnt
> 0);
1117 atomic_dec_32(&vp
->v_rdcnt
);
1119 if (filemode
& FWRITE
) {
1120 ASSERT(vp
->v_wrcnt
> 0);
1121 atomic_dec_32(&vp
->v_wrcnt
);
1138 return (vn_createat(pnamep
, seg
, vap
, excl
, mode
, vpp
, why
, flag
,
1143 * Create a vnode (makenode).
1156 struct vnode
*startvp
)
1158 struct vnode
*dvp
; /* ptr to parent dir vnode */
1159 struct vnode
*vp
= NULL
;
1164 enum symfollow follow
;
1165 int estale_retry
= 0;
1166 uint32_t auditing
= AU_AUDITING();
1168 ASSERT((vap
->va_mask
& (AT_TYPE
|AT_MODE
)) == (AT_TYPE
|AT_MODE
));
1170 /* symlink interpretation */
1171 if ((flag
& FNOFOLLOW
) || excl
== EXCL
)
1175 flag
&= ~(FNOFOLLOW
|FNOLINKS
);
1180 * If new object is a file, call lower level to create it.
1181 * Note that it is up to the lower level to enforce exclusive
1182 * creation, if the file is already there.
1183 * This allows the lower level to do whatever
1184 * locking or protocol that is needed to prevent races.
1185 * If the new object is directory call lower level to make
1186 * the new directory, with "." and "..".
1188 if (error
= pn_get(pnamep
, seg
, &pn
))
1191 audit_vncreate_start();
1195 * lookup will find the parent directory for the vnode.
1196 * When it is done the pn holds the name of the entry
1198 * If this is a non-exclusive create we also find the node itself.
1200 error
= lookuppnat(&pn
, NULL
, follow
, &dvp
,
1201 (excl
== EXCL
) ? NULLVPP
: vpp
, startvp
);
1204 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1206 if (why
== CRMKDIR
&& error
== EINVAL
)
1207 error
= EEXIST
; /* SVID */
1212 vap
->va_mode
&= ~VSVTX
;
1215 * If default ACLs are defined for the directory don't apply the
1216 * umask if umask is passed.
1223 vsec
.vsa_aclcnt
= 0;
1224 vsec
.vsa_aclentp
= NULL
;
1225 vsec
.vsa_dfaclcnt
= 0;
1226 vsec
.vsa_dfaclentp
= NULL
;
1227 vsec
.vsa_mask
= VSA_DFACLCNT
;
1228 error
= fop_getsecattr(dvp
, &vsec
, 0, CRED(), NULL
);
1230 * If error is ENOSYS then treat it as no error
1231 * Don't want to force all file systems to support
1232 * aclent_t style of ACL's.
1234 if (error
== ENOSYS
)
1242 * Apply the umask if no default ACLs.
1244 if (vsec
.vsa_dfaclcnt
== 0)
1245 vap
->va_mode
&= ~umask
;
1248 * fop_getsecattr() may have allocated memory for
1249 * ACLs we didn't request, so double-check and
1250 * free it if necessary.
1252 if (vsec
.vsa_aclcnt
&& vsec
.vsa_aclentp
!= NULL
)
1253 kmem_free((caddr_t
)vsec
.vsa_aclentp
,
1254 vsec
.vsa_aclcnt
* sizeof (aclent_t
));
1255 if (vsec
.vsa_dfaclcnt
&& vsec
.vsa_dfaclentp
!= NULL
)
1256 kmem_free((caddr_t
)vsec
.vsa_dfaclentp
,
1257 vsec
.vsa_dfaclcnt
* sizeof (aclent_t
));
1262 * In general we want to generate EROFS if the file system is
1263 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1
1264 * documents the open system call, and it says that O_CREAT has no
1265 * effect if the file already exists. Bug 1119649 states
1266 * that open(path, O_CREAT, ...) fails when attempting to open an
1267 * existing file on a read only file system. Thus, the first part
1268 * of the following if statement has 3 checks:
1269 * if the file exists &&
1270 * it is being open with write access &&
1271 * the file system is read only
1272 * then generate EROFS
1274 if ((*vpp
!= NULL
&& (mode
& VWRITE
) && ISROFILE(*vpp
)) ||
1275 (*vpp
== NULL
&& dvp
->v_vfsp
->vfs_flag
& VFS_RDONLY
)) {
1279 } else if (excl
== NONEXCL
&& *vpp
!= NULL
) {
1283 * File already exists. If a mandatory lock has been
1284 * applied, return error.
1287 if (fop_realvp(vp
, &rvp
, NULL
) != 0)
1289 if ((vap
->va_mask
& AT_SIZE
) && nbl_need_check(vp
)) {
1290 nbl_start_crit(vp
, RW_READER
);
1293 if (rvp
->v_filocks
!= NULL
|| rvp
->v_shrlocks
!= NULL
) {
1294 vattr
.va_mask
= AT_MODE
|AT_SIZE
;
1295 if (error
= fop_getattr(vp
, &vattr
, 0, CRED(), NULL
)) {
1298 if (MANDLOCK(vp
, vattr
.va_mode
)) {
1303 * File cannot be truncated if non-blocking mandatory
1304 * locks are currently on the file.
1306 if ((vap
->va_mask
& AT_SIZE
) && in_crit
) {
1310 offset
= vap
->va_size
> vattr
.va_size
?
1311 vattr
.va_size
: vap
->va_size
;
1312 length
= vap
->va_size
> vattr
.va_size
?
1313 vap
->va_size
- vattr
.va_size
:
1314 vattr
.va_size
- vap
->va_size
;
1315 if (nbl_conflict(vp
, NBL_WRITE
, offset
,
1324 * If the file is the root of a VFS, we've crossed a
1325 * mount point and the "containing" directory that we
1326 * acquired above (dvp) is irrelevant because it's in
1327 * a different file system. We apply fop_create to the
1328 * target itself instead of to the containing directory
1329 * and supply a null path name to indicate (conventionally)
1330 * the node itself as the "component" of interest.
1332 * The call to fop_create() is necessary to ensure
1333 * that the appropriate permission checks are made,
1334 * i.e. EISDIR, EACCES, etc. We already know that vpp
1335 * exists since we are in the else condition where this
1338 if (vp
->v_flag
& VROOT
) {
1339 ASSERT(why
!= CRMKDIR
);
1340 error
= fop_create(vp
, "", vap
, excl
, mode
, vpp
,
1341 CRED(), flag
, NULL
, NULL
);
1343 * If the create succeeded, it will have created a
1344 * new reference on a new vnode (*vpp) in the child
1345 * file system, so we want to drop our reference on
1346 * the old (vp) upon exit.
1352 * Large File API - non-large open (FOFFMAX flag not set)
1353 * of regular file fails if the file size exceeds MAXOFF32_T.
1355 if (why
!= CRMKDIR
&&
1356 !(flag
& FOFFMAX
) &&
1357 (vp
->v_type
== VREG
)) {
1358 vattr
.va_mask
= AT_SIZE
;
1359 if ((error
= fop_getattr(vp
, &vattr
, 0,
1363 if ((vattr
.va_size
> (uoff_t
)MAXOFF32_T
)) {
1372 * Call mkdir() if specified, otherwise create().
1374 int must_be_dir
= pn_fixslash(&pn
); /* trailing '/'? */
1378 * N.B., if vn_createat() ever requests
1379 * case-insensitive behavior then it will need
1380 * to be passed to fop_mkdir(). fop_create()
1381 * will already get it via "flag"
1383 error
= fop_mkdir(dvp
, pn
.pn_path
, vap
, vpp
, CRED(),
1385 else if (!must_be_dir
)
1386 error
= fop_create(dvp
, pn
.pn_path
, vap
,
1387 excl
, mode
, vpp
, CRED(), flag
, NULL
, NULL
);
1395 audit_vncreate_finish(*vpp
, error
);
1407 * The following clause was added to handle a problem
1408 * with NFS consistency. It is possible that a lookup
1409 * of the file to be created succeeded, but the file
1410 * itself doesn't actually exist on the server. This
1411 * is chiefly due to the DNLC containing an entry for
1412 * the file which has been removed on the server. In
1413 * this case, we just start over. If there was some
1414 * other cause for the ESTALE error, then the lookup
1415 * of the file will fail and the error will be returned
1416 * above instead of looping around from here.
1418 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1424 vn_link(char *from
, char *to
, enum uio_seg seg
)
1426 return (vn_linkat(NULL
, from
, NO_FOLLOW
, NULL
, to
, seg
));
1430 vn_linkat(vnode_t
*fstartvp
, char *from
, enum symfollow follow
,
1431 vnode_t
*tstartvp
, char *to
, enum uio_seg seg
)
1433 struct vnode
*fvp
; /* from vnode ptr */
1434 struct vnode
*tdvp
; /* to directory vnode ptr */
1439 int estale_retry
= 0;
1440 uint32_t auditing
= AU_AUDITING();
1444 if (error
= pn_get(to
, seg
, &pn
))
1446 if (auditing
&& fstartvp
!= NULL
)
1447 audit_setfsat_path(1);
1448 if (error
= lookupnameat(from
, seg
, follow
, NULLVPP
, &fvp
, fstartvp
))
1450 if (auditing
&& tstartvp
!= NULL
)
1451 audit_setfsat_path(3);
1452 if (error
= lookuppnat(&pn
, NULL
, NO_FOLLOW
, &tdvp
, NULLVPP
, tstartvp
))
1455 * Make sure both source vnode and target directory vnode are
1456 * in the same vfs and that it is writeable.
1458 vattr
.va_mask
= AT_FSID
;
1459 if (error
= fop_getattr(fvp
, &vattr
, 0, CRED(), NULL
))
1461 fsid
= vattr
.va_fsid
;
1462 vattr
.va_mask
= AT_FSID
;
1463 if (error
= fop_getattr(tdvp
, &vattr
, 0, CRED(), NULL
))
1465 if (fsid
!= vattr
.va_fsid
) {
1469 if (tdvp
->v_vfsp
->vfs_flag
& VFS_RDONLY
) {
1476 (void) pn_fixslash(&pn
);
1477 error
= fop_link(tdvp
, fvp
, pn
.pn_path
, CRED(), NULL
, 0);
1484 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1490 vn_rename(char *from
, char *to
, enum uio_seg seg
)
1492 return (vn_renameat(NULL
, from
, NULL
, to
, seg
));
1496 vn_renameat(vnode_t
*fdvp
, char *fname
, vnode_t
*tdvp
,
1497 char *tname
, enum uio_seg seg
)
1501 struct pathname fpn
; /* from pathname */
1502 struct pathname tpn
; /* to pathname */
1504 int in_crit_src
, in_crit_targ
;
1505 vnode_t
*fromvp
, *fvp
;
1506 vnode_t
*tovp
, *targvp
;
1507 int estale_retry
= 0;
1508 uint32_t auditing
= AU_AUDITING();
1511 fvp
= fromvp
= tovp
= targvp
= NULL
;
1512 in_crit_src
= in_crit_targ
= 0;
1514 * Get to and from pathnames.
1516 if (error
= pn_get(fname
, seg
, &fpn
))
1518 if (error
= pn_get(tname
, seg
, &tpn
)) {
1524 * First we need to resolve the correct directories
1525 * The passed in directories may only be a starting point,
1526 * but we need the real directories the file(s) live in.
1527 * For example the fname may be something like usr/lib/sparc
1528 * and we were passed in the / directory, but we need to
1529 * use the lib directory for the rename.
1532 if (auditing
&& fdvp
!= NULL
)
1533 audit_setfsat_path(1);
1535 * Lookup to and from directories.
1537 if (error
= lookuppnat(&fpn
, NULL
, NO_FOLLOW
, &fromvp
, &fvp
, fdvp
)) {
1542 * Make sure there is an entry.
1549 if (auditing
&& tdvp
!= NULL
)
1550 audit_setfsat_path(3);
1551 if (error
= lookuppnat(&tpn
, NULL
, NO_FOLLOW
, &tovp
, &targvp
, tdvp
)) {
1556 * Make sure both the from vnode directory and the to directory
1557 * are in the same vfs and the to directory is writable.
1558 * We check fsid's, not vfs pointers, so loopback fs works.
1560 if (fromvp
!= tovp
) {
1561 vattr
.va_mask
= AT_FSID
;
1562 if (error
= fop_getattr(fromvp
, &vattr
, 0, CRED(), NULL
))
1564 fsid
= vattr
.va_fsid
;
1565 vattr
.va_mask
= AT_FSID
;
1566 if (error
= fop_getattr(tovp
, &vattr
, 0, CRED(), NULL
))
1568 if (fsid
!= vattr
.va_fsid
) {
1574 if (tovp
->v_vfsp
->vfs_flag
& VFS_RDONLY
) {
1580 * Make sure "from" vp is not a mount point.
1581 * Note, lookup did traverse() already, so
1582 * we'll be looking at the mounted FS root.
1583 * (but allow files like mnttab)
1585 if ((fvp
->v_flag
& VROOT
) != 0 && fvp
->v_type
== VDIR
) {
1590 if (targvp
&& (fvp
!= targvp
)) {
1591 nbl_start_crit(targvp
, RW_READER
);
1593 if (nbl_conflict(targvp
, NBL_REMOVE
, 0, 0, 0, NULL
)) {
1599 if (nbl_need_check(fvp
)) {
1600 nbl_start_crit(fvp
, RW_READER
);
1602 if (nbl_conflict(fvp
, NBL_RENAME
, 0, 0, 0, NULL
)) {
1611 (void) pn_fixslash(&tpn
);
1612 error
= fop_rename(fromvp
, fpn
.pn_path
, tovp
, tpn
.pn_path
, CRED(),
1621 nbl_end_crit(targvp
);
1630 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1636 * Remove a file or directory.
1639 vn_remove(char *fnamep
, enum uio_seg seg
, enum rm dirflag
)
1641 return (vn_removeat(NULL
, fnamep
, seg
, dirflag
));
1645 vn_removeat(vnode_t
*startvp
, char *fnamep
, enum uio_seg seg
, enum rm dirflag
)
1647 struct vnode
*vp
; /* entry vnode */
1648 struct vnode
*dvp
; /* ptr to parent dir vnode */
1649 struct vnode
*coveredvp
;
1650 struct pathname pn
; /* name of entry */
1654 struct vfs
*dvfsp
; /* ptr to parent dir vfs */
1656 int estale_retry
= 0;
1659 if (error
= pn_get(fnamep
, seg
, &pn
))
1662 if (error
= lookuppnat(&pn
, NULL
, NO_FOLLOW
, &dvp
, &vp
, startvp
)) {
1664 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1670 * Make sure there is an entry.
1678 dvfsp
= dvp
->v_vfsp
;
1681 * If the named file is the root of a mounted filesystem, fail,
1682 * unless it's marked unlinkable. In that case, unmount the
1683 * filesystem and proceed to unlink the covered vnode. (If the
1684 * covered vnode is a directory, use rmdir instead of unlink,
1685 * to avoid file system corruption.)
1687 if (vp
->v_flag
& VROOT
) {
1688 if ((vfsp
->vfs_flag
& VFS_UNLINKABLE
) == 0) {
1694 * Namefs specific code starts here.
1697 if (dirflag
== RMDIRECTORY
) {
1699 * User called rmdir(2) on a file that has
1700 * been namefs mounted on top of. Since
1701 * namefs doesn't allow directories to
1702 * be mounted on other files we know
1703 * vp is not of type VDIR so fail to operation.
1710 * If VROOT is still set after grabbing vp->v_lock,
1711 * noone has finished nm_unmount so far and coveredvp
1713 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1714 * vp->v_lock, any race window is eliminated.
1717 mutex_enter(&vp
->v_lock
);
1718 if ((vp
->v_flag
& VROOT
) == 0) {
1719 /* Someone beat us to the unmount */
1720 mutex_exit(&vp
->v_lock
);
1725 coveredvp
= vfsp
->vfs_vnodecovered
;
1728 * Note: Implementation of vn_vfswlock shows that ordering of
1729 * v_lock / vn_vfswlock is not an issue here.
1731 error
= vn_vfswlock(coveredvp
);
1732 mutex_exit(&vp
->v_lock
);
1739 error
= dounmount(vfsp
, 0, CRED());
1742 * Unmounted the namefs file system; now get
1743 * the object it was mounted over.
1747 * If namefs was mounted over a directory, then
1748 * we want to use rmdir() instead of unlink().
1750 if (vp
->v_type
== VDIR
)
1751 dirflag
= RMDIRECTORY
;
1758 * Make sure filesystem is writeable.
1759 * We check the parent directory's vfs in case this is an lofs vnode.
1761 if (dvfsp
&& dvfsp
->vfs_flag
& VFS_RDONLY
) {
1769 * If there is the possibility of an nbmand share reservation, make
1770 * sure it's okay to remove the file. Keep a reference to the
1771 * vnode, so that we can exit the nbl critical region after
1772 * calling fop_remove.
1773 * If there is no possibility of an nbmand share reservation,
1774 * release the vnode reference now. Filesystems like NFS may
1775 * behave differently if there is an extra reference, so get rid of
1776 * this one. Fortunately, we can't have nbmand mounts on NFS
1779 if (nbl_need_check(vp
)) {
1780 nbl_start_crit(vp
, RW_READER
);
1782 if (nbl_conflict(vp
, NBL_REMOVE
, 0, 0, 0, NULL
)) {
1791 if (dirflag
== RMDIRECTORY
) {
1793 * Caller is using rmdir(2), which can only be applied to
1796 if (vtype
!= VDIR
) {
1800 proc_t
*pp
= curproc
;
1802 mutex_enter(&pp
->p_lock
);
1803 cwd
= PTOU(pp
)->u_cdir
;
1805 mutex_exit(&pp
->p_lock
);
1806 error
= fop_rmdir(dvp
, pn
.pn_path
, cwd
, CRED(),
1812 * Unlink(2) can be applied to anything.
1814 error
= fop_remove(dvp
, pn
.pn_path
, CRED(), NULL
, 0);
1827 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1833 * Utility function to compare equality of vnodes.
1834 * Compare the underlying real vnodes, if there are underlying vnodes.
1835 * This is a more thorough comparison than the VN_CMP() macro provides.
1838 vn_compare(vnode_t
*vp1
, vnode_t
*vp2
)
1842 if (vp1
!= NULL
&& fop_realvp(vp1
, &realvp
, NULL
) == 0)
1844 if (vp2
!= NULL
&& fop_realvp(vp2
, &realvp
, NULL
) == 0)
1846 return (VN_CMP(vp1
, vp2
));
1850 * The number of locks to hash into. This value must be a power
1851 * of 2 minus 1 and should probably also be prime.
1853 #define NUM_BUCKETS 1023
1855 struct vn_vfslocks_bucket
{
1857 vn_vfslocks_entry_t
*vb_list
;
1858 char pad
[64 - sizeof (kmutex_t
) - sizeof (void *)];
1862 * Total number of buckets will be NUM_BUCKETS + 1 .
1865 #pragma align 64(vn_vfslocks_buckets)
1866 static struct vn_vfslocks_bucket vn_vfslocks_buckets
[NUM_BUCKETS
+ 1];
1868 #define VN_VFSLOCKS_SHIFT 9
1870 #define VN_VFSLOCKS_HASH(vfsvpptr) \
1871 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
1874 * vn_vfslocks_getlock() uses an HASH scheme to generate
1875 * rwstlock using vfs/vnode pointer passed to it.
1877 * vn_vfslocks_rele() releases a reference in the
1878 * HASH table which allows the entry allocated by
1879 * vn_vfslocks_getlock() to be freed at a later
1880 * stage when the refcount drops to zero.
1883 vn_vfslocks_entry_t
*
1884 vn_vfslocks_getlock(void *vfsvpptr
)
1886 struct vn_vfslocks_bucket
*bp
;
1887 vn_vfslocks_entry_t
*vep
;
1888 vn_vfslocks_entry_t
*tvep
;
1890 ASSERT(vfsvpptr
!= NULL
);
1891 bp
= &vn_vfslocks_buckets
[VN_VFSLOCKS_HASH(vfsvpptr
)];
1893 mutex_enter(&bp
->vb_lock
);
1894 for (vep
= bp
->vb_list
; vep
!= NULL
; vep
= vep
->ve_next
) {
1895 if (vep
->ve_vpvfs
== vfsvpptr
) {
1897 mutex_exit(&bp
->vb_lock
);
1901 mutex_exit(&bp
->vb_lock
);
1902 vep
= kmem_alloc(sizeof (*vep
), KM_SLEEP
);
1903 rwst_init(&vep
->ve_lock
, NULL
, RW_DEFAULT
, NULL
);
1904 vep
->ve_vpvfs
= (char *)vfsvpptr
;
1906 mutex_enter(&bp
->vb_lock
);
1907 for (tvep
= bp
->vb_list
; tvep
!= NULL
; tvep
= tvep
->ve_next
) {
1908 if (tvep
->ve_vpvfs
== vfsvpptr
) {
1910 mutex_exit(&bp
->vb_lock
);
1913 * There is already an entry in the hash
1914 * destroy what we just allocated.
1916 rwst_destroy(&vep
->ve_lock
);
1917 kmem_free(vep
, sizeof (*vep
));
1921 vep
->ve_next
= bp
->vb_list
;
1923 mutex_exit(&bp
->vb_lock
);
1928 vn_vfslocks_rele(vn_vfslocks_entry_t
*vepent
)
1930 struct vn_vfslocks_bucket
*bp
;
1931 vn_vfslocks_entry_t
*vep
;
1932 vn_vfslocks_entry_t
*pvep
;
1934 ASSERT(vepent
!= NULL
);
1935 ASSERT(vepent
->ve_vpvfs
!= NULL
);
1937 bp
= &vn_vfslocks_buckets
[VN_VFSLOCKS_HASH(vepent
->ve_vpvfs
)];
1939 mutex_enter(&bp
->vb_lock
);
1940 vepent
->ve_refcnt
--;
1942 if ((int32_t)vepent
->ve_refcnt
< 0)
1943 cmn_err(CE_PANIC
, "vn_vfslocks_rele: refcount negative");
1945 if (vepent
->ve_refcnt
== 0) {
1946 for (vep
= bp
->vb_list
; vep
!= NULL
; vep
= vep
->ve_next
) {
1947 if (vep
->ve_vpvfs
== vepent
->ve_vpvfs
) {
1948 if (bp
->vb_list
== vep
)
1949 bp
->vb_list
= vep
->ve_next
;
1952 pvep
->ve_next
= vep
->ve_next
;
1954 mutex_exit(&bp
->vb_lock
);
1955 rwst_destroy(&vep
->ve_lock
);
1956 kmem_free(vep
, sizeof (*vep
));
1961 cmn_err(CE_PANIC
, "vn_vfslocks_rele: vp/vfs not found");
1963 mutex_exit(&bp
->vb_lock
);
1967 * vn_vfswlock_wait is used to implement a lock which is logically a writers
1968 * lock protecting the v_vfsmountedhere field.
1969 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
1970 * except that it blocks to acquire the lock VVFSLOCK.
1972 * traverse() and routines re-implementing part of traverse (e.g. autofs)
1973 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
1974 * need the non-blocking version of the writers lock i.e. vn_vfswlock
1977 vn_vfswlock_wait(vnode_t
*vp
)
1980 vn_vfslocks_entry_t
*vpvfsentry
;
1983 vpvfsentry
= vn_vfslocks_getlock(vp
);
1984 retval
= rwst_enter_sig(&vpvfsentry
->ve_lock
, RW_WRITER
);
1986 if (retval
== EINTR
) {
1987 vn_vfslocks_rele(vpvfsentry
);
1994 vn_vfsrlock_wait(vnode_t
*vp
)
1997 vn_vfslocks_entry_t
*vpvfsentry
;
2000 vpvfsentry
= vn_vfslocks_getlock(vp
);
2001 retval
= rwst_enter_sig(&vpvfsentry
->ve_lock
, RW_READER
);
2003 if (retval
== EINTR
) {
2004 vn_vfslocks_rele(vpvfsentry
);
2013 * vn_vfswlock is used to implement a lock which is logically a writers lock
2014 * protecting the v_vfsmountedhere field.
2017 vn_vfswlock(vnode_t
*vp
)
2019 vn_vfslocks_entry_t
*vpvfsentry
;
2022 * If vp is NULL then somebody is trying to lock the covered vnode
2023 * of /. (vfs_vnodecovered is NULL for /). This situation will
2024 * only happen when unmounting /. Since that operation will fail
2025 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2030 vpvfsentry
= vn_vfslocks_getlock(vp
);
2032 if (rwst_tryenter(&vpvfsentry
->ve_lock
, RW_WRITER
))
2035 vn_vfslocks_rele(vpvfsentry
);
2040 vn_vfsrlock(vnode_t
*vp
)
2042 vn_vfslocks_entry_t
*vpvfsentry
;
2045 * If vp is NULL then somebody is trying to lock the covered vnode
2046 * of /. (vfs_vnodecovered is NULL for /). This situation will
2047 * only happen when unmounting /. Since that operation will fail
2048 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2053 vpvfsentry
= vn_vfslocks_getlock(vp
);
2055 if (rwst_tryenter(&vpvfsentry
->ve_lock
, RW_READER
))
2058 vn_vfslocks_rele(vpvfsentry
);
2063 vn_vfsunlock(vnode_t
*vp
)
2065 vn_vfslocks_entry_t
*vpvfsentry
;
2068 * ve_refcnt needs to be decremented twice.
2069 * 1. To release refernce after a call to vn_vfslocks_getlock()
2070 * 2. To release the reference from the locking routines like
2071 * vn_vfsrlock/vn_vfswlock etc,.
2073 vpvfsentry
= vn_vfslocks_getlock(vp
);
2074 vn_vfslocks_rele(vpvfsentry
);
2076 rwst_exit(&vpvfsentry
->ve_lock
);
2077 vn_vfslocks_rele(vpvfsentry
);
2081 vn_vfswlock_held(vnode_t
*vp
)
2084 vn_vfslocks_entry_t
*vpvfsentry
;
2088 vpvfsentry
= vn_vfslocks_getlock(vp
);
2089 held
= rwst_lock_held(&vpvfsentry
->ve_lock
, RW_WRITER
);
2091 vn_vfslocks_rele(vpvfsentry
);
2102 vn_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
2108 mutex_init(&vp
->v_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2109 mutex_init(&vp
->v_vsd_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2110 cv_init(&vp
->v_cv
, NULL
, CV_DEFAULT
, NULL
);
2111 rw_init(&vp
->v_nbllock
, NULL
, RW_DEFAULT
, NULL
);
2112 vp
->v_femhead
= NULL
; /* Must be done before vn_reinit() */
2113 vp
->v_path
= vn_vpath_empty
;
2114 vp
->v_path_stamp
= 0;
2115 vp
->v_mpssdata
= NULL
;
2117 vp
->v_fopdata
= NULL
;
2119 vmobject_init(&vp
->v_object
, vp
);
2126 vn_cache_destructor(void *buf
, void *cdrarg
)
2132 vmobject_fini(&vp
->v_object
);
2134 rw_destroy(&vp
->v_nbllock
);
2135 cv_destroy(&vp
->v_cv
);
2136 mutex_destroy(&vp
->v_vsd_lock
);
2137 mutex_destroy(&vp
->v_lock
);
2141 vn_create_cache(void)
2144 ASSERT((1 << VNODE_ALIGN_LOG2
) ==
2145 P2ROUNDUP(sizeof (struct vnode
), VNODE_ALIGN
));
2146 vn_cache
= kmem_cache_create("vn_cache", sizeof (struct vnode
),
2147 VNODE_ALIGN
, vn_cache_constructor
, vn_cache_destructor
, NULL
, NULL
,
2152 vn_destroy_cache(void)
2154 kmem_cache_destroy(vn_cache
);
2158 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2159 * cached by the file system and vnodes remain associated.
2162 vn_recycle(vnode_t
*vp
)
2164 ASSERT(!vn_has_cached_data(vp
));
2165 VERIFY(vp
->v_path
!= NULL
);
2168 * XXX - This really belongs in vn_reinit(), but we have some issues
2169 * with the counts. Best to have it here for clean initialization.
2173 vp
->v_mmap_read
= 0;
2174 vp
->v_mmap_write
= 0;
2177 * If FEM was in use, make sure everything gets cleaned up
2178 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2181 if (vp
->v_femhead
) {
2182 /* XXX - There should be a free_femhead() that does all this */
2183 ASSERT(vp
->v_femhead
->femh_list
== NULL
);
2184 mutex_destroy(&vp
->v_femhead
->femh_lock
);
2185 kmem_free(vp
->v_femhead
, sizeof (*(vp
->v_femhead
)));
2186 vp
->v_femhead
= NULL
;
2188 if (vp
->v_path
!= vn_vpath_empty
) {
2189 kmem_free(vp
->v_path
, strlen(vp
->v_path
) + 1);
2190 vp
->v_path
= vn_vpath_empty
;
2192 vp
->v_path_stamp
= 0;
2194 if (vp
->v_fopdata
!= NULL
) {
2197 vp
->v_mpssdata
= NULL
;
2202 * Used to reset the vnode fields including those that are directly accessible
2203 * as well as those which require an accessor function.
2205 * Does not initialize:
2206 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2207 * v_data (since FS-nodes and vnodes point to each other and should
2208 * be updated simultaneously)
2209 * v_op (in case someone needs to make a VOP call on this object)
2212 vn_reinit(vnode_t
*vp
)
2215 vp
->v_count_dnlc
= 0;
2217 vp
->v_stream
= NULL
;
2218 vp
->v_vfsmountedhere
= NULL
;
2223 vp
->v_filocks
= NULL
;
2224 vp
->v_shrlocks
= NULL
;
2225 VERIFY(!vn_has_cached_data(vp
));
2227 vp
->v_locality
= NULL
;
2228 vp
->v_xattrdir
= NULL
;
2231 * In a few specific instances, vn_reinit() is used to initialize
2232 * locally defined vnode_t instances. Lacking the construction offered
2233 * by vn_alloc(), these vnodes require v_path initialization.
2235 if (vp
->v_path
== NULL
) {
2236 vp
->v_path
= vn_vpath_empty
;
2239 /* Handles v_femhead, v_path, and the r/w/map counts */
2244 vn_alloc(int kmflag
)
2248 vp
= kmem_cache_alloc(vn_cache
, kmflag
);
2251 vp
->v_femhead
= NULL
; /* Must be done before vn_reinit() */
2252 vp
->v_fopdata
= NULL
;
2260 vn_free(vnode_t
*vp
)
2262 ASSERT(vp
->v_shrlocks
== NULL
);
2263 ASSERT(vp
->v_filocks
== NULL
);
2266 * Some file systems call vn_free() with v_count of zero,
2267 * some with v_count of 1. In any case, the value should
2268 * never be anything else.
2270 ASSERT((vp
->v_count
== 0) || (vp
->v_count
== 1));
2271 ASSERT(vp
->v_count_dnlc
== 0);
2272 VERIFY(vp
->v_path
!= NULL
);
2273 if (vp
->v_path
!= vn_vpath_empty
) {
2274 kmem_free(vp
->v_path
, strlen(vp
->v_path
) + 1);
2275 vp
->v_path
= vn_vpath_empty
;
2278 /* If FEM was in use, make sure everything gets cleaned up */
2279 if (vp
->v_femhead
) {
2280 /* XXX - There should be a free_femhead() that does all this */
2281 ASSERT(vp
->v_femhead
->femh_list
== NULL
);
2282 mutex_destroy(&vp
->v_femhead
->femh_lock
);
2283 kmem_free(vp
->v_femhead
, sizeof (*(vp
->v_femhead
)));
2284 vp
->v_femhead
= NULL
;
2287 if (vp
->v_fopdata
!= NULL
) {
2290 vp
->v_mpssdata
= NULL
;
2292 kmem_cache_free(vn_cache
, vp
);
2296 * vnode status changes, should define better states than 1, 0.
2299 vn_reclaim(vnode_t
*vp
)
2301 vfs_t
*vfsp
= vp
->v_vfsp
;
2304 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2307 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_RECLAIMED
);
2311 vn_idle(vnode_t
*vp
)
2313 vfs_t
*vfsp
= vp
->v_vfsp
;
2316 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2319 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_IDLED
);
2322 vn_exists(vnode_t
*vp
)
2324 vfs_t
*vfsp
= vp
->v_vfsp
;
2327 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2330 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_EXISTS
);
2334 vn_invalid(vnode_t
*vp
)
2336 vfs_t
*vfsp
= vp
->v_vfsp
;
2339 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2342 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_DESTROYED
);
2345 /* Vnode event notification */
2348 vnevent_support(vnode_t
*vp
, caller_context_t
*ct
)
2353 return (fop_vnevent(vp
, VE_SUPPORT
, NULL
, NULL
, ct
));
2357 vnevent_rename_src(vnode_t
*vp
, vnode_t
*dvp
, char *name
, caller_context_t
*ct
)
2359 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2362 (void) fop_vnevent(vp
, VE_RENAME_SRC
, dvp
, name
, ct
);
2366 vnevent_rename_dest(vnode_t
*vp
, vnode_t
*dvp
, char *name
,
2367 caller_context_t
*ct
)
2369 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2372 (void) fop_vnevent(vp
, VE_RENAME_DEST
, dvp
, name
, ct
);
2376 vnevent_rename_dest_dir(vnode_t
*vp
, caller_context_t
*ct
)
2378 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2381 (void) fop_vnevent(vp
, VE_RENAME_DEST_DIR
, NULL
, NULL
, ct
);
2385 vnevent_remove(vnode_t
*vp
, vnode_t
*dvp
, char *name
, caller_context_t
*ct
)
2387 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2390 (void) fop_vnevent(vp
, VE_REMOVE
, dvp
, name
, ct
);
2394 vnevent_rmdir(vnode_t
*vp
, vnode_t
*dvp
, char *name
, caller_context_t
*ct
)
2396 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2399 (void) fop_vnevent(vp
, VE_RMDIR
, dvp
, name
, ct
);
2403 vnevent_pre_rename_src(vnode_t
*vp
, vnode_t
*dvp
, char *name
,
2404 caller_context_t
*ct
)
2406 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2409 (void) fop_vnevent(vp
, VE_PRE_RENAME_SRC
, dvp
, name
, ct
);
2413 vnevent_pre_rename_dest(vnode_t
*vp
, vnode_t
*dvp
, char *name
,
2414 caller_context_t
*ct
)
2416 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2419 (void) fop_vnevent(vp
, VE_PRE_RENAME_DEST
, dvp
, name
, ct
);
2423 vnevent_pre_rename_dest_dir(vnode_t
*vp
, vnode_t
*nvp
, char *name
,
2424 caller_context_t
*ct
)
2426 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2429 (void) fop_vnevent(vp
, VE_PRE_RENAME_DEST_DIR
, nvp
, name
, ct
);
2433 vnevent_create(vnode_t
*vp
, caller_context_t
*ct
)
2435 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2438 (void) fop_vnevent(vp
, VE_CREATE
, NULL
, NULL
, ct
);
2442 vnevent_link(vnode_t
*vp
, caller_context_t
*ct
)
2444 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2447 (void) fop_vnevent(vp
, VE_LINK
, NULL
, NULL
, ct
);
2451 vnevent_mountedover(vnode_t
*vp
, caller_context_t
*ct
)
2453 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2456 (void) fop_vnevent(vp
, VE_MOUNTEDOVER
, NULL
, NULL
, ct
);
2460 vnevent_truncate(vnode_t
*vp
, caller_context_t
*ct
)
2462 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2465 (void) fop_vnevent(vp
, VE_TRUNCATE
, NULL
, NULL
, ct
);
2473 vn_is_readonly(vnode_t
*vp
)
2475 return (vp
->v_vfsp
->vfs_flag
& VFS_RDONLY
);
2479 vn_has_flocks(vnode_t
*vp
)
2481 return (vp
->v_filocks
!= NULL
);
2485 vn_has_mandatory_locks(vnode_t
*vp
, int mode
)
2487 return ((vp
->v_filocks
!= NULL
) && (MANDLOCK(vp
, mode
)));
2491 vn_has_cached_data(vnode_t
*vp
)
2493 return (!list_is_empty(&vp
->v_object
.list
));
2497 * Return 0 if the vnode in question shouldn't be permitted into a zone via
2501 vn_can_change_zones(vnode_t
*vp
)
2507 if (nfs_global_client_only
!= 0)
2511 * We always want to look at the underlying vnode if there is one.
2513 if (fop_realvp(vp
, &rvp
, NULL
) != 0)
2516 * Some pseudo filesystems (including doorfs) don't actually register
2517 * their vfsops_t, so the following may return NULL; we happily let
2518 * such vnodes switch zones.
2520 vswp
= vfs_getvfsswbyvfsops(vfs_getops(rvp
->v_vfsp
));
2522 if (vswp
->vsw_flag
& VSW_NOTZONESAFE
)
2524 vfs_unrefvfssw(vswp
);
2530 * Return nonzero if the vnode is a mount point, zero if not.
2533 vn_ismntpt(vnode_t
*vp
)
2535 return (vp
->v_vfsmountedhere
!= NULL
);
2538 /* Retrieve the vfs (if any) mounted on this vnode */
2540 vn_mountedvfs(vnode_t
*vp
)
2542 return (vp
->v_vfsmountedhere
);
2546 * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2549 vn_in_dnlc(vnode_t
*vp
)
2551 return (vp
->v_count_dnlc
> 0);
2555 * vn_has_other_opens() checks whether a particular file is opened by more than
2556 * just the caller and whether the open is for read and/or write.
2557 * This routine is for calling after the caller has already called fop_open()
2558 * and the caller wishes to know if they are the only one with it open for
2559 * the mode(s) specified.
2561 * Vnode counts are only kept on regular files (v_type=VREG).
2573 if (vp
->v_wrcnt
> 1)
2577 if ((vp
->v_rdcnt
> 1) || (vp
->v_wrcnt
> 1))
2581 if ((vp
->v_rdcnt
> 1) && (vp
->v_wrcnt
> 1))
2585 if (vp
->v_rdcnt
> 1)
2594 * vn_is_opened() checks whether a particular file is opened and
2595 * whether the open is for read and/or write.
2597 * Vnode counts are only kept on regular files (v_type=VREG).
2613 if (vp
->v_rdcnt
&& vp
->v_wrcnt
)
2617 if (vp
->v_rdcnt
|| vp
->v_wrcnt
)
2630 * vn_is_mapped() checks whether a particular file is mapped and whether
2631 * the file is mapped read and/or write.
2644 * The atomic_add_64_nv functions force atomicity in the
2645 * case of 32 bit architectures. Otherwise the 64 bit values
2646 * require two fetches. The value of the fields may be
2647 * (potentially) changed between the first fetch and the
2651 if (atomic_add_64_nv((&(vp
->v_mmap_write
)), 0))
2655 if ((atomic_add_64_nv((&(vp
->v_mmap_read
)), 0)) &&
2656 (atomic_add_64_nv((&(vp
->v_mmap_write
)), 0)))
2660 if ((atomic_add_64_nv((&(vp
->v_mmap_read
)), 0)) ||
2661 (atomic_add_64_nv((&(vp
->v_mmap_write
)), 0)))
2665 if (atomic_add_64_nv((&(vp
->v_mmap_read
)), 0))
2672 if (vp
->v_mmap_write
)
2676 if (vp
->v_mmap_read
&& vp
->v_mmap_write
)
2680 if (vp
->v_mmap_read
|| vp
->v_mmap_write
)
2684 if (vp
->v_mmap_read
)
2694 * Set the operations vector for a vnode.
2697 vn_setops(struct vnode
*vnode
, const struct vnodeops
*ops
)
2703 * Retrieve the operations vector for a vnode
2705 const struct vnodeops
*
2706 vn_getops(struct vnode
*vnode
)
2712 * Returns non-zero (1) if the vnodeops matches that of the vnode.
2713 * Returns zero (0) if not.
2716 vn_matchops(struct vnode
*vp
, const struct vnodeops
*vnodeops
)
2718 return (vn_getops(vp
) == vnodeops
);
2722 * fs_new_caller_id() needs to return a unique ID on a given local system.
2723 * The IDs do not need to survive across reboots. These are primarily
2724 * used so that (FEM) monitors can detect particular callers (such as
2725 * the NFS server) to a given vnode/vfs operation.
2730 static uint64_t next_caller_id
= 0LL; /* First call returns 1 */
2732 return ((u_longlong_t
)atomic_inc_64_nv(&next_caller_id
));
2736 * The value stored in v_path is relative to rootdir, located in the global
2737 * zone. Zones or chroot environments which reside deeper inside the VFS
2738 * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2739 * what lies below their perceived root. In order to keep v_path usable for
2740 * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2742 * An upper bound of max_vnode_path is placed upon v_path allocations to
2743 * prevent the system from going too wild at the behest of pathological
2744 * behavior from the operator.
2746 size_t max_vnode_path
= 4 * MAXPATHLEN
;
2750 vn_clearpath(vnode_t
*vp
, hrtime_t compare_stamp
)
2754 mutex_enter(&vp
->v_lock
);
2756 * If the snapshot of v_path_stamp passed in via compare_stamp does not
2757 * match the present value on the vnode, it indicates that subsequent
2758 * changes have occurred. The v_path value is not cleared in this case
2759 * since the new value may be valid.
2761 if (compare_stamp
!= 0 && vp
->v_path_stamp
!= compare_stamp
) {
2762 mutex_exit(&vp
->v_lock
);
2766 vp
->v_path
= vn_vpath_empty
;
2767 vp
->v_path_stamp
= 0;
2768 mutex_exit(&vp
->v_lock
);
2769 if (buf
!= vn_vpath_empty
) {
2770 kmem_free(buf
, strlen(buf
) + 1);
2775 vn_setpath_common(vnode_t
*pvp
, vnode_t
*vp
, const char *name
, size_t len
,
2776 boolean_t is_rename
)
2780 size_t baselen
, buflen
= 0;
2782 /* Handle the vn_setpath_str case. */
2784 if (len
+ 1 > max_vnode_path
) {
2785 DTRACE_PROBE4(vn__setpath__too__long
, vnode_t
*, pvp
,
2786 vnode_t
*, vp
, char *, name
, size_t, len
+ 1);
2789 buf
= kmem_alloc(len
+ 1, KM_SLEEP
);
2790 bcopy(name
, buf
, len
);
2793 mutex_enter(&vp
->v_lock
);
2794 oldbuf
= vp
->v_path
;
2796 vp
->v_path_stamp
= gethrtime();
2797 mutex_exit(&vp
->v_lock
);
2798 if (oldbuf
!= vn_vpath_empty
) {
2799 kmem_free(oldbuf
, strlen(oldbuf
) + 1);
2804 /* Take snapshot of parent dir */
2805 mutex_enter(&pvp
->v_lock
);
2807 if ((pvp
->v_flag
& VTRAVERSE
) != 0) {
2809 * When the parent vnode has VTRAVERSE set in its flags, normal
2810 * assumptions about v_path calculation no longer apply. The
2811 * primary situation where this occurs is via the VFS tricks
2812 * which procfs plays in order to allow /proc/PID/(root|cwd) to
2813 * yield meaningful results.
2815 * When this flag is set, v_path on the child must not be
2816 * updated since the calculated value is likely to be
2817 * incorrect, given the current context.
2819 mutex_exit(&pvp
->v_lock
);
2824 if (pvp
->v_path
== vn_vpath_empty
) {
2826 * Without v_path from the parent directory, generating a child
2827 * path from the name is impossible.
2830 pstamp
= pvp
->v_path_stamp
;
2831 mutex_exit(&pvp
->v_lock
);
2832 vn_clearpath(vp
, pstamp
);
2837 * The only feasible case here is where a NUL lookup is being
2838 * performed on rootdir prior to its v_path being populated.
2840 ASSERT(pvp
->v_path_stamp
== 0);
2844 pstamp
= pvp
->v_path_stamp
;
2845 baselen
= strlen(pvp
->v_path
);
2846 /* ignore a trailing slash if present */
2847 if (pvp
->v_path
[baselen
- 1] == '/') {
2848 /* This should only the be case for rootdir */
2849 ASSERT(baselen
== 1 && pvp
== rootdir
);
2853 mutex_exit(&pvp
->v_lock
);
2856 /* Free the existing (mis-sized) buffer in case of retry */
2857 kmem_free(buf
, buflen
);
2859 /* base, '/', name and trailing NUL */
2860 buflen
= baselen
+ len
+ 2;
2861 if (buflen
> max_vnode_path
) {
2862 DTRACE_PROBE4(vn__setpath_too__long
, vnode_t
*, pvp
,
2863 vnode_t
*, vp
, char *, name
, size_t, buflen
);
2866 buf
= kmem_alloc(buflen
, KM_SLEEP
);
2868 mutex_enter(&pvp
->v_lock
);
2869 if (pvp
->v_path_stamp
!= pstamp
) {
2873 * Since v_path_stamp changed on the parent, it is likely that
2874 * v_path has been altered as well. If the length does not
2875 * exactly match what was previously measured, the buffer
2876 * allocation must be repeated for proper sizing.
2878 if (pvp
->v_path
== vn_vpath_empty
) {
2879 /* Give up if parent lack v_path */
2880 mutex_exit(&pvp
->v_lock
);
2881 kmem_free(buf
, buflen
);
2884 vlen
= strlen(pvp
->v_path
);
2885 if (pvp
->v_path
[vlen
- 1] == '/') {
2888 if (vlen
!= baselen
) {
2892 bcopy(pvp
->v_path
, buf
, baselen
);
2893 mutex_exit(&pvp
->v_lock
);
2897 bcopy(name
, &buf
[baselen
], len
+ 1);
2899 mutex_enter(&vp
->v_lock
);
2900 if (vp
->v_path_stamp
== 0) {
2901 /* never-visited vnode can inherit stamp from parent */
2902 ASSERT(vp
->v_path
== vn_vpath_empty
);
2903 vp
->v_path_stamp
= pstamp
;
2905 mutex_exit(&vp
->v_lock
);
2906 } else if (vp
->v_path_stamp
< pstamp
|| is_rename
) {
2908 * Install the updated path and stamp, ensuring that the v_path
2909 * pointer is valid at all times for dtrace.
2911 oldbuf
= vp
->v_path
;
2913 vp
->v_path_stamp
= gethrtime();
2914 mutex_exit(&vp
->v_lock
);
2915 kmem_free(oldbuf
, strlen(oldbuf
) + 1);
2918 * If the timestamp matches or is greater, it means another
2919 * thread performed the update first while locks were dropped
2920 * here to make the allocation. We defer to the newer value.
2922 mutex_exit(&vp
->v_lock
);
2923 kmem_free(buf
, buflen
);
2925 ASSERT(MUTEX_NOT_HELD(&vp
->v_lock
));
2929 vn_updatepath(vnode_t
*pvp
, vnode_t
*vp
, const char *name
)
2934 * If the parent is older or empty, there's nothing further to do.
2936 if (pvp
->v_path
== vn_vpath_empty
||
2937 pvp
->v_path_stamp
<= vp
->v_path_stamp
) {
2942 * Given the lack of appropriate context, meaningful updates to v_path
2943 * cannot be made for during lookups for the '.' or '..' entries.
2946 if (len
== 0 || (len
== 1 && name
[0] == '.') ||
2947 (len
== 2 && name
[0] == '.' && name
[1] == '.')) {
2951 vn_setpath_common(pvp
, vp
, name
, len
, B_FALSE
);
2955 * Given a starting vnode and a path, updates the path in the target vnode in
2956 * a safe manner. If the vnode already has path information embedded, then the
2957 * cached path is left untouched.
2961 vn_setpath(vnode_t
*rootvp
, vnode_t
*pvp
, vnode_t
*vp
, const char *name
,
2964 vn_setpath_common(pvp
, vp
, name
, len
, B_FALSE
);
2968 * Sets the path to the vnode to be the given string, regardless of current
2969 * context. The string must be a complete path from rootdir. This is only used
2970 * by fsop_root() for setting the path based on the mountpoint.
2973 vn_setpath_str(vnode_t
*vp
, const char *str
, size_t len
)
2975 vn_setpath_common(NULL
, vp
, str
, len
, B_FALSE
);
2979 * Called from within filesystem's vop_rename() to handle renames once the
2980 * target vnode is available.
2983 vn_renamepath(vnode_t
*pvp
, vnode_t
*vp
, const char *name
, size_t len
)
2985 vn_setpath_common(pvp
, vp
, name
, len
, B_TRUE
);
2989 * Similar to vn_setpath_str(), this function sets the path of the destination
2990 * vnode to the be the same as the source vnode.
2993 vn_copypath(struct vnode
*src
, struct vnode
*dst
)
2999 mutex_enter(&src
->v_lock
);
3000 if (src
->v_path
== vn_vpath_empty
) {
3001 mutex_exit(&src
->v_lock
);
3004 buflen
= strlen(src
->v_path
) + 1;
3005 mutex_exit(&src
->v_lock
);
3007 buf
= kmem_alloc(buflen
, KM_SLEEP
);
3009 mutex_enter(&src
->v_lock
);
3010 if (src
->v_path
== vn_vpath_empty
||
3011 strlen(src
->v_path
) + 1 != buflen
) {
3012 mutex_exit(&src
->v_lock
);
3013 kmem_free(buf
, buflen
);
3016 bcopy(src
->v_path
, buf
, buflen
);
3017 stamp
= src
->v_path_stamp
;
3018 mutex_exit(&src
->v_lock
);
3020 mutex_enter(&dst
->v_lock
);
3021 if (dst
->v_path
!= vn_vpath_empty
) {
3022 mutex_exit(&dst
->v_lock
);
3023 kmem_free(buf
, buflen
);
3027 dst
->v_path_stamp
= stamp
;
3028 mutex_exit(&dst
->v_lock
);
3033 * XXX Private interface for segvn routines that handle vnode
3034 * large page segments.
3036 * return 1 if vp's file system fop_pageio() implementation
3037 * can be safely used instead of fop_getpage() for handling
3038 * pagefaults against regular non swap files. fop_pageio()
3039 * interface is considered safe here if its implementation
3040 * is very close to fop_getpage() implementation.
3041 * e.g. It zero's out the part of the page beyond EOF. Doesn't
3042 * panic if there're file holes but instead returns an error.
3043 * Doesn't assume file won't be changed by user writes, etc.
3045 * return 0 otherwise.
3047 * For now allow segvn to only use fop_pageio() with ufs and nfs.
3050 vn_vmpss_usepageio(vnode_t
*vp
)
3052 vfs_t
*vfsp
= vp
->v_vfsp
;
3053 char *fsname
= vfssw
[vfsp
->vfs_fstype
].vsw_name
;
3054 char *pageio_ok_fss
[] = {"ufs", "nfs", NULL
};
3055 char **fsok
= pageio_ok_fss
;
3057 if (fsname
== NULL
) {
3061 for (; *fsok
; fsok
++) {
3062 if (strcmp(*fsok
, fsname
) == 0) {
3069 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3076 caller_context_t
*ct
)
3083 * Adding to the vnode counts before calling open
3084 * avoids the need for a mutex. It circumvents a race
3085 * condition where a query made on the vnode counts results in a
3086 * false negative. The inquirer goes away believing the file is
3087 * not open when there is an open on the file already under way.
3089 * The counts are meant to prevent NFS from granting a delegation
3090 * when it would be dangerous to do so.
3092 * The vnode counts are only kept on regular files
3094 if ((*vpp
)->v_type
== VREG
) {
3096 atomic_inc_32(&(*vpp
)->v_rdcnt
);
3098 atomic_inc_32(&(*vpp
)->v_wrcnt
);
3101 VOPXID_MAP_CR(vp
, cr
);
3103 ret
= fop_open_dispatch(vpp
, mode
, cr
, ct
, true);
3107 * Use the saved vp just in case the vnode ptr got trashed
3110 VOPSTATS_UPDATE(vp
, open
);
3111 if ((vp
->v_type
== VREG
) && (mode
& FREAD
))
3112 atomic_dec_32(&vp
->v_rdcnt
);
3113 if ((vp
->v_type
== VREG
) && (mode
& FWRITE
))
3114 atomic_dec_32(&vp
->v_wrcnt
);
3117 * Some filesystems will return a different vnode,
3118 * but the same path was still used to open it.
3119 * So if we do change the vnode and need to
3120 * copy over the path, do so here, rather than special
3121 * casing each filesystem. Adjust the vnode counts to
3122 * reflect the vnode switch.
3124 VOPSTATS_UPDATE(*vpp
, open
);
3125 if (*vpp
!= vp
&& *vpp
!= NULL
) {
3126 vn_copypath(vp
, *vpp
);
3127 if (((*vpp
)->v_type
== VREG
) && (mode
& FREAD
))
3128 atomic_inc_32(&(*vpp
)->v_rdcnt
);
3129 if ((vp
->v_type
== VREG
) && (mode
& FREAD
))
3130 atomic_dec_32(&vp
->v_rdcnt
);
3131 if (((*vpp
)->v_type
== VREG
) && (mode
& FWRITE
))
3132 atomic_inc_32(&(*vpp
)->v_wrcnt
);
3133 if ((vp
->v_type
== VREG
) && (mode
& FWRITE
))
3134 atomic_dec_32(&vp
->v_wrcnt
);
3148 caller_context_t
*ct
)
3152 VOPXID_MAP_CR(vp
, cr
);
3154 err
= fop_close_dispatch(vp
, flag
, count
, offset
, cr
, ct
, true);
3156 VOPSTATS_UPDATE(vp
, close
);
3158 * Check passed in count to handle possible dups. Vnode counts are only
3159 * kept on regular files
3161 if ((vp
->v_type
== VREG
) && (count
== 1)) {
3163 ASSERT(vp
->v_rdcnt
> 0);
3164 atomic_dec_32(&vp
->v_rdcnt
);
3166 if (flag
& FWRITE
) {
3167 ASSERT(vp
->v_wrcnt
> 0);
3168 atomic_dec_32(&vp
->v_wrcnt
);
3180 caller_context_t
*ct
)
3183 ssize_t resid_start
= uiop
->uio_resid
;
3185 VOPXID_MAP_CR(vp
, cr
);
3187 err
= fop_read_dispatch(vp
, uiop
, ioflag
, cr
, ct
, true);
3189 VOPSTATS_UPDATE_IO(vp
, read
,
3190 read_bytes
, (resid_start
- uiop
->uio_resid
));
3200 caller_context_t
*ct
)
3203 ssize_t resid_start
= uiop
->uio_resid
;
3205 VOPXID_MAP_CR(vp
, cr
);
3207 err
= fop_write_dispatch(vp
, uiop
, ioflag
, cr
, ct
, true);
3209 VOPSTATS_UPDATE_IO(vp
, write
,
3210 write_bytes
, (resid_start
- uiop
->uio_resid
));
3222 caller_context_t
*ct
)
3226 VOPXID_MAP_CR(vp
, cr
);
3228 err
= fop_ioctl_dispatch(vp
, cmd
, arg
, flag
, cr
, rvalp
, ct
, true);
3230 VOPSTATS_UPDATE(vp
, ioctl
);
3240 caller_context_t
*ct
)
3244 VOPXID_MAP_CR(vp
, cr
);
3246 err
= fop_setfl_dispatch(vp
, oflags
, nflags
, cr
, ct
, true);
3248 VOPSTATS_UPDATE(vp
, setfl
);
3258 caller_context_t
*ct
)
3262 VOPXID_MAP_CR(vp
, cr
);
3265 * If this file system doesn't understand the xvattr extensions
3266 * then turn off the xvattr bit.
3268 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_XVATTR
) == 0) {
3269 vap
->va_mask
&= ~AT_XVATTR
;
3273 * We're only allowed to skip the ACL check iff we used a 32 bit
3274 * ACE mask with fop_access() to determine permissions.
3276 if ((flags
& ATTR_NOACLCHECK
) &&
3277 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0)
3280 err
= fop_getattr_dispatch(vp
, vap
, flags
, cr
, ct
, true);
3282 VOPSTATS_UPDATE(vp
, getattr
);
3292 caller_context_t
*ct
)
3296 VOPXID_MAP_CR(vp
, cr
);
3299 * If this file system doesn't understand the xvattr extensions
3300 * then turn off the xvattr bit.
3302 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_XVATTR
) == 0) {
3303 vap
->va_mask
&= ~AT_XVATTR
;
3307 * We're only allowed to skip the ACL check iff we used a 32 bit
3308 * ACE mask with fop_access() to determine permissions.
3310 if ((flags
& ATTR_NOACLCHECK
) &&
3311 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0)
3314 err
= fop_setattr_dispatch(vp
, vap
, flags
, cr
, ct
, true);
3316 VOPSTATS_UPDATE(vp
, setattr
);
3326 caller_context_t
*ct
)
3330 if ((flags
& V_ACE_MASK
) &&
3331 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0) {
3335 VOPXID_MAP_CR(vp
, cr
);
3337 err
= fop_access_dispatch(vp
, mode
, flags
, cr
, ct
, true);
3339 VOPSTATS_UPDATE(vp
, access
);
3352 caller_context_t
*ct
,
3353 int *deflags
, /* Returned per-dirent flags */
3354 pathname_t
*ppnp
) /* Returned case-preserved name in directory */
3359 * If this file system doesn't support case-insensitive access
3360 * and said access is requested, fail quickly. It is required
3361 * that if the vfs supports case-insensitive lookup, it also
3362 * supports extended dirent flags.
3364 if (flags
& FIGNORECASE
&&
3365 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3366 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3369 VOPXID_MAP_CR(dvp
, cr
);
3371 if ((flags
& LOOKUP_XATTR
) && (flags
& LOOKUP_HAVE_SYSATTR_DIR
) == 0) {
3372 ret
= xattr_dir_lookup(dvp
, vpp
, flags
, cr
);
3374 ret
= fop_lookup_dispatch(dvp
, nm
, vpp
, pnp
, flags
, rdir
, cr
,
3375 ct
, deflags
, ppnp
, true);
3378 if (ret
== 0 && *vpp
) {
3379 VOPSTATS_UPDATE(*vpp
, lookup
);
3380 vn_updatepath(dvp
, *vpp
, nm
);
3396 caller_context_t
*ct
,
3397 vsecattr_t
*vsecp
) /* ACL to set during create */
3401 if (vsecp
!= NULL
&&
3402 vfs_has_feature(dvp
->v_vfsp
, VFSFT_ACLONCREATE
) == 0) {
3406 * If this file system doesn't support case-insensitive access
3407 * and said access is requested, fail quickly.
3409 if (flags
& FIGNORECASE
&&
3410 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3411 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3414 VOPXID_MAP_CR(dvp
, cr
);
3416 ret
= fop_create_dispatch(dvp
, name
, vap
, excl
, mode
, vpp
, cr
, flags
,
3419 if (ret
== 0 && *vpp
) {
3420 VOPSTATS_UPDATE(*vpp
, create
);
3421 vn_updatepath(dvp
, *vpp
, name
);
3432 caller_context_t
*ct
,
3438 * If this file system doesn't support case-insensitive access
3439 * and said access is requested, fail quickly.
3441 if (flags
& FIGNORECASE
&&
3442 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3443 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3446 VOPXID_MAP_CR(dvp
, cr
);
3448 err
= fop_remove_dispatch(dvp
, nm
, cr
, ct
, flags
, true);
3450 VOPSTATS_UPDATE(dvp
, remove
);
3460 caller_context_t
*ct
,
3466 * If the target file system doesn't support case-insensitive access
3467 * and said access is requested, fail quickly.
3469 if (flags
& FIGNORECASE
&&
3470 (vfs_has_feature(tdvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3471 vfs_has_feature(tdvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3474 VOPXID_MAP_CR(tdvp
, cr
);
3476 err
= fop_link_dispatch(tdvp
, svp
, tnm
, cr
, ct
, flags
, true);
3478 VOPSTATS_UPDATE(tdvp
, link
);
3489 caller_context_t
*ct
,
3495 * If the file system involved does not support
3496 * case-insensitive access and said access is requested, fail
3499 if (flags
& FIGNORECASE
&&
3500 ((vfs_has_feature(sdvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3501 vfs_has_feature(sdvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0)))
3504 VOPXID_MAP_CR(tdvp
, cr
);
3506 err
= fop_rename_dispatch(sdvp
, snm
, tdvp
, tnm
, cr
, ct
, flags
, true);
3508 VOPSTATS_UPDATE(sdvp
, rename
);
3519 caller_context_t
*ct
,
3521 vsecattr_t
*vsecp
) /* ACL to set during create */
3525 if (vsecp
!= NULL
&&
3526 vfs_has_feature(dvp
->v_vfsp
, VFSFT_ACLONCREATE
) == 0) {
3530 * If this file system doesn't support case-insensitive access
3531 * and said access is requested, fail quickly.
3533 if (flags
& FIGNORECASE
&&
3534 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3535 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3538 VOPXID_MAP_CR(dvp
, cr
);
3540 ret
= fop_mkdir_dispatch(dvp
, dirname
, vap
, vpp
, cr
, ct
, flags
, vsecp
,
3543 if (ret
== 0 && *vpp
) {
3544 VOPSTATS_UPDATE(*vpp
, mkdir
);
3545 vn_updatepath(dvp
, *vpp
, dirname
);
3557 caller_context_t
*ct
,
3563 * If this file system doesn't support case-insensitive access
3564 * and said access is requested, fail quickly.
3566 if (flags
& FIGNORECASE
&&
3567 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3568 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3571 VOPXID_MAP_CR(dvp
, cr
);
3573 err
= fop_rmdir_dispatch(dvp
, nm
, cdir
, cr
, ct
, flags
, true);
3575 VOPSTATS_UPDATE(dvp
, rmdir
);
3585 caller_context_t
*ct
,
3589 ssize_t resid_start
= uiop
->uio_resid
;
3592 * If this file system doesn't support retrieving directory
3593 * entry flags and said access is requested, fail quickly.
3595 if (flags
& V_RDDIR_ENTFLAGS
&&
3596 vfs_has_feature(vp
->v_vfsp
, VFSFT_DIRENTFLAGS
) == 0)
3599 VOPXID_MAP_CR(vp
, cr
);
3601 err
= fop_readdir_dispatch(vp
, uiop
, cr
, eofp
, ct
, flags
, true);
3603 VOPSTATS_UPDATE_IO(vp
, readdir
,
3604 readdir_bytes
, (resid_start
- uiop
->uio_resid
));
3615 caller_context_t
*ct
,
3622 * If this file system doesn't support case-insensitive access
3623 * and said access is requested, fail quickly.
3625 if (flags
& FIGNORECASE
&&
3626 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3627 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3630 VOPXID_MAP_CR(dvp
, cr
);
3632 /* check for reparse point */
3633 if ((vfs_has_feature(dvp
->v_vfsp
, VFSFT_REPARSE
)) &&
3634 (strncmp(target
, FS_REPARSE_TAG_STR
,
3635 strlen(FS_REPARSE_TAG_STR
)) == 0)) {
3636 if (!fs_reparse_mark(target
, vap
, &xvattr
))
3637 vap
= (vattr_t
*)&xvattr
;
3640 err
= fop_symlink_dispatch(dvp
, linkname
, vap
, target
, cr
, ct
, flags
,
3643 VOPSTATS_UPDATE(dvp
, symlink
);
3652 caller_context_t
*ct
)
3656 VOPXID_MAP_CR(vp
, cr
);
3658 err
= fop_readlink_dispatch(vp
, uiop
, cr
, ct
, true);
3660 VOPSTATS_UPDATE(vp
, readlink
);
3669 caller_context_t
*ct
)
3673 VOPXID_MAP_CR(vp
, cr
);
3675 err
= fop_fsync_dispatch(vp
, syncflag
, cr
, ct
, true);
3677 VOPSTATS_UPDATE(vp
, fsync
);
3685 caller_context_t
*ct
)
3687 /* Need to update stats before vop call since we may lose the vnode */
3688 VOPSTATS_UPDATE(vp
, inactive
);
3690 VOPXID_MAP_CR(vp
, cr
);
3692 fop_inactive_dispatch(vp
, cr
, ct
, true);
3699 caller_context_t
*ct
)
3703 err
= fop_fid_dispatch(vp
, fidp
, ct
, true);
3705 VOPSTATS_UPDATE(vp
, fid
);
3713 caller_context_t
*ct
)
3717 ret
= fop_rwlock_dispatch(vp
, write_lock
, ct
, true);
3719 VOPSTATS_UPDATE(vp
, rwlock
);
3727 caller_context_t
*ct
)
3729 fop_rwunlock_dispatch(vp
, write_lock
, ct
, true);
3731 VOPSTATS_UPDATE(vp
, rwunlock
);
3739 caller_context_t
*ct
)
3743 err
= fop_seek_dispatch(vp
, ooff
, noffp
, ct
, true);
3745 VOPSTATS_UPDATE(vp
, seek
);
3753 caller_context_t
*ct
)
3757 err
= fop_cmp_dispatch(vp1
, vp2
, ct
, true);
3759 VOPSTATS_UPDATE(vp1
, cmp
);
3770 struct flk_callback
*flk_cbp
,
3772 caller_context_t
*ct
)
3776 VOPXID_MAP_CR(vp
, cr
);
3778 err
= fop_frlock_dispatch(vp
, cmd
, bfp
, flag
, offset
, flk_cbp
, cr
,
3781 VOPSTATS_UPDATE(vp
, frlock
);
3793 caller_context_t
*ct
)
3797 VOPXID_MAP_CR(vp
, cr
);
3799 err
= fop_space_dispatch(vp
, cmd
, bfp
, flag
, offset
, cr
, ct
, true);
3801 VOPSTATS_UPDATE(vp
, space
);
3809 caller_context_t
*ct
)
3813 err
= fop_realvp_dispatch(vp
, vpp
, ct
, true);
3815 VOPSTATS_UPDATE(vp
, realvp
);
3831 caller_context_t
*ct
)
3835 VOPXID_MAP_CR(vp
, cr
);
3837 err
= fop_getpage_dispatch(vp
, off
, len
, protp
, plarr
, plsz
, seg
,
3838 addr
, rw
, cr
, ct
, true);
3840 VOPSTATS_UPDATE(vp
, getpage
);
3851 caller_context_t
*ct
)
3855 VOPXID_MAP_CR(vp
, cr
);
3857 err
= fop_putpage_dispatch(vp
, off
, len
, flags
, cr
, ct
, true);
3859 VOPSTATS_UPDATE(vp
, putpage
);
3874 caller_context_t
*ct
)
3878 VOPXID_MAP_CR(vp
, cr
);
3880 err
= fop_map_dispatch(vp
, off
, as
, addrp
, len
, prot
, maxprot
,
3881 flags
, cr
, ct
, true);
3883 VOPSTATS_UPDATE(vp
, map
);
3898 caller_context_t
*ct
)
3903 VOPXID_MAP_CR(vp
, cr
);
3905 error
= fop_addmap_dispatch(vp
, off
, as
, addr
, len
, prot
, maxprot
,
3906 flags
, cr
, ct
, true);
3908 if ((!error
) && (vp
->v_type
== VREG
)) {
3909 delta
= (u_longlong_t
)btopr(len
);
3911 * If file is declared MAP_PRIVATE, it can't be written back
3912 * even if open for write. Handle as read.
3914 if (flags
& MAP_PRIVATE
) {
3915 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3919 * atomic_add_64 forces the fetch of a 64 bit value to
3920 * be atomic on 32 bit machines
3922 if (maxprot
& PROT_WRITE
)
3923 atomic_add_64((uint64_t *)(&(vp
->v_mmap_write
)),
3925 if (maxprot
& PROT_READ
)
3926 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3928 if (maxprot
& PROT_EXEC
)
3929 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3933 VOPSTATS_UPDATE(vp
, addmap
);
3948 caller_context_t
*ct
)
3953 VOPXID_MAP_CR(vp
, cr
);
3955 error
= fop_delmap_dispatch(vp
, off
, as
, addr
, len
, prot
, maxprot
,
3956 flags
, cr
, ct
, true);
3959 * NFS calls into delmap twice, the first time
3960 * it simply establishes a callback mechanism and returns EAGAIN
3961 * while the real work is being done upon the second invocation.
3962 * We have to detect this here and only decrement the counts upon
3963 * the second delmap request.
3965 if ((error
!= EAGAIN
) && (vp
->v_type
== VREG
)) {
3967 delta
= (u_longlong_t
)btopr(len
);
3969 if (flags
& MAP_PRIVATE
) {
3970 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3974 * atomic_add_64 forces the fetch of a 64 bit value
3975 * to be atomic on 32 bit machines
3977 if (maxprot
& PROT_WRITE
)
3978 atomic_add_64((uint64_t *)(&(vp
->v_mmap_write
)),
3980 if (maxprot
& PROT_READ
)
3981 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3983 if (maxprot
& PROT_EXEC
)
3984 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3988 VOPSTATS_UPDATE(vp
, delmap
);
3999 struct pollhead
**phpp
,
4000 caller_context_t
*ct
)
4004 err
= fop_poll_dispatch(vp
, events
, anyyet
, reventsp
, phpp
, ct
, true);
4006 VOPSTATS_UPDATE(vp
, poll
);
4016 caller_context_t
*ct
)
4020 /* ensure lbdn and dblks can be passed safely to bdev_dump */
4021 if ((lbdn
!= (daddr_t
)lbdn
) || (dblks
!= (int)dblks
))
4024 err
= fop_dump_dispatch(vp
, addr
, lbdn
, dblks
, ct
, true);
4026 VOPSTATS_UPDATE(vp
, dump
);
4036 caller_context_t
*ct
)
4040 VOPXID_MAP_CR(vp
, cr
);
4042 err
= fop_pathconf_dispatch(vp
, cmd
, valp
, cr
, ct
, true);
4044 VOPSTATS_UPDATE(vp
, pathconf
);
4056 caller_context_t
*ct
)
4060 VOPXID_MAP_CR(vp
, cr
);
4062 err
= fop_pageio_dispatch(vp
, pp
, io_off
, io_len
, flags
, cr
, ct
, true);
4064 VOPSTATS_UPDATE(vp
, pageio
);
4073 caller_context_t
*ct
)
4077 err
= fop_dumpctl_dispatch(vp
, action
, blkp
, ct
, true);
4079 VOPSTATS_UPDATE(vp
, dumpctl
);
4090 caller_context_t
*ct
)
4092 /* Must do stats first since it's possible to lose the vnode */
4093 VOPSTATS_UPDATE(vp
, dispose
);
4095 VOPXID_MAP_CR(vp
, cr
);
4097 fop_dispose_dispatch(vp
, pp
, flag
, dn
, cr
, ct
, true);
4106 caller_context_t
*ct
)
4110 VOPXID_MAP_CR(vp
, cr
);
4113 * We're only allowed to skip the ACL check iff we used a 32 bit
4114 * ACE mask with fop_access() to determine permissions.
4116 if ((flag
& ATTR_NOACLCHECK
) &&
4117 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0) {
4121 err
= fop_setsecattr_dispatch(vp
, vsap
, flag
, cr
, ct
, true);
4123 VOPSTATS_UPDATE(vp
, setsecattr
);
4133 caller_context_t
*ct
)
4138 * We're only allowed to skip the ACL check iff we used a 32 bit
4139 * ACE mask with fop_access() to determine permissions.
4141 if ((flag
& ATTR_NOACLCHECK
) &&
4142 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0) {
4146 VOPXID_MAP_CR(vp
, cr
);
4148 err
= fop_getsecattr_dispatch(vp
, vsap
, flag
, cr
, ct
, true);
4150 VOPSTATS_UPDATE(vp
, getsecattr
);
4158 struct shrlock
*shr
,
4161 caller_context_t
*ct
)
4165 VOPXID_MAP_CR(vp
, cr
);
4167 err
= fop_shrlock_dispatch(vp
, cmd
, shr
, flag
, cr
, ct
, true);
4169 VOPSTATS_UPDATE(vp
, shrlock
);
4174 fop_vnevent(vnode_t
*vp
, vnevent_t vnevent
, vnode_t
*dvp
, char *fnm
,
4175 caller_context_t
*ct
)
4179 err
= fop_vnevent_dispatch(vp
, vnevent
, dvp
, fnm
, ct
, true);
4181 VOPSTATS_UPDATE(vp
, vnevent
);
4186 fop_reqzcbuf(vnode_t
*vp
, enum uio_rw ioflag
, xuio_t
*uiop
, cred_t
*cr
,
4187 caller_context_t
*ct
)
4191 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_ZEROCOPY_SUPPORTED
) == 0)
4194 err
= fop_reqzcbuf_dispatch(vp
, ioflag
, uiop
, cr
, ct
, true);
4196 VOPSTATS_UPDATE(vp
, reqzcbuf
);
4201 fop_retzcbuf(vnode_t
*vp
, xuio_t
*uiop
, cred_t
*cr
, caller_context_t
*ct
)
4205 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_ZEROCOPY_SUPPORTED
) == 0)
4208 err
= fop_retzcbuf_dispatch(vp
, uiop
, cr
, ct
, true);
4210 VOPSTATS_UPDATE(vp
, retzcbuf
);
4215 * Default destructor
4216 * Needed because NULL destructor means that the key is unused
4220 vsd_defaultdestructor(void *value
)
4224 * Create a key (index into per vnode array)
4225 * Locks out vsd_create, vsd_destroy, and vsd_free
4226 * May allocate memory with lock held
4229 vsd_create(uint_t
*keyp
, void (*destructor
)(void *))
4235 * if key is allocated, do nothing
4237 mutex_enter(&vsd_lock
);
4239 mutex_exit(&vsd_lock
);
4243 * find an unused key
4245 if (destructor
== NULL
)
4246 destructor
= vsd_defaultdestructor
;
4248 for (i
= 0; i
< vsd_nkeys
; ++i
)
4249 if (vsd_destructor
[i
] == NULL
)
4253 * if no unused keys, increase the size of the destructor array
4255 if (i
== vsd_nkeys
) {
4256 if ((nkeys
= (vsd_nkeys
<< 1)) == 0)
4259 (void (**)(void *))vsd_realloc((void *)vsd_destructor
,
4260 (size_t)(vsd_nkeys
* sizeof (void (*)(void *))),
4261 (size_t)(nkeys
* sizeof (void (*)(void *))));
4266 * allocate the next available unused key
4268 vsd_destructor
[i
] = destructor
;
4271 /* create vsd_list, if it doesn't exist */
4272 if (vsd_list
== NULL
) {
4273 vsd_list
= kmem_alloc(sizeof (list_t
), KM_SLEEP
);
4274 list_create(vsd_list
, sizeof (struct vsd_node
),
4275 offsetof(struct vsd_node
, vs_nodes
));
4278 mutex_exit(&vsd_lock
);
4284 * Assumes that the caller is preventing vsd_set and vsd_get
4285 * Locks out vsd_create, vsd_destroy, and vsd_free
4286 * May free memory with lock held
4289 vsd_destroy(uint_t
*keyp
)
4292 struct vsd_node
*vsd
;
4295 * protect the key namespace and our destructor lists
4297 mutex_enter(&vsd_lock
);
4301 ASSERT(key
<= vsd_nkeys
);
4304 * if the key is valid
4309 * for every vnode with VSD, call key's destructor
4311 for (vsd
= list_head(vsd_list
); vsd
!= NULL
;
4312 vsd
= list_next(vsd_list
, vsd
)) {
4314 * no VSD for key in this vnode
4316 if (key
> vsd
->vs_nkeys
)
4319 * call destructor for key
4321 if (vsd
->vs_value
[k
] && vsd_destructor
[k
])
4322 (*vsd_destructor
[k
])(vsd
->vs_value
[k
]);
4324 * reset value for key
4326 vsd
->vs_value
[k
] = NULL
;
4329 * actually free the key (NULL destructor == unused)
4331 vsd_destructor
[k
] = NULL
;
4334 mutex_exit(&vsd_lock
);
4338 * Quickly return the per vnode value that was stored with the specified key
4339 * Assumes the caller is protecting key from vsd_create and vsd_destroy
4340 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4343 vsd_get(vnode_t
*vp
, uint_t key
)
4345 struct vsd_node
*vsd
;
4348 ASSERT(mutex_owned(&vp
->v_vsd_lock
));
4352 if (key
&& vsd
!= NULL
&& key
<= vsd
->vs_nkeys
)
4353 return (vsd
->vs_value
[key
- 1]);
4358 * Set a per vnode value indexed with the specified key
4359 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4362 vsd_set(vnode_t
*vp
, uint_t key
, void *value
)
4364 struct vsd_node
*vsd
;
4367 ASSERT(mutex_owned(&vp
->v_vsd_lock
));
4374 vsd
= vp
->v_vsd
= kmem_zalloc(sizeof (*vsd
), KM_SLEEP
);
4377 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4378 * code won't happen and we will continue down and allocate space for
4379 * the vs_value array.
4380 * If the caller is replacing one value with another, then it is up
4381 * to the caller to free/rele/destroy the previous value (if needed).
4383 if (key
<= vsd
->vs_nkeys
) {
4384 vsd
->vs_value
[key
- 1] = value
;
4388 ASSERT(key
<= vsd_nkeys
);
4390 if (vsd
->vs_nkeys
== 0) {
4391 mutex_enter(&vsd_lock
); /* lock out vsd_destroy() */
4393 * Link onto list of all VSD nodes.
4395 list_insert_head(vsd_list
, vsd
);
4396 mutex_exit(&vsd_lock
);
4400 * Allocate vnode local storage and set the value for key
4402 vsd
->vs_value
= vsd_realloc(vsd
->vs_value
,
4403 vsd
->vs_nkeys
* sizeof (void *),
4404 key
* sizeof (void *));
4405 vsd
->vs_nkeys
= key
;
4406 vsd
->vs_value
[key
- 1] = value
;
4412 * Called from vn_free() to run the destructor function for each vsd
4413 * Locks out vsd_create and vsd_destroy
4414 * Assumes that the destructor *DOES NOT* use vsd
4417 vsd_free(vnode_t
*vp
)
4420 struct vsd_node
*vsd
= vp
->v_vsd
;
4425 if (vsd
->vs_nkeys
== 0) {
4426 kmem_free(vsd
, sizeof (*vsd
));
4432 * lock out vsd_create and vsd_destroy, call
4433 * the destructor, and mark the value as destroyed.
4435 mutex_enter(&vsd_lock
);
4437 for (i
= 0; i
< vsd
->vs_nkeys
; i
++) {
4438 if (vsd
->vs_value
[i
] && vsd_destructor
[i
])
4439 (*vsd_destructor
[i
])(vsd
->vs_value
[i
]);
4440 vsd
->vs_value
[i
] = NULL
;
4444 * remove from linked list of VSD nodes
4446 list_remove(vsd_list
, vsd
);
4448 mutex_exit(&vsd_lock
);
4453 kmem_free(vsd
->vs_value
, vsd
->vs_nkeys
* sizeof (void *));
4454 kmem_free(vsd
, sizeof (struct vsd_node
));
4462 vsd_realloc(void *old
, size_t osize
, size_t nsize
)
4466 new = kmem_zalloc(nsize
, KM_SLEEP
);
4468 bcopy(old
, new, osize
);
4469 kmem_free(old
, osize
);
4475 * Setup the extensible system attribute for creating a reparse point.
4476 * The symlink data 'target' is validated for proper format of a reparse
4477 * string and a check also made to make sure the symlink data does not
4478 * point to an existing file.
4480 * return 0 if ok else -1.
4483 fs_reparse_mark(char *target
, vattr_t
*vap
, xvattr_t
*xvattr
)
4487 if ((!target
) || (!vap
) || (!xvattr
))
4490 /* validate reparse string */
4491 if (reparse_validate((const char *)target
))
4495 xvattr
->xva_vattr
= *vap
;
4496 xvattr
->xva_vattr
.va_mask
|= AT_XVATTR
;
4497 xoap
= xva_getxoptattr(xvattr
);
4499 XVA_SET_REQ(xvattr
, XAT_REPARSE
);
4500 xoap
->xoa_reparse
= 1;
4506 * Function to check whether a symlink is a reparse point.
4507 * Return B_TRUE if it is a reparse point, else return B_FALSE
4510 vn_is_reparse(vnode_t
*vp
, cred_t
*cr
, caller_context_t
*ct
)
4515 if ((vp
->v_type
!= VLNK
) ||
4516 !(vfs_has_feature(vp
->v_vfsp
, VFSFT_XVATTR
)))
4520 xoap
= xva_getxoptattr(&xvattr
);
4522 XVA_SET_REQ(&xvattr
, XAT_REPARSE
);
4524 if (fop_getattr(vp
, &xvattr
.xva_vattr
, 0, cr
, ct
))
4527 if ((!(xvattr
.xva_vattr
.va_mask
& AT_XVATTR
)) ||
4528 (!(XVA_ISSET_RTN(&xvattr
, XAT_REPARSE
))))
4531 return (xoap
->xoa_reparse
? B_TRUE
: B_FALSE
);