4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2017, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
29 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * University Copyright- Copyright (c) 1982, 1986, 1988
34 * The Regents of the University of California
37 * University Acknowledgment- Portions of this document are derived from
38 * software developed by the University of California, Berkeley, and its
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/t_lock.h>
45 #include <sys/errno.h>
50 #include <sys/pathname.h>
51 #include <sys/atomic.h>
53 #include <sys/vnode.h>
54 #include <sys/vnode_dispatch.h>
55 #include <sys/rwstlock.h>
60 #include <sys/sysmacros.h>
61 #include <sys/cmn_err.h>
62 #include <sys/systm.h>
64 #include <sys/debug.h>
66 #include <sys/nbmlock.h>
67 #include <sys/fcntl.h>
68 #include <sys/fs_subr.h>
69 #include <sys/taskq.h>
70 #include <sys/fs_reparse.h>
74 /* Determine if this vnode is a file that is read-only */
75 #define ISROFILE(vp) \
76 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
77 (vp)->v_type != VFIFO && vn_is_readonly(vp))
79 /* Tunable via /etc/system; used only by admin/install */
80 int nfs_global_client_only
;
83 * Array of vopstats_t for per-FS-type vopstats. This array has the same
84 * number of entries as and parallel to the vfssw table. (Arguably, it could
85 * be part of the vfssw table.) Once it's initialized, it's accessed using
86 * the same fstype index that is used to index into the vfssw table.
88 vopstats_t
**vopstats_fstype
;
90 /* vopstats initialization template used for fast initialization via bcopy() */
91 static vopstats_t
*vs_templatep
;
93 /* Kmem cache handle for vsk_anchor_t allocations */
94 kmem_cache_t
*vsk_anchor_cache
;
96 /* file events cleanup routine */
97 extern void free_fopdata(vnode_t
*);
100 * Root of AVL tree for the kstats associated with vopstats. Lock protects
101 * updates to vsktat_tree.
103 avl_tree_t vskstat_tree
;
104 kmutex_t vskstat_tree_lock
;
106 /* Global variable which enables/disables the vopstats collection */
107 int vopstats_enabled
= 1;
109 /* Global used for empty/invalid v_path */
110 char *vn_vpath_empty
= "";
113 * forward declarations for internal vnode specific data (vsd)
115 static void *vsd_realloc(void *, size_t, size_t);
118 * forward declarations for reparse point functions
120 static int fs_reparse_mark(char *target
, vattr_t
*vap
, xvattr_t
*xvattr
);
123 * VSD -- VNODE SPECIFIC DATA
124 * The v_data pointer is typically used by a file system to store a
125 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
126 * However, there are times when additional project private data needs
127 * to be stored separately from the data (node) pointed to by v_data.
128 * This additional data could be stored by the file system itself or
129 * by a completely different kernel entity. VSD provides a way for
130 * callers to obtain a key and store a pointer to private data associated
133 * Callers are responsible for protecting the vsd by holding v_vsd_lock
134 * for calls to vsd_set() and vsd_get().
139 * vsd_nkeys - creation and deletion of vsd keys
140 * vsd_list - insertion and deletion of vsd_node in the vsd_list
141 * vsd_destructor - adding and removing destructors to the list
143 static kmutex_t vsd_lock
;
144 static uint_t vsd_nkeys
; /* size of destructor array */
145 /* list of vsd_node's */
146 static list_t
*vsd_list
= NULL
;
147 /* per-key destructor funcs */
148 static void (**vsd_destructor
)(void *);
151 * The following is the common set of actions needed to update the
152 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and
153 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
154 * recording of the bytes transferred. Since the code is similar
155 * but small, it is nearly a duplicate. Consequently any changes
156 * to one may need to be reflected in the other.
157 * Rundown of the variables:
158 * vp - Pointer to the vnode
159 * counter - Partial name structure member to update in vopstats for counts
160 * bytecounter - Partial name structure member to update in vopstats for bytes
161 * bytesval - Value to update in vopstats for bytes
162 * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
163 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
166 #define VOPSTATS_UPDATE(vp, counter) { \
167 vfs_t *vfsp = (vp)->v_vfsp; \
168 if (vfsp && vfsp->vfs_implp && \
169 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
170 vopstats_t *vsp = &vfsp->vfs_vopstats; \
171 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
172 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
173 size_t, uint64_t *); \
174 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \
176 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
177 vsp->n##counter.value.ui64++; \
182 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \
183 vfs_t *vfsp = (vp)->v_vfsp; \
184 if (vfsp && vfsp->vfs_implp && \
185 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
186 vopstats_t *vsp = &vfsp->vfs_vopstats; \
187 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
188 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
189 size_t, uint64_t *); \
190 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
192 vsp->bytecounter.value.ui64 += bytesval; \
193 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
194 vsp->n##counter.value.ui64++; \
195 vsp->bytecounter.value.ui64 += bytesval; \
201 * If the filesystem does not support XIDs map credential
202 * If the vfsp is NULL, perhaps we should also map?
204 #define VOPXID_MAP_CR(vp, cr) { \
205 vfs_t *vfsp = (vp)->v_vfsp; \
206 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
207 cr = crgetmapped(cr); \
211 * Convert stat(2) formats to vnode types and vice versa. (Knows about
212 * numerical order of S_IFMT and vnode types.)
214 enum vtype iftovt_tab
[] = {
215 VNON
, VFIFO
, VCHR
, VNON
, VDIR
, VNON
, VBLK
, VNON
,
216 VREG
, VNON
, VLNK
, VNON
, VSOCK
, VNON
, VNON
, VNON
219 ushort_t vttoif_tab
[] = {
220 0, S_IFREG
, S_IFDIR
, S_IFBLK
, S_IFCHR
, S_IFLNK
, S_IFIFO
,
221 S_IFDOOR
, 0, S_IFSOCK
, S_IFPORT
, 0
225 * The system vnode cache.
228 kmem_cache_t
*vn_cache
;
231 /* Extensible attribute (xva) routines. */
234 * Zero out the structure, set the size of the requested/returned bitmaps,
235 * set VATTR_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
236 * to the returned attributes array.
239 xva_init(xvattr_t
*xvap
)
241 bzero(xvap
, sizeof (xvattr_t
));
242 xvap
->xva_mapsize
= XVA_MAPSIZE
;
243 xvap
->xva_magic
= XVA_MAGIC
;
244 xvap
->xva_vattr
.va_mask
= VATTR_XVATTR
;
245 xvap
->xva_rtnattrmapp
= &(xvap
->xva_rtnattrmap
)[0];
249 * If VATTR_XVATTR is set, returns a pointer to the embedded xoptattr_t
250 * structure. Otherwise, returns NULL.
253 xva_getxoptattr(xvattr_t
*xvap
)
255 xoptattr_t
*xoap
= NULL
;
256 if (xvap
->xva_vattr
.va_mask
& VATTR_XVATTR
)
257 xoap
= &xvap
->xva_xoptattrs
;
262 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
263 * We use the f_fsid reported by VFS_STATVFS() since we use that for the
267 vska_compar(const void *n1
, const void *n2
)
270 ulong_t p1
= ((vsk_anchor_t
*)n1
)->vsk_fsid
;
271 ulong_t p2
= ((vsk_anchor_t
*)n2
)->vsk_fsid
;
275 } else if (p1
> p2
) {
285 * Used to create a single template which will be bcopy()ed to a newly
286 * allocated vsanchor_combo_t structure in new_vsanchor(), below.
289 create_vopstats_template()
293 vsp
= kmem_alloc(sizeof (vopstats_t
), KM_SLEEP
);
294 bzero(vsp
, sizeof (*vsp
)); /* Start fresh */
297 kstat_named_init(&vsp
->nopen
, "nopen", KSTAT_DATA_UINT64
);
299 kstat_named_init(&vsp
->nclose
, "nclose", KSTAT_DATA_UINT64
);
301 kstat_named_init(&vsp
->nread
, "nread", KSTAT_DATA_UINT64
);
302 kstat_named_init(&vsp
->read_bytes
, "read_bytes", KSTAT_DATA_UINT64
);
304 kstat_named_init(&vsp
->nwrite
, "nwrite", KSTAT_DATA_UINT64
);
305 kstat_named_init(&vsp
->write_bytes
, "write_bytes", KSTAT_DATA_UINT64
);
307 kstat_named_init(&vsp
->nioctl
, "nioctl", KSTAT_DATA_UINT64
);
309 kstat_named_init(&vsp
->nsetfl
, "nsetfl", KSTAT_DATA_UINT64
);
311 kstat_named_init(&vsp
->ngetattr
, "ngetattr", KSTAT_DATA_UINT64
);
313 kstat_named_init(&vsp
->nsetattr
, "nsetattr", KSTAT_DATA_UINT64
);
315 kstat_named_init(&vsp
->naccess
, "naccess", KSTAT_DATA_UINT64
);
317 kstat_named_init(&vsp
->nlookup
, "nlookup", KSTAT_DATA_UINT64
);
319 kstat_named_init(&vsp
->ncreate
, "ncreate", KSTAT_DATA_UINT64
);
321 kstat_named_init(&vsp
->nremove
, "nremove", KSTAT_DATA_UINT64
);
323 kstat_named_init(&vsp
->nlink
, "nlink", KSTAT_DATA_UINT64
);
325 kstat_named_init(&vsp
->nrename
, "nrename", KSTAT_DATA_UINT64
);
327 kstat_named_init(&vsp
->nmkdir
, "nmkdir", KSTAT_DATA_UINT64
);
329 kstat_named_init(&vsp
->nrmdir
, "nrmdir", KSTAT_DATA_UINT64
);
330 /* fop_readdir I/O */
331 kstat_named_init(&vsp
->nreaddir
, "nreaddir", KSTAT_DATA_UINT64
);
332 kstat_named_init(&vsp
->readdir_bytes
, "readdir_bytes",
335 kstat_named_init(&vsp
->nsymlink
, "nsymlink", KSTAT_DATA_UINT64
);
337 kstat_named_init(&vsp
->nreadlink
, "nreadlink", KSTAT_DATA_UINT64
);
339 kstat_named_init(&vsp
->nfsync
, "nfsync", KSTAT_DATA_UINT64
);
341 kstat_named_init(&vsp
->ninactive
, "ninactive", KSTAT_DATA_UINT64
);
343 kstat_named_init(&vsp
->nfid
, "nfid", KSTAT_DATA_UINT64
);
345 kstat_named_init(&vsp
->nrwlock
, "nrwlock", KSTAT_DATA_UINT64
);
347 kstat_named_init(&vsp
->nrwunlock
, "nrwunlock", KSTAT_DATA_UINT64
);
349 kstat_named_init(&vsp
->nseek
, "nseek", KSTAT_DATA_UINT64
);
351 kstat_named_init(&vsp
->ncmp
, "ncmp", KSTAT_DATA_UINT64
);
353 kstat_named_init(&vsp
->nfrlock
, "nfrlock", KSTAT_DATA_UINT64
);
355 kstat_named_init(&vsp
->nspace
, "nspace", KSTAT_DATA_UINT64
);
357 kstat_named_init(&vsp
->nrealvp
, "nrealvp", KSTAT_DATA_UINT64
);
359 kstat_named_init(&vsp
->ngetpage
, "ngetpage", KSTAT_DATA_UINT64
);
361 kstat_named_init(&vsp
->nputpage
, "nputpage", KSTAT_DATA_UINT64
);
363 kstat_named_init(&vsp
->nmap
, "nmap", KSTAT_DATA_UINT64
);
365 kstat_named_init(&vsp
->naddmap
, "naddmap", KSTAT_DATA_UINT64
);
367 kstat_named_init(&vsp
->ndelmap
, "ndelmap", KSTAT_DATA_UINT64
);
369 kstat_named_init(&vsp
->npoll
, "npoll", KSTAT_DATA_UINT64
);
371 kstat_named_init(&vsp
->ndump
, "ndump", KSTAT_DATA_UINT64
);
373 kstat_named_init(&vsp
->npathconf
, "npathconf", KSTAT_DATA_UINT64
);
375 kstat_named_init(&vsp
->npageio
, "npageio", KSTAT_DATA_UINT64
);
377 kstat_named_init(&vsp
->ndumpctl
, "ndumpctl", KSTAT_DATA_UINT64
);
379 kstat_named_init(&vsp
->ndispose
, "ndispose", KSTAT_DATA_UINT64
);
381 kstat_named_init(&vsp
->nsetsecattr
, "nsetsecattr", KSTAT_DATA_UINT64
);
383 kstat_named_init(&vsp
->ngetsecattr
, "ngetsecattr", KSTAT_DATA_UINT64
);
385 kstat_named_init(&vsp
->nshrlock
, "nshrlock", KSTAT_DATA_UINT64
);
387 kstat_named_init(&vsp
->nvnevent
, "nvnevent", KSTAT_DATA_UINT64
);
389 kstat_named_init(&vsp
->nreqzcbuf
, "nreqzcbuf", KSTAT_DATA_UINT64
);
391 kstat_named_init(&vsp
->nretzcbuf
, "nretzcbuf", KSTAT_DATA_UINT64
);
397 * Creates a kstat structure associated with a vopstats structure.
400 new_vskstat(char *ksname
, vopstats_t
*vsp
)
404 if (!vopstats_enabled
) {
408 ksp
= kstat_create("unix", 0, ksname
, "misc", KSTAT_TYPE_NAMED
,
409 sizeof (vopstats_t
)/sizeof (kstat_named_t
),
410 KSTAT_FLAG_VIRTUAL
|KSTAT_FLAG_WRITABLE
);
420 * Called from vfsinit() to initialize the support mechanisms for vopstats
425 if (!vopstats_enabled
)
429 * Creates the AVL tree which holds per-vfs vopstat anchors. This
430 * is necessary since we need to check if a kstat exists before we
431 * attempt to create it. Also, initialize its lock.
433 avl_create(&vskstat_tree
, vska_compar
, sizeof (vsk_anchor_t
),
434 offsetof(vsk_anchor_t
, vsk_node
));
435 mutex_init(&vskstat_tree_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
437 vsk_anchor_cache
= kmem_cache_create("vsk_anchor_cache",
438 sizeof (vsk_anchor_t
), sizeof (uintptr_t), NULL
, NULL
, NULL
,
442 * Set up the array of pointers for the vopstats-by-FS-type.
443 * The entries will be allocated/initialized as each file system
444 * goes through modload/mod_installfs.
446 vopstats_fstype
= (vopstats_t
**)kmem_zalloc(
447 (sizeof (vopstats_t
*) * nfstype
), KM_SLEEP
);
449 /* Set up the global vopstats initialization template */
450 vs_templatep
= create_vopstats_template();
454 * We need to have the all of the counters zeroed.
455 * The initialization of the vopstats_t includes on the order of
456 * 50 calls to kstat_named_init(). Rather that do that on every call,
457 * we do it once in a template (vs_templatep) then bcopy it over.
460 initialize_vopstats(vopstats_t
*vsp
)
465 bcopy(vs_templatep
, vsp
, sizeof (vopstats_t
));
469 * If possible, determine which vopstats by fstype to use and
470 * return a pointer to the caller.
473 get_fstype_vopstats(vfs_t
*vfsp
, struct vfssw
*vswp
)
475 int fstype
= 0; /* Index into vfssw[] */
476 vopstats_t
*vsp
= NULL
;
478 if (vfsp
== NULL
|| (vfsp
->vfs_flag
& VFS_STATS
) == 0 ||
482 * Set up the fstype. We go to so much trouble because all versions
483 * of NFS use the same fstype in their vfs even though they have
484 * distinct entries in the vfssw[] table.
485 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
488 fstype
= vswp
- vfssw
; /* Gets us the index */
490 fstype
= vfsp
->vfs_fstype
;
494 * Point to the per-fstype vopstats. The only valid values are
495 * non-zero positive values less than the number of vfssw[] table
498 if (fstype
> 0 && fstype
< nfstype
) {
499 vsp
= vopstats_fstype
[fstype
];
506 * Generate a kstat name, create the kstat structure, and allocate a
507 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t
508 * to the caller. This must only be called from a mount.
511 get_vskstat_anchor(vfs_t
*vfsp
)
513 char kstatstr
[KSTAT_STRLEN
]; /* kstat name for vopstats */
514 statvfs64_t statvfsbuf
; /* Needed to find f_fsid */
515 vsk_anchor_t
*vskp
= NULL
; /* vfs <--> kstat anchor */
516 kstat_t
*ksp
; /* Ptr to new kstat */
517 avl_index_t where
; /* Location in the AVL tree */
519 if (vfsp
== NULL
|| vfsp
->vfs_implp
== NULL
||
520 (vfsp
->vfs_flag
& VFS_STATS
) == 0 || !vopstats_enabled
)
523 /* Need to get the fsid to build a kstat name */
524 if (VFS_STATVFS(vfsp
, &statvfsbuf
) == 0) {
525 /* Create a name for our kstats based on fsid */
526 (void) snprintf(kstatstr
, KSTAT_STRLEN
, "%s%lx",
527 VOPSTATS_STR
, statvfsbuf
.f_fsid
);
529 /* Allocate and initialize the vsk_anchor_t */
530 vskp
= kmem_cache_alloc(vsk_anchor_cache
, KM_SLEEP
);
531 bzero(vskp
, sizeof (*vskp
));
532 vskp
->vsk_fsid
= statvfsbuf
.f_fsid
;
534 mutex_enter(&vskstat_tree_lock
);
535 if (avl_find(&vskstat_tree
, vskp
, &where
) == NULL
) {
536 avl_insert(&vskstat_tree
, vskp
, where
);
537 mutex_exit(&vskstat_tree_lock
);
540 * Now that we've got the anchor in the AVL
541 * tree, we can create the kstat.
543 ksp
= new_vskstat(kstatstr
, &vfsp
->vfs_vopstats
);
548 /* Oops, found one! Release memory and lock. */
549 mutex_exit(&vskstat_tree_lock
);
550 kmem_cache_free(vsk_anchor_cache
, vskp
);
558 * We're in the process of tearing down the vfs and need to cleanup
559 * the data structures associated with the vopstats. Must only be called
563 teardown_vopstats(vfs_t
*vfsp
)
568 if (vfsp
== NULL
|| vfsp
->vfs_implp
== NULL
||
569 (vfsp
->vfs_flag
& VFS_STATS
) == 0 || !vopstats_enabled
)
572 /* This is a safe check since VFS_STATS must be set (see above) */
573 if ((vskap
= vfsp
->vfs_vskap
) == NULL
)
576 /* Whack the pointer right away */
577 vfsp
->vfs_vskap
= NULL
;
579 /* Lock the tree, remove the node, and delete the kstat */
580 mutex_enter(&vskstat_tree_lock
);
581 if (avl_find(&vskstat_tree
, vskap
, &where
)) {
582 avl_remove(&vskstat_tree
, vskap
);
585 if (vskap
->vsk_ksp
) {
586 kstat_delete(vskap
->vsk_ksp
);
588 mutex_exit(&vskstat_tree_lock
);
590 kmem_cache_free(vsk_anchor_cache
, vskap
);
594 * Read or write a vnode. Called from kernel code.
605 rlim64_t ulimit
, /* meaningful only if rw is UIO_WRITE */
614 if (rw
== UIO_WRITE
&& ISROFILE(vp
))
620 VOPXID_MAP_CR(vp
, cr
);
626 uio
.uio_loffset
= offset
;
627 uio
.uio_segflg
= (short)seg
;
629 uio
.uio_llimit
= ulimit
;
632 * We have to enter the critical region before calling fop_rwlock
633 * to avoid a deadlock with ufs.
635 if (nbl_need_check(vp
)) {
638 nbl_start_crit(vp
, RW_READER
);
640 error
= nbl_svmand(vp
, cr
, &svmand
);
643 if (nbl_conflict(vp
, rw
== UIO_WRITE
? NBL_WRITE
: NBL_READ
,
644 uio
.uio_offset
, uio
.uio_resid
, svmand
, NULL
)) {
650 (void) fop_rwlock(vp
,
651 rw
== UIO_WRITE
? V_WRITELOCK_TRUE
: V_WRITELOCK_FALSE
, NULL
);
652 if (rw
== UIO_WRITE
) {
653 uio
.uio_fmode
= FWRITE
;
654 uio
.uio_extflg
= UIO_COPY_DEFAULT
;
655 error
= fop_write(vp
, &uio
, ioflag
, cr
, NULL
);
657 uio
.uio_fmode
= FREAD
;
658 uio
.uio_extflg
= UIO_COPY_CACHED
;
659 error
= fop_read(vp
, &uio
, ioflag
, cr
, NULL
);
662 rw
== UIO_WRITE
? V_WRITELOCK_TRUE
: V_WRITELOCK_FALSE
, NULL
);
664 *residp
= uio
.uio_resid
;
665 else if (uio
.uio_resid
)
675 * Release a vnode. Call fop_inactive on last reference or
676 * decrement reference count.
678 * To avoid race conditions, the v_count is left at 1 for
679 * the call to fop_inactive. This prevents another thread
680 * from reclaiming and releasing the vnode *before* the
681 * fop_inactive routine has a chance to destroy the vnode.
682 * We can't have more than 1 thread calling fop_inactive
688 VERIFY(vp
->v_count
> 0);
689 mutex_enter(&vp
->v_lock
);
690 if (vp
->v_count
== 1) {
691 mutex_exit(&vp
->v_lock
);
692 fop_inactive(vp
, CRED(), NULL
);
696 mutex_exit(&vp
->v_lock
);
700 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
701 * as a single reference, so v_count is not decremented until the last DNLC hold
702 * is released. This makes it possible to distinguish vnodes that are referenced
706 vn_rele_dnlc(vnode_t
*vp
)
708 VERIFY((vp
->v_count
> 0) && (vp
->v_count_dnlc
> 0));
709 mutex_enter(&vp
->v_lock
);
710 if (--vp
->v_count_dnlc
== 0) {
711 if (vp
->v_count
== 1) {
712 mutex_exit(&vp
->v_lock
);
713 fop_inactive(vp
, CRED(), NULL
);
718 mutex_exit(&vp
->v_lock
);
722 * Like vn_rele() except that it clears v_stream under v_lock.
723 * This is used by sockfs when it dismantles the association between
724 * the sockfs node and the vnode in the underlying file system.
725 * v_lock has to be held to prevent a thread coming through the lookupname
726 * path from accessing a stream head that is going away.
729 vn_rele_stream(vnode_t
*vp
)
731 VERIFY(vp
->v_count
> 0);
732 mutex_enter(&vp
->v_lock
);
734 if (vp
->v_count
== 1) {
735 mutex_exit(&vp
->v_lock
);
736 fop_inactive(vp
, CRED(), NULL
);
740 mutex_exit(&vp
->v_lock
);
744 vn_rele_inactive(vnode_t
*vp
)
746 fop_inactive(vp
, CRED(), NULL
);
750 * Like vn_rele() except if we are going to call fop_inactive() then do it
751 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
752 * the file system as a result of releasing the vnode. Note, file systems
753 * already have to handle the race where the vnode is incremented before the
754 * inactive routine is called and does its locking.
756 * Warning: Excessive use of this routine can lead to performance problems.
757 * This is because taskqs throttle back allocation if too many are created.
760 vn_rele_async(vnode_t
*vp
, taskq_t
*taskq
)
762 VERIFY(vp
->v_count
> 0);
763 mutex_enter(&vp
->v_lock
);
764 if (vp
->v_count
== 1) {
765 mutex_exit(&vp
->v_lock
);
766 VERIFY(taskq_dispatch(taskq
, (task_func_t
*)vn_rele_inactive
,
767 vp
, TQ_SLEEP
) != (uintptr_t)NULL
);
771 mutex_exit(&vp
->v_lock
);
784 return (vn_openat(pnamep
, seg
, filemode
, createmode
, vpp
, crwhy
,
790 * Open/create a vnode.
791 * This may be callable by the kernel, the only known use
792 * of user context being that the current user credentials
793 * are used for permissions. crwhy is defined iff filemode & FCREAT.
804 struct vnode
*startvp
,
813 int shrlock_done
= 0;
815 enum symfollow follow
;
816 int estale_retry
= 0;
818 struct shr_locowner shr_own
;
820 if (filemode
& FSEARCH
)
821 filemode
|= FDIRECTORY
;
825 if (filemode
& FREAD
)
827 if (filemode
& (FWRITE
|FTRUNC
))
829 if (filemode
& (FSEARCH
|FEXEC
|FXATTRDIROPEN
))
832 /* symlink interpretation */
833 if (filemode
& FNOFOLLOW
)
838 if (filemode
& FAPPEND
)
839 accessflags
|= V_APPEND
;
842 if (filemode
& FCREAT
&& !(filemode
& FDIRECTORY
)) {
845 /* Wish to create a file. */
846 vattr
.va_type
= VREG
;
847 vattr
.va_mode
= createmode
;
848 vattr
.va_mask
= VATTR_TYPE
|VATTR_MODE
;
849 if (filemode
& FTRUNC
) {
851 vattr
.va_mask
|= VATTR_SIZE
;
853 if (filemode
& FEXCL
)
859 vn_createat(pnamep
, seg
, &vattr
, excl
, mode
, &vp
, crwhy
,
860 (filemode
& ~(FTRUNC
|FEXCL
)), umask
, startvp
))
863 /* Wish to open a file. Just look it up. */
864 if (error
= lookupnameat(pnamep
, seg
, follow
,
865 NULLVPP
, &vp
, startvp
)) {
866 if ((error
== ESTALE
) &&
867 fs_need_estale_retry(estale_retry
++))
873 * Get the attributes to check whether file is large.
874 * We do this only if the FOFFMAX flag is not set and
875 * only for regular files.
878 if (!(filemode
& FOFFMAX
) && (vp
->v_type
== VREG
)) {
879 vattr
.va_mask
= VATTR_SIZE
;
880 if ((error
= fop_getattr(vp
, &vattr
, 0,
884 if (vattr
.va_size
> (uoff_t
)MAXOFF32_T
) {
886 * Large File API - regular open fails
887 * if FOFFMAX flag is set in file mode
894 * Can't write directories, active texts, or
895 * read-only filesystems. Can't truncate files
896 * on which mandatory locking is in effect.
898 if (filemode
& (FWRITE
|FTRUNC
)) {
900 * Allow writable directory if VDIROPEN flag is set.
902 if (vp
->v_type
== VDIR
&& !(vp
->v_flag
& VDIROPEN
)) {
911 * Can't truncate files on which
912 * sysv mandatory locking is in effect.
914 if (filemode
& FTRUNC
) {
917 if (fop_realvp(vp
, &rvp
, NULL
) != 0)
919 if (rvp
->v_filocks
!= NULL
) {
920 vattr
.va_mask
= VATTR_MODE
;
921 if ((error
= fop_getattr(vp
,
922 &vattr
, 0, CRED(), NULL
)) == 0 &&
923 MANDLOCK(vp
, vattr
.va_mode
))
933 if (error
= fop_access(vp
, mode
, accessflags
, CRED(), NULL
))
936 * Require FDIRECTORY to return a directory.
937 * Require FEXEC to return a regular file.
939 if ((filemode
& FDIRECTORY
) && vp
->v_type
!= VDIR
) {
943 if ((filemode
& FEXEC
) && vp
->v_type
!= VREG
) {
944 error
= ENOEXEC
; /* XXX: error code? */
950 * Do remaining checks for FNOFOLLOW and FNOLINKS.
952 if ((filemode
& FNOFOLLOW
) && vp
->v_type
== VLNK
) {
956 if (filemode
& FNOLINKS
) {
957 vattr
.va_mask
= VATTR_NLINK
;
958 if ((error
= fop_getattr(vp
, &vattr
, 0, CRED(), NULL
))) {
961 if (vattr
.va_nlink
!= 1) {
968 * Opening a socket corresponding to the AF_UNIX pathname
969 * in the filesystem name space is not supported.
970 * However, VSOCK nodes in namefs are supported in order
971 * to make fattach work for sockets.
973 * XXX This uses fop_realvp to distinguish between
974 * an unopened namefs node (where fop_realvp returns a
975 * different VSOCK vnode) and a VSOCK created by vn_create
976 * in some file system (where fop_realvp would never return
977 * a different vnode).
979 if (vp
->v_type
== VSOCK
) {
982 error
= fop_realvp(vp
, &nvp
, NULL
);
983 if (error
!= 0 || nvp
== NULL
|| nvp
== vp
||
984 nvp
->v_type
!= VSOCK
) {
990 if ((vp
->v_type
== VREG
) && nbl_need_check(vp
)) {
991 /* get share reservation */
993 if (filemode
& FWRITE
)
994 shr
.s_access
|= F_WRACC
;
995 if (filemode
& FREAD
)
996 shr
.s_access
|= F_RDACC
;
999 shr
.s_pid
= ttoproc(curthread
)->p_pid
;
1000 shr_own
.sl_pid
= shr
.s_pid
;
1002 shr
.s_own_len
= sizeof (shr_own
);
1003 shr
.s_owner
= (caddr_t
)&shr_own
;
1004 error
= fop_shrlock(vp
, F_SHARE_NBMAND
, &shr
, filemode
, CRED(),
1010 /* nbmand conflict check if truncating file */
1011 if ((filemode
& FTRUNC
) && !(filemode
& FCREAT
)) {
1012 nbl_start_crit(vp
, RW_READER
);
1015 vattr
.va_mask
= VATTR_SIZE
;
1016 if (error
= fop_getattr(vp
, &vattr
, 0, CRED(), NULL
))
1018 if (nbl_conflict(vp
, NBL_WRITE
, 0, vattr
.va_size
, 0,
1027 * Do opening protocol.
1029 error
= fop_open(&vp
, filemode
, CRED(), NULL
);
1035 * Truncate if required.
1037 if ((filemode
& FTRUNC
) && !(filemode
& FCREAT
)) {
1039 vattr
.va_mask
= VATTR_SIZE
;
1040 if ((error
= fop_setattr(vp
, &vattr
, 0, CRED(), NULL
)) != 0)
1044 ASSERT(vp
->v_count
> 0);
1052 (void) fop_close(vp
, filemode
, 1, 0, CRED(),
1058 (void) fop_shrlock(vp
, F_UNSHARE
, &shr
, 0, CRED(),
1064 * The following clause was added to handle a problem
1065 * with NFS consistency. It is possible that a lookup
1066 * of the file to be opened succeeded, but the file
1067 * itself doesn't actually exist on the server. This
1068 * is chiefly due to the DNLC containing an entry for
1069 * the file which has been removed on the server. In
1070 * this case, we just start over. If there was some
1071 * other cause for the ESTALE error, then the lookup
1072 * of the file will fail and the error will be returned
1073 * above instead of looping around from here.
1076 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1084 * The following two accessor functions are for the NFSv4 server. Since there
1085 * is no fop_open_UP/DOWNGRADE we need a way for the NFS server to keep the
1086 * vnode open counts correct when a client "upgrades" an open or does an
1087 * open_downgrade. In NFS, an upgrade or downgrade can not only change the
1088 * open mode (add or subtract read or write), but also change the share/deny
1089 * modes. However, share reservations are not integrated with OPEN, yet, so
1090 * we need to handle each separately. These functions are cleaner than having
1091 * the NFS server manipulate the counts directly, however, nobody else should
1092 * use these functions.
1099 ASSERT(vp
->v_type
== VREG
);
1101 if (filemode
& FREAD
)
1102 atomic_inc_32(&vp
->v_rdcnt
);
1103 if (filemode
& FWRITE
)
1104 atomic_inc_32(&vp
->v_wrcnt
);
1113 ASSERT(vp
->v_type
== VREG
);
1115 if (filemode
& FREAD
) {
1116 ASSERT(vp
->v_rdcnt
> 0);
1117 atomic_dec_32(&vp
->v_rdcnt
);
1119 if (filemode
& FWRITE
) {
1120 ASSERT(vp
->v_wrcnt
> 0);
1121 atomic_dec_32(&vp
->v_wrcnt
);
1138 return (vn_createat(pnamep
, seg
, vap
, excl
, mode
, vpp
, why
, flag
,
1143 * Create a vnode (makenode).
1156 struct vnode
*startvp
)
1158 struct vnode
*dvp
; /* ptr to parent dir vnode */
1159 struct vnode
*vp
= NULL
;
1164 enum symfollow follow
;
1165 int estale_retry
= 0;
1167 ASSERT((vap
->va_mask
& (VATTR_TYPE
|VATTR_MODE
)) == (VATTR_TYPE
|VATTR_MODE
));
1169 /* symlink interpretation */
1170 if ((flag
& FNOFOLLOW
) || excl
== EXCL
)
1174 flag
&= ~(FNOFOLLOW
|FNOLINKS
);
1179 * If new object is a file, call lower level to create it.
1180 * Note that it is up to the lower level to enforce exclusive
1181 * creation, if the file is already there.
1182 * This allows the lower level to do whatever
1183 * locking or protocol that is needed to prevent races.
1184 * If the new object is directory call lower level to make
1185 * the new directory, with "." and "..".
1187 if (error
= pn_get(pnamep
, seg
, &pn
))
1192 * lookup will find the parent directory for the vnode.
1193 * When it is done the pn holds the name of the entry
1195 * If this is a non-exclusive create we also find the node itself.
1197 error
= lookuppnat(&pn
, NULL
, follow
, &dvp
,
1198 (excl
== EXCL
) ? NULLVPP
: vpp
, startvp
);
1201 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1203 if (why
== CRMKDIR
&& error
== EINVAL
)
1204 error
= EEXIST
; /* SVID */
1209 vap
->va_mode
&= ~VSVTX
;
1212 * If default ACLs are defined for the directory don't apply the
1213 * umask if umask is passed.
1220 vsec
.vsa_aclcnt
= 0;
1221 vsec
.vsa_aclentp
= NULL
;
1222 vsec
.vsa_dfaclcnt
= 0;
1223 vsec
.vsa_dfaclentp
= NULL
;
1224 vsec
.vsa_mask
= VSA_DFACLCNT
;
1225 error
= fop_getsecattr(dvp
, &vsec
, 0, CRED(), NULL
);
1227 * If error is ENOSYS then treat it as no error
1228 * Don't want to force all file systems to support
1229 * aclent_t style of ACL's.
1231 if (error
== ENOSYS
)
1239 * Apply the umask if no default ACLs.
1241 if (vsec
.vsa_dfaclcnt
== 0)
1242 vap
->va_mode
&= ~umask
;
1245 * fop_getsecattr() may have allocated memory for
1246 * ACLs we didn't request, so double-check and
1247 * free it if necessary.
1249 if (vsec
.vsa_aclcnt
&& vsec
.vsa_aclentp
!= NULL
)
1250 kmem_free((caddr_t
)vsec
.vsa_aclentp
,
1251 vsec
.vsa_aclcnt
* sizeof (aclent_t
));
1252 if (vsec
.vsa_dfaclcnt
&& vsec
.vsa_dfaclentp
!= NULL
)
1253 kmem_free((caddr_t
)vsec
.vsa_dfaclentp
,
1254 vsec
.vsa_dfaclcnt
* sizeof (aclent_t
));
1259 * In general we want to generate EROFS if the file system is
1260 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1
1261 * documents the open system call, and it says that O_CREAT has no
1262 * effect if the file already exists. Bug 1119649 states
1263 * that open(path, O_CREAT, ...) fails when attempting to open an
1264 * existing file on a read only file system. Thus, the first part
1265 * of the following if statement has 3 checks:
1266 * if the file exists &&
1267 * it is being open with write access &&
1268 * the file system is read only
1269 * then generate EROFS
1271 if ((*vpp
!= NULL
&& (mode
& VWRITE
) && ISROFILE(*vpp
)) ||
1272 (*vpp
== NULL
&& dvp
->v_vfsp
->vfs_flag
& VFS_RDONLY
)) {
1276 } else if (excl
== NONEXCL
&& *vpp
!= NULL
) {
1280 * File already exists. If a mandatory lock has been
1281 * applied, return error.
1284 if (fop_realvp(vp
, &rvp
, NULL
) != 0)
1286 if ((vap
->va_mask
& VATTR_SIZE
) && nbl_need_check(vp
)) {
1287 nbl_start_crit(vp
, RW_READER
);
1290 if (rvp
->v_filocks
!= NULL
|| rvp
->v_shrlocks
!= NULL
) {
1291 vattr
.va_mask
= VATTR_MODE
|VATTR_SIZE
;
1292 if (error
= fop_getattr(vp
, &vattr
, 0, CRED(), NULL
)) {
1295 if (MANDLOCK(vp
, vattr
.va_mode
)) {
1300 * File cannot be truncated if non-blocking mandatory
1301 * locks are currently on the file.
1303 if ((vap
->va_mask
& VATTR_SIZE
) && in_crit
) {
1307 offset
= vap
->va_size
> vattr
.va_size
?
1308 vattr
.va_size
: vap
->va_size
;
1309 length
= vap
->va_size
> vattr
.va_size
?
1310 vap
->va_size
- vattr
.va_size
:
1311 vattr
.va_size
- vap
->va_size
;
1312 if (nbl_conflict(vp
, NBL_WRITE
, offset
,
1321 * If the file is the root of a VFS, we've crossed a
1322 * mount point and the "containing" directory that we
1323 * acquired above (dvp) is irrelevant because it's in
1324 * a different file system. We apply fop_create to the
1325 * target itself instead of to the containing directory
1326 * and supply a null path name to indicate (conventionally)
1327 * the node itself as the "component" of interest.
1329 * The call to fop_create() is necessary to ensure
1330 * that the appropriate permission checks are made,
1331 * i.e. EISDIR, EACCES, etc. We already know that vpp
1332 * exists since we are in the else condition where this
1335 if (vp
->v_flag
& VROOT
) {
1336 ASSERT(why
!= CRMKDIR
);
1337 error
= fop_create(vp
, "", vap
, excl
, mode
, vpp
,
1338 CRED(), flag
, NULL
, NULL
);
1340 * If the create succeeded, it will have created a
1341 * new reference on a new vnode (*vpp) in the child
1342 * file system, so we want to drop our reference on
1343 * the old (vp) upon exit.
1349 * Large File API - non-large open (FOFFMAX flag not set)
1350 * of regular file fails if the file size exceeds MAXOFF32_T.
1352 if (why
!= CRMKDIR
&&
1353 !(flag
& FOFFMAX
) &&
1354 (vp
->v_type
== VREG
)) {
1355 vattr
.va_mask
= VATTR_SIZE
;
1356 if ((error
= fop_getattr(vp
, &vattr
, 0,
1360 if ((vattr
.va_size
> (uoff_t
)MAXOFF32_T
)) {
1369 * Call mkdir() if specified, otherwise create().
1371 int must_be_dir
= pn_fixslash(&pn
); /* trailing '/'? */
1375 * N.B., if vn_createat() ever requests
1376 * case-insensitive behavior then it will need
1377 * to be passed to fop_mkdir(). fop_create()
1378 * will already get it via "flag"
1380 error
= fop_mkdir(dvp
, pn
.pn_path
, vap
, vpp
, CRED(),
1382 else if (!must_be_dir
)
1383 error
= fop_create(dvp
, pn
.pn_path
, vap
,
1384 excl
, mode
, vpp
, CRED(), flag
, NULL
, NULL
);
1402 * The following clause was added to handle a problem
1403 * with NFS consistency. It is possible that a lookup
1404 * of the file to be created succeeded, but the file
1405 * itself doesn't actually exist on the server. This
1406 * is chiefly due to the DNLC containing an entry for
1407 * the file which has been removed on the server. In
1408 * this case, we just start over. If there was some
1409 * other cause for the ESTALE error, then the lookup
1410 * of the file will fail and the error will be returned
1411 * above instead of looping around from here.
1413 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1419 vn_link(char *from
, char *to
, enum uio_seg seg
)
1421 return (vn_linkat(NULL
, from
, NO_FOLLOW
, NULL
, to
, seg
));
1425 vn_linkat(vnode_t
*fstartvp
, char *from
, enum symfollow follow
,
1426 vnode_t
*tstartvp
, char *to
, enum uio_seg seg
)
1428 struct vnode
*fvp
; /* from vnode ptr */
1429 struct vnode
*tdvp
; /* to directory vnode ptr */
1434 int estale_retry
= 0;
1438 if (error
= pn_get(to
, seg
, &pn
))
1440 if (error
= lookupnameat(from
, seg
, follow
, NULLVPP
, &fvp
, fstartvp
))
1442 if (error
= lookuppnat(&pn
, NULL
, NO_FOLLOW
, &tdvp
, NULLVPP
, tstartvp
))
1445 * Make sure both source vnode and target directory vnode are
1446 * in the same vfs and that it is writeable.
1448 vattr
.va_mask
= VATTR_FSID
;
1449 if (error
= fop_getattr(fvp
, &vattr
, 0, CRED(), NULL
))
1451 fsid
= vattr
.va_fsid
;
1452 vattr
.va_mask
= VATTR_FSID
;
1453 if (error
= fop_getattr(tdvp
, &vattr
, 0, CRED(), NULL
))
1455 if (fsid
!= vattr
.va_fsid
) {
1459 if (tdvp
->v_vfsp
->vfs_flag
& VFS_RDONLY
) {
1466 (void) pn_fixslash(&pn
);
1467 error
= fop_link(tdvp
, fvp
, pn
.pn_path
, CRED(), NULL
, 0);
1474 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1480 vn_rename(char *from
, char *to
, enum uio_seg seg
)
1482 return (vn_renameat(NULL
, from
, NULL
, to
, seg
));
1486 vn_renameat(vnode_t
*fdvp
, char *fname
, vnode_t
*tdvp
,
1487 char *tname
, enum uio_seg seg
)
1491 struct pathname fpn
; /* from pathname */
1492 struct pathname tpn
; /* to pathname */
1494 int in_crit_src
, in_crit_targ
;
1495 vnode_t
*fromvp
, *fvp
;
1496 vnode_t
*tovp
, *targvp
;
1497 int estale_retry
= 0;
1500 fvp
= fromvp
= tovp
= targvp
= NULL
;
1501 in_crit_src
= in_crit_targ
= 0;
1503 * Get to and from pathnames.
1505 if (error
= pn_get(fname
, seg
, &fpn
))
1507 if (error
= pn_get(tname
, seg
, &tpn
)) {
1513 * First we need to resolve the correct directories
1514 * The passed in directories may only be a starting point,
1515 * but we need the real directories the file(s) live in.
1516 * For example the fname may be something like usr/lib/sparc
1517 * and we were passed in the / directory, but we need to
1518 * use the lib directory for the rename.
1522 * Lookup to and from directories.
1524 if (error
= lookuppnat(&fpn
, NULL
, NO_FOLLOW
, &fromvp
, &fvp
, fdvp
)) {
1529 * Make sure there is an entry.
1536 if (error
= lookuppnat(&tpn
, NULL
, NO_FOLLOW
, &tovp
, &targvp
, tdvp
)) {
1541 * Make sure both the from vnode directory and the to directory
1542 * are in the same vfs and the to directory is writable.
1543 * We check fsid's, not vfs pointers, so loopback fs works.
1545 if (fromvp
!= tovp
) {
1546 vattr
.va_mask
= VATTR_FSID
;
1547 if (error
= fop_getattr(fromvp
, &vattr
, 0, CRED(), NULL
))
1549 fsid
= vattr
.va_fsid
;
1550 vattr
.va_mask
= VATTR_FSID
;
1551 if (error
= fop_getattr(tovp
, &vattr
, 0, CRED(), NULL
))
1553 if (fsid
!= vattr
.va_fsid
) {
1559 if (tovp
->v_vfsp
->vfs_flag
& VFS_RDONLY
) {
1565 * Make sure "from" vp is not a mount point.
1566 * Note, lookup did traverse() already, so
1567 * we'll be looking at the mounted FS root.
1568 * (but allow files like mnttab)
1570 if ((fvp
->v_flag
& VROOT
) != 0 && fvp
->v_type
== VDIR
) {
1575 if (targvp
&& (fvp
!= targvp
)) {
1576 nbl_start_crit(targvp
, RW_READER
);
1578 if (nbl_conflict(targvp
, NBL_REMOVE
, 0, 0, 0, NULL
)) {
1584 if (nbl_need_check(fvp
)) {
1585 nbl_start_crit(fvp
, RW_READER
);
1587 if (nbl_conflict(fvp
, NBL_RENAME
, 0, 0, 0, NULL
)) {
1596 (void) pn_fixslash(&tpn
);
1597 error
= fop_rename(fromvp
, fpn
.pn_path
, tovp
, tpn
.pn_path
, CRED(),
1606 nbl_end_crit(targvp
);
1615 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1621 * Remove a file or directory.
1624 vn_remove(char *fnamep
, enum uio_seg seg
, enum rm dirflag
)
1626 return (vn_removeat(NULL
, fnamep
, seg
, dirflag
));
1630 vn_removeat(vnode_t
*startvp
, char *fnamep
, enum uio_seg seg
, enum rm dirflag
)
1632 struct vnode
*vp
; /* entry vnode */
1633 struct vnode
*dvp
; /* ptr to parent dir vnode */
1634 struct vnode
*coveredvp
;
1635 struct pathname pn
; /* name of entry */
1639 struct vfs
*dvfsp
; /* ptr to parent dir vfs */
1641 int estale_retry
= 0;
1644 if (error
= pn_get(fnamep
, seg
, &pn
))
1647 if (error
= lookuppnat(&pn
, NULL
, NO_FOLLOW
, &dvp
, &vp
, startvp
)) {
1649 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1655 * Make sure there is an entry.
1663 dvfsp
= dvp
->v_vfsp
;
1666 * If the named file is the root of a mounted filesystem, fail,
1667 * unless it's marked unlinkable. In that case, unmount the
1668 * filesystem and proceed to unlink the covered vnode. (If the
1669 * covered vnode is a directory, use rmdir instead of unlink,
1670 * to avoid file system corruption.)
1672 if (vp
->v_flag
& VROOT
) {
1673 if ((vfsp
->vfs_flag
& VFS_UNLINKABLE
) == 0) {
1679 * Namefs specific code starts here.
1682 if (dirflag
== RMDIRECTORY
) {
1684 * User called rmdir(2) on a file that has
1685 * been namefs mounted on top of. Since
1686 * namefs doesn't allow directories to
1687 * be mounted on other files we know
1688 * vp is not of type VDIR so fail to operation.
1695 * If VROOT is still set after grabbing vp->v_lock,
1696 * noone has finished nm_unmount so far and coveredvp
1698 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1699 * vp->v_lock, any race window is eliminated.
1702 mutex_enter(&vp
->v_lock
);
1703 if ((vp
->v_flag
& VROOT
) == 0) {
1704 /* Someone beat us to the unmount */
1705 mutex_exit(&vp
->v_lock
);
1710 coveredvp
= vfsp
->vfs_vnodecovered
;
1713 * Note: Implementation of vn_vfswlock shows that ordering of
1714 * v_lock / vn_vfswlock is not an issue here.
1716 error
= vn_vfswlock(coveredvp
);
1717 mutex_exit(&vp
->v_lock
);
1724 error
= dounmount(vfsp
, 0, CRED());
1727 * Unmounted the namefs file system; now get
1728 * the object it was mounted over.
1732 * If namefs was mounted over a directory, then
1733 * we want to use rmdir() instead of unlink().
1735 if (vp
->v_type
== VDIR
)
1736 dirflag
= RMDIRECTORY
;
1743 * Make sure filesystem is writeable.
1744 * We check the parent directory's vfs in case this is an lofs vnode.
1746 if (dvfsp
&& dvfsp
->vfs_flag
& VFS_RDONLY
) {
1754 * If there is the possibility of an nbmand share reservation, make
1755 * sure it's okay to remove the file. Keep a reference to the
1756 * vnode, so that we can exit the nbl critical region after
1757 * calling fop_remove.
1758 * If there is no possibility of an nbmand share reservation,
1759 * release the vnode reference now. Filesystems like NFS may
1760 * behave differently if there is an extra reference, so get rid of
1761 * this one. Fortunately, we can't have nbmand mounts on NFS
1764 if (nbl_need_check(vp
)) {
1765 nbl_start_crit(vp
, RW_READER
);
1767 if (nbl_conflict(vp
, NBL_REMOVE
, 0, 0, 0, NULL
)) {
1776 if (dirflag
== RMDIRECTORY
) {
1778 * Caller is using rmdir(2), which can only be applied to
1781 if (vtype
!= VDIR
) {
1785 proc_t
*pp
= curproc
;
1787 mutex_enter(&pp
->p_lock
);
1788 cwd
= PTOU(pp
)->u_cdir
;
1790 mutex_exit(&pp
->p_lock
);
1791 error
= fop_rmdir(dvp
, pn
.pn_path
, cwd
, CRED(),
1797 * Unlink(2) can be applied to anything.
1799 error
= fop_remove(dvp
, pn
.pn_path
, CRED(), NULL
, 0);
1812 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1818 * Utility function to compare equality of vnodes.
1819 * Compare the underlying real vnodes, if there are underlying vnodes.
1820 * This is a more thorough comparison than the VN_CMP() macro provides.
1823 vn_compare(vnode_t
*vp1
, vnode_t
*vp2
)
1827 if (vp1
!= NULL
&& fop_realvp(vp1
, &realvp
, NULL
) == 0)
1829 if (vp2
!= NULL
&& fop_realvp(vp2
, &realvp
, NULL
) == 0)
1831 return (VN_CMP(vp1
, vp2
));
1835 * The number of locks to hash into. This value must be a power
1836 * of 2 minus 1 and should probably also be prime.
1838 #define NUM_BUCKETS 1023
1840 struct vn_vfslocks_bucket
{
1842 vn_vfslocks_entry_t
*vb_list
;
1843 char pad
[64 - sizeof (kmutex_t
) - sizeof (void *)];
1847 * Total number of buckets will be NUM_BUCKETS + 1 .
1850 #pragma align 64(vn_vfslocks_buckets)
1851 static struct vn_vfslocks_bucket vn_vfslocks_buckets
[NUM_BUCKETS
+ 1];
1853 #define VN_VFSLOCKS_SHIFT 9
1855 #define VN_VFSLOCKS_HASH(vfsvpptr) \
1856 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
1859 * vn_vfslocks_getlock() uses an HASH scheme to generate
1860 * rwstlock using vfs/vnode pointer passed to it.
1862 * vn_vfslocks_rele() releases a reference in the
1863 * HASH table which allows the entry allocated by
1864 * vn_vfslocks_getlock() to be freed at a later
1865 * stage when the refcount drops to zero.
1868 vn_vfslocks_entry_t
*
1869 vn_vfslocks_getlock(void *vfsvpptr
)
1871 struct vn_vfslocks_bucket
*bp
;
1872 vn_vfslocks_entry_t
*vep
;
1873 vn_vfslocks_entry_t
*tvep
;
1875 ASSERT(vfsvpptr
!= NULL
);
1876 bp
= &vn_vfslocks_buckets
[VN_VFSLOCKS_HASH(vfsvpptr
)];
1878 mutex_enter(&bp
->vb_lock
);
1879 for (vep
= bp
->vb_list
; vep
!= NULL
; vep
= vep
->ve_next
) {
1880 if (vep
->ve_vpvfs
== vfsvpptr
) {
1882 mutex_exit(&bp
->vb_lock
);
1886 mutex_exit(&bp
->vb_lock
);
1887 vep
= kmem_alloc(sizeof (*vep
), KM_SLEEP
);
1888 rwst_init(&vep
->ve_lock
, NULL
, RW_DEFAULT
, NULL
);
1889 vep
->ve_vpvfs
= (char *)vfsvpptr
;
1891 mutex_enter(&bp
->vb_lock
);
1892 for (tvep
= bp
->vb_list
; tvep
!= NULL
; tvep
= tvep
->ve_next
) {
1893 if (tvep
->ve_vpvfs
== vfsvpptr
) {
1895 mutex_exit(&bp
->vb_lock
);
1898 * There is already an entry in the hash
1899 * destroy what we just allocated.
1901 rwst_destroy(&vep
->ve_lock
);
1902 kmem_free(vep
, sizeof (*vep
));
1906 vep
->ve_next
= bp
->vb_list
;
1908 mutex_exit(&bp
->vb_lock
);
1913 vn_vfslocks_rele(vn_vfslocks_entry_t
*vepent
)
1915 struct vn_vfslocks_bucket
*bp
;
1916 vn_vfslocks_entry_t
*vep
;
1917 vn_vfslocks_entry_t
*pvep
;
1919 ASSERT(vepent
!= NULL
);
1920 ASSERT(vepent
->ve_vpvfs
!= NULL
);
1922 bp
= &vn_vfslocks_buckets
[VN_VFSLOCKS_HASH(vepent
->ve_vpvfs
)];
1924 mutex_enter(&bp
->vb_lock
);
1925 vepent
->ve_refcnt
--;
1927 if ((int32_t)vepent
->ve_refcnt
< 0)
1928 cmn_err(CE_PANIC
, "vn_vfslocks_rele: refcount negative");
1930 if (vepent
->ve_refcnt
== 0) {
1931 for (vep
= bp
->vb_list
; vep
!= NULL
; vep
= vep
->ve_next
) {
1932 if (vep
->ve_vpvfs
== vepent
->ve_vpvfs
) {
1933 if (bp
->vb_list
== vep
)
1934 bp
->vb_list
= vep
->ve_next
;
1937 pvep
->ve_next
= vep
->ve_next
;
1939 mutex_exit(&bp
->vb_lock
);
1940 rwst_destroy(&vep
->ve_lock
);
1941 kmem_free(vep
, sizeof (*vep
));
1946 cmn_err(CE_PANIC
, "vn_vfslocks_rele: vp/vfs not found");
1948 mutex_exit(&bp
->vb_lock
);
1952 * vn_vfswlock_wait is used to implement a lock which is logically a writers
1953 * lock protecting the v_vfsmountedhere field.
1954 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
1955 * except that it blocks to acquire the lock VVFSLOCK.
1957 * traverse() and routines re-implementing part of traverse (e.g. autofs)
1958 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
1959 * need the non-blocking version of the writers lock i.e. vn_vfswlock
1962 vn_vfswlock_wait(vnode_t
*vp
)
1965 vn_vfslocks_entry_t
*vpvfsentry
;
1968 vpvfsentry
= vn_vfslocks_getlock(vp
);
1969 retval
= rwst_enter_sig(&vpvfsentry
->ve_lock
, RW_WRITER
);
1971 if (retval
== EINTR
) {
1972 vn_vfslocks_rele(vpvfsentry
);
1979 vn_vfsrlock_wait(vnode_t
*vp
)
1982 vn_vfslocks_entry_t
*vpvfsentry
;
1985 vpvfsentry
= vn_vfslocks_getlock(vp
);
1986 retval
= rwst_enter_sig(&vpvfsentry
->ve_lock
, RW_READER
);
1988 if (retval
== EINTR
) {
1989 vn_vfslocks_rele(vpvfsentry
);
1998 * vn_vfswlock is used to implement a lock which is logically a writers lock
1999 * protecting the v_vfsmountedhere field.
2002 vn_vfswlock(vnode_t
*vp
)
2004 vn_vfslocks_entry_t
*vpvfsentry
;
2007 * If vp is NULL then somebody is trying to lock the covered vnode
2008 * of /. (vfs_vnodecovered is NULL for /). This situation will
2009 * only happen when unmounting /. Since that operation will fail
2010 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2015 vpvfsentry
= vn_vfslocks_getlock(vp
);
2017 if (rwst_tryenter(&vpvfsentry
->ve_lock
, RW_WRITER
))
2020 vn_vfslocks_rele(vpvfsentry
);
2025 vn_vfsrlock(vnode_t
*vp
)
2027 vn_vfslocks_entry_t
*vpvfsentry
;
2030 * If vp is NULL then somebody is trying to lock the covered vnode
2031 * of /. (vfs_vnodecovered is NULL for /). This situation will
2032 * only happen when unmounting /. Since that operation will fail
2033 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2038 vpvfsentry
= vn_vfslocks_getlock(vp
);
2040 if (rwst_tryenter(&vpvfsentry
->ve_lock
, RW_READER
))
2043 vn_vfslocks_rele(vpvfsentry
);
2048 vn_vfsunlock(vnode_t
*vp
)
2050 vn_vfslocks_entry_t
*vpvfsentry
;
2053 * ve_refcnt needs to be decremented twice.
2054 * 1. To release refernce after a call to vn_vfslocks_getlock()
2055 * 2. To release the reference from the locking routines like
2056 * vn_vfsrlock/vn_vfswlock etc,.
2058 vpvfsentry
= vn_vfslocks_getlock(vp
);
2059 vn_vfslocks_rele(vpvfsentry
);
2061 rwst_exit(&vpvfsentry
->ve_lock
);
2062 vn_vfslocks_rele(vpvfsentry
);
2066 vn_vfswlock_held(vnode_t
*vp
)
2069 vn_vfslocks_entry_t
*vpvfsentry
;
2073 vpvfsentry
= vn_vfslocks_getlock(vp
);
2074 held
= rwst_lock_held(&vpvfsentry
->ve_lock
, RW_WRITER
);
2076 vn_vfslocks_rele(vpvfsentry
);
2087 vn_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
2093 mutex_init(&vp
->v_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2094 mutex_init(&vp
->v_vsd_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2095 cv_init(&vp
->v_cv
, NULL
, CV_DEFAULT
, NULL
);
2096 rw_init(&vp
->v_nbllock
, NULL
, RW_DEFAULT
, NULL
);
2097 vp
->v_femhead
= NULL
; /* Must be done before vn_reinit() */
2098 vp
->v_path
= vn_vpath_empty
;
2099 vp
->v_path_stamp
= 0;
2100 vp
->v_mpssdata
= NULL
;
2102 vp
->v_fopdata
= NULL
;
2104 vmobject_init(&vp
->v_object
, vp
);
2111 vn_cache_destructor(void *buf
, void *cdrarg
)
2117 vmobject_fini(&vp
->v_object
);
2119 rw_destroy(&vp
->v_nbllock
);
2120 cv_destroy(&vp
->v_cv
);
2121 mutex_destroy(&vp
->v_vsd_lock
);
2122 mutex_destroy(&vp
->v_lock
);
2126 vn_create_cache(void)
2129 ASSERT((1 << VNODE_ALIGN_LOG2
) ==
2130 P2ROUNDUP(sizeof (struct vnode
), VNODE_ALIGN
));
2131 vn_cache
= kmem_cache_create("vn_cache", sizeof (struct vnode
),
2132 VNODE_ALIGN
, vn_cache_constructor
, vn_cache_destructor
, NULL
, NULL
,
2137 vn_destroy_cache(void)
2139 kmem_cache_destroy(vn_cache
);
2143 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2144 * cached by the file system and vnodes remain associated.
2147 vn_recycle(vnode_t
*vp
)
2149 ASSERT(!vn_has_cached_data(vp
));
2150 VERIFY(vp
->v_path
!= NULL
);
2153 * XXX - This really belongs in vn_reinit(), but we have some issues
2154 * with the counts. Best to have it here for clean initialization.
2158 vp
->v_mmap_read
= 0;
2159 vp
->v_mmap_write
= 0;
2162 * If FEM was in use, make sure everything gets cleaned up
2163 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2166 if (vp
->v_femhead
) {
2167 /* XXX - There should be a free_femhead() that does all this */
2168 ASSERT(vp
->v_femhead
->femh_list
== NULL
);
2169 mutex_destroy(&vp
->v_femhead
->femh_lock
);
2170 kmem_free(vp
->v_femhead
, sizeof (*(vp
->v_femhead
)));
2171 vp
->v_femhead
= NULL
;
2173 if (vp
->v_path
!= vn_vpath_empty
) {
2174 kmem_free(vp
->v_path
, strlen(vp
->v_path
) + 1);
2175 vp
->v_path
= vn_vpath_empty
;
2177 vp
->v_path_stamp
= 0;
2179 if (vp
->v_fopdata
!= NULL
) {
2182 vp
->v_mpssdata
= NULL
;
2187 * Used to reset the vnode fields including those that are directly accessible
2188 * as well as those which require an accessor function.
2190 * Does not initialize:
2191 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2192 * v_data (since FS-nodes and vnodes point to each other and should
2193 * be updated simultaneously)
2194 * v_op (in case someone needs to make a VOP call on this object)
2197 vn_reinit(vnode_t
*vp
)
2200 vp
->v_count_dnlc
= 0;
2202 vp
->v_stream
= NULL
;
2203 vp
->v_vfsmountedhere
= NULL
;
2208 vp
->v_filocks
= NULL
;
2209 vp
->v_shrlocks
= NULL
;
2210 VERIFY(!vn_has_cached_data(vp
));
2212 vp
->v_locality
= NULL
;
2213 vp
->v_xattrdir
= NULL
;
2216 * In a few specific instances, vn_reinit() is used to initialize
2217 * locally defined vnode_t instances. Lacking the construction offered
2218 * by vn_alloc(), these vnodes require v_path initialization.
2220 if (vp
->v_path
== NULL
) {
2221 vp
->v_path
= vn_vpath_empty
;
2224 /* Handles v_femhead, v_path, and the r/w/map counts */
2229 vn_alloc(int kmflag
)
2233 vp
= kmem_cache_alloc(vn_cache
, kmflag
);
2236 vp
->v_femhead
= NULL
; /* Must be done before vn_reinit() */
2237 vp
->v_fopdata
= NULL
;
2245 vn_free(vnode_t
*vp
)
2247 ASSERT(vp
->v_shrlocks
== NULL
);
2248 ASSERT(vp
->v_filocks
== NULL
);
2251 * Some file systems call vn_free() with v_count of zero,
2252 * some with v_count of 1. In any case, the value should
2253 * never be anything else.
2255 ASSERT((vp
->v_count
== 0) || (vp
->v_count
== 1));
2256 ASSERT(vp
->v_count_dnlc
== 0);
2257 VERIFY(vp
->v_path
!= NULL
);
2258 if (vp
->v_path
!= vn_vpath_empty
) {
2259 kmem_free(vp
->v_path
, strlen(vp
->v_path
) + 1);
2260 vp
->v_path
= vn_vpath_empty
;
2263 /* If FEM was in use, make sure everything gets cleaned up */
2264 if (vp
->v_femhead
) {
2265 /* XXX - There should be a free_femhead() that does all this */
2266 ASSERT(vp
->v_femhead
->femh_list
== NULL
);
2267 mutex_destroy(&vp
->v_femhead
->femh_lock
);
2268 kmem_free(vp
->v_femhead
, sizeof (*(vp
->v_femhead
)));
2269 vp
->v_femhead
= NULL
;
2272 if (vp
->v_fopdata
!= NULL
) {
2275 vp
->v_mpssdata
= NULL
;
2277 kmem_cache_free(vn_cache
, vp
);
2281 * vnode status changes, should define better states than 1, 0.
2284 vn_reclaim(vnode_t
*vp
)
2286 vfs_t
*vfsp
= vp
->v_vfsp
;
2289 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2292 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_RECLAIMED
);
2296 vn_idle(vnode_t
*vp
)
2298 vfs_t
*vfsp
= vp
->v_vfsp
;
2301 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2304 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_IDLED
);
2307 vn_exists(vnode_t
*vp
)
2309 vfs_t
*vfsp
= vp
->v_vfsp
;
2312 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2315 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_EXISTS
);
2319 vn_invalid(vnode_t
*vp
)
2321 vfs_t
*vfsp
= vp
->v_vfsp
;
2324 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2327 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_DESTROYED
);
2330 /* Vnode event notification */
2333 vnevent_support(vnode_t
*vp
, caller_context_t
*ct
)
2338 return (fop_vnevent(vp
, VE_SUPPORT
, NULL
, NULL
, ct
));
2342 vnevent_rename_src(vnode_t
*vp
, vnode_t
*dvp
, char *name
, caller_context_t
*ct
)
2344 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2347 (void) fop_vnevent(vp
, VE_RENAME_SRC
, dvp
, name
, ct
);
2351 vnevent_rename_dest(vnode_t
*vp
, vnode_t
*dvp
, char *name
,
2352 caller_context_t
*ct
)
2354 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2357 (void) fop_vnevent(vp
, VE_RENAME_DEST
, dvp
, name
, ct
);
2361 vnevent_rename_dest_dir(vnode_t
*vp
, caller_context_t
*ct
)
2363 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2366 (void) fop_vnevent(vp
, VE_RENAME_DEST_DIR
, NULL
, NULL
, ct
);
2370 vnevent_remove(vnode_t
*vp
, vnode_t
*dvp
, char *name
, caller_context_t
*ct
)
2372 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2375 (void) fop_vnevent(vp
, VE_REMOVE
, dvp
, name
, ct
);
2379 vnevent_rmdir(vnode_t
*vp
, vnode_t
*dvp
, char *name
, caller_context_t
*ct
)
2381 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2384 (void) fop_vnevent(vp
, VE_RMDIR
, dvp
, name
, ct
);
2388 vnevent_pre_rename_src(vnode_t
*vp
, vnode_t
*dvp
, char *name
,
2389 caller_context_t
*ct
)
2391 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2394 (void) fop_vnevent(vp
, VE_PRE_RENAME_SRC
, dvp
, name
, ct
);
2398 vnevent_pre_rename_dest(vnode_t
*vp
, vnode_t
*dvp
, char *name
,
2399 caller_context_t
*ct
)
2401 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2404 (void) fop_vnevent(vp
, VE_PRE_RENAME_DEST
, dvp
, name
, ct
);
2408 vnevent_pre_rename_dest_dir(vnode_t
*vp
, vnode_t
*nvp
, char *name
,
2409 caller_context_t
*ct
)
2411 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2414 (void) fop_vnevent(vp
, VE_PRE_RENAME_DEST_DIR
, nvp
, name
, ct
);
2418 vnevent_create(vnode_t
*vp
, caller_context_t
*ct
)
2420 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2423 (void) fop_vnevent(vp
, VE_CREATE
, NULL
, NULL
, ct
);
2427 vnevent_link(vnode_t
*vp
, caller_context_t
*ct
)
2429 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2432 (void) fop_vnevent(vp
, VE_LINK
, NULL
, NULL
, ct
);
2436 vnevent_mountedover(vnode_t
*vp
, caller_context_t
*ct
)
2438 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2441 (void) fop_vnevent(vp
, VE_MOUNTEDOVER
, NULL
, NULL
, ct
);
2445 vnevent_truncate(vnode_t
*vp
, caller_context_t
*ct
)
2447 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2450 (void) fop_vnevent(vp
, VE_TRUNCATE
, NULL
, NULL
, ct
);
2458 vn_is_readonly(vnode_t
*vp
)
2460 return (vp
->v_vfsp
->vfs_flag
& VFS_RDONLY
);
2464 vn_has_flocks(vnode_t
*vp
)
2466 return (vp
->v_filocks
!= NULL
);
2470 vn_has_mandatory_locks(vnode_t
*vp
, int mode
)
2472 return ((vp
->v_filocks
!= NULL
) && (MANDLOCK(vp
, mode
)));
2476 vn_has_cached_data(vnode_t
*vp
)
2478 return (!list_is_empty(&vp
->v_object
.list
));
2482 * Return 0 if the vnode in question shouldn't be permitted into a zone via
2486 vn_can_change_zones(vnode_t
*vp
)
2492 if (nfs_global_client_only
!= 0)
2496 * We always want to look at the underlying vnode if there is one.
2498 if (fop_realvp(vp
, &rvp
, NULL
) != 0)
2501 * Some pseudo filesystems (including doorfs) don't actually register
2502 * their vfsops_t, so the following may return NULL; we happily let
2503 * such vnodes switch zones.
2505 vswp
= vfs_getvfsswbyvfsops(vfs_getops(rvp
->v_vfsp
));
2507 if (vswp
->vsw_flag
& VSW_NOTZONESAFE
)
2509 vfs_unrefvfssw(vswp
);
2515 * Return nonzero if the vnode is a mount point, zero if not.
2518 vn_ismntpt(vnode_t
*vp
)
2520 return (vp
->v_vfsmountedhere
!= NULL
);
2523 /* Retrieve the vfs (if any) mounted on this vnode */
2525 vn_mountedvfs(vnode_t
*vp
)
2527 return (vp
->v_vfsmountedhere
);
2531 * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2534 vn_in_dnlc(vnode_t
*vp
)
2536 return (vp
->v_count_dnlc
> 0);
2540 * vn_has_other_opens() checks whether a particular file is opened by more than
2541 * just the caller and whether the open is for read and/or write.
2542 * This routine is for calling after the caller has already called fop_open()
2543 * and the caller wishes to know if they are the only one with it open for
2544 * the mode(s) specified.
2546 * Vnode counts are only kept on regular files (v_type=VREG).
2549 vn_has_other_opens(struct vnode
*vp
, v_mode_t mode
)
2555 if (vp
->v_wrcnt
> 1)
2559 if ((vp
->v_rdcnt
> 1) || (vp
->v_wrcnt
> 1))
2563 if ((vp
->v_rdcnt
> 1) && (vp
->v_wrcnt
> 1))
2567 if (vp
->v_rdcnt
> 1)
2576 * vn_is_opened() checks whether a particular file is opened and
2577 * whether the open is for read and/or write.
2579 * Vnode counts are only kept on regular files (v_type=VREG).
2581 bool vn_is_opened(struct vnode
*vp
, v_mode_t mode
)
2591 if (vp
->v_rdcnt
&& vp
->v_wrcnt
)
2595 if (vp
->v_rdcnt
|| vp
->v_wrcnt
)
2608 * vn_is_mapped() checks whether a particular file is mapped and whether
2609 * the file is mapped read and/or write.
2611 bool vn_is_mapped(struct vnode
*vp
, v_mode_t mode
)
2618 * The atomic_add_64_nv functions force atomicity in the
2619 * case of 32 bit architectures. Otherwise the 64 bit values
2620 * require two fetches. The value of the fields may be
2621 * (potentially) changed between the first fetch and the
2625 if (atomic_add_64_nv((&(vp
->v_mmap_write
)), 0))
2629 if ((atomic_add_64_nv((&(vp
->v_mmap_read
)), 0)) &&
2630 (atomic_add_64_nv((&(vp
->v_mmap_write
)), 0)))
2634 if ((atomic_add_64_nv((&(vp
->v_mmap_read
)), 0)) ||
2635 (atomic_add_64_nv((&(vp
->v_mmap_write
)), 0)))
2639 if (atomic_add_64_nv((&(vp
->v_mmap_read
)), 0))
2646 if (vp
->v_mmap_write
)
2650 if (vp
->v_mmap_read
&& vp
->v_mmap_write
)
2654 if (vp
->v_mmap_read
|| vp
->v_mmap_write
)
2658 if (vp
->v_mmap_read
)
2668 * Set the operations vector for a vnode.
2671 vn_setops(struct vnode
*vnode
, const struct vnodeops
*ops
)
2677 * Retrieve the operations vector for a vnode
2679 const struct vnodeops
*
2680 vn_getops(struct vnode
*vnode
)
2686 * Returns non-zero (1) if the vnodeops matches that of the vnode.
2687 * Returns zero (0) if not.
2690 vn_matchops(struct vnode
*vp
, const struct vnodeops
*vnodeops
)
2692 return (vn_getops(vp
) == vnodeops
);
2696 * fs_new_caller_id() needs to return a unique ID on a given local system.
2697 * The IDs do not need to survive across reboots. These are primarily
2698 * used so that (FEM) monitors can detect particular callers (such as
2699 * the NFS server) to a given vnode/vfs operation.
2704 static uint64_t next_caller_id
= 0LL; /* First call returns 1 */
2706 return ((u_longlong_t
)atomic_inc_64_nv(&next_caller_id
));
2710 * The value stored in v_path is relative to rootdir, located in the global
2711 * zone. Zones or chroot environments which reside deeper inside the VFS
2712 * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2713 * what lies below their perceived root. In order to keep v_path usable for
2714 * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2716 * An upper bound of max_vnode_path is placed upon v_path allocations to
2717 * prevent the system from going too wild at the behest of pathological
2718 * behavior from the operator.
2720 size_t max_vnode_path
= 4 * MAXPATHLEN
;
2724 vn_clearpath(vnode_t
*vp
, hrtime_t compare_stamp
)
2728 mutex_enter(&vp
->v_lock
);
2730 * If the snapshot of v_path_stamp passed in via compare_stamp does not
2731 * match the present value on the vnode, it indicates that subsequent
2732 * changes have occurred. The v_path value is not cleared in this case
2733 * since the new value may be valid.
2735 if (compare_stamp
!= 0 && vp
->v_path_stamp
!= compare_stamp
) {
2736 mutex_exit(&vp
->v_lock
);
2740 vp
->v_path
= vn_vpath_empty
;
2741 vp
->v_path_stamp
= 0;
2742 mutex_exit(&vp
->v_lock
);
2743 if (buf
!= vn_vpath_empty
) {
2744 kmem_free(buf
, strlen(buf
) + 1);
2749 vn_setpath_common(vnode_t
*pvp
, vnode_t
*vp
, const char *name
, size_t len
,
2750 boolean_t is_rename
)
2754 size_t baselen
, buflen
= 0;
2756 /* Handle the vn_setpath_str case. */
2758 if (len
+ 1 > max_vnode_path
) {
2759 DTRACE_PROBE4(vn__setpath__too__long
, vnode_t
*, pvp
,
2760 vnode_t
*, vp
, char *, name
, size_t, len
+ 1);
2763 buf
= kmem_alloc(len
+ 1, KM_SLEEP
);
2764 bcopy(name
, buf
, len
);
2767 mutex_enter(&vp
->v_lock
);
2768 oldbuf
= vp
->v_path
;
2770 vp
->v_path_stamp
= gethrtime();
2771 mutex_exit(&vp
->v_lock
);
2772 if (oldbuf
!= vn_vpath_empty
) {
2773 kmem_free(oldbuf
, strlen(oldbuf
) + 1);
2778 /* Take snapshot of parent dir */
2779 mutex_enter(&pvp
->v_lock
);
2781 if ((pvp
->v_flag
& VTRAVERSE
) != 0) {
2783 * When the parent vnode has VTRAVERSE set in its flags, normal
2784 * assumptions about v_path calculation no longer apply. The
2785 * primary situation where this occurs is via the VFS tricks
2786 * which procfs plays in order to allow /proc/PID/(root|cwd) to
2787 * yield meaningful results.
2789 * When this flag is set, v_path on the child must not be
2790 * updated since the calculated value is likely to be
2791 * incorrect, given the current context.
2793 mutex_exit(&pvp
->v_lock
);
2798 if (pvp
->v_path
== vn_vpath_empty
) {
2800 * Without v_path from the parent directory, generating a child
2801 * path from the name is impossible.
2804 pstamp
= pvp
->v_path_stamp
;
2805 mutex_exit(&pvp
->v_lock
);
2806 vn_clearpath(vp
, pstamp
);
2811 * The only feasible case here is where a NUL lookup is being
2812 * performed on rootdir prior to its v_path being populated.
2814 ASSERT(pvp
->v_path_stamp
== 0);
2818 pstamp
= pvp
->v_path_stamp
;
2819 baselen
= strlen(pvp
->v_path
);
2820 /* ignore a trailing slash if present */
2821 if (pvp
->v_path
[baselen
- 1] == '/') {
2822 /* This should only the be case for rootdir */
2823 ASSERT(baselen
== 1 && pvp
== rootdir
);
2827 mutex_exit(&pvp
->v_lock
);
2830 /* Free the existing (mis-sized) buffer in case of retry */
2831 kmem_free(buf
, buflen
);
2833 /* base, '/', name and trailing NUL */
2834 buflen
= baselen
+ len
+ 2;
2835 if (buflen
> max_vnode_path
) {
2836 DTRACE_PROBE4(vn__setpath_too__long
, vnode_t
*, pvp
,
2837 vnode_t
*, vp
, char *, name
, size_t, buflen
);
2840 buf
= kmem_alloc(buflen
, KM_SLEEP
);
2842 mutex_enter(&pvp
->v_lock
);
2843 if (pvp
->v_path_stamp
!= pstamp
) {
2847 * Since v_path_stamp changed on the parent, it is likely that
2848 * v_path has been altered as well. If the length does not
2849 * exactly match what was previously measured, the buffer
2850 * allocation must be repeated for proper sizing.
2852 if (pvp
->v_path
== vn_vpath_empty
) {
2853 /* Give up if parent lack v_path */
2854 mutex_exit(&pvp
->v_lock
);
2855 kmem_free(buf
, buflen
);
2858 vlen
= strlen(pvp
->v_path
);
2859 if (pvp
->v_path
[vlen
- 1] == '/') {
2862 if (vlen
!= baselen
) {
2866 bcopy(pvp
->v_path
, buf
, baselen
);
2867 mutex_exit(&pvp
->v_lock
);
2871 bcopy(name
, &buf
[baselen
], len
+ 1);
2873 mutex_enter(&vp
->v_lock
);
2874 if (vp
->v_path_stamp
== 0) {
2875 /* never-visited vnode can inherit stamp from parent */
2876 ASSERT(vp
->v_path
== vn_vpath_empty
);
2877 vp
->v_path_stamp
= pstamp
;
2879 mutex_exit(&vp
->v_lock
);
2880 } else if (vp
->v_path_stamp
< pstamp
|| is_rename
) {
2882 * Install the updated path and stamp, ensuring that the v_path
2883 * pointer is valid at all times for dtrace.
2885 oldbuf
= vp
->v_path
;
2887 vp
->v_path_stamp
= gethrtime();
2888 mutex_exit(&vp
->v_lock
);
2889 kmem_free(oldbuf
, strlen(oldbuf
) + 1);
2892 * If the timestamp matches or is greater, it means another
2893 * thread performed the update first while locks were dropped
2894 * here to make the allocation. We defer to the newer value.
2896 mutex_exit(&vp
->v_lock
);
2897 kmem_free(buf
, buflen
);
2899 ASSERT(MUTEX_NOT_HELD(&vp
->v_lock
));
2903 vn_updatepath(vnode_t
*pvp
, vnode_t
*vp
, const char *name
)
2908 * If the parent is older or empty, there's nothing further to do.
2910 if (pvp
->v_path
== vn_vpath_empty
||
2911 pvp
->v_path_stamp
<= vp
->v_path_stamp
) {
2916 * Given the lack of appropriate context, meaningful updates to v_path
2917 * cannot be made for during lookups for the '.' or '..' entries.
2920 if (len
== 0 || (len
== 1 && name
[0] == '.') ||
2921 (len
== 2 && name
[0] == '.' && name
[1] == '.')) {
2925 vn_setpath_common(pvp
, vp
, name
, len
, B_FALSE
);
2929 * Given a starting vnode and a path, updates the path in the target vnode in
2930 * a safe manner. If the vnode already has path information embedded, then the
2931 * cached path is left untouched.
2935 vn_setpath(vnode_t
*rootvp
, vnode_t
*pvp
, vnode_t
*vp
, const char *name
,
2938 vn_setpath_common(pvp
, vp
, name
, len
, B_FALSE
);
2942 * Sets the path to the vnode to be the given string, regardless of current
2943 * context. The string must be a complete path from rootdir. This is only used
2944 * by fsop_root() for setting the path based on the mountpoint.
2947 vn_setpath_str(vnode_t
*vp
, const char *str
, size_t len
)
2949 vn_setpath_common(NULL
, vp
, str
, len
, B_FALSE
);
2953 * Called from within filesystem's vop_rename() to handle renames once the
2954 * target vnode is available.
2957 vn_renamepath(vnode_t
*pvp
, vnode_t
*vp
, const char *name
, size_t len
)
2959 vn_setpath_common(pvp
, vp
, name
, len
, B_TRUE
);
2963 * Similar to vn_setpath_str(), this function sets the path of the destination
2964 * vnode to the be the same as the source vnode.
2967 vn_copypath(struct vnode
*src
, struct vnode
*dst
)
2973 mutex_enter(&src
->v_lock
);
2974 if (src
->v_path
== vn_vpath_empty
) {
2975 mutex_exit(&src
->v_lock
);
2978 buflen
= strlen(src
->v_path
) + 1;
2979 mutex_exit(&src
->v_lock
);
2981 buf
= kmem_alloc(buflen
, KM_SLEEP
);
2983 mutex_enter(&src
->v_lock
);
2984 if (src
->v_path
== vn_vpath_empty
||
2985 strlen(src
->v_path
) + 1 != buflen
) {
2986 mutex_exit(&src
->v_lock
);
2987 kmem_free(buf
, buflen
);
2990 bcopy(src
->v_path
, buf
, buflen
);
2991 stamp
= src
->v_path_stamp
;
2992 mutex_exit(&src
->v_lock
);
2994 mutex_enter(&dst
->v_lock
);
2995 if (dst
->v_path
!= vn_vpath_empty
) {
2996 mutex_exit(&dst
->v_lock
);
2997 kmem_free(buf
, buflen
);
3001 dst
->v_path_stamp
= stamp
;
3002 mutex_exit(&dst
->v_lock
);
3007 * XXX Private interface for segvn routines that handle vnode
3008 * large page segments.
3010 * return 1 if vp's file system fop_pageio() implementation
3011 * can be safely used instead of fop_getpage() for handling
3012 * pagefaults against regular non swap files. fop_pageio()
3013 * interface is considered safe here if its implementation
3014 * is very close to fop_getpage() implementation.
3015 * e.g. It zero's out the part of the page beyond EOF. Doesn't
3016 * panic if there're file holes but instead returns an error.
3017 * Doesn't assume file won't be changed by user writes, etc.
3019 * return 0 otherwise.
3021 * For now allow segvn to only use fop_pageio() with ufs and nfs.
3024 vn_vmpss_usepageio(vnode_t
*vp
)
3026 vfs_t
*vfsp
= vp
->v_vfsp
;
3027 char *fsname
= vfssw
[vfsp
->vfs_fstype
].vsw_name
;
3028 char *pageio_ok_fss
[] = {"ufs", "nfs", NULL
};
3029 char **fsok
= pageio_ok_fss
;
3031 if (fsname
== NULL
) {
3035 for (; *fsok
; fsok
++) {
3036 if (strcmp(*fsok
, fsname
) == 0) {
3043 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3050 caller_context_t
*ct
)
3057 * Adding to the vnode counts before calling open
3058 * avoids the need for a mutex. It circumvents a race
3059 * condition where a query made on the vnode counts results in a
3060 * false negative. The inquirer goes away believing the file is
3061 * not open when there is an open on the file already under way.
3063 * The counts are meant to prevent NFS from granting a delegation
3064 * when it would be dangerous to do so.
3066 * The vnode counts are only kept on regular files
3068 if ((*vpp
)->v_type
== VREG
) {
3070 atomic_inc_32(&(*vpp
)->v_rdcnt
);
3072 atomic_inc_32(&(*vpp
)->v_wrcnt
);
3075 VOPXID_MAP_CR(vp
, cr
);
3077 ret
= fop_open_dispatch(vpp
, mode
, cr
, ct
, true);
3081 * Use the saved vp just in case the vnode ptr got trashed
3084 VOPSTATS_UPDATE(vp
, open
);
3085 if ((vp
->v_type
== VREG
) && (mode
& FREAD
))
3086 atomic_dec_32(&vp
->v_rdcnt
);
3087 if ((vp
->v_type
== VREG
) && (mode
& FWRITE
))
3088 atomic_dec_32(&vp
->v_wrcnt
);
3091 * Some filesystems will return a different vnode,
3092 * but the same path was still used to open it.
3093 * So if we do change the vnode and need to
3094 * copy over the path, do so here, rather than special
3095 * casing each filesystem. Adjust the vnode counts to
3096 * reflect the vnode switch.
3098 VOPSTATS_UPDATE(*vpp
, open
);
3099 if (*vpp
!= vp
&& *vpp
!= NULL
) {
3100 vn_copypath(vp
, *vpp
);
3101 if (((*vpp
)->v_type
== VREG
) && (mode
& FREAD
))
3102 atomic_inc_32(&(*vpp
)->v_rdcnt
);
3103 if ((vp
->v_type
== VREG
) && (mode
& FREAD
))
3104 atomic_dec_32(&vp
->v_rdcnt
);
3105 if (((*vpp
)->v_type
== VREG
) && (mode
& FWRITE
))
3106 atomic_inc_32(&(*vpp
)->v_wrcnt
);
3107 if ((vp
->v_type
== VREG
) && (mode
& FWRITE
))
3108 atomic_dec_32(&vp
->v_wrcnt
);
3122 caller_context_t
*ct
)
3126 VOPXID_MAP_CR(vp
, cr
);
3128 err
= fop_close_dispatch(vp
, flag
, count
, offset
, cr
, ct
, true);
3130 VOPSTATS_UPDATE(vp
, close
);
3132 * Check passed in count to handle possible dups. Vnode counts are only
3133 * kept on regular files
3135 if ((vp
->v_type
== VREG
) && (count
== 1)) {
3137 ASSERT(vp
->v_rdcnt
> 0);
3138 atomic_dec_32(&vp
->v_rdcnt
);
3140 if (flag
& FWRITE
) {
3141 ASSERT(vp
->v_wrcnt
> 0);
3142 atomic_dec_32(&vp
->v_wrcnt
);
3154 caller_context_t
*ct
)
3157 ssize_t resid_start
= uiop
->uio_resid
;
3159 VOPXID_MAP_CR(vp
, cr
);
3161 err
= fop_read_dispatch(vp
, uiop
, ioflag
, cr
, ct
, true);
3163 VOPSTATS_UPDATE_IO(vp
, read
,
3164 read_bytes
, (resid_start
- uiop
->uio_resid
));
3174 caller_context_t
*ct
)
3177 ssize_t resid_start
= uiop
->uio_resid
;
3179 VOPXID_MAP_CR(vp
, cr
);
3181 err
= fop_write_dispatch(vp
, uiop
, ioflag
, cr
, ct
, true);
3183 VOPSTATS_UPDATE_IO(vp
, write
,
3184 write_bytes
, (resid_start
- uiop
->uio_resid
));
3196 caller_context_t
*ct
)
3200 VOPXID_MAP_CR(vp
, cr
);
3202 err
= fop_ioctl_dispatch(vp
, cmd
, arg
, flag
, cr
, rvalp
, ct
, true);
3204 VOPSTATS_UPDATE(vp
, ioctl
);
3214 caller_context_t
*ct
)
3218 VOPXID_MAP_CR(vp
, cr
);
3220 err
= fop_setfl_dispatch(vp
, oflags
, nflags
, cr
, ct
, true);
3222 VOPSTATS_UPDATE(vp
, setfl
);
3232 caller_context_t
*ct
)
3236 VOPXID_MAP_CR(vp
, cr
);
3239 * If this file system doesn't understand the xvattr extensions
3240 * then turn off the xvattr bit.
3242 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_XVATTR
) == 0) {
3243 vap
->va_mask
&= ~VATTR_XVATTR
;
3247 * We're only allowed to skip the ACL check iff we used a 32 bit
3248 * ACE mask with fop_access() to determine permissions.
3250 if ((flags
& ATTR_NOACLCHECK
) &&
3251 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0)
3254 err
= fop_getattr_dispatch(vp
, vap
, flags
, cr
, ct
, true);
3256 VOPSTATS_UPDATE(vp
, getattr
);
3266 caller_context_t
*ct
)
3270 VOPXID_MAP_CR(vp
, cr
);
3273 * If this file system doesn't understand the xvattr extensions
3274 * then turn off the xvattr bit.
3276 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_XVATTR
) == 0) {
3277 vap
->va_mask
&= ~VATTR_XVATTR
;
3281 * We're only allowed to skip the ACL check iff we used a 32 bit
3282 * ACE mask with fop_access() to determine permissions.
3284 if ((flags
& ATTR_NOACLCHECK
) &&
3285 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0)
3288 err
= fop_setattr_dispatch(vp
, vap
, flags
, cr
, ct
, true);
3290 VOPSTATS_UPDATE(vp
, setattr
);
3300 caller_context_t
*ct
)
3304 if ((flags
& V_ACE_MASK
) &&
3305 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0) {
3309 VOPXID_MAP_CR(vp
, cr
);
3311 err
= fop_access_dispatch(vp
, mode
, flags
, cr
, ct
, true);
3313 VOPSTATS_UPDATE(vp
, access
);
3326 caller_context_t
*ct
,
3327 int *deflags
, /* Returned per-dirent flags */
3328 pathname_t
*ppnp
) /* Returned case-preserved name in directory */
3333 * If this file system doesn't support case-insensitive access
3334 * and said access is requested, fail quickly. It is required
3335 * that if the vfs supports case-insensitive lookup, it also
3336 * supports extended dirent flags.
3338 if (flags
& FIGNORECASE
&&
3339 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3340 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3343 VOPXID_MAP_CR(dvp
, cr
);
3345 if ((flags
& LOOKUP_XATTR
) && (flags
& LOOKUP_HAVE_SYSATTR_DIR
) == 0) {
3346 ret
= xattr_dir_lookup(dvp
, vpp
, flags
, cr
);
3348 ret
= fop_lookup_dispatch(dvp
, nm
, vpp
, pnp
, flags
, rdir
, cr
,
3349 ct
, deflags
, ppnp
, true);
3352 if (ret
== 0 && *vpp
) {
3353 VOPSTATS_UPDATE(*vpp
, lookup
);
3354 vn_updatepath(dvp
, *vpp
, nm
);
3370 caller_context_t
*ct
,
3371 vsecattr_t
*vsecp
) /* ACL to set during create */
3375 if (vsecp
!= NULL
&&
3376 vfs_has_feature(dvp
->v_vfsp
, VFSFT_ACLONCREATE
) == 0) {
3380 * If this file system doesn't support case-insensitive access
3381 * and said access is requested, fail quickly.
3383 if (flags
& FIGNORECASE
&&
3384 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3385 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3388 VOPXID_MAP_CR(dvp
, cr
);
3390 ret
= fop_create_dispatch(dvp
, name
, vap
, excl
, mode
, vpp
, cr
, flags
,
3393 if (ret
== 0 && *vpp
) {
3394 VOPSTATS_UPDATE(*vpp
, create
);
3395 vn_updatepath(dvp
, *vpp
, name
);
3406 caller_context_t
*ct
,
3412 * If this file system doesn't support case-insensitive access
3413 * and said access is requested, fail quickly.
3415 if (flags
& FIGNORECASE
&&
3416 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3417 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3420 VOPXID_MAP_CR(dvp
, cr
);
3422 err
= fop_remove_dispatch(dvp
, nm
, cr
, ct
, flags
, true);
3424 VOPSTATS_UPDATE(dvp
, remove
);
3434 caller_context_t
*ct
,
3440 * If the target file system doesn't support case-insensitive access
3441 * and said access is requested, fail quickly.
3443 if (flags
& FIGNORECASE
&&
3444 (vfs_has_feature(tdvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3445 vfs_has_feature(tdvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3448 VOPXID_MAP_CR(tdvp
, cr
);
3450 err
= fop_link_dispatch(tdvp
, svp
, tnm
, cr
, ct
, flags
, true);
3452 VOPSTATS_UPDATE(tdvp
, link
);
3463 caller_context_t
*ct
,
3469 * If the file system involved does not support
3470 * case-insensitive access and said access is requested, fail
3473 if (flags
& FIGNORECASE
&&
3474 ((vfs_has_feature(sdvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3475 vfs_has_feature(sdvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0)))
3478 VOPXID_MAP_CR(tdvp
, cr
);
3480 err
= fop_rename_dispatch(sdvp
, snm
, tdvp
, tnm
, cr
, ct
, flags
, true);
3482 VOPSTATS_UPDATE(sdvp
, rename
);
3493 caller_context_t
*ct
,
3495 vsecattr_t
*vsecp
) /* ACL to set during create */
3499 if (vsecp
!= NULL
&&
3500 vfs_has_feature(dvp
->v_vfsp
, VFSFT_ACLONCREATE
) == 0) {
3504 * If this file system doesn't support case-insensitive access
3505 * and said access is requested, fail quickly.
3507 if (flags
& FIGNORECASE
&&
3508 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3509 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3512 VOPXID_MAP_CR(dvp
, cr
);
3514 ret
= fop_mkdir_dispatch(dvp
, dirname
, vap
, vpp
, cr
, ct
, flags
, vsecp
,
3517 if (ret
== 0 && *vpp
) {
3518 VOPSTATS_UPDATE(*vpp
, mkdir
);
3519 vn_updatepath(dvp
, *vpp
, dirname
);
3531 caller_context_t
*ct
,
3537 * If this file system doesn't support case-insensitive access
3538 * and said access is requested, fail quickly.
3540 if (flags
& FIGNORECASE
&&
3541 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3542 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3545 VOPXID_MAP_CR(dvp
, cr
);
3547 err
= fop_rmdir_dispatch(dvp
, nm
, cdir
, cr
, ct
, flags
, true);
3549 VOPSTATS_UPDATE(dvp
, rmdir
);
3559 caller_context_t
*ct
,
3563 ssize_t resid_start
= uiop
->uio_resid
;
3566 * If this file system doesn't support retrieving directory
3567 * entry flags and said access is requested, fail quickly.
3569 if (flags
& V_RDDIR_ENTFLAGS
&&
3570 vfs_has_feature(vp
->v_vfsp
, VFSFT_DIRENTFLAGS
) == 0)
3573 VOPXID_MAP_CR(vp
, cr
);
3575 err
= fop_readdir_dispatch(vp
, uiop
, cr
, eofp
, ct
, flags
, true);
3577 VOPSTATS_UPDATE_IO(vp
, readdir
,
3578 readdir_bytes
, (resid_start
- uiop
->uio_resid
));
3589 caller_context_t
*ct
,
3596 * If this file system doesn't support case-insensitive access
3597 * and said access is requested, fail quickly.
3599 if (flags
& FIGNORECASE
&&
3600 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3601 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3604 VOPXID_MAP_CR(dvp
, cr
);
3606 /* check for reparse point */
3607 if ((vfs_has_feature(dvp
->v_vfsp
, VFSFT_REPARSE
)) &&
3608 (strncmp(target
, FS_REPARSE_TAG_STR
,
3609 strlen(FS_REPARSE_TAG_STR
)) == 0)) {
3610 if (!fs_reparse_mark(target
, vap
, &xvattr
))
3611 vap
= (vattr_t
*)&xvattr
;
3614 err
= fop_symlink_dispatch(dvp
, linkname
, vap
, target
, cr
, ct
, flags
,
3617 VOPSTATS_UPDATE(dvp
, symlink
);
3626 caller_context_t
*ct
)
3630 VOPXID_MAP_CR(vp
, cr
);
3632 err
= fop_readlink_dispatch(vp
, uiop
, cr
, ct
, true);
3634 VOPSTATS_UPDATE(vp
, readlink
);
3643 caller_context_t
*ct
)
3647 VOPXID_MAP_CR(vp
, cr
);
3649 err
= fop_fsync_dispatch(vp
, syncflag
, cr
, ct
, true);
3651 VOPSTATS_UPDATE(vp
, fsync
);
3659 caller_context_t
*ct
)
3661 /* Need to update stats before vop call since we may lose the vnode */
3662 VOPSTATS_UPDATE(vp
, inactive
);
3664 VOPXID_MAP_CR(vp
, cr
);
3666 fop_inactive_dispatch(vp
, cr
, ct
, true);
3673 caller_context_t
*ct
)
3677 err
= fop_fid_dispatch(vp
, fidp
, ct
, true);
3679 VOPSTATS_UPDATE(vp
, fid
);
3687 caller_context_t
*ct
)
3691 ret
= fop_rwlock_dispatch(vp
, write_lock
, ct
, true);
3693 VOPSTATS_UPDATE(vp
, rwlock
);
3701 caller_context_t
*ct
)
3703 fop_rwunlock_dispatch(vp
, write_lock
, ct
, true);
3705 VOPSTATS_UPDATE(vp
, rwunlock
);
3713 caller_context_t
*ct
)
3717 err
= fop_seek_dispatch(vp
, ooff
, noffp
, ct
, true);
3719 VOPSTATS_UPDATE(vp
, seek
);
3727 caller_context_t
*ct
)
3731 err
= fop_cmp_dispatch(vp1
, vp2
, ct
, true);
3733 VOPSTATS_UPDATE(vp1
, cmp
);
3744 struct flk_callback
*flk_cbp
,
3746 caller_context_t
*ct
)
3750 VOPXID_MAP_CR(vp
, cr
);
3752 err
= fop_frlock_dispatch(vp
, cmd
, bfp
, flag
, offset
, flk_cbp
, cr
,
3755 VOPSTATS_UPDATE(vp
, frlock
);
3767 caller_context_t
*ct
)
3771 VOPXID_MAP_CR(vp
, cr
);
3773 err
= fop_space_dispatch(vp
, cmd
, bfp
, flag
, offset
, cr
, ct
, true);
3775 VOPSTATS_UPDATE(vp
, space
);
3783 caller_context_t
*ct
)
3787 err
= fop_realvp_dispatch(vp
, vpp
, ct
, true);
3789 VOPSTATS_UPDATE(vp
, realvp
);
3805 caller_context_t
*ct
)
3809 VOPXID_MAP_CR(vp
, cr
);
3811 err
= fop_getpage_dispatch(vp
, off
, len
, protp
, plarr
, plsz
, seg
,
3812 addr
, rw
, cr
, ct
, true);
3814 VOPSTATS_UPDATE(vp
, getpage
);
3825 caller_context_t
*ct
)
3829 VOPXID_MAP_CR(vp
, cr
);
3831 err
= fop_putpage_dispatch(vp
, off
, len
, flags
, cr
, ct
, true);
3833 VOPSTATS_UPDATE(vp
, putpage
);
3848 caller_context_t
*ct
)
3852 VOPXID_MAP_CR(vp
, cr
);
3854 err
= fop_map_dispatch(vp
, off
, as
, addrp
, len
, prot
, maxprot
,
3855 flags
, cr
, ct
, true);
3857 VOPSTATS_UPDATE(vp
, map
);
3872 caller_context_t
*ct
)
3877 VOPXID_MAP_CR(vp
, cr
);
3879 error
= fop_addmap_dispatch(vp
, off
, as
, addr
, len
, prot
, maxprot
,
3880 flags
, cr
, ct
, true);
3882 if ((!error
) && (vp
->v_type
== VREG
)) {
3883 delta
= (u_longlong_t
)btopr(len
);
3885 * If file is declared MAP_PRIVATE, it can't be written back
3886 * even if open for write. Handle as read.
3888 if (flags
& MAP_PRIVATE
) {
3889 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3893 * atomic_add_64 forces the fetch of a 64 bit value to
3894 * be atomic on 32 bit machines
3896 if (maxprot
& PROT_WRITE
)
3897 atomic_add_64((uint64_t *)(&(vp
->v_mmap_write
)),
3899 if (maxprot
& PROT_READ
)
3900 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3902 if (maxprot
& PROT_EXEC
)
3903 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3907 VOPSTATS_UPDATE(vp
, addmap
);
3922 caller_context_t
*ct
)
3927 VOPXID_MAP_CR(vp
, cr
);
3929 error
= fop_delmap_dispatch(vp
, off
, as
, addr
, len
, prot
, maxprot
,
3930 flags
, cr
, ct
, true);
3933 * NFS calls into delmap twice, the first time
3934 * it simply establishes a callback mechanism and returns EAGAIN
3935 * while the real work is being done upon the second invocation.
3936 * We have to detect this here and only decrement the counts upon
3937 * the second delmap request.
3939 if ((error
!= EAGAIN
) && (vp
->v_type
== VREG
)) {
3941 delta
= (u_longlong_t
)btopr(len
);
3943 if (flags
& MAP_PRIVATE
) {
3944 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3948 * atomic_add_64 forces the fetch of a 64 bit value
3949 * to be atomic on 32 bit machines
3951 if (maxprot
& PROT_WRITE
)
3952 atomic_add_64((uint64_t *)(&(vp
->v_mmap_write
)),
3954 if (maxprot
& PROT_READ
)
3955 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3957 if (maxprot
& PROT_EXEC
)
3958 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3962 VOPSTATS_UPDATE(vp
, delmap
);
3973 struct pollhead
**phpp
,
3974 caller_context_t
*ct
)
3978 err
= fop_poll_dispatch(vp
, events
, anyyet
, reventsp
, phpp
, ct
, true);
3980 VOPSTATS_UPDATE(vp
, poll
);
3990 caller_context_t
*ct
)
3994 /* ensure lbdn and dblks can be passed safely to bdev_dump */
3995 if ((lbdn
!= (daddr_t
)lbdn
) || (dblks
!= (int)dblks
))
3998 err
= fop_dump_dispatch(vp
, addr
, lbdn
, dblks
, ct
, true);
4000 VOPSTATS_UPDATE(vp
, dump
);
4010 caller_context_t
*ct
)
4014 VOPXID_MAP_CR(vp
, cr
);
4016 err
= fop_pathconf_dispatch(vp
, cmd
, valp
, cr
, ct
, true);
4018 VOPSTATS_UPDATE(vp
, pathconf
);
4030 caller_context_t
*ct
)
4034 VOPXID_MAP_CR(vp
, cr
);
4036 err
= fop_pageio_dispatch(vp
, pp
, io_off
, io_len
, flags
, cr
, ct
, true);
4038 VOPSTATS_UPDATE(vp
, pageio
);
4047 caller_context_t
*ct
)
4051 err
= fop_dumpctl_dispatch(vp
, action
, blkp
, ct
, true);
4053 VOPSTATS_UPDATE(vp
, dumpctl
);
4064 caller_context_t
*ct
)
4066 /* Must do stats first since it's possible to lose the vnode */
4067 VOPSTATS_UPDATE(vp
, dispose
);
4069 VOPXID_MAP_CR(vp
, cr
);
4071 fop_dispose_dispatch(vp
, pp
, flag
, dn
, cr
, ct
, true);
4080 caller_context_t
*ct
)
4084 VOPXID_MAP_CR(vp
, cr
);
4087 * We're only allowed to skip the ACL check iff we used a 32 bit
4088 * ACE mask with fop_access() to determine permissions.
4090 if ((flag
& ATTR_NOACLCHECK
) &&
4091 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0) {
4095 err
= fop_setsecattr_dispatch(vp
, vsap
, flag
, cr
, ct
, true);
4097 VOPSTATS_UPDATE(vp
, setsecattr
);
4107 caller_context_t
*ct
)
4112 * We're only allowed to skip the ACL check iff we used a 32 bit
4113 * ACE mask with fop_access() to determine permissions.
4115 if ((flag
& ATTR_NOACLCHECK
) &&
4116 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0) {
4120 VOPXID_MAP_CR(vp
, cr
);
4122 err
= fop_getsecattr_dispatch(vp
, vsap
, flag
, cr
, ct
, true);
4124 VOPSTATS_UPDATE(vp
, getsecattr
);
4132 struct shrlock
*shr
,
4135 caller_context_t
*ct
)
4139 VOPXID_MAP_CR(vp
, cr
);
4141 err
= fop_shrlock_dispatch(vp
, cmd
, shr
, flag
, cr
, ct
, true);
4143 VOPSTATS_UPDATE(vp
, shrlock
);
4148 fop_vnevent(vnode_t
*vp
, vnevent_t vnevent
, vnode_t
*dvp
, char *fnm
,
4149 caller_context_t
*ct
)
4153 err
= fop_vnevent_dispatch(vp
, vnevent
, dvp
, fnm
, ct
, true);
4155 VOPSTATS_UPDATE(vp
, vnevent
);
4160 fop_reqzcbuf(vnode_t
*vp
, enum uio_rw ioflag
, xuio_t
*uiop
, cred_t
*cr
,
4161 caller_context_t
*ct
)
4165 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_ZEROCOPY_SUPPORTED
) == 0)
4168 err
= fop_reqzcbuf_dispatch(vp
, ioflag
, uiop
, cr
, ct
, true);
4170 VOPSTATS_UPDATE(vp
, reqzcbuf
);
4175 fop_retzcbuf(vnode_t
*vp
, xuio_t
*uiop
, cred_t
*cr
, caller_context_t
*ct
)
4179 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_ZEROCOPY_SUPPORTED
) == 0)
4182 err
= fop_retzcbuf_dispatch(vp
, uiop
, cr
, ct
, true);
4184 VOPSTATS_UPDATE(vp
, retzcbuf
);
4189 * Default destructor
4190 * Needed because NULL destructor means that the key is unused
4194 vsd_defaultdestructor(void *value
)
4198 * Create a key (index into per vnode array)
4199 * Locks out vsd_create, vsd_destroy, and vsd_free
4200 * May allocate memory with lock held
4203 vsd_create(uint_t
*keyp
, void (*destructor
)(void *))
4209 * if key is allocated, do nothing
4211 mutex_enter(&vsd_lock
);
4213 mutex_exit(&vsd_lock
);
4217 * find an unused key
4219 if (destructor
== NULL
)
4220 destructor
= vsd_defaultdestructor
;
4222 for (i
= 0; i
< vsd_nkeys
; ++i
)
4223 if (vsd_destructor
[i
] == NULL
)
4227 * if no unused keys, increase the size of the destructor array
4229 if (i
== vsd_nkeys
) {
4230 if ((nkeys
= (vsd_nkeys
<< 1)) == 0)
4233 (void (**)(void *))vsd_realloc((void *)vsd_destructor
,
4234 (size_t)(vsd_nkeys
* sizeof (void (*)(void *))),
4235 (size_t)(nkeys
* sizeof (void (*)(void *))));
4240 * allocate the next available unused key
4242 vsd_destructor
[i
] = destructor
;
4245 /* create vsd_list, if it doesn't exist */
4246 if (vsd_list
== NULL
) {
4247 vsd_list
= kmem_alloc(sizeof (list_t
), KM_SLEEP
);
4248 list_create(vsd_list
, sizeof (struct vsd_node
),
4249 offsetof(struct vsd_node
, vs_nodes
));
4252 mutex_exit(&vsd_lock
);
4258 * Assumes that the caller is preventing vsd_set and vsd_get
4259 * Locks out vsd_create, vsd_destroy, and vsd_free
4260 * May free memory with lock held
4263 vsd_destroy(uint_t
*keyp
)
4266 struct vsd_node
*vsd
;
4269 * protect the key namespace and our destructor lists
4271 mutex_enter(&vsd_lock
);
4275 ASSERT(key
<= vsd_nkeys
);
4278 * if the key is valid
4283 * for every vnode with VSD, call key's destructor
4285 for (vsd
= list_head(vsd_list
); vsd
!= NULL
;
4286 vsd
= list_next(vsd_list
, vsd
)) {
4288 * no VSD for key in this vnode
4290 if (key
> vsd
->vs_nkeys
)
4293 * call destructor for key
4295 if (vsd
->vs_value
[k
] && vsd_destructor
[k
])
4296 (*vsd_destructor
[k
])(vsd
->vs_value
[k
]);
4298 * reset value for key
4300 vsd
->vs_value
[k
] = NULL
;
4303 * actually free the key (NULL destructor == unused)
4305 vsd_destructor
[k
] = NULL
;
4308 mutex_exit(&vsd_lock
);
4312 * Quickly return the per vnode value that was stored with the specified key
4313 * Assumes the caller is protecting key from vsd_create and vsd_destroy
4314 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4317 vsd_get(vnode_t
*vp
, uint_t key
)
4319 struct vsd_node
*vsd
;
4322 ASSERT(mutex_owned(&vp
->v_vsd_lock
));
4326 if (key
&& vsd
!= NULL
&& key
<= vsd
->vs_nkeys
)
4327 return (vsd
->vs_value
[key
- 1]);
4332 * Set a per vnode value indexed with the specified key
4333 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4336 vsd_set(vnode_t
*vp
, uint_t key
, void *value
)
4338 struct vsd_node
*vsd
;
4341 ASSERT(mutex_owned(&vp
->v_vsd_lock
));
4348 vsd
= vp
->v_vsd
= kmem_zalloc(sizeof (*vsd
), KM_SLEEP
);
4351 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4352 * code won't happen and we will continue down and allocate space for
4353 * the vs_value array.
4354 * If the caller is replacing one value with another, then it is up
4355 * to the caller to free/rele/destroy the previous value (if needed).
4357 if (key
<= vsd
->vs_nkeys
) {
4358 vsd
->vs_value
[key
- 1] = value
;
4362 ASSERT(key
<= vsd_nkeys
);
4364 if (vsd
->vs_nkeys
== 0) {
4365 mutex_enter(&vsd_lock
); /* lock out vsd_destroy() */
4367 * Link onto list of all VSD nodes.
4369 list_insert_head(vsd_list
, vsd
);
4370 mutex_exit(&vsd_lock
);
4374 * Allocate vnode local storage and set the value for key
4376 vsd
->vs_value
= vsd_realloc(vsd
->vs_value
,
4377 vsd
->vs_nkeys
* sizeof (void *),
4378 key
* sizeof (void *));
4379 vsd
->vs_nkeys
= key
;
4380 vsd
->vs_value
[key
- 1] = value
;
4386 * Called from vn_free() to run the destructor function for each vsd
4387 * Locks out vsd_create and vsd_destroy
4388 * Assumes that the destructor *DOES NOT* use vsd
4391 vsd_free(vnode_t
*vp
)
4394 struct vsd_node
*vsd
= vp
->v_vsd
;
4399 if (vsd
->vs_nkeys
== 0) {
4400 kmem_free(vsd
, sizeof (*vsd
));
4406 * lock out vsd_create and vsd_destroy, call
4407 * the destructor, and mark the value as destroyed.
4409 mutex_enter(&vsd_lock
);
4411 for (i
= 0; i
< vsd
->vs_nkeys
; i
++) {
4412 if (vsd
->vs_value
[i
] && vsd_destructor
[i
])
4413 (*vsd_destructor
[i
])(vsd
->vs_value
[i
]);
4414 vsd
->vs_value
[i
] = NULL
;
4418 * remove from linked list of VSD nodes
4420 list_remove(vsd_list
, vsd
);
4422 mutex_exit(&vsd_lock
);
4427 kmem_free(vsd
->vs_value
, vsd
->vs_nkeys
* sizeof (void *));
4428 kmem_free(vsd
, sizeof (struct vsd_node
));
4436 vsd_realloc(void *old
, size_t osize
, size_t nsize
)
4440 new = kmem_zalloc(nsize
, KM_SLEEP
);
4442 bcopy(old
, new, osize
);
4443 kmem_free(old
, osize
);
4449 * Setup the extensible system attribute for creating a reparse point.
4450 * The symlink data 'target' is validated for proper format of a reparse
4451 * string and a check also made to make sure the symlink data does not
4452 * point to an existing file.
4454 * return 0 if ok else -1.
4457 fs_reparse_mark(char *target
, vattr_t
*vap
, xvattr_t
*xvattr
)
4461 if ((!target
) || (!vap
) || (!xvattr
))
4464 /* validate reparse string */
4465 if (reparse_validate((const char *)target
))
4469 xvattr
->xva_vattr
= *vap
;
4470 xvattr
->xva_vattr
.va_mask
|= VATTR_XVATTR
;
4471 xoap
= xva_getxoptattr(xvattr
);
4473 XVA_SET_REQ(xvattr
, XAT_REPARSE
);
4474 xoap
->xoa_reparse
= 1;
4480 * Function to check whether a symlink is a reparse point.
4481 * Return B_TRUE if it is a reparse point, else return B_FALSE
4484 vn_is_reparse(vnode_t
*vp
, cred_t
*cr
, caller_context_t
*ct
)
4489 if ((vp
->v_type
!= VLNK
) ||
4490 !(vfs_has_feature(vp
->v_vfsp
, VFSFT_XVATTR
)))
4494 xoap
= xva_getxoptattr(&xvattr
);
4496 XVA_SET_REQ(&xvattr
, XAT_REPARSE
);
4498 if (fop_getattr(vp
, &xvattr
.xva_vattr
, 0, cr
, ct
))
4501 if ((!(xvattr
.xva_vattr
.va_mask
& VATTR_XVATTR
)) ||
4502 (!(XVA_ISSET_RTN(&xvattr
, XAT_REPARSE
))))
4505 return (xoap
->xoa_reparse
? B_TRUE
: B_FALSE
);