4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
21 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
22 /* All Rights Reserved */
26 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28 * Copyright 2015 Joyent, Inc.
32 * Generic vnode operations.
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/errno.h>
38 #include <sys/fcntl.h>
39 #include <sys/flock.h>
40 #include <sys/statvfs.h>
42 #include <sys/vnode.h>
45 #include <sys/unistd.h>
48 #include <sys/debug.h>
49 #include <sys/cmn_err.h>
50 #include <sys/stream.h>
51 #include <sys/fs_subr.h>
52 #include <sys/fs_reparse.h>
55 #include <sys/share.h>
59 #include <sys/nbmlock.h>
60 #include <acl/acl_common.h>
61 #include <sys/pathname.h>
63 static callb_cpr_t
*frlock_serialize_blocked(flk_cb_when_t
, void *);
66 * Tunable to limit the number of retry to recover from STALE error.
68 int fs_estale_retry
= 5;
71 * supports for reparse point door upcall
73 static door_handle_t reparsed_door
;
74 static kmutex_t reparsed_door_lock
;
77 * The associated operation is not supported by the file system.
86 * The associated operation is invalid (on this vnode).
95 * The associated operation is valid only for directories.
104 * Free the file system specific resources. For the file systems that
105 * do not support the forced unmount, it will be a nop function.
110 fs_freevfs(vfs_t
*vfsp
)
116 fs_nosys_map(struct vnode
*vp
, offset_t off
, struct as
*as
, caddr_t
*addrp
,
117 size_t len
, uchar_t prot
, uchar_t maxprot
, uint_t flags
, struct cred
*cr
,
118 caller_context_t
*ct
)
125 fs_nosys_addmap(struct vnode
*vp
, offset_t off
, struct as
*as
, caddr_t addr
,
126 size_t len
, uchar_t prot
, uchar_t maxprot
, uint_t flags
, struct cred
*cr
,
127 caller_context_t
*ct
)
134 fs_nosys_poll(vnode_t
*vp
, short events
, int anyyet
, short *reventsp
,
135 struct pollhead
**phpp
, caller_context_t
*ct
)
142 * The file system has nothing to sync to disk. However, the
143 * VFS_SYNC operation must not fail.
147 fs_sync(struct vfs
*vfspp
, short flag
, cred_t
*cr
)
153 * Does nothing but fop_fsync must not fail.
157 fs_fsync(vnode_t
*vp
, int syncflag
, cred_t
*cr
, caller_context_t
*ct
)
163 * Does nothing but fop_putpage must not fail.
167 fs_putpage(vnode_t
*vp
, offset_t off
, size_t len
, int flags
, cred_t
*cr
,
168 caller_context_t
*ctp
)
174 * Does nothing but fop_ioctl must not fail.
178 fs_ioctl(vnode_t
*vp
, int com
, intptr_t data
, int flag
, cred_t
*cred
,
185 * Read/write lock/unlock. Does nothing.
189 fs_rwlock(vnode_t
*vp
, int write_lock
, caller_context_t
*ctp
)
196 fs_rwunlock(vnode_t
*vp
, int write_lock
, caller_context_t
*ctp
)
201 * Compare two vnodes.
205 fs_cmp(vnode_t
*vp1
, vnode_t
*vp2
, caller_context_t
*ct
)
211 * No-op seek operation.
215 fs_seek(vnode_t
*vp
, offset_t ooff
, offset_t
*noffp
, caller_context_t
*ct
)
217 return ((*noffp
< 0 || *noffp
> MAXOFFSET_T
) ? EINVAL
: 0);
221 * File and record locking.
225 fs_frlock(vnode_t
*vp
, int cmd
, struct flock64
*bfp
, int flag
, offset_t offset
,
226 flk_callback_t
*flk_cbp
, cred_t
*cr
, caller_context_t
*ct
)
230 boolean_t skip_lock
= B_FALSE
;
231 flk_callback_t serialize_callback
;
239 if (flag
& F_REMOTELOCK
) {
243 bfp
->l_pid
= ttoproc(curthread
)->p_pid
;
250 * TBD we do not support remote OFD locks at this time.
252 if (flag
& F_REMOTELOCK
) {
261 * Are NBMAND locks allowed on this file?
264 !(vp
->v_vfsp
->vfs_flag
& VFS_NBMAND
)) {
268 if (vp
->v_type
!= VREG
) {
275 if (flag
& F_REMOTELOCK
) {
276 frcmd
= SETFLCK
|RCMDLCK
;
279 bfp
->l_pid
= ttoproc(curthread
)->p_pid
;
282 if (cmd
== F_SETLK_NBMAND
&&
283 (bfp
->l_type
== F_RDLCK
|| bfp
->l_type
== F_WRLCK
)) {
287 if (nbl_need_check(vp
)) {
288 nbl_start_crit(vp
, RW_WRITER
);
290 if (frcmd
& NBMLCK
) {
291 mode
= (bfp
->l_type
== F_RDLCK
) ?
293 if (vn_is_mapped(vp
, mode
)) {
302 if (flag
& F_REMOTELOCK
) {
303 frcmd
= SETFLCK
|SLPFLCK
|RCMDLCK
;
305 frcmd
= SETFLCK
|SLPFLCK
;
306 bfp
->l_pid
= ttoproc(curthread
)->p_pid
;
310 if (nbl_need_check(vp
)) {
311 nbl_start_crit(vp
, RW_WRITER
);
321 * TBD we do not support remote OFD locks at this time.
323 if (flag
& F_REMOTELOCK
) {
330 case F_HASREMOTELOCKS
:
331 l_has_rmt(bfp
) = flk_has_remote_locks(vp
);
340 * If this is a blocking lock request and we're serializing lock
341 * requests, modify the callback list to leave the critical region
342 * while we're waiting for the lock.
345 if (serialize
&& (frcmd
& SLPFLCK
) != 0) {
346 flk_add_callback(&serialize_callback
,
347 frlock_serialize_blocked
, vp
, flk_cbp
);
348 flk_cbp
= &serialize_callback
;
352 error
= reclock(vp
, bfp
, frcmd
, flag
, offset
, flk_cbp
);
354 if (serialize
&& (frcmd
& SLPFLCK
) != 0)
355 flk_del_callback(&serialize_callback
);
365 * Callback when a lock request blocks and we are serializing requests. If
366 * before sleeping, leave the critical region. If after wakeup, reenter
367 * the critical region.
371 frlock_serialize_blocked(flk_cb_when_t when
, void *infop
)
373 vnode_t
*vp
= (vnode_t
*)infop
;
375 if (when
== FLK_BEFORE_SLEEP
)
378 nbl_start_crit(vp
, RW_WRITER
);
389 fs_setfl(vnode_t
*vp
, int oflags
, int nflags
, cred_t
*cr
, caller_context_t
*ct
)
395 * Return the answer requested to poll() for non-device files.
396 * Only POLLIN, POLLRDNORM, and POLLOUT are recognized.
398 struct pollhead fs_pollhd
;
402 fs_poll(vnode_t
*vp
, short events
, int anyyet
, short *reventsp
,
403 struct pollhead
**phpp
, caller_context_t
*ct
)
408 if (events
& POLLRDNORM
)
409 *reventsp
|= POLLRDNORM
;
410 if (events
& POLLRDBAND
)
411 *reventsp
|= POLLRDBAND
;
412 if (events
& POLLOUT
)
413 *reventsp
|= POLLOUT
;
414 if (events
& POLLWRBAND
)
415 *reventsp
|= POLLWRBAND
;
416 *phpp
= !anyyet
&& !*reventsp
? &fs_pollhd
: NULL
;
421 * POSIX pathconf() support.
425 fs_pathconf(vnode_t
*vp
, int cmd
, ulong_t
*valp
, cred_t
*cr
,
426 caller_context_t
*ct
)
430 struct statvfs64 vfsbuf
;
447 bzero(&vfsbuf
, sizeof (vfsbuf
));
448 if (error
= VFS_STATVFS(vp
->v_vfsp
, &vfsbuf
))
450 val
= vfsbuf
.f_namemax
;
454 case _PC_SYMLINK_MAX
:
463 if (vp
->v_vfsp
->vfs_flag
& VFS_NOTRUNC
)
464 val
= 1; /* NOTRUNC is enabled for vp */
470 val
= _POSIX_VDISABLE
;
473 case _PC_CHOWN_RESTRICTED
:
475 val
= rstchown
; /* chown restricted enabled */
480 case _PC_FILESIZEBITS
:
483 * If ever we come here it means that underlying file system
484 * does not recognise the command and therefore this
485 * configurable limit cannot be determined. We return -1
486 * and don't change errno.
489 val
= (ulong_t
)-1; /* large file support */
492 case _PC_ACL_ENABLED
:
496 case _PC_CASE_BEHAVIOR
:
497 val
= _CASE_SENSITIVE
;
498 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 1)
499 val
|= _CASE_INSENSITIVE
;
500 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 1)
501 val
&= ~_CASE_SENSITIVE
;
504 case _PC_SATTR_ENABLED
:
505 case _PC_SATTR_EXISTS
:
509 case _PC_ACCESS_FILTERING
:
528 fs_dispose(struct vnode
*vp
, page_t
*pp
, int fl
, int dn
, struct cred
*cr
,
529 caller_context_t
*ct
)
532 ASSERT(fl
== B_FREE
|| fl
== B_INVAL
);
537 page_destroy(pp
, dn
);
542 fs_nodispose(struct vnode
*vp
, page_t
*pp
, int fl
, int dn
, struct cred
*cr
,
543 caller_context_t
*ct
)
545 cmn_err(CE_PANIC
, "fs_nodispose invoked");
549 * fabricate acls for file systems that do not support acls.
553 fs_fab_acl(vnode_t
*vp
, vsecattr_t
*vsecattr
, int flag
, cred_t
*cr
,
554 caller_context_t
*ct
)
561 vsecattr
->vsa_aclcnt
= 0;
562 vsecattr
->vsa_aclentsz
= 0;
563 vsecattr
->vsa_aclentp
= NULL
;
564 vsecattr
->vsa_dfaclcnt
= 0; /* Default ACLs are not fabricated */
565 vsecattr
->vsa_dfaclentp
= NULL
;
567 vattr
.va_mask
= AT_MODE
| AT_UID
| AT_GID
;
568 if (error
= fop_getattr(vp
, &vattr
, 0, cr
, ct
))
571 if (vsecattr
->vsa_mask
& (VSA_ACLCNT
| VSA_ACL
)) {
572 aclsize
= 4 * sizeof (aclent_t
);
573 vsecattr
->vsa_aclcnt
= 4; /* USER, GROUP, OTHER, and CLASS */
574 vsecattr
->vsa_aclentp
= kmem_zalloc(aclsize
, KM_SLEEP
);
575 aclentp
= vsecattr
->vsa_aclentp
;
577 aclentp
->a_type
= USER_OBJ
; /* Owner */
578 aclentp
->a_perm
= ((ushort_t
)(vattr
.va_mode
& 0700)) >> 6;
579 aclentp
->a_id
= vattr
.va_uid
; /* Really undefined */
582 aclentp
->a_type
= GROUP_OBJ
; /* Group */
583 aclentp
->a_perm
= ((ushort_t
)(vattr
.va_mode
& 0070)) >> 3;
584 aclentp
->a_id
= vattr
.va_gid
; /* Really undefined */
587 aclentp
->a_type
= OTHER_OBJ
; /* Other */
588 aclentp
->a_perm
= vattr
.va_mode
& 0007;
589 aclentp
->a_id
= (gid_t
)-1; /* Really undefined */
592 aclentp
->a_type
= CLASS_OBJ
; /* Class */
593 aclentp
->a_perm
= (ushort_t
)(0007);
594 aclentp
->a_id
= (gid_t
)-1; /* Really undefined */
595 } else if (vsecattr
->vsa_mask
& (VSA_ACECNT
| VSA_ACE
)) {
596 VERIFY(0 == acl_trivial_create(vattr
.va_mode
,
597 (vp
->v_type
== VDIR
), (ace_t
**)&vsecattr
->vsa_aclentp
,
598 &vsecattr
->vsa_aclcnt
));
599 vsecattr
->vsa_aclentsz
= vsecattr
->vsa_aclcnt
* sizeof (ace_t
);
606 * Common code for implementing DOS share reservations
610 fs_shrlock(struct vnode
*vp
, int cmd
, struct shrlock
*shr
, int flag
, cred_t
*cr
,
611 caller_context_t
*ct
)
616 * Make sure that the file was opened with permissions appropriate
617 * for the request, and make sure the caller isn't trying to sneak
618 * in an NBMAND request.
620 if (cmd
== F_SHARE
) {
621 if (((shr
->s_access
& F_RDACC
) && (flag
& FREAD
) == 0) ||
622 ((shr
->s_access
& F_WRACC
) && (flag
& FWRITE
) == 0))
624 if (shr
->s_access
& (F_RMACC
| F_MDACC
))
626 if (shr
->s_deny
& (F_MANDDNY
| F_RMDNY
))
629 if (cmd
== F_SHARE_NBMAND
) {
630 /* make sure nbmand is allowed on the file */
632 !(vp
->v_vfsp
->vfs_flag
& VFS_NBMAND
)) {
635 if (vp
->v_type
!= VREG
) {
640 nbl_start_crit(vp
, RW_WRITER
);
645 shr
->s_deny
|= F_MANDDNY
;
648 error
= add_share(vp
, shr
);
652 error
= del_share(vp
, shr
);
655 case F_HASREMOTELOCKS
:
657 * We are overloading this command to refer to remote
658 * shares as well as remote locks, despite its name.
660 shr
->s_access
= shr_has_remote_shares(vp
, shr
->s_sysid
);
675 fs_vnevent_nosupport(vnode_t
*vp
, vnevent_t e
, vnode_t
*dvp
, char *fnm
,
676 caller_context_t
*ct
)
684 fs_vnevent_support(vnode_t
*vp
, vnevent_t e
, vnode_t
*dvp
, char *fnm
,
685 caller_context_t
*ct
)
692 * return 1 for non-trivial ACL.
694 * NB: It is not necessary for the caller to fop_rwlock since
695 * we only issue fop_getsecattr.
697 * Returns 0 == trivial
699 * <0 could not determine.
702 fs_acl_nontrivial(vnode_t
*vp
, cred_t
*cr
)
710 /* determine the forms of ACLs maintained */
711 error
= fop_pathconf(vp
, _PC_ACL_ENABLED
, &acl_styles
, cr
, NULL
);
713 /* clear bits we don't understand and establish default acl_style */
714 acl_styles
&= (_ACL_ACLENT_ENABLED
| _ACL_ACE_ENABLED
);
715 if (error
|| (acl_styles
== 0))
716 acl_styles
= _ACL_ACLENT_ENABLED
;
718 vsecattr
.vsa_aclentp
= NULL
;
719 vsecattr
.vsa_dfaclentp
= NULL
;
720 vsecattr
.vsa_aclcnt
= 0;
721 vsecattr
.vsa_dfaclcnt
= 0;
724 /* select one of the styles as current flavor */
726 if (acl_styles
& _ACL_ACLENT_ENABLED
) {
727 acl_flavor
= _ACL_ACLENT_ENABLED
;
728 vsecattr
.vsa_mask
= VSA_ACLCNT
| VSA_DFACLCNT
;
729 } else if (acl_styles
& _ACL_ACE_ENABLED
) {
730 acl_flavor
= _ACL_ACE_ENABLED
;
731 vsecattr
.vsa_mask
= VSA_ACECNT
| VSA_ACE
;
734 ASSERT(vsecattr
.vsa_mask
&& acl_flavor
);
735 error
= fop_getsecattr(vp
, &vsecattr
, 0, cr
, NULL
);
739 /* that flavor failed */
740 acl_styles
&= ~acl_flavor
;
743 /* if all styles fail then assume trivial */
747 /* process the flavor that worked */
749 if (acl_flavor
& _ACL_ACLENT_ENABLED
) {
750 if (vsecattr
.vsa_aclcnt
> MIN_ACL_ENTRIES
)
752 if (vsecattr
.vsa_aclcnt
&& vsecattr
.vsa_aclentp
!= NULL
)
753 kmem_free(vsecattr
.vsa_aclentp
,
754 vsecattr
.vsa_aclcnt
* sizeof (aclent_t
));
755 if (vsecattr
.vsa_dfaclcnt
&& vsecattr
.vsa_dfaclentp
!= NULL
)
756 kmem_free(vsecattr
.vsa_dfaclentp
,
757 vsecattr
.vsa_dfaclcnt
* sizeof (aclent_t
));
759 if (acl_flavor
& _ACL_ACE_ENABLED
) {
760 isnontrivial
= ace_trivial(vsecattr
.vsa_aclentp
,
761 vsecattr
.vsa_aclcnt
);
763 if (vsecattr
.vsa_aclcnt
&& vsecattr
.vsa_aclentp
!= NULL
)
764 kmem_free(vsecattr
.vsa_aclentp
,
765 vsecattr
.vsa_aclcnt
* sizeof (ace_t
));
766 /* ACE has no vsecattr.vsa_dfaclcnt */
768 return (isnontrivial
);
772 * Check whether we need a retry to recover from STALE error.
775 fs_need_estale_retry(int retry_count
)
777 if (retry_count
< fs_estale_retry
)
784 static int (*fs_av_scan
)(vnode_t
*, cred_t
*, int) = NULL
;
787 * Routine for anti-virus scanner to call to register its scanning routine.
790 fs_vscan_register(int (*av_scan
)(vnode_t
*, cred_t
*, int))
792 fs_av_scan
= av_scan
;
796 * Routine for file systems to call to initiate anti-virus scanning.
797 * Scanning will only be done on REGular files (currently).
800 fs_vscan(vnode_t
*vp
, cred_t
*cr
, int async
)
804 if (fs_av_scan
&& vp
->v_type
== VREG
)
805 ret
= (*fs_av_scan
)(vp
, cr
, async
);
811 * support functions for reparse point
814 * reparse_vnode_parse
816 * Read the symlink data of a reparse point specified by the vnode
817 * and return the reparse data as name-value pair in the nvlist.
820 reparse_vnode_parse(vnode_t
*vp
, nvlist_t
*nvl
)
827 if (vp
== NULL
|| nvl
== NULL
)
830 lkdata
= kmem_alloc(MAXREPARSELEN
, KM_SLEEP
);
833 * Set up io vector to read sym link data
835 iov
.iov_base
= lkdata
;
836 iov
.iov_len
= MAXREPARSELEN
;
839 uio
.uio_segflg
= UIO_SYSSPACE
;
840 uio
.uio_extflg
= UIO_COPY_CACHED
;
841 uio
.uio_loffset
= (offset_t
)0;
842 uio
.uio_resid
= MAXREPARSELEN
;
844 if ((err
= fop_readlink(vp
, &uio
, kcred
, NULL
)) == 0) {
845 *(lkdata
+ MAXREPARSELEN
- uio
.uio_resid
) = '\0';
846 err
= reparse_parse(lkdata
, nvl
);
848 kmem_free(lkdata
, MAXREPARSELEN
); /* done with lkdata */
856 mutex_init(&reparsed_door_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
860 reparse_door_get_handle()
864 mutex_enter(&reparsed_door_lock
);
865 if ((dh
= reparsed_door
) == NULL
) {
866 if (door_ki_open(REPARSED_DOOR
, &reparsed_door
) != 0) {
867 reparsed_door
= NULL
;
872 mutex_exit(&reparsed_door_lock
);
877 reparse_door_reset_handle()
879 mutex_enter(&reparsed_door_lock
);
880 reparsed_door
= NULL
;
881 mutex_exit(&reparsed_door_lock
);
887 * Accepts the service-specific item from the reparse point and returns
888 * the service-specific data requested. The caller specifies the size of
889 * the buffer provided via *bufsz; the routine will fail with EOVERFLOW
890 * if the results will not fit in the buffer, in which case, *bufsz will
891 * contain the number of bytes needed to hold the results.
893 * if ok return 0 and update *bufsize with length of actual result
894 * else return error code.
897 reparse_kderef(const char *svc_type
, const char *svc_data
, char *buf
,
900 int err
, retries
, need_free
, retried_doorhd
;
901 size_t dlen
, res_len
;
903 door_arg_t door_args
;
904 reparsed_door_res_t
*resp
;
905 door_handle_t rp_door
;
907 if (svc_type
== NULL
|| svc_data
== NULL
|| buf
== NULL
||
911 /* get reparsed's door handle */
912 if ((rp_door
= reparse_door_get_handle()) == NULL
)
915 /* setup buffer for door_call args and results */
916 dlen
= strlen(svc_type
) + strlen(svc_data
) + 2;
917 if (*bufsize
< dlen
) {
918 darg
= kmem_alloc(dlen
, KM_SLEEP
);
921 darg
= buf
; /* use same buffer for door's args & results */
925 /* build argument string of door call */
926 (void) snprintf(darg
, dlen
, "%s:%s", svc_type
, svc_data
);
928 /* setup args for door call */
929 door_args
.data_ptr
= darg
;
930 door_args
.data_size
= dlen
;
931 door_args
.desc_ptr
= NULL
;
932 door_args
.desc_num
= 0;
933 door_args
.rbuf
= buf
;
934 door_args
.rsize
= *bufsize
;
936 /* do the door_call */
939 door_ki_hold(rp_door
);
940 while ((err
= door_ki_upcall_limited(rp_door
, &door_args
,
941 NULL
, SIZE_MAX
, 0)) != 0) {
942 if (err
== EAGAIN
|| err
== EINTR
) {
943 if (++retries
< REPARSED_DOORCALL_MAX_RETRY
) {
944 delay(SEC_TO_TICK(1));
947 } else if (err
== EBADF
) {
948 /* door server goes away... */
949 reparse_door_reset_handle();
951 if (retried_doorhd
== 0) {
952 door_ki_rele(rp_door
);
954 rp_door
= reparse_door_get_handle();
955 if (rp_door
!= NULL
) {
956 door_ki_hold(rp_door
);
965 door_ki_rele(rp_door
);
968 kmem_free(darg
, dlen
); /* done with args buffer */
973 resp
= (reparsed_door_res_t
*)door_args
.rbuf
;
974 if ((err
= resp
->res_status
) == 0) {
976 * have to save the length of the results before the
977 * bcopy below since it's can be an overlap copy that
978 * overwrites the reparsed_door_res_t structure at
979 * the beginning of the buffer.
981 res_len
= (size_t)resp
->res_len
;
983 /* deref call is ok */
984 if (res_len
> *bufsize
)
987 bcopy(resp
->res_data
, buf
, res_len
);
990 if (door_args
.rbuf
!= buf
)
991 kmem_free(door_args
.rbuf
, door_args
.rsize
);