2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/fcntl.h>
50 #include <sys/limits.h>
52 #include <sys/mount.h>
53 #include <sys/mutex.h>
54 #include <sys/namei.h>
55 #include <sys/vnode.h>
58 #include <sys/filio.h>
60 #include <sys/ttycom.h>
62 #include <sys/syslog.h>
63 #include <sys/unistd.h>
65 #include <security/mac/mac_framework.h>
67 static fo_rdwr_t vn_read
;
68 static fo_rdwr_t vn_write
;
69 static fo_truncate_t vn_truncate
;
70 static fo_ioctl_t vn_ioctl
;
71 static fo_poll_t vn_poll
;
72 static fo_kqfilter_t vn_kqfilter
;
73 static fo_stat_t vn_statfile
;
74 static fo_close_t vn_closefile
;
76 struct fileops vnops
= {
79 .fo_truncate
= vn_truncate
,
82 .fo_kqfilter
= vn_kqfilter
,
83 .fo_stat
= vn_statfile
,
84 .fo_close
= vn_closefile
,
85 .fo_flags
= DFLAG_PASSABLE
| DFLAG_SEEKABLE
89 vn_open(ndp
, flagp
, cmode
, fp
)
90 struct nameidata
*ndp
;
94 struct thread
*td
= ndp
->ni_cnd
.cn_thread
;
96 return (vn_open_cred(ndp
, flagp
, cmode
, td
->td_ucred
, fp
));
100 * Common code for vnode open operations.
101 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
103 * Note that this does NOT free nameidata for the successful case,
104 * due to the NDINIT being done elsewhere.
107 vn_open_cred(ndp
, flagp
, cmode
, cred
, fp
)
108 struct nameidata
*ndp
;
115 struct thread
*td
= ndp
->ni_cnd
.cn_thread
;
117 struct vattr
*vap
= &vat
;
118 int mode
, fmode
, error
;
119 int vfslocked
, mpsafe
;
121 mpsafe
= ndp
->ni_cnd
.cn_flags
& MPSAFE
;
125 if (fmode
& O_CREAT
) {
126 ndp
->ni_cnd
.cn_nameiop
= CREATE
;
127 ndp
->ni_cnd
.cn_flags
= ISOPEN
| LOCKPARENT
| LOCKLEAF
|
128 MPSAFE
| AUDITVNODE1
;
129 if ((fmode
& O_EXCL
) == 0 && (fmode
& O_NOFOLLOW
) == 0)
130 ndp
->ni_cnd
.cn_flags
|= FOLLOW
;
132 if ((error
= namei(ndp
)) != 0)
134 vfslocked
= NDHASGIANT(ndp
);
136 ndp
->ni_cnd
.cn_flags
&= ~MPSAFE
;
137 if (ndp
->ni_vp
== NULL
) {
140 vap
->va_mode
= cmode
;
142 vap
->va_vaflags
|= VA_EXCLUSIVE
;
143 if (vn_start_write(ndp
->ni_dvp
, &mp
, V_NOWAIT
) != 0) {
144 NDFREE(ndp
, NDF_ONLY_PNBUF
);
146 VFS_UNLOCK_GIANT(vfslocked
);
147 if ((error
= vn_start_write(NULL
, &mp
,
148 V_XSLEEP
| PCATCH
)) != 0)
153 error
= mac_vnode_check_create(cred
, ndp
->ni_dvp
,
157 VOP_LEASE(ndp
->ni_dvp
, td
, cred
, LEASE_WRITE
);
158 error
= VOP_CREATE(ndp
->ni_dvp
, &ndp
->ni_vp
,
164 vn_finished_write(mp
);
166 VFS_UNLOCK_GIANT(vfslocked
);
167 NDFREE(ndp
, NDF_ONLY_PNBUF
);
173 if (ndp
->ni_dvp
== ndp
->ni_vp
)
179 if (fmode
& O_EXCL
) {
186 ndp
->ni_cnd
.cn_nameiop
= LOOKUP
;
187 ndp
->ni_cnd
.cn_flags
= ISOPEN
|
188 ((fmode
& O_NOFOLLOW
) ? NOFOLLOW
: FOLLOW
) |
189 LOCKLEAF
| MPSAFE
| AUDITVNODE1
;
190 if ((error
= namei(ndp
)) != 0)
193 ndp
->ni_cnd
.cn_flags
&= ~MPSAFE
;
194 vfslocked
= NDHASGIANT(ndp
);
197 if (vp
->v_type
== VLNK
) {
201 if (vp
->v_type
== VSOCK
) {
206 if (fmode
& (FWRITE
| O_TRUNC
)) {
207 if (vp
->v_type
== VDIR
) {
217 if (fmode
& O_APPEND
)
220 error
= mac_vnode_check_open(cred
, vp
, mode
);
224 if ((fmode
& O_CREAT
) == 0) {
226 error
= vn_writechk(vp
);
231 error
= VOP_ACCESS(vp
, mode
, cred
, td
);
236 if ((error
= VOP_OPEN(vp
, fmode
, cred
, td
, fp
)) != 0)
242 ASSERT_VOP_ELOCKED(vp
, "vn_open_cred");
244 VFS_UNLOCK_GIANT(vfslocked
);
247 NDFREE(ndp
, NDF_ONLY_PNBUF
);
249 VFS_UNLOCK_GIANT(vfslocked
);
256 * Check for write permissions on the specified vnode.
257 * Prototype text segments cannot be written.
261 register struct vnode
*vp
;
264 ASSERT_VOP_LOCKED(vp
, "vn_writechk");
266 * If there's shared text associated with
267 * the vnode, try to free it up once. If
268 * we fail, we can't allow writing.
270 if (vp
->v_vflag
& VV_TEXT
)
280 vn_close(vp
, flags
, file_cred
, td
)
281 register struct vnode
*vp
;
283 struct ucred
*file_cred
;
289 VFS_ASSERT_GIANT(vp
->v_mount
);
291 vn_start_write(vp
, &mp
, V_WAIT
);
292 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
293 if (flags
& FWRITE
) {
294 VNASSERT(vp
->v_writecount
> 0, vp
,
295 ("vn_close: negative writecount"));
298 error
= VOP_CLOSE(vp
, flags
, file_cred
, td
);
300 vn_finished_write(mp
);
305 * Heuristic to detect sequential operation.
308 sequential_heuristic(struct uio
*uio
, struct file
*fp
)
312 * Offset 0 is handled specially. open() sets f_seqcount to 1 so
313 * that the first I/O is normally considered to be slightly
314 * sequential. Seeking to offset 0 doesn't change sequentiality
315 * unless previous seeks have reduced f_seqcount to 0, in which
316 * case offset 0 is not special.
318 if ((uio
->uio_offset
== 0 && fp
->f_seqcount
> 0) ||
319 uio
->uio_offset
== fp
->f_nextoff
) {
321 * f_seqcount is in units of fixed-size blocks so that it
322 * depends mainly on the amount of sequential I/O and not
323 * much on the number of sequential I/O's. The fixed size
324 * of 16384 is hard-coded here since it is (not quite) just
325 * a magic size that works well here. This size is more
326 * closely related to the best I/O size for real disks than
327 * to any block size used by software.
329 fp
->f_seqcount
+= howmany(uio
->uio_resid
, 16384);
330 if (fp
->f_seqcount
> IO_SEQMAX
)
331 fp
->f_seqcount
= IO_SEQMAX
;
332 return (fp
->f_seqcount
<< IO_SEQSHIFT
);
335 /* Not sequential. Quickly draw-down sequentiality. */
336 if (fp
->f_seqcount
> 1)
344 * Package up an I/O request on a vnode into a uio and do it.
347 vn_rdwr(rw
, vp
, base
, len
, offset
, segflg
, ioflg
, active_cred
, file_cred
,
356 struct ucred
*active_cred
;
357 struct ucred
*file_cred
;
367 VFS_ASSERT_GIANT(vp
->v_mount
);
369 if ((ioflg
& IO_NODELOCKED
) == 0) {
371 if (rw
== UIO_WRITE
) {
372 if (vp
->v_type
!= VCHR
&&
373 (error
= vn_start_write(vp
, &mp
, V_WAIT
| PCATCH
))
376 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
379 * XXX This should be LK_SHARED but I don't trust VFS
380 * enough to leave it like that until it has been
383 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
387 ASSERT_VOP_LOCKED(vp
, "IO_NODELOCKED with no vp lock held");
388 auio
.uio_iov
= &aiov
;
390 aiov
.iov_base
= base
;
392 auio
.uio_resid
= len
;
393 auio
.uio_offset
= offset
;
394 auio
.uio_segflg
= segflg
;
399 if ((ioflg
& IO_NOMACCHECK
) == 0) {
401 error
= mac_vnode_check_read(active_cred
, file_cred
,
404 error
= mac_vnode_check_write(active_cred
, file_cred
,
414 error
= VOP_READ(vp
, &auio
, ioflg
, cred
);
416 error
= VOP_WRITE(vp
, &auio
, ioflg
, cred
);
419 *aresid
= auio
.uio_resid
;
421 if (auio
.uio_resid
&& error
== 0)
423 if ((ioflg
& IO_NODELOCKED
) == 0) {
424 if (rw
== UIO_WRITE
&& vp
->v_type
!= VCHR
)
425 vn_finished_write(mp
);
432 * Package up an I/O request on a vnode into a uio and do it. The I/O
433 * request is split up into smaller chunks and we try to avoid saturating
434 * the buffer cache while potentially holding a vnode locked, so we
435 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
436 * to give other processes a chance to lock the vnode (either other processes
437 * core'ing the same binary, or unrelated processes scanning the directory).
440 vn_rdwr_inchunks(rw
, vp
, base
, len
, offset
, segflg
, ioflg
, active_cred
,
441 file_cred
, aresid
, td
)
449 struct ucred
*active_cred
;
450 struct ucred
*file_cred
;
457 VFS_ASSERT_GIANT(vp
->v_mount
);
463 * Force `offset' to a multiple of MAXBSIZE except possibly
464 * for the first chunk, so that filesystems only need to
465 * write full blocks except possibly for the first and last
468 chunk
= MAXBSIZE
- (uoff_t
)offset
% MAXBSIZE
;
472 if (rw
!= UIO_READ
&& vp
->v_type
== VREG
)
475 error
= vn_rdwr(rw
, vp
, base
, chunk
, offset
, segflg
,
476 ioflg
, active_cred
, file_cred
, &iaresid
, td
);
477 len
-= chunk
; /* aresid calc already includes length */
481 base
= (char *)base
+ chunk
;
485 *aresid
= len
+ iaresid
;
490 * File table vnode read routine.
493 vn_read(fp
, uio
, active_cred
, flags
, td
)
496 struct ucred
*active_cred
;
505 KASSERT(uio
->uio_td
== td
, ("uio_td %p is not td %p",
510 if (fp
->f_flag
& FNONBLOCK
)
512 if (fp
->f_flag
& O_DIRECT
)
514 vfslocked
= VFS_LOCK_GIANT(vp
->v_mount
);
515 VOP_LEASE(vp
, td
, fp
->f_cred
, LEASE_READ
);
517 * According to McKusick the vn lock was protecting f_offset here.
518 * It is now protected by the FOFFSET_LOCKED flag.
520 if ((flags
& FOF_OFFSET
) == 0) {
521 mtxp
= mtx_pool_find(mtxpool_sleep
, fp
);
523 while(fp
->f_vnread_flags
& FOFFSET_LOCKED
) {
524 fp
->f_vnread_flags
|= FOFFSET_LOCK_WAITING
;
525 msleep(&fp
->f_vnread_flags
, mtxp
, PUSER
-1,
526 "vnread offlock", 0);
528 fp
->f_vnread_flags
|= FOFFSET_LOCKED
;
530 vn_lock(vp
, LK_SHARED
| LK_RETRY
);
531 uio
->uio_offset
= fp
->f_offset
;
533 vn_lock(vp
, LK_SHARED
| LK_RETRY
);
535 ioflag
|= sequential_heuristic(uio
, fp
);
538 error
= mac_vnode_check_read(active_cred
, fp
->f_cred
, vp
);
541 error
= VOP_READ(vp
, uio
, ioflag
, fp
->f_cred
);
542 if ((flags
& FOF_OFFSET
) == 0) {
543 fp
->f_offset
= uio
->uio_offset
;
545 if (fp
->f_vnread_flags
& FOFFSET_LOCK_WAITING
)
546 wakeup(&fp
->f_vnread_flags
);
547 fp
->f_vnread_flags
= 0;
550 fp
->f_nextoff
= uio
->uio_offset
;
552 VFS_UNLOCK_GIANT(vfslocked
);
557 * File table vnode write routine.
560 vn_write(fp
, uio
, active_cred
, flags
, td
)
563 struct ucred
*active_cred
;
572 KASSERT(uio
->uio_td
== td
, ("uio_td %p is not td %p",
575 vfslocked
= VFS_LOCK_GIANT(vp
->v_mount
);
576 if (vp
->v_type
== VREG
)
579 if (vp
->v_type
== VREG
&& (fp
->f_flag
& O_APPEND
))
581 if (fp
->f_flag
& FNONBLOCK
)
583 if (fp
->f_flag
& O_DIRECT
)
585 if ((fp
->f_flag
& O_FSYNC
) ||
586 (vp
->v_mount
&& (vp
->v_mount
->mnt_flag
& MNT_SYNCHRONOUS
)))
589 if (vp
->v_type
!= VCHR
&&
590 (error
= vn_start_write(vp
, &mp
, V_WAIT
| PCATCH
)) != 0)
592 VOP_LEASE(vp
, td
, fp
->f_cred
, LEASE_WRITE
);
593 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
594 if ((flags
& FOF_OFFSET
) == 0)
595 uio
->uio_offset
= fp
->f_offset
;
596 ioflag
|= sequential_heuristic(uio
, fp
);
598 error
= mac_vnode_check_write(active_cred
, fp
->f_cred
, vp
);
601 error
= VOP_WRITE(vp
, uio
, ioflag
, fp
->f_cred
);
602 if ((flags
& FOF_OFFSET
) == 0)
603 fp
->f_offset
= uio
->uio_offset
;
604 fp
->f_nextoff
= uio
->uio_offset
;
606 if (vp
->v_type
!= VCHR
)
607 vn_finished_write(mp
);
609 VFS_UNLOCK_GIANT(vfslocked
);
614 * File table truncate routine.
617 vn_truncate(fp
, length
, active_cred
, td
)
620 struct ucred
*active_cred
;
630 vfslocked
= VFS_LOCK_GIANT(vp
->v_mount
);
631 error
= vn_start_write(vp
, &mp
, V_WAIT
| PCATCH
);
633 VFS_UNLOCK_GIANT(vfslocked
);
636 VOP_LEASE(vp
, td
, active_cred
, LEASE_WRITE
);
637 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
638 if (vp
->v_type
== VDIR
) {
643 error
= mac_vnode_check_write(active_cred
, fp
->f_cred
, vp
);
647 error
= vn_writechk(vp
);
650 vattr
.va_size
= length
;
651 error
= VOP_SETATTR(vp
, &vattr
, fp
->f_cred
);
655 vn_finished_write(mp
);
656 VFS_UNLOCK_GIANT(vfslocked
);
661 * File table vnode stat routine.
664 vn_statfile(fp
, sb
, active_cred
, td
)
667 struct ucred
*active_cred
;
670 struct vnode
*vp
= fp
->f_vnode
;
674 vfslocked
= VFS_LOCK_GIANT(vp
->v_mount
);
675 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
676 error
= vn_stat(vp
, sb
, active_cred
, fp
->f_cred
, td
);
678 VFS_UNLOCK_GIANT(vfslocked
);
684 * Stat a vnode; implementation for the stat syscall
687 vn_stat(vp
, sb
, active_cred
, file_cred
, td
)
689 register struct stat
*sb
;
690 struct ucred
*active_cred
;
691 struct ucred
*file_cred
;
695 register struct vattr
*vap
;
700 error
= mac_vnode_check_stat(active_cred
, file_cred
, vp
);
706 error
= VOP_GETATTR(vp
, vap
, active_cred
);
711 * Zero the spare stat fields
713 bzero(sb
, sizeof *sb
);
716 * Copy from vattr table
718 if (vap
->va_fsid
!= VNOVAL
)
719 sb
->st_dev
= vap
->va_fsid
;
721 sb
->st_dev
= vp
->v_mount
->mnt_stat
.f_fsid
.val
[0];
722 sb
->st_ino
= vap
->va_fileid
;
724 switch (vap
->va_type
) {
750 sb
->st_nlink
= vap
->va_nlink
;
751 sb
->st_uid
= vap
->va_uid
;
752 sb
->st_gid
= vap
->va_gid
;
753 sb
->st_rdev
= vap
->va_rdev
;
754 if (vap
->va_size
> OFF_MAX
)
756 sb
->st_size
= vap
->va_size
;
757 sb
->st_atimespec
= vap
->va_atime
;
758 sb
->st_mtimespec
= vap
->va_mtime
;
759 sb
->st_ctimespec
= vap
->va_ctime
;
760 sb
->st_birthtimespec
= vap
->va_birthtime
;
763 * According to www.opengroup.org, the meaning of st_blksize is
764 * "a filesystem-specific preferred I/O block size for this
765 * object. In some filesystem types, this may vary from file
767 * Default to PAGE_SIZE after much discussion.
768 * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct.
771 sb
->st_blksize
= PAGE_SIZE
;
773 sb
->st_flags
= vap
->va_flags
;
774 if (priv_check(td
, PRIV_VFS_GENERATION
))
777 sb
->st_gen
= vap
->va_gen
;
779 sb
->st_blocks
= vap
->va_bytes
/ S_BLKSIZE
;
784 * File table vnode ioctl routine.
787 vn_ioctl(fp
, com
, data
, active_cred
, td
)
791 struct ucred
*active_cred
;
794 struct vnode
*vp
= fp
->f_vnode
;
799 vfslocked
= VFS_LOCK_GIANT(vp
->v_mount
);
801 switch (vp
->v_type
) {
804 if (com
== FIONREAD
) {
805 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
806 error
= VOP_GETATTR(vp
, &vattr
, active_cred
);
809 *(int *)data
= vattr
.va_size
- fp
->f_offset
;
811 if (com
== FIONBIO
|| com
== FIOASYNC
) /* XXX */
814 error
= VOP_IOCTL(vp
, com
, data
, fp
->f_flag
,
821 VFS_UNLOCK_GIANT(vfslocked
);
826 * File table vnode poll routine.
829 vn_poll(fp
, events
, active_cred
, td
)
832 struct ucred
*active_cred
;
840 vfslocked
= VFS_LOCK_GIANT(vp
->v_mount
);
842 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
843 error
= mac_vnode_check_poll(active_cred
, fp
->f_cred
, vp
);
848 error
= VOP_POLL(vp
, events
, fp
->f_cred
, td
);
849 VFS_UNLOCK_GIANT(vfslocked
);
854 * Acquire the requested lock and then check for validity. LK_RETRY
855 * permits vn_lock to return doomed vnodes.
858 _vn_lock(struct vnode
*vp
, int flags
, char *file
, int line
)
862 VNASSERT((flags
& LK_TYPE_MASK
) != 0, vp
,
863 ("vn_lock called with no locktype."));
865 error
= VOP_LOCK1(vp
, flags
, file
, line
);
866 flags
&= ~LK_INTERLOCK
; /* Interlock is always dropped. */
867 KASSERT((flags
& LK_RETRY
) == 0 || error
== 0,
868 ("LK_RETRY set with incompatible flags %d\n", flags
));
870 * Callers specify LK_RETRY if they wish to get dead vnodes.
871 * If RETRY is not set, we return ENOENT instead.
873 if (error
== 0 && vp
->v_iflag
& VI_DOOMED
&&
874 (flags
& LK_RETRY
) == 0) {
879 } while (flags
& LK_RETRY
&& error
!= 0);
884 * File table vnode close routine.
898 vfslocked
= VFS_LOCK_GIANT(vp
->v_mount
);
899 if (fp
->f_type
== DTYPE_VNODE
&& fp
->f_flag
& FHASLOCK
) {
900 lf
.l_whence
= SEEK_SET
;
904 (void) VOP_ADVLOCK(vp
, fp
, F_UNLCK
, &lf
, F_FLOCK
);
907 fp
->f_ops
= &badfileops
;
909 error
= vn_close(vp
, fp
->f_flag
, fp
->f_cred
, td
);
910 VFS_UNLOCK_GIANT(vfslocked
);
915 * Preparing to start a filesystem write operation. If the operation is
916 * permitted, then we bump the count of operations in progress and
917 * proceed. If a suspend request is in progress, we wait until the
918 * suspension is over, and then proceed.
921 vn_start_write(vp
, mpp
, flags
)
931 * If a vnode is provided, get and return the mount point that
932 * to which it will write.
935 if ((error
= VOP_GETWRITEMOUNT(vp
, mpp
)) != 0) {
937 if (error
!= EOPNOTSUPP
)
942 if ((mp
= *mpp
) == NULL
)
948 * Check on status of suspension.
950 while ((mp
->mnt_kern_flag
& MNTK_SUSPEND
) != 0) {
951 if (flags
& V_NOWAIT
) {
955 error
= msleep(&mp
->mnt_flag
, MNT_MTX(mp
),
956 (PUSER
- 1) | (flags
& PCATCH
), "suspfs", 0);
960 if (flags
& V_XSLEEP
)
962 mp
->mnt_writeopcount
++;
970 * Secondary suspension. Used by operations such as vop_inactive
971 * routines that are needed by the higher level functions. These
972 * are allowed to proceed until all the higher level functions have
973 * completed (indicated by mnt_writeopcount dropping to zero). At that
974 * time, these operations are halted until the suspension is over.
977 vn_write_suspend_wait(vp
, mp
, flags
)
985 if ((error
= VOP_GETWRITEMOUNT(vp
, &mp
)) != 0) {
986 if (error
!= EOPNOTSUPP
)
992 * If we are not suspended or have not yet reached suspended
993 * mode, then let the operation proceed.
1000 if ((mp
->mnt_kern_flag
& MNTK_SUSPENDED
) == 0) {
1005 if (flags
& V_NOWAIT
) {
1008 return (EWOULDBLOCK
);
1011 * Wait for the suspension to finish.
1013 error
= msleep(&mp
->mnt_flag
, MNT_MTX(mp
),
1014 (PUSER
- 1) | (flags
& PCATCH
) | PDROP
, "suspfs", 0);
1020 * Secondary suspension. Used by operations such as vop_inactive
1021 * routines that are needed by the higher level functions. These
1022 * are allowed to proceed until all the higher level functions have
1023 * completed (indicated by mnt_writeopcount dropping to zero). At that
1024 * time, these operations are halted until the suspension is over.
1027 vn_start_secondary_write(vp
, mpp
, flags
)
1037 if ((error
= VOP_GETWRITEMOUNT(vp
, mpp
)) != 0) {
1039 if (error
!= EOPNOTSUPP
)
1045 * If we are not suspended or have not yet reached suspended
1046 * mode, then let the operation proceed.
1048 if ((mp
= *mpp
) == NULL
)
1053 if ((mp
->mnt_kern_flag
& (MNTK_SUSPENDED
| MNTK_SUSPEND2
)) == 0) {
1054 mp
->mnt_secondary_writes
++;
1055 mp
->mnt_secondary_accwrites
++;
1060 if (flags
& V_NOWAIT
) {
1063 return (EWOULDBLOCK
);
1066 * Wait for the suspension to finish.
1068 error
= msleep(&mp
->mnt_flag
, MNT_MTX(mp
),
1069 (PUSER
- 1) | (flags
& PCATCH
) | PDROP
, "suspfs", 0);
1077 * Filesystem write operation has completed. If we are suspending and this
1078 * operation is the last one, notify the suspender that the suspension is
1082 vn_finished_write(mp
)
1088 mp
->mnt_writeopcount
--;
1089 if (mp
->mnt_writeopcount
< 0)
1090 panic("vn_finished_write: neg cnt");
1091 if ((mp
->mnt_kern_flag
& MNTK_SUSPEND
) != 0 &&
1092 mp
->mnt_writeopcount
<= 0)
1093 wakeup(&mp
->mnt_writeopcount
);
1099 * Filesystem secondary write operation has completed. If we are
1100 * suspending and this operation is the last one, notify the suspender
1101 * that the suspension is now in effect.
1104 vn_finished_secondary_write(mp
)
1110 mp
->mnt_secondary_writes
--;
1111 if (mp
->mnt_secondary_writes
< 0)
1112 panic("vn_finished_secondary_write: neg cnt");
1113 if ((mp
->mnt_kern_flag
& MNTK_SUSPEND
) != 0 &&
1114 mp
->mnt_secondary_writes
<= 0)
1115 wakeup(&mp
->mnt_secondary_writes
);
1122 * Request a filesystem to suspend write operations.
1125 vfs_write_suspend(mp
)
1128 struct thread
*td
= curthread
;
1132 if (mp
->mnt_kern_flag
& MNTK_SUSPEND
) {
1136 mp
->mnt_kern_flag
|= MNTK_SUSPEND
;
1137 if (mp
->mnt_writeopcount
> 0)
1138 (void) msleep(&mp
->mnt_writeopcount
,
1139 MNT_MTX(mp
), (PUSER
- 1)|PDROP
, "suspwt", 0);
1142 if ((error
= VFS_SYNC(mp
, MNT_SUSPEND
, td
)) != 0)
1143 vfs_write_resume(mp
);
1148 * Request a filesystem to resume write operations.
1151 vfs_write_resume(mp
)
1156 if ((mp
->mnt_kern_flag
& MNTK_SUSPEND
) != 0) {
1157 mp
->mnt_kern_flag
&= ~(MNTK_SUSPEND
| MNTK_SUSPEND2
|
1159 wakeup(&mp
->mnt_writeopcount
);
1160 wakeup(&mp
->mnt_flag
);
1166 * Implement kqueues for files by translating it to vnode operation.
1169 vn_kqfilter(struct file
*fp
, struct knote
*kn
)
1174 vfslocked
= VFS_LOCK_GIANT(fp
->f_vnode
->v_mount
);
1175 error
= VOP_KQFILTER(fp
->f_vnode
, kn
);
1176 VFS_UNLOCK_GIANT(vfslocked
);
1182 * Simplified in-kernel wrapper calls for extended attribute access.
1183 * Both calls pass in a NULL credential, authorizing as "kernel" access.
1184 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1187 vn_extattr_get(struct vnode
*vp
, int ioflg
, int attrnamespace
,
1188 const char *attrname
, int *buflen
, char *buf
, struct thread
*td
)
1194 iov
.iov_len
= *buflen
;
1197 auio
.uio_iov
= &iov
;
1198 auio
.uio_iovcnt
= 1;
1199 auio
.uio_rw
= UIO_READ
;
1200 auio
.uio_segflg
= UIO_SYSSPACE
;
1202 auio
.uio_offset
= 0;
1203 auio
.uio_resid
= *buflen
;
1205 if ((ioflg
& IO_NODELOCKED
) == 0)
1206 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
1208 ASSERT_VOP_LOCKED(vp
, "IO_NODELOCKED with no vp lock held");
1210 /* authorize attribute retrieval as kernel */
1211 error
= VOP_GETEXTATTR(vp
, attrnamespace
, attrname
, &auio
, NULL
, NULL
,
1214 if ((ioflg
& IO_NODELOCKED
) == 0)
1218 *buflen
= *buflen
- auio
.uio_resid
;
1225 * XXX failure mode if partially written?
1228 vn_extattr_set(struct vnode
*vp
, int ioflg
, int attrnamespace
,
1229 const char *attrname
, int buflen
, char *buf
, struct thread
*td
)
1236 iov
.iov_len
= buflen
;
1239 auio
.uio_iov
= &iov
;
1240 auio
.uio_iovcnt
= 1;
1241 auio
.uio_rw
= UIO_WRITE
;
1242 auio
.uio_segflg
= UIO_SYSSPACE
;
1244 auio
.uio_offset
= 0;
1245 auio
.uio_resid
= buflen
;
1247 if ((ioflg
& IO_NODELOCKED
) == 0) {
1248 if ((error
= vn_start_write(vp
, &mp
, V_WAIT
)) != 0)
1250 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
1253 ASSERT_VOP_LOCKED(vp
, "IO_NODELOCKED with no vp lock held");
1255 /* authorize attribute setting as kernel */
1256 error
= VOP_SETEXTATTR(vp
, attrnamespace
, attrname
, &auio
, NULL
, td
);
1258 if ((ioflg
& IO_NODELOCKED
) == 0) {
1259 vn_finished_write(mp
);
1267 vn_extattr_rm(struct vnode
*vp
, int ioflg
, int attrnamespace
,
1268 const char *attrname
, struct thread
*td
)
1273 if ((ioflg
& IO_NODELOCKED
) == 0) {
1274 if ((error
= vn_start_write(vp
, &mp
, V_WAIT
)) != 0)
1276 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
1279 ASSERT_VOP_LOCKED(vp
, "IO_NODELOCKED with no vp lock held");
1281 /* authorize attribute removal as kernel */
1282 error
= VOP_DELETEEXTATTR(vp
, attrnamespace
, attrname
, NULL
, td
);
1283 if (error
== EOPNOTSUPP
)
1284 error
= VOP_SETEXTATTR(vp
, attrnamespace
, attrname
, NULL
,
1287 if ((ioflg
& IO_NODELOCKED
) == 0) {
1288 vn_finished_write(mp
);