4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
28 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
29 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/t_lock.h>
35 #include <sys/systm.h>
36 #include <sys/sysmacros.h>
40 #include <sys/vfs_opreg.h>
41 #include <sys/vnode.h>
43 #include <sys/fcntl.h>
44 #include <sys/flock.h>
47 #include <sys/errno.h>
50 #include <sys/dirent.h>
51 #include <sys/pathname.h>
52 #include <sys/vmsystm.h>
53 #include <sys/fs/tmp.h>
54 #include <sys/fs/tmpnode.h>
57 #include <vm/seg_vn.h>
58 #include <vm/seg_map.h>
64 #include <sys/cmn_err.h>
65 #include <sys/debug.h>
69 #include <sys/vtrace.h>
70 #include <sys/policy.h>
71 #include <fs/fs_subr.h>
73 static int tmp_getapage(struct vnode
*, u_offset_t
, size_t, uint_t
*,
74 page_t
**, size_t, struct seg
*, caddr_t
, enum seg_rw
, struct cred
*);
75 static int tmp_putapage(struct vnode
*, page_t
*, u_offset_t
*, size_t *,
80 tmp_open(struct vnode
**vpp
, int flag
, struct cred
*cred
, caller_context_t
*ct
)
83 * swapon to a tmpfs file is not supported so access
84 * is denied on open if VISSWAP is set.
86 if ((*vpp
)->v_flag
& VISSWAP
)
101 cleanlocks(vp
, ttoproc(curthread
)->p_pid
, 0);
102 cleanshares(vp
, ttoproc(curthread
)->p_pid
);
107 * wrtmp does the real work of write requests for tmpfs.
115 struct caller_context
*ct
)
117 pgcnt_t pageoffset
; /* offset in pages */
118 ulong_t segmap_offset
; /* pagesize byte offset into segmap */
119 caddr_t base
; /* base of segmap */
120 ssize_t bytes
; /* bytes to uiomove */
121 pfn_t pagenumber
; /* offset in pages into tmp file */
124 int pagecreate
; /* == 1 if we allocated a page */
126 rlim64_t limit
= uio
->uio_llimit
;
127 long oresid
= uio
->uio_resid
;
130 long tn_size_changed
= 0;
135 ASSERT(vp
->v_type
== VREG
);
137 TRACE_1(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_START
,
138 "tmp_wrtmp_start:vp %p", vp
);
140 ASSERT(RW_WRITE_HELD(&tp
->tn_contents
));
141 ASSERT(RW_WRITE_HELD(&tp
->tn_rwlock
));
143 if (MANDLOCK(vp
, tp
->tn_mode
)) {
144 rw_exit(&tp
->tn_contents
);
146 * tmp_getattr ends up being called by chklock
148 error
= chklock(vp
, FWRITE
, uio
->uio_loffset
, uio
->uio_resid
,
150 rw_enter(&tp
->tn_contents
, RW_WRITER
);
152 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
153 "tmp_wrtmp_end:vp %p error %d", vp
, error
);
158 if (uio
->uio_loffset
< 0)
161 if (limit
== RLIM64_INFINITY
|| limit
> MAXOFFSET_T
)
164 if (uio
->uio_loffset
>= limit
) {
165 proc_t
*p
= ttoproc(curthread
);
167 mutex_enter(&p
->p_lock
);
168 (void) rctl_action(rctlproc_legacy
[RLIMIT_FSIZE
], p
->p_rctls
,
169 p
, RCA_UNSAFE_SIGINFO
);
170 mutex_exit(&p
->p_lock
);
174 if (uio
->uio_loffset
>= MAXOFF_T
) {
175 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
176 "tmp_wrtmp_end:vp %p error %d", vp
, EINVAL
);
180 if (uio
->uio_resid
== 0) {
181 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
182 "tmp_wrtmp_end:vp %p error %d", vp
, 0);
186 if (limit
> MAXOFF_T
)
193 offset
= (long)uio
->uio_offset
;
194 pageoffset
= offset
& PAGEOFFSET
;
196 * A maximum of PAGESIZE bytes of data is transferred
197 * each pass through this loop
199 bytes
= MIN(PAGESIZE
- pageoffset
, uio
->uio_resid
);
201 if (offset
+ bytes
>= limit
) {
202 if (offset
>= limit
) {
206 bytes
= limit
- offset
;
208 pagenumber
= btop(offset
);
211 * delta is the amount of anonymous memory
212 * to reserve for the file.
213 * We always reserve in pagesize increments so
214 * unless we're extending the file into a new page,
215 * we don't need to call tmp_resv.
217 delta
= offset
+ bytes
-
218 P2ROUNDUP_TYPED(tp
->tn_size
, PAGESIZE
, u_offset_t
);
221 if (tmp_resv(tm
, tp
, delta
, pagecreate
)) {
223 * Log file system full in the zone that owns
224 * the tmpfs mount, as well as in the global
227 zcmn_err(tm
->tm_vfsp
->vfs_zone
->zone_id
,
228 CE_WARN
, "%s: File system full, "
229 "swap space limit exceeded",
232 if (tm
->tm_vfsp
->vfs_zone
->zone_id
!=
235 vfs_t
*vfs
= tm
->tm_vfsp
;
237 zcmn_err(GLOBAL_ZONEID
,
238 CE_WARN
, "%s: File system full, "
239 "swap space limit exceeded",
240 vfs
->vfs_vnodecovered
->v_path
);
245 tmpnode_growmap(tp
, (ulong_t
)offset
+ bytes
);
247 /* grow the file to the new length */
248 if (offset
+ bytes
> tp
->tn_size
) {
250 old_tn_size
= tp
->tn_size
;
252 * Postpone updating tp->tn_size until uiomove() is
255 new_tn_size
= offset
+ bytes
;
257 if (bytes
== PAGESIZE
) {
259 * Writing whole page so reading from disk
267 * If writing past EOF or filling in a hole
268 * we need to allocate an anon slot.
270 if (anon_get_ptr(tp
->tn_anon
, pagenumber
) == NULL
) {
271 (void) anon_set_ptr(tp
->tn_anon
, pagenumber
,
272 anon_alloc(vp
, ptob(pagenumber
)), ANON_SLEEP
);
278 * We have to drop the contents lock to allow the VM
279 * system to reacquire it in tmp_getpage()
281 rw_exit(&tp
->tn_contents
);
284 * Touch the page and fault it in if it is not in core
285 * before segmap_getmapflt or vpm_data_copy can lock it.
286 * This is to avoid the deadlock if the buffer is mapped
287 * to the same file through mmap which we want to write.
289 uio_prefaultpages((long)bytes
, uio
);
294 * Copy data. If new pages are created, part of
295 * the page that is not written will be initizliazed
298 error
= vpm_data_copy(vp
, offset
, bytes
, uio
,
299 !pagecreate
, &newpage
, 1, S_WRITE
);
301 /* Get offset within the segmap mapping */
302 segmap_offset
= (offset
& PAGEMASK
) & MAXBOFFSET
;
303 base
= segmap_getmapflt(segkmap
, vp
,
304 (offset
& MAXBMASK
), PAGESIZE
, !pagecreate
,
309 if (!vpm_enable
&& pagecreate
) {
311 * segmap_pagecreate() returns 1 if it calls
312 * page_create_va() to allocate any pages.
314 newpage
= segmap_pagecreate(segkmap
,
315 base
+ segmap_offset
, (size_t)PAGESIZE
, 0);
317 * Clear from the beginning of the page to the starting
318 * offset of the data.
321 (void) kzero(base
+ segmap_offset
,
326 error
= uiomove(base
+ segmap_offset
+ pageoffset
,
327 (long)bytes
, UIO_WRITE
, uio
);
330 if (!vpm_enable
&& pagecreate
&&
331 uio
->uio_offset
< P2ROUNDUP(offset
+ bytes
, PAGESIZE
)) {
332 long zoffset
; /* zero from offset into page */
334 * We created pages w/o initializing them completely,
335 * thus we need to zero the part that wasn't set up.
336 * This happens on most EOF write cases and if
337 * we had some sort of error during the uiomove.
341 nmoved
= uio
->uio_offset
- offset
;
342 ASSERT((nmoved
+ pageoffset
) <= PAGESIZE
);
345 * Zero from the end of data in the page to the
348 if ((zoffset
= pageoffset
+ nmoved
) < PAGESIZE
)
349 (void) kzero(base
+ segmap_offset
+ zoffset
,
350 (size_t)PAGESIZE
- zoffset
);
354 * Unlock the pages which have been allocated by
355 * page_create_va() in segmap_pagecreate()
357 if (!vpm_enable
&& newpage
) {
358 segmap_pageunlock(segkmap
, base
+ segmap_offset
,
359 (size_t)PAGESIZE
, S_WRITE
);
364 * If we failed on a write, we must
365 * be sure to invalidate any pages that may have
369 (void) vpm_sync_pages(vp
, offset
, PAGESIZE
,
372 (void) segmap_release(segkmap
, base
, SM_INVAL
);
376 error
= vpm_sync_pages(vp
, offset
, PAGESIZE
,
379 error
= segmap_release(segkmap
, base
, 0);
384 * Re-acquire contents lock.
386 rw_enter(&tp
->tn_contents
, RW_WRITER
);
392 tp
->tn_size
= new_tn_size
;
395 * If the uiomove failed, fix up tn_size.
398 if (tn_size_changed
) {
400 * The uiomove failed, and we
401 * allocated blocks,so get rid
404 (void) tmpnode_trunc(tm
, tp
,
405 (ulong_t
)old_tn_size
);
409 * XXX - Can this be out of the loop?
411 if ((tp
->tn_mode
& (S_IXUSR
| S_IXGRP
| S_IXOTH
)) &&
412 (tp
->tn_mode
& (S_ISUID
| S_ISGID
)) &&
413 secpolicy_vnode_setid_retain(cr
,
414 (tp
->tn_mode
& S_ISUID
) != 0 && tp
->tn_uid
== 0)) {
416 * Clear Set-UID & Set-GID bits on
417 * successful write if not privileged
418 * and at least one of the execute bits
419 * is set. If we always clear Set-GID,
420 * mandatory file and record locking is
423 tp
->tn_mode
&= ~(S_ISUID
| S_ISGID
);
429 } while (error
== 0 && uio
->uio_resid
> 0 && bytes
!= 0);
433 * If we've already done a partial-write, terminate
434 * the write but return no error.
436 if (oresid
!= uio
->uio_resid
)
438 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
439 "tmp_wrtmp_end:vp %p error %d", vp
, error
);
444 * rdtmp does the real work of read requests for tmpfs.
451 struct caller_context
*ct
)
453 ulong_t pageoffset
; /* offset in tmpfs file (uio_offset) */
454 ulong_t segmap_offset
; /* pagesize byte offset into segmap */
455 caddr_t base
; /* base of segmap */
456 ssize_t bytes
; /* bytes to uiomove */
459 long oresid
= uio
->uio_resid
;
466 TRACE_1(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_START
, "tmp_rdtmp_start:vp %p",
469 ASSERT(RW_LOCK_HELD(&tp
->tn_contents
));
471 if (MANDLOCK(vp
, tp
->tn_mode
)) {
472 rw_exit(&tp
->tn_contents
);
474 * tmp_getattr ends up being called by chklock
476 error
= chklock(vp
, FREAD
, uio
->uio_loffset
, uio
->uio_resid
,
478 rw_enter(&tp
->tn_contents
, RW_READER
);
480 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
481 "tmp_rdtmp_end:vp %p error %d", vp
, error
);
485 ASSERT(tp
->tn_type
== VREG
);
487 if (uio
->uio_loffset
>= MAXOFF_T
) {
488 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
489 "tmp_rdtmp_end:vp %p error %d", vp
, EINVAL
);
492 if (uio
->uio_loffset
< 0)
494 if (uio
->uio_resid
== 0) {
495 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
496 "tmp_rdtmp_end:vp %p error %d", vp
, 0);
506 offset
= uio
->uio_offset
;
507 pageoffset
= offset
& PAGEOFFSET
;
508 bytes
= MIN(PAGESIZE
- pageoffset
, uio
->uio_resid
);
510 diff
= tp
->tn_size
- offset
;
520 * We have to drop the contents lock to allow the VM system
521 * to reacquire it in tmp_getpage() should the uiomove cause a
524 rw_exit(&tp
->tn_contents
);
530 error
= vpm_data_copy(vp
, offset
, bytes
, uio
, 1, NULL
,
533 segmap_offset
= (offset
& PAGEMASK
) & MAXBOFFSET
;
534 base
= segmap_getmapflt(segkmap
, vp
, offset
& MAXBMASK
,
537 error
= uiomove(base
+ segmap_offset
+ pageoffset
,
538 (long)bytes
, UIO_READ
, uio
);
543 (void) vpm_sync_pages(vp
, offset
, PAGESIZE
, 0);
545 (void) segmap_release(segkmap
, base
, 0);
549 error
= vpm_sync_pages(vp
, offset
, PAGESIZE
,
552 error
= segmap_release(segkmap
, base
, 0);
557 * Re-acquire contents lock.
559 rw_enter(&tp
->tn_contents
, RW_READER
);
561 } while (error
== 0 && uio
->uio_resid
> 0);
564 gethrestime(&tp
->tn_atime
);
567 * If we've already done a partial read, terminate
568 * the read but return no error.
570 if (oresid
!= uio
->uio_resid
)
573 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
574 "tmp_rdtmp_end:vp %x error %d", vp
, error
);
580 tmp_read(struct vnode
*vp
, struct uio
*uiop
, int ioflag
, cred_t
*cred
,
581 struct caller_context
*ct
)
583 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
584 struct tmount
*tm
= (struct tmount
*)VTOTM(vp
);
588 * We don't currently support reading non-regular files
590 if (vp
->v_type
== VDIR
)
592 if (vp
->v_type
!= VREG
)
595 * tmp_rwlock should have already been called from layers above
597 ASSERT(RW_READ_HELD(&tp
->tn_rwlock
));
599 rw_enter(&tp
->tn_contents
, RW_READER
);
601 error
= rdtmp(tm
, tp
, uiop
, ct
);
603 rw_exit(&tp
->tn_contents
);
609 tmp_write(struct vnode
*vp
, struct uio
*uiop
, int ioflag
, struct cred
*cred
,
610 struct caller_context
*ct
)
612 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
613 struct tmount
*tm
= (struct tmount
*)VTOTM(vp
);
617 * We don't currently support writing to non-regular files
619 if (vp
->v_type
!= VREG
)
620 return (EINVAL
); /* XXX EISDIR? */
623 * tmp_rwlock should have already been called from layers above
625 ASSERT(RW_WRITE_HELD(&tp
->tn_rwlock
));
627 rw_enter(&tp
->tn_contents
, RW_WRITER
);
629 if (ioflag
& FAPPEND
) {
631 * In append mode start at end of file.
633 uiop
->uio_loffset
= tp
->tn_size
;
636 error
= wrtmp(tm
, tp
, uiop
, cred
, ct
);
638 rw_exit(&tp
->tn_contents
);
652 caller_context_t
*ct
)
664 caller_context_t
*ct
)
666 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
672 * A special case to handle the root tnode on a diskless nfs
673 * client who may have had its uid and gid inherited
674 * from an nfs vnode with nobody ownership. Likely the
675 * root filesystem. After nfs is fully functional the uid/gid
676 * may be mapable so ask again.
677 * vfsp can't get unmounted because we hold vp.
679 if (vp
->v_flag
& VROOT
&&
680 (mvp
= vp
->v_vfsp
->vfs_vnodecovered
) != NULL
) {
681 mutex_enter(&tp
->tn_tlock
);
682 if (tp
->tn_uid
== UID_NOBODY
|| tp
->tn_gid
== GID_NOBODY
) {
683 mutex_exit(&tp
->tn_tlock
);
684 bzero(&va
, sizeof (struct vattr
));
685 va
.va_mask
= AT_UID
|AT_GID
;
686 attrs
= VOP_GETATTR(mvp
, &va
, 0, cred
, ct
);
688 mutex_exit(&tp
->tn_tlock
);
691 mutex_enter(&tp
->tn_tlock
);
693 tp
->tn_uid
= va
.va_uid
;
694 tp
->tn_gid
= va
.va_gid
;
696 vap
->va_type
= vp
->v_type
;
697 vap
->va_mode
= tp
->tn_mode
& MODEMASK
;
698 vap
->va_uid
= tp
->tn_uid
;
699 vap
->va_gid
= tp
->tn_gid
;
700 vap
->va_fsid
= tp
->tn_fsid
;
701 vap
->va_nodeid
= (ino64_t
)tp
->tn_nodeid
;
702 vap
->va_nlink
= tp
->tn_nlink
;
703 vap
->va_size
= (u_offset_t
)tp
->tn_size
;
704 vap
->va_atime
= tp
->tn_atime
;
705 vap
->va_mtime
= tp
->tn_mtime
;
706 vap
->va_ctime
= tp
->tn_ctime
;
707 vap
->va_blksize
= PAGESIZE
;
708 vap
->va_rdev
= tp
->tn_rdev
;
709 vap
->va_seq
= tp
->tn_seq
;
712 * XXX Holes are not taken into account. We could take the time to
713 * run through the anon array looking for allocated slots...
715 vap
->va_nblocks
= (fsblkcnt64_t
)btodb(ptob(btopr(vap
->va_size
)));
716 mutex_exit(&tp
->tn_tlock
);
727 caller_context_t
*ct
)
729 struct tmount
*tm
= (struct tmount
*)VTOTM(vp
);
730 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
736 * Cannot set these attributes
738 if ((vap
->va_mask
& AT_NOSET
) || (vap
->va_mask
& AT_XVATTR
))
741 mutex_enter(&tp
->tn_tlock
);
745 * Change file access modes. Must be owner or have sufficient
748 error
= secpolicy_vnode_setattr(cred
, vp
, vap
, get
, flags
, tmp_taccess
,
756 if (mask
& AT_MODE
) {
757 get
->va_mode
&= S_IFMT
;
758 get
->va_mode
|= vap
->va_mode
& ~S_IFMT
;
762 get
->va_uid
= vap
->va_uid
;
764 get
->va_gid
= vap
->va_gid
;
766 get
->va_atime
= vap
->va_atime
;
768 get
->va_mtime
= vap
->va_mtime
;
770 if (mask
& (AT_UID
| AT_GID
| AT_MODE
| AT_MTIME
))
771 gethrestime(&tp
->tn_ctime
);
773 if (mask
& AT_SIZE
) {
774 ASSERT(vp
->v_type
!= VDIR
);
776 /* Don't support large files. */
777 if (vap
->va_size
> MAXOFF_T
) {
781 mutex_exit(&tp
->tn_tlock
);
783 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
784 rw_enter(&tp
->tn_contents
, RW_WRITER
);
785 error
= tmpnode_trunc(tm
, tp
, (ulong_t
)vap
->va_size
);
786 rw_exit(&tp
->tn_contents
);
787 rw_exit(&tp
->tn_rwlock
);
789 if (error
== 0 && vap
->va_size
== 0)
790 vnevent_truncate(vp
, ct
);
795 mutex_exit(&tp
->tn_tlock
);
807 caller_context_t
*ct
)
809 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
812 mutex_enter(&tp
->tn_tlock
);
813 error
= tmp_taccess(tp
, mode
, cred
);
814 mutex_exit(&tp
->tn_tlock
);
824 struct pathname
*pnp
,
828 caller_context_t
*ct
,
832 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(dvp
);
833 struct tmpnode
*ntp
= NULL
;
837 /* allow cd into @ dir */
838 if (flags
& LOOKUP_XATTR
) {
843 * don't allow attributes if not mounted XATTR support
845 if (!(dvp
->v_vfsp
->vfs_flag
& VFS_XATTR
))
848 if (tp
->tn_flags
& ISXATTR
)
849 /* No attributes on attributes */
852 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
853 if (tp
->tn_xattrdp
== NULL
) {
854 if (!(flags
& CREATE_XATTR_DIR
)) {
855 rw_exit(&tp
->tn_rwlock
);
860 * No attribute directory exists for this
861 * node - create the attr dir as a side effect
866 * Make sure we have adequate permission...
869 if ((error
= tmp_taccess(tp
, VWRITE
, cred
)) != 0) {
870 rw_exit(&tp
->tn_rwlock
);
874 xdp
= tmp_memalloc(sizeof (struct tmpnode
),
877 tmpnode_init(tm
, xdp
, &tp
->tn_attr
, NULL
);
879 * Fix-up fields unique to attribute directories.
881 xdp
->tn_flags
= ISXATTR
;
883 if (tp
->tn_type
== VDIR
) {
884 xdp
->tn_mode
= tp
->tn_attr
.va_mode
;
887 if (tp
->tn_attr
.va_mode
& 0040)
888 xdp
->tn_mode
|= 0750;
889 if (tp
->tn_attr
.va_mode
& 0004)
890 xdp
->tn_mode
|= 0705;
892 xdp
->tn_vnode
->v_type
= VDIR
;
893 xdp
->tn_vnode
->v_flag
|= V_XATTRDIR
;
895 tp
->tn_xattrdp
= xdp
;
897 VN_HOLD(tp
->tn_xattrdp
->tn_vnode
);
899 *vpp
= TNTOV(tp
->tn_xattrdp
);
900 rw_exit(&tp
->tn_rwlock
);
905 * Null component name is a synonym for directory being searched.
914 error
= tdirlookup(tp
, nm
, &ntp
, cred
);
920 * If vnode is a device return special vnode instead
922 if (IS_DEVVP(*vpp
)) {
925 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
, (*vpp
)->v_type
,
931 TRACE_4(TR_FAC_TMPFS
, TR_TMPFS_LOOKUP
,
932 "tmpfs lookup:vp %p name %s vpp %p error %d",
933 dvp
, nm
, vpp
, error
);
943 enum vcexcl exclusive
,
948 caller_context_t
*ct
,
951 struct tmpnode
*parent
;
953 struct tmpnode
*self
;
955 struct tmpnode
*oldtp
;
958 parent
= (struct tmpnode
*)VTOTN(dvp
);
959 tm
= (struct tmount
*)VTOTM(dvp
);
964 /* device files not allowed in ext. attr dirs */
965 if ((parent
->tn_flags
& ISXATTR
) &&
966 (vap
->va_type
== VBLK
|| vap
->va_type
== VCHR
||
967 vap
->va_type
== VFIFO
|| vap
->va_type
== VDOOR
||
968 vap
->va_type
== VSOCK
|| vap
->va_type
== VPORT
))
971 if (vap
->va_type
== VREG
&& (vap
->va_mode
& VSVTX
)) {
972 /* Must be privileged to set sticky bit */
973 if (secpolicy_vnode_stky_modify(cred
))
974 vap
->va_mode
&= ~VSVTX
;
975 } else if (vap
->va_type
== VNON
) {
980 * Null component name is a synonym for directory being searched.
986 error
= tdirlookup(parent
, nm
, &oldtp
, cred
);
989 if (error
== 0) { /* name found */
990 boolean_t trunc
= B_FALSE
;
994 rw_enter(&oldtp
->tn_rwlock
, RW_WRITER
);
997 * if create/read-only an existing
998 * directory, allow it
1000 if (exclusive
== EXCL
)
1002 else if ((oldtp
->tn_type
== VDIR
) && (mode
& VWRITE
))
1005 error
= tmp_taccess(oldtp
, mode
, cred
);
1009 rw_exit(&oldtp
->tn_rwlock
);
1010 tmpnode_rele(oldtp
);
1013 *vpp
= TNTOV(oldtp
);
1014 if ((*vpp
)->v_type
== VREG
&& (vap
->va_mask
& AT_SIZE
) &&
1015 vap
->va_size
== 0) {
1016 rw_enter(&oldtp
->tn_contents
, RW_WRITER
);
1017 (void) tmpnode_trunc(tm
, oldtp
, 0);
1018 rw_exit(&oldtp
->tn_contents
);
1021 rw_exit(&oldtp
->tn_rwlock
);
1022 if (IS_DEVVP(*vpp
)) {
1023 struct vnode
*newvp
;
1025 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
, (*vpp
)->v_type
,
1028 if (newvp
== NULL
) {
1035 vnevent_create(*vpp
, ct
);
1040 if (error
!= ENOENT
)
1043 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1044 error
= tdirenter(tm
, parent
, nm
, DE_CREATE
,
1045 (struct tmpnode
*)NULL
, (struct tmpnode
*)NULL
,
1046 vap
, &self
, cred
, ct
);
1047 rw_exit(&parent
->tn_rwlock
);
1053 if (error
== EEXIST
) {
1055 * This means that the file was created sometime
1056 * after we checked and did not find it and when
1057 * we went to create it.
1058 * Since creat() is supposed to truncate a file
1059 * that already exits go back to the begining
1060 * of the function. This time we will find it
1061 * and go down the tmp_trunc() path
1070 if (!error
&& IS_DEVVP(*vpp
)) {
1071 struct vnode
*newvp
;
1073 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
, (*vpp
)->v_type
, cred
);
1079 TRACE_3(TR_FAC_TMPFS
, TR_TMPFS_CREATE
,
1080 "tmpfs create:dvp %p nm %s vpp %p", dvp
, nm
, vpp
);
1090 caller_context_t
*ct
,
1093 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1095 struct tmpnode
*tp
= NULL
;
1097 error
= tdirlookup(parent
, nm
, &tp
, cred
);
1102 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1103 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
1105 if (tp
->tn_type
!= VDIR
||
1106 (error
= secpolicy_fs_linkdir(cred
, dvp
->v_vfsp
)) == 0)
1107 error
= tdirdelete(parent
, tp
, nm
, DR_REMOVE
, cred
);
1109 rw_exit(&tp
->tn_rwlock
);
1110 rw_exit(&parent
->tn_rwlock
);
1111 vnevent_remove(TNTOV(tp
), dvp
, nm
, ct
);
1114 TRACE_3(TR_FAC_TMPFS
, TR_TMPFS_REMOVE
,
1115 "tmpfs remove:dvp %p nm %s error %d", dvp
, nm
, error
);
1123 struct vnode
*srcvp
,
1126 caller_context_t
*ct
,
1129 struct tmpnode
*parent
;
1130 struct tmpnode
*from
;
1131 struct tmount
*tm
= (struct tmount
*)VTOTM(dvp
);
1133 struct tmpnode
*found
= NULL
;
1134 struct vnode
*realvp
;
1136 if (VOP_REALVP(srcvp
, &realvp
, ct
) == 0)
1139 parent
= (struct tmpnode
*)VTOTN(dvp
);
1140 from
= (struct tmpnode
*)VTOTN(srcvp
);
1142 if ((srcvp
->v_type
== VDIR
&&
1143 secpolicy_fs_linkdir(cred
, dvp
->v_vfsp
)) ||
1144 (from
->tn_uid
!= crgetuid(cred
) && secpolicy_basic_link(cred
)))
1148 * Make sure link for extended attributes is valid
1149 * We only support hard linking of xattr's in xattrdir to an xattrdir
1151 if ((from
->tn_flags
& ISXATTR
) != (parent
->tn_flags
& ISXATTR
))
1154 error
= tdirlookup(parent
, tnm
, &found
, cred
);
1157 tmpnode_rele(found
);
1161 if (error
!= ENOENT
)
1164 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1165 error
= tdirenter(tm
, parent
, tnm
, DE_LINK
, (struct tmpnode
*)NULL
,
1166 from
, NULL
, (struct tmpnode
**)NULL
, cred
, ct
);
1167 rw_exit(&parent
->tn_rwlock
);
1169 vnevent_link(srcvp
, ct
);
1177 struct vnode
*odvp
, /* source parent vnode */
1178 char *onm
, /* source name */
1179 struct vnode
*ndvp
, /* destination parent vnode */
1180 char *nnm
, /* destination name */
1182 caller_context_t
*ct
,
1185 struct tmpnode
*fromparent
;
1186 struct tmpnode
*toparent
;
1187 struct tmpnode
*fromtp
= NULL
; /* source tmpnode */
1188 struct tmount
*tm
= (struct tmount
*)VTOTM(odvp
);
1190 int samedir
= 0; /* set if odvp == ndvp */
1191 struct vnode
*realvp
;
1193 if (VOP_REALVP(ndvp
, &realvp
, ct
) == 0)
1196 fromparent
= (struct tmpnode
*)VTOTN(odvp
);
1197 toparent
= (struct tmpnode
*)VTOTN(ndvp
);
1199 if ((fromparent
->tn_flags
& ISXATTR
) != (toparent
->tn_flags
& ISXATTR
))
1202 mutex_enter(&tm
->tm_renamelck
);
1205 * Look up tmpnode of file we're supposed to rename.
1207 error
= tdirlookup(fromparent
, onm
, &fromtp
, cred
);
1209 mutex_exit(&tm
->tm_renamelck
);
1214 * Make sure we can delete the old (source) entry. This
1215 * requires write permission on the containing directory. If
1216 * that directory is "sticky" it requires further checks.
1218 if (((error
= tmp_taccess(fromparent
, VWRITE
, cred
)) != 0) ||
1219 (error
= tmp_sticky_remove_access(fromparent
, fromtp
, cred
)) != 0)
1223 * Check for renaming to or from '.' or '..' or that
1224 * fromtp == fromparent
1226 if ((onm
[0] == '.' &&
1227 (onm
[1] == '\0' || (onm
[1] == '.' && onm
[2] == '\0'))) ||
1229 (nnm
[1] == '\0' || (nnm
[1] == '.' && nnm
[2] == '\0'))) ||
1230 (fromparent
== fromtp
)) {
1235 samedir
= (fromparent
== toparent
);
1237 * Make sure we can search and rename into the new
1238 * (destination) directory.
1241 error
= tmp_taccess(toparent
, VEXEC
|VWRITE
, cred
);
1247 * Link source to new target
1249 rw_enter(&toparent
->tn_rwlock
, RW_WRITER
);
1250 error
= tdirenter(tm
, toparent
, nnm
, DE_RENAME
,
1251 fromparent
, fromtp
, (struct vattr
*)NULL
,
1252 (struct tmpnode
**)NULL
, cred
, ct
);
1253 rw_exit(&toparent
->tn_rwlock
);
1257 * ESAME isn't really an error; it indicates that the
1258 * operation should not be done because the source and target
1259 * are the same file, but that no error should be reported.
1265 vnevent_rename_src(TNTOV(fromtp
), odvp
, onm
, ct
);
1268 * Notify the target directory if not same as
1272 vnevent_rename_dest_dir(ndvp
, ct
);
1276 * Unlink from source.
1278 rw_enter(&fromparent
->tn_rwlock
, RW_WRITER
);
1279 rw_enter(&fromtp
->tn_rwlock
, RW_WRITER
);
1281 error
= tdirdelete(fromparent
, fromtp
, onm
, DR_RENAME
, cred
);
1284 * The following handles the case where our source tmpnode was
1285 * removed before we got to it.
1287 * XXX We should also cleanup properly in the case where tdirdelete
1288 * fails for some other reason. Currently this case shouldn't happen.
1291 if (error
== ENOENT
)
1294 rw_exit(&fromtp
->tn_rwlock
);
1295 rw_exit(&fromparent
->tn_rwlock
);
1297 tmpnode_rele(fromtp
);
1298 mutex_exit(&tm
->tm_renamelck
);
1300 TRACE_5(TR_FAC_TMPFS
, TR_TMPFS_RENAME
,
1301 "tmpfs rename:ovp %p onm %s nvp %p nnm %s error %d", odvp
, onm
,
1314 caller_context_t
*ct
,
1318 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1319 struct tmpnode
*self
= NULL
;
1320 struct tmount
*tm
= (struct tmount
*)VTOTM(dvp
);
1323 /* no new dirs allowed in xattr dirs */
1324 if (parent
->tn_flags
& ISXATTR
)
1328 * Might be dangling directory. Catch it here,
1329 * because a ENOENT return from tdirlookup() is
1332 if (parent
->tn_nlink
== 0)
1335 error
= tdirlookup(parent
, nm
, &self
, cred
);
1341 if (error
!= ENOENT
)
1344 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1345 error
= tdirenter(tm
, parent
, nm
, DE_MKDIR
, (struct tmpnode
*)NULL
,
1346 (struct tmpnode
*)NULL
, va
, &self
, cred
, ct
);
1348 rw_exit(&parent
->tn_rwlock
);
1353 rw_exit(&parent
->tn_rwlock
);
1365 caller_context_t
*ct
,
1368 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1369 struct tmpnode
*self
= NULL
;
1374 * Return error when removing . and ..
1376 if (strcmp(nm
, ".") == 0)
1378 if (strcmp(nm
, "..") == 0)
1379 return (EEXIST
); /* Should be ENOTEMPTY */
1380 error
= tdirlookup(parent
, nm
, &self
, cred
);
1384 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1385 rw_enter(&self
->tn_rwlock
, RW_WRITER
);
1388 if (vp
== dvp
|| vp
== cdir
) {
1392 if (self
->tn_type
!= VDIR
) {
1397 mutex_enter(&self
->tn_tlock
);
1398 if (self
->tn_nlink
> 2) {
1399 mutex_exit(&self
->tn_tlock
);
1403 mutex_exit(&self
->tn_tlock
);
1405 if (vn_vfswlock(vp
)) {
1409 if (vn_mountedvfs(vp
) != NULL
) {
1415 * Check for an empty directory
1416 * i.e. only includes entries for "." and ".."
1418 if (self
->tn_dirents
> 2) {
1419 error
= EEXIST
; /* SIGH should be ENOTEMPTY */
1421 * Update atime because checking tn_dirents is logically
1422 * equivalent to reading the directory
1424 gethrestime(&self
->tn_atime
);
1428 error
= tdirdelete(parent
, self
, nm
, DR_RMDIR
, cred
);
1432 rw_exit(&self
->tn_rwlock
);
1433 rw_exit(&parent
->tn_rwlock
);
1434 vnevent_rmdir(TNTOV(self
), dvp
, nm
, ct
);
1447 caller_context_t
*ct
,
1450 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1451 struct tdirent
*tdp
;
1454 struct dirent64
*dp
;
1456 ulong_t total_bytes_wanted
;
1462 if (uiop
->uio_loffset
>= MAXOFF_T
) {
1468 * assuming system call has already called tmp_rwlock
1470 ASSERT(RW_READ_HELD(&tp
->tn_rwlock
));
1472 if (uiop
->uio_iovcnt
!= 1)
1475 if (vp
->v_type
!= VDIR
)
1479 * There's a window here where someone could have removed
1480 * all the entries in the directory after we put a hold on the
1481 * vnode but before we grabbed the rwlock. Just return.
1483 if (tp
->tn_dir
== NULL
) {
1485 panic("empty directory 0x%p", (void *)tp
);
1492 * Get space for multiple directory entries
1494 total_bytes_wanted
= uiop
->uio_iov
->iov_len
;
1495 bufsize
= total_bytes_wanted
+ sizeof (struct dirent64
);
1496 outbuf
= kmem_alloc(bufsize
, KM_SLEEP
);
1498 dp
= (struct dirent64
*)outbuf
;
1504 namelen
= strlen(tdp
->td_name
); /* no +1 needed */
1505 offset
= tdp
->td_offset
;
1506 if (offset
>= uiop
->uio_offset
) {
1507 reclen
= (int)DIRENT64_RECLEN(namelen
);
1508 if (outcount
+ reclen
> total_bytes_wanted
) {
1511 * Buffer too small for any entries.
1516 ASSERT(tdp
->td_tmpnode
!= NULL
);
1518 /* use strncpy(9f) to zero out uninitialized bytes */
1520 (void) strncpy(dp
->d_name
, tdp
->td_name
,
1521 DIRENT64_NAMELEN(reclen
));
1522 dp
->d_reclen
= (ushort_t
)reclen
;
1523 dp
->d_ino
= (ino64_t
)tdp
->td_tmpnode
->tn_nodeid
;
1524 dp
->d_off
= (offset_t
)tdp
->td_offset
+ 1;
1525 dp
= (struct dirent64
*)
1526 ((uintptr_t)dp
+ dp
->d_reclen
);
1528 ASSERT(outcount
<= bufsize
);
1534 error
= uiomove(outbuf
, outcount
, UIO_READ
, uiop
);
1537 /* If we reached the end of the list our offset */
1538 /* should now be just past the end. */
1545 uiop
->uio_offset
= offset
;
1547 gethrestime(&tp
->tn_atime
);
1548 kmem_free(outbuf
, bufsize
);
1560 caller_context_t
*ct
,
1563 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1564 struct tmpnode
*self
= (struct tmpnode
*)NULL
;
1565 struct tmount
*tm
= (struct tmount
*)VTOTM(dvp
);
1570 /* no symlinks allowed to files in xattr dirs */
1571 if (parent
->tn_flags
& ISXATTR
)
1574 error
= tdirlookup(parent
, lnm
, &self
, cred
);
1577 * The entry already exists
1580 return (EEXIST
); /* was 0 */
1583 if (error
!= ENOENT
) {
1589 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1590 error
= tdirenter(tm
, parent
, lnm
, DE_CREATE
, (struct tmpnode
*)NULL
,
1591 (struct tmpnode
*)NULL
, tva
, &self
, cred
, ct
);
1592 rw_exit(&parent
->tn_rwlock
);
1599 len
= strlen(tnm
) + 1;
1600 cp
= tmp_memalloc(len
, 0);
1605 (void) strcpy(cp
, tnm
);
1607 self
->tn_symlink
= cp
;
1608 self
->tn_size
= len
- 1;
1619 caller_context_t
*ct
)
1621 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1624 if (vp
->v_type
!= VLNK
)
1627 rw_enter(&tp
->tn_rwlock
, RW_READER
);
1628 rw_enter(&tp
->tn_contents
, RW_READER
);
1629 error
= uiomove(tp
->tn_symlink
, tp
->tn_size
, UIO_READ
, uiop
);
1630 gethrestime(&tp
->tn_atime
);
1631 rw_exit(&tp
->tn_contents
);
1632 rw_exit(&tp
->tn_rwlock
);
1642 caller_context_t
*ct
)
1649 tmp_inactive(struct vnode
*vp
, struct cred
*cred
, caller_context_t
*ct
)
1651 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1652 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vp
->v_vfsp
);
1654 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
1656 mutex_enter(&tp
->tn_tlock
);
1657 mutex_enter(&vp
->v_lock
);
1658 ASSERT(vp
->v_count
>= 1);
1661 * If we don't have the last hold or the link count is non-zero,
1662 * there's little to do -- just drop our hold.
1664 if (vp
->v_count
> 1 || tp
->tn_nlink
!= 0) {
1666 mutex_exit(&vp
->v_lock
);
1667 mutex_exit(&tp
->tn_tlock
);
1668 rw_exit(&tp
->tn_rwlock
);
1673 * We have the last hold *and* the link count is zero, so this
1674 * tmpnode is dead from the filesystem's viewpoint. However,
1675 * if the tmpnode has any pages associated with it (i.e. if it's
1676 * a normal file with non-zero size), the tmpnode can still be
1677 * discovered by pageout or fsflush via the page vnode pointers.
1678 * In this case we must drop all our locks, truncate the tmpnode,
1679 * and try the whole dance again.
1681 if (tp
->tn_size
!= 0) {
1682 if (tp
->tn_type
== VREG
) {
1683 mutex_exit(&vp
->v_lock
);
1684 mutex_exit(&tp
->tn_tlock
);
1685 rw_enter(&tp
->tn_contents
, RW_WRITER
);
1686 (void) tmpnode_trunc(tm
, tp
, 0);
1687 rw_exit(&tp
->tn_contents
);
1688 ASSERT(tp
->tn_size
== 0);
1689 ASSERT(tp
->tn_nblocks
== 0);
1692 if (tp
->tn_type
== VLNK
)
1693 tmp_memfree(tp
->tn_symlink
, tp
->tn_size
+ 1);
1697 * Remove normal file/dir's xattr dir and xattrs.
1699 if (tp
->tn_xattrdp
) {
1700 struct tmpnode
*xtp
= tp
->tn_xattrdp
;
1702 ASSERT(xtp
->tn_flags
& ISXATTR
);
1704 rw_enter(&xtp
->tn_rwlock
, RW_WRITER
);
1706 DECR_COUNT(&xtp
->tn_nlink
, &xtp
->tn_tlock
);
1707 tp
->tn_xattrdp
= NULL
;
1708 rw_exit(&xtp
->tn_rwlock
);
1712 mutex_exit(&vp
->v_lock
);
1713 mutex_exit(&tp
->tn_tlock
);
1714 /* Here's our chance to send invalid event while we're between locks */
1715 vn_invalid(TNTOV(tp
));
1716 mutex_enter(&tm
->tm_contents
);
1717 if (tp
->tn_forw
== NULL
)
1718 tm
->tm_rootnode
->tn_back
= tp
->tn_back
;
1720 tp
->tn_forw
->tn_back
= tp
->tn_back
;
1721 tp
->tn_back
->tn_forw
= tp
->tn_forw
;
1722 mutex_exit(&tm
->tm_contents
);
1723 rw_exit(&tp
->tn_rwlock
);
1724 rw_destroy(&tp
->tn_rwlock
);
1725 mutex_destroy(&tp
->tn_tlock
);
1727 tmp_memfree(tp
, sizeof (struct tmpnode
));
1732 tmp_fid(struct vnode
*vp
, struct fid
*fidp
, caller_context_t
*ct
)
1734 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1737 if (fidp
->fid_len
< (sizeof (struct tfid
) - sizeof (ushort_t
))) {
1738 fidp
->fid_len
= sizeof (struct tfid
) - sizeof (ushort_t
);
1742 tfid
= (struct tfid
*)fidp
;
1743 bzero(tfid
, sizeof (struct tfid
));
1744 tfid
->tfid_len
= (int)sizeof (struct tfid
) - sizeof (ushort_t
);
1746 tfid
->tfid_ino
= tp
->tn_nodeid
;
1747 tfid
->tfid_gen
= tp
->tn_gen
;
1754 * Return all the pages from [off..off+len] in given file
1769 caller_context_t
*ct
)
1772 struct tmpnode
*tp
= VTOTN(vp
);
1773 anoff_t toff
= (anoff_t
)off
;
1778 rw_enter(&tp
->tn_contents
, RW_READER
);
1780 if (off
+ len
> tp
->tn_size
+ PAGEOFFSET
) {
1785 * Look for holes (no anon slot) in faulting range. If there are
1786 * holes we have to switch to a write lock and fill them in. Swap
1787 * space for holes was already reserved when the file was grown.
1790 if (non_anon(tp
->tn_anon
, btop(off
), &tmpoff
, &tlen
)) {
1791 if (!rw_tryupgrade(&tp
->tn_contents
)) {
1792 rw_exit(&tp
->tn_contents
);
1793 rw_enter(&tp
->tn_contents
, RW_WRITER
);
1794 /* Size may have changed when lock was dropped */
1795 if (off
+ len
> tp
->tn_size
+ PAGEOFFSET
) {
1800 for (toff
= (anoff_t
)off
; toff
< (anoff_t
)off
+ len
;
1802 if (anon_get_ptr(tp
->tn_anon
, btop(toff
)) == NULL
) {
1803 /* XXX - may allocate mem w. write lock held */
1804 (void) anon_set_ptr(tp
->tn_anon
, btop(toff
),
1805 anon_alloc(vp
, toff
), ANON_SLEEP
);
1809 rw_downgrade(&tp
->tn_contents
);
1813 err
= pvn_getpages(tmp_getapage
, vp
, (u_offset_t
)off
, len
, protp
,
1814 pl
, plsz
, seg
, addr
, rw
, cr
);
1822 rw_exit(&tp
->tn_contents
);
1827 * Called from pvn_getpages to get a particular page.
1852 if (pp
= page_lookup(vp
, off
, rw
== S_CREATE
? SE_EXCL
: SE_SHARED
)) {
1860 pp
= page_create_va(vp
, off
, PAGESIZE
,
1861 PG_WAIT
| PG_EXCL
, seg
, addr
);
1863 * Someone raced in and created the page after we did the
1864 * lookup but before we did the create, so go back and
1865 * try to look it up again.
1870 * Fill page from backing store, if any. If none, then
1871 * either this is a newly filled hole or page must have
1872 * been unmodified and freed so just zero it out.
1874 err
= swap_getphysname(vp
, off
, &pvp
, &poff
);
1876 panic("tmp_getapage: no anon slot vp %p "
1877 "off %llx pp %p\n", (void *)vp
, off
, (void *)pp
);
1880 flags
= (pl
== NULL
? B_ASYNC
|B_READ
: B_READ
);
1881 err
= VOP_PAGEIO(pvp
, pp
, (u_offset_t
)poff
, PAGESIZE
,
1883 if (flags
& B_ASYNC
)
1885 } else if (rw
!= S_CREATE
) {
1886 pagezero(pp
, 0, PAGESIZE
);
1889 pvn_read_done(pp
, B_ERROR
);
1892 pvn_plist_init(pp
, pl
, plsz
, off
, PAGESIZE
, rw
);
1902 * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
1903 * If len == 0, do from off to EOF.
1905 static int tmp_nopage
= 0; /* Don't do tmp_putpage's if set */
1910 register struct vnode
*vp
,
1915 caller_context_t
*ct
)
1917 register page_t
*pp
;
1921 struct tmpnode
*tp
= VTOTN(vp
);
1927 ASSERT(vp
->v_count
!= 0);
1929 if (vp
->v_flag
& VNOMAP
)
1933 * This being tmpfs, we don't ever do i/o unless we really
1934 * have to (when we're low on memory and pageout calls us
1935 * with B_ASYNC | B_FREE or the user explicitly asks for it with
1937 * XXX to approximately track the mod time like ufs we should
1938 * update the times here. The problem is, once someone does a
1939 * store we never clear the mod bit and do i/o, thus fsflush
1940 * will keep calling us every 30 seconds to do the i/o and we'll
1941 * continually update the mod time. At least we update the mod
1942 * time on the first store because this results in a call to getpage.
1944 if (flags
!= (B_ASYNC
| B_FREE
) && (flags
& B_INVAL
) == 0 &&
1945 (flags
& B_DONTNEED
) == 0)
1948 * If this thread owns the lock, i.e., this thread grabbed it
1949 * as writer somewhere above, then we don't need to grab the
1950 * lock as reader in this routine.
1952 dolock
= (rw_owner(&tp
->tn_contents
) != curthread
);
1955 * If this is pageout don't block on the lock as you could deadlock
1956 * when freemem == 0 (another thread has the read lock and is blocked
1957 * creating a page, and a third thread is waiting to get the writers
1958 * lock - waiting writers priority blocks us from getting the read
1959 * lock). Of course, if the only freeable pages are on this tmpnode
1960 * we're hosed anyways. A better solution might be a new lock type.
1961 * Note: ufs has the same problem.
1963 if (curproc
== proc_pageout
) {
1964 if (!rw_tryenter(&tp
->tn_contents
, RW_READER
))
1967 rw_enter(&tp
->tn_contents
, RW_READER
);
1969 if (!vn_has_cached_data(vp
))
1973 if (curproc
== proc_pageout
) {
1974 panic("tmp: pageout can't block");
1978 /* Search the entire vp list for pages >= off. */
1979 err
= pvn_vplist_dirty(vp
, (u_offset_t
)off
, tmp_putapage
,
1985 * Loop over all offsets in the range [off...off + len]
1986 * looking for pages to deal with.
1988 eoff
= MIN(off
+ len
, tp
->tn_size
);
1989 for (io_off
= off
; io_off
< eoff
; io_off
+= io_len
) {
1991 * If we are not invalidating, synchronously
1992 * freeing or writing pages use the routine
1993 * page_lookup_nowait() to prevent reclaiming
1994 * them from the free list.
1996 if ((flags
& B_INVAL
) || ((flags
& B_ASYNC
) == 0)) {
1997 pp
= page_lookup(vp
, io_off
,
1998 (flags
& (B_INVAL
| B_FREE
)) ?
1999 SE_EXCL
: SE_SHARED
);
2001 pp
= page_lookup_nowait(vp
, io_off
,
2002 (flags
& B_FREE
) ? SE_EXCL
: SE_SHARED
);
2005 if (pp
== NULL
|| pvn_getdirty(pp
, flags
) == 0)
2008 err
= tmp_putapage(vp
, pp
, &io_off
, &io_len
,
2015 /* If invalidating, verify all pages on vnode list are gone. */
2016 if (err
== 0 && off
== 0 && len
== 0 &&
2017 (flags
& B_INVAL
) && vn_has_cached_data(vp
)) {
2018 panic("tmp_putpage: B_INVAL, pages not gone");
2022 if ((curproc
== proc_pageout
) || dolock
)
2023 rw_exit(&tp
->tn_contents
);
2025 * Only reason putapage is going to give us SE_NOSWAP as error
2026 * is when we ask a page to be written to physical backing store
2027 * and there is none. Ignore this because we might be dealing
2028 * with a swap page which does not have any backing store
2029 * on disk. In any other case we won't get this error over here.
2031 if (err
== SE_NOSWAP
)
2036 long tmp_putpagecnt
, tmp_pagespushed
;
2039 * Write out a single page.
2040 * For tmpfs this means choose a physical swap slot and write the page
2041 * out using VOP_PAGEIO. For performance, we attempt to kluster; i.e.,
2042 * we try to find a bunch of other dirty pages adjacent in the file
2043 * and a bunch of contiguous swap slots, and then write all the pages
2044 * out in a single i/o.
2057 ulong_t klstart
, kllen
;
2058 page_t
*pplist
, *npplist
;
2059 extern int klustsize
;
2062 size_t pp_off
, pp_len
;
2070 ASSERT(PAGE_LOCKED(pp
));
2072 /* Kluster in tmp_klustsize chunks */
2074 tmp_klustsize
= klustsize
;
2075 offset
= pp
->p_offset
;
2076 klstart
= (offset
/ tmp_klustsize
) * tmp_klustsize
;
2077 kllen
= MIN(tmp_klustsize
, tp
->tn_size
- klstart
);
2079 /* Get a kluster of pages */
2081 pvn_write_kluster(vp
, pp
, &tmpoff
, &pp_len
, klstart
, kllen
, flags
);
2083 pp_off
= (size_t)tmpoff
;
2086 * Get a cluster of physical offsets for the pages; the amount we
2087 * get may be some subrange of what we ask for (io_off, io_len).
2091 err
= swap_newphysname(vp
, offset
, &io_off
, &io_len
, &pvp
, &pstart
);
2092 ASSERT(err
!= SE_NOANON
); /* anon slot must have been filled */
2094 pvn_write_done(pplist
, B_ERROR
| B_WRITE
| flags
);
2096 * If this routine is called as a result of segvn_sync
2097 * operation and we have no physical swap then we can get an
2098 * error here. In such case we would return SE_NOSWAP as error.
2099 * At this point, we expect only SE_NOSWAP.
2101 ASSERT(err
== SE_NOSWAP
);
2102 if (flags
& B_INVAL
)
2106 ASSERT(pp_off
<= io_off
&& io_off
+ io_len
<= pp_off
+ pp_len
);
2107 ASSERT(io_off
<= offset
&& offset
< io_off
+ io_len
);
2109 /* Toss pages at front/rear that we couldn't get physical backing for */
2110 if (io_off
!= pp_off
) {
2112 page_list_break(&pplist
, &npplist
, btop(io_off
- pp_off
));
2113 ASSERT(pplist
->p_offset
== pp_off
);
2114 ASSERT(pplist
->p_prev
->p_offset
== io_off
- PAGESIZE
);
2115 pvn_write_done(pplist
, B_ERROR
| B_WRITE
| flags
);
2118 if (io_off
+ io_len
< pp_off
+ pp_len
) {
2120 page_list_break(&pplist
, &npplist
, btop(io_len
));
2121 ASSERT(npplist
->p_offset
== io_off
+ io_len
);
2122 ASSERT(npplist
->p_prev
->p_offset
== pp_off
+ pp_len
- PAGESIZE
);
2123 pvn_write_done(npplist
, B_ERROR
| B_WRITE
| flags
);
2126 ASSERT(pplist
->p_offset
== io_off
);
2127 ASSERT(pplist
->p_prev
->p_offset
== io_off
+ io_len
- PAGESIZE
);
2128 ASSERT(btopr(io_len
) <= btopr(kllen
));
2130 /* Do i/o on the remaining kluster */
2131 err
= VOP_PAGEIO(pvp
, pplist
, (u_offset_t
)pstart
, io_len
,
2132 B_WRITE
| flags
, cr
, NULL
);
2134 if ((flags
& B_ASYNC
) == 0) {
2135 pvn_write_done(pplist
, ((err
) ? B_ERROR
: 0) | B_WRITE
| flags
);
2144 tmp_pagespushed
+= btop(io_len
);
2146 if (err
&& err
!= ENOMEM
&& err
!= SE_NOSWAP
)
2147 cmn_err(CE_WARN
, "tmp_putapage: err %d\n", err
);
2163 caller_context_t
*ct
)
2165 struct segvn_crargs vn_a
;
2166 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
2174 if (vp
->v_flag
& VNOMAP
)
2177 if (off
< 0 || (offset_t
)(off
+ len
) < 0 ||
2178 off
> MAXOFF_T
|| (off
+ len
) > MAXOFF_T
)
2181 if (vp
->v_type
!= VREG
)
2185 * Don't allow mapping to locked file
2187 if (vn_has_mandatory_locks(vp
, tp
->tn_mode
)) {
2192 error
= choose_addr(as
, addrp
, len
, off
, ADDR_VACALIGN
, flags
);
2199 vn_a
.offset
= (u_offset_t
)off
;
2200 vn_a
.type
= flags
& MAP_TYPE
;
2202 vn_a
.maxprot
= maxprot
;
2203 vn_a
.flags
= flags
& ~MAP_TYPE
;
2207 vn_a
.lgrp_mem_policy_flags
= 0;
2209 error
= as_map(as
, *addrp
, len
, segvn_create
, &vn_a
);
2215 * tmp_addmap and tmp_delmap can't be called since the vp
2216 * maintained in the segvn mapping is NULL.
2230 caller_context_t
*ct
)
2247 caller_context_t
*ct
)
2253 tmp_freesp(struct vnode
*vp
, struct flock64
*lp
, int flag
)
2256 register struct tmpnode
*tp
= VTOTN(vp
);
2259 ASSERT(vp
->v_type
== VREG
);
2260 ASSERT(lp
->l_start
>= 0);
2265 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
2266 if (tp
->tn_size
== lp
->l_start
) {
2267 rw_exit(&tp
->tn_rwlock
);
2272 * Check for any mandatory locks on the range
2274 if (MANDLOCK(vp
, tp
->tn_mode
)) {
2277 save_start
= lp
->l_start
;
2279 if (tp
->tn_size
< lp
->l_start
) {
2281 * "Truncate up" case: need to make sure there
2282 * is no lock beyond current end-of-file. To
2283 * do so, we need to set l_start to the size
2284 * of the file temporarily.
2286 lp
->l_start
= tp
->tn_size
;
2288 lp
->l_type
= F_WRLCK
;
2290 lp
->l_pid
= ttoproc(curthread
)->p_pid
;
2291 i
= (flag
& (FNDELAY
|FNONBLOCK
)) ? 0 : SLPFLCK
;
2292 if ((i
= reclock(vp
, lp
, i
, 0, lp
->l_start
, NULL
)) != 0 ||
2293 lp
->l_type
!= F_UNLCK
) {
2294 rw_exit(&tp
->tn_rwlock
);
2295 return (i
? i
: EAGAIN
);
2298 lp
->l_start
= save_start
;
2300 VFSTOTM(vp
->v_vfsp
);
2302 rw_enter(&tp
->tn_contents
, RW_WRITER
);
2303 error
= tmpnode_trunc((struct tmount
*)VFSTOTM(vp
->v_vfsp
),
2304 tp
, (ulong_t
)lp
->l_start
);
2305 rw_exit(&tp
->tn_contents
);
2306 rw_exit(&tp
->tn_rwlock
);
2315 struct flock64
*bfp
,
2319 caller_context_t
*ct
)
2323 if (cmd
!= F_FREESP
)
2325 if ((error
= convoff(vp
, bfp
, 0, (offset_t
)offset
)) == 0) {
2326 if ((bfp
->l_start
> MAXOFF_T
) || (bfp
->l_len
> MAXOFF_T
))
2328 error
= tmp_freesp(vp
, bfp
, flag
);
2330 if (error
== 0 && bfp
->l_start
== 0)
2331 vnevent_truncate(vp
, ct
);
2342 caller_context_t
*ct
)
2344 return ((*noffp
< 0 || *noffp
> MAXOFFSET_T
) ? EINVAL
: 0);
2349 tmp_rwlock(struct vnode
*vp
, int write_lock
, caller_context_t
*ctp
)
2351 struct tmpnode
*tp
= VTOTN(vp
);
2354 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
2356 rw_enter(&tp
->tn_rwlock
, RW_READER
);
2358 return (write_lock
);
2363 tmp_rwunlock(struct vnode
*vp
, int write_lock
, caller_context_t
*ctp
)
2365 struct tmpnode
*tp
= VTOTN(vp
);
2367 rw_exit(&tp
->tn_rwlock
);
2376 caller_context_t
*ct
)
2378 struct tmpnode
*tp
= NULL
;
2382 case _PC_XATTR_EXISTS
:
2383 if (vp
->v_vfsp
->vfs_flag
& VFS_XATTR
) {
2384 *valp
= 0; /* assume no attributes */
2385 error
= 0; /* okay to ask */
2387 rw_enter(&tp
->tn_rwlock
, RW_READER
);
2388 if (tp
->tn_xattrdp
) {
2389 rw_enter(&tp
->tn_xattrdp
->tn_rwlock
, RW_READER
);
2390 /* do not count "." and ".." */
2391 if (tp
->tn_xattrdp
->tn_dirents
> 2)
2393 rw_exit(&tp
->tn_xattrdp
->tn_rwlock
);
2395 rw_exit(&tp
->tn_rwlock
);
2400 case _PC_SATTR_ENABLED
:
2401 case _PC_SATTR_EXISTS
:
2402 *valp
= vfs_has_feature(vp
->v_vfsp
, VFSFT_SYSATTR_VIEWS
) &&
2403 (vp
->v_type
== VREG
|| vp
->v_type
== VDIR
);
2406 case _PC_TIMESTAMP_RESOLUTION
:
2407 /* nanosecond timestamp resolution */
2412 error
= fs_pathconf(vp
, cmd
, valp
, cr
, ct
);
2418 struct vnodeops
*tmp_vnodeops
;
2420 const fs_operation_def_t tmp_vnodeops_template
[] = {
2421 VOPNAME_OPEN
, { .vop_open
= tmp_open
},
2422 VOPNAME_CLOSE
, { .vop_close
= tmp_close
},
2423 VOPNAME_READ
, { .vop_read
= tmp_read
},
2424 VOPNAME_WRITE
, { .vop_write
= tmp_write
},
2425 VOPNAME_IOCTL
, { .vop_ioctl
= tmp_ioctl
},
2426 VOPNAME_GETATTR
, { .vop_getattr
= tmp_getattr
},
2427 VOPNAME_SETATTR
, { .vop_setattr
= tmp_setattr
},
2428 VOPNAME_ACCESS
, { .vop_access
= tmp_access
},
2429 VOPNAME_LOOKUP
, { .vop_lookup
= tmp_lookup
},
2430 VOPNAME_CREATE
, { .vop_create
= tmp_create
},
2431 VOPNAME_REMOVE
, { .vop_remove
= tmp_remove
},
2432 VOPNAME_LINK
, { .vop_link
= tmp_link
},
2433 VOPNAME_RENAME
, { .vop_rename
= tmp_rename
},
2434 VOPNAME_MKDIR
, { .vop_mkdir
= tmp_mkdir
},
2435 VOPNAME_RMDIR
, { .vop_rmdir
= tmp_rmdir
},
2436 VOPNAME_READDIR
, { .vop_readdir
= tmp_readdir
},
2437 VOPNAME_SYMLINK
, { .vop_symlink
= tmp_symlink
},
2438 VOPNAME_READLINK
, { .vop_readlink
= tmp_readlink
},
2439 VOPNAME_FSYNC
, { .vop_fsync
= tmp_fsync
},
2440 VOPNAME_INACTIVE
, { .vop_inactive
= tmp_inactive
},
2441 VOPNAME_FID
, { .vop_fid
= tmp_fid
},
2442 VOPNAME_RWLOCK
, { .vop_rwlock
= tmp_rwlock
},
2443 VOPNAME_RWUNLOCK
, { .vop_rwunlock
= tmp_rwunlock
},
2444 VOPNAME_SEEK
, { .vop_seek
= tmp_seek
},
2445 VOPNAME_SPACE
, { .vop_space
= tmp_space
},
2446 VOPNAME_GETPAGE
, { .vop_getpage
= tmp_getpage
},
2447 VOPNAME_PUTPAGE
, { .vop_putpage
= tmp_putpage
},
2448 VOPNAME_MAP
, { .vop_map
= tmp_map
},
2449 VOPNAME_ADDMAP
, { .vop_addmap
= tmp_addmap
},
2450 VOPNAME_DELMAP
, { .vop_delmap
= tmp_delmap
},
2451 VOPNAME_PATHCONF
, { .vop_pathconf
= tmp_pathconf
},
2452 VOPNAME_VNEVENT
, { .vop_vnevent
= fs_vnevent_support
},