4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, Joyent, Inc. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
31 #include <sys/pathname.h>
33 #include <sys/vfs_opreg.h>
34 #include <sys/vnode.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
41 #include <sys/statvfs.h>
42 #include <sys/mount.h>
43 #include <sys/debug.h>
44 #include <sys/systm.h>
45 #include <sys/mntent.h>
46 #include <fs/fs_subr.h>
49 #include <sys/model.h>
50 #include <sys/policy.h>
52 #include <sys/fs/swapnode.h>
53 #include <sys/fs/tmp.h>
54 #include <sys/fs/tmpnode.h>
56 static int tmpfsfstype
;
59 * tmpfs vfs operations.
61 static int tmpfsinit(int, char *);
62 static int tmp_mount(struct vfs
*, struct vnode
*,
63 struct mounta
*, struct cred
*);
64 static int tmp_unmount(struct vfs
*, int, struct cred
*);
65 static int tmp_root(struct vfs
*, struct vnode
**);
66 static int tmp_statvfs(struct vfs
*, struct statvfs64
*);
67 static int tmp_vget(struct vfs
*, struct vnode
**, struct fid
*);
70 * Loadable module wrapper
72 #include <sys/modctl.h>
74 static mntopts_t tmpfs_proto_opttbl
;
76 static vfsdef_t vfw
= {
80 VSW_HASPROTO
|VSW_CANREMOUNT
|VSW_STATS
|VSW_ZMOUNT
,
85 * in-kernel mnttab options
87 static char *xattr_cancel
[] = { MNTOPT_NOXATTR
, NULL
};
88 static char *noxattr_cancel
[] = { MNTOPT_XATTR
, NULL
};
90 static mntopt_t tmpfs_options
[] = {
91 /* Option name Cancel Opt Arg Flags Data */
92 { MNTOPT_XATTR
, xattr_cancel
, NULL
, MO_DEFAULT
, NULL
},
93 { MNTOPT_NOXATTR
, noxattr_cancel
, NULL
, NULL
, NULL
},
94 { "size", NULL
, "0", MO_HASVALUE
, NULL
}
98 static mntopts_t tmpfs_proto_opttbl
= {
99 sizeof (tmpfs_options
) / sizeof (mntopt_t
),
104 * Module linkage information
106 static struct modlfs modlfs
= {
107 &mod_fsops
, "filesystem for tmpfs", &vfw
110 static struct modlinkage modlinkage
= {
111 MODREV_1
, &modlfs
, NULL
117 return (mod_install(&modlinkage
));
125 error
= mod_remove(&modlinkage
);
129 * Tear down the operations vectors
131 (void) vfs_freevfsops_by_type(tmpfsfstype
);
132 vn_freevnodeops(tmp_vnodeops
);
137 _info(struct modinfo
*modinfop
)
139 return (mod_info(&modlinkage
, modinfop
));
143 * The following are patchable variables limiting the amount of system
144 * resources tmpfs can use.
146 * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory
147 * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries)
148 * It is not determined by setting a hard limit but rather as a percentage of
149 * physical memory which is determined when tmpfs is first used in the system.
151 * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
152 * the rest of the system. In other words, if the amount of free swap space
153 * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
154 * anon allocations will fail.
156 * There is also a per mount limit on the amount of swap space
157 * (tmount.tm_anonmax) settable via a mount option.
159 size_t tmpfs_maxkmem
= 0;
160 size_t tmpfs_minfree
= 0;
161 size_t tmp_kmemspace
; /* bytes of kernel heap used by all tmpfs */
163 static major_t tmpfs_major
;
164 static minor_t tmpfs_minor
;
165 static kmutex_t tmpfs_minor_lock
;
168 * initialize global tmpfs locks and such
169 * called when loading tmpfs module
172 tmpfsinit(int fstype
, char *name
)
174 static const fs_operation_def_t tmp_vfsops_template
[] = {
175 VFSNAME_MOUNT
, { .vfs_mount
= tmp_mount
},
176 VFSNAME_UNMOUNT
, { .vfs_unmount
= tmp_unmount
},
177 VFSNAME_ROOT
, { .vfs_root
= tmp_root
},
178 VFSNAME_STATVFS
, { .vfs_statvfs
= tmp_statvfs
},
179 VFSNAME_VGET
, { .vfs_vget
= tmp_vget
},
183 extern void tmpfs_hash_init();
186 tmpfsfstype
= fstype
;
187 ASSERT(tmpfsfstype
!= 0);
189 error
= vfs_setfsops(fstype
, tmp_vfsops_template
, NULL
);
191 cmn_err(CE_WARN
, "tmpfsinit: bad vfs ops template");
195 error
= vn_make_ops(name
, tmp_vnodeops_template
, &tmp_vnodeops
);
197 (void) vfs_freevfsops_by_type(fstype
);
198 cmn_err(CE_WARN
, "tmpfsinit: bad vnode ops template");
203 * tmpfs_minfree doesn't need to be some function of configured
204 * swap space since it really is an absolute limit of swap space
205 * which still allows other processes to execute.
207 if (tmpfs_minfree
== 0) {
211 tmpfs_minfree
= btopr(TMPMINFREE
);
215 * The maximum amount of space tmpfs can allocate is
216 * TMPMAXPROCKMEM percent of kernel memory
218 if (tmpfs_maxkmem
== 0)
219 tmpfs_maxkmem
= MAX(PAGESIZE
, kmem_maxavail() / TMPMAXFRACKMEM
);
221 if ((tmpfs_major
= getudev()) == (major_t
)-1) {
222 cmn_err(CE_WARN
, "tmpfsinit: Can't get unique device number.");
225 mutex_init(&tmpfs_minor_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
236 struct tmount
*tm
= NULL
;
246 if ((error
= secpolicy_fs_mount(cr
, mvp
, vfsp
)) != 0)
249 if (mvp
->v_type
!= VDIR
)
252 mutex_enter(&mvp
->v_lock
);
253 if ((uap
->flags
& MS_REMOUNT
) == 0 && (uap
->flags
& MS_OVERLAY
) == 0 &&
254 (mvp
->v_count
!= 1 || (mvp
->v_flag
& VROOT
))) {
255 mutex_exit(&mvp
->v_lock
);
258 mutex_exit(&mvp
->v_lock
);
261 * Having the resource be anything but "swap" doesn't make sense.
263 vfs_setresource(vfsp
, "swap", 0);
266 * now look for options we understand...
269 /* tmpfs doesn't support read-only mounts */
270 if (vfs_optionisset(vfsp
, MNTOPT_RO
, NULL
)) {
276 * tm_anonmax is set according to the mount arguments
277 * if any. Otherwise, it is set to a maximum value.
279 if (vfs_optionisset(vfsp
, "size", &sizestr
)) {
280 if ((error
= tmp_convnum(sizestr
, &anonmax
)) != 0)
286 if (error
= pn_get(uap
->dir
,
287 (uap
->flags
& MS_SYSSPACE
) ? UIO_SYSSPACE
: UIO_USERSPACE
, &dpn
))
290 if (uap
->flags
& MS_REMOUNT
) {
291 tm
= (struct tmount
*)VFSTOTM(vfsp
);
294 * If we change the size so its less than what is currently
295 * being used, we allow that. The file system will simply be
296 * full until enough files have been removed to get below the
299 mutex_enter(&tm
->tm_contents
);
300 tm
->tm_anonmax
= anonmax
;
301 mutex_exit(&tm
->tm_contents
);
305 if ((tm
= tmp_memalloc(sizeof (struct tmount
), 0)) == NULL
) {
312 * find an available minor device number for this mount
314 mutex_enter(&tmpfs_minor_lock
);
316 tmpfs_minor
= (tmpfs_minor
+ 1) & L_MAXMIN32
;
317 tm
->tm_dev
= makedevice(tmpfs_major
, tmpfs_minor
);
318 } while (vfs_devismounted(tm
->tm_dev
));
319 mutex_exit(&tmpfs_minor_lock
);
322 * Set but don't bother entering the mutex
323 * (tmount not on mount list yet)
325 mutex_init(&tm
->tm_contents
, NULL
, MUTEX_DEFAULT
, NULL
);
326 mutex_init(&tm
->tm_renamelck
, NULL
, MUTEX_DEFAULT
, NULL
);
329 tm
->tm_anonmax
= anonmax
;
331 vfsp
->vfs_data
= (caddr_t
)tm
;
332 vfsp
->vfs_fstype
= tmpfsfstype
;
333 vfsp
->vfs_dev
= tm
->tm_dev
;
334 vfsp
->vfs_bsize
= PAGESIZE
;
335 vfsp
->vfs_flag
|= VFS_NOTRUNC
;
336 vfs_make_fsid(&vfsp
->vfs_fsid
, tm
->tm_dev
, tmpfsfstype
);
337 tm
->tm_mntpath
= tmp_memalloc(dpn
.pn_pathlen
+ 1, TMP_MUSTHAVE
);
338 (void) strcpy(tm
->tm_mntpath
, dpn
.pn_path
);
341 * allocate and initialize root tmpnode structure
343 bzero(&rattr
, sizeof (struct vattr
));
344 rattr
.va_mode
= (mode_t
)(S_IFDIR
| 0777); /* XXX modes */
345 rattr
.va_type
= VDIR
;
347 tp
= tmp_memalloc(sizeof (struct tmpnode
), TMP_MUSTHAVE
);
348 tmpnode_init(tm
, tp
, &rattr
, cr
);
351 * Get the mode, uid, and gid from the underlying mount point.
353 rattr
.va_mask
= AT_MODE
|AT_UID
|AT_GID
; /* Hint to getattr */
354 got_attrs
= VOP_GETATTR(mvp
, &rattr
, 0, cr
, NULL
);
356 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
357 TNTOV(tp
)->v_flag
|= VROOT
;
360 * If the getattr succeeded, use its results. Otherwise allow
361 * the previously set hardwired defaults to prevail.
363 if (got_attrs
== 0) {
364 tp
->tn_mode
= rattr
.va_mode
;
365 tp
->tn_uid
= rattr
.va_uid
;
366 tp
->tn_gid
= rattr
.va_gid
;
370 * initialize linked list of tmpnodes so that the back pointer of
371 * the root tmpnode always points to the last one on the list
372 * and the forward pointer of the last node is null
377 tm
->tm_rootnode
= tp
;
381 rw_exit(&tp
->tn_rwlock
);
388 vfs_set_feature(vfsp
, VFSFT_SYSATTR_VIEWS
);
394 tmp_unmount(struct vfs
*vfsp
, int flag
, struct cred
*cr
)
396 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vfsp
);
397 struct tmpnode
*tnp
, *cancel
;
401 if ((error
= secpolicy_fs_unmount(cr
, vfsp
)) != 0)
405 * forced unmount is not supported by this file system
406 * and thus, ENOTSUP, is being returned.
411 mutex_enter(&tm
->tm_contents
);
414 * If there are no open files, only the root node should have
416 * With tm_contents held, nothing can be added or removed.
417 * There may be some dirty pages. To prevent fsflush from
418 * disrupting the unmount, put a hold on each node while scanning.
419 * If we find a previously referenced node, undo the holds we have
420 * placed and fail EBUSY.
422 tnp
= tm
->tm_rootnode
;
423 if (TNTOV(tnp
)->v_count
> 1) {
424 mutex_exit(&tm
->tm_contents
);
428 for (tnp
= tnp
->tn_forw
; tnp
; tnp
= tnp
->tn_forw
) {
429 if ((vp
= TNTOV(tnp
))->v_count
> 0) {
430 cancel
= tm
->tm_rootnode
->tn_forw
;
431 while (cancel
!= tnp
) {
433 ASSERT(vp
->v_count
> 0);
435 cancel
= cancel
->tn_forw
;
437 mutex_exit(&tm
->tm_contents
);
444 * We can drop the mutex now because no one can find this mount
446 mutex_exit(&tm
->tm_contents
);
449 * Free all kmemalloc'd and anonalloc'd memory associated with
450 * this filesystem. To do this, we go through the file list twice,
451 * once to remove all the directory entries, and then to remove
452 * all the files. We do this because there is useful code in
453 * tmpnode_free which assumes that the directory entry has been
454 * removed before the file.
457 * Remove all directory entries
459 for (tnp
= tm
->tm_rootnode
; tnp
; tnp
= tnp
->tn_forw
) {
460 rw_enter(&tnp
->tn_rwlock
, RW_WRITER
);
461 if (tnp
->tn_type
== VDIR
)
463 if (tnp
->tn_vnode
->v_flag
& V_XATTRDIR
) {
465 * Account for implicit attrdir reference.
467 ASSERT(tnp
->tn_nlink
> 0);
468 DECR_COUNT(&tnp
->tn_nlink
, &tnp
->tn_tlock
);
470 rw_exit(&tnp
->tn_rwlock
);
473 ASSERT(tm
->tm_rootnode
);
476 * All links are gone, v_count is keeping nodes in place.
477 * VN_RELE should make the node disappear, unless somebody
478 * is holding pages against it. Nap and retry until it disappears.
480 * We re-acquire the lock to prevent others who have a HOLD on
481 * a tmpnode via its pages or anon slots from blowing it away
482 * (in tmp_inactive) while we're trying to get to it here. Once
483 * we have a HOLD on it we know it'll stick around.
486 mutex_enter(&tm
->tm_contents
);
488 * Remove all the files (except the rootnode) backwards.
490 while ((tnp
= tm
->tm_rootnode
->tn_back
) != tm
->tm_rootnode
) {
491 mutex_exit(&tm
->tm_contents
);
493 * Inhibit tmp_inactive from touching attribute directory
494 * as all nodes will be released here.
495 * Note we handled the link count in pass 2 above.
497 rw_enter(&tnp
->tn_rwlock
, RW_WRITER
);
498 tnp
->tn_xattrdp
= NULL
;
499 rw_exit(&tnp
->tn_rwlock
);
502 mutex_enter(&tm
->tm_contents
);
504 * It's still there after the RELE. Someone else like pageout
505 * has a hold on it so wait a bit and then try again - we know
506 * they'll give it up soon.
508 if (tnp
== tm
->tm_rootnode
->tn_back
) {
510 mutex_exit(&tm
->tm_contents
);
512 mutex_enter(&tm
->tm_contents
);
515 mutex_exit(&tm
->tm_contents
);
517 tm
->tm_rootnode
->tn_xattrdp
= NULL
;
518 VN_RELE(TNTOV(tm
->tm_rootnode
));
520 ASSERT(tm
->tm_mntpath
);
522 tmp_memfree(tm
->tm_mntpath
, strlen(tm
->tm_mntpath
) + 1);
524 ASSERT(tm
->tm_anonmem
== 0);
526 mutex_destroy(&tm
->tm_contents
);
527 mutex_destroy(&tm
->tm_renamelck
);
528 tmp_memfree(tm
, sizeof (struct tmount
));
534 * return root tmpnode for given vnode
537 tmp_root(struct vfs
*vfsp
, struct vnode
**vpp
)
539 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vfsp
);
540 struct tmpnode
*tp
= tm
->tm_rootnode
;
552 tmp_statvfs(struct vfs
*vfsp
, struct statvfs64
*sbp
)
554 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vfsp
);
561 * The file system may have been mounted by the global zone on
562 * behalf of the non-global zone. In that case, the tmount zone_id
563 * will be the global zone. We still want to show the swap cap inside
564 * the zone in this case, even though the file system was mounted by
567 if (curproc
->p_zone
->zone_id
!= GLOBAL_ZONEUNIQID
)
568 zp
= curproc
->p_zone
;
570 zp
= tm
->tm_vfsp
->vfs_zone
;
573 eff_zid
= GLOBAL_ZONEUNIQID
;
575 eff_zid
= zp
->zone_id
;
577 sbp
->f_bsize
= PAGESIZE
;
578 sbp
->f_frsize
= PAGESIZE
;
581 * Find the amount of available physical and memory swap
583 mutex_enter(&anoninfo_lock
);
584 ASSERT(k_anoninfo
.ani_max
>= k_anoninfo
.ani_phys_resv
);
585 blocks
= (ulong_t
)CURRENT_TOTAL_AVAILABLE_SWAP
;
586 mutex_exit(&anoninfo_lock
);
589 * If tm_anonmax for this mount is less than the available swap space
590 * (minus the amount tmpfs can't use), use that instead
592 if (blocks
> tmpfs_minfree
)
593 sbp
->f_bfree
= MIN(blocks
- tmpfs_minfree
,
594 tm
->tm_anonmax
- tm
->tm_anonmem
);
598 sbp
->f_bavail
= sbp
->f_bfree
;
601 * Total number of blocks is what's available plus what's been used
603 sbp
->f_blocks
= (fsblkcnt64_t
)(sbp
->f_bfree
+ tm
->tm_anonmem
);
605 if (eff_zid
!= GLOBAL_ZONEUNIQID
&&
606 zp
->zone_max_swap_ctl
!= UINT64_MAX
) {
608 * If the fs is used by a non-global zone with a swap cap,
609 * then report the capped size.
611 rctl_qty_t cap
, used
;
612 pgcnt_t pgcap
, pgused
;
614 mutex_enter(&zp
->zone_mem_lock
);
615 cap
= zp
->zone_max_swap_ctl
;
616 used
= zp
->zone_max_swap
;
617 mutex_exit(&zp
->zone_mem_lock
);
622 sbp
->f_bfree
= MIN(pgcap
- pgused
, sbp
->f_bfree
);
623 sbp
->f_bavail
= sbp
->f_bfree
;
624 sbp
->f_blocks
= MIN(pgcap
, sbp
->f_blocks
);
628 * The maximum number of files available is approximately the number
629 * of tmpnodes we can allocate from the remaining kernel memory
630 * available to tmpfs. This is fairly inaccurate since it doesn't
631 * take into account the names stored in the directory entries.
633 if (tmpfs_maxkmem
> tmp_kmemspace
)
634 sbp
->f_ffree
= (tmpfs_maxkmem
- tmp_kmemspace
) /
635 (sizeof (struct tmpnode
) + sizeof (struct tdirent
));
639 sbp
->f_files
= tmpfs_maxkmem
/
640 (sizeof (struct tmpnode
) + sizeof (struct tdirent
));
641 sbp
->f_favail
= (fsfilcnt64_t
)(sbp
->f_ffree
);
642 (void) cmpldev(&d32
, vfsp
->vfs_dev
);
644 (void) strcpy(sbp
->f_basetype
, vfssw
[tmpfsfstype
].vsw_name
);
645 (void) strncpy(sbp
->f_fstr
, tm
->tm_mntpath
, sizeof (sbp
->f_fstr
));
647 * ensure null termination
649 sbp
->f_fstr
[sizeof (sbp
->f_fstr
) - 1] = '\0';
650 sbp
->f_flag
= vf_to_stf(vfsp
->vfs_flag
);
651 sbp
->f_namemax
= MAXNAMELEN
- 1;
656 tmp_vget(struct vfs
*vfsp
, struct vnode
**vpp
, struct fid
*fidp
)
659 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vfsp
);
660 struct tmpnode
*tp
= NULL
;
662 tfid
= (struct tfid
*)fidp
;
665 mutex_enter(&tm
->tm_contents
);
666 for (tp
= tm
->tm_rootnode
; tp
; tp
= tp
->tn_forw
) {
667 mutex_enter(&tp
->tn_tlock
);
668 if (tp
->tn_nodeid
== tfid
->tfid_ino
) {
670 * If the gen numbers don't match we know the
671 * file won't be found since only one tmpnode
672 * can have this number at a time.
674 if (tp
->tn_gen
!= tfid
->tfid_gen
|| tp
->tn_nlink
== 0) {
675 mutex_exit(&tp
->tn_tlock
);
676 mutex_exit(&tm
->tm_contents
);
679 *vpp
= (struct vnode
*)TNTOV(tp
);
683 if ((tp
->tn_mode
& S_ISVTX
) &&
684 !(tp
->tn_mode
& (S_IXUSR
| S_IFDIR
))) {
685 mutex_enter(&(*vpp
)->v_lock
);
686 (*vpp
)->v_flag
|= VISSWAP
;
687 mutex_exit(&(*vpp
)->v_lock
);
689 mutex_exit(&tp
->tn_tlock
);
690 mutex_exit(&tm
->tm_contents
);
693 mutex_exit(&tp
->tn_tlock
);
695 mutex_exit(&tm
->tm_contents
);