7068 need a way to change the tmpfs size on a mounted fs
[unleashed.git] / usr / src / uts / common / fs / tmpfs / tmp_vfsops.c
blobf22cc3ecf055841268511dd92f6b60d7fd76a10c
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, Joyent, Inc. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/kmem.h>
30 #include <sys/time.h>
31 #include <sys/pathname.h>
32 #include <sys/vfs.h>
33 #include <sys/vfs_opreg.h>
34 #include <sys/vnode.h>
35 #include <sys/stat.h>
36 #include <sys/uio.h>
37 #include <sys/stat.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/cred.h>
41 #include <sys/statvfs.h>
42 #include <sys/mount.h>
43 #include <sys/debug.h>
44 #include <sys/systm.h>
45 #include <sys/mntent.h>
46 #include <fs/fs_subr.h>
47 #include <vm/page.h>
48 #include <vm/anon.h>
49 #include <sys/model.h>
50 #include <sys/policy.h>
52 #include <sys/fs/swapnode.h>
53 #include <sys/fs/tmp.h>
54 #include <sys/fs/tmpnode.h>
56 static int tmpfsfstype;
59 * tmpfs vfs operations.
61 static int tmpfsinit(int, char *);
62 static int tmp_mount(struct vfs *, struct vnode *,
63 struct mounta *, struct cred *);
64 static int tmp_unmount(struct vfs *, int, struct cred *);
65 static int tmp_root(struct vfs *, struct vnode **);
66 static int tmp_statvfs(struct vfs *, struct statvfs64 *);
67 static int tmp_vget(struct vfs *, struct vnode **, struct fid *);
70 * Loadable module wrapper
72 #include <sys/modctl.h>
74 static mntopts_t tmpfs_proto_opttbl;
76 static vfsdef_t vfw = {
77 VFSDEF_VERSION,
78 "tmpfs",
79 tmpfsinit,
80 VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
81 &tmpfs_proto_opttbl
85 * in-kernel mnttab options
87 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
88 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
90 static mntopt_t tmpfs_options[] = {
91 /* Option name Cancel Opt Arg Flags Data */
92 { MNTOPT_XATTR, xattr_cancel, NULL, MO_DEFAULT, NULL},
93 { MNTOPT_NOXATTR, noxattr_cancel, NULL, NULL, NULL},
94 { "size", NULL, "0", MO_HASVALUE, NULL}
98 static mntopts_t tmpfs_proto_opttbl = {
99 sizeof (tmpfs_options) / sizeof (mntopt_t),
100 tmpfs_options
104 * Module linkage information
106 static struct modlfs modlfs = {
107 &mod_fsops, "filesystem for tmpfs", &vfw
110 static struct modlinkage modlinkage = {
111 MODREV_1, &modlfs, NULL
115 _init()
117 return (mod_install(&modlinkage));
121 _fini()
123 int error;
125 error = mod_remove(&modlinkage);
126 if (error)
127 return (error);
129 * Tear down the operations vectors
131 (void) vfs_freevfsops_by_type(tmpfsfstype);
132 vn_freevnodeops(tmp_vnodeops);
133 return (0);
137 _info(struct modinfo *modinfop)
139 return (mod_info(&modlinkage, modinfop));
143 * The following are patchable variables limiting the amount of system
144 * resources tmpfs can use.
146 * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory
147 * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries)
148 * It is not determined by setting a hard limit but rather as a percentage of
149 * physical memory which is determined when tmpfs is first used in the system.
151 * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
152 * the rest of the system. In other words, if the amount of free swap space
153 * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
154 * anon allocations will fail.
156 * There is also a per mount limit on the amount of swap space
157 * (tmount.tm_anonmax) settable via a mount option.
159 size_t tmpfs_maxkmem = 0;
160 size_t tmpfs_minfree = 0;
161 size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */
163 static major_t tmpfs_major;
164 static minor_t tmpfs_minor;
165 static kmutex_t tmpfs_minor_lock;
168 * initialize global tmpfs locks and such
169 * called when loading tmpfs module
171 static int
172 tmpfsinit(int fstype, char *name)
174 static const fs_operation_def_t tmp_vfsops_template[] = {
175 VFSNAME_MOUNT, { .vfs_mount = tmp_mount },
176 VFSNAME_UNMOUNT, { .vfs_unmount = tmp_unmount },
177 VFSNAME_ROOT, { .vfs_root = tmp_root },
178 VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs },
179 VFSNAME_VGET, { .vfs_vget = tmp_vget },
180 NULL, NULL
182 int error;
183 extern void tmpfs_hash_init();
185 tmpfs_hash_init();
186 tmpfsfstype = fstype;
187 ASSERT(tmpfsfstype != 0);
189 error = vfs_setfsops(fstype, tmp_vfsops_template, NULL);
190 if (error != 0) {
191 cmn_err(CE_WARN, "tmpfsinit: bad vfs ops template");
192 return (error);
195 error = vn_make_ops(name, tmp_vnodeops_template, &tmp_vnodeops);
196 if (error != 0) {
197 (void) vfs_freevfsops_by_type(fstype);
198 cmn_err(CE_WARN, "tmpfsinit: bad vnode ops template");
199 return (error);
203 * tmpfs_minfree doesn't need to be some function of configured
204 * swap space since it really is an absolute limit of swap space
205 * which still allows other processes to execute.
207 if (tmpfs_minfree == 0) {
209 * Set if not patched
211 tmpfs_minfree = btopr(TMPMINFREE);
215 * The maximum amount of space tmpfs can allocate is
216 * TMPMAXPROCKMEM percent of kernel memory
218 if (tmpfs_maxkmem == 0)
219 tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM);
221 if ((tmpfs_major = getudev()) == (major_t)-1) {
222 cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number.");
223 tmpfs_major = 0;
225 mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
226 return (0);
229 static int
230 tmp_mount(
231 struct vfs *vfsp,
232 struct vnode *mvp,
233 struct mounta *uap,
234 struct cred *cr)
236 struct tmount *tm = NULL;
237 struct tmpnode *tp;
238 struct pathname dpn;
239 int error;
240 pgcnt_t anonmax;
241 struct vattr rattr;
242 int got_attrs;
244 char *sizestr;
246 if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
247 return (error);
249 if (mvp->v_type != VDIR)
250 return (ENOTDIR);
252 mutex_enter(&mvp->v_lock);
253 if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 &&
254 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
255 mutex_exit(&mvp->v_lock);
256 return (EBUSY);
258 mutex_exit(&mvp->v_lock);
261 * Having the resource be anything but "swap" doesn't make sense.
263 vfs_setresource(vfsp, "swap", 0);
266 * now look for options we understand...
269 /* tmpfs doesn't support read-only mounts */
270 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
271 error = EINVAL;
272 goto out;
276 * tm_anonmax is set according to the mount arguments
277 * if any. Otherwise, it is set to a maximum value.
279 if (vfs_optionisset(vfsp, "size", &sizestr)) {
280 if ((error = tmp_convnum(sizestr, &anonmax)) != 0)
281 goto out;
282 } else {
283 anonmax = ULONG_MAX;
286 if (error = pn_get(uap->dir,
287 (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
288 goto out;
290 if (uap->flags & MS_REMOUNT) {
291 tm = (struct tmount *)VFSTOTM(vfsp);
294 * If we change the size so its less than what is currently
295 * being used, we allow that. The file system will simply be
296 * full until enough files have been removed to get below the
297 * new max.
299 mutex_enter(&tm->tm_contents);
300 tm->tm_anonmax = anonmax;
301 mutex_exit(&tm->tm_contents);
302 goto out;
305 if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
306 pn_free(&dpn);
307 error = ENOMEM;
308 goto out;
312 * find an available minor device number for this mount
314 mutex_enter(&tmpfs_minor_lock);
315 do {
316 tmpfs_minor = (tmpfs_minor + 1) & L_MAXMIN32;
317 tm->tm_dev = makedevice(tmpfs_major, tmpfs_minor);
318 } while (vfs_devismounted(tm->tm_dev));
319 mutex_exit(&tmpfs_minor_lock);
322 * Set but don't bother entering the mutex
323 * (tmount not on mount list yet)
325 mutex_init(&tm->tm_contents, NULL, MUTEX_DEFAULT, NULL);
326 mutex_init(&tm->tm_renamelck, NULL, MUTEX_DEFAULT, NULL);
328 tm->tm_vfsp = vfsp;
329 tm->tm_anonmax = anonmax;
331 vfsp->vfs_data = (caddr_t)tm;
332 vfsp->vfs_fstype = tmpfsfstype;
333 vfsp->vfs_dev = tm->tm_dev;
334 vfsp->vfs_bsize = PAGESIZE;
335 vfsp->vfs_flag |= VFS_NOTRUNC;
336 vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype);
337 tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE);
338 (void) strcpy(tm->tm_mntpath, dpn.pn_path);
341 * allocate and initialize root tmpnode structure
343 bzero(&rattr, sizeof (struct vattr));
344 rattr.va_mode = (mode_t)(S_IFDIR | 0777); /* XXX modes */
345 rattr.va_type = VDIR;
346 rattr.va_rdev = 0;
347 tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
348 tmpnode_init(tm, tp, &rattr, cr);
351 * Get the mode, uid, and gid from the underlying mount point.
353 rattr.va_mask = AT_MODE|AT_UID|AT_GID; /* Hint to getattr */
354 got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
356 rw_enter(&tp->tn_rwlock, RW_WRITER);
357 TNTOV(tp)->v_flag |= VROOT;
360 * If the getattr succeeded, use its results. Otherwise allow
361 * the previously set hardwired defaults to prevail.
363 if (got_attrs == 0) {
364 tp->tn_mode = rattr.va_mode;
365 tp->tn_uid = rattr.va_uid;
366 tp->tn_gid = rattr.va_gid;
370 * initialize linked list of tmpnodes so that the back pointer of
371 * the root tmpnode always points to the last one on the list
372 * and the forward pointer of the last node is null
374 tp->tn_back = tp;
375 tp->tn_forw = NULL;
376 tp->tn_nlink = 0;
377 tm->tm_rootnode = tp;
379 tdirinit(tp, tp);
381 rw_exit(&tp->tn_rwlock);
383 pn_free(&dpn);
384 error = 0;
386 out:
387 if (error == 0)
388 vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
390 return (error);
393 static int
394 tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
396 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
397 struct tmpnode *tnp, *cancel;
398 struct vnode *vp;
399 int error;
401 if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
402 return (error);
405 * forced unmount is not supported by this file system
406 * and thus, ENOTSUP, is being returned.
408 if (flag & MS_FORCE)
409 return (ENOTSUP);
411 mutex_enter(&tm->tm_contents);
414 * If there are no open files, only the root node should have
415 * a reference count.
416 * With tm_contents held, nothing can be added or removed.
417 * There may be some dirty pages. To prevent fsflush from
418 * disrupting the unmount, put a hold on each node while scanning.
419 * If we find a previously referenced node, undo the holds we have
420 * placed and fail EBUSY.
422 tnp = tm->tm_rootnode;
423 if (TNTOV(tnp)->v_count > 1) {
424 mutex_exit(&tm->tm_contents);
425 return (EBUSY);
428 for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) {
429 if ((vp = TNTOV(tnp))->v_count > 0) {
430 cancel = tm->tm_rootnode->tn_forw;
431 while (cancel != tnp) {
432 vp = TNTOV(cancel);
433 ASSERT(vp->v_count > 0);
434 VN_RELE(vp);
435 cancel = cancel->tn_forw;
437 mutex_exit(&tm->tm_contents);
438 return (EBUSY);
440 VN_HOLD(vp);
444 * We can drop the mutex now because no one can find this mount
446 mutex_exit(&tm->tm_contents);
449 * Free all kmemalloc'd and anonalloc'd memory associated with
450 * this filesystem. To do this, we go through the file list twice,
451 * once to remove all the directory entries, and then to remove
452 * all the files. We do this because there is useful code in
453 * tmpnode_free which assumes that the directory entry has been
454 * removed before the file.
457 * Remove all directory entries
459 for (tnp = tm->tm_rootnode; tnp; tnp = tnp->tn_forw) {
460 rw_enter(&tnp->tn_rwlock, RW_WRITER);
461 if (tnp->tn_type == VDIR)
462 tdirtrunc(tnp);
463 if (tnp->tn_vnode->v_flag & V_XATTRDIR) {
465 * Account for implicit attrdir reference.
467 ASSERT(tnp->tn_nlink > 0);
468 DECR_COUNT(&tnp->tn_nlink, &tnp->tn_tlock);
470 rw_exit(&tnp->tn_rwlock);
473 ASSERT(tm->tm_rootnode);
476 * All links are gone, v_count is keeping nodes in place.
477 * VN_RELE should make the node disappear, unless somebody
478 * is holding pages against it. Nap and retry until it disappears.
480 * We re-acquire the lock to prevent others who have a HOLD on
481 * a tmpnode via its pages or anon slots from blowing it away
482 * (in tmp_inactive) while we're trying to get to it here. Once
483 * we have a HOLD on it we know it'll stick around.
486 mutex_enter(&tm->tm_contents);
488 * Remove all the files (except the rootnode) backwards.
490 while ((tnp = tm->tm_rootnode->tn_back) != tm->tm_rootnode) {
491 mutex_exit(&tm->tm_contents);
493 * Inhibit tmp_inactive from touching attribute directory
494 * as all nodes will be released here.
495 * Note we handled the link count in pass 2 above.
497 rw_enter(&tnp->tn_rwlock, RW_WRITER);
498 tnp->tn_xattrdp = NULL;
499 rw_exit(&tnp->tn_rwlock);
500 vp = TNTOV(tnp);
501 VN_RELE(vp);
502 mutex_enter(&tm->tm_contents);
504 * It's still there after the RELE. Someone else like pageout
505 * has a hold on it so wait a bit and then try again - we know
506 * they'll give it up soon.
508 if (tnp == tm->tm_rootnode->tn_back) {
509 VN_HOLD(vp);
510 mutex_exit(&tm->tm_contents);
511 delay(hz / 4);
512 mutex_enter(&tm->tm_contents);
515 mutex_exit(&tm->tm_contents);
517 tm->tm_rootnode->tn_xattrdp = NULL;
518 VN_RELE(TNTOV(tm->tm_rootnode));
520 ASSERT(tm->tm_mntpath);
522 tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
524 ASSERT(tm->tm_anonmem == 0);
526 mutex_destroy(&tm->tm_contents);
527 mutex_destroy(&tm->tm_renamelck);
528 tmp_memfree(tm, sizeof (struct tmount));
530 return (0);
534 * return root tmpnode for given vnode
536 static int
537 tmp_root(struct vfs *vfsp, struct vnode **vpp)
539 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
540 struct tmpnode *tp = tm->tm_rootnode;
541 struct vnode *vp;
543 ASSERT(tp);
545 vp = TNTOV(tp);
546 VN_HOLD(vp);
547 *vpp = vp;
548 return (0);
551 static int
552 tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
554 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
555 ulong_t blocks;
556 dev32_t d32;
557 zoneid_t eff_zid;
558 struct zone *zp;
561 * The file system may have been mounted by the global zone on
562 * behalf of the non-global zone. In that case, the tmount zone_id
563 * will be the global zone. We still want to show the swap cap inside
564 * the zone in this case, even though the file system was mounted by
565 * the global zone.
567 if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
568 zp = curproc->p_zone;
569 else
570 zp = tm->tm_vfsp->vfs_zone;
572 if (zp == NULL)
573 eff_zid = GLOBAL_ZONEUNIQID;
574 else
575 eff_zid = zp->zone_id;
577 sbp->f_bsize = PAGESIZE;
578 sbp->f_frsize = PAGESIZE;
581 * Find the amount of available physical and memory swap
583 mutex_enter(&anoninfo_lock);
584 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
585 blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
586 mutex_exit(&anoninfo_lock);
589 * If tm_anonmax for this mount is less than the available swap space
590 * (minus the amount tmpfs can't use), use that instead
592 if (blocks > tmpfs_minfree)
593 sbp->f_bfree = MIN(blocks - tmpfs_minfree,
594 tm->tm_anonmax - tm->tm_anonmem);
595 else
596 sbp->f_bfree = 0;
598 sbp->f_bavail = sbp->f_bfree;
601 * Total number of blocks is what's available plus what's been used
603 sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem);
605 if (eff_zid != GLOBAL_ZONEUNIQID &&
606 zp->zone_max_swap_ctl != UINT64_MAX) {
608 * If the fs is used by a non-global zone with a swap cap,
609 * then report the capped size.
611 rctl_qty_t cap, used;
612 pgcnt_t pgcap, pgused;
614 mutex_enter(&zp->zone_mem_lock);
615 cap = zp->zone_max_swap_ctl;
616 used = zp->zone_max_swap;
617 mutex_exit(&zp->zone_mem_lock);
619 pgcap = btop(cap);
620 pgused = btop(used);
622 sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
623 sbp->f_bavail = sbp->f_bfree;
624 sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
628 * The maximum number of files available is approximately the number
629 * of tmpnodes we can allocate from the remaining kernel memory
630 * available to tmpfs. This is fairly inaccurate since it doesn't
631 * take into account the names stored in the directory entries.
633 if (tmpfs_maxkmem > tmp_kmemspace)
634 sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) /
635 (sizeof (struct tmpnode) + sizeof (struct tdirent));
636 else
637 sbp->f_ffree = 0;
639 sbp->f_files = tmpfs_maxkmem /
640 (sizeof (struct tmpnode) + sizeof (struct tdirent));
641 sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
642 (void) cmpldev(&d32, vfsp->vfs_dev);
643 sbp->f_fsid = d32;
644 (void) strcpy(sbp->f_basetype, vfssw[tmpfsfstype].vsw_name);
645 (void) strncpy(sbp->f_fstr, tm->tm_mntpath, sizeof (sbp->f_fstr));
647 * ensure null termination
649 sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
650 sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
651 sbp->f_namemax = MAXNAMELEN - 1;
652 return (0);
655 static int
656 tmp_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
658 struct tfid *tfid;
659 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
660 struct tmpnode *tp = NULL;
662 tfid = (struct tfid *)fidp;
663 *vpp = NULL;
665 mutex_enter(&tm->tm_contents);
666 for (tp = tm->tm_rootnode; tp; tp = tp->tn_forw) {
667 mutex_enter(&tp->tn_tlock);
668 if (tp->tn_nodeid == tfid->tfid_ino) {
670 * If the gen numbers don't match we know the
671 * file won't be found since only one tmpnode
672 * can have this number at a time.
674 if (tp->tn_gen != tfid->tfid_gen || tp->tn_nlink == 0) {
675 mutex_exit(&tp->tn_tlock);
676 mutex_exit(&tm->tm_contents);
677 return (0);
679 *vpp = (struct vnode *)TNTOV(tp);
681 VN_HOLD(*vpp);
683 if ((tp->tn_mode & S_ISVTX) &&
684 !(tp->tn_mode & (S_IXUSR | S_IFDIR))) {
685 mutex_enter(&(*vpp)->v_lock);
686 (*vpp)->v_flag |= VISSWAP;
687 mutex_exit(&(*vpp)->v_lock);
689 mutex_exit(&tp->tn_tlock);
690 mutex_exit(&tm->tm_contents);
691 return (0);
693 mutex_exit(&tp->tn_tlock);
695 mutex_exit(&tm->tm_contents);
696 return (0);