6327 devfsadm and bootadm taking long time for configs with large number of zvols
[unleashed.git] / usr / src / uts / common / fs / dev / sdev_zvolops.c
blobf75d5c3c4e270ae5abe08fa4677f0afdae7062ad
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2013, 2016 Joyent, Inc. All rights reserved.
25 * Copyright (c) 2014 by Delphix. All rights reserved.
28 /* vnode ops for the /dev/zvol directory */
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/ddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/sunldi.h>
36 #include <fs/fs_subr.h>
37 #include <sys/fs/dv_node.h>
38 #include <sys/fs/sdev_impl.h>
39 #include <sys/zfs_ioctl.h>
40 #include <sys/policy.h>
41 #include <sys/stat.h>
42 #include <sys/vfs_opreg.h>
44 struct vnodeops *devzvol_vnodeops;
45 static major_t devzvol_major;
46 static taskq_ent_t devzvol_zclist_task;
48 static kmutex_t devzvol_mtx;
49 /* Below are protected by devzvol_mtx */
50 static boolean_t devzvol_isopen;
51 static boolean_t devzvol_zclist_task_running = B_FALSE;
52 static uint64_t devzvol_gen = 0;
53 static uint64_t devzvol_zclist;
54 static size_t devzvol_zclist_size;
55 static ldi_ident_t devzvol_li;
56 static ldi_handle_t devzvol_lh;
59 * we need to use ddi_mod* since fs/dev gets loaded early on in
60 * startup(), and linking fs/dev to fs/zfs would drag in a lot of
61 * other stuff (like drv/random) before the rest of the system is
62 * ready to go
64 ddi_modhandle_t zfs_mod;
65 int (*szcm)(char *);
66 int (*szn2m)(char *, minor_t *);
70 * Enable/disable snapshots from being created in /dev/zvol. By default,
71 * they are enabled, preserving the historic behavior.
73 boolean_t devzvol_snaps_allowed = B_TRUE;
75 int
76 sdev_zvol_create_minor(char *dsname)
78 if (szcm == NULL)
79 return (-1);
80 return ((*szcm)(dsname));
83 int
84 sdev_zvol_name2minor(char *dsname, minor_t *minor)
86 if (szn2m == NULL)
87 return (-1);
88 return ((*szn2m)(dsname, minor));
91 int
92 devzvol_open_zfs()
94 int rc;
95 dev_t dv;
97 devzvol_li = ldi_ident_from_anon();
98 if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
99 &devzvol_lh, devzvol_li))
100 return (-1);
101 if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
102 KRTLD_MODE_FIRST, &rc)) == NULL)) {
103 return (rc);
105 ASSERT(szcm == NULL && szn2m == NULL);
106 if ((szcm = (int (*)(char *))
107 ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
108 cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
109 return (rc);
111 if ((szn2m = (int(*)(char *, minor_t *))
112 ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
113 cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
114 return (rc);
116 if (ldi_get_dev(devzvol_lh, &dv))
117 return (-1);
118 devzvol_major = getmajor(dv);
119 return (0);
122 void
123 devzvol_close_zfs()
125 szcm = NULL;
126 szn2m = NULL;
127 (void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
128 ldi_ident_release(devzvol_li);
129 if (zfs_mod != NULL) {
130 (void) ddi_modclose(zfs_mod);
131 zfs_mod = NULL;
136 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
138 uint64_t cookie;
139 int size = 8000;
140 int unused;
141 int rc;
143 if (cmd != ZFS_IOC_POOL_CONFIGS)
144 mutex_enter(&devzvol_mtx);
145 if (!devzvol_isopen) {
146 if ((rc = devzvol_open_zfs()) == 0) {
147 devzvol_isopen = B_TRUE;
148 } else {
149 if (cmd != ZFS_IOC_POOL_CONFIGS)
150 mutex_exit(&devzvol_mtx);
151 return (ENXIO);
154 cookie = zc->zc_cookie;
155 again:
156 zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
157 KM_SLEEP);
158 zc->zc_nvlist_dst_size = size;
159 rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
160 &unused);
161 if (rc == ENOMEM) {
162 int newsize;
163 newsize = zc->zc_nvlist_dst_size;
164 ASSERT(newsize > size);
165 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
166 size = newsize;
167 zc->zc_cookie = cookie;
168 goto again;
170 if (alloc_size == NULL)
171 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
172 else
173 *alloc_size = size;
174 if (cmd != ZFS_IOC_POOL_CONFIGS)
175 mutex_exit(&devzvol_mtx);
176 return (rc);
179 /* figures out if the objset exists and returns its type */
181 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
183 boolean_t ispool, is_snapshot;
184 zfs_cmd_t *zc;
185 int rc;
186 nvlist_t *nvl;
187 size_t nvsz;
189 ispool = (strchr(dsname, '/') == NULL);
190 is_snapshot = (strchr(dsname, '@') != NULL);
192 if (is_snapshot && !devzvol_snaps_allowed)
193 return (ENOTSUP);
195 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
196 (void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
198 nvl = fnvlist_alloc();
199 fnvlist_add_boolean_value(nvl, "cachedpropsonly", B_TRUE);
200 zc->zc_nvlist_src = (uintptr_t)fnvlist_pack(nvl, &nvsz);
201 zc->zc_nvlist_src_size = nvsz;
202 fnvlist_free(nvl);
204 rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
205 ZFS_IOC_OBJSET_STATS, zc, NULL);
206 if (type && rc == 0)
207 *type = (ispool) ? DMU_OST_ZFS :
208 zc->zc_objset_stats.dds_type;
209 fnvlist_pack_free((char *)(uintptr_t)zc->zc_nvlist_src, nvsz);
210 kmem_free(zc, sizeof (zfs_cmd_t));
211 return (rc);
215 * Returns what the zfs dataset name should be, given the /dev/zvol
216 * path and an optional name (can be NULL).
218 * Note that if the name param is NULL, then path must be an
219 * actual dataset's directory and not one of the top-level
220 * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a
221 * specific dataset.
223 char *
224 devzvol_make_dsname(const char *path, const char *name)
226 char *dsname;
227 const char *ptr;
228 int dslen;
230 if (strcmp(path, ZVOL_DIR) == 0)
231 return (NULL);
232 if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
233 return (NULL);
234 ptr = path + strlen(ZVOL_DIR);
235 if (strncmp(ptr, "/dsk", 4) == 0)
236 ptr += strlen("/dsk");
237 else if (strncmp(ptr, "/rdsk", 5) == 0)
238 ptr += strlen("/rdsk");
239 else
240 return (NULL);
242 if (*ptr == '/')
243 ptr++;
244 else if (name == NULL)
245 return (NULL);
247 dslen = strlen(ptr);
248 if (dslen)
249 dslen++; /* plus null */
250 if (name)
251 dslen += strlen(name) + 1; /* plus slash */
252 dsname = kmem_zalloc(dslen, KM_SLEEP);
253 if (*ptr) {
254 (void) strlcpy(dsname, ptr, dslen);
255 if (name)
256 (void) strlcat(dsname, "/", dslen);
258 if (name)
259 (void) strlcat(dsname, name, dslen);
260 return (dsname);
264 * check if the zvol's sdev_node is still valid, which means make
265 * sure the zvol is still valid. zvol minors aren't proactively
266 * destroyed when the zvol is destroyed, so we use a validator to clean
267 * these up (in other words, when such nodes are encountered during
268 * subsequent lookup() and readdir() operations) so that only valid
269 * nodes are returned. The ordering between devname_lookup_func and
270 * devzvol_validate is a little inefficient in the case of invalid
271 * or stale nodes because devname_lookup_func calls
272 * devzvol_create_{dir, link}, then the validator says it's invalid,
273 * and then the node gets cleaned up.
276 devzvol_validate(struct sdev_node *dv)
278 vnode_t *vn = SDEVTOV(dv);
279 dmu_objset_type_t do_type;
280 char *dsname;
281 char *nm = dv->sdev_name;
282 int rc;
284 sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
286 * validate only READY nodes; if someone is sitting on the
287 * directory of a dataset that just got destroyed we could
288 * get a zombie node which we just skip.
290 if (dv->sdev_state != SDEV_READY) {
291 sdcmn_err13(("skipping '%s'", nm));
292 return (SDEV_VTOR_SKIP);
295 if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
296 (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
297 return (SDEV_VTOR_VALID);
298 dsname = devzvol_make_dsname(dv->sdev_path, NULL);
299 if (dsname == NULL)
300 return (SDEV_VTOR_INVALID);
303 * Leave any nodes alone that have been explicitly created by
304 * sdev profiles.
306 if (!(dv->sdev_flags & SDEV_GLOBAL) && dv->sdev_origin != NULL) {
307 kmem_free(dsname, strlen(dsname) + 1);
308 return (SDEV_VTOR_VALID);
311 rc = devzvol_objset_check(dsname, &do_type);
312 sdcmn_err13((" '%s' rc %d", dsname, rc));
313 if (rc != 0) {
314 sdev_node_t *parent = dv->sdev_dotdot;
316 * Explicitly passed-through zvols in our sdev profile can't
317 * be created as prof_* shadow nodes, because in the GZ they
318 * are symlinks, but in the NGZ they are actual device files.
320 * The objset_check will fail on these as they are outside
321 * any delegated dataset (zfs will not allow ioctl access to
322 * them from this zone). We still want them to work, though.
324 if (!(parent->sdev_flags & SDEV_GLOBAL) &&
325 parent->sdev_origin != NULL &&
326 !(dv->sdev_flags & SDEV_GLOBAL) &&
327 (vn->v_type == VBLK || vn->v_type == VCHR) &&
328 prof_name_matched(nm, parent)) {
329 do_type = DMU_OST_ZVOL;
330 } else {
331 kmem_free(dsname, strlen(dsname) + 1);
332 return (SDEV_VTOR_INVALID);
336 sdcmn_err13((" v_type %d do_type %d",
337 vn->v_type, do_type));
338 if ((vn->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
339 ((vn->v_type == VBLK || vn->v_type == VCHR) &&
340 do_type != DMU_OST_ZVOL) ||
341 (vn->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
342 kmem_free(dsname, strlen(dsname) + 1);
343 return (SDEV_VTOR_STALE);
345 if (vn->v_type == VLNK) {
346 char *ptr, *link;
347 long val = 0;
348 minor_t lminor, ominor;
350 rc = sdev_getlink(vn, &link);
351 ASSERT(rc == 0);
353 ptr = strrchr(link, ':') + 1;
354 rc = ddi_strtol(ptr, NULL, 10, &val);
355 kmem_free(link, strlen(link) + 1);
356 ASSERT(rc == 0 && val != 0);
357 lminor = (minor_t)val;
358 if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
359 ominor != lminor) {
360 kmem_free(dsname, strlen(dsname) + 1);
361 return (SDEV_VTOR_STALE);
364 kmem_free(dsname, strlen(dsname) + 1);
365 return (SDEV_VTOR_VALID);
369 * Taskq callback to update the devzvol_zclist.
371 * We need to defer this to the taskq to avoid it running with a user
372 * context that might be associated with some non-global zone, and thus
373 * not being able to list all of the pools on the entire system.
375 /*ARGSUSED*/
376 static void
377 devzvol_update_zclist_cb(void *arg)
379 zfs_cmd_t *zc;
380 int rc;
381 size_t size;
383 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
384 mutex_enter(&devzvol_mtx);
385 zc->zc_cookie = devzvol_gen;
387 rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
388 switch (rc) {
389 case 0:
390 /* new generation */
391 ASSERT(devzvol_gen != zc->zc_cookie);
392 devzvol_gen = zc->zc_cookie;
393 if (devzvol_zclist)
394 kmem_free((void *)(uintptr_t)devzvol_zclist,
395 devzvol_zclist_size);
396 devzvol_zclist = zc->zc_nvlist_dst;
397 /* Keep the alloc'd size, not the nvlist size. */
398 devzvol_zclist_size = size;
399 break;
400 default:
402 * Either there was no change in pool configuration
403 * since we last asked (rc == EEXIST) or we got a
404 * catastrophic error.
406 * Give up memory and exit.
408 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
409 size);
410 break;
413 VERIFY(devzvol_zclist_task_running == B_TRUE);
414 devzvol_zclist_task_running = B_FALSE;
415 mutex_exit(&devzvol_mtx);
417 kmem_free(zc, sizeof (zfs_cmd_t));
420 static void
421 devzvol_update_zclist(void)
423 mutex_enter(&devzvol_mtx);
424 if (devzvol_zclist_task_running == B_TRUE) {
425 mutex_exit(&devzvol_mtx);
426 goto wait;
429 devzvol_zclist_task_running = B_TRUE;
431 taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0,
432 &devzvol_zclist_task);
434 mutex_exit(&devzvol_mtx);
436 wait:
437 taskq_wait(sdev_taskq);
441 * Creates sub-directories for each zpool as needed in response to a
442 * readdir on one of the /dev/zvol/{dsk,rdsk} directories.
444 void
445 devzvol_create_pool_dirs(struct vnode *dvp)
447 nvlist_t *nv = NULL;
448 nvpair_t *elem = NULL;
449 int pools = 0;
450 int rc;
452 sdcmn_err13(("devzvol_create_pool_dirs"));
454 devzvol_update_zclist();
456 mutex_enter(&devzvol_mtx);
458 rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
459 devzvol_zclist_size, &nv, 0);
460 if (rc) {
461 ASSERT(rc == 0);
462 kmem_free((void *)(uintptr_t)devzvol_zclist,
463 devzvol_zclist_size);
464 devzvol_gen = 0;
465 devzvol_zclist = NULL;
466 devzvol_zclist_size = 0;
467 goto out;
469 mutex_exit(&devzvol_mtx);
470 while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
471 struct vnode *vp;
472 ASSERT(dvp->v_count > 0);
473 rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
474 NULL, kcred, NULL, 0, NULL);
475 /* should either work, or not be visible from a zone */
476 ASSERT(rc == 0 || rc == ENOENT);
477 if (rc == 0)
478 VN_RELE(vp);
479 pools++;
481 nvlist_free(nv);
482 mutex_enter(&devzvol_mtx);
483 if (devzvol_isopen && pools == 0) {
484 /* clean up so zfs can be unloaded */
485 devzvol_close_zfs();
486 devzvol_isopen = B_FALSE;
488 out:
489 mutex_exit(&devzvol_mtx);
492 /*ARGSUSED3*/
493 static int
494 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
495 cred_t *cred, void *whatever, char *whichever)
497 timestruc_t now;
498 struct vattr *vap = (struct vattr *)arg;
500 sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
501 ddv->sdev_path, nm));
502 ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
503 strlen(ZVOL_DIR)) == 0);
504 *vap = *sdev_getdefault_attr(VDIR);
505 gethrestime(&now);
506 vap->va_atime = now;
507 vap->va_mtime = now;
508 vap->va_ctime = now;
509 return (0);
512 /*ARGSUSED3*/
513 static int
514 devzvol_create_link(struct sdev_node *ddv, char *nm,
515 void **arg, cred_t *cred, void *whatever, char *whichever)
517 minor_t minor;
518 char *pathname = (char *)*arg;
519 int rc;
520 char *dsname;
521 char *x;
522 char str[MAXNAMELEN];
523 sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
524 ddv->sdev_path, nm));
525 dsname = devzvol_make_dsname(ddv->sdev_path, nm);
526 rc = sdev_zvol_create_minor(dsname);
527 if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
528 sdev_zvol_name2minor(dsname, &minor)) {
529 sdcmn_err13(("devzvol_create_link %d", rc));
530 kmem_free(dsname, strlen(dsname) + 1);
531 return (-1);
533 kmem_free(dsname, strlen(dsname) + 1);
536 * This is a valid zvol; create a symlink that points to the
537 * minor which was created under /devices/pseudo/zfs@0
539 *pathname = '\0';
540 for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
541 (void) strcat(pathname, "../");
542 (void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
543 (void) strncat(pathname, str, MAXPATHLEN);
544 if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
545 strlen(ZVOL_FULL_RDEV_DIR)) == 0)
546 (void) strcat(pathname, ",raw");
547 return (0);
550 /* Clean zvol sdev_nodes that are no longer valid. */
551 static void
552 devzvol_prunedir(struct sdev_node *ddv)
554 struct sdev_node *dv;
556 ASSERT(RW_READ_HELD(&ddv->sdev_contents));
558 sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
559 ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
560 if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
561 rw_exit(&ddv->sdev_contents);
562 rw_enter(&ddv->sdev_contents, RW_WRITER);
565 dv = SDEV_FIRST_ENTRY(ddv);
566 while (dv) {
567 sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
569 switch (devzvol_validate(dv)) {
570 case SDEV_VTOR_VALID:
571 case SDEV_VTOR_SKIP:
572 dv = SDEV_NEXT_ENTRY(ddv, dv);
573 continue;
574 case SDEV_VTOR_INVALID:
575 sdcmn_err7(("prunedir: destroy invalid "
576 "node: %s\n", dv->sdev_name));
577 break;
580 if ((SDEVTOV(dv)->v_type == VDIR) &&
581 (sdev_cleandir(dv, NULL, 0) != 0)) {
582 dv = SDEV_NEXT_ENTRY(ddv, dv);
583 continue;
585 SDEV_HOLD(dv);
586 /* remove the cache node */
587 sdev_cache_update(ddv, &dv, dv->sdev_name,
588 SDEV_CACHE_DELETE);
589 SDEV_RELE(dv);
590 dv = SDEV_FIRST_ENTRY(ddv);
592 rw_downgrade(&ddv->sdev_contents);
596 * This function is used to create a dir or dev inside a zone's /dev when the
597 * zone has a zvol that is dynamically created within the zone (i.e. inside
598 * of a delegated dataset. Since there is no /devices tree within a zone,
599 * we create the chr/blk devices directly inside the zone's /dev instead of
600 * making symlinks.
602 static int
603 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
605 struct vattr vattr;
606 timestruc_t now;
607 enum vtype expected_type = VDIR;
608 dmu_objset_type_t do_type;
609 struct sdev_node *dv = NULL;
610 int res;
611 char *dsname;
613 bzero(&vattr, sizeof (vattr));
614 gethrestime(&now);
615 vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
616 vattr.va_uid = SDEV_UID_DEFAULT;
617 vattr.va_gid = SDEV_GID_DEFAULT;
618 vattr.va_type = VNON;
619 vattr.va_atime = now;
620 vattr.va_mtime = now;
621 vattr.va_ctime = now;
623 if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
624 return (ENOENT);
626 if (devzvol_objset_check(dsname, &do_type) != 0) {
628 * objset_check will succeed on any valid objset in the global
629 * zone, and any valid delegated dataset. It will fail, however,
630 * in non-global zones on explicitly whitelisted zvol devices
631 * that are outside any delegated dataset.
633 * The directories leading up to the zvol device itself will be
634 * created by prof for us in advance (and will always validate
635 * because of the matching check in devzvol_validate). The zvol
636 * device itself can't be created by prof though because in the
637 * GZ it's a symlink, and in the NGZ it is not. So, we create
638 * such zvol device files here.
640 if (!(parent->sdev_flags & SDEV_GLOBAL) &&
641 parent->sdev_origin != NULL &&
642 prof_name_matched(nm, parent)) {
643 do_type = DMU_OST_ZVOL;
644 } else {
645 kmem_free(dsname, strlen(dsname) + 1);
646 return (ENOENT);
650 if (do_type == DMU_OST_ZVOL)
651 expected_type = VBLK;
653 if (expected_type == VDIR) {
654 vattr.va_type = VDIR;
655 vattr.va_mode = SDEV_DIRMODE_DEFAULT;
656 } else {
657 minor_t minor;
658 dev_t devnum;
659 int rc;
661 rc = sdev_zvol_create_minor(dsname);
662 if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
663 sdev_zvol_name2minor(dsname, &minor)) {
664 kmem_free(dsname, strlen(dsname) + 1);
665 return (ENOENT);
668 devnum = makedevice(devzvol_major, minor);
669 vattr.va_rdev = devnum;
671 if (strstr(parent->sdev_path, "/rdsk/") != NULL)
672 vattr.va_type = VCHR;
673 else
674 vattr.va_type = VBLK;
675 vattr.va_mode = SDEV_DEVMODE_DEFAULT;
677 kmem_free(dsname, strlen(dsname) + 1);
679 rw_enter(&parent->sdev_contents, RW_WRITER);
681 res = sdev_mknode(parent, nm, &dv, &vattr,
682 NULL, NULL, kcred, SDEV_READY);
683 rw_exit(&parent->sdev_contents);
684 if (res != 0)
685 return (ENOENT);
687 SDEV_RELE(dv);
688 return (0);
691 /*ARGSUSED*/
692 static int
693 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
694 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
695 caller_context_t *ct, int *direntflags, pathname_t *realpnp)
697 enum vtype expected_type = VDIR;
698 struct sdev_node *parent = VTOSDEV(dvp);
699 char *dsname;
700 dmu_objset_type_t do_type;
701 int error;
703 sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
704 *vpp = NULL;
705 /* execute access is required to search the directory */
706 if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
707 return (error);
709 rw_enter(&parent->sdev_contents, RW_READER);
710 if (!SDEV_IS_GLOBAL(parent)) {
711 int res;
713 rw_exit(&parent->sdev_contents);
716 * If we're in the global zone and reach down into a non-global
717 * zone's /dev/zvol then this action could trigger the creation
718 * of all of the zvol devices for every zone into the non-global
719 * zone's /dev tree. This could be a big security hole. To
720 * prevent this, disallow the global zone from looking inside
721 * a non-global zones /dev/zvol. This behavior is similar to
722 * delegated datasets, which cannot be used by the global zone.
724 if (getzoneid() == GLOBAL_ZONEID)
725 return (EPERM);
727 res = prof_lookup(dvp, nm, vpp, cred);
730 * We won't find a zvol that was dynamically created inside
731 * a NGZ, within a delegated dataset, in the zone's dev profile
732 * but prof_lookup will also find it via sdev_cache_lookup.
734 if (res == ENOENT) {
736 * We have to create the sdev node for the dymamically
737 * created zvol.
739 if (devzvol_mk_ngz_node(parent, nm) != 0)
740 return (ENOENT);
741 res = prof_lookup(dvp, nm, vpp, cred);
744 return (res);
748 * Don't let the global-zone style lookup succeed here when we're not
749 * running in the global zone. This can happen because prof calls into
750 * us (in prof_filldir) trying to create an explicitly passed-through
751 * zvol device outside any delegated dataset.
753 * We have to stop this here or else we will create prof shadows of
754 * the global zone symlink, which will make no sense at all in the
755 * non-global zone (it has no /devices for the symlink to point at).
757 * These zvols will be created later (at access time) by mk_ngz_node
758 * instead. The dirs leading up to them will be created by prof
759 * internally.
761 * We have to return EPERM here, because ENOENT is given special
762 * meaning by prof in this context.
764 if (getzoneid() != GLOBAL_ZONEID) {
765 rw_exit(&parent->sdev_contents);
766 return (EPERM);
769 dsname = devzvol_make_dsname(parent->sdev_path, nm);
770 rw_exit(&parent->sdev_contents);
771 sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
772 if (dsname) {
773 error = devzvol_objset_check(dsname, &do_type);
774 if (error != 0) {
775 error = ENOENT;
776 goto out;
778 if (do_type == DMU_OST_ZVOL)
779 expected_type = VLNK;
782 * the callbacks expect:
784 * parent->sdev_path nm
785 * /dev/zvol {r}dsk
786 * /dev/zvol/{r}dsk <pool name>
787 * /dev/zvol/{r}dsk/<dataset name> <last ds component>
789 * sdev_name is always last path component of sdev_path
791 if (expected_type == VDIR) {
792 error = devname_lookup_func(parent, nm, vpp, cred,
793 devzvol_create_dir, SDEV_VATTR);
794 } else {
795 error = devname_lookup_func(parent, nm, vpp, cred,
796 devzvol_create_link, SDEV_VLINK);
798 sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
799 ASSERT(error || ((*vpp)->v_type == expected_type));
800 out:
801 if (dsname)
802 kmem_free(dsname, strlen(dsname) + 1);
803 sdcmn_err13(("devzvol_lookup %d", error));
804 return (error);
808 * We allow create to find existing nodes
809 * - if the node doesn't exist - EROFS
810 * - creating an existing dir read-only succeeds, otherwise EISDIR
811 * - exclusive creates fail - EEXIST
813 /*ARGSUSED2*/
814 static int
815 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
816 int mode, struct vnode **vpp, struct cred *cred, int flag,
817 caller_context_t *ct, vsecattr_t *vsecp)
819 int error;
820 struct vnode *vp;
822 *vpp = NULL;
824 error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
825 NULL);
826 if (error == 0) {
827 if (excl == EXCL)
828 error = EEXIST;
829 else if (vp->v_type == VDIR && (mode & VWRITE))
830 error = EISDIR;
831 else
832 error = VOP_ACCESS(vp, mode, 0, cred, ct);
834 if (error) {
835 VN_RELE(vp);
836 } else
837 *vpp = vp;
838 } else if (error == ENOENT) {
839 error = EROFS;
842 return (error);
845 void sdev_iter_snapshots(struct vnode *dvp, char *name);
847 void
848 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
850 zfs_cmd_t *zc;
851 int rc;
853 sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
854 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
855 (void) strcpy(zc->zc_name, name);
857 while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
858 struct vnode *vpp;
859 char *ptr;
861 sdcmn_err13((" name %s", zc->zc_name));
862 if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
863 goto skip;
864 ptr = strrchr(zc->zc_name, '/') + 1;
865 rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
866 kcred, NULL, NULL, NULL);
867 if (rc == 0) {
868 VN_RELE(vpp);
869 } else if (rc == ENOENT) {
870 goto skip;
871 } else {
873 * EBUSY == problem with zvols's dmu holds?
874 * EPERM when in a NGZ and traversing up and out.
876 goto skip;
878 if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
879 zc->zc_objset_stats.dds_type == DMU_OST_ZVOL &&
880 devzvol_snaps_allowed)
881 sdev_iter_snapshots(dvp, zc->zc_name);
882 skip:
883 (void) strcpy(zc->zc_name, name);
885 kmem_free(zc, sizeof (zfs_cmd_t));
888 void
889 sdev_iter_snapshots(struct vnode *dvp, char *name)
891 sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
894 /*ARGSUSED4*/
895 static int
896 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
897 int *eofp, caller_context_t *ct_unused, int flags_unused)
899 struct sdev_node *sdvp = VTOSDEV(dvp);
900 char *ptr;
902 sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
903 sdvp->sdev_name));
905 if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
906 struct vnode *vp;
908 rw_exit(&sdvp->sdev_contents);
909 (void) devname_lookup_func(sdvp, "dsk", &vp, cred,
910 devzvol_create_dir, SDEV_VATTR);
911 VN_RELE(vp);
912 (void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
913 devzvol_create_dir, SDEV_VATTR);
914 VN_RELE(vp);
915 rw_enter(&sdvp->sdev_contents, RW_READER);
916 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
918 if (uiop->uio_offset == 0)
919 devzvol_prunedir(sdvp);
920 ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
921 if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
922 rw_exit(&sdvp->sdev_contents);
923 devzvol_create_pool_dirs(dvp);
924 rw_enter(&sdvp->sdev_contents, RW_READER);
925 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
928 ptr = strchr(ptr + 1, '/');
929 if (ptr == NULL)
930 return (ENOENT);
931 ptr++;
932 rw_exit(&sdvp->sdev_contents);
933 sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
934 rw_enter(&sdvp->sdev_contents, RW_READER);
935 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
938 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
939 VOPNAME_READDIR, { .vop_readdir = devzvol_readdir },
940 VOPNAME_LOOKUP, { .vop_lookup = devzvol_lookup },
941 VOPNAME_CREATE, { .vop_create = devzvol_create },
942 VOPNAME_RENAME, { .error = fs_nosys },
943 VOPNAME_MKDIR, { .error = fs_nosys },
944 VOPNAME_RMDIR, { .error = fs_nosys },
945 VOPNAME_REMOVE, { .error = fs_nosys },
946 VOPNAME_SYMLINK, { .error = fs_nosys },
947 NULL, NULL