3329 spa_sync() spends 10-20% of its time in spa_free_sync_cb()
[unleashed.git] / usr / src / uts / common / fs / lookup.c
blob6819509d004e73b5969c554f85c662cb574094df
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
39 #include <sys/types.h>
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/cpuvar.h>
43 #include <sys/errno.h>
44 #include <sys/cred.h>
45 #include <sys/user.h>
46 #include <sys/uio.h>
47 #include <sys/vfs.h>
48 #include <sys/vnode.h>
49 #include <sys/pathname.h>
50 #include <sys/proc.h>
51 #include <sys/vtrace.h>
52 #include <sys/sysmacros.h>
53 #include <sys/debug.h>
54 #include <sys/dirent.h>
55 #include <c2/audit.h>
56 #include <sys/zone.h>
57 #include <sys/dnlc.h>
58 #include <sys/fs/snode.h>
60 /* Controls whether paths are stored with vnodes. */
61 int vfs_vnode_path = 1;
63 int
64 lookupname(
65 char *fnamep,
66 enum uio_seg seg,
67 int followlink,
68 vnode_t **dirvpp,
69 vnode_t **compvpp)
71 return (lookupnameatcred(fnamep, seg, followlink, dirvpp, compvpp, NULL,
72 CRED()));
76 * Lookup the user file name,
77 * Handle allocation and freeing of pathname buffer, return error.
79 int
80 lookupnameatcred(
81 char *fnamep, /* user pathname */
82 enum uio_seg seg, /* addr space that name is in */
83 int followlink, /* follow sym links */
84 vnode_t **dirvpp, /* ret for ptr to parent dir vnode */
85 vnode_t **compvpp, /* ret for ptr to component vnode */
86 vnode_t *startvp, /* start path search from vp */
87 cred_t *cr) /* credential */
89 char namebuf[TYPICALMAXPATHLEN];
90 struct pathname lookpn;
91 int error;
93 error = pn_get_buf(fnamep, seg, &lookpn, namebuf, sizeof (namebuf));
94 if (error == 0) {
95 error = lookuppnatcred(&lookpn, NULL, followlink,
96 dirvpp, compvpp, startvp, cr);
98 if (error == ENAMETOOLONG) {
100 * This thread used a pathname > TYPICALMAXPATHLEN bytes long.
102 if (error = pn_get(fnamep, seg, &lookpn))
103 return (error);
104 error = lookuppnatcred(&lookpn, NULL, followlink,
105 dirvpp, compvpp, startvp, cr);
106 pn_free(&lookpn);
109 return (error);
113 lookupnameat(char *fnamep, enum uio_seg seg, int followlink,
114 vnode_t **dirvpp, vnode_t **compvpp, vnode_t *startvp)
116 return (lookupnameatcred(fnamep, seg, followlink, dirvpp, compvpp,
117 startvp, CRED()));
121 lookuppn(
122 struct pathname *pnp,
123 struct pathname *rpnp,
124 int followlink,
125 vnode_t **dirvpp,
126 vnode_t **compvpp)
128 return (lookuppnatcred(pnp, rpnp, followlink, dirvpp, compvpp, NULL,
129 CRED()));
133 * Lookup the user file name from a given vp, using a specific credential.
136 lookuppnatcred(
137 struct pathname *pnp, /* pathname to lookup */
138 struct pathname *rpnp, /* if non-NULL, return resolved path */
139 int followlink, /* (don't) follow sym links */
140 vnode_t **dirvpp, /* ptr for parent vnode */
141 vnode_t **compvpp, /* ptr for entry vnode */
142 vnode_t *startvp, /* start search from this vp */
143 cred_t *cr) /* user credential */
145 vnode_t *vp; /* current directory vp */
146 vnode_t *rootvp;
147 proc_t *p = curproc;
149 if (pnp->pn_pathlen == 0)
150 return (ENOENT);
152 mutex_enter(&p->p_lock); /* for u_rdir and u_cdir */
153 if ((rootvp = PTOU(p)->u_rdir) == NULL)
154 rootvp = rootdir;
155 else if (rootvp != rootdir) /* no need to VN_HOLD rootdir */
156 VN_HOLD(rootvp);
158 if (pnp->pn_path[0] == '/') {
159 vp = rootvp;
160 } else {
161 vp = (startvp == NULL) ? PTOU(p)->u_cdir : startvp;
163 VN_HOLD(vp);
164 mutex_exit(&p->p_lock);
167 * Skip over leading slashes
169 if (pnp->pn_path[0] == '/') {
170 do {
171 pnp->pn_path++;
172 pnp->pn_pathlen--;
173 } while (pnp->pn_path[0] == '/');
176 return (lookuppnvp(pnp, rpnp, followlink, dirvpp,
177 compvpp, rootvp, vp, cr));
181 lookuppnat(struct pathname *pnp, struct pathname *rpnp,
182 int followlink, vnode_t **dirvpp, vnode_t **compvpp,
183 vnode_t *startvp)
185 return (lookuppnatcred(pnp, rpnp, followlink, dirvpp, compvpp, startvp,
186 CRED()));
189 /* Private flag to do our getcwd() dirty work */
190 #define LOOKUP_CHECKREAD 0x10
191 #define LOOKUP_MASK (~LOOKUP_CHECKREAD)
194 * Starting at current directory, translate pathname pnp to end.
195 * Leave pathname of final component in pnp, return the vnode
196 * for the final component in *compvpp, and return the vnode
197 * for the parent of the final component in dirvpp.
199 * This is the central routine in pathname translation and handles
200 * multiple components in pathnames, separating them at /'s. It also
201 * implements mounted file systems and processes symbolic links.
203 * vp is the vnode where the directory search should start.
205 * Reference counts: vp must be held prior to calling this function. rootvp
206 * should only be held if rootvp != rootdir.
209 lookuppnvp(
210 struct pathname *pnp, /* pathname to lookup */
211 struct pathname *rpnp, /* if non-NULL, return resolved path */
212 int flags, /* follow symlinks */
213 vnode_t **dirvpp, /* ptr for parent vnode */
214 vnode_t **compvpp, /* ptr for entry vnode */
215 vnode_t *rootvp, /* rootvp */
216 vnode_t *vp, /* directory to start search at */
217 cred_t *cr) /* user's credential */
219 vnode_t *cvp; /* current component vp */
220 vnode_t *tvp; /* addressable temp ptr */
221 char component[MAXNAMELEN]; /* buffer for component (incl null) */
222 int error;
223 int nlink;
224 int lookup_flags;
225 struct pathname presrvd; /* case preserved name */
226 struct pathname *pp = NULL;
227 vnode_t *startvp;
228 vnode_t *zonevp = curproc->p_zone->zone_rootvp; /* zone root */
229 int must_be_directory = 0;
230 boolean_t retry_with_kcred;
231 uint32_t auditing = AU_AUDITING();
233 CPU_STATS_ADDQ(CPU, sys, namei, 1);
234 nlink = 0;
235 cvp = NULL;
236 if (rpnp)
237 rpnp->pn_pathlen = 0;
239 lookup_flags = dirvpp ? LOOKUP_DIR : 0;
240 if (flags & FIGNORECASE) {
241 lookup_flags |= FIGNORECASE;
242 pn_alloc(&presrvd);
243 pp = &presrvd;
246 if (auditing)
247 audit_anchorpath(pnp, vp == rootvp);
250 * Eliminate any trailing slashes in the pathname.
251 * If there are any, we must follow all symlinks.
252 * Also, we must guarantee that the last component is a directory.
254 if (pn_fixslash(pnp)) {
255 flags |= FOLLOW;
256 must_be_directory = 1;
259 startvp = vp;
260 next:
261 retry_with_kcred = B_FALSE;
264 * Make sure we have a directory.
266 if (vp->v_type != VDIR) {
267 error = ENOTDIR;
268 goto bad;
271 if (rpnp && VN_CMP(vp, rootvp))
272 (void) pn_set(rpnp, "/");
275 * Process the next component of the pathname.
277 if (error = pn_getcomponent(pnp, component)) {
278 goto bad;
282 * Handle "..": two special cases.
283 * 1. If we're at the root directory (e.g. after chroot or
284 * zone_enter) then change ".." to "." so we can't get
285 * out of this subtree.
286 * 2. If this vnode is the root of a mounted file system,
287 * then replace it with the vnode that was mounted on
288 * so that we take the ".." in the other file system.
290 if (component[0] == '.' && component[1] == '.' && component[2] == 0) {
291 checkforroot:
292 if (VN_CMP(vp, rootvp) || VN_CMP(vp, zonevp)) {
293 component[1] = '\0';
294 } else if (vp->v_flag & VROOT) {
295 vfs_t *vfsp;
296 cvp = vp;
299 * While we deal with the vfs pointer from the vnode
300 * the filesystem could have been forcefully unmounted
301 * and the vnode's v_vfsp could have been invalidated
302 * by VFS_UNMOUNT. Hence, we cache v_vfsp and use it
303 * with vfs_rlock_wait/vfs_unlock.
304 * It is safe to use the v_vfsp even it is freed by
305 * VFS_UNMOUNT because vfs_rlock_wait/vfs_unlock
306 * do not dereference v_vfsp. It is just used as a
307 * magic cookie.
308 * One more corner case here is the memory getting
309 * reused for another vfs structure. In this case
310 * lookuppnvp's vfs_rlock_wait will succeed, domount's
311 * vfs_lock will fail and domount will bail out with an
312 * error (EBUSY).
314 vfsp = cvp->v_vfsp;
317 * This lock is used to synchronize
318 * mounts/unmounts and lookups.
319 * Threads doing mounts/unmounts hold the
320 * writers version vfs_lock_wait().
323 vfs_rlock_wait(vfsp);
326 * If this vnode is on a file system that
327 * has been forcibly unmounted,
328 * we can't proceed. Cancel this operation
329 * and return EIO.
331 * vfs_vnodecovered is NULL if unmounted.
332 * Currently, nfs uses VFS_UNMOUNTED to
333 * check if it's a forced-umount. Keep the
334 * same checking here as well even though it
335 * may not be needed.
337 if (((vp = cvp->v_vfsp->vfs_vnodecovered) == NULL) ||
338 (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
339 vfs_unlock(vfsp);
340 VN_RELE(cvp);
341 if (pp)
342 pn_free(pp);
343 return (EIO);
345 VN_HOLD(vp);
346 vfs_unlock(vfsp);
347 VN_RELE(cvp);
348 cvp = NULL;
350 * Crossing mount points. For eg: We are doing
351 * a lookup of ".." for file systems root vnode
352 * mounted here, and VOP_LOOKUP() (with covered vnode)
353 * will be on underlying file systems mount point
354 * vnode. Set retry_with_kcred flag as we might end
355 * up doing VOP_LOOKUP() with kcred if required.
357 retry_with_kcred = B_TRUE;
358 goto checkforroot;
363 * LOOKUP_CHECKREAD is a private flag used by vnodetopath() to indicate
364 * that we need to have read permission on every directory in the entire
365 * path. This is used to ensure that a forward-lookup of a cached value
366 * has the same effect as a reverse-lookup when the cached value cannot
367 * be found.
369 if ((flags & LOOKUP_CHECKREAD) &&
370 (error = VOP_ACCESS(vp, VREAD, 0, cr, NULL)) != 0)
371 goto bad;
374 * Perform a lookup in the current directory.
376 error = VOP_LOOKUP(vp, component, &tvp, pnp, lookup_flags,
377 rootvp, cr, NULL, NULL, pp);
380 * Retry with kcred - If crossing mount points & error is EACCES.
382 * If we are crossing mount points here and doing ".." lookup,
383 * VOP_LOOKUP() might fail if the underlying file systems
384 * mount point has no execute permission. In cases like these,
385 * we retry VOP_LOOKUP() by giving as much privilage as possible
386 * by passing kcred credentials.
388 * In case of hierarchical file systems, passing kcred still may
389 * or may not work.
390 * For eg: UFS FS --> Mount NFS FS --> Again mount UFS on some
391 * directory inside NFS FS.
393 if ((error == EACCES) && retry_with_kcred)
394 error = VOP_LOOKUP(vp, component, &tvp, pnp, lookup_flags,
395 rootvp, zone_kcred(), NULL, NULL, pp);
397 cvp = tvp;
398 if (error) {
399 cvp = NULL;
401 * On error, return hard error if
402 * (a) we're not at the end of the pathname yet, or
403 * (b) the caller didn't want the parent directory, or
404 * (c) we failed for some reason other than a missing entry.
406 if (pn_pathleft(pnp) || dirvpp == NULL || error != ENOENT)
407 goto bad;
408 if (auditing) { /* directory access */
409 if (error = audit_savepath(pnp, vp, vp, error, cr))
410 goto bad_noaudit;
413 pn_setlast(pnp);
415 * We inform the caller that the desired entry must be
416 * a directory by adding a '/' to the component name.
418 if (must_be_directory && (error = pn_addslash(pnp)) != 0)
419 goto bad;
420 *dirvpp = vp;
421 if (compvpp != NULL)
422 *compvpp = NULL;
423 if (rootvp != rootdir)
424 VN_RELE(rootvp);
425 if (pp)
426 pn_free(pp);
427 return (0);
431 * Traverse mount points.
432 * XXX why don't we need to hold a read lock here (call vn_vfsrlock)?
433 * What prevents a concurrent update to v_vfsmountedhere?
434 * Possible answer: if mounting, we might not see the mount
435 * if it is concurrently coming into existence, but that's
436 * really not much different from the thread running a bit slower.
437 * If unmounting, we may get into traverse() when we shouldn't,
438 * but traverse() will catch this case for us.
439 * (For this to work, fetching v_vfsmountedhere had better
440 * be atomic!)
442 if (vn_mountedvfs(cvp) != NULL) {
443 tvp = cvp;
444 if ((error = traverse(&tvp)) != 0) {
446 * It is required to assign cvp here, because
447 * traverse() will return a held vnode which
448 * may different than the vnode that was passed
449 * in (even in the error case). If traverse()
450 * changes the vnode it releases the original,
451 * and holds the new one.
453 cvp = tvp;
454 goto bad;
456 cvp = tvp;
460 * If we hit a symbolic link and there is more path to be
461 * translated or this operation does not wish to apply
462 * to a link, then place the contents of the link at the
463 * front of the remaining pathname.
465 if (cvp->v_type == VLNK && ((flags & FOLLOW) || pn_pathleft(pnp))) {
466 struct pathname linkpath;
468 if (++nlink > MAXSYMLINKS) {
469 error = ELOOP;
470 goto bad;
472 pn_alloc(&linkpath);
473 if (error = pn_getsymlink(cvp, &linkpath, cr)) {
474 pn_free(&linkpath);
475 goto bad;
478 if (auditing)
479 audit_symlink(pnp, &linkpath);
481 if (pn_pathleft(&linkpath) == 0)
482 (void) pn_set(&linkpath, ".");
483 error = pn_insert(pnp, &linkpath, strlen(component));
484 pn_free(&linkpath);
485 if (error)
486 goto bad;
487 VN_RELE(cvp);
488 cvp = NULL;
489 if (pnp->pn_pathlen == 0) {
490 error = ENOENT;
491 goto bad;
493 if (pnp->pn_path[0] == '/') {
494 do {
495 pnp->pn_path++;
496 pnp->pn_pathlen--;
497 } while (pnp->pn_path[0] == '/');
498 VN_RELE(vp);
499 vp = rootvp;
500 VN_HOLD(vp);
502 if (auditing)
503 audit_anchorpath(pnp, vp == rootvp);
504 if (pn_fixslash(pnp)) {
505 flags |= FOLLOW;
506 must_be_directory = 1;
508 goto next;
512 * If rpnp is non-NULL, remember the resolved path name therein.
513 * Do not include "." components. Collapse occurrences of
514 * "previous/..", so long as "previous" is not itself "..".
515 * Exhausting rpnp results in error ENAMETOOLONG.
517 if (rpnp && strcmp(component, ".") != 0) {
518 size_t len;
520 if (strcmp(component, "..") == 0 &&
521 rpnp->pn_pathlen != 0 &&
522 !((rpnp->pn_pathlen > 2 &&
523 strncmp(rpnp->pn_path+rpnp->pn_pathlen-3, "/..", 3) == 0) ||
524 (rpnp->pn_pathlen == 2 &&
525 strncmp(rpnp->pn_path, "..", 2) == 0))) {
526 while (rpnp->pn_pathlen &&
527 rpnp->pn_path[rpnp->pn_pathlen-1] != '/')
528 rpnp->pn_pathlen--;
529 if (rpnp->pn_pathlen > 1)
530 rpnp->pn_pathlen--;
531 rpnp->pn_path[rpnp->pn_pathlen] = '\0';
532 } else {
533 if (rpnp->pn_pathlen != 0 &&
534 rpnp->pn_path[rpnp->pn_pathlen-1] != '/')
535 rpnp->pn_path[rpnp->pn_pathlen++] = '/';
536 if (flags & FIGNORECASE) {
538 * Return the case-preserved name
539 * within the resolved path.
541 error = copystr(pp->pn_buf,
542 rpnp->pn_path + rpnp->pn_pathlen,
543 rpnp->pn_bufsize - rpnp->pn_pathlen, &len);
544 } else {
545 error = copystr(component,
546 rpnp->pn_path + rpnp->pn_pathlen,
547 rpnp->pn_bufsize - rpnp->pn_pathlen, &len);
549 if (error) /* copystr() returns ENAMETOOLONG */
550 goto bad;
551 rpnp->pn_pathlen += (len - 1);
552 ASSERT(rpnp->pn_bufsize > rpnp->pn_pathlen);
557 * If no more components, return last directory (if wanted) and
558 * last component (if wanted).
560 if (pn_pathleft(pnp) == 0) {
562 * If there was a trailing slash in the pathname,
563 * make sure the last component is a directory.
565 if (must_be_directory && cvp->v_type != VDIR) {
566 error = ENOTDIR;
567 goto bad;
569 if (dirvpp != NULL) {
571 * Check that we have the real parent and not
572 * an alias of the last component.
574 if (vn_compare(vp, cvp)) {
575 if (auditing)
576 (void) audit_savepath(pnp, cvp, vp,
577 EINVAL, cr);
578 pn_setlast(pnp);
579 VN_RELE(vp);
580 VN_RELE(cvp);
581 if (rootvp != rootdir)
582 VN_RELE(rootvp);
583 if (pp)
584 pn_free(pp);
585 return (EINVAL);
587 *dirvpp = vp;
588 } else
589 VN_RELE(vp);
590 if (auditing)
591 (void) audit_savepath(pnp, cvp, vp, 0, cr);
592 if (pnp->pn_path == pnp->pn_buf)
593 (void) pn_set(pnp, ".");
594 else
595 pn_setlast(pnp);
596 if (rpnp) {
597 if (VN_CMP(cvp, rootvp))
598 (void) pn_set(rpnp, "/");
599 else if (rpnp->pn_pathlen == 0)
600 (void) pn_set(rpnp, ".");
603 if (compvpp != NULL)
604 *compvpp = cvp;
605 else
606 VN_RELE(cvp);
607 if (rootvp != rootdir)
608 VN_RELE(rootvp);
609 if (pp)
610 pn_free(pp);
611 return (0);
615 * Skip over slashes from end of last component.
617 while (pnp->pn_path[0] == '/') {
618 pnp->pn_path++;
619 pnp->pn_pathlen--;
623 * Searched through another level of directory:
624 * release previous directory handle and save new (result
625 * of lookup) as current directory.
627 VN_RELE(vp);
628 vp = cvp;
629 cvp = NULL;
630 goto next;
632 bad:
633 if (auditing) /* reached end of path */
634 (void) audit_savepath(pnp, cvp, vp, error, cr);
635 bad_noaudit:
637 * Error. Release vnodes and return.
639 if (cvp)
640 VN_RELE(cvp);
642 * If the error was ESTALE and the current directory to look in
643 * was the root for this lookup, the root for a mounted file
644 * system, or the starting directory for lookups, then
645 * return ENOENT instead of ESTALE. In this case, no recovery
646 * is possible by the higher level. If ESTALE was returned for
647 * some intermediate directory along the path, then recovery
648 * is potentially possible and retrying from the higher level
649 * will either correct the situation by purging stale cache
650 * entries or eventually get back to the point where no recovery
651 * is possible.
653 if (error == ESTALE &&
654 (VN_CMP(vp, rootvp) || (vp->v_flag & VROOT) || vp == startvp))
655 error = ENOENT;
656 VN_RELE(vp);
657 if (rootvp != rootdir)
658 VN_RELE(rootvp);
659 if (pp)
660 pn_free(pp);
661 return (error);
665 * Traverse a mount point. Routine accepts a vnode pointer as a reference
666 * parameter and performs the indirection, releasing the original vnode.
669 traverse(vnode_t **cvpp)
671 int error = 0;
672 vnode_t *cvp;
673 vnode_t *tvp;
674 vfs_t *vfsp;
676 cvp = *cvpp;
679 * If this vnode is mounted on, then we transparently indirect
680 * to the vnode which is the root of the mounted file system.
681 * Before we do this we must check that an unmount is not in
682 * progress on this vnode.
685 for (;;) {
687 * Try to read lock the vnode. If this fails because
688 * the vnode is already write locked, then check to
689 * see whether it is the current thread which locked
690 * the vnode. If it is not, then read lock the vnode
691 * by waiting to acquire the lock.
693 * The code path in domount() is an example of support
694 * which needs to look up two pathnames and locks one
695 * of them in between the two lookups.
697 error = vn_vfsrlock(cvp);
698 if (error) {
699 if (!vn_vfswlock_held(cvp))
700 error = vn_vfsrlock_wait(cvp);
701 if (error != 0) {
703 * lookuppn() expects a held vnode to be
704 * returned because it promptly calls
705 * VN_RELE after the error return
707 *cvpp = cvp;
708 return (error);
713 * Reached the end of the mount chain?
715 vfsp = vn_mountedvfs(cvp);
716 if (vfsp == NULL) {
717 vn_vfsunlock(cvp);
718 break;
722 * The read lock must be held across the call to VFS_ROOT() to
723 * prevent a concurrent unmount from destroying the vfs.
725 error = VFS_ROOT(vfsp, &tvp);
726 vn_vfsunlock(cvp);
728 if (error)
729 break;
731 VN_RELE(cvp);
733 cvp = tvp;
736 *cvpp = cvp;
737 return (error);
741 * Return the lowermost vnode if this is a mountpoint.
743 static vnode_t *
744 vn_under(vnode_t *vp)
746 vnode_t *uvp;
747 vfs_t *vfsp;
749 while (vp->v_flag & VROOT) {
751 vfsp = vp->v_vfsp;
752 vfs_rlock_wait(vfsp);
753 if ((uvp = vfsp->vfs_vnodecovered) == NULL ||
754 (vfsp->vfs_flag & VFS_UNMOUNTED)) {
755 vfs_unlock(vfsp);
756 break;
758 VN_HOLD(uvp);
759 vfs_unlock(vfsp);
760 VN_RELE(vp);
761 vp = uvp;
764 return (vp);
767 static int
768 vnode_match(vnode_t *v1, vnode_t *v2, cred_t *cr)
770 vattr_t v1attr, v2attr;
773 * If we have a device file, check to see if is a cloned open of the
774 * same device. For self-cloning devices, the major numbers will match.
775 * For devices cloned through the 'clone' driver, the minor number of
776 * the source device will be the same as the major number of the cloned
777 * device.
779 if ((v1->v_type == VCHR || v1->v_type == VBLK) &&
780 v1->v_type == v2->v_type) {
781 if ((spec_is_selfclone(v1) || spec_is_selfclone(v2)) &&
782 getmajor(v1->v_rdev) == getmajor(v2->v_rdev))
783 return (1);
785 if (spec_is_clone(v1) &&
786 getmajor(v1->v_rdev) == getminor(v2->v_rdev))
787 return (1);
789 if (spec_is_clone(v2) &&
790 getmajor(v2->v_rdev) == getminor(v1->v_rdev))
791 return (1);
794 v1attr.va_mask = v2attr.va_mask = AT_TYPE;
797 * This check for symbolic links handles the pseudo-symlinks in procfs.
798 * These particular links have v_type of VDIR, but the attributes have a
799 * type of VLNK. We need to avoid these links because otherwise if we
800 * are currently in '/proc/self/fd', then '/proc/self/cwd' will compare
801 * as the same vnode.
803 if (VOP_GETATTR(v1, &v1attr, 0, cr, NULL) != 0 ||
804 VOP_GETATTR(v2, &v2attr, 0, cr, NULL) != 0 ||
805 v1attr.va_type == VLNK || v2attr.va_type == VLNK)
806 return (0);
808 v1attr.va_mask = v2attr.va_mask = AT_TYPE | AT_FSID | AT_NODEID;
810 if (VOP_GETATTR(v1, &v1attr, ATTR_REAL, cr, NULL) != 0 ||
811 VOP_GETATTR(v2, &v2attr, ATTR_REAL, cr, NULL) != 0)
812 return (0);
814 return (v1attr.va_fsid == v2attr.va_fsid &&
815 v1attr.va_nodeid == v2attr.va_nodeid);
820 * Find the entry in the directory corresponding to the target vnode.
823 dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
824 size_t dlen, dirent64_t **rdp)
826 size_t dbuflen;
827 struct iovec iov;
828 struct uio uio;
829 int error;
830 int eof;
831 vnode_t *cmpvp;
832 struct dirent64 *dp;
833 pathname_t pnp;
835 ASSERT(dvp->v_type == VDIR);
838 * This is necessary because of the strange semantics of VOP_LOOKUP().
840 bzero(&pnp, sizeof (pnp));
842 eof = 0;
844 uio.uio_iov = &iov;
845 uio.uio_iovcnt = 1;
846 uio.uio_segflg = UIO_SYSSPACE;
847 uio.uio_fmode = 0;
848 uio.uio_extflg = UIO_COPY_CACHED;
849 uio.uio_loffset = 0;
851 if ((error = VOP_ACCESS(dvp, VREAD, 0, cr, NULL)) != 0)
852 return (error);
854 while (!eof) {
855 uio.uio_resid = dlen;
856 iov.iov_base = dbuf;
857 iov.iov_len = dlen;
859 (void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
860 error = VOP_READDIR(dvp, &uio, cr, &eof, NULL, 0);
861 VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
863 dbuflen = dlen - uio.uio_resid;
865 if (error || dbuflen == 0)
866 break;
868 dp = (dirent64_t *)dbuf;
869 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
871 * Ignore '.' and '..' entries
873 if (strcmp(dp->d_name, ".") == 0 ||
874 strcmp(dp->d_name, "..") == 0) {
875 dp = (dirent64_t *)((intptr_t)dp +
876 dp->d_reclen);
877 continue;
880 error = VOP_LOOKUP(dvp, dp->d_name, &cmpvp, &pnp, 0,
881 vrootp, cr, NULL, NULL, NULL);
884 * We only want to bail out if there was an error other
885 * than ENOENT. Otherwise, it could be that someone
886 * just removed an entry since the readdir() call, and
887 * the entry we want is further on in the directory.
889 if (error == 0) {
890 if (vnode_match(tvp, cmpvp, cr)) {
891 VN_RELE(cmpvp);
892 *rdp = dp;
893 return (0);
896 VN_RELE(cmpvp);
897 } else if (error != ENOENT) {
898 return (error);
901 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
906 * Something strange has happened, this directory does not contain the
907 * specified vnode. This should never happen in the normal case, since
908 * we ensured that dvp is the parent of vp. This is possible in some
909 * rare conditions (races and the special .zfs directory).
911 if (error == 0) {
912 error = VOP_LOOKUP(dvp, ".zfs", &cmpvp, &pnp, 0, vrootp, cr,
913 NULL, NULL, NULL);
914 if (error == 0) {
915 if (vnode_match(tvp, cmpvp, cr)) {
916 (void) strcpy(dp->d_name, ".zfs");
917 dp->d_reclen = strlen(".zfs");
918 dp->d_off = 2;
919 dp->d_ino = 1;
920 *rdp = dp;
921 } else {
922 error = ENOENT;
924 VN_RELE(cmpvp);
928 return (error);
932 * Given a global path (from rootdir), and a vnode that is the current root,
933 * return the portion of the path that is beneath the current root or NULL on
934 * failure. The path MUST be a resolved path (no '..' entries or symlinks),
935 * otherwise this function will fail.
937 static char *
938 localpath(char *path, struct vnode *vrootp, cred_t *cr)
940 vnode_t *vp;
941 vnode_t *cvp;
942 char component[MAXNAMELEN];
943 char *ret = NULL;
944 pathname_t pn;
947 * We use vn_compare() instead of VN_CMP() in order to detect lofs
948 * mounts and stacked vnodes.
950 if (vn_compare(vrootp, rootdir))
951 return (path);
953 if (pn_get(path, UIO_SYSSPACE, &pn) != 0)
954 return (NULL);
956 vp = rootdir;
957 VN_HOLD(vp);
959 if (vn_ismntpt(vp) && traverse(&vp) != 0) {
960 VN_RELE(vp);
961 pn_free(&pn);
962 return (NULL);
965 while (pn_pathleft(&pn)) {
966 pn_skipslash(&pn);
968 if (pn_getcomponent(&pn, component) != 0)
969 break;
971 if (VOP_LOOKUP(vp, component, &cvp, &pn, 0, rootdir, cr,
972 NULL, NULL, NULL) != 0)
973 break;
974 VN_RELE(vp);
975 vp = cvp;
977 if (vn_ismntpt(vp) && traverse(&vp) != 0)
978 break;
980 if (vn_compare(vp, vrootp)) {
981 ret = path + (pn.pn_path - pn.pn_buf);
982 break;
986 VN_RELE(vp);
987 pn_free(&pn);
989 return (ret);
993 * Given a directory, return the full, resolved path. This looks up "..",
994 * searches for the given vnode in the parent, appends the component, etc. It
995 * is used to implement vnodetopath() and getcwd() when the cached path fails.
997 static int
998 dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
999 cred_t *cr)
1001 pathname_t pn, rpn, emptypn;
1002 vnode_t *cmpvp, *pvp = NULL;
1003 vnode_t *startvp = vp;
1004 int err = 0, vprivs;
1005 size_t complen;
1006 char *dbuf;
1007 dirent64_t *dp;
1008 char *bufloc;
1009 size_t dlen = DIRENT64_RECLEN(MAXPATHLEN);
1010 refstr_t *mntpt;
1012 /* Operation only allowed on directories */
1013 ASSERT(vp->v_type == VDIR);
1015 /* We must have at least enough space for "/" */
1016 if (buflen < 2)
1017 return (ENAMETOOLONG);
1019 /* Start at end of string with terminating null */
1020 bufloc = &buf[buflen - 1];
1021 *bufloc = '\0';
1023 pn_alloc(&pn);
1024 pn_alloc(&rpn);
1025 dbuf = kmem_alloc(dlen, KM_SLEEP);
1026 bzero(&emptypn, sizeof (emptypn));
1029 * Begin with an additional reference on vp. This will be decremented
1030 * during the loop.
1032 VN_HOLD(vp);
1034 for (;;) {
1036 * Return if we've reached the root. If the buffer is empty,
1037 * return '/'. We explicitly don't use vn_compare(), since it
1038 * compares the real vnodes. A lofs mount of '/' would produce
1039 * incorrect results otherwise.
1041 if (VN_CMP(vrootp, vp)) {
1042 if (*bufloc == '\0')
1043 *--bufloc = '/';
1044 break;
1048 * If we've reached the VFS root, something has gone wrong. We
1049 * should have reached the root in the above check. The only
1050 * explantation is that 'vp' is not contained withing the given
1051 * root, in which case we return EPERM.
1053 if (VN_CMP(rootdir, vp)) {
1054 err = EPERM;
1055 goto out;
1059 * Shortcut: see if this vnode is a mountpoint. If so,
1060 * grab the path information from the vfs_t.
1062 if (vp->v_flag & VROOT) {
1064 mntpt = vfs_getmntpoint(vp->v_vfsp);
1065 if ((err = pn_set(&pn, (char *)refstr_value(mntpt)))
1066 == 0) {
1067 refstr_rele(mntpt);
1068 rpn.pn_path = rpn.pn_buf;
1071 * Ensure the mountpoint still exists.
1073 VN_HOLD(vrootp);
1074 if (vrootp != rootdir)
1075 VN_HOLD(vrootp);
1076 if (lookuppnvp(&pn, &rpn, flags, NULL,
1077 &cmpvp, vrootp, vrootp, cr) == 0) {
1079 if (VN_CMP(vp, cmpvp)) {
1080 VN_RELE(cmpvp);
1082 complen = strlen(rpn.pn_path);
1083 bufloc -= complen;
1084 if (bufloc < buf) {
1085 err = ERANGE;
1086 goto out;
1088 bcopy(rpn.pn_path, bufloc,
1089 complen);
1090 break;
1091 } else {
1092 VN_RELE(cmpvp);
1095 } else {
1096 refstr_rele(mntpt);
1101 * Shortcut: see if this vnode has correct v_path. If so,
1102 * we have the work done.
1104 mutex_enter(&vp->v_lock);
1105 if (vp->v_path != NULL) {
1107 if ((err = pn_set(&pn, vp->v_path)) == 0) {
1108 mutex_exit(&vp->v_lock);
1109 rpn.pn_path = rpn.pn_buf;
1112 * Ensure the v_path pointing to correct vnode
1114 VN_HOLD(vrootp);
1115 if (vrootp != rootdir)
1116 VN_HOLD(vrootp);
1117 if (lookuppnvp(&pn, &rpn, flags, NULL,
1118 &cmpvp, vrootp, vrootp, cr) == 0) {
1120 if (VN_CMP(vp, cmpvp)) {
1121 VN_RELE(cmpvp);
1123 complen = strlen(rpn.pn_path);
1124 bufloc -= complen;
1125 if (bufloc < buf) {
1126 err = ERANGE;
1127 goto out;
1129 bcopy(rpn.pn_path, bufloc,
1130 complen);
1131 break;
1132 } else {
1133 VN_RELE(cmpvp);
1136 } else {
1137 mutex_exit(&vp->v_lock);
1139 } else {
1140 mutex_exit(&vp->v_lock);
1144 * Shortcuts failed, search for this vnode in its parent. If
1145 * this is a mountpoint, then get the vnode underneath.
1147 if (vp->v_flag & VROOT)
1148 vp = vn_under(vp);
1149 if ((err = VOP_LOOKUP(vp, "..", &pvp, &emptypn, 0, vrootp, cr,
1150 NULL, NULL, NULL)) != 0)
1151 goto out;
1154 * With extended attributes, it's possible for a directory to
1155 * have a parent that is a regular file. Check for that here.
1157 if (pvp->v_type != VDIR) {
1158 err = ENOTDIR;
1159 goto out;
1163 * If this is true, something strange has happened. This is
1164 * only true if we are the root of a filesystem, which should
1165 * have been caught by the check above.
1167 if (VN_CMP(pvp, vp)) {
1168 err = ENOENT;
1169 goto out;
1173 * Check if we have read and search privilege so, that
1174 * we can lookup the path in the directory
1176 vprivs = (flags & LOOKUP_CHECKREAD) ? VREAD | VEXEC : VEXEC;
1177 if ((err = VOP_ACCESS(pvp, vprivs, 0, cr, NULL)) != 0) {
1178 goto out;
1182 * Try to obtain the path component from dnlc cache
1183 * before searching through the directory.
1185 if ((cmpvp = dnlc_reverse_lookup(vp, dbuf, dlen)) != NULL) {
1187 * If we got parent vnode as a result,
1188 * then the answered path is correct.
1190 if (VN_CMP(cmpvp, pvp)) {
1191 VN_RELE(cmpvp);
1192 complen = strlen(dbuf);
1193 bufloc -= complen;
1194 if (bufloc <= buf) {
1195 err = ENAMETOOLONG;
1196 goto out;
1198 bcopy(dbuf, bufloc, complen);
1200 /* Prepend a slash to the current path */
1201 *--bufloc = '/';
1203 /* And continue with the next component */
1204 VN_RELE(vp);
1205 vp = pvp;
1206 pvp = NULL;
1207 continue;
1208 } else {
1209 VN_RELE(cmpvp);
1214 * Search the parent directory for the entry corresponding to
1215 * this vnode.
1217 if ((err = dirfindvp(vrootp, pvp, vp, cr, dbuf, dlen, &dp))
1218 != 0)
1219 goto out;
1220 complen = strlen(dp->d_name);
1221 bufloc -= complen;
1222 if (bufloc <= buf) {
1223 err = ENAMETOOLONG;
1224 goto out;
1226 bcopy(dp->d_name, bufloc, complen);
1228 /* Prepend a slash to the current path. */
1229 *--bufloc = '/';
1231 /* And continue with the next component */
1232 VN_RELE(vp);
1233 vp = pvp;
1234 pvp = NULL;
1238 * Place the path at the beginning of the buffer.
1240 if (bufloc != buf)
1241 ovbcopy(bufloc, buf, buflen - (bufloc - buf));
1243 out:
1245 * If the error was ESTALE and the current directory to look in
1246 * was the root for this lookup, the root for a mounted file
1247 * system, or the starting directory for lookups, then
1248 * return ENOENT instead of ESTALE. In this case, no recovery
1249 * is possible by the higher level. If ESTALE was returned for
1250 * some intermediate directory along the path, then recovery
1251 * is potentially possible and retrying from the higher level
1252 * will either correct the situation by purging stale cache
1253 * entries or eventually get back to the point where no recovery
1254 * is possible.
1256 if (err == ESTALE &&
1257 (VN_CMP(vp, vrootp) || (vp->v_flag & VROOT) || vp == startvp))
1258 err = ENOENT;
1260 kmem_free(dbuf, dlen);
1261 VN_RELE(vp);
1262 if (pvp)
1263 VN_RELE(pvp);
1264 pn_free(&pn);
1265 pn_free(&rpn);
1267 return (err);
1271 * The additional flag, LOOKUP_CHECKREAD, is used to enforce artificial
1272 * constraints in order to be standards compliant. For example, if we have
1273 * the cached path of '/foo/bar', and '/foo' has permissions 100 (execute
1274 * only), then we can legitimately look up the path to the current working
1275 * directory without needing read permission. Existing standards tests,
1276 * however, assume that we are determining the path by repeatedly looking up
1277 * "..". We need to keep this behavior in order to maintain backwards
1278 * compatibility.
1280 static int
1281 vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen,
1282 cred_t *cr, int flags)
1284 pathname_t pn, rpn;
1285 int ret, len;
1286 vnode_t *compvp, *pvp, *realvp;
1287 proc_t *p = curproc;
1288 char path[MAXNAMELEN];
1289 int doclose = 0;
1292 * If vrootp is NULL, get the root for curproc. Callers with any other
1293 * requirements should pass in a different vrootp.
1295 if (vrootp == NULL) {
1296 mutex_enter(&p->p_lock);
1297 if ((vrootp = PTOU(p)->u_rdir) == NULL)
1298 vrootp = rootdir;
1299 VN_HOLD(vrootp);
1300 mutex_exit(&p->p_lock);
1301 } else {
1302 VN_HOLD(vrootp);
1306 * This is to get around an annoying artifact of the /proc filesystem,
1307 * which is the behavior of {cwd/root}. Trying to resolve this path
1308 * will result in /proc/pid/cwd instead of whatever the real working
1309 * directory is. We can't rely on VOP_REALVP(), since that will break
1310 * lofs. The only difference between procfs and lofs is that opening
1311 * the file will return the underling vnode in the case of procfs.
1313 if (vp->v_type == VDIR && VOP_REALVP(vp, &realvp, NULL) == 0 &&
1314 realvp != vp) {
1315 VN_HOLD(vp);
1316 if (VOP_OPEN(&vp, FREAD, cr, NULL) == 0)
1317 doclose = 1;
1318 else
1319 VN_RELE(vp);
1322 pn_alloc(&pn);
1325 * Check to see if we have a cached path in the vnode.
1327 mutex_enter(&vp->v_lock);
1328 if (vp->v_path != NULL) {
1329 (void) pn_set(&pn, vp->v_path);
1330 mutex_exit(&vp->v_lock);
1332 pn_alloc(&rpn);
1334 /* We should only cache absolute paths */
1335 ASSERT(pn.pn_buf[0] == '/');
1338 * If we are in a zone or a chroot environment, then we have to
1339 * take additional steps, since the path to the root might not
1340 * be readable with the current credentials, even though the
1341 * process can legitmately access the file. In this case, we
1342 * do the following:
1344 * lookuppnvp() with all privileges to get the resolved path.
1345 * call localpath() to get the local portion of the path, and
1346 * continue as normal.
1348 * If the the conversion to a local path fails, then we continue
1349 * as normal. This is a heuristic to make process object file
1350 * paths available from within a zone. Because lofs doesn't
1351 * support page operations, the vnode stored in the seg_t is
1352 * actually the underlying real vnode, not the lofs node itself.
1353 * Most of the time, the lofs path is the same as the underlying
1354 * vnode (for example, /usr/lib/libc.so.1).
1356 if (vrootp != rootdir) {
1357 char *local = NULL;
1358 VN_HOLD(rootdir);
1359 if (lookuppnvp(&pn, &rpn, FOLLOW,
1360 NULL, &compvp, rootdir, rootdir, kcred) == 0) {
1361 local = localpath(rpn.pn_path, vrootp,
1362 kcred);
1363 VN_RELE(compvp);
1367 * The original pn was changed through lookuppnvp().
1368 * Set it to local for next validation attempt.
1370 if (local) {
1371 (void) pn_set(&pn, local);
1372 } else {
1373 goto notcached;
1378 * We should have a local path at this point, so start the
1379 * search from the root of the current process.
1381 VN_HOLD(vrootp);
1382 if (vrootp != rootdir)
1383 VN_HOLD(vrootp);
1384 ret = lookuppnvp(&pn, &rpn, FOLLOW | flags, NULL,
1385 &compvp, vrootp, vrootp, cr);
1386 if (ret == 0) {
1388 * Check to see if the returned vnode is the same as
1389 * the one we expect. If not, give up.
1391 if (!vn_compare(vp, compvp) &&
1392 !vnode_match(vp, compvp, cr)) {
1393 VN_RELE(compvp);
1394 goto notcached;
1397 VN_RELE(compvp);
1400 * Return the result.
1402 if (buflen <= rpn.pn_pathlen)
1403 goto notcached;
1405 bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1);
1406 pn_free(&pn);
1407 pn_free(&rpn);
1408 VN_RELE(vrootp);
1409 if (doclose) {
1410 (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL);
1411 VN_RELE(vp);
1413 return (0);
1416 notcached:
1417 pn_free(&rpn);
1418 } else {
1419 mutex_exit(&vp->v_lock);
1422 pn_free(&pn);
1424 if (vp->v_type != VDIR) {
1426 * If we don't have a directory, try to find it in the dnlc via
1427 * reverse lookup. Once this is found, we can use the regular
1428 * directory search to find the full path.
1430 if ((pvp = dnlc_reverse_lookup(vp, path, MAXNAMELEN)) != NULL) {
1432 * Check if we have read privilege so, that
1433 * we can lookup the path in the directory
1435 ret = 0;
1436 if ((flags & LOOKUP_CHECKREAD)) {
1437 ret = VOP_ACCESS(pvp, VREAD, 0, cr, NULL);
1439 if (ret == 0) {
1440 ret = dirtopath(vrootp, pvp, buf, buflen,
1441 flags, cr);
1443 if (ret == 0) {
1444 len = strlen(buf);
1445 if (len + strlen(path) + 1 >= buflen) {
1446 ret = ENAMETOOLONG;
1447 } else {
1448 if (buf[len - 1] != '/')
1449 buf[len++] = '/';
1450 bcopy(path, buf + len,
1451 strlen(path) + 1);
1455 VN_RELE(pvp);
1456 } else
1457 ret = ENOENT;
1458 } else
1459 ret = dirtopath(vrootp, vp, buf, buflen, flags, cr);
1461 VN_RELE(vrootp);
1462 if (doclose) {
1463 (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL);
1464 VN_RELE(vp);
1467 return (ret);
1471 vnodetopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, cred_t *cr)
1473 return (vnodetopath_common(vrootp, vp, buf, buflen, cr, 0));
1477 dogetcwd(char *buf, size_t buflen)
1479 int ret;
1480 vnode_t *vp;
1481 vnode_t *compvp;
1482 refstr_t *cwd, *oldcwd;
1483 const char *value;
1484 pathname_t rpnp, pnp;
1485 proc_t *p = curproc;
1488 * Check to see if there is a cached version of the cwd. If so, lookup
1489 * the cached value and make sure it is the same vnode.
1491 mutex_enter(&p->p_lock);
1492 if ((cwd = PTOU(p)->u_cwd) != NULL)
1493 refstr_hold(cwd);
1494 vp = PTOU(p)->u_cdir;
1495 VN_HOLD(vp);
1496 mutex_exit(&p->p_lock);
1499 * Make sure we have permission to access the current directory.
1501 if ((ret = VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) != 0) {
1502 if (cwd != NULL)
1503 refstr_rele(cwd);
1504 VN_RELE(vp);
1505 return (ret);
1508 if (cwd) {
1509 value = refstr_value(cwd);
1510 if ((ret = pn_get((char *)value, UIO_SYSSPACE, &pnp)) != 0) {
1511 refstr_rele(cwd);
1512 VN_RELE(vp);
1513 return (ret);
1516 pn_alloc(&rpnp);
1518 if (lookuppn(&pnp, &rpnp, NO_FOLLOW, NULL, &compvp) == 0) {
1520 if (VN_CMP(vp, compvp) &&
1521 strcmp(value, rpnp.pn_path) == 0) {
1522 VN_RELE(compvp);
1523 VN_RELE(vp);
1524 pn_free(&pnp);
1525 pn_free(&rpnp);
1526 if (strlen(value) + 1 > buflen) {
1527 refstr_rele(cwd);
1528 return (ENAMETOOLONG);
1530 bcopy(value, buf, strlen(value) + 1);
1531 refstr_rele(cwd);
1532 return (0);
1535 VN_RELE(compvp);
1538 pn_free(&rpnp);
1539 pn_free(&pnp);
1541 refstr_rele(cwd);
1544 ret = vnodetopath_common(NULL, vp, buf, buflen, CRED(),
1545 LOOKUP_CHECKREAD);
1547 VN_RELE(vp);
1550 * Store the new cwd and replace the existing cached copy.
1552 if (ret == 0)
1553 cwd = refstr_alloc(buf);
1554 else
1555 cwd = NULL;
1557 mutex_enter(&p->p_lock);
1558 oldcwd = PTOU(p)->u_cwd;
1559 PTOU(p)->u_cwd = cwd;
1560 mutex_exit(&p->p_lock);
1562 if (oldcwd)
1563 refstr_rele(oldcwd);
1565 return (ret);