kernel/fs/lookup.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  25  * Copyright 2016 Joyent, Inc.
  26  */
  27
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  29 /*        All Rights Reserved   */
  30
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40
  41 #include <sys/types.h>
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/cpuvar.h>
  45 #include <sys/errno.h>
  46 #include <sys/cred.h>
  47 #include <sys/user.h>
  48 #include <sys/uio.h>
  49 #include <sys/vfs.h>
  50 #include <sys/vnode.h>
  51 #include <sys/pathname.h>
  52 #include <sys/proc.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/sysmacros.h>
  55 #include <sys/debug.h>
  56 #include <sys/dirent.h>
  57 #include <sys/zone.h>
  58 #include <sys/dnlc.h>
  59 #include <sys/fs/snode.h>
  60
  61 /* Controls whether paths are stored with vnodes. */
  62 int vfs_vnode_path = 1;
  63
  64 int
  65 lookupname(
  66         char *fnamep,
  67         enum uio_seg seg,
  68         int followlink,
  69         vnode_t **dirvpp,
  70         vnode_t **compvpp)
  71 {
  72         return (lookupnameatcred(fnamep, seg, followlink, dirvpp, compvpp, NULL,
  73             CRED()));
  74 }
  75
  76 /*
  77  * Lookup the user file name,
  78  * Handle allocation and freeing of pathname buffer, return error.
  79  */
  80 int
  81 lookupnameatcred(
  82         char *fnamep,                   /* user pathname */
  83         enum uio_seg seg,               /* addr space that name is in */
  84         int followlink,                 /* follow sym links */
  85         vnode_t **dirvpp,               /* ret for ptr to parent dir vnode */
  86         vnode_t **compvpp,              /* ret for ptr to component vnode */
  87         vnode_t *startvp,               /* start path search from vp */
  88         cred_t *cr)                     /* credential */
  89 {
  90         char namebuf[TYPICALMAXPATHLEN];
  91         struct pathname lookpn;
  92         int error;
  93
  94         error = pn_get_buf(fnamep, seg, &lookpn, namebuf, sizeof (namebuf));
  95         if (error == 0) {
  96                 error = lookuppnatcred(&lookpn, NULL, followlink,
  97                     dirvpp, compvpp, startvp, cr);
  98         }
  99         if (error == ENAMETOOLONG) {
 100                 /*
 101                  * This thread used a pathname > TYPICALMAXPATHLEN bytes long.
 102                  */
 103                 if (error = pn_get(fnamep, seg, &lookpn))
 104                         return (error);
 105                 error = lookuppnatcred(&lookpn, NULL, followlink,
 106                     dirvpp, compvpp, startvp, cr);
 107                 pn_free(&lookpn);
 108         }
 109
 110         return (error);
 111 }
 112
 113 int
 114 lookupnameat(char *fnamep, enum uio_seg seg, int followlink,
 115     vnode_t **dirvpp, vnode_t **compvpp, vnode_t *startvp)
 116 {
 117         return (lookupnameatcred(fnamep, seg, followlink, dirvpp, compvpp,
 118             startvp, CRED()));
 119 }
 120
 121 int
 122 lookuppn(
 123         struct pathname *pnp,
 124         struct pathname *rpnp,
 125         int followlink,
 126         vnode_t **dirvpp,
 127         vnode_t **compvpp)
 128 {
 129         return (lookuppnatcred(pnp, rpnp, followlink, dirvpp, compvpp, NULL,
 130             CRED()));
 131 }
 132
 133 /*
 134  * Lookup the user file name from a given vp, using a specific credential.
 135  */
 136 int
 137 lookuppnatcred(
 138         struct pathname *pnp,           /* pathname to lookup */
 139         struct pathname *rpnp,          /* if non-NULL, return resolved path */
 140         int followlink,                 /* (don't) follow sym links */
 141         vnode_t **dirvpp,               /* ptr for parent vnode */
 142         vnode_t **compvpp,              /* ptr for entry vnode */
 143         vnode_t *startvp,               /* start search from this vp */
 144         cred_t *cr)                     /* user credential */
 145 {
 146         vnode_t *vp;    /* current directory vp */
 147         vnode_t *rootvp;
 148         proc_t *p = curproc;
 149
 150         if (pnp->pn_pathlen == 0)
 151                 return (ENOENT);
 152
 153         mutex_enter(&p->p_lock);        /* for u_rdir and u_cdir */
 154         if ((rootvp = PTOU(p)->u_rdir) == NULL)
 155                 rootvp = rootdir;
 156         else if (rootvp != rootdir)     /* no need to VN_HOLD rootdir */
 157                 VN_HOLD(rootvp);
 158
 159         if (pnp->pn_path[0] == '/') {
 160                 vp = rootvp;
 161         } else {
 162                 vp = (startvp == NULL) ? PTOU(p)->u_cdir : startvp;
 163         }
 164         VN_HOLD(vp);
 165         mutex_exit(&p->p_lock);
 166
 167         /*
 168          * Skip over leading slashes
 169          */
 170         if (pnp->pn_path[0] == '/') {
 171                 do {
 172                         pnp->pn_path++;
 173                         pnp->pn_pathlen--;
 174                 } while (pnp->pn_path[0] == '/');
 175         }
 176
 177         return (lookuppnvp(pnp, rpnp, followlink, dirvpp,
 178             compvpp, rootvp, vp, cr));
 179 }
 180
 181 int
 182 lookuppnat(struct pathname *pnp, struct pathname *rpnp,
 183     int followlink, vnode_t **dirvpp, vnode_t **compvpp,
 184     vnode_t *startvp)
 185 {
 186         return (lookuppnatcred(pnp, rpnp, followlink, dirvpp, compvpp, startvp,
 187             CRED()));
 188 }
 189
 190 /* Private flag to do our getcwd() dirty work */
 191 #define LOOKUP_CHECKREAD        0x10
 192 #define LOOKUP_MASK             (~LOOKUP_CHECKREAD)
 193
 194 /*
 195  * Starting at current directory, translate pathname pnp to end.
 196  * Leave pathname of final component in pnp, return the vnode
 197  * for the final component in *compvpp, and return the vnode
 198  * for the parent of the final component in dirvpp.
 199  *
 200  * This is the central routine in pathname translation and handles
 201  * multiple components in pathnames, separating them at /'s.  It also
 202  * implements mounted file systems and processes symbolic links.
 203  *
 204  * vp is the vnode where the directory search should start.
 205  *
 206  * Reference counts: vp must be held prior to calling this function.  rootvp
 207  * should only be held if rootvp != rootdir.
 208  */
 209 int
 210 lookuppnvp(
 211         struct pathname *pnp,           /* pathname to lookup */
 212         struct pathname *rpnp,          /* if non-NULL, return resolved path */
 213         int flags,                      /* follow symlinks */
 214         vnode_t **dirvpp,               /* ptr for parent vnode */
 215         vnode_t **compvpp,              /* ptr for entry vnode */
 216         vnode_t *rootvp,                /* rootvp */
 217         vnode_t *vp,                    /* directory to start search at */
 218         cred_t *cr)                     /* user's credential */
 219 {
 220         vnode_t *cvp;   /* current component vp */
 221         char component[MAXNAMELEN];     /* buffer for component (incl null) */
 222         int error;
 223         int nlink;
 224         int lookup_flags;
 225         struct pathname presrvd; /* case preserved name */
 226         struct pathname *pp = NULL;
 227         vnode_t *startvp;
 228         vnode_t *zonevp = curproc->p_zone->zone_rootvp;         /* zone root */
 229         int must_be_directory = 0;
 230         boolean_t retry_with_kcred;
 231
 232         CPU_STATS_ADDQ(CPU, sys, namei, 1);
 233         nlink = 0;
 234         cvp = NULL;
 235         if (rpnp)
 236                 rpnp->pn_pathlen = 0;
 237
 238         lookup_flags = dirvpp ? LOOKUP_DIR : 0;
 239         if (flags & FIGNORECASE) {
 240                 lookup_flags |= FIGNORECASE;
 241                 pn_alloc(&presrvd);
 242                 pp = &presrvd;
 243         }
 244
 245         /*
 246          * Eliminate any trailing slashes in the pathname.
 247          * If there are any, we must follow all symlinks.
 248          * Also, we must guarantee that the last component is a directory.
 249          */
 250         if (pn_fixslash(pnp)) {
 251                 flags |= FOLLOW;
 252                 must_be_directory = 1;
 253         }
 254
 255         startvp = vp;
 256 next:
 257         retry_with_kcred = B_FALSE;
 258
 259         /*
 260          * Make sure we have a directory.
 261          */
 262         if (vp->v_type != VDIR) {
 263                 error = ENOTDIR;
 264                 goto bad;
 265         }
 266
 267         if (rpnp && VN_CMP(vp, rootvp))
 268                 (void) pn_set(rpnp, "/");
 269
 270         /*
 271          * Process the next component of the pathname.
 272          */
 273         if (error = pn_getcomponent(pnp, component)) {
 274                 goto bad;
 275         }
 276
 277         /*
 278          * Handle "..": two special cases.
 279          * 1. If we're at the root directory (e.g. after chroot or
 280          *    zone_enter) then change ".." to "." so we can't get
 281          *    out of this subtree.
 282          * 2. If this vnode is the root of a mounted file system,
 283          *    then replace it with the vnode that was mounted on
 284          *    so that we take the ".." in the other file system.
 285          */
 286         if (component[0] == '.' && component[1] == '.' && component[2] == 0) {
 287 checkforroot:
 288                 if (VN_CMP(vp, rootvp) || VN_CMP(vp, zonevp)) {
 289                         component[1] = '\0';
 290                 } else if (vp->v_flag & VROOT) {
 291                         vfs_t *vfsp;
 292                         cvp = vp;
 293
 294                         /*
 295                          * While we deal with the vfs pointer from the vnode
 296                          * the filesystem could have been forcefully unmounted
 297                          * and the vnode's v_vfsp could have been invalidated
 298                          * by VFS_UNMOUNT. Hence, we cache v_vfsp and use it
 299                          * with vfs_rlock_wait/vfs_unlock.
 300                          * It is safe to use the v_vfsp even it is freed by
 301                          * VFS_UNMOUNT because vfs_rlock_wait/vfs_unlock
 302                          * do not dereference v_vfsp. It is just used as a
 303                          * magic cookie.
 304                          * One more corner case here is the memory getting
 305                          * reused for another vfs structure. In this case
 306                          * lookuppnvp's vfs_rlock_wait will succeed, domount's
 307                          * vfs_lock will fail and domount will bail out with an
 308                          * error (EBUSY).
 309                          */
 310                         vfsp = cvp->v_vfsp;
 311
 312                         /*
 313                          * This lock is used to synchronize
 314                          * mounts/unmounts and lookups.
 315                          * Threads doing mounts/unmounts hold the
 316                          * writers version vfs_lock_wait().
 317                          */
 318
 319                         vfs_rlock_wait(vfsp);
 320
 321                         /*
 322                          * If this vnode is on a file system that
 323                          * has been forcibly unmounted,
 324                          * we can't proceed. Cancel this operation
 325                          * and return EIO.
 326                          *
 327                          * vfs_vnodecovered is NULL if unmounted.
 328                          * Currently, nfs uses VFS_UNMOUNTED to
 329                          * check if it's a forced-umount. Keep the
 330                          * same checking here as well even though it
 331                          * may not be needed.
 332                          */
 333                         if (((vp = cvp->v_vfsp->vfs_vnodecovered) == NULL) ||
 334                             (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
 335                                 vfs_unlock(vfsp);
 336                                 VN_RELE(cvp);
 337                                 if (pp)
 338                                         pn_free(pp);
 339                                 return (EIO);
 340                         }
 341                         VN_HOLD(vp);
 342                         vfs_unlock(vfsp);
 343                         VN_RELE(cvp);
 344                         cvp = NULL;
 345                         /*
 346                          * Crossing mount points. For eg: We are doing
 347                          * a lookup of ".." for file systems root vnode
 348                          * mounted here, and fop_lookup() (with covered vnode)
 349                          * will be on underlying file systems mount point
 350                          * vnode. Set retry_with_kcred flag as we might end
 351                          * up doing fop_lookup() with kcred if required.
 352                          */
 353                         retry_with_kcred = B_TRUE;
 354                         goto checkforroot;
 355                 }
 356         }
 357
 358         /*
 359          * LOOKUP_CHECKREAD is a private flag used by vnodetopath() to indicate
 360          * that we need to have read permission on every directory in the entire
 361          * path.  This is used to ensure that a forward-lookup of a cached value
 362          * has the same effect as a reverse-lookup when the cached value cannot
 363          * be found.
 364          */
 365         if ((flags & LOOKUP_CHECKREAD) &&
 366             (error = fop_access(vp, VREAD, 0, cr, NULL)) != 0)
 367                 goto bad;
 368
 369         /*
 370          * Perform a lookup in the current directory.
 371          */
 372         error = fop_lookup(vp, component, &cvp, pnp, lookup_flags,
 373             rootvp, cr, NULL, NULL, pp);
 374
 375         /*
 376          * Retry with kcred - If crossing mount points & error is EACCES.
 377          *
 378          * If we are crossing mount points here and doing ".." lookup,
 379          * fop_lookup() might fail if the underlying file systems
 380          * mount point has no execute permission. In cases like these,
 381          * we retry fop_lookup() by giving as much privilage as possible
 382          * by passing kcred credentials.
 383          *
 384          * In case of hierarchical file systems, passing kcred still may
 385          * or may not work.
 386          * For eg: UFS FS --> Mount NFS FS --> Again mount UFS on some
 387          *                      directory inside NFS FS.
 388          */
 389         if ((error == EACCES) && retry_with_kcred)
 390                 error = fop_lookup(vp, component, &cvp, pnp, lookup_flags,
 391                     rootvp, zone_kcred(), NULL, NULL, pp);
 392
 393         if (error) {
 394                 cvp = NULL;
 395                 /*
 396                  * On error, return hard error if
 397                  * (a) we're not at the end of the pathname yet, or
 398                  * (b) the caller didn't want the parent directory, or
 399                  * (c) we failed for some reason other than a missing entry.
 400                  */
 401                 if (pn_pathleft(pnp) || dirvpp == NULL || error != ENOENT)
 402                         goto bad;
 403
 404                 pn_setlast(pnp);
 405                 /*
 406                  * We inform the caller that the desired entry must be
 407                  * a directory by adding a '/' to the component name.
 408                  */
 409                 if (must_be_directory && (error = pn_addslash(pnp)) != 0)
 410                         goto bad;
 411                 *dirvpp = vp;
 412                 if (compvpp != NULL)
 413                         *compvpp = NULL;
 414                 if (rootvp != rootdir)
 415                         VN_RELE(rootvp);
 416                 if (pp)
 417                         pn_free(pp);
 418                 return (0);
 419         }
 420
 421         /*
 422          * Traverse mount points.
 423          * XXX why don't we need to hold a read lock here (call vn_vfsrlock)?
 424          * What prevents a concurrent update to v_vfsmountedhere?
 425          *      Possible answer: if mounting, we might not see the mount
 426          *      if it is concurrently coming into existence, but that's
 427          *      really not much different from the thread running a bit slower.
 428          *      If unmounting, we may get into traverse() when we shouldn't,
 429          *      but traverse() will catch this case for us.
 430          *      (For this to work, fetching v_vfsmountedhere had better
 431          *      be atomic!)
 432          */
 433         if (vn_mountedvfs(cvp) != NULL) {
 434                 if ((error = traverse(&cvp)) != 0)
 435                         goto bad;
 436         }
 437
 438         /*
 439          * If we hit a symbolic link and there is more path to be
 440          * translated or this operation does not wish to apply
 441          * to a link, then place the contents of the link at the
 442          * front of the remaining pathname.
 443          */
 444         if (cvp->v_type == VLNK && ((flags & FOLLOW) || pn_pathleft(pnp))) {
 445                 struct pathname linkpath;
 446
 447                 if (++nlink > MAXSYMLINKS) {
 448                         error = ELOOP;
 449                         goto bad;
 450                 }
 451                 pn_alloc(&linkpath);
 452                 if (error = pn_getsymlink(cvp, &linkpath, cr)) {
 453                         pn_free(&linkpath);
 454                         goto bad;
 455                 }
 456
 457                 if (pn_pathleft(&linkpath) == 0)
 458                         (void) pn_set(&linkpath, ".");
 459                 error = pn_insert(pnp, &linkpath, strlen(component));
 460                 pn_free(&linkpath);
 461                 if (error)
 462                         goto bad;
 463                 VN_RELE(cvp);
 464                 cvp = NULL;
 465                 if (pnp->pn_pathlen == 0) {
 466                         error = ENOENT;
 467                         goto bad;
 468                 }
 469                 if (pnp->pn_path[0] == '/') {
 470                         do {
 471                                 pnp->pn_path++;
 472                                 pnp->pn_pathlen--;
 473                         } while (pnp->pn_path[0] == '/');
 474                         VN_RELE(vp);
 475                         vp = rootvp;
 476                         VN_HOLD(vp);
 477                 }
 478                 if (pn_fixslash(pnp)) {
 479                         flags |= FOLLOW;
 480                         must_be_directory = 1;
 481                 }
 482                 goto next;
 483         }
 484
 485         /*
 486          * If rpnp is non-NULL, remember the resolved path name therein.
 487          * Do not include "." components.  Collapse occurrences of
 488          * "previous/..", so long as "previous" is not itself "..".
 489          * Exhausting rpnp results in error ENAMETOOLONG.
 490          */
 491         if (rpnp && strcmp(component, ".") != 0) {
 492                 size_t len;
 493
 494                 if (strcmp(component, "..") == 0 &&
 495                     rpnp->pn_pathlen != 0 &&
 496                     !((rpnp->pn_pathlen > 2 &&
 497                     strncmp(rpnp->pn_path+rpnp->pn_pathlen-3, "/..", 3) == 0) ||
 498                     (rpnp->pn_pathlen == 2 &&
 499                     strncmp(rpnp->pn_path, "..", 2) == 0))) {
 500                         while (rpnp->pn_pathlen &&
 501                             rpnp->pn_path[rpnp->pn_pathlen-1] != '/')
 502                                 rpnp->pn_pathlen--;
 503                         if (rpnp->pn_pathlen > 1)
 504                                 rpnp->pn_pathlen--;
 505                         rpnp->pn_path[rpnp->pn_pathlen] = '\0';
 506                 } else {
 507                         if (rpnp->pn_pathlen != 0 &&
 508                             rpnp->pn_path[rpnp->pn_pathlen-1] != '/')
 509                                 rpnp->pn_path[rpnp->pn_pathlen++] = '/';
 510                         if (flags & FIGNORECASE) {
 511                                 /*
 512                                  * Return the case-preserved name
 513                                  * within the resolved path.
 514                                  */
 515                                 error = copystr(pp->pn_buf,
 516                                     rpnp->pn_path + rpnp->pn_pathlen,
 517                                     rpnp->pn_bufsize - rpnp->pn_pathlen, &len);
 518                         } else {
 519                                 error = copystr(component,
 520                                     rpnp->pn_path + rpnp->pn_pathlen,
 521                                     rpnp->pn_bufsize - rpnp->pn_pathlen, &len);
 522                         }
 523                         if (error)      /* copystr() returns ENAMETOOLONG */
 524                                 goto bad;
 525                         rpnp->pn_pathlen += (len - 1);
 526                         ASSERT(rpnp->pn_bufsize > rpnp->pn_pathlen);
 527                 }
 528         }
 529
 530         /*
 531          * If no more components, return last directory (if wanted) and
 532          * last component (if wanted).
 533          */
 534         if (pn_pathleft(pnp) == 0) {
 535                 /*
 536                  * If there was a trailing slash in the pathname,
 537                  * make sure the last component is a directory.
 538                  */
 539                 if (must_be_directory && cvp->v_type != VDIR) {
 540                         error = ENOTDIR;
 541                         goto bad;
 542                 }
 543                 if (dirvpp != NULL) {
 544                         /*
 545                          * Check that we have the real parent and not
 546                          * an alias of the last component.
 547                          */
 548                         if (vn_compare(vp, cvp)) {
 549                                 pn_setlast(pnp);
 550                                 VN_RELE(vp);
 551                                 VN_RELE(cvp);
 552                                 if (rootvp != rootdir)
 553                                         VN_RELE(rootvp);
 554                                 if (pp)
 555                                         pn_free(pp);
 556                                 return (EINVAL);
 557                         }
 558                         *dirvpp = vp;
 559                 } else
 560                         VN_RELE(vp);
 561                 if (pnp->pn_path == pnp->pn_buf)
 562                         (void) pn_set(pnp, ".");
 563                 else
 564                         pn_setlast(pnp);
 565                 if (rpnp) {
 566                         if (VN_CMP(cvp, rootvp))
 567                                 (void) pn_set(rpnp, "/");
 568                         else if (rpnp->pn_pathlen == 0)
 569                                 (void) pn_set(rpnp, ".");
 570                 }
 571
 572                 if (compvpp != NULL)
 573                         *compvpp = cvp;
 574                 else
 575                         VN_RELE(cvp);
 576                 if (rootvp != rootdir)
 577                         VN_RELE(rootvp);
 578                 if (pp)
 579                         pn_free(pp);
 580                 return (0);
 581         }
 582
 583         /*
 584          * Skip over slashes from end of last component.
 585          */
 586         while (pnp->pn_path[0] == '/') {
 587                 pnp->pn_path++;
 588                 pnp->pn_pathlen--;
 589         }
 590
 591         /*
 592          * Searched through another level of directory:
 593          * release previous directory handle and save new (result
 594          * of lookup) as current directory.
 595          */
 596         VN_RELE(vp);
 597         vp = cvp;
 598         cvp = NULL;
 599         goto next;
 600
 601 bad:
 602         /*
 603          * Error.  Release vnodes and return.
 604          */
 605         if (cvp)
 606                 VN_RELE(cvp);
 607         /*
 608          * If the error was ESTALE and the current directory to look in
 609          * was the root for this lookup, the root for a mounted file
 610          * system, or the starting directory for lookups, then
 611          * return ENOENT instead of ESTALE.  In this case, no recovery
 612          * is possible by the higher level.  If ESTALE was returned for
 613          * some intermediate directory along the path, then recovery
 614          * is potentially possible and retrying from the higher level
 615          * will either correct the situation by purging stale cache
 616          * entries or eventually get back to the point where no recovery
 617          * is possible.
 618          */
 619         if (error == ESTALE &&
 620             (VN_CMP(vp, rootvp) || (vp->v_flag & VROOT) || vp == startvp))
 621                 error = ENOENT;
 622         VN_RELE(vp);
 623         if (rootvp != rootdir)
 624                 VN_RELE(rootvp);
 625         if (pp)
 626                 pn_free(pp);
 627         return (error);
 628 }
 629
 630 /*
 631  * Traverse a mount point.  Routine accepts a vnode pointer as a reference
 632  * parameter and performs the indirection, releasing the original vnode.
 633  */
 634 int
 635 traverse(vnode_t **cvpp)
 636 {
 637         int error = 0;
 638         vnode_t *cvp;
 639         vnode_t *tvp;
 640         vfs_t *vfsp;
 641
 642         cvp = *cvpp;
 643
 644         /*
 645          * If this vnode is mounted on, then we transparently indirect
 646          * to the vnode which is the root of the mounted file system.
 647          * Before we do this we must check that an unmount is not in
 648          * progress on this vnode.
 649          */
 650
 651         for (;;) {
 652                 /*
 653                  * Try to read lock the vnode.  If this fails because
 654                  * the vnode is already write locked, then check to
 655                  * see whether it is the current thread which locked
 656                  * the vnode.  If it is not, then read lock the vnode
 657                  * by waiting to acquire the lock.
 658                  *
 659                  * The code path in domount() is an example of support
 660                  * which needs to look up two pathnames and locks one
 661                  * of them in between the two lookups.
 662                  */
 663                 error = vn_vfsrlock(cvp);
 664                 if (error) {
 665                         if (!vn_vfswlock_held(cvp))
 666                                 error = vn_vfsrlock_wait(cvp);
 667                         if (error != 0) {
 668                                 /*
 669                                  * lookuppn() expects a held vnode to be
 670                                  * returned because it promptly calls
 671                                  * VN_RELE after the error return
 672                                  */
 673                                 *cvpp = cvp;
 674                                 return (error);
 675                         }
 676                 }
 677
 678                 /*
 679                  * Reached the end of the mount chain?
 680                  */
 681                 vfsp = vn_mountedvfs(cvp);
 682                 if (vfsp == NULL) {
 683                         vn_vfsunlock(cvp);
 684                         break;
 685                 }
 686
 687                 /*
 688                  * The read lock must be held across the call to VFS_ROOT() to
 689                  * prevent a concurrent unmount from destroying the vfs.
 690                  */
 691                 error = VFS_ROOT(vfsp, &tvp);
 692                 vn_vfsunlock(cvp);
 693
 694                 if (error)
 695                         break;
 696
 697                 VN_RELE(cvp);
 698
 699                 cvp = tvp;
 700         }
 701
 702         *cvpp = cvp;
 703         return (error);
 704 }
 705
 706 /*
 707  * Return the lowermost vnode if this is a mountpoint.
 708  */
 709 static vnode_t *
 710 vn_under(vnode_t *vp)
 711 {
 712         vnode_t *uvp;
 713         vfs_t *vfsp;
 714
 715         while (vp->v_flag & VROOT) {
 716
 717                 vfsp = vp->v_vfsp;
 718                 vfs_rlock_wait(vfsp);
 719                 if ((uvp = vfsp->vfs_vnodecovered) == NULL ||
 720                     (vfsp->vfs_flag & VFS_UNMOUNTED)) {
 721                         vfs_unlock(vfsp);
 722                         break;
 723                 }
 724                 VN_HOLD(uvp);
 725                 vfs_unlock(vfsp);
 726                 VN_RELE(vp);
 727                 vp = uvp;
 728         }
 729
 730         return (vp);
 731 }
 732
 733 static int
 734 vnode_match(vnode_t *v1, vnode_t *v2, cred_t *cr)
 735 {
 736         vattr_t v1attr, v2attr;
 737
 738         /*
 739          * If we have a device file, check to see if is a cloned open of the
 740          * same device.  For self-cloning devices, the major numbers will match.
 741          * For devices cloned through the 'clone' driver, the minor number of
 742          * the source device will be the same as the major number of the cloned
 743          * device.
 744          */
 745         if ((v1->v_type == VCHR || v1->v_type == VBLK) &&
 746             v1->v_type == v2->v_type) {
 747                 if ((spec_is_selfclone(v1) || spec_is_selfclone(v2)) &&
 748                     getmajor(v1->v_rdev) == getmajor(v2->v_rdev))
 749                         return (1);
 750
 751                 if (spec_is_clone(v1) &&
 752                     getmajor(v1->v_rdev) == getminor(v2->v_rdev))
 753                         return (1);
 754
 755                 if (spec_is_clone(v2) &&
 756                     getmajor(v2->v_rdev) == getminor(v1->v_rdev))
 757                         return (1);
 758         }
 759
 760         v1attr.va_mask = v2attr.va_mask = VATTR_TYPE;
 761
 762         /*
 763          * This check for symbolic links handles the pseudo-symlinks in procfs.
 764          * These particular links have v_type of VDIR, but the attributes have a
 765          * type of VLNK.  We need to avoid these links because otherwise if we
 766          * are currently in '/proc/self/fd', then '/proc/self/cwd' will compare
 767          * as the same vnode.
 768          */
 769         if (fop_getattr(v1, &v1attr, 0, cr, NULL) != 0 ||
 770             fop_getattr(v2, &v2attr, 0, cr, NULL) != 0 ||
 771             v1attr.va_type == VLNK || v2attr.va_type == VLNK)
 772                 return (0);
 773
 774         v1attr.va_mask = v2attr.va_mask = VATTR_TYPE | VATTR_FSID | VATTR_NODEID;
 775
 776         if (fop_getattr(v1, &v1attr, ATTR_REAL, cr, NULL) != 0 ||
 777             fop_getattr(v2, &v2attr, ATTR_REAL, cr, NULL) != 0)
 778                 return (0);
 779
 780         return (v1attr.va_fsid == v2attr.va_fsid &&
 781             v1attr.va_nodeid == v2attr.va_nodeid);
 782 }
 783
 784
 785 /*
 786  * Find the entry in the directory corresponding to the target vnode.
 787  */
 788 int
 789 dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 790     size_t dlen, dirent64_t **rdp)
 791 {
 792         size_t dbuflen;
 793         struct iovec iov;
 794         struct uio uio;
 795         int error;
 796         int eof;
 797         vnode_t *cmpvp;
 798         struct dirent64 *dp;
 799         pathname_t pnp;
 800
 801         ASSERT(dvp->v_type == VDIR);
 802
 803         /*
 804          * This is necessary because of the strange semantics of fop_lookup().
 805          */
 806         bzero(&pnp, sizeof (pnp));
 807
 808         eof = 0;
 809
 810         uio.uio_iov = &iov;
 811         uio.uio_iovcnt = 1;
 812         uio.uio_segflg = UIO_SYSSPACE;
 813         uio.uio_fmode = 0;
 814         uio.uio_extflg = UIO_COPY_CACHED;
 815         uio.uio_loffset = 0;
 816
 817         if ((error = fop_access(dvp, VREAD, 0, cr, NULL)) != 0)
 818                 return (error);
 819
 820         while (!eof) {
 821                 uio.uio_resid = dlen;
 822                 iov.iov_base = dbuf;
 823                 iov.iov_len = dlen;
 824
 825                 (void) fop_rwlock(dvp, V_WRITELOCK_FALSE, NULL);
 826                 error = fop_readdir(dvp, &uio, cr, &eof, NULL, 0);
 827                 fop_rwunlock(dvp, V_WRITELOCK_FALSE, NULL);
 828
 829                 dbuflen = dlen - uio.uio_resid;
 830
 831                 if (error || dbuflen == 0)
 832                         break;
 833
 834                 dp = (dirent64_t *)dbuf;
 835                 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
 836                         /*
 837                          * Ignore '.' and '..' entries
 838                          */
 839                         if (strcmp(dp->d_name, ".") == 0 ||
 840                             strcmp(dp->d_name, "..") == 0) {
 841                                 dp = (dirent64_t *)((intptr_t)dp +
 842                                     dp->d_reclen);
 843                                 continue;
 844                         }
 845
 846                         error = fop_lookup(dvp, dp->d_name, &cmpvp, &pnp, 0,
 847                             vrootp, cr, NULL, NULL, NULL);
 848
 849                         /*
 850                          * We only want to bail out if there was an error other
 851                          * than ENOENT.  Otherwise, it could be that someone
 852                          * just removed an entry since the readdir() call, and
 853                          * the entry we want is further on in the directory.
 854                          */
 855                         if (error == 0) {
 856                                 if (vnode_match(tvp, cmpvp, cr)) {
 857                                         VN_RELE(cmpvp);
 858                                         *rdp = dp;
 859                                         return (0);
 860                                 }
 861
 862                                 VN_RELE(cmpvp);
 863                         } else if (error != ENOENT) {
 864                                 return (error);
 865                         }
 866
 867                         dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
 868                 }
 869         }
 870
 871         /*
 872          * Something strange has happened, this directory does not contain the
 873          * specified vnode.  This should never happen in the normal case, since
 874          * we ensured that dvp is the parent of vp.  This is possible in some
 875          * rare conditions (races and the special .zfs directory).
 876          */
 877         if (error == 0) {
 878                 error = fop_lookup(dvp, ".zfs", &cmpvp, &pnp, 0, vrootp, cr,
 879                     NULL, NULL, NULL);
 880                 if (error == 0) {
 881                         if (vnode_match(tvp, cmpvp, cr)) {
 882                                 (void) strcpy(dp->d_name, ".zfs");
 883                                 dp->d_reclen = strlen(".zfs");
 884                                 dp->d_off = 2;
 885                                 dp->d_ino = 1;
 886                                 *rdp = dp;
 887                         } else {
 888                                 error = ENOENT;
 889                         }
 890                         VN_RELE(cmpvp);
 891                 }
 892         }
 893
 894         return (error);
 895 }
 896
 897 /*
 898  * Given a global path (from rootdir), and a vnode that is the current root,
 899  * return the portion of the path that is beneath the current root or NULL on
 900  * failure.  The path MUST be a resolved path (no '..' entries or symlinks),
 901  * otherwise this function will fail.
 902  */
 903 static char *
 904 localpath(char *path, struct vnode *vrootp, cred_t *cr)
 905 {
 906         vnode_t *vp;
 907         vnode_t *cvp;
 908         char component[MAXNAMELEN];
 909         char *ret = NULL;
 910         pathname_t pn;
 911
 912         /*
 913          * We use vn_compare() instead of VN_CMP() in order to detect lofs
 914          * mounts and stacked vnodes.
 915          */
 916         if (vn_compare(vrootp, rootdir))
 917                 return (path);
 918
 919         if (pn_get(path, UIO_SYSSPACE, &pn) != 0)
 920                 return (NULL);
 921
 922         vp = rootdir;
 923         VN_HOLD(vp);
 924
 925         if (vn_ismntpt(vp) && traverse(&vp) != 0) {
 926                 VN_RELE(vp);
 927                 pn_free(&pn);
 928                 return (NULL);
 929         }
 930
 931         while (pn_pathleft(&pn)) {
 932                 pn_skipslash(&pn);
 933
 934                 if (pn_getcomponent(&pn, component) != 0)
 935                         break;
 936
 937                 if (fop_lookup(vp, component, &cvp, &pn, 0, rootdir, cr,
 938                     NULL, NULL, NULL) != 0)
 939                         break;
 940                 VN_RELE(vp);
 941                 vp = cvp;
 942
 943                 if (vn_ismntpt(vp) && traverse(&vp) != 0)
 944                         break;
 945
 946                 if (vn_compare(vp, vrootp)) {
 947                         ret = path + (pn.pn_path - pn.pn_buf);
 948                         break;
 949                 }
 950         }
 951
 952         VN_RELE(vp);
 953         pn_free(&pn);
 954
 955         return (ret);
 956 }
 957
 958 /*
 959  * Clean a stale v_path from a vnode.  This is only performed if the v_path has
 960  * not been altered since it was found to be stale
 961  */
 962 static void
 963 vnode_clear_vpath(vnode_t *vp, char *vpath_old)
 964 {
 965         mutex_enter(&vp->v_lock);
 966         if (vp->v_path != vn_vpath_empty && vp->v_path == vpath_old) {
 967                 vp->v_path = vn_vpath_empty;
 968                 mutex_exit(&vp->v_lock);
 969                 kmem_free(vpath_old, strlen(vpath_old) + 1);
 970         } else {
 971                 mutex_exit(&vp->v_lock);
 972         }
 973 }
 974
 975 /*
 976  * Validate that a pathname refers to a given vnode.
 977  */
 978 static int
 979 vnode_valid_pn(vnode_t *vp, vnode_t *vrootp, pathname_t *pn, pathname_t *rpn,
 980     int flags, cred_t *cr)
 981 {
 982         vnode_t *compvp;
 983         /*
 984          * If we are in a zone or a chroot environment, then we have to
 985          * take additional steps, since the path to the root might not
 986          * be readable with the current credentials, even though the
 987          * process can legitmately access the file.  In this case, we
 988          * do the following:
 989          *
 990          * lookuppnvp() with all privileges to get the resolved path.
 991          * call localpath() to get the local portion of the path, and
 992          * continue as normal.
 993          *
 994          * If the the conversion to a local path fails, then we continue
 995          * as normal.  This is a heuristic to make process object file
 996          * paths available from within a zone.  Because lofs doesn't
 997          * support page operations, the vnode stored in the seg_t is
 998          * actually the underlying real vnode, not the lofs node itself.
 999          * Most of the time, the lofs path is the same as the underlying
1000          * vnode (for example, /usr/lib/libc.so.1).
1001          */
1002         if (vrootp != rootdir) {
1003                 char *local = NULL;
1004
1005                 VN_HOLD(rootdir);
1006                 if (lookuppnvp(pn, rpn, FOLLOW, NULL, &compvp, rootdir,
1007                     rootdir, kcred) == 0) {
1008                         local = localpath(rpn->pn_path, vrootp, kcred);
1009                         VN_RELE(compvp);
1010                 }
1011
1012                 /*
1013                  * The original pn was changed through lookuppnvp().
1014                  * Set it to local for next validation attempt.
1015                  */
1016                 if (local) {
1017                         (void) pn_set(pn, local);
1018                 } else {
1019                         return (1);
1020                 }
1021         }
1022
1023         /*
1024          * We should have a local path at this point, so start the search from
1025          * the root of the current process.
1026          */
1027         VN_HOLD(vrootp);
1028         if (vrootp != rootdir)
1029                 VN_HOLD(vrootp);
1030         if (lookuppnvp(pn, rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp,
1031             cr) == 0) {
1032                 /*
1033                  * Check to see if the returned vnode is the same as the one we
1034                  * expect.
1035                  */
1036                 if (vn_compare(vp, compvp) ||
1037                     vnode_match(vp, compvp, cr)) {
1038                         VN_RELE(compvp);
1039                         return (0);
1040                 } else {
1041                         VN_RELE(compvp);
1042                 }
1043         }
1044
1045         return (1);
1046 }
1047
1048 /*
1049  * Struct for tracking vnodes with invalidated v_path entries during a
1050  * dirtopath reverse lookup.  By keeping adequate state, those vnodes can be
1051  * revisted to populate v_path.
1052  */
1053 struct dirpath_walk {
1054         struct dirpath_walk     *dw_next;
1055         vnode_t                 *dw_vnode;
1056         vnode_t                 *dw_pvnode;
1057         size_t                  dw_len;
1058         char                    *dw_name;
1059 };
1060
1061 /*
1062  * Given a directory, return the full, resolved path.  This looks up "..",
1063  * searches for the given vnode in the parent, appends the component, etc.  It
1064  * is used to implement vnodetopath() and getcwd() when the cached path fails.
1065  */
1066 static int
1067 dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
1068     cred_t *cr)
1069 {
1070         pathname_t      pn, rpn, emptypn;
1071         vnode_t         *pvp = NULL, *startvp = vp;
1072         int             err = 0;
1073         size_t          complen;
1074         dirent64_t      *dp;
1075         char            *bufloc, *dbuf;
1076         const size_t    dlen = DIRENT64_RECLEN(MAXPATHLEN);
1077         struct dirpath_walk *dw_chain = NULL, *dw_entry;
1078
1079         /* Operation only allowed on directories */
1080         ASSERT(vp->v_type == VDIR);
1081
1082         /* We must have at least enough space for "/" */
1083         if (buflen < 2)
1084                 return (ENAMETOOLONG);
1085
1086         /* Start at end of string with terminating null */
1087         bufloc = &buf[buflen - 1];
1088         *bufloc = '\0';
1089
1090         pn_alloc(&pn);
1091         pn_alloc(&rpn);
1092         dbuf = kmem_alloc(dlen, KM_SLEEP);
1093         bzero(&emptypn, sizeof (emptypn));
1094
1095         /*
1096          * Begin with an additional reference on vp.  This will be decremented
1097          * during the loop.
1098          */
1099         VN_HOLD(vp);
1100
1101         for (;;) {
1102                 int vprivs;
1103                 hrtime_t cached_stamp;
1104
1105                 /*
1106                  * Return if we've reached the root.  If the buffer is empty,
1107                  * return '/'.  We explicitly don't use vn_compare(), since it
1108                  * compares the real vnodes.  A lofs mount of '/' would produce
1109                  * incorrect results otherwise.
1110                  */
1111                 if (VN_CMP(vrootp, vp)) {
1112                         if (*bufloc == '\0')
1113                                 *--bufloc = '/';
1114                         break;
1115                 }
1116
1117                 /*
1118                  * If we've reached the VFS root, something has gone wrong.  We
1119                  * should have reached the root in the above check.  The only
1120                  * explantation is that 'vp' is not contained withing the given
1121                  * root, in which case we return EPERM.
1122                  */
1123                 if (VN_CMP(rootdir, vp)) {
1124                         err = EPERM;
1125                         goto out;
1126                 }
1127
1128                 /*
1129                  * Shortcut: see if this vnode has correct v_path. If so,
1130                  * we have the work done.
1131                  */
1132                 mutex_enter(&vp->v_lock);
1133                 if (vp->v_path != vn_vpath_empty &&
1134                     pn_set(&pn, vp->v_path) == 0) {
1135                         cached_stamp = vp->v_path_stamp;
1136                         mutex_exit(&vp->v_lock);
1137                         rpn.pn_path = rpn.pn_buf;
1138
1139                         /* Ensure the v_path pointing to correct vnode */
1140                         if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags,
1141                             cr) == 0) {
1142                                 complen = strlen(rpn.pn_path);
1143                                 bufloc -= complen;
1144                                 if (bufloc < buf) {
1145                                         err = ERANGE;
1146                                         goto out;
1147                                 }
1148                                 bcopy(rpn.pn_path, bufloc, complen);
1149                                 break;
1150                         } else {
1151                                 /*
1152                                  * Immediately nuke cached v_path entries known
1153                                  * to be invalid.
1154                                  */
1155                                 vn_clearpath(vp, cached_stamp);
1156                         }
1157                 } else {
1158                         mutex_exit(&vp->v_lock);
1159                 }
1160
1161                 /*
1162                  * Shortcuts failed, search for this vnode in its parent.  If
1163                  * this is a mountpoint, then get the vnode underneath.
1164                  */
1165                 if (vp->v_flag & VROOT)
1166                         vp = vn_under(vp);
1167                 if ((err = fop_lookup(vp, "..", &pvp, &emptypn, 0, vrootp, cr,
1168                     NULL, NULL, NULL)) != 0)
1169                         goto out;
1170
1171                 /*
1172                  * With extended attributes, it's possible for a directory to
1173                  * have a parent that is a regular file.  Check for that here.
1174                  */
1175                 if (pvp->v_type != VDIR) {
1176                         err = ENOTDIR;
1177                         goto out;
1178                 }
1179
1180                 /*
1181                  * If this is true, something strange has happened.  This is
1182                  * only true if we are the root of a filesystem, which should
1183                  * have been caught by the check above.
1184                  */
1185                 if (VN_CMP(pvp, vp)) {
1186                         err = ENOENT;
1187                         goto out;
1188                 }
1189
1190                 /*
1191                  * Check if we have read and search privilege so, that
1192                  * we can lookup the path in the directory
1193                  */
1194                 vprivs = (flags & LOOKUP_CHECKREAD) ? VREAD | VEXEC : VEXEC;
1195                 if ((err = fop_access(pvp, vprivs, 0, cr, NULL)) != 0) {
1196                         goto out;
1197                 }
1198
1199                 /*
1200                  * Search the parent directory for the entry corresponding to
1201                  * this vnode.
1202                  */
1203                 if ((err = dirfindvp(vrootp, pvp, vp, cr, dbuf, dlen, &dp))
1204                     != 0)
1205                         goto out;
1206                 complen = strlen(dp->d_name);
1207                 bufloc -= complen;
1208                 if (bufloc <= buf) {
1209                         err = ENAMETOOLONG;
1210                         goto out;
1211                 }
1212                 bcopy(dp->d_name, bufloc, complen);
1213
1214                 /* Prepend a slash to the current path.  */
1215                 *--bufloc = '/';
1216
1217                 /*
1218                  * Record the name and directory for later reconstruction and
1219                  * link it up with the others.
1220                  */
1221                 dw_entry = kmem_alloc(sizeof (*dw_entry), KM_SLEEP);
1222                 dw_entry->dw_name = kmem_alloc(complen + 1, KM_SLEEP);
1223                 VN_HOLD(dw_entry->dw_vnode = vp);
1224                 VN_HOLD(dw_entry->dw_pvnode = pvp);
1225                 bcopy(dp->d_name, dw_entry->dw_name, complen + 1);
1226                 dw_entry->dw_len = complen;
1227                 dw_entry->dw_next = dw_chain;
1228                 dw_chain = dw_entry;
1229
1230                 /* And continue with the next component */
1231                 VN_RELE(vp);
1232                 vp = pvp;
1233                 pvp = NULL;
1234         }
1235
1236         /*
1237          * Place the path at the beginning of the buffer.
1238          */
1239         if (bufloc != buf)
1240                 ovbcopy(bufloc, buf, buflen - (bufloc - buf));
1241
1242 out:
1243         /*
1244          * Walk over encountered directory entries which were afflicted with a
1245          * stale or absent v_path.  If the dirtopath was successful, we should
1246          * possess the necessary information to populate all of them with a
1247          * valid v_path.
1248          *
1249          * While processing this list, it is safe to call vn_setpath despite
1250          * the fact that racing vnode actions may have altered v_path entries
1251          * while the above loopwas still executing.  Any updated entries will
1252          * have a newer v_path_stamp value which prevents an invalid overwrite.
1253          *
1254          * If an error was encountered during the search, freeing the chain is
1255          * still required.
1256          */
1257         dw_entry = dw_chain;
1258         while (dw_entry != NULL) {
1259                 struct dirpath_walk *next = dw_entry->dw_next;
1260
1261                 if (err == 0) {
1262                         vn_setpath(NULL, dw_entry->dw_pvnode,
1263                             dw_entry->dw_vnode, dw_entry->dw_name,
1264                             dw_entry->dw_len);
1265                 }
1266
1267                 VN_RELE(dw_entry->dw_vnode);
1268                 VN_RELE(dw_entry->dw_pvnode);
1269                 kmem_free(dw_entry->dw_name, dw_entry->dw_len + 1);
1270                 kmem_free(dw_entry, sizeof (*dw_entry));
1271                 dw_entry = next;
1272         }
1273
1274         /*
1275          * If the error was ESTALE and the current directory to look in
1276          * was the root for this lookup, the root for a mounted file
1277          * system, or the starting directory for lookups, then
1278          * return ENOENT instead of ESTALE.  In this case, no recovery
1279          * is possible by the higher level.  If ESTALE was returned for
1280          * some intermediate directory along the path, then recovery
1281          * is potentially possible and retrying from the higher level
1282          * will either correct the situation by purging stale cache
1283          * entries or eventually get back to the point where no recovery
1284          * is possible.
1285          */
1286         if (err == ESTALE &&
1287             (VN_CMP(vp, vrootp) || (vp->v_flag & VROOT) || vp == startvp))
1288                 err = ENOENT;
1289
1290         kmem_free(dbuf, dlen);
1291         VN_RELE(vp);
1292         if (pvp)
1293                 VN_RELE(pvp);
1294         pn_free(&pn);
1295         pn_free(&rpn);
1296
1297         return (err);
1298 }
1299
1300 /*
1301  * The additional flag, LOOKUP_CHECKREAD, is used to enforce artificial
1302  * constraints in order to be standards compliant.  For example, if we have
1303  * the cached path of '/foo/bar', and '/foo' has permissions 100 (execute
1304  * only), then we can legitimately look up the path to the current working
1305  * directory without needing read permission.  Existing standards tests,
1306  * however, assume that we are determining the path by repeatedly looking up
1307  * "..".  We need to keep this behavior in order to maintain backwards
1308  * compatibility.
1309  */
1310 static int
1311 vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen,
1312     cred_t *cr, int flags)
1313 {
1314         pathname_t pn;
1315         int ret = 0;
1316         vnode_t *realvp;
1317         boolean_t doclose = B_FALSE;
1318
1319         /*
1320          * If vrootp is NULL, get the root for curproc.  Callers with any other
1321          * requirements should pass in a different vrootp.
1322          */
1323         if (vrootp == NULL) {
1324                 proc_t *p = curproc;
1325
1326                 mutex_enter(&p->p_lock);
1327                 if ((vrootp = PTOU(p)->u_rdir) == NULL)
1328                         vrootp = rootdir;
1329                 VN_HOLD(vrootp);
1330                 mutex_exit(&p->p_lock);
1331         } else {
1332                 VN_HOLD(vrootp);
1333         }
1334
1335         /*
1336          * This is to get around an annoying artifact of the /proc filesystem,
1337          * which is the behavior of {cwd/root}.  Trying to resolve this path
1338          * will result in /proc/pid/cwd instead of whatever the real working
1339          * directory is.  We can't rely on fop_realvp(), since that will break
1340          * lofs.  The only difference between procfs and lofs is that opening
1341          * the file will return the underling vnode in the case of procfs.
1342          */
1343         if (vp->v_type == VDIR && fop_realvp(vp, &realvp, NULL) == 0 &&
1344             realvp != vp) {
1345                 VN_HOLD(vp);
1346                 if (fop_open(&vp, FREAD, cr, NULL) == 0)
1347                         doclose = B_TRUE;
1348                 else
1349                         VN_RELE(vp);
1350         }
1351
1352         /*
1353          * Check to see if we have a valid cached path in the vnode.
1354          */
1355         pn_alloc(&pn);
1356         mutex_enter(&vp->v_lock);
1357         if (vp->v_path != vn_vpath_empty) {
1358                 hrtime_t cached_stamp;
1359                 pathname_t rpn;
1360
1361                 cached_stamp = vp->v_path_stamp;
1362                 (void) pn_set(&pn, vp->v_path);
1363                 mutex_exit(&vp->v_lock);
1364
1365                 /* We should only cache absolute paths */
1366                 ASSERT(pn.pn_buf[0] == '/');
1367
1368                 pn_alloc(&rpn);
1369                 if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, cr) == 0) {
1370                         /* Return the result, if we're able. */
1371                         if (buflen > rpn.pn_pathlen) {
1372                                 bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1);
1373                         } else {
1374                                 ret = ENAMETOOLONG;
1375                         }
1376                         pn_free(&pn);
1377                         pn_free(&rpn);
1378                         goto out;
1379                 }
1380                 pn_free(&rpn);
1381                 vn_clearpath(vp, cached_stamp);
1382         } else {
1383                 mutex_exit(&vp->v_lock);
1384         }
1385         pn_free(&pn);
1386
1387         if (vp->v_type != VDIR) {
1388                 /*
1389                  * The reverse lookup tricks used by dirtopath aren't possible
1390                  * for non-directory entries.  The best which can be done is
1391                  * clearing any stale v_path so later lookups can potentially
1392                  * repopulate it with a valid path.
1393                  */
1394                 ret = ENOENT;
1395         } else {
1396                 ret = dirtopath(vrootp, vp, buf, buflen, flags, cr);
1397         }
1398
1399 out:
1400         VN_RELE(vrootp);
1401         if (doclose) {
1402                 (void) fop_close(vp, FREAD, 1, 0, cr, NULL);
1403                 VN_RELE(vp);
1404         }
1405
1406         return (ret);
1407 }
1408
1409 int
1410 vnodetopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, cred_t *cr)
1411 {
1412         return (vnodetopath_common(vrootp, vp, buf, buflen, cr, 0));
1413 }
1414
1415 int
1416 dogetcwd(char *buf, size_t buflen)
1417 {
1418         int ret;
1419         vnode_t *vp;
1420         vnode_t *compvp;
1421         refstr_t *cwd, *oldcwd;
1422         const char *value;
1423         pathname_t rpnp, pnp;
1424         proc_t *p = curproc;
1425
1426         /*
1427          * Check to see if there is a cached version of the cwd.  If so, lookup
1428          * the cached value and make sure it is the same vnode.
1429          */
1430         mutex_enter(&p->p_lock);
1431         if ((cwd = PTOU(p)->u_cwd) != NULL)
1432                 refstr_hold(cwd);
1433         vp = PTOU(p)->u_cdir;
1434         VN_HOLD(vp);
1435         mutex_exit(&p->p_lock);
1436
1437         /*
1438          * Make sure we have permission to access the current directory.
1439          */
1440         if ((ret = fop_access(vp, VEXEC, 0, CRED(), NULL)) != 0) {
1441                 if (cwd != NULL)
1442                         refstr_rele(cwd);
1443                 VN_RELE(vp);
1444                 return (ret);
1445         }
1446
1447         if (cwd) {
1448                 value = refstr_value(cwd);
1449                 if ((ret = pn_get((char *)value, UIO_SYSSPACE, &pnp)) != 0) {
1450                         refstr_rele(cwd);
1451                         VN_RELE(vp);
1452                         return (ret);
1453                 }
1454
1455                 pn_alloc(&rpnp);
1456
1457                 if (lookuppn(&pnp, &rpnp, NO_FOLLOW, NULL, &compvp) == 0) {
1458
1459                         if (VN_CMP(vp, compvp) &&
1460                             strcmp(value, rpnp.pn_path) == 0) {
1461                                 VN_RELE(compvp);
1462                                 VN_RELE(vp);
1463                                 pn_free(&pnp);
1464                                 pn_free(&rpnp);
1465                                 if (strlen(value) + 1 > buflen) {
1466                                         refstr_rele(cwd);
1467                                         return (ENAMETOOLONG);
1468                                 }
1469                                 bcopy(value, buf, strlen(value) + 1);
1470                                 refstr_rele(cwd);
1471                                 return (0);
1472                         }
1473
1474                         VN_RELE(compvp);
1475                 }
1476
1477                 pn_free(&rpnp);
1478                 pn_free(&pnp);
1479
1480                 refstr_rele(cwd);
1481         }
1482
1483         ret = vnodetopath_common(NULL, vp, buf, buflen, CRED(),
1484             LOOKUP_CHECKREAD);
1485
1486         VN_RELE(vp);
1487
1488         /*
1489          * Store the new cwd and replace the existing cached copy.
1490          */
1491         if (ret == 0)
1492                 cwd = refstr_alloc(buf);
1493         else
1494                 cwd = NULL;
1495
1496         mutex_enter(&p->p_lock);
1497         oldcwd = PTOU(p)->u_cwd;
1498         PTOU(p)->u_cwd = cwd;
1499         mutex_exit(&p->p_lock);
1500
1501         if (oldcwd)
1502                 refstr_rele(oldcwd);
1503
1504         return (ret);
1505 }