kernel/fs/lookup.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  25  * Copyright 2016 Joyent, Inc.
  26  */
  27
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  29 /*        All Rights Reserved   */
  30
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40
  41 #include <sys/types.h>
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/cpuvar.h>
  45 #include <sys/errno.h>
  46 #include <sys/cred.h>
  47 #include <sys/user.h>
  48 #include <sys/uio.h>
  49 #include <sys/vfs.h>
  50 #include <sys/vnode.h>
  51 #include <sys/pathname.h>
  52 #include <sys/proc.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/sysmacros.h>
  55 #include <sys/debug.h>
  56 #include <sys/dirent.h>
  57 #include <c2/audit.h>
  58 #include <sys/zone.h>
  59 #include <sys/dnlc.h>
  60 #include <sys/fs/snode.h>
  61
  62 /* Controls whether paths are stored with vnodes. */
  63 int vfs_vnode_path = 1;
  64
  65 int
  66 lookupname(
  67         char *fnamep,
  68         enum uio_seg seg,
  69         int followlink,
  70         vnode_t **dirvpp,
  71         vnode_t **compvpp)
  72 {
  73         return (lookupnameatcred(fnamep, seg, followlink, dirvpp, compvpp, NULL,
  74             CRED()));
  75 }
  76
  77 /*
  78  * Lookup the user file name,
  79  * Handle allocation and freeing of pathname buffer, return error.
  80  */
  81 int
  82 lookupnameatcred(
  83         char *fnamep,                   /* user pathname */
  84         enum uio_seg seg,               /* addr space that name is in */
  85         int followlink,                 /* follow sym links */
  86         vnode_t **dirvpp,               /* ret for ptr to parent dir vnode */
  87         vnode_t **compvpp,              /* ret for ptr to component vnode */
  88         vnode_t *startvp,               /* start path search from vp */
  89         cred_t *cr)                     /* credential */
  90 {
  91         char namebuf[TYPICALMAXPATHLEN];
  92         struct pathname lookpn;
  93         int error;
  94
  95         error = pn_get_buf(fnamep, seg, &lookpn, namebuf, sizeof (namebuf));
  96         if (error == 0) {
  97                 error = lookuppnatcred(&lookpn, NULL, followlink,
  98                     dirvpp, compvpp, startvp, cr);
  99         }
 100         if (error == ENAMETOOLONG) {
 101                 /*
 102                  * This thread used a pathname > TYPICALMAXPATHLEN bytes long.
 103                  */
 104                 if (error = pn_get(fnamep, seg, &lookpn))
 105                         return (error);
 106                 error = lookuppnatcred(&lookpn, NULL, followlink,
 107                     dirvpp, compvpp, startvp, cr);
 108                 pn_free(&lookpn);
 109         }
 110
 111         return (error);
 112 }
 113
 114 int
 115 lookupnameat(char *fnamep, enum uio_seg seg, int followlink,
 116     vnode_t **dirvpp, vnode_t **compvpp, vnode_t *startvp)
 117 {
 118         return (lookupnameatcred(fnamep, seg, followlink, dirvpp, compvpp,
 119             startvp, CRED()));
 120 }
 121
 122 int
 123 lookuppn(
 124         struct pathname *pnp,
 125         struct pathname *rpnp,
 126         int followlink,
 127         vnode_t **dirvpp,
 128         vnode_t **compvpp)
 129 {
 130         return (lookuppnatcred(pnp, rpnp, followlink, dirvpp, compvpp, NULL,
 131             CRED()));
 132 }
 133
 134 /*
 135  * Lookup the user file name from a given vp, using a specific credential.
 136  */
 137 int
 138 lookuppnatcred(
 139         struct pathname *pnp,           /* pathname to lookup */
 140         struct pathname *rpnp,          /* if non-NULL, return resolved path */
 141         int followlink,                 /* (don't) follow sym links */
 142         vnode_t **dirvpp,               /* ptr for parent vnode */
 143         vnode_t **compvpp,              /* ptr for entry vnode */
 144         vnode_t *startvp,               /* start search from this vp */
 145         cred_t *cr)                     /* user credential */
 146 {
 147         vnode_t *vp;    /* current directory vp */
 148         vnode_t *rootvp;
 149         proc_t *p = curproc;
 150
 151         if (pnp->pn_pathlen == 0)
 152                 return (ENOENT);
 153
 154         mutex_enter(&p->p_lock);        /* for u_rdir and u_cdir */
 155         if ((rootvp = PTOU(p)->u_rdir) == NULL)
 156                 rootvp = rootdir;
 157         else if (rootvp != rootdir)     /* no need to VN_HOLD rootdir */
 158                 VN_HOLD(rootvp);
 159
 160         if (pnp->pn_path[0] == '/') {
 161                 vp = rootvp;
 162         } else {
 163                 vp = (startvp == NULL) ? PTOU(p)->u_cdir : startvp;
 164         }
 165         VN_HOLD(vp);
 166         mutex_exit(&p->p_lock);
 167
 168         /*
 169          * Skip over leading slashes
 170          */
 171         if (pnp->pn_path[0] == '/') {
 172                 do {
 173                         pnp->pn_path++;
 174                         pnp->pn_pathlen--;
 175                 } while (pnp->pn_path[0] == '/');
 176         }
 177
 178         return (lookuppnvp(pnp, rpnp, followlink, dirvpp,
 179             compvpp, rootvp, vp, cr));
 180 }
 181
 182 int
 183 lookuppnat(struct pathname *pnp, struct pathname *rpnp,
 184     int followlink, vnode_t **dirvpp, vnode_t **compvpp,
 185     vnode_t *startvp)
 186 {
 187         return (lookuppnatcred(pnp, rpnp, followlink, dirvpp, compvpp, startvp,
 188             CRED()));
 189 }
 190
 191 /* Private flag to do our getcwd() dirty work */
 192 #define LOOKUP_CHECKREAD        0x10
 193 #define LOOKUP_MASK             (~LOOKUP_CHECKREAD)
 194
 195 /*
 196  * Starting at current directory, translate pathname pnp to end.
 197  * Leave pathname of final component in pnp, return the vnode
 198  * for the final component in *compvpp, and return the vnode
 199  * for the parent of the final component in dirvpp.
 200  *
 201  * This is the central routine in pathname translation and handles
 202  * multiple components in pathnames, separating them at /'s.  It also
 203  * implements mounted file systems and processes symbolic links.
 204  *
 205  * vp is the vnode where the directory search should start.
 206  *
 207  * Reference counts: vp must be held prior to calling this function.  rootvp
 208  * should only be held if rootvp != rootdir.
 209  */
 210 int
 211 lookuppnvp(
 212         struct pathname *pnp,           /* pathname to lookup */
 213         struct pathname *rpnp,          /* if non-NULL, return resolved path */
 214         int flags,                      /* follow symlinks */
 215         vnode_t **dirvpp,               /* ptr for parent vnode */
 216         vnode_t **compvpp,              /* ptr for entry vnode */
 217         vnode_t *rootvp,                /* rootvp */
 218         vnode_t *vp,                    /* directory to start search at */
 219         cred_t *cr)                     /* user's credential */
 220 {
 221         vnode_t *cvp;   /* current component vp */
 222         char component[MAXNAMELEN];     /* buffer for component (incl null) */
 223         int error;
 224         int nlink;
 225         int lookup_flags;
 226         struct pathname presrvd; /* case preserved name */
 227         struct pathname *pp = NULL;
 228         vnode_t *startvp;
 229         vnode_t *zonevp = curproc->p_zone->zone_rootvp;         /* zone root */
 230         int must_be_directory = 0;
 231         boolean_t retry_with_kcred;
 232         uint32_t auditing = AU_AUDITING();
 233
 234         CPU_STATS_ADDQ(CPU, sys, namei, 1);
 235         nlink = 0;
 236         cvp = NULL;
 237         if (rpnp)
 238                 rpnp->pn_pathlen = 0;
 239
 240         lookup_flags = dirvpp ? LOOKUP_DIR : 0;
 241         if (flags & FIGNORECASE) {
 242                 lookup_flags |= FIGNORECASE;
 243                 pn_alloc(&presrvd);
 244                 pp = &presrvd;
 245         }
 246
 247         if (auditing)
 248                 audit_anchorpath(pnp, vp == rootvp);
 249
 250         /*
 251          * Eliminate any trailing slashes in the pathname.
 252          * If there are any, we must follow all symlinks.
 253          * Also, we must guarantee that the last component is a directory.
 254          */
 255         if (pn_fixslash(pnp)) {
 256                 flags |= FOLLOW;
 257                 must_be_directory = 1;
 258         }
 259
 260         startvp = vp;
 261 next:
 262         retry_with_kcred = B_FALSE;
 263
 264         /*
 265          * Make sure we have a directory.
 266          */
 267         if (vp->v_type != VDIR) {
 268                 error = ENOTDIR;
 269                 goto bad;
 270         }
 271
 272         if (rpnp && VN_CMP(vp, rootvp))
 273                 (void) pn_set(rpnp, "/");
 274
 275         /*
 276          * Process the next component of the pathname.
 277          */
 278         if (error = pn_getcomponent(pnp, component)) {
 279                 goto bad;
 280         }
 281
 282         /*
 283          * Handle "..": two special cases.
 284          * 1. If we're at the root directory (e.g. after chroot or
 285          *    zone_enter) then change ".." to "." so we can't get
 286          *    out of this subtree.
 287          * 2. If this vnode is the root of a mounted file system,
 288          *    then replace it with the vnode that was mounted on
 289          *    so that we take the ".." in the other file system.
 290          */
 291         if (component[0] == '.' && component[1] == '.' && component[2] == 0) {
 292 checkforroot:
 293                 if (VN_CMP(vp, rootvp) || VN_CMP(vp, zonevp)) {
 294                         component[1] = '\0';
 295                 } else if (vp->v_flag & VROOT) {
 296                         vfs_t *vfsp;
 297                         cvp = vp;
 298
 299                         /*
 300                          * While we deal with the vfs pointer from the vnode
 301                          * the filesystem could have been forcefully unmounted
 302                          * and the vnode's v_vfsp could have been invalidated
 303                          * by VFS_UNMOUNT. Hence, we cache v_vfsp and use it
 304                          * with vfs_rlock_wait/vfs_unlock.
 305                          * It is safe to use the v_vfsp even it is freed by
 306                          * VFS_UNMOUNT because vfs_rlock_wait/vfs_unlock
 307                          * do not dereference v_vfsp. It is just used as a
 308                          * magic cookie.
 309                          * One more corner case here is the memory getting
 310                          * reused for another vfs structure. In this case
 311                          * lookuppnvp's vfs_rlock_wait will succeed, domount's
 312                          * vfs_lock will fail and domount will bail out with an
 313                          * error (EBUSY).
 314                          */
 315                         vfsp = cvp->v_vfsp;
 316
 317                         /*
 318                          * This lock is used to synchronize
 319                          * mounts/unmounts and lookups.
 320                          * Threads doing mounts/unmounts hold the
 321                          * writers version vfs_lock_wait().
 322                          */
 323
 324                         vfs_rlock_wait(vfsp);
 325
 326                         /*
 327                          * If this vnode is on a file system that
 328                          * has been forcibly unmounted,
 329                          * we can't proceed. Cancel this operation
 330                          * and return EIO.
 331                          *
 332                          * vfs_vnodecovered is NULL if unmounted.
 333                          * Currently, nfs uses VFS_UNMOUNTED to
 334                          * check if it's a forced-umount. Keep the
 335                          * same checking here as well even though it
 336                          * may not be needed.
 337                          */
 338                         if (((vp = cvp->v_vfsp->vfs_vnodecovered) == NULL) ||
 339                             (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
 340                                 vfs_unlock(vfsp);
 341                                 VN_RELE(cvp);
 342                                 if (pp)
 343                                         pn_free(pp);
 344                                 return (EIO);
 345                         }
 346                         VN_HOLD(vp);
 347                         vfs_unlock(vfsp);
 348                         VN_RELE(cvp);
 349                         cvp = NULL;
 350                         /*
 351                          * Crossing mount points. For eg: We are doing
 352                          * a lookup of ".." for file systems root vnode
 353                          * mounted here, and fop_lookup() (with covered vnode)
 354                          * will be on underlying file systems mount point
 355                          * vnode. Set retry_with_kcred flag as we might end
 356                          * up doing fop_lookup() with kcred if required.
 357                          */
 358                         retry_with_kcred = B_TRUE;
 359                         goto checkforroot;
 360                 }
 361         }
 362
 363         /*
 364          * LOOKUP_CHECKREAD is a private flag used by vnodetopath() to indicate
 365          * that we need to have read permission on every directory in the entire
 366          * path.  This is used to ensure that a forward-lookup of a cached value
 367          * has the same effect as a reverse-lookup when the cached value cannot
 368          * be found.
 369          */
 370         if ((flags & LOOKUP_CHECKREAD) &&
 371             (error = fop_access(vp, VREAD, 0, cr, NULL)) != 0)
 372                 goto bad;
 373
 374         /*
 375          * Perform a lookup in the current directory.
 376          */
 377         error = fop_lookup(vp, component, &cvp, pnp, lookup_flags,
 378             rootvp, cr, NULL, NULL, pp);
 379
 380         /*
 381          * Retry with kcred - If crossing mount points & error is EACCES.
 382          *
 383          * If we are crossing mount points here and doing ".." lookup,
 384          * fop_lookup() might fail if the underlying file systems
 385          * mount point has no execute permission. In cases like these,
 386          * we retry fop_lookup() by giving as much privilage as possible
 387          * by passing kcred credentials.
 388          *
 389          * In case of hierarchical file systems, passing kcred still may
 390          * or may not work.
 391          * For eg: UFS FS --> Mount NFS FS --> Again mount UFS on some
 392          *                      directory inside NFS FS.
 393          */
 394         if ((error == EACCES) && retry_with_kcred)
 395                 error = fop_lookup(vp, component, &cvp, pnp, lookup_flags,
 396                     rootvp, zone_kcred(), NULL, NULL, pp);
 397
 398         if (error) {
 399                 cvp = NULL;
 400                 /*
 401                  * On error, return hard error if
 402                  * (a) we're not at the end of the pathname yet, or
 403                  * (b) the caller didn't want the parent directory, or
 404                  * (c) we failed for some reason other than a missing entry.
 405                  */
 406                 if (pn_pathleft(pnp) || dirvpp == NULL || error != ENOENT)
 407                         goto bad;
 408                 if (auditing) { /* directory access */
 409                         if (error = audit_savepath(pnp, vp, vp, error, cr))
 410                                 goto bad_noaudit;
 411                 }
 412
 413                 pn_setlast(pnp);
 414                 /*
 415                  * We inform the caller that the desired entry must be
 416                  * a directory by adding a '/' to the component name.
 417                  */
 418                 if (must_be_directory && (error = pn_addslash(pnp)) != 0)
 419                         goto bad;
 420                 *dirvpp = vp;
 421                 if (compvpp != NULL)
 422                         *compvpp = NULL;
 423                 if (rootvp != rootdir)
 424                         VN_RELE(rootvp);
 425                 if (pp)
 426                         pn_free(pp);
 427                 return (0);
 428         }
 429
 430         /*
 431          * Traverse mount points.
 432          * XXX why don't we need to hold a read lock here (call vn_vfsrlock)?
 433          * What prevents a concurrent update to v_vfsmountedhere?
 434          *      Possible answer: if mounting, we might not see the mount
 435          *      if it is concurrently coming into existence, but that's
 436          *      really not much different from the thread running a bit slower.
 437          *      If unmounting, we may get into traverse() when we shouldn't,
 438          *      but traverse() will catch this case for us.
 439          *      (For this to work, fetching v_vfsmountedhere had better
 440          *      be atomic!)
 441          */
 442         if (vn_mountedvfs(cvp) != NULL) {
 443                 if ((error = traverse(&cvp)) != 0)
 444                         goto bad;
 445         }
 446
 447         /*
 448          * If we hit a symbolic link and there is more path to be
 449          * translated or this operation does not wish to apply
 450          * to a link, then place the contents of the link at the
 451          * front of the remaining pathname.
 452          */
 453         if (cvp->v_type == VLNK && ((flags & FOLLOW) || pn_pathleft(pnp))) {
 454                 struct pathname linkpath;
 455
 456                 if (++nlink > MAXSYMLINKS) {
 457                         error = ELOOP;
 458                         goto bad;
 459                 }
 460                 pn_alloc(&linkpath);
 461                 if (error = pn_getsymlink(cvp, &linkpath, cr)) {
 462                         pn_free(&linkpath);
 463                         goto bad;
 464                 }
 465
 466                 if (auditing)
 467                         audit_symlink(pnp, &linkpath);
 468
 469                 if (pn_pathleft(&linkpath) == 0)
 470                         (void) pn_set(&linkpath, ".");
 471                 error = pn_insert(pnp, &linkpath, strlen(component));
 472                 pn_free(&linkpath);
 473                 if (error)
 474                         goto bad;
 475                 VN_RELE(cvp);
 476                 cvp = NULL;
 477                 if (pnp->pn_pathlen == 0) {
 478                         error = ENOENT;
 479                         goto bad;
 480                 }
 481                 if (pnp->pn_path[0] == '/') {
 482                         do {
 483                                 pnp->pn_path++;
 484                                 pnp->pn_pathlen--;
 485                         } while (pnp->pn_path[0] == '/');
 486                         VN_RELE(vp);
 487                         vp = rootvp;
 488                         VN_HOLD(vp);
 489                 }
 490                 if (auditing)
 491                         audit_anchorpath(pnp, vp == rootvp);
 492                 if (pn_fixslash(pnp)) {
 493                         flags |= FOLLOW;
 494                         must_be_directory = 1;
 495                 }
 496                 goto next;
 497         }
 498
 499         /*
 500          * If rpnp is non-NULL, remember the resolved path name therein.
 501          * Do not include "." components.  Collapse occurrences of
 502          * "previous/..", so long as "previous" is not itself "..".
 503          * Exhausting rpnp results in error ENAMETOOLONG.
 504          */
 505         if (rpnp && strcmp(component, ".") != 0) {
 506                 size_t len;
 507
 508                 if (strcmp(component, "..") == 0 &&
 509                     rpnp->pn_pathlen != 0 &&
 510                     !((rpnp->pn_pathlen > 2 &&
 511                     strncmp(rpnp->pn_path+rpnp->pn_pathlen-3, "/..", 3) == 0) ||
 512                     (rpnp->pn_pathlen == 2 &&
 513                     strncmp(rpnp->pn_path, "..", 2) == 0))) {
 514                         while (rpnp->pn_pathlen &&
 515                             rpnp->pn_path[rpnp->pn_pathlen-1] != '/')
 516                                 rpnp->pn_pathlen--;
 517                         if (rpnp->pn_pathlen > 1)
 518                                 rpnp->pn_pathlen--;
 519                         rpnp->pn_path[rpnp->pn_pathlen] = '\0';
 520                 } else {
 521                         if (rpnp->pn_pathlen != 0 &&
 522                             rpnp->pn_path[rpnp->pn_pathlen-1] != '/')
 523                                 rpnp->pn_path[rpnp->pn_pathlen++] = '/';
 524                         if (flags & FIGNORECASE) {
 525                                 /*
 526                                  * Return the case-preserved name
 527                                  * within the resolved path.
 528                                  */
 529                                 error = copystr(pp->pn_buf,
 530                                     rpnp->pn_path + rpnp->pn_pathlen,
 531                                     rpnp->pn_bufsize - rpnp->pn_pathlen, &len);
 532                         } else {
 533                                 error = copystr(component,
 534                                     rpnp->pn_path + rpnp->pn_pathlen,
 535                                     rpnp->pn_bufsize - rpnp->pn_pathlen, &len);
 536                         }
 537                         if (error)      /* copystr() returns ENAMETOOLONG */
 538                                 goto bad;
 539                         rpnp->pn_pathlen += (len - 1);
 540                         ASSERT(rpnp->pn_bufsize > rpnp->pn_pathlen);
 541                 }
 542         }
 543
 544         /*
 545          * If no more components, return last directory (if wanted) and
 546          * last component (if wanted).
 547          */
 548         if (pn_pathleft(pnp) == 0) {
 549                 /*
 550                  * If there was a trailing slash in the pathname,
 551                  * make sure the last component is a directory.
 552                  */
 553                 if (must_be_directory && cvp->v_type != VDIR) {
 554                         error = ENOTDIR;
 555                         goto bad;
 556                 }
 557                 if (dirvpp != NULL) {
 558                         /*
 559                          * Check that we have the real parent and not
 560                          * an alias of the last component.
 561                          */
 562                         if (vn_compare(vp, cvp)) {
 563                                 if (auditing)
 564                                         (void) audit_savepath(pnp, cvp, vp,
 565                                             EINVAL, cr);
 566                                 pn_setlast(pnp);
 567                                 VN_RELE(vp);
 568                                 VN_RELE(cvp);
 569                                 if (rootvp != rootdir)
 570                                         VN_RELE(rootvp);
 571                                 if (pp)
 572                                         pn_free(pp);
 573                                 return (EINVAL);
 574                         }
 575                         *dirvpp = vp;
 576                 } else
 577                         VN_RELE(vp);
 578                 if (auditing)
 579                         (void) audit_savepath(pnp, cvp, vp, 0, cr);
 580                 if (pnp->pn_path == pnp->pn_buf)
 581                         (void) pn_set(pnp, ".");
 582                 else
 583                         pn_setlast(pnp);
 584                 if (rpnp) {
 585                         if (VN_CMP(cvp, rootvp))
 586                                 (void) pn_set(rpnp, "/");
 587                         else if (rpnp->pn_pathlen == 0)
 588                                 (void) pn_set(rpnp, ".");
 589                 }
 590
 591                 if (compvpp != NULL)
 592                         *compvpp = cvp;
 593                 else
 594                         VN_RELE(cvp);
 595                 if (rootvp != rootdir)
 596                         VN_RELE(rootvp);
 597                 if (pp)
 598                         pn_free(pp);
 599                 return (0);
 600         }
 601
 602         /*
 603          * Skip over slashes from end of last component.
 604          */
 605         while (pnp->pn_path[0] == '/') {
 606                 pnp->pn_path++;
 607                 pnp->pn_pathlen--;
 608         }
 609
 610         /*
 611          * Searched through another level of directory:
 612          * release previous directory handle and save new (result
 613          * of lookup) as current directory.
 614          */
 615         VN_RELE(vp);
 616         vp = cvp;
 617         cvp = NULL;
 618         goto next;
 619
 620 bad:
 621         if (auditing)   /* reached end of path */
 622                 (void) audit_savepath(pnp, cvp, vp, error, cr);
 623 bad_noaudit:
 624         /*
 625          * Error.  Release vnodes and return.
 626          */
 627         if (cvp)
 628                 VN_RELE(cvp);
 629         /*
 630          * If the error was ESTALE and the current directory to look in
 631          * was the root for this lookup, the root for a mounted file
 632          * system, or the starting directory for lookups, then
 633          * return ENOENT instead of ESTALE.  In this case, no recovery
 634          * is possible by the higher level.  If ESTALE was returned for
 635          * some intermediate directory along the path, then recovery
 636          * is potentially possible and retrying from the higher level
 637          * will either correct the situation by purging stale cache
 638          * entries or eventually get back to the point where no recovery
 639          * is possible.
 640          */
 641         if (error == ESTALE &&
 642             (VN_CMP(vp, rootvp) || (vp->v_flag & VROOT) || vp == startvp))
 643                 error = ENOENT;
 644         VN_RELE(vp);
 645         if (rootvp != rootdir)
 646                 VN_RELE(rootvp);
 647         if (pp)
 648                 pn_free(pp);
 649         return (error);
 650 }
 651
 652 /*
 653  * Traverse a mount point.  Routine accepts a vnode pointer as a reference
 654  * parameter and performs the indirection, releasing the original vnode.
 655  */
 656 int
 657 traverse(vnode_t **cvpp)
 658 {
 659         int error = 0;
 660         vnode_t *cvp;
 661         vnode_t *tvp;
 662         vfs_t *vfsp;
 663
 664         cvp = *cvpp;
 665
 666         /*
 667          * If this vnode is mounted on, then we transparently indirect
 668          * to the vnode which is the root of the mounted file system.
 669          * Before we do this we must check that an unmount is not in
 670          * progress on this vnode.
 671          */
 672
 673         for (;;) {
 674                 /*
 675                  * Try to read lock the vnode.  If this fails because
 676                  * the vnode is already write locked, then check to
 677                  * see whether it is the current thread which locked
 678                  * the vnode.  If it is not, then read lock the vnode
 679                  * by waiting to acquire the lock.
 680                  *
 681                  * The code path in domount() is an example of support
 682                  * which needs to look up two pathnames and locks one
 683                  * of them in between the two lookups.
 684                  */
 685                 error = vn_vfsrlock(cvp);
 686                 if (error) {
 687                         if (!vn_vfswlock_held(cvp))
 688                                 error = vn_vfsrlock_wait(cvp);
 689                         if (error != 0) {
 690                                 /*
 691                                  * lookuppn() expects a held vnode to be
 692                                  * returned because it promptly calls
 693                                  * VN_RELE after the error return
 694                                  */
 695                                 *cvpp = cvp;
 696                                 return (error);
 697                         }
 698                 }
 699
 700                 /*
 701                  * Reached the end of the mount chain?
 702                  */
 703                 vfsp = vn_mountedvfs(cvp);
 704                 if (vfsp == NULL) {
 705                         vn_vfsunlock(cvp);
 706                         break;
 707                 }
 708
 709                 /*
 710                  * The read lock must be held across the call to VFS_ROOT() to
 711                  * prevent a concurrent unmount from destroying the vfs.
 712                  */
 713                 error = VFS_ROOT(vfsp, &tvp);
 714                 vn_vfsunlock(cvp);
 715
 716                 if (error)
 717                         break;
 718
 719                 VN_RELE(cvp);
 720
 721                 cvp = tvp;
 722         }
 723
 724         *cvpp = cvp;
 725         return (error);
 726 }
 727
 728 /*
 729  * Return the lowermost vnode if this is a mountpoint.
 730  */
 731 static vnode_t *
 732 vn_under(vnode_t *vp)
 733 {
 734         vnode_t *uvp;
 735         vfs_t *vfsp;
 736
 737         while (vp->v_flag & VROOT) {
 738
 739                 vfsp = vp->v_vfsp;
 740                 vfs_rlock_wait(vfsp);
 741                 if ((uvp = vfsp->vfs_vnodecovered) == NULL ||
 742                     (vfsp->vfs_flag & VFS_UNMOUNTED)) {
 743                         vfs_unlock(vfsp);
 744                         break;
 745                 }
 746                 VN_HOLD(uvp);
 747                 vfs_unlock(vfsp);
 748                 VN_RELE(vp);
 749                 vp = uvp;
 750         }
 751
 752         return (vp);
 753 }
 754
 755 static int
 756 vnode_match(vnode_t *v1, vnode_t *v2, cred_t *cr)
 757 {
 758         vattr_t v1attr, v2attr;
 759
 760         /*
 761          * If we have a device file, check to see if is a cloned open of the
 762          * same device.  For self-cloning devices, the major numbers will match.
 763          * For devices cloned through the 'clone' driver, the minor number of
 764          * the source device will be the same as the major number of the cloned
 765          * device.
 766          */
 767         if ((v1->v_type == VCHR || v1->v_type == VBLK) &&
 768             v1->v_type == v2->v_type) {
 769                 if ((spec_is_selfclone(v1) || spec_is_selfclone(v2)) &&
 770                     getmajor(v1->v_rdev) == getmajor(v2->v_rdev))
 771                         return (1);
 772
 773                 if (spec_is_clone(v1) &&
 774                     getmajor(v1->v_rdev) == getminor(v2->v_rdev))
 775                         return (1);
 776
 777                 if (spec_is_clone(v2) &&
 778                     getmajor(v2->v_rdev) == getminor(v1->v_rdev))
 779                         return (1);
 780         }
 781
 782         v1attr.va_mask = v2attr.va_mask = AT_TYPE;
 783
 784         /*
 785          * This check for symbolic links handles the pseudo-symlinks in procfs.
 786          * These particular links have v_type of VDIR, but the attributes have a
 787          * type of VLNK.  We need to avoid these links because otherwise if we
 788          * are currently in '/proc/self/fd', then '/proc/self/cwd' will compare
 789          * as the same vnode.
 790          */
 791         if (fop_getattr(v1, &v1attr, 0, cr, NULL) != 0 ||
 792             fop_getattr(v2, &v2attr, 0, cr, NULL) != 0 ||
 793             v1attr.va_type == VLNK || v2attr.va_type == VLNK)
 794                 return (0);
 795
 796         v1attr.va_mask = v2attr.va_mask = AT_TYPE | AT_FSID | AT_NODEID;
 797
 798         if (fop_getattr(v1, &v1attr, ATTR_REAL, cr, NULL) != 0 ||
 799             fop_getattr(v2, &v2attr, ATTR_REAL, cr, NULL) != 0)
 800                 return (0);
 801
 802         return (v1attr.va_fsid == v2attr.va_fsid &&
 803             v1attr.va_nodeid == v2attr.va_nodeid);
 804 }
 805
 806
 807 /*
 808  * Find the entry in the directory corresponding to the target vnode.
 809  */
 810 int
 811 dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 812     size_t dlen, dirent64_t **rdp)
 813 {
 814         size_t dbuflen;
 815         struct iovec iov;
 816         struct uio uio;
 817         int error;
 818         int eof;
 819         vnode_t *cmpvp;
 820         struct dirent64 *dp;
 821         pathname_t pnp;
 822
 823         ASSERT(dvp->v_type == VDIR);
 824
 825         /*
 826          * This is necessary because of the strange semantics of fop_lookup().
 827          */
 828         bzero(&pnp, sizeof (pnp));
 829
 830         eof = 0;
 831
 832         uio.uio_iov = &iov;
 833         uio.uio_iovcnt = 1;
 834         uio.uio_segflg = UIO_SYSSPACE;
 835         uio.uio_fmode = 0;
 836         uio.uio_extflg = UIO_COPY_CACHED;
 837         uio.uio_loffset = 0;
 838
 839         if ((error = fop_access(dvp, VREAD, 0, cr, NULL)) != 0)
 840                 return (error);
 841
 842         while (!eof) {
 843                 uio.uio_resid = dlen;
 844                 iov.iov_base = dbuf;
 845                 iov.iov_len = dlen;
 846
 847                 (void) fop_rwlock(dvp, V_WRITELOCK_FALSE, NULL);
 848                 error = fop_readdir(dvp, &uio, cr, &eof, NULL, 0);
 849                 fop_rwunlock(dvp, V_WRITELOCK_FALSE, NULL);
 850
 851                 dbuflen = dlen - uio.uio_resid;
 852
 853                 if (error || dbuflen == 0)
 854                         break;
 855
 856                 dp = (dirent64_t *)dbuf;
 857                 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
 858                         /*
 859                          * Ignore '.' and '..' entries
 860                          */
 861                         if (strcmp(dp->d_name, ".") == 0 ||
 862                             strcmp(dp->d_name, "..") == 0) {
 863                                 dp = (dirent64_t *)((intptr_t)dp +
 864                                     dp->d_reclen);
 865                                 continue;
 866                         }
 867
 868                         error = fop_lookup(dvp, dp->d_name, &cmpvp, &pnp, 0,
 869                             vrootp, cr, NULL, NULL, NULL);
 870
 871                         /*
 872                          * We only want to bail out if there was an error other
 873                          * than ENOENT.  Otherwise, it could be that someone
 874                          * just removed an entry since the readdir() call, and
 875                          * the entry we want is further on in the directory.
 876                          */
 877                         if (error == 0) {
 878                                 if (vnode_match(tvp, cmpvp, cr)) {
 879                                         VN_RELE(cmpvp);
 880                                         *rdp = dp;
 881                                         return (0);
 882                                 }
 883
 884                                 VN_RELE(cmpvp);
 885                         } else if (error != ENOENT) {
 886                                 return (error);
 887                         }
 888
 889                         dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
 890                 }
 891         }
 892
 893         /*
 894          * Something strange has happened, this directory does not contain the
 895          * specified vnode.  This should never happen in the normal case, since
 896          * we ensured that dvp is the parent of vp.  This is possible in some
 897          * rare conditions (races and the special .zfs directory).
 898          */
 899         if (error == 0) {
 900                 error = fop_lookup(dvp, ".zfs", &cmpvp, &pnp, 0, vrootp, cr,
 901                     NULL, NULL, NULL);
 902                 if (error == 0) {
 903                         if (vnode_match(tvp, cmpvp, cr)) {
 904                                 (void) strcpy(dp->d_name, ".zfs");
 905                                 dp->d_reclen = strlen(".zfs");
 906                                 dp->d_off = 2;
 907                                 dp->d_ino = 1;
 908                                 *rdp = dp;
 909                         } else {
 910                                 error = ENOENT;
 911                         }
 912                         VN_RELE(cmpvp);
 913                 }
 914         }
 915
 916         return (error);
 917 }
 918
 919 /*
 920  * Given a global path (from rootdir), and a vnode that is the current root,
 921  * return the portion of the path that is beneath the current root or NULL on
 922  * failure.  The path MUST be a resolved path (no '..' entries or symlinks),
 923  * otherwise this function will fail.
 924  */
 925 static char *
 926 localpath(char *path, struct vnode *vrootp, cred_t *cr)
 927 {
 928         vnode_t *vp;
 929         vnode_t *cvp;
 930         char component[MAXNAMELEN];
 931         char *ret = NULL;
 932         pathname_t pn;
 933
 934         /*
 935          * We use vn_compare() instead of VN_CMP() in order to detect lofs
 936          * mounts and stacked vnodes.
 937          */
 938         if (vn_compare(vrootp, rootdir))
 939                 return (path);
 940
 941         if (pn_get(path, UIO_SYSSPACE, &pn) != 0)
 942                 return (NULL);
 943
 944         vp = rootdir;
 945         VN_HOLD(vp);
 946
 947         if (vn_ismntpt(vp) && traverse(&vp) != 0) {
 948                 VN_RELE(vp);
 949                 pn_free(&pn);
 950                 return (NULL);
 951         }
 952
 953         while (pn_pathleft(&pn)) {
 954                 pn_skipslash(&pn);
 955
 956                 if (pn_getcomponent(&pn, component) != 0)
 957                         break;
 958
 959                 if (fop_lookup(vp, component, &cvp, &pn, 0, rootdir, cr,
 960                     NULL, NULL, NULL) != 0)
 961                         break;
 962                 VN_RELE(vp);
 963                 vp = cvp;
 964
 965                 if (vn_ismntpt(vp) && traverse(&vp) != 0)
 966                         break;
 967
 968                 if (vn_compare(vp, vrootp)) {
 969                         ret = path + (pn.pn_path - pn.pn_buf);
 970                         break;
 971                 }
 972         }
 973
 974         VN_RELE(vp);
 975         pn_free(&pn);
 976
 977         return (ret);
 978 }
 979
 980 /*
 981  * Clean a stale v_path from a vnode.  This is only performed if the v_path has
 982  * not been altered since it was found to be stale
 983  */
 984 static void
 985 vnode_clear_vpath(vnode_t *vp, char *vpath_old)
 986 {
 987         mutex_enter(&vp->v_lock);
 988         if (vp->v_path != vn_vpath_empty && vp->v_path == vpath_old) {
 989                 vp->v_path = vn_vpath_empty;
 990                 mutex_exit(&vp->v_lock);
 991                 kmem_free(vpath_old, strlen(vpath_old) + 1);
 992         } else {
 993                 mutex_exit(&vp->v_lock);
 994         }
 995 }
 996
 997 /*
 998  * Validate that a pathname refers to a given vnode.
 999  */
1000 static int
1001 vnode_valid_pn(vnode_t *vp, vnode_t *vrootp, pathname_t *pn, pathname_t *rpn,
1002     int flags, cred_t *cr)
1003 {
1004         vnode_t *compvp;
1005         /*
1006          * If we are in a zone or a chroot environment, then we have to
1007          * take additional steps, since the path to the root might not
1008          * be readable with the current credentials, even though the
1009          * process can legitmately access the file.  In this case, we
1010          * do the following:
1011          *
1012          * lookuppnvp() with all privileges to get the resolved path.
1013          * call localpath() to get the local portion of the path, and
1014          * continue as normal.
1015          *
1016          * If the the conversion to a local path fails, then we continue
1017          * as normal.  This is a heuristic to make process object file
1018          * paths available from within a zone.  Because lofs doesn't
1019          * support page operations, the vnode stored in the seg_t is
1020          * actually the underlying real vnode, not the lofs node itself.
1021          * Most of the time, the lofs path is the same as the underlying
1022          * vnode (for example, /usr/lib/libc.so.1).
1023          */
1024         if (vrootp != rootdir) {
1025                 char *local = NULL;
1026
1027                 VN_HOLD(rootdir);
1028                 if (lookuppnvp(pn, rpn, FOLLOW, NULL, &compvp, rootdir,
1029                     rootdir, kcred) == 0) {
1030                         local = localpath(rpn->pn_path, vrootp, kcred);
1031                         VN_RELE(compvp);
1032                 }
1033
1034                 /*
1035                  * The original pn was changed through lookuppnvp().
1036                  * Set it to local for next validation attempt.
1037                  */
1038                 if (local) {
1039                         (void) pn_set(pn, local);
1040                 } else {
1041                         return (1);
1042                 }
1043         }
1044
1045         /*
1046          * We should have a local path at this point, so start the search from
1047          * the root of the current process.
1048          */
1049         VN_HOLD(vrootp);
1050         if (vrootp != rootdir)
1051                 VN_HOLD(vrootp);
1052         if (lookuppnvp(pn, rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp,
1053             cr) == 0) {
1054                 /*
1055                  * Check to see if the returned vnode is the same as the one we
1056                  * expect.
1057                  */
1058                 if (vn_compare(vp, compvp) ||
1059                     vnode_match(vp, compvp, cr)) {
1060                         VN_RELE(compvp);
1061                         return (0);
1062                 } else {
1063                         VN_RELE(compvp);
1064                 }
1065         }
1066
1067         return (1);
1068 }
1069
1070 /*
1071  * Struct for tracking vnodes with invalidated v_path entries during a
1072  * dirtopath reverse lookup.  By keeping adequate state, those vnodes can be
1073  * revisted to populate v_path.
1074  */
1075 struct dirpath_walk {
1076         struct dirpath_walk     *dw_next;
1077         vnode_t                 *dw_vnode;
1078         vnode_t                 *dw_pvnode;
1079         size_t                  dw_len;
1080         char                    *dw_name;
1081 };
1082
1083 /*
1084  * Given a directory, return the full, resolved path.  This looks up "..",
1085  * searches for the given vnode in the parent, appends the component, etc.  It
1086  * is used to implement vnodetopath() and getcwd() when the cached path fails.
1087  */
1088 static int
1089 dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
1090     cred_t *cr)
1091 {
1092         pathname_t      pn, rpn, emptypn;
1093         vnode_t         *pvp = NULL, *startvp = vp;
1094         int             err = 0;
1095         size_t          complen;
1096         dirent64_t      *dp;
1097         char            *bufloc, *dbuf;
1098         const size_t    dlen = DIRENT64_RECLEN(MAXPATHLEN);
1099         struct dirpath_walk *dw_chain = NULL, *dw_entry;
1100
1101         /* Operation only allowed on directories */
1102         ASSERT(vp->v_type == VDIR);
1103
1104         /* We must have at least enough space for "/" */
1105         if (buflen < 2)
1106                 return (ENAMETOOLONG);
1107
1108         /* Start at end of string with terminating null */
1109         bufloc = &buf[buflen - 1];
1110         *bufloc = '\0';
1111
1112         pn_alloc(&pn);
1113         pn_alloc(&rpn);
1114         dbuf = kmem_alloc(dlen, KM_SLEEP);
1115         bzero(&emptypn, sizeof (emptypn));
1116
1117         /*
1118          * Begin with an additional reference on vp.  This will be decremented
1119          * during the loop.
1120          */
1121         VN_HOLD(vp);
1122
1123         for (;;) {
1124                 int vprivs;
1125                 hrtime_t cached_stamp;
1126
1127                 /*
1128                  * Return if we've reached the root.  If the buffer is empty,
1129                  * return '/'.  We explicitly don't use vn_compare(), since it
1130                  * compares the real vnodes.  A lofs mount of '/' would produce
1131                  * incorrect results otherwise.
1132                  */
1133                 if (VN_CMP(vrootp, vp)) {
1134                         if (*bufloc == '\0')
1135                                 *--bufloc = '/';
1136                         break;
1137                 }
1138
1139                 /*
1140                  * If we've reached the VFS root, something has gone wrong.  We
1141                  * should have reached the root in the above check.  The only
1142                  * explantation is that 'vp' is not contained withing the given
1143                  * root, in which case we return EPERM.
1144                  */
1145                 if (VN_CMP(rootdir, vp)) {
1146                         err = EPERM;
1147                         goto out;
1148                 }
1149
1150                 /*
1151                  * Shortcut: see if this vnode has correct v_path. If so,
1152                  * we have the work done.
1153                  */
1154                 mutex_enter(&vp->v_lock);
1155                 if (vp->v_path != vn_vpath_empty &&
1156                     pn_set(&pn, vp->v_path) == 0) {
1157                         cached_stamp = vp->v_path_stamp;
1158                         mutex_exit(&vp->v_lock);
1159                         rpn.pn_path = rpn.pn_buf;
1160
1161                         /* Ensure the v_path pointing to correct vnode */
1162                         if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags,
1163                             cr) == 0) {
1164                                 complen = strlen(rpn.pn_path);
1165                                 bufloc -= complen;
1166                                 if (bufloc < buf) {
1167                                         err = ERANGE;
1168                                         goto out;
1169                                 }
1170                                 bcopy(rpn.pn_path, bufloc, complen);
1171                                 break;
1172                         } else {
1173                                 /*
1174                                  * Immediately nuke cached v_path entries known
1175                                  * to be invalid.
1176                                  */
1177                                 vn_clearpath(vp, cached_stamp);
1178                         }
1179                 } else {
1180                         mutex_exit(&vp->v_lock);
1181                 }
1182
1183                 /*
1184                  * Shortcuts failed, search for this vnode in its parent.  If
1185                  * this is a mountpoint, then get the vnode underneath.
1186                  */
1187                 if (vp->v_flag & VROOT)
1188                         vp = vn_under(vp);
1189                 if ((err = fop_lookup(vp, "..", &pvp, &emptypn, 0, vrootp, cr,
1190                     NULL, NULL, NULL)) != 0)
1191                         goto out;
1192
1193                 /*
1194                  * With extended attributes, it's possible for a directory to
1195                  * have a parent that is a regular file.  Check for that here.
1196                  */
1197                 if (pvp->v_type != VDIR) {
1198                         err = ENOTDIR;
1199                         goto out;
1200                 }
1201
1202                 /*
1203                  * If this is true, something strange has happened.  This is
1204                  * only true if we are the root of a filesystem, which should
1205                  * have been caught by the check above.
1206                  */
1207                 if (VN_CMP(pvp, vp)) {
1208                         err = ENOENT;
1209                         goto out;
1210                 }
1211
1212                 /*
1213                  * Check if we have read and search privilege so, that
1214                  * we can lookup the path in the directory
1215                  */
1216                 vprivs = (flags & LOOKUP_CHECKREAD) ? VREAD | VEXEC : VEXEC;
1217                 if ((err = fop_access(pvp, vprivs, 0, cr, NULL)) != 0) {
1218                         goto out;
1219                 }
1220
1221                 /*
1222                  * Search the parent directory for the entry corresponding to
1223                  * this vnode.
1224                  */
1225                 if ((err = dirfindvp(vrootp, pvp, vp, cr, dbuf, dlen, &dp))
1226                     != 0)
1227                         goto out;
1228                 complen = strlen(dp->d_name);
1229                 bufloc -= complen;
1230                 if (bufloc <= buf) {
1231                         err = ENAMETOOLONG;
1232                         goto out;
1233                 }
1234                 bcopy(dp->d_name, bufloc, complen);
1235
1236                 /* Prepend a slash to the current path.  */
1237                 *--bufloc = '/';
1238
1239                 /*
1240                  * Record the name and directory for later reconstruction and
1241                  * link it up with the others.
1242                  */
1243                 dw_entry = kmem_alloc(sizeof (*dw_entry), KM_SLEEP);
1244                 dw_entry->dw_name = kmem_alloc(complen + 1, KM_SLEEP);
1245                 VN_HOLD(dw_entry->dw_vnode = vp);
1246                 VN_HOLD(dw_entry->dw_pvnode = pvp);
1247                 bcopy(dp->d_name, dw_entry->dw_name, complen + 1);
1248                 dw_entry->dw_len = complen;
1249                 dw_entry->dw_next = dw_chain;
1250                 dw_chain = dw_entry;
1251
1252                 /* And continue with the next component */
1253                 VN_RELE(vp);
1254                 vp = pvp;
1255                 pvp = NULL;
1256         }
1257
1258         /*
1259          * Place the path at the beginning of the buffer.
1260          */
1261         if (bufloc != buf)
1262                 ovbcopy(bufloc, buf, buflen - (bufloc - buf));
1263
1264 out:
1265         /*
1266          * Walk over encountered directory entries which were afflicted with a
1267          * stale or absent v_path.  If the dirtopath was successful, we should
1268          * possess the necessary information to populate all of them with a
1269          * valid v_path.
1270          *
1271          * While processing this list, it is safe to call vn_setpath despite
1272          * the fact that racing vnode actions may have altered v_path entries
1273          * while the above loopwas still executing.  Any updated entries will
1274          * have a newer v_path_stamp value which prevents an invalid overwrite.
1275          *
1276          * If an error was encountered during the search, freeing the chain is
1277          * still required.
1278          */
1279         dw_entry = dw_chain;
1280         while (dw_entry != NULL) {
1281                 struct dirpath_walk *next = dw_entry->dw_next;
1282
1283                 if (err == 0) {
1284                         vn_setpath(NULL, dw_entry->dw_pvnode,
1285                             dw_entry->dw_vnode, dw_entry->dw_name,
1286                             dw_entry->dw_len);
1287                 }
1288
1289                 VN_RELE(dw_entry->dw_vnode);
1290                 VN_RELE(dw_entry->dw_pvnode);
1291                 kmem_free(dw_entry->dw_name, dw_entry->dw_len + 1);
1292                 kmem_free(dw_entry, sizeof (*dw_entry));
1293                 dw_entry = next;
1294         }
1295
1296         /*
1297          * If the error was ESTALE and the current directory to look in
1298          * was the root for this lookup, the root for a mounted file
1299          * system, or the starting directory for lookups, then
1300          * return ENOENT instead of ESTALE.  In this case, no recovery
1301          * is possible by the higher level.  If ESTALE was returned for
1302          * some intermediate directory along the path, then recovery
1303          * is potentially possible and retrying from the higher level
1304          * will either correct the situation by purging stale cache
1305          * entries or eventually get back to the point where no recovery
1306          * is possible.
1307          */
1308         if (err == ESTALE &&
1309             (VN_CMP(vp, vrootp) || (vp->v_flag & VROOT) || vp == startvp))
1310                 err = ENOENT;
1311
1312         kmem_free(dbuf, dlen);
1313         VN_RELE(vp);
1314         if (pvp)
1315                 VN_RELE(pvp);
1316         pn_free(&pn);
1317         pn_free(&rpn);
1318
1319         return (err);
1320 }
1321
1322 /*
1323  * The additional flag, LOOKUP_CHECKREAD, is used to enforce artificial
1324  * constraints in order to be standards compliant.  For example, if we have
1325  * the cached path of '/foo/bar', and '/foo' has permissions 100 (execute
1326  * only), then we can legitimately look up the path to the current working
1327  * directory without needing read permission.  Existing standards tests,
1328  * however, assume that we are determining the path by repeatedly looking up
1329  * "..".  We need to keep this behavior in order to maintain backwards
1330  * compatibility.
1331  */
1332 static int
1333 vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen,
1334     cred_t *cr, int flags)
1335 {
1336         pathname_t pn;
1337         int ret = 0;
1338         vnode_t *realvp;
1339         boolean_t doclose = B_FALSE;
1340
1341         /*
1342          * If vrootp is NULL, get the root for curproc.  Callers with any other
1343          * requirements should pass in a different vrootp.
1344          */
1345         if (vrootp == NULL) {
1346                 proc_t *p = curproc;
1347
1348                 mutex_enter(&p->p_lock);
1349                 if ((vrootp = PTOU(p)->u_rdir) == NULL)
1350                         vrootp = rootdir;
1351                 VN_HOLD(vrootp);
1352                 mutex_exit(&p->p_lock);
1353         } else {
1354                 VN_HOLD(vrootp);
1355         }
1356
1357         /*
1358          * This is to get around an annoying artifact of the /proc filesystem,
1359          * which is the behavior of {cwd/root}.  Trying to resolve this path
1360          * will result in /proc/pid/cwd instead of whatever the real working
1361          * directory is.  We can't rely on fop_realvp(), since that will break
1362          * lofs.  The only difference between procfs and lofs is that opening
1363          * the file will return the underling vnode in the case of procfs.
1364          */
1365         if (vp->v_type == VDIR && fop_realvp(vp, &realvp, NULL) == 0 &&
1366             realvp != vp) {
1367                 VN_HOLD(vp);
1368                 if (fop_open(&vp, FREAD, cr, NULL) == 0)
1369                         doclose = B_TRUE;
1370                 else
1371                         VN_RELE(vp);
1372         }
1373
1374         /*
1375          * Check to see if we have a valid cached path in the vnode.
1376          */
1377         pn_alloc(&pn);
1378         mutex_enter(&vp->v_lock);
1379         if (vp->v_path != vn_vpath_empty) {
1380                 hrtime_t cached_stamp;
1381                 pathname_t rpn;
1382
1383                 cached_stamp = vp->v_path_stamp;
1384                 (void) pn_set(&pn, vp->v_path);
1385                 mutex_exit(&vp->v_lock);
1386
1387                 /* We should only cache absolute paths */
1388                 ASSERT(pn.pn_buf[0] == '/');
1389
1390                 pn_alloc(&rpn);
1391                 if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, cr) == 0) {
1392                         /* Return the result, if we're able. */
1393                         if (buflen > rpn.pn_pathlen) {
1394                                 bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1);
1395                         } else {
1396                                 ret = ENAMETOOLONG;
1397                         }
1398                         pn_free(&pn);
1399                         pn_free(&rpn);
1400                         goto out;
1401                 }
1402                 pn_free(&rpn);
1403                 vn_clearpath(vp, cached_stamp);
1404         } else {
1405                 mutex_exit(&vp->v_lock);
1406         }
1407         pn_free(&pn);
1408
1409         if (vp->v_type != VDIR) {
1410                 /*
1411                  * The reverse lookup tricks used by dirtopath aren't possible
1412                  * for non-directory entries.  The best which can be done is
1413                  * clearing any stale v_path so later lookups can potentially
1414                  * repopulate it with a valid path.
1415                  */
1416                 ret = ENOENT;
1417         } else {
1418                 ret = dirtopath(vrootp, vp, buf, buflen, flags, cr);
1419         }
1420
1421 out:
1422         VN_RELE(vrootp);
1423         if (doclose) {
1424                 (void) fop_close(vp, FREAD, 1, 0, cr, NULL);
1425                 VN_RELE(vp);
1426         }
1427
1428         return (ret);
1429 }
1430
1431 int
1432 vnodetopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, cred_t *cr)
1433 {
1434         return (vnodetopath_common(vrootp, vp, buf, buflen, cr, 0));
1435 }
1436
1437 int
1438 dogetcwd(char *buf, size_t buflen)
1439 {
1440         int ret;
1441         vnode_t *vp;
1442         vnode_t *compvp;
1443         refstr_t *cwd, *oldcwd;
1444         const char *value;
1445         pathname_t rpnp, pnp;
1446         proc_t *p = curproc;
1447
1448         /*
1449          * Check to see if there is a cached version of the cwd.  If so, lookup
1450          * the cached value and make sure it is the same vnode.
1451          */
1452         mutex_enter(&p->p_lock);
1453         if ((cwd = PTOU(p)->u_cwd) != NULL)
1454                 refstr_hold(cwd);
1455         vp = PTOU(p)->u_cdir;
1456         VN_HOLD(vp);
1457         mutex_exit(&p->p_lock);
1458
1459         /*
1460          * Make sure we have permission to access the current directory.
1461          */
1462         if ((ret = fop_access(vp, VEXEC, 0, CRED(), NULL)) != 0) {
1463                 if (cwd != NULL)
1464                         refstr_rele(cwd);
1465                 VN_RELE(vp);
1466                 return (ret);
1467         }
1468
1469         if (cwd) {
1470                 value = refstr_value(cwd);
1471                 if ((ret = pn_get((char *)value, UIO_SYSSPACE, &pnp)) != 0) {
1472                         refstr_rele(cwd);
1473                         VN_RELE(vp);
1474                         return (ret);
1475                 }
1476
1477                 pn_alloc(&rpnp);
1478
1479                 if (lookuppn(&pnp, &rpnp, NO_FOLLOW, NULL, &compvp) == 0) {
1480
1481                         if (VN_CMP(vp, compvp) &&
1482                             strcmp(value, rpnp.pn_path) == 0) {
1483                                 VN_RELE(compvp);
1484                                 VN_RELE(vp);
1485                                 pn_free(&pnp);
1486                                 pn_free(&rpnp);
1487                                 if (strlen(value) + 1 > buflen) {
1488                                         refstr_rele(cwd);
1489                                         return (ENAMETOOLONG);
1490                                 }
1491                                 bcopy(value, buf, strlen(value) + 1);
1492                                 refstr_rele(cwd);
1493                                 return (0);
1494                         }
1495
1496                         VN_RELE(compvp);
1497                 }
1498
1499                 pn_free(&rpnp);
1500                 pn_free(&pnp);
1501
1502                 refstr_rele(cwd);
1503         }
1504
1505         ret = vnodetopath_common(NULL, vp, buf, buflen, CRED(),
1506             LOOKUP_CHECKREAD);
1507
1508         VN_RELE(vp);
1509
1510         /*
1511          * Store the new cwd and replace the existing cached copy.
1512          */
1513         if (ret == 0)
1514                 cwd = refstr_alloc(buf);
1515         else
1516                 cwd = NULL;
1517
1518         mutex_enter(&p->p_lock);
1519         oldcwd = PTOU(p)->u_cwd;
1520         PTOU(p)->u_cwd = cwd;
1521         mutex_exit(&p->p_lock);
1522
1523         if (oldcwd)
1524                 refstr_rele(oldcwd);
1525
1526         return (ret);
1527 }