2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/vfs_nlookup.c,v 1.24 2008/05/09 17:52:17 dillon Exp $
37 * nlookup() is the 'new' namei interface. Rather then return directory and
38 * leaf vnodes (in various lock states) the new interface instead deals in
39 * namecache records. Namecache records may represent both a positive or
40 * a negative hit. The namespace is locked via the namecache record instead
41 * of via the vnode, and only the leaf namecache record (representing the
42 * filename) needs to be locked.
44 * This greatly improves filesystem parallelism and is a huge simplification
45 * of the API verses the old vnode locking / namei scheme.
47 * Filesystems must actively control the caching aspects of the namecache,
48 * and since namecache pointers are used as handles they are non-optional
49 * even for filesystems which do not generally wish to cache things. It is
50 * intended that a separate cache coherency API will be constructed to handle
54 #include "opt_ktrace.h"
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/vnode.h>
60 #include <sys/mount.h>
61 #include <sys/filedesc.h>
63 #include <sys/namei.h>
64 #include <sys/nlookup.h>
65 #include <sys/malloc.h>
67 #include <sys/objcache.h>
70 #include <sys/ktrace.h>
74 * Initialize a nlookup() structure, early error return for copyin faults
75 * or a degenerate empty string (which is not allowed).
77 * The first process proc0's credentials are used if the calling thread
78 * is not associated with a process context.
81 nlookup_init(struct nlookupdata
*nd
,
82 const char *path
, enum uio_seg seg
, int flags
)
93 * note: the pathlen set by copy*str() includes the terminating \0.
95 bzero(nd
, sizeof(struct nlookupdata
));
96 nd
->nl_path
= objcache_get(namei_oc
, M_WAITOK
);
97 nd
->nl_flags
|= NLC_HASBUF
;
98 if (seg
== UIO_SYSSPACE
)
99 error
= copystr(path
, nd
->nl_path
, MAXPATHLEN
, &pathlen
);
101 error
= copyinstr(path
, nd
->nl_path
, MAXPATHLEN
, &pathlen
);
104 * Don't allow empty pathnames.
105 * POSIX.1 requirement: "" is not a vaild file name.
107 if (error
== 0 && pathlen
<= 1)
112 cache_copy(&p
->p_fd
->fd_ncdir
, &nd
->nl_nch
);
113 cache_copy(&p
->p_fd
->fd_nrdir
, &nd
->nl_rootnch
);
114 if (p
->p_fd
->fd_njdir
.ncp
)
115 cache_copy(&p
->p_fd
->fd_njdir
, &nd
->nl_jailnch
);
116 nd
->nl_cred
= crhold(p
->p_ucred
);
118 cache_copy(&rootnch
, &nd
->nl_nch
);
119 cache_copy(&nd
->nl_nch
, &nd
->nl_rootnch
);
120 cache_copy(&nd
->nl_nch
, &nd
->nl_jailnch
);
121 nd
->nl_cred
= crhold(proc0
.p_ucred
);
124 nd
->nl_flags
|= flags
;
132 * This works similarly to nlookup_init() but does not assume a process
133 * context. rootnch is always chosen for the root directory and the cred
134 * and starting directory are supplied in arguments.
137 nlookup_init_raw(struct nlookupdata
*nd
,
138 const char *path
, enum uio_seg seg
, int flags
,
139 struct ucred
*cred
, struct nchandle
*ncstart
)
147 bzero(nd
, sizeof(struct nlookupdata
));
148 nd
->nl_path
= objcache_get(namei_oc
, M_WAITOK
);
149 nd
->nl_flags
|= NLC_HASBUF
;
150 if (seg
== UIO_SYSSPACE
)
151 error
= copystr(path
, nd
->nl_path
, MAXPATHLEN
, &pathlen
);
153 error
= copyinstr(path
, nd
->nl_path
, MAXPATHLEN
, &pathlen
);
156 * Don't allow empty pathnames.
157 * POSIX.1 requirement: "" is not a vaild file name.
159 if (error
== 0 && pathlen
<= 1)
163 cache_copy(ncstart
, &nd
->nl_nch
);
164 cache_copy(&rootnch
, &nd
->nl_rootnch
);
165 cache_copy(&rootnch
, &nd
->nl_jailnch
);
166 nd
->nl_cred
= crhold(cred
);
168 nd
->nl_flags
|= flags
;
176 * Set a different credential; this credential will be used by future
177 * operations performed on nd.nl_open_vp and nlookupdata structure.
180 nlookup_set_cred(struct nlookupdata
*nd
, struct ucred
*cred
)
182 KKASSERT(nd
->nl_cred
!= NULL
);
184 if (nd
->nl_cred
!= cred
) {
192 * Cleanup a nlookupdata structure after we are through with it. This may
193 * be called on any nlookupdata structure initialized with nlookup_init().
194 * Calling nlookup_done() is mandatory in all cases except where nlookup_init()
195 * returns an error, even if as a consumer you believe you have taken all
196 * dynamic elements out of the nlookupdata structure.
199 nlookup_done(struct nlookupdata
*nd
)
201 if (nd
->nl_nch
.ncp
) {
202 if (nd
->nl_flags
& NLC_NCPISLOCKED
) {
203 nd
->nl_flags
&= ~NLC_NCPISLOCKED
;
204 cache_unlock(&nd
->nl_nch
);
206 cache_drop(&nd
->nl_nch
);
208 if (nd
->nl_rootnch
.ncp
)
209 cache_drop(&nd
->nl_rootnch
);
210 if (nd
->nl_jailnch
.ncp
)
211 cache_drop(&nd
->nl_jailnch
);
212 if ((nd
->nl_flags
& NLC_HASBUF
) && nd
->nl_path
) {
213 objcache_put(namei_oc
, nd
->nl_path
);
220 if (nd
->nl_open_vp
) {
221 if (nd
->nl_flags
& NLC_LOCKVP
) {
222 vn_unlock(nd
->nl_open_vp
);
223 nd
->nl_flags
&= ~NLC_LOCKVP
;
225 vn_close(nd
->nl_open_vp
, nd
->nl_vp_fmode
);
226 nd
->nl_open_vp
= NULL
;
232 nd
->nl_flags
= 0; /* clear remaining flags (just clear everything) */
236 nlookup_zero(struct nlookupdata
*nd
)
238 bzero(nd
, sizeof(struct nlookupdata
));
242 * Simple all-in-one nlookup. Returns a locked namecache structure or NULL
243 * if an error occured.
245 * Note that the returned ncp is not checked for permissions, though VEXEC
246 * is checked on the directory path leading up to the result. The caller
247 * must call naccess() to check the permissions of the returned leaf.
250 nlookup_simple(const char *str
, enum uio_seg seg
,
251 int niflags
, int *error
)
253 struct nlookupdata nd
;
256 *error
= nlookup_init(&nd
, str
, seg
, niflags
);
258 if ((*error
= nlookup(&nd
)) == 0) {
259 nch
= nd
.nl_nch
; /* keep hold ref from structure */
260 cache_zero(&nd
.nl_nch
); /* and NULL out */
272 * Do a generic nlookup. Note that the passed nd is not nlookup_done()'d
273 * on return, even if an error occurs. If no error occurs the returned
274 * nl_nch is always referenced and locked, otherwise it may or may not be.
276 * Intermediate directory elements, including the current directory, require
277 * execute (search) permission. nlookup does not examine the access
278 * permissions on the returned element.
280 * If NLC_CREATE or NLC_DELETE is set the last directory must allow node
281 * creation (VCREATE/VDELETE), and an error code of 0 will be returned for
282 * a non-existant target. Otherwise a non-existant target will cause
283 * ENOENT to be returned.
285 * If NLC_REFDVP is set nd->nl_dvp will be set to the directory vnode
286 * of the returned entry. The vnode will be referenced, but not locked,
287 * and will be released by nlookup_done() along with everything else.
290 nlookup(struct nlookupdata
*nd
)
292 struct nlcomponent nlc
;
302 if (KTRPOINT(nd
->nl_td
, KTR_NAMEI
))
303 ktrnamei(nd
->nl_td
->td_lwp
, nd
->nl_path
);
305 bzero(&nlc
, sizeof(nlc
));
308 * Setup for the loop. The current working namecache element must
309 * be in a refd + unlocked state. This typically the case on entry except
310 * when stringing nlookup()'s along in a chain, since nlookup() always
311 * returns nl_nch in a locked state.
314 if (nd
->nl_flags
& NLC_NCPISLOCKED
) {
315 nd
->nl_flags
&= ~NLC_NCPISLOCKED
;
316 cache_unlock(&nd
->nl_nch
);
325 * Loop on the path components. At the top of the loop nd->nl_nch
326 * is ref'd and unlocked and represents our current position.
330 * Check if the root directory should replace the current
331 * directory. This is done at the start of a translation
332 * or after a symbolic link has been found. In other cases
333 * ptr will never be pointing at a '/'.
338 } while (*ptr
== '/');
339 cache_copy(&nd
->nl_rootnch
, &nch
);
340 cache_drop(&nd
->nl_nch
);
344 * Fast-track termination. There is no parent directory of
345 * the root in the same mount from the point of view of
346 * the caller so return EPERM if NLC_REFDVP is specified.
347 * e.g. 'rmdir /' is not allowed.
350 if (nd
->nl_flags
& NLC_REFDVP
) {
353 cache_lock(&nd
->nl_nch
);
354 nd
->nl_flags
|= NLC_NCPISLOCKED
;
363 * Check directory search permissions.
365 if ((error
= naccess(&nd
->nl_nch
, VEXEC
, nd
->nl_cred
)) != 0)
369 * Extract the path component
371 nlc
.nlc_nameptr
= ptr
;
372 while (*ptr
&& *ptr
!= '/')
374 nlc
.nlc_namelen
= ptr
- nlc
.nlc_nameptr
;
377 * Lookup the path component in the cache, creating an unresolved
378 * entry if necessary. We have to handle "." and ".." as special
381 * When handling ".." we have to detect a traversal back through a
382 * mount point. If we are at the root, ".." just returns the root.
384 * This subsection returns a locked, refd 'nch' unless it errors out.
385 * The namecache topology is not allowed to be disconnected, so
386 * encountering a NULL parent will generate EINVAL. This typically
387 * occurs when a directory is removed out from under a process.
389 * If NLC_DELETE is set neither '.' or '..' can be the last component
392 if (nlc
.nlc_namelen
== 1 && nlc
.nlc_nameptr
[0] == '.') {
393 cache_get(&nd
->nl_nch
, &nch
);
395 } else if (nlc
.nlc_namelen
== 2 &&
396 nlc
.nlc_nameptr
[0] == '.' && nlc
.nlc_nameptr
[1] == '.') {
397 if (nd
->nl_nch
.mount
== nd
->nl_rootnch
.mount
&&
398 nd
->nl_nch
.ncp
== nd
->nl_rootnch
.ncp
401 * ".." at the root returns the root
403 cache_get(&nd
->nl_nch
, &nch
);
406 * Locate the parent ncp. If we are at the root of a
407 * filesystem mount we have to skip to the mounted-on
408 * point in the underlying filesystem.
411 while (nch
.ncp
== nch
.mount
->mnt_ncmountpt
.ncp
)
412 nch
= nch
.mount
->mnt_ncmounton
;
413 nch
.ncp
= nch
.ncp
->nc_parent
;
414 KKASSERT(nch
.ncp
!= NULL
);
415 cache_get(&nch
, &nch
);
419 nch
= cache_nlookup(&nd
->nl_nch
, &nlc
);
420 while ((error
= cache_resolve(&nch
, nd
->nl_cred
)) == EAGAIN
) {
421 kprintf("[diagnostic] nlookup: relookup %*.*s\n",
422 nch
.ncp
->nc_nlen
, nch
.ncp
->nc_nlen
, nch
.ncp
->nc_name
);
424 nch
= cache_nlookup(&nd
->nl_nch
, &nlc
);
429 * [end of subsection] ncp is locked and ref'd. nd->nl_nch is ref'd
433 * Resolve the namespace if necessary. The ncp returned by
434 * cache_nlookup() is referenced and locked.
436 * XXX neither '.' nor '..' should return EAGAIN since they were
437 * previously resolved and thus cannot be newly created ncp's.
439 if (nch
.ncp
->nc_flag
& NCF_UNRESOLVED
) {
440 error
= cache_resolve(&nch
, nd
->nl_cred
);
441 KKASSERT(error
!= EAGAIN
);
443 error
= nch
.ncp
->nc_error
;
447 * Early completion. ENOENT is not an error if this is the last
448 * component and NLC_CREATE was requested. Note that ncp->nc_error
449 * is left as ENOENT in that case, which we check later on.
451 * Also handle invalid '.' or '..' components terminating a path
452 * during removal. The standard requires this and pax pretty
453 * stupidly depends on it.
455 for (xptr
= ptr
; *xptr
== '/'; ++xptr
)
458 if (error
== ENOENT
&& (nd
->nl_flags
& NLC_CREATE
))
459 error
= naccess(&nch
, VCREATE
, nd
->nl_cred
);
460 if (error
== 0 && wasdotordotdot
&& (nd
->nl_flags
& NLC_DELETE
))
465 * Early completion on error.
473 * If the element is a symlink and it is either not the last
474 * element or it is the last element and we are allowed to
475 * follow symlinks, resolve the symlink.
477 if ((nch
.ncp
->nc_flag
& NCF_ISSYMLINK
) &&
478 (*ptr
|| (nd
->nl_flags
& NLC_FOLLOW
))
480 if (nd
->nl_loopcnt
++ >= MAXSYMLINKS
) {
485 error
= nreadsymlink(nd
, &nch
, &nlc
);
491 * Concatenate trailing path elements onto the returned symlink.
492 * Note that if the path component (ptr) is not exhausted, it
493 * will being with a '/', so we do not have to add another one.
495 * The symlink may not be empty.
498 if (nlc
.nlc_namelen
== 0 || nlc
.nlc_namelen
+ len
>= MAXPATHLEN
) {
499 error
= nlc
.nlc_namelen
? ENAMETOOLONG
: ENOENT
;
500 objcache_put(namei_oc
, nlc
.nlc_nameptr
);
503 bcopy(ptr
, nlc
.nlc_nameptr
+ nlc
.nlc_namelen
, len
+ 1);
504 if (nd
->nl_flags
& NLC_HASBUF
)
505 objcache_put(namei_oc
, nd
->nl_path
);
506 nd
->nl_path
= nlc
.nlc_nameptr
;
507 nd
->nl_flags
|= NLC_HASBUF
;
511 * Go back up to the top to resolve any initial '/'s in the
518 * If the element is a directory and we are crossing a mount point,
521 while ((nch
.ncp
->nc_flag
& NCF_ISMOUNTPT
) &&
522 (nd
->nl_flags
& NLC_NOCROSSMOUNT
) == 0 &&
523 (mp
= cache_findmount(&nch
)) != NULL
528 cache_get(&mp
->mnt_ncmountpt
, &nch
);
530 if (nch
.ncp
->nc_flag
& NCF_UNRESOLVED
) {
531 while (vfs_busy(mp
, 0))
533 error
= VFS_ROOT(mp
, &tdp
);
537 cache_setvp(&nch
, tdp
);
547 * Skip any slashes to get to the next element. If there
548 * are any slashes at all the current element must be a
549 * directory or, in the create case, intended to become a directory.
550 * If it isn't we break without incrementing ptr and fall through
551 * to the failure case below.
553 while (*ptr
== '/') {
554 if ((nch
.ncp
->nc_flag
& NCF_ISDIR
) == 0 &&
555 !(nd
->nl_flags
& NLC_WILLBEDIR
)
563 * Continuation case: additional elements and the current
564 * element is a directory.
566 if (*ptr
&& (nch
.ncp
->nc_flag
& NCF_ISDIR
)) {
567 cache_drop(&nd
->nl_nch
);
574 * Failure case: additional elements and the current element
584 * Successful lookup of last element.
586 * Check directory permissions if a deletion is specified.
588 if (*ptr
== 0 && (nd
->nl_flags
& NLC_DELETE
)) {
589 if ((error
= naccess(&nch
, VDELETE
, nd
->nl_cred
)) != 0) {
596 * Termination: no more elements. If NLC_CREATE was set the
597 * ncp may represent a negative hit (ncp->nc_error will be ENOENT),
598 * but we still return an error code of 0.
600 * If NLC_REFDVP is set acquire a referenced parent dvp.
602 if (nd
->nl_flags
& NLC_REFDVP
) {
603 error
= cache_vref(&nd
->nl_nch
, nd
->nl_cred
, &nd
->nl_dvp
);
605 kprintf("NLC_REFDVP: Cannot ref dvp of %p\n", nch
.ncp
);
610 cache_drop(&nd
->nl_nch
);
612 nd
->nl_flags
|= NLC_NCPISLOCKED
;
620 * Resolve a mount point's glue ncp. This ncp connects creates the illusion
621 * of continuity in the namecache tree by connecting the ncp related to the
622 * vnode under the mount to the ncp related to the mount's root vnode.
624 * If no error occured a locked, ref'd ncp is stored in *ncpp.
627 nlookup_mp(struct mount
*mp
, struct nchandle
*nch
)
633 cache_get(&mp
->mnt_ncmountpt
, nch
);
634 if (nch
->ncp
->nc_flag
& NCF_UNRESOLVED
) {
635 while (vfs_busy(mp
, 0))
637 error
= VFS_ROOT(mp
, &vp
);
642 cache_setvp(nch
, vp
);
650 * Read the contents of a symlink, allocate a path buffer out of the
651 * namei_oc and initialize the supplied nlcomponent with the result.
653 * If an error occurs no buffer will be allocated or returned in the nlc.
656 nreadsymlink(struct nlookupdata
*nd
, struct nchandle
*nch
,
657 struct nlcomponent
*nlc
)
666 nlc
->nlc_nameptr
= NULL
;
667 nlc
->nlc_namelen
= 0;
668 if (nch
->ncp
->nc_vp
== NULL
)
670 if ((error
= cache_vget(nch
, nd
->nl_cred
, LK_SHARED
, &vp
)) != 0)
672 cp
= objcache_get(namei_oc
, M_WAITOK
);
674 aiov
.iov_len
= MAXPATHLEN
;
675 auio
.uio_iov
= &aiov
;
678 auio
.uio_rw
= UIO_READ
;
679 auio
.uio_segflg
= UIO_SYSSPACE
;
680 auio
.uio_td
= nd
->nl_td
;
681 auio
.uio_resid
= MAXPATHLEN
- 1;
682 error
= VOP_READLINK(vp
, &auio
, nd
->nl_cred
);
685 linklen
= MAXPATHLEN
- 1 - auio
.uio_resid
;
687 linklen
= varsymreplace(cp
, linklen
, MAXPATHLEN
- 1);
689 error
= ENAMETOOLONG
;
694 nlc
->nlc_nameptr
= cp
;
695 nlc
->nlc_namelen
= linklen
;
699 objcache_put(namei_oc
, cp
);
705 * Check access [XXX cache vattr!] [XXX quota]
707 * Generally check the V* access bits from sys/vnode.h. All specified bits
708 * must pass for this function to return 0.
710 * If VCREATE is specified and the target ncp represents a non-existant
711 * file or dir, or if VDELETE is specified and the target exists, the parent
712 * directory is checked for VWRITE. If VEXCL is specified and the target
713 * ncp represents a positive hit, an error is returned.
715 * If VCREATE is not specified and the target does not exist (negative hit),
716 * ENOENT is returned. Note that nlookup() does not (and should not) return
717 * ENOENT for non-existant leafs.
719 * The passed ncp may or may not be locked. The caller should use a
720 * locked ncp on leaf lookups, especially for VCREATE, VDELETE, and VEXCL
724 naccess(struct nchandle
*nch
, int vmode
, struct ucred
*cred
)
731 if (nch
->ncp
->nc_flag
& NCF_UNRESOLVED
) {
733 cache_resolve(nch
, cred
);
736 error
= nch
->ncp
->nc_error
;
737 if (vmode
& (VDELETE
|VCREATE
|VEXCL
)) {
738 if (((vmode
& VCREATE
) && nch
->ncp
->nc_vp
== NULL
) ||
739 ((vmode
& VDELETE
) && nch
->ncp
->nc_vp
!= NULL
)
741 if ((par
.ncp
= nch
->ncp
->nc_parent
) == NULL
) {
745 par
.mount
= nch
->mount
;
747 error
= naccess(&par
, VWRITE
, cred
);
751 if ((vmode
& VEXCL
) && nch
->ncp
->nc_vp
!= NULL
)
755 error
= cache_vget(nch
, cred
, LK_SHARED
, &vp
);
756 if (error
== ENOENT
) {
759 } else if (error
== 0) {
760 /* XXX cache the va in the namecache or in the vnode */
761 if ((error
= VOP_GETATTR(vp
, &va
)) == 0) {
762 if ((vmode
& VWRITE
) && vp
->v_mount
) {
763 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
)
769 error
= naccess_va(&va
, vmode
, cred
);
776 * Check the requested access against the given vattr using cred.
779 naccess_va(struct vattr
*va
, int vmode
, struct ucred
*cred
)
784 * Test the immutable bit for files, directories, and softlinks.
786 if (vmode
& (VWRITE
|VDELETE
)) {
787 if (va
->va_type
== VDIR
|| va
->va_type
== VLNK
|| va
->va_type
== VREG
) {
788 if (va
->va_flags
& IMMUTABLE
)
794 * root gets universal access
796 if (cred
->cr_uid
== 0)
800 * Check owner perms, group perms, and world perms
803 if (cred
->cr_uid
== va
->va_uid
) {
804 if ((vmode
& va
->va_mode
) != vmode
)
810 for (i
= 0; i
< cred
->cr_ngroups
; ++i
) {
811 if (va
->va_gid
== cred
->cr_groups
[i
]) {
812 if ((vmode
& va
->va_mode
) != vmode
)
819 if ((vmode
& va
->va_mode
) != vmode
)