2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/vfs_nlookup.c,v 1.23 2008/04/14 12:01:50 dillon Exp $
37 * nlookup() is the 'new' namei interface. Rather then return directory and
38 * leaf vnodes (in various lock states) the new interface instead deals in
39 * namecache records. Namecache records may represent both a positive or
40 * a negative hit. The namespace is locked via the namecache record instead
41 * of via the vnode, and only the leaf namecache record (representing the
42 * filename) needs to be locked.
44 * This greatly improves filesystem parallelism and is a huge simplification
45 * of the API verses the old vnode locking / namei scheme.
47 * Filesystems must actively control the caching aspects of the namecache,
48 * and since namecache pointers are used as handles they are non-optional
49 * even for filesystems which do not generally wish to cache things. It is
50 * intended that a separate cache coherency API will be constructed to handle
54 #include "opt_ktrace.h"
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/vnode.h>
60 #include <sys/mount.h>
61 #include <sys/filedesc.h>
63 #include <sys/namei.h>
64 #include <sys/nlookup.h>
65 #include <sys/malloc.h>
67 #include <sys/objcache.h>
70 #include <sys/ktrace.h>
74 * Initialize a nlookup() structure, early error return for copyin faults
75 * or a degenerate empty string (which is not allowed).
77 * The first process proc0's credentials are used if the calling thread
78 * is not associated with a process context.
81 nlookup_init(struct nlookupdata
*nd
,
82 const char *path
, enum uio_seg seg
, int flags
)
93 * note: the pathlen set by copy*str() includes the terminating \0.
95 bzero(nd
, sizeof(struct nlookupdata
));
96 nd
->nl_path
= objcache_get(namei_oc
, M_WAITOK
);
97 nd
->nl_flags
|= NLC_HASBUF
;
98 if (seg
== UIO_SYSSPACE
)
99 error
= copystr(path
, nd
->nl_path
, MAXPATHLEN
, &pathlen
);
101 error
= copyinstr(path
, nd
->nl_path
, MAXPATHLEN
, &pathlen
);
104 * Don't allow empty pathnames.
105 * POSIX.1 requirement: "" is not a vaild file name.
107 if (error
== 0 && pathlen
<= 1)
112 cache_copy(&p
->p_fd
->fd_ncdir
, &nd
->nl_nch
);
113 cache_copy(&p
->p_fd
->fd_nrdir
, &nd
->nl_rootnch
);
114 if (p
->p_fd
->fd_njdir
.ncp
)
115 cache_copy(&p
->p_fd
->fd_njdir
, &nd
->nl_jailnch
);
116 nd
->nl_cred
= crhold(p
->p_ucred
);
118 cache_copy(&rootnch
, &nd
->nl_nch
);
119 cache_copy(&nd
->nl_nch
, &nd
->nl_rootnch
);
120 cache_copy(&nd
->nl_nch
, &nd
->nl_jailnch
);
121 nd
->nl_cred
= crhold(proc0
.p_ucred
);
124 nd
->nl_flags
|= flags
;
132 * This works similarly to nlookup_init() but does not assume a process
133 * context. rootnch is always chosen for the root directory and the cred
134 * and starting directory are supplied in arguments.
137 nlookup_init_raw(struct nlookupdata
*nd
,
138 const char *path
, enum uio_seg seg
, int flags
,
139 struct ucred
*cred
, struct nchandle
*ncstart
)
147 bzero(nd
, sizeof(struct nlookupdata
));
148 nd
->nl_path
= objcache_get(namei_oc
, M_WAITOK
);
149 nd
->nl_flags
|= NLC_HASBUF
;
150 if (seg
== UIO_SYSSPACE
)
151 error
= copystr(path
, nd
->nl_path
, MAXPATHLEN
, &pathlen
);
153 error
= copyinstr(path
, nd
->nl_path
, MAXPATHLEN
, &pathlen
);
156 * Don't allow empty pathnames.
157 * POSIX.1 requirement: "" is not a vaild file name.
159 if (error
== 0 && pathlen
<= 1)
163 cache_copy(ncstart
, &nd
->nl_nch
);
164 cache_copy(&rootnch
, &nd
->nl_rootnch
);
165 cache_copy(&rootnch
, &nd
->nl_jailnch
);
166 nd
->nl_cred
= crhold(cred
);
168 nd
->nl_flags
|= flags
;
176 * Set a different credential; this credential will be used by future
177 * operations performed on nd.nl_open_vp and nlookupdata structure.
180 nlookup_set_cred(struct nlookupdata
*nd
, struct ucred
*cred
)
182 KKASSERT(nd
->nl_cred
!= NULL
);
184 if (nd
->nl_cred
!= cred
) {
192 * Cleanup a nlookupdata structure after we are through with it. This may
193 * be called on any nlookupdata structure initialized with nlookup_init().
194 * Calling nlookup_done() is mandatory in all cases except where nlookup_init()
195 * returns an error, even if as a consumer you believe you have taken all
196 * dynamic elements out of the nlookupdata structure.
199 nlookup_done(struct nlookupdata
*nd
)
201 if (nd
->nl_nch
.ncp
) {
202 if (nd
->nl_flags
& NLC_NCPISLOCKED
) {
203 nd
->nl_flags
&= ~NLC_NCPISLOCKED
;
204 cache_unlock(&nd
->nl_nch
);
206 cache_drop(&nd
->nl_nch
);
208 if (nd
->nl_rootnch
.ncp
)
209 cache_drop(&nd
->nl_rootnch
);
210 if (nd
->nl_jailnch
.ncp
)
211 cache_drop(&nd
->nl_jailnch
);
212 if ((nd
->nl_flags
& NLC_HASBUF
) && nd
->nl_path
) {
213 objcache_put(namei_oc
, nd
->nl_path
);
220 if (nd
->nl_open_vp
) {
221 if (nd
->nl_flags
& NLC_LOCKVP
) {
222 vn_unlock(nd
->nl_open_vp
);
223 nd
->nl_flags
&= ~NLC_LOCKVP
;
225 vn_close(nd
->nl_open_vp
, nd
->nl_vp_fmode
);
226 nd
->nl_open_vp
= NULL
;
228 nd
->nl_flags
= 0; /* clear remaining flags (just clear everything) */
232 nlookup_zero(struct nlookupdata
*nd
)
234 bzero(nd
, sizeof(struct nlookupdata
));
238 * Simple all-in-one nlookup. Returns a locked namecache structure or NULL
239 * if an error occured.
241 * Note that the returned ncp is not checked for permissions, though VEXEC
242 * is checked on the directory path leading up to the result. The caller
243 * must call naccess() to check the permissions of the returned leaf.
246 nlookup_simple(const char *str
, enum uio_seg seg
,
247 int niflags
, int *error
)
249 struct nlookupdata nd
;
252 *error
= nlookup_init(&nd
, str
, seg
, niflags
);
254 if ((*error
= nlookup(&nd
)) == 0) {
255 nch
= nd
.nl_nch
; /* keep hold ref from structure */
256 cache_zero(&nd
.nl_nch
); /* and NULL out */
268 * Do a generic nlookup. Note that the passed nd is not nlookup_done()'d
269 * on return, even if an error occurs. If no error occurs the returned
270 * nl_nch is always referenced and locked, otherwise it may or may not be.
272 * Intermediate directory elements, including the current directory, require
273 * execute (search) permission. nlookup does not examine the access
274 * permissions on the returned element.
276 * If NLC_CREATE or NLC_DELETE is set the last directory must allow node
277 * creation (VCREATE/VDELETE), and an error code of 0 will be returned for
278 * a non-existant target. Otherwise a non-existant target will cause
279 * ENOENT to be returned.
282 nlookup(struct nlookupdata
*nd
)
284 struct nlcomponent nlc
;
294 if (KTRPOINT(nd
->nl_td
, KTR_NAMEI
))
295 ktrnamei(nd
->nl_td
->td_lwp
, nd
->nl_path
);
297 bzero(&nlc
, sizeof(nlc
));
300 * Setup for the loop. The current working namecache element must
301 * be in a refd + unlocked state. This typically the case on entry except
302 * when stringing nlookup()'s along in a chain, since nlookup() always
303 * returns nl_nch in a locked state.
306 if (nd
->nl_flags
& NLC_NCPISLOCKED
) {
307 nd
->nl_flags
&= ~NLC_NCPISLOCKED
;
308 cache_unlock(&nd
->nl_nch
);
313 * Loop on the path components. At the top of the loop nd->nl_nch
314 * is ref'd and unlocked and represents our current position.
318 * Check if the root directory should replace the current
319 * directory. This is done at the start of a translation
320 * or after a symbolic link has been found. In other cases
321 * ptr will never be pointing at a '/'.
326 } while (*ptr
== '/');
327 cache_copy(&nd
->nl_rootnch
, &nch
);
328 cache_drop(&nd
->nl_nch
);
331 cache_lock(&nd
->nl_nch
);
332 nd
->nl_flags
|= NLC_NCPISLOCKED
;
340 * Check directory search permissions.
342 if ((error
= naccess(&nd
->nl_nch
, VEXEC
, nd
->nl_cred
)) != 0)
346 * Extract the path component
348 nlc
.nlc_nameptr
= ptr
;
349 while (*ptr
&& *ptr
!= '/')
351 nlc
.nlc_namelen
= ptr
- nlc
.nlc_nameptr
;
354 * Lookup the path component in the cache, creating an unresolved
355 * entry if necessary. We have to handle "." and ".." as special
358 * When handling ".." we have to detect a traversal back through a
359 * mount point. If we are at the root, ".." just returns the root.
361 * This subsection returns a locked, refd 'nch' unless it errors out.
362 * The namecache topology is not allowed to be disconnected, so
363 * encountering a NULL parent will generate EINVAL. This typically
364 * occurs when a directory is removed out from under a process.
366 * If NLC_DELETE is set neither '.' or '..' can be the last component
369 if (nlc
.nlc_namelen
== 1 && nlc
.nlc_nameptr
[0] == '.') {
370 cache_get(&nd
->nl_nch
, &nch
);
372 } else if (nlc
.nlc_namelen
== 2 &&
373 nlc
.nlc_nameptr
[0] == '.' && nlc
.nlc_nameptr
[1] == '.') {
374 if (nd
->nl_nch
.mount
== nd
->nl_rootnch
.mount
&&
375 nd
->nl_nch
.ncp
== nd
->nl_rootnch
.ncp
378 * ".." at the root returns the root
380 cache_get(&nd
->nl_nch
, &nch
);
383 * Locate the parent ncp. If we are at the root of a
384 * filesystem mount we have to skip to the mounted-on
385 * point in the underlying filesystem.
388 while (nch
.ncp
== nch
.mount
->mnt_ncmountpt
.ncp
)
389 nch
= nch
.mount
->mnt_ncmounton
;
390 nch
.ncp
= nch
.ncp
->nc_parent
;
391 KKASSERT(nch
.ncp
!= NULL
);
392 cache_get(&nch
, &nch
);
396 nch
= cache_nlookup(&nd
->nl_nch
, &nlc
);
397 while ((error
= cache_resolve(&nch
, nd
->nl_cred
)) == EAGAIN
) {
398 kprintf("[diagnostic] nlookup: relookup %*.*s\n",
399 nch
.ncp
->nc_nlen
, nch
.ncp
->nc_nlen
, nch
.ncp
->nc_name
);
401 nch
= cache_nlookup(&nd
->nl_nch
, &nlc
);
406 * [end of subsection] ncp is locked and ref'd. nd->nl_nch is ref'd
410 * Resolve the namespace if necessary. The ncp returned by
411 * cache_nlookup() is referenced and locked.
413 * XXX neither '.' nor '..' should return EAGAIN since they were
414 * previously resolved and thus cannot be newly created ncp's.
416 if (nch
.ncp
->nc_flag
& NCF_UNRESOLVED
) {
417 error
= cache_resolve(&nch
, nd
->nl_cred
);
418 KKASSERT(error
!= EAGAIN
);
420 error
= nch
.ncp
->nc_error
;
424 * Early completion. ENOENT is not an error if this is the last
425 * component and NLC_CREATE was requested. Note that ncp->nc_error
426 * is left as ENOENT in that case, which we check later on.
428 * Also handle invalid '.' or '..' components terminating a path
429 * during removal. The standard requires this and pax pretty
430 *stupidly depends on it.
432 for (xptr
= ptr
; *xptr
== '/'; ++xptr
)
435 if (error
== ENOENT
&& (nd
->nl_flags
& NLC_CREATE
))
436 error
= naccess(&nch
, VCREATE
, nd
->nl_cred
);
437 if (error
== 0 && wasdotordotdot
&& (nd
->nl_flags
& NLC_DELETE
))
442 * Early completion on error.
450 * If the element is a symlink and it is either not the last
451 * element or it is the last element and we are allowed to
452 * follow symlinks, resolve the symlink.
454 if ((nch
.ncp
->nc_flag
& NCF_ISSYMLINK
) &&
455 (*ptr
|| (nd
->nl_flags
& NLC_FOLLOW
))
457 if (nd
->nl_loopcnt
++ >= MAXSYMLINKS
) {
462 error
= nreadsymlink(nd
, &nch
, &nlc
);
468 * Concatenate trailing path elements onto the returned symlink.
469 * Note that if the path component (ptr) is not exhausted, it
470 * will being with a '/', so we do not have to add another one.
472 * The symlink may not be empty.
475 if (nlc
.nlc_namelen
== 0 || nlc
.nlc_namelen
+ len
>= MAXPATHLEN
) {
476 error
= nlc
.nlc_namelen
? ENAMETOOLONG
: ENOENT
;
477 objcache_put(namei_oc
, nlc
.nlc_nameptr
);
480 bcopy(ptr
, nlc
.nlc_nameptr
+ nlc
.nlc_namelen
, len
+ 1);
481 if (nd
->nl_flags
& NLC_HASBUF
)
482 objcache_put(namei_oc
, nd
->nl_path
);
483 nd
->nl_path
= nlc
.nlc_nameptr
;
484 nd
->nl_flags
|= NLC_HASBUF
;
488 * Go back up to the top to resolve any initial '/'s in the
495 * If the element is a directory and we are crossing a mount point,
498 while ((nch
.ncp
->nc_flag
& NCF_ISMOUNTPT
) &&
499 (nd
->nl_flags
& NLC_NOCROSSMOUNT
) == 0 &&
500 (mp
= cache_findmount(&nch
)) != NULL
505 cache_get(&mp
->mnt_ncmountpt
, &nch
);
507 if (nch
.ncp
->nc_flag
& NCF_UNRESOLVED
) {
508 while (vfs_busy(mp
, 0))
510 error
= VFS_ROOT(mp
, &tdp
);
514 cache_setvp(&nch
, tdp
);
524 * Skip any slashes to get to the next element. If there
525 * are any slashes at all the current element must be a
526 * directory or, in the create case, intended to become a directory.
527 * If it isn't we break without incrementing ptr and fall through
528 * to the failure case below.
530 while (*ptr
== '/') {
531 if ((nch
.ncp
->nc_flag
& NCF_ISDIR
) == 0 &&
532 !(nd
->nl_flags
& NLC_WILLBEDIR
)
540 * Continuation case: additional elements and the current
541 * element is a directory.
543 if (*ptr
&& (nch
.ncp
->nc_flag
& NCF_ISDIR
)) {
544 cache_drop(&nd
->nl_nch
);
551 * Failure case: additional elements and the current element
561 * Successful lookup of last element.
563 * Check directory permissions if a deletion is specified.
565 if (*ptr
== 0 && (nd
->nl_flags
& NLC_DELETE
)) {
566 if ((error
= naccess(&nch
, VDELETE
, nd
->nl_cred
)) != 0) {
573 * Termination: no more elements. If NLC_CREATE was set the
574 * ncp may represent a negative hit (ncp->nc_error will be ENOENT),
575 * but we still return an error code of 0.
577 cache_drop(&nd
->nl_nch
);
579 nd
->nl_flags
|= NLC_NCPISLOCKED
;
587 * Resolve a mount point's glue ncp. This ncp connects creates the illusion
588 * of continuity in the namecache tree by connecting the ncp related to the
589 * vnode under the mount to the ncp related to the mount's root vnode.
591 * If no error occured a locked, ref'd ncp is stored in *ncpp.
594 nlookup_mp(struct mount
*mp
, struct nchandle
*nch
)
600 cache_get(&mp
->mnt_ncmountpt
, nch
);
601 if (nch
->ncp
->nc_flag
& NCF_UNRESOLVED
) {
602 while (vfs_busy(mp
, 0))
604 error
= VFS_ROOT(mp
, &vp
);
609 cache_setvp(nch
, vp
);
617 * Read the contents of a symlink, allocate a path buffer out of the
618 * namei_oc and initialize the supplied nlcomponent with the result.
620 * If an error occurs no buffer will be allocated or returned in the nlc.
623 nreadsymlink(struct nlookupdata
*nd
, struct nchandle
*nch
,
624 struct nlcomponent
*nlc
)
633 nlc
->nlc_nameptr
= NULL
;
634 nlc
->nlc_namelen
= 0;
635 if (nch
->ncp
->nc_vp
== NULL
)
637 if ((error
= cache_vget(nch
, nd
->nl_cred
, LK_SHARED
, &vp
)) != 0)
639 cp
= objcache_get(namei_oc
, M_WAITOK
);
641 aiov
.iov_len
= MAXPATHLEN
;
642 auio
.uio_iov
= &aiov
;
645 auio
.uio_rw
= UIO_READ
;
646 auio
.uio_segflg
= UIO_SYSSPACE
;
647 auio
.uio_td
= nd
->nl_td
;
648 auio
.uio_resid
= MAXPATHLEN
- 1;
649 error
= VOP_READLINK(vp
, &auio
, nd
->nl_cred
);
652 linklen
= MAXPATHLEN
- 1 - auio
.uio_resid
;
654 linklen
= varsymreplace(cp
, linklen
, MAXPATHLEN
- 1);
656 error
= ENAMETOOLONG
;
661 nlc
->nlc_nameptr
= cp
;
662 nlc
->nlc_namelen
= linklen
;
666 objcache_put(namei_oc
, cp
);
672 * Check access [XXX cache vattr!] [XXX quota]
674 * Generally check the V* access bits from sys/vnode.h. All specified bits
675 * must pass for this function to return 0.
677 * If VCREATE is specified and the target ncp represents a non-existant
678 * file or dir, or if VDELETE is specified and the target exists, the parent
679 * directory is checked for VWRITE. If VEXCL is specified and the target
680 * ncp represents a positive hit, an error is returned.
682 * If VCREATE is not specified and the target does not exist (negative hit),
683 * ENOENT is returned. Note that nlookup() does not (and should not) return
684 * ENOENT for non-existant leafs.
686 * The passed ncp may or may not be locked. The caller should use a
687 * locked ncp on leaf lookups, especially for VCREATE, VDELETE, and VEXCL
691 naccess(struct nchandle
*nch
, int vmode
, struct ucred
*cred
)
698 if (nch
->ncp
->nc_flag
& NCF_UNRESOLVED
) {
700 cache_resolve(nch
, cred
);
703 error
= nch
->ncp
->nc_error
;
704 if (vmode
& (VDELETE
|VCREATE
|VEXCL
)) {
705 if (((vmode
& VCREATE
) && nch
->ncp
->nc_vp
== NULL
) ||
706 ((vmode
& VDELETE
) && nch
->ncp
->nc_vp
!= NULL
)
708 if ((par
.ncp
= nch
->ncp
->nc_parent
) == NULL
) {
712 par
.mount
= nch
->mount
;
714 error
= naccess(&par
, VWRITE
, cred
);
718 if ((vmode
& VEXCL
) && nch
->ncp
->nc_vp
!= NULL
)
722 error
= cache_vget(nch
, cred
, LK_SHARED
, &vp
);
723 if (error
== ENOENT
) {
726 } else if (error
== 0) {
727 /* XXX cache the va in the namecache or in the vnode */
728 if ((error
= VOP_GETATTR(vp
, &va
)) == 0) {
729 if ((vmode
& VWRITE
) && vp
->v_mount
) {
730 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
)
736 error
= naccess_va(&va
, vmode
, cred
);
743 * Check the requested access against the given vattr using cred.
746 naccess_va(struct vattr
*va
, int vmode
, struct ucred
*cred
)
751 * Test the immutable bit for files, directories, and softlinks.
753 if (vmode
& (VWRITE
|VDELETE
)) {
754 if (va
->va_type
== VDIR
|| va
->va_type
== VLNK
|| va
->va_type
== VREG
) {
755 if (va
->va_flags
& IMMUTABLE
)
761 * root gets universal access
763 if (cred
->cr_uid
== 0)
767 * Check owner perms, group perms, and world perms
770 if (cred
->cr_uid
== va
->va_uid
) {
771 if ((vmode
& va
->va_mode
) != vmode
)
777 for (i
= 0; i
< cred
->cr_ngroups
; ++i
) {
778 if (va
->va_gid
== cred
->cr_groups
[i
]) {
779 if ((vmode
& va
->va_mode
) != vmode
)
786 if ((vmode
& va
->va_mode
) != vmode
)