2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * Copyright (c) 1989, 1993, 1995
35 * The Regents of the University of California. All rights reserved.
37 * This code is derived from software contributed to Berkeley by
38 * Poul-Henning Kamp of the FreeBSD Project.
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 * 3. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 #include <sys/param.h>
66 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/mount.h>
71 #include <sys/vnode.h>
72 #include <sys/malloc.h>
73 #include <sys/sysproto.h>
74 #include <sys/spinlock.h>
76 #include <sys/namei.h>
77 #include <sys/nlookup.h>
78 #include <sys/filedesc.h>
79 #include <sys/fnv_hash.h>
80 #include <sys/globaldata.h>
81 #include <sys/kern_syscall.h>
82 #include <sys/dirent.h>
85 #include <sys/spinlock2.h>
87 #define MAX_RECURSION_DEPTH 64
90 * Random lookups in the cache are accomplished with a hash table using
91 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock,
92 * but we use the ncp->update counter trick to avoid acquiring any
93 * contestable spin-locks during a lookup.
95 * Negative entries may exist and correspond to resolved namecache
96 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT
97 * will be set if the entry corresponds to a whited-out directory entry
98 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list
99 * is locked via pcpu_ncache[n].neg_spin;
103 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One
104 * is applicable to direct lookups via the hash table nchpp or via
105 * nc_list (the two are added or removed together). Removal of the ncp
106 * from the hash table drops this reference. The second is applicable
107 * to vp->v_namecache linkages (or negative list linkages), and removal
108 * of the ncp from these lists drops this reference.
110 * On the 1->0 transition of nc_refs the ncp can no longer be referenced
111 * and must be destroyed. No other thread should have access to it at
112 * this point so it can be safely locked and freed without any deadlock
115 * The 1->0 transition can occur at almost any juncture and so cache_drop()
116 * deals with it directly.
118 * (2) Once the 1->0 transition occurs, the entity that caused the transition
119 * will be responsible for destroying the ncp. The ncp cannot be on any
120 * list or hash at this time, or be held by anyone other than the caller
121 * responsible for the transition.
123 * (3) A ncp must be locked in order to modify it.
125 * (5) ncp locks are ordered, child-to-parent. Child first, then parent.
126 * This may seem backwards but forward-scans use the hash table and thus
127 * can hold the parent unlocked while traversing downward. Deletions,
128 * on the other-hand, tend to propagate bottom-up since the ref on the
129 * is dropped as the children go away.
131 * (6) Both parent and child must be locked in order to enter the child onto
132 * the parent's nc_list.
136 * Structures associated with name cacheing.
138 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash])
141 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */
142 #define NCMOUNT_SET (8) /* power of 2 */
144 MALLOC_DEFINE(M_VFSCACHE
, "vfscache", "VFS name cache entries");
146 TAILQ_HEAD(nchash_list
, namecache
);
149 * Don't cachealign, but at least pad to 32 bytes so entries
150 * don't cross a cache line.
153 struct nchash_list list
; /* 16 bytes */
154 struct spinlock spin
; /* 8 bytes */
155 long pad01
; /* 8 bytes */
158 struct ncmount_cache
{
159 struct spinlock spin
;
160 struct namecache
*ncp
;
162 struct mount
*mp_target
;
170 struct spinlock umount_spin
; /* cache_findmount/interlock */
171 struct spinlock neg_spin
; /* for neg_list and neg_count */
172 struct namecache_list neg_list
;
180 __read_mostly
static struct nchash_head
*nchashtbl
;
181 __read_mostly
static struct pcpu_ncache
*pcpu_ncache
;
182 static struct ncmount_cache ncmount_cache
[NCMOUNT_NUMCACHE
];
185 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server
186 * to create the namecache infrastructure leading to a dangling vnode.
188 * 0 Only errors are reported
189 * 1 Successes are reported
190 * 2 Successes + the whole directory scan is reported
191 * 3 Force the directory scan code run as if the parent vnode did not
192 * have a namecache record, even if it does have one.
194 __read_mostly
static int ncvp_debug
;
195 SYSCTL_INT(_debug
, OID_AUTO
, ncvp_debug
, CTLFLAG_RW
, &ncvp_debug
, 0,
196 "Namecache debug level (0-3)");
198 __read_mostly
static u_long nchash
; /* size of hash table */
199 SYSCTL_ULONG(_debug
, OID_AUTO
, nchash
, CTLFLAG_RD
, &nchash
, 0,
200 "Size of namecache hash table");
202 __read_mostly
static int ncnegflush
= 10; /* burst for negative flush */
203 SYSCTL_INT(_debug
, OID_AUTO
, ncnegflush
, CTLFLAG_RW
, &ncnegflush
, 0,
204 "Batch flush negative entries");
206 __read_mostly
static int ncposflush
= 10; /* burst for positive flush */
207 SYSCTL_INT(_debug
, OID_AUTO
, ncposflush
, CTLFLAG_RW
, &ncposflush
, 0,
208 "Batch flush positive entries");
210 __read_mostly
static int ncnegfactor
= 16; /* ratio of negative entries */
211 SYSCTL_INT(_debug
, OID_AUTO
, ncnegfactor
, CTLFLAG_RW
, &ncnegfactor
, 0,
212 "Ratio of namecache negative entries");
214 __read_mostly
static int nclockwarn
; /* warn on locked entries in ticks */
215 SYSCTL_INT(_debug
, OID_AUTO
, nclockwarn
, CTLFLAG_RW
, &nclockwarn
, 0,
216 "Warn on locked namecache entries in ticks");
218 __read_mostly
static int ncposlimit
; /* number of cache entries allocated */
219 SYSCTL_INT(_debug
, OID_AUTO
, ncposlimit
, CTLFLAG_RW
, &ncposlimit
, 0,
220 "Number of cache entries allocated");
222 __read_mostly
static int ncp_shared_lock_disable
= 0;
223 SYSCTL_INT(_debug
, OID_AUTO
, ncp_shared_lock_disable
, CTLFLAG_RW
,
224 &ncp_shared_lock_disable
, 0, "Disable shared namecache locks");
226 SYSCTL_INT(_debug
, OID_AUTO
, vnsize
, CTLFLAG_RD
, 0, sizeof(struct vnode
),
227 "sizeof(struct vnode)");
228 SYSCTL_INT(_debug
, OID_AUTO
, ncsize
, CTLFLAG_RD
, 0, sizeof(struct namecache
),
229 "sizeof(struct namecache)");
231 __read_mostly
static int ncmount_cache_enable
= 1;
232 SYSCTL_INT(_debug
, OID_AUTO
, ncmount_cache_enable
, CTLFLAG_RW
,
233 &ncmount_cache_enable
, 0, "mount point cache");
235 static __inline
void _cache_drop(struct namecache
*ncp
);
236 static int cache_resolve_mp(struct mount
*mp
);
237 static int cache_findmount_callback(struct mount
*mp
, void *data
);
238 static void _cache_setunresolved(struct namecache
*ncp
);
239 static void _cache_cleanneg(long count
);
240 static void _cache_cleanpos(long count
);
241 static void _cache_cleandefered(void);
242 static void _cache_unlink(struct namecache
*ncp
);
245 * The new name cache statistics (these are rolled up globals and not
246 * modified in the critical path, see struct pcpu_ncache).
248 SYSCTL_NODE(_vfs
, OID_AUTO
, cache
, CTLFLAG_RW
, 0, "Name cache statistics");
249 static long vfscache_negs
;
250 SYSCTL_LONG(_vfs_cache
, OID_AUTO
, numneg
, CTLFLAG_RD
, &vfscache_negs
, 0,
251 "Number of negative namecache entries");
252 static long vfscache_count
;
253 SYSCTL_LONG(_vfs_cache
, OID_AUTO
, numcache
, CTLFLAG_RD
, &vfscache_count
, 0,
254 "Number of namecaches entries");
255 static long vfscache_leafs
;
256 SYSCTL_LONG(_vfs_cache
, OID_AUTO
, numleafs
, CTLFLAG_RD
, &vfscache_leafs
, 0,
257 "Number of namecaches entries");
258 static long numdefered
;
259 SYSCTL_LONG(_debug
, OID_AUTO
, numdefered
, CTLFLAG_RD
, &numdefered
, 0,
260 "Number of cache entries allocated");
263 struct nchstats nchstats
[SMP_MAXCPU
];
265 * Export VFS cache effectiveness statistics to user-land.
267 * The statistics are left for aggregation to user-land so
268 * neat things can be achieved, like observing per-CPU cache
272 sysctl_nchstats(SYSCTL_HANDLER_ARGS
)
274 struct globaldata
*gd
;
278 for (i
= 0; i
< ncpus
; ++i
) {
279 gd
= globaldata_find(i
);
280 if ((error
= SYSCTL_OUT(req
, (void *)&(*gd
->gd_nchstats
),
281 sizeof(struct nchstats
))))
287 SYSCTL_PROC(_vfs_cache
, OID_AUTO
, nchstats
, CTLTYPE_OPAQUE
|CTLFLAG_RD
,
288 0, 0, sysctl_nchstats
, "S,nchstats", "VFS cache effectiveness statistics");
290 static void cache_zap(struct namecache
*ncp
);
293 * Cache mount points and namecache records in order to avoid unnecessary
294 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP
295 * performance and is particularly important on multi-socket systems to
296 * reduce cache-line ping-ponging.
298 * Try to keep the pcpu structure within one cache line (~64 bytes).
300 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */
301 #define MNTCACHE_SET 8 /* set associativity */
303 struct mntcache_elm
{
304 struct namecache
*ncp
;
311 struct mntcache_elm array
[MNTCACHE_COUNT
];
314 static struct mntcache pcpu_mntcache
[MAXCPU
];
317 struct mntcache_elm
*
318 _cache_mntcache_hash(void *ptr
)
320 struct mntcache_elm
*elm
;
323 hv
= iscsi_crc32(&ptr
, sizeof(ptr
)) & (MNTCACHE_COUNT
- 1);
324 elm
= &pcpu_mntcache
[mycpu
->gd_cpuid
].array
[hv
& ~(MNTCACHE_SET
- 1)];
331 _cache_mntref(struct mount
*mp
)
333 struct mntcache_elm
*elm
;
337 elm
= _cache_mntcache_hash(mp
);
338 for (i
= 0; i
< MNTCACHE_SET
; ++i
) {
340 mpr
= atomic_swap_ptr((void *)&elm
->mp
, NULL
);
341 if (__predict_true(mpr
== mp
))
344 atomic_add_int(&mpr
->mnt_refs
, -1);
348 atomic_add_int(&mp
->mnt_refs
, 1);
353 _cache_mntrel(struct mount
*mp
)
355 struct mntcache_elm
*elm
;
356 struct mntcache_elm
*best
;
362 elm
= _cache_mntcache_hash(mp
);
364 for (i
= 0; i
< MNTCACHE_SET
; ++i
) {
365 if (elm
->mp
== NULL
) {
366 mpr
= atomic_swap_ptr((void *)&elm
->mp
, mp
);
367 if (__predict_false(mpr
!= NULL
)) {
368 atomic_add_int(&mpr
->mnt_refs
, -1);
373 delta1
= ticks
- best
->ticks
;
374 delta2
= ticks
- elm
->ticks
;
375 if (delta2
> delta1
|| delta1
< -1 || delta2
< -1)
379 mpr
= atomic_swap_ptr((void *)&best
->mp
, mp
);
382 atomic_add_int(&mpr
->mnt_refs
, -1);
386 * Clears all cached mount points on all cpus. This routine should only
387 * be called when we are waiting for a mount to clear, e.g. so we can
391 cache_clearmntcache(struct mount
*target __unused
)
395 for (n
= 0; n
< ncpus
; ++n
) {
396 struct mntcache
*cache
= &pcpu_mntcache
[n
];
397 struct mntcache_elm
*elm
;
398 struct namecache
*ncp
;
402 for (i
= 0; i
< MNTCACHE_COUNT
; ++i
) {
403 elm
= &cache
->array
[i
];
405 mp
= atomic_swap_ptr((void *)&elm
->mp
, NULL
);
407 atomic_add_int(&mp
->mnt_refs
, -1);
410 ncp
= atomic_swap_ptr((void *)&elm
->ncp
, NULL
);
419 * Namespace locking. The caller must already hold a reference to the
420 * namecache structure in order to lock/unlock it. The controlling entity
421 * in a 1->0 transition does not need to lock the ncp to dispose of it,
422 * as nobody else will have visiblity to it at that point.
424 * Note that holding a locked namecache structure prevents other threads
425 * from making namespace changes (e.g. deleting or creating), prevents
426 * vnode association state changes by other threads, and prevents the
427 * namecache entry from being resolved or unresolved by other threads.
429 * An exclusive lock owner has full authority to associate/disassociate
430 * vnodes and resolve/unresolve the locked ncp.
432 * A shared lock owner only has authority to acquire the underlying vnode,
435 * The primary lock field is nc_lockstatus. nc_locktd is set after the
436 * fact (when locking) or cleared prior to unlocking.
438 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed
439 * or recycled, but it does NOT help you if the vnode had already
440 * initiated a recyclement. If this is important, use cache_get()
441 * rather then cache_lock() (and deal with the differences in the
442 * way the refs counter is handled). Or, alternatively, make an
443 * unconditional call to cache_validate() or cache_resolve()
444 * after cache_lock() returns.
448 _cache_lock(struct namecache
*ncp
)
453 error
= lockmgr(&ncp
->nc_lock
, LK_EXCLUSIVE
);
454 while (__predict_false(error
== EWOULDBLOCK
)) {
456 didwarn
= ticks
- nclockwarn
;
457 kprintf("[diagnostic] cache_lock: "
460 curthread
->td_comm
, ncp
,
461 ncp
->nc_nlen
, ncp
->nc_nlen
,
464 error
= lockmgr(&ncp
->nc_lock
, LK_EXCLUSIVE
| LK_TIMELOCK
);
466 if (__predict_false(didwarn
)) {
467 kprintf("[diagnostic] cache_lock: "
468 "%s unblocked %*.*s after %d secs\n",
470 ncp
->nc_nlen
, ncp
->nc_nlen
, ncp
->nc_name
,
471 (int)(ticks
- didwarn
) / hz
);
476 * Release a previously acquired lock.
478 * A concurrent shared-lock acquisition or acquisition/release can
479 * race bit 31 so only drop the ncp if bit 31 was set.
483 _cache_unlock(struct namecache
*ncp
)
485 lockmgr(&ncp
->nc_lock
, LK_RELEASE
);
489 * Lock ncp exclusively, non-blocking. Return 0 on success.
493 _cache_lock_nonblock(struct namecache
*ncp
)
497 error
= lockmgr(&ncp
->nc_lock
, LK_EXCLUSIVE
| LK_NOWAIT
);
498 if (__predict_false(error
!= 0)) {
505 * This is a special form of _cache_lock() which only succeeds if
506 * it can get a pristine, non-recursive lock. The caller must have
507 * already ref'd the ncp.
509 * On success the ncp will be locked, on failure it will not. The
510 * ref count does not change either way.
512 * We want _cache_lock_special() (on success) to return a definitively
513 * usable vnode or a definitively unresolved ncp.
517 _cache_lock_special(struct namecache
*ncp
)
519 if (_cache_lock_nonblock(ncp
) == 0) {
520 if (lockmgr_oneexcl(&ncp
->nc_lock
)) {
521 if (ncp
->nc_vp
&& (ncp
->nc_vp
->v_flag
& VRECLAIMED
))
522 _cache_setunresolved(ncp
);
531 * Shared lock, guarantees vp held
533 * The shared lock holds vp on the 0->1 transition. It is possible to race
534 * another shared lock release, preventing the other release from dropping
535 * the vnode and clearing bit 31.
537 * If it is not set then we are responsible for setting it, and this
538 * responsibility does not race with anyone else.
542 _cache_lock_shared(struct namecache
*ncp
)
547 error
= lockmgr(&ncp
->nc_lock
, LK_SHARED
| LK_TIMELOCK
);
548 while (__predict_false(error
== EWOULDBLOCK
)) {
550 didwarn
= ticks
- nclockwarn
;
551 kprintf("[diagnostic] cache_lock_shared: "
554 curthread
->td_comm
, ncp
,
555 ncp
->nc_nlen
, ncp
->nc_nlen
,
558 error
= lockmgr(&ncp
->nc_lock
, LK_SHARED
| LK_TIMELOCK
);
560 if (__predict_false(didwarn
)) {
561 kprintf("[diagnostic] cache_lock_shared: "
562 "%s unblocked %*.*s after %d secs\n",
564 ncp
->nc_nlen
, ncp
->nc_nlen
, ncp
->nc_name
,
565 (int)(ticks
- didwarn
) / hz
);
570 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success
574 _cache_lock_shared_nonblock(struct namecache
*ncp
)
578 error
= lockmgr(&ncp
->nc_lock
, LK_SHARED
| LK_NOWAIT
);
579 if (__predict_false(error
!= 0)) {
586 * This function tries to get a shared lock but will back-off to an
589 * (1) Some other thread is trying to obtain an exclusive lock
590 * (to prevent the exclusive requester from getting livelocked out
591 * by many shared locks).
593 * (2) The current thread already owns an exclusive lock (to avoid
596 * WARNING! On machines with lots of cores we really want to try hard to
597 * get a shared lock or concurrent path lookups can chain-react
598 * into a very high-latency exclusive lock.
600 * This is very evident in dsynth's initial scans.
604 _cache_lock_shared_special(struct namecache
*ncp
)
607 * Only honor a successful shared lock (returning 0) if there is
608 * no exclusive request pending and the vnode, if present, is not
609 * in a reclaimed state.
611 if (_cache_lock_shared_nonblock(ncp
) == 0) {
612 if (__predict_true(!lockmgr_exclpending(&ncp
->nc_lock
))) {
613 if (ncp
->nc_vp
== NULL
||
614 (ncp
->nc_vp
->v_flag
& VRECLAIMED
) == 0) {
623 * Non-blocking shared lock failed. If we already own the exclusive
624 * lock just acquire another exclusive lock (instead of deadlocking).
625 * Otherwise acquire a shared lock.
627 if (lockstatus(&ncp
->nc_lock
, curthread
) == LK_EXCLUSIVE
) {
631 _cache_lock_shared(ncp
);
637 _cache_lockstatus(struct namecache
*ncp
)
641 status
= lockstatus(&ncp
->nc_lock
, curthread
);
642 if (status
== 0 || status
== LK_EXCLOTHER
)
648 * cache_hold() and cache_drop() prevent the premature deletion of a
649 * namecache entry but do not prevent operations (such as zapping) on
650 * that namecache entry.
652 * This routine may only be called from outside this source module if
653 * nc_refs is already deterministically at least 1, such as being
654 * associated with e.g. a process, file descriptor, or some other entity.
656 * Only the above situations, similar situations within this module where
657 * the ref count is deterministically at least 1, or when the ncp is found
658 * via the nchpp (hash table) lookup, can bump nc_refs.
660 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It
661 * can still be removed from the nc_list, however, as long as the caller
662 * can acquire its lock (in the wrong order).
664 * This is a rare case where callers are allowed to hold a spinlock,
665 * so we can't ourselves.
669 _cache_hold(struct namecache
*ncp
)
671 KKASSERT(ncp
->nc_refs
> 0);
672 atomic_add_int(&ncp
->nc_refs
, 1);
678 * Drop a cache entry.
680 * The 1->0 transition is special and requires the caller to destroy the
681 * entry. It means that the ncp is no longer on a nchpp list (since that
682 * would mean there was stilla ref). The ncp could still be on a nc_list
683 * but will not have any child of its own, again because nc_refs is now 0
684 * and children would have a ref to their parent.
686 * Once the 1->0 transition is made, nc_refs cannot be incremented again.
690 _cache_drop(struct namecache
*ncp
)
692 if (atomic_fetchadd_int(&ncp
->nc_refs
, -1) == 1) {
694 * Executed unlocked (no need to lock on last drop)
696 _cache_setunresolved(ncp
);
701 ncp
->nc_refs
= -1; /* safety */
703 kfree(ncp
->nc_name
, M_VFSCACHE
);
704 kfree(ncp
, M_VFSCACHE
);
709 * Link a new namecache entry to its parent and to the hash table. Be
710 * careful to avoid races if vhold() blocks in the future.
712 * Both ncp and par must be referenced and locked. The reference is
713 * transfered to the nchpp (and, most notably, NOT to the parent list).
715 * NOTE: The hash table spinlock is held across this call, we can't do
719 _cache_link_parent(struct namecache
*ncp
, struct namecache
*par
,
720 struct nchash_head
*nchpp
)
722 struct pcpu_ncache
*pn
= &pcpu_ncache
[mycpu
->gd_cpuid
];
724 KKASSERT(ncp
->nc_parent
== NULL
);
725 ncp
->nc_parent
= par
;
726 ncp
->nc_head
= nchpp
;
729 * Set inheritance flags. Note that the parent flags may be
730 * stale due to getattr potentially not having been run yet
731 * (it gets run during nlookup()'s).
733 ncp
->nc_flag
&= ~(NCF_SF_PNOCACHE
| NCF_UF_PCACHE
);
734 if (par
->nc_flag
& (NCF_SF_NOCACHE
| NCF_SF_PNOCACHE
))
735 ncp
->nc_flag
|= NCF_SF_PNOCACHE
;
736 if (par
->nc_flag
& (NCF_UF_CACHE
| NCF_UF_PCACHE
))
737 ncp
->nc_flag
|= NCF_UF_PCACHE
;
740 * Add to hash table and parent, adjust accounting
742 TAILQ_INSERT_HEAD(&nchpp
->list
, ncp
, nc_hash
);
743 atomic_add_long(&pn
->vfscache_count
, 1);
744 if (TAILQ_EMPTY(&ncp
->nc_list
))
745 atomic_add_long(&pn
->vfscache_leafs
, 1);
747 if (TAILQ_EMPTY(&par
->nc_list
)) {
748 TAILQ_INSERT_HEAD(&par
->nc_list
, ncp
, nc_entry
);
749 atomic_add_long(&pn
->vfscache_leafs
, -1);
751 * Any vp associated with an ncp which has children must
752 * be held to prevent it from being recycled.
757 TAILQ_INSERT_HEAD(&par
->nc_list
, ncp
, nc_entry
);
759 _cache_hold(par
); /* add nc_parent ref */
763 * Remove the parent and hash associations from a namecache structure.
764 * Drop the ref-count on the parent. The caller receives the ref
765 * from the ncp's nchpp linkage that was removed and may forward that
766 * ref to a new linkage.
768 * The caller usually holds an additional ref * on the ncp so the unlink
769 * cannot be the final drop. XXX should not be necessary now since the
770 * caller receives the ref from the nchpp linkage, assuming the ncp
771 * was linked in the first place.
773 * ncp must be locked, which means that there won't be any nc_parent
774 * removal races. This routine will acquire a temporary lock on
775 * the parent as well as the appropriate hash chain.
778 _cache_unlink_parent(struct namecache
*ncp
)
780 struct pcpu_ncache
*pn
= &pcpu_ncache
[mycpu
->gd_cpuid
];
781 struct namecache
*par
;
782 struct vnode
*dropvp
;
783 struct nchash_head
*nchpp
;
785 if ((par
= ncp
->nc_parent
) != NULL
) {
787 KKASSERT(ncp
->nc_parent
== par
);
789 /* don't add a ref, we drop the nchpp ref later */
791 nchpp
= ncp
->nc_head
;
792 spin_lock(&nchpp
->spin
);
795 * Remove from hash table and parent, adjust accounting
797 TAILQ_REMOVE(&ncp
->nc_head
->list
, ncp
, nc_hash
);
798 TAILQ_REMOVE(&par
->nc_list
, ncp
, nc_entry
);
799 atomic_add_long(&pn
->vfscache_count
, -1);
800 if (TAILQ_EMPTY(&ncp
->nc_list
))
801 atomic_add_long(&pn
->vfscache_leafs
, -1);
804 if (TAILQ_EMPTY(&par
->nc_list
)) {
805 atomic_add_long(&pn
->vfscache_leafs
, 1);
809 ncp
->nc_parent
= NULL
;
811 spin_unlock(&nchpp
->spin
);
813 _cache_drop(par
); /* drop nc_parent ref */
816 * We can only safely vdrop with no spinlocks held.
824 * Allocate a new namecache structure. Most of the code does not require
825 * zero-termination of the string but it makes vop_compat_ncreate() easier.
827 * The returned ncp will be locked and referenced. The ref is generally meant
828 * to be transfered to the nchpp linkage.
830 static struct namecache
*
831 cache_alloc(int nlen
)
833 struct namecache
*ncp
;
835 ncp
= kmalloc(sizeof(*ncp
), M_VFSCACHE
, M_WAITOK
|M_ZERO
);
837 ncp
->nc_name
= kmalloc(nlen
+ 1, M_VFSCACHE
, M_WAITOK
);
839 ncp
->nc_flag
= NCF_UNRESOLVED
;
840 ncp
->nc_error
= ENOTCONN
; /* needs to be resolved */
842 TAILQ_INIT(&ncp
->nc_list
);
843 lockinit(&ncp
->nc_lock
, "ncplk", hz
, LK_CANRECURSE
);
844 lockmgr(&ncp
->nc_lock
, LK_EXCLUSIVE
);
850 * Can only be called for the case where the ncp has never been
851 * associated with anything (so no spinlocks are needed).
854 _cache_free(struct namecache
*ncp
)
856 KKASSERT(ncp
->nc_refs
== 1);
858 kfree(ncp
->nc_name
, M_VFSCACHE
);
859 kfree(ncp
, M_VFSCACHE
);
863 * [re]initialize a nchandle.
866 cache_zero(struct nchandle
*nch
)
873 * Ref and deref a nchandle structure (ncp + mp)
875 * The caller must specify a stable ncp pointer, typically meaning the
876 * ncp is already referenced but this can also occur indirectly through
877 * e.g. holding a lock on a direct child.
879 * WARNING: Caller may hold an unrelated read spinlock, which means we can't
880 * use read spinlocks here.
883 cache_hold(struct nchandle
*nch
)
885 _cache_hold(nch
->ncp
);
886 _cache_mntref(nch
->mount
);
891 * Create a copy of a namecache handle for an already-referenced
895 cache_copy(struct nchandle
*nch
, struct nchandle
*target
)
897 struct namecache
*ncp
;
899 struct mntcache_elm
*elm
;
900 struct namecache
*ncpr
;
908 elm
= _cache_mntcache_hash(ncp
);
909 for (i
= 0; i
< MNTCACHE_SET
; ++i
) {
910 if (elm
->ncp
== ncp
) {
911 ncpr
= atomic_swap_ptr((void *)&elm
->ncp
, NULL
);
927 * Drop the nchandle, but try to cache the ref to avoid global atomic
928 * ops. This is typically done on the system root and jail root nchandles.
931 cache_drop_and_cache(struct nchandle
*nch
, int elmno
)
933 struct mntcache_elm
*elm
;
934 struct mntcache_elm
*best
;
935 struct namecache
*ncpr
;
942 _cache_drop(nch
->ncp
);
946 _cache_mntrel(nch
->mount
);
952 elm
= _cache_mntcache_hash(nch
->ncp
);
954 for (i
= 0; i
< MNTCACHE_SET
; ++i
) {
955 if (elm
->ncp
== NULL
) {
956 ncpr
= atomic_swap_ptr((void *)&elm
->ncp
, nch
->ncp
);
957 _cache_mntrel(nch
->mount
);
965 delta1
= ticks
- best
->ticks
;
966 delta2
= ticks
- elm
->ticks
;
967 if (delta2
> delta1
|| delta1
< -1 || delta2
< -1)
971 ncpr
= atomic_swap_ptr((void *)&best
->ncp
, nch
->ncp
);
972 _cache_mntrel(nch
->mount
);
981 cache_changemount(struct nchandle
*nch
, struct mount
*mp
)
984 _cache_mntrel(nch
->mount
);
989 cache_drop(struct nchandle
*nch
)
991 _cache_mntrel(nch
->mount
);
992 _cache_drop(nch
->ncp
);
998 cache_lockstatus(struct nchandle
*nch
)
1000 return(_cache_lockstatus(nch
->ncp
));
1004 cache_lock(struct nchandle
*nch
)
1006 _cache_lock(nch
->ncp
);
1010 cache_lock_maybe_shared(struct nchandle
*nch
, int excl
)
1012 struct namecache
*ncp
= nch
->ncp
;
1014 if (ncp_shared_lock_disable
|| excl
||
1015 (ncp
->nc_flag
& NCF_UNRESOLVED
)) {
1018 _cache_lock_shared(ncp
);
1019 if ((ncp
->nc_flag
& NCF_UNRESOLVED
) == 0) {
1020 if (ncp
->nc_vp
&& (ncp
->nc_vp
->v_flag
& VRECLAIMED
)) {
1032 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller
1033 * is responsible for checking both for validity on return as they
1034 * may have become invalid.
1036 * We have to deal with potential deadlocks here, just ping pong
1037 * the lock until we get it (we will always block somewhere when
1038 * looping so this is not cpu-intensive).
1040 * which = 0 nch1 not locked, nch2 is locked
1041 * which = 1 nch1 is locked, nch2 is not locked
1044 cache_relock(struct nchandle
*nch1
, struct ucred
*cred1
,
1045 struct nchandle
*nch2
, struct ucred
*cred2
)
1053 if (cache_lock_nonblock(nch1
) == 0) {
1054 cache_resolve(nch1
, cred1
);
1059 cache_resolve(nch1
, cred1
);
1062 if (cache_lock_nonblock(nch2
) == 0) {
1063 cache_resolve(nch2
, cred2
);
1068 cache_resolve(nch2
, cred2
);
1075 cache_lock_nonblock(struct nchandle
*nch
)
1077 return(_cache_lock_nonblock(nch
->ncp
));
1081 cache_unlock(struct nchandle
*nch
)
1083 _cache_unlock(nch
->ncp
);
1087 * ref-and-lock, unlock-and-deref functions.
1089 * This function is primarily used by nlookup. Even though cache_lock
1090 * holds the vnode, it is possible that the vnode may have already
1091 * initiated a recyclement.
1093 * We want cache_get() to return a definitively usable vnode or a
1094 * definitively unresolved ncp.
1098 _cache_get(struct namecache
*ncp
)
1102 if (ncp
->nc_vp
&& (ncp
->nc_vp
->v_flag
& VRECLAIMED
))
1103 _cache_setunresolved(ncp
);
1108 * Attempt to obtain a shared lock on the ncp. A shared lock will only
1109 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1110 * valid. Otherwise an exclusive lock will be acquired instead.
1114 _cache_get_maybe_shared(struct namecache
*ncp
, int excl
)
1116 if (ncp_shared_lock_disable
|| excl
||
1117 (ncp
->nc_flag
& NCF_UNRESOLVED
)) {
1118 return(_cache_get(ncp
));
1121 _cache_lock_shared(ncp
);
1122 if ((ncp
->nc_flag
& NCF_UNRESOLVED
) == 0) {
1123 if (ncp
->nc_vp
&& (ncp
->nc_vp
->v_flag
& VRECLAIMED
)) {
1125 ncp
= _cache_get(ncp
);
1130 ncp
= _cache_get(ncp
);
1137 * NOTE: The same nchandle can be passed for both arguments.
1140 cache_get(struct nchandle
*nch
, struct nchandle
*target
)
1142 KKASSERT(nch
->ncp
->nc_refs
> 0);
1143 target
->mount
= nch
->mount
;
1144 target
->ncp
= _cache_get(nch
->ncp
);
1145 _cache_mntref(target
->mount
);
1149 cache_get_maybe_shared(struct nchandle
*nch
, struct nchandle
*target
, int excl
)
1151 KKASSERT(nch
->ncp
->nc_refs
> 0);
1152 target
->mount
= nch
->mount
;
1153 target
->ncp
= _cache_get_maybe_shared(nch
->ncp
, excl
);
1154 _cache_mntref(target
->mount
);
1158 * Release a held and locked ncp
1162 _cache_put(struct namecache
*ncp
)
1169 cache_put(struct nchandle
*nch
)
1171 _cache_mntrel(nch
->mount
);
1172 _cache_put(nch
->ncp
);
1178 * Resolve an unresolved ncp by associating a vnode with it. If the
1179 * vnode is NULL, a negative cache entry is created.
1181 * The ncp should be locked on entry and will remain locked on return.
1185 _cache_setvp(struct mount
*mp
, struct namecache
*ncp
, struct vnode
*vp
)
1187 KKASSERT((ncp
->nc_flag
& NCF_UNRESOLVED
) &&
1188 (_cache_lockstatus(ncp
) == LK_EXCLUSIVE
) &&
1189 ncp
->nc_vp
== NULL
);
1193 * Any vp associated with an ncp which has children must
1194 * be held. Any vp associated with a locked ncp must be held.
1196 if (!TAILQ_EMPTY(&ncp
->nc_list
))
1198 spin_lock(&vp
->v_spin
);
1200 TAILQ_INSERT_HEAD(&vp
->v_namecache
, ncp
, nc_vnode
);
1201 ++vp
->v_namecache_count
;
1202 _cache_hold(ncp
); /* v_namecache assoc */
1203 spin_unlock(&vp
->v_spin
);
1204 vhold(vp
); /* nc_vp */
1207 * Set auxiliary flags
1209 switch(vp
->v_type
) {
1211 ncp
->nc_flag
|= NCF_ISDIR
;
1214 ncp
->nc_flag
|= NCF_ISSYMLINK
;
1215 /* XXX cache the contents of the symlink */
1224 * XXX: this is a hack to work-around the lack of a real pfs vfs
1228 if (strncmp(mp
->mnt_stat
.f_fstypename
, "null", 5) == 0)
1233 * When creating a negative cache hit we set the
1234 * namecache_gen. A later resolve will clean out the
1235 * negative cache hit if the mount point's namecache_gen
1236 * has changed. Used by devfs, could also be used by
1239 struct pcpu_ncache
*pn
= &pcpu_ncache
[mycpu
->gd_cpuid
];
1242 ncp
->nc_negcpu
= mycpu
->gd_cpuid
;
1243 spin_lock(&pn
->neg_spin
);
1244 TAILQ_INSERT_TAIL(&pn
->neg_list
, ncp
, nc_vnode
);
1245 _cache_hold(ncp
); /* neg_list assoc */
1247 spin_unlock(&pn
->neg_spin
);
1248 atomic_add_long(&pn
->vfscache_negs
, 1);
1250 ncp
->nc_error
= ENOENT
;
1252 VFS_NCPGEN_SET(mp
, ncp
);
1254 ncp
->nc_flag
&= ~(NCF_UNRESOLVED
| NCF_DEFEREDZAP
);
1258 cache_setvp(struct nchandle
*nch
, struct vnode
*vp
)
1260 _cache_setvp(nch
->mount
, nch
->ncp
, vp
);
1267 cache_settimeout(struct nchandle
*nch
, int nticks
)
1269 struct namecache
*ncp
= nch
->ncp
;
1271 if ((ncp
->nc_timeout
= ticks
+ nticks
) == 0)
1272 ncp
->nc_timeout
= 1;
1276 * Disassociate the vnode or negative-cache association and mark a
1277 * namecache entry as unresolved again. Note that the ncp is still
1278 * left in the hash table and still linked to its parent.
1280 * The ncp should be locked and refd on entry and will remain locked and refd
1283 * This routine is normally never called on a directory containing children.
1284 * However, NFS often does just that in its rename() code as a cop-out to
1285 * avoid complex namespace operations. This disconnects a directory vnode
1286 * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1292 _cache_setunresolved(struct namecache
*ncp
)
1296 if ((ncp
->nc_flag
& NCF_UNRESOLVED
) == 0) {
1297 ncp
->nc_flag
|= NCF_UNRESOLVED
;
1298 ncp
->nc_timeout
= 0;
1299 ncp
->nc_error
= ENOTCONN
;
1300 if ((vp
= ncp
->nc_vp
) != NULL
) {
1301 spin_lock(&vp
->v_spin
);
1303 TAILQ_REMOVE(&vp
->v_namecache
, ncp
, nc_vnode
);
1304 --vp
->v_namecache_count
;
1305 spin_unlock(&vp
->v_spin
);
1308 * Any vp associated with an ncp with children is
1309 * held by that ncp. Any vp associated with ncp
1310 * is held by that ncp. These conditions must be
1311 * undone when the vp is cleared out from the ncp.
1313 if (!TAILQ_EMPTY(&ncp
->nc_list
))
1317 struct pcpu_ncache
*pn
;
1319 pn
= &pcpu_ncache
[ncp
->nc_negcpu
];
1321 atomic_add_long(&pn
->vfscache_negs
, -1);
1322 spin_lock(&pn
->neg_spin
);
1323 TAILQ_REMOVE(&pn
->neg_list
, ncp
, nc_vnode
);
1325 spin_unlock(&pn
->neg_spin
);
1327 ncp
->nc_flag
&= ~(NCF_WHITEOUT
|NCF_ISDIR
|NCF_ISSYMLINK
);
1328 _cache_drop(ncp
); /* from v_namecache or neg_list */
1333 * The cache_nresolve() code calls this function to automatically
1334 * set a resolved cache element to unresolved if it has timed out
1335 * or if it is a negative cache hit and the mount point namecache_gen
1339 _cache_auto_unresolve_test(struct mount
*mp
, struct namecache
*ncp
)
1342 * Try to zap entries that have timed out. We have
1343 * to be careful here because locked leafs may depend
1344 * on the vnode remaining intact in a parent, so only
1345 * do this under very specific conditions.
1347 if (ncp
->nc_timeout
&& (int)(ncp
->nc_timeout
- ticks
) < 0 &&
1348 TAILQ_EMPTY(&ncp
->nc_list
)) {
1353 * If a resolved negative cache hit is invalid due to
1354 * the mount's namecache generation being bumped, zap it.
1356 if (ncp
->nc_vp
== NULL
&& VFS_NCPGEN_TEST(mp
, ncp
)) {
1361 * Otherwise we are good
1366 static __inline
void
1367 _cache_auto_unresolve(struct mount
*mp
, struct namecache
*ncp
)
1370 * Already in an unresolved state, nothing to do.
1372 if ((ncp
->nc_flag
& NCF_UNRESOLVED
) == 0) {
1373 if (_cache_auto_unresolve_test(mp
, ncp
))
1374 _cache_setunresolved(ncp
);
1379 cache_setunresolved(struct nchandle
*nch
)
1381 _cache_setunresolved(nch
->ncp
);
1385 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1386 * looking for matches. This flag tells the lookup code when it must
1387 * check for a mount linkage and also prevents the directories in question
1388 * from being deleted or renamed.
1392 cache_clrmountpt_callback(struct mount
*mp
, void *data
)
1394 struct nchandle
*nch
= data
;
1396 if (mp
->mnt_ncmounton
.ncp
== nch
->ncp
)
1398 if (mp
->mnt_ncmountpt
.ncp
== nch
->ncp
)
1404 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated
1405 * with a mount point.
1408 cache_clrmountpt(struct nchandle
*nch
)
1412 count
= mountlist_scan(cache_clrmountpt_callback
, nch
,
1413 MNTSCAN_FORWARD
| MNTSCAN_NOBUSY
|
1416 nch
->ncp
->nc_flag
&= ~NCF_ISMOUNTPT
;
1420 * Invalidate portions of the namecache topology given a starting entry.
1421 * The passed ncp is set to an unresolved state and:
1423 * The passed ncp must be referenced and locked. The routine may unlock
1424 * and relock ncp several times, and will recheck the children and loop
1425 * to catch races. When done the passed ncp will be returned with the
1426 * reference and lock intact.
1428 * CINV_DESTROY - Set a flag in the passed ncp entry indicating
1429 * that the physical underlying nodes have been
1430 * destroyed... as in deleted. For example, when
1431 * a directory is removed. This will cause record
1432 * lookups on the name to no longer be able to find
1433 * the record and tells the resolver to return failure
1434 * rather then trying to resolve through the parent.
1436 * The topology itself, including ncp->nc_name,
1439 * This only applies to the passed ncp, if CINV_CHILDREN
1440 * is specified the children are not flagged.
1442 * CINV_CHILDREN - Set all children (recursively) to an unresolved
1445 * Note that this will also have the side effect of
1446 * cleaning out any unreferenced nodes in the topology
1447 * from the leaves up as the recursion backs out.
1449 * Note that the topology for any referenced nodes remains intact, but
1450 * the nodes will be marked as having been destroyed and will be set
1451 * to an unresolved state.
1453 * It is possible for cache_inval() to race a cache_resolve(), meaning that
1454 * the namecache entry may not actually be invalidated on return if it was
1455 * revalidated while recursing down into its children. This code guarentees
1456 * that the node(s) will go through an invalidation cycle, but does not
1457 * guarentee that they will remain in an invalidated state.
1459 * Returns non-zero if a revalidation was detected during the invalidation
1460 * recursion, zero otherwise. Note that since only the original ncp is
1461 * locked the revalidation ultimately can only indicate that the original ncp
1462 * *MIGHT* no have been reresolved.
1464 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1465 * have to avoid blowing out the kernel stack. We do this by saving the
1466 * deep namecache node and aborting the recursion, then re-recursing at that
1467 * node using a depth-first algorithm in order to allow multiple deep
1468 * recursions to chain through each other, then we restart the invalidation
1473 struct namecache
*resume_ncp
;
1477 static int _cache_inval_internal(struct namecache
*, int, struct cinvtrack
*);
1481 _cache_inval(struct namecache
*ncp
, int flags
)
1483 struct cinvtrack track
;
1484 struct namecache
*ncp2
;
1488 track
.resume_ncp
= NULL
;
1491 r
= _cache_inval_internal(ncp
, flags
, &track
);
1492 if (track
.resume_ncp
== NULL
)
1495 while ((ncp2
= track
.resume_ncp
) != NULL
) {
1496 track
.resume_ncp
= NULL
;
1498 _cache_inval_internal(ncp2
, flags
& ~CINV_DESTROY
,
1500 /*_cache_put(ncp2);*/
1509 cache_inval(struct nchandle
*nch
, int flags
)
1511 return(_cache_inval(nch
->ncp
, flags
));
1515 * Helper for _cache_inval(). The passed ncp is refd and locked and
1516 * remains that way on return, but may be unlocked/relocked multiple
1517 * times by the routine.
1520 _cache_inval_internal(struct namecache
*ncp
, int flags
, struct cinvtrack
*track
)
1522 struct namecache
*nextkid
;
1525 KKASSERT(_cache_lockstatus(ncp
) == LK_EXCLUSIVE
);
1527 _cache_setunresolved(ncp
);
1528 if (flags
& CINV_DESTROY
) {
1529 ncp
->nc_flag
|= NCF_DESTROYED
;
1530 ++ncp
->nc_generation
;
1533 while ((flags
& CINV_CHILDREN
) &&
1534 (nextkid
= TAILQ_FIRST(&ncp
->nc_list
)) != NULL
1536 struct namecache
*kid
;
1540 _cache_hold(nextkid
);
1541 if (++track
->depth
> MAX_RECURSION_DEPTH
) {
1542 track
->resume_ncp
= ncp
;
1546 while ((kid
= nextkid
) != NULL
) {
1548 * Parent (ncp) must be locked for the iteration.
1551 if (kid
->nc_parent
!= ncp
) {
1553 kprintf("cache_inval_internal restartA %s\n",
1558 if ((nextkid
= TAILQ_NEXT(kid
, nc_entry
)) != NULL
)
1559 _cache_hold(nextkid
);
1562 * Parent unlocked for this section to avoid
1563 * deadlocks. Then lock the kid and check for
1567 if (track
->resume_ncp
) {
1573 if (kid
->nc_parent
!= ncp
) {
1574 kprintf("cache_inval_internal "
1583 if ((kid
->nc_flag
& NCF_UNRESOLVED
) == 0 ||
1584 TAILQ_FIRST(&kid
->nc_list
)
1587 rcnt
+= _cache_inval_internal(kid
,
1588 flags
& ~CINV_DESTROY
, track
);
1589 /*_cache_unlock(kid);*/
1590 /*_cache_drop(kid);*/
1597 * Relock parent to continue scan
1602 _cache_drop(nextkid
);
1609 * Someone could have gotten in there while ncp was unlocked,
1612 if ((ncp
->nc_flag
& NCF_UNRESOLVED
) == 0)
1618 * Invalidate a vnode's namecache associations. To avoid races against
1619 * the resolver we do not invalidate a node which we previously invalidated
1620 * but which was then re-resolved while we were in the invalidation loop.
1622 * Returns non-zero if any namecache entries remain after the invalidation
1625 * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1626 * be ripped out of the topology while held, the vnode's v_namecache
1627 * list has no such restriction. NCP's can be ripped out of the list
1628 * at virtually any time if not locked, even if held.
1630 * In addition, the v_namecache list itself must be locked via
1631 * the vnode's spinlock.
1634 cache_inval_vp(struct vnode
*vp
, int flags
)
1636 struct namecache
*ncp
;
1637 struct namecache
*next
;
1640 spin_lock(&vp
->v_spin
);
1641 ncp
= TAILQ_FIRST(&vp
->v_namecache
);
1645 /* loop entered with ncp held and vp spin-locked */
1646 if ((next
= TAILQ_NEXT(ncp
, nc_vnode
)) != NULL
)
1648 spin_unlock(&vp
->v_spin
);
1650 if (ncp
->nc_vp
!= vp
) {
1651 kprintf("Warning: cache_inval_vp: race-A detected on "
1652 "%s\n", ncp
->nc_name
);
1658 _cache_inval(ncp
, flags
);
1659 _cache_put(ncp
); /* also releases reference */
1661 spin_lock(&vp
->v_spin
);
1662 if (ncp
&& ncp
->nc_vp
!= vp
) {
1663 spin_unlock(&vp
->v_spin
);
1664 kprintf("Warning: cache_inval_vp: race-B detected on "
1665 "%s\n", ncp
->nc_name
);
1670 spin_unlock(&vp
->v_spin
);
1671 return(TAILQ_FIRST(&vp
->v_namecache
) != NULL
);
1675 * This routine is used instead of the normal cache_inval_vp() when we
1676 * are trying to recycle otherwise good vnodes.
1678 * Return 0 on success, non-zero if not all namecache records could be
1679 * disassociated from the vnode (for various reasons).
1682 cache_inval_vp_nonblock(struct vnode
*vp
)
1684 struct namecache
*ncp
;
1685 struct namecache
*next
;
1687 spin_lock(&vp
->v_spin
);
1688 ncp
= TAILQ_FIRST(&vp
->v_namecache
);
1692 /* loop entered with ncp held */
1693 if ((next
= TAILQ_NEXT(ncp
, nc_vnode
)) != NULL
)
1695 spin_unlock(&vp
->v_spin
);
1696 if (_cache_lock_nonblock(ncp
)) {
1702 if (ncp
->nc_vp
!= vp
) {
1703 kprintf("Warning: cache_inval_vp: race-A detected on "
1704 "%s\n", ncp
->nc_name
);
1710 _cache_inval(ncp
, 0);
1711 _cache_put(ncp
); /* also releases reference */
1713 spin_lock(&vp
->v_spin
);
1714 if (ncp
&& ncp
->nc_vp
!= vp
) {
1715 spin_unlock(&vp
->v_spin
);
1716 kprintf("Warning: cache_inval_vp: race-B detected on "
1717 "%s\n", ncp
->nc_name
);
1722 spin_unlock(&vp
->v_spin
);
1724 return(TAILQ_FIRST(&vp
->v_namecache
) != NULL
);
1728 * Clears the universal directory search 'ok' flag. This flag allows
1729 * nlookup() to bypass normal vnode checks. This flag is a cached flag
1730 * so clearing it simply forces revalidation.
1733 cache_inval_wxok(struct vnode
*vp
)
1735 struct namecache
*ncp
;
1737 spin_lock(&vp
->v_spin
);
1738 TAILQ_FOREACH(ncp
, &vp
->v_namecache
, nc_vnode
) {
1739 if (ncp
->nc_flag
& (NCF_WXOK
| NCF_NOTX
))
1740 atomic_clear_short(&ncp
->nc_flag
, NCF_WXOK
| NCF_NOTX
);
1742 spin_unlock(&vp
->v_spin
);
1746 * The source ncp has been renamed to the target ncp. Both fncp and tncp
1747 * must be locked. The target ncp is destroyed (as a normal rename-over
1748 * would destroy the target file or directory).
1750 * Because there may be references to the source ncp we cannot copy its
1751 * contents to the target. Instead the source ncp is relinked as the target
1752 * and the target ncp is removed from the namecache topology.
1755 cache_rename(struct nchandle
*fnch
, struct nchandle
*tnch
)
1757 struct namecache
*fncp
= fnch
->ncp
;
1758 struct namecache
*tncp
= tnch
->ncp
;
1759 struct namecache
*tncp_par
;
1760 struct nchash_head
*nchpp
;
1765 ++fncp
->nc_generation
;
1766 ++tncp
->nc_generation
;
1767 if (tncp
->nc_nlen
) {
1768 nname
= kmalloc(tncp
->nc_nlen
+ 1, M_VFSCACHE
, M_WAITOK
);
1769 bcopy(tncp
->nc_name
, nname
, tncp
->nc_nlen
);
1770 nname
[tncp
->nc_nlen
] = 0;
1776 * Rename fncp (unlink)
1778 _cache_unlink_parent(fncp
);
1779 oname
= fncp
->nc_name
;
1780 fncp
->nc_name
= nname
;
1781 fncp
->nc_nlen
= tncp
->nc_nlen
;
1783 kfree(oname
, M_VFSCACHE
);
1785 tncp_par
= tncp
->nc_parent
;
1786 _cache_hold(tncp_par
);
1787 _cache_lock(tncp_par
);
1790 * Rename fncp (relink)
1792 hash
= fnv_32_buf(fncp
->nc_name
, fncp
->nc_nlen
, FNV1_32_INIT
);
1793 hash
= fnv_32_buf(&tncp_par
, sizeof(tncp_par
), hash
);
1794 nchpp
= NCHHASH(hash
);
1796 spin_lock(&nchpp
->spin
);
1797 _cache_link_parent(fncp
, tncp_par
, nchpp
);
1798 spin_unlock(&nchpp
->spin
);
1800 _cache_put(tncp_par
);
1803 * Get rid of the overwritten tncp (unlink)
1805 _cache_unlink(tncp
);
1809 * Perform actions consistent with unlinking a file. The passed-in ncp
1812 * The ncp is marked DESTROYED so it no longer shows up in searches,
1813 * and will be physically deleted when the vnode goes away.
1815 * If the related vnode has no refs then we cycle it through vget()/vput()
1816 * to (possibly if we don't have a ref race) trigger a deactivation,
1817 * allowing the VFS to trivially detect and recycle the deleted vnode
1818 * via VOP_INACTIVE().
1820 * NOTE: _cache_rename() will automatically call _cache_unlink() on the
1824 cache_unlink(struct nchandle
*nch
)
1826 _cache_unlink(nch
->ncp
);
1830 _cache_unlink(struct namecache
*ncp
)
1835 * Causes lookups to fail and allows another ncp with the same
1836 * name to be created under ncp->nc_parent.
1838 ncp
->nc_flag
|= NCF_DESTROYED
;
1839 ++ncp
->nc_generation
;
1842 * Attempt to trigger a deactivation. Set VREF_FINALIZE to
1843 * force action on the 1->0 transition.
1845 if ((ncp
->nc_flag
& NCF_UNRESOLVED
) == 0 &&
1846 (vp
= ncp
->nc_vp
) != NULL
) {
1847 atomic_set_int(&vp
->v_refcnt
, VREF_FINALIZE
);
1848 if (VREFCNT(vp
) <= 0) {
1849 if (vget(vp
, LK_SHARED
) == 0)
1856 * Return non-zero if the nch might be associated with an open and/or mmap()'d
1857 * file. The easy solution is to just return non-zero if the vnode has refs.
1858 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
1859 * force the reclaim).
1862 cache_isopen(struct nchandle
*nch
)
1865 struct namecache
*ncp
= nch
->ncp
;
1867 if ((ncp
->nc_flag
& NCF_UNRESOLVED
) == 0 &&
1868 (vp
= ncp
->nc_vp
) != NULL
&&
1877 * vget the vnode associated with the namecache entry. Resolve the namecache
1878 * entry if necessary. The passed ncp must be referenced and locked. If
1879 * the ncp is resolved it might be locked shared.
1881 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked
1882 * (depending on the passed lk_type) will be returned in *vpp with an error
1883 * of 0, or NULL will be returned in *vpp with a non-0 error code. The
1884 * most typical error is ENOENT, meaning that the ncp represents a negative
1885 * cache hit and there is no vnode to retrieve, but other errors can occur
1888 * The vget() can race a reclaim. If this occurs we re-resolve the
1891 * There are numerous places in the kernel where vget() is called on a
1892 * vnode while one or more of its namecache entries is locked. Releasing
1893 * a vnode never deadlocks against locked namecache entries (the vnode
1894 * will not get recycled while referenced ncp's exist). This means we
1895 * can safely acquire the vnode. In fact, we MUST NOT release the ncp
1896 * lock when acquiring the vp lock or we might cause a deadlock.
1898 * NOTE: The passed-in ncp must be locked exclusively if it is initially
1899 * unresolved. If a reclaim race occurs the passed-in ncp will be
1900 * relocked exclusively before being re-resolved.
1903 cache_vget(struct nchandle
*nch
, struct ucred
*cred
,
1904 int lk_type
, struct vnode
**vpp
)
1906 struct namecache
*ncp
;
1913 if (ncp
->nc_flag
& NCF_UNRESOLVED
)
1914 error
= cache_resolve(nch
, cred
);
1918 if (error
== 0 && (vp
= ncp
->nc_vp
) != NULL
) {
1919 error
= vget(vp
, lk_type
);
1924 * The ncp may have been locked shared, we must relock
1925 * it exclusively before we can set it to unresolved.
1927 if (error
== ENOENT
) {
1928 kprintf("Warning: vnode reclaim race detected "
1929 "in cache_vget on %p (%s)\n",
1933 _cache_setunresolved(ncp
);
1938 * Not a reclaim race, some other error.
1940 KKASSERT(ncp
->nc_vp
== vp
);
1943 KKASSERT(ncp
->nc_vp
== vp
);
1944 KKASSERT((vp
->v_flag
& VRECLAIMED
) == 0);
1947 if (error
== 0 && vp
== NULL
)
1954 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode
1955 * is already held by virtuue of the ncp being locked, but it might not be
1956 * referenced and while it is not referenced it can transition into the
1959 * NOTE: The passed-in ncp must be locked exclusively if it is initially
1960 * unresolved. If a reclaim race occurs the passed-in ncp will be
1961 * relocked exclusively before being re-resolved.
1963 * NOTE: At the moment we have to issue a vget() on the vnode, even though
1964 * we are going to immediately release the lock, in order to resolve
1965 * potential reclamation races. Once we have a solid vnode ref that
1966 * was (at some point) interlocked via a vget(), the vnode will not
1969 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation.
1972 cache_vref(struct nchandle
*nch
, struct ucred
*cred
, struct vnode
**vpp
)
1974 struct namecache
*ncp
;
1982 if (ncp
->nc_flag
& NCF_UNRESOLVED
)
1983 error
= cache_resolve(nch
, cred
);
1987 while (error
== 0 && (vp
= ncp
->nc_vp
) != NULL
) {
1989 * Try a lockless ref of the vnode. VRECLAIMED transitions
1990 * use the vx_lock state and update-counter mechanism so we
1991 * can detect if one is in-progress or occurred.
1993 * If we can successfully ref the vnode and interlock against
1994 * the update-counter mechanism, and VRECLAIMED is found to
1995 * not be set after that, we should be good.
1997 v
= spin_access_start_only(&vp
->v_spin
);
1998 if (__predict_true(spin_access_check_inprog(v
) == 0)) {
2000 if (__predict_false(
2001 spin_access_end_only(&vp
->v_spin
, v
))) {
2003 kprintf("CACHE_VREF: RACED %p\n", vp
);
2006 if (__predict_true((vp
->v_flag
& VRECLAIMED
) == 0)) {
2010 kprintf("CACHE_VREF: IN-RECLAIM\n");
2014 * Do it the slow way
2016 error
= vget(vp
, LK_SHARED
);
2021 if (error
== ENOENT
) {
2022 kprintf("Warning: vnode reclaim race detected "
2023 "in cache_vget on %p (%s)\n",
2027 _cache_setunresolved(ncp
);
2032 * Not a reclaim race, some other error.
2034 KKASSERT(ncp
->nc_vp
== vp
);
2037 KKASSERT(ncp
->nc_vp
== vp
);
2038 KKASSERT((vp
->v_flag
& VRECLAIMED
) == 0);
2039 /* caller does not want a lock */
2044 if (error
== 0 && vp
== NULL
)
2052 * Return a referenced vnode representing the parent directory of
2055 * Because the caller has locked the ncp it should not be possible for
2056 * the parent ncp to go away. However, the parent can unresolve its
2057 * dvp at any time so we must be able to acquire a lock on the parent
2058 * to safely access nc_vp.
2060 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
2061 * so use vhold()/vdrop() while holding the lock to prevent dvp from
2062 * getting destroyed.
2064 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
2065 * lock on the ncp in question..
2068 cache_dvpref(struct namecache
*ncp
)
2070 struct namecache
*par
;
2074 if ((par
= ncp
->nc_parent
) != NULL
) {
2077 if ((par
->nc_flag
& NCF_UNRESOLVED
) == 0) {
2078 if ((dvp
= par
->nc_vp
) != NULL
)
2083 if (vget(dvp
, LK_SHARED
) == 0) {
2086 /* return refd, unlocked dvp */
2098 * Convert a directory vnode to a namecache record without any other
2099 * knowledge of the topology. This ONLY works with directory vnodes and
2100 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the
2101 * returned ncp (if not NULL) will be held and unlocked.
2103 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2104 * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2105 * for dvp. This will fail only if the directory has been deleted out from
2108 * Callers must always check for a NULL return no matter the value of 'makeit'.
2110 * To avoid underflowing the kernel stack each recursive call increments
2111 * the makeit variable.
2114 static int cache_inefficient_scan(struct nchandle
*nch
, struct ucred
*cred
,
2115 struct vnode
*dvp
, char *fakename
);
2116 static int cache_fromdvp_try(struct vnode
*dvp
, struct ucred
*cred
,
2117 struct vnode
**saved_dvp
);
2120 cache_fromdvp(struct vnode
*dvp
, struct ucred
*cred
, int makeit
,
2121 struct nchandle
*nch
)
2123 struct vnode
*saved_dvp
;
2129 nch
->mount
= dvp
->v_mount
;
2134 * Handle the makeit == 0 degenerate case
2137 spin_lock_shared(&dvp
->v_spin
);
2138 nch
->ncp
= TAILQ_FIRST(&dvp
->v_namecache
);
2141 spin_unlock_shared(&dvp
->v_spin
);
2145 * Loop until resolution, inside code will break out on error.
2149 * Break out if we successfully acquire a working ncp.
2151 spin_lock_shared(&dvp
->v_spin
);
2152 nch
->ncp
= TAILQ_FIRST(&dvp
->v_namecache
);
2155 spin_unlock_shared(&dvp
->v_spin
);
2158 spin_unlock_shared(&dvp
->v_spin
);
2161 * If dvp is the root of its filesystem it should already
2162 * have a namecache pointer associated with it as a side
2163 * effect of the mount, but it may have been disassociated.
2165 if (dvp
->v_flag
& VROOT
) {
2166 nch
->ncp
= _cache_get(nch
->mount
->mnt_ncmountpt
.ncp
);
2167 error
= cache_resolve_mp(nch
->mount
);
2168 _cache_put(nch
->ncp
);
2170 kprintf("cache_fromdvp: resolve root of mount %p error %d",
2171 dvp
->v_mount
, error
);
2175 kprintf(" failed\n");
2180 kprintf(" succeeded\n");
2185 * If we are recursed too deeply resort to an O(n^2)
2186 * algorithm to resolve the namecache topology. The
2187 * resolved pvp is left referenced in saved_dvp to
2188 * prevent the tree from being destroyed while we loop.
2191 error
= cache_fromdvp_try(dvp
, cred
, &saved_dvp
);
2193 kprintf("lookupdotdot(longpath) failed %d "
2194 "dvp %p\n", error
, dvp
);
2202 * Get the parent directory and resolve its ncp.
2205 kfree(fakename
, M_TEMP
);
2208 error
= vop_nlookupdotdot(*dvp
->v_ops
, dvp
, &pvp
, cred
,
2211 kprintf("lookupdotdot failed %d dvp %p\n", error
, dvp
);
2217 * Reuse makeit as a recursion depth counter. On success
2218 * nch will be fully referenced.
2220 cache_fromdvp(pvp
, cred
, makeit
+ 1, nch
);
2222 if (nch
->ncp
== NULL
)
2226 * Do an inefficient scan of pvp (embodied by ncp) to look
2227 * for dvp. This will create a namecache record for dvp on
2228 * success. We loop up to recheck on success.
2230 * ncp and dvp are both held but not locked.
2232 error
= cache_inefficient_scan(nch
, cred
, dvp
, fakename
);
2234 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2235 pvp
, nch
->ncp
->nc_name
, dvp
);
2237 /* nch was NULLed out, reload mount */
2238 nch
->mount
= dvp
->v_mount
;
2242 kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2243 pvp
, nch
->ncp
->nc_name
);
2246 /* nch was NULLed out, reload mount */
2247 nch
->mount
= dvp
->v_mount
;
2251 * If nch->ncp is non-NULL it will have been held already.
2254 kfree(fakename
, M_TEMP
);
2263 * Go up the chain of parent directories until we find something
2264 * we can resolve into the namecache. This is very inefficient.
2268 cache_fromdvp_try(struct vnode
*dvp
, struct ucred
*cred
,
2269 struct vnode
**saved_dvp
)
2271 struct nchandle nch
;
2274 static time_t last_fromdvp_report
;
2278 * Loop getting the parent directory vnode until we get something we
2279 * can resolve in the namecache.
2282 nch
.mount
= dvp
->v_mount
;
2288 kfree(fakename
, M_TEMP
);
2291 error
= vop_nlookupdotdot(*dvp
->v_ops
, dvp
, &pvp
, cred
,
2298 spin_lock_shared(&pvp
->v_spin
);
2299 if ((nch
.ncp
= TAILQ_FIRST(&pvp
->v_namecache
)) != NULL
) {
2300 _cache_hold(nch
.ncp
);
2301 spin_unlock_shared(&pvp
->v_spin
);
2305 spin_unlock_shared(&pvp
->v_spin
);
2306 if (pvp
->v_flag
& VROOT
) {
2307 nch
.ncp
= _cache_get(pvp
->v_mount
->mnt_ncmountpt
.ncp
);
2308 error
= cache_resolve_mp(nch
.mount
);
2309 _cache_unlock(nch
.ncp
);
2312 _cache_drop(nch
.ncp
);
2322 if (last_fromdvp_report
!= time_uptime
) {
2323 last_fromdvp_report
= time_uptime
;
2324 kprintf("Warning: extremely inefficient path "
2325 "resolution on %s\n",
2328 error
= cache_inefficient_scan(&nch
, cred
, dvp
, fakename
);
2331 * Hopefully dvp now has a namecache record associated with
2332 * it. Leave it referenced to prevent the kernel from
2333 * recycling the vnode. Otherwise extremely long directory
2334 * paths could result in endless recycling.
2339 _cache_drop(nch
.ncp
);
2342 kfree(fakename
, M_TEMP
);
2347 * Do an inefficient scan of the directory represented by ncp looking for
2348 * the directory vnode dvp. ncp must be held but not locked on entry and
2349 * will be held on return. dvp must be refd but not locked on entry and
2350 * will remain refd on return.
2352 * Why do this at all? Well, due to its stateless nature the NFS server
2353 * converts file handles directly to vnodes without necessarily going through
2354 * the namecache ops that would otherwise create the namecache topology
2355 * leading to the vnode. We could either (1) Change the namecache algorithms
2356 * to allow disconnect namecache records that are re-merged opportunistically,
2357 * or (2) Make the NFS server backtrack and scan to recover a connected
2358 * namecache topology in order to then be able to issue new API lookups.
2360 * It turns out that (1) is a huge mess. It takes a nice clean set of
2361 * namecache algorithms and introduces a lot of complication in every subsystem
2362 * that calls into the namecache to deal with the re-merge case, especially
2363 * since we are using the namecache to placehold negative lookups and the
2364 * vnode might not be immediately assigned. (2) is certainly far less
2365 * efficient then (1), but since we are only talking about directories here
2366 * (which are likely to remain cached), the case does not actually run all
2367 * that often and has the supreme advantage of not polluting the namecache
2370 * If a fakename is supplied just construct a namecache entry using the
2374 cache_inefficient_scan(struct nchandle
*nch
, struct ucred
*cred
,
2375 struct vnode
*dvp
, char *fakename
)
2377 struct nlcomponent nlc
;
2378 struct nchandle rncp
;
2390 vat
.va_blocksize
= 0;
2391 if ((error
= VOP_GETATTR(dvp
, &vat
)) != 0)
2394 error
= cache_vref(nch
, cred
, &pvp
);
2399 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
2400 "vattr fileid = %lld\n",
2401 nch
->ncp
, nch
->ncp
->nc_name
,
2403 (long long)vat
.va_fileid
);
2407 * Use the supplied fakename if not NULL. Fake names are typically
2408 * not in the actual filesystem hierarchy. This is used by HAMMER
2409 * to glue @@timestamp recursions together.
2412 nlc
.nlc_nameptr
= fakename
;
2413 nlc
.nlc_namelen
= strlen(fakename
);
2414 rncp
= cache_nlookup(nch
, &nlc
);
2418 if ((blksize
= vat
.va_blocksize
) == 0)
2419 blksize
= DEV_BSIZE
;
2420 rbuf
= kmalloc(blksize
, M_TEMP
, M_WAITOK
);
2426 iov
.iov_base
= rbuf
;
2427 iov
.iov_len
= blksize
;
2430 uio
.uio_resid
= blksize
;
2431 uio
.uio_segflg
= UIO_SYSSPACE
;
2432 uio
.uio_rw
= UIO_READ
;
2433 uio
.uio_td
= curthread
;
2435 if (ncvp_debug
>= 2)
2436 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio
.uio_offset
);
2437 error
= VOP_READDIR(pvp
, &uio
, cred
, &eofflag
, NULL
, NULL
);
2439 den
= (struct dirent
*)rbuf
;
2440 bytes
= blksize
- uio
.uio_resid
;
2443 if (ncvp_debug
>= 2) {
2444 kprintf("cache_inefficient_scan: %*.*s\n",
2445 den
->d_namlen
, den
->d_namlen
,
2448 if (den
->d_type
!= DT_WHT
&&
2449 den
->d_ino
== vat
.va_fileid
) {
2451 kprintf("cache_inefficient_scan: "
2452 "MATCHED inode %lld path %s/%*.*s\n",
2453 (long long)vat
.va_fileid
,
2455 den
->d_namlen
, den
->d_namlen
,
2458 nlc
.nlc_nameptr
= den
->d_name
;
2459 nlc
.nlc_namelen
= den
->d_namlen
;
2460 rncp
= cache_nlookup(nch
, &nlc
);
2461 KKASSERT(rncp
.ncp
!= NULL
);
2464 bytes
-= _DIRENT_DIRSIZ(den
);
2465 den
= _DIRENT_NEXT(den
);
2467 if (rncp
.ncp
== NULL
&& eofflag
== 0 && uio
.uio_resid
!= blksize
)
2470 kfree(rbuf
, M_TEMP
);
2474 if (rncp
.ncp
->nc_flag
& NCF_UNRESOLVED
) {
2475 _cache_setvp(rncp
.mount
, rncp
.ncp
, dvp
);
2476 if (ncvp_debug
>= 2) {
2477 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2478 nch
->ncp
->nc_name
, rncp
.ncp
->nc_name
, dvp
);
2481 if (ncvp_debug
>= 2) {
2482 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2483 nch
->ncp
->nc_name
, rncp
.ncp
->nc_name
, dvp
,
2487 if (rncp
.ncp
->nc_vp
== NULL
)
2488 error
= rncp
.ncp
->nc_error
;
2490 * Release rncp after a successful nlookup. rncp was fully
2495 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2496 dvp
, nch
->ncp
->nc_name
);
2503 * This function must be called with the ncp held and locked and will unlock
2504 * and drop it during zapping.
2506 * Zap a namecache entry. The ncp is unconditionally set to an unresolved
2507 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list
2508 * and removes the related reference. If the ncp can be removed, and the
2509 * parent can be zapped non-blocking, this function loops up.
2511 * There will be one ref from the caller (which we now own). The only
2512 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list,
2513 * so possibly 2 refs left. Taking this into account, if there are no
2514 * additional refs and no children, the ncp will be removed from the topology
2517 * References and/or children may exist if the ncp is in the middle of the
2518 * topology, preventing the ncp from being destroyed.
2520 * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2522 * This function may return a held (but NOT locked) parent node which the
2523 * caller must drop in a loop. Looping is one way to avoid unbounded recursion
2524 * due to deep namecache trees.
2526 * WARNING! For MPSAFE operation this routine must acquire up to three
2527 * spin locks to be able to safely test nc_refs. Lock order is
2530 * hash spinlock if on hash list
2531 * parent spinlock if child of parent
2532 * (the ncp is unresolved so there is no vnode association)
2535 cache_zap(struct namecache
*ncp
)
2537 struct namecache
*par
;
2538 struct vnode
*dropvp
;
2539 struct nchash_head
*nchpp
;
2541 int nonblock
= 1; /* XXX cleanup */
2545 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2546 * This gets rid of any vp->v_namecache list or negative list and
2549 _cache_setunresolved(ncp
);
2552 * Try to scrap the entry and possibly tail-recurse on its parent.
2553 * We only scrap unref'd (other then our ref) unresolved entries,
2554 * we do not scrap 'live' entries.
2556 * If nc_parent is non NULL we expect 2 references, else just 1.
2557 * If there are more, someone else also holds the ncp and we cannot
2560 KKASSERT(ncp
->nc_flag
& NCF_UNRESOLVED
);
2561 KKASSERT(ncp
->nc_refs
> 0);
2564 * If the ncp is linked to its parent it will also be in the hash
2565 * table. We have to be able to lock the parent and the hash table.
2567 * Acquire locks. Note that the parent can't go away while we hold
2568 * a child locked. If nc_parent is present, expect 2 refs instead
2572 if ((par
= ncp
->nc_parent
) != NULL
) {
2574 if (_cache_lock_nonblock(par
)) {
2576 ncp
->nc_flag
|= NCF_DEFEREDZAP
;
2578 &pcpu_ncache
[mycpu
->gd_cpuid
].numdefered
,
2581 _cache_drop(ncp
); /* caller's ref */
2589 nchpp
= ncp
->nc_head
;
2590 spin_lock(&nchpp
->spin
);
2594 * With the parent and nchpp locked, and the vnode removed
2595 * (no vp->v_namecache), we expect 1 or 2 refs. If there are
2596 * more someone else has a ref and we cannot zap the entry.
2599 * one for our parent link (parent also has one from the linkage)
2607 * On failure undo the work we've done so far and drop the
2608 * caller's ref and ncp.
2610 if (ncp
->nc_refs
!= refcmp
|| TAILQ_FIRST(&ncp
->nc_list
)) {
2612 spin_unlock(&nchpp
->spin
);
2621 * We own all the refs and with the spinlocks held no further
2622 * refs can be acquired by others.
2624 * Remove us from the hash list and parent list. We have to
2625 * drop a ref on the parent's vp if the parent's list becomes
2630 struct pcpu_ncache
*pn
= &pcpu_ncache
[mycpu
->gd_cpuid
];
2632 KKASSERT(nchpp
== ncp
->nc_head
);
2633 TAILQ_REMOVE(&ncp
->nc_head
->list
, ncp
, nc_hash
);
2634 TAILQ_REMOVE(&par
->nc_list
, ncp
, nc_entry
);
2635 atomic_add_long(&pn
->vfscache_count
, -1);
2636 if (TAILQ_EMPTY(&ncp
->nc_list
))
2637 atomic_add_long(&pn
->vfscache_leafs
, -1);
2639 if (TAILQ_EMPTY(&par
->nc_list
)) {
2640 atomic_add_long(&pn
->vfscache_leafs
, 1);
2642 dropvp
= par
->nc_vp
;
2644 ncp
->nc_parent
= NULL
;
2645 ncp
->nc_head
= NULL
;
2646 spin_unlock(&nchpp
->spin
);
2647 _cache_drop(par
); /* removal of ncp from par->nc_list */
2648 /*_cache_unlock(par);*/
2650 KKASSERT(ncp
->nc_head
== NULL
);
2654 * ncp should not have picked up any refs. Physically
2657 if (ncp
->nc_refs
!= refcmp
) {
2658 panic("cache_zap: %p bad refs %d (expected %d)\n",
2659 ncp
, ncp
->nc_refs
, refcmp
);
2661 /* _cache_unlock(ncp) not required */
2662 ncp
->nc_refs
= -1; /* safety */
2664 kfree(ncp
->nc_name
, M_VFSCACHE
);
2665 kfree(ncp
, M_VFSCACHE
);
2668 * Delayed drop (we had to release our spinlocks)
2674 * Loop up if we can recursively clean out the parent.
2677 refcmp
= 1; /* ref on parent */
2678 if (par
->nc_parent
) /* par->par */
2680 par
->nc_flag
&= ~NCF_DEFEREDZAP
;
2681 if ((par
->nc_flag
& NCF_UNRESOLVED
) &&
2682 par
->nc_refs
== refcmp
&&
2683 TAILQ_EMPTY(&par
->nc_list
)) {
2693 * Clean up dangling negative cache and defered-drop entries in the
2696 * This routine is called in the critical path and also called from
2697 * vnlru(). When called from vnlru we use a lower limit to try to
2698 * deal with the negative cache before the critical path has to start
2701 typedef enum { CHI_LOW
, CHI_HIGH
} cache_hs_t
;
2703 static cache_hs_t neg_cache_hysteresis_state
[2] = { CHI_LOW
, CHI_LOW
};
2704 static cache_hs_t pos_cache_hysteresis_state
[2] = { CHI_LOW
, CHI_LOW
};
2707 cache_hysteresis(int critpath
)
2710 long neglimit
= maxvnodes
/ ncnegfactor
;
2711 long xnumcache
= vfscache_leafs
;
2714 neglimit
= neglimit
* 8 / 10;
2717 * Don't cache too many negative hits. We use hysteresis to reduce
2718 * the impact on the critical path.
2720 switch(neg_cache_hysteresis_state
[critpath
]) {
2722 if (vfscache_negs
> MINNEG
&& vfscache_negs
> neglimit
) {
2724 _cache_cleanneg(ncnegflush
);
2726 _cache_cleanneg(ncnegflush
+
2727 vfscache_negs
- neglimit
);
2728 neg_cache_hysteresis_state
[critpath
] = CHI_HIGH
;
2732 if (vfscache_negs
> MINNEG
* 9 / 10 &&
2733 vfscache_negs
* 9 / 10 > neglimit
2736 _cache_cleanneg(ncnegflush
);
2738 _cache_cleanneg(ncnegflush
+
2739 vfscache_negs
* 9 / 10 -
2742 neg_cache_hysteresis_state
[critpath
] = CHI_LOW
;
2748 * Don't cache too many positive hits. We use hysteresis to reduce
2749 * the impact on the critical path.
2751 * Excessive positive hits can accumulate due to large numbers of
2752 * hardlinks (the vnode cache will not prevent hl ncps from growing
2755 if ((poslimit
= ncposlimit
) == 0)
2756 poslimit
= maxvnodes
* 2;
2758 poslimit
= poslimit
* 8 / 10;
2760 switch(pos_cache_hysteresis_state
[critpath
]) {
2762 if (xnumcache
> poslimit
&& xnumcache
> MINPOS
) {
2764 _cache_cleanpos(ncposflush
);
2766 _cache_cleanpos(ncposflush
+
2767 xnumcache
- poslimit
);
2768 pos_cache_hysteresis_state
[critpath
] = CHI_HIGH
;
2772 if (xnumcache
> poslimit
* 5 / 6 && xnumcache
> MINPOS
) {
2774 _cache_cleanpos(ncposflush
);
2776 _cache_cleanpos(ncposflush
+
2777 xnumcache
- poslimit
* 5 / 6);
2779 pos_cache_hysteresis_state
[critpath
] = CHI_LOW
;
2785 * Clean out dangling defered-zap ncps which could not be cleanly
2786 * dropped if too many build up. Note that numdefered is
2787 * heuristical. Make sure we are real-time for the current cpu,
2788 * plus the global rollup.
2790 if (pcpu_ncache
[mycpu
->gd_cpuid
].numdefered
+ numdefered
> neglimit
) {
2791 _cache_cleandefered();
2796 * NEW NAMECACHE LOOKUP API
2798 * Lookup an entry in the namecache. The passed par_nch must be referenced
2799 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp
2800 * is ALWAYS returned, eve if the supplied component is illegal.
2802 * The resulting namecache entry should be returned to the system with
2803 * cache_put() or cache_unlock() + cache_drop().
2805 * namecache locks are recursive but care must be taken to avoid lock order
2806 * reversals (hence why the passed par_nch must be unlocked). Locking
2807 * rules are to order for parent traversals, not for child traversals.
2809 * Nobody else will be able to manipulate the associated namespace (e.g.
2810 * create, delete, rename, rename-target) until the caller unlocks the
2813 * The returned entry will be in one of three states: positive hit (non-null
2814 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2815 * Unresolved entries must be resolved through the filesystem to associate the
2816 * vnode and/or determine whether a positive or negative hit has occured.
2818 * It is not necessary to lock a directory in order to lock namespace under
2819 * that directory. In fact, it is explicitly not allowed to do that. A
2820 * directory is typically only locked when being created, renamed, or
2823 * The directory (par) may be unresolved, in which case any returned child
2824 * will likely also be marked unresolved. Likely but not guarenteed. Since
2825 * the filesystem lookup requires a resolved directory vnode the caller is
2826 * responsible for resolving the namecache chain top-down. This API
2827 * specifically allows whole chains to be created in an unresolved state.
2830 cache_nlookup(struct nchandle
*par_nch
, struct nlcomponent
*nlc
)
2832 struct nchandle nch
;
2833 struct namecache
*ncp
;
2834 struct namecache
*new_ncp
;
2835 struct namecache
*rep_ncp
; /* reuse a destroyed ncp */
2836 struct nchash_head
*nchpp
;
2843 mp
= par_nch
->mount
;
2847 * This is a good time to call it, no ncp's are locked by
2850 cache_hysteresis(1);
2853 * Try to locate an existing entry
2855 hash
= fnv_32_buf(nlc
->nlc_nameptr
, nlc
->nlc_namelen
, FNV1_32_INIT
);
2856 hash
= fnv_32_buf(&par_nch
->ncp
, sizeof(par_nch
->ncp
), hash
);
2858 nchpp
= NCHHASH(hash
);
2862 spin_lock(&nchpp
->spin
);
2864 spin_lock_shared(&nchpp
->spin
);
2866 TAILQ_FOREACH(ncp
, &nchpp
->list
, nc_hash
) {
2868 * Break out if we find a matching entry. Note that
2869 * UNRESOLVED entries may match, but DESTROYED entries
2872 * We may be able to reuse DESTROYED entries that we come
2873 * across, even if the name does not match, as long as
2874 * nc_nlen is correct and the only hold ref is from the nchpp
2877 if (ncp
->nc_parent
== par_nch
->ncp
&&
2878 ncp
->nc_nlen
== nlc
->nlc_namelen
) {
2879 if (ncp
->nc_flag
& NCF_DESTROYED
) {
2880 if (ncp
->nc_refs
== 1 && rep_ncp
== NULL
)
2884 if (bcmp(ncp
->nc_name
, nlc
->nlc_nameptr
, ncp
->nc_nlen
))
2888 spin_unlock(&nchpp
->spin
);
2890 spin_unlock_shared(&nchpp
->spin
);
2892 _cache_unlock(par_nch
->ncp
);
2895 if (_cache_lock_special(ncp
) == 0) {
2897 * Successfully locked but we must re-test
2898 * conditions that might have changed since
2899 * we did not have the lock before.
2901 if (ncp
->nc_parent
!= par_nch
->ncp
||
2902 ncp
->nc_nlen
!= nlc
->nlc_namelen
||
2903 bcmp(ncp
->nc_name
, nlc
->nlc_nameptr
,
2905 (ncp
->nc_flag
& NCF_DESTROYED
)) {
2909 _cache_auto_unresolve(mp
, ncp
);
2911 _cache_free(new_ncp
);
2914 _cache_get(ncp
); /* cycle the lock to block */
2922 * We failed to locate the entry, try to resurrect a destroyed
2923 * entry that we did find that is already correctly linked into
2924 * nchpp and the parent. We must re-test conditions after
2925 * successfully locking rep_ncp.
2927 * This case can occur under heavy loads due to not being able
2928 * to safely lock the parent in cache_zap(). Nominally a repeated
2929 * create/unlink load, but only the namelen needs to match.
2931 if (rep_ncp
&& new_ncp
== NULL
) {
2932 if (_cache_lock_nonblock(rep_ncp
) == 0) {
2933 _cache_hold(rep_ncp
);
2934 if (rep_ncp
->nc_parent
== par_nch
->ncp
&&
2935 rep_ncp
->nc_nlen
== nlc
->nlc_namelen
&&
2936 (rep_ncp
->nc_flag
& NCF_DESTROYED
) &&
2937 rep_ncp
->nc_refs
== 2) {
2939 * Update nc_name as reuse as new.
2942 bcopy(nlc
->nlc_nameptr
, ncp
->nc_name
,
2944 spin_unlock_shared(&nchpp
->spin
);
2945 _cache_setunresolved(ncp
);
2946 ncp
->nc_flag
= NCF_UNRESOLVED
;
2947 ncp
->nc_error
= ENOTCONN
;
2950 _cache_put(rep_ncp
);
2955 * Otherwise create a new entry and add it to the cache. The parent
2956 * ncp must also be locked so we can link into it.
2958 * We have to relookup after possibly blocking in kmalloc or
2959 * when locking par_nch.
2961 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2962 * mount case, in which case nc_name will be NULL.
2964 if (new_ncp
== NULL
) {
2965 spin_unlock_shared(&nchpp
->spin
);
2966 new_ncp
= cache_alloc(nlc
->nlc_namelen
);
2967 if (nlc
->nlc_namelen
) {
2968 bcopy(nlc
->nlc_nameptr
, new_ncp
->nc_name
,
2970 new_ncp
->nc_name
[nlc
->nlc_namelen
] = 0;
2976 * NOTE! The spinlock is held exclusively here because new_ncp
2979 if (par_locked
== 0) {
2980 spin_unlock(&nchpp
->spin
);
2981 _cache_lock(par_nch
->ncp
);
2987 * Link to parent (requires another ref, the one already in new_ncp
2988 * is what we wil lreturn).
2990 * WARNING! We still hold the spinlock. We have to set the hash
2991 * table entry atomically.
2995 _cache_link_parent(ncp
, par_nch
->ncp
, nchpp
);
2996 spin_unlock(&nchpp
->spin
);
2997 _cache_unlock(par_nch
->ncp
);
2998 /* par_locked = 0 - not used */
3001 * stats and namecache size management
3003 if (ncp
->nc_flag
& NCF_UNRESOLVED
)
3004 ++gd
->gd_nchstats
->ncs_miss
;
3005 else if (ncp
->nc_vp
)
3006 ++gd
->gd_nchstats
->ncs_goodhits
;
3008 ++gd
->gd_nchstats
->ncs_neghits
;
3011 _cache_mntref(nch
.mount
);
3017 * Attempt to lookup a namecache entry and return with a shared namecache
3018 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is
3019 * set or we are unable to lock.
3022 cache_nlookup_maybe_shared(struct nchandle
*par_nch
,
3023 struct nlcomponent
*nlc
,
3024 int excl
, struct nchandle
*res_nch
)
3026 struct namecache
*ncp
;
3027 struct nchash_head
*nchpp
;
3033 * If exclusive requested or shared namecache locks are disabled,
3036 if (ncp_shared_lock_disable
|| excl
)
3037 return(EWOULDBLOCK
);
3040 mp
= par_nch
->mount
;
3043 * This is a good time to call it, no ncp's are locked by
3046 cache_hysteresis(1);
3049 * Try to locate an existing entry
3051 hash
= fnv_32_buf(nlc
->nlc_nameptr
, nlc
->nlc_namelen
, FNV1_32_INIT
);
3052 hash
= fnv_32_buf(&par_nch
->ncp
, sizeof(par_nch
->ncp
), hash
);
3053 nchpp
= NCHHASH(hash
);
3055 spin_lock_shared(&nchpp
->spin
);
3057 TAILQ_FOREACH(ncp
, &nchpp
->list
, nc_hash
) {
3059 * Break out if we find a matching entry. Note that
3060 * UNRESOLVED entries may match, but DESTROYED entries
3063 if (ncp
->nc_parent
== par_nch
->ncp
&&
3064 ncp
->nc_nlen
== nlc
->nlc_namelen
&&
3065 bcmp(ncp
->nc_name
, nlc
->nlc_nameptr
, ncp
->nc_nlen
) == 0 &&
3066 (ncp
->nc_flag
& NCF_DESTROYED
) == 0
3069 spin_unlock_shared(&nchpp
->spin
);
3071 if (_cache_lock_shared_special(ncp
) == 0) {
3072 if (ncp
->nc_parent
== par_nch
->ncp
&&
3073 ncp
->nc_nlen
== nlc
->nlc_namelen
&&
3074 bcmp(ncp
->nc_name
, nlc
->nlc_nameptr
,
3075 ncp
->nc_nlen
) == 0 &&
3076 (ncp
->nc_flag
& NCF_DESTROYED
) == 0 &&
3077 (ncp
->nc_flag
& NCF_UNRESOLVED
) == 0 &&
3078 _cache_auto_unresolve_test(mp
, ncp
) == 0) {
3084 return(EWOULDBLOCK
);
3091 spin_unlock_shared(&nchpp
->spin
);
3092 return(EWOULDBLOCK
);
3097 * Note that nc_error might be non-zero (e.g ENOENT).
3100 res_nch
->mount
= mp
;
3102 ++gd
->gd_nchstats
->ncs_goodhits
;
3103 _cache_mntref(res_nch
->mount
);
3105 KKASSERT(ncp
->nc_error
!= EWOULDBLOCK
);
3106 return(ncp
->nc_error
);
3110 * This is a non-blocking verison of cache_nlookup() used by
3111 * nfs_readdirplusrpc_uio(). It can fail for any reason and
3112 * will return nch.ncp == NULL in that case.
3115 cache_nlookup_nonblock(struct nchandle
*par_nch
, struct nlcomponent
*nlc
)
3117 struct nchandle nch
;
3118 struct namecache
*ncp
;
3119 struct namecache
*new_ncp
;
3120 struct nchash_head
*nchpp
;
3127 mp
= par_nch
->mount
;
3131 * Try to locate an existing entry
3133 hash
= fnv_32_buf(nlc
->nlc_nameptr
, nlc
->nlc_namelen
, FNV1_32_INIT
);
3134 hash
= fnv_32_buf(&par_nch
->ncp
, sizeof(par_nch
->ncp
), hash
);
3136 nchpp
= NCHHASH(hash
);
3138 spin_lock(&nchpp
->spin
);
3139 TAILQ_FOREACH(ncp
, &nchpp
->list
, nc_hash
) {
3141 * Break out if we find a matching entry. Note that
3142 * UNRESOLVED entries may match, but DESTROYED entries
3145 if (ncp
->nc_parent
== par_nch
->ncp
&&
3146 ncp
->nc_nlen
== nlc
->nlc_namelen
&&
3147 bcmp(ncp
->nc_name
, nlc
->nlc_nameptr
, ncp
->nc_nlen
) == 0 &&
3148 (ncp
->nc_flag
& NCF_DESTROYED
) == 0
3151 spin_unlock(&nchpp
->spin
);
3153 _cache_unlock(par_nch
->ncp
);
3156 if (_cache_lock_special(ncp
) == 0) {
3157 if (ncp
->nc_parent
!= par_nch
->ncp
||
3158 ncp
->nc_nlen
!= nlc
->nlc_namelen
||
3159 bcmp(ncp
->nc_name
, nlc
->nlc_nameptr
, ncp
->nc_nlen
) ||
3160 (ncp
->nc_flag
& NCF_DESTROYED
)) {
3161 kprintf("cache_lookup_nonblock: "
3162 "ncp-race %p %*.*s\n",
3171 _cache_auto_unresolve(mp
, ncp
);
3173 _cache_free(new_ncp
);
3184 * We failed to locate an entry, create a new entry and add it to
3185 * the cache. The parent ncp must also be locked so we
3188 * We have to relookup after possibly blocking in kmalloc or
3189 * when locking par_nch.
3191 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3192 * mount case, in which case nc_name will be NULL.
3194 if (new_ncp
== NULL
) {
3195 spin_unlock(&nchpp
->spin
);
3196 new_ncp
= cache_alloc(nlc
->nlc_namelen
);
3197 if (nlc
->nlc_namelen
) {
3198 bcopy(nlc
->nlc_nameptr
, new_ncp
->nc_name
,
3200 new_ncp
->nc_name
[nlc
->nlc_namelen
] = 0;
3204 if (par_locked
== 0) {
3205 spin_unlock(&nchpp
->spin
);
3206 if (_cache_lock_nonblock(par_nch
->ncp
) == 0) {
3214 * Link to parent (requires another ref, the one already in new_ncp
3215 * is what we wil lreturn).
3217 * WARNING! We still hold the spinlock. We have to set the hash
3218 * table entry atomically.
3222 _cache_link_parent(ncp
, par_nch
->ncp
, nchpp
);
3223 spin_unlock(&nchpp
->spin
);
3224 _cache_unlock(par_nch
->ncp
);
3225 /* par_locked = 0 - not used */
3228 * stats and namecache size management
3230 if (ncp
->nc_flag
& NCF_UNRESOLVED
)
3231 ++gd
->gd_nchstats
->ncs_miss
;
3232 else if (ncp
->nc_vp
)
3233 ++gd
->gd_nchstats
->ncs_goodhits
;
3235 ++gd
->gd_nchstats
->ncs_neghits
;
3238 _cache_mntref(nch
.mount
);
3243 _cache_free(new_ncp
);
3252 * This version is non-locking. The caller must validate the result
3253 * for parent-to-child continuity.
3255 * It can fail for any reason and will return nch.ncp == NULL in that case.
3258 cache_nlookup_nonlocked(struct nchandle
*par_nch
, struct nlcomponent
*nlc
)
3260 struct nchandle nch
;
3261 struct namecache
*ncp
;
3262 struct nchash_head
*nchpp
;
3268 mp
= par_nch
->mount
;
3271 * Try to locate an existing entry
3273 hash
= fnv_32_buf(nlc
->nlc_nameptr
, nlc
->nlc_namelen
, FNV1_32_INIT
);
3274 hash
= fnv_32_buf(&par_nch
->ncp
, sizeof(par_nch
->ncp
), hash
);
3275 nchpp
= NCHHASH(hash
);
3277 spin_lock_shared(&nchpp
->spin
);
3278 TAILQ_FOREACH(ncp
, &nchpp
->list
, nc_hash
) {
3280 * Break out if we find a matching entry. Note that
3281 * UNRESOLVED entries may match, but DESTROYED entries
3284 * Resolved NFS entries which have timed out fail so the
3285 * caller can rerun with normal locking.
3287 if (ncp
->nc_parent
== par_nch
->ncp
&&
3288 ncp
->nc_nlen
== nlc
->nlc_namelen
&&
3289 bcmp(ncp
->nc_name
, nlc
->nlc_nameptr
, ncp
->nc_nlen
) == 0 &&
3290 (ncp
->nc_flag
& NCF_DESTROYED
) == 0
3292 if (_cache_auto_unresolve_test(par_nch
->mount
, ncp
))
3295 spin_unlock_shared(&nchpp
->spin
);
3299 spin_unlock_shared(&nchpp
->spin
);
3305 * stats and namecache size management
3307 if (ncp
->nc_flag
& NCF_UNRESOLVED
)
3308 ++gd
->gd_nchstats
->ncs_miss
;
3309 else if (ncp
->nc_vp
)
3310 ++gd
->gd_nchstats
->ncs_goodhits
;
3312 ++gd
->gd_nchstats
->ncs_neghits
;
3315 _cache_mntref(nch
.mount
);
3321 * The namecache entry is marked as being used as a mount point.
3322 * Locate the mount if it is visible to the caller. The DragonFly
3323 * mount system allows arbitrary loops in the topology and disentangles
3324 * those loops by matching against (mp, ncp) rather than just (ncp).
3325 * This means any given ncp can dive any number of mounts, depending
3326 * on the relative mount (e.g. nullfs) the caller is at in the topology.
3328 * We use a very simple frontend cache to reduce SMP conflicts,
3329 * which we have to do because the mountlist scan needs an exclusive
3330 * lock around its ripout info list. Not to mention that there might
3331 * be a lot of mounts.
3333 * Because all mounts can potentially be accessed by all cpus, break the cpu's
3334 * down a bit to allow some contention rather than making the cache
3337 * The hash table is split into per-cpu areas, is 4-way set-associative.
3339 struct findmount_info
{
3340 struct mount
*result
;
3341 struct mount
*nch_mount
;
3342 struct namecache
*nch_ncp
;
3346 struct ncmount_cache
*
3347 ncmount_cache_lookup4(struct mount
*mp
, struct namecache
*ncp
)
3351 hash
= iscsi_crc32(&mp
, sizeof(mp
));
3352 hash
= iscsi_crc32_ext(&ncp
, sizeof(ncp
), hash
);
3354 hash
= hash
& ((NCMOUNT_NUMCACHE
- 1) & ~(NCMOUNT_SET
- 1));
3356 return (&ncmount_cache
[hash
]);
3360 struct ncmount_cache
*
3361 ncmount_cache_lookup(struct mount
*mp
, struct namecache
*ncp
)
3363 struct ncmount_cache
*ncc
;
3364 struct ncmount_cache
*best
;
3369 ncc
= ncmount_cache_lookup4(mp
, ncp
);
3372 * NOTE: When checking for a ticks overflow implement a slop of
3373 * 2 ticks just to be safe, because ticks is accessed
3374 * non-atomically one CPU can increment it while another
3375 * is still using the old value.
3377 if (ncc
->ncp
== ncp
&& ncc
->mp
== mp
) /* 0 */
3379 delta
= (int)(ticks
- ncc
->ticks
); /* beware GCC opts */
3380 if (delta
< -2) /* overflow reset */
3385 for (i
= 1; i
< NCMOUNT_SET
; ++i
) { /* 1, 2, 3 */
3387 if (ncc
->ncp
== ncp
&& ncc
->mp
== mp
)
3389 delta
= (int)(ticks
- ncc
->ticks
);
3392 if (delta
> best_delta
) {
3401 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid
3402 * doing an expensive mountlist_scan*() if possible.
3404 * (mp, ncp) -> mountonpt.k
3406 * Returns a referenced mount pointer or NULL
3408 * General SMP operation uses a per-cpu umount_spin to interlock unmount
3409 * operations (that is, where the mp_target can be freed out from under us).
3411 * Lookups use the ncc->updating counter to validate the contents in order
3412 * to avoid having to obtain the per cache-element spin-lock. In addition,
3413 * the ticks field is only updated when it changes. However, if our per-cpu
3414 * lock fails due to an unmount-in-progress, we fall-back to the
3415 * cache-element's spin-lock.
3418 cache_findmount(struct nchandle
*nch
)
3420 struct findmount_info info
;
3421 struct ncmount_cache
*ncc
;
3422 struct ncmount_cache ncc_copy
;
3423 struct mount
*target
;
3424 struct pcpu_ncache
*pcpu
;
3425 struct spinlock
*spinlk
;
3429 if (ncmount_cache_enable
== 0 || pcpu
== NULL
) {
3433 pcpu
+= mycpu
->gd_cpuid
;
3436 ncc
= ncmount_cache_lookup(nch
->mount
, nch
->ncp
);
3437 if (ncc
->ncp
== nch
->ncp
&& ncc
->mp
== nch
->mount
) {
3440 * This is a bit messy for now because we do not yet have
3441 * safe disposal of mount structures. We have to ref
3442 * ncc->mp_target but the 'update' counter only tell us
3443 * whether the cache has changed after the fact.
3445 * For now get a per-cpu spinlock that will only contend
3446 * against umount's. This is the best path. If it fails,
3447 * instead of waiting on the umount we fall-back to a
3448 * shared ncc->spin lock, which will generally only cost a
3451 update
= ncc
->updating
;
3452 if (__predict_true(spin_trylock(&pcpu
->umount_spin
))) {
3453 spinlk
= &pcpu
->umount_spin
;
3455 spinlk
= &ncc
->spin
;
3456 spin_lock_shared(spinlk
);
3458 if (update
& 1) { /* update in progress */
3459 spin_unlock_any(spinlk
);
3464 if (ncc
->updating
!= update
) { /* content changed */
3465 spin_unlock_any(spinlk
);
3468 if (ncc_copy
.ncp
!= nch
->ncp
|| ncc_copy
.mp
!= nch
->mount
) {
3469 spin_unlock_any(spinlk
);
3472 if (ncc_copy
.isneg
== 0) {
3473 target
= ncc_copy
.mp_target
;
3474 if (target
->mnt_ncmounton
.mount
== nch
->mount
&&
3475 target
->mnt_ncmounton
.ncp
== nch
->ncp
) {
3477 * Cache hit (positive) (avoid dirtying
3478 * the cache line if possible)
3480 if (ncc
->ticks
!= (int)ticks
)
3481 ncc
->ticks
= (int)ticks
;
3482 _cache_mntref(target
);
3486 * Cache hit (negative) (avoid dirtying
3487 * the cache line if possible)
3489 if (ncc
->ticks
!= (int)ticks
)
3490 ncc
->ticks
= (int)ticks
;
3493 spin_unlock_any(spinlk
);
3503 info
.nch_mount
= nch
->mount
;
3504 info
.nch_ncp
= nch
->ncp
;
3505 mountlist_scan(cache_findmount_callback
, &info
,
3506 MNTSCAN_FORWARD
| MNTSCAN_NOBUSY
| MNTSCAN_NOUNLOCK
);
3509 * To reduce multi-re-entry on the cache, relookup in the cache.
3510 * This can still race, obviously, but that's ok.
3512 ncc
= ncmount_cache_lookup(nch
->mount
, nch
->ncp
);
3513 if (ncc
->ncp
== nch
->ncp
&& ncc
->mp
== nch
->mount
) {
3515 atomic_add_int(&info
.result
->mnt_refs
, -1);
3522 if ((info
.result
== NULL
||
3523 (info
.result
->mnt_kern_flag
& MNTK_UNMOUNT
) == 0)) {
3524 spin_lock(&ncc
->spin
);
3525 atomic_add_int_nonlocked(&ncc
->updating
, 1);
3527 KKASSERT(ncc
->updating
& 1);
3528 if (ncc
->mp
!= nch
->mount
) {
3530 atomic_add_int(&ncc
->mp
->mnt_refs
, -1);
3531 atomic_add_int(&nch
->mount
->mnt_refs
, 1);
3532 ncc
->mp
= nch
->mount
;
3534 ncc
->ncp
= nch
->ncp
; /* ptr compares only, not refd*/
3535 ncc
->ticks
= (int)ticks
;
3539 if (ncc
->mp_target
!= info
.result
) {
3541 atomic_add_int(&ncc
->mp_target
->mnt_refs
, -1);
3542 ncc
->mp_target
= info
.result
;
3543 atomic_add_int(&info
.result
->mnt_refs
, 1);
3547 if (ncc
->mp_target
) {
3548 atomic_add_int(&ncc
->mp_target
->mnt_refs
, -1);
3549 ncc
->mp_target
= NULL
;
3553 atomic_add_int_nonlocked(&ncc
->updating
, 1);
3554 spin_unlock(&ncc
->spin
);
3556 return(info
.result
);
3561 cache_findmount_callback(struct mount
*mp
, void *data
)
3563 struct findmount_info
*info
= data
;
3566 * Check the mount's mounted-on point against the passed nch.
3568 if (mp
->mnt_ncmounton
.mount
== info
->nch_mount
&&
3569 mp
->mnt_ncmounton
.ncp
== info
->nch_ncp
3579 cache_dropmount(struct mount
*mp
)
3585 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive
3588 * A full scan is not required, but for now just do it anyway.
3591 cache_ismounting(struct mount
*mp
)
3593 struct ncmount_cache
*ncc
;
3594 struct mount
*ncc_mp
;
3597 if (pcpu_ncache
== NULL
)
3600 for (i
= 0; i
< NCMOUNT_NUMCACHE
; ++i
) {
3601 ncc
= &ncmount_cache
[i
];
3602 if (ncc
->mp
!= mp
->mnt_ncmounton
.mount
||
3603 ncc
->ncp
!= mp
->mnt_ncmounton
.ncp
) {
3606 spin_lock(&ncc
->spin
);
3607 atomic_add_int_nonlocked(&ncc
->updating
, 1);
3609 KKASSERT(ncc
->updating
& 1);
3610 if (ncc
->mp
!= mp
->mnt_ncmounton
.mount
||
3611 ncc
->ncp
!= mp
->mnt_ncmounton
.ncp
) {
3614 spin_unlock(&ncc
->spin
);
3621 atomic_add_int(&ncc_mp
->mnt_refs
, -1);
3622 ncc_mp
= ncc
->mp_target
;
3623 ncc
->mp_target
= NULL
;
3625 atomic_add_int(&ncc_mp
->mnt_refs
, -1);
3626 ncc
->ticks
= (int)ticks
- hz
* 120;
3629 atomic_add_int_nonlocked(&ncc
->updating
, 1);
3630 spin_unlock(&ncc
->spin
);
3634 * Pre-cache the mount point
3636 ncc
= ncmount_cache_lookup(mp
->mnt_ncmounton
.mount
,
3637 mp
->mnt_ncmounton
.ncp
);
3639 spin_lock(&ncc
->spin
);
3640 atomic_add_int_nonlocked(&ncc
->updating
, 1);
3642 KKASSERT(ncc
->updating
& 1);
3645 atomic_add_int(&ncc
->mp
->mnt_refs
, -1);
3646 atomic_add_int(&mp
->mnt_ncmounton
.mount
->mnt_refs
, 1);
3647 ncc
->mp
= mp
->mnt_ncmounton
.mount
;
3648 ncc
->ncp
= mp
->mnt_ncmounton
.ncp
; /* ptr compares only */
3649 ncc
->ticks
= (int)ticks
;
3652 if (ncc
->mp_target
!= mp
) {
3654 atomic_add_int(&ncc
->mp_target
->mnt_refs
, -1);
3655 ncc
->mp_target
= mp
;
3656 atomic_add_int(&mp
->mnt_refs
, 1);
3659 atomic_add_int_nonlocked(&ncc
->updating
, 1);
3660 spin_unlock(&ncc
->spin
);
3664 * Scrap any ncmount_cache entries related to mp. Not only do we need to
3665 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any
3666 * negative hits involving (mp, <any>).
3668 * A full scan is required.
3671 cache_unmounting(struct mount
*mp
)
3673 struct ncmount_cache
*ncc
;
3674 struct pcpu_ncache
*pcpu
;
3675 struct mount
*ncc_mp
;
3682 for (i
= 0; i
< ncpus
; ++i
)
3683 spin_lock(&pcpu
[i
].umount_spin
);
3685 for (i
= 0; i
< NCMOUNT_NUMCACHE
; ++i
) {
3686 ncc
= &ncmount_cache
[i
];
3687 if (ncc
->mp
!= mp
&& ncc
->mp_target
!= mp
)
3689 spin_lock(&ncc
->spin
);
3690 atomic_add_int_nonlocked(&ncc
->updating
, 1);
3693 if (ncc
->mp
!= mp
&& ncc
->mp_target
!= mp
) {
3694 atomic_add_int_nonlocked(&ncc
->updating
, 1);
3696 spin_unlock(&ncc
->spin
);
3703 atomic_add_int(&ncc_mp
->mnt_refs
, -1);
3704 ncc_mp
= ncc
->mp_target
;
3705 ncc
->mp_target
= NULL
;
3707 atomic_add_int(&ncc_mp
->mnt_refs
, -1);
3708 ncc
->ticks
= (int)ticks
- hz
* 120;
3711 atomic_add_int_nonlocked(&ncc
->updating
, 1);
3712 spin_unlock(&ncc
->spin
);
3715 for (i
= 0; i
< ncpus
; ++i
)
3716 spin_unlock(&pcpu
[i
].umount_spin
);
3720 * Resolve an unresolved namecache entry, generally by looking it up.
3721 * The passed ncp must be locked and refd.
3723 * Theoretically since a vnode cannot be recycled while held, and since
3724 * the nc_parent chain holds its vnode as long as children exist, the
3725 * direct parent of the cache entry we are trying to resolve should
3726 * have a valid vnode. If not then generate an error that we can
3727 * determine is related to a resolver bug.
3729 * However, if a vnode was in the middle of a recyclement when the NCP
3730 * got locked, ncp->nc_vp might point to a vnode that is about to become
3731 * invalid. cache_resolve() handles this case by unresolving the entry
3732 * and then re-resolving it.
3734 * Note that successful resolution does not necessarily return an error
3735 * code of 0. If the ncp resolves to a negative cache hit then ENOENT
3739 cache_resolve(struct nchandle
*nch
, struct ucred
*cred
)
3741 struct namecache
*par_tmp
;
3742 struct namecache
*par
;
3743 struct namecache
*ncp
;
3744 struct nchandle nctmp
;
3751 KKASSERT(_cache_lockstatus(ncp
) == LK_EXCLUSIVE
);
3754 * If the ncp is already resolved we have nothing to do. However,
3755 * we do want to guarentee that a usable vnode is returned when
3756 * a vnode is present, so make sure it hasn't been reclaimed.
3758 if ((ncp
->nc_flag
& NCF_UNRESOLVED
) == 0) {
3759 if (ncp
->nc_vp
&& (ncp
->nc_vp
->v_flag
& VRECLAIMED
))
3760 _cache_setunresolved(ncp
);
3761 if ((ncp
->nc_flag
& NCF_UNRESOLVED
) == 0)
3762 return (ncp
->nc_error
);
3766 * If the ncp was destroyed it will never resolve again. This
3767 * can basically only happen when someone is chdir'd into an
3768 * empty directory which is then rmdir'd. We want to catch this
3769 * here and not dive the VFS because the VFS might actually
3770 * have a way to re-resolve the disconnected ncp, which will
3771 * result in inconsistencies in the cdir/nch for proc->p_fd.
3773 if (ncp
->nc_flag
& NCF_DESTROYED
)
3777 * Mount points need special handling because the parent does not
3778 * belong to the same filesystem as the ncp.
3780 if (ncp
== mp
->mnt_ncmountpt
.ncp
)
3781 return (cache_resolve_mp(mp
));
3784 * We expect an unbroken chain of ncps to at least the mount point,
3785 * and even all the way to root (but this code doesn't have to go
3786 * past the mount point).
3788 if (ncp
->nc_parent
== NULL
) {
3789 kprintf("EXDEV case 1 %p %*.*s\n", ncp
,
3790 ncp
->nc_nlen
, ncp
->nc_nlen
, ncp
->nc_name
);
3791 ncp
->nc_error
= EXDEV
;
3792 return(ncp
->nc_error
);
3796 * The vp's of the parent directories in the chain are held via vhold()
3797 * due to the existance of the child, and should not disappear.
3798 * However, there are cases where they can disappear:
3800 * - due to filesystem I/O errors.
3801 * - due to NFS being stupid about tracking the namespace and
3802 * destroys the namespace for entire directories quite often.
3803 * - due to forced unmounts.
3804 * - due to an rmdir (parent will be marked DESTROYED)
3806 * When this occurs we have to track the chain backwards and resolve
3807 * it, looping until the resolver catches up to the current node. We
3808 * could recurse here but we might run ourselves out of kernel stack
3809 * so we do it in a more painful manner. This situation really should
3810 * not occur all that often, or if it does not have to go back too
3811 * many nodes to resolve the ncp.
3813 while ((dvp
= cache_dvpref(ncp
)) == NULL
) {
3815 * This case can occur if a process is CD'd into a
3816 * directory which is then rmdir'd. If the parent is marked
3817 * destroyed there is no point trying to resolve it.
3819 if (ncp
->nc_parent
->nc_flag
& NCF_DESTROYED
)
3821 par
= ncp
->nc_parent
;
3824 while ((par_tmp
= par
->nc_parent
) != NULL
&&
3825 par_tmp
->nc_vp
== NULL
) {
3826 _cache_hold(par_tmp
);
3827 _cache_lock(par_tmp
);
3831 if (par
->nc_parent
== NULL
) {
3832 kprintf("EXDEV case 2 %*.*s\n",
3833 par
->nc_nlen
, par
->nc_nlen
, par
->nc_name
);
3838 * The parent is not set in stone, ref and lock it to prevent
3839 * it from disappearing. Also note that due to renames it
3840 * is possible for our ncp to move and for par to no longer
3841 * be one of its parents. We resolve it anyway, the loop
3842 * will handle any moves.
3844 _cache_get(par
); /* additional hold/lock */
3845 _cache_put(par
); /* from earlier hold/lock */
3846 if (par
== nch
->mount
->mnt_ncmountpt
.ncp
) {
3847 cache_resolve_mp(nch
->mount
);
3848 } else if ((dvp
= cache_dvpref(par
)) == NULL
) {
3849 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n",
3850 par
->nc_nlen
, par
->nc_nlen
, par
->nc_name
);
3854 if (par
->nc_flag
& NCF_UNRESOLVED
) {
3857 par
->nc_error
= VOP_NRESOLVE(&nctmp
, dvp
, cred
);
3861 if ((error
= par
->nc_error
) != 0) {
3862 if (par
->nc_error
!= EAGAIN
) {
3863 kprintf("EXDEV case 3 %*.*s error %d\n",
3864 par
->nc_nlen
, par
->nc_nlen
, par
->nc_name
,
3869 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
3870 par
, par
->nc_nlen
, par
->nc_nlen
, par
->nc_name
);
3877 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
3878 * ncp's and reattach them. If this occurs the original ncp is marked
3879 * EAGAIN to force a relookup.
3881 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
3882 * ncp must already be resolved.
3887 ncp
->nc_error
= VOP_NRESOLVE(&nctmp
, dvp
, cred
);
3890 ncp
->nc_error
= EPERM
;
3892 if (ncp
->nc_error
== EAGAIN
) {
3893 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
3894 ncp
, ncp
->nc_nlen
, ncp
->nc_nlen
, ncp
->nc_name
);
3897 return(ncp
->nc_error
);
3901 * Resolve the ncp associated with a mount point. Such ncp's almost always
3902 * remain resolved and this routine is rarely called. NFS MPs tends to force
3903 * re-resolution more often due to its mac-truck-smash-the-namecache
3904 * method of tracking namespace changes.
3906 * The semantics for this call is that the passed ncp must be locked on
3907 * entry and will be locked on return. However, if we actually have to
3908 * resolve the mount point we temporarily unlock the entry in order to
3909 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of
3910 * the unlock we have to recheck the flags after we relock.
3913 cache_resolve_mp(struct mount
*mp
)
3915 struct namecache
*ncp
= mp
->mnt_ncmountpt
.ncp
;
3919 KKASSERT(mp
!= NULL
);
3922 * If the ncp is already resolved we have nothing to do. However,
3923 * we do want to guarentee that a usable vnode is returned when
3924 * a vnode is present, so make sure it hasn't been reclaimed.
3926 if ((ncp
->nc_flag
& NCF_UNRESOLVED
) == 0) {
3927 if (ncp
->nc_vp
&& (ncp
->nc_vp
->v_flag
& VRECLAIMED
))
3928 _cache_setunresolved(ncp
);
3931 if (ncp
->nc_flag
& NCF_UNRESOLVED
) {
3933 while (vfs_busy(mp
, 0))
3935 error
= VFS_ROOT(mp
, &vp
);
3939 * recheck the ncp state after relocking.
3941 if (ncp
->nc_flag
& NCF_UNRESOLVED
) {
3942 ncp
->nc_error
= error
;
3944 _cache_setvp(mp
, ncp
, vp
);
3947 kprintf("[diagnostic] cache_resolve_mp: failed"
3948 " to resolve mount %p err=%d ncp=%p\n",
3950 _cache_setvp(mp
, ncp
, NULL
);
3952 } else if (error
== 0) {
3957 return(ncp
->nc_error
);
3961 * Clean out negative cache entries when too many have accumulated.
3964 _cache_cleanneg(long count
)
3966 struct pcpu_ncache
*pn
;
3967 struct namecache
*ncp
;
3968 static uint32_t neg_rover
;
3972 n
= neg_rover
++; /* SMP heuristical, race ok */
3974 n
= n
% (uint32_t)ncpus
;
3977 * Normalize vfscache_negs and count. count is sometimes based
3978 * on vfscache_negs. vfscache_negs is heuristical and can sometimes
3979 * have crazy values.
3981 vnegs
= vfscache_negs
;
3983 if (vnegs
<= MINNEG
)
3988 pn
= &pcpu_ncache
[n
];
3989 spin_lock(&pn
->neg_spin
);
3990 count
= pn
->neg_count
* count
/ vnegs
+ 1;
3991 spin_unlock(&pn
->neg_spin
);
3994 * Attempt to clean out the specified number of negative cache
3998 spin_lock(&pn
->neg_spin
);
3999 ncp
= TAILQ_FIRST(&pn
->neg_list
);
4001 spin_unlock(&pn
->neg_spin
);
4004 TAILQ_REMOVE(&pn
->neg_list
, ncp
, nc_vnode
);
4005 TAILQ_INSERT_TAIL(&pn
->neg_list
, ncp
, nc_vnode
);
4007 spin_unlock(&pn
->neg_spin
);
4010 * This can race, so we must re-check that the ncp
4011 * is on the ncneg.list after successfully locking it.
4013 if (_cache_lock_special(ncp
) == 0) {
4014 if (ncp
->nc_vp
== NULL
&&
4015 (ncp
->nc_flag
& NCF_UNRESOLVED
) == 0) {
4029 * Clean out positive cache entries when too many have accumulated.
4032 _cache_cleanpos(long count
)
4034 static volatile int rover
;
4035 struct nchash_head
*nchpp
;
4036 struct namecache
*ncp
;
4040 * Attempt to clean out the specified number of negative cache
4044 rover_copy
= ++rover
; /* MPSAFEENOUGH */
4046 nchpp
= NCHHASH(rover_copy
);
4048 if (TAILQ_FIRST(&nchpp
->list
) == NULL
) {
4054 * Cycle ncp on list, ignore and do not move DUMMY
4055 * ncps. These are temporary list iterators.
4057 * We must cycle the ncp to the end of the list to
4058 * ensure that all ncp's have an equal chance of
4061 spin_lock(&nchpp
->spin
);
4062 ncp
= TAILQ_FIRST(&nchpp
->list
);
4063 while (ncp
&& (ncp
->nc_flag
& NCF_DUMMY
))
4064 ncp
= TAILQ_NEXT(ncp
, nc_hash
);
4066 TAILQ_REMOVE(&nchpp
->list
, ncp
, nc_hash
);
4067 TAILQ_INSERT_TAIL(&nchpp
->list
, ncp
, nc_hash
);
4070 spin_unlock(&nchpp
->spin
);
4073 if (_cache_lock_special(ncp
) == 0) {
4084 * This is a kitchen sink function to clean out ncps which we
4085 * tried to zap from cache_drop() but failed because we were
4086 * unable to acquire the parent lock.
4088 * Such entries can also be removed via cache_inval_vp(), such
4089 * as when unmounting.
4092 _cache_cleandefered(void)
4094 struct nchash_head
*nchpp
;
4095 struct namecache
*ncp
;
4096 struct namecache dummy
;
4100 * Create a list iterator. DUMMY indicates that this is a list
4101 * iterator, DESTROYED prevents matches by lookup functions.
4104 pcpu_ncache
[mycpu
->gd_cpuid
].numdefered
= 0;
4105 bzero(&dummy
, sizeof(dummy
));
4106 dummy
.nc_flag
= NCF_DESTROYED
| NCF_DUMMY
;
4109 for (i
= 0; i
<= nchash
; ++i
) {
4110 nchpp
= &nchashtbl
[i
];
4112 spin_lock(&nchpp
->spin
);
4113 TAILQ_INSERT_HEAD(&nchpp
->list
, &dummy
, nc_hash
);
4115 while ((ncp
= TAILQ_NEXT(ncp
, nc_hash
)) != NULL
) {
4116 if ((ncp
->nc_flag
& NCF_DEFEREDZAP
) == 0)
4118 TAILQ_REMOVE(&nchpp
->list
, &dummy
, nc_hash
);
4119 TAILQ_INSERT_AFTER(&nchpp
->list
, ncp
, &dummy
, nc_hash
);
4121 spin_unlock(&nchpp
->spin
);
4122 if (_cache_lock_nonblock(ncp
) == 0) {
4123 ncp
->nc_flag
&= ~NCF_DEFEREDZAP
;
4127 spin_lock(&nchpp
->spin
);
4130 TAILQ_REMOVE(&nchpp
->list
, &dummy
, nc_hash
);
4131 spin_unlock(&nchpp
->spin
);
4136 * Name cache initialization, from vfsinit() when we are booting
4141 struct pcpu_ncache
*pn
;
4146 * Per-cpu accounting and negative hit list
4148 pcpu_ncache
= kmalloc(sizeof(*pcpu_ncache
) * ncpus
,
4149 M_VFSCACHE
, M_WAITOK
|M_ZERO
);
4150 for (i
= 0; i
< ncpus
; ++i
) {
4151 pn
= &pcpu_ncache
[i
];
4152 TAILQ_INIT(&pn
->neg_list
);
4153 spin_init(&pn
->neg_spin
, "ncneg");
4154 spin_init(&pn
->umount_spin
, "ncumm");
4158 * Initialise per-cpu namecache effectiveness statistics.
4160 for (i
= 0; i
< ncpus
; ++i
) {
4161 gd
= globaldata_find(i
);
4162 gd
->gd_nchstats
= &nchstats
[i
];
4166 * Create a generous namecache hash table
4168 nchashtbl
= hashinit_ext(vfs_inodehashsize(),
4169 sizeof(struct nchash_head
),
4170 M_VFSCACHE
, &nchash
);
4171 for (i
= 0; i
<= (int)nchash
; ++i
) {
4172 TAILQ_INIT(&nchashtbl
[i
].list
);
4173 spin_init(&nchashtbl
[i
].spin
, "nchinit_hash");
4175 for (i
= 0; i
< NCMOUNT_NUMCACHE
; ++i
)
4176 spin_init(&ncmount_cache
[i
].spin
, "nchinit_cache");
4177 nclockwarn
= 5 * hz
;
4181 * Called from start_init() to bootstrap the root filesystem. Returns
4182 * a referenced, unlocked namecache record.
4185 cache_allocroot(struct nchandle
*nch
, struct mount
*mp
, struct vnode
*vp
)
4187 nch
->ncp
= cache_alloc(0);
4191 _cache_setvp(nch
->mount
, nch
->ncp
, vp
);
4195 * vfs_cache_setroot()
4197 * Create an association between the root of our namecache and
4198 * the root vnode. This routine may be called several times during
4201 * If the caller intends to save the returned namecache pointer somewhere
4202 * it must cache_hold() it.
4205 vfs_cache_setroot(struct vnode
*nvp
, struct nchandle
*nch
)
4208 struct nchandle onch
;
4216 cache_zero(&rootnch
);
4224 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache
4225 * topology and is being removed as quickly as possible. The new VOP_N*()
4226 * API calls are required to make specific adjustments using the supplied
4227 * ncp pointers rather then just bogusly purging random vnodes.
4229 * Invalidate all namecache entries to a particular vnode as well as
4230 * any direct children of that vnode in the namecache. This is a
4231 * 'catch all' purge used by filesystems that do not know any better.
4233 * Note that the linkage between the vnode and its namecache entries will
4234 * be removed, but the namecache entries themselves might stay put due to
4235 * active references from elsewhere in the system or due to the existance of
4236 * the children. The namecache topology is left intact even if we do not
4237 * know what the vnode association is. Such entries will be marked
4241 cache_purge(struct vnode
*vp
)
4243 cache_inval_vp(vp
, CINV_DESTROY
| CINV_CHILDREN
);
4246 static int disablecwd
;
4247 SYSCTL_INT(_debug
, OID_AUTO
, disablecwd
, CTLFLAG_RW
, &disablecwd
, 0,
4250 static u_long numcwdcalls
;
4251 SYSCTL_ULONG(_vfs_cache
, OID_AUTO
, numcwdcalls
, CTLFLAG_RD
, &numcwdcalls
, 0,
4252 "Number of current directory resolution calls");
4253 static u_long numcwdfailnf
;
4254 SYSCTL_ULONG(_vfs_cache
, OID_AUTO
, numcwdfailnf
, CTLFLAG_RD
, &numcwdfailnf
, 0,
4255 "Number of current directory failures due to lack of file");
4256 static u_long numcwdfailsz
;
4257 SYSCTL_ULONG(_vfs_cache
, OID_AUTO
, numcwdfailsz
, CTLFLAG_RD
, &numcwdfailsz
, 0,
4258 "Number of current directory failures due to large result");
4259 static u_long numcwdfound
;
4260 SYSCTL_ULONG(_vfs_cache
, OID_AUTO
, numcwdfound
, CTLFLAG_RD
, &numcwdfound
, 0,
4261 "Number of current directory resolution successes");
4267 sys___getcwd(struct __getcwd_args
*uap
)
4277 buflen
= uap
->buflen
;
4280 if (buflen
> MAXPATHLEN
)
4281 buflen
= MAXPATHLEN
;
4283 buf
= kmalloc(buflen
, M_TEMP
, M_WAITOK
);
4284 bp
= kern_getcwd(buf
, buflen
, &error
);
4286 error
= copyout(bp
, uap
->buf
, strlen(bp
) + 1);
4292 kern_getcwd(char *buf
, size_t buflen
, int *error
)
4294 struct proc
*p
= curproc
;
4296 int i
, slash_prefixed
;
4297 struct filedesc
*fdp
;
4298 struct nchandle nch
;
4299 struct namecache
*ncp
;
4308 nch
= fdp
->fd_ncdir
;
4313 while (ncp
&& (ncp
!= fdp
->fd_nrdir
.ncp
||
4314 nch
.mount
!= fdp
->fd_nrdir
.mount
)
4317 * While traversing upwards if we encounter the root
4318 * of the current mount we have to skip to the mount point
4319 * in the underlying filesystem.
4321 if (ncp
== nch
.mount
->mnt_ncmountpt
.ncp
) {
4322 nch
= nch
.mount
->mnt_ncmounton
;
4331 * Prepend the path segment
4333 for (i
= ncp
->nc_nlen
- 1; i
>= 0; i
--) {
4340 *--bp
= ncp
->nc_name
[i
];
4352 * Go up a directory. This isn't a mount point so we don't
4353 * have to check again.
4355 while ((nch
.ncp
= ncp
->nc_parent
) != NULL
) {
4356 if (ncp_shared_lock_disable
)
4359 _cache_lock_shared(ncp
);
4360 if (nch
.ncp
!= ncp
->nc_parent
) {
4364 _cache_hold(nch
.ncp
);
4377 if (!slash_prefixed
) {
4395 * Thus begins the fullpath magic.
4397 * The passed nchp is referenced but not locked.
4399 static int disablefullpath
;
4400 SYSCTL_INT(_debug
, OID_AUTO
, disablefullpath
, CTLFLAG_RW
,
4401 &disablefullpath
, 0,
4402 "Disable fullpath lookups");
4405 cache_fullpath(struct proc
*p
, struct nchandle
*nchp
, struct nchandle
*nchbase
,
4406 char **retbuf
, char **freebuf
, int guess
)
4408 struct nchandle fd_nrdir
;
4409 struct nchandle nch
;
4410 struct namecache
*ncp
;
4411 struct mount
*mp
, *new_mp
;
4420 buf
= kmalloc(MAXPATHLEN
, M_TEMP
, M_WAITOK
);
4421 bp
= buf
+ MAXPATHLEN
- 1;
4424 fd_nrdir
= *nchbase
;
4426 fd_nrdir
= p
->p_fd
->fd_nrdir
;
4436 while (ncp
&& (ncp
!= fd_nrdir
.ncp
|| mp
!= fd_nrdir
.mount
)) {
4440 * If we are asked to guess the upwards path, we do so whenever
4441 * we encounter an ncp marked as a mountpoint. We try to find
4442 * the actual mountpoint by finding the mountpoint with this
4445 if (guess
&& (ncp
->nc_flag
& NCF_ISMOUNTPT
)) {
4446 new_mp
= mount_get_by_nc(ncp
);
4449 * While traversing upwards if we encounter the root
4450 * of the current mount we have to skip to the mount point.
4452 if (ncp
== mp
->mnt_ncmountpt
.ncp
) {
4456 nch
= new_mp
->mnt_ncmounton
;
4466 * Prepend the path segment
4468 for (i
= ncp
->nc_nlen
- 1; i
>= 0; i
--) {
4474 *--bp
= ncp
->nc_name
[i
];
4485 * Go up a directory. This isn't a mount point so we don't
4486 * have to check again.
4488 * We can only safely access nc_parent with ncp held locked.
4490 while ((nch
.ncp
= ncp
->nc_parent
) != NULL
) {
4491 _cache_lock_shared(ncp
);
4492 if (nch
.ncp
!= ncp
->nc_parent
) {
4496 _cache_hold(nch
.ncp
);
4509 if (!slash_prefixed
) {
4527 vn_fullpath(struct proc
*p
, struct vnode
*vn
, char **retbuf
,
4528 char **freebuf
, int guess
)
4530 struct namecache
*ncp
;
4531 struct nchandle nch
;
4535 if (disablefullpath
)
4541 /* vn is NULL, client wants us to use p->p_textvp */
4543 if ((vn
= p
->p_textvp
) == NULL
)
4546 spin_lock_shared(&vn
->v_spin
);
4547 TAILQ_FOREACH(ncp
, &vn
->v_namecache
, nc_vnode
) {
4552 spin_unlock_shared(&vn
->v_spin
);
4556 spin_unlock_shared(&vn
->v_spin
);
4559 nch
.mount
= vn
->v_mount
;
4560 error
= cache_fullpath(p
, &nch
, NULL
, retbuf
, freebuf
, guess
);
4566 vfscache_rollup_cpu(struct globaldata
*gd
)
4568 struct pcpu_ncache
*pn
;
4571 if (pcpu_ncache
== NULL
)
4573 pn
= &pcpu_ncache
[gd
->gd_cpuid
];
4575 if (pn
->vfscache_count
) {
4576 count
= atomic_swap_long(&pn
->vfscache_count
, 0);
4577 atomic_add_long(&vfscache_count
, count
);
4579 if (pn
->vfscache_leafs
) {
4580 count
= atomic_swap_long(&pn
->vfscache_leafs
, 0);
4581 atomic_add_long(&vfscache_leafs
, count
);
4583 if (pn
->vfscache_negs
) {
4584 count
= atomic_swap_long(&pn
->vfscache_negs
, 0);
4585 atomic_add_long(&vfscache_negs
, count
);
4587 if (pn
->numdefered
) {
4588 count
= atomic_swap_long(&pn
->numdefered
, 0);
4589 atomic_add_long(&numdefered
, count
);