kernel - Rename spinlock counter trick API
[dragonfly.git] / sys / kern / vfs_cache.c
blob784ce79976d525ad54931c647ae4e8a43450f329
1 /*
2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * Copyright (c) 1989, 1993, 1995
35 * The Regents of the University of California. All rights reserved.
37 * This code is derived from software contributed to Berkeley by
38 * Poul-Henning Kamp of the FreeBSD Project.
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 * 3. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/uio.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/mount.h>
71 #include <sys/vnode.h>
72 #include <sys/malloc.h>
73 #include <sys/sysproto.h>
74 #include <sys/spinlock.h>
75 #include <sys/proc.h>
76 #include <sys/namei.h>
77 #include <sys/nlookup.h>
78 #include <sys/filedesc.h>
79 #include <sys/fnv_hash.h>
80 #include <sys/globaldata.h>
81 #include <sys/kern_syscall.h>
82 #include <sys/dirent.h>
83 #include <ddb/ddb.h>
85 #include <sys/spinlock2.h>
87 #define MAX_RECURSION_DEPTH 64
90 * Random lookups in the cache are accomplished with a hash table using
91 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock,
92 * but we use the ncp->update counter trick to avoid acquiring any
93 * contestable spin-locks during a lookup.
95 * Negative entries may exist and correspond to resolved namecache
96 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT
97 * will be set if the entry corresponds to a whited-out directory entry
98 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list
99 * is locked via pcpu_ncache[n].neg_spin;
101 * MPSAFE RULES:
103 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One
104 * is applicable to direct lookups via the hash table nchpp or via
105 * nc_list (the two are added or removed together). Removal of the ncp
106 * from the hash table drops this reference. The second is applicable
107 * to vp->v_namecache linkages (or negative list linkages), and removal
108 * of the ncp from these lists drops this reference.
110 * On the 1->0 transition of nc_refs the ncp can no longer be referenced
111 * and must be destroyed. No other thread should have access to it at
112 * this point so it can be safely locked and freed without any deadlock
113 * fears.
115 * The 1->0 transition can occur at almost any juncture and so cache_drop()
116 * deals with it directly.
118 * (2) Once the 1->0 transition occurs, the entity that caused the transition
119 * will be responsible for destroying the ncp. The ncp cannot be on any
120 * list or hash at this time, or be held by anyone other than the caller
121 * responsible for the transition.
123 * (3) A ncp must be locked in order to modify it.
125 * (5) ncp locks are ordered, child-to-parent. Child first, then parent.
126 * This may seem backwards but forward-scans use the hash table and thus
127 * can hold the parent unlocked while traversing downward. Deletions,
128 * on the other-hand, tend to propagate bottom-up since the ref on the
129 * is dropped as the children go away.
131 * (6) Both parent and child must be locked in order to enter the child onto
132 * the parent's nc_list.
136 * Structures associated with name cacheing.
138 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash])
139 #define MINNEG 1024
140 #define MINPOS 1024
141 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */
142 #define NCMOUNT_SET (8) /* power of 2 */
144 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
146 TAILQ_HEAD(nchash_list, namecache);
149 * Don't cachealign, but at least pad to 32 bytes so entries
150 * don't cross a cache line.
152 struct nchash_head {
153 struct nchash_list list; /* 16 bytes */
154 struct spinlock spin; /* 8 bytes */
155 long pad01; /* 8 bytes */
158 struct ncmount_cache {
159 struct spinlock spin;
160 struct namecache *ncp;
161 struct mount *mp;
162 struct mount *mp_target;
163 int isneg;
164 int ticks;
165 int updating;
166 int unused01;
169 struct pcpu_ncache {
170 struct spinlock umount_spin; /* cache_findmount/interlock */
171 struct spinlock neg_spin; /* for neg_list and neg_count */
172 struct namecache_list neg_list;
173 long neg_count;
174 long vfscache_negs;
175 long vfscache_count;
176 long vfscache_leafs;
177 long numdefered;
178 } __cachealign;
180 __read_mostly static struct nchash_head *nchashtbl;
181 __read_mostly static struct pcpu_ncache *pcpu_ncache;
182 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE];
185 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server
186 * to create the namecache infrastructure leading to a dangling vnode.
188 * 0 Only errors are reported
189 * 1 Successes are reported
190 * 2 Successes + the whole directory scan is reported
191 * 3 Force the directory scan code run as if the parent vnode did not
192 * have a namecache record, even if it does have one.
194 __read_mostly static int ncvp_debug;
195 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
196 "Namecache debug level (0-3)");
198 __read_mostly static u_long nchash; /* size of hash table */
199 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
200 "Size of namecache hash table");
202 __read_mostly static int ncnegflush = 10; /* burst for negative flush */
203 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
204 "Batch flush negative entries");
206 __read_mostly static int ncposflush = 10; /* burst for positive flush */
207 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
208 "Batch flush positive entries");
210 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */
211 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
212 "Ratio of namecache negative entries");
214 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */
215 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
216 "Warn on locked namecache entries in ticks");
218 __read_mostly static int ncposlimit; /* number of cache entries allocated */
219 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
220 "Number of cache entries allocated");
222 __read_mostly static int ncp_shared_lock_disable = 0;
223 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
224 &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
226 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
227 "sizeof(struct vnode)");
228 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
229 "sizeof(struct namecache)");
231 __read_mostly static int ncmount_cache_enable = 1;
232 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
233 &ncmount_cache_enable, 0, "mount point cache");
235 static __inline void _cache_drop(struct namecache *ncp);
236 static int cache_resolve_mp(struct mount *mp);
237 static int cache_findmount_callback(struct mount *mp, void *data);
238 static void _cache_setunresolved(struct namecache *ncp);
239 static void _cache_cleanneg(long count);
240 static void _cache_cleanpos(long count);
241 static void _cache_cleandefered(void);
242 static void _cache_unlink(struct namecache *ncp);
245 * The new name cache statistics (these are rolled up globals and not
246 * modified in the critical path, see struct pcpu_ncache).
248 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
249 static long vfscache_negs;
250 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0,
251 "Number of negative namecache entries");
252 static long vfscache_count;
253 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0,
254 "Number of namecaches entries");
255 static long vfscache_leafs;
256 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0,
257 "Number of namecaches entries");
258 static long numdefered;
259 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
260 "Number of cache entries allocated");
263 struct nchstats nchstats[SMP_MAXCPU];
265 * Export VFS cache effectiveness statistics to user-land.
267 * The statistics are left for aggregation to user-land so
268 * neat things can be achieved, like observing per-CPU cache
269 * distribution.
271 static int
272 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
274 struct globaldata *gd;
275 int i, error;
277 error = 0;
278 for (i = 0; i < ncpus; ++i) {
279 gd = globaldata_find(i);
280 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
281 sizeof(struct nchstats))))
282 break;
285 return (error);
287 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
288 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
290 static void cache_zap(struct namecache *ncp);
293 * Cache mount points and namecache records in order to avoid unnecessary
294 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP
295 * performance and is particularly important on multi-socket systems to
296 * reduce cache-line ping-ponging.
298 * Try to keep the pcpu structure within one cache line (~64 bytes).
300 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */
301 #define MNTCACHE_SET 8 /* set associativity */
303 struct mntcache_elm {
304 struct namecache *ncp;
305 struct mount *mp;
306 int ticks;
307 int unused01;
310 struct mntcache {
311 struct mntcache_elm array[MNTCACHE_COUNT];
312 } __cachealign;
314 static struct mntcache pcpu_mntcache[MAXCPU];
316 static __inline
317 struct mntcache_elm *
318 _cache_mntcache_hash(void *ptr)
320 struct mntcache_elm *elm;
321 int hv;
323 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1);
324 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)];
326 return elm;
329 static
330 void
331 _cache_mntref(struct mount *mp)
333 struct mntcache_elm *elm;
334 struct mount *mpr;
335 int i;
337 elm = _cache_mntcache_hash(mp);
338 for (i = 0; i < MNTCACHE_SET; ++i) {
339 if (elm->mp == mp) {
340 mpr = atomic_swap_ptr((void *)&elm->mp, NULL);
341 if (__predict_true(mpr == mp))
342 return;
343 if (mpr)
344 atomic_add_int(&mpr->mnt_refs, -1);
346 ++elm;
348 atomic_add_int(&mp->mnt_refs, 1);
351 static
352 void
353 _cache_mntrel(struct mount *mp)
355 struct mntcache_elm *elm;
356 struct mntcache_elm *best;
357 struct mount *mpr;
358 int delta1;
359 int delta2;
360 int i;
362 elm = _cache_mntcache_hash(mp);
363 best = elm;
364 for (i = 0; i < MNTCACHE_SET; ++i) {
365 if (elm->mp == NULL) {
366 mpr = atomic_swap_ptr((void *)&elm->mp, mp);
367 if (__predict_false(mpr != NULL)) {
368 atomic_add_int(&mpr->mnt_refs, -1);
370 elm->ticks = ticks;
371 return;
373 delta1 = ticks - best->ticks;
374 delta2 = ticks - elm->ticks;
375 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
376 best = elm;
377 ++elm;
379 mpr = atomic_swap_ptr((void *)&best->mp, mp);
380 best->ticks = ticks;
381 if (mpr)
382 atomic_add_int(&mpr->mnt_refs, -1);
386 * Clears all cached mount points on all cpus. This routine should only
387 * be called when we are waiting for a mount to clear, e.g. so we can
388 * unmount.
390 void
391 cache_clearmntcache(struct mount *target __unused)
393 int n;
395 for (n = 0; n < ncpus; ++n) {
396 struct mntcache *cache = &pcpu_mntcache[n];
397 struct mntcache_elm *elm;
398 struct namecache *ncp;
399 struct mount *mp;
400 int i;
402 for (i = 0; i < MNTCACHE_COUNT; ++i) {
403 elm = &cache->array[i];
404 if (elm->mp) {
405 mp = atomic_swap_ptr((void *)&elm->mp, NULL);
406 if (mp)
407 atomic_add_int(&mp->mnt_refs, -1);
409 if (elm->ncp) {
410 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL);
411 if (ncp)
412 _cache_drop(ncp);
419 * Namespace locking. The caller must already hold a reference to the
420 * namecache structure in order to lock/unlock it. The controlling entity
421 * in a 1->0 transition does not need to lock the ncp to dispose of it,
422 * as nobody else will have visiblity to it at that point.
424 * Note that holding a locked namecache structure prevents other threads
425 * from making namespace changes (e.g. deleting or creating), prevents
426 * vnode association state changes by other threads, and prevents the
427 * namecache entry from being resolved or unresolved by other threads.
429 * An exclusive lock owner has full authority to associate/disassociate
430 * vnodes and resolve/unresolve the locked ncp.
432 * A shared lock owner only has authority to acquire the underlying vnode,
433 * if any.
435 * The primary lock field is nc_lockstatus. nc_locktd is set after the
436 * fact (when locking) or cleared prior to unlocking.
438 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed
439 * or recycled, but it does NOT help you if the vnode had already
440 * initiated a recyclement. If this is important, use cache_get()
441 * rather then cache_lock() (and deal with the differences in the
442 * way the refs counter is handled). Or, alternatively, make an
443 * unconditional call to cache_validate() or cache_resolve()
444 * after cache_lock() returns.
446 static __inline
447 void
448 _cache_lock(struct namecache *ncp)
450 int didwarn = 0;
451 int error;
453 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
454 while (__predict_false(error == EWOULDBLOCK)) {
455 if (didwarn == 0) {
456 didwarn = ticks - nclockwarn;
457 kprintf("[diagnostic] cache_lock: "
458 "%s blocked on %p "
459 "\"%*.*s\"\n",
460 curthread->td_comm, ncp,
461 ncp->nc_nlen, ncp->nc_nlen,
462 ncp->nc_name);
464 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK);
466 if (__predict_false(didwarn)) {
467 kprintf("[diagnostic] cache_lock: "
468 "%s unblocked %*.*s after %d secs\n",
469 curthread->td_comm,
470 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
471 (int)(ticks - didwarn) / hz);
476 * Release a previously acquired lock.
478 * A concurrent shared-lock acquisition or acquisition/release can
479 * race bit 31 so only drop the ncp if bit 31 was set.
481 static __inline
482 void
483 _cache_unlock(struct namecache *ncp)
485 lockmgr(&ncp->nc_lock, LK_RELEASE);
489 * Lock ncp exclusively, non-blocking. Return 0 on success.
491 static __inline
493 _cache_lock_nonblock(struct namecache *ncp)
495 int error;
497 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT);
498 if (__predict_false(error != 0)) {
499 return(EWOULDBLOCK);
501 return 0;
505 * This is a special form of _cache_lock() which only succeeds if
506 * it can get a pristine, non-recursive lock. The caller must have
507 * already ref'd the ncp.
509 * On success the ncp will be locked, on failure it will not. The
510 * ref count does not change either way.
512 * We want _cache_lock_special() (on success) to return a definitively
513 * usable vnode or a definitively unresolved ncp.
515 static __inline
517 _cache_lock_special(struct namecache *ncp)
519 if (_cache_lock_nonblock(ncp) == 0) {
520 if (lockmgr_oneexcl(&ncp->nc_lock)) {
521 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
522 _cache_setunresolved(ncp);
523 return 0;
525 _cache_unlock(ncp);
527 return EWOULDBLOCK;
531 * Shared lock, guarantees vp held
533 * The shared lock holds vp on the 0->1 transition. It is possible to race
534 * another shared lock release, preventing the other release from dropping
535 * the vnode and clearing bit 31.
537 * If it is not set then we are responsible for setting it, and this
538 * responsibility does not race with anyone else.
540 static __inline
541 void
542 _cache_lock_shared(struct namecache *ncp)
544 int didwarn = 0;
545 int error;
547 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
548 while (__predict_false(error == EWOULDBLOCK)) {
549 if (didwarn == 0) {
550 didwarn = ticks - nclockwarn;
551 kprintf("[diagnostic] cache_lock_shared: "
552 "%s blocked on %p "
553 "\"%*.*s\"\n",
554 curthread->td_comm, ncp,
555 ncp->nc_nlen, ncp->nc_nlen,
556 ncp->nc_name);
558 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
560 if (__predict_false(didwarn)) {
561 kprintf("[diagnostic] cache_lock_shared: "
562 "%s unblocked %*.*s after %d secs\n",
563 curthread->td_comm,
564 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
565 (int)(ticks - didwarn) / hz);
570 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success
572 static __inline
574 _cache_lock_shared_nonblock(struct namecache *ncp)
576 int error;
578 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT);
579 if (__predict_false(error != 0)) {
580 return(EWOULDBLOCK);
582 return 0;
586 * This function tries to get a shared lock but will back-off to an
587 * exclusive lock if:
589 * (1) Some other thread is trying to obtain an exclusive lock
590 * (to prevent the exclusive requester from getting livelocked out
591 * by many shared locks).
593 * (2) The current thread already owns an exclusive lock (to avoid
594 * deadlocking).
596 * WARNING! On machines with lots of cores we really want to try hard to
597 * get a shared lock or concurrent path lookups can chain-react
598 * into a very high-latency exclusive lock.
600 * This is very evident in dsynth's initial scans.
602 static __inline
604 _cache_lock_shared_special(struct namecache *ncp)
607 * Only honor a successful shared lock (returning 0) if there is
608 * no exclusive request pending and the vnode, if present, is not
609 * in a reclaimed state.
611 if (_cache_lock_shared_nonblock(ncp) == 0) {
612 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) {
613 if (ncp->nc_vp == NULL ||
614 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
615 return(0);
618 _cache_unlock(ncp);
619 return(EWOULDBLOCK);
623 * Non-blocking shared lock failed. If we already own the exclusive
624 * lock just acquire another exclusive lock (instead of deadlocking).
625 * Otherwise acquire a shared lock.
627 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) {
628 _cache_lock(ncp);
629 return(0);
631 _cache_lock_shared(ncp);
632 return(0);
635 static __inline
637 _cache_lockstatus(struct namecache *ncp)
639 int status;
641 status = lockstatus(&ncp->nc_lock, curthread);
642 if (status == 0 || status == LK_EXCLOTHER)
643 status = -1;
644 return status;
648 * cache_hold() and cache_drop() prevent the premature deletion of a
649 * namecache entry but do not prevent operations (such as zapping) on
650 * that namecache entry.
652 * This routine may only be called from outside this source module if
653 * nc_refs is already deterministically at least 1, such as being
654 * associated with e.g. a process, file descriptor, or some other entity.
656 * Only the above situations, similar situations within this module where
657 * the ref count is deterministically at least 1, or when the ncp is found
658 * via the nchpp (hash table) lookup, can bump nc_refs.
660 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It
661 * can still be removed from the nc_list, however, as long as the caller
662 * can acquire its lock (in the wrong order).
664 * This is a rare case where callers are allowed to hold a spinlock,
665 * so we can't ourselves.
667 static __inline
668 struct namecache *
669 _cache_hold(struct namecache *ncp)
671 KKASSERT(ncp->nc_refs > 0);
672 atomic_add_int(&ncp->nc_refs, 1);
674 return(ncp);
678 * Drop a cache entry.
680 * The 1->0 transition is special and requires the caller to destroy the
681 * entry. It means that the ncp is no longer on a nchpp list (since that
682 * would mean there was stilla ref). The ncp could still be on a nc_list
683 * but will not have any child of its own, again because nc_refs is now 0
684 * and children would have a ref to their parent.
686 * Once the 1->0 transition is made, nc_refs cannot be incremented again.
688 static __inline
689 void
690 _cache_drop(struct namecache *ncp)
692 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) {
694 * Executed unlocked (no need to lock on last drop)
696 _cache_setunresolved(ncp);
699 * Scrap it.
701 ncp->nc_refs = -1; /* safety */
702 if (ncp->nc_name)
703 kfree(ncp->nc_name, M_VFSCACHE);
704 kfree(ncp, M_VFSCACHE);
709 * Link a new namecache entry to its parent and to the hash table. Be
710 * careful to avoid races if vhold() blocks in the future.
712 * Both ncp and par must be referenced and locked. The reference is
713 * transfered to the nchpp (and, most notably, NOT to the parent list).
715 * NOTE: The hash table spinlock is held across this call, we can't do
716 * anything fancy.
718 static void
719 _cache_link_parent(struct namecache *ncp, struct namecache *par,
720 struct nchash_head *nchpp)
722 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
724 KKASSERT(ncp->nc_parent == NULL);
725 ncp->nc_parent = par;
726 ncp->nc_head = nchpp;
729 * Set inheritance flags. Note that the parent flags may be
730 * stale due to getattr potentially not having been run yet
731 * (it gets run during nlookup()'s).
733 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
734 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
735 ncp->nc_flag |= NCF_SF_PNOCACHE;
736 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
737 ncp->nc_flag |= NCF_UF_PCACHE;
740 * Add to hash table and parent, adjust accounting
742 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
743 atomic_add_long(&pn->vfscache_count, 1);
744 if (TAILQ_EMPTY(&ncp->nc_list))
745 atomic_add_long(&pn->vfscache_leafs, 1);
747 if (TAILQ_EMPTY(&par->nc_list)) {
748 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
749 atomic_add_long(&pn->vfscache_leafs, -1);
751 * Any vp associated with an ncp which has children must
752 * be held to prevent it from being recycled.
754 if (par->nc_vp)
755 vhold(par->nc_vp);
756 } else {
757 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
759 _cache_hold(par); /* add nc_parent ref */
763 * Remove the parent and hash associations from a namecache structure.
764 * Drop the ref-count on the parent. The caller receives the ref
765 * from the ncp's nchpp linkage that was removed and may forward that
766 * ref to a new linkage.
768 * The caller usually holds an additional ref * on the ncp so the unlink
769 * cannot be the final drop. XXX should not be necessary now since the
770 * caller receives the ref from the nchpp linkage, assuming the ncp
771 * was linked in the first place.
773 * ncp must be locked, which means that there won't be any nc_parent
774 * removal races. This routine will acquire a temporary lock on
775 * the parent as well as the appropriate hash chain.
777 static void
778 _cache_unlink_parent(struct namecache *ncp)
780 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
781 struct namecache *par;
782 struct vnode *dropvp;
783 struct nchash_head *nchpp;
785 if ((par = ncp->nc_parent) != NULL) {
786 cpu_ccfence();
787 KKASSERT(ncp->nc_parent == par);
789 /* don't add a ref, we drop the nchpp ref later */
790 _cache_lock(par);
791 nchpp = ncp->nc_head;
792 spin_lock(&nchpp->spin);
795 * Remove from hash table and parent, adjust accounting
797 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash);
798 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
799 atomic_add_long(&pn->vfscache_count, -1);
800 if (TAILQ_EMPTY(&ncp->nc_list))
801 atomic_add_long(&pn->vfscache_leafs, -1);
803 dropvp = NULL;
804 if (TAILQ_EMPTY(&par->nc_list)) {
805 atomic_add_long(&pn->vfscache_leafs, 1);
806 if (par->nc_vp)
807 dropvp = par->nc_vp;
809 ncp->nc_parent = NULL;
810 ncp->nc_head = NULL;
811 spin_unlock(&nchpp->spin);
812 _cache_unlock(par);
813 _cache_drop(par); /* drop nc_parent ref */
816 * We can only safely vdrop with no spinlocks held.
818 if (dropvp)
819 vdrop(dropvp);
824 * Allocate a new namecache structure. Most of the code does not require
825 * zero-termination of the string but it makes vop_compat_ncreate() easier.
827 * The returned ncp will be locked and referenced. The ref is generally meant
828 * to be transfered to the nchpp linkage.
830 static struct namecache *
831 cache_alloc(int nlen)
833 struct namecache *ncp;
835 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
836 if (nlen)
837 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
838 ncp->nc_nlen = nlen;
839 ncp->nc_flag = NCF_UNRESOLVED;
840 ncp->nc_error = ENOTCONN; /* needs to be resolved */
841 ncp->nc_refs = 1;
842 TAILQ_INIT(&ncp->nc_list);
843 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE);
844 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
846 return(ncp);
850 * Can only be called for the case where the ncp has never been
851 * associated with anything (so no spinlocks are needed).
853 static void
854 _cache_free(struct namecache *ncp)
856 KKASSERT(ncp->nc_refs == 1);
857 if (ncp->nc_name)
858 kfree(ncp->nc_name, M_VFSCACHE);
859 kfree(ncp, M_VFSCACHE);
863 * [re]initialize a nchandle.
865 void
866 cache_zero(struct nchandle *nch)
868 nch->ncp = NULL;
869 nch->mount = NULL;
873 * Ref and deref a nchandle structure (ncp + mp)
875 * The caller must specify a stable ncp pointer, typically meaning the
876 * ncp is already referenced but this can also occur indirectly through
877 * e.g. holding a lock on a direct child.
879 * WARNING: Caller may hold an unrelated read spinlock, which means we can't
880 * use read spinlocks here.
882 struct nchandle *
883 cache_hold(struct nchandle *nch)
885 _cache_hold(nch->ncp);
886 _cache_mntref(nch->mount);
887 return(nch);
891 * Create a copy of a namecache handle for an already-referenced
892 * entry.
894 void
895 cache_copy(struct nchandle *nch, struct nchandle *target)
897 struct namecache *ncp;
898 struct mount *mp;
899 struct mntcache_elm *elm;
900 struct namecache *ncpr;
901 int i;
903 ncp = nch->ncp;
904 mp = nch->mount;
905 target->ncp = ncp;
906 target->mount = mp;
908 elm = _cache_mntcache_hash(ncp);
909 for (i = 0; i < MNTCACHE_SET; ++i) {
910 if (elm->ncp == ncp) {
911 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL);
912 if (ncpr == ncp) {
913 _cache_mntref(mp);
914 return;
916 if (ncpr)
917 _cache_drop(ncpr);
919 ++elm;
921 if (ncp)
922 _cache_hold(ncp);
923 _cache_mntref(mp);
927 * Drop the nchandle, but try to cache the ref to avoid global atomic
928 * ops. This is typically done on the system root and jail root nchandles.
930 void
931 cache_drop_and_cache(struct nchandle *nch, int elmno)
933 struct mntcache_elm *elm;
934 struct mntcache_elm *best;
935 struct namecache *ncpr;
936 int delta1;
937 int delta2;
938 int i;
940 if (elmno > 4) {
941 if (nch->ncp) {
942 _cache_drop(nch->ncp);
943 nch->ncp = NULL;
945 if (nch->mount) {
946 _cache_mntrel(nch->mount);
947 nch->mount = NULL;
949 return;
952 elm = _cache_mntcache_hash(nch->ncp);
953 best = elm;
954 for (i = 0; i < MNTCACHE_SET; ++i) {
955 if (elm->ncp == NULL) {
956 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp);
957 _cache_mntrel(nch->mount);
958 elm->ticks = ticks;
959 nch->mount = NULL;
960 nch->ncp = NULL;
961 if (ncpr)
962 _cache_drop(ncpr);
963 return;
965 delta1 = ticks - best->ticks;
966 delta2 = ticks - elm->ticks;
967 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
968 best = elm;
969 ++elm;
971 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp);
972 _cache_mntrel(nch->mount);
973 best->ticks = ticks;
974 nch->mount = NULL;
975 nch->ncp = NULL;
976 if (ncpr)
977 _cache_drop(ncpr);
980 void
981 cache_changemount(struct nchandle *nch, struct mount *mp)
983 _cache_mntref(mp);
984 _cache_mntrel(nch->mount);
985 nch->mount = mp;
988 void
989 cache_drop(struct nchandle *nch)
991 _cache_mntrel(nch->mount);
992 _cache_drop(nch->ncp);
993 nch->ncp = NULL;
994 nch->mount = NULL;
998 cache_lockstatus(struct nchandle *nch)
1000 return(_cache_lockstatus(nch->ncp));
1003 void
1004 cache_lock(struct nchandle *nch)
1006 _cache_lock(nch->ncp);
1009 void
1010 cache_lock_maybe_shared(struct nchandle *nch, int excl)
1012 struct namecache *ncp = nch->ncp;
1014 if (ncp_shared_lock_disable || excl ||
1015 (ncp->nc_flag & NCF_UNRESOLVED)) {
1016 _cache_lock(ncp);
1017 } else {
1018 _cache_lock_shared(ncp);
1019 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1020 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1021 _cache_unlock(ncp);
1022 _cache_lock(ncp);
1024 } else {
1025 _cache_unlock(ncp);
1026 _cache_lock(ncp);
1032 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller
1033 * is responsible for checking both for validity on return as they
1034 * may have become invalid.
1036 * We have to deal with potential deadlocks here, just ping pong
1037 * the lock until we get it (we will always block somewhere when
1038 * looping so this is not cpu-intensive).
1040 * which = 0 nch1 not locked, nch2 is locked
1041 * which = 1 nch1 is locked, nch2 is not locked
1043 void
1044 cache_relock(struct nchandle *nch1, struct ucred *cred1,
1045 struct nchandle *nch2, struct ucred *cred2)
1047 int which;
1049 which = 0;
1051 for (;;) {
1052 if (which == 0) {
1053 if (cache_lock_nonblock(nch1) == 0) {
1054 cache_resolve(nch1, cred1);
1055 break;
1057 cache_unlock(nch2);
1058 cache_lock(nch1);
1059 cache_resolve(nch1, cred1);
1060 which = 1;
1061 } else {
1062 if (cache_lock_nonblock(nch2) == 0) {
1063 cache_resolve(nch2, cred2);
1064 break;
1066 cache_unlock(nch1);
1067 cache_lock(nch2);
1068 cache_resolve(nch2, cred2);
1069 which = 0;
1075 cache_lock_nonblock(struct nchandle *nch)
1077 return(_cache_lock_nonblock(nch->ncp));
1080 void
1081 cache_unlock(struct nchandle *nch)
1083 _cache_unlock(nch->ncp);
1087 * ref-and-lock, unlock-and-deref functions.
1089 * This function is primarily used by nlookup. Even though cache_lock
1090 * holds the vnode, it is possible that the vnode may have already
1091 * initiated a recyclement.
1093 * We want cache_get() to return a definitively usable vnode or a
1094 * definitively unresolved ncp.
1096 static
1097 struct namecache *
1098 _cache_get(struct namecache *ncp)
1100 _cache_hold(ncp);
1101 _cache_lock(ncp);
1102 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1103 _cache_setunresolved(ncp);
1104 return(ncp);
1108 * Attempt to obtain a shared lock on the ncp. A shared lock will only
1109 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1110 * valid. Otherwise an exclusive lock will be acquired instead.
1112 static
1113 struct namecache *
1114 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1116 if (ncp_shared_lock_disable || excl ||
1117 (ncp->nc_flag & NCF_UNRESOLVED)) {
1118 return(_cache_get(ncp));
1120 _cache_hold(ncp);
1121 _cache_lock_shared(ncp);
1122 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1123 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1124 _cache_unlock(ncp);
1125 ncp = _cache_get(ncp);
1126 _cache_drop(ncp);
1128 } else {
1129 _cache_unlock(ncp);
1130 ncp = _cache_get(ncp);
1131 _cache_drop(ncp);
1133 return(ncp);
1137 * NOTE: The same nchandle can be passed for both arguments.
1139 void
1140 cache_get(struct nchandle *nch, struct nchandle *target)
1142 KKASSERT(nch->ncp->nc_refs > 0);
1143 target->mount = nch->mount;
1144 target->ncp = _cache_get(nch->ncp);
1145 _cache_mntref(target->mount);
1148 void
1149 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1151 KKASSERT(nch->ncp->nc_refs > 0);
1152 target->mount = nch->mount;
1153 target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1154 _cache_mntref(target->mount);
1158 * Release a held and locked ncp
1160 static __inline
1161 void
1162 _cache_put(struct namecache *ncp)
1164 _cache_unlock(ncp);
1165 _cache_drop(ncp);
1168 void
1169 cache_put(struct nchandle *nch)
1171 _cache_mntrel(nch->mount);
1172 _cache_put(nch->ncp);
1173 nch->ncp = NULL;
1174 nch->mount = NULL;
1178 * Resolve an unresolved ncp by associating a vnode with it. If the
1179 * vnode is NULL, a negative cache entry is created.
1181 * The ncp should be locked on entry and will remain locked on return.
1183 static
1184 void
1185 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
1187 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) &&
1188 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) &&
1189 ncp->nc_vp == NULL);
1191 if (vp) {
1193 * Any vp associated with an ncp which has children must
1194 * be held. Any vp associated with a locked ncp must be held.
1196 if (!TAILQ_EMPTY(&ncp->nc_list))
1197 vhold(vp);
1198 spin_lock(&vp->v_spin);
1199 ncp->nc_vp = vp;
1200 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1201 ++vp->v_namecache_count;
1202 _cache_hold(ncp); /* v_namecache assoc */
1203 spin_unlock(&vp->v_spin);
1204 vhold(vp); /* nc_vp */
1207 * Set auxiliary flags
1209 switch(vp->v_type) {
1210 case VDIR:
1211 ncp->nc_flag |= NCF_ISDIR;
1212 break;
1213 case VLNK:
1214 ncp->nc_flag |= NCF_ISSYMLINK;
1215 /* XXX cache the contents of the symlink */
1216 break;
1217 default:
1218 break;
1221 ncp->nc_error = 0;
1224 * XXX: this is a hack to work-around the lack of a real pfs vfs
1225 * implementation
1227 if (mp) {
1228 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1229 vp->v_pfsmp = mp;
1231 } else {
1233 * When creating a negative cache hit we set the
1234 * namecache_gen. A later resolve will clean out the
1235 * negative cache hit if the mount point's namecache_gen
1236 * has changed. Used by devfs, could also be used by
1237 * other remote FSs.
1239 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
1241 ncp->nc_vp = NULL;
1242 ncp->nc_negcpu = mycpu->gd_cpuid;
1243 spin_lock(&pn->neg_spin);
1244 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
1245 _cache_hold(ncp); /* neg_list assoc */
1246 ++pn->neg_count;
1247 spin_unlock(&pn->neg_spin);
1248 atomic_add_long(&pn->vfscache_negs, 1);
1250 ncp->nc_error = ENOENT;
1251 if (mp)
1252 VFS_NCPGEN_SET(mp, ncp);
1254 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1257 void
1258 cache_setvp(struct nchandle *nch, struct vnode *vp)
1260 _cache_setvp(nch->mount, nch->ncp, vp);
1264 * Used for NFS
1266 void
1267 cache_settimeout(struct nchandle *nch, int nticks)
1269 struct namecache *ncp = nch->ncp;
1271 if ((ncp->nc_timeout = ticks + nticks) == 0)
1272 ncp->nc_timeout = 1;
1276 * Disassociate the vnode or negative-cache association and mark a
1277 * namecache entry as unresolved again. Note that the ncp is still
1278 * left in the hash table and still linked to its parent.
1280 * The ncp should be locked and refd on entry and will remain locked and refd
1281 * on return.
1283 * This routine is normally never called on a directory containing children.
1284 * However, NFS often does just that in its rename() code as a cop-out to
1285 * avoid complex namespace operations. This disconnects a directory vnode
1286 * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1287 * sync.
1290 static
1291 void
1292 _cache_setunresolved(struct namecache *ncp)
1294 struct vnode *vp;
1296 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1297 ncp->nc_flag |= NCF_UNRESOLVED;
1298 ncp->nc_timeout = 0;
1299 ncp->nc_error = ENOTCONN;
1300 if ((vp = ncp->nc_vp) != NULL) {
1301 spin_lock(&vp->v_spin);
1302 ncp->nc_vp = NULL;
1303 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1304 --vp->v_namecache_count;
1305 spin_unlock(&vp->v_spin);
1308 * Any vp associated with an ncp with children is
1309 * held by that ncp. Any vp associated with ncp
1310 * is held by that ncp. These conditions must be
1311 * undone when the vp is cleared out from the ncp.
1313 if (!TAILQ_EMPTY(&ncp->nc_list))
1314 vdrop(vp);
1315 vdrop(vp);
1316 } else {
1317 struct pcpu_ncache *pn;
1319 pn = &pcpu_ncache[ncp->nc_negcpu];
1321 atomic_add_long(&pn->vfscache_negs, -1);
1322 spin_lock(&pn->neg_spin);
1323 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
1324 --pn->neg_count;
1325 spin_unlock(&pn->neg_spin);
1327 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1328 _cache_drop(ncp); /* from v_namecache or neg_list */
1333 * The cache_nresolve() code calls this function to automatically
1334 * set a resolved cache element to unresolved if it has timed out
1335 * or if it is a negative cache hit and the mount point namecache_gen
1336 * has changed.
1338 static __inline int
1339 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1342 * Try to zap entries that have timed out. We have
1343 * to be careful here because locked leafs may depend
1344 * on the vnode remaining intact in a parent, so only
1345 * do this under very specific conditions.
1347 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1348 TAILQ_EMPTY(&ncp->nc_list)) {
1349 return 1;
1353 * If a resolved negative cache hit is invalid due to
1354 * the mount's namecache generation being bumped, zap it.
1356 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1357 return 1;
1361 * Otherwise we are good
1363 return 0;
1366 static __inline void
1367 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1370 * Already in an unresolved state, nothing to do.
1372 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1373 if (_cache_auto_unresolve_test(mp, ncp))
1374 _cache_setunresolved(ncp);
1378 void
1379 cache_setunresolved(struct nchandle *nch)
1381 _cache_setunresolved(nch->ncp);
1385 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1386 * looking for matches. This flag tells the lookup code when it must
1387 * check for a mount linkage and also prevents the directories in question
1388 * from being deleted or renamed.
1390 static
1392 cache_clrmountpt_callback(struct mount *mp, void *data)
1394 struct nchandle *nch = data;
1396 if (mp->mnt_ncmounton.ncp == nch->ncp)
1397 return(1);
1398 if (mp->mnt_ncmountpt.ncp == nch->ncp)
1399 return(1);
1400 return(0);
1404 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated
1405 * with a mount point.
1407 void
1408 cache_clrmountpt(struct nchandle *nch)
1410 int count;
1412 count = mountlist_scan(cache_clrmountpt_callback, nch,
1413 MNTSCAN_FORWARD | MNTSCAN_NOBUSY |
1414 MNTSCAN_NOUNLOCK);
1415 if (count == 0)
1416 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1420 * Invalidate portions of the namecache topology given a starting entry.
1421 * The passed ncp is set to an unresolved state and:
1423 * The passed ncp must be referenced and locked. The routine may unlock
1424 * and relock ncp several times, and will recheck the children and loop
1425 * to catch races. When done the passed ncp will be returned with the
1426 * reference and lock intact.
1428 * CINV_DESTROY - Set a flag in the passed ncp entry indicating
1429 * that the physical underlying nodes have been
1430 * destroyed... as in deleted. For example, when
1431 * a directory is removed. This will cause record
1432 * lookups on the name to no longer be able to find
1433 * the record and tells the resolver to return failure
1434 * rather then trying to resolve through the parent.
1436 * The topology itself, including ncp->nc_name,
1437 * remains intact.
1439 * This only applies to the passed ncp, if CINV_CHILDREN
1440 * is specified the children are not flagged.
1442 * CINV_CHILDREN - Set all children (recursively) to an unresolved
1443 * state as well.
1445 * Note that this will also have the side effect of
1446 * cleaning out any unreferenced nodes in the topology
1447 * from the leaves up as the recursion backs out.
1449 * Note that the topology for any referenced nodes remains intact, but
1450 * the nodes will be marked as having been destroyed and will be set
1451 * to an unresolved state.
1453 * It is possible for cache_inval() to race a cache_resolve(), meaning that
1454 * the namecache entry may not actually be invalidated on return if it was
1455 * revalidated while recursing down into its children. This code guarentees
1456 * that the node(s) will go through an invalidation cycle, but does not
1457 * guarentee that they will remain in an invalidated state.
1459 * Returns non-zero if a revalidation was detected during the invalidation
1460 * recursion, zero otherwise. Note that since only the original ncp is
1461 * locked the revalidation ultimately can only indicate that the original ncp
1462 * *MIGHT* no have been reresolved.
1464 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1465 * have to avoid blowing out the kernel stack. We do this by saving the
1466 * deep namecache node and aborting the recursion, then re-recursing at that
1467 * node using a depth-first algorithm in order to allow multiple deep
1468 * recursions to chain through each other, then we restart the invalidation
1469 * from scratch.
1472 struct cinvtrack {
1473 struct namecache *resume_ncp;
1474 int depth;
1477 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1479 static
1481 _cache_inval(struct namecache *ncp, int flags)
1483 struct cinvtrack track;
1484 struct namecache *ncp2;
1485 int r;
1487 track.depth = 0;
1488 track.resume_ncp = NULL;
1490 for (;;) {
1491 r = _cache_inval_internal(ncp, flags, &track);
1492 if (track.resume_ncp == NULL)
1493 break;
1494 _cache_unlock(ncp);
1495 while ((ncp2 = track.resume_ncp) != NULL) {
1496 track.resume_ncp = NULL;
1497 _cache_lock(ncp2);
1498 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1499 &track);
1500 /*_cache_put(ncp2);*/
1501 cache_zap(ncp2);
1503 _cache_lock(ncp);
1505 return(r);
1509 cache_inval(struct nchandle *nch, int flags)
1511 return(_cache_inval(nch->ncp, flags));
1515 * Helper for _cache_inval(). The passed ncp is refd and locked and
1516 * remains that way on return, but may be unlocked/relocked multiple
1517 * times by the routine.
1519 static int
1520 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1522 struct namecache *nextkid;
1523 int rcnt = 0;
1525 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1527 _cache_setunresolved(ncp);
1528 if (flags & CINV_DESTROY) {
1529 ncp->nc_flag |= NCF_DESTROYED;
1530 ++ncp->nc_generation;
1533 while ((flags & CINV_CHILDREN) &&
1534 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1536 struct namecache *kid;
1537 int restart;
1539 restart = 0;
1540 _cache_hold(nextkid);
1541 if (++track->depth > MAX_RECURSION_DEPTH) {
1542 track->resume_ncp = ncp;
1543 _cache_hold(ncp);
1544 ++rcnt;
1546 while ((kid = nextkid) != NULL) {
1548 * Parent (ncp) must be locked for the iteration.
1550 nextkid = NULL;
1551 if (kid->nc_parent != ncp) {
1552 _cache_drop(kid);
1553 kprintf("cache_inval_internal restartA %s\n",
1554 ncp->nc_name);
1555 restart = 1;
1556 break;
1558 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1559 _cache_hold(nextkid);
1562 * Parent unlocked for this section to avoid
1563 * deadlocks. Then lock the kid and check for
1564 * races.
1566 _cache_unlock(ncp);
1567 if (track->resume_ncp) {
1568 _cache_drop(kid);
1569 _cache_lock(ncp);
1570 break;
1572 _cache_lock(kid);
1573 if (kid->nc_parent != ncp) {
1574 kprintf("cache_inval_internal "
1575 "restartB %s\n",
1576 ncp->nc_name);
1577 restart = 1;
1578 _cache_unlock(kid);
1579 _cache_drop(kid);
1580 _cache_lock(ncp);
1581 break;
1583 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1584 TAILQ_FIRST(&kid->nc_list)
1587 rcnt += _cache_inval_internal(kid,
1588 flags & ~CINV_DESTROY, track);
1589 /*_cache_unlock(kid);*/
1590 /*_cache_drop(kid);*/
1591 cache_zap(kid);
1592 } else {
1593 cache_zap(kid);
1597 * Relock parent to continue scan
1599 _cache_lock(ncp);
1601 if (nextkid)
1602 _cache_drop(nextkid);
1603 --track->depth;
1604 if (restart == 0)
1605 break;
1609 * Someone could have gotten in there while ncp was unlocked,
1610 * retry if so.
1612 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1613 ++rcnt;
1614 return (rcnt);
1618 * Invalidate a vnode's namecache associations. To avoid races against
1619 * the resolver we do not invalidate a node which we previously invalidated
1620 * but which was then re-resolved while we were in the invalidation loop.
1622 * Returns non-zero if any namecache entries remain after the invalidation
1623 * loop completed.
1625 * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1626 * be ripped out of the topology while held, the vnode's v_namecache
1627 * list has no such restriction. NCP's can be ripped out of the list
1628 * at virtually any time if not locked, even if held.
1630 * In addition, the v_namecache list itself must be locked via
1631 * the vnode's spinlock.
1634 cache_inval_vp(struct vnode *vp, int flags)
1636 struct namecache *ncp;
1637 struct namecache *next;
1639 restart:
1640 spin_lock(&vp->v_spin);
1641 ncp = TAILQ_FIRST(&vp->v_namecache);
1642 if (ncp)
1643 _cache_hold(ncp);
1644 while (ncp) {
1645 /* loop entered with ncp held and vp spin-locked */
1646 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1647 _cache_hold(next);
1648 spin_unlock(&vp->v_spin);
1649 _cache_lock(ncp);
1650 if (ncp->nc_vp != vp) {
1651 kprintf("Warning: cache_inval_vp: race-A detected on "
1652 "%s\n", ncp->nc_name);
1653 _cache_put(ncp);
1654 if (next)
1655 _cache_drop(next);
1656 goto restart;
1658 _cache_inval(ncp, flags);
1659 _cache_put(ncp); /* also releases reference */
1660 ncp = next;
1661 spin_lock(&vp->v_spin);
1662 if (ncp && ncp->nc_vp != vp) {
1663 spin_unlock(&vp->v_spin);
1664 kprintf("Warning: cache_inval_vp: race-B detected on "
1665 "%s\n", ncp->nc_name);
1666 _cache_drop(ncp);
1667 goto restart;
1670 spin_unlock(&vp->v_spin);
1671 return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1675 * This routine is used instead of the normal cache_inval_vp() when we
1676 * are trying to recycle otherwise good vnodes.
1678 * Return 0 on success, non-zero if not all namecache records could be
1679 * disassociated from the vnode (for various reasons).
1682 cache_inval_vp_nonblock(struct vnode *vp)
1684 struct namecache *ncp;
1685 struct namecache *next;
1687 spin_lock(&vp->v_spin);
1688 ncp = TAILQ_FIRST(&vp->v_namecache);
1689 if (ncp)
1690 _cache_hold(ncp);
1691 while (ncp) {
1692 /* loop entered with ncp held */
1693 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1694 _cache_hold(next);
1695 spin_unlock(&vp->v_spin);
1696 if (_cache_lock_nonblock(ncp)) {
1697 _cache_drop(ncp);
1698 if (next)
1699 _cache_drop(next);
1700 goto done;
1702 if (ncp->nc_vp != vp) {
1703 kprintf("Warning: cache_inval_vp: race-A detected on "
1704 "%s\n", ncp->nc_name);
1705 _cache_put(ncp);
1706 if (next)
1707 _cache_drop(next);
1708 goto done;
1710 _cache_inval(ncp, 0);
1711 _cache_put(ncp); /* also releases reference */
1712 ncp = next;
1713 spin_lock(&vp->v_spin);
1714 if (ncp && ncp->nc_vp != vp) {
1715 spin_unlock(&vp->v_spin);
1716 kprintf("Warning: cache_inval_vp: race-B detected on "
1717 "%s\n", ncp->nc_name);
1718 _cache_drop(ncp);
1719 goto done;
1722 spin_unlock(&vp->v_spin);
1723 done:
1724 return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1728 * Clears the universal directory search 'ok' flag. This flag allows
1729 * nlookup() to bypass normal vnode checks. This flag is a cached flag
1730 * so clearing it simply forces revalidation.
1732 void
1733 cache_inval_wxok(struct vnode *vp)
1735 struct namecache *ncp;
1737 spin_lock(&vp->v_spin);
1738 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1739 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX))
1740 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX);
1742 spin_unlock(&vp->v_spin);
1746 * The source ncp has been renamed to the target ncp. Both fncp and tncp
1747 * must be locked. The target ncp is destroyed (as a normal rename-over
1748 * would destroy the target file or directory).
1750 * Because there may be references to the source ncp we cannot copy its
1751 * contents to the target. Instead the source ncp is relinked as the target
1752 * and the target ncp is removed from the namecache topology.
1754 void
1755 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
1757 struct namecache *fncp = fnch->ncp;
1758 struct namecache *tncp = tnch->ncp;
1759 struct namecache *tncp_par;
1760 struct nchash_head *nchpp;
1761 u_int32_t hash;
1762 char *oname;
1763 char *nname;
1765 ++fncp->nc_generation;
1766 ++tncp->nc_generation;
1767 if (tncp->nc_nlen) {
1768 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
1769 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
1770 nname[tncp->nc_nlen] = 0;
1771 } else {
1772 nname = NULL;
1776 * Rename fncp (unlink)
1778 _cache_unlink_parent(fncp);
1779 oname = fncp->nc_name;
1780 fncp->nc_name = nname;
1781 fncp->nc_nlen = tncp->nc_nlen;
1782 if (oname)
1783 kfree(oname, M_VFSCACHE);
1785 tncp_par = tncp->nc_parent;
1786 _cache_hold(tncp_par);
1787 _cache_lock(tncp_par);
1790 * Rename fncp (relink)
1792 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
1793 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
1794 nchpp = NCHHASH(hash);
1796 spin_lock(&nchpp->spin);
1797 _cache_link_parent(fncp, tncp_par, nchpp);
1798 spin_unlock(&nchpp->spin);
1800 _cache_put(tncp_par);
1803 * Get rid of the overwritten tncp (unlink)
1805 _cache_unlink(tncp);
1809 * Perform actions consistent with unlinking a file. The passed-in ncp
1810 * must be locked.
1812 * The ncp is marked DESTROYED so it no longer shows up in searches,
1813 * and will be physically deleted when the vnode goes away.
1815 * If the related vnode has no refs then we cycle it through vget()/vput()
1816 * to (possibly if we don't have a ref race) trigger a deactivation,
1817 * allowing the VFS to trivially detect and recycle the deleted vnode
1818 * via VOP_INACTIVE().
1820 * NOTE: _cache_rename() will automatically call _cache_unlink() on the
1821 * target ncp.
1823 void
1824 cache_unlink(struct nchandle *nch)
1826 _cache_unlink(nch->ncp);
1829 static void
1830 _cache_unlink(struct namecache *ncp)
1832 struct vnode *vp;
1835 * Causes lookups to fail and allows another ncp with the same
1836 * name to be created under ncp->nc_parent.
1838 ncp->nc_flag |= NCF_DESTROYED;
1839 ++ncp->nc_generation;
1842 * Attempt to trigger a deactivation. Set VREF_FINALIZE to
1843 * force action on the 1->0 transition.
1845 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1846 (vp = ncp->nc_vp) != NULL) {
1847 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
1848 if (VREFCNT(vp) <= 0) {
1849 if (vget(vp, LK_SHARED) == 0)
1850 vput(vp);
1856 * Return non-zero if the nch might be associated with an open and/or mmap()'d
1857 * file. The easy solution is to just return non-zero if the vnode has refs.
1858 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
1859 * force the reclaim).
1862 cache_isopen(struct nchandle *nch)
1864 struct vnode *vp;
1865 struct namecache *ncp = nch->ncp;
1867 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1868 (vp = ncp->nc_vp) != NULL &&
1869 VREFCNT(vp)) {
1870 return 1;
1872 return 0;
1877 * vget the vnode associated with the namecache entry. Resolve the namecache
1878 * entry if necessary. The passed ncp must be referenced and locked. If
1879 * the ncp is resolved it might be locked shared.
1881 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked
1882 * (depending on the passed lk_type) will be returned in *vpp with an error
1883 * of 0, or NULL will be returned in *vpp with a non-0 error code. The
1884 * most typical error is ENOENT, meaning that the ncp represents a negative
1885 * cache hit and there is no vnode to retrieve, but other errors can occur
1886 * too.
1888 * The vget() can race a reclaim. If this occurs we re-resolve the
1889 * namecache entry.
1891 * There are numerous places in the kernel where vget() is called on a
1892 * vnode while one or more of its namecache entries is locked. Releasing
1893 * a vnode never deadlocks against locked namecache entries (the vnode
1894 * will not get recycled while referenced ncp's exist). This means we
1895 * can safely acquire the vnode. In fact, we MUST NOT release the ncp
1896 * lock when acquiring the vp lock or we might cause a deadlock.
1898 * NOTE: The passed-in ncp must be locked exclusively if it is initially
1899 * unresolved. If a reclaim race occurs the passed-in ncp will be
1900 * relocked exclusively before being re-resolved.
1903 cache_vget(struct nchandle *nch, struct ucred *cred,
1904 int lk_type, struct vnode **vpp)
1906 struct namecache *ncp;
1907 struct vnode *vp;
1908 int error;
1910 ncp = nch->ncp;
1911 again:
1912 vp = NULL;
1913 if (ncp->nc_flag & NCF_UNRESOLVED)
1914 error = cache_resolve(nch, cred);
1915 else
1916 error = 0;
1918 if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1919 error = vget(vp, lk_type);
1920 if (error) {
1922 * VRECLAIM race
1924 * The ncp may have been locked shared, we must relock
1925 * it exclusively before we can set it to unresolved.
1927 if (error == ENOENT) {
1928 kprintf("Warning: vnode reclaim race detected "
1929 "in cache_vget on %p (%s)\n",
1930 vp, ncp->nc_name);
1931 _cache_unlock(ncp);
1932 _cache_lock(ncp);
1933 _cache_setunresolved(ncp);
1934 goto again;
1938 * Not a reclaim race, some other error.
1940 KKASSERT(ncp->nc_vp == vp);
1941 vp = NULL;
1942 } else {
1943 KKASSERT(ncp->nc_vp == vp);
1944 KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1947 if (error == 0 && vp == NULL)
1948 error = ENOENT;
1949 *vpp = vp;
1950 return(error);
1954 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode
1955 * is already held by virtuue of the ncp being locked, but it might not be
1956 * referenced and while it is not referenced it can transition into the
1957 * VRECLAIMED state.
1959 * NOTE: The passed-in ncp must be locked exclusively if it is initially
1960 * unresolved. If a reclaim race occurs the passed-in ncp will be
1961 * relocked exclusively before being re-resolved.
1963 * NOTE: At the moment we have to issue a vget() on the vnode, even though
1964 * we are going to immediately release the lock, in order to resolve
1965 * potential reclamation races. Once we have a solid vnode ref that
1966 * was (at some point) interlocked via a vget(), the vnode will not
1967 * be reclaimed.
1969 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation.
1972 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
1974 struct namecache *ncp;
1975 struct vnode *vp;
1976 int error;
1977 int v;
1979 ncp = nch->ncp;
1980 again:
1981 vp = NULL;
1982 if (ncp->nc_flag & NCF_UNRESOLVED)
1983 error = cache_resolve(nch, cred);
1984 else
1985 error = 0;
1987 while (error == 0 && (vp = ncp->nc_vp) != NULL) {
1989 * Try a lockless ref of the vnode. VRECLAIMED transitions
1990 * use the vx_lock state and update-counter mechanism so we
1991 * can detect if one is in-progress or occurred.
1993 * If we can successfully ref the vnode and interlock against
1994 * the update-counter mechanism, and VRECLAIMED is found to
1995 * not be set after that, we should be good.
1997 v = spin_access_start_only(&vp->v_spin);
1998 if (__predict_true(spin_access_check_inprog(v) == 0)) {
1999 vref_special(vp);
2000 if (__predict_false(
2001 spin_access_end_only(&vp->v_spin, v))) {
2002 vrele(vp);
2003 kprintf("CACHE_VREF: RACED %p\n", vp);
2004 continue;
2006 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) {
2007 break;
2009 vrele(vp);
2010 kprintf("CACHE_VREF: IN-RECLAIM\n");
2014 * Do it the slow way
2016 error = vget(vp, LK_SHARED);
2017 if (error) {
2019 * VRECLAIM race
2021 if (error == ENOENT) {
2022 kprintf("Warning: vnode reclaim race detected "
2023 "in cache_vget on %p (%s)\n",
2024 vp, ncp->nc_name);
2025 _cache_unlock(ncp);
2026 _cache_lock(ncp);
2027 _cache_setunresolved(ncp);
2028 goto again;
2032 * Not a reclaim race, some other error.
2034 KKASSERT(ncp->nc_vp == vp);
2035 vp = NULL;
2036 } else {
2037 KKASSERT(ncp->nc_vp == vp);
2038 KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2039 /* caller does not want a lock */
2040 vn_unlock(vp);
2042 break;
2044 if (error == 0 && vp == NULL)
2045 error = ENOENT;
2046 *vpp = vp;
2048 return(error);
2052 * Return a referenced vnode representing the parent directory of
2053 * ncp.
2055 * Because the caller has locked the ncp it should not be possible for
2056 * the parent ncp to go away. However, the parent can unresolve its
2057 * dvp at any time so we must be able to acquire a lock on the parent
2058 * to safely access nc_vp.
2060 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
2061 * so use vhold()/vdrop() while holding the lock to prevent dvp from
2062 * getting destroyed.
2064 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
2065 * lock on the ncp in question..
2067 struct vnode *
2068 cache_dvpref(struct namecache *ncp)
2070 struct namecache *par;
2071 struct vnode *dvp;
2073 dvp = NULL;
2074 if ((par = ncp->nc_parent) != NULL) {
2075 _cache_hold(par);
2076 _cache_lock(par);
2077 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
2078 if ((dvp = par->nc_vp) != NULL)
2079 vhold(dvp);
2081 _cache_unlock(par);
2082 if (dvp) {
2083 if (vget(dvp, LK_SHARED) == 0) {
2084 vn_unlock(dvp);
2085 vdrop(dvp);
2086 /* return refd, unlocked dvp */
2087 } else {
2088 vdrop(dvp);
2089 dvp = NULL;
2092 _cache_drop(par);
2094 return(dvp);
2098 * Convert a directory vnode to a namecache record without any other
2099 * knowledge of the topology. This ONLY works with directory vnodes and
2100 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the
2101 * returned ncp (if not NULL) will be held and unlocked.
2103 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2104 * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2105 * for dvp. This will fail only if the directory has been deleted out from
2106 * under the caller.
2108 * Callers must always check for a NULL return no matter the value of 'makeit'.
2110 * To avoid underflowing the kernel stack each recursive call increments
2111 * the makeit variable.
2114 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2115 struct vnode *dvp, char *fakename);
2116 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2117 struct vnode **saved_dvp);
2120 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2121 struct nchandle *nch)
2123 struct vnode *saved_dvp;
2124 struct vnode *pvp;
2125 char *fakename;
2126 int error;
2128 nch->ncp = NULL;
2129 nch->mount = dvp->v_mount;
2130 saved_dvp = NULL;
2131 fakename = NULL;
2134 * Handle the makeit == 0 degenerate case
2136 if (makeit == 0) {
2137 spin_lock_shared(&dvp->v_spin);
2138 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2139 if (nch->ncp)
2140 cache_hold(nch);
2141 spin_unlock_shared(&dvp->v_spin);
2145 * Loop until resolution, inside code will break out on error.
2147 while (makeit) {
2149 * Break out if we successfully acquire a working ncp.
2151 spin_lock_shared(&dvp->v_spin);
2152 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2153 if (nch->ncp) {
2154 cache_hold(nch);
2155 spin_unlock_shared(&dvp->v_spin);
2156 break;
2158 spin_unlock_shared(&dvp->v_spin);
2161 * If dvp is the root of its filesystem it should already
2162 * have a namecache pointer associated with it as a side
2163 * effect of the mount, but it may have been disassociated.
2165 if (dvp->v_flag & VROOT) {
2166 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2167 error = cache_resolve_mp(nch->mount);
2168 _cache_put(nch->ncp);
2169 if (ncvp_debug) {
2170 kprintf("cache_fromdvp: resolve root of mount %p error %d",
2171 dvp->v_mount, error);
2173 if (error) {
2174 if (ncvp_debug)
2175 kprintf(" failed\n");
2176 nch->ncp = NULL;
2177 break;
2179 if (ncvp_debug)
2180 kprintf(" succeeded\n");
2181 continue;
2185 * If we are recursed too deeply resort to an O(n^2)
2186 * algorithm to resolve the namecache topology. The
2187 * resolved pvp is left referenced in saved_dvp to
2188 * prevent the tree from being destroyed while we loop.
2190 if (makeit > 20) {
2191 error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2192 if (error) {
2193 kprintf("lookupdotdot(longpath) failed %d "
2194 "dvp %p\n", error, dvp);
2195 nch->ncp = NULL;
2196 break;
2198 continue;
2202 * Get the parent directory and resolve its ncp.
2204 if (fakename) {
2205 kfree(fakename, M_TEMP);
2206 fakename = NULL;
2208 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2209 &fakename);
2210 if (error) {
2211 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2212 break;
2214 vn_unlock(pvp);
2217 * Reuse makeit as a recursion depth counter. On success
2218 * nch will be fully referenced.
2220 cache_fromdvp(pvp, cred, makeit + 1, nch);
2221 vrele(pvp);
2222 if (nch->ncp == NULL)
2223 break;
2226 * Do an inefficient scan of pvp (embodied by ncp) to look
2227 * for dvp. This will create a namecache record for dvp on
2228 * success. We loop up to recheck on success.
2230 * ncp and dvp are both held but not locked.
2232 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2233 if (error) {
2234 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2235 pvp, nch->ncp->nc_name, dvp);
2236 cache_drop(nch);
2237 /* nch was NULLed out, reload mount */
2238 nch->mount = dvp->v_mount;
2239 break;
2241 if (ncvp_debug) {
2242 kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2243 pvp, nch->ncp->nc_name);
2245 cache_drop(nch);
2246 /* nch was NULLed out, reload mount */
2247 nch->mount = dvp->v_mount;
2251 * If nch->ncp is non-NULL it will have been held already.
2253 if (fakename)
2254 kfree(fakename, M_TEMP);
2255 if (saved_dvp)
2256 vrele(saved_dvp);
2257 if (nch->ncp)
2258 return (0);
2259 return (EINVAL);
2263 * Go up the chain of parent directories until we find something
2264 * we can resolve into the namecache. This is very inefficient.
2266 static
2268 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2269 struct vnode **saved_dvp)
2271 struct nchandle nch;
2272 struct vnode *pvp;
2273 int error;
2274 static time_t last_fromdvp_report;
2275 char *fakename;
2278 * Loop getting the parent directory vnode until we get something we
2279 * can resolve in the namecache.
2281 vref(dvp);
2282 nch.mount = dvp->v_mount;
2283 nch.ncp = NULL;
2284 fakename = NULL;
2286 for (;;) {
2287 if (fakename) {
2288 kfree(fakename, M_TEMP);
2289 fakename = NULL;
2291 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2292 &fakename);
2293 if (error) {
2294 vrele(dvp);
2295 break;
2297 vn_unlock(pvp);
2298 spin_lock_shared(&pvp->v_spin);
2299 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2300 _cache_hold(nch.ncp);
2301 spin_unlock_shared(&pvp->v_spin);
2302 vrele(pvp);
2303 break;
2305 spin_unlock_shared(&pvp->v_spin);
2306 if (pvp->v_flag & VROOT) {
2307 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2308 error = cache_resolve_mp(nch.mount);
2309 _cache_unlock(nch.ncp);
2310 vrele(pvp);
2311 if (error) {
2312 _cache_drop(nch.ncp);
2313 nch.ncp = NULL;
2314 vrele(dvp);
2316 break;
2318 vrele(dvp);
2319 dvp = pvp;
2321 if (error == 0) {
2322 if (last_fromdvp_report != time_uptime) {
2323 last_fromdvp_report = time_uptime;
2324 kprintf("Warning: extremely inefficient path "
2325 "resolution on %s\n",
2326 nch.ncp->nc_name);
2328 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2331 * Hopefully dvp now has a namecache record associated with
2332 * it. Leave it referenced to prevent the kernel from
2333 * recycling the vnode. Otherwise extremely long directory
2334 * paths could result in endless recycling.
2336 if (*saved_dvp)
2337 vrele(*saved_dvp);
2338 *saved_dvp = dvp;
2339 _cache_drop(nch.ncp);
2341 if (fakename)
2342 kfree(fakename, M_TEMP);
2343 return (error);
2347 * Do an inefficient scan of the directory represented by ncp looking for
2348 * the directory vnode dvp. ncp must be held but not locked on entry and
2349 * will be held on return. dvp must be refd but not locked on entry and
2350 * will remain refd on return.
2352 * Why do this at all? Well, due to its stateless nature the NFS server
2353 * converts file handles directly to vnodes without necessarily going through
2354 * the namecache ops that would otherwise create the namecache topology
2355 * leading to the vnode. We could either (1) Change the namecache algorithms
2356 * to allow disconnect namecache records that are re-merged opportunistically,
2357 * or (2) Make the NFS server backtrack and scan to recover a connected
2358 * namecache topology in order to then be able to issue new API lookups.
2360 * It turns out that (1) is a huge mess. It takes a nice clean set of
2361 * namecache algorithms and introduces a lot of complication in every subsystem
2362 * that calls into the namecache to deal with the re-merge case, especially
2363 * since we are using the namecache to placehold negative lookups and the
2364 * vnode might not be immediately assigned. (2) is certainly far less
2365 * efficient then (1), but since we are only talking about directories here
2366 * (which are likely to remain cached), the case does not actually run all
2367 * that often and has the supreme advantage of not polluting the namecache
2368 * algorithms.
2370 * If a fakename is supplied just construct a namecache entry using the
2371 * fake name.
2373 static int
2374 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2375 struct vnode *dvp, char *fakename)
2377 struct nlcomponent nlc;
2378 struct nchandle rncp;
2379 struct dirent *den;
2380 struct vnode *pvp;
2381 struct vattr vat;
2382 struct iovec iov;
2383 struct uio uio;
2384 int blksize;
2385 int eofflag;
2386 int bytes;
2387 char *rbuf;
2388 int error;
2390 vat.va_blocksize = 0;
2391 if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2392 return (error);
2393 cache_lock(nch);
2394 error = cache_vref(nch, cred, &pvp);
2395 cache_unlock(nch);
2396 if (error)
2397 return (error);
2398 if (ncvp_debug) {
2399 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
2400 "vattr fileid = %lld\n",
2401 nch->ncp, nch->ncp->nc_name,
2402 vat.va_blocksize,
2403 (long long)vat.va_fileid);
2407 * Use the supplied fakename if not NULL. Fake names are typically
2408 * not in the actual filesystem hierarchy. This is used by HAMMER
2409 * to glue @@timestamp recursions together.
2411 if (fakename) {
2412 nlc.nlc_nameptr = fakename;
2413 nlc.nlc_namelen = strlen(fakename);
2414 rncp = cache_nlookup(nch, &nlc);
2415 goto done;
2418 if ((blksize = vat.va_blocksize) == 0)
2419 blksize = DEV_BSIZE;
2420 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2421 rncp.ncp = NULL;
2423 eofflag = 0;
2424 uio.uio_offset = 0;
2425 again:
2426 iov.iov_base = rbuf;
2427 iov.iov_len = blksize;
2428 uio.uio_iov = &iov;
2429 uio.uio_iovcnt = 1;
2430 uio.uio_resid = blksize;
2431 uio.uio_segflg = UIO_SYSSPACE;
2432 uio.uio_rw = UIO_READ;
2433 uio.uio_td = curthread;
2435 if (ncvp_debug >= 2)
2436 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2437 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2438 if (error == 0) {
2439 den = (struct dirent *)rbuf;
2440 bytes = blksize - uio.uio_resid;
2442 while (bytes > 0) {
2443 if (ncvp_debug >= 2) {
2444 kprintf("cache_inefficient_scan: %*.*s\n",
2445 den->d_namlen, den->d_namlen,
2446 den->d_name);
2448 if (den->d_type != DT_WHT &&
2449 den->d_ino == vat.va_fileid) {
2450 if (ncvp_debug) {
2451 kprintf("cache_inefficient_scan: "
2452 "MATCHED inode %lld path %s/%*.*s\n",
2453 (long long)vat.va_fileid,
2454 nch->ncp->nc_name,
2455 den->d_namlen, den->d_namlen,
2456 den->d_name);
2458 nlc.nlc_nameptr = den->d_name;
2459 nlc.nlc_namelen = den->d_namlen;
2460 rncp = cache_nlookup(nch, &nlc);
2461 KKASSERT(rncp.ncp != NULL);
2462 break;
2464 bytes -= _DIRENT_DIRSIZ(den);
2465 den = _DIRENT_NEXT(den);
2467 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2468 goto again;
2470 kfree(rbuf, M_TEMP);
2471 done:
2472 vrele(pvp);
2473 if (rncp.ncp) {
2474 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2475 _cache_setvp(rncp.mount, rncp.ncp, dvp);
2476 if (ncvp_debug >= 2) {
2477 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2478 nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2480 } else {
2481 if (ncvp_debug >= 2) {
2482 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2483 nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2484 rncp.ncp->nc_vp);
2487 if (rncp.ncp->nc_vp == NULL)
2488 error = rncp.ncp->nc_error;
2490 * Release rncp after a successful nlookup. rncp was fully
2491 * referenced.
2493 cache_put(&rncp);
2494 } else {
2495 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2496 dvp, nch->ncp->nc_name);
2497 error = ENOENT;
2499 return (error);
2503 * This function must be called with the ncp held and locked and will unlock
2504 * and drop it during zapping.
2506 * Zap a namecache entry. The ncp is unconditionally set to an unresolved
2507 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list
2508 * and removes the related reference. If the ncp can be removed, and the
2509 * parent can be zapped non-blocking, this function loops up.
2511 * There will be one ref from the caller (which we now own). The only
2512 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list,
2513 * so possibly 2 refs left. Taking this into account, if there are no
2514 * additional refs and no children, the ncp will be removed from the topology
2515 * and destroyed.
2517 * References and/or children may exist if the ncp is in the middle of the
2518 * topology, preventing the ncp from being destroyed.
2520 * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2522 * This function may return a held (but NOT locked) parent node which the
2523 * caller must drop in a loop. Looping is one way to avoid unbounded recursion
2524 * due to deep namecache trees.
2526 * WARNING! For MPSAFE operation this routine must acquire up to three
2527 * spin locks to be able to safely test nc_refs. Lock order is
2528 * very important.
2530 * hash spinlock if on hash list
2531 * parent spinlock if child of parent
2532 * (the ncp is unresolved so there is no vnode association)
2534 static void
2535 cache_zap(struct namecache *ncp)
2537 struct namecache *par;
2538 struct vnode *dropvp;
2539 struct nchash_head *nchpp;
2540 int refcmp;
2541 int nonblock = 1; /* XXX cleanup */
2543 again:
2545 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2546 * This gets rid of any vp->v_namecache list or negative list and
2547 * the related ref.
2549 _cache_setunresolved(ncp);
2552 * Try to scrap the entry and possibly tail-recurse on its parent.
2553 * We only scrap unref'd (other then our ref) unresolved entries,
2554 * we do not scrap 'live' entries.
2556 * If nc_parent is non NULL we expect 2 references, else just 1.
2557 * If there are more, someone else also holds the ncp and we cannot
2558 * destroy it.
2560 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2561 KKASSERT(ncp->nc_refs > 0);
2564 * If the ncp is linked to its parent it will also be in the hash
2565 * table. We have to be able to lock the parent and the hash table.
2567 * Acquire locks. Note that the parent can't go away while we hold
2568 * a child locked. If nc_parent is present, expect 2 refs instead
2569 * of 1.
2571 nchpp = NULL;
2572 if ((par = ncp->nc_parent) != NULL) {
2573 if (nonblock) {
2574 if (_cache_lock_nonblock(par)) {
2575 /* lock failed */
2576 ncp->nc_flag |= NCF_DEFEREDZAP;
2577 atomic_add_long(
2578 &pcpu_ncache[mycpu->gd_cpuid].numdefered,
2580 _cache_unlock(ncp);
2581 _cache_drop(ncp); /* caller's ref */
2582 return;
2584 _cache_hold(par);
2585 } else {
2586 _cache_hold(par);
2587 _cache_lock(par);
2589 nchpp = ncp->nc_head;
2590 spin_lock(&nchpp->spin);
2594 * With the parent and nchpp locked, and the vnode removed
2595 * (no vp->v_namecache), we expect 1 or 2 refs. If there are
2596 * more someone else has a ref and we cannot zap the entry.
2598 * one for our hold
2599 * one for our parent link (parent also has one from the linkage)
2601 if (par)
2602 refcmp = 2;
2603 else
2604 refcmp = 1;
2607 * On failure undo the work we've done so far and drop the
2608 * caller's ref and ncp.
2610 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) {
2611 if (par) {
2612 spin_unlock(&nchpp->spin);
2613 _cache_put(par);
2615 _cache_unlock(ncp);
2616 _cache_drop(ncp);
2617 return;
2621 * We own all the refs and with the spinlocks held no further
2622 * refs can be acquired by others.
2624 * Remove us from the hash list and parent list. We have to
2625 * drop a ref on the parent's vp if the parent's list becomes
2626 * empty.
2628 dropvp = NULL;
2629 if (par) {
2630 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
2632 KKASSERT(nchpp == ncp->nc_head);
2633 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash);
2634 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
2635 atomic_add_long(&pn->vfscache_count, -1);
2636 if (TAILQ_EMPTY(&ncp->nc_list))
2637 atomic_add_long(&pn->vfscache_leafs, -1);
2639 if (TAILQ_EMPTY(&par->nc_list)) {
2640 atomic_add_long(&pn->vfscache_leafs, 1);
2641 if (par->nc_vp)
2642 dropvp = par->nc_vp;
2644 ncp->nc_parent = NULL;
2645 ncp->nc_head = NULL;
2646 spin_unlock(&nchpp->spin);
2647 _cache_drop(par); /* removal of ncp from par->nc_list */
2648 /*_cache_unlock(par);*/
2649 } else {
2650 KKASSERT(ncp->nc_head == NULL);
2654 * ncp should not have picked up any refs. Physically
2655 * destroy the ncp.
2657 if (ncp->nc_refs != refcmp) {
2658 panic("cache_zap: %p bad refs %d (expected %d)\n",
2659 ncp, ncp->nc_refs, refcmp);
2661 /* _cache_unlock(ncp) not required */
2662 ncp->nc_refs = -1; /* safety */
2663 if (ncp->nc_name)
2664 kfree(ncp->nc_name, M_VFSCACHE);
2665 kfree(ncp, M_VFSCACHE);
2668 * Delayed drop (we had to release our spinlocks)
2670 if (dropvp)
2671 vdrop(dropvp);
2674 * Loop up if we can recursively clean out the parent.
2676 if (par) {
2677 refcmp = 1; /* ref on parent */
2678 if (par->nc_parent) /* par->par */
2679 ++refcmp;
2680 par->nc_flag &= ~NCF_DEFEREDZAP;
2681 if ((par->nc_flag & NCF_UNRESOLVED) &&
2682 par->nc_refs == refcmp &&
2683 TAILQ_EMPTY(&par->nc_list)) {
2684 ncp = par;
2685 goto again;
2687 _cache_unlock(par);
2688 _cache_drop(par);
2693 * Clean up dangling negative cache and defered-drop entries in the
2694 * namecache.
2696 * This routine is called in the critical path and also called from
2697 * vnlru(). When called from vnlru we use a lower limit to try to
2698 * deal with the negative cache before the critical path has to start
2699 * dealing with it.
2701 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2703 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2704 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2706 void
2707 cache_hysteresis(int critpath)
2709 long poslimit;
2710 long neglimit = maxvnodes / ncnegfactor;
2711 long xnumcache = vfscache_leafs;
2713 if (critpath == 0)
2714 neglimit = neglimit * 8 / 10;
2717 * Don't cache too many negative hits. We use hysteresis to reduce
2718 * the impact on the critical path.
2720 switch(neg_cache_hysteresis_state[critpath]) {
2721 case CHI_LOW:
2722 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) {
2723 if (critpath)
2724 _cache_cleanneg(ncnegflush);
2725 else
2726 _cache_cleanneg(ncnegflush +
2727 vfscache_negs - neglimit);
2728 neg_cache_hysteresis_state[critpath] = CHI_HIGH;
2730 break;
2731 case CHI_HIGH:
2732 if (vfscache_negs > MINNEG * 9 / 10 &&
2733 vfscache_negs * 9 / 10 > neglimit
2735 if (critpath)
2736 _cache_cleanneg(ncnegflush);
2737 else
2738 _cache_cleanneg(ncnegflush +
2739 vfscache_negs * 9 / 10 -
2740 neglimit);
2741 } else {
2742 neg_cache_hysteresis_state[critpath] = CHI_LOW;
2744 break;
2748 * Don't cache too many positive hits. We use hysteresis to reduce
2749 * the impact on the critical path.
2751 * Excessive positive hits can accumulate due to large numbers of
2752 * hardlinks (the vnode cache will not prevent hl ncps from growing
2753 * into infinity).
2755 if ((poslimit = ncposlimit) == 0)
2756 poslimit = maxvnodes * 2;
2757 if (critpath == 0)
2758 poslimit = poslimit * 8 / 10;
2760 switch(pos_cache_hysteresis_state[critpath]) {
2761 case CHI_LOW:
2762 if (xnumcache > poslimit && xnumcache > MINPOS) {
2763 if (critpath)
2764 _cache_cleanpos(ncposflush);
2765 else
2766 _cache_cleanpos(ncposflush +
2767 xnumcache - poslimit);
2768 pos_cache_hysteresis_state[critpath] = CHI_HIGH;
2770 break;
2771 case CHI_HIGH:
2772 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) {
2773 if (critpath)
2774 _cache_cleanpos(ncposflush);
2775 else
2776 _cache_cleanpos(ncposflush +
2777 xnumcache - poslimit * 5 / 6);
2778 } else {
2779 pos_cache_hysteresis_state[critpath] = CHI_LOW;
2781 break;
2785 * Clean out dangling defered-zap ncps which could not be cleanly
2786 * dropped if too many build up. Note that numdefered is
2787 * heuristical. Make sure we are real-time for the current cpu,
2788 * plus the global rollup.
2790 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) {
2791 _cache_cleandefered();
2796 * NEW NAMECACHE LOOKUP API
2798 * Lookup an entry in the namecache. The passed par_nch must be referenced
2799 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp
2800 * is ALWAYS returned, eve if the supplied component is illegal.
2802 * The resulting namecache entry should be returned to the system with
2803 * cache_put() or cache_unlock() + cache_drop().
2805 * namecache locks are recursive but care must be taken to avoid lock order
2806 * reversals (hence why the passed par_nch must be unlocked). Locking
2807 * rules are to order for parent traversals, not for child traversals.
2809 * Nobody else will be able to manipulate the associated namespace (e.g.
2810 * create, delete, rename, rename-target) until the caller unlocks the
2811 * entry.
2813 * The returned entry will be in one of three states: positive hit (non-null
2814 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2815 * Unresolved entries must be resolved through the filesystem to associate the
2816 * vnode and/or determine whether a positive or negative hit has occured.
2818 * It is not necessary to lock a directory in order to lock namespace under
2819 * that directory. In fact, it is explicitly not allowed to do that. A
2820 * directory is typically only locked when being created, renamed, or
2821 * destroyed.
2823 * The directory (par) may be unresolved, in which case any returned child
2824 * will likely also be marked unresolved. Likely but not guarenteed. Since
2825 * the filesystem lookup requires a resolved directory vnode the caller is
2826 * responsible for resolving the namecache chain top-down. This API
2827 * specifically allows whole chains to be created in an unresolved state.
2829 struct nchandle
2830 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
2832 struct nchandle nch;
2833 struct namecache *ncp;
2834 struct namecache *new_ncp;
2835 struct namecache *rep_ncp; /* reuse a destroyed ncp */
2836 struct nchash_head *nchpp;
2837 struct mount *mp;
2838 u_int32_t hash;
2839 globaldata_t gd;
2840 int par_locked;
2842 gd = mycpu;
2843 mp = par_nch->mount;
2844 par_locked = 0;
2847 * This is a good time to call it, no ncp's are locked by
2848 * the caller or us.
2850 cache_hysteresis(1);
2853 * Try to locate an existing entry
2855 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2856 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2857 new_ncp = NULL;
2858 nchpp = NCHHASH(hash);
2859 restart:
2860 rep_ncp = NULL;
2861 if (new_ncp)
2862 spin_lock(&nchpp->spin);
2863 else
2864 spin_lock_shared(&nchpp->spin);
2866 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
2868 * Break out if we find a matching entry. Note that
2869 * UNRESOLVED entries may match, but DESTROYED entries
2870 * do not.
2872 * We may be able to reuse DESTROYED entries that we come
2873 * across, even if the name does not match, as long as
2874 * nc_nlen is correct and the only hold ref is from the nchpp
2875 * list itself.
2877 if (ncp->nc_parent == par_nch->ncp &&
2878 ncp->nc_nlen == nlc->nlc_namelen) {
2879 if (ncp->nc_flag & NCF_DESTROYED) {
2880 if (ncp->nc_refs == 1 && rep_ncp == NULL)
2881 rep_ncp = ncp;
2882 continue;
2884 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen))
2885 continue;
2886 _cache_hold(ncp);
2887 if (new_ncp)
2888 spin_unlock(&nchpp->spin);
2889 else
2890 spin_unlock_shared(&nchpp->spin);
2891 if (par_locked) {
2892 _cache_unlock(par_nch->ncp);
2893 par_locked = 0;
2895 if (_cache_lock_special(ncp) == 0) {
2897 * Successfully locked but we must re-test
2898 * conditions that might have changed since
2899 * we did not have the lock before.
2901 if (ncp->nc_parent != par_nch->ncp ||
2902 ncp->nc_nlen != nlc->nlc_namelen ||
2903 bcmp(ncp->nc_name, nlc->nlc_nameptr,
2904 ncp->nc_nlen) ||
2905 (ncp->nc_flag & NCF_DESTROYED)) {
2906 _cache_put(ncp);
2907 goto restart;
2909 _cache_auto_unresolve(mp, ncp);
2910 if (new_ncp)
2911 _cache_free(new_ncp);
2912 goto found;
2914 _cache_get(ncp); /* cycle the lock to block */
2915 _cache_put(ncp);
2916 _cache_drop(ncp);
2917 goto restart;
2922 * We failed to locate the entry, try to resurrect a destroyed
2923 * entry that we did find that is already correctly linked into
2924 * nchpp and the parent. We must re-test conditions after
2925 * successfully locking rep_ncp.
2927 * This case can occur under heavy loads due to not being able
2928 * to safely lock the parent in cache_zap(). Nominally a repeated
2929 * create/unlink load, but only the namelen needs to match.
2931 if (rep_ncp && new_ncp == NULL) {
2932 if (_cache_lock_nonblock(rep_ncp) == 0) {
2933 _cache_hold(rep_ncp);
2934 if (rep_ncp->nc_parent == par_nch->ncp &&
2935 rep_ncp->nc_nlen == nlc->nlc_namelen &&
2936 (rep_ncp->nc_flag & NCF_DESTROYED) &&
2937 rep_ncp->nc_refs == 2) {
2939 * Update nc_name as reuse as new.
2941 ncp = rep_ncp;
2942 bcopy(nlc->nlc_nameptr, ncp->nc_name,
2943 nlc->nlc_namelen);
2944 spin_unlock_shared(&nchpp->spin);
2945 _cache_setunresolved(ncp);
2946 ncp->nc_flag = NCF_UNRESOLVED;
2947 ncp->nc_error = ENOTCONN;
2948 goto found;
2950 _cache_put(rep_ncp);
2955 * Otherwise create a new entry and add it to the cache. The parent
2956 * ncp must also be locked so we can link into it.
2958 * We have to relookup after possibly blocking in kmalloc or
2959 * when locking par_nch.
2961 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2962 * mount case, in which case nc_name will be NULL.
2964 if (new_ncp == NULL) {
2965 spin_unlock_shared(&nchpp->spin);
2966 new_ncp = cache_alloc(nlc->nlc_namelen);
2967 if (nlc->nlc_namelen) {
2968 bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2969 nlc->nlc_namelen);
2970 new_ncp->nc_name[nlc->nlc_namelen] = 0;
2972 goto restart;
2976 * NOTE! The spinlock is held exclusively here because new_ncp
2977 * is non-NULL.
2979 if (par_locked == 0) {
2980 spin_unlock(&nchpp->spin);
2981 _cache_lock(par_nch->ncp);
2982 par_locked = 1;
2983 goto restart;
2987 * Link to parent (requires another ref, the one already in new_ncp
2988 * is what we wil lreturn).
2990 * WARNING! We still hold the spinlock. We have to set the hash
2991 * table entry atomically.
2993 ncp = new_ncp;
2994 ++ncp->nc_refs;
2995 _cache_link_parent(ncp, par_nch->ncp, nchpp);
2996 spin_unlock(&nchpp->spin);
2997 _cache_unlock(par_nch->ncp);
2998 /* par_locked = 0 - not used */
2999 found:
3001 * stats and namecache size management
3003 if (ncp->nc_flag & NCF_UNRESOLVED)
3004 ++gd->gd_nchstats->ncs_miss;
3005 else if (ncp->nc_vp)
3006 ++gd->gd_nchstats->ncs_goodhits;
3007 else
3008 ++gd->gd_nchstats->ncs_neghits;
3009 nch.mount = mp;
3010 nch.ncp = ncp;
3011 _cache_mntref(nch.mount);
3013 return(nch);
3017 * Attempt to lookup a namecache entry and return with a shared namecache
3018 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is
3019 * set or we are unable to lock.
3022 cache_nlookup_maybe_shared(struct nchandle *par_nch,
3023 struct nlcomponent *nlc,
3024 int excl, struct nchandle *res_nch)
3026 struct namecache *ncp;
3027 struct nchash_head *nchpp;
3028 struct mount *mp;
3029 u_int32_t hash;
3030 globaldata_t gd;
3033 * If exclusive requested or shared namecache locks are disabled,
3034 * return failure.
3036 if (ncp_shared_lock_disable || excl)
3037 return(EWOULDBLOCK);
3039 gd = mycpu;
3040 mp = par_nch->mount;
3043 * This is a good time to call it, no ncp's are locked by
3044 * the caller or us.
3046 cache_hysteresis(1);
3049 * Try to locate an existing entry
3051 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3052 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3053 nchpp = NCHHASH(hash);
3055 spin_lock_shared(&nchpp->spin);
3057 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3059 * Break out if we find a matching entry. Note that
3060 * UNRESOLVED entries may match, but DESTROYED entries
3061 * do not.
3063 if (ncp->nc_parent == par_nch->ncp &&
3064 ncp->nc_nlen == nlc->nlc_namelen &&
3065 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3066 (ncp->nc_flag & NCF_DESTROYED) == 0
3068 _cache_hold(ncp);
3069 spin_unlock_shared(&nchpp->spin);
3071 if (_cache_lock_shared_special(ncp) == 0) {
3072 if (ncp->nc_parent == par_nch->ncp &&
3073 ncp->nc_nlen == nlc->nlc_namelen &&
3074 bcmp(ncp->nc_name, nlc->nlc_nameptr,
3075 ncp->nc_nlen) == 0 &&
3076 (ncp->nc_flag & NCF_DESTROYED) == 0 &&
3077 (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
3078 _cache_auto_unresolve_test(mp, ncp) == 0) {
3079 goto found;
3081 _cache_unlock(ncp);
3083 _cache_drop(ncp);
3084 return(EWOULDBLOCK);
3089 * Failure
3091 spin_unlock_shared(&nchpp->spin);
3092 return(EWOULDBLOCK);
3095 * Success
3097 * Note that nc_error might be non-zero (e.g ENOENT).
3099 found:
3100 res_nch->mount = mp;
3101 res_nch->ncp = ncp;
3102 ++gd->gd_nchstats->ncs_goodhits;
3103 _cache_mntref(res_nch->mount);
3105 KKASSERT(ncp->nc_error != EWOULDBLOCK);
3106 return(ncp->nc_error);
3110 * This is a non-blocking verison of cache_nlookup() used by
3111 * nfs_readdirplusrpc_uio(). It can fail for any reason and
3112 * will return nch.ncp == NULL in that case.
3114 struct nchandle
3115 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
3117 struct nchandle nch;
3118 struct namecache *ncp;
3119 struct namecache *new_ncp;
3120 struct nchash_head *nchpp;
3121 struct mount *mp;
3122 u_int32_t hash;
3123 globaldata_t gd;
3124 int par_locked;
3126 gd = mycpu;
3127 mp = par_nch->mount;
3128 par_locked = 0;
3131 * Try to locate an existing entry
3133 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3134 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3135 new_ncp = NULL;
3136 nchpp = NCHHASH(hash);
3137 restart:
3138 spin_lock(&nchpp->spin);
3139 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3141 * Break out if we find a matching entry. Note that
3142 * UNRESOLVED entries may match, but DESTROYED entries
3143 * do not.
3145 if (ncp->nc_parent == par_nch->ncp &&
3146 ncp->nc_nlen == nlc->nlc_namelen &&
3147 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3148 (ncp->nc_flag & NCF_DESTROYED) == 0
3150 _cache_hold(ncp);
3151 spin_unlock(&nchpp->spin);
3152 if (par_locked) {
3153 _cache_unlock(par_nch->ncp);
3154 par_locked = 0;
3156 if (_cache_lock_special(ncp) == 0) {
3157 if (ncp->nc_parent != par_nch->ncp ||
3158 ncp->nc_nlen != nlc->nlc_namelen ||
3159 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) ||
3160 (ncp->nc_flag & NCF_DESTROYED)) {
3161 kprintf("cache_lookup_nonblock: "
3162 "ncp-race %p %*.*s\n",
3163 ncp,
3164 nlc->nlc_namelen,
3165 nlc->nlc_namelen,
3166 nlc->nlc_nameptr);
3167 _cache_unlock(ncp);
3168 _cache_drop(ncp);
3169 goto failed;
3171 _cache_auto_unresolve(mp, ncp);
3172 if (new_ncp) {
3173 _cache_free(new_ncp);
3174 new_ncp = NULL;
3176 goto found;
3178 _cache_drop(ncp);
3179 goto failed;
3184 * We failed to locate an entry, create a new entry and add it to
3185 * the cache. The parent ncp must also be locked so we
3186 * can link into it.
3188 * We have to relookup after possibly blocking in kmalloc or
3189 * when locking par_nch.
3191 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3192 * mount case, in which case nc_name will be NULL.
3194 if (new_ncp == NULL) {
3195 spin_unlock(&nchpp->spin);
3196 new_ncp = cache_alloc(nlc->nlc_namelen);
3197 if (nlc->nlc_namelen) {
3198 bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3199 nlc->nlc_namelen);
3200 new_ncp->nc_name[nlc->nlc_namelen] = 0;
3202 goto restart;
3204 if (par_locked == 0) {
3205 spin_unlock(&nchpp->spin);
3206 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3207 par_locked = 1;
3208 goto restart;
3210 goto failed;
3214 * Link to parent (requires another ref, the one already in new_ncp
3215 * is what we wil lreturn).
3217 * WARNING! We still hold the spinlock. We have to set the hash
3218 * table entry atomically.
3220 ncp = new_ncp;
3221 ++ncp->nc_refs;
3222 _cache_link_parent(ncp, par_nch->ncp, nchpp);
3223 spin_unlock(&nchpp->spin);
3224 _cache_unlock(par_nch->ncp);
3225 /* par_locked = 0 - not used */
3226 found:
3228 * stats and namecache size management
3230 if (ncp->nc_flag & NCF_UNRESOLVED)
3231 ++gd->gd_nchstats->ncs_miss;
3232 else if (ncp->nc_vp)
3233 ++gd->gd_nchstats->ncs_goodhits;
3234 else
3235 ++gd->gd_nchstats->ncs_neghits;
3236 nch.mount = mp;
3237 nch.ncp = ncp;
3238 _cache_mntref(nch.mount);
3240 return(nch);
3241 failed:
3242 if (new_ncp) {
3243 _cache_free(new_ncp);
3244 new_ncp = NULL;
3246 nch.mount = NULL;
3247 nch.ncp = NULL;
3248 return(nch);
3252 * This version is non-locking. The caller must validate the result
3253 * for parent-to-child continuity.
3255 * It can fail for any reason and will return nch.ncp == NULL in that case.
3257 struct nchandle
3258 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc)
3260 struct nchandle nch;
3261 struct namecache *ncp;
3262 struct nchash_head *nchpp;
3263 struct mount *mp;
3264 u_int32_t hash;
3265 globaldata_t gd;
3267 gd = mycpu;
3268 mp = par_nch->mount;
3271 * Try to locate an existing entry
3273 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3274 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3275 nchpp = NCHHASH(hash);
3277 spin_lock_shared(&nchpp->spin);
3278 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3280 * Break out if we find a matching entry. Note that
3281 * UNRESOLVED entries may match, but DESTROYED entries
3282 * do not.
3284 * Resolved NFS entries which have timed out fail so the
3285 * caller can rerun with normal locking.
3287 if (ncp->nc_parent == par_nch->ncp &&
3288 ncp->nc_nlen == nlc->nlc_namelen &&
3289 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3290 (ncp->nc_flag & NCF_DESTROYED) == 0
3292 if (_cache_auto_unresolve_test(par_nch->mount, ncp))
3293 break;
3294 _cache_hold(ncp);
3295 spin_unlock_shared(&nchpp->spin);
3296 goto found;
3299 spin_unlock_shared(&nchpp->spin);
3300 nch.mount = NULL;
3301 nch.ncp = NULL;
3302 return nch;
3303 found:
3305 * stats and namecache size management
3307 if (ncp->nc_flag & NCF_UNRESOLVED)
3308 ++gd->gd_nchstats->ncs_miss;
3309 else if (ncp->nc_vp)
3310 ++gd->gd_nchstats->ncs_goodhits;
3311 else
3312 ++gd->gd_nchstats->ncs_neghits;
3313 nch.mount = mp;
3314 nch.ncp = ncp;
3315 _cache_mntref(nch.mount);
3317 return(nch);
3321 * The namecache entry is marked as being used as a mount point.
3322 * Locate the mount if it is visible to the caller. The DragonFly
3323 * mount system allows arbitrary loops in the topology and disentangles
3324 * those loops by matching against (mp, ncp) rather than just (ncp).
3325 * This means any given ncp can dive any number of mounts, depending
3326 * on the relative mount (e.g. nullfs) the caller is at in the topology.
3328 * We use a very simple frontend cache to reduce SMP conflicts,
3329 * which we have to do because the mountlist scan needs an exclusive
3330 * lock around its ripout info list. Not to mention that there might
3331 * be a lot of mounts.
3333 * Because all mounts can potentially be accessed by all cpus, break the cpu's
3334 * down a bit to allow some contention rather than making the cache
3335 * excessively huge.
3337 * The hash table is split into per-cpu areas, is 4-way set-associative.
3339 struct findmount_info {
3340 struct mount *result;
3341 struct mount *nch_mount;
3342 struct namecache *nch_ncp;
3345 static __inline
3346 struct ncmount_cache *
3347 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp)
3349 uint32_t hash;
3351 hash = iscsi_crc32(&mp, sizeof(mp));
3352 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash);
3353 hash ^= hash >> 16;
3354 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1));
3356 return (&ncmount_cache[hash]);
3359 static
3360 struct ncmount_cache *
3361 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3363 struct ncmount_cache *ncc;
3364 struct ncmount_cache *best;
3365 int delta;
3366 int best_delta;
3367 int i;
3369 ncc = ncmount_cache_lookup4(mp, ncp);
3372 * NOTE: When checking for a ticks overflow implement a slop of
3373 * 2 ticks just to be safe, because ticks is accessed
3374 * non-atomically one CPU can increment it while another
3375 * is still using the old value.
3377 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */
3378 return ncc;
3379 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */
3380 if (delta < -2) /* overflow reset */
3381 ncc->ticks = ticks;
3382 best = ncc;
3383 best_delta = delta;
3385 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */
3386 ++ncc;
3387 if (ncc->ncp == ncp && ncc->mp == mp)
3388 return ncc;
3389 delta = (int)(ticks - ncc->ticks);
3390 if (delta < -2)
3391 ncc->ticks = ticks;
3392 if (delta > best_delta) {
3393 best_delta = delta;
3394 best = ncc;
3397 return best;
3401 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid
3402 * doing an expensive mountlist_scan*() if possible.
3404 * (mp, ncp) -> mountonpt.k
3406 * Returns a referenced mount pointer or NULL
3408 * General SMP operation uses a per-cpu umount_spin to interlock unmount
3409 * operations (that is, where the mp_target can be freed out from under us).
3411 * Lookups use the ncc->updating counter to validate the contents in order
3412 * to avoid having to obtain the per cache-element spin-lock. In addition,
3413 * the ticks field is only updated when it changes. However, if our per-cpu
3414 * lock fails due to an unmount-in-progress, we fall-back to the
3415 * cache-element's spin-lock.
3417 struct mount *
3418 cache_findmount(struct nchandle *nch)
3420 struct findmount_info info;
3421 struct ncmount_cache *ncc;
3422 struct ncmount_cache ncc_copy;
3423 struct mount *target;
3424 struct pcpu_ncache *pcpu;
3425 struct spinlock *spinlk;
3426 int update;
3428 pcpu = pcpu_ncache;
3429 if (ncmount_cache_enable == 0 || pcpu == NULL) {
3430 ncc = NULL;
3431 goto skip;
3433 pcpu += mycpu->gd_cpuid;
3435 again:
3436 ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3437 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3438 found:
3440 * This is a bit messy for now because we do not yet have
3441 * safe disposal of mount structures. We have to ref
3442 * ncc->mp_target but the 'update' counter only tell us
3443 * whether the cache has changed after the fact.
3445 * For now get a per-cpu spinlock that will only contend
3446 * against umount's. This is the best path. If it fails,
3447 * instead of waiting on the umount we fall-back to a
3448 * shared ncc->spin lock, which will generally only cost a
3449 * cache ping-pong.
3451 update = ncc->updating;
3452 if (__predict_true(spin_trylock(&pcpu->umount_spin))) {
3453 spinlk = &pcpu->umount_spin;
3454 } else {
3455 spinlk = &ncc->spin;
3456 spin_lock_shared(spinlk);
3458 if (update & 1) { /* update in progress */
3459 spin_unlock_any(spinlk);
3460 goto skip;
3462 ncc_copy = *ncc;
3463 cpu_lfence();
3464 if (ncc->updating != update) { /* content changed */
3465 spin_unlock_any(spinlk);
3466 goto again;
3468 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) {
3469 spin_unlock_any(spinlk);
3470 goto again;
3472 if (ncc_copy.isneg == 0) {
3473 target = ncc_copy.mp_target;
3474 if (target->mnt_ncmounton.mount == nch->mount &&
3475 target->mnt_ncmounton.ncp == nch->ncp) {
3477 * Cache hit (positive) (avoid dirtying
3478 * the cache line if possible)
3480 if (ncc->ticks != (int)ticks)
3481 ncc->ticks = (int)ticks;
3482 _cache_mntref(target);
3484 } else {
3486 * Cache hit (negative) (avoid dirtying
3487 * the cache line if possible)
3489 if (ncc->ticks != (int)ticks)
3490 ncc->ticks = (int)ticks;
3491 target = NULL;
3493 spin_unlock_any(spinlk);
3495 return target;
3497 skip:
3500 * Slow
3502 info.result = NULL;
3503 info.nch_mount = nch->mount;
3504 info.nch_ncp = nch->ncp;
3505 mountlist_scan(cache_findmount_callback, &info,
3506 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK);
3509 * To reduce multi-re-entry on the cache, relookup in the cache.
3510 * This can still race, obviously, but that's ok.
3512 ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3513 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3514 if (info.result)
3515 atomic_add_int(&info.result->mnt_refs, -1);
3516 goto found;
3520 * Cache the result.
3522 if ((info.result == NULL ||
3523 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) {
3524 spin_lock(&ncc->spin);
3525 atomic_add_int_nonlocked(&ncc->updating, 1);
3526 cpu_sfence();
3527 KKASSERT(ncc->updating & 1);
3528 if (ncc->mp != nch->mount) {
3529 if (ncc->mp)
3530 atomic_add_int(&ncc->mp->mnt_refs, -1);
3531 atomic_add_int(&nch->mount->mnt_refs, 1);
3532 ncc->mp = nch->mount;
3534 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/
3535 ncc->ticks = (int)ticks;
3537 if (info.result) {
3538 ncc->isneg = 0;
3539 if (ncc->mp_target != info.result) {
3540 if (ncc->mp_target)
3541 atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3542 ncc->mp_target = info.result;
3543 atomic_add_int(&info.result->mnt_refs, 1);
3545 } else {
3546 ncc->isneg = 1;
3547 if (ncc->mp_target) {
3548 atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3549 ncc->mp_target = NULL;
3552 cpu_sfence();
3553 atomic_add_int_nonlocked(&ncc->updating, 1);
3554 spin_unlock(&ncc->spin);
3556 return(info.result);
3559 static
3561 cache_findmount_callback(struct mount *mp, void *data)
3563 struct findmount_info *info = data;
3566 * Check the mount's mounted-on point against the passed nch.
3568 if (mp->mnt_ncmounton.mount == info->nch_mount &&
3569 mp->mnt_ncmounton.ncp == info->nch_ncp
3571 info->result = mp;
3572 _cache_mntref(mp);
3573 return(-1);
3575 return(0);
3578 void
3579 cache_dropmount(struct mount *mp)
3581 _cache_mntrel(mp);
3585 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive
3586 * or negative).
3588 * A full scan is not required, but for now just do it anyway.
3590 void
3591 cache_ismounting(struct mount *mp)
3593 struct ncmount_cache *ncc;
3594 struct mount *ncc_mp;
3595 int i;
3597 if (pcpu_ncache == NULL)
3598 return;
3600 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
3601 ncc = &ncmount_cache[i];
3602 if (ncc->mp != mp->mnt_ncmounton.mount ||
3603 ncc->ncp != mp->mnt_ncmounton.ncp) {
3604 continue;
3606 spin_lock(&ncc->spin);
3607 atomic_add_int_nonlocked(&ncc->updating, 1);
3608 cpu_sfence();
3609 KKASSERT(ncc->updating & 1);
3610 if (ncc->mp != mp->mnt_ncmounton.mount ||
3611 ncc->ncp != mp->mnt_ncmounton.ncp) {
3612 cpu_sfence();
3613 ++ncc->updating;
3614 spin_unlock(&ncc->spin);
3615 continue;
3617 ncc_mp = ncc->mp;
3618 ncc->ncp = NULL;
3619 ncc->mp = NULL;
3620 if (ncc_mp)
3621 atomic_add_int(&ncc_mp->mnt_refs, -1);
3622 ncc_mp = ncc->mp_target;
3623 ncc->mp_target = NULL;
3624 if (ncc_mp)
3625 atomic_add_int(&ncc_mp->mnt_refs, -1);
3626 ncc->ticks = (int)ticks - hz * 120;
3628 cpu_sfence();
3629 atomic_add_int_nonlocked(&ncc->updating, 1);
3630 spin_unlock(&ncc->spin);
3634 * Pre-cache the mount point
3636 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount,
3637 mp->mnt_ncmounton.ncp);
3639 spin_lock(&ncc->spin);
3640 atomic_add_int_nonlocked(&ncc->updating, 1);
3641 cpu_sfence();
3642 KKASSERT(ncc->updating & 1);
3644 if (ncc->mp)
3645 atomic_add_int(&ncc->mp->mnt_refs, -1);
3646 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1);
3647 ncc->mp = mp->mnt_ncmounton.mount;
3648 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */
3649 ncc->ticks = (int)ticks;
3651 ncc->isneg = 0;
3652 if (ncc->mp_target != mp) {
3653 if (ncc->mp_target)
3654 atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3655 ncc->mp_target = mp;
3656 atomic_add_int(&mp->mnt_refs, 1);
3658 cpu_sfence();
3659 atomic_add_int_nonlocked(&ncc->updating, 1);
3660 spin_unlock(&ncc->spin);
3664 * Scrap any ncmount_cache entries related to mp. Not only do we need to
3665 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any
3666 * negative hits involving (mp, <any>).
3668 * A full scan is required.
3670 void
3671 cache_unmounting(struct mount *mp)
3673 struct ncmount_cache *ncc;
3674 struct pcpu_ncache *pcpu;
3675 struct mount *ncc_mp;
3676 int i;
3678 pcpu = pcpu_ncache;
3679 if (pcpu == NULL)
3680 return;
3682 for (i = 0; i < ncpus; ++i)
3683 spin_lock(&pcpu[i].umount_spin);
3685 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
3686 ncc = &ncmount_cache[i];
3687 if (ncc->mp != mp && ncc->mp_target != mp)
3688 continue;
3689 spin_lock(&ncc->spin);
3690 atomic_add_int_nonlocked(&ncc->updating, 1);
3691 cpu_sfence();
3693 if (ncc->mp != mp && ncc->mp_target != mp) {
3694 atomic_add_int_nonlocked(&ncc->updating, 1);
3695 cpu_sfence();
3696 spin_unlock(&ncc->spin);
3697 continue;
3699 ncc_mp = ncc->mp;
3700 ncc->ncp = NULL;
3701 ncc->mp = NULL;
3702 if (ncc_mp)
3703 atomic_add_int(&ncc_mp->mnt_refs, -1);
3704 ncc_mp = ncc->mp_target;
3705 ncc->mp_target = NULL;
3706 if (ncc_mp)
3707 atomic_add_int(&ncc_mp->mnt_refs, -1);
3708 ncc->ticks = (int)ticks - hz * 120;
3710 cpu_sfence();
3711 atomic_add_int_nonlocked(&ncc->updating, 1);
3712 spin_unlock(&ncc->spin);
3715 for (i = 0; i < ncpus; ++i)
3716 spin_unlock(&pcpu[i].umount_spin);
3720 * Resolve an unresolved namecache entry, generally by looking it up.
3721 * The passed ncp must be locked and refd.
3723 * Theoretically since a vnode cannot be recycled while held, and since
3724 * the nc_parent chain holds its vnode as long as children exist, the
3725 * direct parent of the cache entry we are trying to resolve should
3726 * have a valid vnode. If not then generate an error that we can
3727 * determine is related to a resolver bug.
3729 * However, if a vnode was in the middle of a recyclement when the NCP
3730 * got locked, ncp->nc_vp might point to a vnode that is about to become
3731 * invalid. cache_resolve() handles this case by unresolving the entry
3732 * and then re-resolving it.
3734 * Note that successful resolution does not necessarily return an error
3735 * code of 0. If the ncp resolves to a negative cache hit then ENOENT
3736 * will be returned.
3739 cache_resolve(struct nchandle *nch, struct ucred *cred)
3741 struct namecache *par_tmp;
3742 struct namecache *par;
3743 struct namecache *ncp;
3744 struct nchandle nctmp;
3745 struct mount *mp;
3746 struct vnode *dvp;
3747 int error;
3749 ncp = nch->ncp;
3750 mp = nch->mount;
3751 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
3752 restart:
3754 * If the ncp is already resolved we have nothing to do. However,
3755 * we do want to guarentee that a usable vnode is returned when
3756 * a vnode is present, so make sure it hasn't been reclaimed.
3758 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3759 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3760 _cache_setunresolved(ncp);
3761 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
3762 return (ncp->nc_error);
3766 * If the ncp was destroyed it will never resolve again. This
3767 * can basically only happen when someone is chdir'd into an
3768 * empty directory which is then rmdir'd. We want to catch this
3769 * here and not dive the VFS because the VFS might actually
3770 * have a way to re-resolve the disconnected ncp, which will
3771 * result in inconsistencies in the cdir/nch for proc->p_fd.
3773 if (ncp->nc_flag & NCF_DESTROYED)
3774 return(EINVAL);
3777 * Mount points need special handling because the parent does not
3778 * belong to the same filesystem as the ncp.
3780 if (ncp == mp->mnt_ncmountpt.ncp)
3781 return (cache_resolve_mp(mp));
3784 * We expect an unbroken chain of ncps to at least the mount point,
3785 * and even all the way to root (but this code doesn't have to go
3786 * past the mount point).
3788 if (ncp->nc_parent == NULL) {
3789 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
3790 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3791 ncp->nc_error = EXDEV;
3792 return(ncp->nc_error);
3796 * The vp's of the parent directories in the chain are held via vhold()
3797 * due to the existance of the child, and should not disappear.
3798 * However, there are cases where they can disappear:
3800 * - due to filesystem I/O errors.
3801 * - due to NFS being stupid about tracking the namespace and
3802 * destroys the namespace for entire directories quite often.
3803 * - due to forced unmounts.
3804 * - due to an rmdir (parent will be marked DESTROYED)
3806 * When this occurs we have to track the chain backwards and resolve
3807 * it, looping until the resolver catches up to the current node. We
3808 * could recurse here but we might run ourselves out of kernel stack
3809 * so we do it in a more painful manner. This situation really should
3810 * not occur all that often, or if it does not have to go back too
3811 * many nodes to resolve the ncp.
3813 while ((dvp = cache_dvpref(ncp)) == NULL) {
3815 * This case can occur if a process is CD'd into a
3816 * directory which is then rmdir'd. If the parent is marked
3817 * destroyed there is no point trying to resolve it.
3819 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
3820 return(ENOENT);
3821 par = ncp->nc_parent;
3822 _cache_hold(par);
3823 _cache_lock(par);
3824 while ((par_tmp = par->nc_parent) != NULL &&
3825 par_tmp->nc_vp == NULL) {
3826 _cache_hold(par_tmp);
3827 _cache_lock(par_tmp);
3828 _cache_put(par);
3829 par = par_tmp;
3831 if (par->nc_parent == NULL) {
3832 kprintf("EXDEV case 2 %*.*s\n",
3833 par->nc_nlen, par->nc_nlen, par->nc_name);
3834 _cache_put(par);
3835 return (EXDEV);
3838 * The parent is not set in stone, ref and lock it to prevent
3839 * it from disappearing. Also note that due to renames it
3840 * is possible for our ncp to move and for par to no longer
3841 * be one of its parents. We resolve it anyway, the loop
3842 * will handle any moves.
3844 _cache_get(par); /* additional hold/lock */
3845 _cache_put(par); /* from earlier hold/lock */
3846 if (par == nch->mount->mnt_ncmountpt.ncp) {
3847 cache_resolve_mp(nch->mount);
3848 } else if ((dvp = cache_dvpref(par)) == NULL) {
3849 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n",
3850 par->nc_nlen, par->nc_nlen, par->nc_name);
3851 _cache_put(par);
3852 continue;
3853 } else {
3854 if (par->nc_flag & NCF_UNRESOLVED) {
3855 nctmp.mount = mp;
3856 nctmp.ncp = par;
3857 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3859 vrele(dvp);
3861 if ((error = par->nc_error) != 0) {
3862 if (par->nc_error != EAGAIN) {
3863 kprintf("EXDEV case 3 %*.*s error %d\n",
3864 par->nc_nlen, par->nc_nlen, par->nc_name,
3865 par->nc_error);
3866 _cache_put(par);
3867 return(error);
3869 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
3870 par, par->nc_nlen, par->nc_nlen, par->nc_name);
3872 _cache_put(par);
3873 /* loop */
3877 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
3878 * ncp's and reattach them. If this occurs the original ncp is marked
3879 * EAGAIN to force a relookup.
3881 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
3882 * ncp must already be resolved.
3884 if (dvp) {
3885 nctmp.mount = mp;
3886 nctmp.ncp = ncp;
3887 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3888 vrele(dvp);
3889 } else {
3890 ncp->nc_error = EPERM;
3892 if (ncp->nc_error == EAGAIN) {
3893 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
3894 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3895 goto restart;
3897 return(ncp->nc_error);
3901 * Resolve the ncp associated with a mount point. Such ncp's almost always
3902 * remain resolved and this routine is rarely called. NFS MPs tends to force
3903 * re-resolution more often due to its mac-truck-smash-the-namecache
3904 * method of tracking namespace changes.
3906 * The semantics for this call is that the passed ncp must be locked on
3907 * entry and will be locked on return. However, if we actually have to
3908 * resolve the mount point we temporarily unlock the entry in order to
3909 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of
3910 * the unlock we have to recheck the flags after we relock.
3912 static int
3913 cache_resolve_mp(struct mount *mp)
3915 struct namecache *ncp = mp->mnt_ncmountpt.ncp;
3916 struct vnode *vp;
3917 int error;
3919 KKASSERT(mp != NULL);
3922 * If the ncp is already resolved we have nothing to do. However,
3923 * we do want to guarentee that a usable vnode is returned when
3924 * a vnode is present, so make sure it hasn't been reclaimed.
3926 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3927 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3928 _cache_setunresolved(ncp);
3931 if (ncp->nc_flag & NCF_UNRESOLVED) {
3932 _cache_unlock(ncp);
3933 while (vfs_busy(mp, 0))
3935 error = VFS_ROOT(mp, &vp);
3936 _cache_lock(ncp);
3939 * recheck the ncp state after relocking.
3941 if (ncp->nc_flag & NCF_UNRESOLVED) {
3942 ncp->nc_error = error;
3943 if (error == 0) {
3944 _cache_setvp(mp, ncp, vp);
3945 vput(vp);
3946 } else {
3947 kprintf("[diagnostic] cache_resolve_mp: failed"
3948 " to resolve mount %p err=%d ncp=%p\n",
3949 mp, error, ncp);
3950 _cache_setvp(mp, ncp, NULL);
3952 } else if (error == 0) {
3953 vput(vp);
3955 vfs_unbusy(mp);
3957 return(ncp->nc_error);
3961 * Clean out negative cache entries when too many have accumulated.
3963 static void
3964 _cache_cleanneg(long count)
3966 struct pcpu_ncache *pn;
3967 struct namecache *ncp;
3968 static uint32_t neg_rover;
3969 uint32_t n;
3970 long vnegs;
3972 n = neg_rover++; /* SMP heuristical, race ok */
3973 cpu_ccfence();
3974 n = n % (uint32_t)ncpus;
3977 * Normalize vfscache_negs and count. count is sometimes based
3978 * on vfscache_negs. vfscache_negs is heuristical and can sometimes
3979 * have crazy values.
3981 vnegs = vfscache_negs;
3982 cpu_ccfence();
3983 if (vnegs <= MINNEG)
3984 vnegs = MINNEG;
3985 if (count < 1)
3986 count = 1;
3988 pn = &pcpu_ncache[n];
3989 spin_lock(&pn->neg_spin);
3990 count = pn->neg_count * count / vnegs + 1;
3991 spin_unlock(&pn->neg_spin);
3994 * Attempt to clean out the specified number of negative cache
3995 * entries.
3997 while (count > 0) {
3998 spin_lock(&pn->neg_spin);
3999 ncp = TAILQ_FIRST(&pn->neg_list);
4000 if (ncp == NULL) {
4001 spin_unlock(&pn->neg_spin);
4002 break;
4004 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
4005 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
4006 _cache_hold(ncp);
4007 spin_unlock(&pn->neg_spin);
4010 * This can race, so we must re-check that the ncp
4011 * is on the ncneg.list after successfully locking it.
4013 if (_cache_lock_special(ncp) == 0) {
4014 if (ncp->nc_vp == NULL &&
4015 (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4016 cache_zap(ncp);
4017 } else {
4018 _cache_unlock(ncp);
4019 _cache_drop(ncp);
4021 } else {
4022 _cache_drop(ncp);
4024 --count;
4029 * Clean out positive cache entries when too many have accumulated.
4031 static void
4032 _cache_cleanpos(long count)
4034 static volatile int rover;
4035 struct nchash_head *nchpp;
4036 struct namecache *ncp;
4037 int rover_copy;
4040 * Attempt to clean out the specified number of negative cache
4041 * entries.
4043 while (count > 0) {
4044 rover_copy = ++rover; /* MPSAFEENOUGH */
4045 cpu_ccfence();
4046 nchpp = NCHHASH(rover_copy);
4048 if (TAILQ_FIRST(&nchpp->list) == NULL) {
4049 --count;
4050 continue;
4054 * Cycle ncp on list, ignore and do not move DUMMY
4055 * ncps. These are temporary list iterators.
4057 * We must cycle the ncp to the end of the list to
4058 * ensure that all ncp's have an equal chance of
4059 * being removed.
4061 spin_lock(&nchpp->spin);
4062 ncp = TAILQ_FIRST(&nchpp->list);
4063 while (ncp && (ncp->nc_flag & NCF_DUMMY))
4064 ncp = TAILQ_NEXT(ncp, nc_hash);
4065 if (ncp) {
4066 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash);
4067 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash);
4068 _cache_hold(ncp);
4070 spin_unlock(&nchpp->spin);
4072 if (ncp) {
4073 if (_cache_lock_special(ncp) == 0) {
4074 cache_zap(ncp);
4075 } else {
4076 _cache_drop(ncp);
4079 --count;
4084 * This is a kitchen sink function to clean out ncps which we
4085 * tried to zap from cache_drop() but failed because we were
4086 * unable to acquire the parent lock.
4088 * Such entries can also be removed via cache_inval_vp(), such
4089 * as when unmounting.
4091 static void
4092 _cache_cleandefered(void)
4094 struct nchash_head *nchpp;
4095 struct namecache *ncp;
4096 struct namecache dummy;
4097 int i;
4100 * Create a list iterator. DUMMY indicates that this is a list
4101 * iterator, DESTROYED prevents matches by lookup functions.
4103 numdefered = 0;
4104 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0;
4105 bzero(&dummy, sizeof(dummy));
4106 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY;
4107 dummy.nc_refs = 1;
4109 for (i = 0; i <= nchash; ++i) {
4110 nchpp = &nchashtbl[i];
4112 spin_lock(&nchpp->spin);
4113 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
4114 ncp = &dummy;
4115 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) {
4116 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
4117 continue;
4118 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4119 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash);
4120 _cache_hold(ncp);
4121 spin_unlock(&nchpp->spin);
4122 if (_cache_lock_nonblock(ncp) == 0) {
4123 ncp->nc_flag &= ~NCF_DEFEREDZAP;
4124 _cache_unlock(ncp);
4126 _cache_drop(ncp);
4127 spin_lock(&nchpp->spin);
4128 ncp = &dummy;
4130 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4131 spin_unlock(&nchpp->spin);
4136 * Name cache initialization, from vfsinit() when we are booting
4138 void
4139 nchinit(void)
4141 struct pcpu_ncache *pn;
4142 globaldata_t gd;
4143 int i;
4146 * Per-cpu accounting and negative hit list
4148 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus,
4149 M_VFSCACHE, M_WAITOK|M_ZERO);
4150 for (i = 0; i < ncpus; ++i) {
4151 pn = &pcpu_ncache[i];
4152 TAILQ_INIT(&pn->neg_list);
4153 spin_init(&pn->neg_spin, "ncneg");
4154 spin_init(&pn->umount_spin, "ncumm");
4158 * Initialise per-cpu namecache effectiveness statistics.
4160 for (i = 0; i < ncpus; ++i) {
4161 gd = globaldata_find(i);
4162 gd->gd_nchstats = &nchstats[i];
4166 * Create a generous namecache hash table
4168 nchashtbl = hashinit_ext(vfs_inodehashsize(),
4169 sizeof(struct nchash_head),
4170 M_VFSCACHE, &nchash);
4171 for (i = 0; i <= (int)nchash; ++i) {
4172 TAILQ_INIT(&nchashtbl[i].list);
4173 spin_init(&nchashtbl[i].spin, "nchinit_hash");
4175 for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
4176 spin_init(&ncmount_cache[i].spin, "nchinit_cache");
4177 nclockwarn = 5 * hz;
4181 * Called from start_init() to bootstrap the root filesystem. Returns
4182 * a referenced, unlocked namecache record.
4184 void
4185 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
4187 nch->ncp = cache_alloc(0);
4188 nch->mount = mp;
4189 _cache_mntref(mp);
4190 if (vp)
4191 _cache_setvp(nch->mount, nch->ncp, vp);
4195 * vfs_cache_setroot()
4197 * Create an association between the root of our namecache and
4198 * the root vnode. This routine may be called several times during
4199 * booting.
4201 * If the caller intends to save the returned namecache pointer somewhere
4202 * it must cache_hold() it.
4204 void
4205 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
4207 struct vnode *ovp;
4208 struct nchandle onch;
4210 ovp = rootvnode;
4211 onch = rootnch;
4212 rootvnode = nvp;
4213 if (nch)
4214 rootnch = *nch;
4215 else
4216 cache_zero(&rootnch);
4217 if (ovp)
4218 vrele(ovp);
4219 if (onch.ncp)
4220 cache_drop(&onch);
4224 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache
4225 * topology and is being removed as quickly as possible. The new VOP_N*()
4226 * API calls are required to make specific adjustments using the supplied
4227 * ncp pointers rather then just bogusly purging random vnodes.
4229 * Invalidate all namecache entries to a particular vnode as well as
4230 * any direct children of that vnode in the namecache. This is a
4231 * 'catch all' purge used by filesystems that do not know any better.
4233 * Note that the linkage between the vnode and its namecache entries will
4234 * be removed, but the namecache entries themselves might stay put due to
4235 * active references from elsewhere in the system or due to the existance of
4236 * the children. The namecache topology is left intact even if we do not
4237 * know what the vnode association is. Such entries will be marked
4238 * NCF_UNRESOLVED.
4240 void
4241 cache_purge(struct vnode *vp)
4243 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
4246 static int disablecwd;
4247 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
4248 "Disable getcwd");
4250 static u_long numcwdcalls;
4251 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
4252 "Number of current directory resolution calls");
4253 static u_long numcwdfailnf;
4254 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
4255 "Number of current directory failures due to lack of file");
4256 static u_long numcwdfailsz;
4257 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
4258 "Number of current directory failures due to large result");
4259 static u_long numcwdfound;
4260 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
4261 "Number of current directory resolution successes");
4264 * MPALMOSTSAFE
4267 sys___getcwd(struct __getcwd_args *uap)
4269 u_int buflen;
4270 int error;
4271 char *buf;
4272 char *bp;
4274 if (disablecwd)
4275 return (ENODEV);
4277 buflen = uap->buflen;
4278 if (buflen == 0)
4279 return (EINVAL);
4280 if (buflen > MAXPATHLEN)
4281 buflen = MAXPATHLEN;
4283 buf = kmalloc(buflen, M_TEMP, M_WAITOK);
4284 bp = kern_getcwd(buf, buflen, &error);
4285 if (error == 0)
4286 error = copyout(bp, uap->buf, strlen(bp) + 1);
4287 kfree(buf, M_TEMP);
4288 return (error);
4291 char *
4292 kern_getcwd(char *buf, size_t buflen, int *error)
4294 struct proc *p = curproc;
4295 char *bp;
4296 int i, slash_prefixed;
4297 struct filedesc *fdp;
4298 struct nchandle nch;
4299 struct namecache *ncp;
4301 numcwdcalls++;
4302 bp = buf;
4303 bp += buflen - 1;
4304 *bp = '\0';
4305 fdp = p->p_fd;
4306 slash_prefixed = 0;
4308 nch = fdp->fd_ncdir;
4309 ncp = nch.ncp;
4310 if (ncp)
4311 _cache_hold(ncp);
4313 while (ncp && (ncp != fdp->fd_nrdir.ncp ||
4314 nch.mount != fdp->fd_nrdir.mount)
4317 * While traversing upwards if we encounter the root
4318 * of the current mount we have to skip to the mount point
4319 * in the underlying filesystem.
4321 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
4322 nch = nch.mount->mnt_ncmounton;
4323 _cache_drop(ncp);
4324 ncp = nch.ncp;
4325 if (ncp)
4326 _cache_hold(ncp);
4327 continue;
4331 * Prepend the path segment
4333 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4334 if (bp == buf) {
4335 numcwdfailsz++;
4336 *error = ERANGE;
4337 bp = NULL;
4338 goto done;
4340 *--bp = ncp->nc_name[i];
4342 if (bp == buf) {
4343 numcwdfailsz++;
4344 *error = ERANGE;
4345 bp = NULL;
4346 goto done;
4348 *--bp = '/';
4349 slash_prefixed = 1;
4352 * Go up a directory. This isn't a mount point so we don't
4353 * have to check again.
4355 while ((nch.ncp = ncp->nc_parent) != NULL) {
4356 if (ncp_shared_lock_disable)
4357 _cache_lock(ncp);
4358 else
4359 _cache_lock_shared(ncp);
4360 if (nch.ncp != ncp->nc_parent) {
4361 _cache_unlock(ncp);
4362 continue;
4364 _cache_hold(nch.ncp);
4365 _cache_unlock(ncp);
4366 break;
4368 _cache_drop(ncp);
4369 ncp = nch.ncp;
4371 if (ncp == NULL) {
4372 numcwdfailnf++;
4373 *error = ENOENT;
4374 bp = NULL;
4375 goto done;
4377 if (!slash_prefixed) {
4378 if (bp == buf) {
4379 numcwdfailsz++;
4380 *error = ERANGE;
4381 bp = NULL;
4382 goto done;
4384 *--bp = '/';
4386 numcwdfound++;
4387 *error = 0;
4388 done:
4389 if (ncp)
4390 _cache_drop(ncp);
4391 return (bp);
4395 * Thus begins the fullpath magic.
4397 * The passed nchp is referenced but not locked.
4399 static int disablefullpath;
4400 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
4401 &disablefullpath, 0,
4402 "Disable fullpath lookups");
4405 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
4406 char **retbuf, char **freebuf, int guess)
4408 struct nchandle fd_nrdir;
4409 struct nchandle nch;
4410 struct namecache *ncp;
4411 struct mount *mp, *new_mp;
4412 char *bp, *buf;
4413 int slash_prefixed;
4414 int error = 0;
4415 int i;
4417 *retbuf = NULL;
4418 *freebuf = NULL;
4420 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
4421 bp = buf + MAXPATHLEN - 1;
4422 *bp = '\0';
4423 if (nchbase)
4424 fd_nrdir = *nchbase;
4425 else if (p != NULL)
4426 fd_nrdir = p->p_fd->fd_nrdir;
4427 else
4428 fd_nrdir = rootnch;
4429 slash_prefixed = 0;
4430 nch = *nchp;
4431 ncp = nch.ncp;
4432 if (ncp)
4433 _cache_hold(ncp);
4434 mp = nch.mount;
4436 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
4437 new_mp = NULL;
4440 * If we are asked to guess the upwards path, we do so whenever
4441 * we encounter an ncp marked as a mountpoint. We try to find
4442 * the actual mountpoint by finding the mountpoint with this
4443 * ncp.
4445 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
4446 new_mp = mount_get_by_nc(ncp);
4449 * While traversing upwards if we encounter the root
4450 * of the current mount we have to skip to the mount point.
4452 if (ncp == mp->mnt_ncmountpt.ncp) {
4453 new_mp = mp;
4455 if (new_mp) {
4456 nch = new_mp->mnt_ncmounton;
4457 _cache_drop(ncp);
4458 ncp = nch.ncp;
4459 if (ncp)
4460 _cache_hold(ncp);
4461 mp = nch.mount;
4462 continue;
4466 * Prepend the path segment
4468 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4469 if (bp == buf) {
4470 kfree(buf, M_TEMP);
4471 error = ENOMEM;
4472 goto done;
4474 *--bp = ncp->nc_name[i];
4476 if (bp == buf) {
4477 kfree(buf, M_TEMP);
4478 error = ENOMEM;
4479 goto done;
4481 *--bp = '/';
4482 slash_prefixed = 1;
4485 * Go up a directory. This isn't a mount point so we don't
4486 * have to check again.
4488 * We can only safely access nc_parent with ncp held locked.
4490 while ((nch.ncp = ncp->nc_parent) != NULL) {
4491 _cache_lock_shared(ncp);
4492 if (nch.ncp != ncp->nc_parent) {
4493 _cache_unlock(ncp);
4494 continue;
4496 _cache_hold(nch.ncp);
4497 _cache_unlock(ncp);
4498 break;
4500 _cache_drop(ncp);
4501 ncp = nch.ncp;
4503 if (ncp == NULL) {
4504 kfree(buf, M_TEMP);
4505 error = ENOENT;
4506 goto done;
4509 if (!slash_prefixed) {
4510 if (bp == buf) {
4511 kfree(buf, M_TEMP);
4512 error = ENOMEM;
4513 goto done;
4515 *--bp = '/';
4517 *retbuf = bp;
4518 *freebuf = buf;
4519 error = 0;
4520 done:
4521 if (ncp)
4522 _cache_drop(ncp);
4523 return(error);
4527 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
4528 char **freebuf, int guess)
4530 struct namecache *ncp;
4531 struct nchandle nch;
4532 int error;
4534 *freebuf = NULL;
4535 if (disablefullpath)
4536 return (ENODEV);
4538 if (p == NULL)
4539 return (EINVAL);
4541 /* vn is NULL, client wants us to use p->p_textvp */
4542 if (vn == NULL) {
4543 if ((vn = p->p_textvp) == NULL)
4544 return (EINVAL);
4546 spin_lock_shared(&vn->v_spin);
4547 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
4548 if (ncp->nc_nlen)
4549 break;
4551 if (ncp == NULL) {
4552 spin_unlock_shared(&vn->v_spin);
4553 return (EINVAL);
4555 _cache_hold(ncp);
4556 spin_unlock_shared(&vn->v_spin);
4558 nch.ncp = ncp;
4559 nch.mount = vn->v_mount;
4560 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
4561 _cache_drop(ncp);
4562 return (error);
4565 void
4566 vfscache_rollup_cpu(struct globaldata *gd)
4568 struct pcpu_ncache *pn;
4569 long count;
4571 if (pcpu_ncache == NULL)
4572 return;
4573 pn = &pcpu_ncache[gd->gd_cpuid];
4575 if (pn->vfscache_count) {
4576 count = atomic_swap_long(&pn->vfscache_count, 0);
4577 atomic_add_long(&vfscache_count, count);
4579 if (pn->vfscache_leafs) {
4580 count = atomic_swap_long(&pn->vfscache_leafs, 0);
4581 atomic_add_long(&vfscache_leafs, count);
4583 if (pn->vfscache_negs) {
4584 count = atomic_swap_long(&pn->vfscache_negs, 0);
4585 atomic_add_long(&vfscache_negs, count);
4587 if (pn->numdefered) {
4588 count = atomic_swap_long(&pn->numdefered, 0);
4589 atomic_add_long(&numdefered, count);