sys/kern/vfs_cache.c

   1 /*
   2  * Copyright (c) 2003-2020 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1989, 1993, 1995
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * This code is derived from software contributed to Berkeley by
  38  * Poul-Henning Kamp of the FreeBSD Project.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  */
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/uio.h>
  68 #include <sys/kernel.h>
  69 #include <sys/sysctl.h>
  70 #include <sys/mount.h>
  71 #include <sys/vnode.h>
  72 #include <sys/malloc.h>
  73 #include <sys/sysproto.h>
  74 #include <sys/spinlock.h>
  75 #include <sys/proc.h>
  76 #include <sys/namei.h>
  77 #include <sys/nlookup.h>
  78 #include <sys/filedesc.h>
  79 #include <sys/fnv_hash.h>
  80 #include <sys/globaldata.h>
  81 #include <sys/kern_syscall.h>
  82 #include <sys/dirent.h>
  83 #include <ddb/ddb.h>
  84
  85 #include <sys/spinlock2.h>
  86
  87 #define MAX_RECURSION_DEPTH     64
  88
  89 /*
  90  * Random lookups in the cache are accomplished with a hash table using
  91  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock,
  92  * but we use the ncp->update counter trick to avoid acquiring any
  93  * contestable spin-locks during a lookup.
  94  *
  95  * Negative entries may exist and correspond to resolved namecache
  96  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
  97  * will be set if the entry corresponds to a whited-out directory entry
  98  * (verses simply not finding the entry at all).  pcpu_ncache[n].neg_list
  99  * is locked via pcpu_ncache[n].neg_spin;
 100  *
 101  * MPSAFE RULES:
 102  *
 103  * (1) ncp's typically have at least a nc_refs of 1, and usually 2.  One
 104  *     is applicable to direct lookups via the hash table nchpp or via
 105  *     nc_list (the two are added or removed together).  Removal of the ncp
 106  *     from the hash table drops this reference.  The second is applicable
 107  *     to vp->v_namecache linkages (or negative list linkages), and removal
 108  *     of the ncp from these lists drops this reference.
 109  *
 110  *     On the 1->0 transition of nc_refs the ncp can no longer be referenced
 111  *     and must be destroyed.  No other thread should have access to it at
 112  *     this point so it can be safely locked and freed without any deadlock
 113  *     fears.
 114  *
 115  *     The 1->0 transition can occur at almost any juncture and so cache_drop()
 116  *     deals with it directly.
 117  *
 118  * (2) Once the 1->0 transition occurs, the entity that caused the transition
 119  *     will be responsible for destroying the ncp.  The ncp cannot be on any
 120  *     list or hash at this time, or be held by anyone other than the caller
 121  *     responsible for the transition.
 122  *
 123  * (3) A ncp must be locked in order to modify it.
 124  *
 125  * (5) ncp locks are ordered, child-to-parent.  Child first, then parent.
 126  *     This may seem backwards but forward-scans use the hash table and thus
 127  *     can hold the parent unlocked while traversing downward.  Deletions,
 128  *     on the other-hand, tend to propagate bottom-up since the ref on the
 129  *     is dropped as the children go away.
 130  *
 131  * (6) Both parent and child must be locked in order to enter the child onto
 132  *     the parent's nc_list.
 133  */
 134
 135 /*
 136  * Structures associated with name cacheing.
 137  */
 138 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
 139 #define MINNEG                  1024
 140 #define MINPOS                  1024
 141 #define NCMOUNT_NUMCACHE        (16384) /* power of 2 */
 142 #define NCMOUNT_SET             (8)     /* power of 2 */
 143
 144 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 145
 146 TAILQ_HEAD(nchash_list, namecache);
 147
 148 /*
 149  * Don't cachealign, but at least pad to 32 bytes so entries
 150  * don't cross a cache line.
 151  */
 152 struct nchash_head {
 153        struct nchash_list list; /* 16 bytes */
 154        struct spinlock  spin;   /* 8 bytes */
 155        long     pad01;          /* 8 bytes */
 156 };
 157
 158 struct ncmount_cache {
 159         struct spinlock spin;
 160         struct namecache *ncp;
 161         struct mount *mp;
 162         struct mount *mp_target;
 163         int isneg;
 164         int ticks;
 165         int updating;
 166         int unused01;
 167 };
 168
 169 struct pcpu_ncache {
 170         struct spinlock         umount_spin;    /* cache_findmount/interlock */
 171         struct spinlock         neg_spin;       /* for neg_list and neg_count */
 172         struct namecache_list   neg_list;
 173         long                    neg_count;
 174         long                    vfscache_negs;
 175         long                    vfscache_count;
 176         long                    vfscache_leafs;
 177         long                    numdefered;
 178 } __cachealign;
 179
 180 __read_mostly static struct nchash_head *nchashtbl;
 181 __read_mostly static struct pcpu_ncache *pcpu_ncache;
 182 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
 183
 184 /*
 185  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
 186  * to create the namecache infrastructure leading to a dangling vnode.
 187  *
 188  * 0    Only errors are reported
 189  * 1    Successes are reported
 190  * 2    Successes + the whole directory scan is reported
 191  * 3    Force the directory scan code run as if the parent vnode did not
 192  *      have a namecache record, even if it does have one.
 193  */
 194 __read_mostly static int        ncvp_debug;
 195 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
 196     "Namecache debug level (0-3)");
 197
 198 __read_mostly static u_long nchash;             /* size of hash table */
 199 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 200     "Size of namecache hash table");
 201
 202 __read_mostly static int ncnegflush = 10;       /* burst for negative flush */
 203 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
 204     "Batch flush negative entries");
 205
 206 __read_mostly static int ncposflush = 10;       /* burst for positive flush */
 207 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
 208     "Batch flush positive entries");
 209
 210 __read_mostly static int ncnegfactor = 16;      /* ratio of negative entries */
 211 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 212     "Ratio of namecache negative entries");
 213
 214 __read_mostly static int nclockwarn;    /* warn on locked entries in ticks */
 215 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
 216     "Warn on locked namecache entries in ticks");
 217
 218 __read_mostly static int ncposlimit;    /* number of cache entries allocated */
 219 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
 220     "Number of cache entries allocated");
 221
 222 __read_mostly static int ncp_shared_lock_disable = 0;
 223 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
 224            &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
 225
 226 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
 227     "sizeof(struct vnode)");
 228 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
 229     "sizeof(struct namecache)");
 230
 231 __read_mostly static int ncmount_cache_enable = 1;
 232 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
 233            &ncmount_cache_enable, 0, "mount point cache");
 234
 235 static __inline void _cache_drop(struct namecache *ncp);
 236 static int cache_resolve_mp(struct mount *mp);
 237 static int cache_findmount_callback(struct mount *mp, void *data);
 238 static void _cache_setunresolved(struct namecache *ncp);
 239 static void _cache_cleanneg(long count);
 240 static void _cache_cleanpos(long count);
 241 static void _cache_cleandefered(void);
 242 static void _cache_unlink(struct namecache *ncp);
 243
 244 /*
 245  * The new name cache statistics (these are rolled up globals and not
 246  * modified in the critical path, see struct pcpu_ncache).
 247  */
 248 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 249 static long vfscache_negs;
 250 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0,
 251     "Number of negative namecache entries");
 252 static long vfscache_count;
 253 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0,
 254     "Number of namecaches entries");
 255 static long vfscache_leafs;
 256 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0,
 257     "Number of namecaches entries");
 258 static long     numdefered;
 259 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
 260     "Number of cache entries allocated");
 261
 262
 263 struct nchstats nchstats[SMP_MAXCPU];
 264 /*
 265  * Export VFS cache effectiveness statistics to user-land.
 266  *
 267  * The statistics are left for aggregation to user-land so
 268  * neat things can be achieved, like observing per-CPU cache
 269  * distribution.
 270  */
 271 static int
 272 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 273 {
 274         struct globaldata *gd;
 275         int i, error;
 276
 277         error = 0;
 278         for (i = 0; i < ncpus; ++i) {
 279                 gd = globaldata_find(i);
 280                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
 281                         sizeof(struct nchstats))))
 282                         break;
 283         }
 284
 285         return (error);
 286 }
 287 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 288   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 289
 290 static void cache_zap(struct namecache *ncp);
 291
 292 /*
 293  * Cache mount points and namecache records in order to avoid unnecessary
 294  * atomic ops on mnt_refs and ncp->refs.  This improves concurrent SMP
 295  * performance and is particularly important on multi-socket systems to
 296  * reduce cache-line ping-ponging.
 297  *
 298  * Try to keep the pcpu structure within one cache line (~64 bytes).
 299  */
 300 #define MNTCACHE_COUNT  32      /* power of 2, multiple of SET */
 301 #define MNTCACHE_SET    8       /* set associativity */
 302
 303 struct mntcache_elm {
 304         struct namecache *ncp;
 305         struct mount     *mp;
 306         int     ticks;
 307         int     unused01;
 308 };
 309
 310 struct mntcache {
 311         struct mntcache_elm array[MNTCACHE_COUNT];
 312 } __cachealign;
 313
 314 static struct mntcache  pcpu_mntcache[MAXCPU];
 315
 316 static __inline
 317 struct mntcache_elm *
 318 _cache_mntcache_hash(void *ptr)
 319 {
 320         struct mntcache_elm *elm;
 321         int hv;
 322
 323         hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1);
 324         elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)];
 325
 326         return elm;
 327 }
 328
 329 static
 330 void
 331 _cache_mntref(struct mount *mp)
 332 {
 333         struct mntcache_elm *elm;
 334         struct mount *mpr;
 335         int i;
 336
 337         elm = _cache_mntcache_hash(mp);
 338         for (i = 0; i < MNTCACHE_SET; ++i) {
 339                 if (elm->mp == mp) {
 340                         mpr = atomic_swap_ptr((void *)&elm->mp, NULL);
 341                         if (__predict_true(mpr == mp))
 342                                 return;
 343                         if (mpr)
 344                                 atomic_add_int(&mpr->mnt_refs, -1);
 345                 }
 346                 ++elm;
 347         }
 348         atomic_add_int(&mp->mnt_refs, 1);
 349 }
 350
 351 static
 352 void
 353 _cache_mntrel(struct mount *mp)
 354 {
 355         struct mntcache_elm *elm;
 356         struct mntcache_elm *best;
 357         struct mount *mpr;
 358         int delta1;
 359         int delta2;
 360         int i;
 361
 362         elm = _cache_mntcache_hash(mp);
 363         best = elm;
 364         for (i = 0; i < MNTCACHE_SET; ++i) {
 365                 if (elm->mp == NULL) {
 366                         mpr = atomic_swap_ptr((void *)&elm->mp, mp);
 367                         if (__predict_false(mpr != NULL)) {
 368                                 atomic_add_int(&mpr->mnt_refs, -1);
 369                         }
 370                         elm->ticks = ticks;
 371                         return;
 372                 }
 373                 delta1 = ticks - best->ticks;
 374                 delta2 = ticks - elm->ticks;
 375                 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
 376                         best = elm;
 377                 ++elm;
 378         }
 379         mpr = atomic_swap_ptr((void *)&best->mp, mp);
 380         best->ticks = ticks;
 381         if (mpr)
 382                 atomic_add_int(&mpr->mnt_refs, -1);
 383 }
 384
 385 /*
 386  * Clears all cached mount points on all cpus.  This routine should only
 387  * be called when we are waiting for a mount to clear, e.g. so we can
 388  * unmount.
 389  */
 390 void
 391 cache_clearmntcache(struct mount *target __unused)
 392 {
 393         int n;
 394
 395         for (n = 0; n < ncpus; ++n) {
 396                 struct mntcache *cache = &pcpu_mntcache[n];
 397                 struct mntcache_elm *elm;
 398                 struct namecache *ncp;
 399                 struct mount *mp;
 400                 int i;
 401
 402                 for (i = 0; i < MNTCACHE_COUNT; ++i) {
 403                         elm = &cache->array[i];
 404                         if (elm->mp) {
 405                                 mp = atomic_swap_ptr((void *)&elm->mp, NULL);
 406                                 if (mp)
 407                                         atomic_add_int(&mp->mnt_refs, -1);
 408                         }
 409                         if (elm->ncp) {
 410                                 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL);
 411                                 if (ncp)
 412                                         _cache_drop(ncp);
 413                         }
 414                 }
 415         }
 416 }
 417
 418 /*
 419  * Namespace locking.  The caller must already hold a reference to the
 420  * namecache structure in order to lock/unlock it.  The controlling entity
 421  * in a 1->0 transition does not need to lock the ncp to dispose of it,
 422  * as nobody else will have visiblity to it at that point.
 423  *
 424  * Note that holding a locked namecache structure prevents other threads
 425  * from making namespace changes (e.g. deleting or creating), prevents
 426  * vnode association state changes by other threads, and prevents the
 427  * namecache entry from being resolved or unresolved by other threads.
 428  *
 429  * An exclusive lock owner has full authority to associate/disassociate
 430  * vnodes and resolve/unresolve the locked ncp.
 431  *
 432  * A shared lock owner only has authority to acquire the underlying vnode,
 433  * if any.
 434  *
 435  * The primary lock field is nc_lockstatus.  nc_locktd is set after the
 436  * fact (when locking) or cleared prior to unlocking.
 437  *
 438  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
 439  *           or recycled, but it does NOT help you if the vnode had already
 440  *           initiated a recyclement.  If this is important, use cache_get()
 441  *           rather then cache_lock() (and deal with the differences in the
 442  *           way the refs counter is handled).  Or, alternatively, make an
 443  *           unconditional call to cache_validate() or cache_resolve()
 444  *           after cache_lock() returns.
 445  */
 446 static __inline
 447 void
 448 _cache_lock(struct namecache *ncp)
 449 {
 450         int didwarn = 0;
 451         int error;
 452
 453         error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
 454         while (__predict_false(error == EWOULDBLOCK)) {
 455                 if (didwarn == 0) {
 456                         didwarn = ticks - nclockwarn;
 457                         kprintf("[diagnostic] cache_lock: "
 458                                 "%s blocked on %p "
 459                                 "\"%*.*s\"\n",
 460                                 curthread->td_comm, ncp,
 461                                 ncp->nc_nlen, ncp->nc_nlen,
 462                                 ncp->nc_name);
 463                 }
 464                 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK);
 465         }
 466         if (__predict_false(didwarn)) {
 467                 kprintf("[diagnostic] cache_lock: "
 468                         "%s unblocked %*.*s after %d secs\n",
 469                         curthread->td_comm,
 470                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 471                         (int)(ticks - didwarn) / hz);
 472         }
 473 }
 474
 475 /*
 476  * Release a previously acquired lock.
 477  *
 478  * A concurrent shared-lock acquisition or acquisition/release can
 479  * race bit 31 so only drop the ncp if bit 31 was set.
 480  */
 481 static __inline
 482 void
 483 _cache_unlock(struct namecache *ncp)
 484 {
 485         lockmgr(&ncp->nc_lock, LK_RELEASE);
 486 }
 487
 488 /*
 489  * Lock ncp exclusively, non-blocking.  Return 0 on success.
 490  */
 491 static __inline
 492 int
 493 _cache_lock_nonblock(struct namecache *ncp)
 494 {
 495         int error;
 496
 497         error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT);
 498         if (__predict_false(error != 0)) {
 499                 return(EWOULDBLOCK);
 500         }
 501         return 0;
 502 }
 503
 504 /*
 505  * This is a special form of _cache_lock() which only succeeds if
 506  * it can get a pristine, non-recursive lock.  The caller must have
 507  * already ref'd the ncp.
 508  *
 509  * On success the ncp will be locked, on failure it will not.  The
 510  * ref count does not change either way.
 511  *
 512  * We want _cache_lock_special() (on success) to return a definitively
 513  * usable vnode or a definitively unresolved ncp.
 514  */
 515 static __inline
 516 int
 517 _cache_lock_special(struct namecache *ncp)
 518 {
 519         if (_cache_lock_nonblock(ncp) == 0) {
 520                 if (lockmgr_oneexcl(&ncp->nc_lock)) {
 521                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
 522                                 _cache_setunresolved(ncp);
 523                         return 0;
 524                 }
 525                 _cache_unlock(ncp);
 526         }
 527         return EWOULDBLOCK;
 528 }
 529
 530 /*
 531  * Shared lock, guarantees vp held
 532  *
 533  * The shared lock holds vp on the 0->1 transition.  It is possible to race
 534  * another shared lock release, preventing the other release from dropping
 535  * the vnode and clearing bit 31.
 536  *
 537  * If it is not set then we are responsible for setting it, and this
 538  * responsibility does not race with anyone else.
 539  */
 540 static __inline
 541 void
 542 _cache_lock_shared(struct namecache *ncp)
 543 {
 544         int didwarn = 0;
 545         int error;
 546
 547         error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
 548         while (__predict_false(error == EWOULDBLOCK)) {
 549                 if (didwarn == 0) {
 550                         didwarn = ticks - nclockwarn;
 551                         kprintf("[diagnostic] cache_lock_shared: "
 552                                 "%s blocked on %p "
 553                                 "\"%*.*s\"\n",
 554                                 curthread->td_comm, ncp,
 555                                 ncp->nc_nlen, ncp->nc_nlen,
 556                                 ncp->nc_name);
 557                 }
 558                 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
 559         }
 560         if (__predict_false(didwarn)) {
 561                 kprintf("[diagnostic] cache_lock_shared: "
 562                         "%s unblocked %*.*s after %d secs\n",
 563                         curthread->td_comm,
 564                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 565                         (int)(ticks - didwarn) / hz);
 566         }
 567 }
 568
 569 /*
 570  * Shared lock, guarantees vp held.  Non-blocking.  Returns 0 on success
 571  */
 572 static __inline
 573 int
 574 _cache_lock_shared_nonblock(struct namecache *ncp)
 575 {
 576         int error;
 577
 578         error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT);
 579         if (__predict_false(error != 0)) {
 580                 return(EWOULDBLOCK);
 581         }
 582         return 0;
 583 }
 584
 585 /*
 586  * This function tries to get a shared lock but will back-off to an
 587  * exclusive lock if:
 588  *
 589  * (1) Some other thread is trying to obtain an exclusive lock
 590  *     (to prevent the exclusive requester from getting livelocked out
 591  *     by many shared locks).
 592  *
 593  * (2) The current thread already owns an exclusive lock (to avoid
 594  *     deadlocking).
 595  *
 596  * WARNING! On machines with lots of cores we really want to try hard to
 597  *          get a shared lock or concurrent path lookups can chain-react
 598  *          into a very high-latency exclusive lock.
 599  *
 600  *          This is very evident in dsynth's initial scans.
 601  */
 602 static __inline
 603 int
 604 _cache_lock_shared_special(struct namecache *ncp)
 605 {
 606         /*
 607          * Only honor a successful shared lock (returning 0) if there is
 608          * no exclusive request pending and the vnode, if present, is not
 609          * in a reclaimed state.
 610          */
 611         if (_cache_lock_shared_nonblock(ncp) == 0) {
 612                 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) {
 613                         if (ncp->nc_vp == NULL ||
 614                             (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
 615                                 return(0);
 616                         }
 617                 }
 618                 _cache_unlock(ncp);
 619                 return(EWOULDBLOCK);
 620         }
 621
 622         /*
 623          * Non-blocking shared lock failed.  If we already own the exclusive
 624          * lock just acquire another exclusive lock (instead of deadlocking).
 625          * Otherwise acquire a shared lock.
 626          */
 627         if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) {
 628                 _cache_lock(ncp);
 629                 return(0);
 630         }
 631         _cache_lock_shared(ncp);
 632         return(0);
 633 }
 634
 635 static __inline
 636 int
 637 _cache_lockstatus(struct namecache *ncp)
 638 {
 639         int status;
 640
 641         status = lockstatus(&ncp->nc_lock, curthread);
 642         if (status == 0 || status == LK_EXCLOTHER)
 643                 status = -1;
 644         return status;
 645 }
 646
 647 /*
 648  * cache_hold() and cache_drop() prevent the premature deletion of a
 649  * namecache entry but do not prevent operations (such as zapping) on
 650  * that namecache entry.
 651  *
 652  * This routine may only be called from outside this source module if
 653  * nc_refs is already deterministically at least 1, such as being
 654  * associated with e.g. a process, file descriptor, or some other entity.
 655  *
 656  * Only the above situations, similar situations within this module where
 657  * the ref count is deterministically at least 1, or when the ncp is found
 658  * via the nchpp (hash table) lookup, can bump nc_refs.
 659  *
 660  * Very specifically, a ncp found via nc_list CANNOT bump nc_refs.  It
 661  * can still be removed from the nc_list, however, as long as the caller
 662  * can acquire its lock (in the wrong order).
 663  *
 664  * This is a rare case where callers are allowed to hold a spinlock,
 665  * so we can't ourselves.
 666  */
 667 static __inline
 668 struct namecache *
 669 _cache_hold(struct namecache *ncp)
 670 {
 671         KKASSERT(ncp->nc_refs > 0);
 672         atomic_add_int(&ncp->nc_refs, 1);
 673
 674         return(ncp);
 675 }
 676
 677 /*
 678  * Drop a cache entry.
 679  *
 680  * The 1->0 transition is special and requires the caller to destroy the
 681  * entry.  It means that the ncp is no longer on a nchpp list (since that
 682  * would mean there was stilla ref).  The ncp could still be on a nc_list
 683  * but will not have any child of its own, again because nc_refs is now 0
 684  * and children would have a ref to their parent.
 685  *
 686  * Once the 1->0 transition is made, nc_refs cannot be incremented again.
 687  */
 688 static __inline
 689 void
 690 _cache_drop(struct namecache *ncp)
 691 {
 692         if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) {
 693                 /*
 694                  * Executed unlocked (no need to lock on last drop)
 695                  */
 696                 _cache_setunresolved(ncp);
 697
 698                 /*
 699                  * Scrap it.
 700                  */
 701                 ncp->nc_refs = -1;      /* safety */
 702                 if (ncp->nc_name)
 703                         kfree(ncp->nc_name, M_VFSCACHE);
 704                 kfree(ncp, M_VFSCACHE);
 705         }
 706 }
 707
 708 /*
 709  * Link a new namecache entry to its parent and to the hash table.  Be
 710  * careful to avoid races if vhold() blocks in the future.
 711  *
 712  * Both ncp and par must be referenced and locked.  The reference is
 713  * transfered to the nchpp (and, most notably, NOT to the parent list).
 714  *
 715  * NOTE: The hash table spinlock is held across this call, we can't do
 716  *       anything fancy.
 717  */
 718 static void
 719 _cache_link_parent(struct namecache *ncp, struct namecache *par,
 720                    struct nchash_head *nchpp)
 721 {
 722         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
 723
 724         KKASSERT(ncp->nc_parent == NULL);
 725         ncp->nc_parent = par;
 726         ncp->nc_head = nchpp;
 727
 728         /*
 729          * Set inheritance flags.  Note that the parent flags may be
 730          * stale due to getattr potentially not having been run yet
 731          * (it gets run during nlookup()'s).
 732          */
 733         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
 734         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
 735                 ncp->nc_flag |= NCF_SF_PNOCACHE;
 736         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
 737                 ncp->nc_flag |= NCF_UF_PCACHE;
 738
 739         /*
 740          * Add to hash table and parent, adjust accounting
 741          */
 742         TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
 743         atomic_add_long(&pn->vfscache_count, 1);
 744         if (TAILQ_EMPTY(&ncp->nc_list))
 745                 atomic_add_long(&pn->vfscache_leafs, 1);
 746
 747         if (TAILQ_EMPTY(&par->nc_list)) {
 748                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 749                 atomic_add_long(&pn->vfscache_leafs, -1);
 750                 /*
 751                  * Any vp associated with an ncp which has children must
 752                  * be held to prevent it from being recycled.
 753                  */
 754                 if (par->nc_vp)
 755                         vhold(par->nc_vp);
 756         } else {
 757                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 758         }
 759         _cache_hold(par);       /* add nc_parent ref */
 760 }
 761
 762 /*
 763  * Remove the parent and hash associations from a namecache structure.
 764  * Drop the ref-count on the parent.  The caller receives the ref
 765  * from the ncp's nchpp linkage that was removed and may forward that
 766  * ref to a new linkage.
 767
 768  * The caller usually holds an additional ref * on the ncp so the unlink
 769  * cannot be the final drop.  XXX should not be necessary now since the
 770  * caller receives the ref from the nchpp linkage, assuming the ncp
 771  * was linked in the first place.
 772  *
 773  * ncp must be locked, which means that there won't be any nc_parent
 774  * removal races.  This routine will acquire a temporary lock on
 775  * the parent as well as the appropriate hash chain.
 776  */
 777 static void
 778 _cache_unlink_parent(struct namecache *ncp)
 779 {
 780         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
 781         struct namecache *par;
 782         struct vnode *dropvp;
 783         struct nchash_head *nchpp;
 784
 785         if ((par = ncp->nc_parent) != NULL) {
 786                 cpu_ccfence();
 787                 KKASSERT(ncp->nc_parent == par);
 788
 789                 /* don't add a ref, we drop the nchpp ref later */
 790                 _cache_lock(par);
 791                 nchpp = ncp->nc_head;
 792                 spin_lock(&nchpp->spin);
 793
 794                 /*
 795                  * Remove from hash table and parent, adjust accounting
 796                  */
 797                 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash);
 798                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 799                 atomic_add_long(&pn->vfscache_count, -1);
 800                 if (TAILQ_EMPTY(&ncp->nc_list))
 801                         atomic_add_long(&pn->vfscache_leafs, -1);
 802
 803                 dropvp = NULL;
 804                 if (TAILQ_EMPTY(&par->nc_list)) {
 805                         atomic_add_long(&pn->vfscache_leafs, 1);
 806                         if (par->nc_vp)
 807                                 dropvp = par->nc_vp;
 808                 }
 809                 ncp->nc_parent = NULL;
 810                 ncp->nc_head = NULL;
 811                 spin_unlock(&nchpp->spin);
 812                 _cache_unlock(par);
 813                 _cache_drop(par);       /* drop nc_parent ref */
 814
 815                 /*
 816                  * We can only safely vdrop with no spinlocks held.
 817                  */
 818                 if (dropvp)
 819                         vdrop(dropvp);
 820         }
 821 }
 822
 823 /*
 824  * Allocate a new namecache structure.  Most of the code does not require
 825  * zero-termination of the string but it makes vop_compat_ncreate() easier.
 826  *
 827  * The returned ncp will be locked and referenced.  The ref is generally meant
 828  * to be transfered to the nchpp linkage.
 829  */
 830 static struct namecache *
 831 cache_alloc(int nlen)
 832 {
 833         struct namecache *ncp;
 834
 835         ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
 836         if (nlen)
 837                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
 838         ncp->nc_nlen = nlen;
 839         ncp->nc_flag = NCF_UNRESOLVED;
 840         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
 841         ncp->nc_refs = 1;
 842         TAILQ_INIT(&ncp->nc_list);
 843         lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE);
 844         lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
 845
 846         return(ncp);
 847 }
 848
 849 /*
 850  * Can only be called for the case where the ncp has never been
 851  * associated with anything (so no spinlocks are needed).
 852  */
 853 static void
 854 _cache_free(struct namecache *ncp)
 855 {
 856         KKASSERT(ncp->nc_refs == 1);
 857         if (ncp->nc_name)
 858                 kfree(ncp->nc_name, M_VFSCACHE);
 859         kfree(ncp, M_VFSCACHE);
 860 }
 861
 862 /*
 863  * [re]initialize a nchandle.
 864  */
 865 void
 866 cache_zero(struct nchandle *nch)
 867 {
 868         nch->ncp = NULL;
 869         nch->mount = NULL;
 870 }
 871
 872 /*
 873  * Ref and deref a nchandle structure (ncp + mp)
 874  *
 875  * The caller must specify a stable ncp pointer, typically meaning the
 876  * ncp is already referenced but this can also occur indirectly through
 877  * e.g. holding a lock on a direct child.
 878  *
 879  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
 880  *          use read spinlocks here.
 881  */
 882 struct nchandle *
 883 cache_hold(struct nchandle *nch)
 884 {
 885         _cache_hold(nch->ncp);
 886         _cache_mntref(nch->mount);
 887         return(nch);
 888 }
 889
 890 /*
 891  * Create a copy of a namecache handle for an already-referenced
 892  * entry.
 893  */
 894 void
 895 cache_copy(struct nchandle *nch, struct nchandle *target)
 896 {
 897         struct namecache *ncp;
 898         struct mount *mp;
 899         struct mntcache_elm *elm;
 900         struct namecache *ncpr;
 901         int i;
 902
 903         ncp = nch->ncp;
 904         mp = nch->mount;
 905         target->ncp = ncp;
 906         target->mount = mp;
 907
 908         elm = _cache_mntcache_hash(ncp);
 909         for (i = 0; i < MNTCACHE_SET; ++i) {
 910                 if (elm->ncp == ncp) {
 911                         ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL);
 912                         if (ncpr == ncp) {
 913                                 _cache_mntref(mp);
 914                                 return;
 915                         }
 916                         if (ncpr)
 917                                 _cache_drop(ncpr);
 918                 }
 919                 ++elm;
 920         }
 921         if (ncp)
 922                 _cache_hold(ncp);
 923         _cache_mntref(mp);
 924 }
 925
 926 /*
 927  * Drop the nchandle, but try to cache the ref to avoid global atomic
 928  * ops.  This is typically done on the system root and jail root nchandles.
 929  */
 930 void
 931 cache_drop_and_cache(struct nchandle *nch, int elmno)
 932 {
 933         struct mntcache_elm *elm;
 934         struct mntcache_elm *best;
 935         struct namecache *ncpr;
 936         int delta1;
 937         int delta2;
 938         int i;
 939
 940         if (elmno > 4) {
 941                 if (nch->ncp) {
 942                         _cache_drop(nch->ncp);
 943                         nch->ncp = NULL;
 944                 }
 945                 if (nch->mount) {
 946                         _cache_mntrel(nch->mount);
 947                         nch->mount = NULL;
 948                 }
 949                 return;
 950         }
 951
 952         elm = _cache_mntcache_hash(nch->ncp);
 953         best = elm;
 954         for (i = 0; i < MNTCACHE_SET; ++i) {
 955                 if (elm->ncp == NULL) {
 956                         ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp);
 957                         _cache_mntrel(nch->mount);
 958                         elm->ticks = ticks;
 959                         nch->mount = NULL;
 960                         nch->ncp = NULL;
 961                         if (ncpr)
 962                                 _cache_drop(ncpr);
 963                         return;
 964                 }
 965                 delta1 = ticks - best->ticks;
 966                 delta2 = ticks - elm->ticks;
 967                 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
 968                         best = elm;
 969                 ++elm;
 970         }
 971         ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp);
 972         _cache_mntrel(nch->mount);
 973         best->ticks = ticks;
 974         nch->mount = NULL;
 975         nch->ncp = NULL;
 976         if (ncpr)
 977                 _cache_drop(ncpr);
 978 }
 979
 980 void
 981 cache_changemount(struct nchandle *nch, struct mount *mp)
 982 {
 983         _cache_mntref(mp);
 984         _cache_mntrel(nch->mount);
 985         nch->mount = mp;
 986 }
 987
 988 void
 989 cache_drop(struct nchandle *nch)
 990 {
 991         _cache_mntrel(nch->mount);
 992         _cache_drop(nch->ncp);
 993         nch->ncp = NULL;
 994         nch->mount = NULL;
 995 }
 996
 997 int
 998 cache_lockstatus(struct nchandle *nch)
 999 {
1000         return(_cache_lockstatus(nch->ncp));
1001 }
1002
1003 void
1004 cache_lock(struct nchandle *nch)
1005 {
1006         _cache_lock(nch->ncp);
1007 }
1008
1009 void
1010 cache_lock_maybe_shared(struct nchandle *nch, int excl)
1011 {
1012         struct namecache *ncp = nch->ncp;
1013
1014         if (ncp_shared_lock_disable || excl ||
1015             (ncp->nc_flag & NCF_UNRESOLVED)) {
1016                 _cache_lock(ncp);
1017         } else {
1018                 _cache_lock_shared(ncp);
1019                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1020                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1021                                 _cache_unlock(ncp);
1022                                 _cache_lock(ncp);
1023                         }
1024                 } else {
1025                         _cache_unlock(ncp);
1026                         _cache_lock(ncp);
1027                 }
1028         }
1029 }
1030
1031 /*
1032  * Relock nch1 given an unlocked nch1 and a locked nch2.  The caller
1033  * is responsible for checking both for validity on return as they
1034  * may have become invalid.
1035  *
1036  * We have to deal with potential deadlocks here, just ping pong
1037  * the lock until we get it (we will always block somewhere when
1038  * looping so this is not cpu-intensive).
1039  *
1040  * which = 0    nch1 not locked, nch2 is locked
1041  * which = 1    nch1 is locked, nch2 is not locked
1042  */
1043 void
1044 cache_relock(struct nchandle *nch1, struct ucred *cred1,
1045              struct nchandle *nch2, struct ucred *cred2)
1046 {
1047         int which;
1048
1049         which = 0;
1050
1051         for (;;) {
1052                 if (which == 0) {
1053                         if (cache_lock_nonblock(nch1) == 0) {
1054                                 cache_resolve(nch1, cred1);
1055                                 break;
1056                         }
1057                         cache_unlock(nch2);
1058                         cache_lock(nch1);
1059                         cache_resolve(nch1, cred1);
1060                         which = 1;
1061                 } else {
1062                         if (cache_lock_nonblock(nch2) == 0) {
1063                                 cache_resolve(nch2, cred2);
1064                                 break;
1065                         }
1066                         cache_unlock(nch1);
1067                         cache_lock(nch2);
1068                         cache_resolve(nch2, cred2);
1069                         which = 0;
1070                 }
1071         }
1072 }
1073
1074 int
1075 cache_lock_nonblock(struct nchandle *nch)
1076 {
1077         return(_cache_lock_nonblock(nch->ncp));
1078 }
1079
1080 void
1081 cache_unlock(struct nchandle *nch)
1082 {
1083         _cache_unlock(nch->ncp);
1084 }
1085
1086 /*
1087  * ref-and-lock, unlock-and-deref functions.
1088  *
1089  * This function is primarily used by nlookup.  Even though cache_lock
1090  * holds the vnode, it is possible that the vnode may have already
1091  * initiated a recyclement.
1092  *
1093  * We want cache_get() to return a definitively usable vnode or a
1094  * definitively unresolved ncp.
1095  */
1096 static
1097 struct namecache *
1098 _cache_get(struct namecache *ncp)
1099 {
1100         _cache_hold(ncp);
1101         _cache_lock(ncp);
1102         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1103                 _cache_setunresolved(ncp);
1104         return(ncp);
1105 }
1106
1107 /*
1108  * Attempt to obtain a shared lock on the ncp.  A shared lock will only
1109  * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1110  * valid.  Otherwise an exclusive lock will be acquired instead.
1111  */
1112 static
1113 struct namecache *
1114 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1115 {
1116         if (ncp_shared_lock_disable || excl ||
1117             (ncp->nc_flag & NCF_UNRESOLVED)) {
1118                 return(_cache_get(ncp));
1119         }
1120         _cache_hold(ncp);
1121         _cache_lock_shared(ncp);
1122         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1123                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1124                         _cache_unlock(ncp);
1125                         ncp = _cache_get(ncp);
1126                         _cache_drop(ncp);
1127                 }
1128         } else {
1129                 _cache_unlock(ncp);
1130                 ncp = _cache_get(ncp);
1131                 _cache_drop(ncp);
1132         }
1133         return(ncp);
1134 }
1135
1136 /*
1137  * NOTE: The same nchandle can be passed for both arguments.
1138  */
1139 void
1140 cache_get(struct nchandle *nch, struct nchandle *target)
1141 {
1142         KKASSERT(nch->ncp->nc_refs > 0);
1143         target->mount = nch->mount;
1144         target->ncp = _cache_get(nch->ncp);
1145         _cache_mntref(target->mount);
1146 }
1147
1148 void
1149 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1150 {
1151         KKASSERT(nch->ncp->nc_refs > 0);
1152         target->mount = nch->mount;
1153         target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1154         _cache_mntref(target->mount);
1155 }
1156
1157 /*
1158  * Release a held and locked ncp
1159  */
1160 static __inline
1161 void
1162 _cache_put(struct namecache *ncp)
1163 {
1164         _cache_unlock(ncp);
1165         _cache_drop(ncp);
1166 }
1167
1168 void
1169 cache_put(struct nchandle *nch)
1170 {
1171         _cache_mntrel(nch->mount);
1172         _cache_put(nch->ncp);
1173         nch->ncp = NULL;
1174         nch->mount = NULL;
1175 }
1176
1177 /*
1178  * Resolve an unresolved ncp by associating a vnode with it.  If the
1179  * vnode is NULL, a negative cache entry is created.
1180  *
1181  * The ncp should be locked on entry and will remain locked on return.
1182  */
1183 static
1184 void
1185 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
1186 {
1187         KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) &&
1188                  (_cache_lockstatus(ncp) == LK_EXCLUSIVE) &&
1189                  ncp->nc_vp == NULL);
1190
1191         if (vp) {
1192                 /*
1193                  * Any vp associated with an ncp which has children must
1194                  * be held.  Any vp associated with a locked ncp must be held.
1195                  */
1196                 if (!TAILQ_EMPTY(&ncp->nc_list))
1197                         vhold(vp);
1198                 spin_lock(&vp->v_spin);
1199                 ncp->nc_vp = vp;
1200                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1201                 ++vp->v_namecache_count;
1202                 _cache_hold(ncp);               /* v_namecache assoc */
1203                 spin_unlock(&vp->v_spin);
1204                 vhold(vp);                      /* nc_vp */
1205
1206                 /*
1207                  * Set auxiliary flags
1208                  */
1209                 switch(vp->v_type) {
1210                 case VDIR:
1211                         ncp->nc_flag |= NCF_ISDIR;
1212                         break;
1213                 case VLNK:
1214                         ncp->nc_flag |= NCF_ISSYMLINK;
1215                         /* XXX cache the contents of the symlink */
1216                         break;
1217                 default:
1218                         break;
1219                 }
1220
1221                 ncp->nc_error = 0;
1222
1223                 /*
1224                  * XXX: this is a hack to work-around the lack of a real pfs vfs
1225                  * implementation
1226                  */
1227                 if (mp) {
1228                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1229                                 vp->v_pfsmp = mp;
1230                 }
1231         } else {
1232                 /*
1233                  * When creating a negative cache hit we set the
1234                  * namecache_gen.  A later resolve will clean out the
1235                  * negative cache hit if the mount point's namecache_gen
1236                  * has changed.  Used by devfs, could also be used by
1237                  * other remote FSs.
1238                  */
1239                 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
1240
1241                 ncp->nc_vp = NULL;
1242                 ncp->nc_negcpu = mycpu->gd_cpuid;
1243                 spin_lock(&pn->neg_spin);
1244                 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
1245                 _cache_hold(ncp);       /* neg_list assoc */
1246                 ++pn->neg_count;
1247                 spin_unlock(&pn->neg_spin);
1248                 atomic_add_long(&pn->vfscache_negs, 1);
1249
1250                 ncp->nc_error = ENOENT;
1251                 if (mp)
1252                         VFS_NCPGEN_SET(mp, ncp);
1253         }
1254         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1255 }
1256
1257 void
1258 cache_setvp(struct nchandle *nch, struct vnode *vp)
1259 {
1260         _cache_setvp(nch->mount, nch->ncp, vp);
1261 }
1262
1263 /*
1264  * Used for NFS
1265  */
1266 void
1267 cache_settimeout(struct nchandle *nch, int nticks)
1268 {
1269         struct namecache *ncp = nch->ncp;
1270
1271         if ((ncp->nc_timeout = ticks + nticks) == 0)
1272                 ncp->nc_timeout = 1;
1273 }
1274
1275 /*
1276  * Disassociate the vnode or negative-cache association and mark a
1277  * namecache entry as unresolved again.  Note that the ncp is still
1278  * left in the hash table and still linked to its parent.
1279  *
1280  * The ncp should be locked and refd on entry and will remain locked and refd
1281  * on return.
1282  *
1283  * This routine is normally never called on a directory containing children.
1284  * However, NFS often does just that in its rename() code as a cop-out to
1285  * avoid complex namespace operations.  This disconnects a directory vnode
1286  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1287  * sync.
1288  *
1289  */
1290 static
1291 void
1292 _cache_setunresolved(struct namecache *ncp)
1293 {
1294         struct vnode *vp;
1295
1296         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1297                 ncp->nc_flag |= NCF_UNRESOLVED;
1298                 ncp->nc_timeout = 0;
1299                 ncp->nc_error = ENOTCONN;
1300                 if ((vp = ncp->nc_vp) != NULL) {
1301                         spin_lock(&vp->v_spin);
1302                         ncp->nc_vp = NULL;
1303                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1304                         --vp->v_namecache_count;
1305                         spin_unlock(&vp->v_spin);
1306
1307                         /*
1308                          * Any vp associated with an ncp with children is
1309                          * held by that ncp.  Any vp associated with  ncp
1310                          * is held by that ncp.  These conditions must be
1311                          * undone when the vp is cleared out from the ncp.
1312                          */
1313                         if (!TAILQ_EMPTY(&ncp->nc_list))
1314                                 vdrop(vp);
1315                         vdrop(vp);
1316                 } else {
1317                         struct pcpu_ncache *pn;
1318
1319                         pn = &pcpu_ncache[ncp->nc_negcpu];
1320
1321                         atomic_add_long(&pn->vfscache_negs, -1);
1322                         spin_lock(&pn->neg_spin);
1323                         TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
1324                         --pn->neg_count;
1325                         spin_unlock(&pn->neg_spin);
1326                 }
1327                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1328                 _cache_drop(ncp);       /* from v_namecache or neg_list */
1329         }
1330 }
1331
1332 /*
1333  * The cache_nresolve() code calls this function to automatically
1334  * set a resolved cache element to unresolved if it has timed out
1335  * or if it is a negative cache hit and the mount point namecache_gen
1336  * has changed.
1337  */
1338 static __inline int
1339 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1340 {
1341         /*
1342          * Try to zap entries that have timed out.  We have
1343          * to be careful here because locked leafs may depend
1344          * on the vnode remaining intact in a parent, so only
1345          * do this under very specific conditions.
1346          */
1347         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1348             TAILQ_EMPTY(&ncp->nc_list)) {
1349                 return 1;
1350         }
1351
1352         /*
1353          * If a resolved negative cache hit is invalid due to
1354          * the mount's namecache generation being bumped, zap it.
1355          */
1356         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1357                 return 1;
1358         }
1359
1360         /*
1361          * Otherwise we are good
1362          */
1363         return 0;
1364 }
1365
1366 static __inline void
1367 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1368 {
1369         /*
1370          * Already in an unresolved state, nothing to do.
1371          */
1372         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1373                 if (_cache_auto_unresolve_test(mp, ncp))
1374                         _cache_setunresolved(ncp);
1375         }
1376 }
1377
1378 void
1379 cache_setunresolved(struct nchandle *nch)
1380 {
1381         _cache_setunresolved(nch->ncp);
1382 }
1383
1384 /*
1385  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1386  * looking for matches.  This flag tells the lookup code when it must
1387  * check for a mount linkage and also prevents the directories in question
1388  * from being deleted or renamed.
1389  */
1390 static
1391 int
1392 cache_clrmountpt_callback(struct mount *mp, void *data)
1393 {
1394         struct nchandle *nch = data;
1395
1396         if (mp->mnt_ncmounton.ncp == nch->ncp)
1397                 return(1);
1398         if (mp->mnt_ncmountpt.ncp == nch->ncp)
1399                 return(1);
1400         return(0);
1401 }
1402
1403 /*
1404  * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated
1405  * with a mount point.
1406  */
1407 void
1408 cache_clrmountpt(struct nchandle *nch)
1409 {
1410         int count;
1411
1412         count = mountlist_scan(cache_clrmountpt_callback, nch,
1413                                MNTSCAN_FORWARD | MNTSCAN_NOBUSY |
1414                                MNTSCAN_NOUNLOCK);
1415         if (count == 0)
1416                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1417 }
1418
1419 /*
1420  * Invalidate portions of the namecache topology given a starting entry.
1421  * The passed ncp is set to an unresolved state and:
1422  *
1423  * The passed ncp must be referenced and locked.  The routine may unlock
1424  * and relock ncp several times, and will recheck the children and loop
1425  * to catch races.  When done the passed ncp will be returned with the
1426  * reference and lock intact.
1427  *
1428  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
1429  *                        that the physical underlying nodes have been
1430  *                        destroyed... as in deleted.  For example, when
1431  *                        a directory is removed.  This will cause record
1432  *                        lookups on the name to no longer be able to find
1433  *                        the record and tells the resolver to return failure
1434  *                        rather then trying to resolve through the parent.
1435  *
1436  *                        The topology itself, including ncp->nc_name,
1437  *                        remains intact.
1438  *
1439  *                        This only applies to the passed ncp, if CINV_CHILDREN
1440  *                        is specified the children are not flagged.
1441  *
1442  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
1443  *                        state as well.
1444  *
1445  *                        Note that this will also have the side effect of
1446  *                        cleaning out any unreferenced nodes in the topology
1447  *                        from the leaves up as the recursion backs out.
1448  *
1449  * Note that the topology for any referenced nodes remains intact, but
1450  * the nodes will be marked as having been destroyed and will be set
1451  * to an unresolved state.
1452  *
1453  * It is possible for cache_inval() to race a cache_resolve(), meaning that
1454  * the namecache entry may not actually be invalidated on return if it was
1455  * revalidated while recursing down into its children.  This code guarentees
1456  * that the node(s) will go through an invalidation cycle, but does not
1457  * guarentee that they will remain in an invalidated state.
1458  *
1459  * Returns non-zero if a revalidation was detected during the invalidation
1460  * recursion, zero otherwise.  Note that since only the original ncp is
1461  * locked the revalidation ultimately can only indicate that the original ncp
1462  * *MIGHT* no have been reresolved.
1463  *
1464  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1465  * have to avoid blowing out the kernel stack.  We do this by saving the
1466  * deep namecache node and aborting the recursion, then re-recursing at that
1467  * node using a depth-first algorithm in order to allow multiple deep
1468  * recursions to chain through each other, then we restart the invalidation
1469  * from scratch.
1470  */
1471
1472 struct cinvtrack {
1473         struct namecache *resume_ncp;
1474         int depth;
1475 };
1476
1477 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1478
1479 static
1480 int
1481 _cache_inval(struct namecache *ncp, int flags)
1482 {
1483         struct cinvtrack track;
1484         struct namecache *ncp2;
1485         int r;
1486
1487         track.depth = 0;
1488         track.resume_ncp = NULL;
1489
1490         for (;;) {
1491                 r = _cache_inval_internal(ncp, flags, &track);
1492                 if (track.resume_ncp == NULL)
1493                         break;
1494                 _cache_unlock(ncp);
1495                 while ((ncp2 = track.resume_ncp) != NULL) {
1496                         track.resume_ncp = NULL;
1497                         _cache_lock(ncp2);
1498                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1499                                              &track);
1500                         /*_cache_put(ncp2);*/
1501                         cache_zap(ncp2);
1502                 }
1503                 _cache_lock(ncp);
1504         }
1505         return(r);
1506 }
1507
1508 int
1509 cache_inval(struct nchandle *nch, int flags)
1510 {
1511         return(_cache_inval(nch->ncp, flags));
1512 }
1513
1514 /*
1515  * Helper for _cache_inval().  The passed ncp is refd and locked and
1516  * remains that way on return, but may be unlocked/relocked multiple
1517  * times by the routine.
1518  */
1519 static int
1520 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1521 {
1522         struct namecache *nextkid;
1523         int rcnt = 0;
1524
1525         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1526
1527         _cache_setunresolved(ncp);
1528         if (flags & CINV_DESTROY) {
1529                 ncp->nc_flag |= NCF_DESTROYED;
1530                 ++ncp->nc_generation;
1531         }
1532
1533         while ((flags & CINV_CHILDREN) &&
1534                (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1535         ) {
1536                 struct namecache *kid;
1537                 int restart;
1538
1539                 restart = 0;
1540                 _cache_hold(nextkid);
1541                 if (++track->depth > MAX_RECURSION_DEPTH) {
1542                         track->resume_ncp = ncp;
1543                         _cache_hold(ncp);
1544                         ++rcnt;
1545                 }
1546                 while ((kid = nextkid) != NULL) {
1547                         /*
1548                          * Parent (ncp) must be locked for the iteration.
1549                          */
1550                         nextkid = NULL;
1551                         if (kid->nc_parent != ncp) {
1552                                 _cache_drop(kid);
1553                                 kprintf("cache_inval_internal restartA %s\n",
1554                                         ncp->nc_name);
1555                                 restart = 1;
1556                                 break;
1557                         }
1558                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1559                                 _cache_hold(nextkid);
1560
1561                         /*
1562                          * Parent unlocked for this section to avoid
1563                          * deadlocks.  Then lock the kid and check for
1564                          * races.
1565                          */
1566                         _cache_unlock(ncp);
1567                         if (track->resume_ncp) {
1568                                 _cache_drop(kid);
1569                                 _cache_lock(ncp);
1570                                 break;
1571                         }
1572                         _cache_lock(kid);
1573                         if (kid->nc_parent != ncp) {
1574                                 kprintf("cache_inval_internal "
1575                                         "restartB %s\n",
1576                                         ncp->nc_name);
1577                                 restart = 1;
1578                                 _cache_unlock(kid);
1579                                 _cache_drop(kid);
1580                                 _cache_lock(ncp);
1581                                 break;
1582                         }
1583                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1584                             TAILQ_FIRST(&kid->nc_list)
1585                         ) {
1586
1587                                 rcnt += _cache_inval_internal(kid,
1588                                                 flags & ~CINV_DESTROY, track);
1589                                 /*_cache_unlock(kid);*/
1590                                 /*_cache_drop(kid);*/
1591                                 cache_zap(kid);
1592                         } else {
1593                                 cache_zap(kid);
1594                         }
1595
1596                         /*
1597                          * Relock parent to continue scan
1598                          */
1599                         _cache_lock(ncp);
1600                 }
1601                 if (nextkid)
1602                         _cache_drop(nextkid);
1603                 --track->depth;
1604                 if (restart == 0)
1605                         break;
1606         }
1607
1608         /*
1609          * Someone could have gotten in there while ncp was unlocked,
1610          * retry if so.
1611          */
1612         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1613                 ++rcnt;
1614         return (rcnt);
1615 }
1616
1617 /*
1618  * Invalidate a vnode's namecache associations.  To avoid races against
1619  * the resolver we do not invalidate a node which we previously invalidated
1620  * but which was then re-resolved while we were in the invalidation loop.
1621  *
1622  * Returns non-zero if any namecache entries remain after the invalidation
1623  * loop completed.
1624  *
1625  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1626  *       be ripped out of the topology while held, the vnode's v_namecache
1627  *       list has no such restriction.  NCP's can be ripped out of the list
1628  *       at virtually any time if not locked, even if held.
1629  *
1630  *       In addition, the v_namecache list itself must be locked via
1631  *       the vnode's spinlock.
1632  */
1633 int
1634 cache_inval_vp(struct vnode *vp, int flags)
1635 {
1636         struct namecache *ncp;
1637         struct namecache *next;
1638
1639 restart:
1640         spin_lock(&vp->v_spin);
1641         ncp = TAILQ_FIRST(&vp->v_namecache);
1642         if (ncp)
1643                 _cache_hold(ncp);
1644         while (ncp) {
1645                 /* loop entered with ncp held and vp spin-locked */
1646                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1647                         _cache_hold(next);
1648                 spin_unlock(&vp->v_spin);
1649                 _cache_lock(ncp);
1650                 if (ncp->nc_vp != vp) {
1651                         kprintf("Warning: cache_inval_vp: race-A detected on "
1652                                 "%s\n", ncp->nc_name);
1653                         _cache_put(ncp);
1654                         if (next)
1655                                 _cache_drop(next);
1656                         goto restart;
1657                 }
1658                 _cache_inval(ncp, flags);
1659                 _cache_put(ncp);                /* also releases reference */
1660                 ncp = next;
1661                 spin_lock(&vp->v_spin);
1662                 if (ncp && ncp->nc_vp != vp) {
1663                         spin_unlock(&vp->v_spin);
1664                         kprintf("Warning: cache_inval_vp: race-B detected on "
1665                                 "%s\n", ncp->nc_name);
1666                         _cache_drop(ncp);
1667                         goto restart;
1668                 }
1669         }
1670         spin_unlock(&vp->v_spin);
1671         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1672 }
1673
1674 /*
1675  * This routine is used instead of the normal cache_inval_vp() when we
1676  * are trying to recycle otherwise good vnodes.
1677  *
1678  * Return 0 on success, non-zero if not all namecache records could be
1679  * disassociated from the vnode (for various reasons).
1680  */
1681 int
1682 cache_inval_vp_nonblock(struct vnode *vp)
1683 {
1684         struct namecache *ncp;
1685         struct namecache *next;
1686
1687         spin_lock(&vp->v_spin);
1688         ncp = TAILQ_FIRST(&vp->v_namecache);
1689         if (ncp)
1690                 _cache_hold(ncp);
1691         while (ncp) {
1692                 /* loop entered with ncp held */
1693                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1694                         _cache_hold(next);
1695                 spin_unlock(&vp->v_spin);
1696                 if (_cache_lock_nonblock(ncp)) {
1697                         _cache_drop(ncp);
1698                         if (next)
1699                                 _cache_drop(next);
1700                         goto done;
1701                 }
1702                 if (ncp->nc_vp != vp) {
1703                         kprintf("Warning: cache_inval_vp: race-A detected on "
1704                                 "%s\n", ncp->nc_name);
1705                         _cache_put(ncp);
1706                         if (next)
1707                                 _cache_drop(next);
1708                         goto done;
1709                 }
1710                 _cache_inval(ncp, 0);
1711                 _cache_put(ncp);                /* also releases reference */
1712                 ncp = next;
1713                 spin_lock(&vp->v_spin);
1714                 if (ncp && ncp->nc_vp != vp) {
1715                         spin_unlock(&vp->v_spin);
1716                         kprintf("Warning: cache_inval_vp: race-B detected on "
1717                                 "%s\n", ncp->nc_name);
1718                         _cache_drop(ncp);
1719                         goto done;
1720                 }
1721         }
1722         spin_unlock(&vp->v_spin);
1723 done:
1724         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1725 }
1726
1727 /*
1728  * Clears the universal directory search 'ok' flag.  This flag allows
1729  * nlookup() to bypass normal vnode checks.  This flag is a cached flag
1730  * so clearing it simply forces revalidation.
1731  */
1732 void
1733 cache_inval_wxok(struct vnode *vp)
1734 {
1735         struct namecache *ncp;
1736
1737         spin_lock(&vp->v_spin);
1738         TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1739                 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX))
1740                         atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX);
1741         }
1742         spin_unlock(&vp->v_spin);
1743 }
1744
1745 /*
1746  * The source ncp has been renamed to the target ncp.  Both fncp and tncp
1747  * must be locked.  The target ncp is destroyed (as a normal rename-over
1748  * would destroy the target file or directory).
1749  *
1750  * Because there may be references to the source ncp we cannot copy its
1751  * contents to the target.  Instead the source ncp is relinked as the target
1752  * and the target ncp is removed from the namecache topology.
1753  */
1754 void
1755 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
1756 {
1757         struct namecache *fncp = fnch->ncp;
1758         struct namecache *tncp = tnch->ncp;
1759         struct namecache *tncp_par;
1760         struct nchash_head *nchpp;
1761         u_int32_t hash;
1762         char *oname;
1763         char *nname;
1764
1765         ++fncp->nc_generation;
1766         ++tncp->nc_generation;
1767         if (tncp->nc_nlen) {
1768                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
1769                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
1770                 nname[tncp->nc_nlen] = 0;
1771         } else {
1772                 nname = NULL;
1773         }
1774
1775         /*
1776          * Rename fncp (unlink)
1777          */
1778         _cache_unlink_parent(fncp);
1779         oname = fncp->nc_name;
1780         fncp->nc_name = nname;
1781         fncp->nc_nlen = tncp->nc_nlen;
1782         if (oname)
1783                 kfree(oname, M_VFSCACHE);
1784
1785         tncp_par = tncp->nc_parent;
1786         _cache_hold(tncp_par);
1787         _cache_lock(tncp_par);
1788
1789         /*
1790          * Rename fncp (relink)
1791          */
1792         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
1793         hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
1794         nchpp = NCHHASH(hash);
1795
1796         spin_lock(&nchpp->spin);
1797         _cache_link_parent(fncp, tncp_par, nchpp);
1798         spin_unlock(&nchpp->spin);
1799
1800         _cache_put(tncp_par);
1801
1802         /*
1803          * Get rid of the overwritten tncp (unlink)
1804          */
1805         _cache_unlink(tncp);
1806 }
1807
1808 /*
1809  * Perform actions consistent with unlinking a file.  The passed-in ncp
1810  * must be locked.
1811  *
1812  * The ncp is marked DESTROYED so it no longer shows up in searches,
1813  * and will be physically deleted when the vnode goes away.
1814  *
1815  * If the related vnode has no refs then we cycle it through vget()/vput()
1816  * to (possibly if we don't have a ref race) trigger a deactivation,
1817  * allowing the VFS to trivially detect and recycle the deleted vnode
1818  * via VOP_INACTIVE().
1819  *
1820  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
1821  *       target ncp.
1822  */
1823 void
1824 cache_unlink(struct nchandle *nch)
1825 {
1826         _cache_unlink(nch->ncp);
1827 }
1828
1829 static void
1830 _cache_unlink(struct namecache *ncp)
1831 {
1832         struct vnode *vp;
1833
1834         /*
1835          * Causes lookups to fail and allows another ncp with the same
1836          * name to be created under ncp->nc_parent.
1837          */
1838         ncp->nc_flag |= NCF_DESTROYED;
1839         ++ncp->nc_generation;
1840
1841         /*
1842          * Attempt to trigger a deactivation.  Set VREF_FINALIZE to
1843          * force action on the 1->0 transition.
1844          */
1845         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1846             (vp = ncp->nc_vp) != NULL) {
1847                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
1848                 if (VREFCNT(vp) <= 0) {
1849                         if (vget(vp, LK_SHARED) == 0)
1850                                 vput(vp);
1851                 }
1852         }
1853 }
1854
1855 /*
1856  * Return non-zero if the nch might be associated with an open and/or mmap()'d
1857  * file.  The easy solution is to just return non-zero if the vnode has refs.
1858  * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
1859  * force the reclaim).
1860  */
1861 int
1862 cache_isopen(struct nchandle *nch)
1863 {
1864         struct vnode *vp;
1865         struct namecache *ncp = nch->ncp;
1866
1867         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1868             (vp = ncp->nc_vp) != NULL &&
1869             VREFCNT(vp)) {
1870                 return 1;
1871         }
1872         return 0;
1873 }
1874
1875
1876 /*
1877  * vget the vnode associated with the namecache entry.  Resolve the namecache
1878  * entry if necessary.  The passed ncp must be referenced and locked.  If
1879  * the ncp is resolved it might be locked shared.
1880  *
1881  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
1882  * (depending on the passed lk_type) will be returned in *vpp with an error
1883  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
1884  * most typical error is ENOENT, meaning that the ncp represents a negative
1885  * cache hit and there is no vnode to retrieve, but other errors can occur
1886  * too.
1887  *
1888  * The vget() can race a reclaim.  If this occurs we re-resolve the
1889  * namecache entry.
1890  *
1891  * There are numerous places in the kernel where vget() is called on a
1892  * vnode while one or more of its namecache entries is locked.  Releasing
1893  * a vnode never deadlocks against locked namecache entries (the vnode
1894  * will not get recycled while referenced ncp's exist).  This means we
1895  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
1896  * lock when acquiring the vp lock or we might cause a deadlock.
1897  *
1898  * NOTE: The passed-in ncp must be locked exclusively if it is initially
1899  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
1900  *       relocked exclusively before being re-resolved.
1901  */
1902 int
1903 cache_vget(struct nchandle *nch, struct ucred *cred,
1904            int lk_type, struct vnode **vpp)
1905 {
1906         struct namecache *ncp;
1907         struct vnode *vp;
1908         int error;
1909
1910         ncp = nch->ncp;
1911 again:
1912         vp = NULL;
1913         if (ncp->nc_flag & NCF_UNRESOLVED)
1914                 error = cache_resolve(nch, cred);
1915         else
1916                 error = 0;
1917
1918         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1919                 error = vget(vp, lk_type);
1920                 if (error) {
1921                         /*
1922                          * VRECLAIM race
1923                          *
1924                          * The ncp may have been locked shared, we must relock
1925                          * it exclusively before we can set it to unresolved.
1926                          */
1927                         if (error == ENOENT) {
1928                                 kprintf("Warning: vnode reclaim race detected "
1929                                         "in cache_vget on %p (%s)\n",
1930                                         vp, ncp->nc_name);
1931                                 _cache_unlock(ncp);
1932                                 _cache_lock(ncp);
1933                                 _cache_setunresolved(ncp);
1934                                 goto again;
1935                         }
1936
1937                         /*
1938                          * Not a reclaim race, some other error.
1939                          */
1940                         KKASSERT(ncp->nc_vp == vp);
1941                         vp = NULL;
1942                 } else {
1943                         KKASSERT(ncp->nc_vp == vp);
1944                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1945                 }
1946         }
1947         if (error == 0 && vp == NULL)
1948                 error = ENOENT;
1949         *vpp = vp;
1950         return(error);
1951 }
1952
1953 /*
1954  * Similar to cache_vget() but only acquires a ref on the vnode.  The vnode
1955  * is already held by virtuue of the ncp being locked, but it might not be
1956  * referenced and while it is not referenced it can transition into the
1957  * VRECLAIMED state.
1958  *
1959  * NOTE: The passed-in ncp must be locked exclusively if it is initially
1960  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
1961  *       relocked exclusively before being re-resolved.
1962  *
1963  * NOTE: At the moment we have to issue a vget() on the vnode, even though
1964  *       we are going to immediately release the lock, in order to resolve
1965  *       potential reclamation races.  Once we have a solid vnode ref that
1966  *       was (at some point) interlocked via a vget(), the vnode will not
1967  *       be reclaimed.
1968  *
1969  * NOTE: vhold counts (v_auxrefs) do not prevent reclamation.
1970  */
1971 int
1972 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
1973 {
1974         struct namecache *ncp;
1975         struct vnode *vp;
1976         int error;
1977         int v;
1978
1979         ncp = nch->ncp;
1980 again:
1981         vp = NULL;
1982         if (ncp->nc_flag & NCF_UNRESOLVED)
1983                 error = cache_resolve(nch, cred);
1984         else
1985                 error = 0;
1986
1987         while (error == 0 && (vp = ncp->nc_vp) != NULL) {
1988                 /*
1989                  * Try a lockless ref of the vnode.  VRECLAIMED transitions
1990                  * use the vx_lock state and update-counter mechanism so we
1991                  * can detect if one is in-progress or occurred.
1992                  *
1993                  * If we can successfully ref the vnode and interlock against
1994                  * the update-counter mechanism, and VRECLAIMED is found to
1995                  * not be set after that, we should be good.
1996                  */
1997                 v = spin_access_start_only(&vp->v_spin);
1998                 if (__predict_true(spin_access_check_inprog(v) == 0)) {
1999                         vref_special(vp);
2000                         if (__predict_false(
2001                                     spin_access_end_only(&vp->v_spin, v))) {
2002                                 vrele(vp);
2003                                 kprintf("CACHE_VREF: RACED %p\n", vp);
2004                                 continue;
2005                         }
2006                         if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) {
2007                                 break;
2008                         }
2009                         vrele(vp);
2010                         kprintf("CACHE_VREF: IN-RECLAIM\n");
2011                 }
2012
2013                 /*
2014                  * Do it the slow way
2015                  */
2016                 error = vget(vp, LK_SHARED);
2017                 if (error) {
2018                         /*
2019                          * VRECLAIM race
2020                          */
2021                         if (error == ENOENT) {
2022                                 kprintf("Warning: vnode reclaim race detected "
2023                                         "in cache_vget on %p (%s)\n",
2024                                         vp, ncp->nc_name);
2025                                 _cache_unlock(ncp);
2026                                 _cache_lock(ncp);
2027                                 _cache_setunresolved(ncp);
2028                                 goto again;
2029                         }
2030
2031                         /*
2032                          * Not a reclaim race, some other error.
2033                          */
2034                         KKASSERT(ncp->nc_vp == vp);
2035                         vp = NULL;
2036                 } else {
2037                         KKASSERT(ncp->nc_vp == vp);
2038                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2039                         /* caller does not want a lock */
2040                         vn_unlock(vp);
2041                 }
2042                 break;
2043         }
2044         if (error == 0 && vp == NULL)
2045                 error = ENOENT;
2046         *vpp = vp;
2047
2048         return(error);
2049 }
2050
2051 /*
2052  * Return a referenced vnode representing the parent directory of
2053  * ncp.
2054  *
2055  * Because the caller has locked the ncp it should not be possible for
2056  * the parent ncp to go away.  However, the parent can unresolve its
2057  * dvp at any time so we must be able to acquire a lock on the parent
2058  * to safely access nc_vp.
2059  *
2060  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
2061  * so use vhold()/vdrop() while holding the lock to prevent dvp from
2062  * getting destroyed.
2063  *
2064  * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
2065  *       lock on the ncp in question..
2066  */
2067 struct vnode *
2068 cache_dvpref(struct namecache *ncp)
2069 {
2070         struct namecache *par;
2071         struct vnode *dvp;
2072
2073         dvp = NULL;
2074         if ((par = ncp->nc_parent) != NULL) {
2075                 _cache_hold(par);
2076                 _cache_lock(par);
2077                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
2078                         if ((dvp = par->nc_vp) != NULL)
2079                                 vhold(dvp);
2080                 }
2081                 _cache_unlock(par);
2082                 if (dvp) {
2083                         if (vget(dvp, LK_SHARED) == 0) {
2084                                 vn_unlock(dvp);
2085                                 vdrop(dvp);
2086                                 /* return refd, unlocked dvp */
2087                         } else {
2088                                 vdrop(dvp);
2089                                 dvp = NULL;
2090                         }
2091                 }
2092                 _cache_drop(par);
2093         }
2094         return(dvp);
2095 }
2096
2097 /*
2098  * Convert a directory vnode to a namecache record without any other
2099  * knowledge of the topology.  This ONLY works with directory vnodes and
2100  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
2101  * returned ncp (if not NULL) will be held and unlocked.
2102  *
2103  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2104  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2105  * for dvp.  This will fail only if the directory has been deleted out from
2106  * under the caller.
2107  *
2108  * Callers must always check for a NULL return no matter the value of 'makeit'.
2109  *
2110  * To avoid underflowing the kernel stack each recursive call increments
2111  * the makeit variable.
2112  */
2113
2114 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2115                                   struct vnode *dvp, char *fakename);
2116 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2117                                   struct vnode **saved_dvp);
2118
2119 int
2120 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2121               struct nchandle *nch)
2122 {
2123         struct vnode *saved_dvp;
2124         struct vnode *pvp;
2125         char *fakename;
2126         int error;
2127
2128         nch->ncp = NULL;
2129         nch->mount = dvp->v_mount;
2130         saved_dvp = NULL;
2131         fakename = NULL;
2132
2133         /*
2134          * Handle the makeit == 0 degenerate case
2135          */
2136         if (makeit == 0) {
2137                 spin_lock_shared(&dvp->v_spin);
2138                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2139                 if (nch->ncp)
2140                         cache_hold(nch);
2141                 spin_unlock_shared(&dvp->v_spin);
2142         }
2143
2144         /*
2145          * Loop until resolution, inside code will break out on error.
2146          */
2147         while (makeit) {
2148                 /*
2149                  * Break out if we successfully acquire a working ncp.
2150                  */
2151                 spin_lock_shared(&dvp->v_spin);
2152                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2153                 if (nch->ncp) {
2154                         cache_hold(nch);
2155                         spin_unlock_shared(&dvp->v_spin);
2156                         break;
2157                 }
2158                 spin_unlock_shared(&dvp->v_spin);
2159
2160                 /*
2161                  * If dvp is the root of its filesystem it should already
2162                  * have a namecache pointer associated with it as a side
2163                  * effect of the mount, but it may have been disassociated.
2164                  */
2165                 if (dvp->v_flag & VROOT) {
2166                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2167                         error = cache_resolve_mp(nch->mount);
2168                         _cache_put(nch->ncp);
2169                         if (ncvp_debug) {
2170                                 kprintf("cache_fromdvp: resolve root of mount %p error %d",
2171                                         dvp->v_mount, error);
2172                         }
2173                         if (error) {
2174                                 if (ncvp_debug)
2175                                         kprintf(" failed\n");
2176                                 nch->ncp = NULL;
2177                                 break;
2178                         }
2179                         if (ncvp_debug)
2180                                 kprintf(" succeeded\n");
2181                         continue;
2182                 }
2183
2184                 /*
2185                  * If we are recursed too deeply resort to an O(n^2)
2186                  * algorithm to resolve the namecache topology.  The
2187                  * resolved pvp is left referenced in saved_dvp to
2188                  * prevent the tree from being destroyed while we loop.
2189                  */
2190                 if (makeit > 20) {
2191                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2192                         if (error) {
2193                                 kprintf("lookupdotdot(longpath) failed %d "
2194                                        "dvp %p\n", error, dvp);
2195                                 nch->ncp = NULL;
2196                                 break;
2197                         }
2198                         continue;
2199                 }
2200
2201                 /*
2202                  * Get the parent directory and resolve its ncp.
2203                  */
2204                 if (fakename) {
2205                         kfree(fakename, M_TEMP);
2206                         fakename = NULL;
2207                 }
2208                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2209                                           &fakename);
2210                 if (error) {
2211                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2212                         break;
2213                 }
2214                 vn_unlock(pvp);
2215
2216                 /*
2217                  * Reuse makeit as a recursion depth counter.  On success
2218                  * nch will be fully referenced.
2219                  */
2220                 cache_fromdvp(pvp, cred, makeit + 1, nch);
2221                 vrele(pvp);
2222                 if (nch->ncp == NULL)
2223                         break;
2224
2225                 /*
2226                  * Do an inefficient scan of pvp (embodied by ncp) to look
2227                  * for dvp.  This will create a namecache record for dvp on
2228                  * success.  We loop up to recheck on success.
2229                  *
2230                  * ncp and dvp are both held but not locked.
2231                  */
2232                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2233                 if (error) {
2234                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2235                                 pvp, nch->ncp->nc_name, dvp);
2236                         cache_drop(nch);
2237                         /* nch was NULLed out, reload mount */
2238                         nch->mount = dvp->v_mount;
2239                         break;
2240                 }
2241                 if (ncvp_debug) {
2242                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2243                                 pvp, nch->ncp->nc_name);
2244                 }
2245                 cache_drop(nch);
2246                 /* nch was NULLed out, reload mount */
2247                 nch->mount = dvp->v_mount;
2248         }
2249
2250         /*
2251          * If nch->ncp is non-NULL it will have been held already.
2252          */
2253         if (fakename)
2254                 kfree(fakename, M_TEMP);
2255         if (saved_dvp)
2256                 vrele(saved_dvp);
2257         if (nch->ncp)
2258                 return (0);
2259         return (EINVAL);
2260 }
2261
2262 /*
2263  * Go up the chain of parent directories until we find something
2264  * we can resolve into the namecache.  This is very inefficient.
2265  */
2266 static
2267 int
2268 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2269                   struct vnode **saved_dvp)
2270 {
2271         struct nchandle nch;
2272         struct vnode *pvp;
2273         int error;
2274         static time_t last_fromdvp_report;
2275         char *fakename;
2276
2277         /*
2278          * Loop getting the parent directory vnode until we get something we
2279          * can resolve in the namecache.
2280          */
2281         vref(dvp);
2282         nch.mount = dvp->v_mount;
2283         nch.ncp = NULL;
2284         fakename = NULL;
2285
2286         for (;;) {
2287                 if (fakename) {
2288                         kfree(fakename, M_TEMP);
2289                         fakename = NULL;
2290                 }
2291                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2292                                           &fakename);
2293                 if (error) {
2294                         vrele(dvp);
2295                         break;
2296                 }
2297                 vn_unlock(pvp);
2298                 spin_lock_shared(&pvp->v_spin);
2299                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2300                         _cache_hold(nch.ncp);
2301                         spin_unlock_shared(&pvp->v_spin);
2302                         vrele(pvp);
2303                         break;
2304                 }
2305                 spin_unlock_shared(&pvp->v_spin);
2306                 if (pvp->v_flag & VROOT) {
2307                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2308                         error = cache_resolve_mp(nch.mount);
2309                         _cache_unlock(nch.ncp);
2310                         vrele(pvp);
2311                         if (error) {
2312                                 _cache_drop(nch.ncp);
2313                                 nch.ncp = NULL;
2314                                 vrele(dvp);
2315                         }
2316                         break;
2317                 }
2318                 vrele(dvp);
2319                 dvp = pvp;
2320         }
2321         if (error == 0) {
2322                 if (last_fromdvp_report != time_uptime) {
2323                         last_fromdvp_report = time_uptime;
2324                         kprintf("Warning: extremely inefficient path "
2325                                 "resolution on %s\n",
2326                                 nch.ncp->nc_name);
2327                 }
2328                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2329
2330                 /*
2331                  * Hopefully dvp now has a namecache record associated with
2332                  * it.  Leave it referenced to prevent the kernel from
2333                  * recycling the vnode.  Otherwise extremely long directory
2334                  * paths could result in endless recycling.
2335                  */
2336                 if (*saved_dvp)
2337                     vrele(*saved_dvp);
2338                 *saved_dvp = dvp;
2339                 _cache_drop(nch.ncp);
2340         }
2341         if (fakename)
2342                 kfree(fakename, M_TEMP);
2343         return (error);
2344 }
2345
2346 /*
2347  * Do an inefficient scan of the directory represented by ncp looking for
2348  * the directory vnode dvp.  ncp must be held but not locked on entry and
2349  * will be held on return.  dvp must be refd but not locked on entry and
2350  * will remain refd on return.
2351  *
2352  * Why do this at all?  Well, due to its stateless nature the NFS server
2353  * converts file handles directly to vnodes without necessarily going through
2354  * the namecache ops that would otherwise create the namecache topology
2355  * leading to the vnode.  We could either (1) Change the namecache algorithms
2356  * to allow disconnect namecache records that are re-merged opportunistically,
2357  * or (2) Make the NFS server backtrack and scan to recover a connected
2358  * namecache topology in order to then be able to issue new API lookups.
2359  *
2360  * It turns out that (1) is a huge mess.  It takes a nice clean set of
2361  * namecache algorithms and introduces a lot of complication in every subsystem
2362  * that calls into the namecache to deal with the re-merge case, especially
2363  * since we are using the namecache to placehold negative lookups and the
2364  * vnode might not be immediately assigned. (2) is certainly far less
2365  * efficient then (1), but since we are only talking about directories here
2366  * (which are likely to remain cached), the case does not actually run all
2367  * that often and has the supreme advantage of not polluting the namecache
2368  * algorithms.
2369  *
2370  * If a fakename is supplied just construct a namecache entry using the
2371  * fake name.
2372  */
2373 static int
2374 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2375                        struct vnode *dvp, char *fakename)
2376 {
2377         struct nlcomponent nlc;
2378         struct nchandle rncp;
2379         struct dirent *den;
2380         struct vnode *pvp;
2381         struct vattr vat;
2382         struct iovec iov;
2383         struct uio uio;
2384         int blksize;
2385         int eofflag;
2386         int bytes;
2387         char *rbuf;
2388         int error;
2389
2390         vat.va_blocksize = 0;
2391         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2392                 return (error);
2393         cache_lock(nch);
2394         error = cache_vref(nch, cred, &pvp);
2395         cache_unlock(nch);
2396         if (error)
2397                 return (error);
2398         if (ncvp_debug) {
2399                 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
2400                         "vattr fileid = %lld\n",
2401                         nch->ncp, nch->ncp->nc_name,
2402                         vat.va_blocksize,
2403                         (long long)vat.va_fileid);
2404         }
2405
2406         /*
2407          * Use the supplied fakename if not NULL.  Fake names are typically
2408          * not in the actual filesystem hierarchy.  This is used by HAMMER
2409          * to glue @@timestamp recursions together.
2410          */
2411         if (fakename) {
2412                 nlc.nlc_nameptr = fakename;
2413                 nlc.nlc_namelen = strlen(fakename);
2414                 rncp = cache_nlookup(nch, &nlc);
2415                 goto done;
2416         }
2417
2418         if ((blksize = vat.va_blocksize) == 0)
2419                 blksize = DEV_BSIZE;
2420         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2421         rncp.ncp = NULL;
2422
2423         eofflag = 0;
2424         uio.uio_offset = 0;
2425 again:
2426         iov.iov_base = rbuf;
2427         iov.iov_len = blksize;
2428         uio.uio_iov = &iov;
2429         uio.uio_iovcnt = 1;
2430         uio.uio_resid = blksize;
2431         uio.uio_segflg = UIO_SYSSPACE;
2432         uio.uio_rw = UIO_READ;
2433         uio.uio_td = curthread;
2434
2435         if (ncvp_debug >= 2)
2436                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2437         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2438         if (error == 0) {
2439                 den = (struct dirent *)rbuf;
2440                 bytes = blksize - uio.uio_resid;
2441
2442                 while (bytes > 0) {
2443                         if (ncvp_debug >= 2) {
2444                                 kprintf("cache_inefficient_scan: %*.*s\n",
2445                                         den->d_namlen, den->d_namlen,
2446                                         den->d_name);
2447                         }
2448                         if (den->d_type != DT_WHT &&
2449                             den->d_ino == vat.va_fileid) {
2450                                 if (ncvp_debug) {
2451                                         kprintf("cache_inefficient_scan: "
2452                                                "MATCHED inode %lld path %s/%*.*s\n",
2453                                                (long long)vat.va_fileid,
2454                                                nch->ncp->nc_name,
2455                                                den->d_namlen, den->d_namlen,
2456                                                den->d_name);
2457                                 }
2458                                 nlc.nlc_nameptr = den->d_name;
2459                                 nlc.nlc_namelen = den->d_namlen;
2460                                 rncp = cache_nlookup(nch, &nlc);
2461                                 KKASSERT(rncp.ncp != NULL);
2462                                 break;
2463                         }
2464                         bytes -= _DIRENT_DIRSIZ(den);
2465                         den = _DIRENT_NEXT(den);
2466                 }
2467                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2468                         goto again;
2469         }
2470         kfree(rbuf, M_TEMP);
2471 done:
2472         vrele(pvp);
2473         if (rncp.ncp) {
2474                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2475                         _cache_setvp(rncp.mount, rncp.ncp, dvp);
2476                         if (ncvp_debug >= 2) {
2477                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2478                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2479                         }
2480                 } else {
2481                         if (ncvp_debug >= 2) {
2482                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2483                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2484                                         rncp.ncp->nc_vp);
2485                         }
2486                 }
2487                 if (rncp.ncp->nc_vp == NULL)
2488                         error = rncp.ncp->nc_error;
2489                 /*
2490                  * Release rncp after a successful nlookup.  rncp was fully
2491                  * referenced.
2492                  */
2493                 cache_put(&rncp);
2494         } else {
2495                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2496                         dvp, nch->ncp->nc_name);
2497                 error = ENOENT;
2498         }
2499         return (error);
2500 }
2501
2502 /*
2503  * This function must be called with the ncp held and locked and will unlock
2504  * and drop it during zapping.
2505  *
2506  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
2507  * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list
2508  * and removes the related reference.  If the ncp can be removed, and the
2509  * parent can be zapped non-blocking, this function loops up.
2510  *
2511  * There will be one ref from the caller (which we now own).  The only
2512  * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list,
2513  * so possibly 2 refs left.  Taking this into account, if there are no
2514  * additional refs and no children, the ncp will be removed from the topology
2515  * and destroyed.
2516  *
2517  * References and/or children may exist if the ncp is in the middle of the
2518  * topology, preventing the ncp from being destroyed.
2519  *
2520  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2521  *
2522  * This function may return a held (but NOT locked) parent node which the
2523  * caller must drop in a loop.  Looping is one way to avoid unbounded recursion
2524  * due to deep namecache trees.
2525  *
2526  * WARNING!  For MPSAFE operation this routine must acquire up to three
2527  *           spin locks to be able to safely test nc_refs.  Lock order is
2528  *           very important.
2529  *
2530  *           hash spinlock if on hash list
2531  *           parent spinlock if child of parent
2532  *           (the ncp is unresolved so there is no vnode association)
2533  */
2534 static void
2535 cache_zap(struct namecache *ncp)
2536 {
2537         struct namecache *par;
2538         struct vnode *dropvp;
2539         struct nchash_head *nchpp;
2540         int refcmp;
2541         int nonblock = 1;       /* XXX cleanup */
2542
2543 again:
2544         /*
2545          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2546          * This gets rid of any vp->v_namecache list or negative list and
2547          * the related ref.
2548          */
2549         _cache_setunresolved(ncp);
2550
2551         /*
2552          * Try to scrap the entry and possibly tail-recurse on its parent.
2553          * We only scrap unref'd (other then our ref) unresolved entries,
2554          * we do not scrap 'live' entries.
2555          *
2556          * If nc_parent is non NULL we expect 2 references, else just 1.
2557          * If there are more, someone else also holds the ncp and we cannot
2558          * destroy it.
2559          */
2560         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2561         KKASSERT(ncp->nc_refs > 0);
2562
2563         /*
2564          * If the ncp is linked to its parent it will also be in the hash
2565          * table.  We have to be able to lock the parent and the hash table.
2566          *
2567          * Acquire locks.  Note that the parent can't go away while we hold
2568          * a child locked.  If nc_parent is present, expect 2 refs instead
2569          * of 1.
2570          */
2571         nchpp = NULL;
2572         if ((par = ncp->nc_parent) != NULL) {
2573                 if (nonblock) {
2574                         if (_cache_lock_nonblock(par)) {
2575                                 /* lock failed */
2576                                 ncp->nc_flag |= NCF_DEFEREDZAP;
2577                                 atomic_add_long(
2578                                     &pcpu_ncache[mycpu->gd_cpuid].numdefered,
2579                                     1);
2580                                 _cache_unlock(ncp);
2581                                 _cache_drop(ncp);       /* caller's ref */
2582                                 return;
2583                         }
2584                         _cache_hold(par);
2585                 } else {
2586                         _cache_hold(par);
2587                         _cache_lock(par);
2588                 }
2589                 nchpp = ncp->nc_head;
2590                 spin_lock(&nchpp->spin);
2591         }
2592
2593         /*
2594          * With the parent and nchpp locked, and the vnode removed
2595          * (no vp->v_namecache), we expect 1 or 2 refs.  If there are
2596          * more someone else has a ref and we cannot zap the entry.
2597          *
2598          * one for our hold
2599          * one for our parent link (parent also has one from the linkage)
2600          */
2601         if (par)
2602                 refcmp = 2;
2603         else
2604                 refcmp = 1;
2605
2606         /*
2607          * On failure undo the work we've done so far and drop the
2608          * caller's ref and ncp.
2609          */
2610         if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) {
2611                 if (par) {
2612                         spin_unlock(&nchpp->spin);
2613                         _cache_put(par);
2614                 }
2615                 _cache_unlock(ncp);
2616                 _cache_drop(ncp);
2617                 return;
2618         }
2619
2620         /*
2621          * We own all the refs and with the spinlocks held no further
2622          * refs can be acquired by others.
2623          *
2624          * Remove us from the hash list and parent list.  We have to
2625          * drop a ref on the parent's vp if the parent's list becomes
2626          * empty.
2627          */
2628         dropvp = NULL;
2629         if (par) {
2630                 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
2631
2632                 KKASSERT(nchpp == ncp->nc_head);
2633                 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash);
2634                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
2635                 atomic_add_long(&pn->vfscache_count, -1);
2636                 if (TAILQ_EMPTY(&ncp->nc_list))
2637                         atomic_add_long(&pn->vfscache_leafs, -1);
2638
2639                 if (TAILQ_EMPTY(&par->nc_list)) {
2640                         atomic_add_long(&pn->vfscache_leafs, 1);
2641                         if (par->nc_vp)
2642                                 dropvp = par->nc_vp;
2643                 }
2644                 ncp->nc_parent = NULL;
2645                 ncp->nc_head = NULL;
2646                 spin_unlock(&nchpp->spin);
2647                 _cache_drop(par);       /* removal of ncp from par->nc_list */
2648                 /*_cache_unlock(par);*/
2649         } else {
2650                 KKASSERT(ncp->nc_head == NULL);
2651         }
2652
2653         /*
2654          * ncp should not have picked up any refs.  Physically
2655          * destroy the ncp.
2656          */
2657         if (ncp->nc_refs != refcmp) {
2658                 panic("cache_zap: %p bad refs %d (expected %d)\n",
2659                         ncp, ncp->nc_refs, refcmp);
2660         }
2661         /* _cache_unlock(ncp) not required */
2662         ncp->nc_refs = -1;      /* safety */
2663         if (ncp->nc_name)
2664                 kfree(ncp->nc_name, M_VFSCACHE);
2665         kfree(ncp, M_VFSCACHE);
2666
2667         /*
2668          * Delayed drop (we had to release our spinlocks)
2669          */
2670         if (dropvp)
2671                 vdrop(dropvp);
2672
2673         /*
2674          * Loop up if we can recursively clean out the parent.
2675          */
2676         if (par) {
2677                 refcmp = 1;             /* ref on parent */
2678                 if (par->nc_parent)     /* par->par */
2679                         ++refcmp;
2680                 par->nc_flag &= ~NCF_DEFEREDZAP;
2681                 if ((par->nc_flag & NCF_UNRESOLVED) &&
2682                     par->nc_refs == refcmp &&
2683                     TAILQ_EMPTY(&par->nc_list)) {
2684                         ncp = par;
2685                         goto again;
2686                 }
2687                 _cache_unlock(par);
2688                 _cache_drop(par);
2689         }
2690 }
2691
2692 /*
2693  * Clean up dangling negative cache and defered-drop entries in the
2694  * namecache.
2695  *
2696  * This routine is called in the critical path and also called from
2697  * vnlru().  When called from vnlru we use a lower limit to try to
2698  * deal with the negative cache before the critical path has to start
2699  * dealing with it.
2700  */
2701 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2702
2703 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2704 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2705
2706 void
2707 cache_hysteresis(int critpath)
2708 {
2709         long poslimit;
2710         long neglimit = maxvnodes / ncnegfactor;
2711         long xnumcache = vfscache_leafs;
2712
2713         if (critpath == 0)
2714                 neglimit = neglimit * 8 / 10;
2715
2716         /*
2717          * Don't cache too many negative hits.  We use hysteresis to reduce
2718          * the impact on the critical path.
2719          */
2720         switch(neg_cache_hysteresis_state[critpath]) {
2721         case CHI_LOW:
2722                 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) {
2723                         if (critpath)
2724                                 _cache_cleanneg(ncnegflush);
2725                         else
2726                                 _cache_cleanneg(ncnegflush +
2727                                                 vfscache_negs - neglimit);
2728                         neg_cache_hysteresis_state[critpath] = CHI_HIGH;
2729                 }
2730                 break;
2731         case CHI_HIGH:
2732                 if (vfscache_negs > MINNEG * 9 / 10 &&
2733                     vfscache_negs * 9 / 10 > neglimit
2734                 ) {
2735                         if (critpath)
2736                                 _cache_cleanneg(ncnegflush);
2737                         else
2738                                 _cache_cleanneg(ncnegflush +
2739                                                 vfscache_negs * 9 / 10 -
2740                                                 neglimit);
2741                 } else {
2742                         neg_cache_hysteresis_state[critpath] = CHI_LOW;
2743                 }
2744                 break;
2745         }
2746
2747         /*
2748          * Don't cache too many positive hits.  We use hysteresis to reduce
2749          * the impact on the critical path.
2750          *
2751          * Excessive positive hits can accumulate due to large numbers of
2752          * hardlinks (the vnode cache will not prevent hl ncps from growing
2753          * into infinity).
2754          */
2755         if ((poslimit = ncposlimit) == 0)
2756                 poslimit = maxvnodes * 2;
2757         if (critpath == 0)
2758                 poslimit = poslimit * 8 / 10;
2759
2760         switch(pos_cache_hysteresis_state[critpath]) {
2761         case CHI_LOW:
2762                 if (xnumcache > poslimit && xnumcache > MINPOS) {
2763                         if (critpath)
2764                                 _cache_cleanpos(ncposflush);
2765                         else
2766                                 _cache_cleanpos(ncposflush +
2767                                                 xnumcache - poslimit);
2768                         pos_cache_hysteresis_state[critpath] = CHI_HIGH;
2769                 }
2770                 break;
2771         case CHI_HIGH:
2772                 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) {
2773                         if (critpath)
2774                                 _cache_cleanpos(ncposflush);
2775                         else
2776                                 _cache_cleanpos(ncposflush +
2777                                                 xnumcache - poslimit * 5 / 6);
2778                 } else {
2779                         pos_cache_hysteresis_state[critpath] = CHI_LOW;
2780                 }
2781                 break;
2782         }
2783
2784         /*
2785          * Clean out dangling defered-zap ncps which could not be cleanly
2786          * dropped if too many build up.  Note that numdefered is
2787          * heuristical.  Make sure we are real-time for the current cpu,
2788          * plus the global rollup.
2789          */
2790         if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) {
2791                 _cache_cleandefered();
2792         }
2793 }
2794
2795 /*
2796  * NEW NAMECACHE LOOKUP API
2797  *
2798  * Lookup an entry in the namecache.  The passed par_nch must be referenced
2799  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
2800  * is ALWAYS returned, eve if the supplied component is illegal.
2801  *
2802  * The resulting namecache entry should be returned to the system with
2803  * cache_put() or cache_unlock() + cache_drop().
2804  *
2805  * namecache locks are recursive but care must be taken to avoid lock order
2806  * reversals (hence why the passed par_nch must be unlocked).  Locking
2807  * rules are to order for parent traversals, not for child traversals.
2808  *
2809  * Nobody else will be able to manipulate the associated namespace (e.g.
2810  * create, delete, rename, rename-target) until the caller unlocks the
2811  * entry.
2812  *
2813  * The returned entry will be in one of three states:  positive hit (non-null
2814  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2815  * Unresolved entries must be resolved through the filesystem to associate the
2816  * vnode and/or determine whether a positive or negative hit has occured.
2817  *
2818  * It is not necessary to lock a directory in order to lock namespace under
2819  * that directory.  In fact, it is explicitly not allowed to do that.  A
2820  * directory is typically only locked when being created, renamed, or
2821  * destroyed.
2822  *
2823  * The directory (par) may be unresolved, in which case any returned child
2824  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
2825  * the filesystem lookup requires a resolved directory vnode the caller is
2826  * responsible for resolving the namecache chain top-down.  This API
2827  * specifically allows whole chains to be created in an unresolved state.
2828  */
2829 struct nchandle
2830 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
2831 {
2832         struct nchandle nch;
2833         struct namecache *ncp;
2834         struct namecache *new_ncp;
2835         struct namecache *rep_ncp;      /* reuse a destroyed ncp */
2836         struct nchash_head *nchpp;
2837         struct mount *mp;
2838         u_int32_t hash;
2839         globaldata_t gd;
2840         int par_locked;
2841
2842         gd = mycpu;
2843         mp = par_nch->mount;
2844         par_locked = 0;
2845
2846         /*
2847          * This is a good time to call it, no ncp's are locked by
2848          * the caller or us.
2849          */
2850         cache_hysteresis(1);
2851
2852         /*
2853          * Try to locate an existing entry
2854          */
2855         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2856         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2857         new_ncp = NULL;
2858         nchpp = NCHHASH(hash);
2859 restart:
2860         rep_ncp = NULL;
2861         if (new_ncp)
2862                 spin_lock(&nchpp->spin);
2863         else
2864                 spin_lock_shared(&nchpp->spin);
2865
2866         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
2867                 /*
2868                  * Break out if we find a matching entry.  Note that
2869                  * UNRESOLVED entries may match, but DESTROYED entries
2870                  * do not.
2871                  *
2872                  * We may be able to reuse DESTROYED entries that we come
2873                  * across, even if the name does not match, as long as
2874                  * nc_nlen is correct and the only hold ref is from the nchpp
2875                  * list itself.
2876                  */
2877                 if (ncp->nc_parent == par_nch->ncp &&
2878                     ncp->nc_nlen == nlc->nlc_namelen) {
2879                         if (ncp->nc_flag & NCF_DESTROYED) {
2880                                 if (ncp->nc_refs == 1 && rep_ncp == NULL)
2881                                         rep_ncp = ncp;
2882                                 continue;
2883                         }
2884                         if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen))
2885                                 continue;
2886                         _cache_hold(ncp);
2887                         if (new_ncp)
2888                                 spin_unlock(&nchpp->spin);
2889                         else
2890                                 spin_unlock_shared(&nchpp->spin);
2891                         if (par_locked) {
2892                                 _cache_unlock(par_nch->ncp);
2893                                 par_locked = 0;
2894                         }
2895                         if (_cache_lock_special(ncp) == 0) {
2896                                 /*
2897                                  * Successfully locked but we must re-test
2898                                  * conditions that might have changed since
2899                                  * we did not have the lock before.
2900                                  */
2901                                 if (ncp->nc_parent != par_nch->ncp ||
2902                                     ncp->nc_nlen != nlc->nlc_namelen ||
2903                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
2904                                          ncp->nc_nlen) ||
2905                                     (ncp->nc_flag & NCF_DESTROYED)) {
2906                                         _cache_put(ncp);
2907                                         goto restart;
2908                                 }
2909                                 _cache_auto_unresolve(mp, ncp);
2910                                 if (new_ncp)
2911                                         _cache_free(new_ncp);
2912                                 goto found;
2913                         }
2914                         _cache_get(ncp);        /* cycle the lock to block */
2915                         _cache_put(ncp);
2916                         _cache_drop(ncp);
2917                         goto restart;
2918                 }
2919         }
2920
2921         /*
2922          * We failed to locate the entry, try to resurrect a destroyed
2923          * entry that we did find that is already correctly linked into
2924          * nchpp and the parent.  We must re-test conditions after
2925          * successfully locking rep_ncp.
2926          *
2927          * This case can occur under heavy loads due to not being able
2928          * to safely lock the parent in cache_zap().  Nominally a repeated
2929          * create/unlink load, but only the namelen needs to match.
2930          */
2931         if (rep_ncp && new_ncp == NULL) {
2932                 if (_cache_lock_nonblock(rep_ncp) == 0) {
2933                         _cache_hold(rep_ncp);
2934                         if (rep_ncp->nc_parent == par_nch->ncp &&
2935                             rep_ncp->nc_nlen == nlc->nlc_namelen &&
2936                             (rep_ncp->nc_flag & NCF_DESTROYED) &&
2937                             rep_ncp->nc_refs == 2) {
2938                                 /*
2939                                  * Update nc_name as reuse as new.
2940                                  */
2941                                 ncp = rep_ncp;
2942                                 bcopy(nlc->nlc_nameptr, ncp->nc_name,
2943                                       nlc->nlc_namelen);
2944                                 spin_unlock_shared(&nchpp->spin);
2945                                 _cache_setunresolved(ncp);
2946                                 ncp->nc_flag = NCF_UNRESOLVED;
2947                                 ncp->nc_error = ENOTCONN;
2948                                 goto found;
2949                         }
2950                         _cache_put(rep_ncp);
2951                 }
2952         }
2953
2954         /*
2955          * Otherwise create a new entry and add it to the cache.  The parent
2956          * ncp must also be locked so we can link into it.
2957          *
2958          * We have to relookup after possibly blocking in kmalloc or
2959          * when locking par_nch.
2960          *
2961          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2962          *       mount case, in which case nc_name will be NULL.
2963          */
2964         if (new_ncp == NULL) {
2965                 spin_unlock_shared(&nchpp->spin);
2966                 new_ncp = cache_alloc(nlc->nlc_namelen);
2967                 if (nlc->nlc_namelen) {
2968                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2969                               nlc->nlc_namelen);
2970                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
2971                 }
2972                 goto restart;
2973         }
2974
2975         /*
2976          * NOTE! The spinlock is held exclusively here because new_ncp
2977          *       is non-NULL.
2978          */
2979         if (par_locked == 0) {
2980                 spin_unlock(&nchpp->spin);
2981                 _cache_lock(par_nch->ncp);
2982                 par_locked = 1;
2983                 goto restart;
2984         }
2985
2986         /*
2987          * Link to parent (requires another ref, the one already in new_ncp
2988          * is what we wil lreturn).
2989          *
2990          * WARNING!  We still hold the spinlock.  We have to set the hash
2991          *           table entry atomically.
2992          */
2993         ncp = new_ncp;
2994         ++ncp->nc_refs;
2995         _cache_link_parent(ncp, par_nch->ncp, nchpp);
2996         spin_unlock(&nchpp->spin);
2997         _cache_unlock(par_nch->ncp);
2998         /* par_locked = 0 - not used */
2999 found:
3000         /*
3001          * stats and namecache size management
3002          */
3003         if (ncp->nc_flag & NCF_UNRESOLVED)
3004                 ++gd->gd_nchstats->ncs_miss;
3005         else if (ncp->nc_vp)
3006                 ++gd->gd_nchstats->ncs_goodhits;
3007         else
3008                 ++gd->gd_nchstats->ncs_neghits;
3009         nch.mount = mp;
3010         nch.ncp = ncp;
3011         _cache_mntref(nch.mount);
3012
3013         return(nch);
3014 }
3015
3016 /*
3017  * Attempt to lookup a namecache entry and return with a shared namecache
3018  * lock.  This operates non-blocking.  EWOULDBLOCK is returned if excl is
3019  * set or we are unable to lock.
3020  */
3021 int
3022 cache_nlookup_maybe_shared(struct nchandle *par_nch,
3023                            struct nlcomponent *nlc,
3024                            int excl, struct nchandle *res_nch)
3025 {
3026         struct namecache *ncp;
3027         struct nchash_head *nchpp;
3028         struct mount *mp;
3029         u_int32_t hash;
3030         globaldata_t gd;
3031
3032         /*
3033          * If exclusive requested or shared namecache locks are disabled,
3034          * return failure.
3035          */
3036         if (ncp_shared_lock_disable || excl)
3037                 return(EWOULDBLOCK);
3038
3039         gd = mycpu;
3040         mp = par_nch->mount;
3041
3042         /*
3043          * This is a good time to call it, no ncp's are locked by
3044          * the caller or us.
3045          */
3046         cache_hysteresis(1);
3047
3048         /*
3049          * Try to locate an existing entry
3050          */
3051         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3052         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3053         nchpp = NCHHASH(hash);
3054
3055         spin_lock_shared(&nchpp->spin);
3056
3057         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3058                 /*
3059                  * Break out if we find a matching entry.  Note that
3060                  * UNRESOLVED entries may match, but DESTROYED entries
3061                  * do not.
3062                  */
3063                 if (ncp->nc_parent == par_nch->ncp &&
3064                     ncp->nc_nlen == nlc->nlc_namelen &&
3065                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3066                     (ncp->nc_flag & NCF_DESTROYED) == 0
3067                 ) {
3068                         _cache_hold(ncp);
3069                         spin_unlock_shared(&nchpp->spin);
3070
3071                         if (_cache_lock_shared_special(ncp) == 0) {
3072                                 if (ncp->nc_parent == par_nch->ncp &&
3073                                     ncp->nc_nlen == nlc->nlc_namelen &&
3074                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
3075                                          ncp->nc_nlen) == 0 &&
3076                                     (ncp->nc_flag & NCF_DESTROYED) == 0 &&
3077                                     (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
3078                                     _cache_auto_unresolve_test(mp, ncp) == 0) {
3079                                         goto found;
3080                                 }
3081                                 _cache_unlock(ncp);
3082                         }
3083                         _cache_drop(ncp);
3084                         return(EWOULDBLOCK);
3085                 }
3086         }
3087
3088         /*
3089          * Failure
3090          */
3091         spin_unlock_shared(&nchpp->spin);
3092         return(EWOULDBLOCK);
3093
3094         /*
3095          * Success
3096          *
3097          * Note that nc_error might be non-zero (e.g ENOENT).
3098          */
3099 found:
3100         res_nch->mount = mp;
3101         res_nch->ncp = ncp;
3102         ++gd->gd_nchstats->ncs_goodhits;
3103         _cache_mntref(res_nch->mount);
3104
3105         KKASSERT(ncp->nc_error != EWOULDBLOCK);
3106         return(ncp->nc_error);
3107 }
3108
3109 /*
3110  * This is a non-blocking verison of cache_nlookup() used by
3111  * nfs_readdirplusrpc_uio().  It can fail for any reason and
3112  * will return nch.ncp == NULL in that case.
3113  */
3114 struct nchandle
3115 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
3116 {
3117         struct nchandle nch;
3118         struct namecache *ncp;
3119         struct namecache *new_ncp;
3120         struct nchash_head *nchpp;
3121         struct mount *mp;
3122         u_int32_t hash;
3123         globaldata_t gd;
3124         int par_locked;
3125
3126         gd = mycpu;
3127         mp = par_nch->mount;
3128         par_locked = 0;
3129
3130         /*
3131          * Try to locate an existing entry
3132          */
3133         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3134         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3135         new_ncp = NULL;
3136         nchpp = NCHHASH(hash);
3137 restart:
3138         spin_lock(&nchpp->spin);
3139         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3140                 /*
3141                  * Break out if we find a matching entry.  Note that
3142                  * UNRESOLVED entries may match, but DESTROYED entries
3143                  * do not.
3144                  */
3145                 if (ncp->nc_parent == par_nch->ncp &&
3146                     ncp->nc_nlen == nlc->nlc_namelen &&
3147                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3148                     (ncp->nc_flag & NCF_DESTROYED) == 0
3149                 ) {
3150                         _cache_hold(ncp);
3151                         spin_unlock(&nchpp->spin);
3152                         if (par_locked) {
3153                                 _cache_unlock(par_nch->ncp);
3154                                 par_locked = 0;
3155                         }
3156                         if (_cache_lock_special(ncp) == 0) {
3157                                 if (ncp->nc_parent != par_nch->ncp ||
3158                                     ncp->nc_nlen != nlc->nlc_namelen ||
3159                                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) ||
3160                                     (ncp->nc_flag & NCF_DESTROYED)) {
3161                                         kprintf("cache_lookup_nonblock: "
3162                                                 "ncp-race %p %*.*s\n",
3163                                                 ncp,
3164                                                 nlc->nlc_namelen,
3165                                                 nlc->nlc_namelen,
3166                                                 nlc->nlc_nameptr);
3167                                         _cache_unlock(ncp);
3168                                         _cache_drop(ncp);
3169                                         goto failed;
3170                                 }
3171                                 _cache_auto_unresolve(mp, ncp);
3172                                 if (new_ncp) {
3173                                         _cache_free(new_ncp);
3174                                         new_ncp = NULL;
3175                                 }
3176                                 goto found;
3177                         }
3178                         _cache_drop(ncp);
3179                         goto failed;
3180                 }
3181         }
3182
3183         /*
3184          * We failed to locate an entry, create a new entry and add it to
3185          * the cache.  The parent ncp must also be locked so we
3186          * can link into it.
3187          *
3188          * We have to relookup after possibly blocking in kmalloc or
3189          * when locking par_nch.
3190          *
3191          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3192          *       mount case, in which case nc_name will be NULL.
3193          */
3194         if (new_ncp == NULL) {
3195                 spin_unlock(&nchpp->spin);
3196                 new_ncp = cache_alloc(nlc->nlc_namelen);
3197                 if (nlc->nlc_namelen) {
3198                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3199                               nlc->nlc_namelen);
3200                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
3201                 }
3202                 goto restart;
3203         }
3204         if (par_locked == 0) {
3205                 spin_unlock(&nchpp->spin);
3206                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3207                         par_locked = 1;
3208                         goto restart;
3209                 }
3210                 goto failed;
3211         }
3212
3213         /*
3214          * Link to parent (requires another ref, the one already in new_ncp
3215          * is what we wil lreturn).
3216          *
3217          * WARNING!  We still hold the spinlock.  We have to set the hash
3218          *           table entry atomically.
3219          */
3220         ncp = new_ncp;
3221         ++ncp->nc_refs;
3222         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3223         spin_unlock(&nchpp->spin);
3224         _cache_unlock(par_nch->ncp);
3225         /* par_locked = 0 - not used */
3226 found:
3227         /*
3228          * stats and namecache size management
3229          */
3230         if (ncp->nc_flag & NCF_UNRESOLVED)
3231                 ++gd->gd_nchstats->ncs_miss;
3232         else if (ncp->nc_vp)
3233                 ++gd->gd_nchstats->ncs_goodhits;
3234         else
3235                 ++gd->gd_nchstats->ncs_neghits;
3236         nch.mount = mp;
3237         nch.ncp = ncp;
3238         _cache_mntref(nch.mount);
3239
3240         return(nch);
3241 failed:
3242         if (new_ncp) {
3243                 _cache_free(new_ncp);
3244                 new_ncp = NULL;
3245         }
3246         nch.mount = NULL;
3247         nch.ncp = NULL;
3248         return(nch);
3249 }
3250
3251 /*
3252  * This version is non-locking.  The caller must validate the result
3253  * for parent-to-child continuity.
3254  *
3255  * It can fail for any reason and will return nch.ncp == NULL in that case.
3256  */
3257 struct nchandle
3258 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc)
3259 {
3260         struct nchandle nch;
3261         struct namecache *ncp;
3262         struct nchash_head *nchpp;
3263         struct mount *mp;
3264         u_int32_t hash;
3265         globaldata_t gd;
3266
3267         gd = mycpu;
3268         mp = par_nch->mount;
3269
3270         /*
3271          * Try to locate an existing entry
3272          */
3273         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3274         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3275         nchpp = NCHHASH(hash);
3276
3277         spin_lock_shared(&nchpp->spin);
3278         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3279                 /*
3280                  * Break out if we find a matching entry.  Note that
3281                  * UNRESOLVED entries may match, but DESTROYED entries
3282                  * do not.
3283                  *
3284                  * Resolved NFS entries which have timed out fail so the
3285                  * caller can rerun with normal locking.
3286                  */
3287                 if (ncp->nc_parent == par_nch->ncp &&
3288                     ncp->nc_nlen == nlc->nlc_namelen &&
3289                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3290                     (ncp->nc_flag & NCF_DESTROYED) == 0
3291                 ) {
3292                         if (_cache_auto_unresolve_test(par_nch->mount, ncp))
3293                                 break;
3294                         _cache_hold(ncp);
3295                         spin_unlock_shared(&nchpp->spin);
3296                         goto found;
3297                 }
3298         }
3299         spin_unlock_shared(&nchpp->spin);
3300         nch.mount = NULL;
3301         nch.ncp = NULL;
3302         return nch;
3303 found:
3304         /*
3305          * stats and namecache size management
3306          */
3307         if (ncp->nc_flag & NCF_UNRESOLVED)
3308                 ++gd->gd_nchstats->ncs_miss;
3309         else if (ncp->nc_vp)
3310                 ++gd->gd_nchstats->ncs_goodhits;
3311         else
3312                 ++gd->gd_nchstats->ncs_neghits;
3313         nch.mount = mp;
3314         nch.ncp = ncp;
3315         _cache_mntref(nch.mount);
3316
3317         return(nch);
3318 }
3319
3320 /*
3321  * The namecache entry is marked as being used as a mount point.
3322  * Locate the mount if it is visible to the caller.  The DragonFly
3323  * mount system allows arbitrary loops in the topology and disentangles
3324  * those loops by matching against (mp, ncp) rather than just (ncp).
3325  * This means any given ncp can dive any number of mounts, depending
3326  * on the relative mount (e.g. nullfs) the caller is at in the topology.
3327  *
3328  * We use a very simple frontend cache to reduce SMP conflicts,
3329  * which we have to do because the mountlist scan needs an exclusive
3330  * lock around its ripout info list.  Not to mention that there might
3331  * be a lot of mounts.
3332  *
3333  * Because all mounts can potentially be accessed by all cpus, break the cpu's
3334  * down a bit to allow some contention rather than making the cache
3335  * excessively huge.
3336  *
3337  * The hash table is split into per-cpu areas, is 4-way set-associative.
3338  */
3339 struct findmount_info {
3340         struct mount *result;
3341         struct mount *nch_mount;
3342         struct namecache *nch_ncp;
3343 };
3344
3345 static __inline
3346 struct ncmount_cache *
3347 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp)
3348 {
3349         uint32_t hash;
3350
3351         hash = iscsi_crc32(&mp, sizeof(mp));
3352         hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash);
3353         hash ^= hash >> 16;
3354         hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1));
3355
3356         return (&ncmount_cache[hash]);
3357 }
3358
3359 static
3360 struct ncmount_cache *
3361 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3362 {
3363         struct ncmount_cache *ncc;
3364         struct ncmount_cache *best;
3365         int delta;
3366         int best_delta;
3367         int i;
3368
3369         ncc = ncmount_cache_lookup4(mp, ncp);
3370
3371         /*
3372          * NOTE: When checking for a ticks overflow implement a slop of
3373          *       2 ticks just to be safe, because ticks is accessed
3374          *       non-atomically one CPU can increment it while another
3375          *       is still using the old value.
3376          */
3377         if (ncc->ncp == ncp && ncc->mp == mp)   /* 0 */
3378                 return ncc;
3379         delta = (int)(ticks - ncc->ticks);      /* beware GCC opts */
3380         if (delta < -2)                         /* overflow reset */
3381                 ncc->ticks = ticks;
3382         best = ncc;
3383         best_delta = delta;
3384
3385         for (i = 1; i < NCMOUNT_SET; ++i) {     /* 1, 2, 3 */
3386                 ++ncc;
3387                 if (ncc->ncp == ncp && ncc->mp == mp)
3388                         return ncc;
3389                 delta = (int)(ticks - ncc->ticks);
3390                 if (delta < -2)
3391                         ncc->ticks = ticks;
3392                 if (delta > best_delta) {
3393                         best_delta = delta;
3394                         best = ncc;
3395                 }
3396         }
3397         return best;
3398 }
3399
3400 /*
3401  * pcpu-optimized mount search.  Locate the recursive mountpoint, avoid
3402  * doing an expensive mountlist_scan*() if possible.
3403  *
3404  * (mp, ncp) -> mountonpt.k
3405  *
3406  * Returns a referenced mount pointer or NULL
3407  *
3408  * General SMP operation uses a per-cpu umount_spin to interlock unmount
3409  * operations (that is, where the mp_target can be freed out from under us).
3410  *
3411  * Lookups use the ncc->updating counter to validate the contents in order
3412  * to avoid having to obtain the per cache-element spin-lock.  In addition,
3413  * the ticks field is only updated when it changes.  However, if our per-cpu
3414  * lock fails due to an unmount-in-progress, we fall-back to the
3415  * cache-element's spin-lock.
3416  */
3417 struct mount *
3418 cache_findmount(struct nchandle *nch)
3419 {
3420         struct findmount_info info;
3421         struct ncmount_cache *ncc;
3422         struct ncmount_cache ncc_copy;
3423         struct mount *target;
3424         struct pcpu_ncache *pcpu;
3425         struct spinlock *spinlk;
3426         int update;
3427
3428         pcpu = pcpu_ncache;
3429         if (ncmount_cache_enable == 0 || pcpu == NULL) {
3430                 ncc = NULL;
3431                 goto skip;
3432         }
3433         pcpu += mycpu->gd_cpuid;
3434
3435 again:
3436         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3437         if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3438 found:
3439                 /*
3440                  * This is a bit messy for now because we do not yet have
3441                  * safe disposal of mount structures.  We have to ref
3442                  * ncc->mp_target but the 'update' counter only tell us
3443                  * whether the cache has changed after the fact.
3444                  *
3445                  * For now get a per-cpu spinlock that will only contend
3446                  * against umount's.  This is the best path.  If it fails,
3447                  * instead of waiting on the umount we fall-back to a
3448                  * shared ncc->spin lock, which will generally only cost a
3449                  * cache ping-pong.
3450                  */
3451                 update = ncc->updating;
3452                 if (__predict_true(spin_trylock(&pcpu->umount_spin))) {
3453                         spinlk = &pcpu->umount_spin;
3454                 } else {
3455                         spinlk = &ncc->spin;
3456                         spin_lock_shared(spinlk);
3457                 }
3458                 if (update & 1) {               /* update in progress */
3459                         spin_unlock_any(spinlk);
3460                         goto skip;
3461                 }
3462                 ncc_copy = *ncc;
3463                 cpu_lfence();
3464                 if (ncc->updating != update) {  /* content changed */
3465                         spin_unlock_any(spinlk);
3466                         goto again;
3467                 }
3468                 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) {
3469                         spin_unlock_any(spinlk);
3470                         goto again;
3471                 }
3472                 if (ncc_copy.isneg == 0) {
3473                         target = ncc_copy.mp_target;
3474                         if (target->mnt_ncmounton.mount == nch->mount &&
3475                             target->mnt_ncmounton.ncp == nch->ncp) {
3476                                 /*
3477                                  * Cache hit (positive) (avoid dirtying
3478                                  * the cache line if possible)
3479                                  */
3480                                 if (ncc->ticks != (int)ticks)
3481                                         ncc->ticks = (int)ticks;
3482                                 _cache_mntref(target);
3483                         }
3484                 } else {
3485                         /*
3486                          * Cache hit (negative) (avoid dirtying
3487                          * the cache line if possible)
3488                          */
3489                         if (ncc->ticks != (int)ticks)
3490                                 ncc->ticks = (int)ticks;
3491                         target = NULL;
3492                 }
3493                 spin_unlock_any(spinlk);
3494
3495                 return target;
3496         }
3497 skip:
3498
3499         /*
3500          * Slow
3501          */
3502         info.result = NULL;
3503         info.nch_mount = nch->mount;
3504         info.nch_ncp = nch->ncp;
3505         mountlist_scan(cache_findmount_callback, &info,
3506                        MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK);
3507
3508         /*
3509          * To reduce multi-re-entry on the cache, relookup in the cache.
3510          * This can still race, obviously, but that's ok.
3511          */
3512         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3513         if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3514                 if (info.result)
3515                         atomic_add_int(&info.result->mnt_refs, -1);
3516                 goto found;
3517         }
3518
3519         /*
3520          * Cache the result.
3521          */
3522         if ((info.result == NULL ||
3523             (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) {
3524                 spin_lock(&ncc->spin);
3525                 atomic_add_int_nonlocked(&ncc->updating, 1);
3526                 cpu_sfence();
3527                 KKASSERT(ncc->updating & 1);
3528                 if (ncc->mp != nch->mount) {
3529                         if (ncc->mp)
3530                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
3531                         atomic_add_int(&nch->mount->mnt_refs, 1);
3532                         ncc->mp = nch->mount;
3533                 }
3534                 ncc->ncp = nch->ncp;    /* ptr compares only, not refd*/
3535                 ncc->ticks = (int)ticks;
3536
3537                 if (info.result) {
3538                         ncc->isneg = 0;
3539                         if (ncc->mp_target != info.result) {
3540                                 if (ncc->mp_target)
3541                                         atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3542                                 ncc->mp_target = info.result;
3543                                 atomic_add_int(&info.result->mnt_refs, 1);
3544                         }
3545                 } else {
3546                         ncc->isneg = 1;
3547                         if (ncc->mp_target) {
3548                                 atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3549                                 ncc->mp_target = NULL;
3550                         }
3551                 }
3552                 cpu_sfence();
3553                 atomic_add_int_nonlocked(&ncc->updating, 1);
3554                 spin_unlock(&ncc->spin);
3555         }
3556         return(info.result);
3557 }
3558
3559 static
3560 int
3561 cache_findmount_callback(struct mount *mp, void *data)
3562 {
3563         struct findmount_info *info = data;
3564
3565         /*
3566          * Check the mount's mounted-on point against the passed nch.
3567          */
3568         if (mp->mnt_ncmounton.mount == info->nch_mount &&
3569             mp->mnt_ncmounton.ncp == info->nch_ncp
3570         ) {
3571             info->result = mp;
3572             _cache_mntref(mp);
3573             return(-1);
3574         }
3575         return(0);
3576 }
3577
3578 void
3579 cache_dropmount(struct mount *mp)
3580 {
3581         _cache_mntrel(mp);
3582 }
3583
3584 /*
3585  * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive
3586  * or negative).
3587  *
3588  * A full scan is not required, but for now just do it anyway.
3589  */
3590 void
3591 cache_ismounting(struct mount *mp)
3592 {
3593         struct ncmount_cache *ncc;
3594         struct mount *ncc_mp;
3595         int i;
3596
3597         if (pcpu_ncache == NULL)
3598                 return;
3599
3600         for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
3601                 ncc = &ncmount_cache[i];
3602                 if (ncc->mp != mp->mnt_ncmounton.mount ||
3603                     ncc->ncp != mp->mnt_ncmounton.ncp) {
3604                         continue;
3605                 }
3606                 spin_lock(&ncc->spin);
3607                 atomic_add_int_nonlocked(&ncc->updating, 1);
3608                 cpu_sfence();
3609                 KKASSERT(ncc->updating & 1);
3610                 if (ncc->mp != mp->mnt_ncmounton.mount ||
3611                     ncc->ncp != mp->mnt_ncmounton.ncp) {
3612                         cpu_sfence();
3613                         ++ncc->updating;
3614                         spin_unlock(&ncc->spin);
3615                         continue;
3616                 }
3617                 ncc_mp = ncc->mp;
3618                 ncc->ncp = NULL;
3619                 ncc->mp = NULL;
3620                 if (ncc_mp)
3621                         atomic_add_int(&ncc_mp->mnt_refs, -1);
3622                 ncc_mp = ncc->mp_target;
3623                 ncc->mp_target = NULL;
3624                 if (ncc_mp)
3625                         atomic_add_int(&ncc_mp->mnt_refs, -1);
3626                 ncc->ticks = (int)ticks - hz * 120;
3627
3628                 cpu_sfence();
3629                 atomic_add_int_nonlocked(&ncc->updating, 1);
3630                 spin_unlock(&ncc->spin);
3631         }
3632
3633         /*
3634          * Pre-cache the mount point
3635          */
3636         ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount,
3637                                    mp->mnt_ncmounton.ncp);
3638
3639         spin_lock(&ncc->spin);
3640         atomic_add_int_nonlocked(&ncc->updating, 1);
3641         cpu_sfence();
3642         KKASSERT(ncc->updating & 1);
3643
3644         if (ncc->mp)
3645                 atomic_add_int(&ncc->mp->mnt_refs, -1);
3646         atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1);
3647         ncc->mp = mp->mnt_ncmounton.mount;
3648         ncc->ncp = mp->mnt_ncmounton.ncp;       /* ptr compares only */
3649         ncc->ticks = (int)ticks;
3650
3651         ncc->isneg = 0;
3652         if (ncc->mp_target != mp) {
3653                 if (ncc->mp_target)
3654                         atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3655                 ncc->mp_target = mp;
3656                 atomic_add_int(&mp->mnt_refs, 1);
3657         }
3658         cpu_sfence();
3659         atomic_add_int_nonlocked(&ncc->updating, 1);
3660         spin_unlock(&ncc->spin);
3661 }
3662
3663 /*
3664  * Scrap any ncmount_cache entries related to mp.  Not only do we need to
3665  * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any
3666  * negative hits involving (mp, <any>).
3667  *
3668  * A full scan is required.
3669  */
3670 void
3671 cache_unmounting(struct mount *mp)
3672 {
3673         struct ncmount_cache *ncc;
3674         struct pcpu_ncache *pcpu;
3675         struct mount *ncc_mp;
3676         int i;
3677
3678         pcpu = pcpu_ncache;
3679         if (pcpu == NULL)
3680                 return;
3681
3682         for (i = 0; i < ncpus; ++i)
3683                 spin_lock(&pcpu[i].umount_spin);
3684
3685         for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
3686                 ncc = &ncmount_cache[i];
3687                 if (ncc->mp != mp && ncc->mp_target != mp)
3688                         continue;
3689                 spin_lock(&ncc->spin);
3690                 atomic_add_int_nonlocked(&ncc->updating, 1);
3691                 cpu_sfence();
3692
3693                 if (ncc->mp != mp && ncc->mp_target != mp) {
3694                         atomic_add_int_nonlocked(&ncc->updating, 1);
3695                         cpu_sfence();
3696                         spin_unlock(&ncc->spin);
3697                         continue;
3698                 }
3699                 ncc_mp = ncc->mp;
3700                 ncc->ncp = NULL;
3701                 ncc->mp = NULL;
3702                 if (ncc_mp)
3703                         atomic_add_int(&ncc_mp->mnt_refs, -1);
3704                 ncc_mp = ncc->mp_target;
3705                 ncc->mp_target = NULL;
3706                 if (ncc_mp)
3707                         atomic_add_int(&ncc_mp->mnt_refs, -1);
3708                 ncc->ticks = (int)ticks - hz * 120;
3709
3710                 cpu_sfence();
3711                 atomic_add_int_nonlocked(&ncc->updating, 1);
3712                 spin_unlock(&ncc->spin);
3713         }
3714
3715         for (i = 0; i < ncpus; ++i)
3716                 spin_unlock(&pcpu[i].umount_spin);
3717 }
3718
3719 /*
3720  * Resolve an unresolved namecache entry, generally by looking it up.
3721  * The passed ncp must be locked and refd.
3722  *
3723  * Theoretically since a vnode cannot be recycled while held, and since
3724  * the nc_parent chain holds its vnode as long as children exist, the
3725  * direct parent of the cache entry we are trying to resolve should
3726  * have a valid vnode.  If not then generate an error that we can
3727  * determine is related to a resolver bug.
3728  *
3729  * However, if a vnode was in the middle of a recyclement when the NCP
3730  * got locked, ncp->nc_vp might point to a vnode that is about to become
3731  * invalid.  cache_resolve() handles this case by unresolving the entry
3732  * and then re-resolving it.
3733  *
3734  * Note that successful resolution does not necessarily return an error
3735  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
3736  * will be returned.
3737  */
3738 int
3739 cache_resolve(struct nchandle *nch, struct ucred *cred)
3740 {
3741         struct namecache *par_tmp;
3742         struct namecache *par;
3743         struct namecache *ncp;
3744         struct nchandle nctmp;
3745         struct mount *mp;
3746         struct vnode *dvp;
3747         int error;
3748
3749         ncp = nch->ncp;
3750         mp = nch->mount;
3751         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
3752 restart:
3753         /*
3754          * If the ncp is already resolved we have nothing to do.  However,
3755          * we do want to guarentee that a usable vnode is returned when
3756          * a vnode is present, so make sure it hasn't been reclaimed.
3757          */
3758         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3759                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3760                         _cache_setunresolved(ncp);
3761                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
3762                         return (ncp->nc_error);
3763         }
3764
3765         /*
3766          * If the ncp was destroyed it will never resolve again.  This
3767          * can basically only happen when someone is chdir'd into an
3768          * empty directory which is then rmdir'd.  We want to catch this
3769          * here and not dive the VFS because the VFS might actually
3770          * have a way to re-resolve the disconnected ncp, which will
3771          * result in inconsistencies in the cdir/nch for proc->p_fd.
3772          */
3773         if (ncp->nc_flag & NCF_DESTROYED)
3774                 return(EINVAL);
3775
3776         /*
3777          * Mount points need special handling because the parent does not
3778          * belong to the same filesystem as the ncp.
3779          */
3780         if (ncp == mp->mnt_ncmountpt.ncp)
3781                 return (cache_resolve_mp(mp));
3782
3783         /*
3784          * We expect an unbroken chain of ncps to at least the mount point,
3785          * and even all the way to root (but this code doesn't have to go
3786          * past the mount point).
3787          */
3788         if (ncp->nc_parent == NULL) {
3789                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
3790                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3791                 ncp->nc_error = EXDEV;
3792                 return(ncp->nc_error);
3793         }
3794
3795         /*
3796          * The vp's of the parent directories in the chain are held via vhold()
3797          * due to the existance of the child, and should not disappear.
3798          * However, there are cases where they can disappear:
3799          *
3800          *      - due to filesystem I/O errors.
3801          *      - due to NFS being stupid about tracking the namespace and
3802          *        destroys the namespace for entire directories quite often.
3803          *      - due to forced unmounts.
3804          *      - due to an rmdir (parent will be marked DESTROYED)
3805          *
3806          * When this occurs we have to track the chain backwards and resolve
3807          * it, looping until the resolver catches up to the current node.  We
3808          * could recurse here but we might run ourselves out of kernel stack
3809          * so we do it in a more painful manner.  This situation really should
3810          * not occur all that often, or if it does not have to go back too
3811          * many nodes to resolve the ncp.
3812          */
3813         while ((dvp = cache_dvpref(ncp)) == NULL) {
3814                 /*
3815                  * This case can occur if a process is CD'd into a
3816                  * directory which is then rmdir'd.  If the parent is marked
3817                  * destroyed there is no point trying to resolve it.
3818                  */
3819                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
3820                         return(ENOENT);
3821                 par = ncp->nc_parent;
3822                 _cache_hold(par);
3823                 _cache_lock(par);
3824                 while ((par_tmp = par->nc_parent) != NULL &&
3825                        par_tmp->nc_vp == NULL) {
3826                         _cache_hold(par_tmp);
3827                         _cache_lock(par_tmp);
3828                         _cache_put(par);
3829                         par = par_tmp;
3830                 }
3831                 if (par->nc_parent == NULL) {
3832                         kprintf("EXDEV case 2 %*.*s\n",
3833                                 par->nc_nlen, par->nc_nlen, par->nc_name);
3834                         _cache_put(par);
3835                         return (EXDEV);
3836                 }
3837                 /*
3838                  * The parent is not set in stone, ref and lock it to prevent
3839                  * it from disappearing.  Also note that due to renames it
3840                  * is possible for our ncp to move and for par to no longer
3841                  * be one of its parents.  We resolve it anyway, the loop
3842                  * will handle any moves.
3843                  */
3844                 _cache_get(par);        /* additional hold/lock */
3845                 _cache_put(par);        /* from earlier hold/lock */
3846                 if (par == nch->mount->mnt_ncmountpt.ncp) {
3847                         cache_resolve_mp(nch->mount);
3848                 } else if ((dvp = cache_dvpref(par)) == NULL) {
3849                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n",
3850                                 par->nc_nlen, par->nc_nlen, par->nc_name);
3851                         _cache_put(par);
3852                         continue;
3853                 } else {
3854                         if (par->nc_flag & NCF_UNRESOLVED) {
3855                                 nctmp.mount = mp;
3856                                 nctmp.ncp = par;
3857                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3858                         }
3859                         vrele(dvp);
3860                 }
3861                 if ((error = par->nc_error) != 0) {
3862                         if (par->nc_error != EAGAIN) {
3863                                 kprintf("EXDEV case 3 %*.*s error %d\n",
3864                                     par->nc_nlen, par->nc_nlen, par->nc_name,
3865                                     par->nc_error);
3866                                 _cache_put(par);
3867                                 return(error);
3868                         }
3869                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
3870                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
3871                 }
3872                 _cache_put(par);
3873                 /* loop */
3874         }
3875
3876         /*
3877          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
3878          * ncp's and reattach them.  If this occurs the original ncp is marked
3879          * EAGAIN to force a relookup.
3880          *
3881          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
3882          * ncp must already be resolved.
3883          */
3884         if (dvp) {
3885                 nctmp.mount = mp;
3886                 nctmp.ncp = ncp;
3887                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3888                 vrele(dvp);
3889         } else {
3890                 ncp->nc_error = EPERM;
3891         }
3892         if (ncp->nc_error == EAGAIN) {
3893                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
3894                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3895                 goto restart;
3896         }
3897         return(ncp->nc_error);
3898 }
3899
3900 /*
3901  * Resolve the ncp associated with a mount point.  Such ncp's almost always
3902  * remain resolved and this routine is rarely called.  NFS MPs tends to force
3903  * re-resolution more often due to its mac-truck-smash-the-namecache
3904  * method of tracking namespace changes.
3905  *
3906  * The semantics for this call is that the passed ncp must be locked on
3907  * entry and will be locked on return.  However, if we actually have to
3908  * resolve the mount point we temporarily unlock the entry in order to
3909  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
3910  * the unlock we have to recheck the flags after we relock.
3911  */
3912 static int
3913 cache_resolve_mp(struct mount *mp)
3914 {
3915         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
3916         struct vnode *vp;
3917         int error;
3918
3919         KKASSERT(mp != NULL);
3920
3921         /*
3922          * If the ncp is already resolved we have nothing to do.  However,
3923          * we do want to guarentee that a usable vnode is returned when
3924          * a vnode is present, so make sure it hasn't been reclaimed.
3925          */
3926         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3927                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3928                         _cache_setunresolved(ncp);
3929         }
3930
3931         if (ncp->nc_flag & NCF_UNRESOLVED) {
3932                 _cache_unlock(ncp);
3933                 while (vfs_busy(mp, 0))
3934                         ;
3935                 error = VFS_ROOT(mp, &vp);
3936                 _cache_lock(ncp);
3937
3938                 /*
3939                  * recheck the ncp state after relocking.
3940                  */
3941                 if (ncp->nc_flag & NCF_UNRESOLVED) {
3942                         ncp->nc_error = error;
3943                         if (error == 0) {
3944                                 _cache_setvp(mp, ncp, vp);
3945                                 vput(vp);
3946                         } else {
3947                                 kprintf("[diagnostic] cache_resolve_mp: failed"
3948                                         " to resolve mount %p err=%d ncp=%p\n",
3949                                         mp, error, ncp);
3950                                 _cache_setvp(mp, ncp, NULL);
3951                         }
3952                 } else if (error == 0) {
3953                         vput(vp);
3954                 }
3955                 vfs_unbusy(mp);
3956         }
3957         return(ncp->nc_error);
3958 }
3959
3960 /*
3961  * Clean out negative cache entries when too many have accumulated.
3962  */
3963 static void
3964 _cache_cleanneg(long count)
3965 {
3966         struct pcpu_ncache *pn;
3967         struct namecache *ncp;
3968         static uint32_t neg_rover;
3969         uint32_t n;
3970         long vnegs;
3971
3972         n = neg_rover++;        /* SMP heuristical, race ok */
3973         cpu_ccfence();
3974         n = n % (uint32_t)ncpus;
3975
3976         /*
3977          * Normalize vfscache_negs and count.  count is sometimes based
3978          * on vfscache_negs.  vfscache_negs is heuristical and can sometimes
3979          * have crazy values.
3980          */
3981         vnegs = vfscache_negs;
3982         cpu_ccfence();
3983         if (vnegs <= MINNEG)
3984                 vnegs = MINNEG;
3985         if (count < 1)
3986                 count = 1;
3987
3988         pn = &pcpu_ncache[n];
3989         spin_lock(&pn->neg_spin);
3990         count = pn->neg_count * count / vnegs + 1;
3991         spin_unlock(&pn->neg_spin);
3992
3993         /*
3994          * Attempt to clean out the specified number of negative cache
3995          * entries.
3996          */
3997         while (count > 0) {
3998                 spin_lock(&pn->neg_spin);
3999                 ncp = TAILQ_FIRST(&pn->neg_list);
4000                 if (ncp == NULL) {
4001                         spin_unlock(&pn->neg_spin);
4002                         break;
4003                 }
4004                 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
4005                 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
4006                 _cache_hold(ncp);
4007                 spin_unlock(&pn->neg_spin);
4008
4009                 /*
4010                  * This can race, so we must re-check that the ncp
4011                  * is on the ncneg.list after successfully locking it.
4012                  */
4013                 if (_cache_lock_special(ncp) == 0) {
4014                         if (ncp->nc_vp == NULL &&
4015                             (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4016                                 cache_zap(ncp);
4017                         } else {
4018                                 _cache_unlock(ncp);
4019                                 _cache_drop(ncp);
4020                         }
4021                 } else {
4022                         _cache_drop(ncp);
4023                 }
4024                 --count;
4025         }
4026 }
4027
4028 /*
4029  * Clean out positive cache entries when too many have accumulated.
4030  */
4031 static void
4032 _cache_cleanpos(long count)
4033 {
4034         static volatile int rover;
4035         struct nchash_head *nchpp;
4036         struct namecache *ncp;
4037         int rover_copy;
4038
4039         /*
4040          * Attempt to clean out the specified number of negative cache
4041          * entries.
4042          */
4043         while (count > 0) {
4044                 rover_copy = ++rover;   /* MPSAFEENOUGH */
4045                 cpu_ccfence();
4046                 nchpp = NCHHASH(rover_copy);
4047
4048                 if (TAILQ_FIRST(&nchpp->list) == NULL) {
4049                         --count;
4050                         continue;
4051                 }
4052
4053                 /*
4054                  * Cycle ncp on list, ignore and do not move DUMMY
4055                  * ncps.  These are temporary list iterators.
4056                  *
4057                  * We must cycle the ncp to the end of the list to
4058                  * ensure that all ncp's have an equal chance of
4059                  * being removed.
4060                  */
4061                 spin_lock(&nchpp->spin);
4062                 ncp = TAILQ_FIRST(&nchpp->list);
4063                 while (ncp && (ncp->nc_flag & NCF_DUMMY))
4064                         ncp = TAILQ_NEXT(ncp, nc_hash);
4065                 if (ncp) {
4066                         TAILQ_REMOVE(&nchpp->list, ncp, nc_hash);
4067                         TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash);
4068                         _cache_hold(ncp);
4069                 }
4070                 spin_unlock(&nchpp->spin);
4071
4072                 if (ncp) {
4073                         if (_cache_lock_special(ncp) == 0) {
4074                                 cache_zap(ncp);
4075                         } else {
4076                                 _cache_drop(ncp);
4077                         }
4078                 }
4079                 --count;
4080         }
4081 }
4082
4083 /*
4084  * This is a kitchen sink function to clean out ncps which we
4085  * tried to zap from cache_drop() but failed because we were
4086  * unable to acquire the parent lock.
4087  *
4088  * Such entries can also be removed via cache_inval_vp(), such
4089  * as when unmounting.
4090  */
4091 static void
4092 _cache_cleandefered(void)
4093 {
4094         struct nchash_head *nchpp;
4095         struct namecache *ncp;
4096         struct namecache dummy;
4097         int i;
4098
4099         /*
4100          * Create a list iterator.  DUMMY indicates that this is a list
4101          * iterator, DESTROYED prevents matches by lookup functions.
4102          */
4103         numdefered = 0;
4104         pcpu_ncache[mycpu->gd_cpuid].numdefered = 0;
4105         bzero(&dummy, sizeof(dummy));
4106         dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY;
4107         dummy.nc_refs = 1;
4108
4109         for (i = 0; i <= nchash; ++i) {
4110                 nchpp = &nchashtbl[i];
4111
4112                 spin_lock(&nchpp->spin);
4113                 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
4114                 ncp = &dummy;
4115                 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) {
4116                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
4117                                 continue;
4118                         TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4119                         TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash);
4120                         _cache_hold(ncp);
4121                         spin_unlock(&nchpp->spin);
4122                         if (_cache_lock_nonblock(ncp) == 0) {
4123                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
4124                                 _cache_unlock(ncp);
4125                         }
4126                         _cache_drop(ncp);
4127                         spin_lock(&nchpp->spin);
4128                         ncp = &dummy;
4129                 }
4130                 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4131                 spin_unlock(&nchpp->spin);
4132         }
4133 }
4134
4135 /*
4136  * Name cache initialization, from vfsinit() when we are booting
4137  */
4138 void
4139 nchinit(void)
4140 {
4141         struct pcpu_ncache *pn;
4142         globaldata_t gd;
4143         int i;
4144
4145         /*
4146          * Per-cpu accounting and negative hit list
4147          */
4148         pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus,
4149                               M_VFSCACHE, M_WAITOK|M_ZERO);
4150         for (i = 0; i < ncpus; ++i) {
4151                 pn = &pcpu_ncache[i];
4152                 TAILQ_INIT(&pn->neg_list);
4153                 spin_init(&pn->neg_spin, "ncneg");
4154                 spin_init(&pn->umount_spin, "ncumm");
4155         }
4156
4157         /*
4158          * Initialise per-cpu namecache effectiveness statistics.
4159          */
4160         for (i = 0; i < ncpus; ++i) {
4161                 gd = globaldata_find(i);
4162                 gd->gd_nchstats = &nchstats[i];
4163         }
4164
4165         /*
4166          * Create a generous namecache hash table
4167          */
4168         nchashtbl = hashinit_ext(vfs_inodehashsize(),
4169                                  sizeof(struct nchash_head),
4170                                  M_VFSCACHE, &nchash);
4171         for (i = 0; i <= (int)nchash; ++i) {
4172                 TAILQ_INIT(&nchashtbl[i].list);
4173                 spin_init(&nchashtbl[i].spin, "nchinit_hash");
4174         }
4175         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
4176                 spin_init(&ncmount_cache[i].spin, "nchinit_cache");
4177         nclockwarn = 5 * hz;
4178 }
4179
4180 /*
4181  * Called from start_init() to bootstrap the root filesystem.  Returns
4182  * a referenced, unlocked namecache record.
4183  */
4184 void
4185 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
4186 {
4187         nch->ncp = cache_alloc(0);
4188         nch->mount = mp;
4189         _cache_mntref(mp);
4190         if (vp)
4191                 _cache_setvp(nch->mount, nch->ncp, vp);
4192 }
4193
4194 /*
4195  * vfs_cache_setroot()
4196  *
4197  *      Create an association between the root of our namecache and
4198  *      the root vnode.  This routine may be called several times during
4199  *      booting.
4200  *
4201  *      If the caller intends to save the returned namecache pointer somewhere
4202  *      it must cache_hold() it.
4203  */
4204 void
4205 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
4206 {
4207         struct vnode *ovp;
4208         struct nchandle onch;
4209
4210         ovp = rootvnode;
4211         onch = rootnch;
4212         rootvnode = nvp;
4213         if (nch)
4214                 rootnch = *nch;
4215         else
4216                 cache_zero(&rootnch);
4217         if (ovp)
4218                 vrele(ovp);
4219         if (onch.ncp)
4220                 cache_drop(&onch);
4221 }
4222
4223 /*
4224  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
4225  * topology and is being removed as quickly as possible.  The new VOP_N*()
4226  * API calls are required to make specific adjustments using the supplied
4227  * ncp pointers rather then just bogusly purging random vnodes.
4228  *
4229  * Invalidate all namecache entries to a particular vnode as well as
4230  * any direct children of that vnode in the namecache.  This is a
4231  * 'catch all' purge used by filesystems that do not know any better.
4232  *
4233  * Note that the linkage between the vnode and its namecache entries will
4234  * be removed, but the namecache entries themselves might stay put due to
4235  * active references from elsewhere in the system or due to the existance of
4236  * the children.   The namecache topology is left intact even if we do not
4237  * know what the vnode association is.  Such entries will be marked
4238  * NCF_UNRESOLVED.
4239  */
4240 void
4241 cache_purge(struct vnode *vp)
4242 {
4243         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
4244 }
4245
4246 static int disablecwd;
4247 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
4248     "Disable getcwd");
4249
4250 static u_long numcwdcalls;
4251 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
4252     "Number of current directory resolution calls");
4253 static u_long numcwdfailnf;
4254 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
4255     "Number of current directory failures due to lack of file");
4256 static u_long numcwdfailsz;
4257 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
4258     "Number of current directory failures due to large result");
4259 static u_long numcwdfound;
4260 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
4261     "Number of current directory resolution successes");
4262
4263 /*
4264  * MPALMOSTSAFE
4265  */
4266 int
4267 sys___getcwd(struct __getcwd_args *uap)
4268 {
4269         u_int buflen;
4270         int error;
4271         char *buf;
4272         char *bp;
4273
4274         if (disablecwd)
4275                 return (ENODEV);
4276
4277         buflen = uap->buflen;
4278         if (buflen == 0)
4279                 return (EINVAL);
4280         if (buflen > MAXPATHLEN)
4281                 buflen = MAXPATHLEN;
4282
4283         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
4284         bp = kern_getcwd(buf, buflen, &error);
4285         if (error == 0)
4286                 error = copyout(bp, uap->buf, strlen(bp) + 1);
4287         kfree(buf, M_TEMP);
4288         return (error);
4289 }
4290
4291 char *
4292 kern_getcwd(char *buf, size_t buflen, int *error)
4293 {
4294         struct proc *p = curproc;
4295         char *bp;
4296         int i, slash_prefixed;
4297         struct filedesc *fdp;
4298         struct nchandle nch;
4299         struct namecache *ncp;
4300
4301         numcwdcalls++;
4302         bp = buf;
4303         bp += buflen - 1;
4304         *bp = '\0';
4305         fdp = p->p_fd;
4306         slash_prefixed = 0;
4307
4308         nch = fdp->fd_ncdir;
4309         ncp = nch.ncp;
4310         if (ncp)
4311                 _cache_hold(ncp);
4312
4313         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
4314                nch.mount != fdp->fd_nrdir.mount)
4315         ) {
4316                 /*
4317                  * While traversing upwards if we encounter the root
4318                  * of the current mount we have to skip to the mount point
4319                  * in the underlying filesystem.
4320                  */
4321                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
4322                         nch = nch.mount->mnt_ncmounton;
4323                         _cache_drop(ncp);
4324                         ncp = nch.ncp;
4325                         if (ncp)
4326                                 _cache_hold(ncp);
4327                         continue;
4328                 }
4329
4330                 /*
4331                  * Prepend the path segment
4332                  */
4333                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4334                         if (bp == buf) {
4335                                 numcwdfailsz++;
4336                                 *error = ERANGE;
4337                                 bp = NULL;
4338                                 goto done;
4339                         }
4340                         *--bp = ncp->nc_name[i];
4341                 }
4342                 if (bp == buf) {
4343                         numcwdfailsz++;
4344                         *error = ERANGE;
4345                         bp = NULL;
4346                         goto done;
4347                 }
4348                 *--bp = '/';
4349                 slash_prefixed = 1;
4350
4351                 /*
4352                  * Go up a directory.  This isn't a mount point so we don't
4353                  * have to check again.
4354                  */
4355                 while ((nch.ncp = ncp->nc_parent) != NULL) {
4356                         if (ncp_shared_lock_disable)
4357                                 _cache_lock(ncp);
4358                         else
4359                                 _cache_lock_shared(ncp);
4360                         if (nch.ncp != ncp->nc_parent) {
4361                                 _cache_unlock(ncp);
4362                                 continue;
4363                         }
4364                         _cache_hold(nch.ncp);
4365                         _cache_unlock(ncp);
4366                         break;
4367                 }
4368                 _cache_drop(ncp);
4369                 ncp = nch.ncp;
4370         }
4371         if (ncp == NULL) {
4372                 numcwdfailnf++;
4373                 *error = ENOENT;
4374                 bp = NULL;
4375                 goto done;
4376         }
4377         if (!slash_prefixed) {
4378                 if (bp == buf) {
4379                         numcwdfailsz++;
4380                         *error = ERANGE;
4381                         bp = NULL;
4382                         goto done;
4383                 }
4384                 *--bp = '/';
4385         }
4386         numcwdfound++;
4387         *error = 0;
4388 done:
4389         if (ncp)
4390                 _cache_drop(ncp);
4391         return (bp);
4392 }
4393
4394 /*
4395  * Thus begins the fullpath magic.
4396  *
4397  * The passed nchp is referenced but not locked.
4398  */
4399 static int disablefullpath;
4400 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
4401     &disablefullpath, 0,
4402     "Disable fullpath lookups");
4403
4404 int
4405 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
4406                char **retbuf, char **freebuf, int guess)
4407 {
4408         struct nchandle fd_nrdir;
4409         struct nchandle nch;
4410         struct namecache *ncp;
4411         struct mount *mp, *new_mp;
4412         char *bp, *buf;
4413         int slash_prefixed;
4414         int error = 0;
4415         int i;
4416
4417         *retbuf = NULL;
4418         *freebuf = NULL;
4419
4420         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
4421         bp = buf + MAXPATHLEN - 1;
4422         *bp = '\0';
4423         if (nchbase)
4424                 fd_nrdir = *nchbase;
4425         else if (p != NULL)
4426                 fd_nrdir = p->p_fd->fd_nrdir;
4427         else
4428                 fd_nrdir = rootnch;
4429         slash_prefixed = 0;
4430         nch = *nchp;
4431         ncp = nch.ncp;
4432         if (ncp)
4433                 _cache_hold(ncp);
4434         mp = nch.mount;
4435
4436         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
4437                 new_mp = NULL;
4438
4439                 /*
4440                  * If we are asked to guess the upwards path, we do so whenever
4441                  * we encounter an ncp marked as a mountpoint. We try to find
4442                  * the actual mountpoint by finding the mountpoint with this
4443                  * ncp.
4444                  */
4445                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
4446                         new_mp = mount_get_by_nc(ncp);
4447                 }
4448                 /*
4449                  * While traversing upwards if we encounter the root
4450                  * of the current mount we have to skip to the mount point.
4451                  */
4452                 if (ncp == mp->mnt_ncmountpt.ncp) {
4453                         new_mp = mp;
4454                 }
4455                 if (new_mp) {
4456                         nch = new_mp->mnt_ncmounton;
4457                         _cache_drop(ncp);
4458                         ncp = nch.ncp;
4459                         if (ncp)
4460                                 _cache_hold(ncp);
4461                         mp = nch.mount;
4462                         continue;
4463                 }
4464
4465                 /*
4466                  * Prepend the path segment
4467                  */
4468                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4469                         if (bp == buf) {
4470                                 kfree(buf, M_TEMP);
4471                                 error = ENOMEM;
4472                                 goto done;
4473                         }
4474                         *--bp = ncp->nc_name[i];
4475                 }
4476                 if (bp == buf) {
4477                         kfree(buf, M_TEMP);
4478                         error = ENOMEM;
4479                         goto done;
4480                 }
4481                 *--bp = '/';
4482                 slash_prefixed = 1;
4483
4484                 /*
4485                  * Go up a directory.  This isn't a mount point so we don't
4486                  * have to check again.
4487                  *
4488                  * We can only safely access nc_parent with ncp held locked.
4489                  */
4490                 while ((nch.ncp = ncp->nc_parent) != NULL) {
4491                         _cache_lock_shared(ncp);
4492                         if (nch.ncp != ncp->nc_parent) {
4493                                 _cache_unlock(ncp);
4494                                 continue;
4495                         }
4496                         _cache_hold(nch.ncp);
4497                         _cache_unlock(ncp);
4498                         break;
4499                 }
4500                 _cache_drop(ncp);
4501                 ncp = nch.ncp;
4502         }
4503         if (ncp == NULL) {
4504                 kfree(buf, M_TEMP);
4505                 error = ENOENT;
4506                 goto done;
4507         }
4508
4509         if (!slash_prefixed) {
4510                 if (bp == buf) {
4511                         kfree(buf, M_TEMP);
4512                         error = ENOMEM;
4513                         goto done;
4514                 }
4515                 *--bp = '/';
4516         }
4517         *retbuf = bp;
4518         *freebuf = buf;
4519         error = 0;
4520 done:
4521         if (ncp)
4522                 _cache_drop(ncp);
4523         return(error);
4524 }
4525
4526 int
4527 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
4528             char **freebuf, int guess)
4529 {
4530         struct namecache *ncp;
4531         struct nchandle nch;
4532         int error;
4533
4534         *freebuf = NULL;
4535         if (disablefullpath)
4536                 return (ENODEV);
4537
4538         if (p == NULL)
4539                 return (EINVAL);
4540
4541         /* vn is NULL, client wants us to use p->p_textvp */
4542         if (vn == NULL) {
4543                 if ((vn = p->p_textvp) == NULL)
4544                         return (EINVAL);
4545         }
4546         spin_lock_shared(&vn->v_spin);
4547         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
4548                 if (ncp->nc_nlen)
4549                         break;
4550         }
4551         if (ncp == NULL) {
4552                 spin_unlock_shared(&vn->v_spin);
4553                 return (EINVAL);
4554         }
4555         _cache_hold(ncp);
4556         spin_unlock_shared(&vn->v_spin);
4557
4558         nch.ncp = ncp;
4559         nch.mount = vn->v_mount;
4560         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
4561         _cache_drop(ncp);
4562         return (error);
4563 }
4564
4565 void
4566 vfscache_rollup_cpu(struct globaldata *gd)
4567 {
4568         struct pcpu_ncache *pn;
4569         long count;
4570
4571         if (pcpu_ncache == NULL)
4572                 return;
4573         pn = &pcpu_ncache[gd->gd_cpuid];
4574
4575         if (pn->vfscache_count) {
4576                 count = atomic_swap_long(&pn->vfscache_count, 0);
4577                 atomic_add_long(&vfscache_count, count);
4578         }
4579         if (pn->vfscache_leafs) {
4580                 count = atomic_swap_long(&pn->vfscache_leafs, 0);
4581                 atomic_add_long(&vfscache_leafs, count);
4582         }
4583         if (pn->vfscache_negs) {
4584                 count = atomic_swap_long(&pn->vfscache_negs, 0);
4585                 atomic_add_long(&vfscache_negs, count);
4586         }
4587         if (pn->numdefered) {
4588                 count = atomic_swap_long(&pn->numdefered, 0);
4589                 atomic_add_long(&numdefered, count);
4590         }
4591 }